Writing to page mapped dmas in kernel

Question

Writing to page mapped dmas in kernel

I've been working on modifying the intel ixgbe kernel driver to function with my PCIe device (FPGA but that's not super important). The kernel and the PCIe device all negotiate quite well, configuration headers are passed along and communication seems to function. However attempting to write DMA_FROM_DEVICE I have a slight problem that I don't understand and I'm hoping for help.

rx_ring->desc = dma_alloc_coherent(dev, ///This function allocates dma space of size size for handle dma on device dev with flag GFP KERNEL
                       rx_ring->size,
                       &rx_ring->dma,   ///This dma handle may be cast to unsigned integer of the same bus width and given to dev as the DMA base address
                       GFP_KERNEL);

page = dev_alloc_pages(0);
dma = dma_map_page(rx_ring->dev, page, 0, acc_rx_pg_size(rx_ring), DMA_FROM_DEVICE);

//Writing to the PCI device the base address to place data into.     
writel(q_vector->adapter->rx_ring[0]->dma >> 32, q_vector->adapter->hw_region2.hw_addr+0x08+ACC_PCI_IPCONT_DATA_OFFSET);
writel(q_vector->adapter->rx_ring[0]->dma & 0xFFFFFFFF, q_vector->adapter->hw_region2.hw_addr+0x0C+ACC_PCI_IPCONT_DATA_OFFSET);
//This will perfectly read data I place onto the PCIe bus.
rx_ring->desc->wb.upper.length

//This seems to read some garbage memory.
dma_sync_single_range_for_cpu(rx_ring->dev,
                      rx_buffer->dma,
                      rx_buffer->page_offset,
                      acc_rx_bufsz(rx_ring),
                      DMA_FROM_DEVICE);
unsigned char *va = page_address(page) + rx_buffer->page_offset;
memcpy(__skb_put(skb, size), va, ALIGN(size, sizeof(long)));

//Some code later
dma_sync_single_range_for_device(rx_ring->dev, new_buff->dma,
                 new_buff->page_offset,
                 acc_rx_bufsz(rx_ring),
                 DMA_FROM_DEVICE);

I've tried to purge code down to just the points of interest but here's the brief run down. I allocate space for the dma creating the virtual and bus address via the dma_alloc_coherent function. I create a page of memory for the dma and map this page to the dma via the dev_alloc_pages and dma_map_page commands. I pass the dma bus address to my PCIe device so it can write to the proper offset via the writel commands (I know iowrite32 but this is on redhat).

From here there are 2 ways that the origonal ixgbe driver reads data from the PCIe bus. First it directly reads from the dma's allocated virtual address (desc), but this is only used for configuration information (in the driver I am working off of). The second method is via use page_address(page) to I believe get a virtual address for the page of memory. The problem is there is only garbage memory there.

So here is my confusion. Where is page pointing to and how do I place data into page via the PCI bus? I assumed that dma_map_page would sort of merge the 2 virtual addresses into 1 so my write into the dma's bus address would collide into the page but this doesn't seem to be the case. What base address should my PCI device be writing from to align into this page of memory?

I'm working on redhat, specifically Centos kernel version 3.10.0 which makes for some problems since redhat kernel is very different from base kernel but hopefully someone can help. Thank you for any pointers.

EDIT: Added dma_sync calls which I forgot to include in original post.

EDIT2: Added a more complete code base. As a note I'm still not including some of the struct definitions or top function calls (like probe for instance), but hopefully this will be a lot more complete. Sorry for how long it is.

//These functions are called during configuration
int acc_setup_rx_resources(struct acc_ring *rx_ring)
{
    struct device *dev = rx_ring->dev;
    int orig_node = dev_to_node(dev);
    int numa_node = -1;
    int size;

    size = sizeof(struct acc_rx_buffer) * rx_ring->count;

    if (rx_ring->q_vector)
        numa_node = rx_ring->q_vector->numa_node;

    rx_ring->rx_buffer_info = vzalloc_node(size, numa_node);
    if (!rx_ring->rx_buffer_info)
        rx_ring->rx_buffer_info = vzalloc(size);
    if (!rx_ring->rx_buffer_info)
        goto err;

    /* Round up to nearest 4K */
    rx_ring->size = rx_ring->count * sizeof(union acc_adv_rx_desc);
    rx_ring->size = ALIGN(rx_ring->size, 4096);

    set_dev_node(dev, numa_node);
    rx_ring->desc = dma_alloc_coherent(dev, 
                       rx_ring->size,
                       &rx_ring->dma,   
                       GFP_KERNEL);
    set_dev_node(dev, orig_node);
    if (!rx_ring->desc)
        rx_ring->desc = dma_alloc_coherent(dev, rx_ring->size,
                           &rx_ring->dma, GFP_KERNEL);
    if (!rx_ring->desc)
        goto err;

    rx_ring->next_to_clean = 0;
    rx_ring->next_to_use = 0;

    return 0;
err:
    vfree(rx_ring->rx_buffer_info);
    rx_ring->rx_buffer_info = NULL;
    dev_err(dev, "Unable to allocate memory for the Rx descriptor ring\n");
    return -ENOMEM;
}

static bool acc_alloc_mapped_page(struct acc_ring *rx_ring,
                    struct acc_rx_buffer *bi)
{
    struct page *page = bi->page;
    dma_addr_t dma = bi->dma;

    if (likely(page))
        return true;

    page = dev_alloc_pages(0);
    if(unlikely(!page)){
        rx_ring->rx_stats.alloc_rx_page_failed++;
        return false;
    }

    /* map page for use */
    dma = dma_map_page(rx_ring->dev, page, 0,
               acc_rx_pg_size(rx_ring), DMA_FROM_DEVICE);

    if (dma_mapping_error(rx_ring->dev, dma)) {
        __free_pages(page, acc_rx_pg_order(rx_ring));
        bi->page = NULL;

        rx_ring->rx_stats.alloc_rx_page_failed++;
        return false;
    }
    bi->dma = dma;
    bi->page = page; 
    bi->page_offset = 0;
    page_ref_add(page, USHRT_MAX - 1);  //This seems to exist in redhat kernel but not 3.10 base kernel... keep?

    return true;
}

void acc_alloc_rx_buffers(struct acc_ring *rx_ring, u16 cleaned_count)
{
    union acc_adv_rx_desc *rx_desc;
    struct acc_rx_buffer *bi;
    u16 i = rx_ring->next_to_use;   
    printk(KERN_INFO "acc Attempting to allocate rx buffers\n");

    /* nothing to do */
    if (!cleaned_count)
        return;

    rx_desc = ACC_RX_DESC(rx_ring, i);  
    bi = &rx_ring->rx_buffer_info[i];   
    i -= rx_ring->count;    

    do {
        if (!acc_alloc_mapped_page(rx_ring, bi)){
            printk(KERN_INFO "acc Failed to allocate and map the page to dma\n");
            break;
        }
        printk(KERN_INFO "acc happily allocated and mapped page to dma\n");

        /*
         * Refresh the desc even if buffer_addrs didn't change
         * because each write-back erases this info.
         */
        rx_desc->read.pkt_addr = cpu_to_le64(bi->dma + bi->page_offset);

        rx_desc++;
        bi++;   ///Move to the next buffer
        i++;
        if (unlikely(!i)) {
            rx_desc = ACC_RX_DESC(rx_ring, 0);
            bi = rx_ring->rx_buffer_info;
            i -= rx_ring->count;
        }

        /* clear the hdr_addr for the next_to_use descriptor */
        rx_desc->read.hdr_addr = 0;

        cleaned_count--;
    } while (cleaned_count); 

    i += rx_ring->count;

    if (rx_ring->next_to_use != i)
        acc_release_rx_desc(rx_ring, i);
}

//This function is called via a napi_schedule command which fires when an MSI interrupt is thrown from my PCIe device (all works fine).
int acc_poll(struct napi_struct *napi, int budget)
{
    struct acc_q_vector *q_vector =
                container_of(napi, struct acc_q_vector, napi);
    struct acc_adapter *adapter = q_vector->adapter;
    struct acc_ring *ring;
    int per_ring_budget;
    bool clean_complete = true;

    e_dev_info("Landed in acc_poll\n");

    e_dev_info("Attempting to read register space 0x00=%x\t0x04=%x\n", \
        readl(q_vector->adapter->hw.hw_addr), readl(q_vector->adapter->hw.hw_addr+0x04));
    e_dev_info("Attempting to write to pci ctl\n");
    e_dev_info("Target address %.8x%.8x\n",q_vector->adapter->rx_ring[0]->dma >> 32, q_vector->adapter->rx_ring[0]->dma & 0xFFFFFFFF);
    e_dev_info("Attempted page address %.8x%.8x\n",virt_to_bus(page_address(q_vector->adapter->rx_ring[0]->rx_buffer_info[0].page)) >> 32, virt_to_bus(page_address(q_vector->adapter->rx_ring[0]->rx_buffer_info[0].page)) & 0xFFFFFFFF);
    writeq(0x0000000000000001, q_vector->adapter->hw_region2.hw_addr+ACC_PCI_IPCONT_DATA_OFFSET);  //These are supposed to be iowrite64 but it seems iowrite64 is different in redhat and only supports the copy function (to,from,size). yay redhat think different.

    writel(q_vector->adapter->rx_ring[0]->dma >> 32, q_vector->adapter->hw_region2.hw_addr+0x08+ACC_PCI_IPCONT_DATA_OFFSET);
    writel(q_vector->adapter->rx_ring[0]->dma & 0xFFFFFFFF, q_vector->adapter->hw_region2.hw_addr+0x0C+ACC_PCI_IPCONT_DATA_OFFSET);

    writel(virt_to_bus(page_address(q_vector->adapter->rx_ring[0]->rx_buffer_info[0].page)) >> 32, q_vector->adapter->hw_region2.hw_addr+0x10+ACC_PCI_IPCONT_DATA_OFFSET);
    writel(virt_to_bus(page_address(q_vector->adapter->rx_ring[0]->rx_buffer_info[0].page)) & 0xFFFFFFFF, q_vector->adapter->hw_region2.hw_addr+0x14+ACC_PCI_IPCONT_DATA_OFFSET);

    writeq(0xFF00000000000000, q_vector->adapter->hw_region2.hw_addr+0x18+ACC_PCI_IPCONT_DATA_OFFSET);
    writeq(0x0000000CC0000000, q_vector->adapter->hw_region2.hw_addr+0x20+ACC_PCI_IPCONT_DATA_OFFSET);
    writeq(0x0000000CC0000000, q_vector->adapter->hw_region2.hw_addr+0x28+ACC_PCI_IPCONT_DATA_OFFSET);
    writeq(0x0003344000005500, q_vector->adapter->hw_region2.hw_addr+0x30+ACC_PCI_IPCONT_DATA_OFFSET);

    //Send the start command to the block
    writeq(0x0000000000000001, q_vector->adapter->hw_region2.hw_addr);


    acc_for_each_ring(ring, q_vector->tx)
        clean_complete &= !!acc_clean_tx_irq(q_vector, ring);

    if (q_vector->rx.count > 1)
        per_ring_budget = max(budget/q_vector->rx.count, 1);
    else
        per_ring_budget = budget;

    acc_for_each_ring(ring, q_vector->rx){
        e_dev_info("Calling clean_rx_irq\n");
        clean_complete &= acc_clean_rx_irq(q_vector, ring,  
                             per_ring_budget);
    }

    /* If all work not completed, return budget and keep polling */
    if (!clean_complete)
        return budget;

    e_dev_info("Clean complete\n");

    /* all work done, exit the polling mode */
    napi_complete(napi);
    if (adapter->rx_itr_setting & 1)
        acc_set_itr(q_vector);
    if (!test_bit(__ACC_DOWN, &adapter->state))
        acc_irq_enable_queues(adapter, ((u64)1 << q_vector->v_idx));

    e_dev_info("Exiting acc_poll\n");

    return 0;
}

static bool acc_clean_rx_irq(struct acc_q_vector *q_vector,
                   struct acc_ring *rx_ring,
                   const int budget)
{
    printk(KERN_INFO "acc Entered clean_rx_irq\n");
    unsigned int total_rx_bytes = 0, total_rx_packets = 0;
    u16 cleaned_count = acc_desc_unused(rx_ring);   /// First pass this is count-1 because ntc and ntu are 0 so this is 512-1=511

    printk(KERN_INFO "acc RX irq Clean count = %d\n", cleaned_count);

    do {
        union acc_adv_rx_desc *rx_desc;
        struct sk_buff *skb;

        /* return some buffers to hardware, one at a time is too slow */
        if (cleaned_count >= ACC_RX_BUFFER_WRITE) { //When the clean count is >16 allocate some more buffers to get the clean count down. First pass this happens.
            acc_alloc_rx_buffers(rx_ring, cleaned_count);
            cleaned_count = 0;
        }

        rx_desc = ACC_RX_DESC(rx_ring, rx_ring->next_to_clean);

        printk(KERN_INFO "acc inside RX do while, acquired description\n");

        printk(KERN_INFO "acc Everything I can about the rx_ring desc (acc_rx_buffer). status_error=%d\t \
        length=%d\n", rx_desc->wb.upper.status_error, rx_desc->wb.upper.length);

        if (!acc_test_staterr(rx_desc, ACC_RXD_STAT_DD))
            break;

        printk(KERN_INFO "acc inside RX past status_error check\n");
        /*
         * This memory barrier is needed to keep us from reading
         * any other fields out of the rx_desc until we know the
         * RXD_STAT_DD bit is set
         */
        rmb();

        /* retrieve a buffer from the ring */
        skb = acc_fetch_rx_buffer(rx_ring, rx_desc);

        /* exit if we failed to retrieve a buffer */
        if (!skb)
            break;

        printk(KERN_INFO "acc successfully retrieved a buffer\n");

        cleaned_count++;

        /* place incomplete frames back on ring for completion */
        if (acc_is_non_eop(rx_ring, rx_desc, skb))
            continue;

        /* verify the packet layout is correct */
        if (acc_cleanup_headers(rx_ring, rx_desc, skb))
            continue;

        /* probably a little skewed due to removing CRC */
        total_rx_bytes += skb->len;

        /* populate checksum, timestamp, VLAN, and protocol */
        acc_process_skb_fields(rx_ring, rx_desc, skb);

        acc_rx_skb(q_vector, skb);  ///I believe this sends data to the kernel network stuff and then the generic OS

        /* update budget accounting */
        total_rx_packets++;
    } while (likely(total_rx_packets < budget));

    printk(KERN_INFO "acc rx irq exited the while loop\n");

    u64_stats_update_begin(&rx_ring->syncp);
    rx_ring->stats.packets += total_rx_packets;
    rx_ring->stats.bytes += total_rx_bytes;
    u64_stats_update_end(&rx_ring->syncp);
    q_vector->rx.total_packets += total_rx_packets;
    q_vector->rx.total_bytes += total_rx_bytes;

    if (cleaned_count)
        acc_alloc_rx_buffers(rx_ring, cleaned_count);

    printk(KERN_INFO "acc rx irq returning happily\n");

    return (total_rx_packets < budget);
}

static struct sk_buff *acc_fetch_rx_buffer(struct acc_ring *rx_ring,
                         union acc_adv_rx_desc *rx_desc)
{
    struct acc_rx_buffer *rx_buffer;
    struct sk_buff *skb;
    struct page *page;

    printk(KERN_INFO "acc Attempting to fetch rx buffer\n");

    rx_buffer = &rx_ring->rx_buffer_info[rx_ring->next_to_clean];
    page = rx_buffer->page; //This page is set by I think acc_add_rx_frag... hard to tell. yes the page is created there and kind of linked to the dma via dma_map_page
    prefetchw(page);    ///Prefetch the page cacheline for writing

    skb = rx_buffer->skb;   ///This does the mapping between skb and dma page table I believe.

    if (likely(!skb)) {
        printk(KERN_INFO "acc attempting to allocate netdrv space for page.\n");
        void *page_addr = page_address(page) +  //get the virtual page address of this page.
                  rx_buffer->page_offset;

        /* prefetch first cache line of first page */
        prefetch(page_addr);
#if L1_CACHE_BYTES < 128
        prefetch(page_addr + L1_CACHE_BYTES);
#endif

        /* allocate a skb to store the frags */
        skb = netdev_alloc_skb_ip_align(rx_ring->netdev,
                        ACC_RX_HDR_SIZE);
        if (unlikely(!skb)) {
            rx_ring->rx_stats.alloc_rx_buff_failed++;
            return NULL;
        }

        /*
         * we will be copying header into skb->data in
         * pskb_may_pull so it is in our interest to prefetch
         * it now to avoid a possible cache miss
         */
        prefetchw(skb->data);

        /*
         * Delay unmapping of the first packet. It carries the
         * header information, HW may still access the header
         * after the writeback.  Only unmap it when EOP is
         * reached
         */
        if (likely((rx_desc, ACC_RXD_STAT_EOP)))
            goto dma_sync;

        ACC_CB(skb)->dma = rx_buffer->dma;
    } else {
        if (acc_test_staterr(rx_desc, ACC_RXD_STAT_EOP))
            acc_dma_sync_frag(rx_ring, skb);

dma_sync:
        /* we are reusing so sync this buffer for CPU use */
        printk(KERN_INFO "acc attempting to sync the dma and the device.\n");
        dma_sync_single_range_for_cpu(rx_ring->dev, //Sync to the pci device, this dma buffer, at this page offset, this ring, for device to DMA transfer
                          rx_buffer->dma,
                          rx_buffer->page_offset,
                          acc_rx_bufsz(rx_ring),
                          DMA_FROM_DEVICE);
    }

    /* pull page into skb */
    if (acc_add_rx_frag(rx_ring, rx_buffer, rx_desc, skb)) {
        //This is again temporary to try and create blockers around the problem.
        return skb;
        /* hand second half of page back to the ring */
        acc_reuse_rx_page(rx_ring, rx_buffer);
    } else if (ACC_CB(skb)->dma == rx_buffer->dma) {
        /* the page has been released from the ring */
        ACC_CB(skb)->page_released = true;
    } else {
        /* we are not reusing the buffer so unmap it */
        dma_unmap_page(rx_ring->dev, rx_buffer->dma,
                   acc_rx_pg_size(rx_ring),
                   DMA_FROM_DEVICE);
    }

    /* clear contents of buffer_info */
    rx_buffer->skb = NULL;
    rx_buffer->dma = 0;
    rx_buffer->page = NULL;

    printk(KERN_INFO "acc returning from fetch_rx_buffer.\n");

    return skb;
}

static bool acc_add_rx_frag(struct acc_ring *rx_ring,
                  struct acc_rx_buffer *rx_buffer,
                  union acc_adv_rx_desc *rx_desc,
                  struct sk_buff *skb)
{
    printk(KERN_INFO "acc Attempting to add rx_frag from page.\n");
    struct page *page = rx_buffer->page;
    unsigned int size = le16_to_cpu(rx_desc->wb.upper.length);
#if (PAGE_SIZE < 8192)
    unsigned int truesize = acc_rx_bufsz(rx_ring);
#else
    unsigned int truesize = ALIGN(size, L1_CACHE_BYTES);
    unsigned int last_offset = acc_rx_pg_size(rx_ring) -
                   acc_rx_bufsz(rx_ring);
#endif

    if ((size <= ACC_RX_HDR_SIZE) && !skb_is_nonlinear(skb)) {
        printk(KERN_INFO "acc Inside the size check.\n");
        unsigned char *va = page_address(page) + rx_buffer->page_offset;
        printk(KERN_INFO "page:%p\tpage_address:%p\tpage_offset:%d\n",page,page_address(page),rx_buffer->page_offset);
        printk(KERN_INFO "acc First 4 bytes of string:%x  %x  %x  %x\n",va[0],va[1],va[2],va[3]); //FIXME: I can now read this page table but there is still no meaningful data in it. (appear to be reading garbage)
        printk(KERN_INFO "acc 32 bytes in:%x %x %x %x\n",va[32],va[33],va[34],va[35]);
        return true;

        memcpy(__skb_put(skb, size), va, ALIGN(size, sizeof(long)));    

        /* we can reuse buffer as-is, just make sure it is local */
        if (likely(page_to_nid(page) == numa_node_id()))
            return true;

        /* this page cannot be reused so discard it */
        put_page(page);
        return false;
    }

    skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page,
            rx_buffer->page_offset, size, truesize);

    /* avoid re-using remote pages */
    if (unlikely(page_to_nid(page) != numa_node_id()))
        return false;

#if (PAGE_SIZE < 8192)
    /* if we are only owner of page we can reuse it */
    if (unlikely(page_count(page) != 1))
        return false;

    /* flip page offset to other buffer */
    rx_buffer->page_offset ^= truesize;

    /*
     * since we are the only owner of the page and we need to
     * increment it, just set the value to 2 in order to avoid
     * an unecessary locked operation
     */
    atomic_set(&page->_count, 2);
#else
    /* move offset up to the next cache line */
    rx_buffer->page_offset += truesize;

    if (rx_buffer->page_offset > last_offset)
        return false;

    /* bump ref count on page before it is given to the stack */
    get_page(page);
#endif

    return true;
}

c

linux-kernel

kernel

redhat

vivado

asked on Stack Overflow Mar 21, 2019 by

arduic • edited Apr 19, 2019 by

marc_s

0 Answers

Nobody has answered this question yet.

User contributions licensed under CC BY-SA 3.0