Я работал над модификацией драйвера ядра Intel ixgbe для работы с моим устройством PCIe (FPGA, но это не супер важно). Ядро и устройство PCIe согласованы достаточно хорошо, заголовки конфигурации передаются, и связь, кажется, функционирует. Однако, пытаясь написать DMA_FROM_DEVICE, у меня есть небольшая проблема, которую я не понимаю, и я надеюсь на помощь.
rx_ring->desc = dma_alloc_coherent(dev, ///This function allocates dma space of size size for handle dma on device dev with flag GFP KERNEL
rx_ring->size,
&rx_ring->dma, ///This dma handle may be cast to unsigned integer of the same bus width and given to dev as the DMA base address
GFP_KERNEL);
page = dev_alloc_pages(0);
dma = dma_map_page(rx_ring->dev, page, 0, acc_rx_pg_size(rx_ring), DMA_FROM_DEVICE);
//Writing to the PCI device the base address to place data into.
writel(q_vector->adapter->rx_ring[0]->dma >> 32, q_vector->adapter->hw_region2.hw_addr+0x08+ACC_PCI_IPCONT_DATA_OFFSET);
writel(q_vector->adapter->rx_ring[0]->dma & 0xFFFFFFFF, q_vector->adapter->hw_region2.hw_addr+0x0C+ACC_PCI_IPCONT_DATA_OFFSET);
//This will perfectly read data I place onto the PCIe bus.
rx_ring->desc->wb.upper.length
//This seems to read some garbage memory.
dma_sync_single_range_for_cpu(rx_ring->dev,
rx_buffer->dma,
rx_buffer->page_offset,
acc_rx_bufsz(rx_ring),
DMA_FROM_DEVICE);
unsigned char *va = page_address(page) + rx_buffer->page_offset;
memcpy(__skb_put(skb, size), va, ALIGN(size, sizeof(long)));
//Some code later
dma_sync_single_range_for_device(rx_ring->dev, new_buff->dma,
new_buff->page_offset,
acc_rx_bufsz(rx_ring),
DMA_FROM_DEVICE);
Я пытался очистить код от интересующих его мест, но вот краткий обзор. Я выделяю место для dma, создавая виртуальный и шинный адрес через функцию dma_alloc_coherent
. Я создаю страницу памяти для dma и отображаю эту страницу в dma с помощью команд dev_alloc_pages
и dma_map_page
. Я передаю адрес шины dma моему устройству PCIe, чтобы оно могло записывать с правильным смещением с помощью команд writel
(я знаю iowrite32, но это на redhat).
Отсюда есть два способа, которыми оригинальный драйвер ixgbe считывает данные с шины PCIe. Сначала он непосредственно считывает выделенный виртуальный адрес dma (desc), но он используется только для информации о конфигурации (в драйвере, над которым я работаю). Второй способ заключается в использовании page_address(page)
, чтобы получить виртуальный адрес страницы памяти. Проблема в том, что там только мусорная память.
Так вот мое смущение. Куда указывает страница, и как разместить данные на странице через шину PCI? Я предполагал, что dma_map_page
объединит 2 виртуальных адреса в 1, поэтому моя запись в адрес шины dma столкнется со страницей, но, похоже, это не так. С какого базового адреса должно записываться мое PCI-устройство для выравнивания на этой странице памяти?
Я работаю над redhat, в частности, с ядром Centos версии 3.10.0, что создает некоторые проблемы, поскольку ядро redhat сильно отличается от базового ядра, но, надеюсь, кто-то может помочь. Спасибо за любые ссылки.
РЕДАКТИРОВАТЬ: Добавлены вызовы dma_sync, которые я забыл включить в оригинальный пост.
EDIT2: добавлена более полная база кода. Как примечание, я все еще не включаю некоторые определения структуры или вызовы функций top (например, например, probe), но, надеюсь, это будет гораздо более полным. Извините за то, как долго.
//These functions are called during configuration
int acc_setup_rx_resources(struct acc_ring *rx_ring)
{
struct device *dev = rx_ring->dev;
int orig_node = dev_to_node(dev);
int numa_node = -1;
int size;
size = sizeof(struct acc_rx_buffer) * rx_ring->count;
if (rx_ring->q_vector)
numa_node = rx_ring->q_vector->numa_node;
rx_ring->rx_buffer_info = vzalloc_node(size, numa_node);
if (!rx_ring->rx_buffer_info)
rx_ring->rx_buffer_info = vzalloc(size);
if (!rx_ring->rx_buffer_info)
goto err;
/* Round up to nearest 4K */
rx_ring->size = rx_ring->count * sizeof(union acc_adv_rx_desc);
rx_ring->size = ALIGN(rx_ring->size, 4096);
set_dev_node(dev, numa_node);
rx_ring->desc = dma_alloc_coherent(dev,
rx_ring->size,
&rx_ring->dma,
GFP_KERNEL);
set_dev_node(dev, orig_node);
if (!rx_ring->desc)
rx_ring->desc = dma_alloc_coherent(dev, rx_ring->size,
&rx_ring->dma, GFP_KERNEL);
if (!rx_ring->desc)
goto err;
rx_ring->next_to_clean = 0;
rx_ring->next_to_use = 0;
return 0;
err:
vfree(rx_ring->rx_buffer_info);
rx_ring->rx_buffer_info = NULL;
dev_err(dev, "Unable to allocate memory for the Rx descriptor ring\n");
return -ENOMEM;
}
static bool acc_alloc_mapped_page(struct acc_ring *rx_ring,
struct acc_rx_buffer *bi)
{
struct page *page = bi->page;
dma_addr_t dma = bi->dma;
if (likely(page))
return true;
page = dev_alloc_pages(0);
if(unlikely(!page)){
rx_ring->rx_stats.alloc_rx_page_failed++;
return false;
}
/* map page for use */
dma = dma_map_page(rx_ring->dev, page, 0,
acc_rx_pg_size(rx_ring), DMA_FROM_DEVICE);
if (dma_mapping_error(rx_ring->dev, dma)) {
__free_pages(page, acc_rx_pg_order(rx_ring));
bi->page = NULL;
rx_ring->rx_stats.alloc_rx_page_failed++;
return false;
}
bi->dma = dma;
bi->page = page;
bi->page_offset = 0;
page_ref_add(page, USHRT_MAX - 1); //This seems to exist in redhat kernel but not 3.10 base kernel... keep?
return true;
}
void acc_alloc_rx_buffers(struct acc_ring *rx_ring, u16 cleaned_count)
{
union acc_adv_rx_desc *rx_desc;
struct acc_rx_buffer *bi;
u16 i = rx_ring->next_to_use;
printk(KERN_INFO "acc Attempting to allocate rx buffers\n");
/* nothing to do */
if (!cleaned_count)
return;
rx_desc = ACC_RX_DESC(rx_ring, i);
bi = &rx_ring->rx_buffer_info[i];
i -= rx_ring->count;
do {
if (!acc_alloc_mapped_page(rx_ring, bi)){
printk(KERN_INFO "acc Failed to allocate and map the page to dma\n");
break;
}
printk(KERN_INFO "acc happily allocated and mapped page to dma\n");
/*
* Refresh the desc even if buffer_addrs didn't change
* because each write-back erases this info.
*/
rx_desc->read.pkt_addr = cpu_to_le64(bi->dma + bi->page_offset);
rx_desc++;
bi++; ///Move to the next buffer
i++;
if (unlikely(!i)) {
rx_desc = ACC_RX_DESC(rx_ring, 0);
bi = rx_ring->rx_buffer_info;
i -= rx_ring->count;
}
/* clear the hdr_addr for the next_to_use descriptor */
rx_desc->read.hdr_addr = 0;
cleaned_count--;
} while (cleaned_count);
i += rx_ring->count;
if (rx_ring->next_to_use != i)
acc_release_rx_desc(rx_ring, i);
}
//This function is called via a napi_schedule command which fires when an MSI interrupt is thrown from my PCIe device (all works fine).
int acc_poll(struct napi_struct *napi, int budget)
{
struct acc_q_vector *q_vector =
container_of(napi, struct acc_q_vector, napi);
struct acc_adapter *adapter = q_vector->adapter;
struct acc_ring *ring;
int per_ring_budget;
bool clean_complete = true;
e_dev_info("Landed in acc_poll\n");
e_dev_info("Attempting to read register space 0x00=%x\t0x04=%x\n", \
readl(q_vector->adapter->hw.hw_addr), readl(q_vector->adapter->hw.hw_addr+0x04));
e_dev_info("Attempting to write to pci ctl\n");
e_dev_info("Target address %.8x%.8x\n",q_vector->adapter->rx_ring[0]->dma >> 32, q_vector->adapter->rx_ring[0]->dma & 0xFFFFFFFF);
e_dev_info("Attempted page address %.8x%.8x\n",virt_to_bus(page_address(q_vector->adapter->rx_ring[0]->rx_buffer_info[0].page)) >> 32, virt_to_bus(page_address(q_vector->adapter->rx_ring[0]->rx_buffer_info[0].page)) & 0xFFFFFFFF);
writeq(0x0000000000000001, q_vector->adapter->hw_region2.hw_addr+ACC_PCI_IPCONT_DATA_OFFSET); //These are supposed to be iowrite64 but it seems iowrite64 is different in redhat and only supports the copy function (to,from,size). yay redhat think different.
writel(q_vector->adapter->rx_ring[0]->dma >> 32, q_vector->adapter->hw_region2.hw_addr+0x08+ACC_PCI_IPCONT_DATA_OFFSET);
writel(q_vector->adapter->rx_ring[0]->dma & 0xFFFFFFFF, q_vector->adapter->hw_region2.hw_addr+0x0C+ACC_PCI_IPCONT_DATA_OFFSET);
writel(virt_to_bus(page_address(q_vector->adapter->rx_ring[0]->rx_buffer_info[0].page)) >> 32, q_vector->adapter->hw_region2.hw_addr+0x10+ACC_PCI_IPCONT_DATA_OFFSET);
writel(virt_to_bus(page_address(q_vector->adapter->rx_ring[0]->rx_buffer_info[0].page)) & 0xFFFFFFFF, q_vector->adapter->hw_region2.hw_addr+0x14+ACC_PCI_IPCONT_DATA_OFFSET);
writeq(0xFF00000000000000, q_vector->adapter->hw_region2.hw_addr+0x18+ACC_PCI_IPCONT_DATA_OFFSET);
writeq(0x0000000CC0000000, q_vector->adapter->hw_region2.hw_addr+0x20+ACC_PCI_IPCONT_DATA_OFFSET);
writeq(0x0000000CC0000000, q_vector->adapter->hw_region2.hw_addr+0x28+ACC_PCI_IPCONT_DATA_OFFSET);
writeq(0x0003344000005500, q_vector->adapter->hw_region2.hw_addr+0x30+ACC_PCI_IPCONT_DATA_OFFSET);
//Send the start command to the block
writeq(0x0000000000000001, q_vector->adapter->hw_region2.hw_addr);
acc_for_each_ring(ring, q_vector->tx)
clean_complete &= !!acc_clean_tx_irq(q_vector, ring);
if (q_vector->rx.count > 1)
per_ring_budget = max(budget/q_vector->rx.count, 1);
else
per_ring_budget = budget;
acc_for_each_ring(ring, q_vector->rx){
e_dev_info("Calling clean_rx_irq\n");
clean_complete &= acc_clean_rx_irq(q_vector, ring,
per_ring_budget);
}
/* If all work not completed, return budget and keep polling */
if (!clean_complete)
return budget;
e_dev_info("Clean complete\n");
/* all work done, exit the polling mode */
napi_complete(napi);
if (adapter->rx_itr_setting & 1)
acc_set_itr(q_vector);
if (!test_bit(__ACC_DOWN, &adapter->state))
acc_irq_enable_queues(adapter, ((u64)1 << q_vector->v_idx));
e_dev_info("Exiting acc_poll\n");
return 0;
}
static bool acc_clean_rx_irq(struct acc_q_vector *q_vector,
struct acc_ring *rx_ring,
const int budget)
{
printk(KERN_INFO "acc Entered clean_rx_irq\n");
unsigned int total_rx_bytes = 0, total_rx_packets = 0;
u16 cleaned_count = acc_desc_unused(rx_ring); /// First pass this is count-1 because ntc and ntu are 0 so this is 512-1=511
printk(KERN_INFO "acc RX irq Clean count = %d\n", cleaned_count);
do {
union acc_adv_rx_desc *rx_desc;
struct sk_buff *skb;
/* return some buffers to hardware, one at a time is too slow */
if (cleaned_count >= ACC_RX_BUFFER_WRITE) { //When the clean count is >16 allocate some more buffers to get the clean count down. First pass this happens.
acc_alloc_rx_buffers(rx_ring, cleaned_count);
cleaned_count = 0;
}
rx_desc = ACC_RX_DESC(rx_ring, rx_ring->next_to_clean);
printk(KERN_INFO "acc inside RX do while, acquired description\n");
printk(KERN_INFO "acc Everything I can about the rx_ring desc (acc_rx_buffer). status_error=%d\t \
length=%d\n", rx_desc->wb.upper.status_error, rx_desc->wb.upper.length);
if (!acc_test_staterr(rx_desc, ACC_RXD_STAT_DD))
break;
printk(KERN_INFO "acc inside RX past status_error check\n");
/*
* This memory barrier is needed to keep us from reading
* any other fields out of the rx_desc until we know the
* RXD_STAT_DD bit is set
*/
rmb();
/* retrieve a buffer from the ring */
skb = acc_fetch_rx_buffer(rx_ring, rx_desc);
/* exit if we failed to retrieve a buffer */
if (!skb)
break;
printk(KERN_INFO "acc successfully retrieved a buffer\n");
cleaned_count++;
/* place incomplete frames back on ring for completion */
if (acc_is_non_eop(rx_ring, rx_desc, skb))
continue;
/* verify the packet layout is correct */
if (acc_cleanup_headers(rx_ring, rx_desc, skb))
continue;
/* probably a little skewed due to removing CRC */
total_rx_bytes += skb->len;
/* populate checksum, timestamp, VLAN, and protocol */
acc_process_skb_fields(rx_ring, rx_desc, skb);
acc_rx_skb(q_vector, skb); ///I believe this sends data to the kernel network stuff and then the generic OS
/* update budget accounting */
total_rx_packets++;
} while (likely(total_rx_packets < budget));
printk(KERN_INFO "acc rx irq exited the while loop\n");
u64_stats_update_begin(&rx_ring->syncp);
rx_ring->stats.packets += total_rx_packets;
rx_ring->stats.bytes += total_rx_bytes;
u64_stats_update_end(&rx_ring->syncp);
q_vector->rx.total_packets += total_rx_packets;
q_vector->rx.total_bytes += total_rx_bytes;
if (cleaned_count)
acc_alloc_rx_buffers(rx_ring, cleaned_count);
printk(KERN_INFO "acc rx irq returning happily\n");
return (total_rx_packets < budget);
}
static struct sk_buff *acc_fetch_rx_buffer(struct acc_ring *rx_ring,
union acc_adv_rx_desc *rx_desc)
{
struct acc_rx_buffer *rx_buffer;
struct sk_buff *skb;
struct page *page;
printk(KERN_INFO "acc Attempting to fetch rx buffer\n");
rx_buffer = &rx_ring->rx_buffer_info[rx_ring->next_to_clean];
page = rx_buffer->page; //This page is set by I think acc_add_rx_frag... hard to tell. yes the page is created there and kind of linked to the dma via dma_map_page
prefetchw(page); ///Prefetch the page cacheline for writing
skb = rx_buffer->skb; ///This does the mapping between skb and dma page table I believe.
if (likely(!skb)) {
printk(KERN_INFO "acc attempting to allocate netdrv space for page.\n");
void *page_addr = page_address(page) + //get the virtual page address of this page.
rx_buffer->page_offset;
/* prefetch first cache line of first page */
prefetch(page_addr);
#if L1_CACHE_BYTES < 128
prefetch(page_addr + L1_CACHE_BYTES);
#endif
/* allocate a skb to store the frags */
skb = netdev_alloc_skb_ip_align(rx_ring->netdev,
ACC_RX_HDR_SIZE);
if (unlikely(!skb)) {
rx_ring->rx_stats.alloc_rx_buff_failed++;
return NULL;
}
/*
* we will be copying header into skb->data in
* pskb_may_pull so it is in our interest to prefetch
* it now to avoid a possible cache miss
*/
prefetchw(skb->data);
/*
* Delay unmapping of the first packet. It carries the
* header information, HW may still access the header
* after the writeback. Only unmap it when EOP is
* reached
*/
if (likely((rx_desc, ACC_RXD_STAT_EOP)))
goto dma_sync;
ACC_CB(skb)->dma = rx_buffer->dma;
} else {
if (acc_test_staterr(rx_desc, ACC_RXD_STAT_EOP))
acc_dma_sync_frag(rx_ring, skb);
dma_sync:
/* we are reusing so sync this buffer for CPU use */
printk(KERN_INFO "acc attempting to sync the dma and the device.\n");
dma_sync_single_range_for_cpu(rx_ring->dev, //Sync to the pci device, this dma buffer, at this page offset, this ring, for device to DMA transfer
rx_buffer->dma,
rx_buffer->page_offset,
acc_rx_bufsz(rx_ring),
DMA_FROM_DEVICE);
}
/* pull page into skb */
if (acc_add_rx_frag(rx_ring, rx_buffer, rx_desc, skb)) {
//This is again temporary to try and create blockers around the problem.
return skb;
/* hand second half of page back to the ring */
acc_reuse_rx_page(rx_ring, rx_buffer);
} else if (ACC_CB(skb)->dma == rx_buffer->dma) {
/* the page has been released from the ring */
ACC_CB(skb)->page_released = true;
} else {
/* we are not reusing the buffer so unmap it */
dma_unmap_page(rx_ring->dev, rx_buffer->dma,
acc_rx_pg_size(rx_ring),
DMA_FROM_DEVICE);
}
/* clear contents of buffer_info */
rx_buffer->skb = NULL;
rx_buffer->dma = 0;
rx_buffer->page = NULL;
printk(KERN_INFO "acc returning from fetch_rx_buffer.\n");
return skb;
}
static bool acc_add_rx_frag(struct acc_ring *rx_ring,
struct acc_rx_buffer *rx_buffer,
union acc_adv_rx_desc *rx_desc,
struct sk_buff *skb)
{
printk(KERN_INFO "acc Attempting to add rx_frag from page.\n");
struct page *page = rx_buffer->page;
unsigned int size = le16_to_cpu(rx_desc->wb.upper.length);
#if (PAGE_SIZE < 8192)
unsigned int truesize = acc_rx_bufsz(rx_ring);
#else
unsigned int truesize = ALIGN(size, L1_CACHE_BYTES);
unsigned int last_offset = acc_rx_pg_size(rx_ring) -
acc_rx_bufsz(rx_ring);
#endif
if ((size <= ACC_RX_HDR_SIZE) && !skb_is_nonlinear(skb)) {
printk(KERN_INFO "acc Inside the size check.\n");
unsigned char *va = page_address(page) + rx_buffer->page_offset;
printk(KERN_INFO "page:%p\tpage_address:%p\tpage_offset:%d\n",page,page_address(page),rx_buffer->page_offset);
printk(KERN_INFO "acc First 4 bytes of string:%x %x %x %x\n",va[0],va[1],va[2],va[3]); //FIXME: I can now read this page table but there is still no meaningful data in it. (appear to be reading garbage)
printk(KERN_INFO "acc 32 bytes in:%x %x %x %x\n",va[32],va[33],va[34],va[35]);
return true;
memcpy(__skb_put(skb, size), va, ALIGN(size, sizeof(long)));
/* we can reuse buffer as-is, just make sure it is local */
if (likely(page_to_nid(page) == numa_node_id()))
return true;
/* this page cannot be reused so discard it */
put_page(page);
return false;
}
skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page,
rx_buffer->page_offset, size, truesize);
/* avoid re-using remote pages */
if (unlikely(page_to_nid(page) != numa_node_id()))
return false;
#if (PAGE_SIZE < 8192)
/* if we are only owner of page we can reuse it */
if (unlikely(page_count(page) != 1))
return false;
/* flip page offset to other buffer */
rx_buffer->page_offset ^= truesize;
/*
* since we are the only owner of the page and we need to
* increment it, just set the value to 2 in order to avoid
* an unecessary locked operation
*/
atomic_set(&page->_count, 2);
#else
/* move offset up to the next cache line */
rx_buffer->page_offset += truesize;
if (rx_buffer->page_offset > last_offset)
return false;
/* bump ref count on page before it is given to the stack */
get_page(page);
#endif
return true;
}