On Tue, May 15, 2018 at 12:06 PM, Björn Töpel <bjorn.to...@gmail.com> wrote: > From: Björn Töpel <bjorn.to...@intel.com> > > A lot of things here. First we add support for the new > XDP_SETUP_XSK_UMEM command in ndo_bpf. This allows the AF_XDP socket > to pass a UMEM to the driver. The driver will then DMA map all the > frames in the UMEM for the driver. Next, the Rx code will allocate > frames from the UMEM fill queue, instead of the regular page > allocator. > > Externally, for the rest of the XDP code, the driver the driver > internal UMEM allocator will appear as a MEM_TYPE_ZERO_COPY. > > Keep in mind that having frames coming from userland requires some > extra care taken when passing them to the regular kernel stack. In > these cases the ZC frame must be copied. > > The commit also introduces a completely new clean_rx_irq/allocator > functions for zero-copy, and means (functions pointers) to set > allocators and clean_rx functions. > > Finally, a lot of this are *not* implemented here. To mention some: > > * No passing to the stack via XDP_PASS (clone/copy to skb). > * No XDP redirect to other than sockets (convert_to_xdp_frame does not > clone the frame yet). > > And yes, too much C&P and too big commit. :-) > > Signed-off-by: Björn Töpel <bjorn.to...@intel.com>
A few minor comments below. > --- > drivers/net/ethernet/intel/i40e/i40e.h | 20 ++ > drivers/net/ethernet/intel/i40e/i40e_main.c | 202 +++++++++++++- > drivers/net/ethernet/intel/i40e/i40e_txrx.c | 400 > ++++++++++++++++++++++++++-- > drivers/net/ethernet/intel/i40e/i40e_txrx.h | 30 ++- > 4 files changed, 619 insertions(+), 33 deletions(-) > > diff --git a/drivers/net/ethernet/intel/i40e/i40e.h > b/drivers/net/ethernet/intel/i40e/i40e.h > index 7a80652e2500..e6ee6c9bf094 100644 > --- a/drivers/net/ethernet/intel/i40e/i40e.h > +++ b/drivers/net/ethernet/intel/i40e/i40e.h > @@ -786,6 +786,12 @@ struct i40e_vsi { > > /* VSI specific handlers */ > irqreturn_t (*irq_handler)(int irq, void *data); > + > + /* AF_XDP zero-copy */ > + struct xdp_umem **xsk_umems; > + u16 num_xsk_umems_used; > + u16 num_xsk_umems; > + > } ____cacheline_internodealigned_in_smp; > > struct i40e_netdev_priv { > @@ -1090,6 +1096,20 @@ static inline bool i40e_enabled_xdp_vsi(struct > i40e_vsi *vsi) > return !!vsi->xdp_prog; > } > > +static inline struct xdp_umem *i40e_xsk_umem(struct i40e_ring *ring) > +{ > + bool xdp_on = i40e_enabled_xdp_vsi(ring->vsi); > + int qid = ring->queue_index; > + > + if (ring_is_xdp(ring)) > + qid -= ring->vsi->alloc_queue_pairs; > + > + if (!ring->vsi->xsk_umems || !ring->vsi->xsk_umems[qid] || !xdp_on) > + return NULL; > + > + return ring->vsi->xsk_umems[qid]; > +} > + > int i40e_create_queue_channel(struct i40e_vsi *vsi, struct i40e_channel *ch); > int i40e_set_bw_limit(struct i40e_vsi *vsi, u16 seid, u64 max_tx_rate); > int i40e_add_del_cloud_filter(struct i40e_vsi *vsi, > diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c > b/drivers/net/ethernet/intel/i40e/i40e_main.c > index b4c23cf3979c..dc3d668a741e 100644 > --- a/drivers/net/ethernet/intel/i40e/i40e_main.c > +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c > @@ -5,6 +5,7 @@ > #include <linux/of_net.h> > #include <linux/pci.h> > #include <linux/bpf.h> > +#include <net/xdp_sock.h> > > /* Local includes */ > #include "i40e.h" > @@ -3054,6 +3055,9 @@ static int i40e_configure_tx_ring(struct i40e_ring > *ring) > i40e_status err = 0; > u32 qtx_ctl = 0; > > + if (ring_is_xdp(ring)) > + ring->xsk_umem = i40e_xsk_umem(ring); > + > /* some ATR related tx ring init */ > if (vsi->back->flags & I40E_FLAG_FD_ATR_ENABLED) { > ring->atr_sample_rate = vsi->back->atr_sample_rate; > @@ -3163,13 +3167,31 @@ static int i40e_configure_rx_ring(struct i40e_ring > *ring) > struct i40e_hw *hw = &vsi->back->hw; > struct i40e_hmc_obj_rxq rx_ctx; > i40e_status err = 0; > + int ret; > > bitmap_zero(ring->state, __I40E_RING_STATE_NBITS); > > /* clear the context structure first */ > memset(&rx_ctx, 0, sizeof(rx_ctx)); > > - ring->rx_buf_len = vsi->rx_buf_len; > + ring->xsk_umem = i40e_xsk_umem(ring); > + if (ring->xsk_umem) { > + ring->clean_rx_irq = i40e_clean_rx_irq_zc; > + ring->alloc_rx_buffers = i40e_alloc_rx_buffers_zc; > + ring->rx_buf_len = ring->xsk_umem->props.frame_size - > + ring->xsk_umem->frame_headroom - > + XDP_PACKET_HEADROOM; > + ring->zca.free = i40e_zca_free; > + ret = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, > + MEM_TYPE_ZERO_COPY, > + &ring->zca); > + if (ret) > + return ret; > + } else { > + ring->clean_rx_irq = i40e_clean_rx_irq; > + ring->alloc_rx_buffers = i40e_alloc_rx_buffers; > + ring->rx_buf_len = vsi->rx_buf_len; > + } > > rx_ctx.dbuff = DIV_ROUND_UP(ring->rx_buf_len, > BIT_ULL(I40E_RXQ_CTX_DBUFF_SHIFT)); > @@ -3225,7 +3247,7 @@ static int i40e_configure_rx_ring(struct i40e_ring > *ring) > ring->tail = hw->hw_addr + I40E_QRX_TAIL(pf_q); > writel(0, ring->tail); > > - i40e_alloc_rx_buffers(ring, I40E_DESC_UNUSED(ring)); > + ring->alloc_rx_buffers(ring, I40E_DESC_UNUSED(ring)); > > return 0; > } > @@ -12050,6 +12072,179 @@ static int i40e_queue_pair_enable(struct i40e_vsi > *vsi, int queue_pair) > return err; > } > > +static int i40e_alloc_xsk_umems(struct i40e_vsi *vsi) > +{ > + if (vsi->xsk_umems) > + return 0; > + > + vsi->num_xsk_umems_used = 0; > + vsi->num_xsk_umems = vsi->alloc_queue_pairs; > + vsi->xsk_umems = kcalloc(vsi->num_xsk_umems, sizeof(*vsi->xsk_umems), > + GFP_KERNEL); > + if (!vsi->xsk_umems) { > + vsi->num_xsk_umems = 0; > + return -ENOMEM; > + } > + > + return 0; > +} > + > +static int i40e_add_xsk_umem(struct i40e_vsi *vsi, struct xdp_umem *umem, > + u16 qid) > +{ > + int err; > + > + err = i40e_alloc_xsk_umems(vsi); > + if (err) > + return err; > + > + vsi->xsk_umems[qid] = umem; > + vsi->num_xsk_umems_used++; > + > + return 0; > +} > + > +static void i40e_remove_xsk_umem(struct i40e_vsi *vsi, u16 qid) > +{ > + vsi->xsk_umems[qid] = NULL; > + vsi->num_xsk_umems_used--; > + > + if (vsi->num_xsk_umems == 0) { > + kfree(vsi->xsk_umems); > + vsi->xsk_umems = NULL; > + vsi->num_xsk_umems = 0; > + } > +} > + > +static int i40e_xsk_umem_dma_map(struct i40e_vsi *vsi, struct xdp_umem *umem) > +{ > + struct i40e_pf *pf = vsi->back; > + struct device *dev; > + unsigned int i, j; > + dma_addr_t dma; > + > + dev = &pf->pdev->dev; > + > + for (i = 0; i < umem->props.nframes; i++) { > + dma = dma_map_single_attrs(dev, umem->frames[i].addr, > + umem->props.frame_size, > + DMA_BIDIRECTIONAL, > I40E_RX_DMA_ATTR); > + if (dma_mapping_error(dev, dma)) > + goto out_unmap; > + > + umem->frames[i].dma = dma; > + } > + > + return 0; > + > +out_unmap: > + for (j = 0; j < i; j++) { > + dma_unmap_single_attrs(dev, umem->frames[i].dma, > + umem->props.frame_size, > + DMA_BIDIRECTIONAL, > + I40E_RX_DMA_ATTR); > + umem->frames[i].dma = 0; > + } > + > + return -1; > +} > + > +static void i40e_xsk_umem_dma_unmap(struct i40e_vsi *vsi, struct xdp_umem > *umem) > +{ > + struct i40e_pf *pf = vsi->back; > + struct device *dev; > + unsigned int i; > + > + dev = &pf->pdev->dev; > + > + for (i = 0; i < umem->props.nframes; i++) { > + dma_unmap_single_attrs(dev, umem->frames[i].dma, > + umem->props.frame_size, > + DMA_BIDIRECTIONAL, > + I40E_RX_DMA_ATTR); > + > + umem->frames[i].dma = 0; > + } > +} > + > +static int i40e_xsk_umem_enable(struct i40e_vsi *vsi, struct xdp_umem *umem, > + u16 qid) > +{ > + bool if_running; > + int err; > + > + if (vsi->type != I40E_VSI_MAIN) > + return -EINVAL; > + > + if (qid >= vsi->num_queue_pairs) > + return -EINVAL; > + > + if (vsi->xsk_umems && vsi->xsk_umems[qid]) > + return -EBUSY; > + > + err = i40e_xsk_umem_dma_map(vsi, umem); > + if (err) > + return err; > + > + if_running = netif_running(vsi->netdev) && i40e_enabled_xdp_vsi(vsi); > + > + if (if_running) { > + err = i40e_queue_pair_disable(vsi, qid); > + if (err) > + return err; > + } > + > + err = i40e_add_xsk_umem(vsi, umem, qid); > + if (err) > + return err; > + > + if (if_running) { > + err = i40e_queue_pair_enable(vsi, qid); > + if (err) > + return err; > + } > + > + return 0; > +} > + > +static int i40e_xsk_umem_disable(struct i40e_vsi *vsi, u16 qid) > +{ > + bool if_running; > + int err; > + > + if (!vsi->xsk_umems || qid >= vsi->num_xsk_umems || > + !vsi->xsk_umems[qid]) > + return -EINVAL; > + > + if_running = netif_running(vsi->netdev) && i40e_enabled_xdp_vsi(vsi); > + > + if (if_running) { > + err = i40e_queue_pair_disable(vsi, qid); > + if (err) > + return err; > + } > + > + i40e_xsk_umem_dma_unmap(vsi, vsi->xsk_umems[qid]); > + i40e_remove_xsk_umem(vsi, qid); > + > + if (if_running) { > + err = i40e_queue_pair_enable(vsi, qid); > + if (err) > + return err; > + } > + > + return 0; > +} > + > +static int i40e_xsk_umem_setup(struct i40e_vsi *vsi, struct xdp_umem *umem, > + u16 qid) > +{ > + if (umem) > + return i40e_xsk_umem_enable(vsi, umem, qid); > + > + return i40e_xsk_umem_disable(vsi, qid); > +} > + > /** > * i40e_xdp - implements ndo_bpf for i40e > * @dev: netdevice > @@ -12071,6 +12266,9 @@ static int i40e_xdp(struct net_device *dev, > xdp->prog_attached = i40e_enabled_xdp_vsi(vsi); > xdp->prog_id = vsi->xdp_prog ? vsi->xdp_prog->aux->id : 0; > return 0; > + case XDP_SETUP_XSK_UMEM: > + return i40e_xsk_umem_setup(vsi, xdp->xsk.umem, > + xdp->xsk.queue_id); > default: > return -EINVAL; > } > diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c > b/drivers/net/ethernet/intel/i40e/i40e_txrx.c > index 5efa68de935b..f89ac524652c 100644 > --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c > +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c > @@ -5,6 +5,7 @@ > #include <net/busy_poll.h> > #include <linux/bpf_trace.h> > #include <net/xdp.h> > +#include <net/xdp_sock.h> > #include "i40e.h" > #include "i40e_trace.h" > #include "i40e_prototype.h" > @@ -1373,31 +1374,35 @@ void i40e_clean_rx_ring(struct i40e_ring *rx_ring) > } > > /* Free all the Rx ring sk_buffs */ > - for (i = 0; i < rx_ring->count; i++) { > - struct i40e_rx_buffer *rx_bi = &rx_ring->rx_bi[i]; > + if (!rx_ring->xsk_umem) { > + for (i = 0; i < rx_ring->count; i++) { I'm not a fan of all this extra indenting. This could be much more easily handled with just a goto and a label. > + struct i40e_rx_buffer *rx_bi = &rx_ring->rx_bi[i]; > > - if (!rx_bi->page) > - continue; > - > - /* Invalidate cache lines that may have been written to by > - * device so that we avoid corrupting memory. > - */ > - dma_sync_single_range_for_cpu(rx_ring->dev, > - rx_bi->dma, > - rx_bi->page_offset, > - rx_ring->rx_buf_len, > - DMA_FROM_DEVICE); > - > - /* free resources associated with mapping */ > - dma_unmap_page_attrs(rx_ring->dev, rx_bi->dma, > - i40e_rx_pg_size(rx_ring), > - DMA_FROM_DEVICE, > - I40E_RX_DMA_ATTR); > - > - __page_frag_cache_drain(rx_bi->page, rx_bi->pagecnt_bias); > + if (!rx_bi->page) > + continue; > > - rx_bi->page = NULL; > - rx_bi->page_offset = 0; > + /* Invalidate cache lines that may have been > + * written to by device so that we avoid > + * corrupting memory. > + */ > + dma_sync_single_range_for_cpu(rx_ring->dev, > + rx_bi->dma, > + rx_bi->page_offset, > + rx_ring->rx_buf_len, > + DMA_FROM_DEVICE); > + > + /* free resources associated with mapping */ > + dma_unmap_page_attrs(rx_ring->dev, rx_bi->dma, > + i40e_rx_pg_size(rx_ring), > + DMA_FROM_DEVICE, > + I40E_RX_DMA_ATTR); > + > + __page_frag_cache_drain(rx_bi->page, > + rx_bi->pagecnt_bias); > + > + rx_bi->page = NULL; > + rx_bi->page_offset = 0; > + } > } > > bi_size = sizeof(struct i40e_rx_buffer) * rx_ring->count; > @@ -2214,8 +2219,6 @@ static struct sk_buff *i40e_run_xdp(struct i40e_ring > *rx_ring, > if (!xdp_prog) > goto xdp_out; > > - prefetchw(xdp->data_hard_start); /* xdp_frame write */ > - > act = bpf_prog_run_xdp(xdp_prog, xdp); > switch (act) { > case XDP_PASS: > @@ -2284,7 +2287,7 @@ static inline void i40e_xdp_ring_update_tail(struct > i40e_ring *xdp_ring) > * > * Returns amount of work completed > **/ > -static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) > +int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) > { > unsigned int total_rx_bytes = 0, total_rx_packets = 0; > struct sk_buff *skb = rx_ring->skb; > @@ -2426,6 +2429,349 @@ static int i40e_clean_rx_irq(struct i40e_ring > *rx_ring, int budget) > return failure ? budget : (int)total_rx_packets; > } > How much of the code below is actually reused anywhere else? I would almost be inclined to say that maybe the zero-copy path should be moved to a new file since so much of this is being duplicated from the original tx/rx code path. I can easily see this becoming confusing as to which is which when a bug gets found and needs to be fixed. > +static struct sk_buff *i40e_run_xdp_zc(struct i40e_ring *rx_ring, > + struct xdp_buff *xdp) > +{ > + int err, result = I40E_XDP_PASS; > + struct i40e_ring *xdp_ring; > + struct bpf_prog *xdp_prog; > + u32 act; > + > + rcu_read_lock(); > + xdp_prog = READ_ONCE(rx_ring->xdp_prog); > + > + act = bpf_prog_run_xdp(xdp_prog, xdp); > + switch (act) { > + case XDP_PASS: > + break; > + case XDP_TX: > + xdp_ring = rx_ring->vsi->xdp_rings[rx_ring->queue_index]; > + result = i40e_xmit_xdp_tx_ring(xdp, xdp_ring); > + break; > + case XDP_REDIRECT: > + err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog); > + result = !err ? I40E_XDP_TX : I40E_XDP_CONSUMED; > + break; > + default: > + bpf_warn_invalid_xdp_action(act); > + case XDP_ABORTED: > + trace_xdp_exception(rx_ring->netdev, xdp_prog, act); > + /* fallthrough -- handle aborts by dropping packet */ > + case XDP_DROP: > + result = I40E_XDP_CONSUMED; > + break; > + } > + > + rcu_read_unlock(); > + return ERR_PTR(-result); > +} > + > +static bool i40e_alloc_frame_zc(struct i40e_ring *rx_ring, > + struct i40e_rx_buffer *bi) > +{ > + struct xdp_umem *umem = rx_ring->xsk_umem; > + void *addr = bi->addr; > + u32 *id; > + > + if (addr) { > + rx_ring->rx_stats.page_reuse_count++; > + return true; > + } > + > + id = xsk_umem_peek_id(umem); > + if (unlikely(!id)) { > + rx_ring->rx_stats.alloc_page_failed++; > + return false; > + } > + > + bi->dma = umem->frames[*id].dma + umem->frame_headroom + > + XDP_PACKET_HEADROOM; > + bi->addr = umem->frames[*id].addr + umem->frame_headroom + > + XDP_PACKET_HEADROOM; > + bi->id = *id; > + > + xsk_umem_discard_id(umem); > + return true; > +} > + > +bool i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 cleaned_count) > +{ > + u16 ntu = rx_ring->next_to_use; > + union i40e_rx_desc *rx_desc; > + struct i40e_rx_buffer *bi; > + > + rx_desc = I40E_RX_DESC(rx_ring, ntu); > + bi = &rx_ring->rx_bi[ntu]; > + > + do { > + if (!i40e_alloc_frame_zc(rx_ring, bi)) > + goto no_buffers; > + > + /* sync the buffer for use by the device */ > + dma_sync_single_range_for_device(rx_ring->dev, bi->dma, 0, > + rx_ring->rx_buf_len, > + DMA_BIDIRECTIONAL); > + > + /* Refresh the desc even if buffer_addrs didn't change > + * because each write-back erases this info. > + */ > + rx_desc->read.pkt_addr = cpu_to_le64(bi->dma); > + > + rx_desc++; > + bi++; > + ntu++; > + if (unlikely(ntu == rx_ring->count)) { > + rx_desc = I40E_RX_DESC(rx_ring, 0); > + bi = rx_ring->rx_bi; > + ntu = 0; > + } > + > + /* clear the status bits for the next_to_use descriptor */ > + rx_desc->wb.qword1.status_error_len = 0; > + > + cleaned_count--; > + } while (cleaned_count); > + > + if (rx_ring->next_to_use != ntu) > + i40e_release_rx_desc(rx_ring, ntu); > + > + return false; > + > +no_buffers: > + if (rx_ring->next_to_use != ntu) > + i40e_release_rx_desc(rx_ring, ntu); > + > + /* make sure to come back via polling to try again after > + * allocation failure > + */ > + return true; > +} > + > +static struct i40e_rx_buffer *i40e_get_rx_buffer_zc(struct i40e_ring > *rx_ring, > + const unsigned int size) > +{ > + struct i40e_rx_buffer *rx_buffer; > + > + rx_buffer = &rx_ring->rx_bi[rx_ring->next_to_clean]; > + > + /* we are reusing so sync this buffer for CPU use */ > + dma_sync_single_range_for_cpu(rx_ring->dev, > + rx_buffer->dma, 0, > + size, > + DMA_BIDIRECTIONAL); > + > + return rx_buffer; > +} > + > +static void i40e_reuse_rx_buffer_zc(struct i40e_ring *rx_ring, > + struct i40e_rx_buffer *old_buff) > +{ > + struct i40e_rx_buffer *new_buff; > + u16 nta = rx_ring->next_to_alloc; > + > + new_buff = &rx_ring->rx_bi[nta]; > + > + /* update, and store next to alloc */ > + nta++; > + rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0; > + > + /* transfer page from old buffer to new buffer */ > + new_buff->dma = old_buff->dma; > + new_buff->addr = old_buff->addr; > + new_buff->id = old_buff->id; > +} > + > +/* Called from the XDP return API in NAPI context. */ > +void i40e_zca_free(struct zero_copy_allocator *alloc, unsigned long handle) > +{ > + struct i40e_rx_buffer *new_buff; > + struct i40e_ring *rx_ring; > + u16 nta; > + > + rx_ring = container_of(alloc, struct i40e_ring, zca); > + nta = rx_ring->next_to_alloc; > + > + new_buff = &rx_ring->rx_bi[nta]; > + > + /* update, and store next to alloc */ > + nta++; > + rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0; > + > + new_buff->dma = rx_ring->xsk_umem->frames[handle].dma; > + new_buff->addr = rx_ring->xsk_umem->frames[handle].addr; > + new_buff->id = (u32)handle; > +} > + > +static struct sk_buff *i40e_zc_frame_to_skb(struct i40e_ring *rx_ring, > + struct i40e_rx_buffer *rx_buffer, > + struct xdp_buff *xdp) > +{ > + // XXX implement alloc skb and copy > + i40e_reuse_rx_buffer_zc(rx_ring, rx_buffer); > + return NULL; > +} > + > +static void i40e_clean_programming_status_zc(struct i40e_ring *rx_ring, > + union i40e_rx_desc *rx_desc, > + u64 qw) > +{ > + struct i40e_rx_buffer *rx_buffer; > + u32 ntc = rx_ring->next_to_clean; > + u8 id; > + > + /* fetch, update, and store next to clean */ > + rx_buffer = &rx_ring->rx_bi[ntc++]; > + ntc = (ntc < rx_ring->count) ? ntc : 0; > + rx_ring->next_to_clean = ntc; > + > + prefetch(I40E_RX_DESC(rx_ring, ntc)); > + > + /* place unused page back on the ring */ > + i40e_reuse_rx_buffer_zc(rx_ring, rx_buffer); > + rx_ring->rx_stats.page_reuse_count++; > + > + /* clear contents of buffer_info */ > + rx_buffer->addr = NULL; > + > + id = (qw & I40E_RX_PROG_STATUS_DESC_QW1_PROGID_MASK) >> > + I40E_RX_PROG_STATUS_DESC_QW1_PROGID_SHIFT; > + > + if (id == I40E_RX_PROG_STATUS_DESC_FD_FILTER_STATUS) > + i40e_fd_handle_status(rx_ring, rx_desc, id); > +} > + > +int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget) > +{ > + unsigned int total_rx_bytes = 0, total_rx_packets = 0; > + u16 cleaned_count = I40E_DESC_UNUSED(rx_ring); > + bool failure = false, xdp_xmit = false; > + struct sk_buff *skb; > + struct xdp_buff xdp; > + > + xdp.rxq = &rx_ring->xdp_rxq; > + > + while (likely(total_rx_packets < (unsigned int)budget)) { > + struct i40e_rx_buffer *rx_buffer; > + union i40e_rx_desc *rx_desc; > + unsigned int size; > + u16 vlan_tag; > + u8 rx_ptype; > + u64 qword; > + u32 ntc; > + > + /* return some buffers to hardware, one at a time is too slow > */ > + if (cleaned_count >= I40E_RX_BUFFER_WRITE) { > + failure = failure || > + i40e_alloc_rx_buffers_zc(rx_ring, > + cleaned_count); > + cleaned_count = 0; > + } > + > + rx_desc = I40E_RX_DESC(rx_ring, rx_ring->next_to_clean); > + > + /* status_error_len will always be zero for unused descriptors > + * because it's cleared in cleanup, and overlaps with hdr_addr > + * which is always zero because packet split isn't used, if > the > + * hardware wrote DD then the length will be non-zero > + */ > + qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len); > + > + /* This memory barrier is needed to keep us from reading > + * any other fields out of the rx_desc until we have > + * verified the descriptor has been written back. > + */ > + dma_rmb(); > + > + if (unlikely(i40e_rx_is_programming_status(qword))) { > + i40e_clean_programming_status_zc(rx_ring, rx_desc, > + qword); > + cleaned_count++; > + continue; > + } > + size = (qword & I40E_RXD_QW1_LENGTH_PBUF_MASK) >> > + I40E_RXD_QW1_LENGTH_PBUF_SHIFT; > + if (!size) > + break; > + > + rx_buffer = i40e_get_rx_buffer_zc(rx_ring, size); > + > + /* retrieve a buffer from the ring */ > + xdp.data = rx_buffer->addr; > + xdp_set_data_meta_invalid(&xdp); > + xdp.data_hard_start = xdp.data - XDP_PACKET_HEADROOM; > + xdp.data_end = xdp.data + size; > + xdp.handle = rx_buffer->id; > + > + skb = i40e_run_xdp_zc(rx_ring, &xdp); > + > + if (IS_ERR(skb)) { > + if (PTR_ERR(skb) == -I40E_XDP_TX) > + xdp_xmit = true; > + else > + i40e_reuse_rx_buffer_zc(rx_ring, rx_buffer); > + total_rx_bytes += size; > + total_rx_packets++; > + } else { > + skb = i40e_zc_frame_to_skb(rx_ring, rx_buffer, &xdp); > + if (!skb) { > + rx_ring->rx_stats.alloc_buff_failed++; > + break; > + } > + } > + > + rx_buffer->addr = NULL; > + cleaned_count++; > + > + /* don't care about non-EOP frames in XDP mode */ > + ntc = rx_ring->next_to_clean + 1; > + ntc = (ntc < rx_ring->count) ? ntc : 0; > + rx_ring->next_to_clean = ntc; > + prefetch(I40E_RX_DESC(rx_ring, ntc)); > + > + if (i40e_cleanup_headers(rx_ring, skb, rx_desc)) { > + skb = NULL; > + continue; > + } > + > + /* probably a little skewed due to removing CRC */ > + total_rx_bytes += skb->len; > + > + qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len); > + rx_ptype = (qword & I40E_RXD_QW1_PTYPE_MASK) >> > + I40E_RXD_QW1_PTYPE_SHIFT; > + > + /* populate checksum, VLAN, and protocol */ > + i40e_process_skb_fields(rx_ring, rx_desc, skb, rx_ptype); > + > + vlan_tag = (qword & BIT(I40E_RX_DESC_STATUS_L2TAG1P_SHIFT)) ? > + le16_to_cpu(rx_desc->wb.qword0.lo_dword.l2tag1) : > 0; > + > + i40e_receive_skb(rx_ring, skb, vlan_tag); > + skb = NULL; > + > + /* update budget accounting */ > + total_rx_packets++; > + } > + > + if (xdp_xmit) { > + struct i40e_ring *xdp_ring = > + rx_ring->vsi->xdp_rings[rx_ring->queue_index]; > + > + i40e_xdp_ring_update_tail(xdp_ring); > + xdp_do_flush_map(); > + } > + > + u64_stats_update_begin(&rx_ring->syncp); > + rx_ring->stats.packets += total_rx_packets; > + rx_ring->stats.bytes += total_rx_bytes; > + u64_stats_update_end(&rx_ring->syncp); > + rx_ring->q_vector->rx.total_packets += total_rx_packets; > + rx_ring->q_vector->rx.total_bytes += total_rx_bytes; > + > + /* guarantee a trip back through this routine if there was a failure > */ > + return failure ? budget : (int)total_rx_packets; > +} > + > static inline u32 i40e_buildreg_itr(const int type, u16 itr) > { > u32 val; > @@ -2576,7 +2922,7 @@ int i40e_napi_poll(struct napi_struct *napi, int budget) > budget_per_ring = max(budget/q_vector->num_ringpairs, 1); > > i40e_for_each_ring(ring, q_vector->rx) { > - int cleaned = i40e_clean_rx_irq(ring, budget_per_ring); > + int cleaned = ring->clean_rx_irq(ring, budget_per_ring); > > work_done += cleaned; > /* if we clean as many as budgeted, we must not be done */ > diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h > b/drivers/net/ethernet/intel/i40e/i40e_txrx.h > index fdd2c55f03a6..9d5d9862e9f1 100644 > --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h > +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h > @@ -296,13 +296,22 @@ struct i40e_tx_buffer { > > struct i40e_rx_buffer { > dma_addr_t dma; > - struct page *page; > + union { > + struct { > + struct page *page; > #if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536) > - __u32 page_offset; > + __u32 page_offset; > #else > - __u16 page_offset; > + __u16 page_offset; > #endif > - __u16 pagecnt_bias; > + __u16 pagecnt_bias; > + }; > + struct { > + /* for umem */ > + void *addr; > + u32 id; > + }; > + }; > }; > > struct i40e_queue_stats { > @@ -344,6 +353,8 @@ enum i40e_ring_state_t { > #define I40E_RX_SPLIT_TCP_UDP 0x4 > #define I40E_RX_SPLIT_SCTP 0x8 > > +void i40e_zc_recycle(struct zero_copy_allocator *alloc, unsigned long > handle); > + > /* struct that defines a descriptor ring, associated with a VSI */ > struct i40e_ring { > struct i40e_ring *next; /* pointer to next ring in q_vector */ > @@ -414,6 +425,12 @@ struct i40e_ring { > > struct i40e_channel *ch; > struct xdp_rxq_info xdp_rxq; > + > + int (*clean_rx_irq)(struct i40e_ring *, int); > + bool (*alloc_rx_buffers)(struct i40e_ring *, u16); > + struct xdp_umem *xsk_umem; > + > + struct zero_copy_allocator zca; /* ZC allocator anchor */ > } ____cacheline_internodealigned_in_smp; > > static inline bool ring_uses_build_skb(struct i40e_ring *ring) > @@ -474,6 +491,7 @@ static inline unsigned int i40e_rx_pg_order(struct > i40e_ring *ring) > #define i40e_rx_pg_size(_ring) (PAGE_SIZE << i40e_rx_pg_order(_ring)) > > bool i40e_alloc_rx_buffers(struct i40e_ring *rxr, u16 cleaned_count); > +bool i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 cleaned_count); > netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device > *netdev); > void i40e_clean_tx_ring(struct i40e_ring *tx_ring); > void i40e_clean_rx_ring(struct i40e_ring *rx_ring); > @@ -489,6 +507,9 @@ int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int > size); > bool __i40e_chk_linearize(struct sk_buff *skb); > int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf); > void i40e_xdp_flush(struct net_device *dev); > +int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget); > +int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget); > +void i40e_zca_free(struct zero_copy_allocator *alloc, unsigned long handle); > > /** > * i40e_get_head - Retrieve head from head writeback > @@ -575,4 +596,5 @@ static inline struct netdev_queue *txring_txq(const > struct i40e_ring *ring) > { > return netdev_get_tx_queue(ring->netdev, ring->queue_index); > } > + > #endif /* _I40E_TXRX_H_ */ > -- > 2.14.1 >