On Wed, 17 Apr 2019 at 21:58, Jonathan Lemon <jonathan.le...@gmail.com> wrote: > > When the XDP program attached to a zero-copy AF_XDP socket returns XDP_TX, > queue the umem frame on the XDP TX ring, and arrange for it to be released > via the ZCA free routine, which should place it back onto the reuseq. >
There are a bunch of compiler errors, but I'll leave them out from the comments! > Signed-off-by: Jonathan Lemon <jonathan.le...@gmail.com> > --- > drivers/net/ethernet/intel/i40e/i40e_txrx.h | 1 + > drivers/net/ethernet/intel/i40e/i40e_xsk.c | 52 +++++++++++++++++++-- > include/net/xdp.h | 20 ++++++-- > 3 files changed, 65 insertions(+), 8 deletions(-) > > diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h > b/drivers/net/ethernet/intel/i40e/i40e_txrx.h > index 100e92d2982f..3e7954277737 100644 > --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h > +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h > @@ -274,6 +274,7 @@ static inline unsigned int i40e_txd_use_count(unsigned > int size) > #define I40E_TX_FLAGS_TSYN BIT(8) > #define I40E_TX_FLAGS_FD_SB BIT(9) > #define I40E_TX_FLAGS_UDP_TUNNEL BIT(10) > +#define I40E_TX_FLAGS_ZC_FRAME BIT(11) > #define I40E_TX_FLAGS_VLAN_MASK 0xffff0000 > #define I40E_TX_FLAGS_VLAN_PRIO_MASK 0xe0000000 > #define I40E_TX_FLAGS_VLAN_PRIO_SHIFT 29 > diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c > b/drivers/net/ethernet/intel/i40e/i40e_xsk.c > index d2e212d007c3..16a31c57906a 100644 > --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c > +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c > @@ -188,7 +188,6 @@ int i40e_xsk_umem_setup(struct i40e_vsi *vsi, struct > xdp_umem *umem, > static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp) > { > int err, result = I40E_XDP_PASS; > - struct i40e_ring *xdp_ring; > struct bpf_prog *xdp_prog; > u32 act; > > @@ -202,9 +201,8 @@ static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, > struct xdp_buff *xdp) > switch (act) { > case XDP_PASS: > break; > - case XDP_TX: > - xdp_ring = rx_ring->vsi->xdp_rings[rx_ring->queue_index]; > - result = i40e_xmit_xdp_tx_ring(xdp, xdp_ring); > + case XDP_TX > + result = i40e_xmit_rcvd_zc(rx_ring, xdp); > break; > case XDP_REDIRECT: > err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog); > @@ -623,6 +621,48 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int > budget) > return failure ? budget : (int)total_rx_packets; > } > > +static int i40e_xmit_rcvd_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp) > +{ > + struct i40e_ring *xdp_ring; > + struct i40e_tx_desc *tx_desc; > + struct i40e_tx_buffer *tx_bi; > + struct xdp_frame *xdpf; > + dma_addr_t dma; > + > + xdp_ring = rx_ring->vsi->xdp_rings[rx_ring->queue_index]; > + > + if (!unlikely(I40E_DESC_UNUSED(xdp_ring))) { > + xdp_ring->tx_stats.tx_busy++; > + return I40E_XDP_CONSUMED; > + } > + xpdf = convert_to_xdp_frame_keep_zc(xdp); > + if (unlikely(!xpdf) > + return I40E_XDP_CONSUMED; > + xpdf->handle = xdp->handle; > + > + dma = xdp_umem_get_dma(rx_ring->xsk_umem, xdp->handle); > + tx_bi = &xdp_ring->tx_bi[xdp_ring->next_to_use]; > + tx_bi->bytecount = xpdf->len; > + tx_bi->gso_segs = 1; > + tx_bi->xdpf = xdpf; > + tx_bi->tx_flags = I40E_TX_FLAGS_ZC_FRAME; > + > + tx_desc = I40E_TX_DESC(xdp_ring, xdp_ring->next_to_use); > + tx_desc->buffer_addr = cpu_to_le64(dma); > + tx_desc->cmd_type_offset_bsz = build_ctob(I40E_TX_DESC_CMD_ICRC | > + I40E_TX_DESC_CMD_EOP, > + 0, xpdf->len, 0); > + smp_wmb(); > + > + xdp_ring->next_to_use++; > + if (xdp_ring->next_to_use == xdp_ring->count) > + xdp_ring->next_to_use = 0; > + > + tx_bi->next_to_watch = tx_desc; > + > + return I40E_XDP_TX; > +} What you're basically doing here is a AF_XDP Tx, but triggered from the Rx path, and instead of completion (after the packet is sent) to the completion ring, it's recycled to the Rx HW ring (via zca_free). I like the idea but we need more plumbing first. Let me expand; Unfortunately, the current recycle mechanism requires that at the point of recycling, there has to be space in Rx ring. In the XDP_TX case, there's no completion ring, and we cannot guarantee that there's space on Rx ring (since Rx and Tx are asynchronous). IOW, Rx recycling can currently *only* be done in an Rx context. What I would like to do, is moving i40e-xsk to Jesper's page-pool, instead of the existing recycle mechanism. Then we could return the descriptor to the pool, if the Rx ring doesn't have space for the completed/sent buffer. TL;DR version: Passing zc-frames in XDP_TX cannot be done properly until the Rx recycle mechanism is more robust. :-( (I think Maciej is looking into using the page_pool on the ice driver.) But again, I like the idea! Thanks, Björn > + > /** > * i40e_xmit_zc - Performs zero-copy Tx AF_XDP > * @xdp_ring: XDP Tx ring > @@ -689,6 +729,10 @@ static void i40e_clean_xdp_tx_buffer(struct i40e_ring > *tx_ring, > struct i40e_tx_buffer *tx_bi) > { > xdp_return_frame(tx_bi->xdpf); > + if (tx_bi->tx_flags & I40E_TX_FLAGS_ZC_FRAME) { > + tx_bi->tx_flags = 0; > + return; > + } > dma_unmap_single(tx_ring->dev, > dma_unmap_addr(tx_bi, dma), > dma_unmap_len(tx_bi, len), DMA_TO_DEVICE); > diff --git a/include/net/xdp.h b/include/net/xdp.h > index 0f25b3675c5c..191359c5ebdd 100644 > --- a/include/net/xdp.h > +++ b/include/net/xdp.h > @@ -82,6 +82,7 @@ struct xdp_frame { > */ > struct xdp_mem_info mem; > struct net_device *dev_rx; /* used by cpumap */ > + unsigned long handle; > }; > > /* Clear kernel pointers in xdp_frame */ > @@ -95,15 +96,12 @@ struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct > xdp_buff *xdp); > > /* Convert xdp_buff to xdp_frame */ > static inline > -struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) > +struct xdp_frame *__convert_to_xdp_frame(struct xdp_buff *xdp) > { > struct xdp_frame *xdp_frame; > int metasize; > int headroom; > > - if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) > - return xdp_convert_zc_to_xdp_frame(xdp); > - > /* Assure headroom is available for storing info */ > headroom = xdp->data - xdp->data_hard_start; > metasize = xdp->data - xdp->data_meta; > @@ -125,6 +123,20 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff > *xdp) > return xdp_frame; > } > > +static inline > +struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) > +{ > + if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) > + return xdp_convert_zc_to_xdp_frame(xdp); > + return __convert_to_xdp_frame(xdp); > +} > + > +static inline > +struct xdp_frame *convert_to_xdp_frame_keep_zc(struct xdp_buff *xdp) > +{ > + return __convert_to_xdp_frame(xdp); > +} > + > void xdp_return_frame(struct xdp_frame *xdpf); > void xdp_return_frame_rx_napi(struct xdp_frame *xdpf); > void xdp_return_buff(struct xdp_buff *xdp); > -- > 2.17.1 >