> -----Original Message----- > From: dev [mailto:dev-boun...@dpdk.org] On Behalf Of Pei, Yulong > Sent: Saturday, April 1, 2017 3:02 AM > To: Vladyslav Buslov <vladyslav.bus...@harmonicinc.com>; Zhang, Helin > <helin.zh...@intel.com>; Wu, Jingjing <jingjing...@intel.com>; > Yigit, Ferruh <ferruh.yi...@intel.com> > Cc: dev@dpdk.org > Subject: Re: [dpdk-dev] [PATCH] net/i40e: add packet prefetch > > Hi All > > In Non-vector mode, without this patch, single core performance can reach > 37.576Mpps with 64Byte packet, > But after applied this patch , single core performance downgrade to > 34.343Mpps with 64Byte packet. > > Best Regards > Yulong Pei > > -----Original Message----- > From: dev [mailto:dev-boun...@dpdk.org] On Behalf Of Vladyslav Buslov > Sent: Wednesday, March 1, 2017 6:57 PM > To: Zhang, Helin <helin.zh...@intel.com>; Wu, Jingjing > <jingjing...@intel.com>; Yigit, Ferruh <ferruh.yi...@intel.com> > Cc: dev@dpdk.org > Subject: [dpdk-dev] [PATCH] net/i40e: add packet prefetch > > Prefetch both cache lines of mbuf and first cache line of payload if > CONFIG_RTE_PMD_PACKET_PREFETCH is set. > > Signed-off-by: Vladyslav Buslov <vladyslav.bus...@harmonicinc.com> > --- > drivers/net/i40e/i40e_rxtx.c | 20 ++++++++++++++++---- > 1 file changed, 16 insertions(+), 4 deletions(-) > > diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c > index 48429cc..2b4e5c9 100644 > --- a/drivers/net/i40e/i40e_rxtx.c > +++ b/drivers/net/i40e/i40e_rxtx.c > @@ -100,6 +100,12 @@ > #define I40E_TX_OFFLOAD_NOTSUP_MASK \ > (PKT_TX_OFFLOAD_MASK ^ I40E_TX_OFFLOAD_MASK) > > +#ifdef RTE_PMD_PACKET_PREFETCH > +#define rte_packet_prefetch(p) rte_prefetch0(p) > +#else > +#define rte_packet_prefetch(p) do {} while (0) > +#endif > + > static uint16_t i40e_xmit_pkts_simple(void *tx_queue, > struct rte_mbuf **tx_pkts, > uint16_t nb_pkts); > @@ -495,6 +501,9 @@ i40e_rx_scan_hw_ring(struct i40e_rx_queue *rxq) > /* Translate descriptor info to mbuf parameters */ > for (j = 0; j < nb_dd; j++) { > mb = rxep[j].mbuf; > + rte_packet_prefetch( > + RTE_PTR_ADD(mb->buf_addr, > + RTE_PKTMBUF_HEADROOM)); > qword1 = rte_le_to_cpu_64(\ > rxdp[j].wb.qword1.status_error_len); > pkt_len = ((qword1 & I40E_RXD_QW1_LENGTH_PBUF_MASK) >> > @@ -578,9 +587,11 @@ > i40e_rx_alloc_bufs(struct i40e_rx_queue *rxq) > > rxdp = &rxq->rx_ring[alloc_idx]; > for (i = 0; i < rxq->rx_free_thresh; i++) { > - if (likely(i < (rxq->rx_free_thresh - 1))) > + if (likely(i < (rxq->rx_free_thresh - 1))) { > /* Prefetch next mbuf */ > - rte_prefetch0(rxep[i + 1].mbuf); > + rte_packet_prefetch(rxep[i + 1].mbuf->cacheline0); > + rte_packet_prefetch(rxep[i + 1].mbuf->cacheline1);
As I can see the line aove is the only real difference in that patch. If that so, might be worth to re-run perf tests witout that line? Konstantin > + } > > mb = rxep[i].mbuf; > rte_mbuf_refcnt_set(mb, 1); > @@ -752,7 +763,8 @@ i40e_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, > uint16_t nb_pkts) > I40E_RXD_QW1_LENGTH_PBUF_SHIFT) - rxq->crc_len; > > rxm->data_off = RTE_PKTMBUF_HEADROOM; > - rte_prefetch0(RTE_PTR_ADD(rxm->buf_addr, RTE_PKTMBUF_HEADROOM)); > + rte_packet_prefetch(RTE_PTR_ADD(rxm->buf_addr, > + RTE_PKTMBUF_HEADROOM)); > rxm->nb_segs = 1; > rxm->next = NULL; > rxm->pkt_len = rx_packet_len; > @@ -939,7 +951,7 @@ i40e_recv_scattered_pkts(void *rx_queue, > first_seg->ol_flags |= pkt_flags; > > /* Prefetch data of first segment, if configured to do so. */ > - rte_prefetch0(RTE_PTR_ADD(first_seg->buf_addr, > + rte_packet_prefetch(RTE_PTR_ADD(first_seg->buf_addr, > first_seg->data_off)); > rx_pkts[nb_rx++] = first_seg; > first_seg = NULL; > -- > 2.1.4