> -----Original Message-----
> From: dev [mailto:dev-boun...@dpdk.org] On Behalf Of Pei, Yulong
> Sent: Saturday, April 1, 2017 3:02 AM
> To: Vladyslav Buslov <vladyslav.bus...@harmonicinc.com>; Zhang, Helin 
> <helin.zh...@intel.com>; Wu, Jingjing <jingjing...@intel.com>;
> Yigit, Ferruh <ferruh.yi...@intel.com>
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH] net/i40e: add packet prefetch
> 
> Hi All
> 
> In Non-vector mode, without this patch, single core performance can reach 
> 37.576Mpps with 64Byte packet,
> But after applied this patch , single core performance downgrade to 
> 34.343Mpps with 64Byte packet.
> 
> Best Regards
> Yulong Pei
> 
> -----Original Message-----
> From: dev [mailto:dev-boun...@dpdk.org] On Behalf Of Vladyslav Buslov
> Sent: Wednesday, March 1, 2017 6:57 PM
> To: Zhang, Helin <helin.zh...@intel.com>; Wu, Jingjing 
> <jingjing...@intel.com>; Yigit, Ferruh <ferruh.yi...@intel.com>
> Cc: dev@dpdk.org
> Subject: [dpdk-dev] [PATCH] net/i40e: add packet prefetch
> 
> Prefetch both cache lines of mbuf and first cache line of payload if 
> CONFIG_RTE_PMD_PACKET_PREFETCH is set.
> 
> Signed-off-by: Vladyslav Buslov <vladyslav.bus...@harmonicinc.com>
> ---
>  drivers/net/i40e/i40e_rxtx.c | 20 ++++++++++++++++----
>  1 file changed, 16 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c 
> index 48429cc..2b4e5c9 100644
> --- a/drivers/net/i40e/i40e_rxtx.c
> +++ b/drivers/net/i40e/i40e_rxtx.c
> @@ -100,6 +100,12 @@
>  #define I40E_TX_OFFLOAD_NOTSUP_MASK \
>               (PKT_TX_OFFLOAD_MASK ^ I40E_TX_OFFLOAD_MASK)
> 
> +#ifdef RTE_PMD_PACKET_PREFETCH
> +#define rte_packet_prefetch(p)       rte_prefetch0(p)
> +#else
> +#define rte_packet_prefetch(p)       do {} while (0)
> +#endif
> +
>  static uint16_t i40e_xmit_pkts_simple(void *tx_queue,
>                                     struct rte_mbuf **tx_pkts,
>                                     uint16_t nb_pkts);
> @@ -495,6 +501,9 @@ i40e_rx_scan_hw_ring(struct i40e_rx_queue *rxq)
>               /* Translate descriptor info to mbuf parameters */
>               for (j = 0; j < nb_dd; j++) {
>                       mb = rxep[j].mbuf;
> +                     rte_packet_prefetch(
> +                             RTE_PTR_ADD(mb->buf_addr,
> +                                             RTE_PKTMBUF_HEADROOM));
>                       qword1 = rte_le_to_cpu_64(\
>                               rxdp[j].wb.qword1.status_error_len);
>                       pkt_len = ((qword1 & I40E_RXD_QW1_LENGTH_PBUF_MASK) >> 
> @@ -578,9 +587,11 @@
> i40e_rx_alloc_bufs(struct i40e_rx_queue *rxq)
> 
>       rxdp = &rxq->rx_ring[alloc_idx];
>       for (i = 0; i < rxq->rx_free_thresh; i++) {
> -             if (likely(i < (rxq->rx_free_thresh - 1)))
> +             if (likely(i < (rxq->rx_free_thresh - 1))) {
>                       /* Prefetch next mbuf */
> -                     rte_prefetch0(rxep[i + 1].mbuf);
> +                     rte_packet_prefetch(rxep[i + 1].mbuf->cacheline0);
> +                     rte_packet_prefetch(rxep[i + 1].mbuf->cacheline1);

As I can see the line aove is the only real difference in that patch.
If that so, might be worth to re-run perf tests witout that line?
Konstantin

> +             }
> 
>               mb = rxep[i].mbuf;
>               rte_mbuf_refcnt_set(mb, 1);
> @@ -752,7 +763,8 @@ i40e_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, 
> uint16_t nb_pkts)
>                               I40E_RXD_QW1_LENGTH_PBUF_SHIFT) - rxq->crc_len;
> 
>               rxm->data_off = RTE_PKTMBUF_HEADROOM;
> -             rte_prefetch0(RTE_PTR_ADD(rxm->buf_addr, RTE_PKTMBUF_HEADROOM));
> +             rte_packet_prefetch(RTE_PTR_ADD(rxm->buf_addr,
> +                                             RTE_PKTMBUF_HEADROOM));
>               rxm->nb_segs = 1;
>               rxm->next = NULL;
>               rxm->pkt_len = rx_packet_len;
> @@ -939,7 +951,7 @@ i40e_recv_scattered_pkts(void *rx_queue,
>               first_seg->ol_flags |= pkt_flags;
> 
>               /* Prefetch data of first segment, if configured to do so. */
> -             rte_prefetch0(RTE_PTR_ADD(first_seg->buf_addr,
> +             rte_packet_prefetch(RTE_PTR_ADD(first_seg->buf_addr,
>                       first_seg->data_off));
>               rx_pkts[nb_rx++] = first_seg;
>               first_seg = NULL;
> --
> 2.1.4

Reply via email to