Hi All In Non-vector mode, without this patch, single core performance can reach 37.576Mpps with 64Byte packet, But after applied this patch , single core performance downgrade to 34.343Mpps with 64Byte packet.
Best Regards Yulong Pei -----Original Message----- From: dev [mailto:dev-boun...@dpdk.org] On Behalf Of Vladyslav Buslov Sent: Wednesday, March 1, 2017 6:57 PM To: Zhang, Helin <helin.zh...@intel.com>; Wu, Jingjing <jingjing...@intel.com>; Yigit, Ferruh <ferruh.yi...@intel.com> Cc: dev@dpdk.org Subject: [dpdk-dev] [PATCH] net/i40e: add packet prefetch Prefetch both cache lines of mbuf and first cache line of payload if CONFIG_RTE_PMD_PACKET_PREFETCH is set. Signed-off-by: Vladyslav Buslov <vladyslav.bus...@harmonicinc.com> --- drivers/net/i40e/i40e_rxtx.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c index 48429cc..2b4e5c9 100644 --- a/drivers/net/i40e/i40e_rxtx.c +++ b/drivers/net/i40e/i40e_rxtx.c @@ -100,6 +100,12 @@ #define I40E_TX_OFFLOAD_NOTSUP_MASK \ (PKT_TX_OFFLOAD_MASK ^ I40E_TX_OFFLOAD_MASK) +#ifdef RTE_PMD_PACKET_PREFETCH +#define rte_packet_prefetch(p) rte_prefetch0(p) +#else +#define rte_packet_prefetch(p) do {} while (0) +#endif + static uint16_t i40e_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts); @@ -495,6 +501,9 @@ i40e_rx_scan_hw_ring(struct i40e_rx_queue *rxq) /* Translate descriptor info to mbuf parameters */ for (j = 0; j < nb_dd; j++) { mb = rxep[j].mbuf; + rte_packet_prefetch( + RTE_PTR_ADD(mb->buf_addr, + RTE_PKTMBUF_HEADROOM)); qword1 = rte_le_to_cpu_64(\ rxdp[j].wb.qword1.status_error_len); pkt_len = ((qword1 & I40E_RXD_QW1_LENGTH_PBUF_MASK) >> @@ -578,9 +587,11 @@ i40e_rx_alloc_bufs(struct i40e_rx_queue *rxq) rxdp = &rxq->rx_ring[alloc_idx]; for (i = 0; i < rxq->rx_free_thresh; i++) { - if (likely(i < (rxq->rx_free_thresh - 1))) + if (likely(i < (rxq->rx_free_thresh - 1))) { /* Prefetch next mbuf */ - rte_prefetch0(rxep[i + 1].mbuf); + rte_packet_prefetch(rxep[i + 1].mbuf->cacheline0); + rte_packet_prefetch(rxep[i + 1].mbuf->cacheline1); + } mb = rxep[i].mbuf; rte_mbuf_refcnt_set(mb, 1); @@ -752,7 +763,8 @@ i40e_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) I40E_RXD_QW1_LENGTH_PBUF_SHIFT) - rxq->crc_len; rxm->data_off = RTE_PKTMBUF_HEADROOM; - rte_prefetch0(RTE_PTR_ADD(rxm->buf_addr, RTE_PKTMBUF_HEADROOM)); + rte_packet_prefetch(RTE_PTR_ADD(rxm->buf_addr, + RTE_PKTMBUF_HEADROOM)); rxm->nb_segs = 1; rxm->next = NULL; rxm->pkt_len = rx_packet_len; @@ -939,7 +951,7 @@ i40e_recv_scattered_pkts(void *rx_queue, first_seg->ol_flags |= pkt_flags; /* Prefetch data of first segment, if configured to do so. */ - rte_prefetch0(RTE_PTR_ADD(first_seg->buf_addr, + rte_packet_prefetch(RTE_PTR_ADD(first_seg->buf_addr, first_seg->data_off)); rx_pkts[nb_rx++] = first_seg; first_seg = NULL; -- 2.1.4