To avoid multiple stores on fast path, Ethernet drivers aggregate the writes to data_off, refcnt, nb_segs and port to an uint64_t data and write the data in one shot with uint64_t* at &mbuf->rearm_data address.
Some of the non-IA platforms have store operation overhead if the store address is not naturally aligned.This patch fixes the performance issue on those targets. Signed-off-by: Jerin Jacob <jerin.jacob at caviumnetworks.com> --- Tested this patch on IA and non-IA(ThunderX) platforms. This patch shows 400Kpps/core improvement on ThunderX + ixgbe + vector environment. and this patch does not have any overhead on IA platform. Have tried an another similar approach by replacing "buf_len" with "pad" (in this patch context), Since it has additional overhead on read and then mask to keep "buf_len" intact, not much improvement is not shown. ref: http://dpdk.org/ml/archives/dev/2016-May/038914.html --- drivers/net/fm10k/fm10k_rxtx_vec.c | 3 --- drivers/net/i40e/i40e_rxtx_vec.c | 5 +---- drivers/net/ixgbe/ixgbe_rxtx_vec.c | 3 --- lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h | 4 ++-- lib/librte_mbuf/rte_mbuf.h | 6 +++--- 5 files changed, 6 insertions(+), 15 deletions(-) diff --git a/drivers/net/fm10k/fm10k_rxtx_vec.c b/drivers/net/fm10k/fm10k_rxtx_vec.c index 03e4a5c..f3ef1a1 100644 --- a/drivers/net/fm10k/fm10k_rxtx_vec.c +++ b/drivers/net/fm10k/fm10k_rxtx_vec.c @@ -314,9 +314,6 @@ fm10k_rxq_rearm(struct fm10k_rx_queue *rxq) /* Flush mbuf with pkt template. * Data to be rearmed is 6 bytes long. - * Though, RX will overwrite ol_flags that are coming next - * anyway. So overwrite whole 8 bytes with one load: - * 6 bytes of rearm_data plus first 2 bytes of ol_flags. */ p0 = (uintptr_t)&mb0->rearm_data; *(uint64_t *)p0 = rxq->mbuf_initializer; diff --git a/drivers/net/i40e/i40e_rxtx_vec.c b/drivers/net/i40e/i40e_rxtx_vec.c index f7a62a8..162ce4e 100644 --- a/drivers/net/i40e/i40e_rxtx_vec.c +++ b/drivers/net/i40e/i40e_rxtx_vec.c @@ -86,11 +86,8 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq) mb0 = rxep[0].mbuf; mb1 = rxep[1].mbuf; - /* Flush mbuf with pkt template. + /* Flush mbuf with pkt template. * Data to be rearmed is 6 bytes long. - * Though, RX will overwrite ol_flags that are coming next - * anyway. So overwrite whole 8 bytes with one load: - * 6 bytes of rearm_data plus first 2 bytes of ol_flags. */ p0 = (uintptr_t)&mb0->rearm_data; *(uint64_t *)p0 = rxq->mbuf_initializer; diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec.c b/drivers/net/ixgbe/ixgbe_rxtx_vec.c index c4d709b..33b378d 100644 --- a/drivers/net/ixgbe/ixgbe_rxtx_vec.c +++ b/drivers/net/ixgbe/ixgbe_rxtx_vec.c @@ -89,9 +89,6 @@ ixgbe_rxq_rearm(struct ixgbe_rx_queue *rxq) /* * Flush mbuf with pkt template. * Data to be rearmed is 6 bytes long. - * Though, RX will overwrite ol_flags that are coming next - * anyway. So overwrite whole 8 bytes with one load: - * 6 bytes of rearm_data plus first 2 bytes of ol_flags. */ p0 = (uintptr_t)&mb0->rearm_data; *(uint64_t *)p0 = rxq->mbuf_initializer; diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h index 2acdfd9..26f61f8 100644 --- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h +++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h @@ -111,11 +111,11 @@ struct rte_kni_fifo { */ struct rte_kni_mbuf { void *buf_addr __attribute__((__aligned__(RTE_CACHE_LINE_SIZE))); - char pad0[10]; + char pad0[8]; uint16_t data_off; /**< Start address of data in segment buffer. */ char pad1[2]; uint8_t nb_segs; /**< Number of segments. */ - char pad4[1]; + char pad4[3]; uint64_t ol_flags; /**< Offload features. */ char pad2[4]; uint32_t pkt_len; /**< Total pkt len: sum of all segment data_len. */ diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h index 7b92b88..6bc47ed 100644 --- a/lib/librte_mbuf/rte_mbuf.h +++ b/lib/librte_mbuf/rte_mbuf.h @@ -733,10 +733,8 @@ struct rte_mbuf { void *buf_addr; /**< Virtual address of segment buffer. */ phys_addr_t buf_physaddr; /**< Physical address of segment buffer. */ - uint16_t buf_len; /**< Length of segment buffer. */ - /* next 6 bytes are initialised on RX descriptor rearm */ - MARKER8 rearm_data; + MARKER64 rearm_data; uint16_t data_off; /** @@ -753,6 +751,7 @@ struct rte_mbuf { }; uint8_t nb_segs; /**< Number of segments. */ uint8_t port; /**< Input port. */ + uint16_t pad; /**< 2B pad for naturally aligned ol_flags */ uint64_t ol_flags; /**< Offload features. */ @@ -806,6 +805,7 @@ struct rte_mbuf { uint16_t vlan_tci_outer; /**< Outer VLAN Tag Control Identifier (CPU order) */ + uint16_t buf_len; /**< Length of segment buffer. */ /* second cache line - fields only used in slow path or on TX */ MARKER cacheline1 __rte_cache_min_aligned; -- 2.5.5