To avoid multiple stores on fast path, Ethernet drivers
aggregate the writes to data_off, refcnt, nb_segs and port
to an uint64_t data and write the data in one shot
with uint64_t* at &mbuf->rearm_data address.

Some of the non-IA platforms have store operation overhead
if the store address is not naturally aligned.This patch
fixes the performance issue on those targets.

Signed-off-by: Jerin Jacob <jerin.jacob at caviumnetworks.com>
---

Tested this patch on IA and non-IA(ThunderX) platforms.
This patch shows 400Kpps/core improvement on ThunderX + ixgbe + vector 
environment.
and this patch does not have any overhead on IA platform.

Have tried an another similar approach by replacing "buf_len" with "pad"
(in this patch context),
Since it has additional overhead on read and then mask to keep "buf_len" intact,
not much improvement is not shown.
ref: http://dpdk.org/ml/archives/dev/2016-May/038914.html

---
 drivers/net/fm10k/fm10k_rxtx_vec.c                            | 3 ---
 drivers/net/i40e/i40e_rxtx_vec.c                              | 5 +----
 drivers/net/ixgbe/ixgbe_rxtx_vec.c                            | 3 ---
 lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h | 4 ++--
 lib/librte_mbuf/rte_mbuf.h                                    | 6 +++---
 5 files changed, 6 insertions(+), 15 deletions(-)

diff --git a/drivers/net/fm10k/fm10k_rxtx_vec.c 
b/drivers/net/fm10k/fm10k_rxtx_vec.c
index 03e4a5c..f3ef1a1 100644
--- a/drivers/net/fm10k/fm10k_rxtx_vec.c
+++ b/drivers/net/fm10k/fm10k_rxtx_vec.c
@@ -314,9 +314,6 @@ fm10k_rxq_rearm(struct fm10k_rx_queue *rxq)

                /* Flush mbuf with pkt template.
                 * Data to be rearmed is 6 bytes long.
-                * Though, RX will overwrite ol_flags that are coming next
-                * anyway. So overwrite whole 8 bytes with one load:
-                * 6 bytes of rearm_data plus first 2 bytes of ol_flags.
                 */
                p0 = (uintptr_t)&mb0->rearm_data;
                *(uint64_t *)p0 = rxq->mbuf_initializer;
diff --git a/drivers/net/i40e/i40e_rxtx_vec.c b/drivers/net/i40e/i40e_rxtx_vec.c
index f7a62a8..162ce4e 100644
--- a/drivers/net/i40e/i40e_rxtx_vec.c
+++ b/drivers/net/i40e/i40e_rxtx_vec.c
@@ -86,11 +86,8 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
                mb0 = rxep[0].mbuf;
                mb1 = rxep[1].mbuf;

-                /* Flush mbuf with pkt template.
+               /* Flush mbuf with pkt template.
                 * Data to be rearmed is 6 bytes long.
-                * Though, RX will overwrite ol_flags that are coming next
-                * anyway. So overwrite whole 8 bytes with one load:
-                * 6 bytes of rearm_data plus first 2 bytes of ol_flags.
                 */
                p0 = (uintptr_t)&mb0->rearm_data;
                *(uint64_t *)p0 = rxq->mbuf_initializer;
diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec.c 
b/drivers/net/ixgbe/ixgbe_rxtx_vec.c
index c4d709b..33b378d 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx_vec.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec.c
@@ -89,9 +89,6 @@ ixgbe_rxq_rearm(struct ixgbe_rx_queue *rxq)
                /*
                 * Flush mbuf with pkt template.
                 * Data to be rearmed is 6 bytes long.
-                * Though, RX will overwrite ol_flags that are coming next
-                * anyway. So overwrite whole 8 bytes with one load:
-                * 6 bytes of rearm_data plus first 2 bytes of ol_flags.
                 */
                p0 = (uintptr_t)&mb0->rearm_data;
                *(uint64_t *)p0 = rxq->mbuf_initializer;
diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h 
b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
index 2acdfd9..26f61f8 100644
--- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
+++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
@@ -111,11 +111,11 @@ struct rte_kni_fifo {
  */
 struct rte_kni_mbuf {
        void *buf_addr __attribute__((__aligned__(RTE_CACHE_LINE_SIZE)));
-       char pad0[10];
+       char pad0[8];
        uint16_t data_off;      /**< Start address of data in segment buffer. */
        char pad1[2];
        uint8_t nb_segs;        /**< Number of segments. */
-       char pad4[1];
+       char pad4[3];
        uint64_t ol_flags;      /**< Offload features. */
        char pad2[4];
        uint32_t pkt_len;       /**< Total pkt len: sum of all segment 
data_len. */
diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
index 7b92b88..6bc47ed 100644
--- a/lib/librte_mbuf/rte_mbuf.h
+++ b/lib/librte_mbuf/rte_mbuf.h
@@ -733,10 +733,8 @@ struct rte_mbuf {
        void *buf_addr;           /**< Virtual address of segment buffer. */
        phys_addr_t buf_physaddr; /**< Physical address of segment buffer. */

-       uint16_t buf_len;         /**< Length of segment buffer. */
-
        /* next 6 bytes are initialised on RX descriptor rearm */
-       MARKER8 rearm_data;
+       MARKER64 rearm_data;
        uint16_t data_off;

        /**
@@ -753,6 +751,7 @@ struct rte_mbuf {
        };
        uint8_t nb_segs;          /**< Number of segments. */
        uint8_t port;             /**< Input port. */
+       uint16_t pad;             /**< 2B pad for naturally aligned ol_flags */

        uint64_t ol_flags;        /**< Offload features. */

@@ -806,6 +805,7 @@ struct rte_mbuf {

        uint16_t vlan_tci_outer;  /**< Outer VLAN Tag Control Identifier (CPU 
order) */

+       uint16_t buf_len;         /**< Length of segment buffer. */
        /* second cache line - fields only used in slow path or on TX */
        MARKER cacheline1 __rte_cache_min_aligned;

-- 
2.5.5

Reply via email to