DPDK v1.8.0 makes significant changes to struct rte_mbuf, including
removal of the 'pkt' and 'data' fields. The latter, formally a
pointer, is now calculated via an offset from the start of the
segment buffer. These fields are referenced by OVS when accessing
the data section of an ofpbuf.

The following changes are required to add support for DPDK 1.8:
v3:
- revert splitting of data across ofpbufs
- use the mbuf's 'udata64' field to keep track of the mbuf's data
  state (i.e. NULL/not NULL), and also to store the upper 16 bits
  of the ofpbuf's data offset when that value is greater than
  UINT16_MAX (i.e. the current limit for mbufs)
- Update DPDK version to 1.8.0 for travis installation scripts and
  INSTALL.DPDK.md

v2:
- split large amounts of data across multiple ofpbufs; with the
  removal of the mbuf's 'data' pointer, and replacement with a
  'data_off' field, it is necessary to limit the size of data
  contained in an ofpbuf to UINT16_MAX when mbufs are used
  (data_off and data_len are both of type uint16_t).
  Were data not split across multiple ofpbufs, values larger
  than UINT16_MAX for 'data_len' and 'data_off' would result
  in wrap-around, and consequently, data corruption. Changes
  introduced in this patch prevent this from occurring.

v1:
- update affected functions to use the correct rte_mbuf fields
- remove init function from netdev-dpdk (no longer required as
  rte_eal_pci_probe is now invoked from eal_init).

Signed-off-by: Mark Kavanagh <mark.b.kavan...@intel.com>
Signed-off-by: Rory Sexton <rory.sex...@intel.com>
Signed-off-by: Kevin Traynor <kevin.tray...@intel.com>
---
 .travis/build.sh  |    4 +-
 INSTALL.DPDK.md   |    6 +-
 lib/netdev-dpdk.c |   31 ++++---------
 lib/ofpbuf.c      |   12 +++--
 lib/ofpbuf.h      |  126 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 lib/packet-dpif.h |    4 +-
 6 files changed, 143 insertions(+), 40 deletions(-)

diff --git a/.travis/build.sh b/.travis/build.sh
index a8a515b..942265a 100755
--- a/.travis/build.sh
+++ b/.travis/build.sh
@@ -67,10 +67,10 @@ fi
 
 if [ "$DPDK" ]; then
     if [ -z "$DPDK_VER" ]; then
-           DPDK_VER="1.7.1"
+           DPDK_VER="1.8.0"
     fi
     install_dpdk $DPDK_VER
-    # Disregard bad function cassts until DPDK is fixed
+    # Disregard bad function casts until DPDK is fixed
     CFLAGS="$CFLAGS -Wno-error=bad-function-cast -Wno-error=cast-align"
     EXTRA_OPTS+="--with-dpdk=./dpdk-$DPDK_VER/build"
 elif [ $CC != "clang" ]; then
diff --git a/INSTALL.DPDK.md b/INSTALL.DPDK.md
index 4c443e5..276fe56 100644
--- a/INSTALL.DPDK.md
+++ b/INSTALL.DPDK.md
@@ -16,13 +16,13 @@ OVS needs a system with 1GB hugepages support.
 Building and Installing:
 ------------------------
 
-Required DPDK 1.7
+Required DPDK 1.8.0
 
 1. Configure build & install DPDK:
   1. Set `$DPDK_DIR`
 
      ```
-     export DPDK_DIR=/usr/src/dpdk-1.7.1
+     export DPDK_DIR=/usr/src/dpdk-1.8.0
      cd $DPDK_DIR
      ```
 
@@ -158,7 +158,7 @@ Using the DPDK with ovs-vswitchd:
    ```
 
 6. Add bridge & ports
-          
+
    To use ovs-vswitchd with DPDK, create a bridge with datapath_type
    "netdev" in the configuration database.  For example:
 
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index aea2016..2364455 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -28,6 +28,9 @@
 #include <unistd.h>
 #include <stdio.h>
 
+#include "rte_config.h"
+#include "rte_mbuf.h"
+
 #include "dpif-netdev.h"
 #include "list.h"
 #include "netdev-dpdk.h"
@@ -265,13 +268,12 @@ __rte_pktmbuf_init(struct rte_mempool *mp,
     m->buf_len = (uint16_t)buf_len;
 
     /* keep some headroom between start of buffer and data */
-    m->pkt.data = (char*) m->buf_addr + RTE_MIN(RTE_PKTMBUF_HEADROOM, 
m->buf_len);
+    m->data_off = RTE_MIN(RTE_PKTMBUF_HEADROOM, m->buf_len);
 
     /* init some constant fields */
-    m->type = RTE_MBUF_PKT;
     m->pool = mp;
-    m->pkt.nb_segs = 1;
-    m->pkt.in_port = 0xff;
+    m->nb_segs = 1;
+    m->port = 0xff;
 }
 
 static void
@@ -825,7 +827,8 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct 
dpif_packet ** pkts,
         }
 
         /* We have to do a copy for now */
-        memcpy(mbufs[newcnt]->pkt.data, ofpbuf_data(&pkts[i]->ofpbuf), size);
+        memcpy(rte_pktmbuf_mtod(mbufs[newcnt], void *),
+                ofpbuf_data(&pkts[i]->ofpbuf), size);
 
         rte_pktmbuf_data_len(mbufs[newcnt]) = size;
         rte_pktmbuf_pkt_len(mbufs[newcnt]) = size;
@@ -1270,22 +1273,6 @@ dpdk_common_init(void)
     ovs_thread_create("dpdk_watchdog", dpdk_watchdog, NULL);
 }
 
-static int
-dpdk_class_init(void)
-{
-    int result;
-
-    result = rte_eal_pci_probe();
-    if (result) {
-        VLOG_ERR("Cannot probe PCI");
-        return -result;
-    }
-
-    VLOG_INFO("Ethernet Device Count: %d", (int)rte_eth_dev_count());
-
-    return 0;
-}
-
 /* Client Rings */
 
 static int
@@ -1510,7 +1497,7 @@ dpdk_init(int argc, char **argv)
 const struct netdev_class dpdk_class =
     NETDEV_DPDK_CLASS(
         "dpdk",
-        dpdk_class_init,
+        NULL,
         netdev_dpdk_construct,
         netdev_dpdk_set_multiq,
         netdev_dpdk_eth_send);
diff --git a/lib/ofpbuf.c b/lib/ofpbuf.c
index 4946e6f..71e396e 100644
--- a/lib/ofpbuf.c
+++ b/lib/ofpbuf.c
@@ -15,11 +15,12 @@
  */
 
 #include <config.h>
-#include "ofpbuf.h"
 #include <stdlib.h>
 #include <string.h>
+
 #include "dynamic-string.h"
 #include "netdev-dpdk.h"
+#include "ofpbuf.h"
 #include "util.h"
 
 static void
@@ -247,11 +248,14 @@ ofpbuf_copy__(struct ofpbuf *b, uint8_t *new_base,
 static void
 ofpbuf_resize__(struct ofpbuf *b, size_t new_headroom, size_t new_tailroom)
 {
-    void *new_base, *new_data;
+    void *new_base, *new_data, *old_data;
     size_t new_allocated;
 
     new_allocated = new_headroom + ofpbuf_size(b) + new_tailroom;
 
+    /* Keep a record of the location of the old data */
+    old_data = ofpbuf_data(b);
+
     switch (b->source) {
     case OFPBUF_DPDK:
         OVS_NOT_REACHED();
@@ -283,9 +287,9 @@ ofpbuf_resize__(struct ofpbuf *b, size_t new_headroom, 
size_t new_tailroom)
     ofpbuf_set_base(b, new_base);
 
     new_data = (char *) new_base + new_headroom;
-    if (ofpbuf_data(b) != new_data) {
+    if (old_data != new_data) {
         if (b->frame) {
-            uintptr_t data_delta = (char *) new_data - (char *) ofpbuf_data(b);
+            uintptr_t data_delta = (char *) new_data - (char *) old_data;
 
             b->frame = (char *) b->frame + data_delta;
         }
diff --git a/lib/ofpbuf.h b/lib/ofpbuf.h
index 4e7038d0..38156b0 100644
--- a/lib/ofpbuf.h
+++ b/lib/ofpbuf.h
@@ -19,15 +19,51 @@
 
 #include <stddef.h>
 #include <stdint.h>
+
 #include "list.h"
+#include "netdev-dpdk.h"
 #include "packets.h"
 #include "util.h"
-#include "netdev-dpdk.h"
+
+#ifdef DPDK_NETDEV
+#include "rte_common.h"
+#endif
 
 #ifdef  __cplusplus
 extern "C" {
 #endif
 
+#ifdef DPDK_NETDEV
+/* DPDK 1.8 makes significant changes to struct rte_ mbuf that impact OVS.
+ * Chief among these is the replacement of the 'data' pointer (previously used
+ * to store an ofpbuf's data) with an offset field - the data pointer may be
+ * calculated by adding the data_off to the mbuf segment address. This breaks
+ * the ofpbuf_data API, as there is no way to distinguish an mbuf containing no
+ * data from an mbuf whose data starts at the very beginning of the mbuf (i.e.a
+ * NULL pointer cannot be returned). Furthermore, the data_off field is only 16
+ * bits in length, which restricts the distance between the beginning of an
+ * ofpbuf/mbuf and its actual data. This affects certain control ofpbufs.
+ *
+ * To overcome these limitations, store some information about the ofpbuf in
+ * the mbuf's user data field (mbuf.udata64):
+ * - data_state: flag describing state of data in the mbuf (udata64 bit 0)
+ *               1 = NULL, 0 = non-NULL. When this bit is set, a call to
+ *               ofpbuf_data returns NULL, restoring API compatibility.
+ * - data_off:   upper 16 bits of ofpbuf's data offset (udata64 bits 16-31)
+ *               When the distance between the start of an mbuf segment and its
+ *               actual data exceeds UINT16_MAX, split the 32 bit offset across
+ *               two mbuf fields: the lower 16 bits are stored in mbuf.data_off
+ *               while the upper 16 are stored in bits 16-31 of mbuf.udata64.
+ */
+#define DATA_OFFSET_MASK     (0xFFFF0000)
+#define DATA_OFFSET_UPPER(d) ((d) & DATA_OFFSET_MASK)
+
+enum OVS_PACKED_ENUM ofpbuf_data_state {
+    OFPBUF_DATA_NON_NULL,       /* ofpbuf contains an mbuf with valid data */
+    OFPBUF_DATA_NULL            /* ofpbuf's mbuf holds NULL data */
+};
+#endif
+
 enum OVS_PACKED_ENUM ofpbuf_source {
     OFPBUF_MALLOC,              /* Obtained via malloc(). */
     OFPBUF_STACK,               /* Un-movable stack space or static buffer. */
@@ -84,6 +120,12 @@ static inline void ofpbuf_set_data(struct ofpbuf *, void *);
 static inline void * ofpbuf_base(const struct ofpbuf *);
 static inline void ofpbuf_set_base(struct ofpbuf *, void *);
 
+#ifdef DPDK_NETDEV
+static inline bool ofpbuf_data_is_null(const struct ofpbuf *b);
+static inline void ofpbuf_set_data_null(struct ofpbuf *b);
+static inline void ofpbuf_set_data_non_null(struct ofpbuf *b);
+#endif
+
 static inline uint32_t ofpbuf_size(const struct ofpbuf *);
 static inline void ofpbuf_set_size(struct ofpbuf *, uint32_t);
 
@@ -384,14 +426,71 @@ static inline const void *ofpbuf_get_nd_payload(const 
struct ofpbuf *b)
 #ifdef DPDK_NETDEV
 BUILD_ASSERT_DECL(offsetof(struct ofpbuf, mbuf) == 0);
 
+static inline bool ofpbuf_data_is_null(const struct ofpbuf *b)
+{
+    return b->mbuf.udata64 & OFPBUF_DATA_NULL;
+}
+
+static inline void ofpbuf_set_data_null(struct ofpbuf *b)
+{
+    b->mbuf.udata64 |= OFPBUF_DATA_NULL;
+}
+
+static inline void ofpbuf_set_data_non_null(struct ofpbuf *b)
+{
+    b->mbuf.udata64 &= ~OFPBUF_DATA_NULL;
+}
+
 static inline void * ofpbuf_data(const struct ofpbuf *b)
 {
-    return b->mbuf.pkt.data;
+    uint32_t offset = 0;
+
+    if (OVS_UNLIKELY(ofpbuf_data_is_null(b))) {
+        return NULL;
+    }
+
+    offset = b->mbuf.data_off;
+
+    /* If data_off is partially stored in b->mbuf->udata64, it'll need to
+     * be reassembled.
+     */
+    if (OVS_UNLIKELY(b->mbuf.udata64 & DATA_OFFSET_MASK)) {
+        offset |= DATA_OFFSET_UPPER(b->mbuf.udata64);
+    }
+
+    return (void *)((char *)b->mbuf.buf_addr + offset);
 }
 
 static inline void ofpbuf_set_data(struct ofpbuf *b, void *d)
 {
-    b->mbuf.pkt.data = d;
+    uint64_t data_delta = 0;
+
+    /* NULL 'd' value is valid */
+    if (OVS_UNLIKELY(d == NULL)) {
+        b->mbuf.data_off = 0;
+        b->mbuf.udata64 = 0;
+        ofpbuf_set_data_null(b);
+    } else if (OVS_UNLIKELY(d < b->mbuf.buf_addr)) {
+        OVS_NOT_REACHED();
+    } else {
+        /* Work out the offset between the start of segment buffer and 'd' */
+        data_delta = RTE_PTR_DIFF(d, b->mbuf.buf_addr);
+
+        /* Set lower 16 bits of total data offset, allowing truncation */
+        b->mbuf.data_off = (uint16_t)data_delta;
+
+        /* Reset udata64 in case any junk remains */
+        b->mbuf.udata64 = 0;
+
+        /* If data_delta is > UINT16_MAX, store the upper 16 bits in bits 16-31
+         * of b->mbuf->udata64.
+         */
+        if (OVS_UNLIKELY(data_delta > UINT16_MAX)) {
+            b->mbuf.udata64 = DATA_OFFSET_UPPER(data_delta);
+        }
+
+        ofpbuf_set_data_non_null(b);
+    }
 }
 
 static inline void * ofpbuf_base(const struct ofpbuf *b)
@@ -406,14 +505,27 @@ static inline void ofpbuf_set_base(struct ofpbuf *b, void 
*d)
 
 static inline uint32_t ofpbuf_size(const struct ofpbuf *b)
 {
-    return b->mbuf.pkt.pkt_len;
+    return b->mbuf.pkt_len;
 }
 
 static inline void ofpbuf_set_size(struct ofpbuf *b, uint32_t v)
 {
-    b->mbuf.pkt.data_len = v;    /* Current seg length. */
-    b->mbuf.pkt.pkt_len = v;     /* Total length of all segments linked to
-                                  * this segment. */
+    /* netdev-dpdk does not currently support segmentation; consequently, for
+     * all intents and purposes, 'data_len' (16 bit) and 'pkt_len' (32 bit) may
+     * be used interchangably.
+     *
+     * On the datapath, it is expected that the size of packets
+     * (and thus 'v') will always be <= UINT16_MAX; this means that there is no
+     * loss of accuracy in assigning 'v' to 'data_len'.
+     *
+     * However, control ofpbufs may well be larger than UINT16_MAX (i.e. 'v' >
+     * UINT16_MAX); even though the value is truncated when assigned to
+     * 'data_len', loss of accuracy is avoided in this situation by using
+     * 'pkt_len' to represent the packet size.
+     */
+    b->mbuf.data_len = (uint16_t)v;  /* Current seg length. */
+    b->mbuf.pkt_len = v;             /* Total length of all segments linked to
+                                      * this segment. */
 }
 
 #else
diff --git a/lib/packet-dpif.h b/lib/packet-dpif.h
index 1a5efb6..692a81a 100644
--- a/lib/packet-dpif.h
+++ b/lib/packet-dpif.h
@@ -50,7 +50,7 @@ static inline void dpif_packet_delete(struct dpif_packet *p)
 static inline uint32_t dpif_packet_get_dp_hash(struct dpif_packet *p)
 {
 #ifdef DPDK_NETDEV
-    return p->ofpbuf.mbuf.pkt.hash.rss;
+    return p->ofpbuf.mbuf.hash.rss;
 #else
     return p->dp_hash;
 #endif
@@ -60,7 +60,7 @@ static inline void dpif_packet_set_dp_hash(struct dpif_packet 
*p,
                                            uint32_t hash)
 {
 #ifdef DPDK_NETDEV
-    p->ofpbuf.mbuf.pkt.hash.rss = hash;
+    p->ofpbuf.mbuf.hash.rss = hash;
 #else
     p->dp_hash = hash;
 #endif
-- 
1.7.4.1

_______________________________________________
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev

Reply via email to