Hi Mark,

This patch besides adding Jumbo Frame support also cleans up
the mbuf initialization (by changing the macros, adding
dpdk_buf_size, and rewriting __ovs_rte_pktmbuf_init), so thanks
for this.  I think it makes sense to split the patch in two:
one that does the clenup, and one that allows configuring the
MTU.

I agree with Flavio's comments as well, more inline

Thanks

On 01/02/2016 02:18, "Mark Kavanagh" <mark.b.kavan...@intel.com> wrote:

>Add support for Jumbo Frames to DPDK-enabled port types,
>using single-segment-mbufs.
>
>Using this approach, the amount of memory allocated for each mbuf
>to store frame data is increased to a value greater than 1518B
>(typical Ethernet maximum frame length). The increased space
>available in the mbuf means that an entire Jumbo Frame can be carried
>in a single mbuf, as opposed to partitioning it across multiple mbuf
>segments.
>
>The amount of space allocated to each mbuf to hold frame data is
>defined dynamically by the user when adding a DPDK port to a bridge.
>If an MTU value is not supplied, or the user-supplied value is invalid,
>the MTU for the port defaults to standard Ethernet MTU (i.e. 1500B).
>
>Signed-off-by: Mark Kavanagh <mark.b.kavan...@intel.com>
>---
> INSTALL.DPDK.md   |  59 +++++++++-
> NEWS              |   2 +
> lib/netdev-dpdk.c | 347
>+++++++++++++++++++++++++++++++++++++++++-------------
> 3 files changed, 328 insertions(+), 80 deletions(-)
>
>diff --git a/INSTALL.DPDK.md b/INSTALL.DPDK.md
>index 96b686c..64ccd15 100644
>--- a/INSTALL.DPDK.md
>+++ b/INSTALL.DPDK.md
>@@ -859,10 +859,61 @@ by adding the following string:
> to <interface> sections of all network devices used by DPDK. Parameter
>'N'
> determines how many queues can be used by the guest.
> 
>+
>+Jumbo Frames
>+------------
>+
>+Support for Jumbo Frames may be enabled at run-time for DPDK-type ports.
>+
>+To avail of Jumbo Frame support, add the 'dpdk-mtu' option to the
>ovs-vsctl
>+'add-port' command-line, along with the required MTU for the port.
>+e.g.
>+
>+     ```
>+     ovs-vsctl add-port br0 dpdk0 -- set Interface dpdk0 type=dpdk
>options:dpdk-mtu=9000
>+     ```
>+
>+When Jumbo Frames are enabled, the size of a DPDK port's mbuf segments
>are
>+increased, such that a full Jumbo Frame may be accommodated inside a
>single
>+mbuf segment. Once set, the MTU for a DPDK port is immutable.

Why is it immutable?

>+
>+Jumbo frame support has been validated against 13312B frames, using the
>+DPDK `igb_uio` driver, but larger frames and other DPDK NIC drivers may
>+theoretically be supported. Supported port types excludes vHost-Cuse
>ports, as
>+this feature is pending deprecation.
>+
>+
>+vHost Ports and Jumbo Frames
>+----------------------------
>+Jumbo frame support is available for DPDK vHost-User ports only. Some
>additional
>+configuration is needed to take advantage of this feature:
>+
>+  1. `mergeable buffers` must be enabled for vHost ports, as
>demonstrated in
>+      the QEMU command line snippet below:
>+
>+      ```
>+      '-netdev type=vhost-user,id=mynet1,chardev=char0,vhostforce \'
>+      '-device 
>virtio-net-pci,mac=00:00:00:00:00:01,netdev=mynet1,mrg_rxbuf=on'
>+      ```
>+
>+  2. Where virtio devices are bound to the Linux kernel driver in a guest
>+     environment (i.e. interfaces are not bound to an in-guest DPDK
>driver), the
>+     MTU of those logical network interfaces must also be increased. This
>+     avoids segmentation of Jumbo Frames in the guest. Note that 'MTU'
>refers
>+     to the length of the IP packet only, and not that of the entire
>frame.
>+
>+     e.g. To calculate the exact MTU of a standard IPv4 frame, subtract
>the L2
>+     header and CRC lengths (i.e. 18B) from the max supported frame size.
>+     So, to set the MTU for a 13312B Jumbo Frame:
>+
>+      ```
>+      ifconfig eth1 mtu 13294
>+      ```
>+
>+
> Restrictions:
> -------------
> 
>-  - Work with 1500 MTU, needs few changes in DPDK lib to fix this issue.
>   - Currently DPDK port does not make use any offload functionality.
>   - DPDK-vHost support works with 1G huge pages.
> 
>@@ -903,6 +954,12 @@ Restrictions:
>     the next release of DPDK (which includes the above patch) is
>available and
>     integrated into OVS.
> 
>+  Jumbo Frames:
>+  - `virtio-pmd`: DPDK apps in the guest do not exit gracefully. The
>source of
>+     this issue is currently being investigated.
>+  - vHost-Cuse: Jumbo Frame support is not available for vHost Cuse
>ports.
>+
>+
> Bug Reporting:
> --------------
> 
>diff --git a/NEWS b/NEWS
>index 5c18867..cd563e0 100644
>--- a/NEWS
>+++ b/NEWS
>@@ -46,6 +46,8 @@ v2.5.0 - xx xxx xxxx
>      abstractions, such as virtual L2 and L3 overlays and security
>groups.
>    - RHEL packaging:
>      * DPDK ports may now be created via network scripts (see
>README.RHEL).
>+   - netdev-dpdk:
>+     * Add Jumbo Frame Support

I don't think we should add this to 2.5

> 
> 
> v2.4.0 - 20 Aug 2015
>diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
>index de7e488..76a5dcc 100644
>--- a/lib/netdev-dpdk.c
>+++ b/lib/netdev-dpdk.c
>@@ -62,20 +62,25 @@ static struct vlog_rate_limit rl =
>VLOG_RATE_LIMIT_INIT(5, 20);
> #define OVS_CACHE_LINE_SIZE CACHE_LINE_SIZE
> #define OVS_VPORT_DPDK "ovs_dpdk"
> 
>+#define NETDEV_DPDK_JUMBO_FRAME_ENABLED     1

I don't see particular value in the above #define.

>+#define NETDEV_DPDK_DEFAULT_RX_BUFSIZE      1024
>+
> /*
>  * need to reserve tons of extra space in the mbufs so we can align the
>  * DMA addresses to 4KB.
>  * The minimum mbuf size is limited to avoid scatter behaviour and drop
>in
>  * performance for standard Ethernet MTU.
>  */
>-#define MTU_TO_MAX_LEN(mtu)  ((mtu) + ETHER_HDR_LEN + ETHER_CRC_LEN)
>-#define MBUF_SIZE_MTU(mtu)   (MTU_TO_MAX_LEN(mtu)        \
>-                              + sizeof(struct dp_packet) \
>-                              + RTE_PKTMBUF_HEADROOM)
>-#define MBUF_SIZE_DRIVER     (2048                       \
>-                              + sizeof (struct rte_mbuf) \
>-                              + RTE_PKTMBUF_HEADROOM)
>-#define MBUF_SIZE(mtu)       MAX(MBUF_SIZE_MTU(mtu), MBUF_SIZE_DRIVER)
>+#define MTU_TO_FRAME_LEN(mtu)       ((mtu) + ETHER_HDR_LEN +
>ETHER_CRC_LEN)
>+#define FRAME_LEN_TO_MTU(frame_len) ((frame_len)- ETHER_HDR_LEN -
>ETHER_CRC_LEN)
>+#define MBUF_SEGMENT_SIZE(mtu)      ( MTU_TO_FRAME_LEN(mtu)      \
>+                                    + sizeof(struct dp_packet)   \
>+                                    + RTE_PKTMBUF_HEADROOM)
>+
>+/* This value should be specified as a multiple of the DPDK NIC driver's
>+ * 'min_rx_bufsize' attribute (currently 1024B for 'igb_uio').
>+ */
>+#define NETDEV_DPDK_MAX_FRAME_LEN    13312
> 
> /* Max and min number of packets in the mempool.  OVS tries to allocate a
>  * mempool with MAX_NB_MBUF: if this fails (because the system doesn't
>have
>@@ -86,6 +91,8 @@ static struct vlog_rate_limit rl =
>VLOG_RATE_LIMIT_INIT(5, 20);
> #define MIN_NB_MBUF          (4096 * 4)
> #define MP_CACHE_SZ          RTE_MEMPOOL_CACHE_MAX_SIZE
> 
>+#define DPDK_VLAN_TAG_LEN    4
>+
> /* MAX_NB_MBUF can be divided by 2 many times, until MIN_NB_MBUF */
> BUILD_ASSERT_DECL(MAX_NB_MBUF % ROUND_DOWN_POW2(MAX_NB_MBUF/MIN_NB_MBUF)
>== 0);
> 
>@@ -114,7 +121,6 @@ static const struct rte_eth_conf port_conf = {
>         .header_split   = 0, /* Header Split disabled */
>         .hw_ip_checksum = 0, /* IP checksum offload disabled */
>         .hw_vlan_filter = 0, /* VLAN filtering disabled */
>-        .jumbo_frame    = 0, /* Jumbo Frame Support disabled */
>         .hw_strip_crc   = 0,
>     },
>     .rx_adv_conf = {
>@@ -254,6 +260,41 @@ is_dpdk_class(const struct netdev_class *class)
>     return class->construct == netdev_dpdk_construct;
> }
> 
>+/* DPDK NIC drivers allocate RX buffers at a particular granularity
>+ * (specified by rte_eth_dev_info.min_rx_bufsize - currently 1K for
>igb_uio).
>+ * If 'frame_len' is not a multiple of this value, insufficient buffers
>are
>+ * allocated to accomodate the packet in its entirety. Furthermore, the
>igb_uio
>+ * driver needs to ensure that there is also sufficient space in the Rx
>buffer
>+ * to accommodate two VLAN tags (for QinQ frames). If the RX buffer is
>too
>+ * small, then the driver enables scatter RX behaviour, which reduces
>+ * performance. To prevent this, use a buffer size that is closest to
>+ * 'frame_len', but which satisfies the aforementioned criteria.
>+ */
>+static uint32_t
>+dpdk_buf_size(struct netdev_dpdk *netdev, int frame_len)
>+{
>+    struct rte_eth_dev_info info;
>+    uint32_t buf_size;
>+    /* XXX: This is a workaround for DPDK v2.2, and needs to be
>refactored with a
>+     * future DPDK release. */

Could you elaborate on that?

>+    uint32_t len = frame_len + (2 * DPDK_VLAN_TAG_LEN);
>+
>+    if(netdev->type == DPDK_DEV_ETH) {
>+        rte_eth_dev_info_get(netdev->port_id, &info);
>+        buf_size = (info.min_rx_bufsize == 0) ?
>+                   NETDEV_DPDK_DEFAULT_RX_BUFSIZE :
>+                   info.min_rx_bufsize;
>+    } else {
>+        buf_size = NETDEV_DPDK_DEFAULT_RX_BUFSIZE;
>+    }
>+
>+    if(len % buf_size) {
>+        len = buf_size * ((len/buf_size) + 1);
>+    }

I think this looks better with the ROUND_UP macro.

>+
>+    return len;
>+}
>+
> /* XXX: use dpdk malloc for entire OVS. in fact huge page should be used
>  * for all other segments data, bss and text. */
> 
>@@ -280,26 +321,65 @@ free_dpdk_buf(struct dp_packet *p)
> }
> 
> static void
>-__rte_pktmbuf_init(struct rte_mempool *mp,
>-                   void *opaque_arg OVS_UNUSED,
>-                   void *_m,
>-                   unsigned i OVS_UNUSED)
>+ovs_rte_pktmbuf_pool_init(struct rte_mempool *mp, void *opaque_arg)
> {
>-    struct rte_mbuf *m = _m;
>-    uint32_t buf_len = mp->elt_size - sizeof(struct dp_packet);
>+    struct rte_pktmbuf_pool_private *user_mbp_priv, *mbp_priv;
>+    struct rte_pktmbuf_pool_private default_mbp_priv;
>+    uint16_t roomsz;
> 
>     RTE_MBUF_ASSERT(mp->elt_size >= sizeof(struct dp_packet));
> 
>+    /* if no structure is provided, assume no mbuf private area */
>+
>+    user_mbp_priv = opaque_arg;
>+    if (user_mbp_priv == NULL) {
>+        default_mbp_priv.mbuf_priv_size = 0;
>+        if (mp->elt_size > sizeof(struct dp_packet)) {
>+            roomsz = mp->elt_size - sizeof(struct dp_packet);
>+        } else {
>+            roomsz = 0;
>+        }
>+        default_mbp_priv.mbuf_data_room_size = roomsz;
>+        user_mbp_priv = &default_mbp_priv;
>+    }
>+
>+    RTE_MBUF_ASSERT(mp->elt_size >= sizeof(struct dp_packet) +
>+        user_mbp_priv->mbuf_data_room_size +
>+        user_mbp_priv->mbuf_priv_size);
>+
>+    mbp_priv = rte_mempool_get_priv(mp);
>+    memcpy(mbp_priv, user_mbp_priv, sizeof(*mbp_priv));
>+}
>+
>+/* Initialise some fields in the mbuf structure that are not modified by
>the
>+ * user once created (origin pool, buffer start address, etc.*/
>+static void
>+__ovs_rte_pktmbuf_init(struct rte_mempool *mp,
>+                       void *opaque_arg OVS_UNUSED,
>+                       void *_m,
>+                       unsigned i OVS_UNUSED)
>+{
>+    struct rte_mbuf *m = _m;
>+    uint32_t buf_size, buf_len, priv_size;
>+
>+    priv_size = rte_pktmbuf_priv_size(mp);
>+    buf_size = sizeof(struct dp_packet) + priv_size;
>+    buf_len = rte_pktmbuf_data_room_size(mp);
>+
>+    RTE_MBUF_ASSERT(RTE_ALIGN(priv_size, RTE_MBUF_PRIV_ALIGN) ==
>priv_size);
>+    RTE_MBUF_ASSERT(mp->elt_size >= buf_size);
>+    RTE_MBUF_ASSERT(buf_len <= UINT16_MAX);
>+
>     memset(m, 0, mp->elt_size);
> 
>-    /* start of buffer is just after mbuf structure */
>-    m->buf_addr = (char *)m + sizeof(struct dp_packet);
>-    m->buf_physaddr = rte_mempool_virt2phy(mp, m) +
>-                    sizeof(struct dp_packet);
>+    /* start of buffer is after dp_packet structure and priv data */
>+    m->priv_size = priv_size;
>+    m->buf_addr = (char *)m + buf_size;
>+    m->buf_physaddr = rte_mempool_virt2phy(mp, m) + buf_size;
>     m->buf_len = (uint16_t)buf_len;
> 
>     /* keep some headroom between start of buffer and data */
>-    m->data_off = RTE_MIN(RTE_PKTMBUF_HEADROOM, m->buf_len);
>+    m->data_off = RTE_MIN(RTE_PKTMBUF_HEADROOM, (uint16_t)m->buf_len);
> 
>     /* init some constant fields */
>     m->pool = mp;
>@@ -315,7 +395,7 @@ ovs_rte_pktmbuf_init(struct rte_mempool *mp,
> {
>     struct rte_mbuf *m = _m;
> 
>-    __rte_pktmbuf_init(mp, opaque_arg, _m, i);
>+    __ovs_rte_pktmbuf_init(mp, opaque_arg, m, i);
> 
>     dp_packet_init_dpdk((struct dp_packet *) m, m->buf_len);
> }
>@@ -326,6 +406,7 @@ dpdk_mp_get(int socket_id, int mtu)
>OVS_REQUIRES(dpdk_mutex)
>     struct dpdk_mp *dmp = NULL;
>     char mp_name[RTE_MEMPOOL_NAMESIZE];
>     unsigned mp_size;
>+    struct rte_pktmbuf_pool_private mbp_priv;
> 
>     LIST_FOR_EACH (dmp, list_node, &dpdk_mp_list) {
>         if (dmp->socket_id == socket_id && dmp->mtu == mtu) {
>@@ -338,6 +419,8 @@ dpdk_mp_get(int socket_id, int mtu)
>OVS_REQUIRES(dpdk_mutex)
>     dmp->socket_id = socket_id;
>     dmp->mtu = mtu;
>     dmp->refcount = 1;
>+    mbp_priv.mbuf_data_room_size = MTU_TO_FRAME_LEN(mtu) +
>RTE_PKTMBUF_HEADROOM;
>+    mbp_priv.mbuf_priv_size = 0;
> 
>     mp_size = MAX_NB_MBUF;
>     do {
>@@ -346,10 +429,10 @@ dpdk_mp_get(int socket_id, int mtu)
>OVS_REQUIRES(dpdk_mutex)
>             return NULL;
>         }
> 
>-        dmp->mp = rte_mempool_create(mp_name, mp_size, MBUF_SIZE(mtu),
>+        dmp->mp = rte_mempool_create(mp_name, mp_size,
>MBUF_SEGMENT_SIZE(mtu),
>                                      MP_CACHE_SZ,
>                                      sizeof(struct
>rte_pktmbuf_pool_private),
>-                                     rte_pktmbuf_pool_init, NULL,
>+                                     ovs_rte_pktmbuf_pool_init,
>&mbp_priv,
>                                      ovs_rte_pktmbuf_init, NULL,
>                                      socket_id, 0);
>     } while (!dmp->mp && rte_errno == ENOMEM && (mp_size /= 2) >=
>MIN_NB_MBUF);

Ok, this is quite intricated. I believe the reason that OVS used its own
__rte_pktmbuf_init() was that it wasn't possible to have custom metadata
in mbufs before DPDK 2.1.

If I'm not mistaken, with DPDK commit b507905ff407 there's a way to do that
without copying any code from DPDK with the following incremental (from
master)

---8<---

@@ -278,42 +272,12 @@ free_dpdk_buf(struct dp_packet *p)
 }
 
 static void
-__rte_pktmbuf_init(struct rte_mempool *mp,
-                   void *opaque_arg OVS_UNUSED,
-                   void *_m,
-                   unsigned i OVS_UNUSED)
-{
-    struct rte_mbuf *m = _m;
-    uint32_t buf_len = mp->elt_size - sizeof(struct dp_packet);
-
-    RTE_MBUF_ASSERT(mp->elt_size >= sizeof(struct dp_packet));
-
-    memset(m, 0, mp->elt_size);
-
-    /* start of buffer is just after mbuf structure */
-    m->buf_addr = (char *)m + sizeof(struct dp_packet);
-    m->buf_physaddr = rte_mempool_virt2phy(mp, m) +
-                    sizeof(struct dp_packet);
-    m->buf_len = (uint16_t)buf_len;
-
-    /* keep some headroom between start of buffer and data */
-    m->data_off = RTE_MIN(RTE_PKTMBUF_HEADROOM, m->buf_len);
-
-    /* init some constant fields */
-    m->pool = mp;
-    m->nb_segs = 1;
-    m->port = 0xff;
-}
-
-static void
-ovs_rte_pktmbuf_init(struct rte_mempool *mp,
-                     void *opaque_arg OVS_UNUSED,
-                     void *_m,
-                     unsigned i OVS_UNUSED)
+ovs_rte_pktmbuf_init(struct rte_mempool *mp, void *opaque_arg,
+                     void *_m, unsigned i)
 {
     struct rte_mbuf *m = _m;
 
-    __rte_pktmbuf_init(mp, opaque_arg, _m, i);
+    rte_pktmbuf_init(mp, opaque_arg, _m, i);
 
     dp_packet_init_dpdk((struct dp_packet *) m, m->buf_len);
 }
@@ -339,15 +303,21 @@ dpdk_mp_get(int socket_id, int mtu)
OVS_REQUIRES(dpdk_mutex)
 
     mp_size = MAX_NB_MBUF;
     do {
+        struct rte_pktmbuf_pool_private mbuf_sizes;
+
         if (snprintf(mp_name, RTE_MEMPOOL_NAMESIZE, "ovs_mp_%d_%d_%u",
                      dmp->mtu, dmp->socket_id, mp_size) < 0) {
             return NULL;
         }
 
+        mbuf_sizes.mbuf_priv_size = sizeof (struct dp_packet)
+                                    - sizeof (struct rte_mbuf);
+        mbuf_sizes.mbuf_data_room_size = MBUF_SIZE(mtu)
+                                         - sizeof(struct dp_packet);
         dmp->mp = rte_mempool_create(mp_name, mp_size, MBUF_SIZE(mtu),
                                      MP_CACHE_SZ,
                                      sizeof(struct
rte_pktmbuf_pool_private),
-                                     rte_pktmbuf_pool_init, NULL,
+                                     rte_pktmbuf_pool_init, &mbuf_sizes,
                                      ovs_rte_pktmbuf_init, NULL,
                                      socket_id, 0);
     } while (!dmp->mp && rte_errno == ENOMEM && (mp_size /= 2) >=
MIN_NB_MBUF);

---8<---

I think this will make the patch cleaner. Do you think this will work with
the increased MTU as well?

>@@ -433,6 +516,7 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int
>n_rxq, int n_txq)
> {
>     int diag = 0;
>     int i;
>+    struct rte_eth_conf conf = port_conf;
> 
>     /* A device may report more queues than it makes available (this has
>      * been observed for Intel xl710, which reserves some of them for
>@@ -444,7 +528,12 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev,
>int n_rxq, int n_txq)
>             VLOG_INFO("Retrying setup with (rxq:%d txq:%d)", n_rxq,
>n_txq);
>         }
> 
>-        diag = rte_eth_dev_configure(dev->port_id, n_rxq, n_txq,
>&port_conf);
>+        if(OVS_UNLIKELY(dev->mtu > ETHER_MTU)) {
>+            conf.rxmode.jumbo_frame = NETDEV_DPDK_JUMBO_FRAME_ENABLED;
>+            conf.rxmode.max_rx_pkt_len = MTU_TO_FRAME_LEN(dev->mtu);
>+        }
>+
>+        diag = rte_eth_dev_configure(dev->port_id, n_rxq, n_txq, &conf);
>         if (diag) {
>             break;
>         }
>@@ -586,6 +675,7 @@ netdev_dpdk_init(struct netdev *netdev_, unsigned int
>port_no,
>     struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_);
>     int sid;
>     int err = 0;
>+    uint32_t buf_size;
> 
>     ovs_mutex_init(&netdev->mutex);
>     ovs_mutex_lock(&netdev->mutex);
>@@ -605,10 +695,16 @@ netdev_dpdk_init(struct netdev *netdev_, unsigned
>int port_no,
>     netdev->port_id = port_no;
>     netdev->type = type;
>     netdev->flags = 0;
>+
>+    /* Initialize port's MTU and frame len to the default Ethernet
>values.
>+     * Larger, user-specified (jumbo) frame buffers are accommodated in
>+     * netdev_dpdk_set_config.
>+     */
>+    netdev->max_packet_len = ETHER_MAX_LEN;
>     netdev->mtu = ETHER_MTU;
>-    netdev->max_packet_len = MTU_TO_MAX_LEN(netdev->mtu);
> 
>-    netdev->dpdk_mp = dpdk_mp_get(netdev->socket_id, netdev->mtu);
>+    buf_size = dpdk_buf_size(netdev, ETHER_MAX_LEN);
>+    netdev->dpdk_mp = dpdk_mp_get(netdev->socket_id,
>FRAME_LEN_TO_MTU(buf_size));
>     if (!netdev->dpdk_mp) {
>         err = ENOMEM;
>         goto unlock;
>@@ -651,6 +747,27 @@ dpdk_dev_parse_name(const char dev_name[], const
>char prefix[],
>     return 0;
> }
> 
>+static void
>+dpdk_dev_parse_mtu(const struct smap *args, int *mtu)
>+{
>+    const char *mtu_str = smap_get(args, "dpdk-mtu");
>+    char *end_ptr = NULL;
>+    int local_mtu;
>+
>+    if(mtu_str) {
>+        local_mtu = strtoul(mtu_str, &end_ptr, 0);
>+    }
>+    if(!mtu_str || local_mtu < ETHER_MTU ||
>+                   local_mtu >
>FRAME_LEN_TO_MTU(NETDEV_DPDK_MAX_FRAME_LEN) ||
>+                   *end_ptr != '\0') {
>+        local_mtu = ETHER_MTU;
>+        VLOG_WARN("Invalid or missing dpdk-mtu parameter - defaulting to
>%d.\n",
>+                   local_mtu);
>+    }
>+
>+    *mtu = local_mtu;
>+}
>+
> static int
> vhost_construct_helper(struct netdev *netdev_) OVS_REQUIRES(dpdk_mutex)
> {
>@@ -777,11 +894,77 @@ netdev_dpdk_get_config(const struct netdev
>*netdev_, struct smap *args)
>     smap_add_format(args, "configured_rx_queues", "%d", netdev_->n_rxq);
>     smap_add_format(args, "requested_tx_queues", "%d", netdev_->n_txq);
>     smap_add_format(args, "configured_tx_queues", "%d", dev->real_n_txq);
>+    smap_add_format(args, "mtu", "%d", dev->mtu);
>     ovs_mutex_unlock(&dev->mutex);
> 
>     return 0;
> }
> 
>+/* Set the mtu of DPDK_DEV_ETH ports */
>+static int
>+netdev_dpdk_set_mtu(const struct netdev *netdev, int mtu)
>+{
>+    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>+    int old_mtu, err;
>+    uint32_t buf_size;
>+    int dpdk_mtu;
>+    struct dpdk_mp *old_mp;
>+    struct dpdk_mp *mp;
>+
>+    ovs_mutex_lock(&dpdk_mutex);
>+    ovs_mutex_lock(&dev->mutex);
>+    if (dev->mtu == mtu) {
>+        err = 0;
>+        goto out;
>+    }
>+
>+    buf_size = dpdk_buf_size(dev, MTU_TO_FRAME_LEN(mtu));
>+    dpdk_mtu = FRAME_LEN_TO_MTU(buf_size);
>+
>+    mp = dpdk_mp_get(dev->socket_id, dpdk_mtu);
>+    if (!mp) {
>+        err = ENOMEM;
>+        goto out;
>+    }
>+
>+    rte_eth_dev_stop(dev->port_id);
>+
>+    old_mtu = dev->mtu;
>+    old_mp = dev->dpdk_mp;
>+    dev->dpdk_mp = mp;
>+    dev->mtu = mtu;
>+    dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);
>+
>+    err = dpdk_eth_dev_init(dev);
>+    if (err) {
>+        VLOG_WARN("Unable to set MTU '%d' for '%s'; reverting to last
>known "
>+                  "good value '%d'\n", mtu, dev->up.name, old_mtu);
>+        dpdk_mp_put(mp);
>+        dev->mtu = old_mtu;
>+        dev->dpdk_mp = old_mp;
>+        dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);
>+        dpdk_eth_dev_init(dev);
>+        goto out;
>+    } else {
>+        dpdk_mp_put(old_mp);
>+        netdev_change_seq_changed(netdev);
>+    }
>+out:
>+    ovs_mutex_unlock(&dev->mutex);
>+    ovs_mutex_unlock(&dpdk_mutex);
>+    return err;
>+}
>+
>+static int
>+netdev_dpdk_set_config(struct netdev *netdev_, const struct smap *args)
>+{
>+    int mtu;
>+
>+    dpdk_dev_parse_mtu(args, &mtu);
>+
>+    return netdev_dpdk_set_mtu(netdev_, mtu);
>+}
>+
> static int
> netdev_dpdk_get_numa_id(const struct netdev *netdev_)
> {
>@@ -1358,54 +1541,6 @@ netdev_dpdk_get_mtu(const struct netdev *netdev,
>int *mtup)
> 
>     return 0;
> }
>-
>-static int
>-netdev_dpdk_set_mtu(const struct netdev *netdev, int mtu)
>-{
>-    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>-    int old_mtu, err;
>-    struct dpdk_mp *old_mp;
>-    struct dpdk_mp *mp;
>-
>-    ovs_mutex_lock(&dpdk_mutex);
>-    ovs_mutex_lock(&dev->mutex);
>-    if (dev->mtu == mtu) {
>-        err = 0;
>-        goto out;
>-    }
>-
>-    mp = dpdk_mp_get(dev->socket_id, dev->mtu);
>-    if (!mp) {
>-        err = ENOMEM;
>-        goto out;
>-    }
>-
>-    rte_eth_dev_stop(dev->port_id);
>-
>-    old_mtu = dev->mtu;
>-    old_mp = dev->dpdk_mp;
>-    dev->dpdk_mp = mp;
>-    dev->mtu = mtu;
>-    dev->max_packet_len = MTU_TO_MAX_LEN(dev->mtu);
>-
>-    err = dpdk_eth_dev_init(dev);
>-    if (err) {
>-        dpdk_mp_put(mp);
>-        dev->mtu = old_mtu;
>-        dev->dpdk_mp = old_mp;
>-        dev->max_packet_len = MTU_TO_MAX_LEN(dev->mtu);
>-        dpdk_eth_dev_init(dev);
>-        goto out;
>-    }
>-
>-    dpdk_mp_put(old_mp);
>-    netdev_change_seq_changed(netdev);
>-out:
>-    ovs_mutex_unlock(&dev->mutex);
>-    ovs_mutex_unlock(&dpdk_mutex);
>-    return err;
>-}
>-
> static int
> netdev_dpdk_get_carrier(const struct netdev *netdev_, bool *carrier);
> 
>@@ -1682,7 +1817,7 @@ netdev_dpdk_get_status(const struct netdev
>*netdev_, struct smap *args)
>     smap_add_format(args, "numa_id", "%d",
>rte_eth_dev_socket_id(dev->port_id));
>     smap_add_format(args, "driver_name", "%s", dev_info.driver_name);
>     smap_add_format(args, "min_rx_bufsize", "%u",
>dev_info.min_rx_bufsize);
>-    smap_add_format(args, "max_rx_pktlen", "%u", dev_info.max_rx_pktlen);
>+    smap_add_format(args, "max_rx_pktlen", "%u", dev->max_packet_len);
>     smap_add_format(args, "max_rx_queues", "%u", dev_info.max_rx_queues);
>     smap_add_format(args, "max_tx_queues", "%u", dev_info.max_tx_queues);
>     smap_add_format(args, "max_mac_addrs", "%u", dev_info.max_mac_addrs);
>@@ -1904,6 +2039,51 @@ dpdk_vhost_user_class_init(void)
>     return 0;
> }
> 
>+/* Set the mtu of DPDK_DEV_VHOST ports */
>+static int
>+netdev_dpdk_vhost_set_mtu(const struct netdev *netdev, int mtu)
>+{
>+    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>+    int err = 0;
>+    struct dpdk_mp *old_mp;
>+    struct dpdk_mp *mp;
>+
>+    ovs_mutex_lock(&dpdk_mutex);
>+    ovs_mutex_lock(&dev->mutex);
>+    if (dev->mtu == mtu) {
>+        err = 0;
>+        goto out;
>+    }
>+
>+    mp = dpdk_mp_get(dev->socket_id, mtu);
>+    if (!mp) {
>+        err = ENOMEM;
>+        goto out;
>+    }
>+
>+    old_mp = dev->dpdk_mp;
>+    dev->dpdk_mp = mp;
>+    dev->mtu = mtu;
>+    dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);
>+
>+    dpdk_mp_put(old_mp);
>+    netdev_change_seq_changed(netdev);
>+out:
>+    ovs_mutex_unlock(&dev->mutex);
>+    ovs_mutex_unlock(&dpdk_mutex);
>+    return err;
>+}
>+
>+static int
>+netdev_dpdk_vhost_set_config(struct netdev *netdev_, const struct smap
>*args)
>+{
>+    int mtu;
>+
>+    dpdk_dev_parse_mtu(args, &mtu);
>+
>+    return netdev_dpdk_vhost_set_mtu(netdev_, mtu);
>+}
>+
> static void
> dpdk_common_init(void)
> {
>@@ -2040,8 +2220,9 @@ unlock_dpdk:
>     return err;
> }
> 
>-#define NETDEV_DPDK_CLASS(NAME, INIT, CONSTRUCT, DESTRUCT, MULTIQ, SEND,
>\
>-    GET_CARRIER, GET_STATS, GET_FEATURES, GET_STATUS, RXQ_RECV)
>\
>+#define NETDEV_DPDK_CLASS(NAME, INIT, CONSTRUCT, DESTRUCT, SET_CONFIG, \
>+        MULTIQ, SEND, SET_MTU, GET_CARRIER, GET_STATS, GET_FEATURES,   \
>+        GET_STATUS, RXQ_RECV)                                          \
> {                                                             \
>     NAME,                                                     \
>     INIT,                       /* init */                    \
>@@ -2053,7 +2234,7 @@ unlock_dpdk:
>     DESTRUCT,                                                 \
>     netdev_dpdk_dealloc,                                      \
>     netdev_dpdk_get_config,                                   \
>-    NULL,                       /* netdev_dpdk_set_config */  \
>+    SET_CONFIG,                                               \
>     NULL,                       /* get_tunnel_config */       \
>     NULL,                       /* build header */            \
>     NULL,                       /* push header */             \
>@@ -2067,7 +2248,7 @@ unlock_dpdk:
>     netdev_dpdk_set_etheraddr,                                \
>     netdev_dpdk_get_etheraddr,                                \
>     netdev_dpdk_get_mtu,                                      \
>-    netdev_dpdk_set_mtu,                                      \
>+    SET_MTU,                                                  \
>     netdev_dpdk_get_ifindex,                                  \
>     GET_CARRIER,                                              \
>     netdev_dpdk_get_carrier_resets,                           \
>@@ -2213,8 +2394,10 @@ static const struct netdev_class dpdk_class =
>         NULL,
>         netdev_dpdk_construct,
>         netdev_dpdk_destruct,
>+        netdev_dpdk_set_config,
>         netdev_dpdk_set_multiq,
>         netdev_dpdk_eth_send,
>+        netdev_dpdk_set_mtu,
>         netdev_dpdk_get_carrier,
>         netdev_dpdk_get_stats,
>         netdev_dpdk_get_features,
>@@ -2227,8 +2410,10 @@ static const struct netdev_class dpdk_ring_class =
>         NULL,
>         netdev_dpdk_ring_construct,
>         netdev_dpdk_destruct,
>+        netdev_dpdk_set_config,
>         netdev_dpdk_set_multiq,
>         netdev_dpdk_ring_send,
>+        netdev_dpdk_set_mtu,
>         netdev_dpdk_get_carrier,
>         netdev_dpdk_get_stats,
>         netdev_dpdk_get_features,
>@@ -2241,8 +2426,10 @@ static const struct netdev_class OVS_UNUSED
>dpdk_vhost_cuse_class =
>         dpdk_vhost_cuse_class_init,
>         netdev_dpdk_vhost_cuse_construct,
>         netdev_dpdk_vhost_destruct,
>+        NULL,
>         netdev_dpdk_vhost_set_multiq,
>         netdev_dpdk_vhost_send,
>+        NULL,
>         netdev_dpdk_vhost_get_carrier,
>         netdev_dpdk_vhost_get_stats,
>         NULL,
>@@ -2255,8 +2442,10 @@ static const struct netdev_class OVS_UNUSED
>dpdk_vhost_user_class =
>         dpdk_vhost_user_class_init,
>         netdev_dpdk_vhost_user_construct,
>         netdev_dpdk_vhost_destruct,
>+        netdev_dpdk_vhost_set_config,
>         netdev_dpdk_vhost_set_multiq,
>         netdev_dpdk_vhost_send,
>+        netdev_dpdk_vhost_set_mtu,
>         netdev_dpdk_vhost_get_carrier,
>         netdev_dpdk_vhost_get_stats,
>         NULL,
>-- 
>1.9.3
>
>_______________________________________________
>dev mailing list
>dev@openvswitch.org
>http://openvswitch.org/mailman/listinfo/dev

_______________________________________________
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev

Reply via email to