Hi Ian,

I haven't looked at every detail nor tested the patch yet,
I just wanted to give some early feedback.

Thanks,

Daniele

On 24/02/2016 06:17, "dev on behalf of Ian Stokes"
<dev-boun...@openvswitch.org on behalf of ian.sto...@intel.com> wrote:

>This patch provides the modifications required in netdev-dpdk.c and
>vswitch.xml to enable ingress policing for DPDK interfaces.
>
>This patch implements the necessary netdev functions to netdev-dpdk.c as
>well as various helper functions required for ingress policing.
>
>The vswitch.xml has been modified to explain the expected parameters and
>behaviour when using ingress policing.
>
>The INSTALL.DPDK.md guide has been modified to provide an example
>configuration of ingress policing.
>
>Signed-off-by: Ian Stokes <ian.sto...@intel.com>
>---
> INSTALL.DPDK.md      |   19 +++++
> NEWS                 |    1 +
> lib/netdev-dpdk.c    |  187
>++++++++++++++++++++++++++++++++++++++++++++++++--
> vswitchd/vswitch.xml |   24 ++++---
> 4 files changed, 217 insertions(+), 14 deletions(-)
>
>diff --git a/INSTALL.DPDK.md b/INSTALL.DPDK.md
>index ca49106..c42cc8a 100644
>--- a/INSTALL.DPDK.md
>+++ b/INSTALL.DPDK.md
>@@ -207,6 +207,25 @@ Using the DPDK with ovs-vswitchd:
>    ./ovs-ofctl add-flow br0 in_port=2,action=output:1
>    ```
> 
>+8. Ingress Policing Example
>+
>+   Assuming you have a vhost-user port receiving traffic consisting of
>+   packets of size 64 bytes, the following command would limit the
>reception
>+   rate of the port to ~1,000,000 packets per second:
>+
>+   `ovs-vsctl set interface vhost-user0 ingress_policing_rate=46000000
>+    ingress_policing_burst=4096`
>+
>+   To examine the ingress policer configuration of the port:
>+
>+   `ovs-vsctl list interface vhost-user0`
>+
>+   To clear the ingress policer configuration from the port use the
>following:
>+
>+   `ovs-vsctl set interface vhost-user0 ingress_policing_rate=0`
>+
>+   For more details regarding ingress-policer see the vswitch.xml.
>+
> Performance Tuning:
> -------------------
> 
>diff --git a/NEWS b/NEWS
>index 57a250e..fb67f86 100644
>--- a/NEWS
>+++ b/NEWS
>@@ -18,6 +18,7 @@ Post-v2.5.0
>      * New appctl command 'dpif-netdev/pmd-rxq-show' to check the
>port/rxq
>        assignment.
>      * Type of log messages from PMD threads changed from INFO to DBG.
>+     * Add ingress policing functionality.
>    - ovs-benchmark: This utility has been removed due to lack of use and
>      bitrot.
>    - ovs-appctl:
>diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
>index 71034a0..faf3583 100644
>--- a/lib/netdev-dpdk.c
>+++ b/lib/netdev-dpdk.c
>@@ -53,6 +53,7 @@
> 
> #include "rte_config.h"
> #include "rte_mbuf.h"
>+#include "rte_meter.h"
> #include "rte_virtio_net.h"
> 
> VLOG_DEFINE_THIS_MODULE(dpdk);
>@@ -193,6 +194,11 @@ struct dpdk_ring {
>     struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
> };
> 
>+struct ingress_policer {
>+    struct rte_meter_srtcm_params app_srtcm_params;
>+    struct rte_meter_srtcm in_policer;
>+};
>+
> struct netdev_dpdk {
>     struct netdev up;
>     int port_id;
>@@ -231,6 +237,13 @@ struct netdev_dpdk {
>     /* Identifier used to distinguish vhost devices from each other */
>     char vhost_id[PATH_MAX];
> 
>+    /* Ingress Policer */
>+    rte_spinlock_t policer_lock;
>+    struct ingress_policer *ingress_policer;
>+

I would prefer not to have a lock at this level.

I think it would make more sense to make the ingress_policer
pointer RCU protected and embed the spinlock into
struct ingress_policer, to protect the token bucket.


>+    uint32_t policer_rate;
>+    uint32_t policer_burst;

Why do we need to keep these parameter here?
Aren't these values kept in rte_meter_srtcm_params?

>+
>     /* In dpdk_list. */
>     struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
> };
>@@ -617,6 +630,11 @@ netdev_dpdk_init(struct netdev *netdev_, unsigned
>int port_no,
>     netdev_->requested_n_rxq = NR_QUEUE;
>     netdev->real_n_txq = NR_QUEUE;
> 
>+    rte_spinlock_init(&netdev->policer_lock);
>+    netdev->ingress_policer = NULL;
>+    netdev->policer_rate = 0;
>+    netdev->policer_burst = 0;
>+
>     if (type == DPDK_DEV_ETH) {
>         netdev_dpdk_alloc_txq(netdev, NR_QUEUE);
>         err = dpdk_eth_dev_init(netdev);
>@@ -1012,12 +1030,14 @@ is_vhost_running(struct virtio_net *dev)
> 
> static inline void
> netdev_dpdk_vhost_update_rx_counters(struct netdev_stats *stats,
>-                                     struct dp_packet **packets, int
>count)
>+                                     struct dp_packet **packets, int
>count,
>+                                                                       int 
>dropped)
> {
>     int i;
>     struct dp_packet *packet;
> 
>     stats->rx_packets += count;
>+    stats->rx_dropped += dropped;
>     for (i = 0; i < count; i++) {
>         packet = packets[i];
> 
>@@ -1039,6 +1059,48 @@ netdev_dpdk_vhost_update_rx_counters(struct
>netdev_stats *stats,
>     }
> }
> 
>+static inline int
>+policer_pkt_handle__(struct rte_meter_srtcm *meter,
>+                            struct rte_mbuf *pkt, uint64_t time)
>+{
>+    uint32_t pkt_len = rte_pktmbuf_pkt_len(pkt) - sizeof(struct
>ether_hdr);
>+
>+    /* color input is not used for blind modes */
>+    if (rte_meter_srtcm_color_blind_check(meter, time, pkt_len) !=
>e_RTE_METER_GREEN) {
>+        return 1;
>+    }
>+
>+    return 0;
>+}

Maybe you thought about that, but I think this function is identical
to egress_policer_pkt_handle__().  I think it's worth renaming that and
sharing the implementation.

>+
>+static int
>+policer_run(struct netdev *netdev_, struct rte_mbuf **pkts,
>+                        int pkt_cnt)
>+{
>+    int i = 0;
>+    int cnt = 0;
>+    struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_);
>+    struct rte_mbuf *pkt = NULL;
>+    uint64_t current_time = rte_rdtsc();
>+
>+    for (i = 0; i < pkt_cnt; i++) {
>+        pkt = pkts[i];
>+        /* Handle current packet */
>+        if (policer_pkt_handle__(&netdev->ingress_policer->in_policer,
>pkt,
>+                                        current_time) != 1) {
>+            if (cnt != i) {
>+                pkts[cnt] = pkt;
>+            }
>+            cnt++;
>+        }
>+        else {
>+            rte_pktmbuf_free(pkt);
>+        }
>+    }
>+
>+    return cnt;
>+}

Even this is very similar to egress_policer_run(), so it probably
makes sense to share the implementation.

I guess this patch is based on a tree where there was no egress policing,
so maybe you though about this already

>+
> /*
>  * The receive path for the vhost port is the TX path out from guest.
>  */
>@@ -1052,6 +1114,7 @@ netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq_,
>     struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(vhost_dev);
>     int qid = rxq_->queue_id;
>     uint16_t nb_rx = 0;
>+    uint16_t dropped = 0;
> 
>     if (OVS_UNLIKELY(!is_vhost_running(virtio_dev))) {
>         return EAGAIN;
>@@ -1069,8 +1132,16 @@ netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq_,
>         return EAGAIN;
>     }
> 
>+    rte_spinlock_lock(&vhost_dev->policer_lock);
>+    if (vhost_dev->ingress_policer != NULL) {
>+        dropped = nb_rx;
>+        nb_rx = policer_run(netdev, (struct rte_mbuf **)packets, nb_rx);
>+        dropped -= nb_rx;
>+    }
>+    rte_spinlock_unlock(&vhost_dev->policer_lock);
>+
>     rte_spinlock_lock(&vhost_dev->stats_lock);
>-    netdev_dpdk_vhost_update_rx_counters(&vhost_dev->stats, packets,
>nb_rx);
>+    netdev_dpdk_vhost_update_rx_counters(&vhost_dev->stats, packets,
>nb_rx, dropped);
>     rte_spinlock_unlock(&vhost_dev->stats_lock);
> 
>     *c = (int) nb_rx;
>@@ -1085,6 +1156,7 @@ netdev_dpdk_rxq_recv(struct netdev_rxq *rxq_,
>struct dp_packet **packets,
>     struct netdev *netdev = rx->up.netdev;
>     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>     int nb_rx;
>+    int dropped = 0;
> 
>     /* There is only one tx queue for this core.  Do not flush other
>      * queues.
>@@ -1102,6 +1174,21 @@ netdev_dpdk_rxq_recv(struct netdev_rxq *rxq_,
>struct dp_packet **packets,
>         return EAGAIN;
>     }

I would really prefer to avoid taking unnecessary locks
in the fast path.  If we use RCU for the ingress_policer
pointer, like we can avoid that.

I hope that this way we can minimize the overhead for
devices with policing not enabled.

> 
>+    rte_spinlock_lock(&dev->policer_lock);
>+    if (dev->ingress_policer != NULL) {
>+        dropped = nb_rx;
>+        nb_rx = policer_run(netdev, (struct rte_mbuf **) packets, nb_rx);
>+        dropped -= nb_rx;
>+    }
>+    rte_spinlock_unlock(&dev->policer_lock);
>+
>+    /* Update stats to reflect dropped packets */
>+    if (OVS_UNLIKELY(dropped)) {
>+        rte_spinlock_lock(&dev->stats_lock);
>+        dev->stats.rx_dropped += dropped;
>+        rte_spinlock_unlock(&dev->stats_lock);
>+    }
>+
>     *c = nb_rx;
> 
>     return 0;
>@@ -1502,12 +1589,12 @@ netdev_dpdk_vhost_get_stats(const struct netdev
>*netdev,
>     stats->tx_fifo_errors = UINT64_MAX;
>     stats->tx_heartbeat_errors = UINT64_MAX;
>     stats->tx_window_errors = UINT64_MAX;
>-    stats->rx_dropped += UINT64_MAX;
> 
>     rte_spinlock_lock(&dev->stats_lock);
>     /* Supported Stats */
>     stats->rx_packets += dev->stats.rx_packets;
>     stats->tx_packets += dev->stats.tx_packets;
>+    stats->rx_dropped += dev->stats.rx_dropped;
>     stats->tx_dropped += dev->stats.tx_dropped;
>     stats->multicast = dev->stats.multicast;
>     stats->rx_bytes = dev->stats.rx_bytes;
>@@ -1545,11 +1632,12 @@ netdev_dpdk_get_stats(const struct netdev
>*netdev, struct netdev_stats *stats)
> 
>     rte_spinlock_lock(&dev->stats_lock);
>     stats->tx_dropped = dev->stats.tx_dropped;
>+    stats->rx_dropped = dev->stats.rx_dropped;
>     rte_spinlock_unlock(&dev->stats_lock);
> 
>     /* These are the available DPDK counters for packets not received
>due to
>      * local resource constraints in DPDK and NIC respectively. */
>-    stats->rx_dropped = rte_stats.rx_nombuf + rte_stats.imissed;
>+    stats->rx_dropped += rte_stats.rx_nombuf + rte_stats.imissed;
>     stats->collisions = UINT64_MAX;
> 
>     stats->rx_length_errors = UINT64_MAX;
>@@ -1617,6 +1705,95 @@ netdev_dpdk_get_features(const struct netdev
>*netdev_,
> }
> 
> static int
>+netdev_dpdk_policer_construct__(struct netdev *netdev_, uint32_t rate,
>+                                uint32_t burst)
>+{
>+    struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_);
>+    struct ingress_policer *policer;
>+    int err = 0;
>+
>+    rte_spinlock_lock(&netdev->policer_lock);
>+    policer = xmalloc(sizeof *policer);
>+    netdev->ingress_policer = policer;
>+
>+    policer->app_srtcm_params.cir = rate;
>+    policer->app_srtcm_params.cbs = burst;
>+    policer->app_srtcm_params.ebs = 0;
>+    err = rte_meter_srtcm_config(&policer->in_policer,
>+                                    &policer->app_srtcm_params);
>+    rte_spinlock_unlock(&netdev->policer_lock);
>+
>+    return err;
>+}
>+
>+static int
>+netdev_dpdk_policer_destruct__(struct netdev *netdev_)
>+{
>+      struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_);
>+      if (netdev->ingress_policer == NULL)    {
>+              return 0;
>+      }
>+      else {
>+        rte_spinlock_lock(&netdev->policer_lock);
>+        free(netdev->ingress_policer);
>+        netdev->ingress_policer = NULL;
>+          netdev->policer_rate = 0;
>+        netdev->policer_burst = 0;
>+        rte_spinlock_unlock(&netdev->policer_lock);
>+    }
>+      return 0;
>+}
>+
>+static int
>+netdev_dpdk_set_policing(struct netdev* netdev_, uint32_t policer_rate,
>+                            uint32_t policer_burst)
>+{
>+      struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_);
>+    int err = 0;
>+
>+    policer_burst = (!policer_rate ? 0      /* Force to 0 if no rate
>specified. */
>+                    : !policer_burst ? 4096 /* Default to 4096 bytes if
>0. */

netdev-linux has a much higher default value.
Any reason you choose this? Just curious

>+                    : policer_burst);       /* Stick with user-specified
>value. */
>+
>+    ovs_mutex_lock(&netdev->mutex);
>+
>+      if (netdev->policer_rate == policer_rate &&
>+        netdev->policer_burst == policer_burst) {
>+              /* Assume that settings haven't changed since we last set them. 
>*/
>+          ovs_mutex_unlock(&netdev->mutex);
>+              return err;
>+      }
>+
>+    /* Destroy any existing ingress policer for the device if one exists
>+     * and the policer_rate and policer_burst are being set to 0
>+     */
>+    if ((netdev->ingress_policer != NULL) &&
>+        (policer_rate == 0) && (policer_burst == 0)) {
>+
>+        /* User has set rate and burst to 0, destroy the policer */
>+              err = netdev_dpdk_policer_destruct__(netdev_);
>+          ovs_mutex_unlock(&netdev->mutex);
>+        return err;
>+    }
>+
>+    /* Destroy existing policer */
>+    if (netdev->ingress_policer != NULL) {
>+        err = netdev_dpdk_policer_destruct__(netdev_);
>+    }
>+
>+    /* Add an ingress policer */
>+    err = netdev_dpdk_policer_construct__(netdev_, policer_rate,
>policer_burst);
>+
>+    /* Update policer_rate and policer_burst for the netdev */
>+    netdev->policer_rate = policer_rate;
>+    netdev->policer_burst = policer_burst;
>+
>+    ovs_mutex_unlock(&netdev->mutex);
>+
>+      return err;
>+}
>+
>+static int
> netdev_dpdk_get_ifindex(const struct netdev *netdev)
> {
>     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>@@ -2193,7 +2370,7 @@ unlock_dpdk:
>     GET_FEATURES,                                             \
>     NULL,                       /* set_advertisements */      \
>                                                               \
>-    NULL,                       /* set_policing */            \
>+    netdev_dpdk_set_policing,   /* set_policing */            \
>     NULL,                       /* get_qos_types */           \
>     NULL,                       /* get_qos_capabilities */    \
>     NULL,                       /* get_qos */                 \
>diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
>index c2ec914..4ef9de9 100644
>--- a/vswitchd/vswitch.xml
>+++ b/vswitchd/vswitch.xml
>@@ -2384,8 +2384,8 @@
>         table="Queue"/> tables).
>       </p>
>       <p>
>-        Policing is currently implemented only on Linux.  The Linux
>-        implementation uses a simple ``token bucket'' approach:
>+        Policing is implemented on Linux and DPDK interfaces. Both
>+        implementations use a simple ``token bucket'' approach:
>       </p>
>       <ul>
>         <li>
>@@ -2422,17 +2422,23 @@
>       </p>
>       <column name="ingress_policing_rate">
>         <p>

I don't see why we need to use a different unit with DPDK devices.

Also, the comments in netdev-provider.h and netdev.c for set_policing()
clearly specify kbits, so those are not honored by this implementation.


>-          Maximum rate for data received on this interface, in kbps.
>Data
>-          received faster than this rate is dropped.  Set to
><code>0</code>
>-          (the default) to disable policing.
>+          Maximum rate for data received on this interface. The metric
>used
>+          for Linux and DPDK interfaces differ. Linux interfaces expect
>the
>+          value to be specified in kbps. DPDK interfaces expect the
>value to
>+          be specified in bytes per second. Data received faster than
>this
>+          rate is dropped. Set to <code>0</code>(the default) to disable
>+          policing for both interface types.
>         </p>
>       </column>
> 
>       <column name="ingress_policing_burst">
>-        <p>Maximum burst size for data received on this interface, in
>kb.  The
>-        default burst size if set to <code>0</code> is 1000 kb.  This
>value
>-        has no effect if <ref column="ingress_policing_rate"/>
>-        is <code>0</code>.</p>
>+        <p>Maximum burst size for data received on this interface. The
>metric
>+        used for Linux and DPDK interfaces differ. Linux interfaces
>expect a
>+        value specified in kb. DPDK interfaces expect a value specified
>in
>+        bytes. The default burst size if set to <code>0</code> is 1000
>kb for
>+        Linux interfaces and 4096 bytes for DPDK. This value has no
>effect if
>+        <ref column="ingress_policing_rate"/> is <code>0</code> for
>either
>+        interface type.</p>
>         <p>
>           Specifying a larger burst size lets the algorithm be more
>forgiving,
>           which is important for protocols like TCP that react severely
>to
>-- 
>1.7.4.1
>
>_______________________________________________
>dev mailing list
>dev@openvswitch.org
>http://openvswitch.org/mailman/listinfo/dev

_______________________________________________
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev

Reply via email to