The user-space datapath (dpif-netdev) consists of a first level "exact match 
cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With many 
parallel packet flows (e.g. TCP connections) the EMC becomes inefficient and 
the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables 
(aka subtables), each containing megaflow entries with the same mask of packet 
header and metadata fields to match upon. A dpcls lookup matches a given packet 
against all subtables in sequence until it hits a match. As megaflow cache 
entries are by construction non-overlapping, the first match is the only match.

Today the order of the subtables in the dpcls is essentially random so that on 
average a dpcsl lookup has to visit N/2 subtables for a hit, when N is the 
total number of subtables. Even though every single hash-table lookup is fast, 
the performance of the current dpcls degrades when there are many subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a 
small subset of subtables that have hits. The entire megaflow cache typically 
decomposes nicely into partitions that are hit only by packets entering from a 
range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM -> 
Phy). 

Therefore, keeping a separate list of subtables per ingress port, sorted by 
frequency of hits, reduces the average number of subtables lookups in the dpcls 
to a minimum, even if the total number of subtables gets large. 

The patch introduces 32 subtable vectors per dpcls and hashes the ingress port 
to select the subtable vector. The patch also counts matches per 32 slots in 
each vector (hashing the subtable pointer to obtain the slot) and sorts the 
vectors according to match frequency every second.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl 
dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups 
per hit" to report the average number of subtable lookup needed for a megaflow 
match. Ideally, this should be close to 1 and much smaller than N/2.

I have benchmarked a cloud L3 overlay pipeline with a VXLAN overlay mesh. With 
pure L3 tenant traffic between VMs on different nodes the resulting netdev 
dpcls contains N=4 subtables.

Disabling the EMC, I have measured a baseline performance (in+out) of ~1.32 
Mpps (64 bytes, 1000 L4 flows). The average number of subtable lookups per 
dpcls match is 2.5.

With the patch the average number of subtable lookups per dpcls match goes down 
to 1.25 (apparently there are still two ports of different nature hashed to the 
same vector, otherwise it should be exactly one). Even so the forwarding 
performance grows by ~30% to 1.72 Mpps. 

As the number of subtables will often be higher in reality, we can assume that 
this is at the lower end of the speed-up one can expect from this optimization. 
Just running a parallel ping between the VXLAN tunnel endpoints increases the 
number of subtables and hence the average number of subtable lookups from 2.5 
to 3.5 with a corresponding decrease of throughput to 1.14 Mpps. With the patch 
the parallel ping has no impact on average number of subtable lookups and 
performance. The performance gain is then ~50%.

Signed-off-by: Jan Scheurich <jan.scheur...@ericsson.com>

-----

 lib/dpif-netdev.c | 110 
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 92 insertions(+), 18 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 8d39d9e..c491110 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -149,9 +149,15 @@ struct emc_cache {
 
 /* Simple non-wildcarding single-priority classifier. */

+#define NUM_SUBTABLE_VECTORS 32
+#define NUM_SUBTABLE_SLOTS 32
+#define DPCLS_OPTIMIATION_INTERVAL 1000
+
 struct dpcls {
     struct cmap subtables_map;
-    struct pvector subtables;
+    struct pvector subtables[NUM_SUBTABLE_VECTORS];
+    uint64_t hit_cnt[NUM_SUBTABLE_VECTORS][NUM_SUBTABLE_SLOTS];
+    long long int next_optimization;
 };

 /* A rule to be inserted to the classifier. */
@@ -164,12 +170,14 @@ struct dpcls_rule {

 static void dpcls_init(struct dpcls *);
 static void dpcls_destroy(struct dpcls *);
+static void dpcls_try_optimize(struct dpcls *);
 static void dpcls_insert(struct dpcls *, struct dpcls_rule *,
                          const struct netdev_flow_key *mask);
 static void dpcls_remove(struct dpcls *, struct dpcls_rule *);
-static bool dpcls_lookup(const struct dpcls *cls,
+static bool dpcls_lookup(struct dpcls *cls,
                          const struct netdev_flow_key keys[],
-                         struct dpcls_rule **rules, size_t cnt);
+                         struct dpcls_rule **rules, size_t cnt,
+                         const odp_port_t port_no, int *num_lookups_p);
 
 /* Datapath based on the network device interface from netdev.h.
  *
@@ -239,6 +247,7 @@ enum dp_stat_type {
     DP_STAT_MASKED_HIT,         /* Packets that matched in the flow table. */
     DP_STAT_MISS,               /* Packets that did not match. */
     DP_STAT_LOST,               /* Packets not passed up to the client. */
+    DP_STATS_LOOKUP_HIT,        /* Number of subtable lookups for flow table 
hits */
     DP_N_STATS
 };

@@ -657,8 +666,10 @@ pmd_info_show_stats(struct ds *reply,

     ds_put_format(reply,
                   "\temc hits:%llu\n\tmegaflow hits:%llu\n"
+                  "\tavg. subtable lookups per hit:%5.2f\n"
                   "\tmiss:%llu\n\tlost:%llu\n",
                   stats[DP_STAT_EXACT_HIT], stats[DP_STAT_MASKED_HIT],
+                  stats[DP_STAT_MASKED_HIT] > 0 ? 
(1.0*stats[DP_STATS_LOOKUP_HIT])/stats[DP_STAT_MASKED_HIT] : 0,
                   stats[DP_STAT_MISS], stats[DP_STAT_LOST]);

     if (total_cycles == 0) {
@@ -1855,13 +1866,13 @@ emc_lookup(struct emc_cache *cache, const struct 
netdev_flow_key *key)
 }

 static struct dp_netdev_flow *
-dp_netdev_pmd_lookup_flow(const struct dp_netdev_pmd_thread *pmd,
+dp_netdev_pmd_lookup_flow(struct dp_netdev_pmd_thread *pmd,
                           const struct netdev_flow_key *key)
 {
     struct dp_netdev_flow *netdev_flow;
     struct dpcls_rule *rule;

-    dpcls_lookup(&pmd->cls, key, &rule, 1);
+    dpcls_lookup(&pmd->cls, key, &rule, 1, 0, 0);  /* Use 0 as dummy in_port */
     netdev_flow = dp_netdev_flow_cast(rule);

     return netdev_flow;
@@ -2874,6 +2885,7 @@ reload:

             emc_cache_slow_sweep(&pmd->flow_cache);
             coverage_try_clear();
+            dpcls_try_optimize(&pmd->cls);
             ovsrcu_quiesce();

             atomic_read_relaxed(&pmd->change_seq, &seq);
@@ -3814,7 +3826,8 @@ static inline void
 fast_path_processing(struct dp_netdev_pmd_thread *pmd,
                      struct dp_packet_batch *packets_,
                      struct netdev_flow_key *keys,
-                     struct packet_batch_per_flow batches[], size_t *n_batches)
+                     struct packet_batch_per_flow batches[], size_t *n_batches,
+                     odp_port_t port_no)
 {
     int cnt = packets_->count;
 #if !defined(__CHECKER__) && !defined(_WIN32)
@@ -3827,7 +3840,7 @@ fast_path_processing(struct dp_netdev_pmd_thread *pmd,
     struct dpcls_rule *rules[PKT_ARRAY_SIZE];
     struct dp_netdev *dp = pmd->dp;
     struct emc_cache *flow_cache = &pmd->flow_cache;
-    int miss_cnt = 0, lost_cnt = 0;
+    int miss_cnt = 0, lost_cnt = 0, lookup_cnt;
     bool any_miss;
     size_t i;

@@ -3835,7 +3848,7 @@ fast_path_processing(struct dp_netdev_pmd_thread *pmd,
         /* Key length is needed in all the cases, hash computed on demand. */
         keys[i].len = netdev_flow_key_size(miniflow_n_values(&keys[i].mf));
     }
-    any_miss = !dpcls_lookup(&pmd->cls, keys, rules, cnt);
+    any_miss = !dpcls_lookup(&pmd->cls, keys, rules, cnt, port_no, 
&lookup_cnt);
     if (OVS_UNLIKELY(any_miss) && !fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
         uint64_t actions_stub[512 / 8], slow_stub[512 / 8];
         struct ofpbuf actions, put_actions;
@@ -3893,6 +3906,7 @@ fast_path_processing(struct dp_netdev_pmd_thread *pmd,
     }

     dp_netdev_count_packet(pmd, DP_STAT_MASKED_HIT, cnt - miss_cnt);
+    dp_netdev_count_packet(pmd, DP_STATS_LOOKUP_HIT, lookup_cnt);
     dp_netdev_count_packet(pmd, DP_STAT_MISS, miss_cnt);
     dp_netdev_count_packet(pmd, DP_STAT_LOST, lost_cnt);
 }
@@ -3925,7 +3939,7 @@ dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
                             md_is_valid, port_no);
     if (OVS_UNLIKELY(newcnt)) {
         packets->count = newcnt;
-        fast_path_processing(pmd, packets, keys, batches, &n_batches);
+        fast_path_processing(pmd, packets, keys, batches, &n_batches, port_no);
     }

     for (i = 0; i < n_batches; i++) {
@@ -4354,14 +4368,24 @@ struct dpcls_subtable {
 static void
 dpcls_init(struct dpcls *cls)
 {
+    int i;
+
     cmap_init(&cls->subtables_map);
-    pvector_init(&cls->subtables);
+    for (i=0; i<NUM_SUBTABLE_VECTORS; i++) {
+        pvector_init(&cls->subtables[i]);
+    }
+    memset(cls->hit_cnt, 0, sizeof cls->hit_cnt);
+    cls->next_optimization = time_msec() + DPCLS_OPTIMIATION_INTERVAL;
 }

 static void
 dpcls_destroy_subtable(struct dpcls *cls, struct dpcls_subtable *subtable)
 {
-    pvector_remove(&cls->subtables, subtable);
+    int i;
+
+    for (i=0; i<NUM_SUBTABLE_VECTORS; i++) {
+        pvector_remove(&cls->subtables[i], subtable);
+    }
     cmap_remove(&cls->subtables_map, &subtable->cmap_node,
                 subtable->mask.hash);
     cmap_destroy(&subtable->rules);
@@ -4376,13 +4400,16 @@ dpcls_destroy(struct dpcls *cls)
 {
     if (cls) {
         struct dpcls_subtable *subtable;
+        int i;

         CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) {
             ovs_assert(cmap_count(&subtable->rules) == 0);
             dpcls_destroy_subtable(cls, subtable);
         }
         cmap_destroy(&cls->subtables_map);
-        pvector_destroy(&cls->subtables);
+        for (i=0; i<NUM_SUBTABLE_VECTORS; i++) {
+            pvector_destroy(&cls->subtables[i]);
+        }
     }
 }

@@ -4390,6 +4417,7 @@ static struct dpcls_subtable *
 dpcls_create_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
 {
     struct dpcls_subtable *subtable;
+    int i;

     /* Need to add one. */
     subtable = xmalloc(sizeof *subtable
@@ -4397,8 +4425,11 @@ dpcls_create_subtable(struct dpcls *cls, const struct 
netdev_flow_key *mask)
     cmap_init(&subtable->rules);
     netdev_flow_key_clone(&subtable->mask, mask);
     cmap_insert(&cls->subtables_map, &subtable->cmap_node, mask->hash);
-    pvector_insert(&cls->subtables, subtable, 0);
-    pvector_publish(&cls->subtables);
+    for (i=0; i<NUM_SUBTABLE_VECTORS; i++) {
+        /* Insert new subtable with priority zero (no hits) */
+        pvector_insert(&cls->subtables[i], subtable, 0);
+        pvector_publish(&cls->subtables[i]);
+    }

     return subtable;
 }
@@ -4417,6 +4448,33 @@ dpcls_find_subtable(struct dpcls *cls, const struct 
netdev_flow_key *mask)
     return dpcls_create_subtable(cls, mask);
 }

+
+/* Periodically sort the subtable vectors according to hit counts */
+static inline void
+dpcls_try_optimize(struct dpcls *cls)
+{
+    long long int now = time_msec();
+
+    if (now > cls->next_optimization) {
+        struct dpcls_subtable *subtable;
+        int port_idx, subtable_idx;
+
+        for (port_idx=0; port_idx<NUM_SUBTABLE_VECTORS; port_idx++) {
+            struct pvector *pvec = &cls->subtables[port_idx];
+            PVECTOR_FOR_EACH (subtable, pvec) {
+                subtable_idx = hash_pointer(subtable, 210365) % 
NUM_SUBTABLE_SLOTS;
+                pvector_change_priority(pvec, subtable,
+                        cls->hit_cnt[port_idx][subtable_idx]);
+            }
+            pvector_publish(pvec);
+        }
+
+        /* Start new measuring interval */
+        memset(cls->hit_cnt, 0, sizeof cls->hit_cnt);
+        cls->next_optimization = now + DPCLS_OPTIMIATION_INTERVAL;
+    }
+}
+
 /* Insert 'rule' into 'cls'. */
 static void
 dpcls_insert(struct dpcls *cls, struct dpcls_rule *rule,
@@ -4433,6 +4491,7 @@ static void
 dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
 {
     struct dpcls_subtable *subtable;
+    int i;

     ovs_assert(rule->mask);

@@ -4441,7 +4500,9 @@ dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
     if (cmap_remove(&subtable->rules, &rule->cmap_node, rule->flow.hash)
         == 0) {
         dpcls_destroy_subtable(cls, subtable);
-        pvector_publish(&cls->subtables);
+        for (i=0; i<NUM_SUBTABLE_VECTORS; i++) {
+            pvector_publish(&cls->subtables[i]);
+        }
     }
 }

@@ -4474,8 +4535,9 @@ dpcls_rule_matches_key(const struct dpcls_rule *rule,
  *
  * Returns true if all flows found a corresponding rule. */
 static bool
-dpcls_lookup(const struct dpcls *cls, const struct netdev_flow_key keys[],
-             struct dpcls_rule **rules, const size_t cnt)
+dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key keys[],
+             struct dpcls_rule **rules, const size_t cnt,
+             const odp_port_t port_no, int *num_lookups_p)
 {
     /* The batch size 16 was experimentally found faster than 8 or 32. */
     typedef uint16_t map_type;
@@ -4495,7 +4557,12 @@ dpcls_lookup(const struct dpcls *cls, const struct 
netdev_flow_key keys[],
     }
     memset(rules, 0, cnt * sizeof *rules);

-    PVECTOR_FOR_EACH (subtable, &cls->subtables) {
+    /* Select the subtable vector for the in_port */
+    uint32_t port_idx = hash_port_no(port_no) % NUM_SUBTABLE_VECTORS;
+    uint64_t *hit_cnt = cls->hit_cnt[port_idx];
+    int lookups_match = 0, subtable_pos = 1;
+
+    PVECTOR_FOR_EACH (subtable, &cls->subtables[port_idx]) {
         const struct netdev_flow_key *mkeys = keys;
         struct dpcls_rule **mrules = rules;
         map_type remains = 0;
@@ -4527,6 +4594,10 @@ dpcls_lookup(const struct dpcls *cls, const struct 
netdev_flow_key keys[],
                 CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
                     if (OVS_LIKELY(dpcls_rule_matches_key(rule, &mkeys[i]))) {
                         mrules[i] = rule;
+                        /* Update the subtable hit statistics for the in_port 
vector */
+                        int subtable_idx = hash_pointer(subtable, 210365) % 
NUM_SUBTABLE_SLOTS;
+                        hit_cnt[subtable_idx]++;
+                        lookups_match += subtable_pos;
                         goto next;
                     }
                 }
@@ -4538,8 +4609,11 @@ dpcls_lookup(const struct dpcls *cls, const struct 
netdev_flow_key keys[],
             remains |= maps[m];
         }
         if (!remains) {
+            if (num_lookups_p) *num_lookups_p = lookups_match;
             return true;              /* All found. */
         }
+        subtable_pos++;
     }
+    if (num_lookups_p) *num_lookups_p = lookups_match;
     return false;                     /* Some misses. */
 }

_______________________________________________
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev

Reply via email to