Keep kernel flow stats for each NUMA node rather than each (logical)
CPU.  This almost removes the kernel-side OVS locking overhead
otherwise on the top of perf report and improves TCP_CRR test scores
by roughly 50% (with 4 handler and 2 revalidator threads with a TCP
port flow forcing all connection setups to userspace).  My limited
testing revealed no performance regression for tests where only two
wildcarded kernel flows are used.

Signed-off-by: Jarno Rajahalme <jrajaha...@nicira.com>
---
 datapath/flow.c       |   71 +++++++++++++++----------------------------------
 datapath/flow.h       |    4 +--
 datapath/flow_table.c |   41 ++++++++++++++++++----------
 datapath/flow_table.h |    2 ++
 4 files changed, 52 insertions(+), 66 deletions(-)

diff --git a/datapath/flow.c b/datapath/flow.c
index b1b2596..0002759 100644
--- a/datapath/flow.c
+++ b/datapath/flow.c
@@ -67,10 +67,10 @@ void ovs_flow_stats_update(struct sw_flow *flow, struct 
sk_buff *skb)
        struct flow_stats *stats;
        __be16 tcp_flags = 0;
 
-       if (!flow->stats.is_percpu)
+       if (!flow->stats.per_numa_mem)
                stats = flow->stats.stat;
        else
-               stats = this_cpu_ptr(flow->stats.cpu_stats);
+               stats = &flow->stats.numa_stats[numa_node_id()];
 
        if ((flow->key.eth.type == htons(ETH_P_IP) ||
             flow->key.eth.type == htons(ETH_P_IPV6)) &&
@@ -79,23 +79,20 @@ void ovs_flow_stats_update(struct sw_flow *flow, struct 
sk_buff *skb)
                tcp_flags = TCP_FLAGS_BE16(tcp_hdr(skb));
        }
 
-       spin_lock(&stats->lock);
+       spin_lock_bh(&stats->lock);
        stats->used = jiffies;
        stats->packet_count++;
        stats->byte_count += skb->len;
        stats->tcp_flags |= tcp_flags;
-       spin_unlock(&stats->lock);
+       spin_unlock_bh(&stats->lock);
 }
 
-static void stats_read(struct flow_stats *stats, bool lock_bh,
+static void stats_read(struct flow_stats *stats,
                       struct ovs_flow_stats *ovs_stats,
                       unsigned long *used, __be16 *tcp_flags)
 {
        if (stats->packet_count) {
-               if (lock_bh)
-                       spin_lock_bh(&stats->lock);
-               else
-                       spin_lock(&stats->lock);
+               spin_lock_bh(&stats->lock);
 
                if (time_after(stats->used, *used))
                        *used = stats->used;
@@ -103,73 +100,47 @@ static void stats_read(struct flow_stats *stats, bool 
lock_bh,
                ovs_stats->n_packets += stats->packet_count;
                ovs_stats->n_bytes += stats->byte_count;
 
-               if (lock_bh)
-                       spin_unlock_bh(&stats->lock);
-               else
-                       spin_unlock(&stats->lock);
+               spin_unlock_bh(&stats->lock);
        }
 }
 
 void ovs_flow_stats_get(struct sw_flow *flow, struct ovs_flow_stats *ovs_stats,
                        unsigned long *used, __be16 *tcp_flags)
 {
-       int cpu, cur_cpu;
-
        *used = 0;
        *tcp_flags = 0;
        memset(ovs_stats, 0, sizeof(*ovs_stats));
 
-       if (!flow->stats.is_percpu) {
-               stats_read(flow->stats.stat, true, ovs_stats, used, tcp_flags);
+       if (!flow->stats.per_numa_mem) {
+               stats_read(flow->stats.stat, ovs_stats, used, tcp_flags);
        } else {
-               cur_cpu = get_cpu();
-
-               for_each_possible_cpu(cpu) {
-                       struct flow_stats *stats;
-                       bool lock_bh;
-
-                       stats = per_cpu_ptr(flow->stats.cpu_stats, cpu);
-                       lock_bh = (cpu == cur_cpu);
-                       stats_read(stats, lock_bh, ovs_stats, used, tcp_flags);
-               }
-               put_cpu();
+               int node;
+               for (node = 0; node < ovs_numa_nodes; node++)
+                       stats_read(&flow->stats.numa_stats[node],
+                                  ovs_stats, used, tcp_flags);
        }
 }
 
-static void stats_reset(struct flow_stats *stats, bool lock_bh)
+static void stats_reset(struct flow_stats *stats)
 {
-       if (lock_bh)
-               spin_lock_bh(&stats->lock);
-       else
-               spin_lock(&stats->lock);
+       spin_lock_bh(&stats->lock);
 
        stats->used = 0;
        stats->packet_count = 0;
        stats->byte_count = 0;
        stats->tcp_flags = 0;
 
-       if (lock_bh)
-               spin_unlock_bh(&stats->lock);
-       else
-               spin_unlock(&stats->lock);
+       spin_unlock_bh(&stats->lock);
 }
 
 void ovs_flow_stats_clear(struct sw_flow *flow)
 {
-       int cpu, cur_cpu;
-
-       if (!flow->stats.is_percpu) {
-               stats_reset(flow->stats.stat, true);
+       if (!flow->stats.per_numa_mem) {
+               stats_reset(flow->stats.stat);
        } else {
-               cur_cpu = get_cpu();
-
-               for_each_possible_cpu(cpu) {
-                       bool lock_bh;
-
-                       lock_bh = (cpu == cur_cpu);
-                       stats_reset(per_cpu_ptr(flow->stats.cpu_stats, cpu), 
lock_bh);
-               }
-               put_cpu();
+               int node;
+               for (node = 0; node < ovs_numa_nodes; node++)
+                       stats_reset(&flow->stats.numa_stats[node]);
        }
 }
 
diff --git a/datapath/flow.h b/datapath/flow.h
index eafcfd8..14ea93f 100644
--- a/datapath/flow.h
+++ b/datapath/flow.h
@@ -158,10 +158,10 @@ struct flow_stats {
 };
 
 struct sw_flow_stats {
-       bool is_percpu;
+       char * per_numa_mem;            /* NULL for shared stats on 'stat'. */
        union {
                struct flow_stats *stat;
-               struct flow_stats __percpu *cpu_stats;
+               struct flow_stats *numa_stats;
        };
 };
 
diff --git a/datapath/flow_table.c b/datapath/flow_table.c
index 4232b82..ec8e5a4 100644
--- a/datapath/flow_table.c
+++ b/datapath/flow_table.c
@@ -72,10 +72,9 @@ void ovs_flow_mask_key(struct sw_flow_key *dst, const struct 
sw_flow_key *src,
                *d++ = *s++ & *m++;
 }
 
-struct sw_flow *ovs_flow_alloc(bool percpu_stats)
+struct sw_flow *ovs_flow_alloc(bool pernumanode_stats)
 {
        struct sw_flow *flow;
-       int cpu;
 
        flow = kmem_cache_alloc(flow_cache, GFP_KERNEL);
        if (!flow)
@@ -84,25 +83,28 @@ struct sw_flow *ovs_flow_alloc(bool percpu_stats)
        flow->sf_acts = NULL;
        flow->mask = NULL;
 
-       flow->stats.is_percpu = percpu_stats;
+       if (!pernumanode_stats || !ovs_numa_nodes) {
+               flow->stats.per_numa_mem = NULL;
 
-       if (!percpu_stats) {
                flow->stats.stat = kzalloc(sizeof(*flow->stats.stat), 
GFP_KERNEL);
                if (!flow->stats.stat)
                        goto err;
 
                spin_lock_init(&flow->stats.stat->lock);
        } else {
-               flow->stats.cpu_stats = alloc_percpu(struct flow_stats);
-               if (!flow->stats.cpu_stats)
-                       goto err;
+               int node;
 
-               for_each_possible_cpu(cpu) {
-                       struct flow_stats *cpu_stats;
+               flow->stats.per_numa_mem
+                       = kzalloc(sizeof(struct flow_stats) * ovs_numa_nodes
+                                 + (SMP_CACHE_BYTES - 1), GFP_KERNEL);
 
-                       cpu_stats = per_cpu_ptr(flow->stats.cpu_stats, cpu);
-                       spin_lock_init(&cpu_stats->lock);
-               }
+               if (!flow->stats.per_numa_mem)
+                       goto err;
+
+               flow->stats.numa_stats
+                       = (struct flow_stats 
*)L1_CACHE_ALIGN((u64)flow->stats.per_numa_mem);
+               for (node = 0; node < ovs_numa_nodes; node++)
+                       spin_lock_init(&flow->stats.numa_stats[node].lock);
        }
        return flow;
 err:
@@ -141,8 +143,8 @@ static struct flex_array *alloc_buckets(unsigned int 
n_buckets)
 static void flow_free(struct sw_flow *flow)
 {
        kfree((struct sf_flow_acts __force *)flow->sf_acts);
-       if (flow->stats.is_percpu)
-               free_percpu(flow->stats.cpu_stats);
+       if (flow->stats.per_numa_mem)
+               kfree(flow->stats.per_numa_mem);
        else
                kfree(flow->stats.stat);
        kmem_cache_free(flow_cache, flow);
@@ -601,10 +603,14 @@ int ovs_flow_tbl_insert(struct flow_table *table, struct 
sw_flow *flow,
        return 0;
 }
 
+int ovs_numa_nodes = 0;
+
 /* Initializes the flow module.
  * Returns zero if successful or a negative error code. */
 int ovs_flow_init(void)
 {
+       int node;
+
        BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long));
        BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long));
 
@@ -613,6 +619,13 @@ int ovs_flow_init(void)
        if (flow_cache == NULL)
                return -ENOMEM;
 
+       /* Count the NUMA nodes. */
+       for_each_node_with_cpus(node) {
+               if (node >= ovs_numa_nodes)
+                       ovs_numa_nodes = node + 1;
+       }
+       pr_info("ovs_flow_init: ovs_numa_nodes: %d.\n", ovs_numa_nodes);
+
        return 0;
 }
 
diff --git a/datapath/flow_table.h b/datapath/flow_table.h
index 1996e34..3ff1077 100644
--- a/datapath/flow_table.h
+++ b/datapath/flow_table.h
@@ -52,6 +52,8 @@ struct flow_table {
        unsigned int count;
 };
 
+extern int ovs_numa_nodes;
+
 int ovs_flow_init(void);
 void ovs_flow_exit(void);
 
-- 
1.7.10.4

_______________________________________________
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev

Reply via email to