Keep kernel flow stats for each NUMA node rather than each (logical) CPU. This almost removes the kernel-side OVS locking overhead otherwise on the top of perf report and improves TCP_CRR test scores by roughly 50% (with 4 handler and 2 revalidator threads with a TCP port flow forcing all connection setups to userspace). My limited testing revealed no performance regression for tests where only two wildcarded kernel flows are used.
Signed-off-by: Jarno Rajahalme <jrajaha...@nicira.com> --- datapath/flow.c | 71 +++++++++++++++---------------------------------- datapath/flow.h | 4 +-- datapath/flow_table.c | 41 ++++++++++++++++++---------- datapath/flow_table.h | 2 ++ 4 files changed, 52 insertions(+), 66 deletions(-) diff --git a/datapath/flow.c b/datapath/flow.c index b1b2596..0002759 100644 --- a/datapath/flow.c +++ b/datapath/flow.c @@ -67,10 +67,10 @@ void ovs_flow_stats_update(struct sw_flow *flow, struct sk_buff *skb) struct flow_stats *stats; __be16 tcp_flags = 0; - if (!flow->stats.is_percpu) + if (!flow->stats.per_numa_mem) stats = flow->stats.stat; else - stats = this_cpu_ptr(flow->stats.cpu_stats); + stats = &flow->stats.numa_stats[numa_node_id()]; if ((flow->key.eth.type == htons(ETH_P_IP) || flow->key.eth.type == htons(ETH_P_IPV6)) && @@ -79,23 +79,20 @@ void ovs_flow_stats_update(struct sw_flow *flow, struct sk_buff *skb) tcp_flags = TCP_FLAGS_BE16(tcp_hdr(skb)); } - spin_lock(&stats->lock); + spin_lock_bh(&stats->lock); stats->used = jiffies; stats->packet_count++; stats->byte_count += skb->len; stats->tcp_flags |= tcp_flags; - spin_unlock(&stats->lock); + spin_unlock_bh(&stats->lock); } -static void stats_read(struct flow_stats *stats, bool lock_bh, +static void stats_read(struct flow_stats *stats, struct ovs_flow_stats *ovs_stats, unsigned long *used, __be16 *tcp_flags) { if (stats->packet_count) { - if (lock_bh) - spin_lock_bh(&stats->lock); - else - spin_lock(&stats->lock); + spin_lock_bh(&stats->lock); if (time_after(stats->used, *used)) *used = stats->used; @@ -103,73 +100,47 @@ static void stats_read(struct flow_stats *stats, bool lock_bh, ovs_stats->n_packets += stats->packet_count; ovs_stats->n_bytes += stats->byte_count; - if (lock_bh) - spin_unlock_bh(&stats->lock); - else - spin_unlock(&stats->lock); + spin_unlock_bh(&stats->lock); } } void ovs_flow_stats_get(struct sw_flow *flow, struct ovs_flow_stats *ovs_stats, unsigned long *used, __be16 *tcp_flags) { - int cpu, cur_cpu; - *used = 0; *tcp_flags = 0; memset(ovs_stats, 0, sizeof(*ovs_stats)); - if (!flow->stats.is_percpu) { - stats_read(flow->stats.stat, true, ovs_stats, used, tcp_flags); + if (!flow->stats.per_numa_mem) { + stats_read(flow->stats.stat, ovs_stats, used, tcp_flags); } else { - cur_cpu = get_cpu(); - - for_each_possible_cpu(cpu) { - struct flow_stats *stats; - bool lock_bh; - - stats = per_cpu_ptr(flow->stats.cpu_stats, cpu); - lock_bh = (cpu == cur_cpu); - stats_read(stats, lock_bh, ovs_stats, used, tcp_flags); - } - put_cpu(); + int node; + for (node = 0; node < ovs_numa_nodes; node++) + stats_read(&flow->stats.numa_stats[node], + ovs_stats, used, tcp_flags); } } -static void stats_reset(struct flow_stats *stats, bool lock_bh) +static void stats_reset(struct flow_stats *stats) { - if (lock_bh) - spin_lock_bh(&stats->lock); - else - spin_lock(&stats->lock); + spin_lock_bh(&stats->lock); stats->used = 0; stats->packet_count = 0; stats->byte_count = 0; stats->tcp_flags = 0; - if (lock_bh) - spin_unlock_bh(&stats->lock); - else - spin_unlock(&stats->lock); + spin_unlock_bh(&stats->lock); } void ovs_flow_stats_clear(struct sw_flow *flow) { - int cpu, cur_cpu; - - if (!flow->stats.is_percpu) { - stats_reset(flow->stats.stat, true); + if (!flow->stats.per_numa_mem) { + stats_reset(flow->stats.stat); } else { - cur_cpu = get_cpu(); - - for_each_possible_cpu(cpu) { - bool lock_bh; - - lock_bh = (cpu == cur_cpu); - stats_reset(per_cpu_ptr(flow->stats.cpu_stats, cpu), lock_bh); - } - put_cpu(); + int node; + for (node = 0; node < ovs_numa_nodes; node++) + stats_reset(&flow->stats.numa_stats[node]); } } diff --git a/datapath/flow.h b/datapath/flow.h index eafcfd8..14ea93f 100644 --- a/datapath/flow.h +++ b/datapath/flow.h @@ -158,10 +158,10 @@ struct flow_stats { }; struct sw_flow_stats { - bool is_percpu; + char * per_numa_mem; /* NULL for shared stats on 'stat'. */ union { struct flow_stats *stat; - struct flow_stats __percpu *cpu_stats; + struct flow_stats *numa_stats; }; }; diff --git a/datapath/flow_table.c b/datapath/flow_table.c index 4232b82..ec8e5a4 100644 --- a/datapath/flow_table.c +++ b/datapath/flow_table.c @@ -72,10 +72,9 @@ void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src, *d++ = *s++ & *m++; } -struct sw_flow *ovs_flow_alloc(bool percpu_stats) +struct sw_flow *ovs_flow_alloc(bool pernumanode_stats) { struct sw_flow *flow; - int cpu; flow = kmem_cache_alloc(flow_cache, GFP_KERNEL); if (!flow) @@ -84,25 +83,28 @@ struct sw_flow *ovs_flow_alloc(bool percpu_stats) flow->sf_acts = NULL; flow->mask = NULL; - flow->stats.is_percpu = percpu_stats; + if (!pernumanode_stats || !ovs_numa_nodes) { + flow->stats.per_numa_mem = NULL; - if (!percpu_stats) { flow->stats.stat = kzalloc(sizeof(*flow->stats.stat), GFP_KERNEL); if (!flow->stats.stat) goto err; spin_lock_init(&flow->stats.stat->lock); } else { - flow->stats.cpu_stats = alloc_percpu(struct flow_stats); - if (!flow->stats.cpu_stats) - goto err; + int node; - for_each_possible_cpu(cpu) { - struct flow_stats *cpu_stats; + flow->stats.per_numa_mem + = kzalloc(sizeof(struct flow_stats) * ovs_numa_nodes + + (SMP_CACHE_BYTES - 1), GFP_KERNEL); - cpu_stats = per_cpu_ptr(flow->stats.cpu_stats, cpu); - spin_lock_init(&cpu_stats->lock); - } + if (!flow->stats.per_numa_mem) + goto err; + + flow->stats.numa_stats + = (struct flow_stats *)L1_CACHE_ALIGN((u64)flow->stats.per_numa_mem); + for (node = 0; node < ovs_numa_nodes; node++) + spin_lock_init(&flow->stats.numa_stats[node].lock); } return flow; err: @@ -141,8 +143,8 @@ static struct flex_array *alloc_buckets(unsigned int n_buckets) static void flow_free(struct sw_flow *flow) { kfree((struct sf_flow_acts __force *)flow->sf_acts); - if (flow->stats.is_percpu) - free_percpu(flow->stats.cpu_stats); + if (flow->stats.per_numa_mem) + kfree(flow->stats.per_numa_mem); else kfree(flow->stats.stat); kmem_cache_free(flow_cache, flow); @@ -601,10 +603,14 @@ int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, return 0; } +int ovs_numa_nodes = 0; + /* Initializes the flow module. * Returns zero if successful or a negative error code. */ int ovs_flow_init(void) { + int node; + BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long)); BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long)); @@ -613,6 +619,13 @@ int ovs_flow_init(void) if (flow_cache == NULL) return -ENOMEM; + /* Count the NUMA nodes. */ + for_each_node_with_cpus(node) { + if (node >= ovs_numa_nodes) + ovs_numa_nodes = node + 1; + } + pr_info("ovs_flow_init: ovs_numa_nodes: %d.\n", ovs_numa_nodes); + return 0; } diff --git a/datapath/flow_table.h b/datapath/flow_table.h index 1996e34..3ff1077 100644 --- a/datapath/flow_table.h +++ b/datapath/flow_table.h @@ -52,6 +52,8 @@ struct flow_table { unsigned int count; }; +extern int ovs_numa_nodes; + int ovs_flow_init(void); void ovs_flow_exit(void); -- 1.7.10.4 _______________________________________________ dev mailing list dev@openvswitch.org http://openvswitch.org/mailman/listinfo/dev