On Thu, Feb 6, 2014 at 3:13 PM, Jarno Rajahalme <jrajaha...@nicira.com> wrote: > Keep kernel flow stats for each NUMA node rather than each (logical) > CPU. This avoids using the per-CPU allocator and removes most of the > > diff --git a/datapath/flow.c b/datapath/flow.c > index abe6789..e86034e 100644 > --- a/datapath/flow.c > +++ b/datapath/flow.c > @@ -64,14 +64,10 @@ u64 ovs_flow_used_time(unsigned long flow_jiffies) > > void ovs_flow_stats_update(struct sw_flow *flow, struct sk_buff *skb) > { > - struct flow_stats *stats; > + int node = numa_node_id(); > + struct flow_stats *stats = flow->stats[node]; > __be16 tcp_flags = 0; > > - if (!flow->stats.is_percpu) > - stats = flow->stats.stat; > - else > - stats = this_cpu_ptr(flow->stats.cpu_stats); > - > if ((flow->key.eth.type == htons(ETH_P_IP) || > flow->key.eth.type == htons(ETH_P_IPV6)) && > flow->key.ip.frag != OVS_FRAG_TYPE_LATER && > @@ -80,96 +76,126 @@ void ovs_flow_stats_update(struct sw_flow *flow, struct > sk_buff *skb) > tcp_flags = TCP_FLAGS_BE16(tcp_hdr(skb)); > } > > - spin_lock(&stats->lock); > + /* Check if already have node-specific stats. */ > + if (likely(stats)) > + spin_lock(&stats->lock); > + else { > + stats = flow->stats[0]; /* Pre-allocated. */ > + spin_lock(&stats->lock); > + > + /* If the current NUMA-node is the only writer on the > + * pre-allocated stats keep using them. > + * A previous locker may have already allocated the stats, > + * so we need to check again. If the node-specific stats > + * were already allocated, we update the pre-allocated stats > + * as we have already locked them. */ > + if (likely(stats->last_writer != node && stats->last_writer > >= 0 > + && !flow->stats[node])) { > + /* Try to allocate node-specific stats. */ > + struct flow_stats *new_stats; > + > + new_stats = kmem_cache_alloc_node(flow_stats_cache, > + GFP_THISNODE | > + __GFP_NOMEMALLOC, > + node); > + if (likely(new_stats)) { > + new_stats->used = jiffies; > + new_stats->packet_count = 1; > + new_stats->byte_count = skb->len; > + new_stats->tcp_flags = tcp_flags; > + new_stats->last_writer = node; > + spin_lock_init(&new_stats->lock); > + > + flow->stats[node] = new_stats; > + goto unlock; /* Unlock the pre-allocated > stats. */
One more thing I noticed is that we need memory barrier for numa-stats array. Since other process could be reading starts from different cpu. > + } > + } > + } > + > stats->used = jiffies; > stats->packet_count++; > stats->byte_count += skb->len; > stats->tcp_flags |= tcp_flags; > + stats->last_writer = node; > +unlock: > spin_unlock(&stats->lock); > } > > -static void stats_read(struct flow_stats *stats, bool lock_bh, > - struct ovs_flow_stats *ovs_stats, > - unsigned long *used, __be16 *tcp_flags) > -{ > - if (lock_bh) > - spin_lock_bh(&stats->lock); > - else > - spin_lock(&stats->lock); > - > - if (time_after(stats->used, *used)) > - *used = stats->used; > - *tcp_flags |= stats->tcp_flags; > - ovs_stats->n_packets += stats->packet_count; > - ovs_stats->n_bytes += stats->byte_count; > - > - if (lock_bh) > - spin_unlock_bh(&stats->lock); > - else > - spin_unlock(&stats->lock); > -} > - > void ovs_flow_stats_get(struct sw_flow *flow, struct ovs_flow_stats > *ovs_stats, > unsigned long *used, __be16 *tcp_flags) > { > - int cpu, cur_cpu; > + int node, local_node; > + struct flow_stats *stats; > > - *used = 0; > - *tcp_flags = 0; > - memset(ovs_stats, 0, sizeof(*ovs_stats)); > + preempt_disable(); > + local_node = numa_node_id(); > > - if (!flow->stats.is_percpu) { > - stats_read(flow->stats.stat, true, ovs_stats, used, > tcp_flags); > + /* Start from the local node. */ > + stats = flow->stats[local_node]; > + if (stats && likely(stats->packet_count)) { > + spin_lock_bh(&stats->lock); > + *used = stats->used; > + *tcp_flags = stats->tcp_flags; > + ovs_stats->n_packets = stats->packet_count; > + ovs_stats->n_bytes = stats->byte_count; > + spin_unlock_bh(&stats->lock); > } else { > - cur_cpu = get_cpu(); > - > - for_each_possible_cpu(cpu) { > - struct flow_stats *stats; > - bool lock_bh; > + *used = 0; > + *tcp_flags = 0; > + ovs_stats->n_packets = 0; > + ovs_stats->n_bytes = 0; > + } > > - stats = per_cpu_ptr(flow->stats.cpu_stats, cpu); > - lock_bh = (cpu == cur_cpu); > - stats_read(stats, lock_bh, ovs_stats, used, > tcp_flags); > + /* Collect stats from other nodes. */ > + for_each_node(node) { > + if (node == local_node) > + continue; /* Done already. */ > + stats = flow->stats[node]; > + if (stats && likely(stats->packet_count)) { > + spin_lock(&stats->lock); > + if (time_after(stats->used, *used)) > + *used = stats->used; > + *tcp_flags |= stats->tcp_flags; > + ovs_stats->n_packets += stats->packet_count; > + ovs_stats->n_bytes += stats->byte_count; > + spin_unlock(&stats->lock); > } > - put_cpu(); > } > -} > - > -static void stats_reset(struct flow_stats *stats, bool lock_bh) > -{ > - if (lock_bh) > - spin_lock_bh(&stats->lock); > - else > - spin_lock(&stats->lock); > - > - stats->used = 0; > - stats->packet_count = 0; > - stats->byte_count = 0; > - stats->tcp_flags = 0; > - > - if (lock_bh) > - spin_unlock_bh(&stats->lock); > - else > - spin_unlock(&stats->lock); > + preempt_enable(); > } > > void ovs_flow_stats_clear(struct sw_flow *flow) > { > - int cpu, cur_cpu; > - > - if (!flow->stats.is_percpu) { > - stats_reset(flow->stats.stat, true); > - } else { > - cur_cpu = get_cpu(); > + int node, local_node; > + struct flow_stats *stats; > > - for_each_possible_cpu(cpu) { > - bool lock_bh; > + preempt_disable(); > + local_node = numa_node_id(); > > - lock_bh = (cpu == cur_cpu); > - stats_reset(per_cpu_ptr(flow->stats.cpu_stats, cpu), > lock_bh); > + /* Start from the current node. */ > + stats = flow->stats[local_node]; > + if (stats) { > + spin_lock_bh(&stats->lock); > + stats->used = 0; > + stats->packet_count = 0; > + stats->byte_count = 0; > + stats->tcp_flags = 0; > + spin_unlock_bh(&stats->lock); > + } > + for_each_node(node) { > + if (node == local_node) > + continue; /* Done already. */ > + stats = flow->stats[node]; > + if (stats) { > + spin_lock(&stats->lock); > + stats->used = 0; > + stats->packet_count = 0; > + stats->byte_count = 0; > + stats->tcp_flags = 0; > + spin_unlock(&stats->lock); > } > - put_cpu(); > } > + preempt_enable(); > } > _______________________________________________ dev mailing list dev@openvswitch.org http://openvswitch.org/mailman/listinfo/dev