On Tue, Feb 4, 2014 at 9:03 AM, Jarno Rajahalme <jrajaha...@nicira.com> wrote: > Pravin, > > Thanks for your thorough review. Some comments below, > > Jarno > > On Feb 3, 2014, at 9:14 AM, Pravin Shelar <pshe...@nicira.com> wrote: > > On Thu, Jan 23, 2014 at 4:43 PM, Jarno Rajahalme <jrajaha...@nicira.com> > wrote: > > > ... > > +static void get_skb_stats(struct flow_stats *stat, const struct sw_flow > *flow, > + const struct sk_buff *skb) > +{ > + int node = numa_node_id(); > + > + /* Prepare for writing later. */ > + if (unlikely(!flow->stats[node])) > + node = 0; > + spin_lock_prefetch(&flow->stats[node]->lock); > + > > lock-prefetch looks pretty aggressive, specially when path between > prefetch and actual use is not pre-defined, it depends on actions > list. execute actions has lot more memory footprint when it come to > output action to tunnel port. > > > The risk being that prefetch is too early in that case? Do you have a setup > where you could test that case? > Yes, It seems bit early. I will give it try on my setup.
> > + stat->used = jiffies; > + stat->packet_count = 1; > + stat->byte_count = skb->len; > + stat->tcp_flags = 0; > + > + if ((flow->key.eth.type == htons(ETH_P_IP) || > + flow->key.eth.type == htons(ETH_P_IPV6)) && > + flow->key.ip.proto == IPPROTO_TCP && > + likely(skb->len >= skb_transport_offset(skb) + sizeof(struct > tcphdr))) { > + stat->tcp_flags = TCP_FLAGS_BE16(tcp_hdr(skb)); > + } > +} > + > /* Must be called with rcu_read_lock. */ > void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb) > { > @@ -221,6 +244,7 @@ void ovs_dp_process_received_packet(struct vport *p, > struct sk_buff *skb) > struct sw_flow *flow; > struct dp_stats_percpu *stats; > struct sw_flow_key key; > + struct flow_stats stat; > u64 *stats_counter; > u32 n_mask_hit; > int error; > @@ -252,8 +276,9 @@ void ovs_dp_process_received_packet(struct vport *p, > struct sk_buff *skb) > OVS_CB(skb)->flow = flow; > OVS_CB(skb)->pkt_key = &key; > > - ovs_flow_stats_update(OVS_CB(skb)->flow, skb); > + get_skb_stats(&stat, flow, skb); > ovs_execute_actions(dp, skb); > + ovs_flow_stats_update(flow, &stat); > stats_counter = &stats->n_hit; > > I am not sure why would you divide stats update in two steps? We could > have just done prefetch. > > > Applying the actions can change the skb, and we want to record the size > before any modifications. > ok. > ... > > > +static inline bool alloc_stats(struct sw_flow *flow, > + const struct flow_stats *stat, int node) > +{ > + struct flow_stats *stats; > + /* Alloc stats for the 'node', but with minimal latency. */ > + stats = kmem_cache_alloc_node(flow_stats_cache, GFP_THISNODE | > + __GFP_NOTRACK | __GFP_NOMEMALLOC, > node); > > > __GFP_NOTRACK is hardly used out side of memory allocator. Why would > we need it in OVS. It only affects memory allocation if debugging > (CONFIG_KMEMCHECK) is on. CONFIG_KMEMCHECK is pretty heavy weight > tracing so it is not usually on. > > > OK, thanks, will take __GFP_NOTRACK off then. > > ... > > - stats = per_cpu_ptr(flow->stats.cpu_stats, cpu); > - lock_bh = (cpu == cur_cpu); > - stats_read(stats, lock_bh, ovs_stats, used, > tcp_flags); > + /* Collect per NUMA-node stats from other nodes. */ > + for (node = 0; node < ovs_numa_nodes; node++) { > + if (node == current_node) > + continue; /* Done already. */ > + stats = flow->stats[node]; > + if (stats && likely(stats->packet_count)) { > + spin_lock_bh(&stats->lock); > + if (time_after(stats->used, *used)) > + *used = stats->used; > + *tcp_flags |= stats->tcp_flags; > + ovs_stats->n_packets += stats->packet_count; > + ovs_stats->n_bytes += stats->byte_count; > + spin_unlock_bh(&stats->lock); > } > - put_cpu(); > > > We can selectively use spin_lock_bh for locking stats on local node to > improve interrupt latency. > > > I.e., use spin_lock() for the other than local node? OK. > As done currently you need to disable preemption using get/put cpu. > ... > > static struct kmem_cache *flow_cache; > +struct kmem_cache *flow_stats_cache; > + > > Can you mark it __read_mostly. > > +int ovs_numa_nodes = 0; > > > I think I'll get rid of this. Or did you mean I should mark the > flow_stats_cache __read_mostly? > ok. > ... > > - spin_lock_init(&flow->stats.stat->lock); > - } else { > - flow->stats.cpu_stats = alloc_percpu(struct flow_stats); > - if (!flow->stats.cpu_stats) > - goto err; > + spin_lock_init(&flow->stats[0]->lock); > + flow->stats[0]->last_writer = -1; > > - for_each_possible_cpu(cpu) { > - struct flow_stats *cpu_stats; > + for (node = 1; node < ovs_numa_nodes; node++) > + flow->stats[node] = NULL; > > - cpu_stats = per_cpu_ptr(flow->stats.cpu_stats, cpu); > - spin_lock_init(&cpu_stats->lock); > - } > - } > > Can you use for_each_node() here? this works better when numa node are > sparse. > > > OK. > > > return flow; > err: > kmem_cache_free(flow_cache, flow); > @@ -142,11 +137,12 @@ static struct flex_array *alloc_buckets(unsigned int > n_buckets) > > static void flow_free(struct sw_flow *flow) > { > + int node; > + > kfree((struct sf_flow_acts __force *)flow->sf_acts); > - if (flow->stats.is_percpu) > - free_percpu(flow->stats.cpu_stats); > - else > - kfree(flow->stats.stat); > + for (node = 0; node < ovs_numa_nodes; node++) > + if (flow->stats[node]) > + kmem_cache_free(flow_stats_cache, > flow->stats[node]); > > same as above. > > > OK. > > kmem_cache_free(flow_cache, flow); > } > > @@ -603,19 +599,41 @@ int ovs_flow_tbl_insert(struct flow_table *table, > struct sw_flow *flow, > * Returns zero if successful or a negative error code. */ > int ovs_flow_init(void) > { > + int node; > + > BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long)); > BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long)); > > - flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow), 0, > - 0, NULL); > + /* Count the NUMA nodes. */ > + for_each_node_with_cpus(node) { > + if (node >= ovs_numa_nodes) > + ovs_numa_nodes = node + 1; > + } > + pr_info("ovs_flow_init: ovs_numa_nodes: %d.\n", ovs_numa_nodes); > + > > > This does not handle hotplug cpu. cpu/node can become online after ovs > kernel module is loaded. That can cause segment fault on stats access. > We can directly use num_possible_nodes(). > > > OK. > > > + flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow) > + + (ovs_numa_nodes > + * sizeof(struct flow_stats *)), > + __alignof__(struct sw_flow), > + SLAB_HWCACHE_ALIGN, NULL); > > > without any explicit alignment struct sw_flow gets platform alignment > by compiler which is 8 bytes for x86-64. When HWCACHE-ALIGN flag is > specified the alignment arg does not help kmem-cache-create since > HW-CACHE-ALIGN is L1 cache align which is larger than default > alignment. > > > So I should use 0 align arg if I know that that compiler alignment for the > struct is less than L1 cache alignment? > That should be fine. > > if (flow_cache == NULL) > return -ENOMEM; > > + flow_stats_cache > + = kmem_cache_create("sw_flow_stats", sizeof(struct > flow_stats), > + L1_CACHE_BYTES, SLAB_HWCACHE_ALIGN, > NULL); > > same as above, I am not sure how specifying alignment and alignment-flag > helps. > > If you want to align struct flow_stats at L1_cache then may be we can > get rid of explicit alignment done for it at its definition. > In general we should get rid of aligning structure and create cache > with correct alignment here. > > > OK. > > Jarno > _______________________________________________ dev mailing list dev@openvswitch.org http://openvswitch.org/mailman/listinfo/dev