Pravin, Thanks for your thorough review. Some comments below,
Jarno On Feb 3, 2014, at 9:14 AM, Pravin Shelar <pshe...@nicira.com> wrote: > On Thu, Jan 23, 2014 at 4:43 PM, Jarno Rajahalme <jrajaha...@nicira.com> > wrote: >> ... >> +static void get_skb_stats(struct flow_stats *stat, const struct sw_flow >> *flow, >> + const struct sk_buff *skb) >> +{ >> + int node = numa_node_id(); >> + >> + /* Prepare for writing later. */ >> + if (unlikely(!flow->stats[node])) >> + node = 0; >> + spin_lock_prefetch(&flow->stats[node]->lock); >> + > lock-prefetch looks pretty aggressive, specially when path between > prefetch and actual use is not pre-defined, it depends on actions > list. execute actions has lot more memory footprint when it come to > output action to tunnel port. The risk being that prefetch is too early in that case? Do you have a setup where you could test that case? > >> + stat->used = jiffies; >> + stat->packet_count = 1; >> + stat->byte_count = skb->len; >> + stat->tcp_flags = 0; >> + >> + if ((flow->key.eth.type == htons(ETH_P_IP) || >> + flow->key.eth.type == htons(ETH_P_IPV6)) && >> + flow->key.ip.proto == IPPROTO_TCP && >> + likely(skb->len >= skb_transport_offset(skb) + sizeof(struct >> tcphdr))) { >> + stat->tcp_flags = TCP_FLAGS_BE16(tcp_hdr(skb)); >> + } >> +} >> + >> /* Must be called with rcu_read_lock. */ >> void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb) >> { >> @@ -221,6 +244,7 @@ void ovs_dp_process_received_packet(struct vport *p, >> struct sk_buff *skb) >> struct sw_flow *flow; >> struct dp_stats_percpu *stats; >> struct sw_flow_key key; >> + struct flow_stats stat; >> u64 *stats_counter; >> u32 n_mask_hit; >> int error; >> @@ -252,8 +276,9 @@ void ovs_dp_process_received_packet(struct vport *p, >> struct sk_buff *skb) >> OVS_CB(skb)->flow = flow; >> OVS_CB(skb)->pkt_key = &key; >> >> - ovs_flow_stats_update(OVS_CB(skb)->flow, skb); >> + get_skb_stats(&stat, flow, skb); >> ovs_execute_actions(dp, skb); >> + ovs_flow_stats_update(flow, &stat); >> stats_counter = &stats->n_hit; >> > I am not sure why would you divide stats update in two steps? We could > have just done prefetch. Applying the actions can change the skb, and we want to record the size before any modifications. ... >> >> +static inline bool alloc_stats(struct sw_flow *flow, >> + const struct flow_stats *stat, int node) >> +{ >> + struct flow_stats *stats; >> + /* Alloc stats for the 'node', but with minimal latency. */ >> + stats = kmem_cache_alloc_node(flow_stats_cache, GFP_THISNODE | >> + __GFP_NOTRACK | __GFP_NOMEMALLOC, >> node); > > __GFP_NOTRACK is hardly used out side of memory allocator. Why would > we need it in OVS. It only affects memory allocation if debugging > (CONFIG_KMEMCHECK) is on. CONFIG_KMEMCHECK is pretty heavy weight > tracing so it is not usually on. > OK, thanks, will take __GFP_NOTRACK off then. ... >> - stats = per_cpu_ptr(flow->stats.cpu_stats, cpu); >> - lock_bh = (cpu == cur_cpu); >> - stats_read(stats, lock_bh, ovs_stats, used, >> tcp_flags); >> + /* Collect per NUMA-node stats from other nodes. */ >> + for (node = 0; node < ovs_numa_nodes; node++) { >> + if (node == current_node) >> + continue; /* Done already. */ >> + stats = flow->stats[node]; >> + if (stats && likely(stats->packet_count)) { >> + spin_lock_bh(&stats->lock); >> + if (time_after(stats->used, *used)) >> + *used = stats->used; >> + *tcp_flags |= stats->tcp_flags; >> + ovs_stats->n_packets += stats->packet_count; >> + ovs_stats->n_bytes += stats->byte_count; >> + spin_unlock_bh(&stats->lock); >> } >> - put_cpu(); > > We can selectively use spin_lock_bh for locking stats on local node to > improve interrupt latency. I.e., use spin_lock() for the other than local node? OK. ... >> static struct kmem_cache *flow_cache; >> +struct kmem_cache *flow_stats_cache; >> + > Can you mark it __read_mostly. > >> +int ovs_numa_nodes = 0; >> I think I’ll get rid of this. Or did you mean I should mark the flow_stats_cache __read_mostly? … >> - spin_lock_init(&flow->stats.stat->lock); >> - } else { >> - flow->stats.cpu_stats = alloc_percpu(struct flow_stats); >> - if (!flow->stats.cpu_stats) >> - goto err; >> + spin_lock_init(&flow->stats[0]->lock); >> + flow->stats[0]->last_writer = -1; >> >> - for_each_possible_cpu(cpu) { >> - struct flow_stats *cpu_stats; >> + for (node = 1; node < ovs_numa_nodes; node++) >> + flow->stats[node] = NULL; >> >> - cpu_stats = per_cpu_ptr(flow->stats.cpu_stats, cpu); >> - spin_lock_init(&cpu_stats->lock); >> - } >> - } > Can you use for_each_node() here? this works better when numa node are sparse. > OK. > >> return flow; >> err: >> kmem_cache_free(flow_cache, flow); >> @@ -142,11 +137,12 @@ static struct flex_array *alloc_buckets(unsigned int >> n_buckets) >> >> static void flow_free(struct sw_flow *flow) >> { >> + int node; >> + >> kfree((struct sf_flow_acts __force *)flow->sf_acts); >> - if (flow->stats.is_percpu) >> - free_percpu(flow->stats.cpu_stats); >> - else >> - kfree(flow->stats.stat); >> + for (node = 0; node < ovs_numa_nodes; node++) >> + if (flow->stats[node]) >> + kmem_cache_free(flow_stats_cache, flow->stats[node]); > same as above. > OK. >> kmem_cache_free(flow_cache, flow); >> } >> >> @@ -603,19 +599,41 @@ int ovs_flow_tbl_insert(struct flow_table *table, >> struct sw_flow *flow, >> * Returns zero if successful or a negative error code. */ >> int ovs_flow_init(void) >> { >> + int node; >> + >> BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long)); >> BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long)); >> >> - flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow), 0, >> - 0, NULL); >> + /* Count the NUMA nodes. */ >> + for_each_node_with_cpus(node) { >> + if (node >= ovs_numa_nodes) >> + ovs_numa_nodes = node + 1; >> + } >> + pr_info("ovs_flow_init: ovs_numa_nodes: %d.\n", ovs_numa_nodes); >> + > > This does not handle hotplug cpu. cpu/node can become online after ovs > kernel module is loaded. That can cause segment fault on stats access. > We can directly use num_possible_nodes(). > OK. > >> + flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow) >> + + (ovs_numa_nodes >> + * sizeof(struct flow_stats *)), >> + __alignof__(struct sw_flow), >> + SLAB_HWCACHE_ALIGN, NULL); > > without any explicit alignment struct sw_flow gets platform alignment > by compiler which is 8 bytes for x86-64. When HWCACHE-ALIGN flag is > specified the alignment arg does not help kmem-cache-create since > HW-CACHE-ALIGN is L1 cache align which is larger than default > alignment. So I should use 0 align arg if I know that that compiler alignment for the struct is less than L1 cache alignment? > >> if (flow_cache == NULL) >> return -ENOMEM; >> >> + flow_stats_cache >> + = kmem_cache_create("sw_flow_stats", sizeof(struct >> flow_stats), >> + L1_CACHE_BYTES, SLAB_HWCACHE_ALIGN, >> NULL); > same as above, I am not sure how specifying alignment and alignment-flag > helps. > > If you want to align struct flow_stats at L1_cache then may be we can > get rid of explicit alignment done for it at its definition. > In general we should get rid of aligning structure and create cache > with correct alignment here. OK. Jarno
_______________________________________________ dev mailing list dev@openvswitch.org http://openvswitch.org/mailman/listinfo/dev