Pravin,

Thanks for your thorough review. Some comments below,

  Jarno

On Feb 3, 2014, at 9:14 AM, Pravin Shelar <pshe...@nicira.com> wrote:

> On Thu, Jan 23, 2014 at 4:43 PM, Jarno Rajahalme <jrajaha...@nicira.com> 
> wrote:
>> 
...
>> +static void get_skb_stats(struct flow_stats *stat, const struct sw_flow 
>> *flow,
>> +                         const struct sk_buff *skb)
>> +{
>> +       int node = numa_node_id();
>> +
>> +       /* Prepare for writing later. */
>> +       if (unlikely(!flow->stats[node]))
>> +               node = 0;
>> +       spin_lock_prefetch(&flow->stats[node]->lock);
>> +
> lock-prefetch looks pretty aggressive, specially when path between
> prefetch and actual use is not pre-defined, it depends on actions
> list. execute actions has lot more memory footprint when it come to
> output action to tunnel port.

The risk being that prefetch is too early in that case? Do you have a setup 
where you could test that case?

> 
>> +       stat->used = jiffies;
>> +       stat->packet_count = 1;
>> +       stat->byte_count = skb->len;
>> +       stat->tcp_flags = 0;
>> +
>> +       if ((flow->key.eth.type == htons(ETH_P_IP) ||
>> +            flow->key.eth.type == htons(ETH_P_IPV6)) &&
>> +           flow->key.ip.proto == IPPROTO_TCP &&
>> +           likely(skb->len >= skb_transport_offset(skb) + sizeof(struct 
>> tcphdr))) {
>> +               stat->tcp_flags = TCP_FLAGS_BE16(tcp_hdr(skb));
>> +       }
>> +}
>> +
>> /* Must be called with rcu_read_lock. */
>> void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb)
>> {
>> @@ -221,6 +244,7 @@ void ovs_dp_process_received_packet(struct vport *p, 
>> struct sk_buff *skb)
>>        struct sw_flow *flow;
>>        struct dp_stats_percpu *stats;
>>        struct sw_flow_key key;
>> +       struct flow_stats stat;
>>        u64 *stats_counter;
>>        u32 n_mask_hit;
>>        int error;
>> @@ -252,8 +276,9 @@ void ovs_dp_process_received_packet(struct vport *p, 
>> struct sk_buff *skb)
>>        OVS_CB(skb)->flow = flow;
>>        OVS_CB(skb)->pkt_key = &key;
>> 
>> -       ovs_flow_stats_update(OVS_CB(skb)->flow, skb);
>> +       get_skb_stats(&stat, flow, skb);
>>        ovs_execute_actions(dp, skb);
>> +       ovs_flow_stats_update(flow, &stat);
>>        stats_counter = &stats->n_hit;
>> 
> I am not sure why would you divide stats update in two steps? We could
> have just done prefetch.

Applying the actions can change the skb, and we want to record the size before 
any modifications.

...
>> 
>> +static inline bool alloc_stats(struct sw_flow *flow,
>> +                              const struct flow_stats *stat, int node)
>> +{
>> +       struct flow_stats *stats;
>> +       /* Alloc stats for the 'node', but with minimal latency. */
>> +       stats = kmem_cache_alloc_node(flow_stats_cache, GFP_THISNODE |
>> +                                     __GFP_NOTRACK | __GFP_NOMEMALLOC, 
>> node);
> 
> __GFP_NOTRACK is hardly used out side of memory allocator. Why would
> we need it in OVS. It only affects memory allocation if debugging
> (CONFIG_KMEMCHECK) is on. CONFIG_KMEMCHECK is pretty heavy weight
> tracing so it is not usually on.
> 

OK, thanks, will take __GFP_NOTRACK off then.

...
>> -                       stats = per_cpu_ptr(flow->stats.cpu_stats, cpu);
>> -                       lock_bh = (cpu == cur_cpu);
>> -                       stats_read(stats, lock_bh, ovs_stats, used, 
>> tcp_flags);
>> +       /* Collect per NUMA-node stats from other nodes. */
>> +       for (node = 0; node < ovs_numa_nodes; node++) {
>> +               if (node == current_node)
>> +                       continue; /* Done already. */
>> +               stats = flow->stats[node];
>> +               if (stats && likely(stats->packet_count)) {
>> +                       spin_lock_bh(&stats->lock);
>> +                       if (time_after(stats->used, *used))
>> +                               *used = stats->used;
>> +                       *tcp_flags |= stats->tcp_flags;
>> +                       ovs_stats->n_packets += stats->packet_count;
>> +                       ovs_stats->n_bytes += stats->byte_count;
>> +                       spin_unlock_bh(&stats->lock);
>>                }
>> -               put_cpu();
> 
> We can selectively use spin_lock_bh for locking stats on local node to
> improve interrupt latency.

I.e., use spin_lock() for the other than local node? OK.

...
>> static struct kmem_cache *flow_cache;
>> +struct kmem_cache *flow_stats_cache;
>> +
> Can you mark it __read_mostly.
> 
>> +int ovs_numa_nodes = 0;
>> 

I think I’ll get rid of this. Or did you mean I should mark the 
flow_stats_cache __read_mostly?

…
>> -               spin_lock_init(&flow->stats.stat->lock);
>> -       } else {
>> -               flow->stats.cpu_stats = alloc_percpu(struct flow_stats);
>> -               if (!flow->stats.cpu_stats)
>> -                       goto err;
>> +       spin_lock_init(&flow->stats[0]->lock);
>> +       flow->stats[0]->last_writer = -1;
>> 
>> -               for_each_possible_cpu(cpu) {
>> -                       struct flow_stats *cpu_stats;
>> +       for (node = 1; node < ovs_numa_nodes; node++)
>> +               flow->stats[node] = NULL;
>> 
>> -                       cpu_stats = per_cpu_ptr(flow->stats.cpu_stats, cpu);
>> -                       spin_lock_init(&cpu_stats->lock);
>> -               }
>> -       }
> Can you use for_each_node() here? this works better when numa node are sparse.
> 

OK.

> 
>>        return flow;
>> err:
>>        kmem_cache_free(flow_cache, flow);
>> @@ -142,11 +137,12 @@ static struct flex_array *alloc_buckets(unsigned int 
>> n_buckets)
>> 
>> static void flow_free(struct sw_flow *flow)
>> {
>> +       int node;
>> +
>>        kfree((struct sf_flow_acts __force *)flow->sf_acts);
>> -       if (flow->stats.is_percpu)
>> -               free_percpu(flow->stats.cpu_stats);
>> -       else
>> -               kfree(flow->stats.stat);
>> +       for (node = 0; node < ovs_numa_nodes; node++)
>> +               if (flow->stats[node])
>> +                       kmem_cache_free(flow_stats_cache, flow->stats[node]);
> same as above.
> 

OK.

>>        kmem_cache_free(flow_cache, flow);
>> }
>> 
>> @@ -603,19 +599,41 @@ int ovs_flow_tbl_insert(struct flow_table *table, 
>> struct sw_flow *flow,
>>  * Returns zero if successful or a negative error code. */
>> int ovs_flow_init(void)
>> {
>> +       int node;
>> +
>>        BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long));
>>        BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long));
>> 
>> -       flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow), 0,
>> -                                       0, NULL);
>> +       /* Count the NUMA nodes. */
>> +       for_each_node_with_cpus(node) {
>> +               if (node >= ovs_numa_nodes)
>> +                       ovs_numa_nodes = node + 1;
>> +       }
>> +       pr_info("ovs_flow_init: ovs_numa_nodes: %d.\n", ovs_numa_nodes);
>> +
> 
> This does not handle hotplug cpu. cpu/node can become online after ovs
> kernel module is loaded. That can cause segment fault on stats access.
> We can directly use num_possible_nodes().
> 

OK.

> 
>> +       flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow)
>> +                                      + (ovs_numa_nodes
>> +                                         * sizeof(struct flow_stats *)),
>> +                                      __alignof__(struct sw_flow),
>> +                                      SLAB_HWCACHE_ALIGN, NULL);
> 
> without any explicit alignment struct sw_flow gets platform alignment
> by compiler which is 8 bytes for x86-64. When HWCACHE-ALIGN flag is
> specified the alignment arg does not help kmem-cache-create since
> HW-CACHE-ALIGN is L1 cache align which is larger than default
> alignment.

So I should use 0 align arg if I know that that compiler alignment for the 
struct is less than L1 cache alignment?

> 
>>        if (flow_cache == NULL)
>>                return -ENOMEM;
>> 
>> +       flow_stats_cache
>> +               = kmem_cache_create("sw_flow_stats", sizeof(struct 
>> flow_stats),
>> +                                   L1_CACHE_BYTES, SLAB_HWCACHE_ALIGN, 
>> NULL);
> same as above, I am not sure how specifying alignment and alignment-flag 
> helps.
> 
> If you want to align struct flow_stats at L1_cache then may be we can
> get rid of explicit alignment done for it at its definition.
> In general we should get rid of aligning structure and create cache
> with correct alignment here.

OK.

  Jarno

_______________________________________________
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev

Reply via email to