On 10/15/20 11:46 AM, Eelco Chaudron wrote: > The flow_lookup() function uses per CPU variables, which must not be > preempted. However, this is fine in the general napi use case where > the local BH is disabled. But, it's also called in the netlink > context, which is preemptible. The below patch makes sure that even > in the netlink path, preemption is disabled. > > In addition, the u64_stats_update_begin() sync point was not protected, > making the sync point part of the per CPU variable fixed this. > > Fixes: eac87c413bf9 ("net: openvswitch: reorder masks array based on usage") > Reported-by: Juri Lelli <jle...@redhat.com> > Signed-off-by: Eelco Chaudron <echau...@redhat.com> > --- > v2: - Add u64_stats_update_begin() sync point protection > - Moved patch to net from net-next branch > > net/openvswitch/flow_table.c | 56 > +++++++++++++++++++++++++----------------- > net/openvswitch/flow_table.h | 8 +++++- > 2 files changed, 39 insertions(+), 25 deletions(-) > > diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c > index e2235849a57e..d90b4af6f539 100644 > --- a/net/openvswitch/flow_table.c > +++ b/net/openvswitch/flow_table.c > @@ -172,7 +172,7 @@ static struct table_instance *table_instance_alloc(int > new_size) > > static void __mask_array_destroy(struct mask_array *ma) > { > - free_percpu(ma->masks_usage_cntr); > + free_percpu(ma->masks_usage_stats); > kfree(ma); > } > > @@ -196,15 +196,15 @@ static void tbl_mask_array_reset_counters(struct > mask_array *ma) > ma->masks_usage_zero_cntr[i] = 0; > > for_each_possible_cpu(cpu) { > - u64 *usage_counters = per_cpu_ptr(ma->masks_usage_cntr, > - cpu); > + struct mask_array_stats *stats; > unsigned int start; > u64 counter; > > + stats = per_cpu_ptr(ma->masks_usage_stats, cpu); > do { > - start = u64_stats_fetch_begin_irq(&ma->syncp); > - counter = usage_counters[i]; > - } while (u64_stats_fetch_retry_irq(&ma->syncp, start)); > + start = > u64_stats_fetch_begin_irq(&stats->syncp); > + counter = stats->usage_cntrs[i]; > + } while (u64_stats_fetch_retry_irq(&stats->syncp, > start)); > > ma->masks_usage_zero_cntr[i] += counter; > } > @@ -227,9 +227,10 @@ static struct mask_array *tbl_mask_array_alloc(int size) > sizeof(struct sw_flow_mask *) * > size); > > - new->masks_usage_cntr = __alloc_percpu(sizeof(u64) * size, > - __alignof__(u64)); > - if (!new->masks_usage_cntr) { > + new->masks_usage_stats = __alloc_percpu(sizeof(struct mask_array_stats) > + > + sizeof(u64) * size, > + __alignof__(u64)); > + if (!new->masks_usage_stats) { > kfree(new); > return NULL; > } > @@ -732,7 +733,7 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl, > u32 *n_cache_hit, > u32 *index) > { > - u64 *usage_counters = this_cpu_ptr(ma->masks_usage_cntr); > + struct mask_array_stats *stats = this_cpu_ptr(ma->masks_usage_stats); > struct sw_flow *flow; > struct sw_flow_mask *mask; > int i; > @@ -742,9 +743,9 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl, > if (mask) { > flow = masked_flow_lookup(ti, key, mask, n_mask_hit); > if (flow) { > - u64_stats_update_begin(&ma->syncp); > - usage_counters[*index]++; > - u64_stats_update_end(&ma->syncp); > + u64_stats_update_begin(&stats->syncp); > + stats->usage_cntrs[*index]++; > + u64_stats_update_end(&stats->syncp); > (*n_cache_hit)++; > return flow; > } > @@ -763,9 +764,9 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl, > flow = masked_flow_lookup(ti, key, mask, n_mask_hit); > if (flow) { /* Found */ > *index = i; > - u64_stats_update_begin(&ma->syncp); > - usage_counters[*index]++; > - u64_stats_update_end(&ma->syncp); > + u64_stats_update_begin(&stats->syncp); > + stats->usage_cntrs[*index]++; > + u64_stats_update_end(&stats->syncp); > return flow; > } > } > @@ -851,9 +852,17 @@ struct sw_flow *ovs_flow_tbl_lookup(struct flow_table > *tbl, > struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array); > u32 __always_unused n_mask_hit; > u32 __always_unused n_cache_hit; > + struct sw_flow *flow; > u32 index = 0; > > - return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index); > + /* This function gets called trough the netlink interface and therefore > + * is preemptible. However, flow_lookup() function needs to be called > + * with preemption disabled due to CPU specific variables.
Is it possible to add some kind of assertion inside flow_lookup() to avoid this kind of issues in the future? It might be also good to update the comment for flow_lookup() function itself. > + */ > + local_bh_disable(); > + flow = flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index); > + local_bh_enable(); > + return flow; > } > > struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, > @@ -1109,7 +1118,6 @@ void ovs_flow_masks_rebalance(struct flow_table *table) > > for (i = 0; i < ma->max; i++) { > struct sw_flow_mask *mask; > - unsigned int start; > int cpu; > > mask = rcu_dereference_ovsl(ma->masks[i]); > @@ -1120,14 +1128,16 @@ void ovs_flow_masks_rebalance(struct flow_table > *table) > masks_and_count[i].counter = 0; > > for_each_possible_cpu(cpu) { > - u64 *usage_counters = per_cpu_ptr(ma->masks_usage_cntr, > - cpu); > + struct mask_array_stats *stats; > + unsigned int start; > u64 counter; > > + stats = per_cpu_ptr(ma->masks_usage_stats, cpu); > do { > - start = u64_stats_fetch_begin_irq(&ma->syncp); > - counter = usage_counters[i]; > - } while (u64_stats_fetch_retry_irq(&ma->syncp, start)); > + start = > u64_stats_fetch_begin_irq(&stats->syncp); > + counter = stats->usage_cntrs[i]; > + } while (u64_stats_fetch_retry_irq(&stats->syncp, > + start)); > > masks_and_count[i].counter += counter; > } > diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h > index 6e7d4ac59353..43144396e192 100644 > --- a/net/openvswitch/flow_table.h > +++ b/net/openvswitch/flow_table.h > @@ -38,12 +38,16 @@ struct mask_count { > u64 counter; > }; > > +struct mask_array_stats { > + struct u64_stats_sync syncp; > + u64 usage_cntrs[]; > +}; > + > struct mask_array { > struct rcu_head rcu; > int count, max; > - u64 __percpu *masks_usage_cntr; > + struct mask_array_stats __percpu *masks_usage_stats; > u64 *masks_usage_zero_cntr; > - struct u64_stats_sync syncp; > struct sw_flow_mask __rcu *masks[]; > }; > > > _______________________________________________ > dev mailing list > d...@openvswitch.org > https://mail.openvswitch.org/mailman/listinfo/ovs-dev >