On 10/15/20 12:54 PM, Eelco Chaudron wrote: > > > On 15 Oct 2020, at 12:27, Ilya Maximets wrote: > >> On 10/15/20 11:46 AM, Eelco Chaudron wrote: >>> The flow_lookup() function uses per CPU variables, which must not be >>> preempted. However, this is fine in the general napi use case where >>> the local BH is disabled. But, it's also called in the netlink >>> context, which is preemptible. The below patch makes sure that even >>> in the netlink path, preemption is disabled. >>> >>> In addition, the u64_stats_update_begin() sync point was not protected, >>> making the sync point part of the per CPU variable fixed this. >>> >>> Fixes: eac87c413bf9 ("net: openvswitch: reorder masks array based on usage") >>> Reported-by: Juri Lelli <jle...@redhat.com> >>> Signed-off-by: Eelco Chaudron <echau...@redhat.com> >>> --- >>> v2: - Add u64_stats_update_begin() sync point protection >>> - Moved patch to net from net-next branch >>> >>> net/openvswitch/flow_table.c | 56 >>> +++++++++++++++++++++++++----------------- >>> net/openvswitch/flow_table.h | 8 +++++- >>> 2 files changed, 39 insertions(+), 25 deletions(-) >>> >>> diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c >>> index e2235849a57e..d90b4af6f539 100644 >>> --- a/net/openvswitch/flow_table.c >>> +++ b/net/openvswitch/flow_table.c >>> @@ -172,7 +172,7 @@ static struct table_instance *table_instance_alloc(int >>> new_size) >>> >>> static void __mask_array_destroy(struct mask_array *ma) >>> { >>> - free_percpu(ma->masks_usage_cntr); >>> + free_percpu(ma->masks_usage_stats); >>> kfree(ma); >>> } >>> >>> @@ -196,15 +196,15 @@ static void tbl_mask_array_reset_counters(struct >>> mask_array *ma) >>> ma->masks_usage_zero_cntr[i] = 0; >>> >>> for_each_possible_cpu(cpu) { >>> - u64 *usage_counters = per_cpu_ptr(ma->masks_usage_cntr, >>> - cpu); >>> + struct mask_array_stats *stats; >>> unsigned int start; >>> u64 counter; >>> >>> + stats = per_cpu_ptr(ma->masks_usage_stats, cpu); >>> do { >>> - start = u64_stats_fetch_begin_irq(&ma->syncp); >>> - counter = usage_counters[i]; >>> - } while (u64_stats_fetch_retry_irq(&ma->syncp, start)); >>> + start = u64_stats_fetch_begin_irq(&stats->syncp); >>> + counter = stats->usage_cntrs[i]; >>> + } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); >>> >>> ma->masks_usage_zero_cntr[i] += counter; >>> } >>> @@ -227,9 +227,10 @@ static struct mask_array *tbl_mask_array_alloc(int >>> size) >>> sizeof(struct sw_flow_mask *) * >>> size); >>> >>> - new->masks_usage_cntr = __alloc_percpu(sizeof(u64) * size, >>> - __alignof__(u64)); >>> - if (!new->masks_usage_cntr) { >>> + new->masks_usage_stats = __alloc_percpu(sizeof(struct >>> mask_array_stats) + >>> + sizeof(u64) * size, >>> + __alignof__(u64)); >>> + if (!new->masks_usage_stats) { >>> kfree(new); >>> return NULL; >>> } >>> @@ -732,7 +733,7 @@ static struct sw_flow *flow_lookup(struct flow_table >>> *tbl, >>> u32 *n_cache_hit, >>> u32 *index) >>> { >>> - u64 *usage_counters = this_cpu_ptr(ma->masks_usage_cntr); >>> + struct mask_array_stats *stats = this_cpu_ptr(ma->masks_usage_stats); >>> struct sw_flow *flow; >>> struct sw_flow_mask *mask; >>> int i; >>> @@ -742,9 +743,9 @@ static struct sw_flow *flow_lookup(struct flow_table >>> *tbl, >>> if (mask) { >>> flow = masked_flow_lookup(ti, key, mask, n_mask_hit); >>> if (flow) { >>> - u64_stats_update_begin(&ma->syncp); >>> - usage_counters[*index]++; >>> - u64_stats_update_end(&ma->syncp); >>> + u64_stats_update_begin(&stats->syncp); >>> + stats->usage_cntrs[*index]++; >>> + u64_stats_update_end(&stats->syncp); >>> (*n_cache_hit)++; >>> return flow; >>> } >>> @@ -763,9 +764,9 @@ static struct sw_flow *flow_lookup(struct flow_table >>> *tbl, >>> flow = masked_flow_lookup(ti, key, mask, n_mask_hit); >>> if (flow) { /* Found */ >>> *index = i; >>> - u64_stats_update_begin(&ma->syncp); >>> - usage_counters[*index]++; >>> - u64_stats_update_end(&ma->syncp); >>> + u64_stats_update_begin(&stats->syncp); >>> + stats->usage_cntrs[*index]++; >>> + u64_stats_update_end(&stats->syncp); >>> return flow; >>> } >>> } >>> @@ -851,9 +852,17 @@ struct sw_flow *ovs_flow_tbl_lookup(struct flow_table >>> *tbl, >>> struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array); >>> u32 __always_unused n_mask_hit; >>> u32 __always_unused n_cache_hit; >>> + struct sw_flow *flow; >>> u32 index = 0; >>> >>> - return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, >>> &index); >>> + /* This function gets called trough the netlink interface and therefore >>> + * is preemptible. However, flow_lookup() function needs to be called >>> + * with preemption disabled due to CPU specific variables. >> >> Is it possible to add some kind of assertion inside flow_lookup() to avoid >> this kind of issues in the future? > > We could do something like WARN_ON_ONCE(preemptible()) but do not think such > check should be added to the fast path.
I meant something like lockdep_assert_preemption_disabled(). This will not impact fast path if CONFIG_PROVE_LOCKING disabled, but will allow to catch issues during development. > >> It might be also good to update the comment for flow_lookup() function >> itself. > > Good idea, will do this in a v3 > >>> + */ >>> + local_bh_disable(); >>> + flow = flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, >>> &index); >>> + local_bh_enable(); >>> + return flow; >>> } >>> >>> struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, >>> @@ -1109,7 +1118,6 @@ void ovs_flow_masks_rebalance(struct flow_table >>> *table) >>> >>> for (i = 0; i < ma->max; i++) { >>> struct sw_flow_mask *mask; >>> - unsigned int start; >>> int cpu; >>> >>> mask = rcu_dereference_ovsl(ma->masks[i]); >>> @@ -1120,14 +1128,16 @@ void ovs_flow_masks_rebalance(struct flow_table >>> *table) >>> masks_and_count[i].counter = 0; >>> >>> for_each_possible_cpu(cpu) { >>> - u64 *usage_counters = per_cpu_ptr(ma->masks_usage_cntr, >>> - cpu); >>> + struct mask_array_stats *stats; >>> + unsigned int start; >>> u64 counter; >>> >>> + stats = per_cpu_ptr(ma->masks_usage_stats, cpu); >>> do { >>> - start = u64_stats_fetch_begin_irq(&ma->syncp); >>> - counter = usage_counters[i]; >>> - } while (u64_stats_fetch_retry_irq(&ma->syncp, start)); >>> + start = u64_stats_fetch_begin_irq(&stats->syncp); >>> + counter = stats->usage_cntrs[i]; >>> + } while (u64_stats_fetch_retry_irq(&stats->syncp, >>> + start)); >>> >>> masks_and_count[i].counter += counter; >>> } >>> diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h >>> index 6e7d4ac59353..43144396e192 100644 >>> --- a/net/openvswitch/flow_table.h >>> +++ b/net/openvswitch/flow_table.h >>> @@ -38,12 +38,16 @@ struct mask_count { >>> u64 counter; >>> }; >>> >>> +struct mask_array_stats { >>> + struct u64_stats_sync syncp; >>> + u64 usage_cntrs[]; >>> +}; >>> + >>> struct mask_array { >>> struct rcu_head rcu; >>> int count, max; >>> - u64 __percpu *masks_usage_cntr; >>> + struct mask_array_stats __percpu *masks_usage_stats; >>> u64 *masks_usage_zero_cntr; >>> - struct u64_stats_sync syncp; >>> struct sw_flow_mask __rcu *masks[]; >>> }; >>> >>> >>> _______________________________________________ >>> dev mailing list >>> d...@openvswitch.org >>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev >>> >