On Fri, Aug 7, 2020 at 8:33 AM Johan Knöös <jkn...@google.com> wrote: > > On Tue, Aug 4, 2020 at 8:52 AM Gregory Rose <gvrose8...@gmail.com> wrote: > > > > > > > > On 8/3/2020 12:01 PM, Johan Knöös via discuss wrote: > > > Hi Open vSwitch contributors, > > > > > > We have found openvswitch is causing double-freeing of memory. The > > > issue was not present in kernel version 5.5.17 but is present in > > > 5.6.14 and newer kernels. > > > > > > After reverting the RCU commits below for debugging, enabling > > > slub_debug, lockdep, and KASAN, we see the warnings at the end of this > > > email in the kernel log (the last one shows the double-free). When I > > > revert 50b0e61b32ee890a75b4377d5fbe770a86d6a4c1 ("net: openvswitch: > > > fix possible memleak on destroy flow-table"), the symptoms disappear. > > > While I have a reliable way to reproduce the issue, I unfortunately > > > don't yet have a process that's amenable to sharing. Please take a > > > look. > > > > > > 189a6883dcf7 rcu: Remove kfree_call_rcu_nobatch() > > > 77a40f97030b rcu: Remove kfree_rcu() special casing and lazy-callback > > > handling > > > e99637becb2e rcu: Add support for debug_objects debugging for kfree_rcu() > > > 0392bebebf26 rcu: Add multiple in-flight batches of kfree_rcu() work > > > 569d767087ef rcu: Make kfree_rcu() use a non-atomic ->monitor_todo > > > a35d16905efc rcu: Add basic support for kfree_rcu() batching > > > > > > Thanks, > > > Johan Knöös > > > > Let's add the author of the patch you reverted and the Linux netdev > > mailing list. > > > > - Greg > > I found we also sometimes get warnings from > https://elixir.bootlin.com/linux/v5.5.17/source/kernel/rcu/tree.c#L2239 > under similar conditions even on kernel 5.5.17, which I believe may be > related. However, it's much rarer and I don't have a reliable way of > reproducing it. Perhaps 50b0e61b32ee890a75b4377d5fbe770a86d6a4c1 only > increases the frequency of a pre-existing bug.
It seems clear we have a double free on table->mask_array when the reallocation is triggered on the destroy path. Are you able to test the attached patch (compile tested only)? Also note: it is generated against the latest net tree, it may not be applied cleanly to any earlier stable release. Thanks!
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index 8c12675cbb67..cc7859db445a 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -294,7 +294,7 @@ static int tbl_mask_array_add_mask(struct flow_table *tbl, } static void tbl_mask_array_del_mask(struct flow_table *tbl, - struct sw_flow_mask *mask) + struct sw_flow_mask *mask, bool destroy) { struct mask_array *ma = ovsl_dereference(tbl->mask_array); int i, ma_count = READ_ONCE(ma->count); @@ -314,6 +314,11 @@ static void tbl_mask_array_del_mask(struct flow_table *tbl, rcu_assign_pointer(ma->masks[i], ma->masks[ma_count -1]); RCU_INIT_POINTER(ma->masks[ma_count -1], NULL); + if (destroy) { + kfree(mask); + return; + } + kfree_rcu(mask, rcu); /* Shrink the mask array if necessary. */ @@ -326,7 +331,8 @@ static void tbl_mask_array_del_mask(struct flow_table *tbl, } /* Remove 'mask' from the mask list, if it is not needed any more. */ -static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask) +static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask, + bool destroy) { if (mask) { /* ovs-lock is required to protect mask-refcount and @@ -337,7 +343,7 @@ static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask) mask->ref_count--; if (!mask->ref_count) - tbl_mask_array_del_mask(tbl, mask); + tbl_mask_array_del_mask(tbl, mask, destroy); } } @@ -470,7 +476,7 @@ static void table_instance_flow_free(struct flow_table *table, table->ufid_count--; } - flow_mask_remove(table, flow->mask); + flow_mask_remove(table, flow->mask, !count); } static void table_instance_destroy(struct flow_table *table, @@ -521,9 +527,9 @@ void ovs_flow_tbl_destroy(struct flow_table *table) struct mask_cache *mc = rcu_dereference_raw(table->mask_cache); struct mask_array *ma = rcu_dereference_raw(table->mask_array); + table_instance_destroy(table, ti, ufid_ti, false); call_rcu(&mc->rcu, mask_cache_rcu_cb); call_rcu(&ma->rcu, mask_array_rcu_cb); - table_instance_destroy(table, ti, ufid_ti, false); } struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti,