On Tue, 2019-04-23 at 07:43 +0000, Vlad Buslov wrote: > On Mon 22 Apr 2019 at 23:34, Saeed Mahameed <sae...@mellanox.com> > wrote: > > On Mon, 2019-04-22 at 10:21 +0300, Vlad Buslov wrote: > > > Recent changes that introduced unlocked flower did not properly > > > account for > > > case when reoffload is initiated concurrently with filter > > > updates. To > > > fix > > > the issue, extend flower with 'hw_filters' list that is used to > > > store > > > filters that don't have 'skip_hw' flag set. Filter is added to > > > the > > > list > > > when it is inserted to hardware and only removed from it after > > > being > > > unoffloaded from all drivers that parent block is attached to. > > > This > > > ensures > > > that concurrent reoffload can still access filter that is being > > > deleted and > > > prevents race condition when driver callback can be removed when > > > filter is > > > no longer accessible trough idr, but is still present in > > > hardware. > > > > > > Refactor fl_change() to respect new filter reference counter and > > > to > > > release > > > filter reference with __fl_put() in case of error, instead of > > > directly > > > deallocating filter memory. This allows for concurrent access to > > > filter > > > from fl_reoffload() and protects it with reference counting. > > > Refactor > > > fl_reoffload() to iterate over hw_filters list instead of idr. > > > Implement > > > fl_get_next_hw_filter() helper function that is used to iterate > > > over > > > hw_filters list with reference counting and skips filters that > > > are > > > being > > > concurrently deleted. > > > > > > Fixes: 92149190067d ("net: sched: flower: set unlocked flag for > > > flower proto ops") > > > Signed-off-by: Vlad Buslov <vla...@mellanox.com> > > > Reported-by: Jakub Kicinski <jakub.kicin...@netronome.com> > > > --- > > > net/sched/cls_flower.c | 77 +++++++++++++++++++++++++++++++----- > > > ---- > > > -- > > > 1 file changed, 57 insertions(+), 20 deletions(-) > > > > > > diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c > > > index 4b5585358699..524b86560af3 100644 > > > --- a/net/sched/cls_flower.c > > > +++ b/net/sched/cls_flower.c > > > @@ -90,6 +90,7 @@ struct cls_fl_head { > > > struct rhashtable ht; > > > spinlock_t masks_lock; /* Protect masks list */ > > > struct list_head masks; > > > + struct list_head hw_filters; > > > struct rcu_work rwork; > > > struct idr handle_idr; > > > }; > > > @@ -102,6 +103,7 @@ struct cls_fl_filter { > > > struct tcf_result res; > > > struct fl_flow_key key; > > > struct list_head list; > > > + struct list_head hw_list; > > > u32 handle; > > > u32 flags; > > > u32 in_hw_count; > > > @@ -315,6 +317,7 @@ static int fl_init(struct tcf_proto *tp) > > > > > > spin_lock_init(&head->masks_lock); > > > INIT_LIST_HEAD_RCU(&head->masks); > > > + INIT_LIST_HEAD(&head->hw_filters); > > > rcu_assign_pointer(tp->root, head); > > > idr_init(&head->handle_idr); > > > > > > @@ -352,6 +355,16 @@ static bool fl_mask_put(struct cls_fl_head > > > *head, struct fl_flow_mask *mask) > > > return true; > > > } > > > > > > +static struct cls_fl_head *fl_head_dereference(struct tcf_proto > > > *tp) > > > +{ > > > + /* Flower classifier only changes root pointer during init and > > > destroy. > > > + * Users must obtain reference to tcf_proto instance before > > > calling its > > > + * API, so tp->root pointer is protected from concurrent call > > > to > > > + * fl_destroy() by reference counting. > > > + */ > > > + return rcu_dereference_raw(tp->root); > > > +} > > > + > > > static void __fl_destroy_filter(struct cls_fl_filter *f) > > > { > > > tcf_exts_destroy(&f->exts); > > > @@ -382,6 +395,8 @@ static void fl_hw_destroy_filter(struct > > > tcf_proto > > > *tp, struct cls_fl_filter *f, > > > > > > tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, > > > false); > > > spin_lock(&tp->lock); > > > + if (!list_empty(&f->hw_list)) > > > + list_del_init(&f->hw_list); > > > tcf_block_offload_dec(block, &f->flags); > > > spin_unlock(&tp->lock); > > > > > > @@ -393,6 +408,7 @@ static int fl_hw_replace_filter(struct > > > tcf_proto > > > *tp, > > > struct cls_fl_filter *f, bool > > > rtnl_held, > > > struct netlink_ext_ack *extack) > > > { > > > + struct cls_fl_head *head = fl_head_dereference(tp); > > > struct tc_cls_flower_offload cls_flower = {}; > > > struct tcf_block *block = tp->chain->block; > > > bool skip_sw = tc_skip_sw(f->flags); > > > @@ -444,6 +460,9 @@ static int fl_hw_replace_filter(struct > > > tcf_proto > > > *tp, > > > goto errout; > > > } > > > > > > + spin_lock(&tp->lock); > > > + list_add(&f->hw_list, &head->hw_filters); > > > + spin_unlock(&tp->lock); > > > errout: > > > if (!rtnl_held) > > > rtnl_unlock(); > > > @@ -475,23 +494,11 @@ static void fl_hw_update_stats(struct > > > tcf_proto > > > *tp, struct cls_fl_filter *f, > > > rtnl_unlock(); > > > } > > > > > > -static struct cls_fl_head *fl_head_dereference(struct tcf_proto > > > *tp) > > > -{ > > > - /* Flower classifier only changes root pointer during init and > > > destroy. > > > - * Users must obtain reference to tcf_proto instance before > > > calling its > > > - * API, so tp->root pointer is protected from concurrent call > > > to > > > - * fl_destroy() by reference counting. > > > - */ > > > - return rcu_dereference_raw(tp->root); > > > -} > > > - > > > static void __fl_put(struct cls_fl_filter *f) > > > { > > > if (!refcount_dec_and_test(&f->refcnt)) > > > return; > > > > > > - WARN_ON(!f->deleted); > > > - > > > if (tcf_exts_get_net(&f->exts)) > > > tcf_queue_work(&f->rwork, fl_destroy_filter_work); > > > else > > > @@ -1522,6 +1529,7 @@ static int fl_change(struct net *net, > > > struct > > > sk_buff *in_skb, > > > err = -ENOBUFS; > > > goto errout_tb; > > > } > > > + INIT_LIST_HEAD(&fnew->hw_list); > > > refcount_set(&fnew->refcnt, 1); > > > > > > err = tcf_exts_init(&fnew->exts, net, TCA_FLOWER_ACT, 0); > > > @@ -1569,7 +1577,6 @@ static int fl_change(struct net *net, > > > struct > > > sk_buff *in_skb, > > > goto errout_hw; > > > } > > > > > > - refcount_inc(&fnew->refcnt); > > > if (fold) { > > > /* Fold filter was deleted concurrently. Retry lookup. > > > */ > > > if (fold->deleted) { > > > @@ -1591,6 +1598,7 @@ static int fl_change(struct net *net, > > > struct > > > sk_buff *in_skb, > > > in_ht = true; > > > } > > > > > > + refcount_inc(&fnew->refcnt); > > > rhashtable_remove_fast(&fold->mask->ht, > > > &fold->ht_node, > > > fold->mask->filter_ht_params); > > > @@ -1631,6 +1639,7 @@ static int fl_change(struct net *net, > > > struct > > > sk_buff *in_skb, > > > if (err) > > > goto errout_hw; > > > > > > + refcount_inc(&fnew->refcnt); > > > fnew->handle = handle; > > > list_add_tail_rcu(&fnew->list, &fnew->mask->filters); > > > spin_unlock(&tp->lock); > > > @@ -1642,19 +1651,20 @@ static int fl_change(struct net *net, > > > struct > > > sk_buff *in_skb, > > > kfree(mask); > > > return 0; > > > > > > +errout_ht: > > > + spin_lock(&tp->lock); > > > errout_hw: > > > + fnew->deleted = true; > > > spin_unlock(&tp->lock); > > > if (!tc_skip_hw(fnew->flags)) > > > fl_hw_destroy_filter(tp, fnew, rtnl_held, NULL); > > > -errout_ht: > > > if (in_ht) > > > rhashtable_remove_fast(&fnew->mask->ht, &fnew->ht_node, > > > fnew->mask->filter_ht_params); > > > errout_mask: > > > fl_mask_put(head, fnew->mask); > > > errout: > > > - tcf_exts_get_net(&fnew->exts); > > > - tcf_queue_work(&fnew->rwork, fl_destroy_filter_work); > > > + __fl_put(fnew); > > > errout_tb: > > > kfree(tb); > > > errout_mask_alloc: > > > @@ -1699,16 +1709,44 @@ static void fl_walk(struct tcf_proto *tp, > > > struct tcf_walker *arg, > > > } > > > } > > > > > > +static struct cls_fl_filter * > > > +fl_get_next_hw_filter(struct tcf_proto *tp, struct cls_fl_filter > > > *f, > > > bool add) > > > +{ > > > + struct cls_fl_head *head = fl_head_dereference(tp); > > > + > > > + spin_lock(&tp->lock); > > > + if (!f) { > > > + if (list_empty(&head->hw_filters)) { > > > + spin_unlock(&tp->lock); > > > + return NULL; > > > + } > > > > Shouldn't this be a pre-condition to the whole function ? i mean > > regardless of whether 'f' is NULL or not ? > > List can't be empty if we already have an element of the list (f), so > why check this on every iteration? >
Because you release the lock on every iteration, you can't just assume that the list is still intact. > > > + > > > + f = list_first_entry(&head->hw_filters, struct > > > cls_fl_filter, > > > + hw_list); > > > + } else { > > > + f = list_next_entry(f, hw_list); > > > + } > > > + > > > > Maybe if you use list_for_each_entry_continue below, might > > simplify > > the above logic. it is weird that you need to figure out next entry > > then call list_for_each_from, list 'continue' variation is made for > > such use cases. > > list_for_each_entry_continue requires initialized cursor and will > skip > first element if we obtain initial cursor with list_first_entry(). We > can have two loops - one that uses list_for_each_entry for initial > iteration, and another one that uses list_for_each_entry_continue for > case when f!=NULL, but I don't see how that would be any simpler. > you can set the initial cursor to list->head and not first entry. It would be simpler if you do: if (list_empty()) return NULL; pos = f ? f : list->head; list_for_each_entry_continue(pos, ...); > > > + list_for_each_entry_from(f, &head->hw_filters, hw_list) { > > > + if (!(add && f->deleted) && refcount_inc_not_zero(&f- > > > > refcnt)) { > > > + spin_unlock(&tp->lock); > > > + return f; > > > + } > > > + } > > > + > > > + spin_unlock(&tp->lock); > > > + return NULL; > > > +} > > > + > > > static int fl_reoffload(struct tcf_proto *tp, bool add, > > > tc_setup_cb_t *cb, > > > void *cb_priv, struct netlink_ext_ack *extack) > > > { > > > struct tc_cls_flower_offload cls_flower = {}; > > > struct tcf_block *block = tp->chain->block; > > > - unsigned long handle = 0; > > > - struct cls_fl_filter *f; > > > + struct cls_fl_filter *f = NULL; > > > int err; > > > > > > - while ((f = fl_get_next_filter(tp, &handle))) { > > > + while ((f = fl_get_next_hw_filter(tp, f, add))) { > > > if (tc_skip_hw(f->flags)) > > > goto next_flow; > > > > this can never be true as it is already a pre-condition for > > fl_hw_replace_filter which actually adds to the hw_filters list, i > > think it needs to be removed. if it is, then i think it should be > > part > > of fl_get_next_hw_filter and not the caller responsibility. > > Good catch. > > > > @@ -1757,7 +1795,6 @@ static int fl_reoffload(struct tcf_proto > > > *tp, > > > bool add, tc_setup_cb_t *cb, > > > add); > > > spin_unlock(&tp->lock); > > > next_flow: > > > - handle++; > > > __fl_put(f); > > > } > > >