On Mon, 2019-04-22 at 10:21 +0300, Vlad Buslov wrote: > Recent changes that introduced unlocked flower did not properly > account for > case when reoffload is initiated concurrently with filter updates. To > fix > the issue, extend flower with 'hw_filters' list that is used to store > filters that don't have 'skip_hw' flag set. Filter is added to the > list > when it is inserted to hardware and only removed from it after being > unoffloaded from all drivers that parent block is attached to. This > ensures > that concurrent reoffload can still access filter that is being > deleted and > prevents race condition when driver callback can be removed when > filter is > no longer accessible trough idr, but is still present in hardware. > > Refactor fl_change() to respect new filter reference counter and to > release > filter reference with __fl_put() in case of error, instead of > directly > deallocating filter memory. This allows for concurrent access to > filter > from fl_reoffload() and protects it with reference counting. Refactor > fl_reoffload() to iterate over hw_filters list instead of idr. > Implement > fl_get_next_hw_filter() helper function that is used to iterate over > hw_filters list with reference counting and skips filters that are > being > concurrently deleted. > > Fixes: 92149190067d ("net: sched: flower: set unlocked flag for > flower proto ops") > Signed-off-by: Vlad Buslov <vla...@mellanox.com> > Reported-by: Jakub Kicinski <jakub.kicin...@netronome.com> > --- > net/sched/cls_flower.c | 77 +++++++++++++++++++++++++++++++--------- > -- > 1 file changed, 57 insertions(+), 20 deletions(-) > > diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c > index 4b5585358699..524b86560af3 100644 > --- a/net/sched/cls_flower.c > +++ b/net/sched/cls_flower.c > @@ -90,6 +90,7 @@ struct cls_fl_head { > struct rhashtable ht; > spinlock_t masks_lock; /* Protect masks list */ > struct list_head masks; > + struct list_head hw_filters; > struct rcu_work rwork; > struct idr handle_idr; > }; > @@ -102,6 +103,7 @@ struct cls_fl_filter { > struct tcf_result res; > struct fl_flow_key key; > struct list_head list; > + struct list_head hw_list; > u32 handle; > u32 flags; > u32 in_hw_count; > @@ -315,6 +317,7 @@ static int fl_init(struct tcf_proto *tp) > > spin_lock_init(&head->masks_lock); > INIT_LIST_HEAD_RCU(&head->masks); > + INIT_LIST_HEAD(&head->hw_filters); > rcu_assign_pointer(tp->root, head); > idr_init(&head->handle_idr); > > @@ -352,6 +355,16 @@ static bool fl_mask_put(struct cls_fl_head > *head, struct fl_flow_mask *mask) > return true; > } > > +static struct cls_fl_head *fl_head_dereference(struct tcf_proto *tp) > +{ > + /* Flower classifier only changes root pointer during init and > destroy. > + * Users must obtain reference to tcf_proto instance before > calling its > + * API, so tp->root pointer is protected from concurrent call > to > + * fl_destroy() by reference counting. > + */ > + return rcu_dereference_raw(tp->root); > +} > + > static void __fl_destroy_filter(struct cls_fl_filter *f) > { > tcf_exts_destroy(&f->exts); > @@ -382,6 +395,8 @@ static void fl_hw_destroy_filter(struct tcf_proto > *tp, struct cls_fl_filter *f, > > tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, > false); > spin_lock(&tp->lock); > + if (!list_empty(&f->hw_list)) > + list_del_init(&f->hw_list); > tcf_block_offload_dec(block, &f->flags); > spin_unlock(&tp->lock); > > @@ -393,6 +408,7 @@ static int fl_hw_replace_filter(struct tcf_proto > *tp, > struct cls_fl_filter *f, bool > rtnl_held, > struct netlink_ext_ack *extack) > { > + struct cls_fl_head *head = fl_head_dereference(tp); > struct tc_cls_flower_offload cls_flower = {}; > struct tcf_block *block = tp->chain->block; > bool skip_sw = tc_skip_sw(f->flags); > @@ -444,6 +460,9 @@ static int fl_hw_replace_filter(struct tcf_proto > *tp, > goto errout; > } > > + spin_lock(&tp->lock); > + list_add(&f->hw_list, &head->hw_filters); > + spin_unlock(&tp->lock); > errout: > if (!rtnl_held) > rtnl_unlock(); > @@ -475,23 +494,11 @@ static void fl_hw_update_stats(struct tcf_proto > *tp, struct cls_fl_filter *f, > rtnl_unlock(); > } > > -static struct cls_fl_head *fl_head_dereference(struct tcf_proto *tp) > -{ > - /* Flower classifier only changes root pointer during init and > destroy. > - * Users must obtain reference to tcf_proto instance before > calling its > - * API, so tp->root pointer is protected from concurrent call > to > - * fl_destroy() by reference counting. > - */ > - return rcu_dereference_raw(tp->root); > -} > - > static void __fl_put(struct cls_fl_filter *f) > { > if (!refcount_dec_and_test(&f->refcnt)) > return; > > - WARN_ON(!f->deleted); > - > if (tcf_exts_get_net(&f->exts)) > tcf_queue_work(&f->rwork, fl_destroy_filter_work); > else > @@ -1522,6 +1529,7 @@ static int fl_change(struct net *net, struct > sk_buff *in_skb, > err = -ENOBUFS; > goto errout_tb; > } > + INIT_LIST_HEAD(&fnew->hw_list); > refcount_set(&fnew->refcnt, 1); > > err = tcf_exts_init(&fnew->exts, net, TCA_FLOWER_ACT, 0); > @@ -1569,7 +1577,6 @@ static int fl_change(struct net *net, struct > sk_buff *in_skb, > goto errout_hw; > } > > - refcount_inc(&fnew->refcnt); > if (fold) { > /* Fold filter was deleted concurrently. Retry lookup. > */ > if (fold->deleted) { > @@ -1591,6 +1598,7 @@ static int fl_change(struct net *net, struct > sk_buff *in_skb, > in_ht = true; > } > > + refcount_inc(&fnew->refcnt); > rhashtable_remove_fast(&fold->mask->ht, > &fold->ht_node, > fold->mask->filter_ht_params); > @@ -1631,6 +1639,7 @@ static int fl_change(struct net *net, struct > sk_buff *in_skb, > if (err) > goto errout_hw; > > + refcount_inc(&fnew->refcnt); > fnew->handle = handle; > list_add_tail_rcu(&fnew->list, &fnew->mask->filters); > spin_unlock(&tp->lock); > @@ -1642,19 +1651,20 @@ static int fl_change(struct net *net, struct > sk_buff *in_skb, > kfree(mask); > return 0; > > +errout_ht: > + spin_lock(&tp->lock); > errout_hw: > + fnew->deleted = true; > spin_unlock(&tp->lock); > if (!tc_skip_hw(fnew->flags)) > fl_hw_destroy_filter(tp, fnew, rtnl_held, NULL); > -errout_ht: > if (in_ht) > rhashtable_remove_fast(&fnew->mask->ht, &fnew->ht_node, > fnew->mask->filter_ht_params); > errout_mask: > fl_mask_put(head, fnew->mask); > errout: > - tcf_exts_get_net(&fnew->exts); > - tcf_queue_work(&fnew->rwork, fl_destroy_filter_work); > + __fl_put(fnew); > errout_tb: > kfree(tb); > errout_mask_alloc: > @@ -1699,16 +1709,44 @@ static void fl_walk(struct tcf_proto *tp, > struct tcf_walker *arg, > } > } > > +static struct cls_fl_filter * > +fl_get_next_hw_filter(struct tcf_proto *tp, struct cls_fl_filter *f, > bool add) > +{ > + struct cls_fl_head *head = fl_head_dereference(tp); > + > + spin_lock(&tp->lock); > + if (!f) { > + if (list_empty(&head->hw_filters)) { > + spin_unlock(&tp->lock); > + return NULL; > + }
Shouldn't this be a pre-condition to the whole function ? i mean regardless of whether 'f' is NULL or not ? > + > + f = list_first_entry(&head->hw_filters, struct > cls_fl_filter, > + hw_list); > + } else { > + f = list_next_entry(f, hw_list); > + } > + Maybe if you use list_for_each_entry_continue below, might simplify the above logic. it is weird that you need to figure out next entry then call list_for_each_from, list 'continue' variation is made for such use cases. > + list_for_each_entry_from(f, &head->hw_filters, hw_list) { > + if (!(add && f->deleted) && refcount_inc_not_zero(&f- > >refcnt)) { > + spin_unlock(&tp->lock); > + return f; > + } > + } > + > + spin_unlock(&tp->lock); > + return NULL; > +} > + > static int fl_reoffload(struct tcf_proto *tp, bool add, > tc_setup_cb_t *cb, > void *cb_priv, struct netlink_ext_ack *extack) > { > struct tc_cls_flower_offload cls_flower = {}; > struct tcf_block *block = tp->chain->block; > - unsigned long handle = 0; > - struct cls_fl_filter *f; > + struct cls_fl_filter *f = NULL; > int err; > > - while ((f = fl_get_next_filter(tp, &handle))) { > + while ((f = fl_get_next_hw_filter(tp, f, add))) { > if (tc_skip_hw(f->flags)) > goto next_flow; this can never be true as it is already a pre-condition for fl_hw_replace_filter which actually adds to the hw_filters list, i think it needs to be removed. if it is, then i think it should be part of fl_get_next_hw_filter and not the caller responsibility. > > @@ -1757,7 +1795,6 @@ static int fl_reoffload(struct tcf_proto *tp, > bool add, tc_setup_cb_t *cb, > add); > spin_unlock(&tp->lock); > next_flow: > - handle++; > __fl_put(f); > } >