On Mon 22 Apr 2019 at 23:34, Saeed Mahameed <sae...@mellanox.com> wrote: > On Mon, 2019-04-22 at 10:21 +0300, Vlad Buslov wrote: >> Recent changes that introduced unlocked flower did not properly >> account for >> case when reoffload is initiated concurrently with filter updates. To >> fix >> the issue, extend flower with 'hw_filters' list that is used to store >> filters that don't have 'skip_hw' flag set. Filter is added to the >> list >> when it is inserted to hardware and only removed from it after being >> unoffloaded from all drivers that parent block is attached to. This >> ensures >> that concurrent reoffload can still access filter that is being >> deleted and >> prevents race condition when driver callback can be removed when >> filter is >> no longer accessible trough idr, but is still present in hardware. >> >> Refactor fl_change() to respect new filter reference counter and to >> release >> filter reference with __fl_put() in case of error, instead of >> directly >> deallocating filter memory. This allows for concurrent access to >> filter >> from fl_reoffload() and protects it with reference counting. Refactor >> fl_reoffload() to iterate over hw_filters list instead of idr. >> Implement >> fl_get_next_hw_filter() helper function that is used to iterate over >> hw_filters list with reference counting and skips filters that are >> being >> concurrently deleted. >> >> Fixes: 92149190067d ("net: sched: flower: set unlocked flag for >> flower proto ops") >> Signed-off-by: Vlad Buslov <vla...@mellanox.com> >> Reported-by: Jakub Kicinski <jakub.kicin...@netronome.com> >> --- >> net/sched/cls_flower.c | 77 +++++++++++++++++++++++++++++++--------- >> -- >> 1 file changed, 57 insertions(+), 20 deletions(-) >> >> diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c >> index 4b5585358699..524b86560af3 100644 >> --- a/net/sched/cls_flower.c >> +++ b/net/sched/cls_flower.c >> @@ -90,6 +90,7 @@ struct cls_fl_head { >> struct rhashtable ht; >> spinlock_t masks_lock; /* Protect masks list */ >> struct list_head masks; >> + struct list_head hw_filters; >> struct rcu_work rwork; >> struct idr handle_idr; >> }; >> @@ -102,6 +103,7 @@ struct cls_fl_filter { >> struct tcf_result res; >> struct fl_flow_key key; >> struct list_head list; >> + struct list_head hw_list; >> u32 handle; >> u32 flags; >> u32 in_hw_count; >> @@ -315,6 +317,7 @@ static int fl_init(struct tcf_proto *tp) >> >> spin_lock_init(&head->masks_lock); >> INIT_LIST_HEAD_RCU(&head->masks); >> + INIT_LIST_HEAD(&head->hw_filters); >> rcu_assign_pointer(tp->root, head); >> idr_init(&head->handle_idr); >> >> @@ -352,6 +355,16 @@ static bool fl_mask_put(struct cls_fl_head >> *head, struct fl_flow_mask *mask) >> return true; >> } >> >> +static struct cls_fl_head *fl_head_dereference(struct tcf_proto *tp) >> +{ >> + /* Flower classifier only changes root pointer during init and >> destroy. >> + * Users must obtain reference to tcf_proto instance before >> calling its >> + * API, so tp->root pointer is protected from concurrent call >> to >> + * fl_destroy() by reference counting. >> + */ >> + return rcu_dereference_raw(tp->root); >> +} >> + >> static void __fl_destroy_filter(struct cls_fl_filter *f) >> { >> tcf_exts_destroy(&f->exts); >> @@ -382,6 +395,8 @@ static void fl_hw_destroy_filter(struct tcf_proto >> *tp, struct cls_fl_filter *f, >> >> tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, >> false); >> spin_lock(&tp->lock); >> + if (!list_empty(&f->hw_list)) >> + list_del_init(&f->hw_list); >> tcf_block_offload_dec(block, &f->flags); >> spin_unlock(&tp->lock); >> >> @@ -393,6 +408,7 @@ static int fl_hw_replace_filter(struct tcf_proto >> *tp, >> struct cls_fl_filter *f, bool >> rtnl_held, >> struct netlink_ext_ack *extack) >> { >> + struct cls_fl_head *head = fl_head_dereference(tp); >> struct tc_cls_flower_offload cls_flower = {}; >> struct tcf_block *block = tp->chain->block; >> bool skip_sw = tc_skip_sw(f->flags); >> @@ -444,6 +460,9 @@ static int fl_hw_replace_filter(struct tcf_proto >> *tp, >> goto errout; >> } >> >> + spin_lock(&tp->lock); >> + list_add(&f->hw_list, &head->hw_filters); >> + spin_unlock(&tp->lock); >> errout: >> if (!rtnl_held) >> rtnl_unlock(); >> @@ -475,23 +494,11 @@ static void fl_hw_update_stats(struct tcf_proto >> *tp, struct cls_fl_filter *f, >> rtnl_unlock(); >> } >> >> -static struct cls_fl_head *fl_head_dereference(struct tcf_proto *tp) >> -{ >> - /* Flower classifier only changes root pointer during init and >> destroy. >> - * Users must obtain reference to tcf_proto instance before >> calling its >> - * API, so tp->root pointer is protected from concurrent call >> to >> - * fl_destroy() by reference counting. >> - */ >> - return rcu_dereference_raw(tp->root); >> -} >> - >> static void __fl_put(struct cls_fl_filter *f) >> { >> if (!refcount_dec_and_test(&f->refcnt)) >> return; >> >> - WARN_ON(!f->deleted); >> - >> if (tcf_exts_get_net(&f->exts)) >> tcf_queue_work(&f->rwork, fl_destroy_filter_work); >> else >> @@ -1522,6 +1529,7 @@ static int fl_change(struct net *net, struct >> sk_buff *in_skb, >> err = -ENOBUFS; >> goto errout_tb; >> } >> + INIT_LIST_HEAD(&fnew->hw_list); >> refcount_set(&fnew->refcnt, 1); >> >> err = tcf_exts_init(&fnew->exts, net, TCA_FLOWER_ACT, 0); >> @@ -1569,7 +1577,6 @@ static int fl_change(struct net *net, struct >> sk_buff *in_skb, >> goto errout_hw; >> } >> >> - refcount_inc(&fnew->refcnt); >> if (fold) { >> /* Fold filter was deleted concurrently. Retry lookup. >> */ >> if (fold->deleted) { >> @@ -1591,6 +1598,7 @@ static int fl_change(struct net *net, struct >> sk_buff *in_skb, >> in_ht = true; >> } >> >> + refcount_inc(&fnew->refcnt); >> rhashtable_remove_fast(&fold->mask->ht, >> &fold->ht_node, >> fold->mask->filter_ht_params); >> @@ -1631,6 +1639,7 @@ static int fl_change(struct net *net, struct >> sk_buff *in_skb, >> if (err) >> goto errout_hw; >> >> + refcount_inc(&fnew->refcnt); >> fnew->handle = handle; >> list_add_tail_rcu(&fnew->list, &fnew->mask->filters); >> spin_unlock(&tp->lock); >> @@ -1642,19 +1651,20 @@ static int fl_change(struct net *net, struct >> sk_buff *in_skb, >> kfree(mask); >> return 0; >> >> +errout_ht: >> + spin_lock(&tp->lock); >> errout_hw: >> + fnew->deleted = true; >> spin_unlock(&tp->lock); >> if (!tc_skip_hw(fnew->flags)) >> fl_hw_destroy_filter(tp, fnew, rtnl_held, NULL); >> -errout_ht: >> if (in_ht) >> rhashtable_remove_fast(&fnew->mask->ht, &fnew->ht_node, >> fnew->mask->filter_ht_params); >> errout_mask: >> fl_mask_put(head, fnew->mask); >> errout: >> - tcf_exts_get_net(&fnew->exts); >> - tcf_queue_work(&fnew->rwork, fl_destroy_filter_work); >> + __fl_put(fnew); >> errout_tb: >> kfree(tb); >> errout_mask_alloc: >> @@ -1699,16 +1709,44 @@ static void fl_walk(struct tcf_proto *tp, >> struct tcf_walker *arg, >> } >> } >> >> +static struct cls_fl_filter * >> +fl_get_next_hw_filter(struct tcf_proto *tp, struct cls_fl_filter *f, >> bool add) >> +{ >> + struct cls_fl_head *head = fl_head_dereference(tp); >> + >> + spin_lock(&tp->lock); >> + if (!f) { >> + if (list_empty(&head->hw_filters)) { >> + spin_unlock(&tp->lock); >> + return NULL; >> + } > > Shouldn't this be a pre-condition to the whole function ? i mean > regardless of whether 'f' is NULL or not ?
List can't be empty if we already have an element of the list (f), so why check this on every iteration? > >> + >> + f = list_first_entry(&head->hw_filters, struct >> cls_fl_filter, >> + hw_list); >> + } else { >> + f = list_next_entry(f, hw_list); >> + } >> + > > Maybe if you use list_for_each_entry_continue below, might simplify > the above logic. it is weird that you need to figure out next entry > then call list_for_each_from, list 'continue' variation is made for > such use cases. list_for_each_entry_continue requires initialized cursor and will skip first element if we obtain initial cursor with list_first_entry(). We can have two loops - one that uses list_for_each_entry for initial iteration, and another one that uses list_for_each_entry_continue for case when f!=NULL, but I don't see how that would be any simpler. > >> + list_for_each_entry_from(f, &head->hw_filters, hw_list) { >> + if (!(add && f->deleted) && refcount_inc_not_zero(&f- >> >refcnt)) { >> + spin_unlock(&tp->lock); >> + return f; >> + } >> + } >> + >> + spin_unlock(&tp->lock); >> + return NULL; >> +} >> + >> static int fl_reoffload(struct tcf_proto *tp, bool add, >> tc_setup_cb_t *cb, >> void *cb_priv, struct netlink_ext_ack *extack) >> { >> struct tc_cls_flower_offload cls_flower = {}; >> struct tcf_block *block = tp->chain->block; >> - unsigned long handle = 0; >> - struct cls_fl_filter *f; >> + struct cls_fl_filter *f = NULL; >> int err; >> >> - while ((f = fl_get_next_filter(tp, &handle))) { >> + while ((f = fl_get_next_hw_filter(tp, f, add))) { >> if (tc_skip_hw(f->flags)) >> goto next_flow; > > this can never be true as it is already a pre-condition for > fl_hw_replace_filter which actually adds to the hw_filters list, i > think it needs to be removed. if it is, then i think it should be part > of fl_get_next_hw_filter and not the caller responsibility. Good catch. > >> >> @@ -1757,7 +1795,6 @@ static int fl_reoffload(struct tcf_proto *tp, >> bool add, tc_setup_cb_t *cb, >> add); >> spin_unlock(&tp->lock); >> next_flow: >> - handle++; >> __fl_put(f); >> } >>