On Wed, Apr 3, 2019 at 5:42 PM Vlad Buslov <vla...@mellanox.com> wrote: > > > On Wed 03 Apr 2019 at 15:37, John Hurley <john.hur...@netronome.com> wrote: > > Recent refactoring of fl_change aims to use the classifier spinlock to > > avoid the need for rtnl lock. In doing so, the fl_hw_replace_filer() > > function was moved to before the lock is taken. This can create problems > > for drivers if duplicate filters are created (commmon in ovs tc offload > > due to filters being triggered by user-space matches). > > > > Drivers registered for such filters will now receive multiple copies of > > the same rule, each with a different cookie value. This means that the > > drivers would need to do a full match field lookup to determine > > duplicates, repeating work that will happen in flower __fl_lookup(). > > Currently, drivers do not expect to receive duplicate filters. > > > > Fix this by moving the hw_replace_function to after the software filter > > processing. This way, offload messages are only triggered after they are > > verified. Add a new flag to each filter that indicates that it is being > > sent to hw. This prevents the flow from being deleted before processing is > > finished, even if the tp lock is released. > > > > NOTE: > > There may still be a race condition here with the reoffload function. When > > the __SENDING_TO_HW bit is set we do not know if the filter has actually > > been sent to HW or not at time of reoffload. This means we could > > potentially not offload a valid flow here (or not delete one). > > > > Fixes: 620da4860827 ("net: sched: flower: refactor fl_change") > > Signed-off-by: John Hurley <john.hur...@netronome.com> > > Reviewed-by: Simon Horman <simon.hor...@netronome.com> > > Acked-by: Jakub Kicinski <jakub.kicin...@netronome.com> > > --- > > net/sched/cls_flower.c | 110 > > ++++++++++++++++++++++++++++++++++++++++--------- > > 1 file changed, 90 insertions(+), 20 deletions(-) > > > > diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c > > index 0638f17..e2c144f 100644 > > --- a/net/sched/cls_flower.c > > +++ b/net/sched/cls_flower.c > > @@ -94,6 +94,10 @@ struct cls_fl_head { > > struct idr handle_idr; > > }; > > > > +enum cls_fl_filter_state_t { > > + __SENDING_TO_HW, > > +}; > > + > > struct cls_fl_filter { > > struct fl_flow_mask *mask; > > struct rhash_head ht_node; > > @@ -113,6 +117,7 @@ struct cls_fl_filter { > > */ > > refcount_t refcnt; > > bool deleted; > > + unsigned long atomic_flags; > > }; > > > > static const struct rhashtable_params mask_ht_params = { > > @@ -542,12 +547,18 @@ static int __fl_delete(struct tcf_proto *tp, struct > > cls_fl_filter *f, > > > > *last = false; > > > > +replay: > > spin_lock(&tp->lock); > > if (f->deleted) { > > spin_unlock(&tp->lock); > > return -ENOENT; > > } > > > > + if (test_bit(__SENDING_TO_HW, &f->atomic_flags)) { > > + spin_unlock(&tp->lock); > > + goto replay; > > + } > > + > > f->deleted = true; > > rhashtable_remove_fast(&f->mask->ht, &f->ht_node, > > f->mask->filter_ht_params); > > @@ -1528,15 +1539,6 @@ static int fl_change(struct net *net, struct sk_buff > > *in_skb, > > if (err) > > goto errout; > > > > - if (!tc_skip_hw(fnew->flags)) { > > - err = fl_hw_replace_filter(tp, fnew, rtnl_held, extack); > > - if (err) > > - goto errout_mask; > > - } > > - > > - if (!tc_in_hw(fnew->flags)) > > - fnew->flags |= TCA_CLS_FLAGS_NOT_IN_HW; > > - > > spin_lock(&tp->lock); > > > > /* tp was deleted concurrently. -EAGAIN will cause caller to lookup > > @@ -1544,15 +1546,18 @@ static int fl_change(struct net *net, struct > > sk_buff *in_skb, > > */ > > if (tp->deleting) { > > err = -EAGAIN; > > - goto errout_hw; > > + goto errout_mask; > > } > > > > refcount_inc(&fnew->refcnt); > > if (fold) { > > - /* Fold filter was deleted concurrently. Retry lookup. */ > > - if (fold->deleted) { > > + /* Fold filter was deleted or is being added to hw > > concurrently. > > + * Retry lookup. > > + */ > > + if (fold->deleted || > > + test_bit(__SENDING_TO_HW, &fold->atomic_flags)) { > > err = -EAGAIN; > > - goto errout_hw; > > + goto errout_mask; > > } > > > > fnew->handle = handle; > > @@ -1560,7 +1565,7 @@ static int fl_change(struct net *net, struct sk_buff > > *in_skb, > > err = rhashtable_insert_fast(&fnew->mask->ht, &fnew->ht_node, > > fnew->mask->filter_ht_params); > > if (err) > > - goto errout_hw; > > + goto errout_mask; > > > > rhashtable_remove_fast(&fold->mask->ht, > > &fold->ht_node, > > @@ -1568,9 +1573,23 @@ static int fl_change(struct net *net, struct sk_buff > > *in_skb, > > idr_replace(&head->handle_idr, fnew, fnew->handle); > > list_replace_rcu(&fold->list, &fnew->list); > > fold->deleted = true; > > + if (!tc_skip_hw(fnew->flags)) > > + set_bit(__SENDING_TO_HW, &fnew->atomic_flags); > > > > spin_unlock(&tp->lock); > > > > + if (!tc_skip_hw(fnew->flags)) { > > + err = fl_hw_replace_filter(tp, fnew, rtnl_held, > > extack); > > + if (err) { > > + spin_lock(&tp->lock); > > + goto errout_hw_replace; > > + } > > + } > > + > > + if (!tc_in_hw(fnew->flags)) > > + fnew->flags |= TCA_CLS_FLAGS_NOT_IN_HW; > > + > > + clear_bit(__SENDING_TO_HW, &fnew->atomic_flags); > > fl_mask_put(head, fold->mask, true); > > if (!tc_skip_hw(fold->flags)) > > fl_hw_destroy_filter(tp, fold, rtnl_held, NULL); > > @@ -1584,7 +1603,7 @@ static int fl_change(struct net *net, struct sk_buff > > *in_skb, > > } else { > > if (__fl_lookup(fnew->mask, &fnew->mkey)) { > > err = -EEXIST; > > - goto errout_hw; > > + goto errout_mask; > > } > > > > if (handle) { > > @@ -1606,7 +1625,7 @@ static int fl_change(struct net *net, struct sk_buff > > *in_skb, > > INT_MAX, GFP_ATOMIC); > > } > > if (err) > > - goto errout_hw; > > + goto errout_mask; > > > > fnew->handle = handle; > > > > @@ -1616,7 +1635,22 @@ static int fl_change(struct net *net, struct sk_buff > > *in_skb, > > goto errout_idr; > > > > list_add_tail_rcu(&fnew->list, &fnew->mask->filters); > > + if (!tc_skip_hw(fnew->flags)) > > + set_bit(__SENDING_TO_HW, &fnew->atomic_flags); > > + > > spin_unlock(&tp->lock); > > + > > + if (!tc_skip_hw(fnew->flags)) { > > + err = fl_hw_replace_filter(tp, fnew, rtnl_held, > > extack); > > + if (err) { > > + spin_lock(&tp->lock); > > + goto errout_rhash; > > + } > > + } > > + > > + if (!tc_in_hw(fnew->flags)) > > + fnew->flags |= TCA_CLS_FLAGS_NOT_IN_HW; > > + clear_bit(__SENDING_TO_HW, &fnew->atomic_flags); > > } > > > > *arg = fnew; > > @@ -1625,13 +1659,37 @@ static int fl_change(struct net *net, struct > > sk_buff *in_skb, > > kfree(mask); > > return 0; > > > > +errout_hw_replace: > > + if (rhashtable_insert_fast(&fold->mask->ht, &fold->ht_node, > > + fold->mask->filter_ht_params)) { > > + NL_SET_ERR_MSG(extack, "Filter replace failed and could not > > revert."); > > + fl_mask_put(head, fold->mask, true); > > + if (!tc_skip_hw(fold->flags)) > > + fl_hw_destroy_filter(tp, fold, rtnl_held, NULL); > > + tcf_unbind_filter(tp, &fold->res); > > + tcf_exts_get_net(&fold->exts); > > + refcount_dec(&fold->refcnt); > > + __fl_put(fold); > > + } else { > > + idr_replace(&head->handle_idr, fold, fold->handle); > > + list_replace_rcu(&fnew->list, &fold->list); > > + fold->deleted = false; > > + } > > +errout_rhash: > > + if (fnew->deleted) { > > + spin_unlock(&tp->lock); > > + kfree(tb); > > + kfree(mask); > > + return err; > > + } > > + list_del_rcu(&fnew->list); > > + rhashtable_remove_fast(&fnew->mask->ht, &fnew->ht_node, > > + fnew->mask->filter_ht_params); > > + fnew->deleted = true; > > errout_idr: > > idr_remove(&head->handle_idr, fnew->handle); > > -errout_hw: > > - spin_unlock(&tp->lock); > > - if (!tc_skip_hw(fnew->flags)) > > - fl_hw_destroy_filter(tp, fnew, rtnl_held, NULL); > > errout_mask: > > + spin_unlock(&tp->lock); > > fl_mask_put(head, fnew->mask, true); > > errout: > > tcf_exts_destroy(&fnew->exts); > > @@ -1669,6 +1727,12 @@ static void fl_walk(struct tcf_proto *tp, struct > > tcf_walker *arg, > > arg->count = arg->skip; > > > > while ((f = fl_get_next_filter(tp, &arg->cookie)) != NULL) { > > + if (test_bit(__SENDING_TO_HW, &f->atomic_flags)) { > > + __fl_put(f); > > + arg->cookie++; > > + continue; > > + } > > + > > if (arg->fn(tp, f, arg) < 0) { > > __fl_put(f); > > arg->stop = 1; > > @@ -1695,6 +1759,9 @@ static int fl_reoffload(struct tcf_proto *tp, bool > > add, tc_setup_cb_t *cb, > > if (tc_skip_hw(f->flags)) > > continue; > > > > + if (test_bit(__SENDING_TO_HW, &f->atomic_flags)) > > + continue; > > + > > cls_flower.rule = > > > > flow_rule_alloc(tcf_exts_num_actions(&f->exts)); > > if (!cls_flower.rule) > > @@ -2273,6 +2340,9 @@ static int fl_dump(struct net *net, struct tcf_proto > > *tp, void *fh, > > if (!f) > > return skb->len; > > > > + if (test_bit(__SENDING_TO_HW, &f->atomic_flags)) > > + return skb->len; > > + > > t->tcm_handle = f->handle; > > > > nest = nla_nest_start(skb, TCA_OPTIONS); > > Hi John, > > I understand problem that you are trying to fix, but I intentionally > made fl_change() to offload filters before making them visible to > concurrent tasks through flower data structures (hashtable, idr, linked > list) because it removes the headache that you are dealing with by means > of __SENDING_TO_HW flag. > > Maybe we can come up with something simpler? For example, we can check > for duplicates and insert the filter before calling hw offloads to hash > table only, and mark it with something like your __SENDING_TO_HW flag. > Hashtable is only used for fast path lookup and for checking for > duplicates in fl_chage(), which means that flow is not visible to > concurrent accesses through fl_walk() and fl_get(). With this, we can > modify fl_lookup() to return NULL for all flows marked with > __SENDING_TO_HW flag in order for the flow to be ignored by fast path. > > I might be missing something, but such implementation would be much > simpler and less error prone, and wouldn't race with reoffload. > > You need not be fixing my bug instead of me, so please let me know if > you prefer to work on it yourself or let me do it. >
Hi Vlad, Yes, I think that would work. Feel free to post a fix. Likewise, I'm happy to do so if you do not have the time. John > Thanks, > Vlad