On Wed, Apr 3, 2019 at 5:42 PM Vlad Buslov <vla...@mellanox.com> wrote:
>
>
> On Wed 03 Apr 2019 at 15:37, John Hurley <john.hur...@netronome.com> wrote:
> > Recent refactoring of fl_change aims to use the classifier spinlock to
> > avoid the need for rtnl lock. In doing so, the fl_hw_replace_filer()
> > function was moved to before the lock is taken. This can create problems
> > for drivers if duplicate filters are created (commmon in ovs tc offload
> > due to filters being triggered by user-space matches).
> >
> > Drivers registered for such filters will now receive multiple copies of
> > the same rule, each with a different cookie value. This means that the
> > drivers would need to do a full match field lookup to determine
> > duplicates, repeating work that will happen in flower __fl_lookup().
> > Currently, drivers do not expect to receive duplicate filters.
> >
> > Fix this by moving the hw_replace_function to after the software filter
> > processing. This way, offload messages are only triggered after they are
> > verified. Add a new flag to each filter that indicates that it is being
> > sent to hw. This prevents the flow from being deleted before processing is
> > finished, even if the tp lock is released.
> >
> > NOTE:
> > There may still be a race condition here with the reoffload function. When
> > the __SENDING_TO_HW bit is set we do not know if the filter has actually
> > been sent to HW or not at time of reoffload. This means we could
> > potentially not offload a valid flow here (or not delete one).
> >
> > Fixes: 620da4860827 ("net: sched: flower: refactor fl_change")
> > Signed-off-by: John Hurley <john.hur...@netronome.com>
> > Reviewed-by: Simon Horman <simon.hor...@netronome.com>
> > Acked-by: Jakub Kicinski <jakub.kicin...@netronome.com>
> > ---
> >  net/sched/cls_flower.c | 110 
> > ++++++++++++++++++++++++++++++++++++++++---------
> >  1 file changed, 90 insertions(+), 20 deletions(-)
> >
> > diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
> > index 0638f17..e2c144f 100644
> > --- a/net/sched/cls_flower.c
> > +++ b/net/sched/cls_flower.c
> > @@ -94,6 +94,10 @@ struct cls_fl_head {
> >       struct idr handle_idr;
> >  };
> >
> > +enum cls_fl_filter_state_t {
> > +     __SENDING_TO_HW,
> > +};
> > +
> >  struct cls_fl_filter {
> >       struct fl_flow_mask *mask;
> >       struct rhash_head ht_node;
> > @@ -113,6 +117,7 @@ struct cls_fl_filter {
> >        */
> >       refcount_t refcnt;
> >       bool deleted;
> > +     unsigned long atomic_flags;
> >  };
> >
> >  static const struct rhashtable_params mask_ht_params = {
> > @@ -542,12 +547,18 @@ static int __fl_delete(struct tcf_proto *tp, struct 
> > cls_fl_filter *f,
> >
> >       *last = false;
> >
> > +replay:
> >       spin_lock(&tp->lock);
> >       if (f->deleted) {
> >               spin_unlock(&tp->lock);
> >               return -ENOENT;
> >       }
> >
> > +     if (test_bit(__SENDING_TO_HW, &f->atomic_flags)) {
> > +             spin_unlock(&tp->lock);
> > +             goto replay;
> > +     }
> > +
> >       f->deleted = true;
> >       rhashtable_remove_fast(&f->mask->ht, &f->ht_node,
> >                              f->mask->filter_ht_params);
> > @@ -1528,15 +1539,6 @@ static int fl_change(struct net *net, struct sk_buff 
> > *in_skb,
> >       if (err)
> >               goto errout;
> >
> > -     if (!tc_skip_hw(fnew->flags)) {
> > -             err = fl_hw_replace_filter(tp, fnew, rtnl_held, extack);
> > -             if (err)
> > -                     goto errout_mask;
> > -     }
> > -
> > -     if (!tc_in_hw(fnew->flags))
> > -             fnew->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
> > -
> >       spin_lock(&tp->lock);
> >
> >       /* tp was deleted concurrently. -EAGAIN will cause caller to lookup
> > @@ -1544,15 +1546,18 @@ static int fl_change(struct net *net, struct 
> > sk_buff *in_skb,
> >        */
> >       if (tp->deleting) {
> >               err = -EAGAIN;
> > -             goto errout_hw;
> > +             goto errout_mask;
> >       }
> >
> >       refcount_inc(&fnew->refcnt);
> >       if (fold) {
> > -             /* Fold filter was deleted concurrently. Retry lookup. */
> > -             if (fold->deleted) {
> > +             /* Fold filter was deleted or is being added to hw 
> > concurrently.
> > +              * Retry lookup.
> > +              */
> > +             if (fold->deleted ||
> > +                 test_bit(__SENDING_TO_HW, &fold->atomic_flags)) {
> >                       err = -EAGAIN;
> > -                     goto errout_hw;
> > +                     goto errout_mask;
> >               }
> >
> >               fnew->handle = handle;
> > @@ -1560,7 +1565,7 @@ static int fl_change(struct net *net, struct sk_buff 
> > *in_skb,
> >               err = rhashtable_insert_fast(&fnew->mask->ht, &fnew->ht_node,
> >                                            fnew->mask->filter_ht_params);
> >               if (err)
> > -                     goto errout_hw;
> > +                     goto errout_mask;
> >
> >               rhashtable_remove_fast(&fold->mask->ht,
> >                                      &fold->ht_node,
> > @@ -1568,9 +1573,23 @@ static int fl_change(struct net *net, struct sk_buff 
> > *in_skb,
> >               idr_replace(&head->handle_idr, fnew, fnew->handle);
> >               list_replace_rcu(&fold->list, &fnew->list);
> >               fold->deleted = true;
> > +             if (!tc_skip_hw(fnew->flags))
> > +                     set_bit(__SENDING_TO_HW, &fnew->atomic_flags);
> >
> >               spin_unlock(&tp->lock);
> >
> > +             if (!tc_skip_hw(fnew->flags)) {
> > +                     err = fl_hw_replace_filter(tp, fnew, rtnl_held, 
> > extack);
> > +                     if (err) {
> > +                             spin_lock(&tp->lock);
> > +                             goto errout_hw_replace;
> > +                     }
> > +             }
> > +
> > +             if (!tc_in_hw(fnew->flags))
> > +                     fnew->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
> > +
> > +             clear_bit(__SENDING_TO_HW, &fnew->atomic_flags);
> >               fl_mask_put(head, fold->mask, true);
> >               if (!tc_skip_hw(fold->flags))
> >                       fl_hw_destroy_filter(tp, fold, rtnl_held, NULL);
> > @@ -1584,7 +1603,7 @@ static int fl_change(struct net *net, struct sk_buff 
> > *in_skb,
> >       } else {
> >               if (__fl_lookup(fnew->mask, &fnew->mkey)) {
> >                       err = -EEXIST;
> > -                     goto errout_hw;
> > +                     goto errout_mask;
> >               }
> >
> >               if (handle) {
> > @@ -1606,7 +1625,7 @@ static int fl_change(struct net *net, struct sk_buff 
> > *in_skb,
> >                                           INT_MAX, GFP_ATOMIC);
> >               }
> >               if (err)
> > -                     goto errout_hw;
> > +                     goto errout_mask;
> >
> >               fnew->handle = handle;
> >
> > @@ -1616,7 +1635,22 @@ static int fl_change(struct net *net, struct sk_buff 
> > *in_skb,
> >                       goto errout_idr;
> >
> >               list_add_tail_rcu(&fnew->list, &fnew->mask->filters);
> > +             if (!tc_skip_hw(fnew->flags))
> > +                     set_bit(__SENDING_TO_HW, &fnew->atomic_flags);
> > +
> >               spin_unlock(&tp->lock);
> > +
> > +             if (!tc_skip_hw(fnew->flags)) {
> > +                     err = fl_hw_replace_filter(tp, fnew, rtnl_held, 
> > extack);
> > +                     if (err) {
> > +                             spin_lock(&tp->lock);
> > +                             goto errout_rhash;
> > +                     }
> > +             }
> > +
> > +             if (!tc_in_hw(fnew->flags))
> > +                     fnew->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
> > +             clear_bit(__SENDING_TO_HW, &fnew->atomic_flags);
> >       }
> >
> >       *arg = fnew;
> > @@ -1625,13 +1659,37 @@ static int fl_change(struct net *net, struct 
> > sk_buff *in_skb,
> >       kfree(mask);
> >       return 0;
> >
> > +errout_hw_replace:
> > +     if (rhashtable_insert_fast(&fold->mask->ht, &fold->ht_node,
> > +                                fold->mask->filter_ht_params)) {
> > +             NL_SET_ERR_MSG(extack, "Filter replace failed and could not 
> > revert.");
> > +             fl_mask_put(head, fold->mask, true);
> > +             if (!tc_skip_hw(fold->flags))
> > +                     fl_hw_destroy_filter(tp, fold, rtnl_held, NULL);
> > +             tcf_unbind_filter(tp, &fold->res);
> > +             tcf_exts_get_net(&fold->exts);
> > +             refcount_dec(&fold->refcnt);
> > +             __fl_put(fold);
> > +     } else {
> > +             idr_replace(&head->handle_idr, fold, fold->handle);
> > +             list_replace_rcu(&fnew->list, &fold->list);
> > +             fold->deleted = false;
> > +     }
> > +errout_rhash:
> > +     if (fnew->deleted) {
> > +             spin_unlock(&tp->lock);
> > +             kfree(tb);
> > +             kfree(mask);
> > +             return err;
> > +     }
> > +     list_del_rcu(&fnew->list);
> > +     rhashtable_remove_fast(&fnew->mask->ht, &fnew->ht_node,
> > +                            fnew->mask->filter_ht_params);
> > +     fnew->deleted = true;
> >  errout_idr:
> >       idr_remove(&head->handle_idr, fnew->handle);
> > -errout_hw:
> > -     spin_unlock(&tp->lock);
> > -     if (!tc_skip_hw(fnew->flags))
> > -             fl_hw_destroy_filter(tp, fnew, rtnl_held, NULL);
> >  errout_mask:
> > +     spin_unlock(&tp->lock);
> >       fl_mask_put(head, fnew->mask, true);
> >  errout:
> >       tcf_exts_destroy(&fnew->exts);
> > @@ -1669,6 +1727,12 @@ static void fl_walk(struct tcf_proto *tp, struct 
> > tcf_walker *arg,
> >       arg->count = arg->skip;
> >
> >       while ((f = fl_get_next_filter(tp, &arg->cookie)) != NULL) {
> > +             if (test_bit(__SENDING_TO_HW, &f->atomic_flags)) {
> > +                     __fl_put(f);
> > +                     arg->cookie++;
> > +                     continue;
> > +             }
> > +
> >               if (arg->fn(tp, f, arg) < 0) {
> >                       __fl_put(f);
> >                       arg->stop = 1;
> > @@ -1695,6 +1759,9 @@ static int fl_reoffload(struct tcf_proto *tp, bool 
> > add, tc_setup_cb_t *cb,
> >                       if (tc_skip_hw(f->flags))
> >                               continue;
> >
> > +                     if (test_bit(__SENDING_TO_HW, &f->atomic_flags))
> > +                             continue;
> > +
> >                       cls_flower.rule =
> >                               
> > flow_rule_alloc(tcf_exts_num_actions(&f->exts));
> >                       if (!cls_flower.rule)
> > @@ -2273,6 +2340,9 @@ static int fl_dump(struct net *net, struct tcf_proto 
> > *tp, void *fh,
> >       if (!f)
> >               return skb->len;
> >
> > +     if (test_bit(__SENDING_TO_HW, &f->atomic_flags))
> > +             return skb->len;
> > +
> >       t->tcm_handle = f->handle;
> >
> >       nest = nla_nest_start(skb, TCA_OPTIONS);
>
> Hi John,
>
> I understand problem that you are trying to fix, but I intentionally
> made fl_change() to offload filters before making them visible to
> concurrent tasks through flower data structures (hashtable, idr, linked
> list) because it removes the headache that you are dealing with by means
> of __SENDING_TO_HW flag.
>
> Maybe we can come up with something simpler? For example, we can check
> for duplicates and insert the filter before calling hw offloads to hash
> table only, and mark it with something like your __SENDING_TO_HW flag.
> Hashtable is only used for fast path lookup and for checking for
> duplicates in fl_chage(), which means that flow is not visible to
> concurrent accesses through fl_walk() and fl_get(). With this, we can
> modify fl_lookup() to return NULL for all flows marked with
> __SENDING_TO_HW flag in order for the flow to be ignored by fast path.
>
> I might be missing something, but such implementation would be much
> simpler and less error prone, and wouldn't race with reoffload.
>
> You need not be fixing my bug instead of me, so please let me know if
> you prefer to work on it yourself or let me do it.
>


Hi Vlad,
Yes, I think that would work.
Feel free to post a fix.
Likewise, I'm happy to do so if you do not have the time.

John


> Thanks,
> Vlad

Reply via email to