On Tue, 2019-04-23 at 07:43 +0000, Vlad Buslov wrote:
> On Mon 22 Apr 2019 at 23:34, Saeed Mahameed <sae...@mellanox.com>
> wrote:
> > On Mon, 2019-04-22 at 10:21 +0300, Vlad Buslov wrote:
> > > Recent changes that introduced unlocked flower did not properly
> > > account for
> > > case when reoffload is initiated concurrently with filter
> > > updates. To
> > > fix
> > > the issue, extend flower with 'hw_filters' list that is used to
> > > store
> > > filters that don't have 'skip_hw' flag set. Filter is added to
> > > the
> > > list
> > > when it is inserted to hardware and only removed from it after
> > > being
> > > unoffloaded from all drivers that parent block is attached to.
> > > This
> > > ensures
> > > that concurrent reoffload can still access filter that is being
> > > deleted and
> > > prevents race condition when driver callback can be removed when
> > > filter is
> > > no longer accessible trough idr, but is still present in
> > > hardware.
> > > 
> > > Refactor fl_change() to respect new filter reference counter and
> > > to
> > > release
> > > filter reference with __fl_put() in case of error, instead of
> > > directly
> > > deallocating filter memory. This allows for concurrent access to
> > > filter
> > > from fl_reoffload() and protects it with reference counting.
> > > Refactor
> > > fl_reoffload() to iterate over hw_filters list instead of idr.
> > > Implement
> > > fl_get_next_hw_filter() helper function that is used to iterate
> > > over
> > > hw_filters list with reference counting and skips filters that
> > > are
> > > being
> > > concurrently deleted.
> > > 
> > > Fixes: 92149190067d ("net: sched: flower: set unlocked flag for
> > > flower proto ops")
> > > Signed-off-by: Vlad Buslov <vla...@mellanox.com>
> > > Reported-by: Jakub Kicinski <jakub.kicin...@netronome.com>
> > > ---
> > >  net/sched/cls_flower.c | 77 +++++++++++++++++++++++++++++++-----
> > > ----
> > > --
> > >  1 file changed, 57 insertions(+), 20 deletions(-)
> > > 
> > > diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
> > > index 4b5585358699..524b86560af3 100644
> > > --- a/net/sched/cls_flower.c
> > > +++ b/net/sched/cls_flower.c
> > > @@ -90,6 +90,7 @@ struct cls_fl_head {
> > >   struct rhashtable ht;
> > >   spinlock_t masks_lock; /* Protect masks list */
> > >   struct list_head masks;
> > > + struct list_head hw_filters;
> > >   struct rcu_work rwork;
> > >   struct idr handle_idr;
> > >  };
> > > @@ -102,6 +103,7 @@ struct cls_fl_filter {
> > >   struct tcf_result res;
> > >   struct fl_flow_key key;
> > >   struct list_head list;
> > > + struct list_head hw_list;
> > >   u32 handle;
> > >   u32 flags;
> > >   u32 in_hw_count;
> > > @@ -315,6 +317,7 @@ static int fl_init(struct tcf_proto *tp)
> > > 
> > >   spin_lock_init(&head->masks_lock);
> > >   INIT_LIST_HEAD_RCU(&head->masks);
> > > + INIT_LIST_HEAD(&head->hw_filters);
> > >   rcu_assign_pointer(tp->root, head);
> > >   idr_init(&head->handle_idr);
> > > 
> > > @@ -352,6 +355,16 @@ static bool fl_mask_put(struct cls_fl_head
> > > *head, struct fl_flow_mask *mask)
> > >   return true;
> > >  }
> > > 
> > > +static struct cls_fl_head *fl_head_dereference(struct tcf_proto
> > > *tp)
> > > +{
> > > + /* Flower classifier only changes root pointer during init and
> > > destroy.
> > > +  * Users must obtain reference to tcf_proto instance before
> > > calling its
> > > +  * API, so tp->root pointer is protected from concurrent call
> > > to
> > > +  * fl_destroy() by reference counting.
> > > +  */
> > > + return rcu_dereference_raw(tp->root);
> > > +}
> > > +
> > >  static void __fl_destroy_filter(struct cls_fl_filter *f)
> > >  {
> > >   tcf_exts_destroy(&f->exts);
> > > @@ -382,6 +395,8 @@ static void fl_hw_destroy_filter(struct
> > > tcf_proto
> > > *tp, struct cls_fl_filter *f,
> > > 
> > >   tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower,
> > > false);
> > >   spin_lock(&tp->lock);
> > > + if (!list_empty(&f->hw_list))
> > > +         list_del_init(&f->hw_list);
> > >   tcf_block_offload_dec(block, &f->flags);
> > >   spin_unlock(&tp->lock);
> > > 
> > > @@ -393,6 +408,7 @@ static int fl_hw_replace_filter(struct
> > > tcf_proto
> > > *tp,
> > >                           struct cls_fl_filter *f, bool
> > > rtnl_held,
> > >                           struct netlink_ext_ack *extack)
> > >  {
> > > + struct cls_fl_head *head = fl_head_dereference(tp);
> > >   struct tc_cls_flower_offload cls_flower = {};
> > >   struct tcf_block *block = tp->chain->block;
> > >   bool skip_sw = tc_skip_sw(f->flags);
> > > @@ -444,6 +460,9 @@ static int fl_hw_replace_filter(struct
> > > tcf_proto
> > > *tp,
> > >           goto errout;
> > >   }
> > > 
> > > + spin_lock(&tp->lock);
> > > + list_add(&f->hw_list, &head->hw_filters);
> > > + spin_unlock(&tp->lock);
> > >  errout:
> > >   if (!rtnl_held)
> > >           rtnl_unlock();
> > > @@ -475,23 +494,11 @@ static void fl_hw_update_stats(struct
> > > tcf_proto
> > > *tp, struct cls_fl_filter *f,
> > >           rtnl_unlock();
> > >  }
> > > 
> > > -static struct cls_fl_head *fl_head_dereference(struct tcf_proto
> > > *tp)
> > > -{
> > > - /* Flower classifier only changes root pointer during init and
> > > destroy.
> > > -  * Users must obtain reference to tcf_proto instance before
> > > calling its
> > > -  * API, so tp->root pointer is protected from concurrent call
> > > to
> > > -  * fl_destroy() by reference counting.
> > > -  */
> > > - return rcu_dereference_raw(tp->root);
> > > -}
> > > -
> > >  static void __fl_put(struct cls_fl_filter *f)
> > >  {
> > >   if (!refcount_dec_and_test(&f->refcnt))
> > >           return;
> > > 
> > > - WARN_ON(!f->deleted);
> > > -
> > >   if (tcf_exts_get_net(&f->exts))
> > >           tcf_queue_work(&f->rwork, fl_destroy_filter_work);
> > >   else
> > > @@ -1522,6 +1529,7 @@ static int fl_change(struct net *net,
> > > struct
> > > sk_buff *in_skb,
> > >           err = -ENOBUFS;
> > >           goto errout_tb;
> > >   }
> > > + INIT_LIST_HEAD(&fnew->hw_list);
> > >   refcount_set(&fnew->refcnt, 1);
> > > 
> > >   err = tcf_exts_init(&fnew->exts, net, TCA_FLOWER_ACT, 0);
> > > @@ -1569,7 +1577,6 @@ static int fl_change(struct net *net,
> > > struct
> > > sk_buff *in_skb,
> > >           goto errout_hw;
> > >   }
> > > 
> > > - refcount_inc(&fnew->refcnt);
> > >   if (fold) {
> > >           /* Fold filter was deleted concurrently. Retry lookup.
> > > */
> > >           if (fold->deleted) {
> > > @@ -1591,6 +1598,7 @@ static int fl_change(struct net *net,
> > > struct
> > > sk_buff *in_skb,
> > >                   in_ht = true;
> > >           }
> > > 
> > > +         refcount_inc(&fnew->refcnt);
> > >           rhashtable_remove_fast(&fold->mask->ht,
> > >                                  &fold->ht_node,
> > >                                  fold->mask->filter_ht_params);
> > > @@ -1631,6 +1639,7 @@ static int fl_change(struct net *net,
> > > struct
> > > sk_buff *in_skb,
> > >           if (err)
> > >                   goto errout_hw;
> > > 
> > > +         refcount_inc(&fnew->refcnt);
> > >           fnew->handle = handle;
> > >           list_add_tail_rcu(&fnew->list, &fnew->mask->filters);
> > >           spin_unlock(&tp->lock);
> > > @@ -1642,19 +1651,20 @@ static int fl_change(struct net *net,
> > > struct
> > > sk_buff *in_skb,
> > >   kfree(mask);
> > >   return 0;
> > > 
> > > +errout_ht:
> > > + spin_lock(&tp->lock);
> > >  errout_hw:
> > > + fnew->deleted = true;
> > >   spin_unlock(&tp->lock);
> > >   if (!tc_skip_hw(fnew->flags))
> > >           fl_hw_destroy_filter(tp, fnew, rtnl_held, NULL);
> > > -errout_ht:
> > >   if (in_ht)
> > >           rhashtable_remove_fast(&fnew->mask->ht, &fnew->ht_node,
> > >                                  fnew->mask->filter_ht_params);
> > >  errout_mask:
> > >   fl_mask_put(head, fnew->mask);
> > >  errout:
> > > - tcf_exts_get_net(&fnew->exts);
> > > - tcf_queue_work(&fnew->rwork, fl_destroy_filter_work);
> > > + __fl_put(fnew);
> > >  errout_tb:
> > >   kfree(tb);
> > >  errout_mask_alloc:
> > > @@ -1699,16 +1709,44 @@ static void fl_walk(struct tcf_proto *tp,
> > > struct tcf_walker *arg,
> > >   }
> > >  }
> > > 
> > > +static struct cls_fl_filter *
> > > +fl_get_next_hw_filter(struct tcf_proto *tp, struct cls_fl_filter
> > > *f,
> > > bool add)
> > > +{
> > > + struct cls_fl_head *head = fl_head_dereference(tp);
> > > +
> > > + spin_lock(&tp->lock);
> > > + if (!f) {
> > > +         if (list_empty(&head->hw_filters)) {
> > > +                 spin_unlock(&tp->lock);
> > > +                 return NULL;
> > > +         }
> > 
> > Shouldn't this be a pre-condition to the whole function ? i mean
> > regardless of whether 'f' is NULL or not ?
> 
> List can't be empty if we already have an element of the list (f), so
> why check this on every iteration?
> 

Because you release the lock on every iteration, you can't just assume
that the list is still intact.

> > > +
> > > +         f = list_first_entry(&head->hw_filters, struct
> > > cls_fl_filter,
> > > +                              hw_list);
> > > + } else {
> > > +         f = list_next_entry(f, hw_list);
> > > + }
> > > +
> > 
> > Maybe if you use  list_for_each_entry_continue below, might
> > simplify
> > the above logic. it is weird that you need to figure out next entry
> > then call list_for_each_from, list 'continue' variation is made for
> > such use cases.
> 
> list_for_each_entry_continue requires initialized cursor and will
> skip
> first element if we obtain initial cursor with list_first_entry(). We
> can have two loops - one that uses list_for_each_entry for initial
> iteration, and another one that uses list_for_each_entry_continue for
> case when f!=NULL, but I don't see how that would be any simpler.
> 

you can set the initial cursor to list->head and not first entry.

It would be simpler if you do:

if (list_empty())
    return NULL;
pos = f ? f : list->head;
list_for_each_entry_continue(pos, ...);

> > > + list_for_each_entry_from(f, &head->hw_filters, hw_list) {
> > > +         if (!(add && f->deleted) && refcount_inc_not_zero(&f-
> > > > refcnt)) {
> > > +                 spin_unlock(&tp->lock);
> > > +                 return f;
> > > +         }
> > > + }
> > > +
> > > + spin_unlock(&tp->lock);
> > > + return NULL;
> > > +}
> > > +
> > >  static int fl_reoffload(struct tcf_proto *tp, bool add,
> > > tc_setup_cb_t *cb,
> > >                   void *cb_priv, struct netlink_ext_ack *extack)
> > >  {
> > >   struct tc_cls_flower_offload cls_flower = {};
> > >   struct tcf_block *block = tp->chain->block;
> > > - unsigned long handle = 0;
> > > - struct cls_fl_filter *f;
> > > + struct cls_fl_filter *f = NULL;
> > >   int err;
> > > 
> > > - while ((f = fl_get_next_filter(tp, &handle))) {
> > > + while ((f = fl_get_next_hw_filter(tp, f, add))) {
> > >           if (tc_skip_hw(f->flags))
> > >                   goto next_flow;
> > 
> > this can never be true as it is already a pre-condition for
> > fl_hw_replace_filter which actually adds  to the hw_filters list, i
> > think it needs to be removed. if it is, then i think it should be
> > part
> > of fl_get_next_hw_filter and not the caller responsibility.
> 
> Good catch.
> 
> > > @@ -1757,7 +1795,6 @@ static int fl_reoffload(struct tcf_proto
> > > *tp,
> > > bool add, tc_setup_cb_t *cb,
> > >                                     add);
> > >           spin_unlock(&tp->lock);
> > >  next_flow:
> > > -         handle++;
> > >           __fl_put(f);
> > >   }
> > > 

Reply via email to