On Mon, 2019-04-22 at 10:21 +0300, Vlad Buslov wrote:
> Recent changes that introduced unlocked flower did not properly
> account for
> case when reoffload is initiated concurrently with filter updates. To
> fix
> the issue, extend flower with 'hw_filters' list that is used to store
> filters that don't have 'skip_hw' flag set. Filter is added to the
> list
> when it is inserted to hardware and only removed from it after being
> unoffloaded from all drivers that parent block is attached to. This
> ensures
> that concurrent reoffload can still access filter that is being
> deleted and
> prevents race condition when driver callback can be removed when
> filter is
> no longer accessible trough idr, but is still present in hardware.
> 
> Refactor fl_change() to respect new filter reference counter and to
> release
> filter reference with __fl_put() in case of error, instead of
> directly
> deallocating filter memory. This allows for concurrent access to
> filter
> from fl_reoffload() and protects it with reference counting. Refactor
> fl_reoffload() to iterate over hw_filters list instead of idr.
> Implement
> fl_get_next_hw_filter() helper function that is used to iterate over
> hw_filters list with reference counting and skips filters that are
> being
> concurrently deleted.
> 
> Fixes: 92149190067d ("net: sched: flower: set unlocked flag for
> flower proto ops")
> Signed-off-by: Vlad Buslov <vla...@mellanox.com>
> Reported-by: Jakub Kicinski <jakub.kicin...@netronome.com>
> ---
>  net/sched/cls_flower.c | 77 +++++++++++++++++++++++++++++++---------
> --
>  1 file changed, 57 insertions(+), 20 deletions(-)
> 
> diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
> index 4b5585358699..524b86560af3 100644
> --- a/net/sched/cls_flower.c
> +++ b/net/sched/cls_flower.c
> @@ -90,6 +90,7 @@ struct cls_fl_head {
>       struct rhashtable ht;
>       spinlock_t masks_lock; /* Protect masks list */
>       struct list_head masks;
> +     struct list_head hw_filters;
>       struct rcu_work rwork;
>       struct idr handle_idr;
>  };
> @@ -102,6 +103,7 @@ struct cls_fl_filter {
>       struct tcf_result res;
>       struct fl_flow_key key;
>       struct list_head list;
> +     struct list_head hw_list;
>       u32 handle;
>       u32 flags;
>       u32 in_hw_count;
> @@ -315,6 +317,7 @@ static int fl_init(struct tcf_proto *tp)
>  
>       spin_lock_init(&head->masks_lock);
>       INIT_LIST_HEAD_RCU(&head->masks);
> +     INIT_LIST_HEAD(&head->hw_filters);
>       rcu_assign_pointer(tp->root, head);
>       idr_init(&head->handle_idr);
>  
> @@ -352,6 +355,16 @@ static bool fl_mask_put(struct cls_fl_head
> *head, struct fl_flow_mask *mask)
>       return true;
>  }
>  
> +static struct cls_fl_head *fl_head_dereference(struct tcf_proto *tp)
> +{
> +     /* Flower classifier only changes root pointer during init and
> destroy.
> +      * Users must obtain reference to tcf_proto instance before
> calling its
> +      * API, so tp->root pointer is protected from concurrent call
> to
> +      * fl_destroy() by reference counting.
> +      */
> +     return rcu_dereference_raw(tp->root);
> +}
> +
>  static void __fl_destroy_filter(struct cls_fl_filter *f)
>  {
>       tcf_exts_destroy(&f->exts);
> @@ -382,6 +395,8 @@ static void fl_hw_destroy_filter(struct tcf_proto
> *tp, struct cls_fl_filter *f,
>  
>       tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower,
> false);
>       spin_lock(&tp->lock);
> +     if (!list_empty(&f->hw_list))
> +             list_del_init(&f->hw_list);
>       tcf_block_offload_dec(block, &f->flags);
>       spin_unlock(&tp->lock);
>  
> @@ -393,6 +408,7 @@ static int fl_hw_replace_filter(struct tcf_proto
> *tp,
>                               struct cls_fl_filter *f, bool
> rtnl_held,
>                               struct netlink_ext_ack *extack)
>  {
> +     struct cls_fl_head *head = fl_head_dereference(tp);
>       struct tc_cls_flower_offload cls_flower = {};
>       struct tcf_block *block = tp->chain->block;
>       bool skip_sw = tc_skip_sw(f->flags);
> @@ -444,6 +460,9 @@ static int fl_hw_replace_filter(struct tcf_proto
> *tp,
>               goto errout;
>       }
>  
> +     spin_lock(&tp->lock);
> +     list_add(&f->hw_list, &head->hw_filters);
> +     spin_unlock(&tp->lock);
>  errout:
>       if (!rtnl_held)
>               rtnl_unlock();
> @@ -475,23 +494,11 @@ static void fl_hw_update_stats(struct tcf_proto
> *tp, struct cls_fl_filter *f,
>               rtnl_unlock();
>  }
>  
> -static struct cls_fl_head *fl_head_dereference(struct tcf_proto *tp)
> -{
> -     /* Flower classifier only changes root pointer during init and
> destroy.
> -      * Users must obtain reference to tcf_proto instance before
> calling its
> -      * API, so tp->root pointer is protected from concurrent call
> to
> -      * fl_destroy() by reference counting.
> -      */
> -     return rcu_dereference_raw(tp->root);
> -}
> -
>  static void __fl_put(struct cls_fl_filter *f)
>  {
>       if (!refcount_dec_and_test(&f->refcnt))
>               return;
>  
> -     WARN_ON(!f->deleted);
> -
>       if (tcf_exts_get_net(&f->exts))
>               tcf_queue_work(&f->rwork, fl_destroy_filter_work);
>       else
> @@ -1522,6 +1529,7 @@ static int fl_change(struct net *net, struct
> sk_buff *in_skb,
>               err = -ENOBUFS;
>               goto errout_tb;
>       }
> +     INIT_LIST_HEAD(&fnew->hw_list);
>       refcount_set(&fnew->refcnt, 1);
>  
>       err = tcf_exts_init(&fnew->exts, net, TCA_FLOWER_ACT, 0);
> @@ -1569,7 +1577,6 @@ static int fl_change(struct net *net, struct
> sk_buff *in_skb,
>               goto errout_hw;
>       }
>  
> -     refcount_inc(&fnew->refcnt);
>       if (fold) {
>               /* Fold filter was deleted concurrently. Retry lookup.
> */
>               if (fold->deleted) {
> @@ -1591,6 +1598,7 @@ static int fl_change(struct net *net, struct
> sk_buff *in_skb,
>                       in_ht = true;
>               }
>  
> +             refcount_inc(&fnew->refcnt);
>               rhashtable_remove_fast(&fold->mask->ht,
>                                      &fold->ht_node,
>                                      fold->mask->filter_ht_params);
> @@ -1631,6 +1639,7 @@ static int fl_change(struct net *net, struct
> sk_buff *in_skb,
>               if (err)
>                       goto errout_hw;
>  
> +             refcount_inc(&fnew->refcnt);
>               fnew->handle = handle;
>               list_add_tail_rcu(&fnew->list, &fnew->mask->filters);
>               spin_unlock(&tp->lock);
> @@ -1642,19 +1651,20 @@ static int fl_change(struct net *net, struct
> sk_buff *in_skb,
>       kfree(mask);
>       return 0;
>  
> +errout_ht:
> +     spin_lock(&tp->lock);
>  errout_hw:
> +     fnew->deleted = true;
>       spin_unlock(&tp->lock);
>       if (!tc_skip_hw(fnew->flags))
>               fl_hw_destroy_filter(tp, fnew, rtnl_held, NULL);
> -errout_ht:
>       if (in_ht)
>               rhashtable_remove_fast(&fnew->mask->ht, &fnew->ht_node,
>                                      fnew->mask->filter_ht_params);
>  errout_mask:
>       fl_mask_put(head, fnew->mask);
>  errout:
> -     tcf_exts_get_net(&fnew->exts);
> -     tcf_queue_work(&fnew->rwork, fl_destroy_filter_work);
> +     __fl_put(fnew);
>  errout_tb:
>       kfree(tb);
>  errout_mask_alloc:
> @@ -1699,16 +1709,44 @@ static void fl_walk(struct tcf_proto *tp,
> struct tcf_walker *arg,
>       }
>  }
>  
> +static struct cls_fl_filter *
> +fl_get_next_hw_filter(struct tcf_proto *tp, struct cls_fl_filter *f,
> bool add)
> +{
> +     struct cls_fl_head *head = fl_head_dereference(tp);
> +
> +     spin_lock(&tp->lock);
> +     if (!f) {
> +             if (list_empty(&head->hw_filters)) {
> +                     spin_unlock(&tp->lock);
> +                     return NULL;
> +             }

Shouldn't this be a pre-condition to the whole function ? i mean
regardless of whether 'f' is NULL or not ?

> +
> +             f = list_first_entry(&head->hw_filters, struct
> cls_fl_filter,
> +                                  hw_list);
> +     } else {
> +             f = list_next_entry(f, hw_list);
> +     }
> +

Maybe if you use  list_for_each_entry_continue below, might simplify
the above logic. it is weird that you need to figure out next entry
then call list_for_each_from, list 'continue' variation is made for
such use cases.

> +     list_for_each_entry_from(f, &head->hw_filters, hw_list) {
> +             if (!(add && f->deleted) && refcount_inc_not_zero(&f-
> >refcnt)) {
> +                     spin_unlock(&tp->lock);
> +                     return f;
> +             }
> +     }
> +
> +     spin_unlock(&tp->lock);
> +     return NULL;
> +}
> +
>  static int fl_reoffload(struct tcf_proto *tp, bool add,
> tc_setup_cb_t *cb,
>                       void *cb_priv, struct netlink_ext_ack *extack)
>  {
>       struct tc_cls_flower_offload cls_flower = {};
>       struct tcf_block *block = tp->chain->block;
> -     unsigned long handle = 0;
> -     struct cls_fl_filter *f;
> +     struct cls_fl_filter *f = NULL;
>       int err;
>  
> -     while ((f = fl_get_next_filter(tp, &handle))) {
> +     while ((f = fl_get_next_hw_filter(tp, f, add))) {
>               if (tc_skip_hw(f->flags))
>                       goto next_flow;

this can never be true as it is already a pre-condition for
fl_hw_replace_filter which actually adds  to the hw_filters list, i
think it needs to be removed. if it is, then i think it should be part
of fl_get_next_hw_filter and not the caller responsibility.

>  
> @@ -1757,7 +1795,6 @@ static int fl_reoffload(struct tcf_proto *tp,
> bool add, tc_setup_cb_t *cb,
>                                         add);
>               spin_unlock(&tp->lock);
>  next_flow:
> -             handle++;
>               __fl_put(f);
>       }
>  

Reply via email to