在 2024/8/1 5:42, Andrii Nakryiko 写道:
> To avoid unnecessarily taking a (brief) refcount on uprobe during
> breakpoint handling in handle_swbp for entry uprobes, make find_uprobe()
> not take refcount, but protect the lifetime of a uprobe instance with
> RCU. This improves scalability, as refcount gets quite expensive due to
> cache line bouncing between multiple CPUs.
> 
> Specifically, we utilize our own uprobe-specific SRCU instance for this
> RCU protection. put_uprobe() will delay actual kfree() using call_srcu().
> 
> For now, uretprobe and single-stepping handling will still acquire
> refcount as necessary. We'll address these issues in follow up patches
> by making them use SRCU with timeout.
> 
> Signed-off-by: Andrii Nakryiko <and...@kernel.org>
> ---
>  kernel/events/uprobes.c | 93 ++++++++++++++++++++++++-----------------
>  1 file changed, 55 insertions(+), 38 deletions(-)
> 
> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index 23dde3ec5b09..6d5c3f4b210f 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -41,6 +41,8 @@ static struct rb_root uprobes_tree = RB_ROOT;
>  
>  static DEFINE_RWLOCK(uprobes_treelock);      /* serialize rbtree access */
>  
> +DEFINE_STATIC_SRCU(uprobes_srcu);
> +
>  #define UPROBES_HASH_SZ      13
>  /* serialize uprobe->pending_list */
>  static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
> @@ -59,6 +61,7 @@ struct uprobe {
>       struct list_head        pending_list;
>       struct uprobe_consumer  *consumers;
>       struct inode            *inode;         /* Also hold a ref to inode */
> +     struct rcu_head         rcu;
>       loff_t                  offset;
>       loff_t                  ref_ctr_offset;
>       unsigned long           flags;
> @@ -612,6 +615,13 @@ static inline bool uprobe_is_active(struct uprobe 
> *uprobe)
>       return !RB_EMPTY_NODE(&uprobe->rb_node);
>  }
>  
> +static void uprobe_free_rcu(struct rcu_head *rcu)
> +{
> +     struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
> +
> +     kfree(uprobe);
> +}
> +
>  static void put_uprobe(struct uprobe *uprobe)
>  {
>       if (!refcount_dec_and_test(&uprobe->ref))
> @@ -632,6 +642,8 @@ static void put_uprobe(struct uprobe *uprobe)
>       mutex_lock(&delayed_uprobe_lock);
>       delayed_uprobe_remove(uprobe, NULL);
>       mutex_unlock(&delayed_uprobe_lock);
> +
> +     call_srcu(&uprobes_srcu, &uprobe->rcu, uprobe_free_rcu);
>  }
>  
>  static __always_inline
> @@ -673,33 +685,25 @@ static inline int __uprobe_cmp(struct rb_node *a, const 
> struct rb_node *b)
>       return uprobe_cmp(u->inode, u->offset, __node_2_uprobe(b));
>  }
>  
> -static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
> +/*
> + * Assumes being inside RCU protected region.
> + * No refcount is taken on returned uprobe.
> + */
> +static struct uprobe *find_uprobe_rcu(struct inode *inode, loff_t offset)
>  {
>       struct __uprobe_key key = {
>               .inode = inode,
>               .offset = offset,
>       };
> -     struct rb_node *node = rb_find(&key, &uprobes_tree, __uprobe_cmp_key);
> -
> -     if (node)
> -             return try_get_uprobe(__node_2_uprobe(node));
> +     struct rb_node *node;
>  
> -     return NULL;
> -}
> -
> -/*
> - * Find a uprobe corresponding to a given inode:offset
> - * Acquires uprobes_treelock
> - */
> -static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
> -{
> -     struct uprobe *uprobe;
> +     lockdep_assert(srcu_read_lock_held(&uprobes_srcu));
>  
>       read_lock(&uprobes_treelock);
> -     uprobe = __find_uprobe(inode, offset);
> +     node = rb_find(&key, &uprobes_tree, __uprobe_cmp_key);
>       read_unlock(&uprobes_treelock);
>  
> -     return uprobe;
> +     return node ? __node_2_uprobe(node) : NULL;
>  }
>  
>  /*
> @@ -1073,10 +1077,10 @@ register_for_each_vma(struct uprobe *uprobe, struct 
> uprobe_consumer *new)
>                       goto free;
>               /*
>                * We take mmap_lock for writing to avoid the race with
> -              * find_active_uprobe() which takes mmap_lock for reading.
> +              * find_active_uprobe_rcu() which takes mmap_lock for reading.
>                * Thus this install_breakpoint() can not make
> -              * is_trap_at_addr() true right after find_uprobe()
> -              * returns NULL in find_active_uprobe().
> +              * is_trap_at_addr() true right after find_uprobe_rcu()
> +              * returns NULL in find_active_uprobe_rcu().
>                */
>               mmap_write_lock(mm);
>               vma = find_vma(mm, info->vaddr);
> @@ -1885,9 +1889,13 @@ static void prepare_uretprobe(struct uprobe *uprobe, 
> struct pt_regs *regs)
>               return;
>       }
>  
> +     /* we need to bump refcount to store uprobe in utask */
> +     if (!try_get_uprobe(uprobe))
> +             return;
> +
>       ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
>       if (!ri)
> -             return;
> +             goto fail;
>  
>       trampoline_vaddr = uprobe_get_trampoline_vaddr();
>       orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, 
> regs);
> @@ -1914,11 +1922,7 @@ static void prepare_uretprobe(struct uprobe *uprobe, 
> struct pt_regs *regs)
>               }
>               orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
>       }
> -      /*
> -       * uprobe's refcnt is positive, held by caller, so it's safe to
> -       * unconditionally bump it one more time here
> -       */
> -     ri->uprobe = get_uprobe(uprobe);
> +     ri->uprobe = uprobe;
>       ri->func = instruction_pointer(regs);
>       ri->stack = user_stack_pointer(regs);
>       ri->orig_ret_vaddr = orig_ret_vaddr;
> @@ -1929,8 +1933,9 @@ static void prepare_uretprobe(struct uprobe *uprobe, 
> struct pt_regs *regs)
>       utask->return_instances = ri;
>  
>       return;
> - fail:
> +fail:
>       kfree(ri);
> +     put_uprobe(uprobe);
>  }
>  
>  /* Prepare to single-step probed instruction out of line. */
> @@ -1945,9 +1950,14 @@ pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, 
> unsigned long bp_vaddr)
>       if (!utask)
>               return -ENOMEM;
>  
> +     if (!try_get_uprobe(uprobe))
> +             return -EINVAL;
> +
>       xol_vaddr = xol_get_insn_slot(uprobe);
> -     if (!xol_vaddr)
> -             return -ENOMEM;
> +     if (!xol_vaddr) {
> +             err = -ENOMEM;
> +             goto err_out;
> +     }
>  
>       utask->xol_vaddr = xol_vaddr;
>       utask->vaddr = bp_vaddr;
> @@ -1955,12 +1965,15 @@ pre_ssout(struct uprobe *uprobe, struct pt_regs 
> *regs, unsigned long bp_vaddr)
>       err = arch_uprobe_pre_xol(&uprobe->arch, regs);
>       if (unlikely(err)) {
>               xol_free_insn_slot(current);
> -             return err;
> +             goto err_out;
>       }
>  
>       utask->active_uprobe = uprobe;
>       utask->state = UTASK_SSTEP;
>       return 0;
> +err_out:
> +     put_uprobe(uprobe);
> +     return err;
>  }
>  
>  /*
> @@ -2044,7 +2057,8 @@ static int is_trap_at_addr(struct mm_struct *mm, 
> unsigned long vaddr)
>       return is_trap_insn(&opcode);
>  }
>  
> -static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int 
> *is_swbp)
> +/* assumes being inside RCU protected region */
> +static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int 
> *is_swbp)
>  {
>       struct mm_struct *mm = current->mm;
>       struct uprobe *uprobe = NULL;
> @@ -2057,7 +2071,7 @@ static struct uprobe *find_active_uprobe(unsigned long 
> bp_vaddr, int *is_swbp)
>                       struct inode *inode = file_inode(vma->vm_file);
>                       loff_t offset = vaddr_to_offset(vma, bp_vaddr);
>  
> -                     uprobe = find_uprobe(inode, offset);
> +                     uprobe = find_uprobe_rcu(inode, offset);
>               }
>  
>               if (!uprobe)
> @@ -2201,13 +2215,15 @@ static void handle_swbp(struct pt_regs *regs)
>  {
>       struct uprobe *uprobe;
>       unsigned long bp_vaddr;
> -     int is_swbp;
> +     int is_swbp, srcu_idx;
>  
>       bp_vaddr = uprobe_get_swbp_addr(regs);
>       if (bp_vaddr == uprobe_get_trampoline_vaddr())
>               return uprobe_handle_trampoline(regs);
>  
> -     uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
> +     srcu_idx = srcu_read_lock(&uprobes_srcu);
> +
> +     uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp);
>       if (!uprobe) {
>               if (is_swbp > 0) {
>                       /* No matching uprobe; signal SIGTRAP. */
> @@ -2223,6 +2239,7 @@ static void handle_swbp(struct pt_regs *regs)
>                        */
>                       instruction_pointer_set(regs, bp_vaddr);
>               }
> +             srcu_read_unlock(&uprobes_srcu, srcu_idx);
>               return;
>       }
>  
> @@ -2258,12 +2275,12 @@ static void handle_swbp(struct pt_regs *regs)
>       if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
>               goto out;
>  
> -     if (!pre_ssout(uprobe, regs, bp_vaddr))
> -             return;
> +     if (pre_ssout(uprobe, regs, bp_vaddr))
> +             goto out;
>  

Regardless what pre_ssout() returns, it always reach the label 'out', so the
if block is unnecessary.


> -     /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
>  out:
> -     put_uprobe(uprobe);
> +     /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
> +     srcu_read_unlock(&uprobes_srcu, srcu_idx);
>  }
>  
>  /*

-- 
BR
Liao, Chang

Reply via email to