Userspace Statically Defined Tracepoints[1] are dtrace style markers
inside userspace applications. Applications like PostgreSQL, MySQL,
Pthread, Perl, Python, Java, Ruby, Node.js, libvirt, QEMU, glib etc
have these markers embedded in them. These markers are added by developer
at important places in the code. Each marker source expands to a single
nop instruction in the compiled code but there may be additional
overhead for computing the marker arguments which expands to couple of
instructions. In case the overhead is more, execution of it can be
omitted by runtime if() condition when no one is tracing on the marker:

    if (reference_counter > 0) {
        Execute marker instructions;
    }

Default value of reference counter is 0. Tracer has to increment the
reference counter before tracing on a marker and decrement it when
done with the tracing.

Implement the reference counter logic in core uprobe. User will be
able to use it from trace_uprobe as well as from kernel module. New
trace_uprobe definition with reference counter will now be:

    <path>:<offset>[(ref_ctr_offset)]

where ref_ctr_offset is an optional field. For kernel module, new
variant of uprobe_register() has been introduced:

    uprobe_register_refctr(inode, offset, ref_ctr_offset, consumer)

No new variant for uprobe_unregister() because it's assumed to have
only one reference counter for one uprobe.

[1] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation

Note: 'reference counter' is called as 'semaphore' in original Dtrace
(or Systemtap, bcc and even in ELF) documentation and code. But the
term 'semaphore' is misleading in this context. This is just a counter
used to hold number of tracers tracing on a marker. This is not really
used for any synchronization. So we are referring it as 'reference
counter' in kernel / perf code.

Signed-off-by: Ravi Bangoria <ravi.bango...@linux.ibm.com>
Reviewed-by: Masami Hiramatsu <mhira...@kernel.org>
[Only trace_uprobe.c]
---
 include/linux/uprobes.h     |   5 +
 kernel/events/uprobes.c     | 232 ++++++++++++++++++++++++++++++++++++++++++--
 kernel/trace/trace.c        |   2 +-
 kernel/trace/trace_uprobe.c |  38 +++++++-
 4 files changed, 267 insertions(+), 10 deletions(-)

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index bb9d2084af03..103a48a48872 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -123,6 +123,7 @@ extern unsigned long uprobe_get_swbp_addr(struct pt_regs 
*regs);
 extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs);
 extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct 
*mm, unsigned long vaddr, uprobe_opcode_t);
 extern int uprobe_register(struct inode *inode, loff_t offset, struct 
uprobe_consumer *uc);
+extern int uprobe_register_refctr(struct inode *inode, loff_t offset, loff_t 
ref_ctr_offset, struct uprobe_consumer *uc);
 extern int uprobe_apply(struct inode *inode, loff_t offset, struct 
uprobe_consumer *uc, bool);
 extern void uprobe_unregister(struct inode *inode, loff_t offset, struct 
uprobe_consumer *uc);
 extern int uprobe_mmap(struct vm_area_struct *vma);
@@ -160,6 +161,10 @@ uprobe_register(struct inode *inode, loff_t offset, struct 
uprobe_consumer *uc)
 {
        return -ENOSYS;
 }
+static inline int uprobe_register_refctr(struct inode *inode, loff_t offset, 
loff_t ref_ctr_offset, struct uprobe_consumer *uc)
+{
+       return -ENOSYS;
+}
 static inline int
 uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, 
bool add)
 {
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index c0418ba52ba8..ad92fed11526 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -73,6 +73,7 @@ struct uprobe {
        struct uprobe_consumer  *consumers;
        struct inode            *inode;         /* Also hold a ref to inode */
        loff_t                  offset;
+       loff_t                  ref_ctr_offset;
        unsigned long           flags;
 
        /*
@@ -88,6 +89,15 @@ struct uprobe {
        struct arch_uprobe      arch;
 };
 
+struct delayed_uprobe {
+       struct list_head list;
+       struct uprobe *uprobe;
+       struct mm_struct *mm;
+};
+
+static DEFINE_MUTEX(delayed_uprobe_lock);
+static LIST_HEAD(delayed_uprobe_list);
+
 /*
  * Execute out of line area: anonymous executable mapping installed
  * by the probed task to execute the copy of the original instruction
@@ -282,6 +292,154 @@ static int verify_opcode(struct page *page, unsigned long 
vaddr, uprobe_opcode_t
        return 1;
 }
 
+static struct delayed_uprobe *
+delayed_uprobe_check(struct uprobe *uprobe, struct mm_struct *mm)
+{
+       struct delayed_uprobe *du;
+
+       list_for_each_entry(du, &delayed_uprobe_list, list)
+               if (du->uprobe == uprobe && du->mm == mm)
+                       return du;
+       return NULL;
+}
+
+static int delayed_uprobe_add(struct uprobe *uprobe, struct mm_struct *mm)
+{
+       struct delayed_uprobe *du;
+
+       if (delayed_uprobe_check(uprobe, mm))
+               return 0;
+
+       du  = kzalloc(sizeof(*du), GFP_KERNEL);
+       if (!du)
+               return -ENOMEM;
+
+       du->uprobe = uprobe;
+       du->mm = mm;
+       list_add(&du->list, &delayed_uprobe_list);
+       return 0;
+}
+
+static void delayed_uprobe_delete(struct delayed_uprobe *du)
+{
+       if (!du)
+               return;
+       list_del(&du->list);
+       kfree(du);
+}
+
+static void delayed_uprobe_remove(struct uprobe *uprobe, struct mm_struct *mm)
+{
+       struct list_head *pos, *q;
+       struct delayed_uprobe *du;
+
+       list_for_each_safe(pos, q, &delayed_uprobe_list) {
+               du = list_entry(pos, struct delayed_uprobe, list);
+               if (uprobe) {
+                       if (du->uprobe == uprobe && du->mm == mm)
+                               delayed_uprobe_delete(du);
+               } else if (du->mm == mm) {
+                       delayed_uprobe_delete(du);
+               }
+       }
+}
+
+static bool valid_ref_ctr_vma(struct uprobe *uprobe,
+                             struct vm_area_struct *vma)
+{
+       unsigned long vaddr = offset_to_vaddr(vma, uprobe->ref_ctr_offset);
+
+       return uprobe->ref_ctr_offset &&
+               vma->vm_file &&
+               file_inode(vma->vm_file) == uprobe->inode &&
+               vma->vm_flags & VM_WRITE &&
+               !(vma->vm_flags & VM_SHARED) &&
+               vma->vm_start <= vaddr &&
+               vma->vm_end > vaddr;
+}
+
+static struct vm_area_struct *
+find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm)
+{
+       struct vm_area_struct *tmp;
+
+       for (tmp = mm->mmap; tmp; tmp = tmp->vm_next)
+               if (valid_ref_ctr_vma(uprobe, tmp))
+                       return tmp;
+
+       return NULL;
+}
+
+static int
+__update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d)
+{
+       void *kaddr;
+       struct page *page;
+       struct vm_area_struct *vma;
+       int ret = 0;
+       short *ptr;
+
+       if (vaddr == 0 || d == 0)
+               return -EINVAL;
+
+       ret = get_user_pages_remote(NULL, mm, vaddr, 1,
+                       FOLL_WRITE, &page, &vma, NULL);
+       if (unlikely(ret <= 0)) {
+               /*
+                * We are asking for 1 page. If get_user_pages_remote() fails,
+                * it may return 0, in that case we have to return error.
+                */
+               ret = (ret == 0) ? -EBUSY : ret;
+               pr_warn("Failed to %s ref_ctr. (%d)\n",
+                       d > 0 ? "increment" : "decrement", ret);
+               return ret;
+       }
+
+       kaddr = kmap_atomic(page);
+       ptr = kaddr + (vaddr & ~PAGE_MASK);
+
+       if (unlikely(*ptr + d < 0)) {
+               pr_warn("ref_ctr going negative. vaddr: 0x%lx, "
+                       "curr val: %d, delta: %d\n", vaddr, *ptr, d);
+               ret = -EINVAL;
+               goto out;
+       }
+
+       *ptr += d;
+       ret = 0;
+out:
+       kunmap_atomic(kaddr);
+       put_page(page);
+       return ret;
+}
+
+static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
+                         bool is_register)
+{
+       struct vm_area_struct *rc_vma;
+       unsigned long rc_vaddr;
+       int ret = 0;
+
+       rc_vma = find_ref_ctr_vma(uprobe, mm);
+
+       if (rc_vma) {
+               rc_vaddr = offset_to_vaddr(rc_vma, uprobe->ref_ctr_offset);
+               ret = __update_ref_ctr(mm, rc_vaddr, is_register ? 1 : -1);
+
+               if (is_register)
+                       return ret;
+       }
+
+       mutex_lock(&delayed_uprobe_lock);
+       if (is_register)
+               ret = delayed_uprobe_add(uprobe, mm);
+       else
+               delayed_uprobe_remove(uprobe, mm);
+       mutex_unlock(&delayed_uprobe_lock);
+
+       return ret;
+}
+
 /*
  * NOTE:
  * Expect the breakpoint instruction to be the smallest size instruction for
@@ -302,9 +460,13 @@ static int verify_opcode(struct page *page, unsigned long 
vaddr, uprobe_opcode_t
 int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
                        unsigned long vaddr, uprobe_opcode_t opcode)
 {
+       struct uprobe *uprobe;
        struct page *old_page, *new_page;
        struct vm_area_struct *vma;
-       int ret;
+       int ret, is_register, ref_ctr_updated = 0;
+
+       is_register = is_swbp_insn(&opcode);
+       uprobe = container_of(auprobe, struct uprobe, arch);
 
 retry:
        /* Read the page with vaddr into memory */
@@ -317,6 +479,15 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, 
struct mm_struct *mm,
        if (ret <= 0)
                goto put_old;
 
+       /* We are going to replace instruction, update ref_ctr. */
+       if (!ref_ctr_updated && uprobe->ref_ctr_offset) {
+               ret = update_ref_ctr(uprobe, mm, is_register);
+               if (ret)
+                       goto put_old;
+
+               ref_ctr_updated = 1;
+       }
+
        ret = anon_vma_prepare(vma);
        if (ret)
                goto put_old;
@@ -337,6 +508,11 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, 
struct mm_struct *mm,
 
        if (unlikely(ret == -EAGAIN))
                goto retry;
+
+       /* Revert back reference counter if instruction update failed. */
+       if (ret && is_register && ref_ctr_updated)
+               update_ref_ctr(uprobe, mm, false);
+
        return ret;
 }
 
@@ -484,7 +660,8 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe)
        return u;
 }
 
-static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
+static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
+                                  loff_t ref_ctr_offset)
 {
        struct uprobe *uprobe, *cur_uprobe;
 
@@ -494,6 +671,7 @@ static struct uprobe *alloc_uprobe(struct inode *inode, 
loff_t offset)
 
        uprobe->inode = inode;
        uprobe->offset = offset;
+       uprobe->ref_ctr_offset = ref_ctr_offset;
        init_rwsem(&uprobe->register_rwsem);
        init_rwsem(&uprobe->consumer_rwsem);
 
@@ -895,7 +1073,7 @@ EXPORT_SYMBOL_GPL(uprobe_unregister);
  * else return 0 (success)
  */
 static int __uprobe_register(struct inode *inode, loff_t offset,
-                            struct uprobe_consumer *uc)
+                            loff_t ref_ctr_offset, struct uprobe_consumer *uc)
 {
        struct uprobe *uprobe;
        int ret;
@@ -912,7 +1090,7 @@ static int __uprobe_register(struct inode *inode, loff_t 
offset,
                return -EINVAL;
 
  retry:
-       uprobe = alloc_uprobe(inode, offset);
+       uprobe = alloc_uprobe(inode, offset, ref_ctr_offset);
        if (!uprobe)
                return -ENOMEM;
        /*
@@ -938,10 +1116,17 @@ static int __uprobe_register(struct inode *inode, loff_t 
offset,
 int uprobe_register(struct inode *inode, loff_t offset,
                    struct uprobe_consumer *uc)
 {
-       return __uprobe_register(inode, offset, uc);
+       return __uprobe_register(inode, offset, 0, uc);
 }
 EXPORT_SYMBOL_GPL(uprobe_register);
 
+int uprobe_register_refctr(struct inode *inode, loff_t offset,
+                          loff_t ref_ctr_offset, struct uprobe_consumer *uc)
+{
+       return __uprobe_register(inode, offset, ref_ctr_offset, uc);
+}
+EXPORT_SYMBOL_GPL(uprobe_register_refctr);
+
 /*
  * uprobe_apply - unregister a already registered probe.
  * @inode: the file in which the probe has to be removed.
@@ -1060,6 +1245,30 @@ static void build_probe_list(struct inode *inode,
        spin_unlock(&uprobes_treelock);
 }
 
+static int delayed_uprobe_install(struct vm_area_struct *vma)
+{
+       struct list_head *pos, *q;
+       struct delayed_uprobe *du;
+       unsigned long vaddr;
+       int ret = 0, err = 0;
+
+       mutex_lock(&delayed_uprobe_lock);
+       list_for_each_safe(pos, q, &delayed_uprobe_list) {
+               du = list_entry(pos, struct delayed_uprobe, list);
+
+               if (!valid_ref_ctr_vma(du->uprobe, vma))
+                       continue;
+
+               vaddr = offset_to_vaddr(vma, du->uprobe->ref_ctr_offset);
+               ret = __update_ref_ctr(vma->vm_mm, vaddr, 1);
+               /* Record an error and continue. */
+               err = ret & !err ? ret : err;
+               delayed_uprobe_delete(du);
+       }
+       mutex_unlock(&delayed_uprobe_lock);
+       return err;
+}
+
 /*
  * Called from mmap_region/vma_adjust with mm->mmap_sem acquired.
  *
@@ -1072,7 +1281,14 @@ int uprobe_mmap(struct vm_area_struct *vma)
        struct uprobe *uprobe, *u;
        struct inode *inode;
 
-       if (no_uprobe_events() || !valid_vma(vma, true))
+       if (no_uprobe_events())
+               return 0;
+
+       if (vma->vm_flags & VM_WRITE &&
+           test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags))
+               delayed_uprobe_install(vma);
+
+       if (!valid_vma(vma, true))
                return 0;
 
        inode = file_inode(vma->vm_file);
@@ -1246,6 +1462,10 @@ void uprobe_clear_state(struct mm_struct *mm)
 {
        struct xol_area *area = mm->uprobes_state.xol_area;
 
+       mutex_lock(&delayed_uprobe_lock);
+       delayed_uprobe_remove(NULL, mm);
+       mutex_unlock(&delayed_uprobe_lock);
+
        if (!area)
                return;
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 823687997b01..616160b85860 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4620,7 +4620,7 @@ static const char readme_msg[] =
   "place (kretprobe): [<module>:]<symbol>[+<offset>]|<memaddr>\n"
 #endif
 #ifdef CONFIG_UPROBE_EVENTS
-       "\t    place: <path>:<offset>\n"
+  "   place (uprobe): <path>:<offset>[(ref_ctr_offset)]\n"
 #endif
        "\t     args: <name>=fetcharg[:type]\n"
        "\t fetcharg: %<register>, @<address>, @<symbol>[+|-<offset>],\n"
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index bf89a51e740d..bf2be098eb08 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -59,6 +59,7 @@ struct trace_uprobe {
        struct inode                    *inode;
        char                            *filename;
        unsigned long                   offset;
+       unsigned long                   ref_ctr_offset;
        unsigned long                   nhit;
        struct trace_probe              tp;
 };
@@ -364,10 +365,10 @@ static int register_trace_uprobe(struct trace_uprobe *tu)
 static int create_trace_uprobe(int argc, char **argv)
 {
        struct trace_uprobe *tu;
-       char *arg, *event, *group, *filename;
+       char *arg, *event, *group, *filename, *rctr, *rctr_end;
        char buf[MAX_EVENT_NAME_LEN];
        struct path path;
-       unsigned long offset;
+       unsigned long offset, ref_ctr_offset;
        bool is_delete, is_return;
        int i, ret;
 
@@ -376,6 +377,7 @@ static int create_trace_uprobe(int argc, char **argv)
        is_return = false;
        event = NULL;
        group = NULL;
+       ref_ctr_offset = 0;
 
        /* argc must be >= 1 */
        if (argv[0][0] == '-')
@@ -450,6 +452,26 @@ static int create_trace_uprobe(int argc, char **argv)
                goto fail_address_parse;
        }
 
+       /* Parse reference counter offset if specified. */
+       rctr = strchr(arg, '(');
+       if (rctr) {
+               rctr_end = strchr(rctr, ')');
+               if (rctr > rctr_end || *(rctr_end + 1) != 0) {
+                       ret = -EINVAL;
+                       pr_info("Invalid reference counter offset.\n");
+                       goto fail_address_parse;
+               }
+
+               *rctr++ = '\0';
+               *rctr_end = '\0';
+               ret = kstrtoul(rctr, 0, &ref_ctr_offset);
+               if (ret) {
+                       pr_info("Invalid reference counter offset.\n");
+                       goto fail_address_parse;
+               }
+       }
+
+       /* Parse uprobe offset. */
        ret = kstrtoul(arg, 0, &offset);
        if (ret)
                goto fail_address_parse;
@@ -484,6 +506,7 @@ static int create_trace_uprobe(int argc, char **argv)
                goto fail_address_parse;
        }
        tu->offset = offset;
+       tu->ref_ctr_offset = ref_ctr_offset;
        tu->path = path;
        tu->filename = kstrdup(filename, GFP_KERNEL);
 
@@ -602,6 +625,9 @@ static int probes_seq_show(struct seq_file *m, void *v)
                        trace_event_name(&tu->tp.call), tu->filename,
                        (int)(sizeof(void *) * 2), tu->offset);
 
+       if (tu->ref_ctr_offset)
+               seq_printf(m, "(0x%lx)", tu->ref_ctr_offset);
+
        for (i = 0; i < tu->tp.nr_args; i++)
                seq_printf(m, " %s=%s", tu->tp.args[i].name, 
tu->tp.args[i].comm);
 
@@ -917,7 +943,13 @@ probe_event_enable(struct trace_uprobe *tu, struct 
trace_event_file *file,
 
        tu->consumer.filter = filter;
        tu->inode = d_real_inode(tu->path.dentry);
-       ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
+       if (tu->ref_ctr_offset) {
+               ret = uprobe_register_refctr(tu->inode, tu->offset,
+                               tu->ref_ctr_offset, &tu->consumer);
+       } else {
+               ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
+       }
+
        if (ret)
                goto err_buffer;
 
-- 
2.14.4

Reply via email to