On Thu, 27 Jun 2019 16:53:52 +0530 "Naveen N. Rao" <naveen.n....@linux.vnet.ibm.com> wrote:
> With -mprofile-kernel, gcc emits 'mflr r0', followed by 'bl _mcount' to > enable function tracing and profiling. So far, with dynamic ftrace, we > used to only patch out the branch to _mcount(). However, mflr is > executed by the branch unit that can only execute one per cycle on > POWER9 and shared with branches, so it would be nice to avoid it where > possible. > > We cannot simply nop out the mflr either. When enabling function > tracing, there can be a race if tracing is enabled when some thread was > interrupted after executing a nop'ed out mflr. In this case, the thread > would execute the now-patched-in branch to _mcount() without having > executed the preceding mflr. > > To solve this, we now enable function tracing in 2 steps: patch in the > mflr instruction, use 'smp_call_function(isync); > synchronize_rcu_tasks()' to ensure all existing threads make progress, > and then patch in the branch to _mcount(). We override > ftrace_replace_code() with a powerpc64 variant for this purpose. You may want to explain that you do the reverse when patching it out. That is, patch out the "bl _mcount" into a nop and then patch out the "mflr r0". But interesting, I don't see a synchronize_rcu_tasks() call there. > > Suggested-by: Nicholas Piggin <npig...@gmail.com> > Reviewed-by: Nicholas Piggin <npig...@gmail.com> > Signed-off-by: Naveen N. Rao <naveen.n....@linux.vnet.ibm.com> > --- > arch/powerpc/kernel/trace/ftrace.c | 258 ++++++++++++++++++++++++++--- > 1 file changed, 236 insertions(+), 22 deletions(-) > > diff --git a/arch/powerpc/kernel/trace/ftrace.c > b/arch/powerpc/kernel/trace/ftrace.c > index 517662a56bdc..86c2b50dcaa9 100644 > --- a/arch/powerpc/kernel/trace/ftrace.c > +++ b/arch/powerpc/kernel/trace/ftrace.c > @@ -125,7 +125,7 @@ __ftrace_make_nop(struct module *mod, > { > unsigned long entry, ptr, tramp; > unsigned long ip = rec->ip; > - unsigned int op, pop; > + unsigned int op; > > /* read where this goes */ > if (probe_kernel_read(&op, (void *)ip, sizeof(int))) { > @@ -160,8 +160,6 @@ __ftrace_make_nop(struct module *mod, > > #ifdef CONFIG_MPROFILE_KERNEL > /* When using -mkernel_profile there is no load to jump over */ > - pop = PPC_INST_NOP; > - > if (probe_kernel_read(&op, (void *)(ip - 4), 4)) { > pr_err("Fetching instruction at %lx failed.\n", ip - 4); > return -EFAULT; > @@ -169,26 +167,23 @@ __ftrace_make_nop(struct module *mod, > > /* We expect either a mflr r0, or a std r0, LRSAVE(r1) */ > if (op != PPC_INST_MFLR && op != PPC_INST_STD_LR) { > - pr_err("Unexpected instruction %08x around bl _mcount\n", op); > + pr_err("Unexpected instruction %08x before bl _mcount\n", op); > return -EINVAL; > } > -#else > - /* > - * Our original call site looks like: > - * > - * bl <tramp> > - * ld r2,XX(r1) > - * > - * Milton Miller pointed out that we can not simply nop the branch. > - * If a task was preempted when calling a trace function, the nops > - * will remove the way to restore the TOC in r2 and the r2 TOC will > - * get corrupted. > - * > - * Use a b +8 to jump over the load. > - */ > > - pop = PPC_INST_BRANCH | 8; /* b +8 */ > + /* We should patch out the bl to _mcount first */ > + if (patch_instruction((unsigned int *)ip, PPC_INST_NOP)) { > + pr_err("Patching NOP failed.\n"); > + return -EPERM; > + } > > + /* then, nop out the preceding 'mflr r0' as an optimization */ > + if (op == PPC_INST_MFLR && > + patch_instruction((unsigned int *)(ip - 4), PPC_INST_NOP)) { > + pr_err("Patching NOP failed.\n"); > + return -EPERM; > + } > +#else > /* > * Check what is in the next instruction. We can see ld r2,40(r1), but > * on first pass after boot we will see mflr r0. > @@ -202,12 +197,25 @@ __ftrace_make_nop(struct module *mod, > pr_err("Expected %08x found %08x\n", PPC_INST_LD_TOC, op); > return -EINVAL; > } > -#endif /* CONFIG_MPROFILE_KERNEL */ > > - if (patch_instruction((unsigned int *)ip, pop)) { > + /* > + * Our original call site looks like: > + * > + * bl <tramp> > + * ld r2,XX(r1) > + * > + * Milton Miller pointed out that we can not simply nop the branch. > + * If a task was preempted when calling a trace function, the nops > + * will remove the way to restore the TOC in r2 and the r2 TOC will > + * get corrupted. > + * > + * Use a b +8 to jump over the load. > + */ > + if (patch_instruction((unsigned int *)ip, PPC_INST_BRANCH | 8)) { > pr_err("Patching NOP failed.\n"); > return -EPERM; > } > +#endif /* CONFIG_MPROFILE_KERNEL */ > > return 0; > } > @@ -421,6 +429,26 @@ static int __ftrace_make_nop_kernel(struct dyn_ftrace > *rec, unsigned long addr) > return -EPERM; > } > > +#ifdef CONFIG_MPROFILE_KERNEL I would think you need to break this up into two parts as well, with a synchronize_rcu_tasks() in between. Imagine this scenario: <func>: nop <-- interrupt comes here, and preempts the task nop First change. <func>: mflr r0 nop Second change. <func>: mflr r0 bl _mcount Task returns from interrupt <func>: mflr r0 bl _mcount <-- executes here It never did the mflr r0, because the last command that was executed was a nop before it was interrupted. And yes, it can be interrupted on a nop! -- Steve > + /* Nop out the preceding 'mflr r0' as an optimization */ > + if (probe_kernel_read(&op, (void *)(ip - 4), 4)) { > + pr_err("Fetching instruction at %lx failed.\n", ip - 4); > + return -EFAULT; > + } > + > + /* We expect either a mflr r0, or a std r0, LRSAVE(r1) */ > + if (op != PPC_INST_MFLR && op != PPC_INST_STD_LR) { > + pr_err("Unexpected instruction %08x before bl _mcount\n", op); > + return -EINVAL; > + } > + > + if (op == PPC_INST_MFLR && > + patch_instruction((unsigned int *)(ip - 4), PPC_INST_NOP)) { > + pr_err("Patching NOP failed.\n"); > + return -EPERM; > + } > +#endif > + > return 0; > } > > @@ -429,6 +457,7 @@ int ftrace_make_nop(struct module *mod, > { > unsigned long ip = rec->ip; > unsigned int old, new; > + int rc; > > /* > * If the calling address is more that 24 bits away, > @@ -439,7 +468,34 @@ int ftrace_make_nop(struct module *mod, > /* within range */ > old = ftrace_call_replace(ip, addr, 1); > new = PPC_INST_NOP; > - return ftrace_modify_code(ip, old, new); > + rc = ftrace_modify_code(ip, old, new); > +#ifdef CONFIG_MPROFILE_KERNEL > + if (rc) > + return rc; > + > + /* > + * For -mprofile-kernel, we patch out the preceding 'mflr r0' > + * instruction, as an optimization. It is important to nop out > + * the branch to _mcount() first, as a lone 'mflr r0' is > + * harmless. > + */ > + if (probe_kernel_read(&old, (void *)(ip - 4), 4)) { > + pr_err("Fetching instruction at %lx failed.\n", ip - 4); > + return -EFAULT; > + } > + > + /* We expect either a mflr r0, or a std r0, LRSAVE(r1) */ > + if (old != PPC_INST_MFLR && old != PPC_INST_STD_LR) { > + pr_err("Unexpected instruction %08x before bl > _mcount\n", > + old); > + return -EINVAL; > + } > + > + if (old == PPC_INST_MFLR) > + rc = patch_instruction((unsigned int *)(ip - 4), > + PPC_INST_NOP); > +#endif > + return rc; > } else if (core_kernel_text(ip)) > return __ftrace_make_nop_kernel(rec, addr); > > @@ -567,6 +623,37 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long > addr) > return -EINVAL; > } > > +#ifdef CONFIG_MPROFILE_KERNEL > + /* > + * We could end up here without having called __ftrace_make_call_prep() > + * if function tracing is enabled before a module is loaded. > + * > + * ftrace_module_enable() --> ftrace_replace_code_rec() --> > + * ftrace_make_call() --> __ftrace_make_call() > + * > + * In this scenario, the previous instruction will be a NOP. It is > + * safe to patch it with a 'mflr r0' since we know for a fact that > + * this code is not yet being run. > + */ > + ip -= MCOUNT_INSN_SIZE; > + > + /* read where this goes */ > + if (probe_kernel_read(op, ip, MCOUNT_INSN_SIZE)) > + return -EFAULT; > + > + /* > + * nothing to do if this is using the older -mprofile-kernel > + * instruction sequence > + */ > + if (op[0] != PPC_INST_NOP) > + return 0; > + > + if (patch_instruction((unsigned int *)ip, PPC_INST_MFLR)) { > + pr_err("Patching MFLR failed.\n"); > + return -EPERM; > + } > +#endif > + > return 0; > } > > @@ -863,6 +950,133 @@ void arch_ftrace_update_code(int command) > ftrace_modify_all_code(command); > } > > +#ifdef CONFIG_MPROFILE_KERNEL > +/* Returns 1 if we patched in the mflr */ > +static int __ftrace_make_call_prep(struct dyn_ftrace *rec) > +{ > + void *ip = (void *)rec->ip - MCOUNT_INSN_SIZE; > + unsigned int op[2]; > + > + /* read where this goes */ > + if (probe_kernel_read(op, ip, sizeof(op))) > + return -EFAULT; > + > + if (op[1] != PPC_INST_NOP) { > + pr_err("Unexpected call sequence at %p: %x %x\n", > + ip, op[0], op[1]); > + return -EINVAL; > + } > + > + /* > + * nothing to do if this is using the older -mprofile-kernel > + * instruction sequence > + */ > + if (op[0] != PPC_INST_NOP) > + return 0; > + > + if (patch_instruction((unsigned int *)ip, PPC_INST_MFLR)) { > + pr_err("Patching MFLR failed.\n"); > + return -EPERM; > + } > + > + return 1; > +} > + > +static void do_isync(void *info __maybe_unused) > +{ > + isync(); > +} > + > +/* > + * When enabling function tracing for -mprofile-kernel that uses a > + * 2-instruction sequence of 'mflr r0, bl _mcount()', we use a 2 step > process: > + * 1. Patch in the 'mflr r0' instruction > + * 1a. flush icache on all cpus, so that the updated instruction is seen > + * 1b. synchronize_rcu_tasks() to ensure that any cpus that had executed > + * the earlier nop there make progress (and hence do not branch into > + * ftrace without executing the preceding mflr) > + * 2. Patch in the branch to ftrace > + */ > +void ftrace_replace_code(int mod_flags) > +{ > + int enable = mod_flags & FTRACE_MODIFY_ENABLE_FL; > + int schedulable = mod_flags & FTRACE_MODIFY_MAY_SLEEP_FL; > + int ret, failed, make_call = 0; > + struct ftrace_rec_iter *iter; > + struct dyn_ftrace *rec; > + > + if (unlikely(!ftrace_enabled)) > + return; > + > + for_ftrace_rec_iter(iter) { > + rec = ftrace_rec_iter_record(iter); > + > + if (rec->flags & FTRACE_FL_DISABLED) > + continue; > + > + failed = 0; > + ret = ftrace_test_record(rec, enable); > + if (ret == FTRACE_UPDATE_MAKE_CALL) { > + failed = __ftrace_make_call_prep(rec); > + if (failed < 0) { > + ftrace_bug(failed, rec); > + return; > + } else if (failed == 1) > + make_call++; > + } > + > + if (!failed) { > + /* We can patch the record directly */ > + failed = ftrace_replace_code_rec(rec, enable); > + if (failed) { > + ftrace_bug(failed, rec); > + return; > + } > + } > + > + if (schedulable) > + cond_resched(); > + } > + > + if (!make_call) > + /* No records needed patching a preceding mflr */ > + return; > + > + /* Make sure all cpus see the new instruction */ > + smp_call_function(do_isync, NULL, 1); > + > + /* > + * We also need to ensure that all cpus make progress: > + * - With !CONFIG_PREEMPT, we want to be sure that cpus return from > + * any interrupts they may be handling, and make progress. > + * - With CONFIG_PREEMPT, we want to be additionally sure that there > + * are no pre-empted tasks that have executed the earlier nop, and > + * might end up executing the subsequently patched branch to ftrace. > + */ > + synchronize_rcu_tasks(); > + > + for_ftrace_rec_iter(iter) { > + rec = ftrace_rec_iter_record(iter); > + > + if (rec->flags & FTRACE_FL_DISABLED) > + continue; > + > + ret = ftrace_test_record(rec, enable); > + if (ret == FTRACE_UPDATE_MAKE_CALL) > + failed = ftrace_replace_code_rec(rec, enable); > + > + if (failed) { > + ftrace_bug(failed, rec); > + return; > + } > + > + if (schedulable) > + cond_resched(); > + } > + > +} > +#endif > + > #ifdef CONFIG_PPC64 > #define PACATOC offsetof(struct paca_struct, kernel_toc) >