Let's continue with replacing the existing Int3-based framework in ftrace with the new generic function introduced by the commit fd4363fff3d9 (x86: Introduce int3 (breakpoint)-based instruction patching)
This time use it in ftrace_replace_code that modifies all the watched function calls. If we use text_poke_bp in ftrace_make_nop, ftrace_make_call, ftrace_modify_call, it would be possible to get rid of the x86-specific ftrace_replace_code implementation and use the generic code. This would be really lovely change. Unfortunately, the code would be slow because syncing on each CPU is relatively expensive operation. For example, I tried to switch between 7 tracers: blk, branch, function_graph, wakeup_rt, irqsoff, function, and nop. Every tracer has also been enabled and disabled. With 500 cycles, I got these times with the original code: real 16m14.390s 16m15.200s 16m19.632s user 0m0.028s 0m0.024s 0m0.028s sys 0m23.788s 0m23.812s 0m23.804s It slowed down after using text_poke_bp for each record separately: real 29m45.785s 29m47.504s 29m44.016 user 0m0.004s 0m0.004s 0m0.004s sys 0m15.776s 0m16.088s 0m16.192s Hence, we implemented the more complex text_poke_bp_iter. When we use it, we get the times: real 17m9.753s 17m12.272s 17m11.424s user 0m0.004s 0m0.004s 0m0.004s sys 0m18.244s 0m18.252s 0m18.308s It is slightly slower than the ftrace-specific code. Well, this is the cost for using a generic API that can be used also in other locations. Signed-off-by: Petr Mladek <pmla...@suse.cz> --- --- arch/x86/kernel/alternative.c | 4 +- arch/x86/kernel/ftrace.c | 428 +++++++----------------------------------- arch/x86/kernel/traps.c | 10 - include/linux/ftrace.h | 6 - 4 files changed, 75 insertions(+), 373 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 95e8a7f..c7c7ad8 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -603,7 +603,7 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len) * * Note: Must be called under text_mutex. */ -static int text_poke_part(void *addr, const void *opcode, size_t len) +static int notrace text_poke_part(void *addr, const void *opcode, size_t len) { /* * On x86_64, kernel text mappings are mapped read-only with @@ -647,7 +647,7 @@ static void *bp_int3_handler, *bp_int3_addr; static size_t bp_int3_len; static void *(*bp_int3_is_handled)(const unsigned long ip); -int poke_int3_handler(struct pt_regs *regs) +int notrace poke_int3_handler(struct pt_regs *regs) { /* bp_patching_in_progress */ smp_rmb(); diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 5ade40e..92fe8ca 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -127,23 +127,6 @@ static const unsigned char *ftrace_nop_replace(void) } static int -ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code, - unsigned const char *new_code) -{ - int ret; - - ret = ftrace_check_code(ip, old_code); - - /* replace the text with the new text */ - if (!ret && do_ftrace_mod_code(ip, new_code)) - return -EPERM; - - sync_core(); - - return 0; -} - -static int ftrace_modify_code(unsigned long ip, unsigned const char *old_code, unsigned const char *new_code) { @@ -168,20 +151,7 @@ int ftrace_make_nop(struct module *mod, old = ftrace_call_replace(ip, addr); new = ftrace_nop_replace(); - /* - * On boot up, and when modules are loaded, the MCOUNT_ADDR - * is converted to a nop, and will never become MCOUNT_ADDR - * again. This code is either running before SMP (on boot up) - * or before the code will ever be executed (module load). - * We do not want to use the breakpoint version in this case, - * just modify the code directly. - */ - if (addr == MCOUNT_ADDR) - return ftrace_modify_code_direct(rec->ip, old, new); - - /* Normal cases use add_brk_on_nop */ - WARN_ONCE(1, "invalid use of ftrace_make_nop"); - return -EINVAL; + return ftrace_modify_code(ip, old, new); } int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) @@ -192,44 +162,10 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) old = ftrace_nop_replace(); new = ftrace_call_replace(ip, addr); - /* Should only be called when module is loaded */ - return ftrace_modify_code_direct(rec->ip, old, new); + return ftrace_modify_code(ip, old, new); } /* - * The modifying_ftrace_code is used to tell the breakpoint - * handler to call ftrace_int3_handler(). If it fails to - * call this handler for a breakpoint added by ftrace, then - * the kernel may crash. - * - * As atomic_writes on x86 do not need a barrier, we do not - * need to add smp_mb()s for this to work. It is also considered - * that we can not read the modifying_ftrace_code before - * executing the breakpoint. That would be quite remarkable if - * it could do that. Here's the flow that is required: - * - * CPU-0 CPU-1 - * - * atomic_inc(mfc); - * write int3s - * <trap-int3> // implicit (r)mb - * if (atomic_read(mfc)) - * call ftrace_int3_handler() - * - * Then when we are finished: - * - * atomic_dec(mfc); - * - * If we hit a breakpoint that was not set by ftrace, it does not - * matter if ftrace_int3_handler() is called or not. It will - * simply be ignored. But it is crucial that a ftrace nop/caller - * breakpoint is handled. No other user should ever place a - * breakpoint on an ftrace nop/caller location. It must only - * be done by this code. - */ -atomic_t modifying_ftrace_code __read_mostly; - -/* * Should never be called: * As it is only called by __ftrace_replace_code() which is called by * ftrace_replace_code() that x86 overrides, and by ftrace_update_code() @@ -267,80 +203,6 @@ int ftrace_update_ftrace_func(ftrace_func_t func) } /* - * A breakpoint was added to the code address we are about to - * modify, and this is the handle that will just skip over it. - * We are either changing a nop into a trace call, or a trace - * call to a nop. While the change is taking place, we treat - * it just like it was a nop. - */ -int ftrace_int3_handler(struct pt_regs *regs) -{ - if (WARN_ON_ONCE(!regs)) - return 0; - - if (!ftrace_location(regs->ip - 1)) - return 0; - - regs->ip += MCOUNT_INSN_SIZE - 1; - - return 1; -} - -static int ftrace_write(unsigned long ip, const char *val, int size) -{ - /* - * On x86_64, kernel text mappings are mapped read-only with - * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead - * of the kernel text mapping to modify the kernel text. - * - * For 32bit kernels, these mappings are same and we can use - * kernel identity mapping to modify code. - */ - if (within(ip, (unsigned long)_text, (unsigned long)_etext)) - ip = (unsigned long)__va(__pa_symbol(ip)); - - return probe_kernel_write((void *)ip, val, size); -} - -static int add_break(unsigned long ip, const char *old) -{ - unsigned char replaced[MCOUNT_INSN_SIZE]; - unsigned char brk = BREAKPOINT_INSTRUCTION; - - if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) - return -EFAULT; - - /* Make sure it is what we expect it to be */ - if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0) - return -EINVAL; - - if (ftrace_write(ip, &brk, 1)) - return -EPERM; - - return 0; -} - -static int add_brk_on_call(struct dyn_ftrace *rec, unsigned long addr) -{ - unsigned const char *old; - unsigned long ip = rec->ip; - - old = ftrace_call_replace(ip, addr); - - return add_break(rec->ip, old); -} - - -static int add_brk_on_nop(struct dyn_ftrace *rec) -{ - unsigned const char *old; - - old = ftrace_nop_replace(); - - return add_break(rec->ip, old); -} - -/* * If the record has the FTRACE_FL_REGS set, that means that it * wants to convert to a callback that saves all regs. If FTRACE_FL_REGS * is not not set, then it wants to convert to the normal callback. @@ -366,275 +228,131 @@ static unsigned long get_ftrace_old_addr(struct dyn_ftrace *rec) return (unsigned long)FTRACE_ADDR; } -static int add_breakpoints(struct dyn_ftrace *rec, int enable) -{ - unsigned long ftrace_addr; - int ret; - - ret = ftrace_test_record(rec, enable); - - ftrace_addr = get_ftrace_addr(rec); - - switch (ret) { - case FTRACE_UPDATE_IGNORE: - return 0; - - case FTRACE_UPDATE_MAKE_CALL: - /* converting nop to call */ - return add_brk_on_nop(rec); - - case FTRACE_UPDATE_MODIFY_CALL_REGS: - case FTRACE_UPDATE_MODIFY_CALL: - ftrace_addr = get_ftrace_old_addr(rec); - /* fall through */ - case FTRACE_UPDATE_MAKE_NOP: - /* converting a call to a nop */ - return add_brk_on_call(rec, ftrace_addr); - } - return 0; -} +struct ftrace_tp_iter { + struct ftrace_rec_iter *rec_iter; + struct dyn_ftrace *rec; + int enable; +}; -/* - * On error, we need to remove breakpoints. This needs to - * be done caefully. If the address does not currently have a - * breakpoint, we know we are done. Otherwise, we look at the - * remaining 4 bytes of the instruction. If it matches a nop - * we replace the breakpoint with the nop. Otherwise we replace - * it with the call instruction. - */ -static int remove_breakpoint(struct dyn_ftrace *rec) +static struct ftrace_tp_iter *ftrace_tp_set_record(struct ftrace_tp_iter *tp_iter) { - unsigned char ins[MCOUNT_INSN_SIZE]; - unsigned char brk = BREAKPOINT_INSTRUCTION; - const unsigned char *nop; - unsigned long ftrace_addr; - unsigned long ip = rec->ip; - - /* If we fail the read, just give up */ - if (probe_kernel_read(ins, (void *)ip, MCOUNT_INSN_SIZE)) - return -EFAULT; + if (!tp_iter->rec_iter) + return NULL; - /* If this does not have a breakpoint, we are done */ - if (ins[0] != brk) - return -1; - - nop = ftrace_nop_replace(); - - /* - * If the last 4 bytes of the instruction do not match - * a nop, then we assume that this is a call to ftrace_addr. - */ - if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) { - /* - * For extra paranoidism, we check if the breakpoint is on - * a call that would actually jump to the ftrace_addr. - * If not, don't touch the breakpoint, we make just create - * a disaster. - */ - ftrace_addr = get_ftrace_addr(rec); - nop = ftrace_call_replace(ip, ftrace_addr); - - if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) == 0) - goto update; - - /* Check both ftrace_addr and ftrace_old_addr */ - ftrace_addr = get_ftrace_old_addr(rec); - nop = ftrace_call_replace(ip, ftrace_addr); - - if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) - return -EINVAL; - } - - update: - return probe_kernel_write((void *)ip, &nop[0], 1); + tp_iter->rec = ftrace_rec_iter_record(tp_iter->rec_iter); + return tp_iter; } -static int add_update_code(unsigned long ip, unsigned const char *new) +void *ftrace_tp_iter_start(void *init) { - /* skip breakpoint */ - ip++; - new++; - if (ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1)) - return -EPERM; - return 0; + struct ftrace_tp_iter *tp_iter = init; + + tp_iter->rec_iter = ftrace_rec_iter_start(); + return ftrace_tp_set_record(tp_iter); } -static int add_update_call(struct dyn_ftrace *rec, unsigned long addr) +void *ftrace_tp_iter_next(void *cur) { - unsigned long ip = rec->ip; - unsigned const char *new; + struct ftrace_tp_iter *tp_iter = cur; - new = ftrace_call_replace(ip, addr); - return add_update_code(ip, new); + tp_iter->rec_iter = ftrace_rec_iter_next(tp_iter->rec_iter); + return ftrace_tp_set_record(tp_iter); } -static int add_update_nop(struct dyn_ftrace *rec) +void *ftrace_tp_iter_get_addr(void *cur) { - unsigned long ip = rec->ip; - unsigned const char *new; + struct ftrace_tp_iter *tp_iter = cur; - new = ftrace_nop_replace(); - return add_update_code(ip, new); + return (void *)(tp_iter->rec->ip); } -static int add_update(struct dyn_ftrace *rec, int enable) +const void *ftrace_tp_iter_get_opcode(void *cur) { - unsigned long ftrace_addr; + struct ftrace_tp_iter *tp_iter = cur; + unsigned long addr; int ret; - ret = ftrace_test_record(rec, enable); - - ftrace_addr = get_ftrace_addr(rec); + ret = ftrace_test_record(tp_iter->rec, tp_iter->enable); switch (ret) { - case FTRACE_UPDATE_IGNORE: - return 0; + case FTRACE_UPDATE_MAKE_NOP: + return ftrace_nop_replace(); - case FTRACE_UPDATE_MODIFY_CALL_REGS: - case FTRACE_UPDATE_MODIFY_CALL: case FTRACE_UPDATE_MAKE_CALL: - /* converting nop to call */ - return add_update_call(rec, ftrace_addr); + case FTRACE_UPDATE_MODIFY_CALL: + case FTRACE_UPDATE_MODIFY_CALL_REGS: + addr = get_ftrace_addr(tp_iter->rec); + return ftrace_call_replace(tp_iter->rec->ip, addr); - case FTRACE_UPDATE_MAKE_NOP: - /* converting a call to a nop */ - return add_update_nop(rec); + case FTRACE_UPDATE_IGNORE: + default: /* unknown ftrace bug */ + return NULL; } - - return 0; -} - -static int finish_update_call(struct dyn_ftrace *rec, unsigned long addr) -{ - unsigned long ip = rec->ip; - unsigned const char *new; - - new = ftrace_call_replace(ip, addr); - - if (ftrace_write(ip, new, 1)) - return -EPERM; - - return 0; -} - -static int finish_update_nop(struct dyn_ftrace *rec) -{ - unsigned long ip = rec->ip; - unsigned const char *new; - - new = ftrace_nop_replace(); - - if (ftrace_write(ip, new, 1)) - return -EPERM; - return 0; } -static int finish_update(struct dyn_ftrace *rec, int enable) +const void *ftrace_tp_iter_get_old_opcode(void *cur) { - unsigned long ftrace_addr; + struct ftrace_tp_iter *tp_iter = cur; + unsigned long old_addr; int ret; - ret = ftrace_update_record(rec, enable); - - ftrace_addr = get_ftrace_addr(rec); + ret = ftrace_test_record(tp_iter->rec, tp_iter->enable); switch (ret) { - case FTRACE_UPDATE_IGNORE: - return 0; - - case FTRACE_UPDATE_MODIFY_CALL_REGS: - case FTRACE_UPDATE_MODIFY_CALL: case FTRACE_UPDATE_MAKE_CALL: - /* converting nop to call */ - return finish_update_call(rec, ftrace_addr); + return ftrace_nop_replace(); case FTRACE_UPDATE_MAKE_NOP: - /* converting a call to a nop */ - return finish_update_nop(rec); - } + case FTRACE_UPDATE_MODIFY_CALL: + case FTRACE_UPDATE_MODIFY_CALL_REGS: + old_addr = get_ftrace_old_addr(tp_iter->rec); + return ftrace_call_replace(tp_iter->rec->ip, old_addr); - return 0; + case FTRACE_UPDATE_IGNORE: + default: /* unknown ftrace bug */ + return NULL; + } } -static void do_sync_core(void *data) +int ftrace_tp_iter_finish(void *cur) { - sync_core(); -} + struct ftrace_tp_iter *tp_iter = cur; -static void run_sync(void) -{ - int enable_irqs = irqs_disabled(); - - /* We may be called with interrupts disbled (on bootup). */ - if (enable_irqs) - local_irq_enable(); - on_each_cpu(do_sync_core, NULL, 1); - if (enable_irqs) - local_irq_disable(); + ftrace_update_record(tp_iter->rec, tp_iter->enable); + return 0; } void ftrace_replace_code(int enable) { - struct ftrace_rec_iter *iter; - struct dyn_ftrace *rec; - const char *report = "adding breakpoints"; - int count = 0; + struct text_poke_bp_iter tp_bp_iter; + struct ftrace_tp_iter tp_iter; int ret; - for_ftrace_rec_iter(iter) { - rec = ftrace_rec_iter_record(iter); - - ret = add_breakpoints(rec, enable); - if (ret) - goto remove_breakpoints; - count++; - } - - run_sync(); - - report = "updating code"; - - for_ftrace_rec_iter(iter) { - rec = ftrace_rec_iter_record(iter); - - ret = add_update(rec, enable); - if (ret) - goto remove_breakpoints; - } + tp_iter.enable = enable; - run_sync(); + tp_bp_iter.init = (void *)&tp_iter; + tp_bp_iter.len = MCOUNT_INSN_SIZE; + tp_bp_iter.start = ftrace_tp_iter_start; + tp_bp_iter.next = ftrace_tp_iter_next; + tp_bp_iter.get_addr = ftrace_tp_iter_get_addr; + tp_bp_iter.get_opcode = ftrace_tp_iter_get_opcode; + tp_bp_iter.get_old_opcode = ftrace_tp_iter_get_old_opcode; + tp_bp_iter.finish = ftrace_tp_iter_finish; + tp_bp_iter.is_handled = (void *(*)(const unsigned long))ftrace_location; - report = "removing breakpoints"; + ret = text_poke_bp_list(&tp_bp_iter); - for_ftrace_rec_iter(iter) { - rec = ftrace_rec_iter_record(iter); - - ret = finish_update(rec, enable); - if (ret) - goto remove_breakpoints; - } - - run_sync(); - - return; - - remove_breakpoints: - ftrace_bug(ret, rec ? rec->ip : 0); - printk(KERN_WARNING "Failed on %s (%d):\n", report, count); - for_ftrace_rec_iter(iter) { - rec = ftrace_rec_iter_record(iter); - remove_breakpoint(rec); - } + if (ret) + ftrace_bug(ret, (unsigned long)(tp_bp_iter.fail_addr)); } +/* + * The code is modified using Int3 guard, + * so we do not need to call the stop machine + */ void arch_ftrace_update_code(int command) { - /* See comment above by declaration of modifying_ftrace_code */ - atomic_inc(&modifying_ftrace_code); - ftrace_modify_all_code(command); - - atomic_dec(&modifying_ftrace_code); } int __init ftrace_dyn_arch_init(void *data) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 729aa77..16bf450 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -50,7 +50,6 @@ #include <asm/processor.h> #include <asm/debugreg.h> #include <linux/atomic.h> -#include <asm/ftrace.h> #include <asm/traps.h> #include <asm/desc.h> #include <asm/i387.h> @@ -319,15 +318,6 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co { enum ctx_state prev_state; -#ifdef CONFIG_DYNAMIC_FTRACE - /* - * ftrace must be first, everything else may cause a recursive crash. - * See note by declaration of modifying_ftrace_code in ftrace.c - */ - if (unlikely(atomic_read(&modifying_ftrace_code)) && - ftrace_int3_handler(regs)) - return; -#endif if (poke_int3_handler(regs)) return; diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 9f15c00..40ec039 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -383,12 +383,6 @@ struct ftrace_rec_iter *ftrace_rec_iter_start(void); struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter); struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter); -#define for_ftrace_rec_iter(iter) \ - for (iter = ftrace_rec_iter_start(); \ - iter; \ - iter = ftrace_rec_iter_next(iter)) - - int ftrace_update_record(struct dyn_ftrace *rec, int enable); int ftrace_test_record(struct dyn_ftrace *rec, int enable); void ftrace_run_stop_machine(int command); -- 1.8.4 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/