On Fri, 3 May 2019 16:07:59 -0700
Linus Torvalds <torva...@linux-foundation.org> wrote:

>     static struct pt_regs *emulate_call(struct pt_regs *regs, unsigned
> long return, unsigned long target)
>     {
>     #ifdef CONFIG_X86_32
>             /* BIG comment about how we need to move pt_regs to make
> room and to update the return 'sp' */
>             struct pt_regs *new = (void *)regs - 4;
>             unsigned long *sp = (unsigned long *)(new + 1);

I tried this, and it crashed. After a bit of debugging, I found that
the issue is that we can't use sizeof(*regs), because we don't have a
full pt_regs. We are missing the first entry (which is why we have
kernel_stack_pointer() in the first place!)


>             memmove(new, regs, sizeof(*regs));

This would need to be something like:

#define SAVED_REGS_SIZE (sizeof(struct pt_regs) - 4)

                struct pt_regs *new = (void *)regs - 4;
                unsigned long *sp = (unsigned long *)
                        ((void *)new + SAVED_REGS_SIZE);

                memmove(new, regs, SAVED_REGS_SIZE);

>             regs = new;
>     #else
>             unsigned long *sp = regs->sp;
>             regs->sp -= 4;

should be:

                unsigned long *sp;

                regs->sp -= sizeof(long);
                sp = (unsigned long *)regs->sp;

>     #endif
>             *sp = value;
>             regs->ip = target;
>             return regs;
>     }

I got it sorta working. And it appears that printk() no longer flushes
partial lines, and I couldn't figure out where my self tests broke as
it does:

        pr_info("Testing ....: ");
        ret = do_test();
        if (ret)
                pr_info("PASSED\n");
        else
                pr_info("FAILED\n");

but because it doesn't flush anymore, I don't see what test it is :-p
But that's another issue.

With some "early_printk" in the code, I do see it doing the calls, but
for some reason, the start up test never moves forward.

Below is the patch that I tried. I started with Peter's changes (this
works on x86_64) and tried to work your ideas in for x86_32. This still
gets stuck (but doesn't crash). 

Also note that this method prevents the die notifiers from doing this,
which means I had to disable Peter's selftest because that no longer
will work the way it is written.

-- Steve

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index d309f30cf7af..f0fd3e17bedd 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -67,9 +67,20 @@
 # define preempt_stop(clobbers)        DISABLE_INTERRUPTS(clobbers); 
TRACE_IRQS_OFF
 #else
 # define preempt_stop(clobbers)
-# define resume_kernel         restore_all_kernel
 #endif
 
+.macro RETINT_PREEMPT
+#ifdef CONFIG_PREEMPT
+       DISABLE_INTERRUPTS(CLBR_ANY)
+       cmpl    $0, PER_CPU_VAR(__preempt_count)
+       jnz     .Lend_\@
+       testl   $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception 
path) ?
+       jz      .Lend_\@
+       call    preempt_schedule_irq
+.Lend_\@:
+#endif
+.endm
+
 .macro TRACE_IRQS_IRET
 #ifdef CONFIG_TRACE_IRQFLAGS
        testl   $X86_EFLAGS_IF, PT_EFLAGS(%esp)     # interrupts off?
@@ -377,6 +388,7 @@
 
 #define CS_FROM_ENTRY_STACK    (1 << 31)
 #define CS_FROM_USER_CR3       (1 << 30)
+#define CS_FROM_INT3           (1 << 29)
 
 .macro SWITCH_TO_KERNEL_STACK
 
@@ -753,7 +765,7 @@ ret_from_intr:
        andl    $SEGMENT_RPL_MASK, %eax
 #endif
        cmpl    $USER_RPL, %eax
-       jb      resume_kernel                   # not returning to v8086 or 
userspace
+       jb      restore_all_kernel              # not returning to v8086 or 
userspace
 
 ENTRY(resume_userspace)
        DISABLE_INTERRUPTS(CLBR_ANY)
@@ -763,19 +775,6 @@ ENTRY(resume_userspace)
        jmp     restore_all
 END(ret_from_exception)
 
-#ifdef CONFIG_PREEMPT
-ENTRY(resume_kernel)
-       DISABLE_INTERRUPTS(CLBR_ANY)
-.Lneed_resched:
-       cmpl    $0, PER_CPU_VAR(__preempt_count)
-       jnz     restore_all_kernel
-       testl   $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception 
path) ?
-       jz      restore_all_kernel
-       call    preempt_schedule_irq
-       jmp     .Lneed_resched
-END(resume_kernel)
-#endif
-
 GLOBAL(__begin_SYSENTER_singlestep_region)
 /*
  * All code from here through __end_SYSENTER_singlestep_region is subject
@@ -1026,6 +1025,7 @@ restore_all:
        INTERRUPT_RETURN
 
 restore_all_kernel:
+       RETINT_PREEMPT
        TRACE_IRQS_IRET
        PARANOID_EXIT_TO_KERNEL_MODE
        BUG_IF_WRONG_CR3
@@ -1476,6 +1476,27 @@ END(nmi)
 
 ENTRY(int3)
        ASM_CLAC
+
+#ifdef CONFIG_VM86
+       testl   $X86_EFLAGS_VM, 8(%esp)
+       jnz     .Lfrom_usermode_no_gap
+#endif
+       testl   $SEGMENT_RPL_MASK, 4(%esp)
+       jnz     .Lfrom_usermode_no_gap
+
+       pushl   $-1                             # mark this as an int
+
+       SAVE_ALL switch_stacks=1
+       ENCODE_FRAME_POINTER
+       TRACE_IRQS_OFF
+       movl    %esp, %eax                      # pt_regs pointer
+       subl    $8, %esp
+       call    do_kernel_int3
+       movl    %eax, %esp
+       jmp     ret_from_exception
+
+.Lfrom_usermode_no_gap:
+
        pushl   $-1                             # mark this as an int
 
        SAVE_ALL switch_stacks=1
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 1f0efdb7b629..834ec1397dab 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -879,7 +879,7 @@ apicinterrupt IRQ_WORK_VECTOR                       
irq_work_interrupt              smp_irq_work_interrupt
  * @paranoid == 2 is special: the stub will never switch stacks.  This is for
  * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS.
  */
-.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
+.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 
create_gap=0
 ENTRY(\sym)
        UNWIND_HINT_IRET_REGS offset=\has_error_code*8
 
@@ -899,6 +899,16 @@ ENTRY(\sym)
        jnz     .Lfrom_usermode_switch_stack_\@
        .endif
 
+       .if \create_gap == 1
+       testb   $3, CS-ORIG_RAX(%rsp)
+       jnz     .Lfrom_usermode_no_gap_\@
+       .rept 6
+       pushq   5*8(%rsp)
+       .endr
+       UNWIND_HINT_IRET_REGS offset=8
+.Lfrom_usermode_no_gap_\@:
+       .endif
+
        .if \paranoid
        call    paranoid_entry
        .else
@@ -1130,7 +1140,7 @@ apicinterrupt3 HYPERV_STIMER0_VECTOR \
 #endif /* CONFIG_HYPERV */
 
 idtentry debug                 do_debug                has_error_code=0        
paranoid=1 shift_ist=DEBUG_STACK
-idtentry int3                  do_int3                 has_error_code=0
+idtentry int3                  do_int3                 has_error_code=0        
create_gap=1
 idtentry stack_segment         do_stack_segment        has_error_code=1
 
 #ifdef CONFIG_XEN_PV
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index cf350639e76d..4b335ac5afcc 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -37,7 +37,7 @@ struct dyn_arch_ftrace {
        /* No extra data needed for x86 */
 };
 
-int ftrace_int3_handler(struct pt_regs *regs);
+int ftrace_int3_handler(struct pt_regs **regs);
 
 #define FTRACE_GRAPH_TRAMP_ADDR FTRACE_GRAPH_ADDR
 
diff --git a/arch/x86/include/asm/text-patching.h 
b/arch/x86/include/asm/text-patching.h
index e85ff65c43c3..8e638231cf52 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -39,4 +39,33 @@ extern int poke_int3_handler(struct pt_regs *regs);
 extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void 
*handler);
 extern int after_bootmem;
 
+static inline void int3_emulate_jmp(struct pt_regs *regs, unsigned long ip)
+{
+       regs->ip = ip;
+}
+
+#define INT3_INSN_SIZE 1
+#define CALL_INSN_SIZE 5
+
+static inline struct pt_regs * int3_emulate_call(struct pt_regs *regs, 
unsigned long func)
+{
+#ifdef CONFIG_X86_32
+#define SAVED_REGS_SIZE (sizeof(struct pt_regs) - 8)
+       /* BIG comment about how we need to move pt_regs to make
+          room and to update the return 'sp' */
+       struct pt_regs *new = (void *)regs - sizeof(long);
+       unsigned long *sp = (unsigned long *)
+               ((void *)new + SAVED_REGS_SIZE);
+       memmove(new, regs, SAVED_REGS_SIZE);
+       regs = new;
+#else
+       unsigned long *sp;
+       regs->sp -= sizeof(long);
+       sp = (unsigned long *)regs->sp;
+#endif
+       *sp = regs->ip - INT3_INSN_SIZE + CALL_INSN_SIZE;
+       regs->ip = func;
+       return regs;
+}
+
 #endif /* _ASM_X86_TEXT_PATCHING_H */
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 9a79c7808f9c..c0e9002e2708 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -613,11 +613,84 @@ extern struct paravirt_patch_site 
__start_parainstructions[],
        __stop_parainstructions[];
 #endif /* CONFIG_PARAVIRT */
 
+/*
+ * Self-test for the INT3 based CALL emulation code.
+ *
+ * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up
+ * properly and that there is a stack gap between the INT3 frame and the
+ * previous context. Without this gap doing a virtual PUSH on the interrupted
+ * stack would corrupt the INT3 IRET frame.
+ *
+ * See entry_{32,64}.S for more details.
+ */
+static void __init int3_magic(unsigned int *ptr)
+{
+       *ptr = 1;
+}
+
+extern __initdata unsigned long int3_selftest_ip; /* defined in asm below */
+
+static int __init
+int3_exception_notify(struct notifier_block *self, unsigned long val, void 
*data)
+{
+       struct die_args *args = data;
+       struct pt_regs *regs = args->regs;
+
+       if (!regs || user_mode(regs))
+               return NOTIFY_DONE;
+
+       if (val != DIE_INT3)
+               return NOTIFY_DONE;
+
+       if (regs->ip - INT3_INSN_SIZE != int3_selftest_ip)
+               return NOTIFY_DONE;
+
+       int3_emulate_call(regs, (unsigned long)&int3_magic);
+       return NOTIFY_STOP;
+}
+
+static void __init int3_selftest(void)
+{
+       static __initdata struct notifier_block int3_exception_nb = {
+               .notifier_call  = int3_exception_notify,
+               .priority       = INT_MAX-1, /* last */
+       };
+       unsigned int val = 0;
+
+       return;
+       BUG_ON(register_die_notifier(&int3_exception_nb));
+
+       /*
+        * Basically: int3_magic(&val); but really complicated :-)
+        *
+        * Stick the address of the INT3 instruction into int3_selftest_ip,
+        * then trigger the INT3, padded with NOPs to match a CALL instruction
+        * length.
+        */
+       asm volatile ("1: int3; nop; nop; nop; nop\n\t"
+                     ".pushsection .init.data,\"aw\"\n\t"
+                     ".align " __ASM_SEL(4, 8) "\n\t"
+                     ".type int3_selftest_ip, @object\n\t"
+                     ".size int3_selftest_ip, " __ASM_SEL(4, 8) "\n\t"
+                     "int3_selftest_ip:\n\t"
+                     __ASM_SEL(.long, .quad) " 1b\n\t"
+                     ".popsection\n\t"
+                     : : __ASM_SEL_RAW(a, D) (&val) : "memory");
+
+       BUG_ON(val != 1);
+
+       unregister_die_notifier(&int3_exception_nb);
+}
+
 void __init alternative_instructions(void)
 {
-       /* The patching is not fully atomic, so try to avoid local interruptions
-          that might execute the to be patched code.
-          Other CPUs are not running. */
+       int3_selftest();
+
+       /*
+        * The patching is not fully atomic, so try to avoid local
+        * interruptions that might execute the to be patched code.
+        * Other CPUs are not running.
+        */
        stop_nmi();
 
        /*
@@ -642,10 +715,11 @@ void __init alternative_instructions(void)
                                            _text, _etext);
        }
 
-       if (!uniproc_patched || num_possible_cpus() == 1)
+       if (!uniproc_patched || num_possible_cpus() == 1) {
                free_init_pages("SMP alternatives",
                                (unsigned long)__smp_locks,
                                (unsigned long)__smp_locks_end);
+       }
 #endif
 
        apply_paravirt(__parainstructions, __parainstructions_end);
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index ef49517f6bb2..c5e2ae1efbd8 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -29,6 +29,7 @@
 #include <asm/kprobes.h>
 #include <asm/ftrace.h>
 #include <asm/nops.h>
+#include <asm/text-patching.h>
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 
@@ -231,6 +232,7 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned 
long old_addr,
 }
 
 static unsigned long ftrace_update_func;
+static unsigned long ftrace_update_func_call;
 
 static int update_ftrace_func(unsigned long ip, void *new)
 {
@@ -259,6 +261,8 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
        unsigned char *new;
        int ret;
 
+       ftrace_update_func_call = (unsigned long)func;
+
        new = ftrace_call_replace(ip, (unsigned long)func);
        ret = update_ftrace_func(ip, new);
 
@@ -287,20 +291,30 @@ static nokprobe_inline int is_ftrace_caller(unsigned long 
ip)
  * call to a nop. While the change is taking place, we treat
  * it just like it was a nop.
  */
-int ftrace_int3_handler(struct pt_regs *regs)
+int ftrace_int3_handler(struct pt_regs **pregs)
 {
+       struct pt_regs *regs = *pregs;
        unsigned long ip;
 
        if (WARN_ON_ONCE(!regs))
                return 0;
 
-       ip = regs->ip - 1;
-       if (!ftrace_location(ip) && !is_ftrace_caller(ip))
-               return 0;
+       ip = regs->ip - INT3_INSN_SIZE;
 
-       regs->ip += MCOUNT_INSN_SIZE - 1;
+       if (ftrace_location(ip)) {
+               *pregs = int3_emulate_call(regs, (unsigned 
long)ftrace_regs_caller);
+//             early_printk("ip=%pS was=%px is=%px\n", (void *)ip, regs, 
*pregs);
+               return 1;
+       } else if (is_ftrace_caller(ip)) {
+               if (!ftrace_update_func_call) {
+                       int3_emulate_jmp(regs, ip + CALL_INSN_SIZE);
+                       return 1;
+               }
+               *pregs = int3_emulate_call(regs, ftrace_update_func_call);
+               return 1;
+       }
 
-       return 1;
+       return 0;
 }
 NOKPROBE_SYMBOL(ftrace_int3_handler);
 
@@ -859,6 +873,8 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
 
        func = ftrace_ops_get_func(ops);
 
+       ftrace_update_func_call = (unsigned long)func;
+
        /* Do a safe modify in case the trampoline is executing */
        new = ftrace_call_replace(ip, (unsigned long)func);
        ret = update_ftrace_func(ip, new);
@@ -960,6 +976,7 @@ static int ftrace_mod_jmp(unsigned long ip, void *func)
 {
        unsigned char *new;
 
+       ftrace_update_func_call = 0UL;
        new = ftrace_jmp_replace(ip, (unsigned long)func);
 
        return update_ftrace_func(ip, new);
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 4b8ee05dd6ad..03ba69a6e594 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -163,6 +163,9 @@ static inline bool invalid_selector(u16 value)
  * stack pointer we fall back to regs as stack if no previous stack
  * exists.
  *
+ * There is a special case for INT3, there we construct a full pt_regs
+ * environment. We can detect this case by a high bit in regs->cs
+ *
  * This is valid only for kernel mode traps.
  */
 unsigned long kernel_stack_pointer(struct pt_regs *regs)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index d26f9e9c3d83..17e0c56326c9 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -570,7 +570,7 @@ do_general_protection(struct pt_regs *regs, long error_code)
 }
 NOKPROBE_SYMBOL(do_general_protection);
 
-dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
+dotraplinkage struct pt_regs * notrace do_kernel_int3(struct pt_regs *regs)
 {
 #ifdef CONFIG_DYNAMIC_FTRACE
        /*
@@ -578,11 +578,11 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, 
long error_code)
         * See note by declaration of modifying_ftrace_code in ftrace.c
         */
        if (unlikely(atomic_read(&modifying_ftrace_code)) &&
-           ftrace_int3_handler(regs))
-               return;
+           ftrace_int3_handler(&regs))
+               return regs;
 #endif
        if (poke_int3_handler(regs))
-               return;
+               return regs;
 
        /*
         * Use ist_enter despite the fact that we don't use an IST stack.
@@ -594,7 +594,7 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, 
long error_code)
        ist_enter(regs);
        RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
-       if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
+       if (kgdb_ll_trap(DIE_INT3, "int3", regs, 0, X86_TRAP_BP,
                                SIGTRAP) == NOTIFY_STOP)
                goto exit;
 #endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
@@ -604,16 +604,26 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, 
long error_code)
                goto exit;
 #endif
 
-       if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
+       if (notify_die(DIE_INT3, "int3", regs, 0, X86_TRAP_BP,
                        SIGTRAP) == NOTIFY_STOP)
                goto exit;
 
        cond_local_irq_enable(regs);
-       do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, 0, NULL);
+       do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, 0, 0, NULL);
        cond_local_irq_disable(regs);
 
 exit:
        ist_exit(regs);
+       return regs;
+}
+NOKPROBE_SYMBOL(do_kernel_int3);
+
+dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
+{
+       struct pt_regs *ret_regs;
+
+       ret_regs = do_kernel_int3(regs);
+       WARN_ON_ONCE(ret_regs != regs);
 }
 NOKPROBE_SYMBOL(do_int3);
 

Reply via email to