From: Rik van Riel <r...@redhat.com>

On syscall entry with nohz_full on, we enable interrupts, call user_exit,
disable interrupts, do something, re-enable interrupts, and go on our
merry way.

Profiling shows that a large amount of the nohz_full overhead comes
from the extraneous disabling and re-enabling of interrupts. Andy
suggested simply not enabling interrupts until after the context
tracking code has done its thing, which allows us to skip a whole
interrupt disable & re-enable cycle.

This patch builds on top of these patches by Paolo:
https://lkml.org/lkml/2015/4/28/188
https://lkml.org/lkml/2015/4/29/139

Together with this patch I posted earlier this week, the syscall path
on a nohz_full cpu seems to be about 10% faster.
https://lkml.org/lkml/2015/4/24/394

My test is a simple microbenchmark that calls getpriority() in a loop
10 million times:

                run time        system time
vanilla         5.49s           2.08s
__acct patch    5.21s           1.92s
both patches    4.88s           1.71s

Cc: Frederic Weisbecker <fweis...@redhat.com>
Cc: Ingo Molnar <mi...@redhat.com>
Cc: Paolo Bonzini <pbonz...@redhat.com>
Cc: Heiko Carstens <heiko.carst...@de.ibm.com>
Cc: Thomas Gleixner <t...@linutronix.de>
Suggested-by: Andy Lutomirsky <aml...@amacapital.net>
Signed-off-by: Rik van Riel <r...@redhat.com>
---
 arch/x86/kernel/entry_32.S       |  4 ++--
 arch/x86/kernel/entry_64.S       |  4 ++--
 arch/x86/kernel/ptrace.c         |  6 +++++-
 include/linux/context_tracking.h | 11 +++++++++++
 4 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 1c309763e321..0bdf8c7057e4 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -406,7 +406,6 @@ ENTRY(ia32_sysenter_target)
 
        pushl_cfi %eax
        SAVE_ALL
-       ENABLE_INTERRUPTS(CLBR_NONE)
 
 /*
  * Load the potential sixth argument from user stack.
@@ -424,6 +423,7 @@ ENTRY(ia32_sysenter_target)
 
        testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
        jnz sysenter_audit
+       ENABLE_INTERRUPTS(CLBR_NONE)
 sysenter_do_call:
        cmpl $(NR_syscalls), %eax
        jae sysenter_badsys
@@ -647,7 +647,7 @@ END(work_pending)
 syscall_trace_entry:
        movl $-ENOSYS,PT_EAX(%esp)
        movl %esp, %eax
-       call syscall_trace_enter
+       call syscall_trace_enter        /* returns with irqs enabled */
        /* What it returned is what we'll actually use.  */
        cmpl $(NR_syscalls), %eax
        jnae syscall_call
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 02c2eff7478d..f7751da7b53e 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -228,7 +228,6 @@ GLOBAL(system_call_after_swapgs)
         * task preemption. We must enable interrupts only after we're done
         * with using rsp_scratch:
         */
-       ENABLE_INTERRUPTS(CLBR_NONE)
        pushq_cfi       %r11                    /* pt_regs->flags */
        pushq_cfi       $__USER_CS              /* pt_regs->cs */
        pushq_cfi       %rcx                    /* pt_regs->ip */
@@ -248,6 +247,7 @@ GLOBAL(system_call_after_swapgs)
 
        testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, 
SIZEOF_PTREGS)
        jnz tracesys
+       ENABLE_INTERRUPTS(CLBR_NONE)
 system_call_fastpath:
 #if __SYSCALL_MASK == ~0
        cmpq $__NR_syscall_max,%rax
@@ -313,7 +313,7 @@ GLOBAL(system_call_after_swapgs)
 tracesys:
        movq %rsp, %rdi
        movl $AUDIT_ARCH_X86_64, %esi
-       call syscall_trace_enter_phase1
+       call syscall_trace_enter_phase1 /* returns with interrupts enabled */
        test %rax, %rax
        jnz tracesys_phase2             /* if needed, run the slow path */
        RESTORE_C_REGS_EXCEPT_RAX       /* else restore clobbered regs */
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index a7bc79480719..066c86d0b68c 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1456,6 +1456,8 @@ static void do_audit_syscall_entry(struct pt_regs *regs, 
u32 arch)
  *
  * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax
  * are fully functional.
+ * Called with IRQs disabled, to be enabled after the context tracking
+ * code has run.
  *
  * For phase 2's benefit, our return value is:
  * 0:                  resume the syscall
@@ -1477,10 +1479,12 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs 
*regs, u32 arch)
         * doing anything that could touch RCU.
         */
        if (work & _TIF_NOHZ) {
-               user_exit();
+               user_exit_irqsoff();
                work &= ~_TIF_NOHZ;
        }
 
+       local_irq_enable();
+
 #ifdef CONFIG_SECCOMP
        /*
         * Do seccomp first -- it should minimize exposure of other
diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index 5d3719aed958..dc3b169b2b70 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -25,12 +25,23 @@ static inline void user_enter(void)
                context_tracking_enter(CONTEXT_USER);
 
 }
+
 static inline void user_exit(void)
 {
        if (context_tracking_is_enabled())
                context_tracking_exit(CONTEXT_USER);
 }
 
+/* Called with IRQs already disabled. */
+static inline void user_exit_irqsoff(void)
+{
+       if (in_interrupt())
+               return;
+
+       if (context_tracking_is_enabled())
+               __context_tracking_exit(CONTEXT_USER);
+}
+
 static inline enum ctx_state exception_enter(void)
 {
        enum ctx_state prev_ctx;
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to