fpu->last_cpu records the last CPU a given FPU context structure was used on.
This enables an important optimization: if a task schedules out to a kernel
thread and then gets scheduled back after only FPU-inactive kernel threads
executed, the FPU state in the registers is still intact and the FPU restore
can be skipped - speeding up the context switch.

The same logic can be implemented slightly simpler, by using a single boolean
flag: fpu->fpregs_cached tells us whether the context's FPU registers are
cached in the CPU.

The only difference is that this flag has to be invalidated when a task is
migrated away from its CPU - but that is a slow path compared to context
switches.

Cc: Andy Lutomirski <l...@kernel.org>
Cc: Borislav Petkov <b...@alien8.de>
Cc: Dave Hansen <dave.han...@linux.intel.com>
Cc: Fenghua Yu <fenghua...@intel.com>
Cc: H. Peter Anvin <h...@zytor.com>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Oleg Nesterov <o...@redhat.com>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: Rik van Riel <r...@redhat.com>
Cc: Thomas Gleixner <t...@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng...@intel.com>
Signed-off-by: Ingo Molnar <mi...@kernel.org>
---
 arch/x86/include/asm/fpu/internal.h | 15 ++++++++-------
 arch/x86/include/asm/fpu/types.h    | 24 ++++++++++--------------
 arch/x86/include/asm/switch_to.h    | 10 ++++++++++
 arch/x86/kernel/fpu/core.c          |  2 +-
 kernel/sched/core.c                 |  2 ++
 kernel/sched/sched.h                |  8 ++++++++
 6 files changed, 39 insertions(+), 22 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h 
b/arch/x86/include/asm/fpu/internal.h
index 255645f60ca2..2eaf93cf11cc 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -490,7 +490,7 @@ DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
 
 /*
  * The in-register FPU state for an FPU context on a CPU is assumed to be
- * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx
+ * valid if fpu->fpregs_cached is still set, and if the fpu_fpregs_owner_ctx
  * matches the FPU.
  *
  * If the FPU register state is valid, the kernel can skip restoring the
@@ -512,12 +512,12 @@ static inline void __cpu_invalidate_fpregs_state(void)
 
 static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu)
 {
-       fpu->last_cpu = -1;
+       fpu->fpregs_cached = 0;
 }
 
 static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu)
 {
-       return fpu == this_cpu_read_stable(fpu_fpregs_owner_ctx) && cpu == 
fpu->last_cpu;
+       return fpu == this_cpu_read_stable(fpu_fpregs_owner_ctx) && 
fpu->fpregs_cached;
 }
 
 /*
@@ -573,15 +573,16 @@ switch_fpu_prepare(struct fpu *old_fpu, int cpu)
 {
        if (old_fpu->fpregs_active) {
                if (!copy_fpregs_to_fpstate(old_fpu))
-                       old_fpu->last_cpu = -1;
+                       old_fpu->fpregs_cached = 0;
                else
-                       old_fpu->last_cpu = cpu;
+                       old_fpu->fpregs_cached = 1;
 
                /* But leave fpu_fpregs_owner_ctx! */
                old_fpu->fpregs_active = 0;
                trace_x86_fpu_regs_deactivated(old_fpu);
-       } else
-               old_fpu->last_cpu = -1;
+       } else {
+               old_fpu->fpregs_cached = 0;
+       }
 }
 
 /*
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 3c80f5b9c09d..3090b0d7b232 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -276,20 +276,6 @@ union fpregs_state {
  */
 struct fpu {
        /*
-        * @last_cpu:
-        *
-        * Records the last CPU on which this context was loaded into
-        * FPU registers. (In the lazy-restore case we might be
-        * able to reuse FPU registers across multiple context switches
-        * this way, if no intermediate task used the FPU.)
-        *
-        * A value of -1 is used to indicate that the FPU state in context
-        * memory is newer than the FPU state in registers, and that the
-        * FPU state should be reloaded next time the task is run.
-        */
-       unsigned int                    last_cpu;
-
-       /*
         * @fpstate_active:
         *
         * This flag indicates whether this context is active: if the task
@@ -322,6 +308,16 @@ struct fpu {
        unsigned char                   fpregs_active;
 
        /*
+        * @fpregs_cached:
+        *
+        * This flag tells us whether this context is loaded into a CPU
+        * right now.
+        *
+        * This is set to 0 if a task is migrated to another CPU.
+        */
+       unsigned char                   fpregs_cached;
+
+       /*
         * @state:
         *
         * In-memory copy of all FPU registers that we save/restore
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index fcc5cd387fd1..a7146dadb31d 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -72,4 +72,14 @@ do {                                                         
        \
        ((last) = __switch_to_asm((prev), (next)));                     \
 } while (0)
 
+
+/*
+ * The task-migration arch callback clears the FPU registers cache:
+ */
+static inline void arch_task_migrate(struct task_struct *p)
+{
+       p->thread.fpu.fpregs_cached = 0;
+}
+#define arch_task_migrate arch_task_migrate
+
 #endif /* _ASM_X86_SWITCH_TO_H */
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index e1114f070c2d..287f1cb32b59 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -190,7 +190,7 @@ EXPORT_SYMBOL_GPL(fpstate_init);
 int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
 {
        dst_fpu->fpregs_active = 0;
-       dst_fpu->last_cpu = -1;
+       dst_fpu->fpregs_cached = 0;
 
        if (!src_fpu->fpstate_active || !static_cpu_has(X86_FEATURE_FPU))
                return 0;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c56fb57f2991..7eb2f3041fde 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1253,6 +1253,8 @@ void set_task_cpu(struct task_struct *p, unsigned int 
new_cpu)
                        p->sched_class->migrate_task_rq(p);
                p->se.nr_migrations++;
                perf_event_task_migrate(p);
+
+               arch_task_migrate(p);
        }
 
        __set_task_cpu(p, new_cpu);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7b34c7826ca5..ff8a894132e4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1824,3 +1824,11 @@ static inline void cpufreq_update_this_cpu(struct rq 
*rq, unsigned int flags) {}
 #else /* arch_scale_freq_capacity */
 #define arch_scale_freq_invariant()    (false)
 #endif
+
+/*
+ * Default task-migration arch callback:
+ */
+#ifndef arch_task_migrate
+static inline void arch_task_migrate(struct task_struct *p) { }
+#endif
+
-- 
2.7.4

Reply via email to