From: Vineeth Pillai <virem...@linux.microsoft.com>

There are use cases where the kernel protection is not needed. One
example could be about using core scheduling for non-security related
use cases - isolate core for a particular process dynamically. Also,
to test/benchmark the overhead of kernel protection.

Have a compile time and boot time option to disable the feature.
CONFIG_SCHED_CORE_KERNEL_PROTECTION will enable this feature at
compile time and is enabled by default is CONFIG_SCHED_CORE=y.
sched_core_kernel_protection= boot time option to control this. Value
0 will disable the feature.

Signed-off-by: Vineeth Pillai <virem...@linux.microsoft.com>
---
 .../admin-guide/kernel-parameters.txt         |  9 +++++
 include/linux/sched.h                         |  2 +-
 kernel/Kconfig.preempt                        | 13 +++++++
 kernel/sched/core.c                           | 39 ++++++++++++++++++-
 kernel/sched/sched.h                          |  2 +
 5 files changed, 63 insertions(+), 2 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index a1068742a6df..01e442388e4a 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4611,6 +4611,15 @@
 
        sbni=           [NET] Granch SBNI12 leased line adapter
 
+       sched_core_kernel_protection=
+                       [SCHED_CORE, SCHED_CORE_IRQ_PAUSE] Pause SMT siblings
+                       of a core runninig in user mode if atleast one of the
+                       siblings of the core is running in kernel. This is to
+                       guarantee that kernel data is not leaked to tasks which
+                       are not trusted by the kernel.
+                       This feature is valid only when Core scheduling is
+                       enabled(CONFIG_SCHED_CORE).
+
        sched_debug     [KNL] Enables verbose scheduler debug messages.
 
        schedstats=     [KNL,X86] Enable or disable scheduled statistics.
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1e04ffe689cb..4d9ae6b4dcc9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2055,7 +2055,7 @@ int sched_trace_rq_nr_running(struct rq *rq);
 
 const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
 
-#ifdef CONFIG_SCHED_CORE
+#ifdef CONFIG_SCHED_CORE_KERNEL_PROTECTION
 void sched_core_unsafe_enter(void);
 void sched_core_unsafe_exit(void);
 void sched_core_unsafe_exit_wait(unsigned long ti_check);
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 4488fbf4d3a8..52f86739f910 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -86,3 +86,16 @@ config SCHED_CORE
        default y
        depends on SCHED_SMT
 
+config SCHED_CORE_KERNEL_PROTECTION
+       bool "Core Scheduling for SMT"
+       default y
+       depends on SCHED_CORE
+       help
+         This option enables pausing all SMT siblings of a core running in
+         user mode when atleast one of the siblings in the core is in kernel.
+         This is to enforce security such that information from kernel is not
+         leaked to non-trusted tasks running on siblings. This option is valid
+         only if Core Scheduling(CONFIG_SCHED_CORE) is enabled.
+
+         If in doubt, select 'Y' when CONFIG_SCHED_CORE=y
+
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0dc9172be04d..34238fd67f31 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -75,6 +75,24 @@ __read_mostly int scheduler_running;
 
 #ifdef CONFIG_SCHED_CORE
 
+#ifdef CONFIG_SCHED_CORE_KERNEL_PROTECTION
+
+DEFINE_STATIC_KEY_TRUE(sched_core_kernel_protection);
+static int __init set_sched_core_kernel_protection(char *str)
+{
+       unsigned long val = 0;
+
+       if (!str)
+               return 0;
+
+       if (!kstrtoul(str, 0, &val) && !val)
+               static_branch_disable(&sched_core_kernel_protection);
+
+       return 1;
+}
+__setup("sched_core_kernel_protection=", set_sched_core_kernel_protection);
+#endif
+
 DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
 
 /* kernel prio, less is more */
@@ -4600,6 +4618,8 @@ static inline bool cookie_match(struct task_struct *a, 
struct task_struct *b)
        return a->core_cookie == b->core_cookie;
 }
 
+#ifdef CONFIG_SCHED_CORE_KERNEL_PROTECTION
+
 /*
  * Handler to attempt to enter kernel. It does nothing because the exit to
  * usermode or guest mode will do the actual work (of waiting if needed).
@@ -4609,6 +4629,11 @@ static void sched_core_irq_work(struct irq_work *work)
        return;
 }
 
+static inline void init_sched_core_irq_work(struct rq *rq)
+{
+       init_irq_work(&rq->core_irq_work, sched_core_irq_work);
+}
+
 /*
  * sched_core_wait_till_safe - Pause the caller's hyperthread until the core
  * exits the core-wide unsafe state. Obviously the CPU calling this function
@@ -4684,6 +4709,9 @@ void sched_core_unsafe_enter(void)
        struct rq *rq;
        int i, cpu;
 
+       if (!static_branch_likely(&sched_core_kernel_protection))
+               return;
+
        /* Ensure that on return to user/guest, we check whether to wait. */
        if (current->core_cookie)
                set_tsk_thread_flag(current, TIF_UNSAFE_RET);
@@ -4769,6 +4797,9 @@ void sched_core_unsafe_exit(void)
        struct rq *rq;
        int cpu;
 
+       if (!static_branch_likely(&sched_core_kernel_protection))
+               return;
+
        local_irq_save(flags);
        cpu = smp_processor_id();
        rq = cpu_rq(cpu);
@@ -4807,9 +4838,15 @@ void sched_core_unsafe_exit(void)
 
 void sched_core_unsafe_exit_wait(unsigned long ti_check)
 {
+       if (!static_branch_likely(&sched_core_kernel_protection))
+               return;
+
        sched_core_unsafe_exit();
        sched_core_wait_till_safe(ti_check);
 }
+#else
+static inline void init_sched_core_irq_work(struct rq *rq) {}
+#endif /* CONFIG_SCHED_CORE_KERNEL_PROTECTION */
 
 // XXX fairness/fwd progress conditions
 /*
@@ -7795,7 +7832,7 @@ int sched_cpu_starting(unsigned int cpu)
                        rq = cpu_rq(i);
                        if (rq->core && rq->core == rq)
                                core_rq = rq;
-                       init_irq_work(&rq->core_irq_work, sched_core_irq_work);
+                       init_sched_core_irq_work(rq);
                }
 
                if (!core_rq)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index dbd8416ddaba..676818bdb9df 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1058,8 +1058,10 @@ struct rq {
        unsigned int            core_sched_seq;
        struct rb_root          core_tree;
        unsigned char           core_forceidle;
+#ifdef CONFIG_SCHED_CORE_KERNEL_PROTECTION
        struct irq_work         core_irq_work; /* To force HT into kernel */
        unsigned int            core_this_unsafe_nest;
+#endif
 
        /* shared state */
        unsigned int            core_task_seq;
-- 
2.17.1

Reply via email to