Hi,

new thread since someone forgot to CC scheduler maintainers on actual
scheduler patches and I can't be arsed to look up the original thread.

The below boots to wanting to mount a root filesystem with
CONFIG_PREEMPT=y using kvm -smp 4.

I suppose we might want to move TIF_NEED_RESCHED into the preempt_count
just as we might want to move PREEMPT_ACTIVE out of it.

Adding TIF_NEED_RESCHED into the preempt count would allow a single test
in preempt_check_resched() instead of still needing the TI. Removing
PREEMPT_ACTIVE from preempt count should allow us to get rid of
ti::preempt_count altogether.

The only problem with TIF_NEED_RESCHED is that its cross-cpu which would
make the entire thing atomic which would suck donkey balls so maybe we
need two separate per-cpu variables? 

---
 arch/x86/kernel/entry_64.S |  2 +-
 include/linux/preempt.h    |  9 ++++++---
 kernel/context_tracking.c  |  3 +--
 kernel/sched/core.c        | 20 +++++++++++++++-----
 lib/smp_processor_id.c     |  3 +--
 5 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 1b69951..5ea77d2 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1118,7 +1118,7 @@ ENTRY(native_iret)
        /* Returning to kernel space. Check if we need preemption */
        /* rcx:  threadinfo. interrupts off. */
 ENTRY(retint_kernel)
-       cmpl $0,TI_preempt_count(%rcx)
+       cmpl $0,PER_CPU_VAR(__preempt_count_var)
        jnz  retint_restore_args
        bt  $TIF_NEED_RESCHED,TI_flags(%rcx)
        jnc  retint_restore_args
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index f5d4723..2ca9c8ff 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -6,7 +6,7 @@
  * preempt_count (used for kernel preemption, interrupt count, etc.)
  */
 
-#include <linux/thread_info.h>
+#include <asm/percpu.h>
 #include <linux/linkage.h>
 #include <linux/list.h>
 
@@ -21,7 +21,9 @@
 #define inc_preempt_count() add_preempt_count(1)
 #define dec_preempt_count() sub_preempt_count(1)
 
-#define preempt_count()        (current_thread_info()->preempt_count)
+DECLARE_PER_CPU(int, __preempt_count_var);
+
+#define preempt_count() __raw_get_cpu_var(__preempt_count_var)
 
 #ifdef CONFIG_PREEMPT
 
@@ -29,7 +31,8 @@ asmlinkage void preempt_schedule(void);
 
 #define preempt_check_resched() \
 do { \
-       if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \
+       if (unlikely(preempt_count() == 0 && \
+                    test_thread_flag(TIF_NEED_RESCHED))) \
                preempt_schedule(); \
 } while (0)
 
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 383f823..6d113d8 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -87,10 +87,9 @@ void user_enter(void)
  */
 void __sched notrace preempt_schedule_context(void)
 {
-       struct thread_info *ti = current_thread_info();
        enum ctx_state prev_ctx;
 
-       if (likely(ti->preempt_count || irqs_disabled()))
+       if (likely(preempt_count() || irqs_disabled()))
                return;
 
        /*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 54957a6..59d0b6e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -89,6 +89,8 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
 
+DEFINE_PER_CPU(int, __preempt_count_var) = INIT_PREEMPT_COUNT;
+
 void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
 {
        unsigned long delta;
@@ -2013,6 +2015,16 @@ context_switch(struct rq *rq, struct task_struct *prev,
        spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 #endif
 
+#ifdef CONFIG_PREEMPT_COUNT
+       /*
+        * If it weren't for PREEMPT_ACTIVE we could guarantee that the
+        * preempt_count() of all tasks was equal here and this wouldn't be
+        * needed at all -- try and move PREEMPT_ACTIVE into TI_flags?
+        */
+       task_thread_info(prev)->preempt_count = preempt_count();
+       preempt_count() = task_thread_info(next)->preempt_count;
+#endif
+
        context_tracking_task_switch(prev, next);
        /* Here we just switch the register state and the stack. */
        switch_to(prev, next, prev);
@@ -2515,13 +2527,11 @@ void __sched schedule_preempt_disabled(void)
  */
 asmlinkage void __sched notrace preempt_schedule(void)
 {
-       struct thread_info *ti = current_thread_info();
-
        /*
         * If there is a non-zero preempt_count or interrupts are disabled,
         * we do not want to preempt the current task. Just return..
         */
-       if (likely(ti->preempt_count || irqs_disabled()))
+       if (likely(preempt_count() || irqs_disabled()))
                return;
 
        do {
@@ -2546,11 +2556,10 @@ EXPORT_SYMBOL(preempt_schedule);
  */
 asmlinkage void __sched preempt_schedule_irq(void)
 {
-       struct thread_info *ti = current_thread_info();
        enum ctx_state prev_state;
 
        /* Catch callers which need to be fixed */
-       BUG_ON(ti->preempt_count || !irqs_disabled());
+       BUG_ON(preempt_count() || !irqs_disabled());
 
        prev_state = exception_enter();
 
@@ -4218,6 +4227,7 @@ void init_idle(struct task_struct *idle, int cpu)
 
        /* Set the preempt count _outside_ the spinlocks! */
        task_thread_info(idle)->preempt_count = 0;
+       per_cpu(__preempt_count_var, cpu) = 0;
 
        /*
         * The idle tasks have their own, simple scheduling class:
diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
index 4c0d0e5..04abe53 100644
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c
@@ -9,10 +9,9 @@
 
 notrace unsigned int debug_smp_processor_id(void)
 {
-       unsigned long preempt_count = preempt_count();
        int this_cpu = raw_smp_processor_id();
 
-       if (likely(preempt_count))
+       if (likely(preempt_count()))
                goto out;
 
        if (irqs_disabled())
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to