From: Joel Fernandes <[email protected]>

Move NMI nesting tracking from the preempt_count bits to a separate per-CPU
counter (nmi_nesting). This is to free up the NMI bits in the preempt_count,
allowing those bits to be repurposed for other uses.

Reduce NMI_BITS from 4 to 1, using it only to detect if we're in an NMI.
The per-CPU counter currently caps nesting at 15.

[boqun: Solve Steven Rostedt's comment on the BUG_ON() condition]

Suggested-by: Boqun Feng <[email protected]>
Signed-off-by: Joel Fernandes <[email protected]>
Signed-off-by: Lyude Paul <[email protected]>
Signed-off-by: Boqun Feng <[email protected]>
Link: https://patch.msgid.link/[email protected]
---
 include/linux/hardirq.h                        | 17 +++++++++++++----
 include/linux/preempt.h                        |  9 +++++++--
 kernel/softirq.c                               |  2 ++
 tools/testing/selftests/bpf/bpf_experimental.h |  2 +-
 4 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index d57cab4d4c06..1a0360a1000f 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -10,6 +10,8 @@
 #include <linux/vtime.h>
 #include <asm/hardirq.h>
 
+DECLARE_PER_CPU(unsigned int, nmi_nesting);
+
 extern void synchronize_irq(unsigned int irq);
 extern bool synchronize_hardirq(unsigned int irq);
 
@@ -102,14 +104,17 @@ void irq_exit_rcu(void);
  */
 
 /*
- * nmi_enter() can nest up to 15 times; see NMI_BITS.
+ * nmi_enter() can nest - nesting is tracked in a per-CPU counter.
  */
 #define __nmi_enter()                                          \
        do {                                                    \
                lockdep_off();                                  \
                arch_nmi_enter();                               \
-               BUG_ON(in_nmi() == NMI_MASK);                   \
-               __preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET);       \
+               /* Maximum NMI nesting is 15. */                \
+               BUG_ON(__this_cpu_read(nmi_nesting) >= 15);     \
+               __this_cpu_inc(nmi_nesting);                    \
+               __preempt_count_add(HARDIRQ_OFFSET);            \
+               preempt_count_set(preempt_count() | NMI_MASK);  \
        } while (0)
 
 #define nmi_enter()                                            \
@@ -124,8 +129,12 @@ void irq_exit_rcu(void);
 
 #define __nmi_exit()                                           \
        do {                                                    \
+               unsigned int nesting;                           \
                BUG_ON(!in_nmi());                              \
-               __preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET);       \
+               __preempt_count_sub(HARDIRQ_OFFSET);            \
+               nesting = __this_cpu_dec_return(nmi_nesting);   \
+               if (!nesting)                                   \
+                       __preempt_count_sub(NMI_OFFSET);        \
                arch_nmi_exit();                                \
                lockdep_on();                                   \
        } while (0)
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index d964f965c8ff..586f96688325 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -17,6 +17,8 @@
  *
  * - bits 0-7 are the preemption count (max preemption depth: 256)
  * - bits 8-15 are the softirq count (max # of softirqs: 256)
+ * - bits 16-19 are the hardirq count (max # of hardirqs: 16)
+ * - bit 20 is the NMI flag (no nesting count, tracked separately)
  *
  * The hardirq count could in theory be the same as the number of
  * interrupts in the system, but we run all interrupt handlers with
@@ -24,16 +26,19 @@
  * there are a few palaeontologic drivers which reenable interrupts in
  * the handler, so we need more than one bit here.
  *
+ * NMI nesting depth is tracked in a separate per-CPU variable
+ * (nmi_nesting) to save bits in preempt_count.
+ *
  *         PREEMPT_MASK:       0x000000ff
  *         SOFTIRQ_MASK:       0x0000ff00
  *         HARDIRQ_MASK:       0x000f0000
- *             NMI_MASK:       0x00f00000
+ *             NMI_MASK:       0x00100000
  * PREEMPT_NEED_RESCHED:       0x80000000
  */
 #define PREEMPT_BITS   8
 #define SOFTIRQ_BITS   8
 #define HARDIRQ_BITS   4
-#define NMI_BITS       4
+#define NMI_BITS       1
 
 #define PREEMPT_SHIFT  0
 #define SOFTIRQ_SHIFT  (PREEMPT_SHIFT + PREEMPT_BITS)
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 4425d8dce44b..10af5ed859e7 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -88,6 +88,8 @@ EXPORT_PER_CPU_SYMBOL_GPL(hardirqs_enabled);
 EXPORT_PER_CPU_SYMBOL_GPL(hardirq_context);
 #endif
 
+DEFINE_PER_CPU(unsigned int, nmi_nesting);
+
 /*
  * SOFTIRQ_OFFSET usage:
  *
diff --git a/tools/testing/selftests/bpf/bpf_experimental.h 
b/tools/testing/selftests/bpf/bpf_experimental.h
index 2234bd6bc9d3..2d4256ff471f 100644
--- a/tools/testing/selftests/bpf/bpf_experimental.h
+++ b/tools/testing/selftests/bpf/bpf_experimental.h
@@ -449,7 +449,7 @@ extern int bpf_cgroup_read_xattr(struct cgroup *cgroup, 
const char *name__str,
 #define PREEMPT_BITS   8
 #define SOFTIRQ_BITS   8
 #define HARDIRQ_BITS   4
-#define NMI_BITS       4
+#define NMI_BITS       1
 
 #define PREEMPT_SHIFT  0
 #define SOFTIRQ_SHIFT  (PREEMPT_SHIFT + PREEMPT_BITS)
-- 
2.50.1 (Apple Git-155)


Reply via email to