Threads running on nohz_full CPUs are not considered by
synchronize_sched, but they should be covered by a membarrier system
call with MEMBARRIER_CMD_SHARED command.

Introduce two new commands to membarrier:
MEMBARRIER_CMD_REGISTER_EXPEDITED and
MEMBARRIER_CMD_UNREGISTER_EXPEDITED.

No-hz full threads requiring to receive interrupts to ensure correct
memory ordering pairing compiler barriers with membarrier system call
should register as "expedited" threads.

[ This RFC patch lacks documentation. I mainly want feedback to see if
  everyone is OK with the general approach. ]

Signed-off-by: Mathieu Desnoyers <mathieu.desnoy...@efficios.com>
Cc: Paul E. McKenney <paul...@linux.vnet.ibm.com>
Cc: Josh Triplett <j...@joshtriplett.org>
Cc: KOSAKI Motohiro <kosaki.motoh...@jp.fujitsu.com>
Cc: Steven Rostedt <rost...@goodmis.org>
Cc: Nicholas Miell <nmi...@comcast.net>
Cc: Ingo Molnar <mi...@redhat.com>
Cc: Alan Cox <gno...@lxorguk.ukuu.org.uk>
Cc: Lai Jiangshan <la...@cn.fujitsu.com>
Cc: Stephen Hemminger <step...@networkplumber.org>
Cc: Thomas Gleixner <t...@linutronix.de>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: David Howells <dhowe...@redhat.com>
Cc: Pranith Kumar <bobby.pr...@gmail.com>
Cc: Michael Kerrisk <mtk.manpa...@gmail.com>
Cc: Shuah Khan <shua...@osg.samsung.com>
Cc: Andrew Morton <a...@linux-foundation.org>
Cc: Linus Torvalds <torva...@linux-foundation.org>
---
 fs/exec.c                       |  1 +
 include/linux/sched.h           | 27 +++++++++++++++
 include/uapi/linux/membarrier.h |  6 ++++
 kernel/fork.c                   |  2 ++
 kernel/membarrier.c             | 77 +++++++++++++++++++++++++++++++++++++++--
 5 files changed, 111 insertions(+), 2 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index e579466..2cf1f87 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1771,6 +1771,7 @@ static int do_execveat_common(int fd, struct filename 
*filename,
        /* execve succeeded */
        current->fs->in_exec = 0;
        current->in_execve = 0;
+       membarrier_execve(current);
        acct_update_integrals(current);
        task_numa_free(current);
        free_bprm(bprm);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ad3ec9e..1242eb9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1998,6 +1998,9 @@ struct task_struct {
        /* A live task holds one reference. */
        atomic_t stack_refcount;
 #endif
+#ifdef CONFIG_MEMBARRIER
+       unsigned int membarrier_expedited;
+#endif
 /* CPU-specific state of this task */
        struct thread_struct thread;
 /*
@@ -3671,4 +3674,28 @@ void cpufreq_add_update_util_hook(int cpu, struct 
update_util_data *data,
 void cpufreq_remove_update_util_hook(int cpu);
 #endif /* CONFIG_CPU_FREQ */
 
+#ifdef CONFIG_MEMBARRIER
+static inline void membarrier_fork(struct task_struct *t,
+               unsigned long clone_flags)
+{
+       if (clone_flags & CLONE_THREAD)
+               t->membarrier_expedited = 0;
+       else
+               t->membarrier_expedited = current->membarrier_expedited;
+}
+
+static inline void membarrier_execve(struct task_struct *t)
+{
+       t->membarrier_expedited = 0;
+}
+#else
+static inline void membarrier_fork(struct task_struct *t,
+               unsigned long clone_flags)
+{
+}
+static inline void membarrier_execve(struct task_struct *t)
+{
+}
+#endif
+
 #endif
diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h
index e0b108b..4b78f07 100644
--- a/include/uapi/linux/membarrier.h
+++ b/include/uapi/linux/membarrier.h
@@ -40,6 +40,10 @@
  *                          (non-running threads are de facto in such a
  *                          state). This covers threads from all processes
  *                          running on the system. This command returns 0.
+ * @MEMBARRIER_CMD_REGISTER_EXPEDITED:
+ *                          TODO
+ * @MEMBARRIER_CMD_UNREGISTER_EXPEDITED:
+ *                          TODO
  *
  * Command to be passed to the membarrier system call. The commands need to
  * be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to
@@ -48,6 +52,8 @@
 enum membarrier_cmd {
        MEMBARRIER_CMD_QUERY = 0,
        MEMBARRIER_CMD_SHARED = (1 << 0),
+       MEMBARRIER_CMD_REGISTER_EXPEDITED = (1 << 1),
+       MEMBARRIER_CMD_UNREGISTER_EXPEDITED = (1 << 2),
 };
 
 #endif /* _UAPI_LINUX_MEMBARRIER_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 11c5c8a..cec23e0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1769,6 +1769,8 @@ static __latent_entropy struct task_struct *copy_process(
         */
        copy_seccomp(p);
 
+       membarrier_fork(p, clone_flags);
+
        /*
         * Process group and session signals need to be delivered to just the
         * parent before the fork or both the parent and the child after the
diff --git a/kernel/membarrier.c b/kernel/membarrier.c
index 536c727..65a6fbf 100644
--- a/kernel/membarrier.c
+++ b/kernel/membarrier.c
@@ -16,12 +16,79 @@
 
 #include <linux/syscalls.h>
 #include <linux/membarrier.h>
+#include <linux/tick.h>
+#include <linux/smp.h>
+#include <linux/sched.h>
+
+/*
+ * TODO: private sched.h is needed for runqueue. Should we move the
+ * sched code under kernel/sched/ ?
+ */
+#include "sched/sched.h"
 
 /*
  * Bitmask made from a "or" of all commands within enum membarrier_cmd,
  * except MEMBARRIER_CMD_QUERY.
  */
-#define MEMBARRIER_CMD_BITMASK (MEMBARRIER_CMD_SHARED)
+#define MEMBARRIER_CMD_BITMASK \
+       (MEMBARRIER_CMD_SHARED \
+       | MEMBARRIER_CMD_REGISTER_EXPEDITED \
+       | MEMBARRIER_CMD_UNREGISTER_EXPEDITED)
+
+static int membarrier_register_expedited(struct task_struct *t)
+{
+       struct rq *rq;
+
+       if (t->membarrier_expedited == UINT_MAX)
+               return -EOVERFLOW;
+       rq = this_rq();
+       raw_spin_lock(&rq->lock);
+       t->membarrier_expedited++;
+       raw_spin_unlock(&rq->lock);
+       return 0;
+}
+
+static int membarrier_unregister_expedited(struct task_struct *t)
+{
+       struct rq *rq;
+
+       if (!t->membarrier_expedited)
+               return -ENOENT;
+       rq = this_rq();
+       raw_spin_lock(&rq->lock);
+       t->membarrier_expedited--;
+       raw_spin_unlock(&rq->lock);
+       return 0;
+}
+
+static void memory_barrier(void *info)
+{
+       smp_mb();
+}
+
+static void membarrier_nohz_full_expedited(void)
+{
+       int cpu;
+
+       if (!tick_nohz_full_enabled())
+               return;
+       for_each_cpu(cpu, tick_nohz_full_mask) {
+               struct rq *rq;
+               struct task_struct *t;
+
+               rq = cpu_rq(cpu);
+               raw_spin_lock(&rq->lock);
+               t = rq->curr;
+               if (t->membarrier_expedited) {
+                       int ret;
+
+                       ret = smp_call_function_single(cpu, memory_barrier,
+                                       NULL, 1);
+                       WARN_ON_ONCE(ret);
+               }
+               raw_spin_unlock(&rq->lock);
+       }
+}
 
 /**
  * sys_membarrier - issue memory barriers on a set of threads
@@ -57,9 +124,15 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
        case MEMBARRIER_CMD_QUERY:
                return MEMBARRIER_CMD_BITMASK;
        case MEMBARRIER_CMD_SHARED:
-               if (num_online_cpus() > 1)
+               if (num_online_cpus() > 1) {
                        synchronize_sched();
+                       membarrier_nohz_full_expedited();
+               }
                return 0;
+       case MEMBARRIER_CMD_REGISTER_EXPEDITED:
+               return membarrier_register_expedited(current);
+       case MEMBARRIER_CMD_UNREGISTER_EXPEDITED:
+               return membarrier_unregister_expedited(current);
        default:
                return -EINVAL;
        }
-- 
2.1.4

Reply via email to