Threads running on nohz_full CPUs are not considered by synchronize_sched, but they should be covered by a membarrier system call with MEMBARRIER_CMD_SHARED command.
Introduce two new commands to membarrier: MEMBARRIER_CMD_REGISTER_EXPEDITED and MEMBARRIER_CMD_UNREGISTER_EXPEDITED. No-hz full threads requiring to receive interrupts to ensure correct memory ordering pairing compiler barriers with membarrier system call should register as "expedited" threads. [ This RFC patch lacks documentation. I mainly want feedback to see if everyone is OK with the general approach. ] Signed-off-by: Mathieu Desnoyers <mathieu.desnoy...@efficios.com> Cc: Paul E. McKenney <paul...@linux.vnet.ibm.com> Cc: Josh Triplett <j...@joshtriplett.org> Cc: KOSAKI Motohiro <kosaki.motoh...@jp.fujitsu.com> Cc: Steven Rostedt <rost...@goodmis.org> Cc: Nicholas Miell <nmi...@comcast.net> Cc: Ingo Molnar <mi...@redhat.com> Cc: Alan Cox <gno...@lxorguk.ukuu.org.uk> Cc: Lai Jiangshan <la...@cn.fujitsu.com> Cc: Stephen Hemminger <step...@networkplumber.org> Cc: Thomas Gleixner <t...@linutronix.de> Cc: Peter Zijlstra <pet...@infradead.org> Cc: David Howells <dhowe...@redhat.com> Cc: Pranith Kumar <bobby.pr...@gmail.com> Cc: Michael Kerrisk <mtk.manpa...@gmail.com> Cc: Shuah Khan <shua...@osg.samsung.com> Cc: Andrew Morton <a...@linux-foundation.org> Cc: Linus Torvalds <torva...@linux-foundation.org> --- fs/exec.c | 1 + include/linux/sched.h | 27 +++++++++++++++ include/uapi/linux/membarrier.h | 6 ++++ kernel/fork.c | 2 ++ kernel/membarrier.c | 77 +++++++++++++++++++++++++++++++++++++++-- 5 files changed, 111 insertions(+), 2 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index e579466..2cf1f87 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1771,6 +1771,7 @@ static int do_execveat_common(int fd, struct filename *filename, /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; + membarrier_execve(current); acct_update_integrals(current); task_numa_free(current); free_bprm(bprm); diff --git a/include/linux/sched.h b/include/linux/sched.h index ad3ec9e..1242eb9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1998,6 +1998,9 @@ struct task_struct { /* A live task holds one reference. */ atomic_t stack_refcount; #endif +#ifdef CONFIG_MEMBARRIER + unsigned int membarrier_expedited; +#endif /* CPU-specific state of this task */ struct thread_struct thread; /* @@ -3671,4 +3674,28 @@ void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data, void cpufreq_remove_update_util_hook(int cpu); #endif /* CONFIG_CPU_FREQ */ +#ifdef CONFIG_MEMBARRIER +static inline void membarrier_fork(struct task_struct *t, + unsigned long clone_flags) +{ + if (clone_flags & CLONE_THREAD) + t->membarrier_expedited = 0; + else + t->membarrier_expedited = current->membarrier_expedited; +} + +static inline void membarrier_execve(struct task_struct *t) +{ + t->membarrier_expedited = 0; +} +#else +static inline void membarrier_fork(struct task_struct *t, + unsigned long clone_flags) +{ +} +static inline void membarrier_execve(struct task_struct *t) +{ +} +#endif + #endif diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h index e0b108b..4b78f07 100644 --- a/include/uapi/linux/membarrier.h +++ b/include/uapi/linux/membarrier.h @@ -40,6 +40,10 @@ * (non-running threads are de facto in such a * state). This covers threads from all processes * running on the system. This command returns 0. + * @MEMBARRIER_CMD_REGISTER_EXPEDITED: + * TODO + * @MEMBARRIER_CMD_UNREGISTER_EXPEDITED: + * TODO * * Command to be passed to the membarrier system call. The commands need to * be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to @@ -48,6 +52,8 @@ enum membarrier_cmd { MEMBARRIER_CMD_QUERY = 0, MEMBARRIER_CMD_SHARED = (1 << 0), + MEMBARRIER_CMD_REGISTER_EXPEDITED = (1 << 1), + MEMBARRIER_CMD_UNREGISTER_EXPEDITED = (1 << 2), }; #endif /* _UAPI_LINUX_MEMBARRIER_H */ diff --git a/kernel/fork.c b/kernel/fork.c index 11c5c8a..cec23e0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1769,6 +1769,8 @@ static __latent_entropy struct task_struct *copy_process( */ copy_seccomp(p); + membarrier_fork(p, clone_flags); + /* * Process group and session signals need to be delivered to just the * parent before the fork or both the parent and the child after the diff --git a/kernel/membarrier.c b/kernel/membarrier.c index 536c727..65a6fbf 100644 --- a/kernel/membarrier.c +++ b/kernel/membarrier.c @@ -16,12 +16,79 @@ #include <linux/syscalls.h> #include <linux/membarrier.h> +#include <linux/tick.h> +#include <linux/smp.h> +#include <linux/sched.h> + +/* + * TODO: private sched.h is needed for runqueue. Should we move the + * sched code under kernel/sched/ ? + */ +#include "sched/sched.h" /* * Bitmask made from a "or" of all commands within enum membarrier_cmd, * except MEMBARRIER_CMD_QUERY. */ -#define MEMBARRIER_CMD_BITMASK (MEMBARRIER_CMD_SHARED) +#define MEMBARRIER_CMD_BITMASK \ + (MEMBARRIER_CMD_SHARED \ + | MEMBARRIER_CMD_REGISTER_EXPEDITED \ + | MEMBARRIER_CMD_UNREGISTER_EXPEDITED) + +static int membarrier_register_expedited(struct task_struct *t) +{ + struct rq *rq; + + if (t->membarrier_expedited == UINT_MAX) + return -EOVERFLOW; + rq = this_rq(); + raw_spin_lock(&rq->lock); + t->membarrier_expedited++; + raw_spin_unlock(&rq->lock); + return 0; +} + +static int membarrier_unregister_expedited(struct task_struct *t) +{ + struct rq *rq; + + if (!t->membarrier_expedited) + return -ENOENT; + rq = this_rq(); + raw_spin_lock(&rq->lock); + t->membarrier_expedited--; + raw_spin_unlock(&rq->lock); + return 0; +} + +static void memory_barrier(void *info) +{ + smp_mb(); +} + +static void membarrier_nohz_full_expedited(void) +{ + int cpu; + + if (!tick_nohz_full_enabled()) + return; + for_each_cpu(cpu, tick_nohz_full_mask) { + struct rq *rq; + struct task_struct *t; + + rq = cpu_rq(cpu); + raw_spin_lock(&rq->lock); + t = rq->curr; + if (t->membarrier_expedited) { + int ret; + + ret = smp_call_function_single(cpu, memory_barrier, + NULL, 1); + WARN_ON_ONCE(ret); + } + raw_spin_unlock(&rq->lock); + } +} /** * sys_membarrier - issue memory barriers on a set of threads @@ -57,9 +124,15 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) case MEMBARRIER_CMD_QUERY: return MEMBARRIER_CMD_BITMASK; case MEMBARRIER_CMD_SHARED: - if (num_online_cpus() > 1) + if (num_online_cpus() > 1) { synchronize_sched(); + membarrier_nohz_full_expedited(); + } return 0; + case MEMBARRIER_CMD_REGISTER_EXPEDITED: + return membarrier_register_expedited(current); + case MEMBARRIER_CMD_UNREGISTER_EXPEDITED: + return membarrier_unregister_expedited(current); default: return -EINVAL; } -- 2.1.4