Excerpts from Andy Lutomirski's message of June 16, 2021 1:21 pm:
> The old sync_core_before_usermode() comments suggested that a 
> non-icache-syncing
> return-to-usermode instruction is x86-specific and that all other
> architectures automatically notice cross-modified code on return to
> userspace.
> 
> This is misleading.  The incantation needed to modify code from one
> CPU and execute it on another CPU is highly architecture dependent.
> On x86, according to the SDM, one must modify the code, issue SFENCE
> if the modification was WC or nontemporal, and then issue a "serializing
> instruction" on the CPU that will execute the code.  membarrier() can do
> the latter.
> 
> On arm64 and powerpc, one must flush the icache and then flush the pipeline
> on the target CPU, although the CPU manuals don't necessarily use this
> language.
> 
> So let's drop any pretense that we can have a generic way to define or
> implement membarrier's SYNC_CORE operation and instead require all
> architectures to define the helper and supply their own documentation as to
> how to use it.  This means x86, arm64, and powerpc for now.  Let's also
> rename the function from sync_core_before_usermode() to
> membarrier_sync_core_before_usermode() because the precise flushing details
> may very well be specific to membarrier, and even the concept of
> "sync_core" in the kernel is mostly an x86-ism.
> 
> (It may well be the case that, on real x86 processors, synchronizing the
>  icache (which requires no action at all) and "flushing the pipeline" is
>  sufficient, but trying to use this language would be confusing at best.
>  LFENCE does something awfully like "flushing the pipeline", but the SDM
>  does not permit LFENCE as an alternative to a "serializing instruction"
>  for this purpose.)
> 
> Cc: Michael Ellerman <m...@ellerman.id.au>
> Cc: Benjamin Herrenschmidt <b...@kernel.crashing.org>
> Cc: Paul Mackerras <pau...@samba.org>
> Cc: linuxppc-dev@lists.ozlabs.org
> Cc: Nicholas Piggin <npig...@gmail.com>
> Cc: Catalin Marinas <catalin.mari...@arm.com>
> Cc: Will Deacon <w...@kernel.org>
> Cc: linux-arm-ker...@lists.infradead.org
> Cc: Mathieu Desnoyers <mathieu.desnoy...@efficios.com>
> Cc: Nicholas Piggin <npig...@gmail.com>
> Cc: Peter Zijlstra <pet...@infradead.org>
> Cc: x...@kernel.org
> Cc: sta...@vger.kernel.org
> Fixes: 70216e18e519 ("membarrier: Provide core serializing command, 
> *_SYNC_CORE")
> Signed-off-by: Andy Lutomirski <l...@kernel.org>
> ---
>  .../membarrier-sync-core/arch-support.txt     | 68 ++++++-------------
>  arch/arm64/include/asm/sync_core.h            | 19 ++++++
>  arch/powerpc/include/asm/sync_core.h          | 14 ++++
>  arch/x86/Kconfig                              |  1 -
>  arch/x86/include/asm/sync_core.h              |  7 +-
>  arch/x86/kernel/alternative.c                 |  2 +-
>  arch/x86/kernel/cpu/mce/core.c                |  2 +-
>  arch/x86/mm/tlb.c                             |  3 +-
>  drivers/misc/sgi-gru/grufault.c               |  2 +-
>  drivers/misc/sgi-gru/gruhandles.c             |  2 +-
>  drivers/misc/sgi-gru/grukservices.c           |  2 +-
>  include/linux/sched/mm.h                      |  1 -
>  include/linux/sync_core.h                     | 21 ------
>  init/Kconfig                                  |  3 -
>  kernel/sched/membarrier.c                     | 15 ++--
>  15 files changed, 75 insertions(+), 87 deletions(-)
>  create mode 100644 arch/arm64/include/asm/sync_core.h
>  create mode 100644 arch/powerpc/include/asm/sync_core.h
>  delete mode 100644 include/linux/sync_core.h
> 
> diff --git 
> a/Documentation/features/sched/membarrier-sync-core/arch-support.txt 
> b/Documentation/features/sched/membarrier-sync-core/arch-support.txt
> index 883d33b265d6..41c9ebcb275f 100644
> --- a/Documentation/features/sched/membarrier-sync-core/arch-support.txt
> +++ b/Documentation/features/sched/membarrier-sync-core/arch-support.txt
> @@ -5,51 +5,25 @@
>  #
>  # Architecture requirements
>  #
> -# * arm/arm64/powerpc
>  #
> -# Rely on implicit context synchronization as a result of exception return
> -# when returning from IPI handler, and when returning to user-space.
> -#
> -# * x86
> -#
> -# x86-32 uses IRET as return from interrupt, which takes care of the IPI.
> -# However, it uses both IRET and SYSEXIT to go back to user-space. The IRET
> -# instruction is core serializing, but not SYSEXIT.
> -#
> -# x86-64 uses IRET as return from interrupt, which takes care of the IPI.
> -# However, it can return to user-space through either SYSRETL (compat code),
> -# SYSRETQ, or IRET.
> -#
> -# Given that neither SYSRET{L,Q}, nor SYSEXIT, are core serializing, we rely
> -# instead on write_cr3() performed by switch_mm() to provide core 
> serialization
> -# after changing the current mm, and deal with the special case of kthread ->
> -# uthread (temporarily keeping current mm into active_mm) by issuing a
> -# sync_core_before_usermode() in that specific case.
> -#
> -    -----------------------
> -    |         arch |status|
> -    -----------------------
> -    |       alpha: | TODO |
> -    |         arc: | TODO |
> -    |         arm: |  ok  |
> -    |       arm64: |  ok  |
> -    |        csky: | TODO |
> -    |       h8300: | TODO |
> -    |     hexagon: | TODO |
> -    |        ia64: | TODO |
> -    |        m68k: | TODO |
> -    |  microblaze: | TODO |
> -    |        mips: | TODO |
> -    |       nds32: | TODO |
> -    |       nios2: | TODO |
> -    |    openrisc: | TODO |
> -    |      parisc: | TODO |
> -    |     powerpc: |  ok  |
> -    |       riscv: | TODO |
> -    |        s390: | TODO |
> -    |          sh: | TODO |
> -    |       sparc: | TODO |
> -    |          um: | TODO |
> -    |         x86: |  ok  |
> -    |      xtensa: | TODO |
> -    -----------------------
> +# An architecture that wants to support
> +# MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE needs to define precisely what 
> it
> +# is supposed to do and implement membarrier_sync_core_before_usermode() to
> +# make it do that.  Then it can select ARCH_HAS_MEMBARRIER_SYNC_CORE via
> +# Kconfig.Unfortunately, MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE is not a
> +# fantastic API and may not make sense on all architectures.  Once an
> +# architecture meets these requirements,
> +#
> +# On x86, a program can safely modify code, issue
> +# MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE, and then execute that code, via
> +# the modified address or an alias, from any thread in the calling process.
> +#
> +# On arm64, a program can modify code, flush the icache as needed, and issue
> +# MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE to force a "context 
> synchronizing
> +# event", aka pipeline flush on all CPUs that might run the calling process.
> +# Then the program can execute the modified code as long as it is executed
> +# from an address consistent with the icache flush and the CPU's cache type.
> +#
> +# On powerpc, a program can use MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE
> +# similarly to arm64.  It would be nice if the powerpc maintainers could
> +# add a more clear explanantion.
> diff --git a/arch/arm64/include/asm/sync_core.h 
> b/arch/arm64/include/asm/sync_core.h
> new file mode 100644
> index 000000000000..74996bf533bb
> --- /dev/null
> +++ b/arch/arm64/include/asm/sync_core.h
> @@ -0,0 +1,19 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _ASM_ARM64_SYNC_CORE_H
> +#define _ASM_ARM64_SYNC_CORE_H
> +
> +#include <asm/barrier.h>
> +
> +/*
> + * On arm64, anyone trying to use membarrier() to handle JIT code is
> + * required to first flush the icache and then do SYNC_CORE.  All that's
> + * needed after the icache flush is to execute a "context synchronization
> + * event".  Right now, ERET does this, and we are guaranteed to ERET before
> + * any user code runs.  If Linux ever programs the CPU to make ERET stop
> + * being a context synchronizing event, then this will need to be adjusted.
> + */
> +static inline void membarrier_sync_core_before_usermode(void)
> +{
> +}
> +
> +#endif /* _ASM_ARM64_SYNC_CORE_H */
> diff --git a/arch/powerpc/include/asm/sync_core.h 
> b/arch/powerpc/include/asm/sync_core.h
> new file mode 100644
> index 000000000000..589fdb34beab
> --- /dev/null
> +++ b/arch/powerpc/include/asm/sync_core.h
> @@ -0,0 +1,14 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _ASM_POWERPC_SYNC_CORE_H
> +#define _ASM_POWERPC_SYNC_CORE_H
> +
> +#include <asm/barrier.h>
> +
> +/*
> + * XXX: can a powerpc person put an appropriate comment here?
> + */
> +static inline void membarrier_sync_core_before_usermode(void)
> +{
> +}
> +
> +#endif /* _ASM_POWERPC_SYNC_CORE_H */

powerpc's can just go in asm/membarrier.h

/*
 * The RFI family of instructions are context synchronising, and
 * that is how we return to userspace, so nothing is required here.
 */

> diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
> index c32c32a2441e..f72a6ab3fac2 100644
> --- a/kernel/sched/membarrier.c
> +++ b/kernel/sched/membarrier.c
> @@ -5,6 +5,9 @@
>   * membarrier system call
>   */
>  #include "sched.h"
> +#ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
> +#include <asm/sync_core.h>
> +#endif

Can you

#else
static inline void membarrier_sync_core_before_usermode(void)
{
 /* this gets constant folded out */
}
#endif

And avoid adding the ifdefs in the following code?

Otherwise I think this is good.

Acked-by: Nicholas Piggin <npig...@gmail.com>

Thanks,
Nick

>  
>  /*
>   * The basic principle behind the regular memory barrier mode of membarrier()
> @@ -221,6 +224,7 @@ static void ipi_mb(void *info)
>       smp_mb();       /* IPIs should be serializing but paranoid. */
>  }
>  
> +#ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
>  static void ipi_sync_core(void *info)
>  {
>       /*
> @@ -230,13 +234,14 @@ static void ipi_sync_core(void *info)
>        * the big comment at the top of this file.
>        *
>        * A sync_core() would provide this guarantee, but
> -      * sync_core_before_usermode() might end up being deferred until
> -      * after membarrier()'s smp_mb().
> +      * membarrier_sync_core_before_usermode() might end up being deferred
> +      * until after membarrier()'s smp_mb().
>        */
>       smp_mb();       /* IPIs should be serializing but paranoid. */
>  
> -     sync_core_before_usermode();
> +     membarrier_sync_core_before_usermode();
>  }
> +#endif
>  
>  static void ipi_rseq(void *info)
>  {
> @@ -368,12 +373,14 @@ static int membarrier_private_expedited(int flags, int 
> cpu_id)
>       smp_call_func_t ipi_func = ipi_mb;
>  
>       if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
> -             if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
> +#ifndef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
>                       return -EINVAL;
> +#else
>               if (!(atomic_read(&mm->membarrier_state) &
>                     MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
>                       return -EPERM;
>               ipi_func = ipi_sync_core;
> +#endif
>       } else if (flags == MEMBARRIER_FLAG_RSEQ) {
>               if (!IS_ENABLED(CONFIG_RSEQ))
>                       return -EINVAL;
> -- 
> 2.31.1
> 
> 

Reply via email to