stncisync orders less than lwsync (only cacheable store-store, not load-load or load-store) so it should be as cheap or cheaper.
Microbenchmarks with no actual loads to order shows that the basic execution cost is the same on POWER10. Signed-off-by: Nicholas Piggin <npig...@gmail.com> --- arch/powerpc/include/asm/barrier.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h index b95b666f0374..f0ff5737b0d8 100644 --- a/arch/powerpc/include/asm/barrier.h +++ b/arch/powerpc/include/asm/barrier.h @@ -6,6 +6,8 @@ #define _ASM_POWERPC_BARRIER_H #include <asm/asm-const.h> +#include <asm/cputable.h> +#include <asm/feature-fixups.h> #ifndef __ASSEMBLY__ #include <asm/ppc-opcode.h> @@ -41,7 +43,12 @@ /* The sub-arch has lwsync */ #if defined(CONFIG_PPC64) || defined(CONFIG_PPC_E500MC) -# define SMPWMB LWSYNC +# define SMPWMB \ + BEGIN_FTR_SECTION; \ + LWSYNC; \ + FTR_SECTION_ELSE; \ + .long PPC_RAW_STNCISYNC(); \ + ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_31) #elif defined(CONFIG_BOOKE) # define SMPWMB mbar #else -- 2.40.1