Module Name: src Committed By: riastradh Date: Sat Jul 30 14:11:01 UTC 2022
Modified Files: src/common/lib/libc/arch/i386/atomic: atomic.S src/common/lib/libc/arch/x86_64/atomic: atomic.S src/sys/arch/amd64/include: frameasm.h src/sys/arch/i386/include: frameasm.h src/sys/arch/x86/x86: patch.c Log Message: x86: Eliminate mfence hotpatch for membar_sync. The more-compatible LOCK ADD $0,-N(%rsp) turns out to be cheaper than MFENCE anyway. Let's save some space and maintenance and rip out the hotpatching for it. To generate a diff of this commit: cvs rdiff -u -r1.35 -r1.36 src/common/lib/libc/arch/i386/atomic/atomic.S cvs rdiff -u -r1.28 -r1.29 src/common/lib/libc/arch/x86_64/atomic/atomic.S cvs rdiff -u -r1.54 -r1.55 src/sys/arch/amd64/include/frameasm.h cvs rdiff -u -r1.34 -r1.35 src/sys/arch/i386/include/frameasm.h cvs rdiff -u -r1.50 -r1.51 src/sys/arch/x86/x86/patch.c Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/common/lib/libc/arch/i386/atomic/atomic.S diff -u src/common/lib/libc/arch/i386/atomic/atomic.S:1.35 src/common/lib/libc/arch/i386/atomic/atomic.S:1.36 --- src/common/lib/libc/arch/i386/atomic/atomic.S:1.35 Sat Apr 9 23:32:51 2022 +++ src/common/lib/libc/arch/i386/atomic/atomic.S Sat Jul 30 14:11:00 2022 @@ -1,4 +1,4 @@ -/* $NetBSD: atomic.S,v 1.35 2022/04/09 23:32:51 riastradh Exp $ */ +/* $NetBSD: atomic.S,v 1.36 2022/07/30 14:11:00 riastradh Exp $ */ /*- * Copyright (c) 2007 The NetBSD Foundation, Inc. @@ -46,11 +46,9 @@ #include "opt_xen.h" #include <machine/frameasm.h> #define LOCK HOTPATCH(HP_NAME_NOLOCK, 1); lock -#define HOTPATCH_SSE2_MFENCE HOTPATCH(HP_NAME_SSE2_MFENCE, 7); #define HOTPATCH_CAS_64 HOTPATCH(HP_NAME_CAS_64, 49); #else #define LOCK lock -#define HOTPATCH_SSE2_MFENCE /* nothing */ #define HOTPATCH_CAS_64 /* nothing */ #endif @@ -198,13 +196,22 @@ END(_membar_release) ENTRY(_membar_sync) /* - * MFENCE, or a serializing instruction like a locked addq, + * MFENCE, or a serializing instruction like a locked ADDL, * is necessary to order store-before-load. Every other * ordering -- load-before-anything, anything-before-store -- * is already guaranteed without explicit barriers. + * + * Empirically it turns out locked ADDL is cheaper than MFENCE, + * so we use that, with an offset below the return address on + * the stack to avoid a false dependency with RET. (It might + * even be better to use a much lower offset, say -128, to + * avoid false dependencies for subsequent callees of the + * caller.) + * + * https://pvk.ca/Blog/2014/10/19/performance-optimisation-~-writing-an-essay/ + * https://shipilev.net/blog/2014/on-the-fence-with-dependencies/ + * https://www.agner.org/optimize/instruction_tables.pdf */ - HOTPATCH_SSE2_MFENCE - /* 7 bytes of instructions */ LOCK addl $0, -4(%esp) ret @@ -406,13 +413,3 @@ STRONG_ALIAS(_membar_consumer,_membar_ac STRONG_ALIAS(_membar_producer,_membar_release) STRONG_ALIAS(_membar_enter,_membar_sync) STRONG_ALIAS(_membar_exit,_membar_release) - -#ifdef _HARDKERNEL - .section .rodata - -LABEL(sse2_mfence) - mfence - ret - nop; nop; nop; -LABEL(sse2_mfence_end) -#endif /* _HARDKERNEL */ Index: src/common/lib/libc/arch/x86_64/atomic/atomic.S diff -u src/common/lib/libc/arch/x86_64/atomic/atomic.S:1.28 src/common/lib/libc/arch/x86_64/atomic/atomic.S:1.29 --- src/common/lib/libc/arch/x86_64/atomic/atomic.S:1.28 Sat Apr 9 23:32:52 2022 +++ src/common/lib/libc/arch/x86_64/atomic/atomic.S Sat Jul 30 14:11:00 2022 @@ -1,4 +1,4 @@ -/* $NetBSD: atomic.S,v 1.28 2022/04/09 23:32:52 riastradh Exp $ */ +/* $NetBSD: atomic.S,v 1.29 2022/07/30 14:11:00 riastradh Exp $ */ /*- * Copyright (c) 2007 The NetBSD Foundation, Inc. @@ -38,15 +38,6 @@ #define ALIAS(f, t) WEAK_ALIAS(f,t) #endif -#ifdef _HARDKERNEL -#include <machine/frameasm.h> -#define LOCK HOTPATCH(HP_NAME_NOLOCK, 1); lock -#define HOTPATCH_SSE2_MFENCE HOTPATCH(HP_NAME_SSE2_MFENCE, 8); -#else -#define LOCK lock -#define HOTPATCH_SSE2_MFENCE /* nothing */ -#endif - .text /* 32-bit */ @@ -273,13 +264,22 @@ END(_membar_release) ENTRY(_membar_sync) /* - * MFENCE, or a serializing instruction like a locked addq, + * MFENCE, or a serializing instruction like a locked ADDQ, * is necessary to order store-before-load. Every other * ordering -- load-before-anything, anything-before-store -- * is already guaranteed without explicit barriers. + * + * Empirically it turns out locked ADDQ is cheaper than MFENCE, + * so we use that, with an offset below the return address on + * the stack to avoid a false dependency with RET. (It might + * even be better to use a much lower offset, say -128, to + * avoid false dependencies for subsequent callees of the + * caller.) + * + * https://pvk.ca/Blog/2014/10/19/performance-optimisation-~-writing-an-essay/ + * https://shipilev.net/blog/2014/on-the-fence-with-dependencies/ + * https://www.agner.org/optimize/instruction_tables.pdf */ - HOTPATCH_SSE2_MFENCE - /* 8 bytes of instructions */ LOCK addq $0, -8(%rsp) ret @@ -429,13 +429,3 @@ STRONG_ALIAS(_membar_consumer,_membar_ac STRONG_ALIAS(_membar_producer,_membar_release) STRONG_ALIAS(_membar_enter,_membar_sync) STRONG_ALIAS(_membar_exit,_membar_release) - -#ifdef _HARDKERNEL - .section .rodata - -LABEL(sse2_mfence) - mfence - ret - nop; nop; nop; nop; -LABEL(sse2_mfence_end) -#endif /* _HARDKERNEL */ Index: src/sys/arch/amd64/include/frameasm.h diff -u src/sys/arch/amd64/include/frameasm.h:1.54 src/sys/arch/amd64/include/frameasm.h:1.55 --- src/sys/arch/amd64/include/frameasm.h:1.54 Sat Apr 9 12:07:00 2022 +++ src/sys/arch/amd64/include/frameasm.h Sat Jul 30 14:11:00 2022 @@ -1,4 +1,4 @@ -/* $NetBSD: frameasm.h,v 1.54 2022/04/09 12:07:00 riastradh Exp $ */ +/* $NetBSD: frameasm.h,v 1.55 2022/07/30 14:11:00 riastradh Exp $ */ #ifndef _AMD64_MACHINE_FRAMEASM_H #define _AMD64_MACHINE_FRAMEASM_H @@ -63,7 +63,6 @@ #define HP_NAME_SVS_ENTER_NMI 11 #define HP_NAME_SVS_LEAVE_NMI 12 #define HP_NAME_MDS_LEAVE 13 -#define HP_NAME_SSE2_MFENCE 14 #define HOTPATCH(name, size) \ 123: ; \ Index: src/sys/arch/i386/include/frameasm.h diff -u src/sys/arch/i386/include/frameasm.h:1.34 src/sys/arch/i386/include/frameasm.h:1.35 --- src/sys/arch/i386/include/frameasm.h:1.34 Sat Apr 9 12:07:00 2022 +++ src/sys/arch/i386/include/frameasm.h Sat Jul 30 14:11:00 2022 @@ -1,4 +1,4 @@ -/* $NetBSD: frameasm.h,v 1.34 2022/04/09 12:07:00 riastradh Exp $ */ +/* $NetBSD: frameasm.h,v 1.35 2022/07/30 14:11:00 riastradh Exp $ */ #ifndef _I386_FRAMEASM_H_ #define _I386_FRAMEASM_H_ @@ -48,10 +48,9 @@ #define HP_NAME_STAC 2 #define HP_NAME_NOLOCK 3 #define HP_NAME_RETFENCE 4 -#define HP_NAME_SSE2_MFENCE 5 -#define HP_NAME_CAS_64 6 -#define HP_NAME_SPLLOWER 7 -#define HP_NAME_MUTEX_EXIT 8 +#define HP_NAME_CAS_64 5 +#define HP_NAME_SPLLOWER 6 +#define HP_NAME_MUTEX_EXIT 7 #define HOTPATCH(name, size) \ 123: ; \ Index: src/sys/arch/x86/x86/patch.c diff -u src/sys/arch/x86/x86/patch.c:1.50 src/sys/arch/x86/x86/patch.c:1.51 --- src/sys/arch/x86/x86/patch.c:1.50 Sat Apr 9 12:07:00 2022 +++ src/sys/arch/x86/x86/patch.c Sat Jul 30 14:11:00 2022 @@ -1,4 +1,4 @@ -/* $NetBSD: patch.c,v 1.50 2022/04/09 12:07:00 riastradh Exp $ */ +/* $NetBSD: patch.c,v 1.51 2022/07/30 14:11:00 riastradh Exp $ */ /*- * Copyright (c) 2007, 2008, 2009 The NetBSD Foundation, Inc. @@ -34,7 +34,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: patch.c,v 1.50 2022/04/09 12:07:00 riastradh Exp $"); +__KERNEL_RCSID(0, "$NetBSD: patch.c,v 1.51 2022/07/30 14:11:00 riastradh Exp $"); #include "opt_lockdebug.h" #ifdef i386 @@ -117,19 +117,6 @@ static const struct x86_hotpatch_descrip }; __link_set_add_rodata(x86_hotpatch_descriptors, hp_nolock_desc); -/* Use MFENCE if available, part of SSE2. */ -extern uint8_t sse2_mfence, sse2_mfence_end; -static const struct x86_hotpatch_source hp_sse2_mfence_source = { - .saddr = &sse2_mfence, - .eaddr = &sse2_mfence_end -}; -static const struct x86_hotpatch_descriptor hp_sse2_mfence_desc = { - .name = HP_NAME_SSE2_MFENCE, - .nsrc = 1, - .srcs = { &hp_sse2_mfence_source } -}; -__link_set_add_rodata(x86_hotpatch_descriptors, hp_sse2_mfence_desc); - #ifdef i386 /* CAS_64. */ extern uint8_t _atomic_cas_cx8, _atomic_cas_cx8_end; @@ -327,20 +314,6 @@ x86_patch(bool early) #endif } - if (!early && (cpu_feature[0] & CPUID_SSE2) != 0) { - /* - * Faster memory barriers. The only barrier x86 ever - * requires for MI synchronization between CPUs is - * MFENCE for store-before-load ordering; all other - * ordering is guaranteed already -- every load is a - * load-acquire and every store is a store-release. - * - * LFENCE and SFENCE are relevant only for MD logic - * involving I/O devices or non-temporal stores. - */ - x86_hotpatch(HP_NAME_SSE2_MFENCE, 0); - } - #ifdef i386 /* * Patch early and late. Second time around the 'lock' prefix