Module Name:    src
Committed By:   riastradh
Date:           Sat Jul 30 14:11:01 UTC 2022

Modified Files:
        src/common/lib/libc/arch/i386/atomic: atomic.S
        src/common/lib/libc/arch/x86_64/atomic: atomic.S
        src/sys/arch/amd64/include: frameasm.h
        src/sys/arch/i386/include: frameasm.h
        src/sys/arch/x86/x86: patch.c

Log Message:
x86: Eliminate mfence hotpatch for membar_sync.

The more-compatible  LOCK ADD $0,-N(%rsp)  turns out to be cheaper
than MFENCE anyway.  Let's save some space and maintenance and rip
out the hotpatching for it.


To generate a diff of this commit:
cvs rdiff -u -r1.35 -r1.36 src/common/lib/libc/arch/i386/atomic/atomic.S
cvs rdiff -u -r1.28 -r1.29 src/common/lib/libc/arch/x86_64/atomic/atomic.S
cvs rdiff -u -r1.54 -r1.55 src/sys/arch/amd64/include/frameasm.h
cvs rdiff -u -r1.34 -r1.35 src/sys/arch/i386/include/frameasm.h
cvs rdiff -u -r1.50 -r1.51 src/sys/arch/x86/x86/patch.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/common/lib/libc/arch/i386/atomic/atomic.S
diff -u src/common/lib/libc/arch/i386/atomic/atomic.S:1.35 src/common/lib/libc/arch/i386/atomic/atomic.S:1.36
--- src/common/lib/libc/arch/i386/atomic/atomic.S:1.35	Sat Apr  9 23:32:51 2022
+++ src/common/lib/libc/arch/i386/atomic/atomic.S	Sat Jul 30 14:11:00 2022
@@ -1,4 +1,4 @@
-/*	$NetBSD: atomic.S,v 1.35 2022/04/09 23:32:51 riastradh Exp $	*/
+/*	$NetBSD: atomic.S,v 1.36 2022/07/30 14:11:00 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2007 The NetBSD Foundation, Inc.
@@ -46,11 +46,9 @@
 #include "opt_xen.h"
 #include <machine/frameasm.h>
 #define LOCK			HOTPATCH(HP_NAME_NOLOCK, 1); lock
-#define HOTPATCH_SSE2_MFENCE	HOTPATCH(HP_NAME_SSE2_MFENCE, 7);
 #define HOTPATCH_CAS_64		HOTPATCH(HP_NAME_CAS_64, 49);
 #else
 #define LOCK			lock
-#define HOTPATCH_SSE2_MFENCE	/* nothing */
 #define HOTPATCH_CAS_64		/* nothing */
 #endif
 
@@ -198,13 +196,22 @@ END(_membar_release)
 
 ENTRY(_membar_sync)
 	/*
-	 * MFENCE, or a serializing instruction like a locked addq,
+	 * MFENCE, or a serializing instruction like a locked ADDL,
 	 * is necessary to order store-before-load.  Every other
 	 * ordering -- load-before-anything, anything-before-store --
 	 * is already guaranteed without explicit barriers.
+	 *
+	 * Empirically it turns out locked ADDL is cheaper than MFENCE,
+	 * so we use that, with an offset below the return address on
+	 * the stack to avoid a false dependency with RET.  (It might
+	 * even be better to use a much lower offset, say -128, to
+	 * avoid false dependencies for subsequent callees of the
+	 * caller.)
+	 *
+	 * https://pvk.ca/Blog/2014/10/19/performance-optimisation-~-writing-an-essay/
+	 * https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
+	 * https://www.agner.org/optimize/instruction_tables.pdf
 	 */
-	HOTPATCH_SSE2_MFENCE
-	/* 7 bytes of instructions */
 	LOCK
 	addl	$0, -4(%esp)
 	ret
@@ -406,13 +413,3 @@ STRONG_ALIAS(_membar_consumer,_membar_ac
 STRONG_ALIAS(_membar_producer,_membar_release)
 STRONG_ALIAS(_membar_enter,_membar_sync)
 STRONG_ALIAS(_membar_exit,_membar_release)
-
-#ifdef _HARDKERNEL
-	.section .rodata
-
-LABEL(sse2_mfence)
-	mfence
-	ret
-	nop; nop; nop;
-LABEL(sse2_mfence_end)
-#endif	/* _HARDKERNEL */

Index: src/common/lib/libc/arch/x86_64/atomic/atomic.S
diff -u src/common/lib/libc/arch/x86_64/atomic/atomic.S:1.28 src/common/lib/libc/arch/x86_64/atomic/atomic.S:1.29
--- src/common/lib/libc/arch/x86_64/atomic/atomic.S:1.28	Sat Apr  9 23:32:52 2022
+++ src/common/lib/libc/arch/x86_64/atomic/atomic.S	Sat Jul 30 14:11:00 2022
@@ -1,4 +1,4 @@
-/*	$NetBSD: atomic.S,v 1.28 2022/04/09 23:32:52 riastradh Exp $	*/
+/*	$NetBSD: atomic.S,v 1.29 2022/07/30 14:11:00 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2007 The NetBSD Foundation, Inc.
@@ -38,15 +38,6 @@
 #define	ALIAS(f, t)	WEAK_ALIAS(f,t)
 #endif
 
-#ifdef _HARDKERNEL
-#include <machine/frameasm.h>
-#define LOCK			HOTPATCH(HP_NAME_NOLOCK, 1); lock
-#define HOTPATCH_SSE2_MFENCE	HOTPATCH(HP_NAME_SSE2_MFENCE, 8);
-#else
-#define LOCK			lock
-#define HOTPATCH_SSE2_MFENCE	/* nothing */
-#endif
-
 	.text
 
 /* 32-bit */
@@ -273,13 +264,22 @@ END(_membar_release)
 
 ENTRY(_membar_sync)
 	/*
-	 * MFENCE, or a serializing instruction like a locked addq,
+	 * MFENCE, or a serializing instruction like a locked ADDQ,
 	 * is necessary to order store-before-load.  Every other
 	 * ordering -- load-before-anything, anything-before-store --
 	 * is already guaranteed without explicit barriers.
+	 *
+	 * Empirically it turns out locked ADDQ is cheaper than MFENCE,
+	 * so we use that, with an offset below the return address on
+	 * the stack to avoid a false dependency with RET.  (It might
+	 * even be better to use a much lower offset, say -128, to
+	 * avoid false dependencies for subsequent callees of the
+	 * caller.)
+	 *
+	 * https://pvk.ca/Blog/2014/10/19/performance-optimisation-~-writing-an-essay/
+	 * https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
+	 * https://www.agner.org/optimize/instruction_tables.pdf
 	 */
-	HOTPATCH_SSE2_MFENCE
-	/* 8 bytes of instructions */
 	LOCK
 	addq	$0, -8(%rsp)
 	ret
@@ -429,13 +429,3 @@ STRONG_ALIAS(_membar_consumer,_membar_ac
 STRONG_ALIAS(_membar_producer,_membar_release)
 STRONG_ALIAS(_membar_enter,_membar_sync)
 STRONG_ALIAS(_membar_exit,_membar_release)
-
-#ifdef _HARDKERNEL
-	.section .rodata
-
-LABEL(sse2_mfence)
-	mfence
-	ret
-	nop; nop; nop; nop;
-LABEL(sse2_mfence_end)
-#endif	/* _HARDKERNEL */

Index: src/sys/arch/amd64/include/frameasm.h
diff -u src/sys/arch/amd64/include/frameasm.h:1.54 src/sys/arch/amd64/include/frameasm.h:1.55
--- src/sys/arch/amd64/include/frameasm.h:1.54	Sat Apr  9 12:07:00 2022
+++ src/sys/arch/amd64/include/frameasm.h	Sat Jul 30 14:11:00 2022
@@ -1,4 +1,4 @@
-/*	$NetBSD: frameasm.h,v 1.54 2022/04/09 12:07:00 riastradh Exp $	*/
+/*	$NetBSD: frameasm.h,v 1.55 2022/07/30 14:11:00 riastradh Exp $	*/
 
 #ifndef _AMD64_MACHINE_FRAMEASM_H
 #define _AMD64_MACHINE_FRAMEASM_H
@@ -63,7 +63,6 @@
 #define HP_NAME_SVS_ENTER_NMI	11
 #define HP_NAME_SVS_LEAVE_NMI	12
 #define HP_NAME_MDS_LEAVE	13
-#define HP_NAME_SSE2_MFENCE	14
 
 #define HOTPATCH(name, size) \
 123:						; \

Index: src/sys/arch/i386/include/frameasm.h
diff -u src/sys/arch/i386/include/frameasm.h:1.34 src/sys/arch/i386/include/frameasm.h:1.35
--- src/sys/arch/i386/include/frameasm.h:1.34	Sat Apr  9 12:07:00 2022
+++ src/sys/arch/i386/include/frameasm.h	Sat Jul 30 14:11:00 2022
@@ -1,4 +1,4 @@
-/*	$NetBSD: frameasm.h,v 1.34 2022/04/09 12:07:00 riastradh Exp $	*/
+/*	$NetBSD: frameasm.h,v 1.35 2022/07/30 14:11:00 riastradh Exp $	*/
 
 #ifndef _I386_FRAMEASM_H_
 #define _I386_FRAMEASM_H_
@@ -48,10 +48,9 @@
 #define HP_NAME_STAC		2
 #define HP_NAME_NOLOCK		3
 #define HP_NAME_RETFENCE	4
-#define HP_NAME_SSE2_MFENCE	5
-#define HP_NAME_CAS_64		6
-#define HP_NAME_SPLLOWER	7
-#define HP_NAME_MUTEX_EXIT	8
+#define HP_NAME_CAS_64		5
+#define HP_NAME_SPLLOWER	6
+#define HP_NAME_MUTEX_EXIT	7
 
 #define HOTPATCH(name, size) \
 123:						; \

Index: src/sys/arch/x86/x86/patch.c
diff -u src/sys/arch/x86/x86/patch.c:1.50 src/sys/arch/x86/x86/patch.c:1.51
--- src/sys/arch/x86/x86/patch.c:1.50	Sat Apr  9 12:07:00 2022
+++ src/sys/arch/x86/x86/patch.c	Sat Jul 30 14:11:00 2022
@@ -1,4 +1,4 @@
-/*	$NetBSD: patch.c,v 1.50 2022/04/09 12:07:00 riastradh Exp $	*/
+/*	$NetBSD: patch.c,v 1.51 2022/07/30 14:11:00 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2007, 2008, 2009 The NetBSD Foundation, Inc.
@@ -34,7 +34,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: patch.c,v 1.50 2022/04/09 12:07:00 riastradh Exp $");
+__KERNEL_RCSID(0, "$NetBSD: patch.c,v 1.51 2022/07/30 14:11:00 riastradh Exp $");
 
 #include "opt_lockdebug.h"
 #ifdef i386
@@ -117,19 +117,6 @@ static const struct x86_hotpatch_descrip
 };
 __link_set_add_rodata(x86_hotpatch_descriptors, hp_nolock_desc);
 
-/* Use MFENCE if available, part of SSE2. */
-extern uint8_t sse2_mfence, sse2_mfence_end;
-static const struct x86_hotpatch_source hp_sse2_mfence_source = {
-	.saddr = &sse2_mfence,
-	.eaddr = &sse2_mfence_end
-};
-static const struct x86_hotpatch_descriptor hp_sse2_mfence_desc = {
-	.name = HP_NAME_SSE2_MFENCE,
-	.nsrc = 1,
-	.srcs = { &hp_sse2_mfence_source }
-};
-__link_set_add_rodata(x86_hotpatch_descriptors, hp_sse2_mfence_desc);
-
 #ifdef i386
 /* CAS_64. */
 extern uint8_t _atomic_cas_cx8, _atomic_cas_cx8_end;
@@ -327,20 +314,6 @@ x86_patch(bool early)
 #endif
 	}
 
-	if (!early && (cpu_feature[0] & CPUID_SSE2) != 0) {
-		/*
-		 * Faster memory barriers.  The only barrier x86 ever
-		 * requires for MI synchronization between CPUs is
-		 * MFENCE for store-before-load ordering; all other
-		 * ordering is guaranteed already -- every load is a
-		 * load-acquire and every store is a store-release.
-		 *
-		 * LFENCE and SFENCE are relevant only for MD logic
-		 * involving I/O devices or non-temporal stores.
-		 */
-		x86_hotpatch(HP_NAME_SSE2_MFENCE, 0);
-	}
-
 #ifdef i386
 	/*
 	 * Patch early and late.  Second time around the 'lock' prefix

Reply via email to