Author: kib
Date: Sun Jun 28 05:04:08 2015
New Revision: 284901
URL: https://svnweb.freebsd.org/changeset/base/284901

Log:
  Remove unneeded data dependency, currently imposed by
  atomic_load_acq(9), on it source, for x86.
  
  Right now, atomic_load_acq() on x86 is sequentially consistent with
  other atomics, code ensures this by doing store/load barrier by
  performing locked nop on the source.  Provide separate primitive
  __storeload_barrier(), which is implemented as the locked nop done on
  a cpu-private variable, and put __storeload_barrier() before load, to
  keep seq_cst semantic but avoid introducing false dependency on the
  no-modification of the source for its later use.
  
  Note that seq_cst property of x86 atomic_load_acq() is not documented
  and not carried by atomics implementations on other architectures,
  although some kernel code relies on the behaviour.  This commit does
  not intend to change this.
  
  Reviewed by:  alc
  Discussed with:       bde
  Tested by:    pho
  Sponsored by: The FreeBSD Foundation
  MFC after:    2 weeks

Modified:
  head/sys/amd64/amd64/atomic.c
  head/sys/amd64/amd64/vm_machdep.c
  head/sys/amd64/include/atomic.h
  head/sys/i386/i386/vm_machdep.c
  head/sys/i386/include/atomic.h

Modified: head/sys/amd64/amd64/atomic.c
==============================================================================
--- head/sys/amd64/amd64/atomic.c       Sun Jun 28 03:22:26 2015        
(r284900)
+++ head/sys/amd64/amd64/atomic.c       Sun Jun 28 05:04:08 2015        
(r284901)
@@ -41,6 +41,7 @@ __FBSDID("$FreeBSD$");
 #undef ATOMIC_ASM
 
 /* Make atomic.h generate public functions */
+static __inline void __storeload_barrier(void);
 #define WANT_FUNCTIONS
 #define static
 #undef __inline

Modified: head/sys/amd64/amd64/vm_machdep.c
==============================================================================
--- head/sys/amd64/amd64/vm_machdep.c   Sun Jun 28 03:22:26 2015        
(r284900)
+++ head/sys/amd64/amd64/vm_machdep.c   Sun Jun 28 05:04:08 2015        
(r284901)
@@ -93,6 +93,8 @@ _Static_assert(OFFSETOF_CURTHREAD == off
     "OFFSETOF_CURTHREAD does not correspond with offset of pc_curthread.");
 _Static_assert(OFFSETOF_CURPCB == offsetof(struct pcpu, pc_curpcb),
     "OFFSETOF_CURPCB does not correspond with offset of pc_curpcb.");
+_Static_assert(OFFSETOF_MONITORBUF == offsetof(struct pcpu, pc_monitorbuf),
+    "OFFSETOF_MONINORBUF does not correspond with offset of pc_monitorbuf.");
 
 struct savefpu *
 get_pcb_user_save_td(struct thread *td)

Modified: head/sys/amd64/include/atomic.h
==============================================================================
--- head/sys/amd64/include/atomic.h     Sun Jun 28 03:22:26 2015        
(r284900)
+++ head/sys/amd64/include/atomic.h     Sun Jun 28 05:04:08 2015        
(r284901)
@@ -85,7 +85,7 @@ u_long        atomic_fetchadd_long(volatile u_l
 int    atomic_testandset_int(volatile u_int *p, u_int v);
 int    atomic_testandset_long(volatile u_long *p, u_int v);
 
-#define        ATOMIC_LOAD(TYPE, LOP)                                  \
+#define        ATOMIC_LOAD(TYPE)                                       \
 u_##TYPE       atomic_load_acq_##TYPE(volatile u_##TYPE *p)
 #define        ATOMIC_STORE(TYPE)                                      \
 void           atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)
@@ -245,53 +245,79 @@ atomic_testandset_long(volatile u_long *
  * We assume that a = b will do atomic loads and stores.  Due to the
  * IA32 memory model, a simple store guarantees release semantics.
  *
- * However, loads may pass stores, so for atomic_load_acq we have to
- * ensure a Store/Load barrier to do the load in SMP kernels.  We use
- * "lock cmpxchg" as recommended by the AMD Software Optimization
- * Guide, and not mfence.  For UP kernels, however, the cache of the
- * single processor is always consistent, so we only need to take care
- * of the compiler.
+ * However, a load may pass a store if they are performed on distinct
+ * addresses, so for atomic_load_acq we introduce a Store/Load barrier
+ * before the load in SMP kernels.  We use "lock addl $0,mem", as
+ * recommended by the AMD Software Optimization Guide, and not mfence.
+ * In the kernel, we use a private per-cpu cache line as the target
+ * for the locked addition, to avoid introducing false data
+ * dependencies.  In userspace, a word in the red zone on the stack
+ * (-8(%rsp)) is utilized.
+ *
+ * For UP kernels, however, the memory of the single processor is
+ * always consistent, so we only need to stop the compiler from
+ * reordering accesses in a way that violates the semantics of acquire
+ * and release.
  */
-#define        ATOMIC_STORE(TYPE)                              \
-static __inline void                                   \
-atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\
-{                                                      \
-       __compiler_membar();                            \
-       *p = v;                                         \
-}                                                      \
-struct __hack
 
-#if defined(_KERNEL) && !defined(SMP)
+#if defined(_KERNEL)
 
-#define        ATOMIC_LOAD(TYPE, LOP)                          \
-static __inline u_##TYPE                               \
-atomic_load_acq_##TYPE(volatile u_##TYPE *p)           \
-{                                                      \
-       u_##TYPE tmp;                                   \
-                                                       \
-       tmp = *p;                                       \
-       __compiler_membar();                            \
-       return (tmp);                                   \
-}                                                      \
-struct __hack
+/*
+ * OFFSETOF_MONITORBUF == __pcpu_offset(pc_monitorbuf).
+ *
+ * The open-coded number is used instead of the symbolic expression to
+ * avoid a dependency on sys/pcpu.h in machine/atomic.h consumers.
+ * An assertion in amd64/vm_machdep.c ensures that the value is correct.
+ */
+#define        OFFSETOF_MONITORBUF     0x180
 
-#else /* !(_KERNEL && !SMP) */
+#if defined(SMP)
+static __inline void
+__storeload_barrier(void)
+{
 
-#define        ATOMIC_LOAD(TYPE, LOP)                          \
-static __inline u_##TYPE                               \
-atomic_load_acq_##TYPE(volatile u_##TYPE *p)           \
-{                                                      \
-       u_##TYPE res;                                   \
-                                                       \
-       __asm __volatile(MPLOCKED LOP                   \
-       : "=a" (res),                   /* 0 */         \
-         "+m" (*p)                     /* 1 */         \
-       : : "memory", "cc");                            \
-       return (res);                                   \
-}                                                      \
+       __asm __volatile("lock; addl $0,%%gs:%0"
+           : "+m" (*(u_int *)OFFSETOF_MONITORBUF) : : "memory", "cc");
+}
+#else /* _KERNEL && UP */
+static __inline void
+__storeload_barrier(void)
+{
+
+       __compiler_membar();
+}
+#endif /* SMP */
+#else /* !_KERNEL */
+static __inline void
+__storeload_barrier(void)
+{
+
+       __asm __volatile("lock; addl $0,-8(%%rsp)" : : : "memory", "cc");
+}
+#endif /* _KERNEL*/
+
+#define        ATOMIC_LOAD(TYPE)                                       \
+static __inline u_##TYPE                                       \
+atomic_load_acq_##TYPE(volatile u_##TYPE *p)                   \
+{                                                              \
+       u_##TYPE res;                                           \
+                                                               \
+       __storeload_barrier();                                  \
+       res = *p;                                               \
+       __compiler_membar();                                    \
+       return (res);                                           \
+}                                                              \
 struct __hack
 
-#endif /* _KERNEL && !SMP */
+#define        ATOMIC_STORE(TYPE)                                      \
+static __inline void                                           \
+atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)      \
+{                                                              \
+                                                               \
+       __compiler_membar();                                    \
+       *p = v;                                                 \
+}                                                              \
+struct __hack
 
 #endif /* KLD_MODULE || !__GNUCLIKE_ASM */
 
@@ -315,20 +341,19 @@ ATOMIC_ASM(clear,    long,  "andq %1,%0"
 ATOMIC_ASM(add,             long,  "addq %1,%0",  "ir",  v);
 ATOMIC_ASM(subtract, long,  "subq %1,%0",  "ir",  v);
 
-ATOMIC_LOAD(char,  "cmpxchgb %b0,%1");
-ATOMIC_LOAD(short, "cmpxchgw %w0,%1");
-ATOMIC_LOAD(int,   "cmpxchgl %0,%1");
-ATOMIC_LOAD(long,  "cmpxchgq %0,%1");
-
-ATOMIC_STORE(char);
-ATOMIC_STORE(short);
-ATOMIC_STORE(int);
-ATOMIC_STORE(long);
+#define        ATOMIC_LOADSTORE(TYPE)                                  \
+       ATOMIC_LOAD(TYPE);                                      \
+       ATOMIC_STORE(TYPE)
+
+ATOMIC_LOADSTORE(char);
+ATOMIC_LOADSTORE(short);
+ATOMIC_LOADSTORE(int);
+ATOMIC_LOADSTORE(long);
 
 #undef ATOMIC_ASM
 #undef ATOMIC_LOAD
 #undef ATOMIC_STORE
-
+#undef ATOMIC_LOADSTORE
 #ifndef WANT_FUNCTIONS
 
 /* Read the current value and store a new value in the destination. */

Modified: head/sys/i386/i386/vm_machdep.c
==============================================================================
--- head/sys/i386/i386/vm_machdep.c     Sun Jun 28 03:22:26 2015        
(r284900)
+++ head/sys/i386/i386/vm_machdep.c     Sun Jun 28 05:04:08 2015        
(r284901)
@@ -111,6 +111,8 @@ _Static_assert(OFFSETOF_CURTHREAD == off
     "OFFSETOF_CURTHREAD does not correspond with offset of pc_curthread.");
 _Static_assert(OFFSETOF_CURPCB == offsetof(struct pcpu, pc_curpcb),
     "OFFSETOF_CURPCB does not correspond with offset of pc_curpcb.");
+_Static_assert(OFFSETOF_MONITORBUF == offsetof(struct pcpu, pc_monitorbuf),
+    "OFFSETOF_MONINORBUF does not correspond with offset of pc_monitorbuf.");
 
 static void    cpu_reset_real(void);
 #ifdef SMP

Modified: head/sys/i386/include/atomic.h
==============================================================================
--- head/sys/i386/include/atomic.h      Sun Jun 28 03:22:26 2015        
(r284900)
+++ head/sys/i386/include/atomic.h      Sun Jun 28 05:04:08 2015        
(r284901)
@@ -87,7 +87,7 @@ int   atomic_cmpset_int(volatile u_int *ds
 u_int  atomic_fetchadd_int(volatile u_int *p, u_int v);
 int    atomic_testandset_int(volatile u_int *p, u_int v);
 
-#define        ATOMIC_LOAD(TYPE, LOP)                                  \
+#define        ATOMIC_LOAD(TYPE)                                       \
 u_##TYPE       atomic_load_acq_##TYPE(volatile u_##TYPE *p)
 #define        ATOMIC_STORE(TYPE)                                      \
 void           atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)
@@ -228,53 +228,78 @@ atomic_testandset_int(volatile u_int *p,
  * We assume that a = b will do atomic loads and stores.  Due to the
  * IA32 memory model, a simple store guarantees release semantics.
  *
- * However, loads may pass stores, so for atomic_load_acq we have to
- * ensure a Store/Load barrier to do the load in SMP kernels.  We use
- * "lock cmpxchg" as recommended by the AMD Software Optimization
- * Guide, and not mfence.  For UP kernels, however, the cache of the
- * single processor is always consistent, so we only need to take care
- * of the compiler.
+ * However, a load may pass a store if they are performed on distinct
+ * addresses, so for atomic_load_acq we introduce a Store/Load barrier
+ * before the load in SMP kernels.  We use "lock addl $0,mem", as
+ * recommended by the AMD Software Optimization Guide, and not mfence.
+ * In the kernel, we use a private per-cpu cache line as the target
+ * for the locked addition, to avoid introducing false data
+ * dependencies.  In userspace, a word at the top of the stack is
+ * utilized.
+ *
+ * For UP kernels, however, the memory of the single processor is
+ * always consistent, so we only need to stop the compiler from
+ * reordering accesses in a way that violates the semantics of acquire
+ * and release.
  */
-#define        ATOMIC_STORE(TYPE)                              \
-static __inline void                                   \
-atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\
-{                                                      \
-       __compiler_membar();                            \
-       *p = v;                                         \
-}                                                      \
-struct __hack
+#if defined(_KERNEL)
 
-#if defined(_KERNEL) && !defined(SMP)
+/*
+ * OFFSETOF_MONITORBUF == __pcpu_offset(pc_monitorbuf).
+ *
+ * The open-coded number is used instead of the symbolic expression to
+ * avoid a dependency on sys/pcpu.h in machine/atomic.h consumers.
+ * An assertion in i386/vm_machdep.c ensures that the value is correct.
+ */
+#define        OFFSETOF_MONITORBUF     0x180
 
-#define        ATOMIC_LOAD(TYPE, LOP)                          \
-static __inline u_##TYPE                               \
-atomic_load_acq_##TYPE(volatile u_##TYPE *p)           \
-{                                                      \
-       u_##TYPE tmp;                                   \
-                                                       \
-       tmp = *p;                                       \
-       __compiler_membar();                            \
-       return (tmp);                                   \
-}                                                      \
-struct __hack
+#if defined(SMP)
+static __inline void
+__storeload_barrier(void)
+{
 
-#else /* !(_KERNEL && !SMP) */
+       __asm __volatile("lock; addl $0,%%fs:%0"
+           : "+m" (*(u_int *)OFFSETOF_MONITORBUF) : : "memory", "cc");
+}
+#else /* _KERNEL && UP */
+static __inline void
+__storeload_barrier(void)
+{
 
-#define        ATOMIC_LOAD(TYPE, LOP)                          \
-static __inline u_##TYPE                               \
-atomic_load_acq_##TYPE(volatile u_##TYPE *p)           \
-{                                                      \
-       u_##TYPE res;                                   \
-                                                       \
-       __asm __volatile(MPLOCKED LOP                   \
-       : "=a" (res),                   /* 0 */         \
-         "+m" (*p)                     /* 1 */         \
-       : : "memory", "cc");                            \
-       return (res);                                   \
-}                                                      \
+       __compiler_membar();
+}
+#endif /* SMP */
+#else /* !_KERNEL */
+static __inline void
+__storeload_barrier(void)
+{
+
+       __asm __volatile("lock; addl $0,(%%esp)" : : : "memory", "cc");
+}
+#endif /* _KERNEL*/
+
+#define        ATOMIC_LOAD(TYPE)                                       \
+static __inline u_##TYPE                                       \
+atomic_load_acq_##TYPE(volatile u_##TYPE *p)                   \
+{                                                              \
+       u_##TYPE res;                                           \
+                                                               \
+       __storeload_barrier();                                  \
+       res = *p;                                               \
+       __compiler_membar();                                    \
+       return (res);                                           \
+}                                                              \
 struct __hack
 
-#endif /* _KERNEL && !SMP */
+#define        ATOMIC_STORE(TYPE)                                      \
+static __inline void                                           \
+atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)      \
+{                                                              \
+                                                               \
+       __compiler_membar();                                    \
+       *p = v;                                                 \
+}                                                              \
+struct __hack
 
 #ifdef _KERNEL
 
@@ -511,19 +536,19 @@ ATOMIC_ASM(clear,    long,  "andl %1,%0"
 ATOMIC_ASM(add,             long,  "addl %1,%0",  "ir",  v);
 ATOMIC_ASM(subtract, long,  "subl %1,%0",  "ir",  v);
 
-ATOMIC_LOAD(char,  "cmpxchgb %b0,%1");
-ATOMIC_LOAD(short, "cmpxchgw %w0,%1");
-ATOMIC_LOAD(int,   "cmpxchgl %0,%1");
-ATOMIC_LOAD(long,  "cmpxchgl %0,%1");
-
-ATOMIC_STORE(char);
-ATOMIC_STORE(short);
-ATOMIC_STORE(int);
-ATOMIC_STORE(long);
+#define        ATOMIC_LOADSTORE(TYPE)                          \
+       ATOMIC_LOAD(TYPE);                              \
+       ATOMIC_STORE(TYPE)
+
+ATOMIC_LOADSTORE(char);
+ATOMIC_LOADSTORE(short);
+ATOMIC_LOADSTORE(int);
+ATOMIC_LOADSTORE(long);
 
 #undef ATOMIC_ASM
 #undef ATOMIC_LOAD
 #undef ATOMIC_STORE
+#undef ATOMIC_LOADSTORE
 
 #ifndef WANT_FUNCTIONS
 
_______________________________________________
svn-src-head@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to