On Sat, Aug 31, 2019 at 03:30:26PM -0700, Noah Misch wrote:
> On Sat, Aug 31, 2019 at 02:27:55PM -0400, Tom Lane wrote:
> > Noah Misch <[email protected]> writes:
> > > Done. fetch-add-variable-test-v1.patch just adds tests for non-constant
> > > addends and 16-bit edge cases. Today's implementation handles those,
> > > PostgreSQL doesn't use them, and I might easily have broken them.
> > > fetch-add-xlc-asm-v1.patch moves xlc builds from the __fetch_and_add()
> > > intrinsic to inline asm. fetch-add-gcc-xlc-unify-v1.patch moves
> > > fetch_add to
> > > inline asm for all other ppc compilers. gcc-7.2.0 generates equivalent
> > > code
> > > before and after. I plan to keep the third patch HEAD-only,
> > > back-patching the
> > > other two. I tested with xlc v12 and v13.
> >
> > Hm, no objection to the first two patches, but I don't understand
> > why the third patch goes to so much effort just to use "addi" rather
> > than (one assumes) "li" then "add"? It doesn't seem likely that
> > that's buying much.
>
> Changing an addi to li+add may not show up on benchmarks, but I can't claim
> it's immaterial. I shouldn't unify the code if that makes the compiled code
> materially worse than what the gcc intrinsics produce today, hence the
> nontrivial (~50 line) bits to match the intrinsics' capabilities.
The first two patches have worked so far, but fetch-add-gcc-xlc-unify-v1.patch
broke older gcc: https://postgr.es/m/flat/[email protected]. That
side thread settled on putting pg_atomic_compare_exchange_u{32,64}_impl
implementations in arch-ppc.h. Attached fetch-add-gcc-xlc-unify-v2.patch does
so. For compare_exchange, the generated code doesn't match gcc intrinsics
exactly; code comments discuss this. Compared to v1, its other change is to
extract common __asm__ templates into macros instead of repeating them four
times (32/64bit, variable/const).
diff --git a/configure b/configure
index 7a6bfc2..d6ecb33 100755
--- a/configure
+++ b/configure
@@ -14650,6 +14650,46 @@ $as_echo "$pgac_cv_have_ppc_mutex_hint" >&6; }
$as_echo "#define HAVE_PPC_LWARX_MUTEX_HINT 1" >>confdefs.h
fi
+ # Check if compiler accepts "i"(x) when __builtin_constant_p(x).
+ { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether
__builtin_constant_p(x) implies \"i\"(x) acceptance" >&5
+$as_echo_n "checking whether __builtin_constant_p(x) implies \"i\"(x)
acceptance... " >&6; }
+if ${pgac_cv_have_i_constraint__builtin_constant_p+:} false; then :
+ $as_echo_n "(cached) " >&6
+else
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+static inline int
+ addi(int ra, int si)
+ {
+ int res = 0;
+ if (__builtin_constant_p(si))
+ __asm__ __volatile__(
+ " addi %0,%1,%2\n" : "=r"(res) : "r"(ra), "i"(si));
+ return res;
+ }
+ int test_adds(int x) { return addi(3, x) + addi(x, 5); }
+int
+main ()
+{
+
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+ pgac_cv_have_i_constraint__builtin_constant_p=yes
+else
+ pgac_cv_have_i_constraint__builtin_constant_p=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result:
$pgac_cv_have_i_constraint__builtin_constant_p" >&5
+$as_echo "$pgac_cv_have_i_constraint__builtin_constant_p" >&6; }
+ if test x"$pgac_cv_have_i_constraint__builtin_constant_p" = xyes ; then
+
+$as_echo "#define HAVE_I_CONSTRAINT__BUILTIN_CONSTANT_P 1" >>confdefs.h
+
+ fi
;;
esac
diff --git a/configure.in b/configure.in
index dde3eec..510bd76 100644
--- a/configure.in
+++ b/configure.in
@@ -1541,6 +1541,26 @@ case $host_cpu in
if test x"$pgac_cv_have_ppc_mutex_hint" = xyes ; then
AC_DEFINE(HAVE_PPC_LWARX_MUTEX_HINT, 1, [Define to 1 if the assembler
supports PPC's LWARX mutex hint bit.])
fi
+ # Check if compiler accepts "i"(x) when __builtin_constant_p(x).
+ AC_CACHE_CHECK([whether __builtin_constant_p(x) implies "i"(x) acceptance],
+ [pgac_cv_have_i_constraint__builtin_constant_p],
+ [AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
+ [static inline int
+ addi(int ra, int si)
+ {
+ int res = 0;
+ if (__builtin_constant_p(si))
+ __asm__ __volatile__(
+ " addi %0,%1,%2\n" : "=r"(res) : "r"(ra), "i"(si));
+ return res;
+ }
+ int test_adds(int x) { return addi(3, x) + addi(x, 5); }], [])],
+ [pgac_cv_have_i_constraint__builtin_constant_p=yes],
+ [pgac_cv_have_i_constraint__builtin_constant_p=no])])
+ if test x"$pgac_cv_have_i_constraint__builtin_constant_p" = xyes ; then
+ AC_DEFINE(HAVE_I_CONSTRAINT__BUILTIN_CONSTANT_P, 1,
+ [Define to 1 if __builtin_constant_p(x) implies "i"(x)
acceptance.])
+ fi
;;
esac
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 512213a..9be84fc 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -332,6 +332,9 @@
/* Define to 1 if you have isinf(). */
#undef HAVE_ISINF
+/* Define to 1 if __builtin_constant_p(x) implies "i"(x) acceptance. */
+#undef HAVE_I_CONSTRAINT__BUILTIN_CONSTANT_P
+
/* Define to 1 if you have the <langinfo.h> header file. */
#undef HAVE_LANGINFO_H
diff --git a/src/include/port/atomics.h b/src/include/port/atomics.h
index f93009e..e4292eb 100644
--- a/src/include/port/atomics.h
+++ b/src/include/port/atomics.h
@@ -87,14 +87,11 @@
* using compiler intrinsics are a good idea.
*/
/*
- * Given a gcc-compatible xlc compiler, prefer the xlc implementation. The
- * ppc64le "IBM XL C/C++ for Linux, V13.1.2" implements both interfaces, but
- * __sync_lock_test_and_set() of one-byte types elicits SIGSEGV.
+ * gcc or compatible, including clang and icc. Exclude xlc. The ppc64le "IBM
+ * XL C/C++ for Linux, V13.1.2" emulates gcc, but __sync_lock_test_and_set()
+ * of one-byte types elicits SIGSEGV.
*/
-#if defined(__IBMC__) || defined(__IBMCPP__)
-#include "port/atomics/generic-xlc.h"
-/* gcc or compatible, including clang and icc */
-#elif defined(__GNUC__) || defined(__INTEL_COMPILER)
+#if (defined(__GNUC__) || defined(__INTEL_COMPILER)) && !(defined(__IBMC__) ||
defined(__IBMCPP__))
#include "port/atomics/generic-gcc.h"
#elif defined(_MSC_VER)
#include "port/atomics/generic-msvc.h"
diff --git a/src/include/port/atomics/arch-ppc.h
b/src/include/port/atomics/arch-ppc.h
index 344b394..f8e527b 100644
--- a/src/include/port/atomics/arch-ppc.h
+++ b/src/include/port/atomics/arch-ppc.h
@@ -25,5 +25,177 @@
#define pg_write_barrier_impl() __asm__ __volatile__ ("lwsync"
: : : "memory")
#endif
+#define PG_HAVE_ATOMIC_U32_SUPPORT
+typedef struct pg_atomic_uint32
+{
+ volatile uint32 value;
+} pg_atomic_uint32;
+
+/* 64bit atomics are only supported in 64bit mode */
+#ifdef __64BIT__
+#define PG_HAVE_ATOMIC_U64_SUPPORT
+typedef struct pg_atomic_uint64
+{
+ volatile uint64 value pg_attribute_aligned(8);
+} pg_atomic_uint64;
+
+#endif /* __64BIT__ */
+
+/* Macros for building function bodies below. */
+#define PG_PPC_COMPARE_EXCHANGE(load_insn, cmp_insn, store_insn, \
+
expected_constraint) \
+ __asm__ __volatile__( \
+ " sync \n" \
+ " " load_insn " %0,0,%5 \n" \
+ " " cmp_insn " %0,%3 \n" \
+ " bne $+12 \n" /*
branch to isync */ \
+ " " store_insn " %4,0,%5 \n" \
+ " bne $-16 \n" /*
branch to load_insn */ \
+ " isync \n" \
+ " mfcr %1 \n" \
+: "=&r"(found), "=r"(condition_register), "+m"(ptr->value) \
+: expected_constraint(*expected), "r"(newval), "r"(&ptr->value) \
+: "memory", "cc")
+
+#define PG_PPC_FETCH_ADD(load_insn, add_insn, store_insn, add_constraint)
\
+ __asm__ __volatile__( \
+ " sync \n" \
+ " " load_insn " %1,0,%4 \n" \
+ " " add_insn " %0,%1,%3 \n" \
+ " " store_insn " %0,0,%4 \n" \
+ " bne $-12 \n" /*
branch to load_insn */ \
+ " isync \n" \
+: "=&r"(_t), "=&r"(res), "+m"(ptr->value) \
+: add_constraint(add_), "r"(&ptr->value) \
+: "memory", "cc")
+
+/*
+ * This mimics gcc __atomic_compare_exchange_n(..., __ATOMIC_SEQ_CST), but
+ * code generation differs at the end. __atomic_compare_exchange_n():
+ * 100: isync
+ * 104: mfcr r3
+ * 108: rlwinm r3,r3,3,31,31
+ * 10c: bne 120 <.eb+0x10>
+ * 110: clrldi r3,r3,63
+ * 114: addi r1,r1,112
+ * 118: blr
+ * 11c: nop
+ * 120: clrldi r3,r3,63
+ * 124: stw r9,0(r4)
+ * 128: addi r1,r1,112
+ * 12c: blr
+ *
+ * This:
+ * f0: isync
+ * f4: mfcr r9
+ * f8: rldicl. r3,r9,35,63
+ * fc: bne 104 <.eb>
+ * 100: stw r10,0(r4)
+ * 104: addi r1,r1,112
+ * 108: blr
+ *
+ * This implementation may or may not have materially different performance.
+ * It's not exploiting the fact that cr0 still holds the relevant comparison
+ * bits, set during the __asm__. One could fix that by moving more code into
+ * the __asm__. (That would remove the freedom to eliminate dead stores when
+ * the caller ignores "expected", but few callers do.)
+ *
+ * The cmpwi variant may be dead code. In gcc 7.2.0,
+ * __builtin_constant_p(*expected) always reports false.
+ * __atomic_compare_exchange_n() does use cmpwi when its second argument
+ * points to a constant. Hence, using this instead of
+ * __atomic_compare_exchange_n() nominally penalizes the generic.h
+ * pg_atomic_test_set_flag_impl(). Modern GCC will use the generic-gcc.h
+ * version, making the penalty theoretical only.
+ *
+ * Recognizing constant "newval" would be superfluous, because there's no
+ * immediate-operand version of stwcx.
+ */
+#define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U32
+static inline bool
+pg_atomic_compare_exchange_u32_impl(volatile pg_atomic_uint32 *ptr,
+ uint32
*expected, uint32 newval)
+{
+ uint32 found;
+ uint32 condition_register;
+ bool ret;
+
+#ifdef HAVE_I_CONSTRAINT__BUILTIN_CONSTANT_P
+ if (__builtin_constant_p(*expected) &&
+ *expected <= PG_INT16_MAX && *expected >= PG_INT16_MIN)
+ PG_PPC_COMPARE_EXCHANGE("lwarx", "cmpwi", "stwcx.", "i");
+ else
+#endif
+ PG_PPC_COMPARE_EXCHANGE("lwarx", "cmpw", "stwcx.", "r");
+
+ ret = (condition_register >> 29) & 1; /* test eq bit of cr0 */
+ if (!ret)
+ *expected = found;
+ return ret;
+}
+
+#define PG_HAVE_ATOMIC_FETCH_ADD_U32
+static inline uint32
+pg_atomic_fetch_add_u32_impl(volatile pg_atomic_uint32 *ptr, int32 add_)
+{
+ uint32 _t;
+ uint32 res;
+
+#ifdef HAVE_I_CONSTRAINT__BUILTIN_CONSTANT_P
+ if (__builtin_constant_p(add_) &&
+ add_ <= PG_INT16_MAX && add_ >= PG_INT16_MIN)
+ PG_PPC_FETCH_ADD("lwarx", "addi", "stwcx.", "i");
+ else
+#endif
+ PG_PPC_FETCH_ADD("lwarx", "add", "stwcx.", "r");
+
+ return res;
+}
+
+#ifdef PG_HAVE_ATOMIC_U64_SUPPORT
+
+#define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U64
+static inline bool
+pg_atomic_compare_exchange_u64_impl(volatile pg_atomic_uint64 *ptr,
+ uint64
*expected, uint64 newval)
+{
+ uint64 found;
+ uint32 condition_register;
+ bool ret;
+
+#ifdef HAVE_I_CONSTRAINT__BUILTIN_CONSTANT_P
+ if (__builtin_constant_p(*expected) &&
+ *expected <= PG_INT16_MAX && *expected >= PG_INT16_MIN)
+ PG_PPC_COMPARE_EXCHANGE("ldarx", "cmpdi", "stdcx.", "i");
+ else
+#endif
+ PG_PPC_COMPARE_EXCHANGE("ldarx", "cmpd", "stdcx.", "r");
+
+ ret = (condition_register >> 29) & 1; /* test eq bit of cr0 */
+ if (!ret)
+ *expected = found;
+ return ret;
+}
+
+#define PG_HAVE_ATOMIC_FETCH_ADD_U64
+static inline uint64
+pg_atomic_fetch_add_u64_impl(volatile pg_atomic_uint64 *ptr, int64 add_)
+{
+ uint64 _t;
+ uint64 res;
+
+#ifdef HAVE_I_CONSTRAINT__BUILTIN_CONSTANT_P
+ if (__builtin_constant_p(add_) &&
+ add_ <= PG_INT16_MAX && add_ >= PG_INT16_MIN)
+ PG_PPC_FETCH_ADD("ldarx", "addi", "stdcx.", "i");
+ else
+#endif
+ PG_PPC_FETCH_ADD("ldarx", "add", "stdcx.", "r");
+
+ return res;
+}
+
+#endif /* PG_HAVE_ATOMIC_U64_SUPPORT */
+
/* per architecture manual doubleword accesses have single copy atomicity */
#define PG_HAVE_8BYTE_SINGLE_COPY_ATOMICITY
diff --git a/src/include/port/atomics/generic-xlc.h
b/src/include/port/atomics/generic-xlc.h
deleted file mode 100644
index 8b5c732..0000000
--- a/src/include/port/atomics/generic-xlc.h
+++ /dev/null
@@ -1,142 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * generic-xlc.h
- * Atomic operations for IBM's CC
- *
- * Portions Copyright (c) 2013-2019, PostgreSQL Global Development Group
- *
- * NOTES:
- *
- * Documentation:
- * * Synchronization and atomic built-in functions
- *
http://www-01.ibm.com/support/knowledgecenter/SSGH3R_13.1.2/com.ibm.xlcpp131.aix.doc/compiler_ref/bifs_sync_atomic.html
- *
- * src/include/port/atomics/generic-xlc.h
- *
- * -------------------------------------------------------------------------
- */
-
-#if defined(HAVE_ATOMICS)
-
-#define PG_HAVE_ATOMIC_U32_SUPPORT
-typedef struct pg_atomic_uint32
-{
- volatile uint32 value;
-} pg_atomic_uint32;
-
-
-/* 64bit atomics are only supported in 64bit mode */
-#ifdef __64BIT__
-#define PG_HAVE_ATOMIC_U64_SUPPORT
-typedef struct pg_atomic_uint64
-{
- volatile uint64 value pg_attribute_aligned(8);
-} pg_atomic_uint64;
-
-#endif /* __64BIT__ */
-
-#define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U32
-static inline bool
-pg_atomic_compare_exchange_u32_impl(volatile pg_atomic_uint32 *ptr,
- uint32
*expected, uint32 newval)
-{
- bool ret;
-
- /*
- * atomics.h specifies sequential consistency ("full barrier semantics")
- * for this interface. Since "lwsync" provides acquire/release
- * consistency only, do not use it here. GCC atomics observe the same
- * restriction; see its rs6000_pre_atomic_barrier().
- */
- __asm__ __volatile__ (" sync \n" ::: "memory");
-
- /*
- * XXX: __compare_and_swap is defined to take signed parameters, but
that
- * shouldn't matter since we don't perform any arithmetic operations.
- */
- ret = __compare_and_swap((volatile int*)&ptr->value,
- (int *)expected,
(int)newval);
-
- /*
- * xlc's documentation tells us:
- * "If __compare_and_swap is used as a locking primitive, insert a call
to
- * the __isync built-in function at the start of any critical sections."
- *
- * The critical section begins immediately after __compare_and_swap().
- */
- __isync();
-
- return ret;
-}
-
-#define PG_HAVE_ATOMIC_FETCH_ADD_U32
-static inline uint32
-pg_atomic_fetch_add_u32_impl(volatile pg_atomic_uint32 *ptr, int32 add_)
-{
- uint32 _t;
- uint32 res;
-
- /*
- * xlc has a no-longer-documented __fetch_and_add() intrinsic. In xlc
- * 12.01.0000.0000, it emits a leading "sync" and trailing "isync". In
- * xlc 13.01.0003.0004, it emits neither. Hence, using the intrinsic
- * would add redundant syncs on xlc 12.
- */
- __asm__ __volatile__(
- " sync \n"
- " lwarx %1,0,%4 \n"
- " add %0,%1,%3 \n"
- " stwcx. %0,0,%4 \n"
- " bne $-12 \n" /* branch to
lwarx */
- " isync \n"
-: "=&r"(_t), "=&r"(res), "+m"(ptr->value)
-: "r"(add_), "r"(&ptr->value)
-: "memory", "cc");
-
- return res;
-}
-
-#ifdef PG_HAVE_ATOMIC_U64_SUPPORT
-
-#define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U64
-static inline bool
-pg_atomic_compare_exchange_u64_impl(volatile pg_atomic_uint64 *ptr,
- uint64
*expected, uint64 newval)
-{
- bool ret;
-
- __asm__ __volatile__ (" sync \n" ::: "memory");
-
- ret = __compare_and_swaplp((volatile long*)&ptr->value,
- (long *)expected,
(long)newval);
-
- __isync();
-
- return ret;
-}
-
-#define PG_HAVE_ATOMIC_FETCH_ADD_U64
-static inline uint64
-pg_atomic_fetch_add_u64_impl(volatile pg_atomic_uint64 *ptr, int64 add_)
-{
- uint64 _t;
- uint64 res;
-
- /* Like u32, but s/lwarx/ldarx/; s/stwcx/stdcx/ */
- __asm__ __volatile__(
- " sync \n"
- " ldarx %1,0,%4 \n"
- " add %0,%1,%3 \n"
- " stdcx. %0,0,%4 \n"
- " bne $-12 \n" /* branch to
ldarx */
- " isync \n"
-: "=&r"(_t), "=&r"(res), "+m"(ptr->value)
-: "r"(add_), "r"(&ptr->value)
-: "memory", "cc");
-
- return res;
-}
-
-#endif /* PG_HAVE_ATOMIC_U64_SUPPORT */
-
-#endif /* defined(HAVE_ATOMICS) */
diff --git a/src/tools/pginclude/cpluspluscheck
b/src/tools/pginclude/cpluspluscheck
index b2060f3..1a35016 100755
--- a/src/tools/pginclude/cpluspluscheck
+++ b/src/tools/pginclude/cpluspluscheck
@@ -84,7 +84,6 @@ do
test "$f" = src/include/port/atomics/generic-gcc.h && continue
test "$f" = src/include/port/atomics/generic-msvc.h && continue
test "$f" = src/include/port/atomics/generic-sunpro.h && continue
- test "$f" = src/include/port/atomics/generic-xlc.h && continue
# rusagestub.h is also platform-specific, and will be included
# by utils/pg_rusage.h if necessary.