On 2019-Feb-14, Tom Lane wrote:

> Some further thoughts here ...
> 
> Does the "lzcnt" runtime probe actually do anything useful?
> On the x86_64 compilers I tried (gcc 8.2.1 and 4.4.7), __builtin_clz
> and __builtin_ctz compile to sequences involving bsrq and bsfq
> regardless of -mpopcnt.  It's fairly hard to see how lzcnt would
> buy anything over those sequences even if there were zero overhead
> involved in using it.

Hah, I just realized you have to add -mlzcnt in order for these builtins
to use the lzcnt instructions.  It goes from something like

        bsrq    %rax, %rax
        xorq    $63, %rax

to
        lzcntq  %rax, %rax

Significant?

I have this patch now, written before I realized the above; I'll augment
it to cater to this (adding -mlzcnt and a new set of functions --
perhaps a new file "lzcnt.c" or maybe put the lot in pg_popcount.c and
rename it?) and resubmit after an errand I have to run now.

-- 
Álvaro Herrera                https://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
commit 477cc89802effed5d80d4a82891c7ef3b5e61e63
Author:     Alvaro Herrera <alvhe...@alvh.no-ip.org>
AuthorDate: Thu Feb 14 15:18:03 2019 -0300
CommitDate: Thu Feb 14 15:31:32 2019 -0300

    fix popcount etc

diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index 148c5550573..53787a7ef32 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -23,4 +23,8 @@ extern int (*pg_leftmost_one64) (uint64 word);
 
 extern uint64 pg_popcount(const char *buf, int bytes);
 
+/* in pg_popcount.c */
+extern int pg_popcount32_sse42(uint32 word);
+extern int pg_popcount64_sse42(uint64 word);
+
 #endif							/* PG_BITUTILS_H */
diff --git a/src/port/Makefile b/src/port/Makefile
index 2da73260a13..d7290573c65 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -41,6 +41,13 @@ OBJS = $(LIBOBJS) $(PG_CRC32C_OBJS) chklocale.o erand48.o inet_net_ntop.o \
 	qsort.o qsort_arg.o quotes.o snprintf.o sprompt.o strerror.o \
 	tar.o thread.o
 
+# If the compiler supports a flag for the POPCOUNT instruction, we compile
+# pg_popcount.o with it.  (Whether to actually use the functions therein is
+# determined at runtime by testing CPUID flags.)
+ifneq ($(CFLAGS_POPCNT),)
+OBJS += pg_popcount.o
+endif
+
 # libpgport.a, libpgport_shlib.a, and libpgport_srv.a contain the same files
 # foo.o, foo_shlib.o, and foo_srv.o are all built from foo.c
 OBJS_SHLIB = $(OBJS:%.o=%_shlib.o)
@@ -78,10 +85,10 @@ pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_ARMV8_CRC32C)
 pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_ARMV8_CRC32C)
 pg_crc32c_armv8_srv.o: CFLAGS+=$(CFLAGS_ARMV8_CRC32C)
 
-# pg_bitutils.c needs CFLAGS_POPCNT
-pg_bitutils.o: CFLAGS+=$(CFLAGS_POPCNT)
-pg_bitutils_shlib.o: CFLAGS+=$(CFLAGS_POPCNT)
-pg_bitutils_srv.o: CFLAGS+=$(CFLAGS_POPCNT)
+# pg_popcount.c needs CFLAGS_POPCNT
+pg_popcount.o: CFLAGS+=$(CFLAGS_POPCNT)
+pg_popcount_shlib.o: CFLAGS+=$(CFLAGS_POPCNT)
+pg_popcount_srv.o: CFLAGS+=$(CFLAGS_POPCNT)
 
 #
 # Shared library versions of object files
diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index aac394fe927..fc8de518791 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -10,7 +10,6 @@
  *
  *-------------------------------------------------------------------------
  */
-
 #include "postgres.h"
 
 #ifdef HAVE__GET_CPUID
@@ -23,61 +22,52 @@
 
 #include "port/pg_bitutils.h"
 
-#if defined(HAVE__BUILTIN_POPCOUNT) && defined(HAVE__GET_CPUID)
+#ifdef HAVE__BUILTIN_POPCOUNT
 static bool pg_popcount_available(void);
 static int pg_popcount32_choose(uint32 word);
-static int pg_popcount32_sse42(uint32 word);
+static int pg_popcount32_builtin(uint32 word);
 static int pg_popcount64_choose(uint64 word);
-static int pg_popcount64_sse42(uint64 word);
-#endif
-static int pg_popcount32_slow(uint32 word);
-static int pg_popcount64_slow(uint64 word);
-
-#if defined(HAVE__GET_CPUID) && (defined(HAVE__BUILTIN_CTZ) || defined(HAVE__BUILTIN_CLZ))
-static bool pg_lzcnt_available(void);
-#endif
-
-#if defined(HAVE__BUILTIN_CTZ) && defined(HAVE__GET_CPUID)
-static int pg_rightmost_one32_choose(uint32 word);
-static int pg_rightmost_one32_abm(uint32 word);
-static int pg_rightmost_one64_choose(uint64 word);
-static int pg_rightmost_one64_abm(uint64 word);
-#endif
-static int pg_rightmost_one32_slow(uint32 word);
-static int pg_rightmost_one64_slow(uint64 word);
-
-#if defined(HAVE__BUILTIN_CLZ) && defined(HAVE__GET_CPUID)
-static int pg_leftmost_one32_choose(uint32 word);
-static int pg_leftmost_one32_abm(uint32 word);
-static int pg_leftmost_one64_choose(uint64 word);
-static int pg_leftmost_one64_abm(uint64 word);
-#endif
-static int pg_leftmost_one32_slow(uint32 word);
-static int pg_leftmost_one64_slow(uint64 word);
-
-#if defined(HAVE__BUILTIN_POPCOUNT) && defined(HAVE__GET_CPUID)
+static int pg_popcount64_builtin(uint64 word);
 int (*pg_popcount32) (uint32 word) = pg_popcount32_choose;
 int (*pg_popcount64) (uint64 word) = pg_popcount64_choose;
 #else
+static int pg_popcount32_slow(uint32 word);
+static int pg_popcount64_slow(uint64 word);
 int (*pg_popcount32) (uint32 word) = pg_popcount32_slow;
 int (*pg_popcount64) (uint64 word) = pg_popcount64_slow;
 #endif
 
-#if defined(HAVE__BUILTIN_CTZ) && defined(HAVE__GET_CPUID)
+#if defined(HAVE__BUILTIN_CTZ) || defined(HAVE__BUILTIN_CLZ)
+static bool pg_lzcnt_available(void);
+#endif
+
+#ifdef HAVE__BUILTIN_CTZ
+static int pg_rightmost_one32_choose(uint32 word);
+static int pg_rightmost_one32_abm(uint32 word);
+static int pg_rightmost_one64_choose(uint64 word);
+static int pg_rightmost_one64_abm(uint64 word);
 int (*pg_rightmost_one32) (uint32 word) = pg_rightmost_one32_choose;
 int (*pg_rightmost_one64) (uint64 word) = pg_rightmost_one64_choose;
 #else
 int (*pg_rightmost_one32) (uint32 word) = pg_rightmost_one32_slow;
 int (*pg_rightmost_one64) (uint64 word) = pg_rightmost_one64_slow;
 #endif
+static int pg_rightmost_one32_slow(uint32 word);
+static int pg_rightmost_one64_slow(uint64 word);
 
-#if defined(HAVE__BUILTIN_CLZ) && defined(HAVE__GET_CPUID)
+#ifdef HAVE__BUILTIN_CLZ
+static int pg_leftmost_one32_choose(uint32 word);
+static int pg_leftmost_one32_abm(uint32 word);
+static int pg_leftmost_one64_choose(uint64 word);
+static int pg_leftmost_one64_abm(uint64 word);
 int (*pg_leftmost_one32) (uint32 word) = pg_leftmost_one32_choose;
 int (*pg_leftmost_one64) (uint64 word) = pg_leftmost_one64_choose;
 #else
 int (*pg_leftmost_one32) (uint32 word) = pg_leftmost_one32_slow;
 int (*pg_leftmost_one64) (uint64 word) = pg_leftmost_one64_slow;
 #endif
+static int pg_leftmost_one32_slow(uint32 word);
+static int pg_leftmost_one64_slow(uint64 word);
 
 /* Array marking the number of 1-bits for each value of 0-255. */
 static const uint8 number_of_ones[256] = {
@@ -147,27 +137,27 @@ static const uint8 leftmost_one_pos[256] = {
 	7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
 };
 
-#if defined(HAVE__GET_CPUID) && defined(HAVE__BUILTIN_POPCOUNT)
 
 static bool
 pg_popcount_available(void)
 {
+#if defined(HAVE__GET_CPUID) || defined(HAVE__CPUID)
 	unsigned int exx[4] = { 0, 0, 0, 0 };
 
 #if defined(HAVE__GET_CPUID)
 	__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
 #elif defined(HAVE__CPUID)
 	__cpuid(exx, 1);
-#else
-#error cpuid instruction not available
 #endif
 
 	return (exx[2] & (1 << 23)) != 0;	/* POPCNT */
-}
+#else		/* HAVE__GET_CPUID || HAVE__CPUID */
+
+	return false;
 #endif
+}
 
-#if defined(HAVE__GET_CPUID) && defined(HAVE__BUILTIN_POPCOUNT)
-
+#ifdef HAVE__BUILTIN_POPCOUNT
 /*
  * This gets called on the first call. It replaces the function pointer
  * so that subsequent calls are routed directly to the chosen implementation.
@@ -178,18 +168,19 @@ pg_popcount32_choose(uint32 word)
 	if (pg_popcount_available())
 		pg_popcount32 = pg_popcount32_sse42;
 	else
-		pg_popcount32 = pg_popcount32_slow;
+		pg_popcount32 = pg_popcount32_builtin;
 
 	return pg_popcount32(word);
 }
 
 static int
-pg_popcount32_sse42(uint32 word)
+pg_popcount32_builtin(uint32 word)
 {
 	return __builtin_popcount(word);
 }
 #endif
 
+#ifndef HAVE__BUILTIN_POPCOUNT
 /*
  * pg_popcount32_slow
  *		Return the number of 1 bits set in word
@@ -207,6 +198,7 @@ pg_popcount32_slow(uint32 word)
 
 	return result;
 }
+#endif
 
 /*
  * pg_popcount
@@ -254,8 +246,7 @@ pg_popcount(const char *buf, int bytes)
 	return popcnt;
 }
 
-#if defined(HAVE__GET_CPUID) && defined(HAVE__BUILTIN_POPCOUNT)
-
+#ifdef HAVE__BUILTIN_POPCOUNT
 /*
  * This gets called on the first call. It replaces the function pointer
  * so that subsequent calls are routed directly to the chosen implementation.
@@ -266,26 +257,26 @@ pg_popcount64_choose(uint64 word)
 	if (pg_popcount_available())
 		pg_popcount64 = pg_popcount64_sse42;
 	else
-		pg_popcount64 = pg_popcount64_slow;
+		pg_popcount64 = pg_popcount64_builtin;
 
 	return pg_popcount64(word);
 }
 
 static int
-pg_popcount64_sse42(uint64 word)
+pg_popcount64_builtin(uint64 word)
 {
 #if defined(HAVE_LONG_INT_64)
 	return __builtin_popcountl(word);
 #elif defined(HAVE_LONG_LONG_INT_64)
 	return __builtin_popcountll(word);
 #else
-	/* shouldn't happen */
 #error must have a working 64-bit integer datatype
 #endif
 }
 
-#endif
+#endif		/* HAVE__BUILTIN_POPCOUNT */
 
+#ifndef HAVE__BUILTIN_POPCOUNT
 /*
  * pg_popcount64_slow
  *		Return the number of 1 bits set in word
@@ -303,28 +294,28 @@ pg_popcount64_slow(uint64 word)
 
 	return result;
 }
-
-#if defined(HAVE__GET_CPUID) && (defined(HAVE__BUILTIN_CTZ) || defined(HAVE__BUILTIN_CLZ))
+#endif
 
 static bool
 pg_lzcnt_available(void)
 {
-
+#if (defined(HAVE__GET_CPUID) || defined(HAVE__CPUID))
 	unsigned int exx[4] = { 0, 0, 0, 0 };
 
 #if defined(HAVE__GET_CPUID)
 	__get_cpuid(0x80000001, &exx[0], &exx[1], &exx[2], &exx[3]);
 #elif defined(HAVE__CPUID)
 	__cpuid(exx, 0x80000001);
-#else
-#error cpuid instruction not available
 #endif
 
 	return (exx[2] & (1 << 5)) != 0;	/* LZCNT */
-}
-#endif
+#else		/* HAVE__GET_CPUID || HAVE__CPUID */
 
-#if defined(HAVE__GET_CPUID) && defined(HAVE__BUILTIN_CTZ)
+	return false;
+#endif
+}
+
+#ifdef HAVE__BUILTIN_CTZ
 /*
  * This gets called on the first call. It replaces the function pointer
  * so that subsequent calls are routed directly to the chosen implementation.
@@ -346,7 +337,7 @@ pg_rightmost_one32_abm(uint32 word)
 	return __builtin_ctz(word);
 }
 
-#endif
+#endif		/* HAVE__BUILTIN_CTZ */
 
 /*
  * pg_rightmost_one32_slow
@@ -370,7 +361,7 @@ pg_rightmost_one32_slow(uint32 word)
 	return result;
 }
 
-#if defined(HAVE__GET_CPUID) && defined(HAVE__BUILTIN_CTZ)
+#ifdef HAVE__BUILTIN_CTZ
 /*
  * This gets called on the first call. It replaces the function pointer
  * so that subsequent calls are routed directly to the chosen implementation.
@@ -394,11 +385,10 @@ pg_rightmost_one64_abm(uint64 word)
 #elif defined(HAVE_LONG_LONG_INT_64)
 	return __builtin_ctzll(word);
 #else
-	/* shouldn't happen */
 #error must have a working 64-bit integer datatype
 #endif
 }
-#endif
+#endif		/* HAVE_BUILTIN_CTZ */
 
 /*
  * pg_rightmost_one64_slow
@@ -422,7 +412,7 @@ pg_rightmost_one64_slow(uint64 word)
 	return result;
 }
 
-#if defined(HAVE__GET_CPUID) && defined(HAVE__BUILTIN_CLZ)
+#ifdef HAVE__BUILTIN_CLZ
 /*
  * This gets called on the first call. It replaces the function pointer
  * so that subsequent calls are routed directly to the chosen implementation.
@@ -443,7 +433,7 @@ pg_leftmost_one32_abm(uint32 word)
 {
 	return 31 - __builtin_clz(word);
 }
-#endif
+#endif		/* HAVE__BUILTIN_CLZ */
 
 /*
  * pg_leftmost_one32_slow
@@ -463,7 +453,7 @@ pg_leftmost_one32_slow(uint32 word)
 	return shift + leftmost_one_pos[(word >> shift) & 255];
 }
 
-#if defined(HAVE__GET_CPUID) && defined(HAVE__BUILTIN_CLZ)
+#ifdef HAVE__BUILTIN_CLZ
 /*
  * This gets called on the first call. It replaces the function pointer
  * so that subsequent calls are routed directly to the chosen implementation.
@@ -487,12 +477,10 @@ pg_leftmost_one64_abm(uint64 word)
 #elif defined(HAVE_LONG_LONG_INT_64)
 	return 63 - __builtin_clzll(word);
 #else
-	/* shouldn't happen */
 #error must have a working 64-bit integer datatype
 #endif
-
 }
-#endif
+#endif		/* HAVE__BUIILTIN_CLZ */
 
 /*
  * pg_leftmost_one64_slow
diff --git a/src/port/pg_popcount.c b/src/port/pg_popcount.c
new file mode 100644
index 00000000000..5254c41273f
--- /dev/null
+++ b/src/port/pg_popcount.c
@@ -0,0 +1,36 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_popcount.c
+ *	  CPU-optimized implementation of pg_popcount
+ *
+ * This file must be compiled with a compiler-specific flag to enable the
+ * POPCOUNT instruction.
+ *
+ * Portions Copyright (c) 2019, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_popcount.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "port/pg_bitutils.h"
+
+int
+pg_popcount32_sse42(uint32 word)
+{
+	return __builtin_popcount(word);
+}
+
+int
+pg_popcount64_sse42(uint64 word)
+{
+#if defined(HAVE_LONG_INT_64)
+	return __builtin_popcountl(word);
+#elif defined(HAVE_LONG_LONG_INT_64)
+	return __builtin_popcountll(word);
+#else
+#error must have a working 64-bit integer datatype
+#endif
+}

Reply via email to