On 2019-Feb-14, Tom Lane wrote: > I'd bet a fair amount of money that we'd be better off *not* using > lzcnt, even if available, because then we could just expose things > along this line: > > static inline int > pg_clz(...) > { > #ifdef HAVE__BUILTIN_CLZ > return __builtin_clz(x); > #else > handwritten implementation; > #endif > } > > Avoiding a function call (that has to indirect through a pointer) probably > saves much more than the difference between lzcnt and the other way.
I see ... makes sense. That leads me to the attached patch. It creates a new file pg_popcount.c which is the only one compiled with -mpopcnt (if available); if there's no compiler switch to enable POPCNT, we just don't compile the file. I'm not sure that's kosher -- in particular I'm not sure if it can fail when POPCNT is enabled by other flags and -mpopcnt is not needed at all. I think our c-compiler.m4 stuff is a bit too simplistic there: it just assumes that -mpopcnt is always required. But what if the user passes it in CFLAGS? I left CPUID alone for the CLZ/CTZ builtins. So we either use the table, or the builtins. We never try the instructions. -- Álvaro Herrera https://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
commit 6c771a4f43da0409ae5fa9ff1b1579f381c451c0 Author: Alvaro Herrera <alvhe...@alvh.no-ip.org> AuthorDate: Thu Feb 14 15:18:03 2019 -0300 CommitDate: Thu Feb 14 19:24:03 2019 -0300 fix popcount etc diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h index 148c5550573..635eb9331af 100644 --- a/src/include/port/pg_bitutils.h +++ b/src/include/port/pg_bitutils.h @@ -10,17 +10,20 @@ * *------------------------------------------------------------------------ - */ - #ifndef PG_BITUTILS_H #define PG_BITUTILS_H -extern int (*pg_popcount32) (uint32 word); -extern int (*pg_popcount64) (uint64 word); -extern int (*pg_rightmost_one32) (uint32 word); -extern int (*pg_rightmost_one64) (uint64 word); -extern int (*pg_leftmost_one32) (uint32 word); -extern int (*pg_leftmost_one64) (uint64 word); +extern int pg_rightmost_one32(uint32 word); +extern int pg_rightmost_one64(uint64 word); +extern int pg_leftmost_one32(uint32 word); +extern int pg_leftmost_one64(uint64 word); +extern int (*pg_popcount32) (uint32 word); +extern int (*pg_popcount64) (uint64 word); extern uint64 pg_popcount(const char *buf, int bytes); +/* in pg_popcount.c */ +extern int pg_popcount32_sse42(uint32 word); +extern int pg_popcount64_sse42(uint64 word); + #endif /* PG_BITUTILS_H */ diff --git a/src/port/Makefile b/src/port/Makefile index 2da73260a13..d7290573c65 100644 --- a/src/port/Makefile +++ b/src/port/Makefile @@ -41,6 +41,13 @@ OBJS = $(LIBOBJS) $(PG_CRC32C_OBJS) chklocale.o erand48.o inet_net_ntop.o \ qsort.o qsort_arg.o quotes.o snprintf.o sprompt.o strerror.o \ tar.o thread.o +# If the compiler supports a flag for the POPCOUNT instruction, we compile +# pg_popcount.o with it. (Whether to actually use the functions therein is +# determined at runtime by testing CPUID flags.) +ifneq ($(CFLAGS_POPCNT),) +OBJS += pg_popcount.o +endif + # libpgport.a, libpgport_shlib.a, and libpgport_srv.a contain the same files # foo.o, foo_shlib.o, and foo_srv.o are all built from foo.c OBJS_SHLIB = $(OBJS:%.o=%_shlib.o) @@ -78,10 +85,10 @@ pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_ARMV8_CRC32C) pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_ARMV8_CRC32C) pg_crc32c_armv8_srv.o: CFLAGS+=$(CFLAGS_ARMV8_CRC32C) -# pg_bitutils.c needs CFLAGS_POPCNT -pg_bitutils.o: CFLAGS+=$(CFLAGS_POPCNT) -pg_bitutils_shlib.o: CFLAGS+=$(CFLAGS_POPCNT) -pg_bitutils_srv.o: CFLAGS+=$(CFLAGS_POPCNT) +# pg_popcount.c needs CFLAGS_POPCNT +pg_popcount.o: CFLAGS+=$(CFLAGS_POPCNT) +pg_popcount_shlib.o: CFLAGS+=$(CFLAGS_POPCNT) +pg_popcount_srv.o: CFLAGS+=$(CFLAGS_POPCNT) # # Shared library versions of object files diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c index aac394fe927..23d317d111e 100644 --- a/src/port/pg_bitutils.c +++ b/src/port/pg_bitutils.c @@ -10,7 +10,6 @@ * *------------------------------------------------------------------------- */ - #include "postgres.h" #ifdef HAVE__GET_CPUID @@ -23,61 +22,21 @@ #include "port/pg_bitutils.h" -#if defined(HAVE__BUILTIN_POPCOUNT) && defined(HAVE__GET_CPUID) +#ifdef HAVE__BUILTIN_POPCOUNT static bool pg_popcount_available(void); -static int pg_popcount32_choose(uint32 word); -static int pg_popcount32_sse42(uint32 word); -static int pg_popcount64_choose(uint64 word); -static int pg_popcount64_sse42(uint64 word); -#endif -static int pg_popcount32_slow(uint32 word); -static int pg_popcount64_slow(uint64 word); - -#if defined(HAVE__GET_CPUID) && (defined(HAVE__BUILTIN_CTZ) || defined(HAVE__BUILTIN_CLZ)) -static bool pg_lzcnt_available(void); -#endif - -#if defined(HAVE__BUILTIN_CTZ) && defined(HAVE__GET_CPUID) -static int pg_rightmost_one32_choose(uint32 word); -static int pg_rightmost_one32_abm(uint32 word); -static int pg_rightmost_one64_choose(uint64 word); -static int pg_rightmost_one64_abm(uint64 word); -#endif -static int pg_rightmost_one32_slow(uint32 word); -static int pg_rightmost_one64_slow(uint64 word); - -#if defined(HAVE__BUILTIN_CLZ) && defined(HAVE__GET_CPUID) -static int pg_leftmost_one32_choose(uint32 word); -static int pg_leftmost_one32_abm(uint32 word); -static int pg_leftmost_one64_choose(uint64 word); -static int pg_leftmost_one64_abm(uint64 word); -#endif -static int pg_leftmost_one32_slow(uint32 word); -static int pg_leftmost_one64_slow(uint64 word); - -#if defined(HAVE__BUILTIN_POPCOUNT) && defined(HAVE__GET_CPUID) -int (*pg_popcount32) (uint32 word) = pg_popcount32_choose; -int (*pg_popcount64) (uint64 word) = pg_popcount64_choose; +static int pg_popcount32_choose(uint32 word); +static int pg_popcount32_builtin(uint32 word); +static int pg_popcount64_choose(uint64 word); +static int pg_popcount64_builtin(uint64 word); +int (*pg_popcount32) (uint32 word) = pg_popcount32_choose; +int (*pg_popcount64) (uint64 word) = pg_popcount64_choose; #else -int (*pg_popcount32) (uint32 word) = pg_popcount32_slow; -int (*pg_popcount64) (uint64 word) = pg_popcount64_slow; -#endif +static int pg_popcount32_slow(uint32 word); +static int pg_popcount64_slow(uint64 word); +int (*pg_popcount32) (uint32 word) = pg_popcount32_slow; +int (*pg_popcount64) (uint64 word) = pg_popcount64_slow; +#endif /* !HAVE_BUILTIN_POPCOUNT */ -#if defined(HAVE__BUILTIN_CTZ) && defined(HAVE__GET_CPUID) -int (*pg_rightmost_one32) (uint32 word) = pg_rightmost_one32_choose; -int (*pg_rightmost_one64) (uint64 word) = pg_rightmost_one64_choose; -#else -int (*pg_rightmost_one32) (uint32 word) = pg_rightmost_one32_slow; -int (*pg_rightmost_one64) (uint64 word) = pg_rightmost_one64_slow; -#endif - -#if defined(HAVE__BUILTIN_CLZ) && defined(HAVE__GET_CPUID) -int (*pg_leftmost_one32) (uint32 word) = pg_leftmost_one32_choose; -int (*pg_leftmost_one64) (uint64 word) = pg_leftmost_one64_choose; -#else -int (*pg_leftmost_one32) (uint32 word) = pg_leftmost_one32_slow; -int (*pg_leftmost_one64) (uint64 word) = pg_leftmost_one64_slow; -#endif /* Array marking the number of 1-bits for each value of 0-255. */ static const uint8 number_of_ones[256] = { @@ -99,6 +58,7 @@ static const uint8 number_of_ones[256] = { 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 }; +#ifndef HAVE__BUILTIN_CTZ /* * Array marking the position of the right-most set bit for each value of * 1-255. We count the right-most position as the 0th bit, and the @@ -122,7 +82,9 @@ static const uint8 rightmost_one_pos[256] = { 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 }; +#endif /* !HAVE__BUILTIN_CTZ */ +#ifndef HAVE__BUILTIN_CLZ /* * Array marking the position of the left-most set bit for each value of * 1-255. We count the right-most position as the 0th bit, and the @@ -146,31 +108,36 @@ static const uint8 leftmost_one_pos[256] = { 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 }; +#endif /* !HAVE_BUILTIN_CLZ */ -#if defined(HAVE__GET_CPUID) && defined(HAVE__BUILTIN_POPCOUNT) - +/* + * Return true iff we have CPUID support and it indicates that the POPCNT + * instruction is available. + */ static bool pg_popcount_available(void) { - unsigned int exx[4] = { 0, 0, 0, 0 }; +#if defined(HAVE__GET_CPUID) || defined(HAVE__CPUID) + unsigned int exx[4] = {0, 0, 0, 0}; #if defined(HAVE__GET_CPUID) __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]); #elif defined(HAVE__CPUID) __cpuid(exx, 1); -#else -#error cpuid instruction not available #endif return (exx[2] & (1 << 23)) != 0; /* POPCNT */ -} +#else /* HAVE__GET_CPUID || HAVE__CPUID */ + + return false; #endif +} -#if defined(HAVE__GET_CPUID) && defined(HAVE__BUILTIN_POPCOUNT) - +#ifdef HAVE__BUILTIN_POPCOUNT /* - * This gets called on the first call. It replaces the function pointer - * so that subsequent calls are routed directly to the chosen implementation. + * This gets called on the first call to pg_popcount32. It replaces the + * function pointer so that subsequent calls are routed directly to the chosen + * implementation. */ static int pg_popcount32_choose(uint32 word) @@ -178,18 +145,17 @@ pg_popcount32_choose(uint32 word) if (pg_popcount_available()) pg_popcount32 = pg_popcount32_sse42; else - pg_popcount32 = pg_popcount32_slow; + pg_popcount32 = pg_popcount32_builtin; return pg_popcount32(word); } static int -pg_popcount32_sse42(uint32 word) +pg_popcount32_builtin(uint32 word) { return __builtin_popcount(word); } -#endif - +#else /* HAVE__BUILTIN_POPCOUNT */ /* * pg_popcount32_slow * Return the number of 1 bits set in word @@ -197,7 +163,7 @@ pg_popcount32_sse42(uint32 word) static int pg_popcount32_slow(uint32 word) { - int result = 0; + int result = 0; while (word != 0) { @@ -207,6 +173,7 @@ pg_popcount32_slow(uint32 word) return result; } +#endif /* * pg_popcount @@ -215,13 +182,13 @@ pg_popcount32_slow(uint32 word) uint64 pg_popcount(const char *buf, int bytes) { - uint64 popcnt = 0; + uint64 popcnt = 0; #if SIZEOF_VOID_P >= 8 /* Process in 64-bit chunks if the buffer is aligned. */ if (buf == (char *) TYPEALIGN(8, buf)) { - uint64 *words = (uint64 *) buf; + uint64 *words = (uint64 *) buf; while (bytes >= 8) { @@ -235,7 +202,7 @@ pg_popcount(const char *buf, int bytes) /* Process in 32-bit chunks if the buffer is aligned. */ if (buf == (char *) TYPEALIGN(4, buf)) { - uint32 *words = (uint32 *) buf; + uint32 *words = (uint32 *) buf; while (bytes >= 4) { @@ -254,11 +221,11 @@ pg_popcount(const char *buf, int bytes) return popcnt; } -#if defined(HAVE__GET_CPUID) && defined(HAVE__BUILTIN_POPCOUNT) - +#ifdef HAVE__BUILTIN_POPCOUNT /* - * This gets called on the first call. It replaces the function pointer - * so that subsequent calls are routed directly to the chosen implementation. + * This gets called on the first call to pg_popcount64. It replaces the + * function pointer so that subsequent calls are routed directly to the chosen + * implementation. */ static int pg_popcount64_choose(uint64 word) @@ -266,26 +233,24 @@ pg_popcount64_choose(uint64 word) if (pg_popcount_available()) pg_popcount64 = pg_popcount64_sse42; else - pg_popcount64 = pg_popcount64_slow; + pg_popcount64 = pg_popcount64_builtin; return pg_popcount64(word); } static int -pg_popcount64_sse42(uint64 word) +pg_popcount64_builtin(uint64 word) { #if defined(HAVE_LONG_INT_64) return __builtin_popcountl(word); #elif defined(HAVE_LONG_LONG_INT_64) return __builtin_popcountll(word); #else - /* shouldn't happen */ #error must have a working 64-bit integer datatype #endif } -#endif - +#else /* HAVE__BUILTIN_POPCOUNT */ /* * pg_popcount64_slow * Return the number of 1 bits set in word @@ -293,7 +258,7 @@ pg_popcount64_sse42(uint64 word) static int pg_popcount64_slow(uint64 word) { - int result = 0; + int result = 0; while (word != 0) { @@ -303,156 +268,77 @@ pg_popcount64_slow(uint64 word) return result; } - -#if defined(HAVE__GET_CPUID) && (defined(HAVE__BUILTIN_CTZ) || defined(HAVE__BUILTIN_CLZ)) - -static bool -pg_lzcnt_available(void) -{ - - unsigned int exx[4] = { 0, 0, 0, 0 }; - -#if defined(HAVE__GET_CPUID) - __get_cpuid(0x80000001, &exx[0], &exx[1], &exx[2], &exx[3]); -#elif defined(HAVE__CPUID) - __cpuid(exx, 0x80000001); -#else -#error cpuid instruction not available -#endif - - return (exx[2] & (1 << 5)) != 0; /* LZCNT */ -} -#endif - -#if defined(HAVE__GET_CPUID) && defined(HAVE__BUILTIN_CTZ) -/* - * This gets called on the first call. It replaces the function pointer - * so that subsequent calls are routed directly to the chosen implementation. - */ -static int -pg_rightmost_one32_choose(uint32 word) -{ - if (pg_lzcnt_available()) - pg_rightmost_one32 = pg_rightmost_one32_abm; - else - pg_rightmost_one32 = pg_rightmost_one32_slow; - - return pg_rightmost_one32(word); -} - -static int -pg_rightmost_one32_abm(uint32 word) -{ - return __builtin_ctz(word); -} - #endif /* - * pg_rightmost_one32_slow + * pg_rightmost_one32 * Returns the number of trailing 0-bits in word, starting at the least * significant bit position. word must not be 0. */ -static int -pg_rightmost_one32_slow(uint32 word) +int +pg_rightmost_one32(uint32 word) { - int result = 0; + int result = 0; Assert(word != 0); +#ifdef HAVE__BUILTIN_CTZ + result = __builtin_ctz(word); +#else while ((word & 255) == 0) { word >>= 8; result += 8; } result += rightmost_one_pos[word & 255]; +#endif /* HAVE__BUILTIN_CTZ */ return result; } -#if defined(HAVE__GET_CPUID) && defined(HAVE__BUILTIN_CTZ) /* - * This gets called on the first call. It replaces the function pointer - * so that subsequent calls are routed directly to the chosen implementation. + * pg_rightmost_one64 + * Returns the number of trailing 0-bits in word, starting at the least + * significant bit position. word must not be 0. */ -static int -pg_rightmost_one64_choose(uint64 word) +int +pg_rightmost_one64(uint64 word) { - if (pg_lzcnt_available()) - pg_rightmost_one64 = pg_rightmost_one64_abm; - else - pg_rightmost_one64 = pg_rightmost_one64_slow; + int result = 0; - return pg_rightmost_one64(word); -} + Assert(word != 0); -static int -pg_rightmost_one64_abm(uint64 word) -{ +#ifdef HAVE__BUILTIN_CTZ #if defined(HAVE_LONG_INT_64) return __builtin_ctzl(word); #elif defined(HAVE_LONG_LONG_INT_64) return __builtin_ctzll(word); #else - /* shouldn't happen */ #error must have a working 64-bit integer datatype #endif -} -#endif - -/* - * pg_rightmost_one64_slow - * Returns the number of trailing 0-bits in word, starting at the least - * significant bit position. word must not be 0. - */ -static int -pg_rightmost_one64_slow(uint64 word) -{ - int result = 0; - - Assert(word != 0); - +#else /* HAVE__BUILTIN_CTZ */ while ((word & 255) == 0) { word >>= 8; result += 8; } result += rightmost_one_pos[word & 255]; +#endif return result; } -#if defined(HAVE__GET_CPUID) && defined(HAVE__BUILTIN_CLZ) /* - * This gets called on the first call. It replaces the function pointer - * so that subsequent calls are routed directly to the chosen implementation. - */ -static int -pg_leftmost_one32_choose(uint32 word) -{ - if (pg_lzcnt_available()) - pg_leftmost_one32 = pg_leftmost_one32_abm; - else - pg_leftmost_one32 = pg_leftmost_one32_slow; - - return pg_leftmost_one32(word); -} - -static int -pg_leftmost_one32_abm(uint32 word) -{ - return 31 - __builtin_clz(word); -} -#endif - -/* - * pg_leftmost_one32_slow + * pg_leftmost_one32 * Returns the 0-based position of the most significant set bit in word * measured from the least significant bit. word must not be 0. */ -static int -pg_leftmost_one32_slow(uint32 word) +int +pg_leftmost_one32(uint32 word) { +#ifdef HAVE__BUILTIN_CLZ + return 31 - __builtin_clz(word); +#else int shift = 32 - 8; Assert(word != 0); @@ -461,53 +347,32 @@ pg_leftmost_one32_slow(uint32 word) shift -= 8; return shift + leftmost_one_pos[(word >> shift) & 255]; +#endif /* HAVE__BUILTIN_CLZ */ } -#if defined(HAVE__GET_CPUID) && defined(HAVE__BUILTIN_CLZ) /* - * This gets called on the first call. It replaces the function pointer - * so that subsequent calls are routed directly to the chosen implementation. + * pg_leftmost_one64 + * Returns the 0-based position of the most significant set bit in word + * measured from the least significant bit. word must not be 0. */ -static int -pg_leftmost_one64_choose(uint64 word) -{ - if (pg_lzcnt_available()) - pg_leftmost_one64 = pg_leftmost_one64_abm; - else - pg_leftmost_one64 = pg_leftmost_one64_slow; - - return pg_leftmost_one64(word); -} - -static int -pg_leftmost_one64_abm(uint64 word) +int +pg_leftmost_one64(uint64 word) { +#ifdef HAVE__BUILTIN_CLZ #if defined(HAVE_LONG_INT_64) return 63 - __builtin_clzl(word); #elif defined(HAVE_LONG_LONG_INT_64) return 63 - __builtin_clzll(word); #else - /* shouldn't happen */ #error must have a working 64-bit integer datatype #endif - -} -#endif - -/* - * pg_leftmost_one64_slow - * Returns the 0-based position of the most significant set bit in word - * measured from the least significant bit. word must not be 0. - */ -static int -pg_leftmost_one64_slow(uint64 word) -{ +#else /* HAVE__BUILTIN_CLZ */ int shift = 64 - 8; Assert(word != 0); - while ((word >> shift) == 0) shift -= 8; return shift + leftmost_one_pos[(word >> shift) & 255]; +#endif /* !HAVE__BUIILTIN_CLZ */ } diff --git a/src/port/pg_popcount.c b/src/port/pg_popcount.c new file mode 100644 index 00000000000..5254c41273f --- /dev/null +++ b/src/port/pg_popcount.c @@ -0,0 +1,36 @@ +/*------------------------------------------------------------------------- + * + * pg_popcount.c + * CPU-optimized implementation of pg_popcount + * + * This file must be compiled with a compiler-specific flag to enable the + * POPCOUNT instruction. + * + * Portions Copyright (c) 2019, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/port/pg_popcount.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "port/pg_bitutils.h" + +int +pg_popcount32_sse42(uint32 word) +{ + return __builtin_popcount(word); +} + +int +pg_popcount64_sse42(uint64 word) +{ +#if defined(HAVE_LONG_INT_64) + return __builtin_popcountl(word); +#elif defined(HAVE_LONG_LONG_INT_64) + return __builtin_popcountll(word); +#else +#error must have a working 64-bit integer datatype +#endif +}