Currently, all platforms must indirect through a function pointer to call popcount on a word-sized input, even though we don't arrange for a fast implementation on non-x86 to make it worthwhile.
0001 moves some declarations around so that "slow" popcount functions are called directly on non-x86 platforms. 0002 was an idea to simplify and unify the coding for the slow functions. Also attached is a test module for building microbenchmarks. On a Power8 machine using gcc 4.8, and running time ./inst/bin/psql -c 'select drive_popcount(100000, 1024)' I get master: 647ms 0001: 183ms 0002: 228ms So 0001 is a clear winner on that platform. 0002 is still good, but slower than 0001 for some reason, and it turns out that on master, gcc does emit a popcnt instruction from the intrinsic: 0000000000000000 <pg_popcount32_slow>: 0: f4 02 63 7c popcntw r3,r3 4: b4 07 63 7c extsw r3,r3 8: 20 00 80 4e blr ... The gcc docs mention a flag for this, but I'm not sure why it seems not to need it: https://gcc.gnu.org/onlinedocs/gcc/RS_002f6000-and-PowerPC-Options.html#RS_002f6000-and-PowerPC-Options Maybe that's because the machine I used was ppc64le, but I'm not sure a ppc binary built like this is portable to other hardware. For that reason, maybe 0002 is a good idea. -- John Naylor EDB: http://www.enterprisedb.com
diff --git a/src/test/modules/test_popcount/Makefile b/src/test/modules/test_popcount/Makefile new file mode 100644 index 0000000000..8a49d1ccfc --- /dev/null +++ b/src/test/modules/test_popcount/Makefile @@ -0,0 +1,21 @@ +MODULE_big = test_popcount +OBJS = test_popcount.o +PGFILEDESC = "test" +EXTENSION = test_popcount +DATA = test_popcount--1.0.sql + +first: all + +# needed? +test_popcount.o: test_popcount.c + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = src/test/modules/test_popcount +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/src/test/modules/test_popcount/test_popcount--1.0.sql b/src/test/modules/test_popcount/test_popcount--1.0.sql new file mode 100644 index 0000000000..a5975e54cd --- /dev/null +++ b/src/test/modules/test_popcount/test_popcount--1.0.sql @@ -0,0 +1,2 @@ +CREATE FUNCTION drive_popcount (count int, num int) RETURNS bigint AS 'MODULE_PATHNAME' LANGUAGE C; +CREATE FUNCTION drive_popcount64(count int, num int) RETURNS bigint AS 'MODULE_PATHNAME' LANGUAGE C; diff --git a/src/test/modules/test_popcount/test_popcount.c b/src/test/modules/test_popcount/test_popcount.c new file mode 100644 index 0000000000..0030ed142f --- /dev/null +++ b/src/test/modules/test_popcount/test_popcount.c @@ -0,0 +1,71 @@ +/* select drive_popcount(1000000, 1024); */ + +#include "postgres.h" + +#include "fmgr.h" + +#include "port/pg_bitutils.h" + +PG_MODULE_MAGIC; + +#ifndef PG_POPCOUNT64 +#define PG_POPCOUNT32(word) pg_popcount32(word) +#define PG_POPCOUNT64(word) pg_popcount64(word) +#endif + + +/* + * drive_popcount64(count int, num int) returns bigint + * + * num is the number of 64-bit words to use + */ +PG_FUNCTION_INFO_V1(drive_popcount64); +Datum +drive_popcount64(PG_FUNCTION_ARGS) +{ + + int count = PG_GETARG_INT32(0); + int num = PG_GETARG_INT32(1); + + int len = num * sizeof(uint64); + uint64 *words = palloc(len); + int64 popcount; + + for (int i = 0; i < num; i++) + words[i] = i; + + while (count--) + { + popcount = 0; + for (int i = 0; i < num; i++) + popcount += PG_POPCOUNT64(words[i]); + } + + PG_RETURN_INT64(popcount); +} + +/* + * drive_popcount(count int, len int) returns bigint + * + * num is the number of 64-bit words to use + */ +PG_FUNCTION_INFO_V1(drive_popcount); +Datum +drive_popcount(PG_FUNCTION_ARGS) +{ + + int count = PG_GETARG_INT32(0); + int num = PG_GETARG_INT32(1); + + int len = num * sizeof(uint64); + uint64 *words = palloc(len); + int64 popcount; + + for (int i = 0; i < num; i++) + words[i] = i; + + while (count--) + popcount = pg_popcount((const char*) words, len); + + PG_RETURN_INT64(popcount); +} diff --git a/src/test/modules/test_popcount/test_popcount.control b/src/test/modules/test_popcount/test_popcount.control new file mode 100644 index 0000000000..6837d724b4 --- /dev/null +++ b/src/test/modules/test_popcount/test_popcount.control @@ -0,0 +1,4 @@ +comment = 'test' +default_version = '1.0' +module_pathname = '$libdir/test_popcount' +relocatable = true
From f65964c997cf8958dd9e53a04fb7e92098c784f1 Mon Sep 17 00:00:00 2001 From: John Naylor <john.naylor@2ndquadrant.com> Date: Wed, 11 Aug 2021 12:17:51 -0400 Subject: [PATCH v1 1/2] Use direct function calls for pg_popcount{32,64} on non-x86 platforms Previously, all pg_popcount{32,64} calls were indirected through a function pointer, even though we had no fast implementation for non-x86 platforms. To fix, use macros to cover both the direct and indirect cases, and define them appropriately similar to the CRC code. --- src/backend/access/heap/visibilitymap.c | 6 ++-- src/backend/nodes/bitmapset.c | 4 +-- src/include/port/pg_bitutils.h | 42 +++++++++++++++++++++- src/port/pg_bitutils.c | 46 ++++--------------------- 4 files changed, 52 insertions(+), 46 deletions(-) diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c index 114fbbdd30..75b0377807 100644 --- a/src/backend/access/heap/visibilitymap.c +++ b/src/backend/access/heap/visibilitymap.c @@ -411,14 +411,14 @@ visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_fro if (all_frozen == NULL) { for (i = 0; i < MAPSIZE / sizeof(uint64); i++) - nvisible += pg_popcount64(map[i] & VISIBLE_MASK64); + nvisible += PG_POPCOUNT64(map[i] & VISIBLE_MASK64); } else { for (i = 0; i < MAPSIZE / sizeof(uint64); i++) { - nvisible += pg_popcount64(map[i] & VISIBLE_MASK64); - nfrozen += pg_popcount64(map[i] & FROZEN_MASK64); + nvisible += PG_POPCOUNT64(map[i] & VISIBLE_MASK64); + nfrozen += PG_POPCOUNT64(map[i] & FROZEN_MASK64); } } diff --git a/src/backend/nodes/bitmapset.c b/src/backend/nodes/bitmapset.c index 649478b0d4..a6c4c62914 100644 --- a/src/backend/nodes/bitmapset.c +++ b/src/backend/nodes/bitmapset.c @@ -57,11 +57,11 @@ #if BITS_PER_BITMAPWORD == 32 #define bmw_leftmost_one_pos(w) pg_leftmost_one_pos32(w) #define bmw_rightmost_one_pos(w) pg_rightmost_one_pos32(w) -#define bmw_popcount(w) pg_popcount32(w) +#define bmw_popcount(w) PG_POPCOUNT32(w) #elif BITS_PER_BITMAPWORD == 64 #define bmw_leftmost_one_pos(w) pg_leftmost_one_pos64(w) #define bmw_rightmost_one_pos(w) pg_rightmost_one_pos64(w) -#define bmw_popcount(w) pg_popcount64(w) +#define bmw_popcount(w) PG_POPCOUNT64(w) #else #error "invalid BITS_PER_BITMAPWORD" #endif diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h index 086bd08132..ea97192b3a 100644 --- a/src/include/port/pg_bitutils.h +++ b/src/include/port/pg_bitutils.h @@ -253,9 +253,49 @@ pg_ceil_log2_64(uint64 num) return pg_leftmost_one_pos64(num - 1) + 1; } -/* Count the number of one-bits in a uint32 or uint64 */ +/* + * With MSVC on x86_64 builds, try using native popcnt instructions via the + * __popcnt and __popcnt64 intrinsics. These don't work the same as GCC's + * __builtin_popcount* intrinsic functions as they always emit popcnt + * instructions. + */ +#if defined(_MSC_VER) && defined(_M_AMD64) +#define HAVE_X86_64_POPCNTQ +#endif + +/* + * On x86_64, we can use the hardware popcount instruction, but only if + * we can verify that the CPU supports it via the cpuid instruction. + * + * Otherwise, we fall back to a hand-rolled implementation. + */ +#ifdef HAVE_X86_64_POPCNTQ +#if defined(HAVE__GET_CPUID) || defined(HAVE__CPUID) +#define TRY_POPCNT_FAST 1 +#endif +#endif + +#ifdef TRY_POPCNT_FAST +/* Use the POPCNT instruction, but perform a runtime check first. */ +#define PG_POPCOUNT32(word) pg_popcount32(word) +#define PG_POPCOUNT64(word) pg_popcount64(word) + +extern int pg_popcount32_slow(uint32 word); +extern int pg_popcount64_slow(uint64 word); extern int (*pg_popcount32) (uint32 word); extern int (*pg_popcount64) (uint64 word); +extern int pg_popcount32_fast(uint32 word); +extern int pg_popcount64_fast(uint64 word); + +#else +/* Use a portable implementation */ +#define PG_POPCOUNT32(word) pg_popcount32_slow(word) +#define PG_POPCOUNT64(word) pg_popcount64_slow(word) + +extern int pg_popcount32_slow(uint32 word); +extern int pg_popcount64_slow(uint64 word); + +#endif /* TRY_POPCNT_FAST */ /* Count the number of one-bits in a byte array */ extern uint64 pg_popcount(const char *buf, int bytes); diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c index 10676b285c..3e90de5249 100644 --- a/src/port/pg_bitutils.c +++ b/src/port/pg_bitutils.c @@ -103,47 +103,13 @@ const uint8 pg_number_of_ones[256] = { 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 }; -/* - * With MSVC on x86_64 builds, try using native popcnt instructions via the - * __popcnt and __popcnt64 intrinsics. These don't work the same as GCC's - * __builtin_popcount* intrinsic functions as they always emit popcnt - * instructions. - */ -#if defined(_MSC_VER) && defined(_M_AMD64) -#define HAVE_X86_64_POPCNTQ -#endif - -/* - * On x86_64, we can use the hardware popcount instruction, but only if - * we can verify that the CPU supports it via the cpuid instruction. - * - * Otherwise, we fall back to __builtin_popcount if the compiler has that, - * or a hand-rolled implementation if not. - */ -#ifdef HAVE_X86_64_POPCNTQ -#if defined(HAVE__GET_CPUID) || defined(HAVE__CPUID) -#define TRY_POPCNT_FAST 1 -#endif -#endif - -static int pg_popcount32_slow(uint32 word); -static int pg_popcount64_slow(uint64 word); - #ifdef TRY_POPCNT_FAST static bool pg_popcount_available(void); static int pg_popcount32_choose(uint32 word); static int pg_popcount64_choose(uint64 word); -static int pg_popcount32_fast(uint32 word); -static int pg_popcount64_fast(uint64 word); int (*pg_popcount32) (uint32 word) = pg_popcount32_choose; int (*pg_popcount64) (uint64 word) = pg_popcount64_choose; -#else -int (*pg_popcount32) (uint32 word) = pg_popcount32_slow; -int (*pg_popcount64) (uint64 word) = pg_popcount64_slow; -#endif /* TRY_POPCNT_FAST */ - -#ifdef TRY_POPCNT_FAST /* * Return true if CPUID indicates that the POPCNT instruction is available. @@ -208,7 +174,7 @@ pg_popcount64_choose(uint64 word) * pg_popcount32_fast * Return the number of 1 bits set in word */ -static int +int pg_popcount32_fast(uint32 word) { #ifdef _MSC_VER @@ -225,7 +191,7 @@ __asm__ __volatile__(" popcntl %1,%0\n":"=q"(res):"rm"(word):"cc"); * pg_popcount64_fast * Return the number of 1 bits set in word */ -static int +int pg_popcount64_fast(uint64 word) { #ifdef _MSC_VER @@ -245,7 +211,7 @@ __asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc"); * pg_popcount32_slow * Return the number of 1 bits set in word */ -static int +int pg_popcount32_slow(uint32 word) { #ifdef HAVE__BUILTIN_POPCOUNT @@ -267,7 +233,7 @@ pg_popcount32_slow(uint32 word) * pg_popcount64_slow * Return the number of 1 bits set in word */ -static int +int pg_popcount64_slow(uint64 word) { #ifdef HAVE__BUILTIN_POPCOUNT @@ -309,7 +275,7 @@ pg_popcount(const char *buf, int bytes) while (bytes >= 8) { - popcnt += pg_popcount64(*words++); + popcnt += PG_POPCOUNT64(*words++); bytes -= 8; } @@ -323,7 +289,7 @@ pg_popcount(const char *buf, int bytes) while (bytes >= 4) { - popcnt += pg_popcount32(*words++); + popcnt += PG_POPCOUNT32(*words++); bytes -= 4; } -- 2.31.1
From 225073b1741fd89ec7728e28978e057e9dc11116 Mon Sep 17 00:00:00 2001 From: John Naylor <john.naylor@2ndquadrant.com> Date: Wed, 11 Aug 2021 12:22:24 -0400 Subject: [PATCH v1 2/2] Replace intrinsics in pg_popcount*_slow with pure C code Intrinsics are used in the hope that the compiler will access some fast hardware implementation where available. However, on x86 at least, __builtin_popcount() didn't emit a POPCNT instruction since -mpopcnt wasn't passed to the compiler. Instead, the compiler emitted bitwise operations where the intrinsic was supported. Where not supported, we used a byte-at-a-time loop using a lookup table. Since the *slow functions are fallback implementations, replace the intrinsics and the associated #ifdef maze with the bitwise operations written in C so all platforms can benefit from them. If we ever get configure support for x86-64-v2, we could use these intrinsics to emit the POPCNT instruction without a runtime check. To allow for that possibility, let's keep the configure checks around. --- src/port/pg_bitutils.c | 47 ++++++++++++++---------------------------- 1 file changed, 15 insertions(+), 32 deletions(-) diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c index 3e90de5249..500372db10 100644 --- a/src/port/pg_bitutils.c +++ b/src/port/pg_bitutils.c @@ -207,6 +207,11 @@ __asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc"); #endif /* TRY_POPCNT_FAST */ +/* + * The *_slow implementations are based on + * http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel + */ + /* * pg_popcount32_slow * Return the number of 1 bits set in word @@ -214,19 +219,11 @@ __asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc"); int pg_popcount32_slow(uint32 word) { -#ifdef HAVE__BUILTIN_POPCOUNT - return __builtin_popcount(word); -#else /* !HAVE__BUILTIN_POPCOUNT */ - int result = 0; - - while (word != 0) - { - result += pg_number_of_ones[word & 255]; - word >>= 8; - } - - return result; -#endif /* HAVE__BUILTIN_POPCOUNT */ + word = word - ((word >> 1) & 0x55555555); + word = (word & 0x33333333) + + ((word >> 2) & 0x33333333); + word = (word + (word >> 4)) & 0xF0F0F0F; + return (int) ((word * 0x1010101) >> 24); } /* @@ -236,25 +233,11 @@ pg_popcount32_slow(uint32 word) int pg_popcount64_slow(uint64 word) { -#ifdef HAVE__BUILTIN_POPCOUNT -#if defined(HAVE_LONG_INT_64) - return __builtin_popcountl(word); -#elif defined(HAVE_LONG_LONG_INT_64) - return __builtin_popcountll(word); -#else -#error must have a working 64-bit integer datatype -#endif -#else /* !HAVE__BUILTIN_POPCOUNT */ - int result = 0; - - while (word != 0) - { - result += pg_number_of_ones[word & 255]; - word >>= 8; - } - - return result; -#endif /* HAVE__BUILTIN_POPCOUNT */ + word = word - ((word >> 1) & UINT64CONST(0x5555555555555555)); + word = (word & UINT64CONST(0x3333333333333333)) + + ((word >> 2) & UINT64CONST(0x3333333333333333)); + word = (word + (word >> 4)) & UINT64CONST(0xF0F0F0F0F0F0F0F); + return (int) ((word * UINT64CONST(0x101010101010101)) >> 56); } -- 2.31.1