Currently, all platforms must indirect through a function pointer to call
popcount on a word-sized input, even though we don't arrange for a fast
implementation on non-x86 to make it worthwhile.

0001 moves some declarations around so that "slow" popcount functions are
called directly on non-x86 platforms.

0002 was an idea to simplify and unify the coding for the slow functions.

Also attached is a test module for building microbenchmarks.

On a Power8 machine using gcc 4.8, and running
time ./inst/bin/psql -c 'select drive_popcount(100000, 1024)'

I get

master: 647ms
0001: 183ms
0002: 228ms

So 0001 is a clear winner on that platform. 0002 is still good, but slower
than 0001 for some reason, and it turns out that on master, gcc does emit a
popcnt instruction from the intrinsic:

0000000000000000 <pg_popcount32_slow>:
   0:   f4 02 63 7c     popcntw r3,r3
   4:   b4 07 63 7c     extsw   r3,r3
   8:   20 00 80 4e     blr
        ...

The gcc docs mention a flag for this, but I'm not sure why it seems not to
need it:

https://gcc.gnu.org/onlinedocs/gcc/RS_002f6000-and-PowerPC-Options.html#RS_002f6000-and-PowerPC-Options

Maybe that's because the machine I used was ppc64le, but I'm not sure a ppc
binary built like this is portable to other hardware. For that reason,
maybe 0002 is a good idea.

-- 
John Naylor
EDB: http://www.enterprisedb.com
diff --git a/src/test/modules/test_popcount/Makefile b/src/test/modules/test_popcount/Makefile
new file mode 100644
index 0000000000..8a49d1ccfc
--- /dev/null
+++ b/src/test/modules/test_popcount/Makefile
@@ -0,0 +1,21 @@
+MODULE_big = test_popcount
+OBJS = test_popcount.o
+PGFILEDESC = "test"
+EXTENSION = test_popcount
+DATA = test_popcount--1.0.sql
+
+first: all
+
+# needed?
+test_popcount.o: test_popcount.c
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = src/test/modules/test_popcount
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/src/test/modules/test_popcount/test_popcount--1.0.sql b/src/test/modules/test_popcount/test_popcount--1.0.sql
new file mode 100644
index 0000000000..a5975e54cd
--- /dev/null
+++ b/src/test/modules/test_popcount/test_popcount--1.0.sql
@@ -0,0 +1,2 @@
+CREATE FUNCTION drive_popcount  (count int, num int) RETURNS bigint AS 'MODULE_PATHNAME' LANGUAGE C;
+CREATE FUNCTION drive_popcount64(count int, num int) RETURNS bigint AS 'MODULE_PATHNAME' LANGUAGE C;
diff --git a/src/test/modules/test_popcount/test_popcount.c b/src/test/modules/test_popcount/test_popcount.c
new file mode 100644
index 0000000000..0030ed142f
--- /dev/null
+++ b/src/test/modules/test_popcount/test_popcount.c
@@ -0,0 +1,71 @@
+/* select drive_popcount(1000000, 1024); */
+
+#include "postgres.h"
+
+#include "fmgr.h"
+
+#include "port/pg_bitutils.h"
+
+PG_MODULE_MAGIC;
+
+#ifndef PG_POPCOUNT64
+#define PG_POPCOUNT32(word) pg_popcount32(word)
+#define PG_POPCOUNT64(word) pg_popcount64(word)
+#endif
+
+
+/*
+ * drive_popcount64(count int, num int) returns bigint
+ *
+ * num is the number of 64-bit words to use
+ */
+PG_FUNCTION_INFO_V1(drive_popcount64);
+Datum
+drive_popcount64(PG_FUNCTION_ARGS)
+{
+
+	int			count = PG_GETARG_INT32(0);
+	int			num   = PG_GETARG_INT32(1);
+
+	int			len = num * sizeof(uint64);
+	uint64 	   *words	= palloc(len);
+	int64		popcount;
+
+	for (int i = 0; i < num; i++)
+		words[i] = i;
+
+	while (count--)
+	{
+		popcount = 0;
+		for (int i = 0; i < num; i++)
+			popcount += PG_POPCOUNT64(words[i]);
+	}
+
+	PG_RETURN_INT64(popcount);
+}
+
+/*
+ * drive_popcount(count int, len int) returns bigint
+ *
+ * num is the number of 64-bit words to use
+ */
+PG_FUNCTION_INFO_V1(drive_popcount);
+Datum
+drive_popcount(PG_FUNCTION_ARGS)
+{
+
+	int			count = PG_GETARG_INT32(0);
+	int			num   = PG_GETARG_INT32(1);
+
+	int			len = num * sizeof(uint64);
+	uint64	   *words	= palloc(len);
+	int64		popcount;
+
+	for (int i = 0; i < num; i++)
+		words[i] = i;
+
+	while (count--)
+		popcount = pg_popcount((const char*) words, len);
+
+	PG_RETURN_INT64(popcount);
+}
diff --git a/src/test/modules/test_popcount/test_popcount.control b/src/test/modules/test_popcount/test_popcount.control
new file mode 100644
index 0000000000..6837d724b4
--- /dev/null
+++ b/src/test/modules/test_popcount/test_popcount.control
@@ -0,0 +1,4 @@
+comment = 'test'
+default_version = '1.0'
+module_pathname = '$libdir/test_popcount'
+relocatable = true
From f65964c997cf8958dd9e53a04fb7e92098c784f1 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@2ndquadrant.com>
Date: Wed, 11 Aug 2021 12:17:51 -0400
Subject: [PATCH v1 1/2] Use direct function calls for pg_popcount{32,64} on
 non-x86 platforms

Previously, all pg_popcount{32,64} calls were indirected through
a function pointer, even though we had no fast implementation for
non-x86 platforms. To fix, use macros to cover both the direct and
indirect cases, and define them appropriately similar to the CRC code.
---
 src/backend/access/heap/visibilitymap.c |  6 ++--
 src/backend/nodes/bitmapset.c           |  4 +--
 src/include/port/pg_bitutils.h          | 42 +++++++++++++++++++++-
 src/port/pg_bitutils.c                  | 46 ++++---------------------
 4 files changed, 52 insertions(+), 46 deletions(-)

diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index 114fbbdd30..75b0377807 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -411,14 +411,14 @@ visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_fro
 		if (all_frozen == NULL)
 		{
 			for (i = 0; i < MAPSIZE / sizeof(uint64); i++)
-				nvisible += pg_popcount64(map[i] & VISIBLE_MASK64);
+				nvisible += PG_POPCOUNT64(map[i] & VISIBLE_MASK64);
 		}
 		else
 		{
 			for (i = 0; i < MAPSIZE / sizeof(uint64); i++)
 			{
-				nvisible += pg_popcount64(map[i] & VISIBLE_MASK64);
-				nfrozen += pg_popcount64(map[i] & FROZEN_MASK64);
+				nvisible += PG_POPCOUNT64(map[i] & VISIBLE_MASK64);
+				nfrozen += PG_POPCOUNT64(map[i] & FROZEN_MASK64);
 			}
 		}
 
diff --git a/src/backend/nodes/bitmapset.c b/src/backend/nodes/bitmapset.c
index 649478b0d4..a6c4c62914 100644
--- a/src/backend/nodes/bitmapset.c
+++ b/src/backend/nodes/bitmapset.c
@@ -57,11 +57,11 @@
 #if BITS_PER_BITMAPWORD == 32
 #define bmw_leftmost_one_pos(w)		pg_leftmost_one_pos32(w)
 #define bmw_rightmost_one_pos(w)	pg_rightmost_one_pos32(w)
-#define bmw_popcount(w)				pg_popcount32(w)
+#define bmw_popcount(w)				PG_POPCOUNT32(w)
 #elif BITS_PER_BITMAPWORD == 64
 #define bmw_leftmost_one_pos(w)		pg_leftmost_one_pos64(w)
 #define bmw_rightmost_one_pos(w)	pg_rightmost_one_pos64(w)
-#define bmw_popcount(w)				pg_popcount64(w)
+#define bmw_popcount(w)				PG_POPCOUNT64(w)
 #else
 #error "invalid BITS_PER_BITMAPWORD"
 #endif
diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index 086bd08132..ea97192b3a 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -253,9 +253,49 @@ pg_ceil_log2_64(uint64 num)
 		return pg_leftmost_one_pos64(num - 1) + 1;
 }
 
-/* Count the number of one-bits in a uint32 or uint64 */
+/*
+ * With MSVC on x86_64 builds, try using native popcnt instructions via the
+ * __popcnt and __popcnt64 intrinsics.  These don't work the same as GCC's
+ * __builtin_popcount* intrinsic functions as they always emit popcnt
+ * instructions.
+ */
+#if defined(_MSC_VER) && defined(_M_AMD64)
+#define HAVE_X86_64_POPCNTQ
+#endif
+
+/*
+ * On x86_64, we can use the hardware popcount instruction, but only if
+ * we can verify that the CPU supports it via the cpuid instruction.
+ *
+ * Otherwise, we fall back to a hand-rolled implementation.
+ */
+#ifdef HAVE_X86_64_POPCNTQ
+#if defined(HAVE__GET_CPUID) || defined(HAVE__CPUID)
+#define TRY_POPCNT_FAST 1
+#endif
+#endif
+
+#ifdef TRY_POPCNT_FAST
+/* Use the POPCNT instruction, but perform a runtime check first. */
+#define PG_POPCOUNT32(word) pg_popcount32(word)
+#define PG_POPCOUNT64(word) pg_popcount64(word)
+
+extern int	pg_popcount32_slow(uint32 word);
+extern int	pg_popcount64_slow(uint64 word);
 extern int	(*pg_popcount32) (uint32 word);
 extern int	(*pg_popcount64) (uint64 word);
+extern int	pg_popcount32_fast(uint32 word);
+extern int	pg_popcount64_fast(uint64 word);
+
+#else
+/* Use a portable implementation */
+#define PG_POPCOUNT32(word) pg_popcount32_slow(word)
+#define PG_POPCOUNT64(word) pg_popcount64_slow(word)
+
+extern int	pg_popcount32_slow(uint32 word);
+extern int	pg_popcount64_slow(uint64 word);
+
+#endif							/* TRY_POPCNT_FAST */
 
 /* Count the number of one-bits in a byte array */
 extern uint64 pg_popcount(const char *buf, int bytes);
diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index 10676b285c..3e90de5249 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -103,47 +103,13 @@ const uint8 pg_number_of_ones[256] = {
 	4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
 };
 
-/*
- * With MSVC on x86_64 builds, try using native popcnt instructions via the
- * __popcnt and __popcnt64 intrinsics.  These don't work the same as GCC's
- * __builtin_popcount* intrinsic functions as they always emit popcnt
- * instructions.
- */
-#if defined(_MSC_VER) && defined(_M_AMD64)
-#define HAVE_X86_64_POPCNTQ
-#endif
-
-/*
- * On x86_64, we can use the hardware popcount instruction, but only if
- * we can verify that the CPU supports it via the cpuid instruction.
- *
- * Otherwise, we fall back to __builtin_popcount if the compiler has that,
- * or a hand-rolled implementation if not.
- */
-#ifdef HAVE_X86_64_POPCNTQ
-#if defined(HAVE__GET_CPUID) || defined(HAVE__CPUID)
-#define TRY_POPCNT_FAST 1
-#endif
-#endif
-
-static int	pg_popcount32_slow(uint32 word);
-static int	pg_popcount64_slow(uint64 word);
-
 #ifdef TRY_POPCNT_FAST
 static bool pg_popcount_available(void);
 static int	pg_popcount32_choose(uint32 word);
 static int	pg_popcount64_choose(uint64 word);
-static int	pg_popcount32_fast(uint32 word);
-static int	pg_popcount64_fast(uint64 word);
 
 int			(*pg_popcount32) (uint32 word) = pg_popcount32_choose;
 int			(*pg_popcount64) (uint64 word) = pg_popcount64_choose;
-#else
-int			(*pg_popcount32) (uint32 word) = pg_popcount32_slow;
-int			(*pg_popcount64) (uint64 word) = pg_popcount64_slow;
-#endif							/* TRY_POPCNT_FAST */
-
-#ifdef TRY_POPCNT_FAST
 
 /*
  * Return true if CPUID indicates that the POPCNT instruction is available.
@@ -208,7 +174,7 @@ pg_popcount64_choose(uint64 word)
  * pg_popcount32_fast
  *		Return the number of 1 bits set in word
  */
-static int
+int
 pg_popcount32_fast(uint32 word)
 {
 #ifdef _MSC_VER
@@ -225,7 +191,7 @@ __asm__ __volatile__(" popcntl %1,%0\n":"=q"(res):"rm"(word):"cc");
  * pg_popcount64_fast
  *		Return the number of 1 bits set in word
  */
-static int
+int
 pg_popcount64_fast(uint64 word)
 {
 #ifdef _MSC_VER
@@ -245,7 +211,7 @@ __asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc");
  * pg_popcount32_slow
  *		Return the number of 1 bits set in word
  */
-static int
+int
 pg_popcount32_slow(uint32 word)
 {
 #ifdef HAVE__BUILTIN_POPCOUNT
@@ -267,7 +233,7 @@ pg_popcount32_slow(uint32 word)
  * pg_popcount64_slow
  *		Return the number of 1 bits set in word
  */
-static int
+int
 pg_popcount64_slow(uint64 word)
 {
 #ifdef HAVE__BUILTIN_POPCOUNT
@@ -309,7 +275,7 @@ pg_popcount(const char *buf, int bytes)
 
 		while (bytes >= 8)
 		{
-			popcnt += pg_popcount64(*words++);
+			popcnt += PG_POPCOUNT64(*words++);
 			bytes -= 8;
 		}
 
@@ -323,7 +289,7 @@ pg_popcount(const char *buf, int bytes)
 
 		while (bytes >= 4)
 		{
-			popcnt += pg_popcount32(*words++);
+			popcnt += PG_POPCOUNT32(*words++);
 			bytes -= 4;
 		}
 
-- 
2.31.1

From 225073b1741fd89ec7728e28978e057e9dc11116 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@2ndquadrant.com>
Date: Wed, 11 Aug 2021 12:22:24 -0400
Subject: [PATCH v1 2/2] Replace intrinsics in pg_popcount*_slow with pure C
 code

Intrinsics are used in the hope that the compiler will access some fast
hardware implementation where available. However, on x86 at least,
__builtin_popcount() didn't emit a POPCNT instruction since -mpopcnt
wasn't passed to the compiler. Instead, the compiler emitted bitwise
operations where the intrinsic was supported. Where not supported,
we used a byte-at-a-time loop using a lookup table.

Since the *slow functions are fallback implementations, replace the
intrinsics and the associated #ifdef maze with the bitwise operations
written in C so all platforms can benefit from them.

If we ever get configure support for x86-64-v2, we could use these
intrinsics to emit the POPCNT instruction without a runtime check. To
allow for that possibility, let's keep the configure checks around.
---
 src/port/pg_bitutils.c | 47 ++++++++++++++----------------------------
 1 file changed, 15 insertions(+), 32 deletions(-)

diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index 3e90de5249..500372db10 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -207,6 +207,11 @@ __asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc");
 #endif							/* TRY_POPCNT_FAST */
 
 
+/*
+ * The *_slow implementations are based on
+ * http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+ */
+
 /*
  * pg_popcount32_slow
  *		Return the number of 1 bits set in word
@@ -214,19 +219,11 @@ __asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc");
 int
 pg_popcount32_slow(uint32 word)
 {
-#ifdef HAVE__BUILTIN_POPCOUNT
-	return __builtin_popcount(word);
-#else							/* !HAVE__BUILTIN_POPCOUNT */
-	int			result = 0;
-
-	while (word != 0)
-	{
-		result += pg_number_of_ones[word & 255];
-		word >>= 8;
-	}
-
-	return result;
-#endif							/* HAVE__BUILTIN_POPCOUNT */
+	word = word - ((word >> 1) & 0x55555555);
+	word = (word & 0x33333333) +
+		((word >> 2) & 0x33333333);
+	word = (word + (word >> 4)) & 0xF0F0F0F;
+	return (int) ((word * 0x1010101) >> 24);
 }
 
 /*
@@ -236,25 +233,11 @@ pg_popcount32_slow(uint32 word)
 int
 pg_popcount64_slow(uint64 word)
 {
-#ifdef HAVE__BUILTIN_POPCOUNT
-#if defined(HAVE_LONG_INT_64)
-	return __builtin_popcountl(word);
-#elif defined(HAVE_LONG_LONG_INT_64)
-	return __builtin_popcountll(word);
-#else
-#error must have a working 64-bit integer datatype
-#endif
-#else							/* !HAVE__BUILTIN_POPCOUNT */
-	int			result = 0;
-
-	while (word != 0)
-	{
-		result += pg_number_of_ones[word & 255];
-		word >>= 8;
-	}
-
-	return result;
-#endif							/* HAVE__BUILTIN_POPCOUNT */
+	word = word - ((word >> 1) & UINT64CONST(0x5555555555555555));
+	word = (word & UINT64CONST(0x3333333333333333)) +
+		((word >> 2) & UINT64CONST(0x3333333333333333));
+	word = (word + (word >> 4)) & UINT64CONST(0xF0F0F0F0F0F0F0F);
+	return (int) ((word * UINT64CONST(0x101010101010101)) >> 56);
 }
 
 
-- 
2.31.1

Reply via email to