This is the fastest portable implementation among the ones below, as measured with GCC 4.4 on a Xeon X3430. The measeured times were, in seconds:
popcount1 25.6 popcount2 6.9 (but is not portable) popcount3 31.4 popcount4 25.6 popcount5 61.6 (and is buggy) popcount6 64.6 popcount7 32.3 popcount8 11.2 #include <limits.h> #include <stdio.h> #include <stdint.h> int popcount1(unsigned int x) { return __builtin_popcount(x); } int popcount2(unsigned int x) { unsigned int y; asm("popcnt %1, %0" : "=r" (y) : "g" (x)); return y; } int popcount3(unsigned int x) { unsigned int n; n = (x >> 1) & 033333333333; x -= n; n = (n >> 1) & 033333333333; x -= n; x = (x + (x >> 3)) & 030707070707; return x % 63; } int popcount4(unsigned int x) { x -= (x >> 1) & 0x55555555; x = (x & 0x33333333) + ((x >> 2) & 0x33333333); x = (x + (x >> 4)) & 0x0f0f0f0f; x += x >> 8; x += x >> 16; return x & 0x3f; } int popcount5(unsigned int x) { int n; n = 0; while (x) { if (x & 0xf) { n += ((0xe9949440 >> (x & 0xf)) & 3) + 1; } x >>= 4; } return n; } int popcount6(unsigned int x) { int n; n = 0; while (x) { n += (0xe994 >> (x & 7)) & 3; x >>= 3; } return n; } int popcount7(unsigned int x) { static const int table[16] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; return (table[x & 0xf] + table[(x >> 4) & 0xf] + table[(x >> 8) & 0xf] + table[(x >> 12) & 0xf] + table[(x >> 16) & 0xf] + table[(x >> 20) & 0xf] + table[(x >> 24) & 0xf] + table[x >> 28]); } static int popcount8(unsigned int x) { #define INIT1(X) \ ((((X) & (1 << 0)) != 0) + \ (((X) & (1 << 1)) != 0) + \ (((X) & (1 << 2)) != 0) + \ (((X) & (1 << 3)) != 0) + \ (((X) & (1 << 4)) != 0) + \ (((X) & (1 << 5)) != 0) + \ (((X) & (1 << 6)) != 0) + \ (((X) & (1 << 7)) != 0)) #define INIT2(X) INIT1(X), INIT1((X) + 1) #define INIT4(X) INIT2(X), INIT2((X) + 2) #define INIT8(X) INIT4(X), INIT4((X) + 4) #define INIT16(X) INIT8(X), INIT8((X) + 8) #define INIT32(X) INIT16(X), INIT16((X) + 16) #define INIT64(X) INIT32(X), INIT32((X) + 32) static const uint8_t popcount8[256] = { INIT64(0), INIT64(64), INIT64(128), INIT64(192) }; return (popcount8[x & 0xff] + popcount8[(x >> 8) & 0xff] + popcount8[(x >> 16) & 0xff] + popcount8[x >> 24]); } int main(void) { unsigned long long int x; int n; n = 0; for (x = 0; x <= UINT32_MAX; x++) { n += popcount8(x); } printf("%d\n", n); return 0; } Signed-off-by: Ben Pfaff <b...@nicira.com> --- lib/util.c | 34 ++++++++++++++++++++++++++++++++++ lib/util.h | 2 ++ 2 files changed, 36 insertions(+), 0 deletions(-) diff --git a/lib/util.c b/lib/util.c index cbcf693..322fc1c 100644 --- a/lib/util.c +++ b/lib/util.c @@ -738,6 +738,40 @@ ctz(uint32_t n) } } +/* Returns the number of 1-bits in 'x', between 0 and 32 inclusive. */ +int +popcount(uint32_t x) +{ + /* In my testing, this implementation is over twice as fast as any other + * portable implementation that I tried, including GCC 4.4 + * __builtin_popcount(), although nonportable asm("popcnt") was over 50% + * faster. */ +#define INIT1(X) \ + ((((X) & (1 << 0)) != 0) + \ + (((X) & (1 << 1)) != 0) + \ + (((X) & (1 << 2)) != 0) + \ + (((X) & (1 << 3)) != 0) + \ + (((X) & (1 << 4)) != 0) + \ + (((X) & (1 << 5)) != 0) + \ + (((X) & (1 << 6)) != 0) + \ + (((X) & (1 << 7)) != 0)) +#define INIT2(X) INIT1(X), INIT1((X) + 1) +#define INIT4(X) INIT2(X), INIT2((X) + 2) +#define INIT8(X) INIT4(X), INIT4((X) + 4) +#define INIT16(X) INIT8(X), INIT8((X) + 8) +#define INIT32(X) INIT16(X), INIT16((X) + 16) +#define INIT64(X) INIT32(X), INIT32((X) + 32) + + static const uint8_t popcount8[256] = { + INIT64(0), INIT64(64), INIT64(128), INIT64(192) + }; + + return (popcount8[x & 0xff] + + popcount8[(x >> 8) & 0xff] + + popcount8[(x >> 16) & 0xff] + + popcount8[x >> 24]); +} + /* Returns true if the 'n' bytes starting at 'p' are zeros. */ bool is_all_zeros(const uint8_t *p, size_t n) diff --git a/lib/util.h b/lib/util.h index ca1d566..7cded28 100644 --- a/lib/util.h +++ b/lib/util.h @@ -17,6 +17,7 @@ #ifndef UTIL_H #define UTIL_H 1 +#include <limits.h> #include <stdarg.h> #include <stdbool.h> #include <stddef.h> @@ -241,6 +242,7 @@ void ignore(bool x OVS_UNUSED); int log_2_floor(uint32_t); int log_2_ceil(uint32_t); int ctz(uint32_t); +int popcount(uint32_t); bool is_all_zeros(const uint8_t *, size_t); bool is_all_ones(const uint8_t *, size_t); -- 1.7.2.5 _______________________________________________ dev mailing list dev@openvswitch.org http://openvswitch.org/mailman/listinfo/dev