Looks good to me. The following questions are to satisfy my curiosity, I think the patch should be merged as is:
How performance critical is this popcount implementation going to be? I assume you've put all this work into testing it because the classifier will be relying on it heavily? Why do you think the gcc builtin is slow? That's surprising to me. Is it possible that in newer versions of gcc (i.e. 4.7 and later) would simply generate the assembly instruction? If it's so performance critical, could we simply check for the assembly instruction in the configure script, and if it exists use it. Of course, if it doesn't exist we would fall back to what you currently have. Ethan On Fri, Jul 20, 2012 at 4:25 PM, Ben Pfaff <b...@nicira.com> wrote: > This is the fastest portable implementation among the ones below, as > measured with GCC 4.4 on a Xeon X3430. The measeured times were, in > seconds: > > popcount1 25.6 > popcount2 6.9 (but is not portable) > popcount3 31.4 > popcount4 25.6 > popcount5 61.6 (and is buggy) > popcount6 64.6 > popcount7 32.3 > popcount8 11.2 > > #include <limits.h> > #include <stdio.h> > #include <stdint.h> > > int > popcount1(unsigned int x) > { > return __builtin_popcount(x); > } > > int > popcount2(unsigned int x) > { > unsigned int y; > asm("popcnt %1, %0" : "=r" (y) : "g" (x)); > return y; > } > > int > popcount3(unsigned int x) > { > unsigned int n; > > n = (x >> 1) & 033333333333; > x -= n; > n = (n >> 1) & 033333333333; > x -= n; > x = (x + (x >> 3)) & 030707070707; > return x % 63; > } > > int > popcount4(unsigned int x) > { > x -= (x >> 1) & 0x55555555; > x = (x & 0x33333333) + ((x >> 2) & 0x33333333); > x = (x + (x >> 4)) & 0x0f0f0f0f; > x += x >> 8; > x += x >> 16; > return x & 0x3f; > } > > int > popcount5(unsigned int x) > { > int n; > > n = 0; > while (x) { > if (x & 0xf) { > n += ((0xe9949440 >> (x & 0xf)) & 3) + 1; > } > x >>= 4; > } > return n; > } > > int > popcount6(unsigned int x) > { > int n; > > n = 0; > while (x) { > n += (0xe994 >> (x & 7)) & 3; > x >>= 3; > } > return n; > } > > int > popcount7(unsigned int x) > { > static const int table[16] = { > 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 > }; > > return (table[x & 0xf] > + table[(x >> 4) & 0xf] > + table[(x >> 8) & 0xf] > + table[(x >> 12) & 0xf] > + table[(x >> 16) & 0xf] > + table[(x >> 20) & 0xf] > + table[(x >> 24) & 0xf] > + table[x >> 28]); > } > > static int > popcount8(unsigned int x) > { > #define INIT1(X) \ > ((((X) & (1 << 0)) != 0) + \ > (((X) & (1 << 1)) != 0) + \ > (((X) & (1 << 2)) != 0) + \ > (((X) & (1 << 3)) != 0) + \ > (((X) & (1 << 4)) != 0) + \ > (((X) & (1 << 5)) != 0) + \ > (((X) & (1 << 6)) != 0) + \ > (((X) & (1 << 7)) != 0)) > #define INIT2(X) INIT1(X), INIT1((X) + 1) > #define INIT4(X) INIT2(X), INIT2((X) + 2) > #define INIT8(X) INIT4(X), INIT4((X) + 4) > #define INIT16(X) INIT8(X), INIT8((X) + 8) > #define INIT32(X) INIT16(X), INIT16((X) + 16) > #define INIT64(X) INIT32(X), INIT32((X) + 32) > > static const uint8_t popcount8[256] = { > INIT64(0), INIT64(64), INIT64(128), INIT64(192) > }; > > return (popcount8[x & 0xff] + > popcount8[(x >> 8) & 0xff] + > popcount8[(x >> 16) & 0xff] + > popcount8[x >> 24]); > } > > int > main(void) > { > unsigned long long int x; > int n; > > n = 0; > for (x = 0; x <= UINT32_MAX; x++) { > n += popcount8(x); > } > printf("%d\n", n); > > return 0; > } > > Signed-off-by: Ben Pfaff <b...@nicira.com> > --- > lib/util.c | 34 ++++++++++++++++++++++++++++++++++ > lib/util.h | 2 ++ > 2 files changed, 36 insertions(+), 0 deletions(-) > > diff --git a/lib/util.c b/lib/util.c > index cbcf693..322fc1c 100644 > --- a/lib/util.c > +++ b/lib/util.c > @@ -738,6 +738,40 @@ ctz(uint32_t n) > } > } > > +/* Returns the number of 1-bits in 'x', between 0 and 32 inclusive. */ > +int > +popcount(uint32_t x) > +{ > + /* In my testing, this implementation is over twice as fast as any other > + * portable implementation that I tried, including GCC 4.4 > + * __builtin_popcount(), although nonportable asm("popcnt") was over 50% > + * faster. */ > +#define INIT1(X) \ > + ((((X) & (1 << 0)) != 0) + \ > + (((X) & (1 << 1)) != 0) + \ > + (((X) & (1 << 2)) != 0) + \ > + (((X) & (1 << 3)) != 0) + \ > + (((X) & (1 << 4)) != 0) + \ > + (((X) & (1 << 5)) != 0) + \ > + (((X) & (1 << 6)) != 0) + \ > + (((X) & (1 << 7)) != 0)) > +#define INIT2(X) INIT1(X), INIT1((X) + 1) > +#define INIT4(X) INIT2(X), INIT2((X) + 2) > +#define INIT8(X) INIT4(X), INIT4((X) + 4) > +#define INIT16(X) INIT8(X), INIT8((X) + 8) > +#define INIT32(X) INIT16(X), INIT16((X) + 16) > +#define INIT64(X) INIT32(X), INIT32((X) + 32) > + > + static const uint8_t popcount8[256] = { > + INIT64(0), INIT64(64), INIT64(128), INIT64(192) > + }; > + > + return (popcount8[x & 0xff] + > + popcount8[(x >> 8) & 0xff] + > + popcount8[(x >> 16) & 0xff] + > + popcount8[x >> 24]); > +} > + > /* Returns true if the 'n' bytes starting at 'p' are zeros. */ > bool > is_all_zeros(const uint8_t *p, size_t n) > diff --git a/lib/util.h b/lib/util.h > index ca1d566..7cded28 100644 > --- a/lib/util.h > +++ b/lib/util.h > @@ -17,6 +17,7 @@ > #ifndef UTIL_H > #define UTIL_H 1 > > +#include <limits.h> > #include <stdarg.h> > #include <stdbool.h> > #include <stddef.h> > @@ -241,6 +242,7 @@ void ignore(bool x OVS_UNUSED); > int log_2_floor(uint32_t); > int log_2_ceil(uint32_t); > int ctz(uint32_t); > +int popcount(uint32_t); > > bool is_all_zeros(const uint8_t *, size_t); > bool is_all_ones(const uint8_t *, size_t); > -- > 1.7.2.5 > _______________________________________________ dev mailing list dev@openvswitch.org http://openvswitch.org/mailman/listinfo/dev