Looks good to me.

The following questions are to satisfy my curiosity, I think the patch
should be merged as is:

How performance critical is this popcount implementation going to be?
I assume you've put all this work into testing it because the
classifier will be relying on it heavily?

Why do you think the gcc builtin is slow? That's surprising to me.  Is
it possible that in newer versions of gcc (i.e. 4.7 and later) would
simply generate the assembly instruction?

If it's so performance critical, could we simply check for the
assembly instruction in the configure script, and if it exists use it.
 Of course, if it doesn't exist we would fall back to what you
currently have.

Ethan



On Fri, Jul 20, 2012 at 4:25 PM, Ben Pfaff <b...@nicira.com> wrote:
> This is the fastest portable implementation among the ones below, as
> measured with GCC 4.4 on a Xeon X3430.  The measeured times were, in
> seconds:
>
> popcount1    25.6
> popcount2     6.9 (but is not portable)
> popcount3    31.4
> popcount4    25.6
> popcount5    61.6 (and is buggy)
> popcount6    64.6
> popcount7    32.3
> popcount8    11.2
>
> #include <limits.h>
> #include <stdio.h>
> #include <stdint.h>
>
> int
> popcount1(unsigned int x)
> {
>     return __builtin_popcount(x);
> }
>
> int
> popcount2(unsigned int x)
> {
>     unsigned int y;
>     asm("popcnt %1, %0" : "=r" (y) : "g" (x));
>     return y;
> }
>
> int
> popcount3(unsigned int x)
> {
>     unsigned int n;
>
>     n = (x >> 1) & 033333333333;
>     x -= n;
>     n = (n >> 1) & 033333333333;
>     x -= n;
>     x = (x + (x >> 3)) & 030707070707;
>     return x % 63;
> }
>
> int
> popcount4(unsigned int x)
> {
>     x -= (x >> 1) & 0x55555555;
>     x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
>     x = (x + (x >> 4)) & 0x0f0f0f0f;
>     x += x >> 8;
>     x += x >> 16;
>     return x & 0x3f;
> }
>
> int
> popcount5(unsigned int x)
> {
>     int n;
>
>     n = 0;
>     while (x) {
>         if (x & 0xf) {
>             n += ((0xe9949440 >> (x & 0xf)) & 3) + 1;
>         }
>         x >>= 4;
>     }
>     return n;
> }
>
> int
> popcount6(unsigned int x)
> {
>     int n;
>
>     n = 0;
>     while (x) {
>         n += (0xe994 >> (x & 7)) & 3;
>         x >>= 3;
>     }
>     return n;
> }
>
> int
> popcount7(unsigned int x)
> {
>     static const int table[16] = {
>         0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4
>     };
>
>     return (table[x & 0xf]
>             + table[(x >> 4) & 0xf]
>             + table[(x >> 8) & 0xf]
>             + table[(x >> 12) & 0xf]
>             + table[(x >> 16) & 0xf]
>             + table[(x >> 20) & 0xf]
>             + table[(x >> 24) & 0xf]
>             + table[x >> 28]);
> }
>
> static int
> popcount8(unsigned int x)
> {
> #define INIT1(X)                                \
>     ((((X) & (1 << 0)) != 0) +                  \
>      (((X) & (1 << 1)) != 0) +                  \
>      (((X) & (1 << 2)) != 0) +                  \
>      (((X) & (1 << 3)) != 0) +                  \
>      (((X) & (1 << 4)) != 0) +                  \
>      (((X) & (1 << 5)) != 0) +                  \
>      (((X) & (1 << 6)) != 0) +                  \
>      (((X) & (1 << 7)) != 0))
> #define INIT2(X)   INIT1(X),  INIT1((X) +  1)
> #define INIT4(X)   INIT2(X),  INIT2((X) +  2)
> #define INIT8(X)   INIT4(X),  INIT4((X) +  4)
> #define INIT16(X)  INIT8(X),  INIT8((X) +  8)
> #define INIT32(X) INIT16(X), INIT16((X) + 16)
> #define INIT64(X) INIT32(X), INIT32((X) + 32)
>
>     static const uint8_t popcount8[256] = {
>         INIT64(0), INIT64(64), INIT64(128), INIT64(192)
>     };
>
>     return (popcount8[x & 0xff] +
>             popcount8[(x >> 8) & 0xff] +
>             popcount8[(x >> 16) & 0xff] +
>             popcount8[x >> 24]);
> }
>
> int
> main(void)
> {
>     unsigned long long int x;
>     int n;
>
>     n = 0;
>     for (x = 0; x <= UINT32_MAX; x++) {
>         n += popcount8(x);
>     }
>     printf("%d\n", n);
>
>     return 0;
> }
>
> Signed-off-by: Ben Pfaff <b...@nicira.com>
> ---
>  lib/util.c |   34 ++++++++++++++++++++++++++++++++++
>  lib/util.h |    2 ++
>  2 files changed, 36 insertions(+), 0 deletions(-)
>
> diff --git a/lib/util.c b/lib/util.c
> index cbcf693..322fc1c 100644
> --- a/lib/util.c
> +++ b/lib/util.c
> @@ -738,6 +738,40 @@ ctz(uint32_t n)
>      }
>  }
>
> +/* Returns the number of 1-bits in 'x', between 0 and 32 inclusive. */
> +int
> +popcount(uint32_t x)
> +{
> +    /* In my testing, this implementation is over twice as fast as any other
> +     * portable implementation that I tried, including GCC 4.4
> +     * __builtin_popcount(), although nonportable asm("popcnt") was over 50%
> +     * faster. */
> +#define INIT1(X)                                \
> +    ((((X) & (1 << 0)) != 0) +                  \
> +     (((X) & (1 << 1)) != 0) +                  \
> +     (((X) & (1 << 2)) != 0) +                  \
> +     (((X) & (1 << 3)) != 0) +                  \
> +     (((X) & (1 << 4)) != 0) +                  \
> +     (((X) & (1 << 5)) != 0) +                  \
> +     (((X) & (1 << 6)) != 0) +                  \
> +     (((X) & (1 << 7)) != 0))
> +#define INIT2(X)   INIT1(X),  INIT1((X) +  1)
> +#define INIT4(X)   INIT2(X),  INIT2((X) +  2)
> +#define INIT8(X)   INIT4(X),  INIT4((X) +  4)
> +#define INIT16(X)  INIT8(X),  INIT8((X) +  8)
> +#define INIT32(X) INIT16(X), INIT16((X) + 16)
> +#define INIT64(X) INIT32(X), INIT32((X) + 32)
> +
> +    static const uint8_t popcount8[256] = {
> +        INIT64(0), INIT64(64), INIT64(128), INIT64(192)
> +    };
> +
> +    return (popcount8[x & 0xff] +
> +            popcount8[(x >> 8) & 0xff] +
> +            popcount8[(x >> 16) & 0xff] +
> +            popcount8[x >> 24]);
> +}
> +
>  /* Returns true if the 'n' bytes starting at 'p' are zeros. */
>  bool
>  is_all_zeros(const uint8_t *p, size_t n)
> diff --git a/lib/util.h b/lib/util.h
> index ca1d566..7cded28 100644
> --- a/lib/util.h
> +++ b/lib/util.h
> @@ -17,6 +17,7 @@
>  #ifndef UTIL_H
>  #define UTIL_H 1
>
> +#include <limits.h>
>  #include <stdarg.h>
>  #include <stdbool.h>
>  #include <stddef.h>
> @@ -241,6 +242,7 @@ void ignore(bool x OVS_UNUSED);
>  int log_2_floor(uint32_t);
>  int log_2_ceil(uint32_t);
>  int ctz(uint32_t);
> +int popcount(uint32_t);
>
>  bool is_all_zeros(const uint8_t *, size_t);
>  bool is_all_ones(const uint8_t *, size_t);
> --
> 1.7.2.5
>
_______________________________________________
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev

Reply via email to