Hi! This patch speeds up the libgcc __popcount{si,di,ti}2 implementations more than 2 times by avoiding table lookup. I've kept using the table lookup for small word size targets (unfortunately avr/rl78 cheap and thus e.g. MIN_UNITS_PER_WORD isn't reliable for those purposes). Maybe it would be nice to detect if UWtype multiplication isn't too expensive (as in, implemented using libcall) and use shits + additions instead for that case.
Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? 2013-06-27 Jakub Jelinek <ja...@redhat.com> PR middle-end/36041 * libgcc2.c (POPCOUNTCST2, POPCOUNTCST4, POPCOUNTCST8, POPCOUNTCST): Define. (__popcountSI2): For __SIZEOF_INT__ > 2 targets use arithmetics instead of table lookups. (__popcountDI2): Likewise. --- libgcc/libgcc2.c.jj 2013-02-05 09:06:55.000000000 +0100 +++ libgcc/libgcc2.c 2013-06-27 08:59:12.319971540 +0200 @@ -819,17 +819,42 @@ const UQItype __popcount_tab[256] = }; #endif +#if defined(L_popcountsi2) || defined(L_popcountdi2) +#define POPCOUNTCST2(x) (((UWtype) x << BITS_PER_UNIT) | x) +#define POPCOUNTCST4(x) (((UWtype) x << (2 * BITS_PER_UNIT)) | x) +#define POPCOUNTCST8(x) (((UWtype) x << (4 * BITS_PER_UNIT)) | x) +#if W_TYPE_SIZE == BITS_PER_UNIT +#define POPCOUNTCST(x) x +#elif W_TYPE_SIZE == 2 * BITS_PER_UNIT +#define POPCOUNTCST(x) POPCOUNTCST2 (x) +#elif W_TYPE_SIZE == 4 * BITS_PER_UNIT +#define POPCOUNTCST(x) POPCOUNTCST4 (POPCOUNTCST2 (x)) +#elif W_TYPE_SIZE == 8 * BITS_PER_UNIT +#define POPCOUNTCST(x) POPCOUNTCST8 (POPCOUNTCST4 (POPCOUNTCST2 (x))) +#endif +#endif + #ifdef L_popcountsi2 #undef int int __popcountSI2 (UWtype x) { + /* Force table lookup on targets like AVR and RL78 which only + pretend they have LIBGCC2_UNITS_PER_WORD 4, but actually + have 1, and other small word targets. */ +#if __SIZEOF_INT__ > 2 && defined (POPCOUNTCST) && BITS_PER_UNIT == 8 + x = x - ((x >> 1) & POPCOUNTCST (0x55)); + x = (x & POPCOUNTCST (0x33)) + ((x >> 2) & POPCOUNTCST (0x33)); + x = (x + (x >> 4)) & POPCOUNTCST (0x0F); + return (x * POPCOUNTCST (0x01)) >> (W_TYPE_SIZE - BITS_PER_UNIT); +#else int i, ret = 0; for (i = 0; i < W_TYPE_SIZE; i += 8) ret += __popcount_tab[(x >> i) & 0xff]; return ret; +#endif } #endif @@ -838,12 +863,28 @@ __popcountSI2 (UWtype x) int __popcountDI2 (UDWtype x) { + /* Force table lookup on targets like AVR and RL78 which only + pretend they have LIBGCC2_UNITS_PER_WORD 4, but actually + have 1, and other small word targets. */ +#if __SIZEOF_INT__ > 2 && defined (POPCOUNTCST) && BITS_PER_UNIT == 8 + const DWunion uu = {.ll = x}; + UWtype x1 = uu.s.low, x2 = uu.s.high; + x1 = x1 - ((x1 >> 1) & POPCOUNTCST (0x55)); + x2 = x2 - ((x2 >> 1) & POPCOUNTCST (0x55)); + x1 = (x1 & POPCOUNTCST (0x33)) + ((x1 >> 2) & POPCOUNTCST (0x33)); + x2 = (x2 & POPCOUNTCST (0x33)) + ((x2 >> 2) & POPCOUNTCST (0x33)); + x1 = (x1 + (x1 >> 4)) & POPCOUNTCST (0x0F); + x2 = (x2 + (x2 >> 4)) & POPCOUNTCST (0x0F); + x1 += x2; + return (x1 * POPCOUNTCST (0x01)) >> (W_TYPE_SIZE - BITS_PER_UNIT); +#else int i, ret = 0; for (i = 0; i < 2*W_TYPE_SIZE; i += 8) ret += __popcount_tab[(x >> i) & 0xff]; return ret; +#endif } #endif Jakub