On Mon, Sep 24, 2012 at 2:35 PM, George Spelvin <li...@horizon.com> wrote: >> Here is the comparison of the x86-32 assembly >> of the fragment which does "x / 10000" thing, >> before and after the patch: > >> -01 c6 add %eax,%esi >> -b8 59 17 b7 d1 mov $0xd1b71759,%eax >> -f7 e6 mul %esi >> -89 d3 mov %edx,%ebx >> -89 f2 mov %esi,%edx >> -c1 eb 0d shr $0xd,%ebx >> >> +01 c7 add %eax,%edi >> +b8 d7 c5 6d 34 mov $0x346dc5d7,%eax >> +f7 e7 mul %edi >> +89 55 e8 mov %edx,-0x18(%ebp) >> +8b 5d e8 mov -0x18(%ebp),%ebx >> +89 fa mov %edi,%edx >> +89 45 e4 mov %eax,-0x1c(%ebp) >> +c1 eb 0b shr $0xb,%ebx >> >> Poor gcc got confused, and generated somewhat >> worse code (spilling and immediately reloading upper >> part of 32x32->64 multiply). > >> Please test and benchmark your changes to this code >> before submitting them. > > Thanks for the feedback! It very much *was* intended to start a > conversation with you, but the 7 week response delay somewhat interfered > with that process. > > I was playing with it on ARM, where the results are a bit different. > > As you can see, it fell out of some other word which *did* make a > useful difference. I just hadn't tested this change in isolation,
Please find attached source of test program. You need to touch test_new.c (or make a few copies of it to experiment with different versions of code). test_header.c and test_main.c contain benchmarking code and need not be modified. It also includes a verification step, which would catch the bugs you had in your patches. Usage: $ gcc [ --static] [-m32] -O2 -Wall test_new.c -otest_new $ ./test_new Conversions per second: 8:59964000 123:48272000 123456:37852000 12345678:34216000 123456789:23528000 2^32:23520000 2^64:17616000 Conversions per second: 8:60092000 123:48536000 123456:37836000 12345678:33924000 123456789:23580000 2^32:23372000 2^64:17608000 Conversions per second: 8:60084000 123:48396000 123456:37840000 12345678:34192000 123456789:23564000 2^32:23484000 2^64:17612000 Conversions per second: 8:60108000 123:48500000 123456:37872000 12345678:33996000 123456789:23576000 2^32:23524000 2^64:17612000 Tested 14680064 ^C ^^^^^^^^^^^^^^^^^^^^^^^^ tests correctness until user interrupts it $
#include <features.h> #include <inttypes.h> #include <stdint.h> #include <stdbool.h> #include <sys/types.h> /* size_t */ #include <limits.h> #include <unistd.h> #include <stdlib.h> #include <stdarg.h> #include <stddef.h> #include <ctype.h> #include <string.h> #include <errno.h> #include <stdio.h> /* for FILE */ #include <sys/time.h> #include <time.h> #define noinline_for_stack __attribute__((noinline)) typedef uint8_t u8; typedef int16_t s16; /* * do_div() is NOT a C function. It wants to return * two values (the quotient and the remainder), but * since that doesn't work very well in C, what it * does is: * * - modifies the 64-bit dividend _in_place_ * - returns the 32-bit remainder * * This ends up being the most efficient "calling * convention" on x86. */ #if LONG_MAX == ((1U << 31)-1) #define do_div(n, base) \ ({ \ unsigned long __upper, __low, __high, __mod, __base; \ __base = (base); \ { \ asm("" : "=a" (__low), "=d" (__high) : "A" (n));\ __upper = __high; \ if (__high) { \ __upper = __high % (__base); \ __high = __high / (__base); \ } \ asm("divl %2" : "=a" (__low), "=d" (__mod) \ : "rm" (__base), "0" (__low), "1" (__upper)); \ asm("" : "=A" (n) : "a" (__low), "d" (__high)); \ } \ __mod; \ }) #elif LONG_MAX > ((1U << 31)-1) #define do_div(n,base) ({ \ uint32_t __base = (base); \ uint32_t __rem; \ __rem = ((uint64_t)(n)) % __base; \ (n) = ((uint64_t)(n)) / __base; \ __rem; \ }) #endif
static const struct printf_spec dummy_spec = { .type = FORMAT_TYPE_LONG_LONG, .flags = 0, .base = 10, .qualifier = 0, .field_width = 0, .precision = 0, }; #define REPS (4000) static unsigned measure_number(time_t* tp, unsigned long long num) { char buf[64]; time_t t, n; int i; unsigned count; t = *tp; count = 0; while (t == (n = time(NULL))) { for (i = REPS; --i >= 0;) number(buf, buf + 63, num, dummy_spec)[0] = '\0'; count += REPS; } *tp = n; return count; } static void measure() { time_t t, n; unsigned count1, count2, count3, count4, count5, count6, count7; t = time(NULL); while (t == (n = time(NULL))) continue; t = n; count1 = measure_number(&t, 8); count2 = measure_number(&t, 123); count3 = measure_number(&t, 123456); count4 = measure_number(&t, 12345678); count5 = measure_number(&t, 123456789); count6 = measure_number(&t, (1ULL << 32) - 1); count7 = measure_number(&t, (0ULL - 1)); printf("Conversions per second: 8:%d 123:%d 123456:%d 12345678:%d 123456789:%d 2^32:%d 2^64:%d\n", count1, count2, count3, count4, count5, count6, count7 ); } static void check(unsigned long long v) { char buf1[64]; char buf2[64]; number(buf1, buf1 + 63, v, dummy_spec)[0] = '\0'; sprintf(buf2, "%llu", v); if (strcmp(buf1, buf2) != 0) { printf("Error in formatting %llu:'%s'\n", v, buf1); exit(1); } } int main() { measure();measure(); measure();measure(); //measure();measure(); unsigned long long i; for (i = 0; ; i++) { check(i); // check(((1ULL << 32) - 1) + i); // check(((1ULL << 32) - 1) - i); check(0ULL - i); // check(i * i); // check(i * i * i); if (!(i & 0x3ffff)) { printf("\rTested %llu ", i); fflush(NULL); } } return 0; }
#include "test_header.c" /* Decimal conversion is by far the most typical, and is used * for /proc and /sys data. This directly impacts e.g. top performance * with many processes running. We optimize it for speed * using ideas described at <http://www.cs.uiowa.edu/~jones/bcd/divide.html> * (with permission from the author, Douglas W. Jones). */ #if LONG_MAX > ((1UL<<31)-1) || LLONG_MAX > ((1ULL<<63)-1) /* Formats correctly any integer in [0, 999999999] */ static noinline_for_stack char *put_dec_full9(char *buf, unsigned q) { unsigned r; /* Possible ways to approx. divide by 10 * (x * 0x1999999a) >> 32 x < 1073741829 (multiply must be 64-bit) * (x * 0xcccd) >> 19 x < 81920 (x < 262149 when 64-bit mul) * (x * 0x6667) >> 18 x < 43699 * (x * 0x3334) >> 17 x < 16389 * (x * 0x199a) >> 16 x < 16389 * (x * 0x0ccd) >> 15 x < 16389 * (x * 0x0667) >> 14 x < 2739 * (x * 0x0334) >> 13 x < 1029 * (x * 0x019a) >> 12 x < 1029 * (x * 0x00cd) >> 11 x < 1029 shorter code than * 0x67 (on i386) * (x * 0x0067) >> 10 x < 179 * (x * 0x0034) >> 9 x < 69 same * (x * 0x001a) >> 8 x < 69 same * (x * 0x000d) >> 7 x < 69 same, shortest code (on i386) * (x * 0x0007) >> 6 x < 19 * See <http://www.cs.uiowa.edu/~jones/bcd/divide.html> */ r = (q * (uint64_t)0x1999999a) >> 32; *buf++ = (q - 10 * r) + '0'; /* 1 */ q = (r * (uint64_t)0x1999999a) >> 32; *buf++ = (r - 10 * q) + '0'; /* 2 */ r = (q * (uint64_t)0x1999999a) >> 32; *buf++ = (q - 10 * r) + '0'; /* 3 */ q = (r * (uint64_t)0x1999999a) >> 32; *buf++ = (r - 10 * q) + '0'; /* 4 */ r = (q * (uint64_t)0x1999999a) >> 32; *buf++ = (q - 10 * r) + '0'; /* 5 */ /* Now value is under 10000, can avoid 64-bit multiply */ q = (r * 0x199a) >> 16; *buf++ = (r - 10 * q) + '0'; /* 6 */ r = (q * 0xcd) >> 11; *buf++ = (q - 10 * r) + '0'; /* 7 */ q = (r * 0xcd) >> 11; *buf++ = (r - 10 * q) + '0'; /* 8 */ *buf++ = q + '0'; /* 9 */ return buf; } #endif /* Similar to above but do not pad with zeros. * Code can be easily arranged to print 9 digits too, but our callers * always call put_dec_full9() instead when the number has 9 decimal digits. */ static noinline_for_stack char *put_dec_trunc8(char *buf, unsigned r) { unsigned q; /* Copy of previous function's body with added early returns */ q = (r * (uint64_t)0x1999999a) >> 32; *buf++ = (r - 10 * q) + '0'; /* 2 */ if (q == 0) return buf; r = (q * (uint64_t)0x1999999a) >> 32; *buf++ = (q - 10 * r) + '0'; /* 3 */ if (r == 0) return buf; q = (r * (uint64_t)0x1999999a) >> 32; *buf++ = (r - 10 * q) + '0'; /* 4 */ if (q == 0) return buf; r = (q * (uint64_t)0x1999999a) >> 32; *buf++ = (q - 10 * r) + '0'; /* 5 */ if (r == 0) return buf; q = (r * 0x199a) >> 16; *buf++ = (r - 10 * q) + '0'; /* 6 */ if (q == 0) return buf; r = (q * 0xcd) >> 11; *buf++ = (q - 10 * r) + '0'; /* 7 */ if (r == 0) return buf; q = (r * 0xcd) >> 11; *buf++ = (r - 10 * q) + '0'; /* 8 */ if (q == 0) return buf; *buf++ = q + '0'; /* 9 */ return buf; } /* There are two algorithms to print larger numbers. * One is generic: divide by 1000000000 and repeatedly print * groups of (up to) 9 digits. It's conceptually simple, * but requires a (unsigned long long) / 1000000000 division. * * Second algorithm splits 64-bit unsigned long long into 16-bit chunks, * manipulates them cleverly and generates groups of 4 decimal digits. * It so happens that it does NOT require long long division. * * If long is > 32 bits, division of 64-bit values is relatively easy, * and we will use the first algorithm. * If long long is > 64 bits (strange architecture with VERY large long long), * second algorithm can't be used, and we again use the first one. * * Else (if long is 32 bits and long long is 64 bits) we use second one. */ #if LONG_MAX > ((1UL<<31)-1) || LLONG_MAX > ((1ULL<<63)-1) /* First algorithm: generic */ static char *put_dec(char *buf, unsigned long long n) { if (n >= 100*1000*1000) { while (n >= 1000*1000*1000) buf = put_dec_full9(buf, do_div(n, 1000*1000*1000)); if (n >= 100*1000*1000) return put_dec_full9(buf, n); } return put_dec_trunc8(buf, n); } #else /* Second algorithm: valid only for 32-bit longs, 64-bit long longs */ static noinline_for_stack char *put_dec_full4(char *buf, unsigned q) { unsigned r; r = (q * 0xcccd) >> 19; *buf++ = (q - 10 * r) + '0'; q = (r * 0x199a) >> 16; *buf++ = (r - 10 * q) + '0'; r = (q * 0xcd) >> 11; *buf++ = (q - 10 * r) + '0'; *buf++ = r + '0'; return buf; } /* Based on code by Douglas W. Jones found at * <http://www.cs.uiowa.edu/~jones/bcd/decimal.html#sixtyfour> * (with permission from the author). * Performs no 64-bit division and hence should be fast on 32-bit machines. */ static char *put_dec(char *buf, unsigned long long n) { uint32_t d3, d2, d1, q, h; if (n < 100*1000*1000) return put_dec_trunc8(buf, n); d1 = ((uint32_t)n >> 16); /* implicit "& 0xffff" */ h = (n >> 32); d2 = (h ) & 0xffff; d3 = (h >> 16); /* implicit "& 0xffff" */ q = 656 * d3 + 7296 * d2 + 5536 * d1 + ((uint32_t)n & 0xffff); buf = put_dec_full4(buf, q % 10000); q = q / 10000; d1 = q + 7671 * d3 + 9496 * d2 + 6 * d1; buf = put_dec_full4(buf, d1 % 10000); q = d1 / 10000; d2 = q + 4749 * d3 + 42 * d2; buf = put_dec_full4(buf, d2 % 10000); q = d2 / 10000; d3 = q + 281 * d3; if (!d3) goto done; buf = put_dec_full4(buf, d3 % 10000); q = d3 / 10000; if (!q) goto done; buf = put_dec_full4(buf, q); done: while (buf[-1] == '0') --buf; return buf; } #endif /* * Convert passed number to decimal string. * Returns the length of string. On buffer overflow, returns 0. * * If speed is not important, use snprintf(). It's easy to read the code. */ int num_to_str(char *buf, int size, unsigned long long num) { char tmp[sizeof(num) * 3]; int idx, len; /* put_dec() may work incorrectly for num = 0 (generate "", not "0") */ if (num <= 9) { tmp[0] = '0' + num; len = 1; } else { len = put_dec(tmp, num) - tmp; } if (len > size) return 0; for (idx = 0; idx < len; ++idx) buf[idx] = tmp[len - idx - 1]; return len; } #define ZEROPAD 1 /* pad with zero */ #define SIGN 2 /* unsigned/signed long */ #define PLUS 4 /* show plus */ #define SPACE 8 /* space if plus */ #define LEFT 16 /* left justified */ #define SMALL 32 /* use lowercase in hex (must be 32 == 0x20) */ #define SPECIAL 64 /* prefix hex with "0x", octal with "0" */ enum format_type { FORMAT_TYPE_NONE, /* Just a string part */ FORMAT_TYPE_WIDTH, FORMAT_TYPE_PRECISION, FORMAT_TYPE_CHAR, FORMAT_TYPE_STR, FORMAT_TYPE_PTR, FORMAT_TYPE_PERCENT_CHAR, FORMAT_TYPE_INVALID, FORMAT_TYPE_LONG_LONG, FORMAT_TYPE_ULONG, FORMAT_TYPE_LONG, FORMAT_TYPE_UBYTE, FORMAT_TYPE_BYTE, FORMAT_TYPE_USHORT, FORMAT_TYPE_SHORT, FORMAT_TYPE_UINT, FORMAT_TYPE_INT, FORMAT_TYPE_NRCHARS, FORMAT_TYPE_SIZE_T, FORMAT_TYPE_PTRDIFF }; struct printf_spec { u8 type; /* format_type enum */ u8 flags; /* flags to number() */ u8 base; /* number base, 8, 10 or 16 only */ u8 qualifier; /* number qualifier, one of 'hHlLtzZ' */ s16 field_width; /* width of output field */ s16 precision; /* # of digits/chars */ }; static noinline_for_stack char *number(char *buf, char *end, unsigned long long num, struct printf_spec spec) { /* we are called with base 8, 10 or 16, only, thus don't need "G..." */ static const char digits[16] = "0123456789ABCDEF"; /* "GHIJKLMNOPQRSTUVWXYZ"; */ char tmp[66]; char sign; char locase; int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10); int i; /* locase = 0 or 0x20. ORing digits or letters with 'locase' * produces same digits or (maybe lowercased) letters */ locase = (spec.flags & SMALL); if (spec.flags & LEFT) spec.flags &= ~ZEROPAD; sign = 0; if (spec.flags & SIGN) { if ((signed long long)num < 0) { sign = '-'; num = -(signed long long)num; spec.field_width--; } else if (spec.flags & PLUS) { sign = '+'; spec.field_width--; } else if (spec.flags & SPACE) { sign = ' '; spec.field_width--; } } if (need_pfx) { spec.field_width--; if (spec.base == 16) spec.field_width--; } /* generate full string in tmp[], in reverse order */ i = 0; if (num < 8) tmp[i++] = '0' + num; /* Generic code, for any base: else do { tmp[i++] = (digits[do_div(num,base)] | locase); } while (num != 0); */ else if (spec.base != 10) { /* 8 or 16 */ int mask = spec.base - 1; int shift = 3; if (spec.base == 16) shift = 4; do { tmp[i++] = (digits[((unsigned char)num) & mask] | locase); num >>= shift; } while (num); } else { /* base 10 */ i = put_dec(tmp, num) - tmp; } /* printing 100 using %2d gives "100", not "00" */ if (i > spec.precision) spec.precision = i; /* leading space padding */ spec.field_width -= spec.precision; if (!(spec.flags & (ZEROPAD+LEFT))) { while (--spec.field_width >= 0) { if (buf < end) *buf = ' '; ++buf; } } /* sign */ if (sign) { if (buf < end) *buf = sign; ++buf; } /* "0x" / "0" prefix */ if (need_pfx) { if (buf < end) *buf = '0'; ++buf; if (spec.base == 16) { if (buf < end) *buf = ('X' | locase); ++buf; } } /* zero or space padding */ if (!(spec.flags & LEFT)) { char c = (spec.flags & ZEROPAD) ? '0' : ' '; while (--spec.field_width >= 0) { if (buf < end) *buf = c; ++buf; } } /* hmm even more zero padding? */ while (i <= --spec.precision) { if (buf < end) *buf = '0'; ++buf; } /* actual digits of result */ while (--i >= 0) { if (buf < end) *buf = tmp[i]; ++buf; } /* trailing space padding */ while (--spec.field_width >= 0) { if (buf < end) *buf = ' '; ++buf; } return buf; } #include "test_main.c"