On Fri, 16 Dec 2005 06:21:02 +1100 Anton Blanchard <[EMAIL PROTECTED]> wrote:
> > Hi Stephen, > > > Replace cube root algorithim with a faster version using Newton-Raphson. > > Surprisingly, doing the scaled div64_64 is faster than a true 64 bit > > division on 64 bit CPU's. > > Interesting, what cpu was this on? Was there much difference between the > two methods? > > Anton Trivial little test program does old (original CUBIC), bisect (optimized version of same algorithm) math (use glibc cbrt ie floating point) newton (Newton's method in the patch) $ ./croot 10000000 Opteron 2.0 Ghz math 9671505106 1.230814 old 9676503359 1.287217 bisect 9666502089 1.074139 newton 9671507303 1.168087 Xeon (32bit) 2.4 Ghz math 9671505106 3.047887 old 9676503359 5.917374 bisect 9666502089 5.337120 newton 9671507303 2.433769 Pentium III 700 Mhz math 9671505106 7.038867 old 9676503359 9.055317 bisect 9666502089 7.904939 newton 9671507303 4.867410 -- Stephen Hemminger <[EMAIL PROTECTED]> OSDL http://developer.osdl.org/~shemminger
#include <stdio.h> #include <stdlib.h> #include <stdint.h> #include <math.h> #include <sys/time.h> #define unlikely(x) __builtin_expect(!!(x), 0) #define fls64(x) generic_fls64(x) #ifdef i386 /* * do_div() is NOT a C function. It wants to return * two values (the quotient and the remainder), but * since that doesn't work very well in C, what it * does is: * * - modifies the 64-bit dividend _in_place_ * - returns the 32-bit remainder * * This ends up being the most efficient "calling * convention" on x86. */ #define do_div(n,base) ({ \ unsigned long __upper, __low, __high, __mod, __base; \ __base = (base); \ asm("":"=a" (__low), "=d" (__high):"A" (n)); \ __upper = __high; \ if (__high) { \ __upper = __high % (__base); \ __high = __high / (__base); \ } \ asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (__base), "0" (__low), "1" (__upper)); \ asm("":"=A" (n):"a" (__low),"d" (__high)); \ __mod; \ }) /** * fls - find last bit set * @x: the word to search * * This is defined the same way as ffs. */ static inline int fls(int x) { int r; __asm__("bsrl %1,%0\n\t" "jnz 1f\n\t" "movl $-1,%0\n" "1:" : "=r" (r) : "rm" (x)); return r+1; } #endif #ifdef __x86_64__ # define do_div(n,base) ({ \ uint32_t __base = (base); \ uint32_t __rem; \ __rem = ((uint64_t)(n)) % __base; \ (n) = ((uint64_t)(n)) / __base; \ __rem; \ }) static __inline__ int fls(int x) { int r; __asm__("bsrl %1,%0\n\t" "cmovzl %2,%0" : "=r" (r) : "rm" (x), "r" (-1)); return r+1; } #undef fls64 static inline int fls64(uint64_t x) { if (x == 0) return 0; __asm__("bsrq %1,%0\n" : "=r" (x) : "rm" (x)); return x+1; } #endif /* * Find most significant bit in 64 bit word. * Similar to ffs(), bits are numbered so lsb is 1; and fls64(0) = 0 */ static inline int generic_fls64(uint64_t x) { uint32_t h = x >> 32; if (h) return fls(x) + 32; return fls(x); } /* 64bit divisor, dividend and result. dynamic precision */ static inline u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor) { u_int32_t d = divisor; if (divisor > 0xffffffffULL) { unsigned int shift = fls(divisor >> 32); d = divisor >> shift; dividend >>= shift; } if (dividend >> 32) do_div(dividend, d); else dividend = (uint32_t) dividend / d; return dividend; } static const uint64_t cubic_table[8] = {0, 65536, 82570, 94519, 104030, 112063, 119087, 125367}; /* * calculate the cubic root of x * the basic idea is that x can be expressed as i*8^j * so cubic_root(x) = cubic_root(i)*2^j * in the following code, x is i, and y is 2^j * because of integer calculation, there are errors in calculation * so finally use binary search to find out the exact solution */ static uint32_t old(uint64_t x) { uint64_t y, app, target, start, end, mid, start_diff, end_diff; if (x == 0) return 0; target = x; /* first estimate lower and upper bound */ y = 1; while (x >= 8){ x = (x >> 3); y = (y << 1); } start = (y*cubic_table[x])>>16; if (x==7) end = (y<<1); else end = (y*cubic_table[x+1]+65535)>>16; /* binary search for more accurate one */ while (start < end-1) { mid = (start+end) >> 1; app = mid*mid*mid; if (app < target) start = mid; else if (app > target) end = mid; else return mid; } /* find the most accurate one from start and end */ app = start*start*start; if (app < target) start_diff = target - app; else start_diff = app - target; app = end*end*end; if (app < target) end_diff = target - app; else end_diff = app - target; if (start_diff < end_diff) return start; else return end; } /* * calculate the cubic root of x based on bisection. */ static uint32_t bisect(uint64_t x) { int l; uint32_t lo, hi; if (unlikely(x == 0)) return 0; /* Initial estimate basede on: * cbrt(x) = exp(log(x)/3) * using log base 2 */ l = (fls64(x)-1)/3; lo = 1u << l; hi = 1u << (l+1); /* use bisection to choose more accurate point. */ do { uint64_t mid = (lo + hi) / 2; uint64_t app = mid * mid * mid;; if (app == x) break; if (app < x) lo = mid; if (app > x) hi = mid; } while (hi - lo > 2); return lo; } /* Compute cube root of a number, using newton-raphson method. */ static uint32_t newton(uint64_t a) { uint32_t x, x1; /* Initial estimate is based on: * cbrt(x) = exp(log(x) * 1/3) */ x = 1u << (fls64(a)/3); /* * Iteration based on: * 2 * x = ( 2 * x + a / x ) / 3 * k+1 k k */ do { x1 = x; x = ( 2 * x + (uint32_t) div64_64(a, x*x)) / 3; } while ( abs(x1 - x) > 1); return x; } int main(int argc, char **argv) { unsigned N = 100000; int i; uint64_t msum, osum, nsum; struct timeval t0, t1, to, tm, tn; if (argc > 1) N = strtoul(argv[1], NULL, 10); srandom(0); msum = 0; gettimeofday(&t0, NULL); for (i = 0; i < N; i++) { uint64_t r = random(); msum += cbrt((double) r); } gettimeofday(&t1, NULL); timersub(&t1, &t0, &tm); printf("math %llu %d.%06d\n", (unsigned long long) msum, (int) tm.tv_sec, (int)tm.tv_usec); srandom(0); osum = 0; gettimeofday(&t0, NULL); for (i = 0; i < N; i++) { uint64_t r = random(); osum += old(r); } gettimeofday(&t1, NULL); timersub(&t1, &t0, &to); printf("old %llu %d.%06d\n", (unsigned long long) osum, (int) to.tv_sec, (int) to.tv_usec); srandom(0); osum = 0; gettimeofday(&t0, NULL); for (i = 0; i < N; i++) { uint64_t r = random(); osum += bisect(r); } gettimeofday(&t1, NULL); timersub(&t1, &t0, &to); printf("bisect %llu %d.%06d\n", (unsigned long long) osum, (int) to.tv_sec, (int) to.tv_usec); nsum = 0; srandom(0); gettimeofday(&t0, NULL); for (i = 0; i < N; i++) { uint64_t r = random(); nsum += newton(r); } gettimeofday(&t1, NULL); timersub(&t1, &t0, &tn); printf("newton %llu %d.%06d\n", (unsigned long long) nsum, (int) tn.tv_sec, (int) tn.tv_usec); return 0; }