Here is a better version of the benchmark code. It has the original code used in 2.4 version of Cubic for comparison
----------------------------------------------------------- /* Test and measure perf of cube root algorithms. */ #include <stdio.h> #include <stdlib.h> #include <stdint.h> #include <math.h> #include <unistd.h> #ifdef __x86_64 #define rdtscll(val) do { \ unsigned int __a,__d; \ asm volatile("rdtsc" : "=a" (__a), "=d" (__d)); \ (val) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \ } while(0) # define do_div(n,base) ({ \ uint32_t __base = (base); \ uint32_t __rem; \ __rem = ((uint64_t)(n)) % __base; \ (n) = ((uint64_t)(n)) / __base; \ __rem; \ }) /** * __ffs - find first bit in word. * @word: The word to search * * Undefined if no bit exists, so code should check against 0 first. */ static __inline__ unsigned long __ffs(unsigned long word) { __asm__("bsfq %1,%0" :"=r" (word) :"rm" (word)); return word; } /* * __fls: find last bit set. * @word: The word to search * * Undefined if no zero exists, so code should check against ~0UL first. */ static inline unsigned long __fls(unsigned long word) { __asm__("bsrq %1,%0" :"=r" (word) :"rm" (word)); return word; } /** * ffs - find first bit set * @x: the word to search * * This is defined the same way as * the libc and compiler builtin ffs routines, therefore * differs in spirit from the above ffz (man ffs). */ static __inline__ int ffs(int x) { int r; __asm__("bsfl %1,%0\n\t" "cmovzl %2,%0" : "=r" (r) : "rm" (x), "r" (-1)); return r+1; } /** * fls - find last bit set * @x: the word to search * * This is defined the same way as ffs. */ static inline int fls(int x) { int r; __asm__("bsrl %1,%0\n\t" "cmovzl %2,%0" : "=&r" (r) : "rm" (x), "rm" (-1)); return r+1; } /** * fls64 - find last bit set in 64 bit word * @x: the word to search * * This is defined the same way as fls. */ static inline int fls64(uint64_t x) { if (x == 0) return 0; return __fls(x) + 1; } static inline uint64_t div64_64(uint64_t dividend, uint64_t divisor) { return dividend / divisor; } #elif __i386 #define rdtscll(val) \ __asm__ __volatile__("rdtsc" : "=A" (val)) /** * ffs - find first bit set * @x: the word to search * * This is defined the same way as * the libc and compiler builtin ffs routines, therefore * differs in spirit from the above ffz() (man ffs). */ static inline int ffs(int x) { int r; __asm__("bsfl %1,%0\n\t" "jnz 1f\n\t" "movl $-1,%0\n" "1:" : "=r" (r) : "rm" (x)); return r+1; } /** * fls - find last bit set * @x: the word to search * * This is defined the same way as ffs(). */ static inline int fls(int x) { int r; __asm__("bsrl %1,%0\n\t" "jnz 1f\n\t" "movl $-1,%0\n" "1:" : "=r" (r) : "rm" (x)); return r+1; } static inline int fls64(uint64_t x) { uint32_t h = x >> 32; if (h) return fls(h) + 32; return fls(x); } #define do_div(n,base) ({ \ unsigned long __upper, __low, __high, __mod, __base; \ __base = (base); \ asm("":"=a" (__low), "=d" (__high):"A" (n)); \ __upper = __high; \ if (__high) { \ __upper = __high % (__base); \ __high = __high / (__base); \ } \ asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (__base), "0" (__low), "1" (__upper)); \ asm("":"=A" (n):"a" (__low),"d" (__high)); \ __mod; \ }) /* 64bit divisor, dividend and result. dynamic precision */ static uint64_t div64_64(uint64_t dividend, uint64_t divisor) { uint32_t d = divisor; if (divisor > 0xffffffffULL) { unsigned int shift = fls(divisor >> 32); d = divisor >> shift; dividend >>= shift; } /* avoid 64 bit division if possible */ if (dividend >> 32) do_div(dividend, d); else dividend = (uint32_t) dividend / d; return dividend; } #endif /* Andi Kleen's version */ uint32_t acbrt(uint64_t x) { uint32_t y = 0; int s; for (s = 63; s >= 0; s -= 3) { uint64_t b, bs; y = 2 * y; b = 3 * y * (y+1) + 1; bs = b << s; if (x >= bs && (b == (bs>>s))) { /* avoid overflow */ x -= bs; y++; } } return y; } /* My version of hacker's delight */ uint32_t hcbrt(uint64_t x) { int s = 60; uint32_t y = 0; do { uint64_t b; y = 2*y; b = (uint64_t)(3*y*(y + 1) + 1) << s; s = s - 3; if (x >= b) { x = x - b; y = y + 1; } } while(s >= 0); return y; } /* calculate the cubic root of x using Newton-Raphson */ static uint32_t ocubic(uint64_t a) { uint32_t x, x1; /* Initial estimate is based on: * cbrt(x) = exp(log(x) / 3) */ x = 1u << (fls64(a)/3); /* * Iteration based on: * 2 * x = ( 2 * x + a / x ) / 3 * k+1 k k */ do { x1 = x; x = (2 * x + div64_64(a, (uint64_t)x * x)) / 3; } while (abs(x1 - x) > 1); return x; } /* calculate the cubic root of x using Newton-Raphson */ static uint32_t ncubic(uint64_t a) { uint64_t x; /* Initial estimate is based on: * cbrt(x) = exp(log(x) / 3) */ x = 1u << (fls64(a)/3); /* Converges in 3 iterations to > 32 bits */ x = (2 * x + div64_64(a, x*x)) / 3; x = (2 * x + div64_64(a, x*x)) / 3; x = (2 * x + div64_64(a, x*x)) / 3; return x; } /* 65536 times the cubic root of 0, 1, 2, 3, 4, 5, 6, 7*/ static uint64_t bictcp_table[8] = {0, 65536, 82570, 94519, 104030, 112063, 119087, 125367}; /* calculate the cubic root of x the basic idea is that x can be expressed as i*8^j so cubic_root(x) = cubic_root(i)*2^j in the following code, x is i, and y is 2^j because of integer calculation, there are errors in calculation so finally use binary search to find out the exact solution*/ static uint32_t bictcp(uint64_t x) { uint64_t y, app, target, start, end, mid, start_diff, end_diff; if (x == 0) return 0; target = x; /*first estimate lower and upper bound*/ y = 1; while (x >= 8){ x = (x >> 3); y = (y << 1); } start = (y*bictcp_table[x])>>16; if (x==7) end = (y<<1); else end = (y*bictcp_table[x+1]+65535)>>16; /*binary search for more accurate one*/ while (start < end-1) { mid = (start+end) >> 1; app = mid*mid*mid; if (app < target) start = mid; else if (app > target) end = mid; else return mid; } /*find the most accurate one from start and end*/ app = start*start*start; if (app < target) start_diff = target - app; else start_diff = app - target; app = end*end*end; if (app < target) end_diff = target - app; else end_diff = app - target; return (start_diff < end_diff) ? start : end; } #define NCASES 1000 static uint64_t cases[NCASES]; static double results[NCASES]; static double ticks_per_usec; static unsigned long long start, end; static void dotest(const char *name, uint32_t (*func)(uint64_t)) { int i; unsigned long long t, mx = 0, sum = 0, sum_sq = 0; double mean, std, err = 0; for (i = 0; i < NCASES; i++) { uint64_t x = cases[i]; uint32_t v; rdtscll(start); v = (*func)(x); rdtscll(end); t = end - start; if (t > mx) mx = t; sum += t; sum_sq += t*t; err += fabs(((double) v - results[i]) / results[i]); } mean = (double) sum / ticks_per_usec / NCASES ; std = sqrtl( (double) sum_sq / ticks_per_usec / NCASES - mean * mean); printf("%-10s %8llu %8.2f %8.2f %8.2f %.03f%%\n", name, (unsigned long long) sum / NCASES, mean, std, (double) mx / ticks_per_usec, err * 100./ NCASES); } int main(int argc, char **argv) { uint64_t x; int i; printf("Calibrating\n"); rdtscll(start); sleep(2); rdtscll(end); ticks_per_usec = (double) (end - start) / 2000000.; for (i = 0; i < 63; i++) cases[i] = 1ull << i; x = ~0; while (x != 0) { cases[i++] = x; x >>= 1; } x = ~0; while (x != 0) { cases[i++] = x; x <<= 1; } while (i < NCASES) cases[i++] = (uint64_t) random() * (uint64_t) random(); for (i = 0; i < NCASES; i++) results[i] = cbrt((double)cases[i]); printf("Function clocks mean(us) max(us) std(us) Avg error\n"); #define DOTEST(x) dotest(#x, x) DOTEST(bictcp); DOTEST(ocubic); DOTEST(ncubic); DOTEST(acbrt); DOTEST(hcbrt); return 0; } - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html