cube root benchmark code

Stephen Hemminger Tue, 06 Mar 2007 15:03:22 -0800

Here is a better version of the benchmark code.
It has the original code used in 2.4 version of Cubic for comparison


-----------------------------------------------------------
/* Test and measure perf of cube root algorithms.  */
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <math.h>
#include <unistd.h>

#ifdef __x86_64

#define rdtscll(val) do { \
     unsigned int __a,__d; \
     asm volatile("rdtsc" : "=a" (__a), "=d" (__d)); \
     (val) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \
} while(0)

# define do_div(n,base) ({                                      \
        uint32_t __base = (base);                               \
        uint32_t __rem;                                         \
        __rem = ((uint64_t)(n)) % __base;                       \
        (n) = ((uint64_t)(n)) / __base;                         \
        __rem;                                                  \
 })


/**
 * __ffs - find first bit in word.
 * @word: The word to search
 *
 * Undefined if no bit exists, so code should check against 0 first.
 */
static __inline__ unsigned long __ffs(unsigned long word)
{
        __asm__("bsfq %1,%0"
                :"=r" (word)
                :"rm" (word));
        return word;
}

/*
 * __fls: find last bit set.
 * @word: The word to search
 *
 * Undefined if no zero exists, so code should check against ~0UL first.
 */
static inline unsigned long __fls(unsigned long word)
{
        __asm__("bsrq %1,%0"
                :"=r" (word)
                :"rm" (word));
        return word;
}

/**
 * ffs - find first bit set
 * @x: the word to search
 *
 * This is defined the same way as
 * the libc and compiler builtin ffs routines, therefore
 * differs in spirit from the above ffz (man ffs).
 */
static __inline__ int ffs(int x)
{
        int r;

        __asm__("bsfl %1,%0\n\t"
                "cmovzl %2,%0" 
                : "=r" (r) : "rm" (x), "r" (-1));
        return r+1;
}

/**
 * fls - find last bit set
 * @x: the word to search
 *
 * This is defined the same way as ffs.
 */
static inline int fls(int x)
{
        int r;

        __asm__("bsrl %1,%0\n\t"
                "cmovzl %2,%0"
                : "=&r" (r) : "rm" (x), "rm" (-1));
        return r+1;
}

/**
 * fls64 - find last bit set in 64 bit word
 * @x: the word to search
 *
 * This is defined the same way as fls.
 */
static inline int fls64(uint64_t x)
{
        if (x == 0)
                return 0;
        return __fls(x) + 1;
}

static inline uint64_t div64_64(uint64_t dividend, uint64_t divisor)
{
        return dividend / divisor;
}

#elif __i386

#define rdtscll(val) \
     __asm__ __volatile__("rdtsc" : "=A" (val))

/**
 * ffs - find first bit set
 * @x: the word to search
 *
 * This is defined the same way as
 * the libc and compiler builtin ffs routines, therefore
 * differs in spirit from the above ffz() (man ffs).
 */
static inline int ffs(int x)
{
        int r;

        __asm__("bsfl %1,%0\n\t"
                "jnz 1f\n\t"
                "movl $-1,%0\n"
                "1:" : "=r" (r) : "rm" (x));
        return r+1;
}

/**
 * fls - find last bit set
 * @x: the word to search
 *
 * This is defined the same way as ffs().
 */
static inline int fls(int x)
{
        int r;

        __asm__("bsrl %1,%0\n\t"
                "jnz 1f\n\t"
                "movl $-1,%0\n"
                "1:" : "=r" (r) : "rm" (x));
        return r+1;
}

static inline int fls64(uint64_t x)
{
        uint32_t h = x >> 32;
        if (h)
                return fls(h) + 32;
        return fls(x);
}


#define do_div(n,base) ({ \
        unsigned long __upper, __low, __high, __mod, __base; \
        __base = (base); \
        asm("":"=a" (__low), "=d" (__high):"A" (n)); \
        __upper = __high; \
        if (__high) { \
                __upper = __high % (__base); \
                __high = __high / (__base); \
        } \
        asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (__base), "0" (__low), 
"1" (__upper)); \
        asm("":"=A" (n):"a" (__low),"d" (__high)); \
        __mod; \
})


/* 64bit divisor, dividend and result. dynamic precision */
static uint64_t div64_64(uint64_t dividend, uint64_t divisor)
{
        uint32_t d = divisor;

        if (divisor > 0xffffffffULL) {
                unsigned int shift = fls(divisor >> 32);

                d = divisor >> shift;
                dividend >>= shift;
        }

        /* avoid 64 bit division if possible */
        if (dividend >> 32)
                do_div(dividend, d);
        else
                dividend = (uint32_t) dividend / d;

        return dividend;
}
#endif

/* Andi Kleen's version */
uint32_t acbrt(uint64_t x)
{
        uint32_t y = 0;
        int s;

        for (s = 63; s >= 0; s -= 3) {
                uint64_t b, bs;

                y = 2 * y;
                b = 3 * y * (y+1) + 1;
                bs = b << s;
                if (x >= bs && (b == (bs>>s))) {  /* avoid overflow */
                        x -= bs;
                        y++;
                }
        }
        return y;
}

/* My version of hacker's delight */
uint32_t hcbrt(uint64_t x)
{
        int s = 60;
        uint32_t y = 0;

        do {
                uint64_t b;
                y = 2*y;
                b = (uint64_t)(3*y*(y + 1) + 1) << s;
                s = s - 3;
                if (x >= b) {
                        x = x - b;
                        y = y + 1;
                }
        } while(s >= 0);

        return y;
}

/* calculate the cubic root of x using Newton-Raphson */
static uint32_t ocubic(uint64_t a)
{
        uint32_t x, x1;

        /* Initial estimate is based on:
         * cbrt(x) = exp(log(x) / 3)
         */
        x = 1u << (fls64(a)/3);

        /*
         * Iteration based on:
         *                         2
         * x    = ( 2 * x  +  a / x  ) / 3
         *  k+1          k         k
         */
        do {
                x1 = x;

                x = (2 * x + div64_64(a, (uint64_t)x * x)) / 3;
        } while (abs(x1 - x) > 1);

        return x;
}

/* calculate the cubic root of x using Newton-Raphson */
static uint32_t ncubic(uint64_t a)
{
        uint64_t x;

        /* Initial estimate is based on:
         * cbrt(x) = exp(log(x) / 3)
         */
        x = 1u << (fls64(a)/3);

        /* Converges in 3 iterations to > 32 bits */
        x = (2 * x + div64_64(a, x*x)) / 3;
        x = (2 * x + div64_64(a, x*x)) / 3;
        x = (2 * x + div64_64(a, x*x)) / 3;

        return x;
}

/* 65536 times the cubic root of 0,    1,     2,     3,      4,      5,      6, 
     7*/
static uint64_t bictcp_table[8] = {0, 65536, 82570, 94519, 104030, 112063, 
119087, 125367};

/* calculate the cubic root of x
   the basic idea is that x can be expressed as i*8^j
   so cubic_root(x) = cubic_root(i)*2^j
   in the following code, x is i, and y is 2^j
   because of integer calculation, there are errors in calculation
   so finally use binary search to find out the exact solution*/
static uint32_t bictcp(uint64_t x)
{
        uint64_t y, app, target, start, end, mid, start_diff, end_diff;

        if (x == 0)
                return 0;

        target = x;

        /*first estimate lower and upper bound*/
        y = 1;
        while (x >= 8){
                x = (x >> 3);
                y = (y << 1);
        }
        start = (y*bictcp_table[x])>>16;
        if (x==7)
                end = (y<<1);
        else
                end = (y*bictcp_table[x+1]+65535)>>16;

        /*binary search for more accurate one*/
        while (start < end-1) {
                mid = (start+end) >> 1;
                app = mid*mid*mid;
                if (app < target)
                        start = mid;
                else if (app > target)
                        end = mid;
                else
                        return mid;
        }

        /*find the most accurate one from start and end*/
        app = start*start*start;
        if (app < target)
                start_diff = target - app;
        else
                start_diff = app - target;
        app = end*end*end;
        if (app < target)
                end_diff = target - app;
        else
                end_diff = app - target;

        return (start_diff < end_diff) ? start : end;
}


#define NCASES 1000
static uint64_t cases[NCASES];
static double results[NCASES];

static double ticks_per_usec;
static unsigned long long start, end;

static void dotest(const char *name, uint32_t (*func)(uint64_t))
{
        int i;
        unsigned long long t, mx = 0, sum = 0, sum_sq = 0;
        double mean, std, err = 0;

        for (i = 0; i < NCASES; i++) {
                uint64_t x = cases[i];
                uint32_t v;

                rdtscll(start);
                v = (*func)(x);
                rdtscll(end);

                t = end - start;
                if (t > mx) mx = t;
                sum += t; sum_sq += t*t;

                err += fabs(((double) v - results[i]) / results[i]);
        }

        mean = (double) sum / ticks_per_usec / NCASES ;
        std = sqrtl( (double) sum_sq / ticks_per_usec / NCASES - mean * mean);

        printf("%-10s %8llu %8.2f %8.2f %8.2f %.03f%%\n", name, 
               (unsigned long long) sum / NCASES, mean, std, 
               (double) mx / ticks_per_usec, err * 100./ NCASES);
}


int main(int argc, char **argv)
{
        uint64_t x;
        int i;

        printf("Calibrating\n");
        rdtscll(start);
        sleep(2);
        rdtscll(end);
        ticks_per_usec = (double) (end - start) / 2000000.;

        for (i = 0; i < 63; i++) 
                cases[i] = 1ull << i;
        x = ~0;
        while (x != 0) {
                cases[i++] = x;
                x >>= 1;
        }
        x = ~0;
        while (x != 0) {
                cases[i++] = x;
                x <<= 1;
        }

        while (i < NCASES)
                cases[i++] = (uint64_t) random()  * (uint64_t) random();

        for (i = 0; i < NCASES; i++) 
                results[i] = cbrt((double)cases[i]);

        printf("Function     clocks  mean(us) max(us)  std(us)  Avg error\n");
        
#define DOTEST(x)       dotest(#x, x)
        DOTEST(bictcp);
        DOTEST(ocubic);
        DOTEST(ncubic);
        DOTEST(acbrt);
        DOTEST(hcbrt);
        return 0;
}
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

cube root benchmark code

Reply via email to