On Fri, 16 Dec 2005 06:21:02 +1100
Anton Blanchard <[EMAIL PROTECTED]> wrote:

> 
> Hi Stephen,
> 
> > Replace cube root algorithim with a faster version using Newton-Raphson.
> > Surprisingly, doing the scaled div64_64 is faster than a true 64 bit
> > division on 64 bit CPU's.
> 
> Interesting, what cpu was this on? Was there much difference between the
> two methods?
> 
> Anton

Trivial little test program does 
old (original CUBIC),
bisect (optimized version of same algorithm)
math (use glibc cbrt ie floating point)
newton (Newton's method in the patch)

$ ./croot 10000000

Opteron 2.0 Ghz
math 9671505106 1.230814
old 9676503359 1.287217
bisect 9666502089 1.074139
newton 9671507303 1.168087

Xeon (32bit) 2.4 Ghz
math 9671505106 3.047887
old 9676503359 5.917374
bisect 9666502089 5.337120
newton 9671507303 2.433769

Pentium III 700 Mhz
math 9671505106 7.038867
old 9676503359 9.055317
bisect 9666502089 7.904939
newton 9671507303 4.867410


-- 
Stephen Hemminger <[EMAIL PROTECTED]>
OSDL http://developer.osdl.org/~shemminger
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <math.h>
#include <sys/time.h>

#define unlikely(x)	__builtin_expect(!!(x), 0)
#define fls64(x)	generic_fls64(x)

#ifdef i386
 /*
 * do_div() is NOT a C function. It wants to return
 * two values (the quotient and the remainder), but
 * since that doesn't work very well in C, what it
 * does is:
 *
 * - modifies the 64-bit dividend _in_place_
 * - returns the 32-bit remainder
 *
 * This ends up being the most efficient "calling
 * convention" on x86.
 */
#define do_div(n,base) ({ \
	unsigned long __upper, __low, __high, __mod, __base; \
	__base = (base); \
	asm("":"=a" (__low), "=d" (__high):"A" (n)); \
	__upper = __high; \
	if (__high) { \
		__upper = __high % (__base); \
		__high = __high / (__base); \
	} \
	asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (__base), "0" (__low), "1" (__upper)); \
	asm("":"=A" (n):"a" (__low),"d" (__high)); \
	__mod; \
})

/**
 * fls - find last bit set
 * @x: the word to search
 *
 * This is defined the same way as ffs.
 */
static inline int fls(int x)
{
	int r;

	__asm__("bsrl %1,%0\n\t"
		"jnz 1f\n\t"
		"movl $-1,%0\n"
		"1:" : "=r" (r) : "rm" (x));
	return r+1;
}

#endif

#ifdef __x86_64__
# define do_div(n,base) ({					\
	uint32_t __base = (base);				\
	uint32_t __rem;						\
	__rem = ((uint64_t)(n)) % __base;			\
	(n) = ((uint64_t)(n)) / __base;				\
	__rem;							\
 })

static __inline__ int fls(int x)
{
	int r;

	__asm__("bsrl %1,%0\n\t"
		"cmovzl %2,%0"
		: "=r" (r) : "rm" (x), "r" (-1));
	return r+1;
}
#undef fls64

static inline int fls64(uint64_t x)
{
	if (x == 0)
		return 0;

	__asm__("bsrq %1,%0\n"
		: "=r" (x) : "rm" (x));
	return x+1;
}

#endif


/*
 * Find most significant bit in 64 bit word.
 * Similar to ffs(), bits are numbered so lsb is 1; and fls64(0) = 0
 */
static inline int generic_fls64(uint64_t x)
{
	uint32_t h = x >> 32;
	if (h)
		return fls(x) + 32;
	return fls(x);
}

/* 64bit divisor, dividend and result. dynamic precision */
static inline u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor)
{
	u_int32_t d = divisor;

	if (divisor > 0xffffffffULL) {
		unsigned int shift = fls(divisor >> 32);

		d = divisor >> shift;
		dividend >>= shift;
	}
	
	if (dividend >> 32) 
		do_div(dividend, d);
	else
		dividend = (uint32_t) dividend / d;

	return dividend;
}

static const uint64_t cubic_table[8]
	= {0, 65536, 82570, 94519, 104030, 112063, 119087, 125367};

/*
 * calculate the cubic root of x
 * the basic idea is that x can be expressed as i*8^j
 * so cubic_root(x) = cubic_root(i)*2^j
 *  in the following code, x is i, and y is 2^j
 *  because of integer calculation, there are errors in calculation
 *  so finally use binary search to find out the exact solution
 */
static uint32_t old(uint64_t x)
{
        uint64_t y, app, target, start, end, mid, start_diff, end_diff;

        if (x == 0)
                return 0;

        target = x;

        /* first estimate lower and upper bound */
        y = 1;
        while (x >= 8){
                x = (x >> 3);
                y = (y << 1);
        }
        start = (y*cubic_table[x])>>16;
        if (x==7)
                end = (y<<1);
        else
                end = (y*cubic_table[x+1]+65535)>>16;

        /* binary search for more accurate one */
        while (start < end-1) {
                mid = (start+end) >> 1;
                app = mid*mid*mid;
                if (app < target)
                        start = mid;
                else if (app > target)
                        end = mid;
                else
                        return mid;
        }

        /* find the most accurate one from start and end */
        app = start*start*start;
        if (app < target)
                start_diff = target - app;
        else
                start_diff = app - target;
        app = end*end*end;
        if (app < target)
                end_diff = target - app;
        else
                end_diff = app - target;

        if (start_diff < end_diff)
                return start;
        else
                return end;
}

/*
 * calculate the cubic root of x based on bisection.
 */
static uint32_t bisect(uint64_t x)
{
	int l;
	uint32_t lo, hi;

        if (unlikely(x == 0))
                return 0;

	/* Initial estimate basede on:
	 * 	cbrt(x) = exp(log(x)/3)
	 * using log base 2
	 */
	l = (fls64(x)-1)/3;
	lo = 1u << l;
	hi = 1u << (l+1);

        /* use bisection to choose more accurate point. */
	do {
                uint64_t mid = (lo + hi) / 2;
		uint64_t app = mid * mid * mid;;

		if (app == x)
			break;
                if (app < x)
                        lo = mid;
                if (app > x)
                        hi = mid;
        } while (hi - lo > 2);

	return lo;
}

/* Compute cube root of a number, using newton-raphson method. */
static uint32_t newton(uint64_t a)
{
	uint32_t x, x1;

	/* Initial estimate is based on:
	 * cbrt(x) = exp(log(x) * 1/3)
	 */
	x = 1u << (fls64(a)/3);

	/*
	 * Iteration based on:
	 *                         2
	 * x    = ( 2 * x  +  a / x  ) / 3
	 *  k+1          k         k
	 */
	do {
		x1 = x;
		x = ( 2 * x + (uint32_t) div64_64(a, x*x)) / 3;
	} while ( abs(x1 - x) > 1);

	return x;
}

int main(int argc, char **argv)
{
	unsigned N = 100000;
	int i;
	uint64_t msum, osum, nsum;
	struct timeval t0, t1, to, tm, tn;

	if (argc > 1)
		N = strtoul(argv[1], NULL, 10);

	srandom(0);
	msum = 0;
	gettimeofday(&t0, NULL);
	for (i = 0; i < N; i++) {
		uint64_t r = random();
		msum += cbrt((double) r);
	}
	gettimeofday(&t1, NULL);
	timersub(&t1, &t0, &tm);
	printf("math %llu %d.%06d\n", (unsigned long long) msum, 
	       (int) tm.tv_sec, (int)tm.tv_usec);


	srandom(0);
	osum = 0;
	gettimeofday(&t0, NULL);
	for (i = 0; i < N; i++) {
		uint64_t r = random();
		osum += old(r);
	}
	gettimeofday(&t1, NULL);
	timersub(&t1, &t0, &to);
	printf("old %llu %d.%06d\n", (unsigned long long) osum, 
	       (int) to.tv_sec, (int) to.tv_usec);

	srandom(0);
	osum = 0;
	gettimeofday(&t0, NULL);
	for (i = 0; i < N; i++) {
		uint64_t r = random();
		osum += bisect(r);
	}
	gettimeofday(&t1, NULL);
	timersub(&t1, &t0, &to);
	printf("bisect %llu %d.%06d\n", (unsigned long long) osum, 
	       (int) to.tv_sec, (int) to.tv_usec);

	nsum = 0;
	srandom(0);
	gettimeofday(&t0, NULL);
	for (i = 0; i < N; i++) {
		uint64_t r = random();
		nsum += newton(r);
	}
	gettimeofday(&t1, NULL);
	timersub(&t1, &t0, &tn);
	printf("newton %llu %d.%06d\n", (unsigned long long) nsum, 
	       (int) tn.tv_sec, (int) tn.tv_usec);

	return 0;
}

Reply via email to