------- Comment #6 from ajrobb at bigfoot dot com 2008-09-01 05:50 ------- OK - it's me again. I have formulated a general case optimization - I should keep quiet now ;-) :
/* Demonstrate a general fast multiply and divide by rational number. * Both methods agree over the entire domain: 0..0xffffffffu. * (even when they overflow the 32-bit result) */ #include <stdint.h> /* 32-bit numerator */ #define Y 47 /* 32-bit denominator */ #define Z 40 /* bit shift (at least 32) to normalize 'fact' below (with MSB set) */ #define B (32 + 2) /* this is what we might write - but it calls a 64-bit divide function */ uint32_t slow(uint32_t x) { return x * (uint64_t) Y / Z; } /* this is how it can be implemented without a 64-bit divide call */ uint32_t fast(uint32_t x) { static const uint64_t fact = ((((uint64_t)(Y % Z)) << B) + (Z - 1)) / Z; /* * While this is constructed as 2 multiplies, the value of (Y / Z) will * often be small (e.g. 0 or 1) and can be implemented without an imul * instruction. This can overwrite the value of %eax after the mul * instruction is issued as we only need the value from %edx. * A superscalar x86 will assign a new %eax register to allow the mul * instruction to continue in parallel. * * Where the routine is not inline, it can be implemented as: * movl $-1288490188, %eax * mull 4(%esp) * movl 4(%esp), %eax * # small multiplication by (Y/Z) could go here * shrl $2, %edx * addl %edx, %eax * ret * * Where this routine is inlined and x is already in a register other than * %eax or %edx, the final multiply and addition might be achieved with a * single lea instruction into %eax. Example: assume that x is in %ebx with * the above ratio (Y/Z is 1): * movl $-1288490188, %eax * mull %ebx * shrl $2, %edx * leal (%edx, %ebx), %eax */ return (uint32_t)((x * fact) >> B) + x * (Y / Z); } NOTE: the general case 64-bit result can be calculated with at most 2 mul instructions and no divides. uint64_t fast64(uint32_t x) { static const uint64_t fact = ((((uint64_t)(Y % Z)) << B) + (Z - 1)) / Z; return ((x * fact) >> B) + x * (uint64_t)(Y / Z); } With the above ratio of 47 / 40, this could be compiled to: movl $-1288490188, %eax mull 4(%esp) movl 4(%esp), %eax shrl $2, %edx addl %edx, %eax xorl %edx, %edx adcl $0, %edx ret Where the ratio is less than 1, the 32-bit and 64-bit versions are very similar - both using a single mul instruction. (The 64-bit version must zero %edx.) -- http://gcc.gnu.org/bugzilla/show_bug.cgi?id=20283