http://gcc.gnu.org/bugzilla/show_bug.cgi?id=28417
Right now Bugzilla internal problem prevents me from creating an attachement there. So it goes here. Not nice enough to go into release. I still cannot figure out what precision is, so I restricted new code to (n == HOST_BITS_PER_WIDE_INT && precision == HOST_BITS_PER_WIDE_INT) case. Need help here. Patched versus stock gcc-4.1.1 on this code: unsigned v; void f9188_mul365384439_shift27(unsigned A) { v = A/(unsigned)1577682821; } # diff -u suboptimal-411-O2.s suboptimal-411-O2-fixed.s --- suboptimal-411-O2.s 2006-07-29 19:34:09.000000000 +0200 +++ suboptimal-411-O2-fixed.s 2006-07-29 19:37:14.000000000 +0200 @@ -19,21 +19,10 @@ f9188_mul365384439_shift27: pushl %ebp movl %esp, %ebp - pushl %edi - pushl %esi - pushl %ebx - movl 8(%ebp), %ebx - movl $1551183727, %ecx - movl %ebx, %eax - mull %ecx - subl %edx, %ebx - shrl %ebx - leal (%ebx,%edx), %eax - shrl $30, %eax - movl %eax, v - popl %ebx - popl %esi - popl %edi + movl $365384439, %eax + mull 8(%ebp) + shrl $27, %edx + movl %edx, v leave ret ... suboptimal.c is a test program. -O2 -S on it will give you the optimized assembly, running it (compiled with -Os) will check the correctness of the optimized code. -- vda
New algorithm lives in separate function gcc_fastdiv_params() which is called from choose_multiplier(). Then we run old algorithm. Then we compare results and if new algorithm gives different one (new result is always better or same than old), we use it. I still cannot figure out what precision is, so I restricted new code to (n == HOST_BITS_PER_WIDE_INT && precision == HOST_BITS_PER_WIDE_INT) case. Need help here. diff -urpN gcc-4.1.1.orig/gcc/expmed.c gcc-4.1.1.new/gcc/expmed.c --- gcc-4.1.1.orig/gcc/expmed.c 2006-05-15 15:33:32.000000000 +0200 +++ gcc-4.1.1.new/gcc/expmed.c 2006-07-29 19:58:55.000000000 +0200 @@ -3224,6 +3224,128 @@ ceil_log2 (unsigned HOST_WIDE_INT x) return floor_log2 (x - 1) + 1; } + +/* +[below: 'div' is unsigned integer division] +['/' is real division with infinite precision] +[A,B,C... - integers, a,b,c... - reals] + +Theory: we want to calculate A div B, fast. +B is const > 2 and is not a power of 2. + +In fp: A/B = A*(L/B)/L. (If L is a large power of 2, +say 2^32, then it could be done really fast). +Let k := L/B, K := L div B + 1. +Then A/B = A*k/L. + +Then this is true: + +A div B <= A * K div L. + +For small enough A: A div B = A * K div L. +Lets find first A for which it is not true. + +Lets compare k/L and K/L. K/L is larger by a small value d: + +d := K/L - k/L = (L div B + 1) / L - L/B/L = += (L div B * B + B) / (L*B) - L/(L*B) = += (L div B * B + B - L) / (L*B) + +A*K/L is larger than A*k/L by A*d. + +A*k/L is closest to 'overflowing into next integer' +when A = N*B-1. The 'deficit' with such A is exactly 1/B. +If A*d >= 1/B, then A*K/L will 'overflow'. + +Thus bad_A >= 1/B / d = (1/B) * (L*B)/(L div B * B + B - L) = += L/(L div B * B + B - L). Then you need to 'walk up' to next +A representable as N*B-1: bad_A2 = (bad_A div B) * B + B-1 +This is the first A which will have wrong result +(i.e. for which (A*K div L) = (A div B)+1, not (A div B). + +Practical use. + +Suppose that A and B are 32-bit unsigned integers. + +Unfortunately, there exist only two B values in 3..2^32-1 range +for which _any_ 32-bit unsigned A can be fast divided using L=2^32 +(because bad_A=2^32 and any A is less than that): + +B=641 K=6700417 d=1/2753074036736 bad_A=4294967296 A=unlimited +B=6700417 K=641 d=1/28778071884562432 bad_A=4294967296 A=unlimited + +We need to use larger powers of 2 for L if we need to handle +many more B's. */ + +#define max_HOST_WIDE_INT ((unsigned HOST_WIDE_INT) (~(HOST_WIDE_INT)0)) +int gcc_fastdiv_params(unsigned HOST_WIDE_INT B, unsigned HOST_WIDE_INT max_A, + unsigned HOST_WIDE_INT *multiplier_ptr, int *shift_ptr); +int gcc_fastdiv_params(unsigned HOST_WIDE_INT B, unsigned HOST_WIDE_INT max_A, + unsigned HOST_WIDE_INT *multiplier_ptr, int *shift_ptr) +{ + unsigned HOST_WIDE_INT K, last_K, d_LB, bad_A, udummy; + HOST_WIDE_INT high_A, dummy; + int lgdn; + + if (!B || !((B-1)&B)) return 0; /* we won't work with powers of 2 */ + + /* L/B should fit into uint, so max L is floor_log2(B) << HOST_BITS_PER_WIDE_INT + ** L is impicitly = 1 << (lgdn+HOST_BITS_PER_WIDE_INT) */ + lgdn = floor_log2(B); + + /* K = L/B + 1: */ + div_and_round_double(TRUNC_DIV_EXPR, 1, 0,1<<lgdn, B,(HOST_WIDE_INT)0, + &K,&dummy, &udummy,&dummy); + K++; + + /* There is not much point in multiplying by even number + ** and then shifting right. Reduce K & L to avoid it: */ + while (!(K & 1) && lgdn) + lgdn--, K >>= 1; + + /* d_LB = ((L/B) * B + B - L) */ + d_LB = K * B; /* since (HOST_WIDE_INT)L == 0, subtracting it can be optimized out */ + /* bad_A = L / d_LB */ + div_and_round_double(TRUNC_DIV_EXPR, 1, 0,1<<lgdn, d_LB,(HOST_WIDE_INT)0, + &bad_A,&high_A, &udummy,&dummy); + + bad_A = (bad_A / B) * B; /* ... + B-1 later */ + if(high_A || bad_A > max_HOST_WIDE_INT - (B-1) ) { + high_A = 1; /* we aren't interested much in true value if high_A!=0 */ + } + else bad_A += B-1; + + if (!high_A && bad_A <= max_A) + return 0; /* no suitable params exist :( */ + + /* K is good, but maybe we can reduce it? */ + while(1) { + last_K = K; + if (--lgdn < 0) + break; + /* K = L/B + 1: */ + div_and_round_double(TRUNC_DIV_EXPR, 1, 0,1<<lgdn, B,(HOST_WIDE_INT)0, + &K,&dummy, &udummy,&dummy); + K++; + /* d_LB = ((L/B) * B + B - L) */ + d_LB = K * B; /* since (HOST_WIDE_INT)L == 0, subtracting it can be optimized out */ + /* bad_A = L / d_LB */ + div_and_round_double(TRUNC_DIV_EXPR, 1, 0,1<<lgdn, d_LB,(HOST_WIDE_INT)0, + &bad_A,&high_A, &udummy,&dummy); + bad_A = (bad_A / B) * B; + if(high_A || bad_A > max_HOST_WIDE_INT - (B-1) ) + high_A = 1; /* we aren't interested much in true value if high_A!=0 */ + else bad_A += B-1; + if (!high_A && bad_A <= max_A) + break; + } + /* new params are bad, report last ones */ + *multiplier_ptr = last_K; + *shift_ptr = lgdn+1; + return 1; +} + + /* Choose a minimal N + 1 bit approximation to 1/D that can be used to replace division by D, and put the least significant N bits of the result in *MULTIPLIER_PTR and return the most significant bit. @@ -3252,6 +3374,14 @@ choose_multiplier (unsigned HOST_WIDE_IN unsigned HOST_WIDE_INT nl, dummy1; HOST_WIDE_INT nh, dummy2; + int fastdiv_params; + unsigned HOST_WIDE_INT K; + int bits; + + fastdiv_params = 0; + if(n == HOST_BITS_PER_WIDE_INT && precision == HOST_BITS_PER_WIDE_INT) + fastdiv_params = gcc_fastdiv_params(d, max_HOST_WIDE_INT, &K, &bits); + /* lgup = ceil(log2(divisor)); */ lgup = ceil_log2 (d); @@ -3266,7 +3396,7 @@ choose_multiplier (unsigned HOST_WIDE_IN gcc_assert (pow != 2 * HOST_BITS_PER_WIDE_INT); /* mlow = 2^(N + lgup)/d */ - if (pow >= HOST_BITS_PER_WIDE_INT) + if (pow >= HOST_BITS_PER_WIDE_INT) { nh = (HOST_WIDE_INT) 1 << (pow - HOST_BITS_PER_WIDE_INT); nl = 0; @@ -3321,6 +3451,12 @@ choose_multiplier (unsigned HOST_WIDE_IN else { *multiplier_ptr = GEN_INT (mhigh_lo); + if(fastdiv_params && (mhigh_hi || K!=mhigh_lo)) + { + *post_shift_ptr = bits; + *multiplier_ptr = GEN_INT (K); + return 0; + } return mhigh_hi; } }
#include <stdint.h> #include <stdio.h> unsigned v; void f9952_mul764333263_shift27(unsigned A) { v=A/754200792; } void f9188_mul365384439_shift27(unsigned A) { v = A/(unsigned)1577682821; } void f9188_mul365384439_shift27_prime(unsigned A) { v = (((uint64_t)A)*(unsigned)365384439) >> (27+32); } void f8399_mul2283243215_shift29(unsigned A) { v = A/(unsigned)1009898111; } void f8399_mul2283243215_shift29_prime(unsigned A) { v = (((uint64_t)A)*(unsigned)2283243215) >> (29+32); } void f8267_mul2482476753_shift30(unsigned A) { v = A/(unsigned)1857695551; } void f8267_mul2482476753_shift30_prime(unsigned A) { v = (((uint64_t)A)*(unsigned)2482476753) >> (30+32); } /* Generated code is suboptimal (gcc 4.1.1): void f9188_mul365384439_shift27(unsigned A) { v = A/1577682821; } f9188_mul365384439_shift27: pushl %edi pushl %esi pushl %ebx movl 16(%esp), %ebx movl $1551183727, %ecx movl %ebx, %eax mull %ecx subl %edx, %ebx shrl %ebx leal (%ebx,%edx), %eax shrl $30, %eax movl %eax, v popl %ebx popl %esi popl %edi ret void f8399_mul2283243215_shift29(unsigned A) { v = A/1009898111; } f8399_mul2283243215_shift29: pushl %edi pushl %esi pushl %ebx movl 16(%esp), %ebx movl $271519133, %ecx movl %ebx, %eax mull %ecx subl %edx, %ebx shrl %ebx leal (%ebx,%edx), %eax shrl $29, %eax movl %eax, v popl %ebx popl %esi popl %edi ret void f8267_mul2482476753_shift30(unsigned A) { v = A/1857695551; } f8267_mul2482476753_shift30: pushl %edi pushl %esi pushl %ebx movl 16(%esp), %ebx movl $669986209, %ecx movl %ebx, %eax mull %ecx subl %edx, %ebx shrl %ebx leal (%ebx,%edx), %eax shrl $30, %eax movl %eax, v popl %ebx popl %esi popl %edi ret These operations can be done like this: f9188_mul365384439_shift27_prime: movl $365384439, %eax mull 4(%esp) movl %edx, %eax shrl $27, %eax movl %eax, v ret f8399_mul2283243215_shift29_prime: movl $-2011724081, %eax mull 4(%esp) movl %edx, %eax shrl $29, %eax movl %eax, v ret f8267_mul2482476753_shift30_prime: movl $-1812490543, %eax mull 4(%esp) movl %edx, %eax shrl $30, %eax movl %eax, v ret (Why there is that silly movl %edx, %eax - that's another good question...) The verification program is below. Was compiled with -Os (in order to force gcc to use div insn - we want to do true division for the purpose of correctness check!) and didn't report a failure. */ int main() { unsigned A=0,u,v; while(++A) { (A & 0xffff) || printf("A=%u\r",A); u = (((uint64_t)A)*(unsigned)365384439) >> (27+32); v = A / (unsigned)1577682821; if(u!=v) { printf("1: A=%u u=%u v=%u\n", A,u,v); return 0; } u = (((uint64_t)A)*(unsigned)2283243215) >> (29+32); v = A / (unsigned)1009898111; if(u!=v) { printf("2: A=%u u=%u v=%u\n", A,u,v); return 0; } u = (((uint64_t)A)*(unsigned)2482476753) >> (30+32); v =A / (unsigned)1857695551; if(u!=v) { printf("3: A=%u u=%u v=%u\n", A,u,v); return 0; } } puts(""); return 0; }
suboptimal-411-O2-fixed.s
Description: Binary data
suboptimal-411-O2.s
Description: Binary data