consider the code fragment: uint64_t slow(uint64_t x) { return x / 1220703125u; }
This can be replaced by: uint64_t fast(uint64_t x) { uint32_t a = ((x >> 32) * 1270091284u) >> 32; uint32_t b = ((x & 0xffffffffu) * 3777893186u) >> 32; return ((x >> 32) * 3777893186ull + a + b) >> 30; } The 'fast' code runs 50% faster than 'slow'. However, removing the redundant multiplies (see my earlier bug - fixed in 4.4 trunk) and tidying up storage, I can use the following assembler to run nearly 100% faster than 'slow': .p2align 4,,15 .globl _fast .def _fast; .scl 2; .type 32; .endef _fast: pushl %ebx movl $1270091284, %eax mull 12(%esp) movl $-517074110, %eax movl %edx, %ebx mull 8(%esp) movl $-517074110, %eax movl %edx, %ecx mull 12(%esp) addl %ebx, %ecx popl %ebx adcl $0, %edx addl %ecx, %eax adcl $0, %edx shrdl $30, %edx, %eax shrl $30, %edx ret NOTE: the 2 multipliers are derived using 96-bit arithmetic: d = 1220703125u -517074110 = 0xffe12e1342u = (((1u << 94) + d - 1) / d) >> 32 1270091284 = (((1u << 94) + d - 1) / d) & 0xffffffffu -- Summary: fast 64-bit divide by constant 32-bit Product: gcc Version: 4.2.3 Status: UNCONFIRMED Severity: enhancement Priority: P3 Component: c AssignedTo: unassigned at gcc dot gnu dot org ReportedBy: ajrobb at bigfoot dot com GCC build triplet: i686-pc-cygwin GCC host triplet: i686-pc-cygwin GCC target triplet: i686-pc-cygwin http://gcc.gnu.org/bugzilla/show_bug.cgi?id=37443