------- Comment #5 from ajrobb at bigfoot dot com 2008-08-31 13:40 ------- For the case where the overall scale is between 1 and 2 (exclusive), it is straightforward to implement a solution with a single multiply and no shifts:
/* Demostrate fast multiply and divide by rational number, r, where 1 < r < 2 * Both methods work over entire domain: 0 < x < (1<<32)/r. */ #include <stdint.h> #define Y 47 #define Z 40 uint32_t slow(uint32_t x) { return x * (uint64_t) Y / Z; } uint32_t fast(uint32_t x) { static const uint64_t fact = ((((uint64_t)(Y-Z)) << 32) + (Z-1)) / Z; return x + (uint32_t)((x * fact) >> 32); } This compiles to the following with gcc 4.2.1 on i586-suse-linux gcc -O3 -fomit-frame-pointer .file "vat.c" .globl __udivdi3 .text .p2align 4,,15 .globl slow .type slow, @function slow: subl $28, %esp movl $47, %eax mull 32(%esp) movl $40, 8(%esp) movl $0, 12(%esp) movl %eax, (%esp) movl %edx, 4(%esp) call __udivdi3 addl $28, %esp ret .size slow, .-slow .p2align 4,,15 .globl fast .type fast, @function fast: subl $12, %esp movl $751619277, %ecx movl %ebx, (%esp) movl 16(%esp), %ebx movl %edi, 8(%esp) movl %esi, 4(%esp) movl %ebx, %eax mull %ecx movl %edx, %edi movl %eax, %esi movl %edi, %esi xorl %edi, %edi movl 8(%esp), %edi leal (%ebx,%esi), %eax movl (%esp), %ebx movl 4(%esp), %esi addl $12, %esp ret .size fast, .-fast # additional hand-coded assembler equivalent to fast above: .p2align 4,,15 .globl faster .type faster, @function faster: movl $751619277, %eax mull 4(%esp) movl 4(%esp), %eax addl %edx, %eax ret .size faster, .-faster .ident "GCC: (GNU) 4.2.1 (SUSE Linux)" .section .note.GNU-stack,"",@progbits -- ajrobb at bigfoot dot com changed: What |Removed |Added ---------------------------------------------------------------------------- Version|3.3.4 |4.2.1 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=20283