------- Comment #2 from ajrobb at bigfoot dot com 2008-08-30 07:21 ------- Thanks for that.
I notice that the 64-bit value is now being accumulated in memory rather than in registers. To be fair, a full 64-bit accumulator need not be maintained - only the high 32 bits. Could you compile the following version with the fixed 4.4, please? #include <stdio.h> #include <stdlib.h> #include <stdint.h> uint64_t mul2(uint32_t a, uint32_t b) { return a * (uint64_t)b; } void mul32(uint32_t * a, const size_t n, const uint32_t factor) { ssize_t i = n; uint32_t hi = 0; while (--i >= 0) { uint64_t p = mul2(*a, factor) + hi; *a++ = p; hi = p >> 32; } *a = p; } For what it's worth, I tweaked the assembler by hand to avoid local stack variables: .globl _mul32 .def _mul32; .scl 2; .type 32; .endef _mul32: pushl %ebp pushl %esi xorl %esi, %esi pushl %ebx movl 20(%esp), %ecx movl 16(%esp), %ebp testl %ecx, %ecx movl 24(%esp), %ebx je L6 .p2align 4,,7 L7: movl (%ebp), %eax mull %ebx addl %esi, %eax movl %eax, (%ebp) adcl $0, %edx decl %ecx leal 4(%ebp), %ebp movl %edx, %esi jne L7 L6: movl %esi, (%ebp) popl %ebx popl %esi popl %ebp ret -- http://gcc.gnu.org/bugzilla/show_bug.cgi?id=37233