Richard Biener <richard.guent...@gmail.com> wrote: > On Sun, Oct 25, 2020 at 8:37 PM Stefan Kanthak <stefan.kant...@nexgo.de> > wrote: >> >> Hi, >> >> for the AMD64 alias x86_64 platform and the __int128_t [DW]type, >> the first few lines of the __mulvDI3() function from libgcc2.c >> >> | DWtype >> | __mulvDI3 (DWtype u, DWtype v) >> | { >> | /* The unchecked multiplication needs 3 Wtype x Wtype multiplications, >> | but the checked multiplication needs only two. */ >> | const DWunion uu = {.ll = u}; >> | const DWunion vv = {.ll = v}; >> | >> | if (__builtin_expect (uu.s.high == uu.s.low >> (W_TYPE_SIZE - 1), 1)) >> | { >> | /* u fits in a single Wtype. */ >> | if (__builtin_expect (vv.s.high == vv.s.low >> (W_TYPE_SIZE - 1), 1)) >> | { >> | /* v fits in a single Wtype as well. */ >> | /* A single multiplication. No overflow risk. */ >> | return (DWtype) uu.s.low * (DWtype) vv.s.low; >> | } >> >> are compiled to this braindead code (obtained from libgcc.a of >> GCC 10.2.0 installed on Debian): >> >> 0000000000000000 <__mulvti3>: >> 0: 41 55 push %r13 >> 2: 49 89 cb mov %rcx,%r11 >> 5: 48 89 d0 mov %rdx,%rax >> 8: 49 89 d2 mov %rdx,%r10 >> b: 41 54 push %r12 >> d: 49 89 fc mov %rdi,%r12 >> 10: 48 89 d1 mov %rdx,%rcx >> 13: 49 89 f0 mov %rsi,%r8 >> 16: 4c 89 e2 mov %r12,%rdx >> 19: 49 89 f5 mov %rsi,%r13 >> 1c: 53 push %rbx >> 1d: 48 89 fe mov %rdi,%rsi >> 20: 48 c1 fa 3f sar $0x3f,%rdx >> 24: 48 c1 f8 3f sar $0x3f,%rax >> 28: 4c 89 df mov %r11,%rdi >> 2b: 4c 39 c2 cmp %r8,%rdx >> 2e: 75 18 jne 48 <__mulvti3+0x48> >> 30: 4c 39 d8 cmp %r11,%rax >> 33: 75 6b jne a0 <__mulvti3+0xa0> >> 35: 4c 89 e0 mov %r12,%rax >> 38: 49 f7 ea imul %r10 >> 3b: 5b pop %rbx >> 3c: 41 5c pop %r12 >> 3e: 41 5d pop %r13 >> 40: c3 retq >> ... >> >> There are EIGHT superfluous MOV instructions here, clobbering the >> non-volatile registers RBX, R12 and R13, plus THREE superfluous >> PUSH/POP pairs. >> >> What stops GCC from generating the following straightforward code >> (11 instructions in 31 bytes instead of 25 instructions in 65 bytes)? >> >> .intel_syntax noprefix >> __mulvti3: >> mov r8, rdi >> mov r9, rdx >> sra r8, 63 >> sra r9, 63 >> cmp r8, rsi >> jne __mulvti3+0x48+65-31 >> cmp r9, rcx >> jne __mulvti3+0xa0+65-31 >> mov rax, rdi >> imul rdx >> ret >> ... >> >> >> not amused > > can you open a bugreport please?
I'd like to discuss alternative implementations of __mulvti3() first: the very first lines of libgcc2.c reads | /* More subroutines needed by GCC output code on some machines. */ | /* Compile this one with gcc. */ Why don't you take advantage of that and implement __mulvDI3() as follows? DWtype __mulvDI3 (DWtype multiplicand, DWtype multiplier) { DWtype product; if (__builtin_mul_overflow(multiplicand, multiplier, &product)) abort(); return product; } For the AMD64 platform, instead of the 131 instructions in 439 bytes (partially cited above) GCC now generates 107 instructions in 324 bytes ... (un)fortunately showing this bug again, now clobbering RBP and RBX, pushing/popping RBP, RBX, R12, R14 and R15, and still moving them around without necessity: __mulvti3: movq %rdi, %r8 movq %rdx, %rdi pushq %r15 movq %rcx, %r9 movq %r8, %rdx movq %rdi, %rax pushq %r14 sarq $63, %rdx pushq %r12 sarq $63, %rax pushq %rbp xorl %ebp, %ebp pushq %rbx cmpq %rsi, %rdx jne .L4 cmpq %rcx, %rax jne .L5 movq %r8, %rax imulq %rdi movq %rax, %rbx .L2: testq %rbp, %rbp jne __mulvti3.cold movq %rbx, %rax popq %rbx popq %rbp popq %r12 popq %r14 popq %r15 ret ... .L4: cmpq %rcx, %rax jne .L7 ... jns .L8 xorl %r10d, %r10d subq %r10, %rax sbbq %r12, %rdx .L8: testq %r12, %r12 jns .L9 ... jne .L10 movq %rcx, %rbx movq %r10, %rdx jmp .L2 ... jmp .L6 .L7: ... ja .L3 ... ja .L3 ... jne .L11 ... jl .L2 .L3: movl $1, %ebp jmp .L2 .L11: testq %r10, %r10 js .L2 jmp .L3 .L10: ... jmp .L3 __mulvti3.cold: .L13: call abort Besides the poor register allocation, this code shows 12 conditional plus 5 unconditional branches. Let's try a second alternative implementation: DWtype __mulvDI3 (DWtype multiplicand, DWtype multiplier) { UDWtype product; UDWtype sign = multiplicand >> (DW_TYPE_SIZE - 1); UDWtype tmp = multiplier >> (DW_TYPE_SIZE - 1); if (__builtin_mul_overflow((multiplicand ^ sign) - sign, (multiplier ^ tmp) - tmp, &product)) abort(); sign ^= tmp; product = (product ^ sign) - sign; if ((__int128_t) (product ^ sign) < 0) abort(); return product; } This lets GCC generate 98 instructions in 293 bytes, with 6 conditional plus 3 unconditional branches: __mulvti3: movq %rsi, %rax pushq %r15 sarq $63, %rax pushq %r14 movq %rdx, %r14 movq %rax, %r10 pushq %r13 movq %rax, %r11 movq %rcx, %rax pushq %r12 sarq $63, %rax pushq %rbp movq %rdi, %rbp movq %rsi, %rdi movq %rax, %r8 xorq %r10, %rbp xorq %r10, %rdi movq %rax, %r9 pushq %rbx movq %rbp, %rax movq %rdi, %rdx subq %r10, %rax sbbq %r10, %rdx xorq %r8, %r14 xorq %r8, %rcx movq %rax, %rsi movq %r14, %r12 movq %rcx, %r13 movq %rdx, %rdi subq %r8, %r12 sbbq %r8, %r13 xorl %ebp, %ebp movq %r13, %rcx testq %rdx, %rdx jne .L4 testq %r13, %r13 jne .L5 mulq %r12 movq %rax, %r14 movq %rdx, %rcx .L2: testq %rbp, %rbp jne .L10 movq %r10, %rax xorq %r8, %rax movq %rax, %r12 movq %r11, %rax xorq %r9, %rax xorq %r12, %r14 movq %rax, %r13 movq %r14, %rax xorq %r13, %rcx subq %r12, %rax movq %r13, %rbx movq %rcx, %rdx sbbq %r13, %rdx xorq %rdx, %rbx js .L10 popq %rbx popq %rbp popq %r12 popq %r13 popq %r14 popq %r15 ret .p2align 4,,10 .p2align 3 .L4: testq %r13, %r13 jne .L8 movq %rdx, %rcx movq %r12, %rbx .L6: movq %rsi, %rax mulq %r12 movq %rax, %r14 movq %rbx, %rax movq %rdx, %r15 xorl %ebx, %ebx mulq %rcx addq %r15, %rax adcq %rbx, %rdx testq %rdx, %rdx jne .L8 movq %rax, %rcx jmp .L2 .p2align 4,,10 .p2align 3 .L5: movq %rax, %rbx jmp .L6 .p2align 4,,10 .p2align 3 .L8: movq %rdi, %rcx movq %r13, %rax movl $1, %ebp imulq %rsi, %rax imulq %r12, %rcx addq %rax, %rcx movq %rsi, %rax mulq %r12 addq %rdx, %rcx movq %rax, %r14 jmp .L2 __mulvti3.cold: .L10: call abort PROPER (assembly) code but uses only 44 instructions in just 121 bytes, with ONE conditional branch! --- mulvti3.s --- # Copyright (C) 2014-2020, Stefan Kanthak .arch generic64 .code64 .intel_syntax noprefix .global abort .global __mulvti3 .type __mulvti3, @function .text # rsi:rdi = multiplicand # rcx:rdx = multiplier __mulvti3: mov r10, rdx mov r11, rcx mov rax, rcx cqo # rdx = tmp mov r9, rdx xor r10, rdx xor r11, rdx sub r10, rdx sbb r11, rdx # r11:r10 = |multiplier| mov rax, rsi cqo # rdx = sign xor r9, rdx # r9 = sign ^ tmp xor rdi, rdx xor rsi, rdx sub rdi, rdx sbb rsi, rdx # rsi:rdi = |multiplicand| xor ecx, ecx cmp rcx, rsi sbb edx, edx # edx = (|multiplicand| < 2**64) ? 0 : -1 cmp rcx, r11 sbb ecx, ecx # ecx = (|multiplier| < 2**64) ? 0 : -1 and ecx, edx # ecx = (|product| < 2**128) ? 0 : -1 mov rax, rsi mul r10 adc ecx, ecx # ecx = (|product| < 2**128) ? 0 : * mov rsi, rax mov rax, rdi mul r11 adc ecx, ecx # ecx = (|product| < 2**128) ? 0 : * add rsi, rax # adc ecx, ecx # ecx = (|product| < 2**128) ? 0 : * mov rax, rdi mul r10 add rdx, rsi adc ecx, ecx # ecx = (|product| < 2**128) ? 0 : * add rax, r9 adc rdx, r9 # rdx:rax = (product < 0) ? ~product % 2**128 : product % 2**128 mov rsi, rdx # rsi = (product % 2**128 < -2**127) # | (product % 2**128 >= 2**127) ? negative : positive xor rax, r9 xor rdx, r9 # rdx:rax = product % 2**128 add rsi, rsi adc ecx, ecx # ecx = (product >= -2**127) # & (product < 2**127) ? 0 : * jnz .overflow ret .overflow: call abort .end --- EOF ---