Hi, for the AMD64 alias x86_64 platform and the __int128_t [DW]type, the first few lines of the __mulvDI3() function from libgcc2.c
| DWtype | __mulvDI3 (DWtype u, DWtype v) | { | /* The unchecked multiplication needs 3 Wtype x Wtype multiplications, | but the checked multiplication needs only two. */ | const DWunion uu = {.ll = u}; | const DWunion vv = {.ll = v}; | | if (__builtin_expect (uu.s.high == uu.s.low >> (W_TYPE_SIZE - 1), 1)) | { | /* u fits in a single Wtype. */ | if (__builtin_expect (vv.s.high == vv.s.low >> (W_TYPE_SIZE - 1), 1)) | { | /* v fits in a single Wtype as well. */ | /* A single multiplication. No overflow risk. */ | return (DWtype) uu.s.low * (DWtype) vv.s.low; | } are compiled to this braindead code (obtained from libgcc.a of GCC 10.2.0 installed on Debian): 0000000000000000 <__mulvti3>: 0: 41 55 push %r13 2: 49 89 cb mov %rcx,%r11 5: 48 89 d0 mov %rdx,%rax 8: 49 89 d2 mov %rdx,%r10 b: 41 54 push %r12 d: 49 89 fc mov %rdi,%r12 10: 48 89 d1 mov %rdx,%rcx 13: 49 89 f0 mov %rsi,%r8 16: 4c 89 e2 mov %r12,%rdx 19: 49 89 f5 mov %rsi,%r13 1c: 53 push %rbx 1d: 48 89 fe mov %rdi,%rsi 20: 48 c1 fa 3f sar $0x3f,%rdx 24: 48 c1 f8 3f sar $0x3f,%rax 28: 4c 89 df mov %r11,%rdi 2b: 4c 39 c2 cmp %r8,%rdx 2e: 75 18 jne 48 <__mulvti3+0x48> 30: 4c 39 d8 cmp %r11,%rax 33: 75 6b jne a0 <__mulvti3+0xa0> 35: 4c 89 e0 mov %r12,%rax 38: 49 f7 ea imul %r10 3b: 5b pop %rbx 3c: 41 5c pop %r12 3e: 41 5d pop %r13 40: c3 retq ... There are EIGHT superfluous MOV instructions here, clobbering the non-volatile registers RBX, R12 and R13, plus THREE superfluous PUSH/POP pairs. What stops GCC from generating the following straightforward code (11 instructions in 31 bytes instead of 25 instructions in 65 bytes)? .intel_syntax noprefix __mulvti3: mov r8, rdi mov r9, rdx sra r8, 63 sra r9, 63 cmp r8, rsi jne __mulvti3+0x48+65-31 cmp r9, rcx jne __mulvti3+0xa0+65-31 mov rax, rdi imul rdx ret ... not amused Stefan Kanthak