[__mulvti3] register allocator plays shell game

Stefan Kanthak Sun, 25 Oct 2020 12:37:01 -0700

Hi,

for the AMD64 alias x86_64 platform and the __int128_t [DW]type,
the first few lines of the __mulvDI3() function from libgcc2.c


| DWtype
| __mulvDI3 (DWtype u, DWtype v)
| {
|   /* The unchecked multiplication needs 3 Wtype x Wtype multiplications,
|      but the checked multiplication needs only two.  */
|   const DWunion uu = {.ll = u};
|   const DWunion vv = {.ll = v};
|
|   if (__builtin_expect (uu.s.high == uu.s.low >> (W_TYPE_SIZE - 1), 1))
|     {
|       /* u fits in a single Wtype.  */
|       if (__builtin_expect (vv.s.high == vv.s.low >> (W_TYPE_SIZE - 1), 1))
|  {
|    /* v fits in a single Wtype as well.  */
|    /* A single multiplication.  No overflow risk.  */
|    return (DWtype) uu.s.low * (DWtype) vv.s.low;
|  }

are compiled to this braindead code (obtained from libgcc.a of
GCC 10.2.0 installed on Debian):

0000000000000000 <__mulvti3>:
   0: 41 55                 push   %r13
   2: 49 89 cb              mov    %rcx,%r11
   5: 48 89 d0              mov    %rdx,%rax
   8: 49 89 d2              mov    %rdx,%r10
   b: 41 54                 push   %r12
   d: 49 89 fc              mov    %rdi,%r12
  10: 48 89 d1              mov    %rdx,%rcx
  13: 49 89 f0              mov    %rsi,%r8
  16: 4c 89 e2              mov    %r12,%rdx
  19: 49 89 f5              mov    %rsi,%r13
  1c: 53                    push   %rbx
  1d: 48 89 fe              mov    %rdi,%rsi
  20: 48 c1 fa 3f           sar    $0x3f,%rdx
  24: 48 c1 f8 3f           sar    $0x3f,%rax
  28: 4c 89 df              mov    %r11,%rdi
  2b: 4c 39 c2              cmp    %r8,%rdx
  2e: 75 18                 jne    48 <__mulvti3+0x48>
  30: 4c 39 d8              cmp    %r11,%rax
  33: 75 6b                 jne    a0 <__mulvti3+0xa0>
  35: 4c 89 e0              mov    %r12,%rax
  38: 49 f7 ea              imul   %r10
  3b: 5b                    pop    %rbx
  3c: 41 5c                 pop    %r12
  3e: 41 5d                 pop    %r13
  40: c3                    retq   
...

There are EIGHT superfluous MOV instructions here, clobbering the
non-volatile registers RBX, R12 and R13, plus THREE superfluous
PUSH/POP pairs.

What stops GCC from generating the following straightforward code
(11 instructions in 31 bytes instead of 25 instructions in 65 bytes)?

.intel_syntax noprefix
__mulvti3:
    mov   r8, rdi
    mov   r9, rdx
    sra   r8, 63
    sra   r9, 63
    cmp   r8, rsi
    jne   __mulvti3+0x48+65-31
    cmp   r9, rcx
    jne   __mulvti3+0xa0+65-31
    mov   rax, rdi
    imul  rdx
    ret
...


not amused
Stefan Kanthak

[__mulvti3] register allocator plays shell game

Reply via email to