On Sun, Oct 25, 2020 at 8:37 PM Stefan Kanthak <stefan.kant...@nexgo.de> wrote:
>
> Hi,
>
> for the AMD64 alias x86_64 platform and the __int128_t [DW]type,
> the first few lines of the __mulvDI3() function from libgcc2.c
>
> | DWtype
> | __mulvDI3 (DWtype u, DWtype v)
> | {
> |   /* The unchecked multiplication needs 3 Wtype x Wtype multiplications,
> |      but the checked multiplication needs only two.  */
> |   const DWunion uu = {.ll = u};
> |   const DWunion vv = {.ll = v};
> |
> |   if (__builtin_expect (uu.s.high == uu.s.low >> (W_TYPE_SIZE - 1), 1))
> |     {
> |       /* u fits in a single Wtype.  */
> |       if (__builtin_expect (vv.s.high == vv.s.low >> (W_TYPE_SIZE - 1), 1))
> |  {
> |    /* v fits in a single Wtype as well.  */
> |    /* A single multiplication.  No overflow risk.  */
> |    return (DWtype) uu.s.low * (DWtype) vv.s.low;
> |  }
>
> are compiled to this braindead code (obtained from libgcc.a of
> GCC 10.2.0 installed on Debian):
>
> 0000000000000000 <__mulvti3>:
>    0: 41 55                 push   %r13
>    2: 49 89 cb              mov    %rcx,%r11
>    5: 48 89 d0              mov    %rdx,%rax
>    8: 49 89 d2              mov    %rdx,%r10
>    b: 41 54                 push   %r12
>    d: 49 89 fc              mov    %rdi,%r12
>   10: 48 89 d1              mov    %rdx,%rcx
>   13: 49 89 f0              mov    %rsi,%r8
>   16: 4c 89 e2              mov    %r12,%rdx
>   19: 49 89 f5              mov    %rsi,%r13
>   1c: 53                    push   %rbx
>   1d: 48 89 fe              mov    %rdi,%rsi
>   20: 48 c1 fa 3f           sar    $0x3f,%rdx
>   24: 48 c1 f8 3f           sar    $0x3f,%rax
>   28: 4c 89 df              mov    %r11,%rdi
>   2b: 4c 39 c2              cmp    %r8,%rdx
>   2e: 75 18                 jne    48 <__mulvti3+0x48>
>   30: 4c 39 d8              cmp    %r11,%rax
>   33: 75 6b                 jne    a0 <__mulvti3+0xa0>
>   35: 4c 89 e0              mov    %r12,%rax
>   38: 49 f7 ea              imul   %r10
>   3b: 5b                    pop    %rbx
>   3c: 41 5c                 pop    %r12
>   3e: 41 5d                 pop    %r13
>   40: c3                    retq
> ...
>
> There are EIGHT superfluous MOV instructions here, clobbering the
> non-volatile registers RBX, R12 and R13, plus THREE superfluous
> PUSH/POP pairs.
>
> What stops GCC from generating the following straightforward code
> (11 instructions in 31 bytes instead of 25 instructions in 65 bytes)?
>
> .intel_syntax noprefix
> __mulvti3:
>     mov   r8, rdi
>     mov   r9, rdx
>     sra   r8, 63
>     sra   r9, 63
>     cmp   r8, rsi
>     jne   __mulvti3+0x48+65-31
>     cmp   r9, rcx
>     jne   __mulvti3+0xa0+65-31
>     mov   rax, rdi
>     imul  rdx
>     ret
> ...
>
>
> not amused

can you open a bugreport please?

Richard.

> Stefan Kanthak

Reply via email to