[Bug target/51838] Inefficient add of 128 bit quantity represented as 64 bit tuple to 128 bit integer.

crazylht at gmail dot com via Gcc-bugs Sun, 29 Aug 2021 21:58:34 -0700

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51838


--- Comment #2 from Hongtao.liu <crazylht at gmail dot com> ---
(In reply to Andrew Pinski from comment #1)
> We do get slightly better now:
>         xorl    %eax, %eax
>         movq    %rdi, %r8
>         xorl    %edi, %edi
>         addq    %rsi, %rax
>         adcq    %rdi, %rdx
>         addq    %rax, (%r8)
>         adcq    %rdx, 8(%r8)
>         ret
> 
> Note on arch64 we do get good code:
>         ldp     x3, x4, [x0]
>         adds    x3, x3, x1
>         adc     x4, x4, x2
>         stp     x3, x4, [x0]
>         ret

The interest thing is when i remove addti3 and ashlti3 from i386.md, GCC
generates optimal code.

void foo(__uint128_t *x, unsigned long long y, unsigned long long z)
{
  *x += y + ((__uint128_t) z << 64);
}

void foo1(__uint128_t *x, unsigned long long y, unsigned long long z)
{
        *x += (__uint128_t) z << 64;
}

void foo2(__uint128_t *x, unsigned long long y, unsigned long long z)
{
        *x += (__uint128_t) z << 3;
}

void foo3(__uint128_t *x, __uint128_t *y)
{
        *x += *y;
}


diff --git a/origin.s b/test.s
index 08274ba..764241a 100644
--- a/origin.s
+++ b/test.s
@@ -6,13 +6,8 @@
 foo:
 .LFB0:
        .cfi_startproc
-       xorl    %eax, %eax
-       movq    %rdi, %r8
-       xorl    %edi, %edi
-       addq    %rsi, %rax
-       adcq    %rdi, %rdx
-       addq    %rax, (%r8)
-       adcq    %rdx, 8(%r8)
+       addq    %rsi, (%rdi)
+       adcq    %rdx, 8(%rdi)
        ret
        .cfi_endproc
 .LFE0:
@@ -23,9 +18,7 @@ foo:
 foo1:
 .LFB1:
        .cfi_startproc
-       xorl    %eax, %eax
-       addq    %rax, (%rdi)
-       adcq    %rdx, 8(%rdi)
+       addq    %rdx, 8(%rdi)
        ret
        .cfi_endproc
 .LFE1:
@@ -36,13 +29,13 @@ foo1:
 foo2:
 .LFB2:
        .cfi_startproc
-       movq    %rdx, %rax
-       movq    %rdx, %r8
-       salq    $3, %rax
-       xorl    %edx, %edx
-       shldq   $3, %r8, %rdx
-       addq    %rax, (%rdi)
-       adcq    %rdx, 8(%rdi)
+       movq    (%rdi), %rcx
+       movq    %rdx, %rsi
+       shrq    $61, %rsi
+       leaq    (%rcx,%rdx,8), %rax
+       cmpq    %rcx, %rax
+       movq    %rax, (%rdi)
+       adcq    %rsi, 8(%rdi)
        ret
        .cfi_endproc
 .LFE2:
@@ -53,9 +46,10 @@ foo2:
 foo3:
 .LFB3:
        .cfi_startproc
-       movq    (%rsi), %rax
+       movq    (%rdi), %rax
+       addq    (%rsi), %rax
        movq    8(%rsi), %rdx
-       addq    %rax, (%rdi)
+       movq    %rax, (%rdi)
        adcq    %rdx, 8(%rdi)
        ret
        .cfi_endproc
(END)

[Bug target/51838] Inefficient add of 128 bit quantity represented as 64 bit tuple to 128 bit integer.

Reply via email to