https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51838
--- Comment #2 from Hongtao.liu <crazylht at gmail dot com> --- (In reply to Andrew Pinski from comment #1) > We do get slightly better now: > xorl %eax, %eax > movq %rdi, %r8 > xorl %edi, %edi > addq %rsi, %rax > adcq %rdi, %rdx > addq %rax, (%r8) > adcq %rdx, 8(%r8) > ret > > Note on arch64 we do get good code: > ldp x3, x4, [x0] > adds x3, x3, x1 > adc x4, x4, x2 > stp x3, x4, [x0] > ret The interest thing is when i remove addti3 and ashlti3 from i386.md, GCC generates optimal code. void foo(__uint128_t *x, unsigned long long y, unsigned long long z) { *x += y + ((__uint128_t) z << 64); } void foo1(__uint128_t *x, unsigned long long y, unsigned long long z) { *x += (__uint128_t) z << 64; } void foo2(__uint128_t *x, unsigned long long y, unsigned long long z) { *x += (__uint128_t) z << 3; } void foo3(__uint128_t *x, __uint128_t *y) { *x += *y; } diff --git a/origin.s b/test.s index 08274ba..764241a 100644 --- a/origin.s +++ b/test.s @@ -6,13 +6,8 @@ foo: .LFB0: .cfi_startproc - xorl %eax, %eax - movq %rdi, %r8 - xorl %edi, %edi - addq %rsi, %rax - adcq %rdi, %rdx - addq %rax, (%r8) - adcq %rdx, 8(%r8) + addq %rsi, (%rdi) + adcq %rdx, 8(%rdi) ret .cfi_endproc .LFE0: @@ -23,9 +18,7 @@ foo: foo1: .LFB1: .cfi_startproc - xorl %eax, %eax - addq %rax, (%rdi) - adcq %rdx, 8(%rdi) + addq %rdx, 8(%rdi) ret .cfi_endproc .LFE1: @@ -36,13 +29,13 @@ foo1: foo2: .LFB2: .cfi_startproc - movq %rdx, %rax - movq %rdx, %r8 - salq $3, %rax - xorl %edx, %edx - shldq $3, %r8, %rdx - addq %rax, (%rdi) - adcq %rdx, 8(%rdi) + movq (%rdi), %rcx + movq %rdx, %rsi + shrq $61, %rsi + leaq (%rcx,%rdx,8), %rax + cmpq %rcx, %rax + movq %rax, (%rdi) + adcq %rsi, 8(%rdi) ret .cfi_endproc .LFE2: @@ -53,9 +46,10 @@ foo2: foo3: .LFB3: .cfi_startproc - movq (%rsi), %rax + movq (%rdi), %rax + addq (%rsi), %rax movq 8(%rsi), %rdx - addq %rax, (%rdi) + movq %rax, (%rdi) adcq %rdx, 8(%rdi) ret .cfi_endproc (END)