[llvm-bugs] [Bug 83030] Multi-precision addition using `__int128`/`i128` doesn't produce optimal code

LLVM Bugs via llvm-bugs Mon, 26 Feb 2024 08:42:10 -0800

Issue	83030
Summary	Multi-precision addition using `__int128`/`i128` doesn't produce optimal code
Labels	new issue
Assignees
Reporter	Eisenwave

    https://godbolt.org/z/vovWPxexE

## Expected output


Firstly, the expected "canonical" output which we got both for `_BitInt(512)` additions and `__builtin_addcll` looks as follows:
```cpp
using u128 = unsigned __int128;
using u64 = unsigned long long;

void add_adc(u64* __restrict dest, const u64* a, const u64* b)
{
    u64 carry = 0;
    for (int i = 0; i < 8; ++i) {
 dest[i] = __builtin_addcll(a[i], b[i], carry, &carry);
 }
}
```
```asm
add_adc(unsigned long long*, unsigned long long const*, unsigned long long const*): # @add_adc(unsigned long long*, unsigned long long const*, unsigned long long const*)
  mov rax, qword ptr [rsi]
  add rax, qword ptr [rdx]
  mov qword ptr [rdi], rax
  mov rax, qword ptr [rsi + 8]
  adc rax, qword ptr [rdx + 8]
  mov qword ptr [rdi + 8], rax
  mov rax, qword ptr [rsi + 16]
  adc rax, qword ptr [rdx + 16]
  mov qword ptr [rdi + 16], rax
  mov rax, qword ptr [rsi + 24]
  adc rax, qword ptr [rdx + 24]
  mov qword ptr [rdi + 24], rax
 mov rax, qword ptr [rsi + 32]
  adc rax, qword ptr [rdx + 32]
  mov qword ptr [rdi + 32], rax
  mov rax, qword ptr [rsi + 40]
  adc rax, qword ptr [rdx + 40]
  mov qword ptr [rdi + 40], rax
  mov rax, qword ptr [rsi + 48]
  adc rax, qword ptr [rdx + 48]
  mov qword ptr [rdi + 48], rax
  mov rax, qword ptr [rsi + 56]
  adc rax, qword ptr [rdx + 56]
  mov qword ptr [rdi + 56], rax
  ret
```
We get one `adc` per loop iteration, which is the theoretical optimum here.

## Actual output for 128-bit integers

When attempting to get the same codegen with 128-bit addition, we don't get the optimum:
```cpp
void add_128(u64* __restrict dest, const u64* a, const u64* b)
{
    u64 carry = 0;
    for (int i = 0; i < 8; ++i) {
        auto sum = u128(carry) + a[i] + b[i];
        dest[i] = sum;
        carry = sum >> 64;
    }
}
```
This compiles to:
```asm
add_128(unsigned long long*, unsigned long long const*, unsigned long long const*):
  mov rax, qword ptr [rsi]
  add rax, qword ptr [rdx]
  mov qword ptr [rdi], rax
  mov rax, qword ptr [rsi + 8]
 adc rax, 0
  setb cl
  movzx ecx, cl
  add rax, qword ptr [rdx + 8]
  mov qword ptr [rdi + 8], rax
  adc rcx, qword ptr [rsi + 16]
 setb al
  movzx eax, al
  add rcx, qword ptr [rdx + 16]
  mov qword ptr [rdi + 16], rcx
  adc rax, qword ptr [rsi + 24]
  setb cl
 movzx ecx, cl
  add rax, qword ptr [rdx + 24]
  mov qword ptr [rdi + 24], rax
  adc rcx, qword ptr [rsi + 32]
  setb al
  movzx eax, al
  add rcx, qword ptr [rdx + 32]
  mov qword ptr [rdi + 32], rcx
 adc rax, qword ptr [rsi + 40]
  setb cl
  movzx ecx, cl
  add rax, qword ptr [rdx + 40]
  mov qword ptr [rdi + 40], rax
  adc rcx, qword ptr [rsi + 48]
  setb al
  movzx eax, al
  add rcx, qword ptr [rdx + 48]
  mov qword ptr [rdi + 48], rcx
  adc rax, qword ptr [rsi + 56]
 add rax, qword ptr [rdx + 56]
  mov qword ptr [rdi + 56], rax
 ret
```
LLVM emits `adc` here, however, two per loop iteration.

Basically, `u128(carry) + a[i] + b[i]` turns into:
```asm
  adc rcx, qword ptr [rsi + 16] ; TMP = u128(carry) + a[i]
  setb al                       ; carry = CARRY_FLAG
  movzx eax, al                 ; carry = ZERO_EXTEND(carry)
  add rcx, qword ptr [rdx + 16] ; SUM = TMP + b[i]
  mov qword ptr [rdi + 16], rcx ; dest[i] = SUM
  adc rax, qword ptr [rsi + 24] ; TMP = u128(carry) + a[i]
; ...
```
Well, it doesn't quite translate into that and it's a bit interleaved, but overall, we need two additions per iteration and instead of keeping the carry in the carry CPU flag at all times, it's being "spilled" into a register.

## Relevant passes

Prior to x86 instruction selection, the `adc` variant still uses two additions per iteration:
```llvm
  %8 = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %6, i64 %7)
  %9 = extractvalue { i64, i1 } %8, 1
  %10 = extractvalue { i64, i1 } %8, 0
  %11 = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %10, i64 %5)
  %12 = extractvalue { i64, i1 } %11, 1
  %13 = extractvalue { i64, i1 } %11, 0
```
x86-isel combines these into
```llvm
  %4:gr64 = ADD64rm %3:gr64(tied-def 0), %2:gr64, 1, $noreg, 0, $noreg, implicit-def $eflags :: (load (s64) from %ir.b); example.cpp:18:19
```

The 128-bit variant prior to instruction selection looks like this, per iteration:
```llvm
 %conv.1 = zext i1 %ov to i128
  %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 8
  %3 = load i64, ptr %arrayidx.1, align 8
  %conv1.1 = zext i64 %3 to i128
  %add.1 = add nuw nsw i128 %conv1.1, %conv.1
 %arrayidx3.1 = getelementptr inbounds i8, ptr %b, i64 8
  %4 = load i64, ptr %arrayidx3.1, align 8
  %conv4.1 = zext i64 %4 to i128
```
Presumably, the `i128` addition would already need to be broken up into separate `uadd.with.overflow` calls prior to instruction selection to get the optimal codegen.

_______________________________________________
llvm-bugs mailing list
llvm-bugs@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

[llvm-bugs] [Bug 83030] Multi-precision addition using `__int128`/`i128` doesn't produce optimal code

Reply via email to