https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98438

            Bug ID: 98438
           Summary: Rather bad optimization of midpoint implementation for
                    __int128 (and other types)
           Product: gcc
           Version: 11.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: gabravier at gmail dot com
  Target Milestone: ---

_Tp midpoint(_Tp __a, _Tp __b) noexcept
{
    using _Up = std::make_unsigned_t<_Tp>;
    constexpr _Up __bitshift = std::numeric_limits<_Up>::digits - 1;

    _Up __diff = _Up(__b) - _Up(__a);
    _Up __sign_bit = __b < __a;

    _Up __half_diff = (__diff / 2) + (__sign_bit << __bitshift) + (__sign_bit &
__diff);

    return __a + __half_diff;
}

For `_Tp` `int`, this results in somewhat bad code generation on x86,
presumably due to the fact that GCC does not seem to know that `sub` generates
flags :

midpoint(int, int):
  mov edx, esi
  xor ecx, ecx
  sub edx, edi
  cmp esi, edi
  setl cl
  mov eax, edx
  mov esi, ecx
  shr eax
  and edx, ecx
  sal esi, 31
  add edx, edi
  add eax, esi
  add eax, edx
  ret

Whereas LLVM has better results :

midpoint(int, int): # @midpoint(int, int)
  xor eax, eax
  sub esi, edi
  setl al
  mov ecx, esi
  and esi, eax
  shl eax, 31
  shr ecx
  add eax, edi
  add eax, ecx
  add eax, esi
  ret

This seems to however be even worse with such types as __int128, where this is
the code generation from GCC :

midpoint(__int128, __int128):
  mov r10, rdx
  mov r11, rcx
  mov rax, rcx
  push r14
  sub r10, rdi
  push rbx
  mov r8, rdi
  mov r9, rsi
  sbb r11, rsi
  xor ebx, ebx
  cmp rdx, rdi
  mov ecx, 1
  sbb rax, rsi
  jl .L2
  xor ecx, ecx
.L2:
  mov rax, r10
  xor esi, esi
  mov r14, rcx
  mov rdx, r11
  shrd rax, r11, 1
  sal r14, 63
  shr rdx
  add rax, rsi
  adc rdx, r14
  and rcx, r10
  mov rsi, rcx
  mov rcx, r11
  and rcx, rbx
  add rsi, r8
  pop rbx
  pop r14
  mov rdi, rcx
  adc rdi, r9
  add rax, rsi
  adc rdx, rdi
  ret

And this is the code generation from LLVM :

midpoint(__int128, __int128): # @midpoint(__int128, __int128)
  mov rax, rdx
  sub rax, rdi
  sbb rcx, rsi
  mov r8, rax
  setl dl
  mov r9, rcx
  shr r8
  shr rcx
  shl r9, 63
  movzx edx, dl
  and eax, edx
  shl rdx, 63
  or r9, r8
  add rdx, rsi
  add r9, rdi
  adc rdx, rcx
  add rax, r9
  adc rdx, 0
  ret

With the GCC version requiring such a large amount of registers that it even
has to use some callee saved registers.

Reply via email to