https://gcc.gnu.org/bugzilla/show_bug.cgi?id=117860

            Bug ID: 117860
           Summary: GCC emits an unnecessary mov for x86
                    _addcarry/_subborrow intrinsic calls where the second
                    operand is a constant that is within the range of a
                    32-bit integer
           Product: gcc
           Version: 14.2.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: john_platts at hotmail dot com
  Target Milestone: ---

Here is a snippet of C99/C++ code where GCC fails to optimize the adc/sbb down
to a single instruction in the case where the second adc/sbb operand (or the
third argument of the _addcarry_u32/_addcarry_u64/_subborrow_u32/_subborrow_u64
intrinsics) is known to be constant and within the range of a 32-bit integer:
#include <stdint.h>

#if (defined(__GNUC__) || defined(__clang__))
#include <immintrin.h>
#elif defined(_MSC_VER)
#include <intrin.h>
#endif

typedef struct {
  uint64_t lo64;
  uint64_t mid64;
  uint64_t hi64;
} UInt192;

UInt192 SomeAddFunc(uint64_t a_lo, uint64_t a_hi, uint64_t b) {
  UInt192 result;
  unsigned char cf;
  unsigned long long sum;

  cf = _addcarry_u64(0, a_lo, b, &sum);
  result.lo64 = sum;

  cf = _addcarry_u64(cf, a_hi, 5, &sum);
  result.mid64 = sum;
  result.hi64 = cf;

  return result;
}


UInt192 SomeSubFunc(uint64_t a_lo, uint64_t a_hi, uint64_t b) {
  UInt192 result;
  unsigned char cf;
  unsigned long long diff;

  cf = _subborrow_u64(0, a_lo, b, &diff);
  result.lo64 = diff;

  cf = _subborrow_u64(cf, a_hi, 17, &diff);
  result.mid64 = diff;
  (void)_subborrow_u64(cf, 0, 0, &diff);
  result.hi64 = diff;

  return result;
}

Here is the code that GCC 14.2.0 generates for the above snippet with the -O2
option:
SomeAddFunc:
        add     rsi, rcx
        mov     ecx, 5
        mov     rax, rdi
        adc     rdx, rcx
        movq    xmm0, rsi
        movq    xmm1, rdx
        setc    dl
        punpcklqdq      xmm0, xmm1
        movzx   edx, dl
        mov     QWORD PTR [rdi+16], rdx
        movups  XMMWORD PTR [rdi], xmm0
        ret
SomeSubFunc:
        sub     rsi, rcx
        mov     ecx, 17
        mov     rax, rdi
        sbb     rdx, rcx
        movq    xmm0, rsi
        movq    xmm1, rdx
        sbb     rdx, rdx
        punpcklqdq      xmm0, xmm1
        mov     QWORD PTR [rdi+16], rdx
        movups  XMMWORD PTR [rdi], xmm0
        ret

In the SomeAddFunc code that is generated by GCC 14.2.0, GCC fails to optimize
the following instructions down to adc rdx, 5 when optimizations are enabled:
        mov     ecx, 5
        adc     rdx, rcx

Likewise, in the SomeSubFunc code that is generated by GCC 14.2.0, GCC fails to
optimize the following instructions down to sbb rdx, 17 when optimizations are
enabled:
        mov     ecx, 17
        sbb     rdx, rcx

A demonstration of the above snippet being compiled with GCC 14.2.0, Clang
19.1.0, and MSVC v19.40 can be found at https://godbolt.org/z/zW8WToP5G.

Reply via email to