https://gcc.gnu.org/bugzilla/show_bug.cgi?id=116014

            Bug ID: 116014
           Summary: Missed optimization opportunity: inverted shift count
           Product: gcc
           Version: 14.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: bisqwit at iki dot fi
  Target Milestone: ---

Below are six short functions which perform bit-shifts by a non-constant
inverted amount. GCC fails to generate most optimal code. Further explanation
is given below the assembler code.

    #include <stdint.h>
    uint64_t shl_m64(uint64_t value, uint8_t k)
    {
        return value << (64-k);
    }
    uint64_t shl_m63(uint64_t value, uint8_t k)
    {
        return value << (63-k);
    }
    uint64_t shr_m64(uint64_t value, uint8_t k)
    {
        return value >> (64-k);
    }
    uint64_t shr_m63(uint64_t value, uint8_t k)
    {
        return value >> (63-k);
    }
    int64_t asr_m64(int64_t value, uint8_t k)
    {
        return value >> (64-k);
    }
    int64_t asr_m63(int64_t value, uint8_t k)
    {
        return value >> (63-k);
    }

Below is the code generated by GCC, using -Ofast -mbmi2 -masm=intel. BMI2 is
used just to make the assembler code more succinct; it is not relevant for the
report.

    shl_m64:
        mov     eax, 64
        sub     eax, esi
        shlx    rax, rdi, rax
        ret
    shl_m63:
        mov     eax, 63
        sub     eax, esi
        shlx    rax, rdi, rax
        ret
    shr_m64:
        mov     eax, 64
        sub     eax, esi
        shrx    rax, rdi, rax
        ret
    shr_m63:
        mov     eax, 63
        sub     eax, esi
        shrx    rax, rdi, rax
        ret
    asr_m64:
        mov     eax, 64
        sub     eax, esi
        sarx    rax, rdi, rax
        ret
    asr_m63:
        mov     eax, 63
        sub     eax, esi
        sarx    rax, rdi, rax
        ret

GCC fails to utilize the fact that on Intel, the shift instructions
automatically mask the shift-count into the target register width. That is,
shift of a 64-bit operand by 68 is the same as shift by 68%64 = 4, and shift of
a 32-bit operand by 100 is the same shift by 100%32 = 4. Utilizing this
knowledge permits the use of single-insn neg/not to replace the subtract, which
requires two insns.

In comparison, Clang (version 16) produces this (optimal) code:

    shl_m64:
        neg     sil
        shlx    rax, rdi, rsi
        ret
    shl_m63:
        not     sil
        shlx    rax, rdi, rsi
        ret
    shr_m64:
        neg     sil
        shrx    rax, rdi, rsi
        ret
    shr_m63:
        not     sil
        shrx    rax, rdi, rsi
        ret
    asr_m64:
        neg     sil
        sarx    rax, rdi, rsi
        ret
    asr_m63:
        not     sil
        sarx    rax, rdi, rsi
        ret

Tested GCC version: GCC: (Debian 14-20240330-1) 14.0.1 20240330 (experimental)
[master r14-9728-g6fc84f680d0]

Reply via email to