[Bug target/82667] New: SSE2 redundant pcmpgtd for sign-extension of values known to be >= 0

peter at cordes dot ca Sun, 22 Oct 2017 18:30:52 -0700

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82667


            Bug ID: 82667
           Summary: SSE2 redundant pcmpgtd for sign-extension of values
                    known to be >= 0
           Product: gcc
           Version: 8.0
            Status: UNCONFIRMED
          Keywords: missed-optimization, ssemmx
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: peter at cordes dot ca
  Target Milestone: ---

long long sumarray(const int *data)
{
    data = (const int*)__builtin_assume_aligned(data, 64);
    long long sum = 0;
    for (int c=0 ; c<32768 ; c++)
        sum += (data[c] >= 128 ? data[c] : 0);

    return sum;
}

// Same function as pr 82666, see that for scalar cmov choices.

Same result with

        if (data[c] >= 128)
            sum += data[c];

https://godbolt.org/g/NwcPmh

gcc 8.0.0 20171022 -O3

        movdqa  .LC0(%rip), %xmm5          # set1(127)
        leaq    131072(%rdi), %rax
        pxor    %xmm2, %xmm2         # accumulator
        pxor    %xmm4, %xmm4         # for Intel CPUs we should re-materialize
with pxor inside the loop instead instead of movdqa.  But not AMD
.L2:
        movdqa  (%rdi), %xmm0
        addq    $16, %rdi
        movdqa  %xmm0, %xmm1
        pcmpgtd %xmm5, %xmm1
        pand    %xmm1, %xmm0
        # so far so good: we have conditionally zeroed xmm0

        movdqa  %xmm4, %xmm1
        pcmpgtd %xmm0, %xmm1    # 0 > x to generate high-half for
sign-extension
        movdqa  %xmm0, %xmm3

        punpckldq       %xmm1, %xmm3   # unpack with compare result
        punpckhdq       %xmm1, %xmm0   # (instead of just zero)
        paddq   %xmm3, %xmm2
        paddq   %xmm0, %xmm2
        cmpq    %rdi, %rax
        jne     .L2
        movdqa  %xmm2, %xmm0
        psrldq  $8, %xmm0       # requires a wasted movdqa vs. pshufd or
movhlps
        paddq   %xmm2, %xmm0
        movq    %xmm0, %rax
        ret

There are multiple inefficiencies that I pointed out in comments, but this bug
report is about doing sign extension when we can prove that simple zero
extension is sufficient.  Negative numbers are impossible from (x>=128 ? x :
0).

Changing the source to do zero-extension but still a signed compare stops it
from auto-vectorizing.

        int to_add = (data[c] >= 128 ? data[c] : 0);
        unsigned tmp = to_add;
        sum += (unsigned long long)tmp;  // zero-extension

Making everything unsigned does zero-extension as expected, but if the
comparison is signed, it either fails to auto-vectorize or it still uses
sign-extension.

e.g. this auto-vectorizes with sign-extension, but if you change the constant
to -128, it won't auto-vectorize at all (because then sign and zero extension
are no longer equivalent).

        int to_add = (data[c] >= 128 ? data[c] : 0);
        unsigned tmp = to_add;
        unsigned long long tmp_ull = tmp;  // zero-extension
        long long tmp_ll = tmp_ull;
        sum += tmp_ll;

[Bug target/82667] New: SSE2 redundant pcmpgtd for sign-extension of values known to be >= 0

Reply via email to