https://gcc.gnu.org/bugzilla/show_bug.cgi?id=91154

--- Comment #16 from Richard Biener <rguenth at gcc dot gnu.org> ---
Ah, because x86_64_general_operand allows memory but the v alternative not
and reloading that is appearantly more expensive than not doing that and
reloading the general reg later.  Fun.  Changing that to
x86_64_nonmemory_operand
makes the whole thing work nearly fully (for this testcase, breaking everything
else of course), there's one gpr op remaining again because we get memory,
this time in the first operand which I kept as nonimmediate_operand.
Not sure how we make RA happier to reload a memory operand for the v,v,v
alternative without doing that elsewhere.

        movl    $-987654321, %r10d
        vmovd   (%rdi), %xmm0
        leal    -1(%r8), %r9d
        xorl    %eax, %eax
        vmovd   %r10d, %xmm1
        .p2align 4,,10
        .p2align 3
.L3:
        vmovd   (%rdx,%rax,4), %xmm2
        vpaddd  %xmm2, %xmm0, %xmm0
        vmovd   %xmm0, 4(%rdi,%rax,4)
        movl    (%rcx,%rax,4), %r8d
        addl    (%rsi,%rax,4), %r8d
        vmovd   %r8d, %xmm3
        movq    %rax, %r8
        vpmaxsd %xmm0, %xmm3, %xmm0
        vpmaxsd %xmm1, %xmm0, %xmm0
        vmovd   %xmm0, 4(%rdi,%rax,4)
        addq    $1, %rax
        cmpq    %r9, %r8
        jne     .L3

Reply via email to