https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88834

--- Comment #8 from kugan at gcc dot gnu.org ---
(In reply to rsand...@gcc.gnu.org from comment #7)
> Thanks for looking at this.
> 
> (In reply to kugan from comment #6)
> >     cmp     w3, 0
> >     ble     .L1
> >     sub     w3, w3, #1
> >     mov     x4, 0
> >     cntw    x5
> >     ptrue   p1.s, all
> >     lsr     w3, w3, 1
> >     add     w3, w3, 1
> >     whilelo p0.s, xzr, x3
> >     .p2align 3,,7
> > .L3:
> >     ld2w    {z4.s - z5.s}, p0/z, [x1, x4, lsl 2]
> >     ld2w    {z2.s - z3.s}, p0/z, [x2, x4, lsl 2]
> >     add     z0.s, z4.s, z2.s
> >     sub     z1.s, z5.s, z3.s
> >     st2w    {z0.s - z1.s}, p0, [x0, x4, lsl 2]
> >     whilelo p0.s, x5, x3
> >     incb    x4, all, mul #2
> >     incw    x5
> >     ptest   p1, p0.b
> >     bne     .L3
> > .L1:
> >     ret
> >     .cfi_endproc
> 
> This doesn't look right.  x4 is an index, so it should be
> incremented by the number of words in two vectors, rather than
> the number of bytes in two vectors.

Thanks for the comments. Fixed it with the attached patch it generates

f:
.LFB0:
        .cfi_startproc
        cmp     w3, 0
        ble     .L1
        sub     w5, w3, #1
        cntw    x4
        mov     x3, 0
        ptrue   p1.s, all
        lsr     w5, w5, 1
        add     w5, w5, 1
        whilelo p0.s, xzr, x5
        .p2align 3,,7
.L3:
        ld2w    {z4.s - z5.s}, p0/z, [x1, x3, lsl 2]
        ld2w    {z2.s - z3.s}, p0/z, [x2, x3, lsl 2]
        add     z0.s, z4.s, z2.s
        sub     z1.s, z5.s, z3.s
        st2w    {z0.s - z1.s}, p0, [x0, x3, lsl 2]
        whilelo p0.s, x4, x5
        inch    x3
        incw    x4
        ptest   p1, p0.b
        bne     .L3
.L1:
        ret
        .cfi_endproc

Reply via email to