With r15-1618-g9f168b412f4, I get the following asm generated for the test case:
.align 1
.align 2
.global test5
.syntax unified
.thumb
.thumb_func
.type test5, %function
test5:
@ args = 4, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
push {r4, r5, r6, lr}
ldr r4, [sp, #16]
cmp r4, #0
ble .L37
sub ip, r4, #16
adds r6, r2, r4
adds r5, r1, r4
add r0, r0, r4
dlstp.8 lr, r4
.L39:
subs r2, r5, r4
subs r1, r0, r4
vldrb.8 q3, [r1]
vldrb.8 q2, [r2]
subs r2, r6, r4
mov r4, ip
sub ip, ip, #16
vadd.i8 q3, q3, q2
vstrb.8 q3, [r2]
vstrb.8 q3, [r3]
letp lr, .L39
.L37:
pop {r4, r5, r6, pc}
.size test5, .-test5
...
.align 1
.align 2
.global test8
.syntax unified
.thumb
.thumb_func
.type test8, %function
test8:
@ args = 4, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
push {r4, lr}
ldr r4, [sp, #8]
cmp r3, #0
ble .L59
dlstp.32 lr, r3
.L61:
vldrw.32 q3, [r0], #16
vctp.32 r4
vpst
vldrwt.32 q2, [r1], #16
adds r4, r4, #1
vadd.i32 q3, q3, q2
vstrw.32 q3, [r2], #16
letp lr, .L61
.L59:
pop {r4, pc}
.size test8, .-test8
With r15-1619-g3b9b8d6cfdf, I instead get:
.align 1
.align 2
.global test5
.syntax unified
.thumb
.thumb_func
.type test5, %function
test5:
@ args = 4, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
push {r4, r5, r6, lr}
ldr ip, [sp, #16]
cmp ip, #0
ble .L37
mov r6, r3
sub r3, ip, #16
add r5, r2, ip
add r4, r1, ip
add r0, r0, ip
dlstp.8 lr, ip
.L39:
sub r2, r4, ip
sub r1, r0, ip
vldrb.8 q3, [r1]
vldrb.8 q2, [r2]
sub r2, r5, ip
mov ip, r3
subs r3, r3, #16
vadd.i8 q3, q3, q2
vstrb.8 q3, [r2]
vstrb.8 q3, [r6]
letp lr, .L39
.L37:
pop {r4, r5, r6, pc}
.size test5, .-test5
...
.align 1
.align 2
.global test8
.syntax unified
.thumb
.thumb_func
.type test8, %function
test8:
@ args = 4, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
push {lr}
ldr ip, [sp, #4]
cmp r3, #0
ble .L59
dlstp.32 lr, r3
.L61:
vldrw.32 q3, [r0], #16
vctp.32 ip
vpst
vldrwt.32 q2, [r1], #16
add ip, ip, #1
vadd.i32 q3, q3, q2
vstrw.32 q3, [r2], #16
letp lr, .L61
.L59:
ldr pc, [sp], #4
.size test8, .-test8
As can be seen, with r15-1619-g3b9b8d6cfdf, it now uses ip in ways that it did
not before. I think this part is fine.
It also, for some reason, decides to move r3 into r6 in test5 and then use
that later for the vstrb.8. While I suppose it does work, it will consume one
extra mov, so it's slightly bigger.
With below patch, I no longer see any failure reported for arm-none-eabi.
Even with the slight size increase for test5, is it ok for trunk?
--
Since r15-1619-g3b9b8d6cfdf, test5 and test8 fails due to that "ip"
might be used and r3 might be moved to another register for later
dereference.
gcc/testsuite/ChangeLog:
PR testsuite/116623
* gcc.target/arm/mve/dlstp-compile-asm-2.c: Align test5 and
test8 with changes in r15-1619-g3b9b8d6cfdf.