https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97875
Bug ID: 97875
Summary: suboptimal loop vectorization
Product: gcc
Version: 11.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: tree-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: clyon at gcc dot gnu.org
Target Milestone: ---
Looking at the code generated for gcc.target/arm/simd/mve-vsub_1.c:
#include <stdint.h>
void test_vsub_i32 (int32_t * dest, int32_t * a, int32_t * b) {
int i;
for (i=0; i<4; i++) {
dest[i] = a[i] - b[i];
}
}
Compiled with -mfloat-abi=hard -mfpu=auto -march=armv8.1-m.main+mve -mthumb
-O3, we get:
test_vsub_i32:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
@ link register save eliminated.
add ip, r1, #4
adds r3, r2, #4
sub ip, r0, ip
subs r3, r0, r3
cmp ip, #8
it hi
cmphi r3, #8
bls .L2
orr r3, r2, r0
orrs r3, r3, r1
lsls r3, r3, #28
bne .L2
vldrw.32 q3, [r1]
vldrw.32 q2, [r2]
vsub.i32 q3, q3, q2
vstrw.32 q3, [r0]
bx lr
.L2:
ldr r3, [r1]
push {r4}
ldr r4, [r2]
subs r3, r3, r4
str r3, [r0]
ldr r4, [r2, #4]
ldr r3, [r1, #4]
subs r3, r3, r4
str r3, [r0, #4]
ldr r4, [r2, #8]
ldr r3, [r1, #8]
subs r3, r3, r4
str r3, [r0, #8]
ldr r3, [r1, #12]
ldr r2, [r2, #12]
ldr r4, [sp], #4
subs r3, r3, r2
str r3, [r0, #12]
bx lr
but only the short vectorized part is necessary:
vldrw.32 q3, [r1]
vldrw.32 q2, [r2]
vsub.i32 q3, q3, q2
vstrw.32 q3, [r0]
bx lr
Since the loop trip count is constant (=4), why isn't this better optimized?
If I declare 'dest' as __restrict__, I get something better, but still not
perfect:
test_vsub_i32:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
@ link register save eliminated.
orr r3, r2, r0
orrs r3, r3, r1
lsls r3, r3, #28
bne .L2
vldrw.32 q3, [r1]
vldrw.32 q2, [r2]
vsub.i32 q3, q3, q2
vstrw.32 q3, [r0]
bx lr
.L2:
push {r4, r5}
ldr r3, [r1]
ldr r4, [r2]
subs r4, r3, r4
str r4, [r0]
ldr r3, [r1, #4]
ldr r4, [r2, #4]
subs r5, r3, r4
str r5, [r0, #4]
ldrd r4, r3, [r1, #8]
ldrd r5, r1, [r2, #8]
subs r4, r4, r5
subs r3, r3, r1
strd r4, r3, [r0, #8]
pop {r4, r5}
bx lr
Compiling for cortex-a9 and Neon:
-mfloat-abi=hard -mcpu=cortex-a9 -mfpu=neon -O3
test_vsub_i32:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
@ link register save eliminated.
add ip, r2, #4
adds r3, r1, #4
sub ip, r0, ip
subs r3, r0, r3
cmp ip, #8
it hi
cmphi r3, #8
bls .L2
vld1.32 {q8}, [r1]
vld1.32 {q9}, [r2]
vsub.i32 q8, q8, q9
vst1.32 {q8}, [r0]
bx lr
.L2:
ldr r3, [r1]
push {r4}
ldr r4, [r2]
subs r3, r3, r4
str r3, [r0]
ldr r4, [r2, #4]
ldr r3, [r1, #4]
subs r3, r3, r4
str r3, [r0, #4]
ldr r4, [r2, #8]
ldr r3, [r1, #8]
subs r3, r3, r4
ldr r4, [sp], #4
str r3, [r0, #8]
ldr r3, [r1, #12]
ldr r2, [r2, #12]
subs r3, r3, r2
str r3, [r0, #12]
bx lr
But in this case adding __restrict__ works well:
test_vsub_i32:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
@ link register save eliminated.
vld1.32 {q8}, [r1]
vld1.32 {q9}, [r2]
vsub.i32 q8, q8, q9
vst1.32 {q8}, [r0]
bx lr