Issue |
139625
|
Summary |
Non-optimal code generation for NEON.
|
Labels |
new issue
|
Assignees |
|
Reporter |
TheBlackPlague
|
**Clang v20.1.3** / **Apple-M3**
See the below barebones example:
```cpp
template<typename T, size_t N>
using A = std::array<T, N>;
template<typename T, typename U, size_t N, size_t M)
void AFAF(A<T, N> x0, A<T, N>, x1, A<T, N * 2 * M> w, A<T, M> b)
{
for (size_t i = 0; i < M; i++) {
U a = 0;
for (size_t j = 0; j < N; j++) {
a += std::min(0, std::max(255, x[j])) * w[j ];
a += std::min(0, std::max(255, x[j])) * w[j + N];
}
y[i] = a + b[i];
}
}
```
Given N == 2, the following assembly is generated on `apple-m3`:
```
AFAF:
sub sp, sp, #128
stp d9, d8, [sp, #16]
stp x28, x27, [sp, #32]
stp x26, x25, [sp, #48]
stp x24, x23, [sp, #64]
stp x22, x21, [sp, #80]
stp x20, x19, [sp, #96]
stp x29, x30, [sp, #112]
str x3, [sp, #8]
mov x8, #0
add x9, x1, #32
add x10, x2, #32
add x11, x0, #32
movi.2d v0, #0000000000000000
movi.2d v1, #0xff00ff00ff00ff
mov w5, #532
mov w6, #534
mov w7, #536
mov w19, #538
mov w20, #540
mov w21, #542
mov w22, #544
mov w23, #546
mov w24, #548
mov w25, #550
mov w26, #552
mov w27, #554
movi.2d v2, #0000000000000000
mov w28, #556
movi.2d v3, #0000000000000000
mov w30, #558
mov w12, #560
movi.2d v4, #0000000000000000
mov w3, #562
movi.2d v6, #0000000000000000
movi.2d v7, #0000000000000000
movi.2d v5, #0000000000000000
movi.2d v17, #0000000000000000
movi.2d v16, #0000000000000000
mov w13, #564
mov w14, #566
mov w15, #568
mov w16, #570
mov w17, #572
mov w0, #574
LBB25_1:
lsl x1, x8, #1
orr x4, x1, #0x200
ldr h18, [x2, x4]
mov w4, #514
orr x4, x1, x4
add x4, x2, x4
ld1.h { v18 }[1], [x4]
mov w4, #516
orr x4, x1, x4
add x4, x2, x4
ld1.h { v18 }[2], [x4]
mov w4, #518
orr x4, x1, x4
add x4, x2, x4
ld1.h { v18 }[3], [x4]
mov w4, #520
orr x4, x1, x4
ldr h19, [x2, x4]
mov w4, #522
orr x4, x1, x4
add x4, x2, x4
ld1.h { v19 }[1], [x4]
mov w4, #524
orr x4, x1, x4
add x4, x2, x4
ld1.h { v19 }[2], [x4]
mov w4, #526
orr x4, x1, x4
add x4, x2, x4
ld1.h { v19 }[3], [x4]
mov w4, #528
orr x4, x1, x4
ldr h20, [x2, x4]
mov w4, #530
orr x4, x1, x4
add x4, x2, x4
ld1.h { v20 }[1], [x4]
orr x4, x1, x5
add x4, x2, x4
ld1.h { v20 }[2], [x4]
orr x4, x1, x6
add x4, x2, x4
ld1.h { v20 }[3], [x4]
orr x4, x1, x7
ldr h21, [x2, x4]
orr x4, x1, x19
add x4, x2, x4
ld1.h { v21 }[1], [x4]
orr x4, x1, x20
add x4, x2, x4
ld1.h { v21 }[2], [x4]
orr x4, x1, x21
add x4, x2, x4
ld1.h { v21 }[3], [x4]
orr x4, x1, x22
ldr h22, [x2, x4]
orr x4, x1, x23
add x4, x2, x4
ld1.h { v22 }[1], [x4]
orr x4, x1, x24
add x4, x2, x4
ld1.h { v22 }[2], [x4]
orr x4, x1, x25
add x4, x2, x4
ld1.h { v22 }[3], [x4]
orr x4, x1, x26
ldr h23, [x2, x4]
orr x4, x1, x27
add x4, x2, x4
ld1.h { v23 }[1], [x4]
orr x4, x1, x28
add x4, x2, x4
ld1.h { v23 }[2], [x4]
orr x4, x1, x30
add x4, x2, x4
ld1.h { v23 }[3], [x4]
orr x4, x1, x12
ldr h24, [x2, x4]
orr x4, x1, x3
add x4, x2, x4
ld1.h { v24 }[1], [x4]
orr x4, x1, x13
add x4, x2, x4
ld1.h { v24 }[2], [x4]
orr x4, x1, x14
add x4, x2, x4
ld1.h { v24 }[3], [x4]
orr x4, x1, x15
ldr h25, [x2, x4]
orr x4, x1, x16
add x4, x2, x4
ld1.h { v25 }[1], [x4]
orr x4, x1, x17
add x4, x2, x4
ld1.h { v25 }[2], [x4]
orr x1, x1, x0
add x1, x2, x1
ld1.h { v25 }[3], [x1]
ldp q26, q27, [x11, #-32]
ldp q28, q29, [x11], #64
smax.8h v26, v26, v0
smax.8h v27, v27, v0
smax.8h v28, v28, v0
smax.8h v29, v29, v0
smin.8h v26, v26, v1
smin.8h v27, v27, v1
smin.8h v28, v28, v1
smin.8h v29, v29, v1
ldp q30, q31, [x10, #-32]
ldp q8, q9, [x10], #64
smlal.4s v2, v26, v30
smlal2.4s v3, v26, v30
smlal.4s v4, v27, v31
smlal2.4s v6, v27, v31
smlal.4s v7, v28, v8
smlal2.4s v5, v28, v8
smlal.4s v17, v29, v9
smlal2.4s v16, v29, v9
ldp q26, q27, [x9, #-32]
ldp q28, q29, [x9], #64
smax.8h v26, v26, v0
smax.8h v27, v27, v0
smax.8h v28, v28, v0
smax.8h v29, v29, v0
smin.8h v26, v26, v1
smin.8h v27, v27, v1
smin.8h v28, v28, v1
smin.8h v29, v29, v1
ext.16b v30, v26, v26, #8
ext.16b v31, v27, v27, #8
ext.16b v8, v28, v28, #8
ext.16b v9, v29, v29, #8
smlal.4s v3, v30, v19
smlal.4s v2, v26, v18
smlal.4s v6, v31, v21
smlal.4s v4, v27, v20
smlal.4s v5, v8, v23
smlal.4s v7, v28, v22
smlal.4s v16, v9, v25
add x8, x8, #32
smlal.4s v17, v29, v24
cmp x8, #256
b.ne LBB25_1
add.4s v0, v4, v2
add.4s v1, v6, v3
add.4s v2, v17, v7
add.4s v0, v2, v0
add.4s v2, v16, v5
add.4s v1, v2, v1
add.4s v0, v0, v1
addv.4s s0, v0
fmov w8, s0
ldr x9, [sp, #8]
ldrsh w9, [x9]
add w0, w8, w9
ldp x29, x30, [sp, #112]
ldp x20, x19, [sp, #96]
ldp x22, x21, [sp, #80]
ldp x24, x23, [sp, #64]
ldp x26, x25, [sp, #48]
ldp x28, x27, [sp, #32]
ldp d9, d8, [sp, #16]
add sp, sp, #128
ret
```
This isn't good. It is doing unnecessary stuff.
However, when I increase N -> 384, suddenly it's all much better (same high-level C++ code):
```
movi.2d v0, #0000000000000000
movi.2d v1, #0xff00ff00ff00ff
movi.2d v2, #0000000000000000
movi.2d v3, #0000000000000000
add x8, x1, #32
add x9, x0, #32
mov w10, #384
movi.2d v5, #0000000000000000
movi.2d v6, #0000000000000000
movi.2d v7, #0000000000000000
movi.2d v4, #0000000000000000
movi.2d v17, #0000000000000000
movi.2d v16, #0000000000000000
LBB26_1:
ldp q18, q19, [x9, #-32]
ldp q20, q21, [x9], #64
smax.8h v18, v18, v0
smax.8h v19, v19, v0
smax.8h v20, v20, v0
smax.8h v21, v21, v0
smin.8h v18, v18, v1
smin.8h v19, v19, v1
smin.8h v20, v20, v1
smin.8h v21, v21, v1
ldp q22, q23, [x2]
ldp q24, q25, [x2, #32]
smlal.4s v2, v18, v22
smlal2.4s v3, v18, v22
smlal.4s v5, v19, v23
smlal2.4s v6, v19, v23
smlal.4s v7, v20, v24
smlal2.4s v4, v20, v24
smlal.4s v17, v21, v25
smlal2.4s v16, v21, v25
ldp q18, q19, [x8, #-32]
ldp q20, q21, [x8], #64
smax.8h v18, v18, v0
smax.8h v19, v19, v0
smax.8h v20, v20, v0
smax.8h v21, v21, v0
smin.8h v18, v18, v1
smin.8h v19, v19, v1
smin.8h v20, v20, v1
smin.8h v21, v21, v1
ldp q22, q23, [x2, #768]
ldp q24, q25, [x2, #800]
smlal2.4s v3, v18, v22
smlal.4s v2, v18, v22
smlal2.4s v6, v19, v23
smlal.4s v5, v19, v23
smlal2.4s v4, v20, v24
smlal.4s v7, v20, v24
smlal2.4s v16, v21, v25
smlal.4s v17, v21, v25
add x2, x2, #64
subs x10, x10, #32
b.ne LBB26_1
add.4s v0, v5, v2
add.4s v1, v6, v3
add.4s v2, v17, v7
add.4s v0, v2, v0
add.4s v2, v16, v4
add.4s v1, v2, v1
add.4s v0, v0, v1
addv.4s s0, v0
fmov w8, s0
ldrsh w9, [x3]
add w0, w8, w9
ret
```
_______________________________________________
llvm-bugs mailing list
llvm-bugs@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs