Issue |
148422
|
Summary |
RVV intrinsics not the best register allocation
|
Labels |
new issue
|
Assignees |
|
Reporter |
camel-cdr
|
While a lot better than GCC, LLVM currently produces a good amount of redundant moves for the following RVV function: https://godbolt.org/z/hnYKedzM9 (43 vs 32 instructions)
```c
vuint16m8_t
trans8x8_vslide(vuint16m8_t v)
{
size_t VL = __riscv_vsetvlmax_e64m4();
vbool16_t modd = __riscv_vreinterpret_b16(
__riscv_vmv_v_x_u8m1(0b10101010, __riscv_vsetvlmax_e8m1()));
vbool16_t meven = __riscv_vmnot(modd, VL);
vbool16_t m;
vuint64m4_t v4l = __riscv_vreinterpret_u64m4(__riscv_vget_u16m4(v, 0));
vuint64m4_t v4h = __riscv_vreinterpret_u64m4(__riscv_vget_u16m4(v, 1));
vuint64m4_t v4lt = v4l;
m = modd;
v4l = __riscv_vslide1up_mu(m, v4l, v4h, 0, VL);
m = meven;
v4h = __riscv_vslide1down_mu(m, v4h, v4lt, 0, VL);
vuint32m2_t v2ll = __riscv_vreinterpret_u32m2(__riscv_vget_u64m2(v4l, 0));
vuint32m2_t v2lh = __riscv_vreinterpret_u32m2(__riscv_vget_u64m2(v4l, 1));
vuint32m2_t v2hl = __riscv_vreinterpret_u32m2(__riscv_vget_u64m2(v4h, 0));
vuint32m2_t v2hh = __riscv_vreinterpret_u32m2(__riscv_vget_u64m2(v4h, 1));
vuint32m2_t v2llt = v2lh, v2hlt = v2hh;
v2lh = __riscv_vslide1down_mu(m, v2lh, v2ll, 0, VL);
v2hh = __riscv_vslide1down_mu(m, v2hh, v2hl, 0, VL);
m = modd;
v2ll = __riscv_vslide1up_mu(m, v2ll, v2llt, 0, VL);
v2hl = __riscv_vslide1up_mu(m, v2hl, v2hlt, 0, VL);
vuint16m1_t v1lll = __riscv_vreinterpret_u16m1(__riscv_vget_u32m1(v2ll, 0));
vuint16m1_t v1llh = __riscv_vreinterpret_u16m1(__riscv_vget_u32m1(v2ll, 1));
vuint16m1_t v1lhl = __riscv_vreinterpret_u16m1(__riscv_vget_u32m1(v2lh, 0));
vuint16m1_t v1lhh = __riscv_vreinterpret_u16m1(__riscv_vget_u32m1(v2lh, 1));
vuint16m1_t v1hll = __riscv_vreinterpret_u16m1(__riscv_vget_u32m1(v2hl, 0));
vuint16m1_t v1hlh = __riscv_vreinterpret_u16m1(__riscv_vget_u32m1(v2hl, 1));
vuint16m1_t v1hhl = __riscv_vreinterpret_u16m1(__riscv_vget_u32m1(v2hh, 0));
vuint16m1_t v1hhh = __riscv_vreinterpret_u16m1(__riscv_vget_u32m1(v2hh, 1));
vuint16m1_t v1lllt = v1lll, v1lhlt = v1lhl, v1hllt = v1hll, v1hhlt = v1hhl;
v1lll = __riscv_vslide1up_mu(m, v1lll, v1llh, 0, VL);
v1lhl = __riscv_vslide1up_mu(m, v1lhl, v1lhh, 0, VL);
v1hll = __riscv_vslide1up_mu(m, v1hll, v1hlh, 0, VL);
v1hhl = __riscv_vslide1up_mu(m, v1hhl, v1hhh, 0, VL);
m = meven;
v1llh = __riscv_vslide1down_mu(m, v1llh, v1lllt, 0, VL);
v1lhh = __riscv_vslide1down_mu(m, v1lhh, v1lhlt, 0, VL);
v1hlh = __riscv_vslide1down_mu(m, v1hlh, v1hllt, 0, VL);
v1hhh = __riscv_vslide1down_mu(m, v1hhh, v1hhlt, 0, VL);
return __riscv_vcreate_v_u16m1_u16m8(
v1lll, v1llh, v1lhl, v1lhh,
v1hll, v1hlh, v1hhl, v1hhh);
}
```
This is probably quite a hard problem, but I thought I'd share it anyway. Maybe slight improvements for this example can have a compounding effect on mask register allocation.
_______________________________________________
llvm-bugs mailing list
llvm-bugs@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs