LGTM <juzhe.zh...@rivai.ai> 於 2023年5月15日 週一 11:16 寫道:
> From: Juzhe-Zhong <juzhe.zh...@rivai.ai> > > This patch optimizes both RVV VLA && VLS vectorization. > > Consider this following case: > void __attribute__((noinline, noclone)) > f (int * __restrict dst, int * __restrict op1, int * __restrict op2, int > count) > { > for (int i = 0; i < count; ++i) > dst[i] = op1[i] + op2[i]; > } > > VLA: > Before this patch: > ble a3,zero,.L1 > srli a4,a1,2 > negw a4,a4 > andi a5,a4,3 > sext.w a3,a3 > beq a5,zero,.L3 > lw a7,0(a1) > lw a6,0(a2) > andi a4,a4,2 > addw a6,a6,a7 > sw a6,0(a0) > beq a4,zero,.L3 > lw a7,4(a1) > lw a4,4(a2) > li a6,3 > addw a4,a4,a7 > sw a4,4(a0) > bne a5,a6,.L3 > lw a6,8(a2) > lw a4,8(a1) > addw a4,a4,a6 > sw a4,8(a0) > .L3: > subw a3,a3,a5 > slli a4,a3,32 > csrr a6,vlenb > srli a4,a4,32 > srli a6,a6,2 > slli a3,a5,2 > mv a5,a4 > bgtu a4,a6,.L17 > .L5: > csrr a6,vlenb > add a1,a1,a3 > add a2,a2,a3 > add a0,a0,a3 > srli a7,a6,2 > li a3,0 > .L8: > vsetvli zero,a5,e32,m1,ta,ma > vle32.v v1,0(a1) > vle32.v v2,0(a2) > vsetvli t1,zero,e32,m1,ta,ma > add a3,a3,a7 > vadd.vv v1,v1,v2 > vsetvli zero,a5,e32,m1,ta,ma > vse32.v v1,0(a0) > mv a5,a4 > bleu a4,a3,.L6 > mv a5,a3 > .L6: > sub a5,a4,a5 > bleu a5,a7,.L7 > mv a5,a7 > .L7: > add a1,a1,a6 > add a2,a2,a6 > add a0,a0,a6 > bne a5,zero,.L8 > .L1: > ret > .L17: > mv a5,a6 > j .L5 > > After this patch: > f: > ble a3,zero,.L1 > csrr a4,vlenb > srli a4,a4,2 > mv a5,a3 > bgtu a3,a4,.L9 > .L3: > csrr a6,vlenb > li a4,0 > srli a7,a6,2 > .L6: > vsetvli zero,a5,e32,m1,ta,ma > vle32.v v2,0(a1) > vle32.v v1,0(a2) > vsetvli t1,zero,e32,m1,ta,ma > add a4,a4,a7 > vadd.vv v1,v1,v2 > vsetvli zero,a5,e32,m1,ta,ma > vse32.v v1,0(a0) > mv a5,a3 > bleu a3,a4,.L4 > mv a5,a4 > .L4: > sub a5,a3,a5 > bleu a5,a7,.L5 > mv a5,a7 > .L5: > add a0,a0,a6 > add a2,a2,a6 > add a1,a1,a6 > bne a5,zero,.L6 > .L1: > ret > .L9: > mv a5,a4 > j .L3 > > VLS: > Before this patch: > f3: > ble a3,zero,.L1 > srli a5,a1,2 > negw a5,a5 > andi a4,a5,3 > sext.w a3,a3 > beq a4,zero,.L3 > lw a7,0(a1) > lw a6,0(a2) > andi a5,a5,2 > addw a6,a6,a7 > sw a6,0(a0) > beq a5,zero,.L3 > lw a7,4(a1) > lw a5,4(a2) > li a6,3 > addw a5,a5,a7 > sw a5,4(a0) > bne a4,a6,.L3 > lw a6,8(a2) > lw a5,8(a1) > addw a5,a5,a6 > sw a5,8(a0) > .L3: > subw a3,a3,a4 > slli a6,a4,2 > slli a5,a3,32 > srli a5,a5,32 > add a1,a1,a6 > add a2,a2,a6 > add a0,a0,a6 > li a3,4 > .L6: > mv a4,a5 > bleu a5,a3,.L5 > li a4,4 > .L5: > vsetvli zero,a4,e32,m1,ta,ma > vle32.v v1,0(a1) > vle32.v v2,0(a2) > vsetivli zero,4,e32,m1,ta,ma > sub a5,a5,a4 > vadd.vv v1,v1,v2 > vsetvli zero,a4,e32,m1,ta,ma > vse32.v v1,0(a0) > addi a1,a1,16 > addi a2,a2,16 > addi a0,a0,16 > bne a5,zero,.L6 > .L1: > ret > > After this patch: > f3: > ble a3,zero,.L1 > li a4,4 > .L4: > mv a5,a3 > bleu a3,a4,.L3 > li a5,4 > .L3: > vsetvli zero,a5,e32,m1,ta,ma > vle32.v v2,0(a1) > vle32.v v1,0(a2) > vsetivli zero,4,e32,m1,ta,ma > sub a3,a3,a5 > vadd.vv v1,v1,v2 > vsetvli zero,a5,e32,m1,ta,ma > vse32.v v1,0(a0) > addi a2,a2,16 > addi a0,a0,16 > addi a1,a1,16 > bne a3,zero,.L4 > .L1: > ret > > gcc/ChangeLog: > > * config/riscv/riscv.cc > (riscv_vectorize_preferred_vector_alignment): New function. > (TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT): New target hook. > > gcc/testsuite/ChangeLog: > > * gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c: Adapt > testcase. > * gcc.target/riscv/rvv/autovec/align-1.c: New test. > * gcc.target/riscv/rvv/autovec/align-2.c: New test. > > --- > gcc/config/riscv/riscv.cc | 14 ++++++++++++++ > .../gcc.target/riscv/rvv/autovec/align-1.c | 12 ++++++++++++ > .../gcc.target/riscv/rvv/autovec/align-2.c | 12 ++++++++++++ > .../riscv/rvv/autovec/binop/shift-rv32gcv.c | 10 ++++++---- > 4 files changed, 44 insertions(+), 4 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c > create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/align-2.c > > diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc > index de578b5b899..a770fdfaa0e 100644 > --- a/gcc/config/riscv/riscv.cc > +++ b/gcc/config/riscv/riscv.cc > @@ -7499,6 +7499,16 @@ riscv_preferred_simd_mode (scalar_mode mode) > return word_mode; > } > > +/* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */ > + > +static poly_uint64 > +riscv_vectorize_preferred_vector_alignment (const_tree type) > +{ > + if (riscv_v_ext_vector_mode_p (TYPE_MODE (type))) > + return TYPE_ALIGN (TREE_TYPE (type)); > + return TYPE_ALIGN (type); > +} > + > /* Initialize the GCC target structure. */ > #undef TARGET_ASM_ALIGNED_HI_OP > #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t" > @@ -7771,6 +7781,10 @@ riscv_preferred_simd_mode (scalar_mode mode) > #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE > #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE riscv_preferred_simd_mode > > +#undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT > +#define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \ > + riscv_vectorize_preferred_vector_alignment > + > struct gcc_target targetm = TARGET_INITIALIZER; > > #include "gt-riscv.h" > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c > b/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c > new file mode 100644 > index 00000000000..14201e1f7e0 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c > @@ -0,0 +1,12 @@ > +/* { dg-do compile } */ > +/* { dg-options "-march=rv32gcv -mabi=ilp32d -O3 --param > riscv-autovec-preference=scalable" } */ > + > +void __attribute__((noinline, noclone)) > +f (int * __restrict dst, int * __restrict op1, int * __restrict op2, int > count) > +{ > + for (int i = 0; i < count; ++i) > + dst[i] = op1[i] + op2[i]; > +} > + > +/* { dg-final { scan-assembler-not "lw" } } */ > +/* { dg-final { scan-assembler-not "sw" } } */ > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-2.c > b/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-2.c > new file mode 100644 > index 00000000000..812584e9d25 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-2.c > @@ -0,0 +1,12 @@ > +/* { dg-do compile } */ > +/* { dg-options "-march=rv32gcv -mabi=ilp32d -O3 --param > riscv-autovec-preference=fixed-vlmax" } */ > + > +void __attribute__((noinline, noclone)) > +f (int * __restrict dst, int * __restrict op1, int * __restrict op2, int > count) > +{ > + for (int i = 0; i < count; ++i) > + dst[i] = op1[i] + op2[i]; > +} > + > +/* { dg-final { scan-assembler-not "lw" } } */ > +/* { dg-final { scan-assembler-not "sw" } } */ > diff --git > a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c > b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c > index da0f79a1cf0..d98100b3276 100644 > --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c > @@ -4,8 +4,10 @@ > #include "shift-template.h" > > /* TODO: For int16_t and uint16_t we need widening/promotion patterns. > - Therefore, expect only 4 vsll.vv instead of 6 for now. */ > + We don't check the assembler number since lacking patterns make > + auto-vectorization inconsistent in LMUL = 1/2/4/8. */ > + > +/* { dg-final { scan-assembler {\tvsll\.vv} } } */ > +/* { dg-final { scan-assembler {\tvsrl\.vv} } } */ > +/* { dg-final { scan-assembler {\tvsra\.vv} } } */ > > -/* { dg-final { scan-assembler-times {\tvsll\.vv} 4 } } */ > -/* { dg-final { scan-assembler-times {\tvsrl\.vv} 3 } } */ > -/* { dg-final { scan-assembler-times {\tvsra\.vv} 3 } } */ > -- > 2.36.1 > > >