committed, thanks. On Thu, Dec 29, 2022 at 11:34 PM <juzhe.zh...@rivai.ai> wrote:
> From: Ju-Zhe Zhong <juzhe.zh...@rivai.ai> > > Currently we use pred_mov to to do the codegen for vse intrinsics. > However, it > generates inferior codegen when I am testing AVL model of VSETVL PASS > using vse intrinsics. > > Consider this following code: > void f2 (int * restrict in, int * restrict out, void * restrict mask_in, > int n) > { > vfloat32mf2_t v = __riscv_vle32_v_f32mf2 ((float *)(in + 10000), 19); > __riscv_vse32_v_f32mf2 ((float *)(out + 10000), v, 19); > vbool64_t mask = *(vbool64_t*)mask_in; > for (int i = 0; i < n; i++) > { > vint16mf2_t v1 = __riscv_vle16_v_i16mf2 ((int16_t *)(in + i + 1), > 19); > __riscv_vse16_v_i16mf2 ((int16_t *)(out + i + 1), v1, 19); > > vint32mf2_t v2 = __riscv_vle32_v_i32mf2 ((int32_t *)(in + i + 2), > 19); > __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 2), v2, 19); > > vint32mf2_t v3 = __riscv_vle32_v_i32mf2_tumu (mask, v2, (int32_t > *)(in + i + 200), 13); > __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 200), v2, 13); > > vfloat64m1_t v4 = __riscv_vle64_v_f64m1_m (mask, (double *)(in + i + > 300), 11); > __riscv_vse64_v_f64m1 ((double *)(out + i + 300), v4, 11); > > vfloat64m1_t v5 = __riscv_vle64_v_f64m1_tum (mask, v4, (double *)(in > + i + 500), 11); > __riscv_vse64_v_f64m1 ((double *)(out + i + 500), v5, 11); > > vfloat64m1_t v6 = __riscv_vle64_v_f64m1_mu (mask, v5, (double *)(in > + i + 600), 11); > __riscv_vse64_v_f64m1_m (mask, (double *)(out + i + 600), v6, 11); > > vuint8mf4_t v7 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 700), > 11); > __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 700), v7, 11); > } > } > > Before this patch: > csrr t2,vlenb > srli t2,t2,1 > slli s0,t2,2 > vsetvli zero,19,e16,mf2,ta,ma > sub s0,s0,t2 > csrr t2,vlenb > vle16.v v24,0(a3) > mv a4,a3 > vse16.v v24,0(a1) > srli t2,t2,1 > add a2,a3,t6 > add s0,s0,sp > vsetvli zero,19,e32,mf2,ta,ma > addi a3,a3,4 > vle32.v v24,0(a3) > vsetvli zero,t0,e32,mf2,ta,ma > vse32.v v24,0(s0) > slli s0,t2,2 > sub s0,s0,t2 > add s0,s0,sp > vsetvli t0,zero,e32,mf2,ta,ma > vle32.v v24,0(s0) > mv s0,t2 > slli t2,t2,2 > mv a5,a1 > vsetvli zero,19,e32,mf2,ta,ma > addi a1,a1,4 > sub t2,t2,s0 > vse32.v v24,0(a1) > add t2,t2,sp > vsetvli t0,zero,e32,mf2,ta,ma > addi t1,a5,796 > vle32.v v24,0(t2) > addi t5,a4,1196 > addi a7,a5,1196 > addi t4,a4,1996 > addi a6,a5,1996 > vsetvli zero,13,e32,mf2,ta,ma > add a4,a4,t3 > vse32.v v24,0(t1) > add a5,a5,t3 > vsetvli zero,11,e64,m1,tu,mu > vle64.v v24,0(t5),v0.t > vse64.v v24,0(a7) > vle64.v v24,0(t4),v0.t > vse64.v v24,0(a6) > vle64.v v24,0(a4),v0.t > vse64.v v24,0(a5),v0.t > vsetvli zero,11,e8,mf4,ta,ma > vle8.v v24,0(a2) > vse8.v v24,0(a2) > bne a0,a3,.L8 > csrr t0,vlenb > slli t1,t0,1 > add sp,sp,t1 > lw s0,12(sp) > addi sp,sp,16 > jr ra > > We are generating redundant spilling codes. > Here we introduce a dedicated pred_store pattern for vse intrinsics like > maskstore in ARM SVE. > > After this patch: > vsetvli zero,19,e16,mf2,ta,ma > mv a5,a4 > vle16.v v24,0(a0) > mv a3,a0 > vse16.v 19,0(a4) > addi t1,a4,796 > vsetvli zero,19,e32,mf2,ta,ma > addi a0,a0,4 > addi a4,a4,4 > vle32.v v24,0(a0) > addi t0,a3,1196 > vse32.v 19,0(a4) > addi a7,a5,1196 > addi t6,a3,1996 > addi a6,a5,1996 > add t5,a3,t4 > vsetvli zero,13,e32,mf2,ta,ma > add a2,a5,t4 > vse32.v 13,0(t1) > add a3,a3,t3 > vsetvli zero,11,e64,m1,tu,mu > add a5,a5,t3 > vle64.v v24,0(t0),v0.t > vse64.v 11,0(a7) > vle64.v v24,0(t6),v0.t > vse64.v 11,0(a6) > vle64.v v24,0(t5),v0.t > vse64.v 11,0(a2),v0.t > vsetvli zero,11,e8,mf4,ta,ma > vle8.v v24,0(a3) > vse8.v 11,0(a5) > bne a1,a4,.L8 > .L6: > ret > > gcc/ChangeLog: > > * config/riscv/riscv-vector-builtins-bases.cc (class loadstore): > use pred_store for vse. > * config/riscv/riscv-vector-builtins.cc > (function_expander::add_mem_operand): Refine function. > (function_expander::use_contiguous_load_insn): Adjust new > implementation. > (function_expander::use_contiguous_store_insn): Ditto. > * config/riscv/riscv-vector-builtins.h: Refine function. > * config/riscv/vector.md (@pred_store<mode>): New pattern. > > gcc/testsuite/ChangeLog: > > * gcc.target/riscv/rvv/base/vse-constraint-1.c: New test. > > --- > .../riscv/riscv-vector-builtins-bases.cc | 2 +- > gcc/config/riscv/riscv-vector-builtins.cc | 22 +---- > gcc/config/riscv/riscv-vector-builtins.h | 8 +- > gcc/config/riscv/vector.md | 23 ++++- > .../riscv/rvv/base/vse-constraint-1.c | 97 +++++++++++++++++++ > 5 files changed, 128 insertions(+), 24 deletions(-) > create mode 100644 > gcc/testsuite/gcc.target/riscv/rvv/base/vse-constraint-1.c > > diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.cc > b/gcc/config/riscv/riscv-vector-builtins-bases.cc > index 10373e5ccf2..af66b016b49 100644 > --- a/gcc/config/riscv/riscv-vector-builtins-bases.cc > +++ b/gcc/config/riscv/riscv-vector-builtins-bases.cc > @@ -106,7 +106,7 @@ class loadstore : public function_base > rtx expand (function_expander &e) const override > { > if (STORE_P) > - return e.use_contiguous_store_insn (code_for_pred_mov > (e.vector_mode ())); > + return e.use_contiguous_store_insn (code_for_pred_store > (e.vector_mode ())); > else > return e.use_contiguous_load_insn (code_for_pred_mov (e.vector_mode > ())); > } > diff --git a/gcc/config/riscv/riscv-vector-builtins.cc > b/gcc/config/riscv/riscv-vector-builtins.cc > index e39bfea9636..47e01b647f8 100644 > --- a/gcc/config/riscv/riscv-vector-builtins.cc > +++ b/gcc/config/riscv/riscv-vector-builtins.cc > @@ -845,15 +845,15 @@ function_expander::add_vundef_operand (machine_mode > mode) > } > > /* Add a memory operand with mode MODE and address ADDR. */ > -rtx > -function_expander::add_mem_operand (machine_mode mode, rtx addr) > +void > +function_expander::add_mem_operand (machine_mode mode, unsigned argno) > { > gcc_assert (VECTOR_MODE_P (mode)); > + rtx addr = expand_normal (CALL_EXPR_ARG (exp, argno)); > rtx mem = gen_rtx_MEM (mode, memory_address (mode, addr)); > /* The memory is only guaranteed to be element-aligned. */ > set_mem_align (mem, GET_MODE_ALIGNMENT (GET_MODE_INNER (mode))); > add_fixed_operand (mem); > - return mem; > } > > /* Use contiguous load INSN. */ > @@ -878,9 +878,7 @@ function_expander::use_contiguous_load_insn (insn_code > icode) > else > add_vundef_operand (mode); > > - tree addr_arg = CALL_EXPR_ARG (exp, arg_offset++); > - rtx addr = expand_normal (addr_arg); > - add_mem_operand (mode, addr); > + add_mem_operand (mode, arg_offset++); > > for (int argno = arg_offset; argno < call_expr_nargs (exp); argno++) > add_input_operand (argno); > @@ -904,27 +902,17 @@ function_expander::use_contiguous_store_insn > (insn_code icode) > /* Record the offset to get the argument. */ > int arg_offset = 0; > > - int addr_loc = use_real_mask_p (pred) ? 1 : 0; > - tree addr_arg = CALL_EXPR_ARG (exp, addr_loc); > - rtx addr = expand_normal (addr_arg); > - rtx mem = add_mem_operand (mode, addr); > + add_mem_operand (mode, use_real_mask_p (pred) ? 1 : 0); > > if (use_real_mask_p (pred)) > add_input_operand (arg_offset++); > else > add_all_one_mask_operand (mask_mode); > > - /* To model "+m" constraint, we include memory operand into input. */ > - add_input_operand (mode, mem); > - > arg_offset++; > for (int argno = arg_offset; argno < call_expr_nargs (exp); argno++) > add_input_operand (argno); > > - add_input_operand (Pmode, get_tail_policy_for_pred (pred)); > - add_input_operand (Pmode, get_mask_policy_for_pred (pred)); > - add_input_operand (Pmode, get_avl_type_rtx (avl_type::NONVLMAX)); > - > return generate_insn (icode); > } > > diff --git a/gcc/config/riscv/riscv-vector-builtins.h > b/gcc/config/riscv/riscv-vector-builtins.h > index c13df99cb5b..58d8d78043c 100644 > --- a/gcc/config/riscv/riscv-vector-builtins.h > +++ b/gcc/config/riscv/riscv-vector-builtins.h > @@ -317,12 +317,12 @@ public: > rtx expand (); > > void add_input_operand (machine_mode, rtx); > - void add_input_operand (unsigned argno); > + void add_input_operand (unsigned); > void add_output_operand (machine_mode, rtx); > - void add_all_one_mask_operand (machine_mode mode); > - void add_vundef_operand (machine_mode mode); > + void add_all_one_mask_operand (machine_mode); > + void add_vundef_operand (machine_mode); > void add_fixed_operand (rtx); > - rtx add_mem_operand (machine_mode, rtx); > + void add_mem_operand (machine_mode, unsigned); > > machine_mode vector_mode (void) const; > > diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md > index 89810b183fc..3d0174f98a2 100644 > --- a/gcc/config/riscv/vector.md > +++ b/gcc/config/riscv/vector.md > @@ -209,7 +209,7 @@ > > ;; The index of operand[] to get the merge op. > (define_attr "merge_op_idx" "" > - (cond [(eq_attr "type" > "vlde,vste,vimov,vfmov,vldm,vstm,vlds,vmalu") > + (cond [(eq_attr "type" "vlde,vimov,vfmov,vldm,vstm,vlds,vmalu") > (const_int 2)] > (const_int INVALID_ATTRIBUTE))) > > @@ -647,7 +647,7 @@ > (reg:SI VL_REGNUM) > (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE) > (match_operand:V 3 "vector_move_operand" " m, m, > vr, vr, viWc0") > - (match_operand:V 2 "vector_merge_operand" " 0, vu, > vu0, vu0, vu0")))] > + (match_operand:V 2 "vector_merge_operand" " 0, vu, > vu, vu0, vu0")))] > "TARGET_VECTOR" > "@ > vle<sew>.v\t%0,%3%p1 > @@ -663,6 +663,25 @@ > [(set_attr "type" "vlde,vlde,vste,vimov,vimov") > (set_attr "mode" "<MODE>")]) > > +;; Dedicated pattern for vse.v instruction since we can't reuse pred_mov > pattern to include > +;; memory operand as input which will produce inferior codegen. > +(define_insn "@pred_store<mode>" > + [(set (match_operand:V 0 "memory_operand" "+m") > + (if_then_else:V > + (unspec:<VM> > + [(match_operand:<VM> 1 "vector_mask_operand" "vmWc1") > + (match_operand 3 "vector_length_operand" " rK") > + (reg:SI VL_REGNUM) > + (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE) > + (match_operand:V 2 "register_operand" " vr") > + (match_dup 0)))] > + "TARGET_VECTOR" > + "vse<sew>.v\t%2,%0%p1" > + [(set_attr "type" "vste") > + (set_attr "mode" "<MODE>") > + (set (attr "avl_type") (symbol_ref "riscv_vector::NONVLMAX")) > + (set_attr "vl_op_idx" "3")]) > + > ;; vlm.v/vsm.v/vmclr.m/vmset.m. > ;; constraint alternative 0 match vlm.v. > ;; constraint alternative 1 match vsm.v. > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/vse-constraint-1.c > b/gcc/testsuite/gcc.target/riscv/rvv/base/vse-constraint-1.c > new file mode 100644 > index 00000000000..5b8b9b41c7b > --- /dev/null > +++ b/gcc/testsuite/gcc.target/riscv/rvv/base/vse-constraint-1.c > @@ -0,0 +1,97 @@ > +/* { dg-do compile } */ > +/* { dg-options "-march=rv32gcv -mabi=ilp32d -O3" } */ > + > +#include "riscv_vector.h" > + > +void f (int * restrict in, int * restrict out, void * restrict mask_in, > int n) > +{ > + vfloat32mf2_t v = __riscv_vle32_v_f32mf2 ((float *)(in + 10000), 19); > + __riscv_vse32_v_f32mf2 ((float *)(out + 10000), v, 19); > + vbool64_t mask = *(vbool64_t*)mask_in; > + for (int i = 0; i < n; i++) > + { > + vint16mf2_t v1 = __riscv_vle16_v_i16mf2 ((int16_t *)(in + i + 1), > 19); > + __riscv_vse16_v_i16mf2 ((int16_t *)(out + i + 1), v1, 19); > + > + vint32mf2_t v2 = __riscv_vle32_v_i32mf2 ((int32_t *)(in + i + 2), > 19); > + __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 2), v2, 19); > + > + vint32mf2_t v3 = __riscv_vle32_v_i32mf2_tumu (mask, v2, (int32_t > *)(in + i + 200), 13); > + __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 200), v3, 13); > + > + vfloat64m1_t v4 = __riscv_vle64_v_f64m1_m (mask, (double *)(in + i > + 300), 11); > + __riscv_vse64_v_f64m1 ((double *)(out + i + 300), v4, 11); > + > + vfloat64m1_t v5 = __riscv_vle64_v_f64m1_tum (mask, v4, (double > *)(in + i + 500), 11); > + __riscv_vse64_v_f64m1 ((double *)(out + i + 500), v5, 11); > + > + vfloat64m1_t v6 = __riscv_vle64_v_f64m1_mu (mask, v5, (double *)(in > + i + 600), 11); > + __riscv_vse64_v_f64m1_m (mask, (double *)(out + i + 600), v6, 11); > + > + vuint8mf4_t v7 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 700), > 11); > + __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 700), v7, 11); > + } > +} > + > +void f2 (int * restrict in, int * restrict out, void * restrict mask_in, > int n) > +{ > + vfloat32mf2_t v = __riscv_vle32_v_f32mf2 ((float *)(in + 10000), 19); > + __riscv_vse32_v_f32mf2 ((float *)(out + 10000), v, 19); > + vbool64_t mask = *(vbool64_t*)mask_in; > + for (int i = 0; i < n; i++) > + { > + vint16mf2_t v1 = __riscv_vle16_v_i16mf2 ((int16_t *)(in + i + 1), > 19); > + __riscv_vse16_v_i16mf2 ((int16_t *)(out + i + 1), v1, 19); > + > + vint32mf2_t v2 = __riscv_vle32_v_i32mf2 ((int32_t *)(in + i + 2), > 19); > + __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 2), v2, 19); > + > + vint32mf2_t v3 = __riscv_vle32_v_i32mf2_tumu (mask, v2, (int32_t > *)(in + i + 200), 13); > + __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 200), v2, 13); > + > + vfloat64m1_t v4 = __riscv_vle64_v_f64m1_m (mask, (double *)(in + i > + 300), 11); > + __riscv_vse64_v_f64m1 ((double *)(out + i + 300), v4, 11); > + > + vfloat64m1_t v5 = __riscv_vle64_v_f64m1_tum (mask, v4, (double > *)(in + i + 500), 11); > + __riscv_vse64_v_f64m1 ((double *)(out + i + 500), v5, 11); > + > + vfloat64m1_t v6 = __riscv_vle64_v_f64m1_mu (mask, v5, (double *)(in > + i + 600), 11); > + __riscv_vse64_v_f64m1_m (mask, (double *)(out + i + 600), v6, 11); > + > + vuint8mf4_t v7 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 700), > 11); > + __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 700), v7, 11); > + } > +} > + > +void f3 (int * restrict in, int * restrict out, void * restrict mask_in, > int n) > +{ > + vfloat32mf2_t v = __riscv_vle32_v_f32mf2 ((float *)(in + 10000), 19); > + __riscv_vse32_v_f32mf2 ((float *)(out + 10000), v, 19); > + vbool64_t mask = *(vbool64_t*)mask_in; > + for (int i = 0; i < n; i++) > + { > + vint16mf2_t v1 = __riscv_vle16_v_i16mf2 ((int16_t *)(in + i + 1), > 19); > + __riscv_vse16_v_i16mf2 ((int16_t *)(out + i + 1), v1, 19); > + > + vint32mf2_t v2 = __riscv_vle32_v_i32mf2 ((int32_t *)(in + i + 2), > 19); > + __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 2), v2, 19); > + > + vint32mf2_t v3 = __riscv_vle32_v_i32mf2_tumu (mask, v2, (int32_t > *)(in + i + 200), 13); > + *(vint32mf2_t*)(out + i + 200) = v3; > + > + vfloat64m1_t v4 = __riscv_vle64_v_f64m1_m (mask, (double *)(in + i > + 300), 11); > + __riscv_vse64_v_f64m1 ((double *)(out + i + 300), v4, 11); > + > + vfloat64m1_t v5 = __riscv_vle64_v_f64m1_tum (mask, v4, (double > *)(in + i + 500), 11); > + __riscv_vse64_v_f64m1 ((double *)(out + i + 500), v5, 11); > + > + vfloat64m1_t v6 = __riscv_vle64_v_f64m1_mu (mask, v5, (double *)(in > + i + 600), 11); > + __riscv_vse64_v_f64m1_m (mask, (double *)(out + i + 600), v6, 11); > + > + vuint8mf4_t v7 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 700), > 11); > + __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 700), v7, 11); > + } > +} > + > +/* It should not have redundant vector register spills which produce csrr > vlenb instructions allocate stack. */ > +/* { dg-final { scan-assembler-not {csrr\s+[a-x0-9]+,\s*vlenb} } } */ > -- > 2.36.1 > >