Committed, thanks Jeff.

Pan

-----Original Message-----
From: Gcc-patches <gcc-patches-bounces+pan2.li=intel....@gcc.gnu.org> On Behalf 
Of Jeff Law via Gcc-patches
Sent: Wednesday, July 12, 2023 11:40 PM
To: Juzhe-Zhong <juzhe.zh...@rivai.ai>; gcc-patches@gcc.gnu.org
Cc: kito.ch...@sifive.com; kito.ch...@gmail.com; rdapp....@gmail.com
Subject: Re: [PATCH V2] RISC-V: Support COND_LEN_* patterns



On 7/12/23 09:24, Juzhe-Zhong wrote:
> This middle-end has been merged:
> https://github.com/gcc-mirror/gcc/commit/0d4dd7e07a879d6c07a33edb2799710faa95651e
> 
> With this patch, we can handle operations may trap on elements outside the 
> loop.
>   
> These 2 following cases will be addressed by this patch:
>   
> 1. integer division:
>   
>    #define TEST_TYPE(TYPE) \
>    __attribute__((noipa)) \
>    void vrem_##TYPE (TYPE * __restrict dst, TYPE * __restrict a, TYPE * 
> __restrict b, int n) \
>    { \
>      for (int i = 0; i < n; i++) \
>        dst[i] = a[i] % b[i]; \
>    }
>    #define TEST_ALL() \
>     TEST_TYPE(int8_t) \
>    TEST_ALL()
>   
>    Before this patch:
>   
>     vrem_int8_t:
>          ble     a3,zero,.L14
>          csrr    t4,vlenb
>          addiw   a5,a3,-1
>          addiw   a4,t4,-1
>          sext.w  t5,a3
>          bltu    a5,a4,.L10
>          csrr    t3,vlenb
>          subw    t3,t5,t3
>          li      a5,0
>          vsetvli t6,zero,e8,m1,ta,ma
> .L4:
>          add     a6,a2,a5
>          add     a7,a0,a5
>          add     t1,a1,a5
>          mv      a4,a5
>          add     a5,a5,t4
>          vl1re8.v        v2,0(a6)
>          vl1re8.v        v1,0(t1)
>          sext.w  a6,a5
>          vrem.vv v1,v1,v2
>          vs1r.v  v1,0(a7)
>          bleu    a6,t3,.L4
>          csrr    a5,vlenb
>          addw    a4,a4,a5
>          sext.w  a5,a4
>          beq     t5,a4,.L16
> .L3:
>          csrr    a6,vlenb
>          subw    t5,t5,a4
>          srli    a6,a6,1
>          addiw   t1,t5,-1
>          addiw   a7,a6,-1
>          bltu    t1,a7,.L9
>          slli    a4,a4,32
>          srli    a4,a4,32
>          add     t0,a1,a4
>          add     t6,a2,a4
>          add     a4,a0,a4
>          vsetvli a7,zero,e8,mf2,ta,ma
>          sext.w  t3,a6
>          vle8.v  v1,0(t0)
>          vle8.v  v2,0(t6)
>          subw    t4,t5,a6
>          vrem.vv v1,v1,v2
>          vse8.v  v1,0(a4)
>          mv      t1,t3
>          bltu    t4,t3,.L7
>          csrr    t1,vlenb
>          add     a4,a4,a6
>          add     t0,t0,a6
>          add     t6,t6,a6
>          sext.w  t1,t1
>          vle8.v  v1,0(t0)
>          vle8.v  v2,0(t6)
>          vrem.vv v1,v1,v2
>          vse8.v  v1,0(a4)
> .L7:
>          addw    a5,t1,a5
>          beq     t5,t1,.L14
> .L9:
>          add     a4,a1,a5
>          add     a6,a2,a5
>          lb      a6,0(a6)
>          lb      a4,0(a4)
>          add     a7,a0,a5
>          addi    a5,a5,1
>          remw    a4,a4,a6
>          sext.w  a6,a5
>          sb      a4,0(a7)
>          bgt     a3,a6,.L9
> .L14:
>          ret
> .L10:
>          li      a4,0
>          li      a5,0
>          j       .L3
> .L16:
>          ret
>   
> After this patch:
>   
>     vrem_int8_t:
> ble a3,zero,.L5
> .L3:
> vsetvli a5,a3,e8,m1,tu,ma
> vle8.v v1,0(a1)
> vle8.v v2,0(a2)
> sub a3,a3,a5
> vrem.vv v1,v1,v2
> vse8.v v1,0(a0)
> add a1,a1,a5
> add a2,a2,a5
> add a0,a0,a5
> bne a3,zero,.L3
> .L5:
> ret
>   
> 2. Floating-point operation **WITHOUT** -ffast-math:
>       
>      #define TEST_TYPE(TYPE) \
>      __attribute__((noipa)) \
>      void vadd_##TYPE (TYPE * __restrict dst, TYPE *__restrict a, TYPE 
> *__restrict b, int n) \
>      { \
>        for (int i = 0; i < n; i++) \
>          dst[i] = a[i] + b[i]; \
>      }
>   
>      #define TEST_ALL() \
>       TEST_TYPE(float) \
>   
>      TEST_ALL()
>     
> Before this patch:
>     
>     vadd_float:
>          ble     a3,zero,.L10
>          csrr    a4,vlenb
>          srli    t3,a4,2
>          addiw   a5,a3,-1
>          addiw   a6,t3,-1
>          sext.w  t6,a3
>          bltu    a5,a6,.L7
>          subw    t5,t6,t3
>          mv      t1,a1
>          mv      a7,a2
>          mv      a6,a0
>          li      a5,0
>          vsetvli t4,zero,e32,m1,ta,ma
> .L4:
>          vl1re32.v       v1,0(t1)
>          vl1re32.v       v2,0(a7)
>          addw    a5,a5,t3
>          vfadd.vv        v1,v1,v2
>          vs1r.v  v1,0(a6)
>          add     t1,t1,a4
>          add     a7,a7,a4
>          add     a6,a6,a4
>          bgeu    t5,a5,.L4
>          beq     t6,a5,.L10
>          sext.w  a5,a5
> .L3:
>          slli    a4,a5,2
> .L6:
>          add     a6,a1,a4
>          add     a7,a2,a4
>          flw     fa4,0(a6)
>          flw     fa5,0(a7)
>          add     a6,a0,a4
>          addiw   a5,a5,1
>          fadd.s  fa5,fa5,fa4
>          addi    a4,a4,4
>          fsw     fa5,0(a6)
>          bgt     a3,a5,.L6
> .L10:
>          ret
> .L7:
>          li      a5,0
>          j       .L3
>   
> After this patch:
>   
>     vadd_float:
> ble a3,zero,.L5
> .L3:
> vsetvli a5,a3,e32,m1,tu,ma
> slli a4,a5,2
> vle32.v v1,0(a1)
> vle32.v v2,0(a2)
> sub a3,a3,a5
> vfadd.vv v1,v1,v2
> vse32.v v1,0(a0)
> add a1,a1,a4
> add a2,a2,a4
> add a0,a0,a4
> bne a3,zero,.L3
> .L5:
> ret
>    
> gcc/ChangeLog:
>   
>          * config/riscv/autovec.md (cond_len_<optab><mode>): New pattern.
>          * config/riscv/riscv-protos.h (enum insn_type): New enum.
>          (expand_cond_len_binop): New function.
>          * config/riscv/riscv-v.cc (emit_nonvlmax_tu_insn): Ditto.
>          (emit_nonvlmax_fp_tu_insn): Ditto.
>          (need_fp_rounding_p): Ditto.
>          (expand_cond_len_binop): Ditto.
>          * config/riscv/riscv.cc (riscv_preferred_else_value): Ditto.
>          (TARGET_PREFERRED_ELSE_VALUE): New target hook.
>   
> gcc/testsuite/ChangeLog:
>   
>          * gcc.target/riscv/rvv/autovec/binop/vdiv-rv32gcv.c: Adapt testcase.
>          * gcc.target/riscv/rvv/autovec/binop/vdiv-rv64gcv.c: Ditto.
>          * gcc.target/riscv/rvv/autovec/binop/vrem-rv32gcv.c: Ditto.
>          * gcc.target/riscv/rvv/autovec/binop/vrem-rv64gcv.c: Ditto.
>          * gcc.target/riscv/rvv/autovec/binop/vadd-run-nofm.c: New test.
>          * gcc.target/riscv/rvv/autovec/binop/vadd-rv32gcv-nofm.c: New test.
>          * gcc.target/riscv/rvv/autovec/binop/vadd-rv64gcv-nofm.c: New test.
>          * gcc.target/riscv/rvv/autovec/binop/vdiv-run-nofm.c: New test.
>          * gcc.target/riscv/rvv/autovec/binop/vdiv-rv32gcv-nofm.c: New test.
>          * gcc.target/riscv/rvv/autovec/binop/vdiv-rv64gcv-nofm.c: New test.
>          * gcc.target/riscv/rvv/autovec/binop/vmul-run-nofm.c: New test.
>          * gcc.target/riscv/rvv/autovec/binop/vmul-rv32gcv-nofm.c: New test.
>          * gcc.target/riscv/rvv/autovec/binop/vmul-rv64gcv-nofm.c: New test.
>          * gcc.target/riscv/rvv/autovec/binop/vsub-run-nofm.c: New test.
>          * gcc.target/riscv/rvv/autovec/binop/vsub-rv32gcv-nofm.c: New test.
>          * gcc.target/riscv/rvv/autovec/binop/vsub-rv64gcv-nofm.c: New test.
OK with the change to call needs_fp_rounding and the minor comment fix 
for FMLA/FMACC are addressed.  No need to wait for another review round. 
  Just post and commit it as pre-approved.

jeff

Reply via email to