On 7/12/23 09:24, Juzhe-Zhong wrote:
This middle-end has been merged:
https://github.com/gcc-mirror/gcc/commit/0d4dd7e07a879d6c07a33edb2799710faa95651e

With this patch, we can handle operations may trap on elements outside the loop.
These 2 following cases will be addressed by this patch: 1. integer division: #define TEST_TYPE(TYPE) \
   __attribute__((noipa)) \
   void vrem_##TYPE (TYPE * __restrict dst, TYPE * __restrict a, TYPE * 
__restrict b, int n) \
   { \
     for (int i = 0; i < n; i++) \
       dst[i] = a[i] % b[i]; \
   }
   #define TEST_ALL() \
    TEST_TYPE(int8_t) \
   TEST_ALL()
Before this patch: vrem_int8_t:
         ble     a3,zero,.L14
         csrr    t4,vlenb
         addiw   a5,a3,-1
         addiw   a4,t4,-1
         sext.w  t5,a3
         bltu    a5,a4,.L10
         csrr    t3,vlenb
         subw    t3,t5,t3
         li      a5,0
         vsetvli t6,zero,e8,m1,ta,ma
.L4:
         add     a6,a2,a5
         add     a7,a0,a5
         add     t1,a1,a5
         mv      a4,a5
         add     a5,a5,t4
         vl1re8.v        v2,0(a6)
         vl1re8.v        v1,0(t1)
         sext.w  a6,a5
         vrem.vv v1,v1,v2
         vs1r.v  v1,0(a7)
         bleu    a6,t3,.L4
         csrr    a5,vlenb
         addw    a4,a4,a5
         sext.w  a5,a4
         beq     t5,a4,.L16
.L3:
         csrr    a6,vlenb
         subw    t5,t5,a4
         srli    a6,a6,1
         addiw   t1,t5,-1
         addiw   a7,a6,-1
         bltu    t1,a7,.L9
         slli    a4,a4,32
         srli    a4,a4,32
         add     t0,a1,a4
         add     t6,a2,a4
         add     a4,a0,a4
         vsetvli a7,zero,e8,mf2,ta,ma
         sext.w  t3,a6
         vle8.v  v1,0(t0)
         vle8.v  v2,0(t6)
         subw    t4,t5,a6
         vrem.vv v1,v1,v2
         vse8.v  v1,0(a4)
         mv      t1,t3
         bltu    t4,t3,.L7
         csrr    t1,vlenb
         add     a4,a4,a6
         add     t0,t0,a6
         add     t6,t6,a6
         sext.w  t1,t1
         vle8.v  v1,0(t0)
         vle8.v  v2,0(t6)
         vrem.vv v1,v1,v2
         vse8.v  v1,0(a4)
.L7:
         addw    a5,t1,a5
         beq     t5,t1,.L14
.L9:
         add     a4,a1,a5
         add     a6,a2,a5
         lb      a6,0(a6)
         lb      a4,0(a4)
         add     a7,a0,a5
         addi    a5,a5,1
         remw    a4,a4,a6
         sext.w  a6,a5
         sb      a4,0(a7)
         bgt     a3,a6,.L9
.L14:
         ret
.L10:
         li      a4,0
         li      a5,0
         j       .L3
.L16:
         ret
After this patch: vrem_int8_t:
ble a3,zero,.L5
.L3:
vsetvli a5,a3,e8,m1,tu,ma
vle8.v v1,0(a1)
vle8.v v2,0(a2)
sub a3,a3,a5
vrem.vv v1,v1,v2
vse8.v v1,0(a0)
add a1,a1,a5
add a2,a2,a5
add a0,a0,a5
bne a3,zero,.L3
.L5:
ret
2. Floating-point operation **WITHOUT** -ffast-math: #define TEST_TYPE(TYPE) \
     __attribute__((noipa)) \
     void vadd_##TYPE (TYPE * __restrict dst, TYPE *__restrict a, TYPE 
*__restrict b, int n) \
     { \
       for (int i = 0; i < n; i++) \
         dst[i] = a[i] + b[i]; \
     }
#define TEST_ALL() \
      TEST_TYPE(float) \
TEST_ALL() Before this patch: vadd_float:
         ble     a3,zero,.L10
         csrr    a4,vlenb
         srli    t3,a4,2
         addiw   a5,a3,-1
         addiw   a6,t3,-1
         sext.w  t6,a3
         bltu    a5,a6,.L7
         subw    t5,t6,t3
         mv      t1,a1
         mv      a7,a2
         mv      a6,a0
         li      a5,0
         vsetvli t4,zero,e32,m1,ta,ma
.L4:
         vl1re32.v       v1,0(t1)
         vl1re32.v       v2,0(a7)
         addw    a5,a5,t3
         vfadd.vv        v1,v1,v2
         vs1r.v  v1,0(a6)
         add     t1,t1,a4
         add     a7,a7,a4
         add     a6,a6,a4
         bgeu    t5,a5,.L4
         beq     t6,a5,.L10
         sext.w  a5,a5
.L3:
         slli    a4,a5,2
.L6:
         add     a6,a1,a4
         add     a7,a2,a4
         flw     fa4,0(a6)
         flw     fa5,0(a7)
         add     a6,a0,a4
         addiw   a5,a5,1
         fadd.s  fa5,fa5,fa4
         addi    a4,a4,4
         fsw     fa5,0(a6)
         bgt     a3,a5,.L6
.L10:
         ret
.L7:
         li      a5,0
         j       .L3
After this patch: vadd_float:
ble a3,zero,.L5
.L3:
vsetvli a5,a3,e32,m1,tu,ma
slli a4,a5,2
vle32.v v1,0(a1)
vle32.v v2,0(a2)
sub a3,a3,a5
vfadd.vv v1,v1,v2
vse32.v v1,0(a0)
add a1,a1,a4
add a2,a2,a4
add a0,a0,a4
bne a3,zero,.L3
.L5:
ret
gcc/ChangeLog: * config/riscv/autovec.md (cond_len_<optab><mode>): New pattern.
         * config/riscv/riscv-protos.h (enum insn_type): New enum.
         (expand_cond_len_binop): New function.
         * config/riscv/riscv-v.cc (emit_nonvlmax_tu_insn): Ditto.
         (emit_nonvlmax_fp_tu_insn): Ditto.
         (need_fp_rounding_p): Ditto.
         (expand_cond_len_binop): Ditto.
         * config/riscv/riscv.cc (riscv_preferred_else_value): Ditto.
         (TARGET_PREFERRED_ELSE_VALUE): New target hook.
gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/autovec/binop/vdiv-rv32gcv.c: Adapt testcase.
         * gcc.target/riscv/rvv/autovec/binop/vdiv-rv64gcv.c: Ditto.
         * gcc.target/riscv/rvv/autovec/binop/vrem-rv32gcv.c: Ditto.
         * gcc.target/riscv/rvv/autovec/binop/vrem-rv64gcv.c: Ditto.
         * gcc.target/riscv/rvv/autovec/binop/vadd-run-nofm.c: New test.
         * gcc.target/riscv/rvv/autovec/binop/vadd-rv32gcv-nofm.c: New test.
         * gcc.target/riscv/rvv/autovec/binop/vadd-rv64gcv-nofm.c: New test.
         * gcc.target/riscv/rvv/autovec/binop/vdiv-run-nofm.c: New test.
         * gcc.target/riscv/rvv/autovec/binop/vdiv-rv32gcv-nofm.c: New test.
         * gcc.target/riscv/rvv/autovec/binop/vdiv-rv64gcv-nofm.c: New test.
         * gcc.target/riscv/rvv/autovec/binop/vmul-run-nofm.c: New test.
         * gcc.target/riscv/rvv/autovec/binop/vmul-rv32gcv-nofm.c: New test.
         * gcc.target/riscv/rvv/autovec/binop/vmul-rv64gcv-nofm.c: New test.
         * gcc.target/riscv/rvv/autovec/binop/vsub-run-nofm.c: New test.
         * gcc.target/riscv/rvv/autovec/binop/vsub-rv32gcv-nofm.c: New test.
         * gcc.target/riscv/rvv/autovec/binop/vsub-rv64gcv-nofm.c: New test.
OK with the change to call needs_fp_rounding and the minor comment fix for FMLA/FMACC are addressed. No need to wait for another review round. Just post and commit it as pre-approved.

jeff

Reply via email to