Address comment and fix on V2: https://gcc.gnu.org/pipermail/gcc-patches/2023-July/625870.html Ok for trunk?
juzhe.zh...@rivai.ai From: Kito Cheng Date: 2023-07-31 21:38 To: Juzhe-Zhong CC: gcc-patches; kito.cheng; jeffreyalaw; rdapp.gcc Subject: Re: [PATCH] RISC-V: Support POPCOUNT auto-vectorization On Mon, Jul 31, 2023 at 8:03 PM Juzhe-Zhong <juzhe.zh...@rivai.ai> wrote: > > This patch is inspired by "lowerCTPOP" in LLVM. > Support popcount auto-vectorization by following LLVM approach. > https://godbolt.org/z/3K3GzvY7f > > Before this patch: > > <source>:7:21: missed: couldn't vectorize loop > <source>:8:14: missed: not vectorized: relevant stmt not supported: _5 = > __builtin_popcount (_4); > > After this patch: > > popcount_32: > ble a2,zero,.L5 > li t3,1431654400 > li a7,858992640 > li t1,252645376 > li a6,16711680 > li a3,65536 > addiw t3,t3,1365 > addiw a7,a7,819 > addiw t1,t1,-241 > addiw a6,a6,255 > addiw a3,a3,-1 > .L3: > vsetvli a5,a2,e8,mf4,ta,ma > vle32.v v1,0(a1) > vsetivli zero,4,e32,m1,ta,ma > vsrl.vi v2,v1,1 > vand.vx v2,v2,t3 > vsub.vv v1,v1,v2 > vsrl.vi v2,v1,2 > vand.vx v2,v2,a7 > vand.vx v1,v1,a7 > vadd.vv v1,v1,v2 > vsrl.vi v2,v1,4 > vadd.vv v1,v1,v2 > vand.vx v1,v1,t1 > vsrl.vi v2,v1,8 > vand.vx v2,v2,a6 > slli a4,a5,2 > vand.vx v1,v1,a6 > vadd.vv v1,v1,v2 > vsrl.vi v2,v1,16 > vand.vx v1,v1,a3 > vand.vx v2,v2,a3 > vadd.vv v1,v1,v2 > vmv.v.v v1,v1 > vsetvli zero,a2,e32,m1,ta,ma > sub a2,a2,a5 > vse32.v v1,0(a0) > add a1,a1,a4 > add a0,a0,a4 > bne a2,zero,.L3 > .L5: > ret > > gcc/ChangeLog: > > * config/riscv/autovec.md (popcount<mode>2): New pattern. > * config/riscv/riscv-protos.h (expand_popcount): New function. > * config/riscv/riscv-v.cc (expand_popcount): Ditto. > > gcc/testsuite/ChangeLog: > > * gcc.target/riscv/rvv/autovec/widen/popcount-1.c: New test. > * gcc.target/riscv/rvv/autovec/widen/popcount_run-1.c: New test. > > --- > gcc/config/riscv/autovec.md | 13 +++ > gcc/config/riscv/riscv-protos.h | 1 + > gcc/config/riscv/riscv-v.cc | 95 +++++++++++++++++++ > .../riscv/rvv/autovec/widen/popcount-1.c | 23 +++++ > .../riscv/rvv/autovec/widen/popcount_run-1.c | 50 ++++++++++ > 5 files changed, 182 insertions(+) > create mode 100644 > gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/popcount-1.c > create mode 100644 > gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/popcount_run-1.c > > diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md > index b5152bc91fd..9d32b91bdca 100644 > --- a/gcc/config/riscv/autovec.md > +++ b/gcc/config/riscv/autovec.md > @@ -922,6 +922,19 @@ > DONE; > }) > > +;; > ------------------------------------------------------------------------------- > +;; - [INT] POPCOUNT. > +;; > ------------------------------------------------------------------------------- > + > +(define_expand "popcount<mode>2" > + [(match_operand:VI 0 "register_operand") > + (match_operand:VI 1 "register_operand")] > + "TARGET_VECTOR" > +{ > + riscv_vector::expand_popcount (operands); > + DONE; > +}) > + > ;; > ------------------------------------------------------------------------------- > ;; ---- [FP] Unary operations > ;; > ------------------------------------------------------------------------------- > diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h > index a729db44c32..ae40fbb4b53 100644 > --- a/gcc/config/riscv/riscv-protos.h > +++ b/gcc/config/riscv/riscv-protos.h > @@ -321,6 +321,7 @@ void expand_select_vl (rtx *); > void expand_load_store (rtx *, bool); > void expand_gather_scatter (rtx *, bool); > void expand_cond_len_ternop (unsigned, rtx *); > +void expand_popcount (rtx *); > > /* Rounding mode bitfield for fixed point VXRM. */ > enum fixed_point_rounding_mode > diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc > index c10e51b362e..b3caa4b188d 100644 > --- a/gcc/config/riscv/riscv-v.cc > +++ b/gcc/config/riscv/riscv-v.cc > @@ -3614,4 +3614,99 @@ expand_reduction (rtx_code code, rtx *ops, rtx init, > reduction_type type) > emit_insn (gen_pred_extract_first (m1_mode, ops[0], m1_tmp2)); > } > > +/* Expand Vector POPCOUNT by parallel popcnt: > + > + int parallel_popcnt(uint32_t n) { > + #define POW2(c) (1U << (c)) > + #define MASK(c) (static_cast<uint32_t>(-1) / (POW2(POW2(c)) + 1U)) > + #define COUNT(x, c) ((x) & MASK(c)) + (((x)>>(POW2(c))) & MASK(c)) > + n = COUNT(n, 0); > + n = COUNT(n, 1); > + n = COUNT(n, 2); > + n = COUNT(n, 3); > + n = COUNT(n, 4); > + // n = COUNT(n, 5); // uncomment this line for 64-bit integers > + return n; > + #undef COUNT > + #undef MASK > + #undef POW2 > + } > +*/ > +void > +expand_popcount (rtx *ops) > +{ > + rtx dst = ops[0]; > + rtx src = ops[1]; > + machine_mode mode = GET_MODE (dst); > + scalar_mode smode = GET_MODE_INNER (mode); > + static const uint64_t mask_values[6] > + = {0x5555555555555555ULL, 0x3333333333333333ULL, 0x0F0F0F0F0F0F0F0FULL, > + 0x00FF00FF00FF00FFULL, 0x0000FFFF0000FFFFULL, 0x00000000FFFFFFFFULL}; > + > + unsigned bit_size = GET_MODE_BITSIZE (smode); bit_size is always euqal or less than 64 since we don't have TI mode for vector modes. > + unsigned word_size > + = (bit_size + LONG_LONG_TYPE_SIZE - 1) / LONG_LONG_TYPE_SIZE; So this is always 1, or you were trying to handle rv32 with ELEN64 here? > + rtx count = CONST0_RTX (mode); > + > + for (unsigned n = 0; n < word_size; ++n) Drop outer loop if word_size never larger than 1? > + { > + rtx part_value = src; > + for (unsigned i = 1, ct = 0; > + i > + < (bit_size > LONG_LONG_TYPE_SIZE ? LONG_LONG_TYPE_SIZE : > bit_size); Just bit_size should be fine since LONG_LONG_TYPE_SIZE is constant 64 and bit_size <=64? > + i <<= 1, ++ct) > + { > + rtx mask_cst = gen_int_mode (mask_values[ct], smode); > + > + rtx vshift = expand_binop (mode, lshr_optab, part_value, > + gen_int_mode (i, smode), NULL_RTX, true, > + OPTAB_DIRECT); > + > + if (i == 4) > + { > + /* Optimize ((X & MASK) + ((X >> 4) & MASK)) > + > + -> (X + (X >> 4)) & MASK */ > + rtx rhs = expand_binop (mode, add_optab, part_value, vshift, > + NULL_RTX, false, OPTAB_DIRECT); > + part_value = gen_reg_rtx (mode); > + rtx part_ops[] = {part_value, rhs, mask_cst}; > + emit_vlmax_insn (code_for_pred_scalar (AND, mode), RVV_BINOP, > + part_ops); > + } > + else > + { > + rtx rhs = gen_reg_rtx (mode); > + rtx rhs_ops[] = {rhs, vshift, mask_cst}; > + emit_vlmax_insn (code_for_pred_scalar (AND, mode), RVV_BINOP, > + rhs_ops); > + if (i == 1) > + part_value = expand_binop (mode, sub_optab, part_value, rhs, > + NULL_RTX, false, OPTAB_DIRECT); > + else > + { > + rtx lhs = gen_reg_rtx (mode); > + rtx lhs_ops[] = {lhs, part_value, mask_cst}; > + emit_vlmax_insn (code_for_pred_scalar (AND, mode), > RVV_BINOP, > + lhs_ops); > + > + part_value = expand_binop (mode, add_optab, lhs, rhs, > + NULL_RTX, false, OPTAB_DIRECT); > + } > + } > + } > + > + count = expand_binop (mode, add_optab, part_value, count, NULL_RTX, > false, > + OPTAB_DIRECT); > + if (bit_size > LONG_LONG_TYPE_SIZE) No need this if word_size is constant 1. > + { > + src = expand_binop (mode, lshr_optab, src, > + gen_int_mode (LONG_LONG_TYPE_SIZE, smode), > + NULL_RTX, true, OPTAB_DIRECT); > + bit_size -= LONG_LONG_TYPE_SIZE; > + } > + } > + emit_move_insn (dst, count); > +} > + > } // namespace riscv_vector >