Re: Re: [PATCH] RISC-V: Support POPCOUNT auto-vectorization

钟居哲 Mon, 31 Jul 2023 07:32:12 -0700

Address comment and fix on V2:
https://gcc.gnu.org/pipermail/gcc-patches/2023-July/625870.html 
Ok for trunk?




juzhe.zh...@rivai.ai
 
From: Kito Cheng
Date: 2023-07-31 21:38
To: Juzhe-Zhong
CC: gcc-patches; kito.cheng; jeffreyalaw; rdapp.gcc
Subject: Re: [PATCH] RISC-V: Support POPCOUNT auto-vectorization
On Mon, Jul 31, 2023 at 8:03 PM Juzhe-Zhong <juzhe.zh...@rivai.ai> wrote:
>
> This patch is inspired by "lowerCTPOP" in LLVM.
> Support popcount auto-vectorization by following LLVM approach.
> https://godbolt.org/z/3K3GzvY7f
>
> Before this patch:
>
> <source>:7:21: missed: couldn't vectorize loop
> <source>:8:14: missed: not vectorized: relevant stmt not supported: _5 = 
> __builtin_popcount (_4);
>
> After this patch:
>
> popcount_32:
>         ble     a2,zero,.L5
>         li      t3,1431654400
>         li      a7,858992640
>         li      t1,252645376
>         li      a6,16711680
>         li      a3,65536
>         addiw   t3,t3,1365
>         addiw   a7,a7,819
>         addiw   t1,t1,-241
>         addiw   a6,a6,255
>         addiw   a3,a3,-1
> .L3:
>         vsetvli a5,a2,e8,mf4,ta,ma
>         vle32.v v1,0(a1)
>         vsetivli        zero,4,e32,m1,ta,ma
>         vsrl.vi v2,v1,1
>         vand.vx v2,v2,t3
>         vsub.vv v1,v1,v2
>         vsrl.vi v2,v1,2
>         vand.vx v2,v2,a7
>         vand.vx v1,v1,a7
>         vadd.vv v1,v1,v2
>         vsrl.vi v2,v1,4
>         vadd.vv v1,v1,v2
>         vand.vx v1,v1,t1
>         vsrl.vi v2,v1,8
>         vand.vx v2,v2,a6
>         slli    a4,a5,2
>         vand.vx v1,v1,a6
>         vadd.vv v1,v1,v2
>         vsrl.vi v2,v1,16
>         vand.vx v1,v1,a3
>         vand.vx v2,v2,a3
>         vadd.vv v1,v1,v2
>         vmv.v.v v1,v1
>         vsetvli zero,a2,e32,m1,ta,ma
>         sub     a2,a2,a5
>         vse32.v v1,0(a0)
>         add     a1,a1,a4
>         add     a0,a0,a4
>         bne     a2,zero,.L3
> .L5:
>         ret
>
> gcc/ChangeLog:
>
>         * config/riscv/autovec.md (popcount<mode>2): New pattern.
>         * config/riscv/riscv-protos.h (expand_popcount): New function.
>         * config/riscv/riscv-v.cc (expand_popcount): Ditto.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/riscv/rvv/autovec/widen/popcount-1.c: New test.
>         * gcc.target/riscv/rvv/autovec/widen/popcount_run-1.c: New test.
>
> ---
>  gcc/config/riscv/autovec.md                   | 13 +++
>  gcc/config/riscv/riscv-protos.h               |  1 +
>  gcc/config/riscv/riscv-v.cc                   | 95 +++++++++++++++++++
>  .../riscv/rvv/autovec/widen/popcount-1.c      | 23 +++++
>  .../riscv/rvv/autovec/widen/popcount_run-1.c  | 50 ++++++++++
>  5 files changed, 182 insertions(+)
>  create mode 100644 
> gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/popcount-1.c
>  create mode 100644 
> gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/popcount_run-1.c
>
> diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
> index b5152bc91fd..9d32b91bdca 100644
> --- a/gcc/config/riscv/autovec.md
> +++ b/gcc/config/riscv/autovec.md
> @@ -922,6 +922,19 @@
>    DONE;
>  })
>
> +;; 
> -------------------------------------------------------------------------------
> +;; - [INT] POPCOUNT.
> +;; 
> -------------------------------------------------------------------------------
> +
> +(define_expand "popcount<mode>2"
> +  [(match_operand:VI 0 "register_operand")
> +   (match_operand:VI 1 "register_operand")]
> +  "TARGET_VECTOR"
> +{
> +  riscv_vector::expand_popcount (operands);
> +  DONE;
> +})
> +
>  ;; 
> -------------------------------------------------------------------------------
>  ;; ---- [FP] Unary operations
>  ;; 
> -------------------------------------------------------------------------------
> diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> index a729db44c32..ae40fbb4b53 100644
> --- a/gcc/config/riscv/riscv-protos.h
> +++ b/gcc/config/riscv/riscv-protos.h
> @@ -321,6 +321,7 @@ void expand_select_vl (rtx *);
>  void expand_load_store (rtx *, bool);
>  void expand_gather_scatter (rtx *, bool);
>  void expand_cond_len_ternop (unsigned, rtx *);
> +void expand_popcount (rtx *);
>
>  /* Rounding mode bitfield for fixed point VXRM.  */
>  enum fixed_point_rounding_mode
> diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
> index c10e51b362e..b3caa4b188d 100644
> --- a/gcc/config/riscv/riscv-v.cc
> +++ b/gcc/config/riscv/riscv-v.cc
> @@ -3614,4 +3614,99 @@ expand_reduction (rtx_code code, rtx *ops, rtx init, 
> reduction_type type)
>    emit_insn (gen_pred_extract_first (m1_mode, ops[0], m1_tmp2));
>  }
>
> +/* Expand Vector POPCOUNT by parallel popcnt:
> +
> +   int parallel_popcnt(uint32_t n) {
> +   #define POW2(c)      (1U << (c))
> +   #define MASK(c)      (static_cast<uint32_t>(-1) / (POW2(POW2(c)) + 1U))
> +   #define COUNT(x, c)  ((x) & MASK(c)) + (((x)>>(POW2(c))) & MASK(c))
> +       n = COUNT(n, 0);
> +       n = COUNT(n, 1);
> +       n = COUNT(n, 2);
> +       n = COUNT(n, 3);
> +       n = COUNT(n, 4);
> +   //  n = COUNT(n, 5);  // uncomment this line for 64-bit integers
> +       return n;
> +   #undef COUNT
> +   #undef MASK
> +   #undef POW2
> +   }
> +*/
> +void
> +expand_popcount (rtx *ops)
> +{
> +  rtx dst = ops[0];
> +  rtx src = ops[1];
> +  machine_mode mode = GET_MODE (dst);
> +  scalar_mode smode = GET_MODE_INNER (mode);
> +  static const uint64_t mask_values[6]
> +    = {0x5555555555555555ULL, 0x3333333333333333ULL, 0x0F0F0F0F0F0F0F0FULL,
> +       0x00FF00FF00FF00FFULL, 0x0000FFFF0000FFFFULL, 0x00000000FFFFFFFFULL};
> +
> +  unsigned bit_size = GET_MODE_BITSIZE (smode);
 
bit_size is always euqal or less than 64 since we don't have TI mode
for vector modes.
 
> +  unsigned word_size
> +    = (bit_size + LONG_LONG_TYPE_SIZE - 1) / LONG_LONG_TYPE_SIZE;
 
So this is always 1, or you were trying to handle rv32 with ELEN64 here?
 
> +  rtx count = CONST0_RTX (mode);
> +
> +  for (unsigned n = 0; n < word_size; ++n)
 
Drop outer loop if word_size never larger than 1?
 
> +    {
> +      rtx part_value = src;
> +      for (unsigned i = 1, ct = 0;
> +          i
> +          < (bit_size > LONG_LONG_TYPE_SIZE ? LONG_LONG_TYPE_SIZE : 
> bit_size);
 
Just bit_size should be fine since LONG_LONG_TYPE_SIZE is constant 64
and bit_size <=64?
 
> +          i <<= 1, ++ct)
> +       {
> +         rtx mask_cst = gen_int_mode (mask_values[ct], smode);
> +
> +         rtx vshift = expand_binop (mode, lshr_optab, part_value,
> +                                    gen_int_mode (i, smode), NULL_RTX, true,
> +                                    OPTAB_DIRECT);
> +
> +         if (i == 4)
> +           {
> +             /* Optimize ((X & MASK) + ((X >> 4) & MASK))
> +
> +                -> (X + (X >> 4)) & MASK  */
> +             rtx rhs = expand_binop (mode, add_optab, part_value, vshift,
> +                                     NULL_RTX, false, OPTAB_DIRECT);
> +             part_value = gen_reg_rtx (mode);
> +             rtx part_ops[] = {part_value, rhs, mask_cst};
> +             emit_vlmax_insn (code_for_pred_scalar (AND, mode), RVV_BINOP,
> +                              part_ops);
> +           }
> +         else
> +           {
> +             rtx rhs = gen_reg_rtx (mode);
> +             rtx rhs_ops[] = {rhs, vshift, mask_cst};
> +             emit_vlmax_insn (code_for_pred_scalar (AND, mode), RVV_BINOP,
> +                              rhs_ops);
> +             if (i == 1)
> +               part_value = expand_binop (mode, sub_optab, part_value, rhs,
> +                                          NULL_RTX, false, OPTAB_DIRECT);
> +             else
> +               {
> +                 rtx lhs = gen_reg_rtx (mode);
> +                 rtx lhs_ops[] = {lhs, part_value, mask_cst};
> +                 emit_vlmax_insn (code_for_pred_scalar (AND, mode), 
> RVV_BINOP,
> +                                  lhs_ops);
> +
> +                 part_value = expand_binop (mode, add_optab, lhs, rhs,
> +                                            NULL_RTX, false, OPTAB_DIRECT);
> +               }
> +           }
> +       }
> +
> +      count = expand_binop (mode, add_optab, part_value, count, NULL_RTX, 
> false,
> +                           OPTAB_DIRECT);
> +      if (bit_size > LONG_LONG_TYPE_SIZE)
 
No need this if word_size is constant 1.
 
> +       {
> +         src = expand_binop (mode, lshr_optab, src,
> +                             gen_int_mode (LONG_LONG_TYPE_SIZE, smode),
> +                             NULL_RTX, true, OPTAB_DIRECT);
> +         bit_size -= LONG_LONG_TYPE_SIZE;
> +       }
> +    }
> +  emit_move_insn (dst, count);
> +}
> +
>  } // namespace riscv_vector
>

Re: Re: [PATCH] RISC-V: Support POPCOUNT auto-vectorization

Reply via email to