> -----Original Message-----
> From: Andrew Pinski (QUIC) <quic_apin...@quicinc.com>
> Sent: Monday, June 10, 2024 12:23 PM
> To: gcc-patches@gcc.gnu.org
> Cc: Andrew Pinski (QUIC) <quic_apin...@quicinc.com>
> Subject: [PATCH v2] aarch64: Improve popcount for bytes
> [PR113042]
> 
> For popcount for bytes, we don't need the reduction addition
> after the vector cnt instruction as we are only counting one
> byte's popcount.
> This changes the popcount extend to cover all ALLI rather than
> GPI.

Ping? 
https://patchwork.sourceware.org/project/gcc/patch/20240610192255.402779-1-quic_apin...@quicinc.com/

Thanks,
Andrew Pinski

> 
> Changes since v1:
> * v2 - Use ALLI iterator and combine all into one pattern.
>        Add new testcases popcnt[6-8].c.
> 
> Bootstrapped and tested on aarch64-linux-gnu with no
> regressions.
> 
>       PR target/113042
> 
> gcc/ChangeLog:
> 
>       * config/aarch64/aarch64.md (popcount<mode>2):
> Update pattern
>       to support ALLI modes.
> 
> gcc/testsuite/ChangeLog:
> 
>       * gcc.target/aarch64/popcnt5.c: New test.
>       * gcc.target/aarch64/popcnt6.c: New test.
> 
> Signed-off-by: Andrew Pinski <quic_apin...@quicinc.com>
> ---
>  gcc/config/aarch64/aarch64.md              | 52
> +++++++++++++++++++---
>  gcc/testsuite/gcc.target/aarch64/popcnt5.c | 19 ++++++++
> gcc/testsuite/gcc.target/aarch64/popcnt6.c | 19 ++++++++
> gcc/testsuite/gcc.target/aarch64/popcnt7.c | 18 ++++++++
> gcc/testsuite/gcc.target/aarch64/popcnt8.c | 18 ++++++++
>  5 files changed, 119 insertions(+), 7 deletions(-)  create mode
> 100644 gcc/testsuite/gcc.target/aarch64/popcnt5.c
>  create mode 100644
> gcc/testsuite/gcc.target/aarch64/popcnt6.c
>  create mode 100644
> gcc/testsuite/gcc.target/aarch64/popcnt7.c
>  create mode 100644
> gcc/testsuite/gcc.target/aarch64/popcnt8.c
> 
> diff --git a/gcc/config/aarch64/aarch64.md
> b/gcc/config/aarch64/aarch64.md index
> 389a1906e23..dd88fd891b5 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -5332,28 +5332,66 @@ (define_insn
> "*aarch64_popcount<mode>2_cssc_insn"
>  ;; MOV       w0, v2.b[0]
> 
>  (define_expand "popcount<mode>2"
> -  [(set (match_operand:GPI 0 "register_operand")
> -     (popcount:GPI (match_operand:GPI 1
> "register_operand")))]
> +  [(set (match_operand:ALLI 0 "register_operand")
> +     (popcount:ALLI (match_operand:ALLI 1
> "register_operand")))]
>    "TARGET_CSSC || TARGET_SIMD"
>  {
> +  rtx in = operands[1];
> +  rtx out = operands[0];
> +  if (TARGET_CSSC
> +      && (<MODE>mode == HImode
> +          || <MODE>mode == QImode))
> +    {
> +      rtx tmp = gen_reg_rtx (SImode);
> +      rtx out1 = gen_reg_rtx (SImode);
> +      if (<MODE>mode == HImode)
> +        emit_insn (gen_zero_extendhisi2 (tmp, in));
> +      else
> +        emit_insn (gen_zero_extendqisi2 (tmp, in));
> +      emit_insn (gen_popcountsi2 (out1, tmp));
> +      emit_move_insn (out, gen_lowpart (<MODE>mode,
> out1));
> +      DONE;
> +    }
>    if (!TARGET_CSSC)
>      {
>        rtx v = gen_reg_rtx (V8QImode);
>        rtx v1 = gen_reg_rtx (V8QImode);
>        rtx in = operands[1];
>        rtx out = operands[0];
> -      if(<MODE>mode == SImode)
> +      /* SImode and HImode should be zero extended to
> DImode. */
> +      if (<MODE>mode == SImode || <MODE>mode ==
> HImode)
>       {
>         rtx tmp;
>         tmp = gen_reg_rtx (DImode);
> -       /* If we have SImode, zero extend to DImode, pop count
> does
> -          not change if we have extra zeros. */
> -       emit_insn (gen_zero_extendsidi2 (tmp, in));
> +       /* If we have SImode, zero extend to DImode,
> +          pop count does not change if we have extra zeros. */
> +       if (<MODE>mode == SImode)
> +         emit_insn (gen_zero_extendsidi2 (tmp, in));
> +       else
> +         emit_insn (gen_zero_extendhidi2 (tmp, in));
>         in = tmp;
>       }
>        emit_move_insn (v, gen_lowpart (V8QImode, in));
>        emit_insn (gen_popcountv8qi2 (v1, v));
> -      emit_insn
> (gen_aarch64_zero_extend<mode>_reduc_plus_v8qi (out,
> v1));
> +      /* QImode, just extract from the v8qi vector.  */
> +      if (<MODE>mode == QImode)
> +     {
> +       emit_move_insn (out, gen_lowpart (QImode, v1));
> +     }
> +      /* HI and SI, reduction is zero extended to SImode. */
> +      else if (<MODE>mode == SImode || <MODE>mode ==
> HImode)
> +     {
> +       rtx out1;
> +       out1 = gen_reg_rtx (SImode);
> +       emit_insn (gen_aarch64_zero_extendsi_reduc_plus_v8qi
> (out1, v1));
> +       emit_move_insn (out, gen_lowpart (<MODE>mode,
> out1));
> +     }
> +      /* DImode, reduction is zero extended to DImode. */
> +      else
> +     {
> +       gcc_assert (<MODE>mode == DImode);
> +       emit_insn (gen_aarch64_zero_extenddi_reduc_plus_v8qi
> (out, v1));
> +     }
>        DONE;
>      }
>  })
> diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt5.c
> b/gcc/testsuite/gcc.target/aarch64/popcnt5.c
> new file mode 100644
> index 00000000000..406369d9b29
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/popcnt5.c
> @@ -0,0 +1,19 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +/* PR target/113042 */
> +
> +#pragma GCC target "+nocssc"
> +
> +/*
> +** h8:
> +**   ldr     b[0-9]+, \[x0\]
> +**   cnt     v[0-9]+.8b, v[0-9]+.8b
> +**   smov    w0, v[0-9]+.b\[0\]
> +**   ret
> +*/
> +/* We should not need the addv here since we only need a
> byte popcount.
> +*/
> +
> +unsigned h8 (const unsigned char *a) {
> +       return __builtin_popcountg (a[0]);
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt6.c
> b/gcc/testsuite/gcc.target/aarch64/popcnt6.c
> new file mode 100644
> index 00000000000..e882cb24126
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/popcnt6.c
> @@ -0,0 +1,19 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +/* PR target/113042 */
> +
> +#pragma GCC target "+nocssc"
> +
> +/*
> +** h8:
> +**   ldr     h[0-9]+, \[x0\]
> +**   cnt     v[0-9]+.8b, v[0-9]+.8b
> +**   addv    b[0-9]+, v[0-9]+.8b
> +**   fmov    w0, s[0-9]+
> +**   ret
> +*/
> +
> +unsigned h8 (const unsigned short *a) {
> +       return __builtin_popcountg (a[0]);
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt7.c
> b/gcc/testsuite/gcc.target/aarch64/popcnt7.c
> new file mode 100644
> index 00000000000..8dfff211ae4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/popcnt7.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +/* PR target/113042 */
> +
> +#pragma GCC target "+cssc"
> +
> +/*
> +** h8:
> +**   ldrb    w[0-9]+, \[x0\]
> +**   cnt     w[0-9]+, w[0-9]+
> +**   ret
> +*/
> +/* We should not produce any extra zero extend for this code
> */
> +
> +unsigned h8 (const unsigned char *a) {
> +       return __builtin_popcountg (a[0]);
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt8.c
> b/gcc/testsuite/gcc.target/aarch64/popcnt8.c
> new file mode 100644
> index 00000000000..66a88b6a929
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/popcnt8.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +/* PR target/113042 */
> +
> +#pragma GCC target "+cssc"
> +
> +/*
> +** h8:
> +**   ldrh    w[0-9]+, \[x0\]
> +**   cnt     w[0-9]+, w[0-9]+
> +**   ret
> +*/
> +/* We should not produce any extra zero extend for this code
> */
> +
> +unsigned h8 (const unsigned short *a) {
> +       return __builtin_popcountg (a[0]);
> +}
> --
> 2.42.0

Reply via email to