Pushed as r15-2659-ge4b8db26de352.

Pengxuan
> This patch improves the Advanced SIMD popcount expansion by using SVE if
> available.
> 
> For example, GCC currently generates the following code sequence for V2DI:
>   cnt     v31.16b, v31.16b
>   uaddlp  v31.8h, v31.16b
>   uaddlp  v31.4s, v31.8h
>   uaddlp  v31.2d, v31.4s
> 
> However, by using SVE, we can generate the following sequence instead:
>   ptrue   p7.b, all
>   cnt     z31.d, p7/m, z31.d
> 
> Similar improvements can be made for V4HI, V8HI, V2SI and V4SI too.
> 
> The scalar popcount expansion can also be improved similarly by using SVE
> and those changes will be included in a separate patch.
> 
>       PR target/113860
> 
> gcc/ChangeLog:
> 
>       * config/aarch64/aarch64-simd.md (popcount<mode>2): Add
> TARGET_SVE
>       support.
>       * config/aarch64/aarch64-sve.md
> (@aarch64_pred_<optab><mode>): Use new
>       iterator SVE_VDQ_I.
>       * config/aarch64/iterators.md (SVE_VDQ_I): New mode iterator.
>       (VPRED): Add V8QI, V16QI, V4HI, V8HI and V2SI.
> 
> gcc/testsuite/ChangeLog:
> 
>       * gcc.target/aarch64/popcnt-sve.c: New test.
> 
> Signed-off-by: Pengxuan Zheng <quic_pzh...@quicinc.com>
> ---
>  gcc/config/aarch64/aarch64-simd.md            |  9 ++
>  gcc/config/aarch64/aarch64-sve.md             | 13 +--
>  gcc/config/aarch64/iterators.md               |  5 ++
>  gcc/testsuite/gcc.target/aarch64/popcnt-sve.c | 88 +++++++++++++++++++
>  4 files changed, 109 insertions(+), 6 deletions(-)  create mode 100644
> gcc/testsuite/gcc.target/aarch64/popcnt-sve.c
> 
> diff --git a/gcc/config/aarch64/aarch64-simd.md
> b/gcc/config/aarch64/aarch64-simd.md
> index bbeee221f37..895d6e5eab5 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -3508,6 +3508,15 @@ (define_expand "popcount<mode>2"
>       (popcount:VDQHSD (match_operand:VDQHSD 1
> "register_operand")))]
>    "TARGET_SIMD"
>    {
> +    if (TARGET_SVE)
> +      {
> +     rtx p = aarch64_ptrue_reg (<VPRED>mode);
> +     emit_insn (gen_aarch64_pred_popcount<mode> (operands[0],
> +                                                 p,
> +                                                 operands[1]));
> +     DONE;
> +      }
> +
>      /* Generate a byte popcount.  */
>      machine_mode mode = <bitsize> == 64 ? V8QImode : V16QImode;
>      rtx tmp = gen_reg_rtx (mode);
> diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-
> sve.md
> index 5331e7121d5..eb3705ae515 100644
> --- a/gcc/config/aarch64/aarch64-sve.md
> +++ b/gcc/config/aarch64/aarch64-sve.md
> @@ -3104,16 +3104,16 @@ (define_expand "<optab><mode>2"
> 
>  ;; Integer unary arithmetic predicated with a PTRUE.
>  (define_insn "@aarch64_pred_<optab><mode>"
> -  [(set (match_operand:SVE_I 0 "register_operand")
> -     (unspec:SVE_I
> +  [(set (match_operand:SVE_VDQ_I 0 "register_operand")
> +     (unspec:SVE_VDQ_I
>         [(match_operand:<VPRED> 1 "register_operand")
> -        (SVE_INT_UNARY:SVE_I
> -          (match_operand:SVE_I 2 "register_operand"))]
> +        (SVE_INT_UNARY:SVE_VDQ_I
> +          (match_operand:SVE_VDQ_I 2 "register_operand"))]
>         UNSPEC_PRED_X))]
>    "TARGET_SVE"
>    {@ [ cons: =0 , 1   , 2 ; attrs: movprfx ]
> -     [ w        , Upl , 0 ; *              ] <sve_int_op>\t%0.<Vetype>, %1/m,
> %2.<Vetype>
> -     [ ?&w      , Upl , w ; yes            ] movprfx\t%0,
> %2\;<sve_int_op>\t%0.<Vetype>, %1/m, %2.<Vetype>
> +     [ w        , Upl , 0 ; *              ] <sve_int_op>\t%Z0.<Vetype>, 
> %1/m,
> %Z2.<Vetype>
> +     [ ?&w      , Upl , w ; yes            ] movprfx\t%Z0,
> %Z2\;<sve_int_op>\t%Z0.<Vetype>, %1/m, %Z2.<Vetype>
>    }
>  )
> 
> @@ -3168,6 +3168,7 @@ (define_insn "*cond_<optab><mode>_any"
>    }
>  )
> 
> +
>  ;; -------------------------------------------------------------------------
>  ;; ---- [INT] General unary arithmetic corresponding to unspecs  ;; 
> ---------------
> ----------------------------------------------------------
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index f527b2cfeb8..ee3d1fb98fd 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -559,6 +559,9 @@ (define_mode_iterator SVE_I [VNx16QI VNx8QI
> VNx4QI VNx2QI  ;; element modes  (define_mode_iterator SVE_I_SIMD_DI
> [SVE_I V2DI])
> 
> +;; All SVE and Advanced SIMD integer vector modes.
> +(define_mode_iterator SVE_VDQ_I [SVE_I VDQ_I])
> +
>  ;; SVE integer vector modes whose elements are 16 bits or wider.
>  (define_mode_iterator SVE_HSDI [VNx8HI VNx4HI VNx2HI
>                               VNx4SI VNx2SI
> @@ -2278,6 +2281,8 @@ (define_mode_attr VPRED [(VNx16QI "VNx16BI")
> (VNx8QI "VNx8BI")
>                        (VNx32BF "VNx8BI")
>                        (VNx16SI "VNx4BI") (VNx16SF "VNx4BI")
>                        (VNx8DI "VNx2BI") (VNx8DF "VNx2BI")
> +                      (V8QI "VNx8BI") (V16QI "VNx16BI")
> +                      (V4HI "VNx4BI") (V8HI "VNx8BI") (V2SI "VNx2BI")
>                        (V4SI "VNx4BI") (V2DI "VNx2BI")])
> 
>  ;; ...and again in lower case.
> diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-sve.c
> b/gcc/testsuite/gcc.target/aarch64/popcnt-sve.c
> new file mode 100644
> index 00000000000..8e349efe390
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/popcnt-sve.c
> @@ -0,0 +1,88 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -march=armv8.2-a+sve -fno-vect-cost-model
> +-fno-schedule-insns -fno-schedule-insns2" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +/*
> +** f_v4hi:
> +**   ptrue   (p[0-7]).b, all
> +**   ldr     d([0-9]+), \[x0\]
> +**   cnt     z\2.h, \1/m, z\2.h
> +**   str     d\2, \[x1\]
> +**   ret
> +*/
> +void
> +f_v4hi (unsigned short *__restrict b, unsigned short *__restrict d) {
> +  d[0] = __builtin_popcount (b[0]);
> +  d[1] = __builtin_popcount (b[1]);
> +  d[2] = __builtin_popcount (b[2]);
> +  d[3] = __builtin_popcount (b[3]);
> +}
> +
> +/*
> +** f_v8hi:
> +**   ptrue   (p[0-7]).b, all
> +**   ldr     q([0-9]+), \[x0\]
> +**   cnt     z\2.h, \1/m, z\2.h
> +**   str     q\2, \[x1\]
> +**   ret
> +*/
> +void
> +f_v8hi (unsigned short *__restrict b, unsigned short *__restrict d) {
> +  d[0] = __builtin_popcount (b[0]);
> +  d[1] = __builtin_popcount (b[1]);
> +  d[2] = __builtin_popcount (b[2]);
> +  d[3] = __builtin_popcount (b[3]);
> +  d[4] = __builtin_popcount (b[4]);
> +  d[5] = __builtin_popcount (b[5]);
> +  d[6] = __builtin_popcount (b[6]);
> +  d[7] = __builtin_popcount (b[7]);
> +}
> +
> +/*
> +** f_v2si:
> +**   ptrue   (p[0-7]).b, all
> +**   ldr     d([0-9]+), \[x0\]
> +**   cnt     z\2.s, \1/m, z\2.s
> +**   str     d\2, \[x1\]
> +**   ret
> +*/
> +void
> +f_v2si (unsigned int *__restrict b, unsigned int *__restrict d) {
> +  d[0] = __builtin_popcount (b[0]);
> +  d[1] = __builtin_popcount (b[1]);
> +}
> +
> +/*
> +** f_v4si:
> +**   ptrue   (p[0-7]).b, all
> +**   ldr     q([0-9]+), \[x0\]
> +**   cnt     z\2.s, \1/m, z\2.s
> +**   str     q\2, \[x1\]
> +**   ret
> +*/
> +void
> +f_v4si (unsigned int *__restrict b, unsigned int *__restrict d) {
> +  d[0] = __builtin_popcount (b[0]);
> +  d[1] = __builtin_popcount (b[1]);
> +  d[2] = __builtin_popcount (b[2]);
> +  d[3] = __builtin_popcount (b[3]);
> +}
> +
> +/*
> +** f_v2di:
> +**   ptrue   (p[0-7]).b, all
> +**   ldr     q([0-9]+), \[x0\]
> +**   cnt     z\2.d, \1/m, z\2.d
> +**   str     q\2, \[x1\]
> +**   ret
> +*/
> +void
> +f_v2di (unsigned long *__restrict b, unsigned long *__restrict d) {
> +  d[0] = __builtin_popcountll (b[0]);
> +  d[1] = __builtin_popcountll (b[1]);
> +}
> --
> 2.17.1

Reply via email to