> Pengxuan Zheng <quic_pzh...@quicinc.com> writes: > > This is similar to the recent improvements to the Advanced SIMD > > popcount expansion by using SVE. We can utilize SVE to generate more > > efficient code for scalar mode popcount too. > > > > Changes since v1: > > * v2: Add a new VNx1BI mode and a new test case for V1DI. > > Sorry for the delay in reviewing this, and for the run-around, > but: following the later discussion in the FLOGB thread about using SVE for > Advanced SIMD modes, the agreement was to use the full SVE predicate > mode, but with predicate restricted to the leading 64 bits or 128 bits (for > 64- > bit and 128-bit Advanced SIMD modes respectively). > I think we should do that even when it isn't strictly necessary, partly so > that all > Advanced SIMD code uses the same predicate, and partly to avoid bugs that > only show up on VL>128 targets. > > I'm afraid that means going back to VNx2BI, as in your original patch. > But we should use: > > ptrue pN.b, vl8 > > rather than: > > ptrue pN.b, all > > to set the predicate. We could do this by adding; > > rtx > aarch64_ptrue_reg (machine_mode mode, unsigned int vl) > > where "vl" is 8 for 64-bit modes and 16 for 128-bit modes. Like with the > current aarch64_ptrue_reg, the predicate would always be constructed in > VNx16BImode and then cast to the right mode.
Thanks for the feedback, Richard! I've abandoned the VNx1BI changes and added a new variant of aarch64_ptrue_reg as you suggested. Please let me know if I misunderstood your feedback or if you have any other comments. https://gcc.gnu.org/pipermail/gcc-patches/2024-October/665338.html Thanks, Pengxuan > > Thanks, > Richard > > > > > PR target/113860 > > > > gcc/ChangeLog: > > > > * config/aarch64/aarch64-modes.def (VECTOR_BOOL_MODE): Add > VNx1BI. > > (ADJUST_NUNITS): Likewise. > > (ADJUST_ALIGNMENT): Likewise. > > * config/aarch64/aarch64-simd.md (popcount<mode>2): Update > pattern to > > also support V1DI mode. > > * config/aarch64/aarch64.cc (aarch64_sve_pred_mode_p): Add > VNx1BImode. > > * config/aarch64/aarch64.md (popcount<mode>2): Add TARGET_SVE > support. > > * config/aarch64/iterators.md (VDQHSD_V1DI): New mode iterator. > > (SVE_VDQ_I): Add V1DI. > > (bitsize): Likewise. > > (VPRED): Likewise. > > (VEC_POP_MODE): New mode attribute. > > (vec_pop_mode): Likewise. > > > > gcc/testsuite/ChangeLog: > > > > * gcc.target/aarch64/popcnt11.c: New test. > > * gcc.target/aarch64/popcnt12.c: New test. > > > > Signed-off-by: Pengxuan Zheng <quic_pzh...@quicinc.com> > > --- > > gcc/config/aarch64/aarch64-modes.def | 3 ++ > > gcc/config/aarch64/aarch64-simd.md | 13 ++++- > > gcc/config/aarch64/aarch64.cc | 3 +- > > gcc/config/aarch64/aarch64.md | 9 ++++ > > gcc/config/aarch64/iterators.md | 16 ++++-- > > gcc/testsuite/gcc.target/aarch64/popcnt11.c | 58 > > +++++++++++++++++++++ gcc/testsuite/gcc.target/aarch64/popcnt12.c | > > 18 +++++++ > > 7 files changed, 114 insertions(+), 6 deletions(-) create mode > > 100644 gcc/testsuite/gcc.target/aarch64/popcnt11.c > > create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt12.c > > > > diff --git a/gcc/config/aarch64/aarch64-modes.def > > b/gcc/config/aarch64/aarch64-modes.def > > index 25a22c1195e..d822d4dfc13 100644 > > --- a/gcc/config/aarch64/aarch64-modes.def > > +++ b/gcc/config/aarch64/aarch64-modes.def > > @@ -53,18 +53,21 @@ VECTOR_BOOL_MODE (VNx16BI, 16, BI, 2); > > VECTOR_BOOL_MODE (VNx8BI, 8, BI, 2); VECTOR_BOOL_MODE (VNx4BI, > 4, BI, > > 2); VECTOR_BOOL_MODE (VNx2BI, 2, BI, 2); > > +VECTOR_BOOL_MODE (VNx1BI, 1, BI, 2); > > > > ADJUST_NUNITS (VNx32BI, aarch64_sve_vg * 16); ADJUST_NUNITS > > (VNx16BI, aarch64_sve_vg * 8); ADJUST_NUNITS (VNx8BI, aarch64_sve_vg > > * 4); ADJUST_NUNITS (VNx4BI, aarch64_sve_vg * 2); ADJUST_NUNITS > > (VNx2BI, aarch64_sve_vg); > > +ADJUST_NUNITS (VNx1BI, exact_div (aarch64_sve_vg, 2)); > > > > ADJUST_ALIGNMENT (VNx32BI, 2); > > ADJUST_ALIGNMENT (VNx16BI, 2); > > ADJUST_ALIGNMENT (VNx8BI, 2); > > ADJUST_ALIGNMENT (VNx4BI, 2); > > ADJUST_ALIGNMENT (VNx2BI, 2); > > +ADJUST_ALIGNMENT (VNx1BI, 2); > > > > /* Bfloat16 modes. */ > > FLOAT_MODE (BF, 2, 0); > > diff --git a/gcc/config/aarch64/aarch64-simd.md > > b/gcc/config/aarch64/aarch64-simd.md > > index 23c03a96371..386b1fa1f4b 100644 > > --- a/gcc/config/aarch64/aarch64-simd.md > > +++ b/gcc/config/aarch64/aarch64-simd.md > > @@ -3515,8 +3515,9 @@ (define_insn > "popcount<mode>2<vczle><vczbe>" > > ) > > > > (define_expand "popcount<mode>2" > > - [(set (match_operand:VDQHSD 0 "register_operand") > > - (popcount:VDQHSD (match_operand:VDQHSD 1 > "register_operand")))] > > + [(set (match_operand:VDQHSD_V1DI 0 "register_operand") > > + (popcount:VDQHSD_V1DI > > + (match_operand:VDQHSD_V1DI 1 "register_operand")))] > > "TARGET_SIMD" > > { > > if (TARGET_SVE) > > @@ -3528,6 +3529,14 @@ (define_expand "popcount<mode>2" > > DONE; > > } > > > > + if (<MODE>mode == V1DImode) > > + { > > + rtx out = gen_reg_rtx (DImode); > > + emit_insn (gen_popcountdi2 (out, gen_lowpart (DImode, > operands[1]))); > > + emit_move_insn (operands[0], gen_lowpart (<MODE>mode, out)); > > + DONE; > > + } > > + > > /* Generate a byte popcount. */ > > machine_mode mode = <bitsize> == 64 ? V8QImode : V16QImode; > > rtx tmp = gen_reg_rtx (mode); > > diff --git a/gcc/config/aarch64/aarch64.cc > > b/gcc/config/aarch64/aarch64.cc index 92763d403c7..78f65f886b7 > 100644 > > --- a/gcc/config/aarch64/aarch64.cc > > +++ b/gcc/config/aarch64/aarch64.cc > > @@ -1476,7 +1476,8 @@ aarch64_sve_pred_mode_p (machine_mode > mode) > > && (mode == VNx16BImode > > || mode == VNx8BImode > > || mode == VNx4BImode > > - || mode == VNx2BImode)); > > + || mode == VNx2BImode > > + || mode == VNx1BImode)); > > } > > > > /* Three mutually-exclusive flags describing a vector or predicate > > type. */ diff --git a/gcc/config/aarch64/aarch64.md > > b/gcc/config/aarch64/aarch64.md index c54b29cd64b..ef52770f1cb > 100644 > > --- a/gcc/config/aarch64/aarch64.md > > +++ b/gcc/config/aarch64/aarch64.md > > @@ -5345,6 +5345,15 @@ (define_expand "popcount<mode>2" > > (popcount:ALLI (match_operand:ALLI 1 "register_operand")))] > > "TARGET_CSSC ? GET_MODE_BITSIZE (<MODE>mode) >= 32 : > TARGET_SIMD" > > { > > + if (!TARGET_CSSC && TARGET_SVE && <MODE>mode != QImode) > > + { > > + rtx tmp = gen_reg_rtx (<VEC_POP_MODE>mode); > > + rtx op1 = gen_lowpart (<VEC_POP_MODE>mode, operands[1]); > > + emit_insn (gen_popcount<vec_pop_mode>2 (tmp, op1)); > > + emit_move_insn (operands[0], gen_lowpart (<MODE>mode, tmp)); > > + DONE; > > + } > > + > > if (!TARGET_CSSC) > > { > > rtx v = gen_reg_rtx (V8QImode); diff --git > > a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md > > index 20a318e023b..6c46e4fec3b 100644 > > --- a/gcc/config/aarch64/iterators.md > > +++ b/gcc/config/aarch64/iterators.md > > @@ -290,6 +290,8 @@ (define_mode_iterator VDQHS [V4HI V8HI V2SI > V4SI]) > > ;; Advanced SIMD modes for H, S and D types. > > (define_mode_iterator VDQHSD [V4HI V8HI V2SI V4SI V2DI]) > > > > +(define_mode_iterator VDQHSD_V1DI [VDQHSD V1DI]) > > + > > ;; Advanced SIMD and scalar integer modes for H and S. > > (define_mode_iterator VSDQ_HSI [V4HI V8HI V2SI V4SI HI SI]) > > > > @@ -560,7 +562,7 @@ (define_mode_iterator SVE_I [VNx16QI VNx8QI > VNx4QI > > VNx2QI (define_mode_iterator SVE_I_SIMD_DI [SVE_I V2DI]) > > > > ;; All SVE and Advanced SIMD integer vector modes. > > -(define_mode_iterator SVE_VDQ_I [SVE_I VDQ_I]) > > +(define_mode_iterator SVE_VDQ_I [SVE_I VDQ_I V1DI]) > > > > ;; SVE integer vector modes whose elements are 16 bits or wider. > > (define_mode_iterator SVE_HSDI [VNx8HI VNx4HI VNx2HI @@ -1230,7 > > +1232,7 @@ (define_mode_attr nunits [(V8QI "8") (V16QI "16") > > (define_mode_attr bitsize [(V8QI "64") (V16QI "128") > > (V4HI "64") (V8HI "128") > > (V2SI "64") (V4SI "128") > > - (V2DI "128")]) > > + (V1DI "64") (V2DI "128")]) > > > > ;; Map a floating point or integer mode to the appropriate register > > name prefix (define_mode_attr s [(HF "h") (SF "s") (DF "d") (SI "s") > > (DI "d")]) @@ -2284,7 +2286,7 @@ (define_mode_attr VPRED [(VNx16QI > "VNx16BI") (VNx8QI "VNx8BI") > > (VNx8DI "VNx2BI") (VNx8DF "VNx2BI") > > (V8QI "VNx8BI") (V16QI "VNx16BI") > > (V4HI "VNx4BI") (V8HI "VNx8BI") (V2SI "VNx2BI") > > - (V4SI "VNx4BI") (V2DI "VNx2BI")]) > > + (V4SI "VNx4BI") (V2DI "VNx2BI") (V1DI "VNx1BI")]) > > > > ;; ...and again in lower case. > > (define_mode_attr vpred [(VNx16QI "vnx16bi") (VNx8QI "vnx8bi") @@ > > -2318,6 +2320,14 @@ (define_mode_attr VDOUBLE [(VNx16QI > "VNx32QI") > > (VNx4SI "VNx8SI") (VNx4SF "VNx8SF") > > (VNx2DI "VNx4DI") (VNx2DF "VNx4DF")]) > > > > +;; The Advanced SIMD modes of popcount corresponding to scalar modes. > > +(define_mode_attr VEC_POP_MODE [(QI "V8QI") (HI "V4HI") > > + (SI "V2SI") (DI "V1DI")]) > > + > > +;; ...and again in lower case. > > +(define_mode_attr vec_pop_mode [(QI "v8qi") (HI "v4hi") > > + (SI "v2si") (DI "v1di")]) > > + > > ;; On AArch64 the By element instruction doesn't have a 2S variant. > > ;; However because the instruction always selects a pair of values > > ;; The normal 3SAME instruction can be used here instead. > > diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt11.c > > b/gcc/testsuite/gcc.target/aarch64/popcnt11.c > > new file mode 100644 > > index 00000000000..595b2f9eb93 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/popcnt11.c > > @@ -0,0 +1,58 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -march=armv8.2-a+sve" } */ > > +/* { dg-final { check-function-bodies "**" "" "" } } */ > > + > > +/* > > +** f_qi: > > +** ldr b([0-9]+), \[x0\] > > +** cnt v\1.8b, v\1.8b > > +** smov w0, v\1.b\[0\] > > +** ret > > +*/ > > +unsigned > > +f_qi (unsigned char *a) > > +{ > > + return __builtin_popcountg (a[0]); > > +} > > + > > +/* > > +** f_hi: > > +** ldr h([0-9]+), \[x0\] > > +** ptrue (p[0-7]).b, all > > +** cnt z\1.h, \2/m, z\1.h > > +** smov w0, v\1.h\[0\] > > +** ret > > +*/ > > +unsigned > > +f_hi (unsigned short *a) > > +{ > > + return __builtin_popcountg (a[0]); > > +} > > + > > +/* > > +** f_si: > > +** ldr s([0-9]+), \[x0\] > > +** ptrue (p[0-7]).b, all > > +** cnt z\1.s, \2/m, z\1.s > > +** umov x0, v\1.d\[0\] > > +** ret > > +*/ > > +unsigned > > +f_si (unsigned int *a) > > +{ > > + return __builtin_popcountg (a[0]); > > +} > > + > > +/* > > +** f_di: > > +** ldr d([0-9]+), \[x0\] > > +** ptrue (p[0-7])\.b, all > > +** cnt z\1\.d, \2/m, z\1\.d > > +** fmov x0, d\1 > > +** ret > > +*/ > > +unsigned > > +f_di (unsigned long *a) > > +{ > > + return __builtin_popcountg (a[0]); > > +} > > diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt12.c > > b/gcc/testsuite/gcc.target/aarch64/popcnt12.c > > new file mode 100644 > > index 00000000000..f086cae55a2 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/popcnt12.c > > @@ -0,0 +1,18 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -fgimple" } */ > > +/* { dg-final { check-function-bodies "**" "" "" } } */ > > + > > +/* > > +** foo: > > +** cnt v0.8b, v0.8b > > +** addv b0, v0.8b > > +** ret > > +*/ > > +__Uint64x1_t __GIMPLE > > +foo (__Uint64x1_t x) > > +{ > > + __Uint64x1_t z; > > + > > + z = .POPCOUNT (x); > > + return z; > > +}