For popcount for bytes, we don't need the reduction addition after the vector cnt instruction as we are only counting one byte's popcount. This implements a new define_expand to handle that.
Bootstrapped and tested on aarch64-linux-gnu with no regressions. PR target/113042 gcc/ChangeLog: * config/aarch64/aarch64.md (popcountqi2): New pattern. gcc/testsuite/ChangeLog: * gcc.target/aarch64/popcnt5.c: New test. Signed-off-by: Andrew Pinski <quic_apin...@quicinc.com> --- gcc/config/aarch64/aarch64.md | 26 ++++++++++++++++++++++ gcc/testsuite/gcc.target/aarch64/popcnt5.c | 19 ++++++++++++++++ 2 files changed, 45 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt5.c diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 389a1906e23..ebaf7ec9970 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -5358,6 +5358,32 @@ (define_expand "popcount<mode>2" } }) +/* Popcount for byte can remove the reduction part after the popcount. + For optimization reasons, enabling this for CSSC. */ +(define_expand "popcountqi2" + [(set (match_operand:QI 0 "register_operand" "=w") + (popcount:QI (match_operand:QI 1 "register_operand" "w")))] + "TARGET_CSSC || TARGET_SIMD" +{ + rtx in = operands[1]; + rtx out = operands[0]; + if (TARGET_CSSC) + { + rtx tmp = gen_reg_rtx (SImode); + rtx out1 = gen_reg_rtx (SImode); + emit_insn (gen_zero_extendqisi2 (tmp, in)); + emit_insn (gen_popcountsi2 (out1, tmp)); + emit_move_insn (out, gen_lowpart (QImode, out1)); + DONE; + } + rtx v = gen_reg_rtx (V8QImode); + rtx v1 = gen_reg_rtx (V8QImode); + emit_move_insn (v, gen_lowpart (V8QImode, in)); + emit_insn (gen_popcountv8qi2 (v1, v)); + emit_move_insn (out, gen_lowpart (QImode, v1)); + DONE; +}) + (define_insn "clrsb<mode>2" [(set (match_operand:GPI 0 "register_operand" "=r") (clrsb:GPI (match_operand:GPI 1 "register_operand" "r")))] diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt5.c b/gcc/testsuite/gcc.target/aarch64/popcnt5.c new file mode 100644 index 00000000000..406369d9b29 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/popcnt5.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ +/* { dg-final { check-function-bodies "**" "" } } */ +/* PR target/113042 */ + +#pragma GCC target "+nocssc" + +/* +** h8: +** ldr b[0-9]+, \[x0\] +** cnt v[0-9]+.8b, v[0-9]+.8b +** smov w0, v[0-9]+.b\[0\] +** ret +*/ +/* We should not need the addv here since we only need a byte popcount. */ + +unsigned h8 (const unsigned char *a) { + return __builtin_popcountg (a[0]); +} -- 2.42.0