For popcount for bytes, we don't need the reduction addition
after the vector cnt instruction as we are only counting one
byte's popcount.
This implements a new define_expand to handle that.

Bootstrapped and tested on aarch64-linux-gnu with no regressions.

        PR target/113042

gcc/ChangeLog:

        * config/aarch64/aarch64.md (popcountqi2): New pattern.

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/popcnt5.c: New test.

Signed-off-by: Andrew Pinski <quic_apin...@quicinc.com>
---
 gcc/config/aarch64/aarch64.md              | 26 ++++++++++++++++++++++
 gcc/testsuite/gcc.target/aarch64/popcnt5.c | 19 ++++++++++++++++
 2 files changed, 45 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt5.c

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 389a1906e23..ebaf7ec9970 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -5358,6 +5358,32 @@ (define_expand "popcount<mode>2"
     }
 })
 
+/* Popcount for byte can remove the reduction part after the popcount.
+   For optimization reasons, enabling this for CSSC. */
+(define_expand "popcountqi2"
+  [(set (match_operand:QI 0 "register_operand" "=w")
+       (popcount:QI (match_operand:QI 1 "register_operand" "w")))]
+  "TARGET_CSSC || TARGET_SIMD"
+{
+  rtx in = operands[1];
+  rtx out = operands[0];
+  if (TARGET_CSSC)
+    {
+      rtx tmp = gen_reg_rtx (SImode);
+      rtx out1 = gen_reg_rtx (SImode);
+      emit_insn (gen_zero_extendqisi2 (tmp, in));
+      emit_insn (gen_popcountsi2 (out1, tmp));
+      emit_move_insn (out, gen_lowpart (QImode, out1));
+      DONE;
+    }
+  rtx v = gen_reg_rtx (V8QImode);
+  rtx v1 = gen_reg_rtx (V8QImode);
+  emit_move_insn (v, gen_lowpart (V8QImode, in));
+  emit_insn (gen_popcountv8qi2 (v1, v));
+  emit_move_insn (out, gen_lowpart (QImode, v1));
+  DONE;
+})
+
 (define_insn "clrsb<mode>2"
   [(set (match_operand:GPI 0 "register_operand" "=r")
         (clrsb:GPI (match_operand:GPI 1 "register_operand" "r")))]
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt5.c 
b/gcc/testsuite/gcc.target/aarch64/popcnt5.c
new file mode 100644
index 00000000000..406369d9b29
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/popcnt5.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+/* PR target/113042 */
+
+#pragma GCC target "+nocssc"
+
+/*
+** h8:
+**     ldr     b[0-9]+, \[x0\]
+**     cnt     v[0-9]+.8b, v[0-9]+.8b
+**     smov    w0, v[0-9]+.b\[0\]
+**     ret
+*/
+/* We should not need the addv here since we only need a byte popcount. */
+
+unsigned h8 (const unsigned char *a) {
+         return __builtin_popcountg (a[0]);
+}
-- 
2.42.0

Reply via email to