From a0977e694017cd9e7e09dae418c7f6a6013f5257 Mon Sep 17 00:00:00 2001
From: Soumya AR <soumyaa@nvidia.com>
Date: Mon, 23 Sep 2024 16:07:34 +0530
Subject: [PATCH] aarch64: Expand CTZ to RBIT + CLZ for SVE [PR109498]

Currently, we vectorize CTZ for SVE by using the following operation:
.CTZ (X) = (PREC - 1) - .CLZ (X & -X)

Instead, this patch expands CTZ to RBIT + CLZ for SVE, as suggested in PR109498.

The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression.
OK for mainline?

Signed-off-by: Soumya AR <soumyaa@nvidia.com>

gcc/ChangeLog:
	PR target/109498
	* config/aarch64/aarch64-sve.md (ctz<mode>2): Added pattern to expand
        CTZ to RBIT + CLZ for SVE.

gcc/testsuite/ChangeLog:
	PR target/109498
	* gcc.target/aarch64/sve/ctz.c: New test.
---
 gcc/config/aarch64/aarch64-sve.md          | 16 +++++++
 gcc/testsuite/gcc.target/aarch64/sve/ctz.c | 49 ++++++++++++++++++++++
 2 files changed, 65 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/ctz.c

diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index bfa28849adf..10094f156b3 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -3088,6 +3088,22 @@
 ;; - NOT
 ;; -------------------------------------------------------------------------
 
+(define_expand "ctz<mode>2"
+  [(set (match_operand:SVE_I 0 "register_operand")
+	(unspec:SVE_I
+	  [(match_dup 2)
+	   (ctz:SVE_I
+	     (match_operand:SVE_I 1 "register_operand"))]
+	  UNSPEC_PRED_X))]
+  "TARGET_SVE"
+  {
+     operands[2] = aarch64_ptrue_reg (<VPRED>mode);
+     emit_insn (gen_aarch64_pred_rbit<mode> (operands[0], operands[2],operands[1]));
+     emit_insn (gen_aarch64_pred_clz<mode> (operands[0], operands[2], operands[0]));
+     DONE;
+  }
+)
+
 ;; Unpredicated integer unary arithmetic.
 (define_expand "<optab><mode>2"
   [(set (match_operand:SVE_I 0 "register_operand")
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/ctz.c b/gcc/testsuite/gcc.target/aarch64/sve/ctz.c
new file mode 100644
index 00000000000..ef0cb8c0c96
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/ctz.c
@@ -0,0 +1,49 @@
+/* { dg-final { check-function-bodies "**" "" } } */
+/* { dg-options "-O3 --param aarch64-autovec-preference=sve-only" } */
+
+#include <stdint.h>
+
+#define FUNC(FUNCTION, NAME, DTYPE)                 \
+void                                                     \
+NAME (DTYPE *__restrict x, DTYPE *__restrict y, int n) { \
+  for (int i = 0; i < n; i++)                            \
+    x[i] = FUNCTION (y[i]);                         \
+}                                                        \
+
+
+/*
+** ctz_uint8:
+**	...
+**	rbit	z31.b, p6/m, z31.b
+**	clz	z31.b, p6/m, z31.b
+**	...
+*/
+FUNC (__builtin_ctzg, ctz_uint8, uint8_t)
+
+/*
+** ctz_uint16:
+**	...
+**	rbit	z31.h, p6/m, z31.h
+**	clz	z31.h, p6/m, z31.h
+**	...
+*/
+FUNC (__builtin_ctzg, ctz_uint16, uint16_t)
+
+/*
+** ctz_uint32:
+**	...
+**	rbit	z31.s, p6/m, z31.s
+**	clz	z31.s, p6/m, z31.s
+**	...
+*/
+FUNC (__builtin_ctz, ctz_uint32, uint32_t)
+
+/*
+** ctz_uint64:
+**	...
+**	rbit	z31.d, p6/m, z31.d
+**	clz	z31.d, p6/m, z31.d
+**	...
+*/
+FUNC (__builtin_ctzll, ctz_uint64, uint64_t)
+
-- 
2.43.2

