This patch add vector pattern for __builtin_ctz.

like __builtin_clz, only 32bit version of ctz supported.

for scalar version ctz, we expand it into:

  rbit
  clz

reverse bits first, then turn cout tailing zero into count leading zero.

while for vector version, rbit only support byte granularity .8B and .16B.
no half-word, and word. so we need to first reverse byte within word,
then reverse bits within byte. thus the generated instruction sequences are:

void
count_tz_v4si (unsigned *__restrict a, int *__restrict b)
{
  int i;
  for (i = 0; i < 4; i++)
    b[i] = __builtin_ctz (a[i]);
}

void
count_tz_v2si (unsigned *__restrict a, int *__restrict b)
{
  int i;
  for (i = 0; i < 2; i++)
    b[i] = __builtin_ctz (a[i]);
}

count_tz_v4si:
        ldr     q0, [x0]
        rev32   v0.16b, v0.16b
        rbit    v0.16b, v0.16b
        clz     v0.4s, v0.4s
        str     q0, [x1]
        ret

count_tz_v2si:
        ldr     d0, [x0]
        rev32   v0.8b, v0.8b
        rbit    v0.8b, v0.8b
        clz     v0.2s, v0.2s
        str     d0, [x1]
        ret

no regression on aarch64-none-gnu-linux qemu test.

ok for trunk?

thanks.

gcc/
  * config/aarch64/iterators.md (VS): New mode iterator.
  (vsi2qi): New mode attribute.
  (VSI2QI): Likewise.
  * config/aarch64/aarch64-simd-builtins.def: New entry for ctz.
  * config/aarch64/aarch64-simd.md (ctz<mode>2): New pattern for ctz.
  * config/aarch64/aarch64-builtins.c
  (aarch64_builtin_vectorized_function): Support BUILT_IN_CTZ.

gcc/testsuite/
  * gcc.target/aarch64/vect_ctz_1.c: New testcase.
diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 527445c..3250f3c 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -1097,6 +1097,14 @@ aarch64_builtin_vectorized_function (tree fndecl, tree type_out, tree type_in)
               return aarch64_builtin_decls[AARCH64_SIMD_BUILTIN_UNOP_clzv4si];
             return NULL_TREE;
           }
+	case BUILT_IN_CTZ:
+          {
+	    if (AARCH64_CHECK_BUILTIN_MODE (2, S))
+	      return aarch64_builtin_decls[AARCH64_SIMD_BUILTIN_UNOP_ctzv2si];
+	    else if (AARCH64_CHECK_BUILTIN_MODE (4, S))
+	      return aarch64_builtin_decls[AARCH64_SIMD_BUILTIN_UNOP_ctzv4si];
+	    return NULL_TREE;
+          }
 #undef AARCH64_CHECK_BUILTIN_MODE
 #define AARCH64_CHECK_BUILTIN_MODE(C, N) \
   (out_mode == N##Imode && out_n == C \
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 62b7f33..c611b5c 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -46,6 +46,7 @@
   BUILTIN_VD_BHSI (BINOP, addp, 0)
   VAR1 (UNOP, addp, 0, di)
   BUILTIN_VDQ_BHSI (UNOP, clz, 2)
+  BUILTIN_VS (UNOP, ctz, 2)

   BUILTIN_VALL (GETLANE, be_checked_get_lane, 0)

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index ef196e4..5ee960f 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -303,6 +303,20 @@
   [(set_attr "type" "neon_rbit")]
 )
 
+(define_expand "ctz<mode>2"
+  [(set (match_operand:VS 0 "register_operand")
+        (ctz:VS (match_operand:VS 1 "register_operand")))]
+  "TARGET_SIMD"
+  {
+     emit_insn (gen_bswap<mode> (operands[0], operands[1]));
+     rtx op0_castsi2qi = simplify_gen_subreg(<VS:VSI2QI>mode, operands[0],
+					     <MODE>mode, 0);
+     emit_insn (gen_aarch64_rbit<VS:vsi2qi> (op0_castsi2qi, op0_castsi2qi));
+     emit_insn (gen_clz<mode>2 (operands[0], operands[0]));
+     DONE;
+  }
+)
+
 (define_insn "*aarch64_mul3_elt<mode>"
  [(set (match_operand:VMUL 0 "register_operand" "=w")
     (mult:VMUL
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 9935167..b416e6a 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -183,6 +183,9 @@
 ;; All byte modes.
 (define_mode_iterator VB [V8QI V16QI])
 
+;; 2 and 4 lane SI modes.
+(define_mode_iterator VS [V2SI V4SI])
+
 (define_mode_iterator TX [TI TF])
 
 ;; Opaque structure modes.
@@ -670,6 +673,9 @@
 		      (V2DI  "p") (V2DF  "p")
 		      (V2SF "p") (V4SF  "v")])
 
+(define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi")])
+(define_mode_attr VSI2QI [(V2SI "V8QI") (V4SI "V16QI")])
+
 ;; -------------------------------------------------------------------
 ;; Code Iterators
 ;; -------------------------------------------------------------------
diff --git a/gcc/testsuite/gcc.target/aarch64/vect_ctz_1.c b/gcc/testsuite/gcc.target/aarch64/vect_ctz_1.c
new file mode 100644
index 0000000..40823b0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect_ctz_1.c
@@ -0,0 +1,41 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps -fno-inline" } */
+
+extern void abort ();
+
+#define TEST(name, subname, count) \
+void \
+count_tz_##name (unsigned *__restrict a, int *__restrict b) \
+{ \
+  int i; \
+  for (i = 0; i < count; i++) \
+    b[i] = __builtin_##subname (a[i]); \
+}
+
+#define CHECK(name, count, input, output) \
+  count_tz_##name (input, output); \
+  for (i = 0; i < count; i++) \
+    { \
+      if (output[i] != r[i]) \
+	abort (); \
+    }
+
+TEST (v4si, ctz, 4)
+TEST (v2si, ctz, 2)
+/* { dg-final { scan-assembler "clz\tv\[0-9\]+\.4s" } } */
+/* { dg-final { scan-assembler "clz\tv\[0-9\]+\.2s" } } */
+
+int
+main ()
+{
+  unsigned int x4[4] = { 0x0, 0xFF80, 0x1FFFF, 0xFF000000 };
+  int r[4] = { 32, 7, 0, 24 };
+  int d[4], i;
+
+  CHECK (v4si, 4, x4, d);
+  CHECK (v2si, 2, x4, d);
+
+  return 0;
+}
+
+/* { dg-final { cleanup-saved-temps } } */

Reply via email to