From 43913cfa47b31d055a0456c863a30e3e44acc2f0 Mon Sep 17 00:00:00 2001
From: Jennifer Schmitz <jschmitz@nvidia.com>
Date: Fri, 2 Aug 2024 06:41:09 -0700
Subject: [PATCH] SVE intrinsics: Fold svdiv (svptrue, x, x) to ones

This patch folds the SVE intrinsic svdiv into a vector of 1's in case
1) the predicate is svptrue and
2) dividend and divisor are equal.
This is implemented in the gimple_folder for signed and unsigned
integers. Corresponding test cases were added to the existing test
suites.

The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression.
OK for mainline?

Signed-off-by: Jennifer Schmitz <jschmitz@nvidia.com>

gcc/

	* config/aarch64/aarch64-sve-builtins-base.cc (svdiv_impl::fold):
	Add optimization.

gcc/testsuite/

	* gcc.target/aarch64/sve/acle/asm/div_s32.c: New test.
	* gcc.target/aarch64/sve/acle/asm/div_s64.c: Likewise.
	* gcc.target/aarch64/sve/acle/asm/div_u32.c: Likewise.
	* gcc.target/aarch64/sve/acle/asm/div_u64.c: Likewise.
---
 .../aarch64/aarch64-sve-builtins-base.cc      | 19 ++++++++++---
 .../gcc.target/aarch64/sve/acle/asm/div_s32.c | 27 +++++++++++++++++++
 .../gcc.target/aarch64/sve/acle/asm/div_s64.c | 27 +++++++++++++++++++
 .../gcc.target/aarch64/sve/acle/asm/div_u32.c | 27 +++++++++++++++++++
 .../gcc.target/aarch64/sve/acle/asm/div_u64.c | 27 +++++++++++++++++++
 5 files changed, 124 insertions(+), 3 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index d55bee0b72f..e347d29c725 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -755,8 +755,21 @@ public:
   gimple *
   fold (gimple_folder &f) const override
   {
-    tree divisor = gimple_call_arg (f.call, 2);
-    tree divisor_cst = uniform_integer_cst_p (divisor);
+    tree pg = gimple_call_arg (f.call, 0);
+    tree op1 = gimple_call_arg (f.call, 1);
+    tree op2 = gimple_call_arg (f.call, 2);
+
+    if (f.type_suffix (0).integer_p
+	&& is_ptrue (pg, f.type_suffix (0).element_bytes)
+	&& operand_equal_p (op1, op2, 0))
+      {
+	tree lhs_type = TREE_TYPE (f.lhs);
+	tree_vector_builder builder (lhs_type, 1, 1);
+	builder.quick_push (build_each_one_cst (TREE_TYPE (lhs_type)));
+	return gimple_build_assign (f.lhs, builder.build ());
+      }
+
+    tree divisor_cst = uniform_integer_cst_p (op2);
 
     if (!divisor_cst || !integer_pow2p (divisor_cst))
       return NULL;
@@ -770,7 +783,7 @@ public:
 				    shapes::binary_uint_opt_n, MODE_n,
 				    f.type_suffix_ids, GROUP_none, f.pred);
 	call = f.redirect_call (instance);
-	tree d = INTEGRAL_TYPE_P (TREE_TYPE (divisor)) ? divisor : divisor_cst;
+	tree d = INTEGRAL_TYPE_P (TREE_TYPE (op2)) ? op2 : divisor_cst;
 	new_divisor = wide_int_to_tree (TREE_TYPE (d), tree_log2 (d));
       }
     else
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s32.c
index d5a23bf0726..09d0419f3ef 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s32.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s32.c
@@ -55,6 +55,15 @@ TEST_UNIFORM_ZX (div_w0_s32_m_untied, svint32_t, int32_t,
 		 z0 = svdiv_n_s32_m (p0, z1, x0),
 		 z0 = svdiv_m (p0, z1, x0))
 
+/*
+** div_same_s32_m_tied1:
+**	mov	z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (div_same_s32_m_tied1, svint32_t,
+		z0 = svdiv_s32_m (svptrue_b32 (), z0, z0),
+		z0 = svdiv_m (svptrue_b32 (), z0, z0))
+
 /*
 ** div_1_s32_m_tied1:
 **	sel	z0\.s, p0, z0\.s, z0\.s
@@ -215,6 +224,15 @@ TEST_UNIFORM_ZX (div_w0_s32_z_untied, svint32_t, int32_t,
 		 z0 = svdiv_n_s32_z (p0, z1, x0),
 		 z0 = svdiv_z (p0, z1, x0))
 
+/*
+** div_same_s32_z_tied1:
+**	mov	z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (div_same_s32_z_tied1, svint32_t,
+		z0 = svdiv_s32_z (svptrue_b32 (), z0, z0),
+		z0 = svdiv_z (svptrue_b32 (), z0, z0))
+
 /*
 ** div_1_s32_z_tied1:
 **	mov	(z[0-9]+\.s), #1
@@ -384,6 +402,15 @@ TEST_UNIFORM_ZX (div_w0_s32_x_untied, svint32_t, int32_t,
 		 z0 = svdiv_n_s32_x (p0, z1, x0),
 		 z0 = svdiv_x (p0, z1, x0))
 
+/*
+** div_same_s32_x_tied1:
+**	mov	z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (div_same_s32_x_tied1, svint32_t,
+		z0 = svdiv_s32_x (svptrue_b32 (), z0, z0),
+		z0 = svdiv_x (svptrue_b32 (), z0, z0))
+
 /*
 ** div_1_s32_x_tied1:
 **	ret
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s64.c
index cfed6f9c1b3..a9b3e1b0b89 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s64.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s64.c
@@ -55,6 +55,15 @@ TEST_UNIFORM_ZX (div_x0_s64_m_untied, svint64_t, int64_t,
 		 z0 = svdiv_n_s64_m (p0, z1, x0),
 		 z0 = svdiv_m (p0, z1, x0))
 
+/*
+** div_same_s64_m_tied1:
+**	mov	z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (div_same_s64_m_tied1, svint64_t,
+		 z0 = svdiv_s64_m (svptrue_b64 (), z0, z0),
+		 z0 = svdiv_m (svptrue_b64 (), z0, z0))
+
 /*
 ** div_1_s64_m_tied1:
 **	sel	z0\.d, p0, z0\.d, z0\.d
@@ -215,6 +224,15 @@ TEST_UNIFORM_ZX (div_x0_s64_z_untied, svint64_t, int64_t,
 		 z0 = svdiv_n_s64_z (p0, z1, x0),
 		 z0 = svdiv_z (p0, z1, x0))
 
+/*
+** div_same_s64_z_tied1:
+**	mov	z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (div_same_s64_z_tied1, svint64_t,
+		 z0 = svdiv_s64_z (svptrue_b64 (), z0, z0),
+		 z0 = svdiv_z (svptrue_b64 (), z0, z0))
+
 /*
 ** div_1_s64_z_tied1:
 **	mov	(z[0-9]+\.d), #1
@@ -384,6 +402,15 @@ TEST_UNIFORM_ZX (div_x0_s64_x_untied, svint64_t, int64_t,
 		 z0 = svdiv_n_s64_x (p0, z1, x0),
 		 z0 = svdiv_x (p0, z1, x0))
 
+/*
+** div_same_s64_x_tied1:
+**	mov	z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (div_same_s64_x_tied1, svint64_t,
+		 z0 = svdiv_s64_x (svptrue_b64 (), z0, z0),
+		 z0 = svdiv_x (svptrue_b64 (), z0, z0))
+
 /*
 ** div_1_s64_x_tied1:
 **	ret
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_u32.c
index 9707664caf4..edd3d03dfce 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_u32.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_u32.c
@@ -55,6 +55,15 @@ TEST_UNIFORM_ZX (div_w0_u32_m_untied, svuint32_t, uint32_t,
 		 z0 = svdiv_n_u32_m (p0, z1, x0),
 		 z0 = svdiv_m (p0, z1, x0))
 
+/*
+** div_same_u32_m_tied1:
+**	mov	z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (div_same_u32_m_tied1, svuint32_t,
+		z0 = svdiv_u32_m (svptrue_b32 (), z0, z0),
+		z0 = svdiv_m (svptrue_b32 (), z0, z0))
+
 /*
 ** div_1_u32_m_tied1:
 **	sel	z0\.s, p0, z0\.s, z0\.s
@@ -194,6 +203,15 @@ TEST_UNIFORM_ZX (div_w0_u32_z_untied, svuint32_t, uint32_t,
 		 z0 = svdiv_n_u32_z (p0, z1, x0),
 		 z0 = svdiv_z (p0, z1, x0))
 
+/*
+** div_same_u32_z_tied1:
+**	mov	z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (div_same_u32_z_tied1, svuint32_t,
+		z0 = svdiv_u32_z (svptrue_b32 (), z0, z0),
+		z0 = svdiv_z (svptrue_b32 (), z0, z0))
+
 /*
 ** div_1_u32_z_tied1:
 **	mov	(z[0-9]+\.s), #1
@@ -336,6 +354,15 @@ TEST_UNIFORM_ZX (div_w0_u32_x_untied, svuint32_t, uint32_t,
 		 z0 = svdiv_n_u32_x (p0, z1, x0),
 		 z0 = svdiv_x (p0, z1, x0))
 
+/*
+** div_same_u32_x_tied1:
+**	mov	z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (div_same_u32_x_tied1, svuint32_t,
+		z0 = svdiv_u32_x (svptrue_b32 (), z0, z0),
+		z0 = svdiv_x (svptrue_b32 (), z0, z0))
+
 /*
 ** div_1_u32_x_tied1:
 **	ret
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_u64.c
index 5247ebdac7a..3f6e581f122 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_u64.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_u64.c
@@ -55,6 +55,15 @@ TEST_UNIFORM_ZX (div_x0_u64_m_untied, svuint64_t, uint64_t,
 		 z0 = svdiv_n_u64_m (p0, z1, x0),
 		 z0 = svdiv_m (p0, z1, x0))
 
+/*
+** div_same_u64_m_tied1:
+**	mov	z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (div_same_u64_m_tied1, svuint64_t,
+		z0 = svdiv_u64_m (svptrue_b64 (), z0, z0),
+		z0 = svdiv_m (svptrue_b64 (), z0, z0))
+
 /*
 ** div_1_u64_m_tied1:
 **	sel	z0\.d, p0, z0\.d, z0\.d
@@ -194,6 +203,15 @@ TEST_UNIFORM_ZX (div_x0_u64_z_untied, svuint64_t, uint64_t,
 		 z0 = svdiv_n_u64_z (p0, z1, x0),
 		 z0 = svdiv_z (p0, z1, x0))
 
+/*
+** div_same_u64_z_tied1:
+**	mov	z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (div_same_u64_z_tied1, svuint64_t,
+		z0 = svdiv_u64_z (svptrue_b64 (), z0, z0),
+		z0 = svdiv_z (svptrue_b64 (), z0, z0))
+
 /*
 ** div_1_u64_z_tied1:
 **	mov	(z[0-9]+\.d), #1
@@ -336,6 +354,15 @@ TEST_UNIFORM_ZX (div_x0_u64_x_untied, svuint64_t, uint64_t,
 		 z0 = svdiv_n_u64_x (p0, z1, x0),
 		 z0 = svdiv_x (p0, z1, x0))
 
+/*
+** div_same_u64_x_tied1:
+**	mov	z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (div_same_u64_x_tied1, svuint64_t,
+		z0 = svdiv_u64_x (svptrue_b64 (), z0, z0),
+		z0 = svdiv_x (svptrue_b64 (), z0, z0))
+
 /*
 ** div_1_u64_x_tied1:
 **	ret
-- 
2.44.0

