From f69833ee5a6927a5eff46a866d768bbb44fde3f0 Mon Sep 17 00:00:00 2001
From: Soumya AR <soumyaa@nvidia.com>
Date: Tue, 29 Oct 2024 10:35:42 +0530
Subject: [PATCH] aarch64: Optimise calls to ldexp with SVE FSCALE instruction
 [PR111733]

This patch uses the FSCALE instruction provided by SVE to implement the
standard ldexp family of functions.

Currently, with '-Ofast -mcpu=neoverse-v2', GCC generates libcalls for the
following code:

float
test_ldexpf (float x, int i)
{
	return __builtin_ldexpf (x, i);
}

double
test_ldexp (double x, int i)
{
	return __builtin_ldexp(x, i);
}

GCC Output:

test_ldexpf:
	b ldexpf

test_ldexp:
	b ldexp

Since SVE has support for an FSCALE instruction, we can use this to process
scalar floats by moving them to a vector register and performing an fscale call,
similar to how LLVM tackles an ldexp builtin as well.

New Output:

test_ldexpf:
	fmov	s31, w0
	ptrue	p7.b, vl4
	fscale	z0.s, p7/m, z0.s, z31.s
	ret

test_ldexp:
	sxtw	x0, w0
	ptrue	p7.b, vl8
	fmov	d31, x0
	fscale	z0.d, p7/m, z0.d, z31.d
	ret

The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression.
OK for mainline?

Signed-off-by: Soumya AR <soumyaa@nvidia.com>

gcc/ChangeLog:

	PR target/111733
	* config/aarch64/aarch64-sve.md
	(ldexp<mode>3): Added a new pattern to match ldexp calls with scalar
	floating modes and expand to the existing pattern for FSCALE.
	(@aarch64_pred_<optab><mode>): Extended the pattern to accept SVE
	operands as well as scalar floating modes.

	* config/aarch64/iterators.md:
	(SVE_FULL_F_SCALAR): Added an iterator to match all FP SVE modes as well
	as HF, SF, and DF.
	(VPRED): Extended the attribute to handle GPF_HF modes.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/sve/fscale.c: New test.
---
 gcc/config/aarch64/aarch64-sve.md             | 25 +++++++++++++++----
 gcc/config/aarch64/iterators.md               |  6 ++++-
 gcc/testsuite/gcc.target/aarch64/sve/fscale.c | 16 ++++++++++++
 3 files changed, 41 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/fscale.c
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index 06bd3e4bb2c..119a0e53853 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -5088,6 +5088,21 @@
 ;; - FTSSEL
 ;; -------------------------------------------------------------------------
 
+(define_expand "ldexp<mode>3"
+ [(set (match_operand:GPF_HF 0 "register_operand")
+       (unspec:GPF_HF
+	 [(match_dup 3)
+	  (const_int SVE_RELAXED_GP)
+	  (match_operand:GPF_HF 1 "register_operand")
+	  (match_operand:<V_INT_EQUIV> 2 "register_operand")]
+	 UNSPEC_COND_FSCALE))]
+ "TARGET_SVE"
+ {
+   operands[3] = aarch64_ptrue_reg (<VPRED>mode,
+   				    GET_MODE_UNIT_SIZE (<MODE>mode));
+ }
+)
+
 ;; Unpredicated floating-point binary operations that take an integer as
 ;; their second operand.
 (define_insn "@aarch64_sve_<optab><mode>"
@@ -5103,17 +5118,17 @@
 ;; Predicated floating-point binary operations that take an integer
 ;; as their second operand.
 (define_insn "@aarch64_pred_<optab><mode>"
-  [(set (match_operand:SVE_FULL_F 0 "register_operand")
-	(unspec:SVE_FULL_F
+  [(set (match_operand:SVE_FULL_F_SCALAR 0 "register_operand")
+	(unspec:SVE_FULL_F_SCALAR
 	  [(match_operand:<VPRED> 1 "register_operand")
 	   (match_operand:SI 4 "aarch64_sve_gp_strictness")
-	   (match_operand:SVE_FULL_F 2 "register_operand")
+	   (match_operand:SVE_FULL_F_SCALAR 2 "register_operand")
 	   (match_operand:<V_INT_EQUIV> 3 "register_operand")]
 	  SVE_COND_FP_BINARY_INT))]
   "TARGET_SVE"
   {@ [ cons: =0 , 1   , 2 , 3 ; attrs: movprfx ]
-     [ w        , Upl , 0 , w ; *              ] <sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
-     [ ?&w      , Upl , w , w ; yes            ] movprfx\t%0, %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+     [ w        , Upl , 0 , w ; *              ] <sve_fp_op>\t%Z0.<Vetype>, %1/m, %Z0.<Vetype>, %Z3.<Vetype>
+     [ ?&w      , Upl , w , w ; yes            ] movprfx\t%Z0, %Z2\;<sve_fp_op>\t%Z0.<Vetype>, %1/m, %Z0.<Vetype>, %Z3.<Vetype>
   }
 )
 
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 0bc98315bb6..7f708ea14f9 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -449,6 +449,9 @@
 ;; All fully-packed SVE floating-point vector modes.
 (define_mode_iterator SVE_FULL_F [VNx8HF VNx4SF VNx2DF])
 
+;; Fully-packed SVE floating-point vector modes and 32-bit and 64-bit floats.
+(define_mode_iterator SVE_FULL_F_SCALAR [VNx8HF VNx4SF VNx2DF HF SF DF])
+
 ;; Fully-packed SVE integer vector modes that have 8-bit or 16-bit elements.
 (define_mode_iterator SVE_FULL_BHI [VNx16QI VNx8HI])
 
@@ -2299,7 +2302,8 @@
 			 (VNx8DI "VNx2BI") (VNx8DF "VNx2BI")
 			 (V8QI "VNx8BI") (V16QI "VNx16BI")
 			 (V4HI "VNx4BI") (V8HI "VNx8BI") (V2SI "VNx2BI")
-			 (V4SI "VNx4BI") (V2DI "VNx2BI") (V1DI "VNx2BI")])
+			 (V4SI "VNx4BI") (V2DI "VNx2BI") (V1DI "VNx2BI")
+			 (HF "VNx8BI") (SF "VNx4BI") (DF "VNx2BI")])
 
 ;; ...and again in lower case.
 (define_mode_attr vpred [(VNx16QI "vnx16bi") (VNx8QI "vnx8bi")
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fscale.c b/gcc/testsuite/gcc.target/aarch64/sve/fscale.c
new file mode 100644
index 00000000000..251b4ef9188
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fscale.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-Ofast" } */
+
+float
+test_ldexpf (float x, int i)
+{
+  return __builtin_ldexpf (x, i);
+}
+/* { dg-final { scan-assembler-times {\tfscale\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+
+double
+test_ldexp (double x, int i)
+{
+  return __builtin_ldexp (x, i);
+} 
+/* { dg-final { scan-assembler-times {\tfscale\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
-- 
2.43.2