Re: [PATCH v2] aarch64: Optimise calls to ldexp with SVE FSCALE instruction [PR111733]

Soumya AR Tue, 12 Nov 2024 21:01:13 -0800


> On 12 Nov 2024, at 4:27 PM, Richard Sandiford <richard.sandif...@arm.com> 
> wrote:
> 
> External email: Use caution opening links or attachments
> 
> 
> Soumya AR <soum...@nvidia.com> writes:
>> diff --git a/gcc/config/aarch64/aarch64-sve.md 
>> b/gcc/config/aarch64/aarch64-sve.md
>> index 06bd3e4bb2c..119a0e53853 100644
>> --- a/gcc/config/aarch64/aarch64-sve.md
>> +++ b/gcc/config/aarch64/aarch64-sve.md
>> @@ -5088,6 +5088,21 @@
>> ;; - FTSSEL
>> ;; -------------------------------------------------------------------------
>> 
>> +(define_expand "ldexp<mode>3"
>> + [(set (match_operand:GPF_HF 0 "register_operand")
>> +       (unspec:GPF_HF
>> +      [(match_dup 3)
>> +       (const_int SVE_RELAXED_GP)
> 
> Sorry for only noticing now, but: this should be SVE_STRICT_GP instead of
> SVE_RELAXED_GP, since we don't want to allow other lanes to be made
> active later.
> 
>> +       (match_operand:GPF_HF 1 "register_operand")
>> +       (match_operand:<V_INT_EQUIV> 2 "register_operand")]
>> +      UNSPEC_COND_FSCALE))]
>> + "TARGET_SVE"
>> + {
>> +   operands[3] = aarch64_ptrue_reg (<VPRED>mode,
>> +                                 GET_MODE_UNIT_SIZE (<MODE>mode));
>> + }
>> +)
>> +
>> ;; Unpredicated floating-point binary operations that take an integer as
>> ;; their second operand.
>> (define_insn "@aarch64_sve_<optab><mode>"
>> @@ -5103,17 +5118,17 @@
>> ;; Predicated floating-point binary operations that take an integer
>> ;; as their second operand.
>> (define_insn "@aarch64_pred_<optab><mode>"
>> -  [(set (match_operand:SVE_FULL_F 0 "register_operand")
>> -     (unspec:SVE_FULL_F
>> +  [(set (match_operand:SVE_FULL_F_SCALAR 0 "register_operand")
>> +     (unspec:SVE_FULL_F_SCALAR
>>        [(match_operand:<VPRED> 1 "register_operand")
>>         (match_operand:SI 4 "aarch64_sve_gp_strictness")
>> -        (match_operand:SVE_FULL_F 2 "register_operand")
>> +        (match_operand:SVE_FULL_F_SCALAR 2 "register_operand")
>>         (match_operand:<V_INT_EQUIV> 3 "register_operand")]
>>        SVE_COND_FP_BINARY_INT))]
>>   "TARGET_SVE"
>>   {@ [ cons: =0 , 1   , 2 , 3 ; attrs: movprfx ]
>> -     [ w        , Upl , 0 , w ; *              ] <sve_fp_op>\t%0.<Vetype>, 
>> %1/m, %0.<Vetype>, %3.<Vetype>
>> -     [ ?&w      , Upl , w , w ; yes            ] movprfx\t%0, 
>> %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
>> +     [ w        , Upl , 0 , w ; *              ] <sve_fp_op>\t%Z0.<Vetype>, 
>> %1/m, %Z0.<Vetype>, %Z3.<Vetype>
>> +     [ ?&w      , Upl , w , w ; yes            ] movprfx\t%Z0, 
>> %Z2\;<sve_fp_op>\t%Z0.<Vetype>, %1/m, %Z0.<Vetype>, %Z3.<Vetype>
>>   }
>> )
>> 
>> diff --git a/gcc/config/aarch64/iterators.md 
>> b/gcc/config/aarch64/iterators.md
>> index 8269b0cdcd9..4153c72954e 100644
>> --- a/gcc/config/aarch64/iterators.md
>> +++ b/gcc/config/aarch64/iterators.md
>> @@ -452,6 +452,9 @@
>> ;; All fully-packed SVE floating-point vector modes.
>> (define_mode_iterator SVE_FULL_F [VNx8HF VNx4SF VNx2DF])
>> 
>> +;; Fully-packed SVE floating-point vector modes and their scalar 
>> equivalents.
>> +(define_mode_iterator SVE_FULL_F_SCALAR [SVE_FULL_F GPF_HF])
>> +
>> ;; Fully-packed SVE integer vector modes that have 8-bit or 16-bit elements.
>> (define_mode_iterator SVE_FULL_BHI [VNx16QI VNx8HI])
>> 
>> @@ -2302,7 +2305,8 @@
>>                       (VNx8DI "VNx2BI") (VNx8DF "VNx2BI")
>>                       (V8QI "VNx8BI") (V16QI "VNx16BI")
>>                       (V4HI "VNx4BI") (V8HI "VNx8BI") (V2SI "VNx2BI")
>> -                      (V4SI "VNx4BI") (V2DI "VNx2BI") (V1DI "VNx2BI")])
>> +                      (V4SI "VNx4BI") (V2DI "VNx2BI") (V1DI "VNx2BI")
>> +                      (HF "VNx8BI") (SF "VNx4BI") (DF "VNx2BI")])
>> 
>> ;; ...and again in lower case.
>> (define_mode_attr vpred [(VNx16QI "vnx16bi") (VNx8QI "vnx8bi")
>> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
>> index c3d0efc0f2c..09b7844d094 100644
>> --- a/gcc/internal-fn.def
>> +++ b/gcc/internal-fn.def
>> @@ -441,7 +441,7 @@ DEF_INTERNAL_OPTAB_FN (VEC_FMADDSUB, ECF_CONST, 
>> vec_fmaddsub, ternary)
>> DEF_INTERNAL_OPTAB_FN (VEC_FMSUBADD, ECF_CONST, vec_fmsubadd, ternary)
>> 
>> /* FP scales.  */
>> -DEF_INTERNAL_FLT_FN (LDEXP, ECF_CONST, ldexp, binary)
>> +DEF_INTERNAL_FLT_FLOATN_FN (LDEXP, ECF_CONST, ldexp, binary)
>> 
>> /* Ternary math functions.  */
>> DEF_INTERNAL_FLT_FLOATN_FN (FMA, ECF_CONST, fma, ternary)
>> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fscale.c 
>> b/gcc/testsuite/gcc.target/aarch64/sve/fscale.c
>> new file mode 100644
>> index 00000000000..2c32d410f6b
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/sve/fscale.c
>> @@ -0,0 +1,46 @@
>> +/* { dg-do compile } */
>> +/* { dg-additional-options "-Ofast" } */
>> +/* { dg-final { check-function-bodies "**" "" } } */
>> +
>> +/*
>> +** test_ldexpf16:
>> +**   ...
>> +**   ptrue   p[0-7]\.b, vl2
> 
> It would be more robust to capture the register using:
> 
> **      ptrue   (p[0-7])\.b, vl2
> 
>> +**   ...
>> +**   fscale  z[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h
> 
> and then match it here using:
> 
> **      fscale  z[0-9]+\.h, \1/m, z[0-9]+\.h, z[0-9]+\.h
> 
> Same for the other tests.
> 
> OK with those changes if they work (no need for another review unless
> you'd prefer one).


Made the changes and committed:
9b2915d95d855333d4d8f66b71a75f653ee0d076

Thanks a lot!

Best,
Soumya

> 
> Thanks,
> Richard
> 
>> +**   ret
>> +*/
>> +_Float16
>> +test_ldexpf16 (_Float16 x, int i)
>> +{
>> +  return __builtin_ldexpf16 (x, i);
>> +}
>> +
>> +/*
>> +** test_ldexpf:
>> +**   ...
>> +**   ptrue   p[0-7]\.b, vl4
>> +**   ...
>> +**   fscale  z[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s
>> +**   ret
>> +*/
>> +float
>> +test_ldexpf (float x, int i)
>> +{
>> +  return __builtin_ldexpf (x, i);
>> +}
>> +
>> +/*
>> +** test_ldexp:
>> +**   ...
>> +**   ptrue   p[0-7]\.b, vl8
>> +**   ...
>> +**   fscale  z[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d
>> +**   ret
>> +*/
>> +double
>> +test_ldexp (double x, int i)
>> +{
>> +  return __builtin_ldexp (x, i);
>> +}
>> +

Re: [PATCH v2] aarch64: Optimise calls to ldexp with SVE FSCALE instruction [PR111733]

Reply via email to