Re: [PATCH] aarch64: Use SVE ASRD instruction with Neon modes.

Soumya AR Tue, 10 Dec 2024 20:55:33 -0800


> On 10 Dec 2024, at 7:03 PM, Richard Sandiford <richard.sandif...@arm.com> 
> wrote:
> 
> External email: Use caution opening links or attachments
> 
> 
> Soumya AR <soum...@nvidia.com> writes:
>> Hi Richard,
>> 
>> Thanks for reviewing this!
>> 
>> I’ve made the suggested changes and added the aarch64_ptrue_reg overload.
> 
> The updated patch is OK, thanks.


Thanks, committed: e5569a20cf3791553ac324269001a7c7c0e56242
> 
>> Thank you for providing the implementation for this, I have added you
>> as a co-author for the patch, hope that works :)
> 
> Sure, that's ok with me FWIW.  IMO it isn't really necessary though,
> so it's ok without too :)
> 
> Sometimes it just feels easier to write review comments in the form
> of code snippets, since that's more direct & precise than describing
> something in English.  It is still fundamentally a code review though.
> 
Yes I understand, but would prefer having it there rather than not :)

Best,
Soumya
> Richard
> 
>> 
>> Best,
>> Soumya
>> 
>> 
>>> On 5 Dec 2024, at 10:08 PM, Richard Sandiford <richard.sandif...@arm.com> 
>>> wrote:
>>> 
>>> External email: Use caution opening links or attachments
>>> 
>>> 
>>> Soumya AR <soum...@nvidia.com> writes:
>>>> The ASRD instruction on SVE performs an arithmetic shift right by an 
>>>> immediate
>>>> for divide.
>>>> 
>>>> This patch enables the use of ASRD with Neon modes.
>>>> 
>>>> For example:
>>>> 
>>>> int in[N], out[N];
>>>> 
>>>> void
>>>> foo (void)
>>>> {
>>>> for (int i = 0; i < N; i++)
>>>>   out[i] = in[i] / 4;
>>>> }
>>>> 
>>>> compiles to:
>>>> 
>>>>     ldr     q31, [x1, x0]
>>>>     cmlt    v30.16b, v31.16b, #0
>>>>     and     z30.b, z30.b, 3
>>>>     add     v30.16b, v30.16b, v31.16b
>>>>     sshr    v30.16b, v30.16b, 2
>>>>     str     q30, [x0, x2]
>>>>     add     x0, x0, 16
>>>>     cmp     x0, 1024
>>>> 
>>>> but can just be:
>>>> 
>>>>     ldp     q30, q31, [x0], 32
>>>>     asrd    z31.b, p7/m, z31.b, #2
>>>>     asrd    z30.b, p7/m, z30.b, #2
>>>>     stp     q30, q31, [x1], 32
>>>>     cmp     x0, x2
>>>> 
>>>> The patch was bootstrapped and regtested on aarch64-linux-gnu, no 
>>>> regression.
>>>> OK for mainline?
>>>> 
>>>> Signed-off-by: Soumya AR <soum...@nvidia.com>
>>>> 
>>>> gcc/ChangeLog:
>>>> 
>>>>     * config/aarch64/aarch64-sve.md: Extended sdiv_pow2<mode>3 and
>>>>     *sdiv_pow2<mode>3 to support Neon modes.
>>>> 
>>>> gcc/testsuite/ChangeLog:
>>>> 
>>>>     * gcc.target/aarch64/sve/sve-asrd.c: New test.
>>>> ---
>>>> gcc/config/aarch64/aarch64-sve.md             | 25 ++++-----
>>>> .../gcc.target/aarch64/sve/sve-asrd.c         | 54 +++++++++++++++++++
>>>> 2 files changed, 67 insertions(+), 12 deletions(-)
>>>> create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/sve-asrd.c
>>> 
>>> Rearranging to put the testcase first:
>>> 
>>>> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/sve-asrd.c 
>>>> b/gcc/testsuite/gcc.target/aarch64/sve/sve-asrd.c
>>>> new file mode 100644
>>>> index 00000000000..00aa8b2380d
>>>> --- /dev/null
>>>> +++ b/gcc/testsuite/gcc.target/aarch64/sve/sve-asrd.c
>>>> @@ -0,0 +1,54 @@
>>>> +/* { dg-do compile } */
>>>> +/* { dg-options "-Ofast --param aarch64-autovec-preference=asimd-only" } 
>>>> */
>>>> +/* { dg-final { check-function-bodies "**" "" "" } } */
>>>> +
>>>> +#include <stdint.h>
>>>> +#define N 1024
>>>> +
>>>> +#define FUNC(M)                     \
>>>> +M in_##M[N];                        \
>>>> +M out_##M[N];                       \
>>>> +void asrd_##M() {                   \
>>>> +  for (int i = 0; i < N; i++)       \
>>>> +    out_##M[i] = in_##M[i] / 4;     \
>>>> +}
>>>> +
>>>> +/*
>>>> +** asrd_int8_t:
>>>> +**   ...
>>>> +**   ptrue   (p[0-7]).b, vl1
>>>> +**   ...
>>>> +**   asrd    z[0-9]+\.b, \1/m, z[0-9]+\.b, #2
>>>> +**   ...
>>>> +*/
>>>> +FUNC(int8_t)
>>> 
>>> ptrue ..., vl1 will only do the shift/division on the first element.
>>> It should instead be vl8 for 64-bit modes and vl16 for 128-bit modes.
>>> 
>>>> +
>>>> +/*
>>>> +** asrd_int16_t:
>>>> +**   ...
>>>> +**   ptrue   (p[0-7]).b, vl2
>>>> +**   ...
>>>> +**   asrd    z[0-9]+\.h, \1/m, z[0-9]+\.h, #2
>>>> +**   ...
>>>> +*/
>>>> +FUNC(int16_t)
>>>> +
>>>> +/*
>>>> +** asrd_int32_t:
>>>> +**   ...
>>>> +**   ptrue   (p[0-7]).b, vl4
>>>> +**   ...
>>>> +**   asrd    z[0-9]+\.s, \1/m, z[0-9]+\.s, #2
>>>> +**   ...
>>>> +*/
>>>> +FUNC(int32_t)
>>>> +
>>>> +/*
>>>> +** asrd_int64_t:
>>>> +**   ...
>>>> +**   ptrue   (p[0-7]).b, vl8
>>>> +**   ...
>>>> +**   asrd    z[0-9]+\.d, \1/m, z[0-9]+\.d, #2
>>>> +**   ...
>>>> +*/
>>>> +FUNC(int64_t)
>>>> diff --git a/gcc/config/aarch64/aarch64-sve.md 
>>>> b/gcc/config/aarch64/aarch64-sve.md
>>>> index affdb24a93d..96effe4abed 100644
>>>> --- a/gcc/config/aarch64/aarch64-sve.md
>>>> +++ b/gcc/config/aarch64/aarch64-sve.md
>>>> @@ -4972,34 +4972,35 @@
>>>> 
>>>> ;; Unpredicated ASRD.
>>>> (define_expand "sdiv_pow2<mode>3"
>>>> -  [(set (match_operand:SVE_I 0 "register_operand")
>>>> -     (unspec:SVE_I
>>>> +  [(set (match_operand:SVE_VDQ_I 0 "register_operand")
>>>> +     (unspec:SVE_VDQ_I
>>>>       [(match_dup 3)
>>>> -        (unspec:SVE_I
>>>> -          [(match_operand:SVE_I 1 "register_operand")
>>>> +        (unspec:SVE_VDQ_I
>>>> +          [(match_operand:SVE_VDQ_I 1 "register_operand")
>>>>           (match_operand 2 "aarch64_simd_rshift_imm")]
>>>>          UNSPEC_ASRD)]
>>>>      UNSPEC_PRED_X))]
>>>>  "TARGET_SVE"
>>>>  {
>>>> -    operands[3] = aarch64_ptrue_reg (<VPRED>mode);
>>>> +    operands[3] = aarch64_ptrue_reg (<VPRED>mode,
>>>> +                                 GET_MODE_UNIT_SIZE (<MODE>mode));
>>> 
>>> Perhaps we should add yet another overload of aarch64_ptrue_reg that
>>> takes the data mode as a second argument.  The implementation could
>>> be something like:
>>> 
>>> /* Return a register of mode PRED_MODE for controlling data of mode 
>>> DATA_MODE.
>>> 
>>>  DATA_MODE can be a scalar, an Advanced SIMD vector, or an SVE vector.
>>>  If it's an N-byte scalar or an Advanced SIMD vector, the first N bits
>>>  of the predicate will be active and the rest will be inactive.
>>>  If DATA_MODE is an SVE mode, every bit of the predicate will be active.  */
>>> static rtx
>>> aarch64_ptrue_reg (machine_mode pred_mode, machine_mode data_mode)
>>> {
>>> if (aarch64_sve_mode_p (data_mode))
>>>   return aarch64_ptrue_reg (pred_mode);
>>> 
>>> auto size = GET_MODE_SIZE (data_mode).to_constant ();
>>> return aarch64_ptrue_reg (pred_mode, size);
>>> }
>>> 
>>> Thanks,
>>> Richard
>>> 
>>>>  }
>>>> )
>>>> 
>>>> ;; Predicated ASRD.
>>>> (define_insn "*sdiv_pow2<mode>3"
>>>> -  [(set (match_operand:SVE_I 0 "register_operand")
>>>> -     (unspec:SVE_I
>>>> +  [(set (match_operand:SVE_VDQ_I 0 "register_operand")
>>>> +     (unspec:SVE_VDQ_I
>>>>       [(match_operand:<VPRED> 1 "register_operand")
>>>> -        (unspec:SVE_I
>>>> -          [(match_operand:SVE_I 2 "register_operand")
>>>> -           (match_operand:SVE_I 3 "aarch64_simd_rshift_imm")]
>>>> +        (unspec:SVE_VDQ_I
>>>> +          [(match_operand:SVE_VDQ_I 2 "register_operand")
>>>> +           (match_operand:SVE_VDQ_I 3 "aarch64_simd_rshift_imm")]
>>>>          UNSPEC_ASRD)]
>>>>       UNSPEC_PRED_X))]
>>>>  "TARGET_SVE"
>>>>  {@ [ cons: =0 , 1   , 2 ; attrs: movprfx ]
>>>> -     [ w        , Upl , 0 ; *              ] asrd\t%0.<Vetype>, %1/m, 
>>>> %0.<Vetype>, #%3
>>>> -     [ ?&w      , Upl , w ; yes            ] movprfx\t%0, 
>>>> %2\;asrd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
>>>> +     [ w        , Upl , 0 ; *              ] asrd\t%Z0.<Vetype>, %1/m, 
>>>> %Z0.<Vetype>, #%3
>>>> +     [ ?&w      , Upl , w ; yes            ] movprfx\t%Z0, 
>>>> %Z2\;asrd\t%Z0.<Vetype>, %1/m, %Z0.<Vetype>, #%3
>>>>  }
>>>> )
>> 
>> 
>> From 2f5df569bd71f87cf7ec580f8511eff3b771dcfb Mon Sep 17 00:00:00 2001
>> From: Soumya AR <soum...@nvidia.com>
>> Date: Tue, 10 Dec 2024 10:37:11 +0530
>> Subject: [PATCH] aarch64: Use SVE ASRD instruction with Neon modes.
>> 
>> The ASRD instruction on SVE performs an arithmetic shift right by an 
>> immediate
>> for divide.
>> 
>> This patch enables the use of ASRD with Neon modes.
>> 
>> For example:
>> 
>> int in[N], out[N];
>> 
>> void
>> foo (void)
>> {
>>  for (int i = 0; i < N; i++)
>>    out[i] = in[i] / 4;
>> }
>> 
>> compiles to:
>> 
>>      ldr     q31, [x1, x0]
>>      cmlt    v30.16b, v31.16b, #0
>>      and     z30.b, z30.b, 3
>>      add     v30.16b, v30.16b, v31.16b
>>      sshr    v30.16b, v30.16b, 2
>>      str     q30, [x0, x2]
>>      add     x0, x0, 16
>>      cmp     x0, 1024
>> 
>> but can just be:
>> 
>>      ldp     q30, q31, [x0], 32
>>      asrd    z31.b, p7/m, z31.b, #2
>>      asrd    z30.b, p7/m, z30.b, #2
>>      stp     q30, q31, [x1], 32
>>      cmp     x0, x2
>> 
>> This patch also adds the following overload:
>>      aarch64_ptrue_reg (machine_mode pred_mode, machine_mode data_mode)
>> Depending on the data mode, the function returns a predicate with the
>> appropriate bits set.
>> 
>> The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression.
>> OK for mainline?
>> 
>> gcc/ChangeLog:
>> 
>>      * config/aarch64/aarch64.cc (aarch64_ptrue_reg): New overload.
>>      * config/aarch64/aarch64-protos.h (aarch64_ptrue_reg): Likewise.
>>      * config/aarch64/aarch64-sve.md: Extended sdiv_pow2<mode>3 and
>>      *sdiv_pow2<mode>3 to support Neon modes.
>> 
>> gcc/testsuite/ChangeLog:
>> 
>>      * gcc.target/aarch64/sve/sve-asrd.c: New test.
>> 
>> Co-authored-by: Richard Sandiford <richard.sandif...@arm.com>
>> Signed-off-by: Soumya AR <soum...@nvidia.com>
>> ---
>> gcc/config/aarch64/aarch64-protos.h           |  1 +
>> gcc/config/aarch64/aarch64-sve.md             | 24 +++---
>> gcc/config/aarch64/aarch64.cc                 | 16 ++++
>> .../gcc.target/aarch64/sve/sve-asrd.c         | 86 +++++++++++++++++++
>> 4 files changed, 115 insertions(+), 12 deletions(-)
>> create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/sve-asrd.c
>> 
>> diff --git a/gcc/config/aarch64/aarch64-protos.h 
>> b/gcc/config/aarch64/aarch64-protos.h
>> index 72df4a575b3..554066d4cda 100644
>> --- a/gcc/config/aarch64/aarch64-protos.h
>> +++ b/gcc/config/aarch64/aarch64-protos.h
>> @@ -1017,6 +1017,7 @@ void aarch64_expand_mov_immediate (rtx, rtx);
>> rtx aarch64_stack_protect_canary_mem (machine_mode, rtx, aarch64_salt_type);
>> rtx aarch64_ptrue_reg (machine_mode);
>> rtx aarch64_ptrue_reg (machine_mode, unsigned int);
>> +rtx aarch64_ptrue_reg (machine_mode, machine_mode);
>> rtx aarch64_pfalse_reg (machine_mode);
>> bool aarch64_sve_same_pred_for_ptest_p (rtx *, rtx *);
>> void aarch64_emit_sve_pred_move (rtx, rtx, rtx);
>> diff --git a/gcc/config/aarch64/aarch64-sve.md 
>> b/gcc/config/aarch64/aarch64-sve.md
>> index affdb24a93d..494c2208ffc 100644
>> --- a/gcc/config/aarch64/aarch64-sve.md
>> +++ b/gcc/config/aarch64/aarch64-sve.md
>> @@ -4972,34 +4972,34 @@
>> 
>> ;; Unpredicated ASRD.
>> (define_expand "sdiv_pow2<mode>3"
>> -  [(set (match_operand:SVE_I 0 "register_operand")
>> -     (unspec:SVE_I
>> +  [(set (match_operand:SVE_VDQ_I 0 "register_operand")
>> +     (unspec:SVE_VDQ_I
>>        [(match_dup 3)
>> -        (unspec:SVE_I
>> -          [(match_operand:SVE_I 1 "register_operand")
>> +        (unspec:SVE_VDQ_I
>> +          [(match_operand:SVE_VDQ_I 1 "register_operand")
>>            (match_operand 2 "aarch64_simd_rshift_imm")]
>>           UNSPEC_ASRD)]
>>       UNSPEC_PRED_X))]
>>   "TARGET_SVE"
>>   {
>> -    operands[3] = aarch64_ptrue_reg (<VPRED>mode);
>> +    operands[3] = aarch64_ptrue_reg (<VPRED>mode, <MODE>mode);
>>   }
>> )
>> 
>> ;; Predicated ASRD.
>> (define_insn "*sdiv_pow2<mode>3"
>> -  [(set (match_operand:SVE_I 0 "register_operand")
>> -     (unspec:SVE_I
>> +  [(set (match_operand:SVE_VDQ_I 0 "register_operand")
>> +     (unspec:SVE_VDQ_I
>>        [(match_operand:<VPRED> 1 "register_operand")
>> -        (unspec:SVE_I
>> -          [(match_operand:SVE_I 2 "register_operand")
>> -           (match_operand:SVE_I 3 "aarch64_simd_rshift_imm")]
>> +        (unspec:SVE_VDQ_I
>> +          [(match_operand:SVE_VDQ_I 2 "register_operand")
>> +           (match_operand:SVE_VDQ_I 3 "aarch64_simd_rshift_imm")]
>>           UNSPEC_ASRD)]
>>        UNSPEC_PRED_X))]
>>   "TARGET_SVE"
>>   {@ [ cons: =0 , 1   , 2 ; attrs: movprfx ]
>> -     [ w        , Upl , 0 ; *              ] asrd\t%0.<Vetype>, %1/m, 
>> %0.<Vetype>, #%3
>> -     [ ?&w      , Upl , w ; yes            ] movprfx\t%0, 
>> %2\;asrd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
>> +     [ w        , Upl , 0 ; *              ] asrd\t%Z0.<Vetype>, %1/m, 
>> %Z0.<Vetype>, #%3
>> +     [ ?&w      , Upl , w ; yes            ] movprfx\t%Z0, 
>> %Z2\;asrd\t%Z0.<Vetype>, %1/m, %Z0.<Vetype>, #%3
>>   }
>> )
>> 
>> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
>> index 00bcf18ae97..68321368da1 100644
>> --- a/gcc/config/aarch64/aarch64.cc
>> +++ b/gcc/config/aarch64/aarch64.cc
>> @@ -3682,6 +3682,22 @@ aarch64_ptrue_reg (machine_mode mode, unsigned int vl)
>>   return gen_lowpart (mode, reg);
>> }
>> 
>> +/* Return a register of mode PRED_MODE for controlling data of mode 
>> DATA_MODE.
>> +
>> +   DATA_MODE can be a scalar, an Advanced SIMD vector, or an SVE vector.
>> +   If it's an N-byte scalar or an Advanced SIMD vector, the first N bits
>> +   of the predicate will be active and the rest will be inactive.
>> +   If DATA_MODE is an SVE mode, every bit of the predicate will be active.  
>> */
>> +rtx
>> +aarch64_ptrue_reg (machine_mode pred_mode, machine_mode data_mode)
>> +{
>> +  if (aarch64_sve_mode_p (data_mode))
>> +    return aarch64_ptrue_reg (pred_mode);
>> +
>> +  auto size = GET_MODE_SIZE (data_mode).to_constant ();
>> +  return aarch64_ptrue_reg (pred_mode, size);
>> +}
>> +
>> /* Return an all-false predicate register of mode MODE.  */
>> 
>> rtx
>> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/sve-asrd.c 
>> b/gcc/testsuite/gcc.target/aarch64/sve/sve-asrd.c
>> new file mode 100644
>> index 00000000000..341baae505c
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/sve/sve-asrd.c
>> @@ -0,0 +1,86 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-Ofast --param aarch64-autovec-preference=asimd-only" } */
>> +/* { dg-final { check-function-bodies "**" "" "" } } */
>> +
>> +#include <stdint.h>
>> +
>> +#define FUNC(TYPE, I)                                                       
>>    \
>> +  TYPE M_##TYPE##_##I[I];                                                   
>>    \
>> +  void asrd_##TYPE##_##I ()                                                 
>>    \
>> +  {                                                                         
>>    \
>> +    for (int i = 0; i < I; i++)                                             
>>    \
>> +      {                                                                     
>>    \
>> +     M_##TYPE##_##I[i] /= 4;                                                
>> \
>> +      }                                                                     
>>    \
>> +  }
>> +
>> +/*
>> +** asrd_int8_t_8:
>> +**   ...
>> +**   ptrue   (p[0-7]).b, vl8
>> +**   ...
>> +**   asrd    z[0-9]+\.b, \1/m, z[0-9]+\.b, #2
>> +**   ...
>> +*/
>> +FUNC(int8_t, 8);
>> +
>> +/*
>> +** asrd_int8_t_16:
>> +**   ...
>> +**   ptrue   (p[0-7]).b, vl16
>> +**   ...
>> +**   asrd    z[0-9]+\.b, \1/m, z[0-9]+\.b, #2
>> +**   ...
>> +*/
>> +FUNC(int8_t, 16);
>> +
>> +/*
>> +** asrd_int16_t_4:
>> +**   ...
>> +**   ptrue   (p[0-7]).b, vl8
>> +**   ...
>> +**   asrd    z[0-9]+\.h, \1/m, z[0-9]+\.h, #2
>> +**   ...
>> +*/
>> +FUNC(int16_t, 4);
>> +
>> +/*
>> +** asrd_int16_t_8:
>> +**   ...
>> +**   ptrue   (p[0-7]).b, vl16
>> +**   ...
>> +**   asrd    z[0-9]+\.h, \1/m, z[0-9]+\.h, #2
>> +**   ...
>> +*/
>> +FUNC(int16_t, 8);
>> +
>> +/*
>> +** asrd_int32_t_2:
>> +**   ...
>> +**   ptrue   (p[0-7]).b, vl8
>> +**   ...
>> +**   asrd    z[0-9]+\.s, \1/m, z[0-9]+\.s, #2
>> +**   ...
>> +*/
>> +FUNC(int32_t, 2);
>> +
>> +/*
>> +** asrd_int32_t_4:
>> +**   ...
>> +**   ptrue   (p[0-7]).b, vl16
>> +**   ...
>> +**   asrd    z[0-9]+\.s, \1/m, z[0-9]+\.s, #2
>> +**   ...
>> +*/
>> +FUNC(int32_t, 4);
>> +
>> +/*
>> +** asrd_int64_t_2:
>> +**   ...
>> +**   ptrue   (p[0-7]).b, vl16
>> +**   ...
>> +**   asrd    z[0-9]+\.d, \1/m, z[0-9]+\.d, #2
>> +**   ...
>> +*/
>> +FUNC(int64_t, 2);
>> +

Re: [PATCH] aarch64: Use SVE ASRD instruction with Neon modes.

Reply via email to