Re: [PATCH] SVE intrinsics: Fold division and multiplication by -1 to neg.

Jennifer Schmitz Thu, 24 Oct 2024 00:15:18 -0700

> On 23 Oct 2024, at 16:40, Richard Sandiford <richard.sandif...@arm.com> wrote:
> 
> External email: Use caution opening links or attachments
> 
> 
> Jennifer Schmitz <jschm...@nvidia.com> writes:
>> Because a neg instruction has lower latency and higher throughput than
>> sdiv and mul, svdiv and svmul by -1 can be folded to svneg. For svdiv,
>> this is already implemented on the RTL level; for svmul, the
>> optimization was still missing.
>> This patch implements folding to svneg for both operations using the
>> gimple_folder. For svdiv, the transform is applied if the divisor is -1.
>> Svmul is folded if either of the operands is -1. A case distinction of
>> the predication is made to account for the fact that svneg_m has 3 arguments
>> (argument 0 holds the values for the inactive lanes), while svneg_x and
>> svneg_z have only 2 arguments.
>> Tests were added or adjusted to check the produced assembly and runtime
>> tests were added to check correctness.
>> 
>> The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression.
>> OK for mainline?
>> 
>> Signed-off-by: Jennifer Schmitz <jschm...@nvidia.com>
> 
> Sorry for the slow review.
> 
>> [...]
>> @@ -2033,12 +2054,37 @@ public:
>>     if (integer_zerop (op1) || integer_zerop (op2))
>>       return f.fold_active_lanes_to (build_zero_cst (TREE_TYPE (f.lhs)));
>> 
>> +    /* If one of the operands is all integer -1, fold to svneg.  */
>> +    tree pg = gimple_call_arg (f.call, 0);
>> +    tree negated_op = NULL;
>> +    if (integer_minus_onep (op2))
>> +      negated_op = op1;
>> +    else if (integer_minus_onep (op1))
>> +      negated_op = op2;
>> +    if (!f.type_suffix (0).unsigned_p && negated_op)
> 
> This is definitely ok, but it would be nice to handle the unsigned_p
> case too at some point.  This would mean bit-casting to the equivalent
> signed type, doing the negation, and casting back.  It would be good
> to have a helper for doing that (maybe with a lambda callback that
> emits the actual call) since I can imagine it will be needed elsewhere
> too.
> 
>> [...]
>> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s32.c 
>> b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s32.c
>> index 13009d88619..1d605dbdd8d 100644
>> --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s32.c
>> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s32.c
>> @@ -183,14 +183,25 @@ TEST_UNIFORM_Z (mul_3_s32_m_untied, svint32_t,
>> 
>> /*
>> ** mul_m1_s32_m:
>> -**   mov     (z[0-9]+)\.b, #-1
>> -**   mul     z0\.s, p0/m, z0\.s, \1\.s
>> +**   neg     z0\.s, p0/m, z0\.s
>> **   ret
>> */
>> TEST_UNIFORM_Z (mul_m1_s32_m, svint32_t,
>>              z0 = svmul_n_s32_m (p0, z0, -1),
>>              z0 = svmul_m (p0, z0, -1))
>> 
>> +/*
>> +** mul_m1r_s32_m:
>> +**   mov     (z[0-9]+)\.b, #-1
>> +**   mov     (z[0-9]+)\.d, z0\.d
>> +**   movprfx z0, \1
>> +**   neg     z0\.s, p0/m, \2\.s
>> +**   ret
>> +*/
>> +TEST_UNIFORM_Z (mul_m1r_s32_m, svint32_t,
>> +             z0 = svmul_s32_m (p0, svdup_s32 (-1), z0),
>> +             z0 = svmul_m (p0, svdup_s32 (-1), z0))
> 
> Maybe it would be better to test the untied case instead, by passing
> z1 rather than z0 as the final argument.  Hopefully that would leave
> us with just the first and last instructions.  (I think the existing
> tests already cover the awkward tied2 case well enough.)
> 
> Same for the later similar tests.
> 
> OK with that change, thanks.
Thanks for the review.
I adjusted the 3 test cases (mul_m1r_s32_m/_x/_z) to cover the untied version. 
For mul_m1r_s32_m this indeed results in MOV + NEG.
I also made a note to address the unsigned_p case in a separate patch.
The patch with the updated test cases was pushed to trunk: 
fc40202c1ac5d585bb236cdaf3a3968927e970a0
Thanks,
Jennifer
> 
> Richard
> 
>> +
>> /*
>> ** mul_s32_z_tied1:
>> **   movprfx z0\.s, p0/z, z0\.s
>> @@ -597,13 +608,44 @@ TEST_UNIFORM_Z (mul_255_s32_x, svint32_t,
>> 
>> /*
>> ** mul_m1_s32_x:
>> -**   mul     z0\.s, z0\.s, #-1
>> +**   neg     z0\.s, p0/m, z0\.s
>> **   ret
>> */
>> TEST_UNIFORM_Z (mul_m1_s32_x, svint32_t,
>>              z0 = svmul_n_s32_x (p0, z0, -1),
>>              z0 = svmul_x (p0, z0, -1))
>> 
>> +/*
>> +** mul_m1r_s32_x:
>> +**   neg     z0\.s, p0/m, z0\.s
>> +**   ret
>> +*/
>> +TEST_UNIFORM_Z (mul_m1r_s32_x, svint32_t,
>> +             z0 = svmul_s32_x (p0, svdup_s32 (-1), z0),
>> +             z0 = svmul_x (p0, svdup_s32 (-1), z0))
>> +
>> +/*
>> +** mul_m1_s32_z:
>> +**   mov     (z[0-9]+)\.d, z0\.d
>> +**   movprfx z0\.s, p0/z, \1\.s
>> +**   neg     z0\.s, p0/m, \1\.s
>> +**   ret
>> +*/
>> +TEST_UNIFORM_Z (mul_m1_s32_z, svint32_t,
>> +             z0 = svmul_n_s32_z (p0, z0, -1),
>> +             z0 = svmul_z (p0, z0, -1))
>> +
>> +/*
>> +** mul_m1r_s32_z:
>> +**   mov     (z[0-9]+)\.d, z0\.d
>> +**   movprfx z0\.s, p0/z, \1\.s
>> +**   neg     z0\.s, p0/m, \1\.s
>> +**   ret
>> +*/
>> +TEST_UNIFORM_Z (mul_m1r_s32_z, svint32_t,
>> +             z0 = svmul_s32_z (p0, svdup_s32 (-1),  z0),
>> +             z0 = svmul_z (p0, svdup_s32 (-1), z0))
>> +
>> /*
>> ** mul_m127_s32_x:
>> **   mul     z0\.s, z0\.s, #-127
>> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s64.c 
>> b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s64.c
>> index 530d9fc84a5..c05d184f2fe 100644
>> --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s64.c
>> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s64.c
>> @@ -192,8 +192,7 @@ TEST_UNIFORM_Z (mul_3_s64_m_untied, svint64_t,
>> 
>> /*
>> ** mul_m1_s64_m:
>> -**   mov     (z[0-9]+)\.b, #-1
>> -**   mul     z0\.d, p0/m, z0\.d, \1\.d
>> +**   neg     z0\.d, p0/m, z0\.d
>> **   ret
>> */
>> TEST_UNIFORM_Z (mul_m1_s64_m, svint64_t,
>> @@ -625,7 +624,7 @@ TEST_UNIFORM_Z (mul_255_s64_x, svint64_t,
>> 
>> /*
>> ** mul_m1_s64_x:
>> -**   mul     z0\.d, z0\.d, #-1
>> +**   neg     z0\.d, p0/m, z0\.d
>> **   ret
>> */
>> TEST_UNIFORM_Z (mul_m1_s64_x, svint64_t,
>> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s8.c 
>> b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s8.c
>> index 0c90a8bb832..efc952e3a52 100644
>> --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s8.c
>> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s8.c
>> @@ -183,8 +183,7 @@ TEST_UNIFORM_Z (mul_3_s8_m_untied, svint8_t,
>> 
>> /*
>> ** mul_m1_s8_m:
>> -**   mov     (z[0-9]+)\.b, #-1
>> -**   mul     z0\.b, p0/m, z0\.b, \1\.b
>> +**   neg     z0\.b, p0/m, z0\.b
>> **   ret
>> */
>> TEST_UNIFORM_Z (mul_m1_s8_m, svint8_t,
>> @@ -587,7 +586,7 @@ TEST_UNIFORM_Z (mul_128_s8_x, svint8_t,
>> 
>> /*
>> ** mul_255_s8_x:
>> -**   mul     z0\.b, z0\.b, #-1
>> +**   neg     z0\.b, p0/m, z0\.b
>> **   ret
>> */
>> TEST_UNIFORM_Z (mul_255_s8_x, svint8_t,
>> @@ -596,7 +595,7 @@ TEST_UNIFORM_Z (mul_255_s8_x, svint8_t,
>> 
>> /*
>> ** mul_m1_s8_x:
>> -**   mul     z0\.b, z0\.b, #-1
>> +**   neg     z0\.b, p0/m, z0\.b
>> **   ret
>> */
>> TEST_UNIFORM_Z (mul_m1_s8_x, svint8_t,
>> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/div_const_run.c 
>> b/gcc/testsuite/gcc.target/aarch64/sve/div_const_run.c
>> index c96bb2763dc..60cf8345d6a 100644
>> --- a/gcc/testsuite/gcc.target/aarch64/sve/div_const_run.c
>> +++ b/gcc/testsuite/gcc.target/aarch64/sve/div_const_run.c
>> @@ -42,7 +42,9 @@ typedef svuint64_t svuint64_ 
>> __attribute__((arm_sve_vector_bits(128)));
>>   TEST_TYPES_1 (uint64, u64)
>> 
>> #define TEST_VALUES_S_1(B, OP1, OP2)                                 \
>> -  F (int##B, s##B, x, OP1, OP2)
>> +  F (int##B, s##B, x, OP1, OP2)                                             
>>  \
>> +  F (int##B, s##B, z, OP1, OP2)                                             
>>  \
>> +  F (int##B, s##B, m, OP1, OP2)
>> 
>> #define TEST_VALUES_S                                                        
>> \
>>   TEST_VALUES_S_1 (32, INT32_MIN, INT32_MIN)                         \
>> @@ -60,7 +62,11 @@ typedef svuint64_t svuint64_ 
>> __attribute__((arm_sve_vector_bits(128)));
>>   TEST_VALUES_S_1 (32, INT32_MAX, -5)                                        
>> \
>>   TEST_VALUES_S_1 (64, INT64_MAX, -5)                                        
>> \
>>   TEST_VALUES_S_1 (32, INT32_MIN, -4)                                        
>> \
>> -  TEST_VALUES_S_1 (64, INT64_MIN, -4)
>> +  TEST_VALUES_S_1 (64, INT64_MIN, -4)                                       
>>  \
>> +  TEST_VALUES_S_1 (32, INT32_MAX, -1)                                       
>>  \
>> +  TEST_VALUES_S_1 (32, -7, -1)                                              
>>  \
>> +  TEST_VALUES_S_1 (64, INT64_MIN, -1)                                       
>>  \
>> +  TEST_VALUES_S_1 (64, 16, -1)
>> 
>> #define TEST_VALUES_U_1(B, OP1, OP2)                                 \
>>   F (uint##B, u##B, x, OP1, OP2)
>> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mul_const_run.c 
>> b/gcc/testsuite/gcc.target/aarch64/sve/mul_const_run.c
>> index c369d5be167..eb897d622fc 100644
>> --- a/gcc/testsuite/gcc.target/aarch64/sve/mul_const_run.c
>> +++ b/gcc/testsuite/gcc.target/aarch64/sve/mul_const_run.c
>> @@ -44,7 +44,9 @@ typedef svuint64_t svuint64_ 
>> __attribute__((arm_sve_vector_bits(128)));
>>   TEST_TYPES_1 (uint64, u64)
>> 
>> #define TEST_VALUES_S_1(B, OP1, OP2)                                 \
>> -  F (int##B, s##B, x, OP1, OP2)
>> +  F (int##B, s##B, x, OP1, OP2)                                             
>>  \
>> +  F (int##B, s##B, m, OP1, OP2)                                             
>>  \
>> +  F (int##B, s##B, z, OP1, OP2)
>> 
>> #define TEST_VALUES_S                                                        
>> \
>>   TEST_VALUES_S_1 (32, INT32_MIN, INT32_MIN)                         \
>> @@ -70,7 +72,11 @@ typedef svuint64_t svuint64_ 
>> __attribute__((arm_sve_vector_bits(128)));
>>   TEST_VALUES_S_1 (32, INT32_MAX, -5)                                        
>> \
>>   TEST_VALUES_S_1 (64, INT64_MAX, -5)                                        
>> \
>>   TEST_VALUES_S_1 (32, INT32_MIN, -4)                                        
>> \
>> -  TEST_VALUES_S_1 (64, INT64_MIN, -4)
>> +  TEST_VALUES_S_1 (64, INT64_MIN, -4)                                       
>>  \
>> +  TEST_VALUES_S_1 (32, INT32_MAX, -1)                                       
>>  \
>> +  TEST_VALUES_S_1 (32, -7, -1)                                              
>>  \
>> +  TEST_VALUES_S_1 (64, INT64_MIN, -1)                                       
>>  \
>> +  TEST_VALUES_S_1 (64, 16, -1)
>> 
>> #define TEST_VALUES_U_1(B, OP1, OP2)                                 \
>>   F (uint##B, u##B, x, OP1, OP2)
smime.p7s
Description: S/MIME cryptographic signature
Re: [PATCH] SVE intrinsics: Fold division and multiplication by -1 to neg.

Reply via email to