Re: [PATCH 1/2] SVE intrinsics: Add strength reduction for division by constant.

Jennifer Schmitz Tue, 30 Jul 2024 00:47:28 -0700

Dear Richard,
Thanks for the feedback. Great to see this patch approved! I made the changes 
as suggested.
Best,
Jennifer

0001-SVE-intrinsics-Add-strength-reduction-for-division-b.patch
Description: Binary data


> On 29 Jul 2024, at 22:55, Richard Sandiford <richard.sandif...@arm.com> wrote:
> 
> External email: Use caution opening links or attachments
> 
> 
> Thanks for doing this.
> 
> Jennifer Schmitz <jschm...@nvidia.com> writes:
>> [...]
>> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s32.c 
>> b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s32.c
>> index c49ca1aa524..6500b64c41b 100644
>> --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s32.c
>> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s32.c
>> @@ -1,6 +1,9 @@
>> /* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
>> 
>> #include "test_sve_acle.h"
>> +#include <stdint.h>
>> +
> 
> I think it'd better to drop the explicit include of stdint.h.  arm_sve.h
> is defined to include stdint.h itself, and we rely on that elsewhere.
> 
> Same for div_s64.c.
Done.
> 
>> +#define MAXPOW 1<<30
>> 
>> /*
>> ** div_s32_m_tied1:
>> @@ -53,10 +56,27 @@ TEST_UNIFORM_ZX (div_w0_s32_m_untied, svint32_t, int32_t,
>>               z0 = svdiv_n_s32_m (p0, z1, x0),
>>               z0 = svdiv_m (p0, z1, x0))
>> 
>> +/*
>> +** div_1_s32_m_tied1:
>> +**   sel     z0\.s, p0, z0\.s, z0\.s
>> +**   ret
>> +*/
>> +TEST_UNIFORM_Z (div_1_s32_m_tied1, svint32_t,
>> +             z0 = svdiv_n_s32_m (p0, z0, 1),
>> +             z0 = svdiv_m (p0, z0, 1))
>> +
>> +/*
>> +** div_1_s32_m_untied:
>> +**   sel     z0\.s, p0, z1\.s, z1\.s
>> +**   ret
>> +*/
>> +TEST_UNIFORM_Z (div_1_s32_m_untied, svint32_t,
>> +             z0 = svdiv_n_s32_m (p0, z1, 1),
>> +             z0 = svdiv_m (p0, z1, 1))
>> +
> 
> [ Thanks for adding the tests (which look good to me).  If the output
>  ever improves in future, we can "defend" the improvement by changing
>  the test.  But in the meantime, the above defends something that is
>  known to work. ]
> 
>> [...]
>> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/div_const_run.c 
>> b/gcc/testsuite/gcc.target/aarch64/sve/div_const_run.c
>> new file mode 100644
>> index 00000000000..1a3c25b817d
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/sve/div_const_run.c
>> @@ -0,0 +1,91 @@
>> +/* { dg-do run { target aarch64_sve128_hw } } */
>> +/* { dg-options "-O2 -msve-vector-bits=128" } */
>> +
>> +#include <arm_sve.h>
>> +#include <stdint.h>
>> +
>> +typedef svbool_t pred __attribute__((arm_sve_vector_bits(128)));
>> +typedef svfloat16_t svfloat16_ __attribute__((arm_sve_vector_bits(128)));
>> +typedef svfloat32_t svfloat32_ __attribute__((arm_sve_vector_bits(128)));
>> +typedef svfloat64_t svfloat64_ __attribute__((arm_sve_vector_bits(128)));
>> +typedef svint32_t svint32_ __attribute__((arm_sve_vector_bits(128)));
>> +typedef svint64_t svint64_ __attribute__((arm_sve_vector_bits(128)));
>> +typedef svuint32_t svuint32_ __attribute__((arm_sve_vector_bits(128)));
>> +typedef svuint64_t svuint64_ __attribute__((arm_sve_vector_bits(128)));
>> +
>> +#define F(T, TS, P, OP1, OP2)                                               
>>  \
>> +{                                                                    \
>> +  T##_t op1 = (T##_t) OP1;                                           \
>> +  T##_t op2 = (T##_t) OP2;                                           \
>> +  sv##T##_ res = svdiv_##P (pg, svdup_##TS (op1), svdup_##TS (op2)); \
>> +  sv##T##_ exp = svdup_##TS (op1 / op2);                             \
>> +  if (svptest_any (pg, svcmpne (pg, exp, res)))                             
>>  \
>> +    __builtin_abort ();                                                     
>>  \
>> +                                                                     \
>> +  sv##T##_ res_n = svdiv_##P (pg, svdup_##TS (op1), op2);            \
>> +  if (svptest_any (pg, svcmpne (pg, exp, res_n)))                    \
>> +    __builtin_abort ();                                                     
>>  \
>> +}
>> +
>> +#define TEST_TYPES_1(T, TS)                                          \
>> +  F (T, TS, m, 79, 16)                                                      
>>  \
>> +  F (T, TS, z, 79, 16)                                                      
>>  \
>> +  F (T, TS, x, 79, 16)
>> +
>> +#define TEST_TYPES                                                   \
>> +  TEST_TYPES_1 (float16, f16)                                               
>>  \
>> +  TEST_TYPES_1 (float32, f32)                                               
>>  \
>> +  TEST_TYPES_1 (float64, f64)                                               
>>  \
>> +  TEST_TYPES_1 (int32, s32)                                          \
>> +  TEST_TYPES_1 (int64, s64)                                          \
>> +  TEST_TYPES_1 (uint32, u32)                                         \
>> +  TEST_TYPES_1 (uint64, u64)
>> +
>> +#define TEST_VALUES_S_1(B, OP1, OP2)                                 \
>> +  F (int##B, s##B, x, OP1, OP2)
>> +
>> +#define TEST_VALUES_S                                                       
>>  \
>> +  TEST_VALUES_S_1 (32, INT32_MIN, INT32_MIN)                         \
>> +  TEST_VALUES_S_1 (64, INT64_MIN, INT64_MIN)                         \
>> +  TEST_VALUES_S_1 (32, -7, 4)                                               
>>  \
>> +  TEST_VALUES_S_1 (64, -7, 4)                                               
>>  \
>> +  TEST_VALUES_S_1 (32, INT32_MAX, (1 << 30))                         \
>> +  TEST_VALUES_S_1 (64, INT64_MAX, (1ULL << 62))                             
>>  \
>> +  TEST_VALUES_S_1 (32, INT32_MIN, (1 << 30))                         \
>> +  TEST_VALUES_S_1 (64, INT64_MIN, (1ULL << 62))                             
>>  \
>> +  TEST_VALUES_S_1 (32, INT32_MAX, 1)                                 \
>> +  TEST_VALUES_S_1 (64, INT64_MAX, 1)                                 \
>> +  TEST_VALUES_S_1 (32, INT32_MIN, 16)                                       
>>  \
>> +  TEST_VALUES_S_1 (64, INT64_MIN, 16)                                       
>>  \
>> +  TEST_VALUES_S_1 (32, INT32_MAX, -5)                                       
>>  \
>> +  TEST_VALUES_S_1 (64, INT64_MAX, -5)                                       
>>  \
>> +  TEST_VALUES_S_1 (32, INT32_MIN, -4)                                       
>>  \
>> +  TEST_VALUES_S_1 (64, INT64_MIN, -4)
>> +
>> +#define TEST_VALUES_U_1(B, OP1, OP2)                                 \
>> +  F (uint##B, u##B, x, OP1, OP2)
>> +
>> +#define TEST_VALUES_U                                                       
>>  \
>> +  TEST_VALUES_U_1 (32, UINT32_MAX, UINT32_MAX)                              
>>  \
>> +  TEST_VALUES_U_1 (64, UINT64_MAX, UINT64_MAX)                              
>>  \
>> +  TEST_VALUES_U_1 (32, UINT32_MAX, (1 << 31))                               
>>  \
>> +  TEST_VALUES_U_1 (64, UINT64_MAX, (1ULL << 63))                     \
>> +  TEST_VALUES_U_1 (32, 7, 4)                                         \
>> +  TEST_VALUES_U_1 (64, 7, 4)                                         \
>> +  TEST_VALUES_U_1 (32, 7, 3)                                         \
>> +  TEST_VALUES_U_1 (64, 7, 3)                                         \
>> +  TEST_VALUES_U_1 (32, 11, 1)                                               
>>  \
>> +  TEST_VALUES_U_1 (64, 11, 1)
>> +
>> +#define TEST_VALUES                                                  \
>> +  TEST_VALUES_S                                                             
>>  \
>> +  TEST_VALUES_U
>> +
>> +int
>> +main (void)
>> +{
>> +  const pred pg = svptrue_b64 ();
> 
> I think this should svptrue_b8 instead.  As it stands, the:
> 
>  if (svptest_any (pg, svcmpne (pg, ...)))
>    __builtin_abort ();
> 
> tests will only check the first element in each 64-bit chunk.
Done.
> 
> OK with those changes from my POV, but please give others 24 hours
> to comment.
> 
> Thanks,
> Richard
> 
>> +  TEST_TYPES
>> +  TEST_VALUES
>> +  return 0;
>> +}

smime.p7s
Description: S/MIME cryptographic signature

Re: [PATCH 1/2] SVE intrinsics: Add strength reduction for division by constant.

Reply via email to