Re: [PATCH 1/2] SVE intrinsics: Add strength reduction for division by constant.

Richard Sandiford Mon, 29 Jul 2024 13:55:23 -0700

Thanks for doing this.

Jennifer Schmitz <jschm...@nvidia.com> writes:
> [...]
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s32.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s32.c
> index c49ca1aa524..6500b64c41b 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s32.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s32.c
> @@ -1,6 +1,9 @@
>  /* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
>  
>  #include "test_sve_acle.h"
> +#include <stdint.h>
> +


I think it'd better to drop the explicit include of stdint.h.  arm_sve.h
is defined to include stdint.h itself, and we rely on that elsewhere.

Same for div_s64.c.

> +#define MAXPOW 1<<30
>  
>  /*
>  ** div_s32_m_tied1:
> @@ -53,10 +56,27 @@ TEST_UNIFORM_ZX (div_w0_s32_m_untied, svint32_t, int32_t,
>                z0 = svdiv_n_s32_m (p0, z1, x0),
>                z0 = svdiv_m (p0, z1, x0))
>  
> +/*
> +** div_1_s32_m_tied1:
> +**   sel     z0\.s, p0, z0\.s, z0\.s
> +**   ret
> +*/
> +TEST_UNIFORM_Z (div_1_s32_m_tied1, svint32_t,
> +             z0 = svdiv_n_s32_m (p0, z0, 1),
> +             z0 = svdiv_m (p0, z0, 1))
> +
> +/*
> +** div_1_s32_m_untied:
> +**   sel     z0\.s, p0, z1\.s, z1\.s
> +**   ret
> +*/
> +TEST_UNIFORM_Z (div_1_s32_m_untied, svint32_t,
> +             z0 = svdiv_n_s32_m (p0, z1, 1),
> +             z0 = svdiv_m (p0, z1, 1))
> +

[ Thanks for adding the tests (which look good to me).  If the output
  ever improves in future, we can "defend" the improvement by changing
  the test.  But in the meantime, the above defends something that is
  known to work. ]

> [...]
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/div_const_run.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/div_const_run.c
> new file mode 100644
> index 00000000000..1a3c25b817d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/div_const_run.c
> @@ -0,0 +1,91 @@
> +/* { dg-do run { target aarch64_sve128_hw } } */
> +/* { dg-options "-O2 -msve-vector-bits=128" } */
> +
> +#include <arm_sve.h>
> +#include <stdint.h>
> +
> +typedef svbool_t pred __attribute__((arm_sve_vector_bits(128)));
> +typedef svfloat16_t svfloat16_ __attribute__((arm_sve_vector_bits(128)));
> +typedef svfloat32_t svfloat32_ __attribute__((arm_sve_vector_bits(128)));
> +typedef svfloat64_t svfloat64_ __attribute__((arm_sve_vector_bits(128)));
> +typedef svint32_t svint32_ __attribute__((arm_sve_vector_bits(128)));
> +typedef svint64_t svint64_ __attribute__((arm_sve_vector_bits(128)));
> +typedef svuint32_t svuint32_ __attribute__((arm_sve_vector_bits(128)));
> +typedef svuint64_t svuint64_ __attribute__((arm_sve_vector_bits(128)));
> +
> +#define F(T, TS, P, OP1, OP2)                                                
> \
> +{                                                                    \
> +  T##_t op1 = (T##_t) OP1;                                           \
> +  T##_t op2 = (T##_t) OP2;                                           \
> +  sv##T##_ res = svdiv_##P (pg, svdup_##TS (op1), svdup_##TS (op2)); \
> +  sv##T##_ exp = svdup_##TS (op1 / op2);                             \
> +  if (svptest_any (pg, svcmpne (pg, exp, res)))                              
> \
> +    __builtin_abort ();                                                      
> \
> +                                                                     \
> +  sv##T##_ res_n = svdiv_##P (pg, svdup_##TS (op1), op2);            \
> +  if (svptest_any (pg, svcmpne (pg, exp, res_n)))                    \
> +    __builtin_abort ();                                                      
> \
> +}
> +
> +#define TEST_TYPES_1(T, TS)                                          \
> +  F (T, TS, m, 79, 16)                                                       
> \
> +  F (T, TS, z, 79, 16)                                                       
> \
> +  F (T, TS, x, 79, 16)
> +
> +#define TEST_TYPES                                                   \
> +  TEST_TYPES_1 (float16, f16)                                                
> \
> +  TEST_TYPES_1 (float32, f32)                                                
> \
> +  TEST_TYPES_1 (float64, f64)                                                
> \
> +  TEST_TYPES_1 (int32, s32)                                          \
> +  TEST_TYPES_1 (int64, s64)                                          \
> +  TEST_TYPES_1 (uint32, u32)                                         \
> +  TEST_TYPES_1 (uint64, u64)
> +
> +#define TEST_VALUES_S_1(B, OP1, OP2)                                 \
> +  F (int##B, s##B, x, OP1, OP2)
> +
> +#define TEST_VALUES_S                                                        
> \
> +  TEST_VALUES_S_1 (32, INT32_MIN, INT32_MIN)                         \
> +  TEST_VALUES_S_1 (64, INT64_MIN, INT64_MIN)                         \
> +  TEST_VALUES_S_1 (32, -7, 4)                                                
> \
> +  TEST_VALUES_S_1 (64, -7, 4)                                                
> \
> +  TEST_VALUES_S_1 (32, INT32_MAX, (1 << 30))                         \
> +  TEST_VALUES_S_1 (64, INT64_MAX, (1ULL << 62))                              
> \
> +  TEST_VALUES_S_1 (32, INT32_MIN, (1 << 30))                         \
> +  TEST_VALUES_S_1 (64, INT64_MIN, (1ULL << 62))                              
> \
> +  TEST_VALUES_S_1 (32, INT32_MAX, 1)                                 \
> +  TEST_VALUES_S_1 (64, INT64_MAX, 1)                                 \
> +  TEST_VALUES_S_1 (32, INT32_MIN, 16)                                        
> \
> +  TEST_VALUES_S_1 (64, INT64_MIN, 16)                                        
> \
> +  TEST_VALUES_S_1 (32, INT32_MAX, -5)                                        
> \
> +  TEST_VALUES_S_1 (64, INT64_MAX, -5)                                        
> \
> +  TEST_VALUES_S_1 (32, INT32_MIN, -4)                                        
> \
> +  TEST_VALUES_S_1 (64, INT64_MIN, -4)
> +
> +#define TEST_VALUES_U_1(B, OP1, OP2)                                 \
> +  F (uint##B, u##B, x, OP1, OP2)
> +
> +#define TEST_VALUES_U                                                        
> \
> +  TEST_VALUES_U_1 (32, UINT32_MAX, UINT32_MAX)                               
> \
> +  TEST_VALUES_U_1 (64, UINT64_MAX, UINT64_MAX)                               
> \
> +  TEST_VALUES_U_1 (32, UINT32_MAX, (1 << 31))                                
> \
> +  TEST_VALUES_U_1 (64, UINT64_MAX, (1ULL << 63))                     \
> +  TEST_VALUES_U_1 (32, 7, 4)                                         \
> +  TEST_VALUES_U_1 (64, 7, 4)                                         \
> +  TEST_VALUES_U_1 (32, 7, 3)                                         \
> +  TEST_VALUES_U_1 (64, 7, 3)                                         \
> +  TEST_VALUES_U_1 (32, 11, 1)                                                
> \
> +  TEST_VALUES_U_1 (64, 11, 1)
> +
> +#define TEST_VALUES                                                  \
> +  TEST_VALUES_S                                                              
> \
> +  TEST_VALUES_U
> +
> +int
> +main (void)
> +{
> +  const pred pg = svptrue_b64 ();

I think this should svptrue_b8 instead.  As it stands, the:

  if (svptest_any (pg, svcmpne (pg, ...)))
    __builtin_abort ();

tests will only check the first element in each 64-bit chunk.

OK with those changes from my POV, but please give others 24 hours
to comment.

Thanks,
Richard

> +  TEST_TYPES
> +  TEST_VALUES
> +  return 0;
> +}

Re: [PATCH 1/2] SVE intrinsics: Add strength reduction for division by constant.

Reply via email to