Soumya AR <[email protected]> writes:
> The ASRD instruction on SVE performs an arithmetic shift right by an immediate
> for divide.
>
> This patch enables the use of ASRD with Neon modes.
>
> For example:
>
> int in[N], out[N];
>
> void
> foo (void)
> {
> for (int i = 0; i < N; i++)
> out[i] = in[i] / 4;
> }
>
> compiles to:
>
> ldr q31, [x1, x0]
> cmlt v30.16b, v31.16b, #0
> and z30.b, z30.b, 3
> add v30.16b, v30.16b, v31.16b
> sshr v30.16b, v30.16b, 2
> str q30, [x0, x2]
> add x0, x0, 16
> cmp x0, 1024
>
> but can just be:
>
> ldp q30, q31, [x0], 32
> asrd z31.b, p7/m, z31.b, #2
> asrd z30.b, p7/m, z30.b, #2
> stp q30, q31, [x1], 32
> cmp x0, x2
>
> The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression.
> OK for mainline?
>
> Signed-off-by: Soumya AR <[email protected]>
>
> gcc/ChangeLog:
>
> * config/aarch64/aarch64-sve.md: Extended sdiv_pow2<mode>3 and
> *sdiv_pow2<mode>3 to support Neon modes.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/aarch64/sve/sve-asrd.c: New test.
> ---
> gcc/config/aarch64/aarch64-sve.md | 25 ++++-----
> .../gcc.target/aarch64/sve/sve-asrd.c | 54 +++++++++++++++++++
> 2 files changed, 67 insertions(+), 12 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/sve-asrd.c
Rearranging to put the testcase first:
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/sve-asrd.c
> b/gcc/testsuite/gcc.target/aarch64/sve/sve-asrd.c
> new file mode 100644
> index 00000000000..00aa8b2380d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/sve-asrd.c
> @@ -0,0 +1,54 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast --param aarch64-autovec-preference=asimd-only" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +#include <stdint.h>
> +#define N 1024
> +
> +#define FUNC(M) \
> +M in_##M[N]; \
> +M out_##M[N]; \
> +void asrd_##M() { \
> + for (int i = 0; i < N; i++) \
> + out_##M[i] = in_##M[i] / 4; \
> +}
> +
> +/*
> +** asrd_int8_t:
> +** ...
> +** ptrue (p[0-7]).b, vl1
> +** ...
> +** asrd z[0-9]+\.b, \1/m, z[0-9]+\.b, #2
> +** ...
> +*/
> +FUNC(int8_t)
ptrue ..., vl1 will only do the shift/division on the first element.
It should instead be vl8 for 64-bit modes and vl16 for 128-bit modes.
> +
> +/*
> +** asrd_int16_t:
> +** ...
> +** ptrue (p[0-7]).b, vl2
> +** ...
> +** asrd z[0-9]+\.h, \1/m, z[0-9]+\.h, #2
> +** ...
> +*/
> +FUNC(int16_t)
> +
> +/*
> +** asrd_int32_t:
> +** ...
> +** ptrue (p[0-7]).b, vl4
> +** ...
> +** asrd z[0-9]+\.s, \1/m, z[0-9]+\.s, #2
> +** ...
> +*/
> +FUNC(int32_t)
> +
> +/*
> +** asrd_int64_t:
> +** ...
> +** ptrue (p[0-7]).b, vl8
> +** ...
> +** asrd z[0-9]+\.d, \1/m, z[0-9]+\.d, #2
> +** ...
> +*/
> +FUNC(int64_t)
> diff --git a/gcc/config/aarch64/aarch64-sve.md
> b/gcc/config/aarch64/aarch64-sve.md
> index affdb24a93d..96effe4abed 100644
> --- a/gcc/config/aarch64/aarch64-sve.md
> +++ b/gcc/config/aarch64/aarch64-sve.md
> @@ -4972,34 +4972,35 @@
>
> ;; Unpredicated ASRD.
> (define_expand "sdiv_pow2<mode>3"
> - [(set (match_operand:SVE_I 0 "register_operand")
> - (unspec:SVE_I
> + [(set (match_operand:SVE_VDQ_I 0 "register_operand")
> + (unspec:SVE_VDQ_I
> [(match_dup 3)
> - (unspec:SVE_I
> - [(match_operand:SVE_I 1 "register_operand")
> + (unspec:SVE_VDQ_I
> + [(match_operand:SVE_VDQ_I 1 "register_operand")
> (match_operand 2 "aarch64_simd_rshift_imm")]
> UNSPEC_ASRD)]
> UNSPEC_PRED_X))]
> "TARGET_SVE"
> {
> - operands[3] = aarch64_ptrue_reg (<VPRED>mode);
> + operands[3] = aarch64_ptrue_reg (<VPRED>mode,
> + GET_MODE_UNIT_SIZE (<MODE>mode));
Perhaps we should add yet another overload of aarch64_ptrue_reg that
takes the data mode as a second argument. The implementation could
be something like:
/* Return a register of mode PRED_MODE for controlling data of mode DATA_MODE.
DATA_MODE can be a scalar, an Advanced SIMD vector, or an SVE vector.
If it's an N-byte scalar or an Advanced SIMD vector, the first N bits
of the predicate will be active and the rest will be inactive.
If DATA_MODE is an SVE mode, every bit of the predicate will be active. */
static rtx
aarch64_ptrue_reg (machine_mode pred_mode, machine_mode data_mode)
{
if (aarch64_sve_mode_p (data_mode))
return aarch64_ptrue_reg (pred_mode);
auto size = GET_MODE_SIZE (data_mode).to_constant ();
return aarch64_ptrue_reg (pred_mode, size);
}
Thanks,
Richard
> }
> )
>
> ;; Predicated ASRD.
> (define_insn "*sdiv_pow2<mode>3"
> - [(set (match_operand:SVE_I 0 "register_operand")
> - (unspec:SVE_I
> + [(set (match_operand:SVE_VDQ_I 0 "register_operand")
> + (unspec:SVE_VDQ_I
> [(match_operand:<VPRED> 1 "register_operand")
> - (unspec:SVE_I
> - [(match_operand:SVE_I 2 "register_operand")
> - (match_operand:SVE_I 3 "aarch64_simd_rshift_imm")]
> + (unspec:SVE_VDQ_I
> + [(match_operand:SVE_VDQ_I 2 "register_operand")
> + (match_operand:SVE_VDQ_I 3 "aarch64_simd_rshift_imm")]
> UNSPEC_ASRD)]
> UNSPEC_PRED_X))]
> "TARGET_SVE"
> {@ [ cons: =0 , 1 , 2 ; attrs: movprfx ]
> - [ w , Upl , 0 ; * ] asrd\t%0.<Vetype>, %1/m,
> %0.<Vetype>, #%3
> - [ ?&w , Upl , w ; yes ] movprfx\t%0,
> %2\;asrd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
> + [ w , Upl , 0 ; * ] asrd\t%Z0.<Vetype>, %1/m,
> %Z0.<Vetype>, #%3
> + [ ?&w , Upl , w ; yes ] movprfx\t%Z0,
> %Z2\;asrd\t%Z0.<Vetype>, %1/m, %Z0.<Vetype>, #%3
> }
> )
>