On Thu, Sep 12, 2024 at 2:53 AM Pengxuan Zheng <quic_pzh...@quicinc.com> wrote:
>
> SVE's INDEX instruction can be used to populate vectors by values starting 
> from
> "base" and incremented by "step" for each subsequent value. We can take
> advantage of it to generate vector constants if TARGET_SVE is available and 
> the
> base and step values are within [-16, 15].

Are there multiplication by or addition of scalar immediate instructions to
enhance this with two-instruction sequences?

> For example, with the following function:
>
> typedef int v4si __attribute__ ((vector_size (16)));
> v4si
> f_v4si (void)
> {
>   return (v4si){ 0, 1, 2, 3 };
> }
>
> GCC currently generates:
>
> f_v4si:
>         adrp    x0, .LC4
>         ldr     q0, [x0, #:lo12:.LC4]
>         ret
>
> .LC4:
>         .word   0
>         .word   1
>         .word   2
>         .word   3
>
> With this patch, we generate an INDEX instruction instead if TARGET_SVE is
> available.
>
> f_v4si:
>         index   z0.s, #0, #1
>         ret
>
>         PR target/113328
>
> gcc/ChangeLog:
>
>         * config/aarch64/aarch64.cc (aarch64_simd_valid_immediate): Improve
>         handling of some ADVSIMD vectors by using SVE's INDEX if TARGET_SVE is
>         available.
>         (aarch64_output_simd_mov_immediate): Likewise.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/aarch64/sve/acle/general/dupq_1.c: Update test to use
>         SVE's INDEX instruction.
>         * gcc.target/aarch64/sve/acle/general/dupq_2.c: Likewise.
>         * gcc.target/aarch64/sve/acle/general/dupq_3.c: Likewise.
>         * gcc.target/aarch64/sve/acle/general/dupq_4.c: Likewise.
>         * gcc.target/aarch64/sve/vec_init_3.c: New test.
>
> Signed-off-by: Pengxuan Zheng <quic_pzh...@quicinc.com>
> ---
>  gcc/config/aarch64/aarch64.cc                 | 12 ++-
>  .../aarch64/sve/acle/general/dupq_1.c         |  3 +-
>  .../aarch64/sve/acle/general/dupq_2.c         |  3 +-
>  .../aarch64/sve/acle/general/dupq_3.c         |  3 +-
>  .../aarch64/sve/acle/general/dupq_4.c         |  3 +-
>  .../gcc.target/aarch64/sve/vec_init_3.c       | 99 +++++++++++++++++++
>  6 files changed, 114 insertions(+), 9 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/vec_init_3.c
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 27e24ba70ab..6b3ca57d0eb 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -22991,7 +22991,7 @@ aarch64_simd_valid_immediate (rtx op, 
> simd_immediate_info *info,
>    if (CONST_VECTOR_P (op)
>        && CONST_VECTOR_DUPLICATE_P (op))
>      n_elts = CONST_VECTOR_NPATTERNS (op);
> -  else if ((vec_flags & VEC_SVE_DATA)
> +  else if (which == AARCH64_CHECK_MOV && TARGET_SVE
>            && const_vec_series_p (op, &base, &step))
>      {
>        gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
> @@ -25249,6 +25249,16 @@ aarch64_output_simd_mov_immediate (rtx const_vector, 
> unsigned width,
>
>    if (which == AARCH64_CHECK_MOV)
>      {
> +      if (info.insn == simd_immediate_info::INDEX)
> +       {
> +         gcc_assert (TARGET_SVE);
> +         snprintf (templ, sizeof (templ), "index\t%%Z0.%c, #"
> +                   HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
> +                   element_char, INTVAL (info.u.index.base),
> +                   INTVAL (info.u.index.step));
> +         return templ;
> +       }
> +
>        mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
>        shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
>                   ? "msl" : "lsl");
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c
> index 216699b0536..0940bedd0dd 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c
> @@ -10,7 +10,6 @@ dupq (int x)
>    return svdupq_s32 (x, 1, 2, 3);
>  }
>
> -/* { dg-final { scan-assembler {\tldr\tq[0-9]+,} } } */
> +/* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #0, #1} } } */
>  /* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[0\], w0\n} } } */
>  /* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} } } */
> -/* { dg-final { scan-assembler {\t\.word\t1\n\t\.word\t2\n\t\.word\t3\n} } } 
> */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c
> index d494943a275..218a6601337 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c
> @@ -10,7 +10,6 @@ dupq (int x)
>    return svdupq_s32 (x, 1, 2, 3);
>  }
>
> -/* { dg-final { scan-assembler {\tldr\tq[0-9]+,} } } */
> +/* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #3, #-1} } } */
>  /* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[0\], w0\n} } } */
>  /* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} } } */
> -/* { dg-final { scan-assembler {\t\.word\t3\n\t\.word\t2\n\t\.word\t1\n} } } 
> */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_3.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_3.c
> index 4bc8259df07..245d43b75b5 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_3.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_3.c
> @@ -10,7 +10,6 @@ dupq (int x)
>    return svdupq_s32 (0, 1, x, 3);
>  }
>
> -/* { dg-final { scan-assembler {\tldr\tq[0-9]+,} } } */
> +/* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #0, #1} } } */
>  /* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[2\], w0\n} } } */
>  /* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} } } */
> -/* { dg-final { scan-assembler 
> {\t\.word\t0\n\t\.word\t1\n\t\.word\t[^\n]*\n\t\.word\t3\n} } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_4.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_4.c
> index 6f9f9f2f22f..cbee6f27b62 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_4.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_4.c
> @@ -10,7 +10,6 @@ dupq (int x)
>    return svdupq_s32 (0, 1, x, 3);
>  }
>
> -/* { dg-final { scan-assembler {\tldr\tq[0-9]+,} } } */
> +/* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #3, #-1} } } */
>  /* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[2\], w0\n} } } */
>  /* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} } } */
> -/* { dg-final { scan-assembler 
> {\t\.word\t3\n\t\.word\t[^\n]*\n\t\.word\t1\n\t\.word\t0\n} } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vec_init_3.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/vec_init_3.c
> new file mode 100644
> index 00000000000..25910dbfa1f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/vec_init_3.c
> @@ -0,0 +1,99 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +typedef char v16qi __attribute__ ((vector_size (16)));
> +typedef char v8qi __attribute__ ((vector_size (8)));
> +typedef short v8hi __attribute__ ((vector_size (16)));
> +typedef short v4hi __attribute__ ((vector_size (8)));
> +typedef int v4si __attribute__ ((vector_size (16)));
> +typedef int v2si __attribute__ ((vector_size (8)));
> +typedef long v2di __attribute__ ((vector_size (16)));
> +
> +/*
> +** f_v16qi:
> +**     index   z0\.b, #0, #1
> +**     ret
> +*/
> +v16qi
> +f_v16qi (void)
> +{
> +  return (v16qi){ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
> +}
> +
> +/*
> +** f_v8qi:
> +**     index   z0\.b, #0, #1
> +**     ret
> +*/
> +v8qi
> +f_v8qi (void)
> +{
> +  return (v8qi){ 0, 1, 2, 3, 4, 5, 6, 7 };
> +}
> +
> +/*
> +** f_v8hi:
> +**     index   z0\.h, #0, #1
> +**     ret
> +*/
> +v8hi
> +f_v8hi (void)
> +{
> +  return (v8hi){ 0, 1, 2, 3, 4, 5, 6, 7 };
> +}
> +
> +/*
> +** f_v4hi:
> +**     index   z0\.h, #0, #1
> +**     ret
> +*/
> +v4hi
> +f_v4hi (void)
> +{
> +  return (v4hi){ 0, 1, 2, 3 };
> +}
> +
> +/*
> +** f_v4si:
> +**     index   z0\.s, #0, #1
> +**     ret
> +*/
> +v4si
> +f_v4si (void)
> +{
> +  return (v4si){ 0, 1, 2, 3 };
> +}
> +
> +/*
> +** f_v2si:
> +**     index   z0\.s, #0, #1
> +**     ret
> +*/
> +v2si
> +f_v2si (void)
> +{
> +  return (v2si){ 0, 1 };
> +}
> +
> +/*
> +** f_v2di:
> +**     index   z0\.d, #0, #1
> +**     ret
> +*/
> +v2di
> +f_v2di (void)
> +{
> +  return (v2di){ 0, 1 };
> +}
> +
> +/*
> +** g_v4si:
> +**     index   z0\.s, #3, #-4
> +**     ret
> +*/
> +v4si
> +g_v4si (void)
> +{
> +  return (v4si){ 3, -1, -5, -9 };
> +}
> --
> 2.17.1
>

Reply via email to