97875)

Kyrylo Tkachov via Gcc-patches Fri, 08 Jan 2021 01:51:01 -0800

Hi Christophe,

> -----Original Message-----
> From: Gcc-patches <gcc-patches-boun...@gcc.gnu.org> On Behalf Of
> Christophe Lyon via Gcc-patches
> Sent: 17 December 2020 17:48
> To: gcc-patches@gcc.gnu.org
> Subject: [PATCH 1/3] arm: Add movmisalign patterns for MVE (PR
> target/97875)
> 
> This patch adds new movmisalign<mode>_mve_load and store patterns for
> MVE to help vectorization. They are very similar to their Neon
> counterparts, but use different iterators and instructions.
> 
> Indeed MVE supports less vectors modes than Neon, so we use
> the MVE_VLD_ST iterator where Neon uses VQX.
> 
> Since the supported modes are different from the ones valid for
> arithmetic operators, we introduce two new sets of macros:
> 
> ARM_HAVE_NEON_<MODE>_LDST
>   true if Neon has vector load/store instructions for <MODE>
> 
> ARM_HAVE_<MODE>_LDST
>   true if any vector extension has vector load/store instructions for <MODE>
>


I'm not a big fan of the big number of these macros ☹ but I understand why 
they're used, so I won't object.

> We move the movmisalign<mode> expander from neon.md to vec-
> commond.md, and
> replace the TARGET_NEON enabler with ARM_HAVE_<MODE>_LDST.
> 
> The patch also updates the mve-vneg.c test to scan for the better code
> generation when loading and storing the vectors involved: it checks
> that no 'orr' instruction is generated to cope with misalignment at
> runtime.
> This test was chosen among the other mve tests, but any other should
> be OK. Using a plain vector copy loop (dest[i] = a[i]) is not a good
> test because the compiler chooses to use memcpy.
> 
> For instance we now generate:
> test_vneg_s32x4:
>       vldrw.32       q3, [r1]
>       vneg.s32  q3, q3
>       vstrw.32       q3, [r0]
>       bx      lr
> 
> instead of:
> test_vneg_s32x4:
>       orr     r3, r1, r0
>       lsls    r3, r3, #28
>       bne     .L15
>       vldrw.32        q3, [r1]
>       vneg.s32  q3, q3
>       vstrw.32        q3, [r0]
>       bx      lr
>       .L15:
>       push    {r4, r5}
>       ldrd    r2, r3, [r1, #8]
>       ldrd    r5, r4, [r1]
>       rsbs    r2, r2, #0
>       rsbs    r5, r5, #0
>       rsbs    r4, r4, #0
>       rsbs    r3, r3, #0
>       strd    r5, r4, [r0]
>       pop     {r4, r5}
>       strd    r2, r3, [r0, #8]
>       bx      lr
> 
> 2020-12-15  Christophe Lyon  <christophe.l...@linaro.org>
> 
>       PR target/97875
>       gcc/
>       * config/arm/arm.h (ARM_HAVE_NEON_V8QI_LDST): New macro.
>       (ARM_HAVE_NEON_V16QI_LDST, ARM_HAVE_NEON_V4HI_LDST):
> Likewise.
>       (ARM_HAVE_NEON_V8HI_LDST, ARM_HAVE_NEON_V2SI_LDST):
> Likewise.
>       (ARM_HAVE_NEON_V4SI_LDST, ARM_HAVE_NEON_V4HF_LDST):
> Likewise.
>       (ARM_HAVE_NEON_V8HF_LDST, ARM_HAVE_NEON_V4BF_LDST):
> Likewise.
>       (ARM_HAVE_NEON_V8BF_LDST, ARM_HAVE_NEON_V2SF_LDST):
> Likewise.
>       (ARM_HAVE_NEON_V4SF_LDST, ARM_HAVE_NEON_DI_LDST):
> Likewise.
>       (ARM_HAVE_NEON_V2DI_LDST): Likewise.
>       (ARM_HAVE_V8QI_LDST, ARM_HAVE_V16QI_LDST): Likewise.
>       (ARM_HAVE_V4HI_LDST, ARM_HAVE_V8HI_LDST): Likewise.
>       (ARM_HAVE_V2SI_LDST, ARM_HAVE_V4SI_LDST,
> ARM_HAVE_V4HF_LDST): Likewise.
>       (ARM_HAVE_V8HF_LDST, ARM_HAVE_V4BF_LDST,
> ARM_HAVE_V8BF_LDST): Likewise.
>       (ARM_HAVE_V2SF_LDST, ARM_HAVE_V4SF_LDST,
> ARM_HAVE_DI_LDST): Likewise.
>       (ARM_HAVE_V2DI_LDST): Likewise.
>       * config/arm/mve.md (*movmisalign<mode>_mve_store): New
> pattern.
>       (*movmisalign<mode>_mve_load): New pattern.
>       * config/arm/neon.md (movmisalign<mode>): Move to ...
>       * config/arm/vec-common.md: ... here.
> 
>       PR target/97875
>       gcc/testsuite/
>       * gcc.target/arm/simd/mve-vneg.c: Update test.
> ---
>  gcc/config/arm/arm.h                         | 40 
> ++++++++++++++++++++++++++++
>  gcc/config/arm/mve.md                        | 25 +++++++++++++++++
>  gcc/config/arm/neon.md                       | 25 -----------------
>  gcc/config/arm/vec-common.md                 | 24 +++++++++++++++++
>  gcc/testsuite/gcc.target/arm/simd/mve-vneg.c |  3 +++
>  5 files changed, 92 insertions(+), 25 deletions(-)
> 
> diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h
> index 4a63d33..d44e0c6 100644
> --- a/gcc/config/arm/arm.h
> +++ b/gcc/config/arm/arm.h
> @@ -1151,6 +1151,46 @@ extern const int arm_arch_cde_coproc_bits[];
>  #define ARM_HAVE_V8HF_ARITH (ARM_HAVE_NEON_V8HF_ARITH ||
> TARGET_HAVE_MVE_FLOAT)
>  #define ARM_HAVE_V4SF_ARITH (ARM_HAVE_NEON_V4SF_ARITH ||
> TARGET_HAVE_MVE_FLOAT)
> 
> +/* The conditions under which vector modes are supported by load/store
> +   instructions using Neon.  */
> +
> +#define ARM_HAVE_NEON_V8QI_LDST TARGET_NEON
> +#define ARM_HAVE_NEON_V16QI_LDST TARGET_NEON
> +#define ARM_HAVE_NEON_V4HI_LDST TARGET_NEON
> +#define ARM_HAVE_NEON_V8HI_LDST TARGET_NEON
> +#define ARM_HAVE_NEON_V2SI_LDST TARGET_NEON
> +#define ARM_HAVE_NEON_V4SI_LDST TARGET_NEON
> +#define ARM_HAVE_NEON_V4HF_LDST TARGET_NEON_FP16INST
> +#define ARM_HAVE_NEON_V8HF_LDST TARGET_NEON_FP16INST
> +#define ARM_HAVE_NEON_V4BF_LDST TARGET_BF16_SIMD
> +#define ARM_HAVE_NEON_V8BF_LDST TARGET_BF16_SIMD
> +#define ARM_HAVE_NEON_V2SF_LDST TARGET_NEON
> +#define ARM_HAVE_NEON_V4SF_LDST TARGET_NEON
> +#define ARM_HAVE_NEON_DI_LDST TARGET_NEON
> +#define ARM_HAVE_NEON_V2DI_LDST TARGET_NEON
> +
> +/* The conditions under which vector modes are supported by load/store
> +   instructions by any vector extension.  */
> +
> +#define ARM_HAVE_V8QI_LDST (ARM_HAVE_NEON_V8QI_LDST ||
> TARGET_REALLY_IWMMXT)
> +#define ARM_HAVE_V4HI_LDST (ARM_HAVE_NEON_V4HI_LDST ||
> TARGET_REALLY_IWMMXT)
> +#define ARM_HAVE_V2SI_LDST (ARM_HAVE_NEON_V2SI_LDST ||
> TARGET_REALLY_IWMMXT)
> +
> +#define ARM_HAVE_V16QI_LDST (ARM_HAVE_NEON_V16QI_LDST ||
> TARGET_HAVE_MVE)
> +#define ARM_HAVE_V8HI_LDST (ARM_HAVE_NEON_V8HI_LDST ||
> TARGET_HAVE_MVE)
> +#define ARM_HAVE_V4SI_LDST (ARM_HAVE_NEON_V4SI_LDST ||
> TARGET_HAVE_MVE)
> +#define ARM_HAVE_DI_LDST ARM_HAVE_NEON_DI_LDST
> +#define ARM_HAVE_V2DI_LDST ARM_HAVE_NEON_V2DI_LDST
> +
> +#define ARM_HAVE_V4HF_LDST ARM_HAVE_NEON_V4HF_LDST
> +#define ARM_HAVE_V2SF_LDST ARM_HAVE_NEON_V2SF_LDST
> +
> +#define ARM_HAVE_V4BF_LDST ARM_HAVE_NEON_V4BF_LDST
> +#define ARM_HAVE_V8BF_LDST ARM_HAVE_NEON_V8BF_LDST
> +
> +#define ARM_HAVE_V8HF_LDST (ARM_HAVE_NEON_V8HF_LDST ||
> TARGET_HAVE_MVE_FLOAT)
> +#define ARM_HAVE_V4SF_LDST (ARM_HAVE_NEON_V4SF_LDST ||
> TARGET_HAVE_MVE_FLOAT)
> +
>  /* The register numbers in sequence, for passing to arm_gen_load_multiple.
> */
>  extern int arm_regs_in_sequence[];
> 
> diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
> index b4c5a1e2..673a83c 100644
> --- a/gcc/config/arm/mve.md
> +++ b/gcc/config/arm/mve.md
> @@ -10937,3 +10937,28 @@ (define_insn "arm_vcx3q<a>_p_v16qi"
>    [(set_attr "type" "coproc")
>     (set_attr "length" "8")]
>  )
> +
> +(define_insn "*movmisalign<mode>_mve_store"
> +  [(set (match_operand:MVE_VLD_ST 0 "neon_permissive_struct_operand"
>            "=Um")
> +     (unspec:MVE_VLD_ST [(match_operand:MVE_VLD_ST 1
> "s_register_operand" " w")]
> +      UNSPEC_MISALIGNED_ACCESS))]
> +  "(TARGET_HAVE_MVE && VALID_MVE_SI_MODE (<MODE>mode))
> +   || (TARGET_HAVE_MVE_FLOAT && VALID_MVE_SF_MODE
> (<MODE>mode))
> +   && !BYTES_BIG_ENDIAN && unaligned_access"
> +  "vstr<V_sz_elem1>.<V_sz_elem>\t%q1, %E0"
> +  [(set_attr "type" "mve_store")
> +   (set_attr "length" "4")]

No need to specify the length here and in the other pattern. It's 4 by default.
Ok for master as long as bootstrap and testing on an arm-none-linux-gnueabihf 
target is clean (to ensure the Neon-related refactoring didn't hurt anything)
Thanks,
Kyrill


> +)
> +
> +
> +(define_insn "*movmisalign<mode>_mve_load"
> +  [(set (match_operand:MVE_VLD_ST 0 "s_register_operand"
>                "=w")
> +     (unspec:MVE_VLD_ST [(match_operand:MVE_VLD_ST 1
> "neon_permissive_struct_operand" " Um")]
> +      UNSPEC_MISALIGNED_ACCESS))]
> +  "(TARGET_HAVE_MVE && VALID_MVE_SI_MODE (<MODE>mode))
> +   || (TARGET_HAVE_MVE_FLOAT && VALID_MVE_SF_MODE
> (<MODE>mode))
> +   && !BYTES_BIG_ENDIAN && unaligned_access"
> +  "vldr<V_sz_elem1>.<V_sz_elem>\t%q0, %E1"
> +  [(set_attr "type" "mve_load")
> +   (set_attr "length" "4")]
> +)
> diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> index d2e92ba..50220be 100644
> --- a/gcc/config/arm/neon.md
> +++ b/gcc/config/arm/neon.md
> @@ -280,31 +280,6 @@ (define_split
>    neon_disambiguate_copy (operands, dest, src, 4);
>  })
> 
> -(define_expand "movmisalign<mode>"
> -  [(set (match_operand:VDQX 0 "neon_perm_struct_or_reg_operand")
> -     (unspec:VDQX [(match_operand:VDQX 1
> "neon_perm_struct_or_reg_operand")]
> -                  UNSPEC_MISALIGNED_ACCESS))]
> -  "TARGET_NEON && !BYTES_BIG_ENDIAN && unaligned_access"
> -{
> -  rtx adjust_mem;
> -  /* This pattern is not permitted to fail during expansion: if both 
> arguments
> -     are non-registers (e.g. memory := constant, which can be created by the
> -     auto-vectorizer), force operand 1 into a register.  */
> -  if (!s_register_operand (operands[0], <MODE>mode)
> -      && !s_register_operand (operands[1], <MODE>mode))
> -    operands[1] = force_reg (<MODE>mode, operands[1]);
> -
> -  if (s_register_operand (operands[0], <MODE>mode))
> -    adjust_mem = operands[1];
> -  else
> -    adjust_mem = operands[0];
> -
> -  /* Legitimize address.  */
> -  if (!neon_vector_mem_operand (adjust_mem, 2, true))
> -    XEXP (adjust_mem, 0) = force_reg (Pmode, XEXP (adjust_mem, 0));
> -
> -})
> -
>  (define_insn "*movmisalign<mode>_neon_store"
>    [(set (match_operand:VDX 0 "neon_permissive_struct_operand"
>       "=Um")
>       (unspec:VDX [(match_operand:VDX 1 "s_register_operand" " w")]
> diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-
> common.md
> index 2d0932b..f6a79e2 100644
> --- a/gcc/config/arm/vec-common.md
> +++ b/gcc/config/arm/vec-common.md
> @@ -205,3 +205,27 @@ (define_expand "neg<mode>2"
>       (neg:VDQWH (match_operand:VDQWH 1 "s_register_operand" "")))]
>    "ARM_HAVE_<MODE>_ARITH"
>  )
> +
> +(define_expand "movmisalign<mode>"
> +  [(set (match_operand:VDQX 0 "neon_perm_struct_or_reg_operand")
> +     (unspec:VDQX [(match_operand:VDQX 1
> "neon_perm_struct_or_reg_operand")]
> +      UNSPEC_MISALIGNED_ACCESS))]
> +  "ARM_HAVE_<MODE>_LDST && !BYTES_BIG_ENDIAN &&
> unaligned_access"
> +{
> +  rtx adjust_mem;
> +  /* This pattern is not permitted to fail during expansion: if both 
> arguments
> +     are non-registers (e.g. memory := constant, which can be created by the
> +     auto-vectorizer), force operand 1 into a register.  */
> +  if (!s_register_operand (operands[0], <MODE>mode)
> +      && !s_register_operand (operands[1], <MODE>mode))
> +    operands[1] = force_reg (<MODE>mode, operands[1]);
> +
> +  if (s_register_operand (operands[0], <MODE>mode))
> +    adjust_mem = operands[1];
> +  else
> +    adjust_mem = operands[0];
> +
> +  /* Legitimize address.  */
> +  if (!neon_vector_mem_operand (adjust_mem, 2, true))
> +    XEXP (adjust_mem, 0) = force_reg (Pmode, XEXP (adjust_mem, 0));
> +})
> diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vneg.c
> b/gcc/testsuite/gcc.target/arm/simd/mve-vneg.c
> index afd0d60..7945a06 100644
> --- a/gcc/testsuite/gcc.target/arm/simd/mve-vneg.c
> +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vneg.c
> @@ -47,3 +47,6 @@ FUNC(f, float, 16, 8, -, vneg)
>     functions above.  */
>  /* { dg-final { scan-assembler-times {vneg.s[0-9]+  q[0-9]+, q[0-9]+} 6 } } 
> */
>  /* { dg-final { scan-assembler-times {vneg.f[0-9]+  q[0-9]+, q[0-9]+} 2 } } 
> */
> +/* { dg-final { scan-assembler-times {vldr[bhw].[0-9]+\tq[0-9]+} 8 } } */
> +/* { dg-final { scan-assembler-times {vstr[bhw].[0-9]+\tq[0-9]+} 8 } } */
> +/* { dg-final { scan-assembler-not {orr\tr[0-9]+, r[0-9]+, r[0-9]+} } } */
> --
> 2.7.4

RE: [PATCH 1/3] arm: Add movmisalign patterns for MVE (PR target/97875)

Reply via email to