Hi Christophe, > -----Original Message----- > From: Gcc-patches <gcc-patches-boun...@gcc.gnu.org> On Behalf Of > Christophe Lyon via Gcc-patches > Sent: 17 December 2020 17:48 > To: gcc-patches@gcc.gnu.org > Subject: [PATCH 1/3] arm: Add movmisalign patterns for MVE (PR > target/97875) > > This patch adds new movmisalign<mode>_mve_load and store patterns for > MVE to help vectorization. They are very similar to their Neon > counterparts, but use different iterators and instructions. > > Indeed MVE supports less vectors modes than Neon, so we use > the MVE_VLD_ST iterator where Neon uses VQX. > > Since the supported modes are different from the ones valid for > arithmetic operators, we introduce two new sets of macros: > > ARM_HAVE_NEON_<MODE>_LDST > true if Neon has vector load/store instructions for <MODE> > > ARM_HAVE_<MODE>_LDST > true if any vector extension has vector load/store instructions for <MODE> >
I'm not a big fan of the big number of these macros ☹ but I understand why they're used, so I won't object. > We move the movmisalign<mode> expander from neon.md to vec- > commond.md, and > replace the TARGET_NEON enabler with ARM_HAVE_<MODE>_LDST. > > The patch also updates the mve-vneg.c test to scan for the better code > generation when loading and storing the vectors involved: it checks > that no 'orr' instruction is generated to cope with misalignment at > runtime. > This test was chosen among the other mve tests, but any other should > be OK. Using a plain vector copy loop (dest[i] = a[i]) is not a good > test because the compiler chooses to use memcpy. > > For instance we now generate: > test_vneg_s32x4: > vldrw.32 q3, [r1] > vneg.s32 q3, q3 > vstrw.32 q3, [r0] > bx lr > > instead of: > test_vneg_s32x4: > orr r3, r1, r0 > lsls r3, r3, #28 > bne .L15 > vldrw.32 q3, [r1] > vneg.s32 q3, q3 > vstrw.32 q3, [r0] > bx lr > .L15: > push {r4, r5} > ldrd r2, r3, [r1, #8] > ldrd r5, r4, [r1] > rsbs r2, r2, #0 > rsbs r5, r5, #0 > rsbs r4, r4, #0 > rsbs r3, r3, #0 > strd r5, r4, [r0] > pop {r4, r5} > strd r2, r3, [r0, #8] > bx lr > > 2020-12-15 Christophe Lyon <christophe.l...@linaro.org> > > PR target/97875 > gcc/ > * config/arm/arm.h (ARM_HAVE_NEON_V8QI_LDST): New macro. > (ARM_HAVE_NEON_V16QI_LDST, ARM_HAVE_NEON_V4HI_LDST): > Likewise. > (ARM_HAVE_NEON_V8HI_LDST, ARM_HAVE_NEON_V2SI_LDST): > Likewise. > (ARM_HAVE_NEON_V4SI_LDST, ARM_HAVE_NEON_V4HF_LDST): > Likewise. > (ARM_HAVE_NEON_V8HF_LDST, ARM_HAVE_NEON_V4BF_LDST): > Likewise. > (ARM_HAVE_NEON_V8BF_LDST, ARM_HAVE_NEON_V2SF_LDST): > Likewise. > (ARM_HAVE_NEON_V4SF_LDST, ARM_HAVE_NEON_DI_LDST): > Likewise. > (ARM_HAVE_NEON_V2DI_LDST): Likewise. > (ARM_HAVE_V8QI_LDST, ARM_HAVE_V16QI_LDST): Likewise. > (ARM_HAVE_V4HI_LDST, ARM_HAVE_V8HI_LDST): Likewise. > (ARM_HAVE_V2SI_LDST, ARM_HAVE_V4SI_LDST, > ARM_HAVE_V4HF_LDST): Likewise. > (ARM_HAVE_V8HF_LDST, ARM_HAVE_V4BF_LDST, > ARM_HAVE_V8BF_LDST): Likewise. > (ARM_HAVE_V2SF_LDST, ARM_HAVE_V4SF_LDST, > ARM_HAVE_DI_LDST): Likewise. > (ARM_HAVE_V2DI_LDST): Likewise. > * config/arm/mve.md (*movmisalign<mode>_mve_store): New > pattern. > (*movmisalign<mode>_mve_load): New pattern. > * config/arm/neon.md (movmisalign<mode>): Move to ... > * config/arm/vec-common.md: ... here. > > PR target/97875 > gcc/testsuite/ > * gcc.target/arm/simd/mve-vneg.c: Update test. > --- > gcc/config/arm/arm.h | 40 > ++++++++++++++++++++++++++++ > gcc/config/arm/mve.md | 25 +++++++++++++++++ > gcc/config/arm/neon.md | 25 ----------------- > gcc/config/arm/vec-common.md | 24 +++++++++++++++++ > gcc/testsuite/gcc.target/arm/simd/mve-vneg.c | 3 +++ > 5 files changed, 92 insertions(+), 25 deletions(-) > > diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h > index 4a63d33..d44e0c6 100644 > --- a/gcc/config/arm/arm.h > +++ b/gcc/config/arm/arm.h > @@ -1151,6 +1151,46 @@ extern const int arm_arch_cde_coproc_bits[]; > #define ARM_HAVE_V8HF_ARITH (ARM_HAVE_NEON_V8HF_ARITH || > TARGET_HAVE_MVE_FLOAT) > #define ARM_HAVE_V4SF_ARITH (ARM_HAVE_NEON_V4SF_ARITH || > TARGET_HAVE_MVE_FLOAT) > > +/* The conditions under which vector modes are supported by load/store > + instructions using Neon. */ > + > +#define ARM_HAVE_NEON_V8QI_LDST TARGET_NEON > +#define ARM_HAVE_NEON_V16QI_LDST TARGET_NEON > +#define ARM_HAVE_NEON_V4HI_LDST TARGET_NEON > +#define ARM_HAVE_NEON_V8HI_LDST TARGET_NEON > +#define ARM_HAVE_NEON_V2SI_LDST TARGET_NEON > +#define ARM_HAVE_NEON_V4SI_LDST TARGET_NEON > +#define ARM_HAVE_NEON_V4HF_LDST TARGET_NEON_FP16INST > +#define ARM_HAVE_NEON_V8HF_LDST TARGET_NEON_FP16INST > +#define ARM_HAVE_NEON_V4BF_LDST TARGET_BF16_SIMD > +#define ARM_HAVE_NEON_V8BF_LDST TARGET_BF16_SIMD > +#define ARM_HAVE_NEON_V2SF_LDST TARGET_NEON > +#define ARM_HAVE_NEON_V4SF_LDST TARGET_NEON > +#define ARM_HAVE_NEON_DI_LDST TARGET_NEON > +#define ARM_HAVE_NEON_V2DI_LDST TARGET_NEON > + > +/* The conditions under which vector modes are supported by load/store > + instructions by any vector extension. */ > + > +#define ARM_HAVE_V8QI_LDST (ARM_HAVE_NEON_V8QI_LDST || > TARGET_REALLY_IWMMXT) > +#define ARM_HAVE_V4HI_LDST (ARM_HAVE_NEON_V4HI_LDST || > TARGET_REALLY_IWMMXT) > +#define ARM_HAVE_V2SI_LDST (ARM_HAVE_NEON_V2SI_LDST || > TARGET_REALLY_IWMMXT) > + > +#define ARM_HAVE_V16QI_LDST (ARM_HAVE_NEON_V16QI_LDST || > TARGET_HAVE_MVE) > +#define ARM_HAVE_V8HI_LDST (ARM_HAVE_NEON_V8HI_LDST || > TARGET_HAVE_MVE) > +#define ARM_HAVE_V4SI_LDST (ARM_HAVE_NEON_V4SI_LDST || > TARGET_HAVE_MVE) > +#define ARM_HAVE_DI_LDST ARM_HAVE_NEON_DI_LDST > +#define ARM_HAVE_V2DI_LDST ARM_HAVE_NEON_V2DI_LDST > + > +#define ARM_HAVE_V4HF_LDST ARM_HAVE_NEON_V4HF_LDST > +#define ARM_HAVE_V2SF_LDST ARM_HAVE_NEON_V2SF_LDST > + > +#define ARM_HAVE_V4BF_LDST ARM_HAVE_NEON_V4BF_LDST > +#define ARM_HAVE_V8BF_LDST ARM_HAVE_NEON_V8BF_LDST > + > +#define ARM_HAVE_V8HF_LDST (ARM_HAVE_NEON_V8HF_LDST || > TARGET_HAVE_MVE_FLOAT) > +#define ARM_HAVE_V4SF_LDST (ARM_HAVE_NEON_V4SF_LDST || > TARGET_HAVE_MVE_FLOAT) > + > /* The register numbers in sequence, for passing to arm_gen_load_multiple. > */ > extern int arm_regs_in_sequence[]; > > diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md > index b4c5a1e2..673a83c 100644 > --- a/gcc/config/arm/mve.md > +++ b/gcc/config/arm/mve.md > @@ -10937,3 +10937,28 @@ (define_insn "arm_vcx3q<a>_p_v16qi" > [(set_attr "type" "coproc") > (set_attr "length" "8")] > ) > + > +(define_insn "*movmisalign<mode>_mve_store" > + [(set (match_operand:MVE_VLD_ST 0 "neon_permissive_struct_operand" > "=Um") > + (unspec:MVE_VLD_ST [(match_operand:MVE_VLD_ST 1 > "s_register_operand" " w")] > + UNSPEC_MISALIGNED_ACCESS))] > + "(TARGET_HAVE_MVE && VALID_MVE_SI_MODE (<MODE>mode)) > + || (TARGET_HAVE_MVE_FLOAT && VALID_MVE_SF_MODE > (<MODE>mode)) > + && !BYTES_BIG_ENDIAN && unaligned_access" > + "vstr<V_sz_elem1>.<V_sz_elem>\t%q1, %E0" > + [(set_attr "type" "mve_store") > + (set_attr "length" "4")] No need to specify the length here and in the other pattern. It's 4 by default. Ok for master as long as bootstrap and testing on an arm-none-linux-gnueabihf target is clean (to ensure the Neon-related refactoring didn't hurt anything) Thanks, Kyrill > +) > + > + > +(define_insn "*movmisalign<mode>_mve_load" > + [(set (match_operand:MVE_VLD_ST 0 "s_register_operand" > "=w") > + (unspec:MVE_VLD_ST [(match_operand:MVE_VLD_ST 1 > "neon_permissive_struct_operand" " Um")] > + UNSPEC_MISALIGNED_ACCESS))] > + "(TARGET_HAVE_MVE && VALID_MVE_SI_MODE (<MODE>mode)) > + || (TARGET_HAVE_MVE_FLOAT && VALID_MVE_SF_MODE > (<MODE>mode)) > + && !BYTES_BIG_ENDIAN && unaligned_access" > + "vldr<V_sz_elem1>.<V_sz_elem>\t%q0, %E1" > + [(set_attr "type" "mve_load") > + (set_attr "length" "4")] > +) > diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md > index d2e92ba..50220be 100644 > --- a/gcc/config/arm/neon.md > +++ b/gcc/config/arm/neon.md > @@ -280,31 +280,6 @@ (define_split > neon_disambiguate_copy (operands, dest, src, 4); > }) > > -(define_expand "movmisalign<mode>" > - [(set (match_operand:VDQX 0 "neon_perm_struct_or_reg_operand") > - (unspec:VDQX [(match_operand:VDQX 1 > "neon_perm_struct_or_reg_operand")] > - UNSPEC_MISALIGNED_ACCESS))] > - "TARGET_NEON && !BYTES_BIG_ENDIAN && unaligned_access" > -{ > - rtx adjust_mem; > - /* This pattern is not permitted to fail during expansion: if both > arguments > - are non-registers (e.g. memory := constant, which can be created by the > - auto-vectorizer), force operand 1 into a register. */ > - if (!s_register_operand (operands[0], <MODE>mode) > - && !s_register_operand (operands[1], <MODE>mode)) > - operands[1] = force_reg (<MODE>mode, operands[1]); > - > - if (s_register_operand (operands[0], <MODE>mode)) > - adjust_mem = operands[1]; > - else > - adjust_mem = operands[0]; > - > - /* Legitimize address. */ > - if (!neon_vector_mem_operand (adjust_mem, 2, true)) > - XEXP (adjust_mem, 0) = force_reg (Pmode, XEXP (adjust_mem, 0)); > - > -}) > - > (define_insn "*movmisalign<mode>_neon_store" > [(set (match_operand:VDX 0 "neon_permissive_struct_operand" > "=Um") > (unspec:VDX [(match_operand:VDX 1 "s_register_operand" " w")] > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec- > common.md > index 2d0932b..f6a79e2 100644 > --- a/gcc/config/arm/vec-common.md > +++ b/gcc/config/arm/vec-common.md > @@ -205,3 +205,27 @@ (define_expand "neg<mode>2" > (neg:VDQWH (match_operand:VDQWH 1 "s_register_operand" "")))] > "ARM_HAVE_<MODE>_ARITH" > ) > + > +(define_expand "movmisalign<mode>" > + [(set (match_operand:VDQX 0 "neon_perm_struct_or_reg_operand") > + (unspec:VDQX [(match_operand:VDQX 1 > "neon_perm_struct_or_reg_operand")] > + UNSPEC_MISALIGNED_ACCESS))] > + "ARM_HAVE_<MODE>_LDST && !BYTES_BIG_ENDIAN && > unaligned_access" > +{ > + rtx adjust_mem; > + /* This pattern is not permitted to fail during expansion: if both > arguments > + are non-registers (e.g. memory := constant, which can be created by the > + auto-vectorizer), force operand 1 into a register. */ > + if (!s_register_operand (operands[0], <MODE>mode) > + && !s_register_operand (operands[1], <MODE>mode)) > + operands[1] = force_reg (<MODE>mode, operands[1]); > + > + if (s_register_operand (operands[0], <MODE>mode)) > + adjust_mem = operands[1]; > + else > + adjust_mem = operands[0]; > + > + /* Legitimize address. */ > + if (!neon_vector_mem_operand (adjust_mem, 2, true)) > + XEXP (adjust_mem, 0) = force_reg (Pmode, XEXP (adjust_mem, 0)); > +}) > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vneg.c > b/gcc/testsuite/gcc.target/arm/simd/mve-vneg.c > index afd0d60..7945a06 100644 > --- a/gcc/testsuite/gcc.target/arm/simd/mve-vneg.c > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vneg.c > @@ -47,3 +47,6 @@ FUNC(f, float, 16, 8, -, vneg) > functions above. */ > /* { dg-final { scan-assembler-times {vneg.s[0-9]+ q[0-9]+, q[0-9]+} 6 } } > */ > /* { dg-final { scan-assembler-times {vneg.f[0-9]+ q[0-9]+, q[0-9]+} 2 } } > */ > +/* { dg-final { scan-assembler-times {vldr[bhw].[0-9]+\tq[0-9]+} 8 } } */ > +/* { dg-final { scan-assembler-times {vstr[bhw].[0-9]+\tq[0-9]+} 8 } } */ > +/* { dg-final { scan-assembler-not {orr\tr[0-9]+, r[0-9]+, r[0-9]+} } } */ > -- > 2.7.4