Re: [PATCH 2/6] aarch64: Use canonical RTL representation for SVE2 XAR and extend it to fixed-width modes

Richard Sandiford Mon, 28 Oct 2024 03:36:41 -0700

Kyrylo Tkachov <ktkac...@nvidia.com> writes:
> Hi all,
>
> The MD pattern for the XAR instruction in SVE2 is currently expressed with
> non-canonical RTL by using a ROTATERT code with a constant rotate amount.
> Fix it by using the left ROTATE code.  This necessitates adjusting the rotate
> amount during expand. 
>
> Additionally, as the SVE2 XAR instruction is unpredicated and can handle all
> element sizes from .b to .d, it is a good fit for implementing the XOR+ROTATE
> operation for Advanced SIMD modes where the TARGET_SHA3 cannot be used
> (that can only handle V2DImode operands).  Therefore let's extend the accepted
> modes of the SVE2 patternt to include the Advanced SIMD integer modes.
>
> This leads to some tests for the svxar* intrinsics to fail because they now
> simplify to a plain EOR when the rotate amount is the width of the element.
> This simplification is desirable (EOR instructions have better or equal
> throughput than XAR, and they are non-destructive of their input) so the
> tests are adjusted.
>
> For V2DImode XAR operations we should prefer the Advanced SIMD version when
> it is available (TARGET_SHA3) because it is non-destructive, so restrict the
> SVE2 pattern accordingly.  Tests are added to confirm this.
>
> Bootstrapped and tested on aarch64-none-linux-gnu.
> Ok for mainline?
> Thanks,
> Kyrill
>
> Signed-off-by: Kyrylo Tkachov <ktkac...@nvidia.com>
>
> gcc/
>
>       * config/aarch64/iterators.md (SVE_ASIMD_FULL_I): New mode iterator.
>       * config/aarch64/aarch64-sve2.md (@aarch64_sve2_xar<mode>):
>       Use SVE_ASIMD_FULL_I modes.  Use ROTATE code for the rotate step.
>       Adjust output logic.
>       * config/aarch64/aarch64-sve-builtins-sve2.cc (svxar_impl): Define.
>       (svxar): Use the above.
>
> gcc/testsuite/
>
>       * gcc.target/aarch64/xar_neon_modes.c: New test.
>       * gcc.target/aarch64/xar_v2di_nonsve.c: Likewise.
>       * gcc.target/aarch64/sve2/acle/asm/xar_s16.c: Scan for EOR rather than
>       XAR.
>       * gcc.target/aarch64/sve2/acle/asm/xar_s32.c: Likewise.
>       * gcc.target/aarch64/sve2/acle/asm/xar_s64.c: Likewise.
>       * gcc.target/aarch64/sve2/acle/asm/xar_s8.c: Likewise.
>       * gcc.target/aarch64/sve2/acle/asm/xar_u16.c: Likewise.
>       * gcc.target/aarch64/sve2/acle/asm/xar_u32.c: Likewise.
>       * gcc.target/aarch64/sve2/acle/asm/xar_u64.c: Likewise.
>       * gcc.target/aarch64/sve2/acle/asm/xar_u8.c: Likewise.


Looks great to me.  Just one very minor nit:

> diff --git a/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc 
> b/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc
> index ddd6e466ee3..62c17281ec7 100644
> --- a/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc
> +++ b/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc
> @@ -90,6 +90,23 @@ public:
>    }
>  };
>  
> +class svxar_impl : public function_base
> +{
> +public:
> +  rtx
> +  expand (function_expander &e) const override
> +  {
> +    /* aarch64_sve2_xar represents this operation with a left-rotate RTX.
> +       Convert the right-rotate amount from the intrinsic to fit this.  */
> +    machine_mode mode = e.vector_mode (0);
> +    HOST_WIDE_INT rot = GET_MODE_UNIT_BITSIZE (mode)
> +                     - INTVAL (e.args[2]);
> +    e.args[2]
> +      = aarch64_simd_gen_const_vector_dup (mode, rot);

The split line seems unnecessary.

OK with that change as far as I'm concerned.

Thanks,
Richard

> +    return e.use_exact_insn (code_for_aarch64_sve2_xar (mode));
> +  }
> +};
> +
>  class svcdot_impl : public function_base
>  {
>  public:
> @@ -773,6 +790,6 @@ FUNCTION (svwhilege, while_comparison, (UNSPEC_WHILEGE, 
> UNSPEC_WHILEHS))
>  FUNCTION (svwhilegt, while_comparison, (UNSPEC_WHILEGT, UNSPEC_WHILEHI))
>  FUNCTION (svwhilerw, svwhilerw_svwhilewr_impl, (UNSPEC_WHILERW))
>  FUNCTION (svwhilewr, svwhilerw_svwhilewr_impl, (UNSPEC_WHILEWR))
> -FUNCTION (svxar, CODE_FOR_MODE0 (aarch64_sve2_xar),)
> +FUNCTION (svxar, svxar_impl,)
>  
>  } /* end namespace aarch64_sve */
> diff --git a/gcc/config/aarch64/aarch64-sve2.md 
> b/gcc/config/aarch64/aarch64-sve2.md
> index 5f2697c3179..8047f405a17 100644
> --- a/gcc/config/aarch64/aarch64-sve2.md
> +++ b/gcc/config/aarch64/aarch64-sve2.md
> @@ -1266,18 +1266,28 @@
>  ;; - XAR
>  ;; -------------------------------------------------------------------------
>  
> +;; Also allow the Advanced SIMD modes as the the SVE2 XAR instruction
> +;; can handle more element sizes than the TARGET_SHA3 one from Advanced SIMD.
> +;; Don't allow the V2DImode use here unless !TARGET_SHA3 as the Advanced SIMD
> +;; version should be preferred when available as it is non-destructive on its
> +;; input.
>  (define_insn "@aarch64_sve2_xar<mode>"
> -  [(set (match_operand:SVE_FULL_I 0 "register_operand")
> -     (rotatert:SVE_FULL_I
> -       (xor:SVE_FULL_I
> -         (match_operand:SVE_FULL_I 1 "register_operand")
> -         (match_operand:SVE_FULL_I 2 "register_operand"))
> -       (match_operand:SVE_FULL_I 3 "aarch64_simd_rshift_imm")))]
> -  "TARGET_SVE2"
> -  {@ [ cons: =0 , 1  , 2 ; attrs: movprfx ]
> -     [ w        , %0 , w ; *              ] xar\t%0.<Vetype>, %0.<Vetype>, 
> %2.<Vetype>, #%3
> -     [ ?&w      , w  , w ; yes            ] movprfx\t%0, 
> %1\;xar\t%0.<Vetype>, %0.<Vetype>, %2.<Vetype>, #%3
> +  [(set (match_operand:SVE_ASIMD_FULL_I 0 "register_operand" "=w,?&w")
> +     (rotate:SVE_ASIMD_FULL_I
> +       (xor:SVE_ASIMD_FULL_I
> +         (match_operand:SVE_ASIMD_FULL_I 1 "register_operand" "%0,w")
> +         (match_operand:SVE_ASIMD_FULL_I 2 "register_operand" "w,w"))
> +       (match_operand:SVE_ASIMD_FULL_I 3 "aarch64_simd_lshift_imm")))]
> +  "TARGET_SVE2 && !(<MODE>mode == V2DImode && TARGET_SHA3)"
> +  {
> +    operands[3]
> +      = GEN_INT (GET_MODE_UNIT_BITSIZE (<MODE>mode)
> +              - INTVAL (unwrap_const_vec_duplicate (operands[3])));
> +    if (which_alternative == 0)
> +      return "xar\t%Z0.<Vetype>, %Z0.<Vetype>, %Z2.<Vetype>, #%3";
> +    return "movprfx\t%Z0, %Z1\;xar\t%Z0.<Vetype>, %Z0.<Vetype>, 
> %Z2.<Vetype>, #%3";
>    }
> +  [(set_attr "movprfx" "*,yes")]
>  )
>  
>  ;; -------------------------------------------------------------------------
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index 0bc98315bb6..8269b0cdcd9 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -446,6 +446,9 @@
>  ;; All fully-packed SVE integer vector modes.
>  (define_mode_iterator SVE_FULL_I [VNx16QI VNx8HI VNx4SI VNx2DI])
>  
> +;; All fully-packed SVE integer and Advanced SIMD integer modes.
> +(define_mode_iterator SVE_ASIMD_FULL_I [SVE_FULL_I VDQ_I])
> +
>  ;; All fully-packed SVE floating-point vector modes.
>  (define_mode_iterator SVE_FULL_F [VNx8HF VNx4SF VNx2DF])
>  
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/xar_s16.c 
> b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/xar_s16.c
> index 34351d52718..f69ba3f7b06 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/xar_s16.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/xar_s16.c
> @@ -70,7 +70,11 @@ TEST_UNIFORM_Z (xar_2_s16_untied, svint16_t,
>  
>  /*
>  ** xar_16_s16_tied1:
> -**   xar     z0\.h, z0\.h, z1\.h, #16
> +** (
> +**   eor     z0\.d, z1\.d, z0\.d
> +** |
> +**   eor     z0\.d, z0\.d, z1\.d
> +** )
>  **   ret
>  */
>  TEST_UNIFORM_Z (xar_16_s16_tied1, svint16_t,
> @@ -79,7 +83,11 @@ TEST_UNIFORM_Z (xar_16_s16_tied1, svint16_t,
>  
>  /*
>  ** xar_16_s16_tied2:
> -**   xar     z0\.h, z0\.h, z1\.h, #16
> +** (
> +**   eor     z0\.d, z1\.d, z0\.d
> +** |
> +**   eor     z0\.d, z0\.d, z1\.d
> +** )
>  **   ret
>  */
>  TEST_UNIFORM_Z (xar_16_s16_tied2, svint16_t,
> @@ -89,11 +97,9 @@ TEST_UNIFORM_Z (xar_16_s16_tied2, svint16_t,
>  /*
>  ** xar_16_s16_untied:
>  ** (
> -**   movprfx z0, z1
> -**   xar     z0\.h, z0\.h, z2\.h, #16
> +**   eor     z0\.d, z1\.d, z2\.d
>  ** |
> -**   movprfx z0, z2
> -**   xar     z0\.h, z0\.h, z1\.h, #16
> +**   eor     z0\.d, z2\.d, z1\.d
>  ** )
>  **   ret
>  */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/xar_s32.c 
> b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/xar_s32.c
> index 366a6172807..540f7b875ec 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/xar_s32.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/xar_s32.c
> @@ -70,7 +70,11 @@ TEST_UNIFORM_Z (xar_2_s32_untied, svint32_t,
>  
>  /*
>  ** xar_32_s32_tied1:
> -**   xar     z0\.s, z0\.s, z1\.s, #32
> +** (
> +**   eor     z0\.d, z1\.d, z0\.d
> +** |
> +**   eor     z0\.d, z0\.d, z1\.d
> +** )
>  **   ret
>  */
>  TEST_UNIFORM_Z (xar_32_s32_tied1, svint32_t,
> @@ -79,7 +83,11 @@ TEST_UNIFORM_Z (xar_32_s32_tied1, svint32_t,
>  
>  /*
>  ** xar_32_s32_tied2:
> -**   xar     z0\.s, z0\.s, z1\.s, #32
> +** (
> +**   eor     z0\.d, z0\.d, z1\.d
> +** |
> +**   eor     z0\.d, z1\.d, z0\.d
> +** )
>  **   ret
>  */
>  TEST_UNIFORM_Z (xar_32_s32_tied2, svint32_t,
> @@ -89,11 +97,9 @@ TEST_UNIFORM_Z (xar_32_s32_tied2, svint32_t,
>  /*
>  ** xar_32_s32_untied:
>  ** (
> -**   movprfx z0, z1
> -**   xar     z0\.s, z0\.s, z2\.s, #32
> +**   eor     z0\.d, z1\.d, z2\.d
>  ** |
> -**   movprfx z0, z2
> -**   xar     z0\.s, z0\.s, z1\.s, #32
> +**   eor     z0\.d, z2\.d, z1\.d
>  ** )
>  **   ret
>  */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/xar_s64.c 
> b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/xar_s64.c
> index dedda2ed044..9491dbdb848 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/xar_s64.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/xar_s64.c
> @@ -70,7 +70,11 @@ TEST_UNIFORM_Z (xar_2_s64_untied, svint64_t,
>  
>  /*
>  ** xar_64_s64_tied1:
> -**   xar     z0\.d, z0\.d, z1\.d, #64
> +** (
> +**   eor     z0\.d, z1\.d, z0\.d
> +** |
> +**   eor     z0\.d, z0\.d, z1\.d
> +** )
>  **   ret
>  */
>  TEST_UNIFORM_Z (xar_64_s64_tied1, svint64_t,
> @@ -79,7 +83,11 @@ TEST_UNIFORM_Z (xar_64_s64_tied1, svint64_t,
>  
>  /*
>  ** xar_64_s64_tied2:
> -**   xar     z0\.d, z0\.d, z1\.d, #64
> +** (
> +**   eor     z0\.d, z1\.d, z0\.d
> +** |
> +**   eor     z0\.d, z0\.d, z1\.d
> +** )
>  **   ret
>  */
>  TEST_UNIFORM_Z (xar_64_s64_tied2, svint64_t,
> @@ -89,11 +97,9 @@ TEST_UNIFORM_Z (xar_64_s64_tied2, svint64_t,
>  /*
>  ** xar_64_s64_untied:
>  ** (
> -**   movprfx z0, z1
> -**   xar     z0\.d, z0\.d, z2\.d, #64
> +**   eor     z0\.d, z1\.d, z2\.d
>  ** |
> -**   movprfx z0, z2
> -**   xar     z0\.d, z0\.d, z1\.d, #64
> +**   eor     z0\.d, z2\.d, z1\.d
>  ** )
>  **   ret
>  */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/xar_s8.c 
> b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/xar_s8.c
> index 904352b93da..e62e5bca5ba 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/xar_s8.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/xar_s8.c
> @@ -70,7 +70,11 @@ TEST_UNIFORM_Z (xar_2_s8_untied, svint8_t,
>  
>  /*
>  ** xar_8_s8_tied1:
> -**   xar     z0\.b, z0\.b, z1\.b, #8
> +** (
> +**   eor     z0\.d, z1\.d, z0\.d
> +** |
> +**   eor     z0\.d, z0\.d, z1\.d
> +** )
>  **   ret
>  */
>  TEST_UNIFORM_Z (xar_8_s8_tied1, svint8_t,
> @@ -79,7 +83,11 @@ TEST_UNIFORM_Z (xar_8_s8_tied1, svint8_t,
>  
>  /*
>  ** xar_8_s8_tied2:
> -**   xar     z0\.b, z0\.b, z1\.b, #8
> +** (
> +**   eor     z0\.d, z1\.d, z0\.d
> +** |
> +**   eor     z0\.d, z0\.d, z1\.d
> +** )
>  **   ret
>  */
>  TEST_UNIFORM_Z (xar_8_s8_tied2, svint8_t,
> @@ -89,11 +97,9 @@ TEST_UNIFORM_Z (xar_8_s8_tied2, svint8_t,
>  /*
>  ** xar_8_s8_untied:
>  ** (
> -**   movprfx z0, z1
> -**   xar     z0\.b, z0\.b, z2\.b, #8
> +**   eor     z0\.d, z1\.d, z2\.d
>  ** |
> -**   movprfx z0, z2
> -**   xar     z0\.b, z0\.b, z1\.b, #8
> +**   eor     z0\.d, z2\.d, z1\.d
>  ** )
>  **   ret
>  */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/xar_u16.c 
> b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/xar_u16.c
> index c7b9665aeed..6269145bc6d 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/xar_u16.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/xar_u16.c
> @@ -70,7 +70,11 @@ TEST_UNIFORM_Z (xar_2_u16_untied, svuint16_t,
>  
>  /*
>  ** xar_16_u16_tied1:
> -**   xar     z0\.h, z0\.h, z1\.h, #16
> +** (
> +**   eor     z0\.d, z1\.d, z0\.d
> +** |
> +**   eor     z0\.d, z0\.d, z1\.d
> +** )
>  **   ret
>  */
>  TEST_UNIFORM_Z (xar_16_u16_tied1, svuint16_t,
> @@ -79,7 +83,11 @@ TEST_UNIFORM_Z (xar_16_u16_tied1, svuint16_t,
>  
>  /*
>  ** xar_16_u16_tied2:
> -**   xar     z0\.h, z0\.h, z1\.h, #16
> +** (
> +**   eor     z0\.d, z1\.d, z0\.d
> +** |
> +**   eor     z0\.d, z0\.d, z1\.d
> +** )
>  **   ret
>  */
>  TEST_UNIFORM_Z (xar_16_u16_tied2, svuint16_t,
> @@ -89,11 +97,9 @@ TEST_UNIFORM_Z (xar_16_u16_tied2, svuint16_t,
>  /*
>  ** xar_16_u16_untied:
>  ** (
> -**   movprfx z0, z1
> -**   xar     z0\.h, z0\.h, z2\.h, #16
> +**   eor     z0\.d, z1\.d, z2\.d
>  ** |
> -**   movprfx z0, z2
> -**   xar     z0\.h, z0\.h, z1\.h, #16
> +**   eor     z0\.d, z2\.d, z1\.d
>  ** )
>  **   ret
>  */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/xar_u32.c 
> b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/xar_u32.c
> index 115ead7701c..99efd14e1ed 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/xar_u32.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/xar_u32.c
> @@ -70,7 +70,11 @@ TEST_UNIFORM_Z (xar_2_u32_untied, svuint32_t,
>  
>  /*
>  ** xar_32_u32_tied1:
> -**   xar     z0\.s, z0\.s, z1\.s, #32
> +** (
> +**   eor     z0\.d, z1\.d, z0\.d
> +** |
> +**   eor     z0\.d, z0\.d, z1\.d
> +** )
>  **   ret
>  */
>  TEST_UNIFORM_Z (xar_32_u32_tied1, svuint32_t,
> @@ -79,7 +83,11 @@ TEST_UNIFORM_Z (xar_32_u32_tied1, svuint32_t,
>  
>  /*
>  ** xar_32_u32_tied2:
> -**   xar     z0\.s, z0\.s, z1\.s, #32
> +** (
> +**   eor     z0\.d, z1\.d, z0\.d
> +** |
> +**   eor     z0\.d, z0\.d, z1\.d
> +** )
>  **   ret
>  */
>  TEST_UNIFORM_Z (xar_32_u32_tied2, svuint32_t,
> @@ -89,11 +97,9 @@ TEST_UNIFORM_Z (xar_32_u32_tied2, svuint32_t,
>  /*
>  ** xar_32_u32_untied:
>  ** (
> -**   movprfx z0, z1
> -**   xar     z0\.s, z0\.s, z2\.s, #32
> +**   eor     z0\.d, z1\.d, z2\.d
>  ** |
> -**   movprfx z0, z2
> -**   xar     z0\.s, z0\.s, z1\.s, #32
> +**   eor     z0\.d, z2\.d, z1\.d
>  ** )
>  **   ret
>  */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/xar_u64.c 
> b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/xar_u64.c
> index 1d0d90e90d6..5c770ffdadb 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/xar_u64.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/xar_u64.c
> @@ -70,7 +70,11 @@ TEST_UNIFORM_Z (xar_2_u64_untied, svuint64_t,
>  
>  /*
>  ** xar_64_u64_tied1:
> -**   xar     z0\.d, z0\.d, z1\.d, #64
> +** (
> +**   eor     z0\.d, z1\.d, z0\.d
> +** |
> +**   eor     z0\.d, z0\.d, z1\.d
> +** )
>  **   ret
>  */
>  TEST_UNIFORM_Z (xar_64_u64_tied1, svuint64_t,
> @@ -79,7 +83,11 @@ TEST_UNIFORM_Z (xar_64_u64_tied1, svuint64_t,
>  
>  /*
>  ** xar_64_u64_tied2:
> -**   xar     z0\.d, z0\.d, z1\.d, #64
> +** (
> +**   eor     z0\.d, z1\.d, z0\.d
> +** |
> +**   eor     z0\.d, z0\.d, z1\.d
> +** )
>  **   ret
>  */
>  TEST_UNIFORM_Z (xar_64_u64_tied2, svuint64_t,
> @@ -89,11 +97,9 @@ TEST_UNIFORM_Z (xar_64_u64_tied2, svuint64_t,
>  /*
>  ** xar_64_u64_untied:
>  ** (
> -**   movprfx z0, z1
> -**   xar     z0\.d, z0\.d, z2\.d, #64
> +**   eor     z0\.d, z1\.d, z2\.d
>  ** |
> -**   movprfx z0, z2
> -**   xar     z0\.d, z0\.d, z1\.d, #64
> +**   eor     z0\.d, z2\.d, z1\.d
>  ** )
>  **   ret
>  */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/xar_u8.c 
> b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/xar_u8.c
> index 3b6161729cb..5ae5323a08a 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/xar_u8.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/xar_u8.c
> @@ -70,7 +70,11 @@ TEST_UNIFORM_Z (xar_2_u8_untied, svuint8_t,
>  
>  /*
>  ** xar_8_u8_tied1:
> -**   xar     z0\.b, z0\.b, z1\.b, #8
> +** (
> +**   eor     z0\.d, z1\.d, z0\.d
> +** |
> +**   eor     z0\.d, z0\.d, z1\.d
> +** )
>  **   ret
>  */
>  TEST_UNIFORM_Z (xar_8_u8_tied1, svuint8_t,
> @@ -79,7 +83,11 @@ TEST_UNIFORM_Z (xar_8_u8_tied1, svuint8_t,
>  
>  /*
>  ** xar_8_u8_tied2:
> -**   xar     z0\.b, z0\.b, z1\.b, #8
> +** (
> +**   eor     z0\.d, z1\.d, z0\.d
> +** |
> +**   eor     z0\.d, z0\.d, z1\.d
> +** )
>  **   ret
>  */
>  TEST_UNIFORM_Z (xar_8_u8_tied2, svuint8_t,
> @@ -89,11 +97,9 @@ TEST_UNIFORM_Z (xar_8_u8_tied2, svuint8_t,
>  /*
>  ** xar_8_u8_untied:
>  ** (
> -**   movprfx z0, z1
> -**   xar     z0\.b, z0\.b, z2\.b, #8
> +**   eor     z0\.d, z1\.d, z2\.d
>  ** |
> -**   movprfx z0, z2
> -**   xar     z0\.b, z0\.b, z1\.b, #8
> +**   eor     z0\.d, z2\.d, z1\.d
>  ** )
>  **   ret
>  */
> diff --git a/gcc/testsuite/gcc.target/aarch64/xar_neon_modes.c 
> b/gcc/testsuite/gcc.target/aarch64/xar_neon_modes.c
> new file mode 100644
> index 00000000000..750fbcfc48a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/xar_neon_modes.c
> @@ -0,0 +1,39 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +#pragma GCC target "+sve2+nosha3"
> +
> +typedef char __attribute__ ((vector_size (16))) v16qi;
> +typedef unsigned short __attribute__ ((vector_size (16))) v8hi;
> +typedef unsigned int __attribute__ ((vector_size (16))) v4si;
> +typedef unsigned long long __attribute__ ((vector_size (16))) v2di;
> +
> +v16qi
> +xar_v16qi (v16qi a, v16qi b) {
> +  v16qi c = a ^ b;
> +  return (c << 2) ^ (c >> 6);
> +}
> +/* { dg-final { scan-assembler {\txar\tz0.b, z[0-9]+.b, z[0-9]+.b, #6} } } */
> +
> +v8hi
> +xar_v8hi (v8hi a, v8hi b) {
> +  v8hi c = a ^ b;
> +  return (c << 13) ^ (c >> 3);
> +}
> +/* { dg-final { scan-assembler {\txar\tz0.h, z[0-9]+.h, z[0-9]+.h, #3} } } */
> +
> +v4si
> +xar_v4si (v4si a, v4si b) {
> +  v4si c = a ^ b;
> +  return (c << 9) ^ (c >> 23);
> +}
> +/* { dg-final { scan-assembler {\txar\tz0.s, z[0-9]+.s, z[0-9]+.s, #23} } } 
> */
> +
> +/* When +sha3 for Advanced SIMD is not available we should still use the
> +   SVE2 form of XAR.  */
> +v2di
> +xar_v2di (v2di a, v2di b) {
> +  v2di c = a ^ b;
> +  return (c << 22) ^ (c >> 42);
> +}
> +/* { dg-final { scan-assembler {\txar\tz0.d, z[0-9]+.d, z[0-9]+.d, #42} } } 
> */
> diff --git a/gcc/testsuite/gcc.target/aarch64/xar_v2di_nonsve.c 
> b/gcc/testsuite/gcc.target/aarch64/xar_v2di_nonsve.c
> new file mode 100644
> index 00000000000..b0f1a97222b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/xar_v2di_nonsve.c
> @@ -0,0 +1,16 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +#pragma GCC target "+sve2+sha3"
> +
> +typedef unsigned long long __attribute__ ((vector_size (16))) v2di;
> +
> +/* Both +sve2 and +sha3 have V2DImode XAR instructions, but we should
> +   prefer the Advanced SIMD one when both are available.  */
> +v2di
> +xar_v2di (v2di a, v2di b) {
> +  v2di c = a ^ b;
> +  return (c << 22) ^ (c >> 42);
> +}
> +/* { dg-final { scan-assembler {\txar\tv0.2d, v[0-9]+.2d, v[0-9]+.2d, 42} } 
> } */
> +

Re: [PATCH 2/6] aarch64: Use canonical RTL representation for SVE2 XAR and extend it to fixed-width modes

Reply via email to