Re: [PATCH 2/2]AArch64 Perform more late folding of reg moves and shifts which arrive after expand

Richard Sandiford via Gcc-patches Fri, 23 Sep 2022 07:32:33 -0700

Tamar Christina <tamar.christ...@arm.com> writes:
> Hi All,
>
> Similar to the 1/2 patch but adds additional back-end specific folding for if
> the register sequence was created as a result of RTL optimizations.
>
> Concretely:
>
> #include <arm_neon.h>
>
> unsigned int foor (uint32x4_t x)
> {
>     return x[1] >> 16;
> }
>
> generates:
>
> foor:
>         umov    w0, v0.h[3]
>         ret
>
> instead of
>
> foor:
>         umov    w0, v0.s[1]
>         lsr     w0, w0, 16
>         ret


The same thing ought to work for smov, so it would be good to do both.
That would also make the split between the original and new patterns
more obvious: left shift for the old pattern, right shift for the new
pattern.

> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
>       * config/aarch64/aarch64.md (*<optab>si3_insn_uxtw): Split SHIFT into
>       left and right ones.
>       * config/aarch64/constraints.md (Usl): New.
>       * config/aarch64/iterators.md (SHIFT_NL, LSHIFTRT): New.
>
> gcc/testsuite/ChangeLog:
>
>       * gcc.target/aarch64/shift-read.c: New test.
>
> --- inline copy of patch -- 
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 
> c333fb1f72725992bb304c560f1245a242d5192d..6aa1fb4be003f2027d63ac69fd314c2bbc876258
>  100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -5493,7 +5493,7 @@ (define_insn "*rol<mode>3_insn"
>  ;; zero_extend version of shifts
>  (define_insn "*<optab>si3_insn_uxtw"
>    [(set (match_operand:DI 0 "register_operand" "=r,r")
> -     (zero_extend:DI (SHIFT_no_rotate:SI
> +     (zero_extend:DI (SHIFT_arith:SI
>        (match_operand:SI 1 "register_operand" "r,r")
>        (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Uss,r"))))]
>    ""
> @@ -5528,6 +5528,60 @@ (define_insn "*rolsi3_insn_uxtw"
>    [(set_attr "type" "rotate_imm")]
>  )
>  
> +(define_insn "*<optab>si3_insn2_uxtw"
> +  [(set (match_operand:DI 0 "register_operand" "=r,?r,r")

Is the "?" justified?  It seems odd to penalise a native,
single-instruction r->r operation in favour of a w->r operation.

> +     (zero_extend:DI (LSHIFTRT:SI
> +      (match_operand:SI 1 "register_operand" "w,r,r")
> +      (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Usl,Uss,r"))))]
> +  ""
> +  {
> +    switch (which_alternative)
> +    {
> +      case 0:
> +     {
> +       machine_mode dest, vec_mode;
> +       int val = INTVAL (operands[2]);
> +       int size = 32 - val;
> +       if (size == 16)
> +         dest = HImode;
> +       else if (size == 8)
> +         dest = QImode;
> +       else
> +         gcc_unreachable ();
> +
> +       /* Get nearest 64-bit vector mode.  */
> +       int nunits = 64 / size;
> +       auto vector_mode
> +         = mode_for_vector (as_a <scalar_mode> (dest), nunits);
> +       if (!vector_mode.exists (&vec_mode))
> +         gcc_unreachable ();
> +       operands[1] = gen_rtx_REG (vec_mode, REGNO (operands[1]));
> +       operands[2] = gen_int_mode (val / size, SImode);
> +
> +       /* Ideally we just call aarch64_get_lane_zero_extend but reload gets
> +          into a weird loop due to a mov of w -> r being present most time
> +          this instruction applies.  */
> +       switch (dest)
> +       {
> +         case QImode:
> +           return "umov\\t%w0, %1.b[%2]";
> +         case HImode:
> +           return "umov\\t%w0, %1.h[%2]";
> +         default:
> +           gcc_unreachable ();
> +       }

Doesn't this reduce to something like:

  if (size == 16)
    return "umov\\t%w0, %1.h[1]";
  if (size == 8)
    return "umov\\t%w0, %1.b[3]";
  gcc_unreachable ();

?  We should print %1 correctly as vN even with its original type.

Thanks,
Richard

> +     }
> +      case 1:
> +     return "<shift>\\t%w0, %w1, %2";
> +      case 2:
> +     return "<shift>\\t%w0, %w1, %w2";
> +      default:
> +     gcc_unreachable ();
> +      }
> +  }
> +  [(set_attr "type" "neon_to_gp,bfx,shift_reg")]
> +)
> +
>  (define_insn "*<optab><mode>3_insn"
>    [(set (match_operand:SHORT 0 "register_operand" "=r")
>       (ASHIFT:SHORT (match_operand:SHORT 1 "register_operand" "r")
> diff --git a/gcc/config/aarch64/constraints.md 
> b/gcc/config/aarch64/constraints.md
> index 
> ee7587cca1673208e2bfd6b503a21d0c8b69bf75..470510d691ee8589aec9b0a71034677534641bea
>  100644
> --- a/gcc/config/aarch64/constraints.md
> +++ b/gcc/config/aarch64/constraints.md
> @@ -166,6 +166,14 @@ (define_constraint "Uss"
>    (and (match_code "const_int")
>         (match_test "(unsigned HOST_WIDE_INT) ival < 32")))
>  
> +(define_constraint "Usl"
> +  "@internal
> +  A constraint that matches an immediate shift constant in SImode that has an
> +  exact mode available to use."
> +  (and (match_code "const_int")
> +       (and (match_test "satisfies_constraint_Uss (op)")
> +         (match_test "(32 - ival == 8) || (32 - ival == 16)"))))
> +
>  (define_constraint "Usn"
>   "A constant that can be used with a CCMN operation (once negated)."
>   (and (match_code "const_int")
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index 
> e904407b2169e589b7007ff966b2d9347a6d0fd2..bf16207225e3a4f1f20ed6f54321bccbbf15d73f
>  100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -2149,8 +2149,11 @@ (define_mode_attr sve_lane_pair_con [(VNx8HF "y") 
> (VNx4SF "x")])
>  ;; This code iterator allows the various shifts supported on the core
>  (define_code_iterator SHIFT [ashift ashiftrt lshiftrt rotatert rotate])
>  
> -;; This code iterator allows all shifts except for rotates.
> -(define_code_iterator SHIFT_no_rotate [ashift ashiftrt lshiftrt])
> +;; This code iterator allows arithmetic shifts
> +(define_code_iterator SHIFT_arith [ashift ashiftrt])
> +
> +;; Singleton code iterator for only logical right shift.
> +(define_code_iterator LSHIFTRT [lshiftrt])
>  
>  ;; This code iterator allows the shifts supported in arithmetic instructions
>  (define_code_iterator ASHIFT [ashift ashiftrt lshiftrt])
> diff --git a/gcc/testsuite/gcc.target/aarch64/shift-read.c 
> b/gcc/testsuite/gcc.target/aarch64/shift-read.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..e6e355224c96344fe1cdabd6b0d3d5d609cd95bd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/shift-read.c
> @@ -0,0 +1,85 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O2" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +#include <arm_neon.h>
> +
> +/*
> +** foor:
> +**   umov    w0, v0.h\[3\]
> +**   ret
> +*/
> +unsigned int foor (uint32x4_t x)
> +{
> +    return x[1] >> 16;
> +}
> +
> +/*
> +** fool:
> +**   umov    w0, v0.s\[1\]
> +**   lsl     w0, w0, 16
> +**   ret
> +*/
> +unsigned int fool (uint32x4_t x)
> +{
> +    return x[1] << 16;
> +}
> +
> +/*
> +** foor2:
> +**   umov    w0, v0.h\[7\]
> +**   ret
> +*/
> +unsigned short foor2 (uint32x4_t x)
> +{
> +    return x[3] >> 16;
> +}
> +
> +/*
> +** fool2:
> +**   fmov    w0, s0
> +**   lsl     w0, w0, 16
> +**   ret
> +*/
> +unsigned int fool2 (uint32x4_t x)
> +{
> +    return x[0] << 16;
> +}
> +
> +typedef int v4si __attribute__ ((vector_size (16)));
> +
> +/*
> +** bar:
> +**   addv    s0, v0.4s
> +**   fmov    w0, s0
> +**   lsr     w1, w0, 16
> +**   add     w0, w1, w0, uxth
> +**   ret
> +*/
> +int bar (v4si x)
> +{
> +  unsigned int sum = vaddvq_s32 (x);
> +  return (((uint16_t)(sum & 0xffff)) + ((uint32_t)sum >> 16));
> +}
> +
> +/*
> +** foo:
> +**   lsr     w0, w0, 16
> +**   ret
> +*/
> +unsigned short foo (unsigned x)
> +{
> +  return x >> 16;
> +}
> +
> +/*
> +** foo2:
> +**   ...
> +**   umov    w0, v[0-8]+.h\[1\]
> +**   ret
> +*/
> +unsigned short foo2 (v4si x)
> +{
> +  int y = x[0] + x[1];
> +  return y >> 16;
> +}

Re: [PATCH 2/2]AArch64 Perform more late folding of reg moves and shifts which arrive after expand

Reply via email to