Tamar Christina <tamar.christ...@arm.com> writes: > Hi All, > > Similar to the 1/2 patch but adds additional back-end specific folding for if > the register sequence was created as a result of RTL optimizations. > > Concretely: > > #include <arm_neon.h> > > unsigned int foor (uint32x4_t x) > { > return x[1] >> 16; > } > > generates: > > foor: > umov w0, v0.h[3] > ret > > instead of > > foor: > umov w0, v0.s[1] > lsr w0, w0, 16 > ret
The same thing ought to work for smov, so it would be good to do both. That would also make the split between the original and new patterns more obvious: left shift for the old pattern, right shift for the new pattern. > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. > > Ok for master? > > Thanks, > Tamar > > gcc/ChangeLog: > > * config/aarch64/aarch64.md (*<optab>si3_insn_uxtw): Split SHIFT into > left and right ones. > * config/aarch64/constraints.md (Usl): New. > * config/aarch64/iterators.md (SHIFT_NL, LSHIFTRT): New. > > gcc/testsuite/ChangeLog: > > * gcc.target/aarch64/shift-read.c: New test. > > --- inline copy of patch -- > diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md > index > c333fb1f72725992bb304c560f1245a242d5192d..6aa1fb4be003f2027d63ac69fd314c2bbc876258 > 100644 > --- a/gcc/config/aarch64/aarch64.md > +++ b/gcc/config/aarch64/aarch64.md > @@ -5493,7 +5493,7 @@ (define_insn "*rol<mode>3_insn" > ;; zero_extend version of shifts > (define_insn "*<optab>si3_insn_uxtw" > [(set (match_operand:DI 0 "register_operand" "=r,r") > - (zero_extend:DI (SHIFT_no_rotate:SI > + (zero_extend:DI (SHIFT_arith:SI > (match_operand:SI 1 "register_operand" "r,r") > (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Uss,r"))))] > "" > @@ -5528,6 +5528,60 @@ (define_insn "*rolsi3_insn_uxtw" > [(set_attr "type" "rotate_imm")] > ) > > +(define_insn "*<optab>si3_insn2_uxtw" > + [(set (match_operand:DI 0 "register_operand" "=r,?r,r") Is the "?" justified? It seems odd to penalise a native, single-instruction r->r operation in favour of a w->r operation. > + (zero_extend:DI (LSHIFTRT:SI > + (match_operand:SI 1 "register_operand" "w,r,r") > + (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Usl,Uss,r"))))] > + "" > + { > + switch (which_alternative) > + { > + case 0: > + { > + machine_mode dest, vec_mode; > + int val = INTVAL (operands[2]); > + int size = 32 - val; > + if (size == 16) > + dest = HImode; > + else if (size == 8) > + dest = QImode; > + else > + gcc_unreachable (); > + > + /* Get nearest 64-bit vector mode. */ > + int nunits = 64 / size; > + auto vector_mode > + = mode_for_vector (as_a <scalar_mode> (dest), nunits); > + if (!vector_mode.exists (&vec_mode)) > + gcc_unreachable (); > + operands[1] = gen_rtx_REG (vec_mode, REGNO (operands[1])); > + operands[2] = gen_int_mode (val / size, SImode); > + > + /* Ideally we just call aarch64_get_lane_zero_extend but reload gets > + into a weird loop due to a mov of w -> r being present most time > + this instruction applies. */ > + switch (dest) > + { > + case QImode: > + return "umov\\t%w0, %1.b[%2]"; > + case HImode: > + return "umov\\t%w0, %1.h[%2]"; > + default: > + gcc_unreachable (); > + } Doesn't this reduce to something like: if (size == 16) return "umov\\t%w0, %1.h[1]"; if (size == 8) return "umov\\t%w0, %1.b[3]"; gcc_unreachable (); ? We should print %1 correctly as vN even with its original type. Thanks, Richard > + } > + case 1: > + return "<shift>\\t%w0, %w1, %2"; > + case 2: > + return "<shift>\\t%w0, %w1, %w2"; > + default: > + gcc_unreachable (); > + } > + } > + [(set_attr "type" "neon_to_gp,bfx,shift_reg")] > +) > + > (define_insn "*<optab><mode>3_insn" > [(set (match_operand:SHORT 0 "register_operand" "=r") > (ASHIFT:SHORT (match_operand:SHORT 1 "register_operand" "r") > diff --git a/gcc/config/aarch64/constraints.md > b/gcc/config/aarch64/constraints.md > index > ee7587cca1673208e2bfd6b503a21d0c8b69bf75..470510d691ee8589aec9b0a71034677534641bea > 100644 > --- a/gcc/config/aarch64/constraints.md > +++ b/gcc/config/aarch64/constraints.md > @@ -166,6 +166,14 @@ (define_constraint "Uss" > (and (match_code "const_int") > (match_test "(unsigned HOST_WIDE_INT) ival < 32"))) > > +(define_constraint "Usl" > + "@internal > + A constraint that matches an immediate shift constant in SImode that has an > + exact mode available to use." > + (and (match_code "const_int") > + (and (match_test "satisfies_constraint_Uss (op)") > + (match_test "(32 - ival == 8) || (32 - ival == 16)")))) > + > (define_constraint "Usn" > "A constant that can be used with a CCMN operation (once negated)." > (and (match_code "const_int") > diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md > index > e904407b2169e589b7007ff966b2d9347a6d0fd2..bf16207225e3a4f1f20ed6f54321bccbbf15d73f > 100644 > --- a/gcc/config/aarch64/iterators.md > +++ b/gcc/config/aarch64/iterators.md > @@ -2149,8 +2149,11 @@ (define_mode_attr sve_lane_pair_con [(VNx8HF "y") > (VNx4SF "x")]) > ;; This code iterator allows the various shifts supported on the core > (define_code_iterator SHIFT [ashift ashiftrt lshiftrt rotatert rotate]) > > -;; This code iterator allows all shifts except for rotates. > -(define_code_iterator SHIFT_no_rotate [ashift ashiftrt lshiftrt]) > +;; This code iterator allows arithmetic shifts > +(define_code_iterator SHIFT_arith [ashift ashiftrt]) > + > +;; Singleton code iterator for only logical right shift. > +(define_code_iterator LSHIFTRT [lshiftrt]) > > ;; This code iterator allows the shifts supported in arithmetic instructions > (define_code_iterator ASHIFT [ashift ashiftrt lshiftrt]) > diff --git a/gcc/testsuite/gcc.target/aarch64/shift-read.c > b/gcc/testsuite/gcc.target/aarch64/shift-read.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..e6e355224c96344fe1cdabd6b0d3d5d609cd95bd > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/shift-read.c > @@ -0,0 +1,85 @@ > +/* { dg-do compile } */ > +/* { dg-additional-options "-O2" } */ > +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */ > + > +#include <arm_neon.h> > + > +/* > +** foor: > +** umov w0, v0.h\[3\] > +** ret > +*/ > +unsigned int foor (uint32x4_t x) > +{ > + return x[1] >> 16; > +} > + > +/* > +** fool: > +** umov w0, v0.s\[1\] > +** lsl w0, w0, 16 > +** ret > +*/ > +unsigned int fool (uint32x4_t x) > +{ > + return x[1] << 16; > +} > + > +/* > +** foor2: > +** umov w0, v0.h\[7\] > +** ret > +*/ > +unsigned short foor2 (uint32x4_t x) > +{ > + return x[3] >> 16; > +} > + > +/* > +** fool2: > +** fmov w0, s0 > +** lsl w0, w0, 16 > +** ret > +*/ > +unsigned int fool2 (uint32x4_t x) > +{ > + return x[0] << 16; > +} > + > +typedef int v4si __attribute__ ((vector_size (16))); > + > +/* > +** bar: > +** addv s0, v0.4s > +** fmov w0, s0 > +** lsr w1, w0, 16 > +** add w0, w1, w0, uxth > +** ret > +*/ > +int bar (v4si x) > +{ > + unsigned int sum = vaddvq_s32 (x); > + return (((uint16_t)(sum & 0xffff)) + ((uint32_t)sum >> 16)); > +} > + > +/* > +** foo: > +** lsr w0, w0, 16 > +** ret > +*/ > +unsigned short foo (unsigned x) > +{ > + return x >> 16; > +} > + > +/* > +** foo2: > +** ... > +** umov w0, v[0-8]+.h\[1\] > +** ret > +*/ > +unsigned short foo2 (v4si x) > +{ > + int y = x[0] + x[1]; > + return y >> 16; > +}