Pengxuan Zheng <quic_pzh...@quicinc.com> writes: > This patch optimizes certain vector permute expansion with the FMOV > instruction > when one of the input vectors is a vector of all zeros and the result of the > vector permute is as if the upper lane of the non-zero input vector is set to > zero and the lower lane remains unchanged. > > Note that the patch also propagates zero_op0_p and zero_op1_p during re-encode > now. They will be used by aarch64_evpc_fmov to check if the input vectors are > valid candidates. > > PR target/100165 > > gcc/ChangeLog: > > * config/aarch64/aarch64-simd.md (aarch64_simd_vec_set_zero_fmov<mode>): > New define_insn. > * config/aarch64/aarch64.cc (aarch64_evpc_reencode): Copy zero_op0_p and > zero_op1_p. > (aarch64_evpc_fmov): New function. > (aarch64_expand_vec_perm_const_1): Add call to aarch64_evpc_fmov. > > gcc/testsuite/ChangeLog: > > * gcc.target/aarch64/vec-set-zero.c: Update test accordingly. > * gcc.target/aarch64/fmov.c: New test. > * gcc.target/aarch64/fmov-be.c: New test.
Nice! Thanks for doing this. Some comments on the patch below. > > Signed-off-by: Pengxuan Zheng <quic_pzh...@quicinc.com> > --- > gcc/config/aarch64/aarch64-simd.md | 14 +++ > gcc/config/aarch64/aarch64.cc | 74 +++++++++++- > gcc/testsuite/gcc.target/aarch64/fmov-be.c | 74 ++++++++++++ > gcc/testsuite/gcc.target/aarch64/fmov.c | 110 ++++++++++++++++++ > .../gcc.target/aarch64/vec-set-zero.c | 6 +- > 5 files changed, 275 insertions(+), 3 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/aarch64/fmov-be.c > create mode 100644 gcc/testsuite/gcc.target/aarch64/fmov.c > > diff --git a/gcc/config/aarch64/aarch64-simd.md > b/gcc/config/aarch64/aarch64-simd.md > index e456f693d2f..543126948e7 100644 > --- a/gcc/config/aarch64/aarch64-simd.md > +++ b/gcc/config/aarch64/aarch64-simd.md > @@ -1190,6 +1190,20 @@ (define_insn "aarch64_simd_vec_set<mode>" > [(set_attr "type" "neon_ins<q>, neon_from_gp<q>, neon_load1_one_lane<q>")] > ) > > +(define_insn "aarch64_simd_vec_set_zero_fmov<mode>" > + [(set (match_operand:VP_2E 0 "register_operand" "=w") > + (vec_merge:VP_2E > + (match_operand:VP_2E 1 "aarch64_simd_imm_zero" "Dz") > + (match_operand:VP_2E 3 "register_operand" "w") > + (match_operand:SI 2 "immediate_operand" "i")))] > + "TARGET_SIMD > + && (ENDIAN_LANE_N (<nunits>, exact_log2 (INTVAL (operands[2]))) == 1)" > + { > + return "fmov\\t%<Vetype>0, %<Vetype>3"; > + } > + [(set_attr "type" "fmov")] > +) > + I think this shows that target-independent code is missing some canonicalisation of vec_merge. combine has: unsigned n_elts = 0; if (GET_CODE (x) == VEC_MERGE && CONST_INT_P (XEXP (x, 2)) && GET_MODE_NUNITS (GET_MODE (x)).is_constant (&n_elts) && (swap_commutative_operands_p (XEXP (x, 0), XEXP (x, 1)) /* Two operands have same precedence, then first bit of mask select first operand. */ || (!swap_commutative_operands_p (XEXP (x, 1), XEXP (x, 0)) && !(UINTVAL (XEXP (x, 2)) & 1)))) { rtx temp = XEXP (x, 0); unsigned HOST_WIDE_INT sel = UINTVAL (XEXP (x, 2)); unsigned HOST_WIDE_INT mask = HOST_WIDE_INT_1U; if (n_elts == HOST_BITS_PER_WIDE_INT) mask = -1; else mask = (HOST_WIDE_INT_1U << n_elts) - 1; SUBST (XEXP (x, 0), XEXP (x, 1)); SUBST (XEXP (x, 1), temp); SUBST (XEXP (x, 2), GEN_INT (~sel & mask)); } which AFAICT would prefer to put the immediate second, not first. I think we should be doing the same canonicalisation in simplify_ternary_operation, and possibly elsewhere, so that the .md pattern only needs to match the canonical form (i.e. register, immedate, mask). On: > + && (ENDIAN_LANE_N (<nunits>, exact_log2 (INTVAL (operands[2]))) == 1)" it seems dangerous to pass exact_log2 to ENDIAN_LANE_N when we haven't checked whether it is a power of 2. (0b00 or 0b11 ought to get simplified, but I don't think we can ignore the possibility.) Rather than restrict the pattern to pairs, could we instead handle VALL_F16 minus the QI elements, with the 16-bit elements restricted to TARGET_F16? E.g. we should be able to handle V4SI using an FMOV of S registers if only the low element is nonzero. Part of me thinks that this should just be described as a plain old AND, but I suppose that doesn't work well for FP modes. Still, handling ANDs might be an interesting follow-up :) Thanks, Richard > (define_insn "aarch64_simd_vec_set_zero<mode>" > [(set (match_operand:VALL_F16 0 "register_operand" "=w") > (vec_merge:VALL_F16 > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > index a6cc00e74ab..64756920eda 100644 > --- a/gcc/config/aarch64/aarch64.cc > +++ b/gcc/config/aarch64/aarch64.cc > @@ -25950,6 +25950,8 @@ aarch64_evpc_reencode (struct expand_vec_perm_d *d) > newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL; > newd.op0 = d->op0 ? gen_lowpart (new_mode, d->op0) : NULL; > newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL; > + newd.zero_op0_p = d->zero_op0_p; > + newd.zero_op1_p = d->zero_op1_p; > newd.testing_p = d->testing_p; > newd.one_vector_p = d->one_vector_p; > > @@ -26434,6 +26436,74 @@ aarch64_evpc_ins (struct expand_vec_perm_d *d) > return true; > } > > +/* Recognize patterns suitable for the FMOV instructions. */ > +static bool > +aarch64_evpc_fmov (struct expand_vec_perm_d *d) > +{ > + machine_mode mode = d->vmode; > + unsigned HOST_WIDE_INT nelt; > + > + if (d->vec_flags != VEC_ADVSIMD) > + return false; > + > + /* to_constant is safe since this routine is specific to Advanced SIMD > + vectors. */ > + nelt = d->perm.length ().to_constant (); > + > + /* Either d->op0 or d->op1 should be a vector of all zeros. */ > + if (nelt != 2 || d->one_vector_p || (!d->zero_op0_p && !d->zero_op1_p)) > + return false; > + > + HOST_WIDE_INT elt0, elt1; > + rtx in0 = d->op0; > + rtx in1 = d->op1; > + > + if (!d->perm[0].is_constant (&elt0)) > + return false; > + > + if (!d->perm[1].is_constant (&elt1)) > + return false; > + > + if (!BYTES_BIG_ENDIAN) > + { > + /* Lane 0 of the output vector should come from lane 0 of the non-zero > + vector. */ > + if (elt0 != (d->zero_op0_p ? 2 : 0)) > + return false; > + > + /* Lane 1 of the output vector should come from any lane of the zero > + vector. */ > + if (elt1 != (d->zero_op0_p ? 0 : 2) && elt1 != (d->zero_op0_p ? 1 : 3)) > + return false; > + } > + else > + { > + /* Lane 0 of the output vector should come from any lane of the zero > + vector. */ > + if (elt0 != (d->zero_op0_p ? 0 : 2) && elt0 != (d->zero_op0_p ? 1 : 3)) > + return false; > + > + /* Lane 1 of the output vector should come from lane 1 of the non-zero > + vector. */ > + if (elt1 != (d->zero_op0_p ? 3 : 1)) > + return false; > + } > + > + if (d->testing_p) > + return true; > + > + insn_code icode = code_for_aarch64_simd_vec_copy_lane (mode); > + expand_operand ops[5]; > + create_output_operand (&ops[0], d->target, mode); > + create_input_operand (&ops[1], d->zero_op0_p ? in1 : in0, mode); > + create_integer_operand (&ops[2], BYTES_BIG_ENDIAN ? 1 : 2); > + create_input_operand (&ops[3], d->zero_op0_p ? in0 : in1, mode); > + create_integer_operand (&ops[4], 0); > + expand_insn (icode, 5, ops); > + > + return true; > +} > + > static bool > aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) > { > @@ -26457,7 +26527,9 @@ aarch64_expand_vec_perm_const_1 (struct > expand_vec_perm_d *d) > { > if (d->vmode == d->op_mode) > { > - if (aarch64_evpc_rev_local (d)) > + if (aarch64_evpc_fmov (d)) > + return true; > + else if (aarch64_evpc_rev_local (d)) > return true; > else if (aarch64_evpc_rev_global (d)) > return true; > diff --git a/gcc/testsuite/gcc.target/aarch64/fmov-be.c > b/gcc/testsuite/gcc.target/aarch64/fmov-be.c > new file mode 100644 > index 00000000000..f864d474a7a > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/fmov-be.c > @@ -0,0 +1,74 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mbig-endian" } */ > +/* { dg-final { check-function-bodies "**" "" "" } } */ > + > +typedef int v2si __attribute__ ((vector_size (8))); > +typedef long v2di __attribute__ ((vector_size (16))); > +typedef int v4si __attribute__ ((vector_size (16))); > +typedef float v4sf __attribute__ ((vector_size (16))); > + > +/* > +** f_v2si: > +** fmov s0, s0 > +** ret > +*/ > +v2si > +f_v2si (v2si x) > +{ > + return __builtin_shuffle (x, (v2si){ 0, 0 }, (v2si){ 3, 1 }); > +} > + > +/* > +** g_v2si: > +** fmov s0, s0 > +** ret > +*/ > +v2si > +g_v2si (v2si x) > +{ > + return __builtin_shuffle ((v2si){ 0, 0 }, x, (v2si){ 0, 3 }); > +} > + > +/* > +** f_v2di: > +** fmov d0, d0 > +** ret > +*/ > +v2di > +f_v2di (v2di x) > +{ > + return __builtin_shuffle (x, (v2di){ 0, 0 }, (v2di){ 2, 1 }); > +} > + > +/* > +** g_v2di: > +** fmov d0, d0 > +** ret > +*/ > +v2di > +g_v2di (v2di x) > +{ > + return __builtin_shuffle ((v2di){ 0, 0 }, x, (v2di){ 0, 3 }); > +} > + > +/* > +** f_v4si: > +** fmov d0, d0 > +** ret > +*/ > +v4si > +f_v4si (v4si x) > +{ > + return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 6, 7, 2, 3 }); > +} > + > +/* > +** g_v4si: > +** fmov d0, d0 > +** ret > +*/ > +v4si > +g_v4si (v4si x) > +{ > + return __builtin_shuffle ((v4si){ 0, 0, 0, 0 }, x, (v4si){ 2, 3, 6, 7 }); > +} > diff --git a/gcc/testsuite/gcc.target/aarch64/fmov.c > b/gcc/testsuite/gcc.target/aarch64/fmov.c > new file mode 100644 > index 00000000000..f40ec56c5f8 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/fmov.c > @@ -0,0 +1,110 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2" } */ > +/* { dg-final { check-function-bodies "**" "" "" } } */ > + > +typedef int v2si __attribute__ ((vector_size (8))); > +typedef float v2sf __attribute__ ((vector_size (8))); > +typedef long v2di __attribute__ ((vector_size (16))); > +typedef double v2df __attribute__ ((vector_size (16))); > +typedef int v4si __attribute__ ((vector_size (16))); > +typedef float v4sf __attribute__ ((vector_size (16))); > + > + > +/* > +** f_v2si: > +** fmov s0, s0 > +** ret > +*/ > +v2si > +f_v2si (v2si x) > +{ > + return __builtin_shuffle (x, (v2si){ 0, 0 }, (v2si){ 0, 3 }); > +} > + > +/* > +** g_v2si: > +** fmov s0, s0 > +** ret > +*/ > +v2si > +g_v2si (v2si x) > +{ > + return __builtin_shuffle ((v2si){ 0, 0 }, x, (v2si){ 2, 0 }); > +} > + > +/* > +** f_v2sf: > +** fmov s0, s0 > +** ret > +*/ > +v2sf > +f_v2sf (v2sf x) > +{ > + return __builtin_shuffle (x, (v2sf){ 0, 0 }, (v2si){ 0, 2 }); > +} > + > +/* > +** f_v2di: > +** fmov d0, d0 > +** ret > +*/ > +v2di > +f_v2di (v2di x) > +{ > + return __builtin_shuffle (x, (v2di){ 0, 0 }, (v2di){ 0, 3 }); > +} > + > +/* > +** g_v2di: > +** fmov d0, d0 > +** ret > +*/ > +v2di > +g_v2di (v2di x) > +{ > + return __builtin_shuffle ((v2di){ 0, 0 }, x, (v2di){ 2, 1 }); > +} > + > +/* > +** f_v2df: > +** fmov d0, d0 > +** ret > +*/ > +v2df > +f_v2df (v2df x) > +{ > + return __builtin_shuffle (x, (v2df){ 0, 0 }, (v2di){ 0, 2 }); > +} > + > +/* > +** f_v4si: > +** fmov d0, d0 > +** ret > +*/ > +v4si > +f_v4si (v4si x) > +{ > + return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 0, 1, 4, 5 }); > +} > + > +/* > +** g_v4si: > +** fmov d0, d0 > +** ret > +*/ > +v4si > +g_v4si (v4si x) > +{ > + return __builtin_shuffle ((v4si){ 0, 0, 0, 0 }, x, (v4si){ 4, 5, 2, 3 }); > +} > + > +/* > +** f_v4sf: > +** fmov d0, d0 > +** ret > +*/ > +v4sf > +f_v4sf (v4sf x) > +{ > + return __builtin_shuffle (x, (v4sf){ 0, 0, 0, 0 }, (v4si){ 0, 1, 6, 7 }); > +} > diff --git a/gcc/testsuite/gcc.target/aarch64/vec-set-zero.c > b/gcc/testsuite/gcc.target/aarch64/vec-set-zero.c > index b34b902cf27..9040839931f 100644 > --- a/gcc/testsuite/gcc.target/aarch64/vec-set-zero.c > +++ b/gcc/testsuite/gcc.target/aarch64/vec-set-zero.c > @@ -28,8 +28,10 @@ FOO(float64x2_t) > > /* { dg-final { scan-assembler-times {ins\tv[0-9]+\.b\[1\], wzr} 2 { target > aarch64_little_endian } } } */ > /* { dg-final { scan-assembler-times {ins\tv[0-9]+\.h\[1\], wzr} 4 { target > aarch64_little_endian } } } */ > -/* { dg-final { scan-assembler-times {ins\tv[0-9]+\.s\[1\], wzr} 4 { target > aarch64_little_endian } } } */ > -/* { dg-final { scan-assembler-times {ins\tv[0-9]+\.d\[1\], xzr} 2 { target > aarch64_little_endian } } } */ > +/* { dg-final { scan-assembler-times {ins\tv[0-9]+\.s\[1\], wzr} 2 { target > aarch64_little_endian } } } */ > +/* { dg-final { scan-assembler-times {ins\tv[0-9]+\.d\[1\], xzr} 0 { target > aarch64_little_endian } } } */ > +/* { dg-final { scan-assembler-times {fmov\ts0, s0} 2 { target > aarch64_little_endian } } } */ > +/* { dg-final { scan-assembler-times {fmov\td0, d0} 2 { target > aarch64_little_endian } } } */ > > /* { dg-final { scan-assembler-times {ins\tv[0-9]+\.b\[6\], wzr} 1 { target > aarch64_big_endian } } } */ > /* { dg-final { scan-assembler-times {ins\tv[0-9]+\.b\[14\], wzr} 1 { target > aarch64_big_endian } } } */