Tamar Christina <tamar.christ...@arm.com> writes: > Hi, > > Sending a new version of the patch because I noticed the pattern was > overriding the nor pattern. > > A second pattern is needed to capture the nor case as combine will match the > longest sequence first. So without this pattern we end up de-optimizing nor > and instead emit two nots. I did not find a better way to do this.
Hmm, that's unfortunate. But yeah, I don't know of a better way of avoiding it either. There's a risk we might need a pattern with the operands swapped as well (so that the (not (reg …)) comes first) but it would be better to avoid that using a new canonicalisation rule if necessary. > Note: This patch series is working incrementally towards generating the most > efficient code for this and other loops in small steps. > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. > > Ok for master? > > Thanks, > Tamar > > gcc/ChangeLog: > > * config/aarch64/aarch64-sve.md (*fcm<cmp_op><mode>_bic_combine, > *fcm<cmp_op><mode>_nor_combine, *fcmuo<mode>_bic_combine, > *fcmuo<mode>_nor_combine): New. > > gcc/testsuite/ChangeLog: > > * gcc.target/aarch64/sve/pred-not-gen.c-1: New test. > * gcc.target/aarch64/sve/pred-not-gen.c-2: New test. > * gcc.target/aarch64/sve/pred-not-gen.c-3: New test. > * gcc.target/aarch64/sve/pred-not-gen.c-4: New test. OK, thanks. Richard > > --- inline copy of patch --- > > diff --git a/gcc/config/aarch64/aarch64-sve.md > b/gcc/config/aarch64/aarch64-sve.md > index > 359fe0e457096cf4042a774789a5c241420703d3..8fe4c721313e70592d2cf0acbfbe2f07b070b51a > 100644 > --- a/gcc/config/aarch64/aarch64-sve.md > +++ b/gcc/config/aarch64/aarch64-sve.md > @@ -8126,6 +8126,160 @@ (define_insn_and_split "*fcmuo<mode>_and_combine" > UNSPEC_COND_FCMUO))] > ) > > +;; Similar to *fcm<cmp_op><mode>_and_combine, but for BIC rather than AND. > +;; In this case, we still need a separate NOT/BIC operation, but predicating > +;; the comparison on the BIC operand removes the need for a PTRUE. > +(define_insn_and_split "*fcm<cmp_op><mode>_bic_combine" > + [(set (match_operand:<VPRED> 0 "register_operand" "=Upa") > + (and:<VPRED> > + (and:<VPRED> > + (not:<VPRED> > + (unspec:<VPRED> > + [(match_operand:<VPRED> 1) > + (const_int SVE_KNOWN_PTRUE) > + (match_operand:SVE_FULL_F 2 "register_operand" "w") > + (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero" > "wDz")] > + SVE_COND_FP_CMP_I0)) > + (match_operand:<VPRED> 4 "register_operand" "Upa")) > + (match_dup:<VPRED> 1))) > + (clobber (match_scratch:<VPRED> 5 "=&Upl"))] > + "TARGET_SVE" > + "#" > + "&& 1" > + [(set (match_dup 5) > + (unspec:<VPRED> > + [(match_dup 4) > + (const_int SVE_MAYBE_NOT_PTRUE) > + (match_dup 2) > + (match_dup 3)] > + SVE_COND_FP_CMP_I0)) > + (set (match_dup 0) > + (and:<VPRED> > + (not:<VPRED> > + (match_dup 5)) > + (match_dup 4)))] > +{ > + if (can_create_pseudo_p ()) > + operands[5] = gen_reg_rtx (<VPRED>mode); > +} > +) > + > +;; Make sure that we expand to a nor when the operand 4 of > +;; *fcm<cmp_op><mode>_bic_combine is a not. > +(define_insn_and_split "*fcm<cmp_op><mode>_nor_combine" > + [(set (match_operand:<VPRED> 0 "register_operand" "=Upa") > + (and:<VPRED> > + (and:<VPRED> > + (not:<VPRED> > + (unspec:<VPRED> > + [(match_operand:<VPRED> 1) > + (const_int SVE_KNOWN_PTRUE) > + (match_operand:SVE_FULL_F 2 "register_operand" "w") > + (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero" > "wDz")] > + SVE_COND_FP_CMP_I0)) > + (not:<VPRED> > + (match_operand:<VPRED> 4 "register_operand" "Upa"))) > + (match_dup:<VPRED> 1))) > + (clobber (match_scratch:<VPRED> 5 "=&Upl"))] > + "TARGET_SVE" > + "#" > + "&& 1" > + [(set (match_dup 5) > + (unspec:<VPRED> > + [(match_dup 1) > + (const_int SVE_KNOWN_PTRUE) > + (match_dup 2) > + (match_dup 3)] > + SVE_COND_FP_CMP_I0)) > + (set (match_dup 0) > + (and:<VPRED> > + (and:<VPRED> > + (not:<VPRED> > + (match_dup 5)) > + (not:<VPRED> > + (match_dup 4))) > + (match_dup 1)))] > +{ > + if (can_create_pseudo_p ()) > + operands[5] = gen_reg_rtx (<VPRED>mode); > +} > +) > + > +(define_insn_and_split "*fcmuo<mode>_bic_combine" > + [(set (match_operand:<VPRED> 0 "register_operand" "=Upa") > + (and:<VPRED> > + (and:<VPRED> > + (not:<VPRED> > + (unspec:<VPRED> > + [(match_operand:<VPRED> 1) > + (const_int SVE_KNOWN_PTRUE) > + (match_operand:SVE_FULL_F 2 "register_operand" "w") > + (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero" > "wDz")] > + UNSPEC_COND_FCMUO)) > + (match_operand:<VPRED> 4 "register_operand" "Upa")) > + (match_dup:<VPRED> 1))) > + (clobber (match_scratch:<VPRED> 5 "=&Upl"))] > + "TARGET_SVE" > + "#" > + "&& 1" > + [(set (match_dup 5) > + (unspec:<VPRED> > + [(match_dup 4) > + (const_int SVE_MAYBE_NOT_PTRUE) > + (match_dup 2) > + (match_dup 3)] > + UNSPEC_COND_FCMUO)) > + (set (match_dup 0) > + (and:<VPRED> > + (not:<VPRED> > + (match_dup 5)) > + (match_dup 4)))] > +{ > + if (can_create_pseudo_p ()) > + operands[5] = gen_reg_rtx (<VPRED>mode); > +} > +) > + > +;; Same for unordered comparisons. > +(define_insn_and_split "*fcmuo<mode>_nor_combine" > + [(set (match_operand:<VPRED> 0 "register_operand" "=Upa") > + (and:<VPRED> > + (and:<VPRED> > + (not:<VPRED> > + (unspec:<VPRED> > + [(match_operand:<VPRED> 1) > + (const_int SVE_KNOWN_PTRUE) > + (match_operand:SVE_FULL_F 2 "register_operand" "w") > + (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero" > "wDz")] > + UNSPEC_COND_FCMUO)) > + (not:<VPRED> > + (match_operand:<VPRED> 4 "register_operand" "Upa"))) > + (match_dup:<VPRED> 1))) > + (clobber (match_scratch:<VPRED> 5 "=&Upl"))] > + "TARGET_SVE" > + "#" > + "&& 1" > + [(set (match_dup 5) > + (unspec:<VPRED> > + [(match_dup 1) > + (const_int SVE_KNOWN_PTRUE) > + (match_dup 2) > + (match_dup 3)] > + UNSPEC_COND_FCMUO)) > + (set (match_dup 0) > + (and:<VPRED> > + (and:<VPRED> > + (not:<VPRED> > + (match_dup 5)) > + (not:<VPRED> > + (match_dup 4))) > + (match_dup 1)))] > +{ > + if (can_create_pseudo_p ()) > + operands[5] = gen_reg_rtx (<VPRED>mode); > +} > +) > + > ;; ------------------------------------------------------------------------- > ;; ---- [FP] Absolute comparisons > ;; ------------------------------------------------------------------------- > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-1.c > b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-1.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..2c06564186c5a5e7917da475a9c201c81dfeb136 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-1.c > @@ -0,0 +1,23 @@ > +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ > +/* { dg-options "-O3 --save-temps" } */ > + > +/* > +** f10: > +** ... > +** ld1d z1.d, p0/z, \[x1, x5, lsl 3\] > +** fcmgt p2.d, p0/z, z1.d, #0.0 > +** ld1d z2.d, p2/z, \[x2, x5, lsl 3\] > +** not p1.b, p0/z, p2.b > +** ld1d z0.d, p1/z, \[x3, x5, lsl 3\] > +** ... > +*/ > + > +void f10(double * restrict z, double * restrict w, double * restrict x, > double * restrict y, int n) > +{ > + for (int i = 0; i < n; i++) { > + z[i] = (w[i] > 0) ? x[i] + w[i] : y[i] - w[i]; > + } > +} > + > +/* { dg-final { scan-assembler-not {\tbic\t} } } */ > +/* { dg-final { scan-assembler-times {\tnot\tp[0-9]+\.b, p[0-9]+/z, > p[0-9]+\.b\n} 1 } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-2.c > b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-2.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..0c3b78d4c67455c971e94fb2ffdd7be2d4884864 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-2.c > @@ -0,0 +1,23 @@ > +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ > +/* { dg-options "-O3 --save-temps" } */ > + > +/* > +** f11: > +** ... > +** ld1d z0.d, p0/z, \[x1, x2, lsl 3\] > +** fcmgt p2.d, p3/z, z0.d, #0.0 > +** fcmgt p1.d, p0/z, z0.d, #0.0 > +** not p1.b, p0/z, p1.b > +** ld1d z1.d, p1/z, \[x3, x2, lsl 3\] > +** ... > +*/ > + > +void f11(double * restrict z, double * restrict w, double * restrict x, > double * restrict y, int n) > +{ > + for (int i = 0; i < n; i++) { > + z[i] = (w[i] > 0) ? w[i] : y[i]; > + } > +} > + > +/* { dg-final { scan-assembler-not {\tbic\t} } } */ > +/* { dg-final { scan-assembler-times {\tnot\tp[0-9]+\.b, p[0-9]+/z, > p[0-9]+\.b\n} 1 } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-3.c > b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-3.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..248f8ab57191ce8a1d4c334533de8bc76aa07691 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-3.c > @@ -0,0 +1,21 @@ > +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ > +/* { dg-options "-O3 --save-temps" } */ > + > +/* > +** f12: > +** ... > +** ld1w z1.s, p0/z, \[x1, x2, lsl 2\] > +** cmple p1.s, p0/z, z1.s, #0 > +** ld1w z0.s, p1/z, \[x3, x2, lsl 2\] > +** ... > +*/ > + > +void f12(int * restrict z, int * restrict w, int * restrict x, int * > restrict y, int n) > +{ > + for (int i = 0; i < n; i++) { > + z[i] = (w[i] > 0) ? w[i] : y[i]; > + } > +} > + > +/* { dg-final { scan-assembler-not {\tbic\t} } } */ > +/* { dg-final { scan-assembler-not {\tnot\tp[0-9]+\.b, p[0-9]+/z, > p[0-9]+\.b\n} } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-4.c > b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-4.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..96200309880a91ad1db5801115c911cfdce06125 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-4.c > @@ -0,0 +1,14 @@ > +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ > +/* { dg-options "-O3 --save-temps" } */ > + > +#include <math.h> > + > +void f13(double * restrict z, double * restrict w, double * restrict x, > double * restrict y, int n) > +{ > + for (int i = 0; i < n; i++) { > + z[i] = (isunordered(w[i], 0)) ? x[i] + w[i] : y[i] - w[i]; > + } > +} > + > +/* { dg-final { scan-assembler-not {\tbic\t} } } */ > +/* { dg-final { scan-assembler-times {\tnot\tp[0-9]+\.b, p[0-9]+/z, > p[0-9]+\.b\n} 1 } } */