Hi All, The following example
void f5(float * restrict z0, float * restrict z1, float *restrict x, float * restrict y, float c, int n) { for (int i = 0; i < n; i++) { float a = x[i]; float b = y[i]; if (a > b) { z0[i] = a + b; if (a > c) { z1[i] = a - b; } } } } generates currently: ptrue p3.b, all ld1w z1.s, p1/z, [x2, x5, lsl 2] ld1w z2.s, p1/z, [x3, x5, lsl 2] fcmgt p0.s, p3/z, z1.s, z0.s fcmgt p2.s, p1/z, z1.s, z2.s fcmgt p0.s, p0/z, z1.s, z2.s The conditions for a > b and a > c become separate comparisons. After this patch using a 2 -> 2 split we generate: ld1w z1.s, p0/z, [x2, x5, lsl 2] ld1w z2.s, p0/z, [x3, x5, lsl 2] fcmgt p1.s, p0/z, z1.s, z2.s fcmgt p1.s, p1/z, z1.s, z0.s Where the condition a > b && a > c are folded by using the predicate result of the previous compare and thus allows the removal of one of the compares. Note: This patch series is working incrementally towards generating the most efficient code for this and other loops in small steps. Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: * config/aarch64/aarch64-sve.md (*mask_cmp_and_combine): New. gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve/pred-combine-and.c: New test. --- inline copy of patch -- diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index 2c23c6b12bafb038d82920e7141a418e078a2c65..ee9d32c0a5534209689d9d3abaa560ee5b66347d 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -8162,6 +8162,48 @@ (define_insn_and_split "*mask_inv_combine" } ) +;; Combine multiple masks where the comparisons operators are the same and +;; each comparison has one parameter shared. e.g. combine a > b && a > c +(define_insn_and_split "*mask_cmp_and_combine" + [(set (match_operand:<VPRED> 0 "register_operand" "=Upa") + (and:<VPRED> + (and:<VPRED> + (unspec:<VPRED> + [(match_operand:<VPRED> 1) + (const_int SVE_KNOWN_PTRUE) + (match_operand:SVE_FULL_F 2 "register_operand" "w") + (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero" "wDz")] + SVE_COND_FP_CMP_I0) + (unspec:<VPRED> + [(match_dup 1) + (const_int SVE_KNOWN_PTRUE) + (match_dup 2) + (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero" "wDz")] + SVE_COND_FP_CMP_I0)) + (match_operand:<VPRED> 5 "register_operand" "Upa"))) + (clobber (match_scratch:<VPRED> 6 "=&Upa"))] + "TARGET_SVE" + "#" + "&& 1" + [(set (match_dup 6) + (unspec:<VPRED> + [(match_dup 5) + (const_int SVE_MAYBE_NOT_PTRUE) + (match_dup 2) + (match_dup 3)] + SVE_COND_FP_CMP_I0)) + (set (match_dup 0) + (unspec:<VPRED> + [(match_dup 6) + (const_int SVE_MAYBE_NOT_PTRUE) + (match_dup 2) + (match_dup 4)] + SVE_COND_FP_CMP_I0))] +{ + operands[6] = gen_reg_rtx (<VPRED>mode); +} +) + ;; ------------------------------------------------------------------------- ;; ---- [FP] Absolute comparisons ;; ------------------------------------------------------------------------- diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c b/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c new file mode 100644 index 0000000000000000000000000000000000000000..d395b7f84bb15b588493611df5a47549726ac24a --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c @@ -0,0 +1,18 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O3 --save-temps" } */ + +void f5(float * restrict z0, float * restrict z1, float *restrict x, float * restrict y, float c, int n) +{ + for (int i = 0; i < n; i++) { + float a = x[i]; + float b = y[i]; + if (a > b) { + z0[i] = a + b; + if (a > c) { + z1[i] = a - b; + } + } + } +} + +/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-9]+/z, z[0-9]+\.s, z[0-9]+\.s} 2 } } */ --
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index 2c23c6b12bafb038d82920e7141a418e078a2c65..ee9d32c0a5534209689d9d3abaa560ee5b66347d 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -8162,6 +8162,48 @@ (define_insn_and_split "*mask_inv_combine" } ) +;; Combine multiple masks where the comparisons operators are the same and +;; each comparison has one parameter shared. e.g. combine a > b && a > c +(define_insn_and_split "*mask_cmp_and_combine" + [(set (match_operand:<VPRED> 0 "register_operand" "=Upa") + (and:<VPRED> + (and:<VPRED> + (unspec:<VPRED> + [(match_operand:<VPRED> 1) + (const_int SVE_KNOWN_PTRUE) + (match_operand:SVE_FULL_F 2 "register_operand" "w") + (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero" "wDz")] + SVE_COND_FP_CMP_I0) + (unspec:<VPRED> + [(match_dup 1) + (const_int SVE_KNOWN_PTRUE) + (match_dup 2) + (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero" "wDz")] + SVE_COND_FP_CMP_I0)) + (match_operand:<VPRED> 5 "register_operand" "Upa"))) + (clobber (match_scratch:<VPRED> 6 "=&Upa"))] + "TARGET_SVE" + "#" + "&& 1" + [(set (match_dup 6) + (unspec:<VPRED> + [(match_dup 5) + (const_int SVE_MAYBE_NOT_PTRUE) + (match_dup 2) + (match_dup 3)] + SVE_COND_FP_CMP_I0)) + (set (match_dup 0) + (unspec:<VPRED> + [(match_dup 6) + (const_int SVE_MAYBE_NOT_PTRUE) + (match_dup 2) + (match_dup 4)] + SVE_COND_FP_CMP_I0))] +{ + operands[6] = gen_reg_rtx (<VPRED>mode); +} +) + ;; ------------------------------------------------------------------------- ;; ---- [FP] Absolute comparisons ;; ------------------------------------------------------------------------- diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c b/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c new file mode 100644 index 0000000000000000000000000000000000000000..d395b7f84bb15b588493611df5a47549726ac24a --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c @@ -0,0 +1,18 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O3 --save-temps" } */ + +void f5(float * restrict z0, float * restrict z1, float *restrict x, float * restrict y, float c, int n) +{ + for (int i = 0; i < n; i++) { + float a = x[i]; + float b = y[i]; + if (a > b) { + z0[i] = a + b; + if (a > c) { + z1[i] = a - b; + } + } + } +} + +/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-9]+/z, z[0-9]+\.s, z[0-9]+\.s} 2 } } */