Hi all, this patch is meant not to generate predication in loop when the loop is operating only on full vectors.
Ex: #+BEGIN_SRC C /* Vector length is 256. */ void f (int *restrict x, int *restrict y, unsigned int n) { for (unsigned int i = 0; i < n * 8; ++i) x[i] += y[i]; } #+END_SRC Compiling on aarch64 with -O3 -msve-vector-bits=256 current trunk gives: #+BEGIN_SRC asm f: .LFB0: .cfi_startproc lsl w2, w2, 3 cbz w2, .L1 mov x3, 0 whilelo p0.s, xzr, x2 .p2align 3,,7 .L3: ld1w z0.s, p0/z, [x0, x3, lsl 2] ld1w z1.s, p0/z, [x1, x3, lsl 2] add z0.s, z0.s, z1.s st1w z0.s, p0, [x0, x3, lsl 2] add x3, x3, 8 whilelo p0.s, x3, x2 b.any .L3 .L1: ret .cfi_endproc #+END_SRC With the patch applied: #+BEGIN_SRC asm f: .LFB0: .cfi_startproc lsl w3, w2, 3 cbz w3, .L1 mov x2, 0 ptrue p0.b, vl32 .p2align 3,,7 .L3: ld1w z0.s, p0/z, [x0, x2, lsl 2] ld1w z1.s, p0/z, [x1, x2, lsl 2] add z0.s, z0.s, z1.s st1w z0.s, p0, [x0, x2, lsl 2] add x2, x2, 8 cmp x2, x3 bne .L3 .L1: ret .cfi_endproc #+END_SRC To achieve this we check earlier if the loop needs peeling and if is not the case we do not set LOOP_VINFO_USING_PARTIAL_VECTORS_P to true. I moved some logic from 'determine_peel_for_niter' to 'vect_need_peeling_or_part_vects_p' so it can be used for this purpose. Bootstrapped and regtested on aarch64-linux-gnu. Feedback is welcome, thanks. Andrea
>From fdcceaa420d6c3b03cf22ab50e0f9c393e8e3932 Mon Sep 17 00:00:00 2001 From: Andrea Corallo <andrea.cora...@arm.com> Date: Fri, 28 Aug 2020 16:01:15 +0100 Subject: [PATCH] vec: don't select partial vectors when unnecessary gcc/ChangeLog 2020-09-09 Andrea Corallo <andrea.cora...@arm.com> * tree-vect-loop.c (vect_need_peeling_or_part_vects_p): New function. (determine_peel_for_niter): Move out some logic into 'vect_need_peeling_or_part_vects_p'. gcc/testsuite/ChangeLog 2020-09-09 Andrea Corallo <andrea.cora...@arm.com> * gcc.target/aarch64/sve/cost_model_10.c: New test. * gcc.target/aarch64/sve/clastb_8.c: Update test for new vectorization strategy. * gcc.target/aarch64/sve/cost_model_5.c: Likewise. * gcc.target/aarch64/sve/struct_vect_14.c: Likewise. * gcc.target/aarch64/sve/struct_vect_15.c: Likewise. * gcc.target/aarch64/sve/struct_vect_16.c: Likewise. * gcc.target/aarch64/sve/struct_vect_17.c: Likewise. --- .../gcc.target/aarch64/sve/clastb_8.c | 5 +- .../gcc.target/aarch64/sve/cost_model_10.c | 12 +++ .../gcc.target/aarch64/sve/cost_model_5.c | 4 +- .../gcc.target/aarch64/sve/struct_vect_14.c | 8 +- .../gcc.target/aarch64/sve/struct_vect_15.c | 8 +- .../gcc.target/aarch64/sve/struct_vect_16.c | 8 +- .../gcc.target/aarch64/sve/struct_vect_17.c | 8 +- gcc/tree-vect-loop.c | 86 +++++++++++-------- 8 files changed, 81 insertions(+), 58 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/cost_model_10.c diff --git a/gcc/testsuite/gcc.target/aarch64/sve/clastb_8.c b/gcc/testsuite/gcc.target/aarch64/sve/clastb_8.c index 57c42082449..e61ff4ac92d 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/clastb_8.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/clastb_8.c @@ -23,7 +23,4 @@ TEST_TYPE (uint64_t); /* { dg-final { scan-assembler {\tclastb\t(h[0-9]+), p[0-7], \1, z[0-9]+\.h\n} } } */ /* { dg-final { scan-assembler {\tclastb\t(s[0-9]+), p[0-7], \1, z[0-9]+\.s\n} } } */ /* { dg-final { scan-assembler {\tclastb\t(d[0-9]+), p[0-7], \1, z[0-9]+\.d\n} } } */ -/* { dg-final { scan-assembler {\twhilelo\tp[0-9]+\.b,} } } */ -/* { dg-final { scan-assembler {\twhilelo\tp[0-9]+\.h,} } } */ -/* { dg-final { scan-assembler {\twhilelo\tp[0-9]+\.s,} } } */ -/* { dg-final { scan-assembler {\twhilelo\tp[0-9]+\.d,} } } */ +/* { dg-final { scan-assembler {\tptrue\tp[0-9]+\.b,} 4 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cost_model_10.c b/gcc/testsuite/gcc.target/aarch64/sve/cost_model_10.c new file mode 100644 index 00000000000..bfac09ed1c1 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/cost_model_10.c @@ -0,0 +1,12 @@ +/* { dg-options "-O3 -msve-vector-bits=256" } */ + +void +f (int *restrict x, int *restrict y, unsigned int n) +{ + for (unsigned int i = 0; i < n * 8; ++i) + x[i] += y[i]; +} + +/* { dg-final { scan-assembler-not {\twhilelo\t} } } */ +/* { dg-final { scan-assembler {\tptrue\tp} } } */ +/* { dg-final { scan-assembler {\tcmp\tx[0-9]+, x[0-9]+\n} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cost_model_5.c b/gcc/testsuite/gcc.target/aarch64/sve/cost_model_5.c index 250ca837324..f3a29fc38a1 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/cost_model_5.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/cost_model_5.c @@ -9,5 +9,5 @@ vset (int *restrict dst, int *restrict src, int count) *dst++ = 1; } -/* { dg-final { scan-assembler-not {\tst1w\tz} } } */ -/* { dg-final { scan-assembler-times {\tstp\tq} 2 } } */ +/* { dg-final { scan-assembler-times {\tst1w\tz} 2 } } */ +/* { dg-final { scan-assembler-not {\tstp\tq} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_14.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_14.c index a16a79e51c0..45644b67bda 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_14.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_14.c @@ -43,12 +43,12 @@ #undef NAME #undef TYPE -/* { dg-final { scan-assembler-times {\tld2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+, x[0-9]+\]\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tld2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */ /* { dg-final { scan-assembler-times {\tld3b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */ -/* { dg-final { scan-assembler-times {\tld4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+, x[0-9]+\]\n} 1 } } */ -/* { dg-final { scan-assembler-times {\tst2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+, x[0-9]+\]\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tld4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tst2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */ /* { dg-final { scan-assembler-times {\tst3b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */ -/* { dg-final { scan-assembler-times {\tst4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+, x[0-9]+\]\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tst4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */ /* { dg-final { scan-assembler-times {\tld2h\t{z[0-9]+.h - z[0-9]+.h}, p[0-7]/z, \[x[0-9]+\]\n} 2 } } */ /* { dg-final { scan-assembler-times {\tld3h\t{z[0-9]+.h - z[0-9]+.h}, p[0-7]/z, \[x[0-9]+\]\n} 2 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_15.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_15.c index bc00267c8e7..814dbb3ae41 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_15.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_15.c @@ -3,12 +3,12 @@ #include "struct_vect_14.c" -/* { dg-final { scan-assembler-times {\tld2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+, x[0-9]+\]\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tld2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */ /* { dg-final { scan-assembler-times {\tld3b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */ -/* { dg-final { scan-assembler-times {\tld4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+, x[0-9]+\]\n} 1 } } */ -/* { dg-final { scan-assembler-times {\tst2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+, x[0-9]+\]\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tld4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tst2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */ /* { dg-final { scan-assembler-times {\tst3b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */ -/* { dg-final { scan-assembler-times {\tst4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+, x[0-9]+\]\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tst4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */ /* { dg-final { scan-assembler-times {\tld2h\t{z[0-9]+.h - z[0-9]+.h}, p[0-7]/z, \[x[0-9]+\]\n} 2 } } */ /* { dg-final { scan-assembler-times {\tld3h\t{z[0-9]+.h - z[0-9]+.h}, p[0-7]/z, \[x[0-9]+\]\n} 2 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_16.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_16.c index 9e2a549f5e8..6ecf89b5442 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_16.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_16.c @@ -3,12 +3,12 @@ #include "struct_vect_14.c" -/* { dg-final { scan-assembler-times {\tld2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+, x[0-9]+\]\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tld2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */ /* { dg-final { scan-assembler-times {\tld3b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */ -/* { dg-final { scan-assembler-times {\tld4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+, x[0-9]+\]\n} 1 } } */ -/* { dg-final { scan-assembler-times {\tst2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+, x[0-9]+\]\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tld4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tst2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */ /* { dg-final { scan-assembler-times {\tst3b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */ -/* { dg-final { scan-assembler-times {\tst4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+, x[0-9]+\]\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tst4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */ /* { dg-final { scan-assembler-times {\tld2h\t{z[0-9]+.h - z[0-9]+.h}, p[0-7]/z, \[x[0-9]+\]\n} 2 } } */ /* { dg-final { scan-assembler-times {\tld3h\t{z[0-9]+.h - z[0-9]+.h}, p[0-7]/z, \[x[0-9]+\]\n} 2 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_17.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_17.c index e791e2e12a6..571c6d0d33b 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_17.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_17.c @@ -3,12 +3,12 @@ #include "struct_vect_14.c" -/* { dg-final { scan-assembler-times {\tld2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+, x[0-9]+\]\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tld2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */ /* { dg-final { scan-assembler-times {\tld3b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */ -/* { dg-final { scan-assembler-times {\tld4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+, x[0-9]+\]\n} 1 } } */ -/* { dg-final { scan-assembler-times {\tst2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+, x[0-9]+\]\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tld4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tst2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */ /* { dg-final { scan-assembler-times {\tst3b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */ -/* { dg-final { scan-assembler-times {\tst4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+, x[0-9]+\]\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tst4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */ /* { dg-final { scan-assembler-times {\tld2h\t{z[0-9]+.h - z[0-9]+.h}, p[0-7]/z, \[x[0-9]+\]\n} 2 } } */ /* { dg-final { scan-assembler-times {\tld3h\t{z[0-9]+.h - z[0-9]+.h}, p[0-7]/z, \[x[0-9]+\]\n} 2 } } */ diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index 80e78f7adf4..730dc0130c6 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -991,6 +991,51 @@ vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor) return wi::min_precision (max_ni * factor, UNSIGNED); } +/* true if the loop needs peeling or partial vectors when vectorized. */ + +static bool +vect_need_peeling_or_part_vects_p (loop_vec_info loop_vinfo) +{ + unsigned HOST_WIDE_INT const_vf; + HOST_WIDE_INT max_niter + = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo)); + + unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo); + if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) + th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO + (loop_vinfo)); + + if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) + && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0) + { + /* Work out the (constant) number of iterations that need to be + peeled for reasons other than niters. */ + unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); + if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) + peel_niter += 1; + if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter, + LOOP_VINFO_VECT_FACTOR (loop_vinfo))) + return true; + } + else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) + /* ??? When peeling for gaps but not alignment, we could + try to check whether the (variable) niters is known to be + VF * N + 1. That's something of a niche case though. */ + || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) + || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf) + || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo)) + < (unsigned) exact_log2 (const_vf)) + /* In case of versioning, check if the maximum number of + iterations is greater than th. If they are identical, + the epilogue is unnecessary. */ + && (!LOOP_REQUIRES_VERSIONING (loop_vinfo) + || ((unsigned HOST_WIDE_INT) max_niter + > (th / const_vf) * const_vf)))) + return true; + + return false; +} + /* Each statement in LOOP_VINFO can be masked where necessary. Check whether we can actually generate the masks required. Return true if so, storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */ @@ -1967,44 +2012,10 @@ determine_peel_for_niter (loop_vec_info loop_vinfo) { LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false; - unsigned HOST_WIDE_INT const_vf; - HOST_WIDE_INT max_niter - = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo)); - - unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo); - if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO - (loop_vinfo)); - if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) /* The main loop handles all iterations. */ LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false; - else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) - && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0) - { - /* Work out the (constant) number of iterations that need to be - peeled for reasons other than niters. */ - unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); - if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) - peel_niter += 1; - if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter, - LOOP_VINFO_VECT_FACTOR (loop_vinfo))) - LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true; - } - else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) - /* ??? When peeling for gaps but not alignment, we could - try to check whether the (variable) niters is known to be - VF * N + 1. That's something of a niche case though. */ - || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) - || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf) - || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo)) - < (unsigned) exact_log2 (const_vf)) - /* In case of versioning, check if the maximum number of - iterations is greater than th. If they are identical, - the epilogue is unnecessary. */ - && (!LOOP_REQUIRES_VERSIONING (loop_vinfo) - || ((unsigned HOST_WIDE_INT) max_niter - > (th / const_vf) * const_vf)))) + else if (vect_need_peeling_or_part_vects_p (loop_vinfo)) LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true; } @@ -2267,7 +2278,10 @@ start_over: { if (param_vect_partial_vector_usage == 0) LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false; - else if (vect_verify_full_masking (loop_vinfo) + else if ((vect_verify_full_masking (loop_vinfo) + && vect_need_peeling_or_part_vects_p (loop_vinfo)) + /* Don't use partial vectors if we don't need to peel the + loop. */ || vect_verify_loop_lens (loop_vinfo)) { /* The epilogue and other known niters less than VF -- 2.17.1