The following ensures we still apply a runtime costmodel check even
when not peeling for niters. This is relevant for fully masked
loop vectorization and in this case triggers versioning of the loop.
Given the condition on whether we apply epilog peeling (not to mention
the condition determining whether to include a skip-vector check there,
which is the place a cost model check is eventually put on) is mightly
complicated, this is a RFC because in the end we should have made
sure LOOP_REQUIRES_VERSIONING is true and add a separate
LOOP_REQUIRES_VERSIONING_FOR_COST.
I think I have seen the same bug even without masking, but vectorization
tends to be a static profitability decision in those cases as we know
no epilog is needed. For x86 the case of main loop masking isn't
too important right now (besides having this bug), for fixed-length
vectorization it's much better to rely on masked epilogs. For the
testcase it's where there is only a single masked vector iteration
and for a low number of scalar iteration that's not profitable,
like in this case with an "emulated" fold-left reduction.
The fact that we set check_profitability but then do not check
profitability seems to be a bug though.
Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.
Comments welcome, in particular whether it was a conious choice
to never apply a runtime check for fully masked vector loops
when otherwise no versioning or epilog peeling is required?
Thanks,
Richard.
PR tree-optimization/110979
* tree-vectorizer.h (vect_apply_runtime_profitability_check_p):
Adjust for partial vectors, only require a check for th > 1.
(vect_needs_epilog_peeling): Declare.
* tree-vect-loop.cc (vect_analyze_loop_2): Make sure to set
LOOP_VINFO_VERSIONING_THRESHOLD when we'll version.
(vect_transform_loop(): Also version when we need to
check profitability but are not applying epilog peeling.
* tree-vect-loop-manip.cc (vect_needs_epilog_peeling): New
function, split out from ...
(vect_do_peeling): ... here.
(vect_loop_versioning): Always dump when we apply versioning.
* gcc.dg/vect/costmodel/x86_64/costmodel-pr110979.c: New testcase.
---
.../costmodel/x86_64/costmodel-pr110979.c | 14 +++++
gcc/tree-vect-loop-manip.cc | 63 ++++++++++++-------
gcc/tree-vect-loop.cc | 11 ++--
gcc/tree-vectorizer.h | 5 +-
4 files changed, 67 insertions(+), 26 deletions(-)
create mode 100644
gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr110979.c
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr110979.c
b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr110979.c
new file mode 100644
index 00000000000..99830dcb1dc
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr110979.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target avx512f } */
+/* { dg-additional-options "-O3 -mavx512f --param vect-partial-vector-usage=2"
} */
+
+float
+foo3 (float* __restrict a, int n)
+{
+ float sum = 0.0f;
+ for (int i = 0; i != n; i++)
+ sum += a[i];
+ return sum;
+}
+
+/* { dg-final { scan-tree-dump "applying loop versioning to loop" "vect" } } */
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index 5862098803a..1eb2faeb74e 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -3048,6 +3048,38 @@ vect_get_main_loop_result (loop_vec_info loop_vinfo,
tree main_loop_value,
return phi_result;
}
+/* Returns whether we need to perform epilog peeling and initializes
+ *EPILOG_PEELING and *BOUND_EPILOG if so. */
+
+static bool
+vect_needs_epilog_peeling (loop_vec_info loop_vinfo,
+ bool *epilog_peeling, poly_uint64 *bound_epilog)
+{
+ poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+ *bound_epilog = 0;
+ if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
+ && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
+ *bound_epilog += vf - 1;
+ if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
+ *bound_epilog += 1;
+
+ /* For early breaks the scalar loop needs to execute at most VF times
+ to find the element that caused the break. */
+ if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
+ *bound_epilog = vf;
+
+ *epilog_peeling = maybe_ne (*bound_epilog, 0U);
+ return *epilog_peeling;
+}
+
+bool
+vect_needs_epilog_peeling (loop_vec_info loop_vinfo)
+{
+ bool epilog_peeling;
+ poly_uint64 bound_epilog;
+ return vect_needs_epilog_peeling (loop_vinfo, &epilog_peeling,
&bound_epilog);
+}
+
/* Function vect_do_peeling.
Input:
@@ -3153,30 +3185,19 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters,
tree nitersm1,
basic_block guard_bb, guard_to;
profile_probability prob_prolog, prob_vector, prob_epilog;
int estimated_vf;
- int prolog_peeling = 0;
bool vect_epilogues = loop_vinfo->epilogue_vinfo != NULL;
+ poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+ int prolog_peeling = 0;
if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
- poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
- poly_uint64 bound_epilog = 0;
- if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
- && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
- bound_epilog += vf - 1;
- if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
- bound_epilog += 1;
-
- /* For early breaks the scalar loop needs to execute at most VF times
- to find the element that caused the break. */
- if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
- bound_epilog = vf;
-
- bool epilog_peeling = maybe_ne (bound_epilog, 0U);
- poly_uint64 bound_scalar = bound_epilog;
-
- if (!prolog_peeling && !epilog_peeling)
+ bool epilog_peeling;
+ poly_uint64 bound_epilog;
+ if (!vect_needs_epilog_peeling (loop_vinfo, &epilog_peeling, &bound_epilog)
+ && prolog_peeling == 0)
return NULL;
+ poly_uint64 bound_scalar = bound_epilog;
/* Before doing any peeling make sure to reset debug binds outside of
the loop refering to defs not in LC SSA. */
@@ -4366,10 +4387,10 @@ vect_loop_versioning (loop_vec_info loop_vinfo,
}
else
{
- if (loop_to_version != loop
- && dump_enabled_p ())
+ if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
- "applying loop versioning to outer loop %d\n",
+ "applying loop versioning to %sloop %d\n",
+ loop_to_version != loop ? "outer " : "",
loop_to_version->num);
unsigned orig_pe_idx = loop_preheader_edge (loop)->dest_idx;
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index b27e762c69c..62542d400f4 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -2528,7 +2528,9 @@ start_over:
analyzing the epilogue. For instance, checks for alias versioning will be
skipped when dealing with epilogues as we assume we already checked them
for the main loop. So instead we always check the 'orig_loop_vinfo'. */
- if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
+ if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo)
+ || (vect_apply_runtime_profitability_check_p (orig_loop_vinfo)
+ && !vect_needs_epilog_peeling (orig_loop_vinfo)))
{
poly_uint64 niters_th = 0;
unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
@@ -2553,8 +2555,8 @@ start_over:
if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
niters_th += 1;
- /* Use the same condition as vect_transform_loop to decide when to use
- the cost to determine a versioning threshold. */
+ /* Use the same condition as vect_transform_loop to decide when to use
+ the cost to determine a versioning threshold. */
if (vect_apply_runtime_profitability_check_p (loop_vinfo)
&& ordered_p (th, niters_th))
niters_th = ordered_max (poly_uint64 (th), niters_th);
@@ -11093,7 +11095,8 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple
*loop_vectorized_call)
/* Version the loop first, if required, so the profitability check
comes first. */
- if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
+ if (LOOP_REQUIRES_VERSIONING (loop_vinfo)
+ || (check_profitability && !vect_needs_epilog_peeling (loop_vinfo)))
{
class loop *sloop
= vect_loop_versioning (loop_vinfo, loop_vectorized_call);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 9c349e8ffde..83689246340 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2396,7 +2396,9 @@ vect_apply_runtime_profitability_check_p (loop_vec_info
loop_vinfo)
{
unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
return (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
- && th >= vect_vf_for_cost (loop_vinfo));
+ && th > 1
+ && (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
+ || th >= vect_vf_for_cost (loop_vinfo)));
}
/* Return true if CODE is a lane-reducing opcode. */
@@ -2468,6 +2470,7 @@ class loop *slpeel_tree_duplicate_loop_to_edge_cfg (class
loop *, edge,
edge, edge *, bool = true,
vec<basic_block> * = NULL);
class loop *vect_loop_versioning (loop_vec_info, gimple *);
+extern bool vect_needs_epilog_peeling (loop_vec_info);
extern class loop *vect_do_peeling (loop_vec_info, tree, tree,
tree *, tree *, tree *, int, bool, bool,
tree *);
--
2.51.0