Made the suggested changes.
Regarding the name change to partial vectors, I agree in the name change
since that is the terminology we are using in the loop_vinfo members
too, but is there an actual difference between predication/masking and
partial vectors that I am missing?
OK for trunk?
gcc/ChangeLog:
* tree-vect-loop.c (vect_better_loop_vinfo_p): Round factors up
for epilogue costing.
(vect_analyze_loop): Re-analyze all modes for epilogues, unless
we are guaranteed that we can't
have partial vectors.
(genopinit.c) (partial_vectors_supported): Generate new function.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/masked_epilogue.c: New test.
diff --git a/gcc/genopinit.c b/gcc/genopinit.c
index
195ddf74fa2b7d89760622073dcec9d5d339a097..2bc7cdbf53337beae181afd7bb05b366ab068c6a
100644
--- a/gcc/genopinit.c
+++ b/gcc/genopinit.c
@@ -321,6 +321,7 @@ main (int argc, const char **argv)
" bool supports_vec_scatter_store_cached;\n"
"};\n"
"extern void init_all_optabs (struct target_optabs *);\n"
+ "extern bool partial_vectors_supported_p (void);\n"
"\n"
"extern struct target_optabs default_target_optabs;\n"
"extern struct target_optabs *this_fn_optabs;\n"
@@ -373,6 +374,33 @@ main (int argc, const char **argv)
fprintf (s_file, " ena[%u] = HAVE_%s;\n", i, p->name);
fprintf (s_file, "}\n\n");
+ fprintf (s_file,
+ "/* Returns TRUE if the target supports any of the partial vector\n"
+ " optabs: while_ult_optab, len_load_optab or len_store_optab,\n"
+ " for any mode. */\n"
+ "bool\npartial_vectors_supported_p (void)\n{\n");
+ bool any_match = false;
+ fprintf (s_file, "\treturn");
+ bool first = true;
+ for (i = 0; patterns.iterate (i, &p); ++i)
+ {
+#define CMP_NAME(N) !strncmp (p->name, (N), strlen ((N)))
+ if (CMP_NAME("while_ult") || CMP_NAME ("len_load")
+ || CMP_NAME ("len_store"))
+ {
+ if (first)
+ fprintf (s_file, " HAVE_%s", p->name);
+ else
+ fprintf (s_file, " || HAVE_%s", p->name);
+ first = false;
+ any_match = true;
+ }
+ }
+ if (!any_match)
+ fprintf (s_file, " false");
+ fprintf (s_file, ";\n}\n");
+
+
/* Perform a binary search on a pre-encoded optab+mode*2. */
/* ??? Perhaps even better to generate a minimal perfect hash.
Using gperf directly is awkward since it's so geared to working
diff --git a/gcc/testsuite/gcc.target/aarch64/masked_epilogue.c
b/gcc/testsuite/gcc.target/aarch64/masked_epilogue.c
new file mode 100644
index
0000000000000000000000000000000000000000..286a7be236f337fee4c4650f42da72000855c5e6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/masked_epilogue.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details
-march=armv8-a+sve -msve-vector-bits=scalable" } */
+
+void f(unsigned char y[restrict],
+ unsigned char x[restrict], int n) {
+ for (int i = 0; i < n; ++i)
+ y[i] = (y[i] + x[i] + 1) >> 1;
+}
+
+/* { dg-final { scan-tree-dump {LOOP EPILOGUE VECTORIZED \(MODE=VNx} "vect" }
} */
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index
a28bb6321d76b8222bc8cfdade151ca9b4dca406..5af98a36678ae61e99f93beb90920e2d0940c53a
100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -2824,11 +2824,13 @@ vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
{
unsigned HOST_WIDE_INT main_vf_max
= estimated_poly_value (main_poly_vf, POLY_VALUE_MAX);
+ unsigned HOST_WIDE_INT old_vf_max
+ = estimated_poly_value (old_vf, POLY_VALUE_MAX);
+ unsigned HOST_WIDE_INT new_vf_max
+ = estimated_poly_value (new_vf, POLY_VALUE_MAX);
- old_factor = main_vf_max / estimated_poly_value (old_vf,
- POLY_VALUE_MAX);
- new_factor = main_vf_max / estimated_poly_value (new_vf,
- POLY_VALUE_MAX);
+ old_factor = CEIL (main_vf_max, old_vf_max);
+ new_factor = CEIL (main_vf_max, new_vf_max);
/* If the loop is not using partial vectors then it will iterate one
time less than one that does. It is safe to subtract one here,
@@ -3069,8 +3071,6 @@ vect_analyze_loop (class loop *loop, vec_info_shared
*shared)
machine_mode autodetected_vector_mode = VOIDmode;
opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
unsigned int mode_i = 0;
- unsigned int first_loop_i = 0;
- unsigned int first_loop_next_i = 0;
unsigned HOST_WIDE_INT simdlen = loop->simdlen;
/* First determine the main loop vectorization mode, either the first
@@ -3079,7 +3079,6 @@ vect_analyze_loop (class loop *loop, vec_info_shared
*shared)
lowest cost if pick_lowest_cost_p. */
while (1)
{
- unsigned int loop_vinfo_i = mode_i;
bool fatal;
opt_loop_vec_info loop_vinfo
= vect_analyze_loop_1 (loop, shared, &loop_form_info,
@@ -3108,11 +3107,7 @@ vect_analyze_loop (class loop *loop, vec_info_shared
*shared)
first_loop_vinfo = opt_loop_vec_info::success (NULL);
}
if (first_loop_vinfo == NULL)
- {
- first_loop_vinfo = loop_vinfo;
- first_loop_i = loop_vinfo_i;
- first_loop_next_i = mode_i;
- }
+ first_loop_vinfo = loop_vinfo;
else
{
delete loop_vinfo;
@@ -3158,32 +3153,37 @@ vect_analyze_loop (class loop *loop, vec_info_shared
*shared)
/* Now analyze first_loop_vinfo for epilogue vectorization. */
poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
- /* Handle the case that the original loop can use partial
- vectorization, but want to only adopt it for the epilogue.
- The retry should be in the same mode as original. */
- if (LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (first_loop_vinfo))
+ /* For epilogues start the analysis from the first mode. The motivation
+ behind starting from the beginning comes from cases where the VECTOR_MODES
+ array may contain length-agnostic and length-specific modes. Their
+ ordering is not guaranteed, so we could end up picking a mode for the main
+ loop that is after the epilogue's optimal mode. */
+ mode_i = 1;
+ bool supports_partial_vectors = partial_vectors_supported_p ();
+ poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
+
+ while (1)
{
- gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (first_loop_vinfo)
- && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (first_loop_vinfo));
+ /* If the target does not support partial vectors we can shorten the
+ number of modes to analyze for the epilogue as we know we can't pick a
+ mode that has at least as many NUNITS as the main loop's vectorization
+ factor, since that would imply the epilogue's vectorization factor
+ would be at least as high as the main loop's and we would be
+ vectorizing for more scalar iterations than there would be left. */
+ if (!supports_partial_vectors
+ && maybe_ge (GET_MODE_NUNITS (vector_modes[mode_i]), first_vinfo_vf))
+ {
+ mode_i++;
+ if (mode_i == vector_modes.length ())
+ break;
+ continue;
+ }
+
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
- "***** Re-trying analysis with same vector mode"
- " %s for epilogue with partial vectors.\n",
- GET_MODE_NAME (first_loop_vinfo->vector_mode));
- mode_i = first_loop_i;
- }
- else
- {
- mode_i = first_loop_next_i;
- if (mode_i == vector_modes.length ())
- return first_loop_vinfo;
- }
-
- /* ??? If first_loop_vinfo was using VOIDmode then we probably
- want to instead search for the corresponding mode in vector_modes[]. */
+ "***** Re-trying epilogue analysis with vector "
+ "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
- while (1)
- {
bool fatal;
opt_loop_vec_info loop_vinfo
= vect_analyze_loop_1 (loop, shared, &loop_form_info,
@@ -3235,11 +3235,6 @@ vect_analyze_loop (class loop *loop, vec_info_shared
*shared)
if (mode_i == vector_modes.length ())
break;
- /* Try the next biggest vector size. */
- if (dump_enabled_p ())
- dump_printf_loc (MSG_NOTE, vect_location,
- "***** Re-trying epilogue analysis with vector "
- "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
}
if (!first_loop_vinfo->epilogue_vinfos.is_empty ())