gcc/ChangeLog 2020-MM-DD Kewen Lin <li...@gcc.gnu.org>
* doc/invoke.texi (vect-with-length-scope): Document new option. * params.opt (vect-with-length-scope): New. * tree-vect-loop-manip.c (vect_set_loop_lens_directly): New function. (vect_set_loop_condition_len): Likewise. (vect_set_loop_condition): Call vect_set_loop_condition_len for loop with length. (vect_gen_vector_loop_niters): Use VF as the step for loop with length. (vect_do_peeling): Adjust for loop with length. * tree-vect-loop.c (_loop_vec_info::_loop_vec_info): Initialize can_with_length_p and fully_with_length_p. (release_vec_loop_lens): New function. (_loop_vec_info::~_loop_vec_info): Use it to free the loop lens. (vect_verify_loop_lens): New function. (vect_analyze_loop_costing): Adjust for loop fully with length. (determine_peel_for_niter): Don't peel if loop fully with length. (vect_analyze_loop_2): Save LOOP_VINFO_CAN_WITH_LENGTH_P around retries, and free the length rgroups before retrying. Check loop-wide reasons for disabling loops with length. Make the final decision about use vector access with length or not. (vect_analyze_loop): Add handlings for epilogue of loop that can use vector with length but not. (vect_estimate_min_profitable_iters): Adjust for loop with length. (vectorizable_reduction): Disable loop with length. (vectorizable_live_operation): Likewise. (vect_record_loop_len): New function. (vect_get_loop_len): Likewise. (vect_transform_loop): Flag final loop iteration could be partial vector for loop with length. * tree-vect-stmts.c (check_load_store_with_len): New function. (vectorizable_store): Handle vector loop with length. (vectorizable_load): Likewise. (vect_gen_len): New function. * tree-vectorizer.h (struct rgroup_lens): New structure. (vec_loop_lens): New typedef. (_loop_vec_info): Add lens, can_with_length_p and fully_with_length_p. (LOOP_VINFO_CAN_WITH_LENGTH_P): New macro. (LOOP_VINFO_FULLY_WITH_LENGTH_P): Likewise. (LOOP_VINFO_LENS): Likewise. (vect_record_loop_len): New declare. (vect_get_loop_len): Likewise. (vect_gen_len): Likewise.
--- gcc/doc/invoke.texi | 7 + gcc/params.opt | 4 + gcc/tree-vect-loop-manip.c | 268 ++++++++++++++++++++++++++++++++++++- gcc/tree-vect-loop.c | 241 ++++++++++++++++++++++++++++++++- gcc/tree-vect-stmts.c | 152 +++++++++++++++++++++ gcc/tree-vectorizer.h | 32 +++++ 6 files changed, 697 insertions(+), 7 deletions(-) diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 8b9935dfe65..ac765feab13 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -13079,6 +13079,13 @@ by the copy loop headers pass. @item vect-epilogues-nomask Enable loop epilogue vectorization using smaller vector size. +@item vect-with-length-scope +Control the scope of vector memory access with length exploitation. 0 means we +don't expliot any vector memory access with length, 1 means we only exploit +vector memory access with length for those loops whose iteration number are +less than VF, such as very small loop or epilogue, 2 means we want to exploit +vector memory access with length for any loops if possible. + @item slp-max-insns-in-bb Maximum number of instructions in basic block to be considered for SLP vectorization. diff --git a/gcc/params.opt b/gcc/params.opt index 4aec480798b..d4309101067 100644 --- a/gcc/params.opt +++ b/gcc/params.opt @@ -964,4 +964,8 @@ Bound on number of runtime checks inserted by the vectorizer's loop versioning f Common Joined UInteger Var(param_vect_max_version_for_alignment_checks) Init(6) Param Optimization Bound on number of runtime checks inserted by the vectorizer's loop versioning for alignment check. +-param=vect-with-length-scope= +Common Joined UInteger Var(param_vect_with_length_scope) Init(0) IntegerRange(0, 2) Param Optimization +Control the vector with length exploitation scope. + ; This comment is to ensure we retain the blank line above. diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c index 8c5e696b995..3d5dec6f65c 100644 --- a/gcc/tree-vect-loop-manip.c +++ b/gcc/tree-vect-loop-manip.c @@ -747,6 +747,263 @@ vect_set_loop_condition_masked (class loop *loop, loop_vec_info loop_vinfo, return cond_stmt; } +/* Helper for vect_set_loop_condition_len. Like vect_set_loop_masks_directly, + generate definitions for all the lengths in RGL and return a length that is + nonzero when the loop needs to iterate. Add any new preheader statements to + PREHEADER_SEQ. Use LOOP_COND_GSI to insert code before the exit gcond. + + RGL belongs to loop LOOP. The loop originally iterated NITERS + times and has been vectorized according to LOOP_VINFO. Each iteration + of the vectorized loop handles VF iterations of the scalar loop. + + IV_LIMIT is the limit which induction variable can reach, that will be used + to check whether induction variable can wrap before hit the niters. */ + +static tree +vect_set_loop_lens_directly (class loop *loop, loop_vec_info loop_vinfo, + gimple_seq *preheader_seq, + gimple_stmt_iterator loop_cond_gsi, + rgroup_lens *rgl, tree niters, widest_int iv_limit) +{ + scalar_int_mode len_mode = targetm.vectorize.length_mode; + unsigned int len_prec = GET_MODE_PRECISION (len_mode); + tree len_type = build_nonstandard_integer_type (len_prec, true); + + tree vec_type = rgl->vec_type; + unsigned int nbytes_per_iter = rgl->nbytes_per_iter; + poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vec_type)); + poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); + tree vec_size = build_int_cst (len_type, vector_size); + + /* See whether zero-based IV would ever generate zero length before + wrapping around. */ + bool might_wrap_p = (iv_limit == -1); + if (!might_wrap_p) + { + widest_int iv_limit_max = iv_limit * nbytes_per_iter; + might_wrap_p = wi::min_precision (iv_limit_max, UNSIGNED) > len_prec; + } + + /* Calculate the maximum number of bytes of scalars that the rgroup + handles in total, the number that it handles for each iteration + of the vector loop. */ + tree nbytes_total = niters; + tree nbytes_step = build_int_cst (len_type, vf); + if (nbytes_per_iter != 1) + { + tree factor = build_int_cst (len_type, nbytes_per_iter); + nbytes_total = gimple_build (preheader_seq, MULT_EXPR, len_type, + nbytes_total, factor); + nbytes_step = gimple_build (preheader_seq, MULT_EXPR, len_type, + nbytes_step, factor); + } + + /* Create an induction variable that counts the processed bytes of scalars. */ + tree index_before_incr, index_after_incr; + gimple_stmt_iterator incr_gsi; + bool insert_after; + standard_iv_increment_position (loop, &incr_gsi, &insert_after); + create_iv (build_int_cst (len_type, 0), nbytes_step, NULL_TREE, loop, + &incr_gsi, insert_after, &index_before_incr, &index_after_incr); + + tree zero_index = build_int_cst (len_type, 0); + tree test_index, test_limit, first_limit; + gimple_stmt_iterator *test_gsi; + + /* For the first iteration it doesn't matter whether the IV hits + a value above NBYTES_TOTAL. That only matters for the latch + condition. */ + first_limit = nbytes_total; + + if (might_wrap_p) + { + test_index = index_before_incr; + tree adjust = gimple_convert (preheader_seq, len_type, nbytes_step); + test_limit = gimple_build (preheader_seq, MAX_EXPR, len_type, + nbytes_total, adjust); + test_limit = gimple_build (preheader_seq, MINUS_EXPR, len_type, + test_limit, adjust); + test_gsi = &incr_gsi; + } + else + { + /* Test the incremented IV, which will always hit a value above + the bound before wrapping. */ + test_index = index_after_incr; + test_limit = nbytes_total; + test_gsi = &loop_cond_gsi; + } + + /* Provide a definition of each length in the group. */ + tree next_len = NULL_TREE; + tree len; + unsigned int i; + FOR_EACH_VEC_ELT_REVERSE (rgl->lens, i, len) + { + /* Previous lengths will cover BIAS scalars. This length covers the + next batch. Each batch's length should be vector_size. */ + poly_uint64 bias = vector_size * i; + tree bias_tree = build_int_cst (len_type, bias); + + /* See whether the first iteration of the vector loop is known + to have a full vector size. */ + poly_uint64 const_limit; + bool first_iteration_full + = (poly_int_tree_p (first_limit, &const_limit) + && known_ge (const_limit, (i + 1) * vector_size)); + + /* Rather than have a new IV that starts at BIAS and goes up to + TEST_LIMIT, prefer to use the same 0-based IV for each length + and adjust the bound down by BIAS. */ + tree this_test_limit = test_limit; + if (i != 0) + { + this_test_limit = gimple_build (preheader_seq, MAX_EXPR, len_type, + this_test_limit, bias_tree); + this_test_limit = gimple_build (preheader_seq, MINUS_EXPR, len_type, + this_test_limit, bias_tree); + } + + /* Create the initial length. First include all scalar bytes that + are within the loop limit. */ + tree init_len = NULL_TREE; + if (!first_iteration_full) + { + tree start, end; + if (first_limit == test_limit) + { + /* Use a natural test between zero (the initial IV value) + and the loop limit. The "else" block would be valid too, + but this choice can avoid the need to load BIAS_TREE into + a register. */ + start = zero_index; + end = this_test_limit; + } + else + { + /* FIRST_LIMIT is the maximum number of scalar bytes handled by + the first iteration of the vector loop. Test the portion + associated with this length. */ + start = bias_tree; + end = first_limit; + } + + init_len = make_temp_ssa_name (len_type, NULL, "max_len"); + gimple_seq seq = vect_gen_len (init_len, start, end, vec_size); + gimple_seq_add_seq (preheader_seq, seq); + } + + /* First iteration is full. */ + if (!init_len) + init_len = vec_size; + + /* Get the length value for the next iteration of the loop. */ + next_len = make_temp_ssa_name (len_type, NULL, "next_len"); + tree end = this_test_limit; + gimple_seq seq = vect_gen_len (next_len, test_index, end, vec_size); + gsi_insert_seq_before (test_gsi, seq, GSI_SAME_STMT); + + /* Use mask routine for length. */ + vect_set_loop_mask (loop, len, init_len, next_len); + } + + return next_len; +} + +/* Like vect_set_loop_condition_masked, handle the case vector access with + length. */ + +static gcond * +vect_set_loop_condition_len (class loop *loop, loop_vec_info loop_vinfo, + tree niters, tree final_iv, + bool niters_maybe_zero, + gimple_stmt_iterator loop_cond_gsi) +{ + gimple_seq preheader_seq = NULL; + gimple_seq header_seq = NULL; + tree orig_niters = niters; + + /* Type of the initial value of NITERS. */ + tree ni_actual_type = TREE_TYPE (niters); + unsigned int ni_actual_prec = TYPE_PRECISION (ni_actual_type); + + /* Obtain target supported length type. */ + scalar_int_mode len_mode = targetm.vectorize.length_mode; + unsigned int len_prec = GET_MODE_PRECISION (len_mode); + tree len_type = build_nonstandard_integer_type (len_prec, true); + + /* Calculate the value that the induction variable must be able to hit in + order to ensure that we end the loop with an zero length. */ + widest_int iv_limit = -1; + unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo); + if (max_loop_iterations (loop, &iv_limit)) + { + /* Round this value down to the previous vector alignment boundary and + then add an extra full iteration. */ + poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); + iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf; + } + + /* Convert NITERS to the same size as the length. */ + if (niters_maybe_zero || (len_prec > ni_actual_prec)) + { + /* We know that there is always at least one iteration, so if the + count is zero then it must have wrapped. Cope with this by + subtracting 1 before the conversion and adding 1 to the result. */ + gcc_assert (TYPE_UNSIGNED (ni_actual_type)); + niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type, niters, + build_minus_one_cst (ni_actual_type)); + niters = gimple_convert (&preheader_seq, len_type, niters); + niters = gimple_build (&preheader_seq, PLUS_EXPR, len_type, niters, + build_one_cst (len_type)); + } + else + niters = gimple_convert (&preheader_seq, len_type, niters); + + /* Iterate over all the rgroups and fill in their lengths. We could use + the first length from any rgroup for the loop condition; here we + arbitrarily pick the last. */ + tree test_len = NULL_TREE; + rgroup_lens *rgl; + unsigned int i; + vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo); + + FOR_EACH_VEC_ELT (*lens, i, rgl) + if (!rgl->lens.is_empty ()) + /* Set up all lens for this group. */ + test_len + = vect_set_loop_lens_directly (loop, loop_vinfo, &preheader_seq, + loop_cond_gsi, rgl, niters, iv_limit); + + /* Emit all accumulated statements. */ + add_preheader_seq (loop, preheader_seq); + add_header_seq (loop, header_seq); + + /* Get a boolean result that tells us whether to iterate. */ + edge exit_edge = single_exit (loop); + tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR; + tree zero_len = build_zero_cst (TREE_TYPE (test_len)); + gcond *cond_stmt + = gimple_build_cond (code, test_len, zero_len, NULL_TREE, NULL_TREE); + gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT); + + /* The loop iterates (NITERS - 1) / VF + 1 times. + Subtract one from this to get the latch count. */ + tree step = build_int_cst (len_type, LOOP_VINFO_VECT_FACTOR (loop_vinfo)); + tree niters_minus_one + = fold_build2 (PLUS_EXPR, len_type, niters, build_minus_one_cst (len_type)); + loop->nb_iterations + = fold_build2 (TRUNC_DIV_EXPR, len_type, niters_minus_one, step); + + if (final_iv) + { + gassign *assign = gimple_build_assign (final_iv, orig_niters); + gsi_insert_on_edge_immediate (single_exit (loop), assign); + } + + return cond_stmt; +} + /* Like vect_set_loop_condition, but handle the case in which there are no loop masks. */ @@ -916,6 +1173,10 @@ vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo, cond_stmt = vect_set_loop_condition_masked (loop, loop_vinfo, niters, final_iv, niters_maybe_zero, loop_cond_gsi); + else if (loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)) + cond_stmt = vect_set_loop_condition_len (loop, loop_vinfo, niters, + final_iv, niters_maybe_zero, + loop_cond_gsi); else cond_stmt = vect_set_loop_condition_unmasked (loop, niters, step, final_iv, niters_maybe_zero, @@ -1939,7 +2200,8 @@ vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters, unsigned HOST_WIDE_INT const_vf; if (vf.is_constant (&const_vf) - && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) + && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) + && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)) { /* Create: niters >> log2(vf) */ /* If it's known that niters == number of latch executions + 1 doesn't @@ -2472,6 +2734,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); poly_uint64 bound_epilog = 0; if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) + && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo) && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)) bound_epilog += vf - 1; if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) @@ -2567,7 +2830,8 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, if (vect_epilogues && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && prolog_peeling >= 0 - && known_eq (vf, lowest_vf)) + && known_eq (vf, lowest_vf) + && !LOOP_VINFO_FULLY_WITH_LENGTH_P (epilogue_vinfo)) { unsigned HOST_WIDE_INT eiters = (LOOP_VINFO_INT_NITERS (loop_vinfo) diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index 80e33b61be7..d61f46becfd 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -815,6 +815,8 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared) vectorizable (false), can_fully_mask_p (true), fully_masked_p (false), + can_with_length_p (param_vect_with_length_scope != 0), + fully_with_length_p (false), peeling_for_gaps (false), peeling_for_niter (false), no_data_dependencies (false), @@ -887,6 +889,18 @@ release_vec_loop_masks (vec_loop_masks *masks) masks->release (); } +/* Free all levels of LENS. */ + +void +release_vec_loop_lens (vec_loop_lens *lens) +{ + rgroup_lens *rgl; + unsigned int i; + FOR_EACH_VEC_ELT (*lens, i, rgl) + rgl->lens.release (); + lens->release (); +} + /* Free all memory used by the _loop_vec_info, as well as all the stmt_vec_info structs of all the stmts in the loop. */ @@ -895,6 +909,7 @@ _loop_vec_info::~_loop_vec_info () free (bbs); release_vec_loop_masks (&masks); + release_vec_loop_lens (&lens); delete ivexpr_map; delete scan_map; epilogue_vinfos.release (); @@ -1056,6 +1071,44 @@ vect_verify_full_masking (loop_vec_info loop_vinfo) return true; } +/* Check whether we can use vector access with length based on precison + comparison. So far, to keep it simple, we only allow the case that the + precision of the target supported length is larger than the precision + required by loop niters. */ + +static bool +vect_verify_loop_lens (loop_vec_info loop_vinfo) +{ + class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo); + + if (LOOP_VINFO_LENS (loop_vinfo).is_empty ()) + return false; + + /* Get the maximum number of iterations that is representable + in the counter type. */ + tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo)); + widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1; + + /* Get a more refined estimate for the number of iterations. */ + widest_int max_back_edges; + if (max_loop_iterations (loop, &max_back_edges)) + max_ni = wi::smin (max_ni, max_back_edges + 1); + + /* Account for rgroup lengths, in which each bit is replicated N times. */ + rgroup_lens *rgl = &(*lens)[lens->length () - 1]; + max_ni *= rgl->nbytes_per_iter; + + /* Work out how many bits we need to represent the limit. */ + unsigned int min_ni_width = wi::min_precision (max_ni, UNSIGNED); + + unsigned len_bits = GET_MODE_PRECISION (targetm.vectorize.length_mode); + if (len_bits < min_ni_width) + return false; + + return true; +} + /* Calculate the cost of one scalar iteration of the loop. */ static void vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo) @@ -1630,7 +1683,8 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo) /* Only fully-masked loops can have iteration counts less than the vectorization factor. */ - if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) + if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) + && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)) { if (known_niters_smaller_than_vf (loop_vinfo)) { @@ -1858,7 +1912,8 @@ determine_peel_for_niter (loop_vec_info loop_vinfo) th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)); - if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) + if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) + || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)) /* The main loop handles all iterations. */ LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false; else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) @@ -2048,6 +2103,7 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts) } bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo); + bool saved_can_with_length_p = LOOP_VINFO_CAN_WITH_LENGTH_P(loop_vinfo); /* We don't expect to have to roll back to anything other than an empty set of rgroups. */ @@ -2144,6 +2200,71 @@ start_over: "not using a fully-masked loop.\n"); } + /* Decide whether we can use vector access with length. */ + + if ((LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) + || LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)) + && LOOP_VINFO_CAN_WITH_LENGTH_P (loop_vinfo)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "can't use vector access with length becuase peeling" + " for alignment or gaps is required.\n"); + LOOP_VINFO_CAN_WITH_LENGTH_P (loop_vinfo) = false; + } + + if (LOOP_VINFO_CAN_WITH_LENGTH_P (loop_vinfo) + && !vect_verify_loop_lens (loop_vinfo)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "can't use vector access with length becuase the" + " length precision verification fail.\n"); + LOOP_VINFO_CAN_WITH_LENGTH_P (loop_vinfo) = false; + } + + if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "can't use vector access with length becuase the" + " loop will be fully-masked.\n"); + LOOP_VINFO_CAN_WITH_LENGTH_P (loop_vinfo) = false; + } + + if (LOOP_VINFO_CAN_WITH_LENGTH_P (loop_vinfo)) + { + /* One special case, the loop with max niters less than VF, we can simply + take it as body with length. */ + if (param_vect_with_length_scope == 1) + { + /* This is the epilogue, should be less than VF. */ + if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)) + LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo) = true; + /* Otherwise, ensure the loop iteration less than VF. */ + else if (known_niters_smaller_than_vf (loop_vinfo)) + LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo) = true; + } + else + { + gcc_assert (param_vect_with_length_scope == 2); + LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo) = true; + } + } + else + /* Always set it as false in case previous tries set it. */ + LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo) = false; + + if (dump_enabled_p ()) + { + if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)) + dump_printf_loc (MSG_NOTE, vect_location, "using vector access with" + " length for loop fully.\n"); + else + dump_printf_loc (MSG_NOTE, vect_location, "not using vector access with" + " length for loop fully.\n"); + } + /* If epilog loop is required because of data accesses with gaps, one additional iteration needs to be peeled. Check if there is enough iterations for vectorization. */ @@ -2164,6 +2285,7 @@ start_over: loop or a loop that has a lower VF than the main loop. */ if (LOOP_VINFO_EPILOGUE_P (loop_vinfo) && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) + && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo) && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo))) return opt_result::failure_at (vect_location, @@ -2362,12 +2484,14 @@ again: = init_cost (LOOP_VINFO_LOOP (loop_vinfo)); /* Reset accumulated rgroup information. */ release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo)); + release_vec_loop_lens (&LOOP_VINFO_LENS (loop_vinfo)); /* Reset assorted flags. */ LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false; LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false; LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0; LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0; LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p; + LOOP_VINFO_CAN_WITH_LENGTH_P (loop_vinfo) = saved_can_with_length_p; goto start_over; } @@ -2646,8 +2770,10 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) if (ordered_p (lowest_th, th)) lowest_th = ordered_min (lowest_th, th); } - else - delete loop_vinfo; + else { + delete loop_vinfo; + loop_vinfo = opt_loop_vec_info::success (NULL); + } /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is enabled, SIMDUID is not set, it is the innermost loop and we have @@ -2672,6 +2798,7 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) else { delete loop_vinfo; + loop_vinfo = opt_loop_vec_info::success (NULL); if (fatal) { gcc_checking_assert (first_loop_vinfo == NULL); @@ -2679,6 +2806,21 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) } } + /* If the original loop can use vector access with length but we still + get true vect_epilogue here, it would try vector access with length + on epilogue and with the same mode. */ + if (vect_epilogues && loop_vinfo + && LOOP_VINFO_CAN_WITH_LENGTH_P (loop_vinfo)) + { + gcc_assert (!LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)); + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "***** Re-trying analysis with same vector" + " mode %s for epilogue with length.\n", + GET_MODE_NAME (loop_vinfo->vector_mode)); + continue; + } + if (mode_i < vector_modes.length () && VECTOR_MODE_P (autodetected_vector_mode) && (related_vector_mode (vector_modes[mode_i], @@ -3519,6 +3661,11 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, target_cost_data, num_masks - 1, vector_stmt, NULL, NULL_TREE, 0, vect_body); } + else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)) + { + peel_iters_prologue = 0; + peel_iters_epilogue = 0; + } else if (npeel < 0) { peel_iters_prologue = assumed_vf / 2; @@ -3809,6 +3956,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, min_profitable_iters); if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) + && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo) && min_profitable_iters < (assumed_vf + peel_iters_prologue)) /* We want the vectorized loop to execute at least once. */ min_profitable_iters = assumed_vf + peel_iters_prologue; @@ -6761,6 +6909,16 @@ vectorizable_reduction (loop_vec_info loop_vinfo, dump_printf_loc (MSG_NOTE, vect_location, "using an in-order (fold-left) reduction.\n"); STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type; + + if (loop_vinfo && LOOP_VINFO_CAN_WITH_LENGTH_P (loop_vinfo)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "can't use vector access with length due to" + " reduction operation.\n"); + LOOP_VINFO_CAN_WITH_LENGTH_P (loop_vinfo) = false; + } + /* All but single defuse-cycle optimized, lane-reducing and fold-left reductions go through their own vectorizable_* routines. */ if (!single_defuse_cycle @@ -8041,6 +8199,16 @@ vectorizable_live_operation (loop_vec_info loop_vinfo, 1, vectype, NULL); } } + + if (loop_vinfo && LOOP_VINFO_CAN_WITH_LENGTH_P (loop_vinfo)) + { + LOOP_VINFO_CAN_WITH_LENGTH_P (loop_vinfo) = false; + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "can't use vector access with length due to" + " live operation.\n"); + } + return true; } @@ -8354,6 +8522,66 @@ vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks, return mask; } +/* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS + lengths for vector access with length that each control a vector of type + VECTYPE. */ + +void +vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens, + unsigned int nvectors, tree vectype) +{ + gcc_assert (nvectors != 0); + if (lens->length () < nvectors) + lens->safe_grow_cleared (nvectors); + rgroup_lens *rgl = &(*lens)[nvectors - 1]; + + /* The number of scalars per iteration, total bytes of them and the number of + vectors are both compile-time constants. */ + poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype)); + poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); + unsigned int nbytes_per_iter + = exact_div (nvectors * vector_size, vf).to_constant (); + + /* The one associated to the same nvectors should have the same bytes per + iteration. */ + if (!rgl->vec_type) + { + rgl->vec_type = vectype; + rgl->nbytes_per_iter = nbytes_per_iter; + } + else + gcc_assert (rgl->nbytes_per_iter == nbytes_per_iter); +} + +/* Given a complete set of length LENS, extract length number INDEX for an + rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS. */ + +tree +vect_get_loop_len (vec_loop_lens *lens, unsigned int nvectors, unsigned int index) +{ + rgroup_lens *rgl = &(*lens)[nvectors - 1]; + + /* Populate the rgroup's len array, if this is the first time we've + used it. */ + if (rgl->lens.is_empty ()) + { + rgl->lens.safe_grow_cleared (nvectors); + for (unsigned int i = 0; i < nvectors; ++i) + { + scalar_int_mode len_mode = targetm.vectorize.length_mode; + unsigned int len_prec = GET_MODE_PRECISION (len_mode); + tree len_type = build_nonstandard_integer_type (len_prec, true); + tree len = make_temp_ssa_name (len_type, NULL, "loop_len"); + + /* Provide a dummy definition until the real one is available. */ + SSA_NAME_DEF_STMT (len) = gimple_build_nop (); + rgl->lens[i] = len; + } + } + + return rgl->lens[index]; +} + /* Scale profiling counters by estimation for LOOP which is vectorized by factor VF. */ @@ -8714,6 +8942,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call) { if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) + && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo) && known_eq (lowest_vf, vf)) { niters_vector @@ -8881,7 +9110,9 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call) /* True if the final iteration might not handle a full vector's worth of scalar iterations. */ - bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo); + bool final_iter_may_be_partial + = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) + || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo); /* The minimum number of iterations performed by the epilogue. This is 1 when peeling for gaps because we always need a final scalar iteration. */ diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index e7822c44951..d6be39e1831 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -1879,6 +1879,66 @@ check_load_store_masking (loop_vec_info loop_vinfo, tree vectype, gcc_unreachable (); } +/* Check whether a load or store statement in the loop described by + LOOP_VINFO is possible to go with length. This is testing whether + the vectorizer pass has the appropriate support, as well as whether + the target does. + + VLS_TYPE says whether the statement is a load or store and VECTYPE + is the type of the vector being loaded or stored. MEMORY_ACCESS_TYPE + says how the load or store is going to be implemented and GROUP_SIZE + is the number of load or store statements in the containing group. + + Clear LOOP_VINFO_CAN_WITH_LENGTH_P if it can't go with length, otherwise + record the required length types. */ + +static void +check_load_store_with_len (loop_vec_info loop_vinfo, tree vectype, + vec_load_store_type vls_type, int group_size, + vect_memory_access_type memory_access_type) +{ + /* Invariant loads need no special support. */ + if (memory_access_type == VMAT_INVARIANT) + return; + + if (memory_access_type != VMAT_CONTIGUOUS + && memory_access_type != VMAT_CONTIGUOUS_PERMUTE) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "can't use vector access with length" + " because an access isn't contiguous.\n"); + LOOP_VINFO_CAN_WITH_LENGTH_P (loop_vinfo) = false; + return; + } + + machine_mode vecmode = TYPE_MODE (vectype); + bool is_load = (vls_type == VLS_LOAD); + optab op = is_load ? lenload_optab : lenstore_optab; + + if (!VECTOR_MODE_P (vecmode) + || !convert_optab_handler (op, vecmode, targetm.vectorize.length_mode)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "can't use vector access with length because" + " the target doesn't have the appropriate" + " load or store with length.\n"); + LOOP_VINFO_CAN_WITH_LENGTH_P (loop_vinfo) = false; + return; + } + + vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo); + poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); + poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); + unsigned int nvectors; + + if (can_div_away_from_zero_p (group_size * vf, nunits, &nvectors)) + vect_record_loop_len (loop_vinfo, lens, nvectors, vectype); + else + gcc_unreachable (); +} + /* Return the mask input to a masked load or store. VEC_MASK is the vectorized form of the scalar mask condition and LOOP_MASK, if nonnull, is the mask that needs to be applied to all loads and stores in a vectorized loop. @@ -7532,6 +7592,10 @@ vectorizable_store (vec_info *vinfo, check_load_store_masking (loop_vinfo, vectype, vls_type, group_size, memory_access_type, &gs_info, mask); + if (loop_vinfo && LOOP_VINFO_CAN_WITH_LENGTH_P (loop_vinfo)) + check_load_store_with_len (loop_vinfo, vectype, vls_type, group_size, + memory_access_type); + if (slp_node && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0], vectype)) @@ -8068,6 +8132,15 @@ vectorizable_store (vec_info *vinfo, = (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) ? &LOOP_VINFO_MASKS (loop_vinfo) : NULL); + + vec_loop_lens *loop_lens + = (loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo) + ? &LOOP_VINFO_LENS (loop_vinfo) + : NULL); + + /* Shouldn't go with length if fully masked. */ + gcc_assert (!loop_lens || (loop_lens && !loop_masks)); + /* Targets with store-lane instructions must not require explicit realignment. vect_supportable_dr_alignment always returns either dr_aligned or dr_unaligned_supported for masked operations. */ @@ -8320,10 +8393,15 @@ vectorizable_store (vec_info *vinfo, unsigned HOST_WIDE_INT align; tree final_mask = NULL_TREE; + tree final_len = NULL_TREE; if (loop_masks) final_mask = vect_get_loop_mask (gsi, loop_masks, vec_num * ncopies, vectype, vec_num * j + i); + else if (loop_lens) + final_len = vect_get_loop_len (loop_lens, vec_num * ncopies, + vec_num * j + i); + if (vec_mask) final_mask = prepare_load_store_mask (mask_vectype, final_mask, vec_mask, gsi); @@ -8403,6 +8481,17 @@ vectorizable_store (vec_info *vinfo, new_stmt_info = vect_finish_stmt_generation (vinfo, stmt_info, call, gsi); } + else if (final_len) + { + align = least_bit_hwi (misalign | align); + tree ptr = build_int_cst (ref_type, align); + gcall *call + = gimple_build_call_internal (IFN_LEN_STORE, 4, dataref_ptr, + ptr, final_len, vec_oprnd); + gimple_call_set_nothrow (call, true); + new_stmt_info + = vect_finish_stmt_generation (vinfo, stmt_info, call, gsi); + } else { data_ref = fold_build2 (MEM_REF, vectype, @@ -8839,6 +8928,10 @@ vectorizable_load (vec_info *vinfo, check_load_store_masking (loop_vinfo, vectype, VLS_LOAD, group_size, memory_access_type, &gs_info, mask); + if (loop_vinfo && LOOP_VINFO_CAN_WITH_LENGTH_P (loop_vinfo)) + check_load_store_with_len (loop_vinfo, vectype, VLS_LOAD, group_size, + memory_access_type); + STMT_VINFO_TYPE (stmt_info) = load_vec_info_type; vect_model_load_cost (vinfo, stmt_info, ncopies, vf, memory_access_type, slp_node, cost_vec); @@ -8937,6 +9030,7 @@ vectorizable_load (vec_info *vinfo, gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)); gcc_assert (!nested_in_vect_loop); + gcc_assert (!LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)); if (grouped_load) { @@ -9234,6 +9328,15 @@ vectorizable_load (vec_info *vinfo, = (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) ? &LOOP_VINFO_MASKS (loop_vinfo) : NULL); + + vec_loop_lens *loop_lens + = (loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo) + ? &LOOP_VINFO_LENS (loop_vinfo) + : NULL); + + /* Shouldn't go with length if fully masked. */ + gcc_assert (!loop_lens || (loop_lens && !loop_masks)); + /* Targets with store-lane instructions must not require explicit realignment. vect_supportable_dr_alignment always returns either dr_aligned or dr_unaligned_supported for masked operations. */ @@ -9555,15 +9658,20 @@ vectorizable_load (vec_info *vinfo, for (i = 0; i < vec_num; i++) { tree final_mask = NULL_TREE; + tree final_len = NULL_TREE; if (loop_masks && memory_access_type != VMAT_INVARIANT) final_mask = vect_get_loop_mask (gsi, loop_masks, vec_num * ncopies, vectype, vec_num * j + i); + else if (loop_lens && memory_access_type != VMAT_INVARIANT) + final_len = vect_get_loop_len (loop_lens, vec_num * ncopies, + vec_num * j + i); if (vec_mask) final_mask = prepare_load_store_mask (mask_vectype, final_mask, vec_mask, gsi); + if (i > 0) dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi, stmt_info, bump); @@ -9629,6 +9737,18 @@ vectorizable_load (vec_info *vinfo, new_stmt = call; data_ref = NULL_TREE; } + else if (final_len) + { + align = least_bit_hwi (misalign | align); + tree ptr = build_int_cst (ref_type, align); + gcall *call + = gimple_build_call_internal (IFN_LEN_LOAD, 3, + dataref_ptr, ptr, + final_len); + gimple_call_set_nothrow (call, true); + new_stmt = call; + data_ref = NULL_TREE; + } else { tree ltype = vectype; @@ -12480,3 +12600,35 @@ vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info, *nunits_vectype_out = nunits_vectype; return opt_result::success (); } + +/* Generate and return statement sequence that sets vector length LEN that is: + + min_of_start_and_end = min (START_INDEX, END_INDEX); + left_bytes = END_INDEX - min_of_start_and_end; + rhs = min (left_bytes, VECTOR_SIZE); + LEN = rhs; + + TODO: for now, rs6000 supported vector with length only cares 8-bits, which + means if we have left_bytes larger than 255, it can't be saturated to vector + size. One target hook can be provided if other ports don't suffer this. +*/ + +gimple_seq +vect_gen_len (tree len, tree start_index, tree end_index, tree vector_size) +{ + gimple_seq stmts = NULL; + tree len_type = TREE_TYPE (len); + gcc_assert (TREE_TYPE (start_index) == len_type); + + tree min = fold_build2 (MIN_EXPR, len_type, start_index, end_index); + tree left_bytes = fold_build2 (MINUS_EXPR, len_type, end_index, min); + left_bytes = fold_build2 (MIN_EXPR, len_type, left_bytes, vector_size); + + tree rhs = force_gimple_operand (left_bytes, &stmts, true, NULL_TREE); + gimple *new_stmt = gimple_build_assign (len, rhs); + gimple_stmt_iterator i = gsi_last (stmts); + gsi_insert_after_without_update (&i, new_stmt, GSI_CONTINUE_LINKING); + + return stmts; +} + diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 2eb3ab5d280..774d5025639 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -476,6 +476,21 @@ struct rgroup_masks { typedef auto_vec<rgroup_masks> vec_loop_masks; +/* Similar to masks above, the lengths needed by rgroups with nV vectors. */ +struct rgroup_lens +{ + /* The total bytes for any nS per iteration. */ + unsigned int nbytes_per_iter; + + /* Any vector type to use these lengths. */ + tree vec_type; + + /* A vector of nV lengths, in iteration order. */ + vec<tree> lens; +}; + +typedef auto_vec<rgroup_lens> vec_loop_lens; + typedef auto_vec<std::pair<data_reference*, tree> > drs_init_vec; /*-----------------------------------------------------------------*/ @@ -523,6 +538,10 @@ public: on inactive scalars. */ vec_loop_masks masks; + /* The lengths that a loop with length should use to avoid operating + on inactive scalars. */ + vec_loop_lens lens; + /* Set of scalar conditions that have loop mask applied. */ scalar_cond_masked_set_type scalar_cond_masked_set; @@ -626,6 +645,12 @@ public: /* True if have decided to use a fully-masked loop. */ bool fully_masked_p; + /* Records whether we still have the option of using a length access loop. */ + bool can_with_length_p; + + /* True if have decided to use length access for the loop fully. */ + bool fully_with_length_p; + /* When we have grouped data accesses with gaps, we may introduce invalid memory accesses. We peel the last iteration of the loop to prevent this. */ @@ -689,6 +714,9 @@ public: #define LOOP_VINFO_VECTORIZABLE_P(L) (L)->vectorizable #define LOOP_VINFO_CAN_FULLY_MASK_P(L) (L)->can_fully_mask_p #define LOOP_VINFO_FULLY_MASKED_P(L) (L)->fully_masked_p +#define LOOP_VINFO_CAN_WITH_LENGTH_P(L) (L)->can_with_length_p +#define LOOP_VINFO_FULLY_WITH_LENGTH_P(L) (L)->fully_with_length_p +#define LOOP_VINFO_LENS(L) (L)->lens #define LOOP_VINFO_VECT_FACTOR(L) (L)->vectorization_factor #define LOOP_VINFO_MAX_VECT_FACTOR(L) (L)->max_vectorization_factor #define LOOP_VINFO_MASKS(L) (L)->masks @@ -1842,6 +1870,10 @@ extern void vect_record_loop_mask (loop_vec_info, vec_loop_masks *, unsigned int, tree, tree); extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *, unsigned int, tree, unsigned int); +extern void vect_record_loop_len (loop_vec_info, vec_loop_lens *, unsigned int, + tree); +extern tree vect_get_loop_len (vec_loop_lens *, unsigned int, unsigned int); +extern gimple_seq vect_gen_len (tree, tree, tree, tree); extern stmt_vec_info info_for_reduction (vec_info *, stmt_vec_info); /* Drive for loop transformation stage. */ --