Hi Richard, On Fri, 4 Sep 2020 at 15:42, Richard Biener <rguent...@suse.de> wrote: > > The following adds the capability to code-generate live lanes in > basic-block vectorization using lane extracts from vector stmts > rather than keeping the original scalar code around for those. > This eventually makes previously not profitable vectorizations > profitable (the live scalar code was appropriately costed so > are the lane extracts now), without considering the cost model > this patch doesn't add or remove any basic-block vectorization > capabilities. > > The patch re/ab-uses STMT_VINFO_LIVE_P in basic-block vectorization > mode to tell whether a live lane is vectorized or whether it is > provided by means of keeping the scalar code live. > > The patch is a first step towards vectorizing sequences of > stmts that do not end up in stores or vector constructors though. > > Bootstrapped and tested on x86_64-unknown-linux-gnu. > > Any comments? > Yes: this is causing an ICE on arm: FAIL: gcc.dg/vect/bb-slp-pr92596.c (internal compiler error) FAIL: gcc.dg/vect/bb-slp-pr92596.c (test for excess errors) Excess errors: during GIMPLE pass: slp dump file: bb-slp-pr92596.c.173t.slp2 /gcc/testsuite/gcc.dg/vect/bb-slp-pr92596.c:11:6: internal compiler error: in vect_transform_stmt, at tree-vect-stmts.c:10870 0xfa16cc vect_transform_stmt(vec_info*, _stmt_vec_info*, gimple_stmt_iterator*, _slp_tree*, _slp_instance*) /gcc/tree-vect-stmts.c:10870 0xfd6954 vect_schedule_slp_instance /gcc/tree-vect-slp.c:4570 0xfd684f vect_schedule_slp_instance /gcc/tree-vect-slp.c:4436 0xfd684f vect_schedule_slp_instance /gcc/tree-vect-slp.c:4436 0xfdeace vect_schedule_slp(vec_info*) /gcc/tree-vect-slp.c:4695 0xfe2529 vect_slp_region /gcc/tree-vect-slp.c:3529 0xfe33d7 vect_slp_bb(basic_block_def*) /gcc/tree-vect-slp.c:3647 0xfe503c execute /gcc/tree-vectorizer.c:1429
Christophe > Thanks, > Richard. > > 2020-09-04 Richard Biener <rguent...@suse.de> > > * tree-vectorizer.h (vectorizable_live_operation): Adjust. > * tree-vect-loop.c (vectorizable_live_operation): Vectorize > live lanes out of basic-block vectorization nodes. > * tree-vect-slp.c (vect_bb_slp_mark_live_stmts): New function. > (vect_slp_analyze_operations): Analyze live lanes and their > vectorization possibility after the whole SLP graph is final. > (vect_bb_slp_scalar_cost): Adjust for vectorized live lanes. > * tree-vect-stmts.c (can_vectorize_live_stmts): Adjust. > (vect_transform_stmt): Call can_vectorize_live_stmts also for > basic-block vectorization. > > * gcc.dg/vect/bb-slp-46.c: New testcase. > * gcc.dg/vect/bb-slp-47.c: Likewise. > * gcc.dg/vect/bb-slp-32.c: Adjust. > --- > gcc/testsuite/gcc.dg/vect/bb-slp-32.c | 7 +- > gcc/testsuite/gcc.dg/vect/bb-slp-46.c | 28 +++ > gcc/testsuite/gcc.dg/vect/bb-slp-47.c | 14 ++ > gcc/tree-vect-loop.c | 243 ++++++++++++++++---------- > gcc/tree-vect-slp.c | 145 +++++++++++++-- > gcc/tree-vect-stmts.c | 12 +- > gcc/tree-vectorizer.h | 2 +- > 7 files changed, 332 insertions(+), 119 deletions(-) > create mode 100644 gcc/testsuite/gcc.dg/vect/bb-slp-46.c > create mode 100644 gcc/testsuite/gcc.dg/vect/bb-slp-47.c > > diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-32.c > b/gcc/testsuite/gcc.dg/vect/bb-slp-32.c > index 41bbf352156..020b6365e02 100644 > --- a/gcc/testsuite/gcc.dg/vect/bb-slp-32.c > +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-32.c > @@ -7,16 +7,21 @@ int foo (int *p, int a, int b) > { > int x[4]; > int tem0, tem1, tem2, tem3; > + int sum = 0; > tem0 = p[0] + 1 + a; > + sum += tem0; > x[0] = tem0; > tem1 = p[1] + 2 + b; > + sum += tem1; > x[1] = tem1; > tem2 = p[2] + 3 + b; > + sum += tem2; > x[2] = tem2; > tem3 = p[3] + 4 + a; > + sum += tem3; > x[3] = tem3; > bar (x); > - return tem0 + tem1 + tem2 + tem3; > + return sum; > } > > /* { dg-final { scan-tree-dump "vectorization is not profitable" "slp2" { > xfail { vect_no_align && { ! vect_hw_misalign } } } } } */ > diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-46.c > b/gcc/testsuite/gcc.dg/vect/bb-slp-46.c > new file mode 100644 > index 00000000000..4e4571ef640 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-46.c > @@ -0,0 +1,28 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target vect_int } */ > +/* { dg-additional-options "-fdump-tree-optimized" } */ > + > +int a[4], b[4]; > +int foo () > +{ > + int tem0 = a[0] + b[0]; > + int temx = tem0 * 17; /* this fails without a real need */ > + int tem1 = a[1] + b[1]; > + int tem2 = a[2] + b[2]; > + int tem3 = a[3] + b[3]; > + int temy = tem3 * 13; > + a[0] = tem0; > + a[1] = tem1; > + a[2] = tem2; > + a[3] = tem3; > + return temx + temy; > +} > + > +/* We should extract the live lane from the vectorized add rather than > + keeping the original scalar add. > + ??? Because of a too conservative check we fail for temx here. */ > +/* { dg-final { scan-tree-dump "basic block vectorized" "slp2" } } */ > +/* { dg-final { scan-tree-dump "extracting lane for live stmt" "slp2" } } */ > +/* { dg-final { scan-tree-dump-times "extracting lane for live stmt" 2 > "slp2" { xfail *-*-* } } } */ > +/* { dg-final { scan-tree-dump-times " \\+ " 3 "optimized" } } */ > +/* { dg-final { scan-tree-dump-times " \\+ " 2 "optimized" { xfail *-*-* } } > } */ > diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-47.c > b/gcc/testsuite/gcc.dg/vect/bb-slp-47.c > new file mode 100644 > index 00000000000..9583b09cfbd > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-47.c > @@ -0,0 +1,14 @@ > +/* { dg-do compile } */ > + > +int bar(); > +int foo (int *a, int b, int c) > +{ > + int tem0 = bar (); > + int tem1 = tem0 + b; > + int tem3 = tem1 + c; > + a[0] = tem3; > + a[1] = tem3 + 1; > + a[2] = tem3 + 2; > + a[3] = tem3 + 3; > + return tem1; > +} > diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c > index 362cdc4f1cb..2e4ef37d956 100644 > --- a/gcc/tree-vect-loop.c > +++ b/gcc/tree-vect-loop.c > @@ -8019,14 +8019,14 @@ vectorizable_induction (loop_vec_info loop_vinfo, > it can be supported. */ > > bool > -vectorizable_live_operation (loop_vec_info loop_vinfo, > +vectorizable_live_operation (vec_info *vinfo, > stmt_vec_info stmt_info, > gimple_stmt_iterator *gsi, > slp_tree slp_node, slp_instance > slp_node_instance, > int slp_index, bool vec_stmt_p, > - stmt_vector_for_cost *) > + stmt_vector_for_cost *cost_vec) > { > - class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); > + loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo); > imm_use_iterator imm_iter; > tree lhs, lhs_type, bitsize, vec_bitsize; > tree vectype = STMT_VINFO_VECTYPE (stmt_info); > @@ -8071,10 +8071,6 @@ vectorizable_live_operation (loop_vec_info loop_vinfo, > return true; > } > > - /* FORNOW. CHECKME. */ > - if (nested_in_vect_loop_p (loop, stmt_info)) > - return false; > - > /* If STMT is not relevant and it is a simple assignment and its inputs are > invariant then it can remain in place, unvectorized. The original last > scalar value that it computes will be used. */ > @@ -8097,12 +8093,11 @@ vectorizable_live_operation (loop_vec_info loop_vinfo, > { > gcc_assert (slp_index >= 0); > > - int num_scalar = SLP_TREE_LANES (slp_node); > - int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); > - > /* Get the last occurrence of the scalar index from the concatenation > of > all the slp vectors. Calculate which slp vector it is and the index > within. */ > + int num_scalar = SLP_TREE_LANES (slp_node); > + int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); > poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index; > > /* Calculate which vector contains the result, and which lane of > @@ -8120,7 +8115,7 @@ vectorizable_live_operation (loop_vec_info loop_vinfo, > if (!vec_stmt_p) > { > /* No transformation required. */ > - if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) > + if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) > { > if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype, > OPTIMIZE_FOR_SPEED)) > @@ -8157,14 +8152,20 @@ vectorizable_live_operation (loop_vec_info loop_vinfo, > 1, vectype, NULL); > } > } > + /* ??? Enable for loop costing as well. */ > + if (!loop_vinfo) > + record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE, > + 0, vect_epilogue); > return true; > } > > /* Use the lhs of the original scalar statement. */ > gimple *stmt = vect_orig_stmt (stmt_info)->stmt; > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live " > + "stmt %G", stmt); > > - lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt) > - : gimple_get_lhs (stmt); > + lhs = gimple_get_lhs (stmt); > lhs_type = TREE_TYPE (lhs); > > bitsize = vector_element_bits_tree (vectype); > @@ -8172,16 +8173,14 @@ vectorizable_live_operation (loop_vec_info loop_vinfo, > > /* Get the vectorized lhs of STMT and the lane to use (counted in bits). > */ > tree vec_lhs, bitstart; > + gimple *vec_stmt; > if (slp_node) > { > - gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)); > + gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)); > > /* Get the correct slp vectorized stmt. */ > - gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]; > - if (gphi *phi = dyn_cast <gphi *> (vec_stmt)) > - vec_lhs = gimple_phi_result (phi); > - else > - vec_lhs = gimple_get_lhs (vec_stmt); > + vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]; > + vec_lhs = gimple_get_lhs (vec_stmt); > > /* Get entry to use. */ > bitstart = bitsize_int (vec_index); > @@ -8190,102 +8189,158 @@ vectorizable_live_operation (loop_vec_info > loop_vinfo, > else > { > /* For multiple copies, get the last copy. */ > - vec_lhs = gimple_get_lhs (STMT_VINFO_VEC_STMTS (stmt_info).last ()); > + vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last (); > + vec_lhs = gimple_get_lhs (vec_stmt); > > /* Get the last lane in the vector. */ > bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize); > } > > - /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI > - requirement, insert one phi node for it. It looks like: > - loop; > - BB: > - # lhs' = PHI <lhs> > - ==> > - loop; > - BB: > - # vec_lhs' = PHI <vec_lhs> > - new_tree = lane_extract <vec_lhs', ...>; > - lhs' = new_tree; */ > + if (loop_vinfo) > + { > + /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI > + requirement, insert one phi node for it. It looks like: > + loop; > + BB: > + # lhs' = PHI <lhs> > + ==> > + loop; > + BB: > + # vec_lhs' = PHI <vec_lhs> > + new_tree = lane_extract <vec_lhs', ...>; > + lhs' = new_tree; */ > + > + class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); > + basic_block exit_bb = single_exit (loop)->dest; > + gcc_assert (single_pred_p (exit_bb)); > + > + tree vec_lhs_phi = copy_ssa_name (vec_lhs); > + gimple *phi = create_phi_node (vec_lhs_phi, exit_bb); > + SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs); > + > + gimple_seq stmts = NULL; > + tree new_tree; > + if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) > + { > + /* Emit: > > - basic_block exit_bb = single_exit (loop)->dest; > - gcc_assert (single_pred_p (exit_bb)); > + SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK> > > - tree vec_lhs_phi = copy_ssa_name (vec_lhs); > - gimple *phi = create_phi_node (vec_lhs_phi, exit_bb); > - SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs); > + where VEC_LHS is the vectorized live-out result and MASK is > + the loop mask for the final iteration. */ > + gcc_assert (ncopies == 1 && !slp_node); > + tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info)); > + tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo), > + 1, vectype, 0); > + tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, > scalar_type, > + mask, vec_lhs_phi); > > - gimple_seq stmts = NULL; > - tree new_tree; > - if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) > - { > - /* Emit: > + /* Convert the extracted vector element to the scalar type. */ > + new_tree = gimple_convert (&stmts, lhs_type, scalar_res); > + } > + else > + { > + tree bftype = TREE_TYPE (vectype); > + if (VECTOR_BOOLEAN_TYPE_P (vectype)) > + bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), > 1); > + new_tree = build3 (BIT_FIELD_REF, bftype, > + vec_lhs_phi, bitsize, bitstart); > + new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree), > + &stmts, true, NULL_TREE); > + } > > - SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK> > + if (stmts) > + { > + gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb); > + gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); > > - where VEC_LHS is the vectorized live-out result and MASK is > - the loop mask for the final iteration. */ > - gcc_assert (ncopies == 1 && !slp_node); > - tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info)); > - tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo), 1, > - vectype, 0); > - tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type, > - mask, vec_lhs_phi); > + /* Remove existing phi from lhs and create one copy from new_tree. > */ > + tree lhs_phi = NULL_TREE; > + gimple_stmt_iterator gsi; > + for (gsi = gsi_start_phis (exit_bb); > + !gsi_end_p (gsi); gsi_next (&gsi)) > + { > + gimple *phi = gsi_stmt (gsi); > + if ((gimple_phi_arg_def (phi, 0) == lhs)) > + { > + remove_phi_node (&gsi, false); > + lhs_phi = gimple_phi_result (phi); > + gimple *copy = gimple_build_assign (lhs_phi, new_tree); > + gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT); > + break; > + } > + } > + } > > - /* Convert the extracted vector element to the required scalar type. > */ > - new_tree = gimple_convert (&stmts, lhs_type, scalar_res); > + /* Replace use of lhs with newly computed result. If the use stmt is a > + single arg PHI, just replace all uses of PHI result. It's necessary > + because lcssa PHI defining lhs may be before newly inserted stmt. */ > + use_operand_p use_p; > + FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs) > + if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)) > + && !is_gimple_debug (use_stmt)) > + { > + if (gimple_code (use_stmt) == GIMPLE_PHI > + && gimple_phi_num_args (use_stmt) == 1) > + { > + replace_uses_by (gimple_phi_result (use_stmt), new_tree); > + } > + else > + { > + FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) > + SET_USE (use_p, new_tree); > + } > + update_stmt (use_stmt); > + } > } > else > { > + /* For basic-block vectorization simply insert the lane-extraction. */ > tree bftype = TREE_TYPE (vectype); > if (VECTOR_BOOLEAN_TYPE_P (vectype)) > bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1); > - new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, > bitstart); > + tree new_tree = build3 (BIT_FIELD_REF, bftype, > + vec_lhs, bitsize, bitstart); > + gimple_seq stmts = NULL; > new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree), > &stmts, true, NULL_TREE); > - } > > - if (stmts) > - { > - gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb); > - gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); > + gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); > > - /* Remove existing phi from lhs and create one copy from new_tree. */ > - tree lhs_phi = NULL_TREE; > - gimple_stmt_iterator gsi; > - for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi)) > - { > - gimple *phi = gsi_stmt (gsi); > - if ((gimple_phi_arg_def (phi, 0) == lhs)) > - { > - remove_phi_node (&gsi, false); > - lhs_phi = gimple_phi_result (phi); > - gimple *copy = gimple_build_assign (lhs_phi, new_tree); > - gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT); > - break; > - } > - } > - } > - > - /* Replace use of lhs with newly computed result. If the use stmt is a > - single arg PHI, just replace all uses of PHI result. It's necessary > - because lcssa PHI defining lhs may be before newly inserted stmt. */ > - use_operand_p use_p; > - FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs) > - if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)) > - && !is_gimple_debug (use_stmt)) > - { > - if (gimple_code (use_stmt) == GIMPLE_PHI > - && gimple_phi_num_args (use_stmt) == 1) > - { > - replace_uses_by (gimple_phi_result (use_stmt), new_tree); > - } > - else > - { > - FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) > - SET_USE (use_p, new_tree); > - } > - update_stmt (use_stmt); > + /* Replace use of lhs with newly computed result. If the use stmt is a > + single arg PHI, just replace all uses of PHI result. It's necessary > + because lcssa PHI defining lhs may be before newly inserted stmt. */ > + use_operand_p use_p; > + stmt_vec_info use_stmt_info; > + FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs) > + if (!is_gimple_debug (use_stmt) > + && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt)) > + || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))) > + { > + /* ??? This can happen when the live lane ends up being > + used in a vector construction code-generated by an > + external SLP node (and code-generation for that already > + happened). See gcc.dg/vect/bb-slp-47.c. > + Doing this is what would happen if that vector CTOR > + were not code-generated yet so it is not too bad. > + ??? In fact we'd likely want to avoid this situation > + in the first place. */ > + if (gimple_code (use_stmt) != GIMPLE_PHI > + && !vect_stmt_dominates_stmt_p (gsi_stmt (*gsi), use_stmt)) > + { > + gcc_assert (is_gimple_assign (use_stmt) > + && gimple_assign_rhs_code (use_stmt) == > CONSTRUCTOR); > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > + "Using original scalar computation for " > + "live lane because use preceeds vector " > + "def\n"); > + continue; > + } > + FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) > + SET_USE (use_p, new_tree); > + update_stmt (use_stmt); > + } > } > > return true; > diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c > index 15e5f277eac..520a23692f1 100644 > --- a/gcc/tree-vect-slp.c > +++ b/gcc/tree-vect-slp.c > @@ -2969,6 +2969,101 @@ vect_slp_analyze_node_operations (vec_info *vinfo, > slp_tree node, > } > > > +/* Mark lanes of NODE that are live outside of the basic-block vectorized > + region and that can be vectorized using vectorizable_live_operation > + with STMT_VINFO_LIVE_P. Not handled live operations will cause the > + scalar code computing it to be retained. */ > + > +static void > +vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node, > + slp_instance instance, > + stmt_vector_for_cost *cost_vec, > + hash_set<stmt_vec_info> &svisited) > +{ > + unsigned i; > + stmt_vec_info stmt_info; > + stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node); > + FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) > + { > + stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info); > + if (svisited.contains (orig_stmt_info)) > + continue; > + bool mark_visited = true; > + gimple *orig_stmt = orig_stmt_info->stmt; > + ssa_op_iter op_iter; > + def_operand_p def_p; > + FOR_EACH_SSA_DEF_OPERAND (def_p, orig_stmt, op_iter, SSA_OP_DEF) > + { > + imm_use_iterator use_iter; > + gimple *use_stmt; > + stmt_vec_info use_stmt_info; > + FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p)) > + if (!is_gimple_debug (use_stmt)) > + { > + use_stmt_info = bb_vinfo->lookup_stmt (use_stmt); > + if (!use_stmt_info > + || !PURE_SLP_STMT (vect_stmt_to_vectorize > (use_stmt_info))) > + { > + STMT_VINFO_LIVE_P (stmt_info) = true; > + if (vectorizable_live_operation (bb_vinfo, stmt_info, > + NULL, node, instance, i, > + false, cost_vec)) > + /* ??? So we know we can vectorize the live stmt > + from one SLP node. If we cannot do so from all > + or none consistently we'd have to record which > + SLP node (and lane) we want to use for the live > + operation. So make sure we can code-generate > + from all nodes. */ > + mark_visited = false; > + else > + STMT_VINFO_LIVE_P (stmt_info) = false; > + BREAK_FROM_IMM_USE_STMT (use_iter); > + } > + } > + /* We have to verify whether we can insert the lane extract > + before all uses. The following is a conservative approximation. > + We cannot put this into vectorizable_live_operation because > + iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT > + doesn't work. > + Note that while the fact that we emit code for loads at the > + first load should make this a non-problem leafs we construct > + from scalars are vectorized after the last scalar def. > + ??? If we'd actually compute the insert location during > + analysis we could use sth less conservative than the last > + scalar stmt in the node for the dominance check. */ > + /* ??? What remains is "live" uses in vector CTORs in the same > + SLP graph which is where those uses can end up code-generated > + right after their definition instead of close to their original > + use. But that would restrict us to code-generate lane-extracts > + from the latest stmt in a node. So we compensate for this > + during code-generation, simply not replacing uses for those > + hopefully rare cases. */ > + if (STMT_VINFO_LIVE_P (stmt_info)) > + FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p)) > + if (!is_gimple_debug (use_stmt) > + && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt)) > + || !PURE_SLP_STMT (vect_stmt_to_vectorize > (use_stmt_info))) > + && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt)) > + { > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > + "Cannot determine insertion place for " > + "lane extract\n"); > + STMT_VINFO_LIVE_P (stmt_info) = false; > + mark_visited = true; > + } > + } > + if (mark_visited) > + svisited.add (orig_stmt_info); > + } > + > + slp_tree child; > + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) > + if (SLP_TREE_DEF_TYPE (child) == vect_internal_def) > + vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance, > + cost_vec, svisited); > +} > + > /* Analyze statements in SLP instances of VINFO. Return true if the > operations are supported. */ > > @@ -3018,6 +3113,19 @@ vect_slp_analyze_operations (vec_info *vinfo) > } > } > > + /* Compute vectorizable live stmts. */ > + if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo)) > + { > + hash_set<stmt_vec_info> svisited; > + stmt_vector_for_cost cost_vec; > + cost_vec.create (2); > + for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i) > + vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance), > + instance, &cost_vec, svisited); > + add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec); > + cost_vec.release (); > + } > + > return !vinfo->slp_instances.is_empty (); > } > > @@ -3047,31 +3155,36 @@ vect_bb_slp_scalar_cost (vec_info *vinfo, > if ((*life)[i]) > continue; > > + stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info); > + gimple *orig_stmt = orig_stmt_info->stmt; > + > /* If there is a non-vectorized use of the defs then the scalar > stmt is kept live in which case we do not account it or any > required defs in the SLP children in the scalar cost. This > way we make the vectorization more costly when compared to > the scalar cost. */ > - stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info); > - gimple *orig_stmt = orig_stmt_info->stmt; > - FOR_EACH_SSA_DEF_OPERAND (def_p, orig_stmt, op_iter, SSA_OP_DEF) > + if (!STMT_VINFO_LIVE_P (stmt_info)) > { > - imm_use_iterator use_iter; > - gimple *use_stmt; > - FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p)) > - if (!is_gimple_debug (use_stmt)) > - { > - stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt); > - if (!use_stmt_info > - || !PURE_SLP_STMT (vect_stmt_to_vectorize > (use_stmt_info))) > + FOR_EACH_SSA_DEF_OPERAND (def_p, orig_stmt, op_iter, SSA_OP_DEF) > + { > + imm_use_iterator use_iter; > + gimple *use_stmt; > + FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p)) > + if (!is_gimple_debug (use_stmt)) > { > - (*life)[i] = true; > - BREAK_FROM_IMM_USE_STMT (use_iter); > + stmt_vec_info use_stmt_info = vinfo->lookup_stmt > (use_stmt); > + if (!use_stmt_info > + || !PURE_SLP_STMT > + (vect_stmt_to_vectorize (use_stmt_info))) > + { > + (*life)[i] = true; > + BREAK_FROM_IMM_USE_STMT (use_iter); > + } > } > - } > + } > + if ((*life)[i]) > + continue; > } > - if ((*life)[i]) > - continue; > > /* Count scalar stmts only once. */ > if (gimple_visited_p (orig_stmt)) > diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c > index 7e072a2e636..ea09cc08780 100644 > --- a/gcc/tree-vect-stmts.c > +++ b/gcc/tree-vect-stmts.c > @@ -10532,7 +10532,7 @@ vectorizable_comparison (vec_info *vinfo, > GSI and VEC_STMT_P are as for vectorizable_live_operation. */ > > static bool > -can_vectorize_live_stmts (loop_vec_info loop_vinfo, > +can_vectorize_live_stmts (vec_info *vinfo, > stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, > slp_tree slp_node, slp_instance slp_node_instance, > bool vec_stmt_p, > @@ -10545,7 +10545,7 @@ can_vectorize_live_stmts (loop_vec_info loop_vinfo, > FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, slp_stmt_info) > { > if (STMT_VINFO_LIVE_P (slp_stmt_info) > - && !vectorizable_live_operation (loop_vinfo, > + && !vectorizable_live_operation (vinfo, > slp_stmt_info, gsi, slp_node, > slp_node_instance, i, > vec_stmt_p, cost_vec)) > @@ -10553,7 +10553,7 @@ can_vectorize_live_stmts (loop_vec_info loop_vinfo, > } > } > else if (STMT_VINFO_LIVE_P (stmt_info) > - && !vectorizable_live_operation (loop_vinfo, stmt_info, gsi, > + && !vectorizable_live_operation (vinfo, stmt_info, gsi, > slp_node, slp_node_instance, -1, > vec_stmt_p, cost_vec)) > return false; > @@ -10950,10 +10950,8 @@ vect_transform_stmt (vec_info *vinfo, > > /* Handle stmts whose DEF is used outside the loop-nest that is > being vectorized. */ > - if (is_a <loop_vec_info> (vinfo)) > - done = can_vectorize_live_stmts (as_a <loop_vec_info> (vinfo), > - stmt_info, gsi, slp_node, > - slp_node_instance, true, NULL); > + done = can_vectorize_live_stmts (vinfo, stmt_info, gsi, slp_node, > + slp_node_instance, true, NULL); > gcc_assert (done); > > return false; > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h > index f36e2ad9626..6a560ae8d19 100644 > --- a/gcc/tree-vectorizer.h > +++ b/gcc/tree-vectorizer.h > @@ -1995,7 +1995,7 @@ extern stmt_vec_info info_for_reduction (vec_info *, > stmt_vec_info); > extern class loop *vect_transform_loop (loop_vec_info, gimple *); > extern opt_loop_vec_info vect_analyze_loop_form (class loop *, > vec_info_shared *); > -extern bool vectorizable_live_operation (loop_vec_info, > +extern bool vectorizable_live_operation (vec_info *, > stmt_vec_info, gimple_stmt_iterator > *, > slp_tree, slp_instance, int, > bool, stmt_vector_for_cost *); > -- > 2.26.2