On Tue, 8 Sep 2020, Christophe Lyon wrote: > Hi Richard, > > On Fri, 4 Sep 2020 at 15:42, Richard Biener <rguent...@suse.de> wrote: > > > > The following adds the capability to code-generate live lanes in > > basic-block vectorization using lane extracts from vector stmts > > rather than keeping the original scalar code around for those. > > This eventually makes previously not profitable vectorizations > > profitable (the live scalar code was appropriately costed so > > are the lane extracts now), without considering the cost model > > this patch doesn't add or remove any basic-block vectorization > > capabilities. > > > > The patch re/ab-uses STMT_VINFO_LIVE_P in basic-block vectorization > > mode to tell whether a live lane is vectorized or whether it is > > provided by means of keeping the scalar code live. > > > > The patch is a first step towards vectorizing sequences of > > stmts that do not end up in stores or vector constructors though. > > > > Bootstrapped and tested on x86_64-unknown-linux-gnu. > > > > Any comments? > > > Yes: this is causing an ICE on arm:
As usual this isn't enough for me to reproduce with a cross. Can you open a bugreport with the cc1 command & the configury pasted? Thanks, Richard. > FAIL: gcc.dg/vect/bb-slp-pr92596.c (internal compiler error) > FAIL: gcc.dg/vect/bb-slp-pr92596.c (test for excess errors) > Excess errors: > during GIMPLE pass: slp > dump file: bb-slp-pr92596.c.173t.slp2 > /gcc/testsuite/gcc.dg/vect/bb-slp-pr92596.c:11:6: internal compiler > error: in vect_transform_stmt, at tree-vect-stmts.c:10870 > 0xfa16cc vect_transform_stmt(vec_info*, _stmt_vec_info*, > gimple_stmt_iterator*, _slp_tree*, _slp_instance*) > /gcc/tree-vect-stmts.c:10870 > 0xfd6954 vect_schedule_slp_instance > /gcc/tree-vect-slp.c:4570 > 0xfd684f vect_schedule_slp_instance > /gcc/tree-vect-slp.c:4436 > 0xfd684f vect_schedule_slp_instance > /gcc/tree-vect-slp.c:4436 > 0xfdeace vect_schedule_slp(vec_info*) > /gcc/tree-vect-slp.c:4695 > 0xfe2529 vect_slp_region > /gcc/tree-vect-slp.c:3529 > 0xfe33d7 vect_slp_bb(basic_block_def*) > /gcc/tree-vect-slp.c:3647 > 0xfe503c execute > /gcc/tree-vectorizer.c:1429 > > Christophe > > > Thanks, > > Richard. > > > > 2020-09-04 Richard Biener <rguent...@suse.de> > > > > * tree-vectorizer.h (vectorizable_live_operation): Adjust. > > * tree-vect-loop.c (vectorizable_live_operation): Vectorize > > live lanes out of basic-block vectorization nodes. > > * tree-vect-slp.c (vect_bb_slp_mark_live_stmts): New function. > > (vect_slp_analyze_operations): Analyze live lanes and their > > vectorization possibility after the whole SLP graph is final. > > (vect_bb_slp_scalar_cost): Adjust for vectorized live lanes. > > * tree-vect-stmts.c (can_vectorize_live_stmts): Adjust. > > (vect_transform_stmt): Call can_vectorize_live_stmts also for > > basic-block vectorization. > > > > * gcc.dg/vect/bb-slp-46.c: New testcase. > > * gcc.dg/vect/bb-slp-47.c: Likewise. > > * gcc.dg/vect/bb-slp-32.c: Adjust. > > --- > > gcc/testsuite/gcc.dg/vect/bb-slp-32.c | 7 +- > > gcc/testsuite/gcc.dg/vect/bb-slp-46.c | 28 +++ > > gcc/testsuite/gcc.dg/vect/bb-slp-47.c | 14 ++ > > gcc/tree-vect-loop.c | 243 ++++++++++++++++---------- > > gcc/tree-vect-slp.c | 145 +++++++++++++-- > > gcc/tree-vect-stmts.c | 12 +- > > gcc/tree-vectorizer.h | 2 +- > > 7 files changed, 332 insertions(+), 119 deletions(-) > > create mode 100644 gcc/testsuite/gcc.dg/vect/bb-slp-46.c > > create mode 100644 gcc/testsuite/gcc.dg/vect/bb-slp-47.c > > > > diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-32.c > > b/gcc/testsuite/gcc.dg/vect/bb-slp-32.c > > index 41bbf352156..020b6365e02 100644 > > --- a/gcc/testsuite/gcc.dg/vect/bb-slp-32.c > > +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-32.c > > @@ -7,16 +7,21 @@ int foo (int *p, int a, int b) > > { > > int x[4]; > > int tem0, tem1, tem2, tem3; > > + int sum = 0; > > tem0 = p[0] + 1 + a; > > + sum += tem0; > > x[0] = tem0; > > tem1 = p[1] + 2 + b; > > + sum += tem1; > > x[1] = tem1; > > tem2 = p[2] + 3 + b; > > + sum += tem2; > > x[2] = tem2; > > tem3 = p[3] + 4 + a; > > + sum += tem3; > > x[3] = tem3; > > bar (x); > > - return tem0 + tem1 + tem2 + tem3; > > + return sum; > > } > > > > /* { dg-final { scan-tree-dump "vectorization is not profitable" "slp2" { > > xfail { vect_no_align && { ! vect_hw_misalign } } } } } */ > > diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-46.c > > b/gcc/testsuite/gcc.dg/vect/bb-slp-46.c > > new file mode 100644 > > index 00000000000..4e4571ef640 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-46.c > > @@ -0,0 +1,28 @@ > > +/* { dg-do compile } */ > > +/* { dg-require-effective-target vect_int } */ > > +/* { dg-additional-options "-fdump-tree-optimized" } */ > > + > > +int a[4], b[4]; > > +int foo () > > +{ > > + int tem0 = a[0] + b[0]; > > + int temx = tem0 * 17; /* this fails without a real need */ > > + int tem1 = a[1] + b[1]; > > + int tem2 = a[2] + b[2]; > > + int tem3 = a[3] + b[3]; > > + int temy = tem3 * 13; > > + a[0] = tem0; > > + a[1] = tem1; > > + a[2] = tem2; > > + a[3] = tem3; > > + return temx + temy; > > +} > > + > > +/* We should extract the live lane from the vectorized add rather than > > + keeping the original scalar add. > > + ??? Because of a too conservative check we fail for temx here. */ > > +/* { dg-final { scan-tree-dump "basic block vectorized" "slp2" } } */ > > +/* { dg-final { scan-tree-dump "extracting lane for live stmt" "slp2" } } > > */ > > +/* { dg-final { scan-tree-dump-times "extracting lane for live stmt" 2 > > "slp2" { xfail *-*-* } } } */ > > +/* { dg-final { scan-tree-dump-times " \\+ " 3 "optimized" } } */ > > +/* { dg-final { scan-tree-dump-times " \\+ " 2 "optimized" { xfail *-*-* } > > } } */ > > diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-47.c > > b/gcc/testsuite/gcc.dg/vect/bb-slp-47.c > > new file mode 100644 > > index 00000000000..9583b09cfbd > > --- /dev/null > > +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-47.c > > @@ -0,0 +1,14 @@ > > +/* { dg-do compile } */ > > + > > +int bar(); > > +int foo (int *a, int b, int c) > > +{ > > + int tem0 = bar (); > > + int tem1 = tem0 + b; > > + int tem3 = tem1 + c; > > + a[0] = tem3; > > + a[1] = tem3 + 1; > > + a[2] = tem3 + 2; > > + a[3] = tem3 + 3; > > + return tem1; > > +} > > diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c > > index 362cdc4f1cb..2e4ef37d956 100644 > > --- a/gcc/tree-vect-loop.c > > +++ b/gcc/tree-vect-loop.c > > @@ -8019,14 +8019,14 @@ vectorizable_induction (loop_vec_info loop_vinfo, > > it can be supported. */ > > > > bool > > -vectorizable_live_operation (loop_vec_info loop_vinfo, > > +vectorizable_live_operation (vec_info *vinfo, > > stmt_vec_info stmt_info, > > gimple_stmt_iterator *gsi, > > slp_tree slp_node, slp_instance > > slp_node_instance, > > int slp_index, bool vec_stmt_p, > > - stmt_vector_for_cost *) > > + stmt_vector_for_cost *cost_vec) > > { > > - class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); > > + loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo); > > imm_use_iterator imm_iter; > > tree lhs, lhs_type, bitsize, vec_bitsize; > > tree vectype = STMT_VINFO_VECTYPE (stmt_info); > > @@ -8071,10 +8071,6 @@ vectorizable_live_operation (loop_vec_info > > loop_vinfo, > > return true; > > } > > > > - /* FORNOW. CHECKME. */ > > - if (nested_in_vect_loop_p (loop, stmt_info)) > > - return false; > > - > > /* If STMT is not relevant and it is a simple assignment and its inputs > > are > > invariant then it can remain in place, unvectorized. The original > > last > > scalar value that it computes will be used. */ > > @@ -8097,12 +8093,11 @@ vectorizable_live_operation (loop_vec_info > > loop_vinfo, > > { > > gcc_assert (slp_index >= 0); > > > > - int num_scalar = SLP_TREE_LANES (slp_node); > > - int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); > > - > > /* Get the last occurrence of the scalar index from the > > concatenation of > > all the slp vectors. Calculate which slp vector it is and the index > > within. */ > > + int num_scalar = SLP_TREE_LANES (slp_node); > > + int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); > > poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index; > > > > /* Calculate which vector contains the result, and which lane of > > @@ -8120,7 +8115,7 @@ vectorizable_live_operation (loop_vec_info loop_vinfo, > > if (!vec_stmt_p) > > { > > /* No transformation required. */ > > - if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) > > + if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) > > { > > if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype, > > OPTIMIZE_FOR_SPEED)) > > @@ -8157,14 +8152,20 @@ vectorizable_live_operation (loop_vec_info > > loop_vinfo, > > 1, vectype, NULL); > > } > > } > > + /* ??? Enable for loop costing as well. */ > > + if (!loop_vinfo) > > + record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE, > > + 0, vect_epilogue); > > return true; > > } > > > > /* Use the lhs of the original scalar statement. */ > > gimple *stmt = vect_orig_stmt (stmt_info)->stmt; > > + if (dump_enabled_p ()) > > + dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live " > > + "stmt %G", stmt); > > > > - lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt) > > - : gimple_get_lhs (stmt); > > + lhs = gimple_get_lhs (stmt); > > lhs_type = TREE_TYPE (lhs); > > > > bitsize = vector_element_bits_tree (vectype); > > @@ -8172,16 +8173,14 @@ vectorizable_live_operation (loop_vec_info > > loop_vinfo, > > > > /* Get the vectorized lhs of STMT and the lane to use (counted in bits). > > */ > > tree vec_lhs, bitstart; > > + gimple *vec_stmt; > > if (slp_node) > > { > > - gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)); > > + gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)); > > > > /* Get the correct slp vectorized stmt. */ > > - gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]; > > - if (gphi *phi = dyn_cast <gphi *> (vec_stmt)) > > - vec_lhs = gimple_phi_result (phi); > > - else > > - vec_lhs = gimple_get_lhs (vec_stmt); > > + vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]; > > + vec_lhs = gimple_get_lhs (vec_stmt); > > > > /* Get entry to use. */ > > bitstart = bitsize_int (vec_index); > > @@ -8190,102 +8189,158 @@ vectorizable_live_operation (loop_vec_info > > loop_vinfo, > > else > > { > > /* For multiple copies, get the last copy. */ > > - vec_lhs = gimple_get_lhs (STMT_VINFO_VEC_STMTS (stmt_info).last ()); > > + vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last (); > > + vec_lhs = gimple_get_lhs (vec_stmt); > > > > /* Get the last lane in the vector. */ > > bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize); > > } > > > > - /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI > > - requirement, insert one phi node for it. It looks like: > > - loop; > > - BB: > > - # lhs' = PHI <lhs> > > - ==> > > - loop; > > - BB: > > - # vec_lhs' = PHI <vec_lhs> > > - new_tree = lane_extract <vec_lhs', ...>; > > - lhs' = new_tree; */ > > + if (loop_vinfo) > > + { > > + /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed > > PHI > > + requirement, insert one phi node for it. It looks like: > > + loop; > > + BB: > > + # lhs' = PHI <lhs> > > + ==> > > + loop; > > + BB: > > + # vec_lhs' = PHI <vec_lhs> > > + new_tree = lane_extract <vec_lhs', ...>; > > + lhs' = new_tree; */ > > + > > + class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); > > + basic_block exit_bb = single_exit (loop)->dest; > > + gcc_assert (single_pred_p (exit_bb)); > > + > > + tree vec_lhs_phi = copy_ssa_name (vec_lhs); > > + gimple *phi = create_phi_node (vec_lhs_phi, exit_bb); > > + SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs); > > + > > + gimple_seq stmts = NULL; > > + tree new_tree; > > + if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) > > + { > > + /* Emit: > > > > - basic_block exit_bb = single_exit (loop)->dest; > > - gcc_assert (single_pred_p (exit_bb)); > > + SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK> > > > > - tree vec_lhs_phi = copy_ssa_name (vec_lhs); > > - gimple *phi = create_phi_node (vec_lhs_phi, exit_bb); > > - SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs); > > + where VEC_LHS is the vectorized live-out result and MASK is > > + the loop mask for the final iteration. */ > > + gcc_assert (ncopies == 1 && !slp_node); > > + tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info)); > > + tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS > > (loop_vinfo), > > + 1, vectype, 0); > > + tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, > > scalar_type, > > + mask, vec_lhs_phi); > > > > - gimple_seq stmts = NULL; > > - tree new_tree; > > - if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) > > - { > > - /* Emit: > > + /* Convert the extracted vector element to the scalar type. */ > > + new_tree = gimple_convert (&stmts, lhs_type, scalar_res); > > + } > > + else > > + { > > + tree bftype = TREE_TYPE (vectype); > > + if (VECTOR_BOOLEAN_TYPE_P (vectype)) > > + bftype = build_nonstandard_integer_type (tree_to_uhwi > > (bitsize), 1); > > + new_tree = build3 (BIT_FIELD_REF, bftype, > > + vec_lhs_phi, bitsize, bitstart); > > + new_tree = force_gimple_operand (fold_convert (lhs_type, > > new_tree), > > + &stmts, true, NULL_TREE); > > + } > > > > - SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK> > > + if (stmts) > > + { > > + gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb); > > + gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); > > > > - where VEC_LHS is the vectorized live-out result and MASK is > > - the loop mask for the final iteration. */ > > - gcc_assert (ncopies == 1 && !slp_node); > > - tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info)); > > - tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo), > > 1, > > - vectype, 0); > > - tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, > > scalar_type, > > - mask, vec_lhs_phi); > > + /* Remove existing phi from lhs and create one copy from > > new_tree. */ > > + tree lhs_phi = NULL_TREE; > > + gimple_stmt_iterator gsi; > > + for (gsi = gsi_start_phis (exit_bb); > > + !gsi_end_p (gsi); gsi_next (&gsi)) > > + { > > + gimple *phi = gsi_stmt (gsi); > > + if ((gimple_phi_arg_def (phi, 0) == lhs)) > > + { > > + remove_phi_node (&gsi, false); > > + lhs_phi = gimple_phi_result (phi); > > + gimple *copy = gimple_build_assign (lhs_phi, new_tree); > > + gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT); > > + break; > > + } > > + } > > + } > > > > - /* Convert the extracted vector element to the required scalar type. > > */ > > - new_tree = gimple_convert (&stmts, lhs_type, scalar_res); > > + /* Replace use of lhs with newly computed result. If the use stmt > > is a > > + single arg PHI, just replace all uses of PHI result. It's > > necessary > > + because lcssa PHI defining lhs may be before newly inserted stmt. > > */ > > + use_operand_p use_p; > > + FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs) > > + if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)) > > + && !is_gimple_debug (use_stmt)) > > + { > > + if (gimple_code (use_stmt) == GIMPLE_PHI > > + && gimple_phi_num_args (use_stmt) == 1) > > + { > > + replace_uses_by (gimple_phi_result (use_stmt), new_tree); > > + } > > + else > > + { > > + FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) > > + SET_USE (use_p, new_tree); > > + } > > + update_stmt (use_stmt); > > + } > > } > > else > > { > > + /* For basic-block vectorization simply insert the lane-extraction. > > */ > > tree bftype = TREE_TYPE (vectype); > > if (VECTOR_BOOLEAN_TYPE_P (vectype)) > > bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1); > > - new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, > > bitstart); > > + tree new_tree = build3 (BIT_FIELD_REF, bftype, > > + vec_lhs, bitsize, bitstart); > > + gimple_seq stmts = NULL; > > new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree), > > &stmts, true, NULL_TREE); > > - } > > > > - if (stmts) > > - { > > - gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb); > > - gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); > > + gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); > > > > - /* Remove existing phi from lhs and create one copy from new_tree. > > */ > > - tree lhs_phi = NULL_TREE; > > - gimple_stmt_iterator gsi; > > - for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next > > (&gsi)) > > - { > > - gimple *phi = gsi_stmt (gsi); > > - if ((gimple_phi_arg_def (phi, 0) == lhs)) > > - { > > - remove_phi_node (&gsi, false); > > - lhs_phi = gimple_phi_result (phi); > > - gimple *copy = gimple_build_assign (lhs_phi, new_tree); > > - gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT); > > - break; > > - } > > - } > > - } > > - > > - /* Replace use of lhs with newly computed result. If the use stmt is a > > - single arg PHI, just replace all uses of PHI result. It's necessary > > - because lcssa PHI defining lhs may be before newly inserted stmt. */ > > - use_operand_p use_p; > > - FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs) > > - if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)) > > - && !is_gimple_debug (use_stmt)) > > - { > > - if (gimple_code (use_stmt) == GIMPLE_PHI > > - && gimple_phi_num_args (use_stmt) == 1) > > - { > > - replace_uses_by (gimple_phi_result (use_stmt), new_tree); > > - } > > - else > > - { > > - FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) > > - SET_USE (use_p, new_tree); > > - } > > - update_stmt (use_stmt); > > + /* Replace use of lhs with newly computed result. If the use stmt > > is a > > + single arg PHI, just replace all uses of PHI result. It's > > necessary > > + because lcssa PHI defining lhs may be before newly inserted stmt. > > */ > > + use_operand_p use_p; > > + stmt_vec_info use_stmt_info; > > + FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs) > > + if (!is_gimple_debug (use_stmt) > > + && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt)) > > + || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))) > > + { > > + /* ??? This can happen when the live lane ends up being > > + used in a vector construction code-generated by an > > + external SLP node (and code-generation for that already > > + happened). See gcc.dg/vect/bb-slp-47.c. > > + Doing this is what would happen if that vector CTOR > > + were not code-generated yet so it is not too bad. > > + ??? In fact we'd likely want to avoid this situation > > + in the first place. */ > > + if (gimple_code (use_stmt) != GIMPLE_PHI > > + && !vect_stmt_dominates_stmt_p (gsi_stmt (*gsi), use_stmt)) > > + { > > + gcc_assert (is_gimple_assign (use_stmt) > > + && gimple_assign_rhs_code (use_stmt) == > > CONSTRUCTOR); > > + if (dump_enabled_p ()) > > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > > + "Using original scalar computation for " > > + "live lane because use preceeds vector " > > + "def\n"); > > + continue; > > + } > > + FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) > > + SET_USE (use_p, new_tree); > > + update_stmt (use_stmt); > > + } > > } > > > > return true; > > diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c > > index 15e5f277eac..520a23692f1 100644 > > --- a/gcc/tree-vect-slp.c > > +++ b/gcc/tree-vect-slp.c > > @@ -2969,6 +2969,101 @@ vect_slp_analyze_node_operations (vec_info *vinfo, > > slp_tree node, > > } > > > > > > +/* Mark lanes of NODE that are live outside of the basic-block vectorized > > + region and that can be vectorized using vectorizable_live_operation > > + with STMT_VINFO_LIVE_P. Not handled live operations will cause the > > + scalar code computing it to be retained. */ > > + > > +static void > > +vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node, > > + slp_instance instance, > > + stmt_vector_for_cost *cost_vec, > > + hash_set<stmt_vec_info> &svisited) > > +{ > > + unsigned i; > > + stmt_vec_info stmt_info; > > + stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node); > > + FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) > > + { > > + stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info); > > + if (svisited.contains (orig_stmt_info)) > > + continue; > > + bool mark_visited = true; > > + gimple *orig_stmt = orig_stmt_info->stmt; > > + ssa_op_iter op_iter; > > + def_operand_p def_p; > > + FOR_EACH_SSA_DEF_OPERAND (def_p, orig_stmt, op_iter, SSA_OP_DEF) > > + { > > + imm_use_iterator use_iter; > > + gimple *use_stmt; > > + stmt_vec_info use_stmt_info; > > + FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p)) > > + if (!is_gimple_debug (use_stmt)) > > + { > > + use_stmt_info = bb_vinfo->lookup_stmt (use_stmt); > > + if (!use_stmt_info > > + || !PURE_SLP_STMT (vect_stmt_to_vectorize > > (use_stmt_info))) > > + { > > + STMT_VINFO_LIVE_P (stmt_info) = true; > > + if (vectorizable_live_operation (bb_vinfo, stmt_info, > > + NULL, node, instance, > > i, > > + false, cost_vec)) > > + /* ??? So we know we can vectorize the live stmt > > + from one SLP node. If we cannot do so from all > > + or none consistently we'd have to record which > > + SLP node (and lane) we want to use for the live > > + operation. So make sure we can code-generate > > + from all nodes. */ > > + mark_visited = false; > > + else > > + STMT_VINFO_LIVE_P (stmt_info) = false; > > + BREAK_FROM_IMM_USE_STMT (use_iter); > > + } > > + } > > + /* We have to verify whether we can insert the lane extract > > + before all uses. The following is a conservative > > approximation. > > + We cannot put this into vectorizable_live_operation because > > + iterating over all use stmts from inside a > > FOR_EACH_IMM_USE_STMT > > + doesn't work. > > + Note that while the fact that we emit code for loads at the > > + first load should make this a non-problem leafs we construct > > + from scalars are vectorized after the last scalar def. > > + ??? If we'd actually compute the insert location during > > + analysis we could use sth less conservative than the last > > + scalar stmt in the node for the dominance check. */ > > + /* ??? What remains is "live" uses in vector CTORs in the same > > + SLP graph which is where those uses can end up code-generated > > + right after their definition instead of close to their original > > + use. But that would restrict us to code-generate lane-extracts > > + from the latest stmt in a node. So we compensate for this > > + during code-generation, simply not replacing uses for those > > + hopefully rare cases. */ > > + if (STMT_VINFO_LIVE_P (stmt_info)) > > + FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p)) > > + if (!is_gimple_debug (use_stmt) > > + && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt)) > > + || !PURE_SLP_STMT (vect_stmt_to_vectorize > > (use_stmt_info))) > > + && !vect_stmt_dominates_stmt_p (last_stmt->stmt, > > use_stmt)) > > + { > > + if (dump_enabled_p ()) > > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > > + "Cannot determine insertion place for " > > + "lane extract\n"); > > + STMT_VINFO_LIVE_P (stmt_info) = false; > > + mark_visited = true; > > + } > > + } > > + if (mark_visited) > > + svisited.add (orig_stmt_info); > > + } > > + > > + slp_tree child; > > + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) > > + if (SLP_TREE_DEF_TYPE (child) == vect_internal_def) > > + vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance, > > + cost_vec, svisited); > > +} > > + > > /* Analyze statements in SLP instances of VINFO. Return true if the > > operations are supported. */ > > > > @@ -3018,6 +3113,19 @@ vect_slp_analyze_operations (vec_info *vinfo) > > } > > } > > > > + /* Compute vectorizable live stmts. */ > > + if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo)) > > + { > > + hash_set<stmt_vec_info> svisited; > > + stmt_vector_for_cost cost_vec; > > + cost_vec.create (2); > > + for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i) > > + vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance), > > + instance, &cost_vec, svisited); > > + add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec); > > + cost_vec.release (); > > + } > > + > > return !vinfo->slp_instances.is_empty (); > > } > > > > @@ -3047,31 +3155,36 @@ vect_bb_slp_scalar_cost (vec_info *vinfo, > > if ((*life)[i]) > > continue; > > > > + stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info); > > + gimple *orig_stmt = orig_stmt_info->stmt; > > + > > /* If there is a non-vectorized use of the defs then the scalar > > stmt is kept live in which case we do not account it or any > > required defs in the SLP children in the scalar cost. This > > way we make the vectorization more costly when compared to > > the scalar cost. */ > > - stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info); > > - gimple *orig_stmt = orig_stmt_info->stmt; > > - FOR_EACH_SSA_DEF_OPERAND (def_p, orig_stmt, op_iter, SSA_OP_DEF) > > + if (!STMT_VINFO_LIVE_P (stmt_info)) > > { > > - imm_use_iterator use_iter; > > - gimple *use_stmt; > > - FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p)) > > - if (!is_gimple_debug (use_stmt)) > > - { > > - stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt); > > - if (!use_stmt_info > > - || !PURE_SLP_STMT (vect_stmt_to_vectorize > > (use_stmt_info))) > > + FOR_EACH_SSA_DEF_OPERAND (def_p, orig_stmt, op_iter, SSA_OP_DEF) > > + { > > + imm_use_iterator use_iter; > > + gimple *use_stmt; > > + FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR > > (def_p)) > > + if (!is_gimple_debug (use_stmt)) > > { > > - (*life)[i] = true; > > - BREAK_FROM_IMM_USE_STMT (use_iter); > > + stmt_vec_info use_stmt_info = vinfo->lookup_stmt > > (use_stmt); > > + if (!use_stmt_info > > + || !PURE_SLP_STMT > > + (vect_stmt_to_vectorize (use_stmt_info))) > > + { > > + (*life)[i] = true; > > + BREAK_FROM_IMM_USE_STMT (use_iter); > > + } > > } > > - } > > + } > > + if ((*life)[i]) > > + continue; > > } > > - if ((*life)[i]) > > - continue; > > > > /* Count scalar stmts only once. */ > > if (gimple_visited_p (orig_stmt)) > > diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c > > index 7e072a2e636..ea09cc08780 100644 > > --- a/gcc/tree-vect-stmts.c > > +++ b/gcc/tree-vect-stmts.c > > @@ -10532,7 +10532,7 @@ vectorizable_comparison (vec_info *vinfo, > > GSI and VEC_STMT_P are as for vectorizable_live_operation. */ > > > > static bool > > -can_vectorize_live_stmts (loop_vec_info loop_vinfo, > > +can_vectorize_live_stmts (vec_info *vinfo, > > stmt_vec_info stmt_info, gimple_stmt_iterator > > *gsi, > > slp_tree slp_node, slp_instance slp_node_instance, > > bool vec_stmt_p, > > @@ -10545,7 +10545,7 @@ can_vectorize_live_stmts (loop_vec_info loop_vinfo, > > FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, slp_stmt_info) > > { > > if (STMT_VINFO_LIVE_P (slp_stmt_info) > > - && !vectorizable_live_operation (loop_vinfo, > > + && !vectorizable_live_operation (vinfo, > > slp_stmt_info, gsi, slp_node, > > slp_node_instance, i, > > vec_stmt_p, cost_vec)) > > @@ -10553,7 +10553,7 @@ can_vectorize_live_stmts (loop_vec_info loop_vinfo, > > } > > } > > else if (STMT_VINFO_LIVE_P (stmt_info) > > - && !vectorizable_live_operation (loop_vinfo, stmt_info, gsi, > > + && !vectorizable_live_operation (vinfo, stmt_info, gsi, > > slp_node, slp_node_instance, -1, > > vec_stmt_p, cost_vec)) > > return false; > > @@ -10950,10 +10950,8 @@ vect_transform_stmt (vec_info *vinfo, > > > > /* Handle stmts whose DEF is used outside the loop-nest that is > > being vectorized. */ > > - if (is_a <loop_vec_info> (vinfo)) > > - done = can_vectorize_live_stmts (as_a <loop_vec_info> (vinfo), > > - stmt_info, gsi, slp_node, > > - slp_node_instance, true, NULL); > > + done = can_vectorize_live_stmts (vinfo, stmt_info, gsi, slp_node, > > + slp_node_instance, true, NULL); > > gcc_assert (done); > > > > return false; > > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h > > index f36e2ad9626..6a560ae8d19 100644 > > --- a/gcc/tree-vectorizer.h > > +++ b/gcc/tree-vectorizer.h > > @@ -1995,7 +1995,7 @@ extern stmt_vec_info info_for_reduction (vec_info *, > > stmt_vec_info); > > extern class loop *vect_transform_loop (loop_vec_info, gimple *); > > extern opt_loop_vec_info vect_analyze_loop_form (class loop *, > > vec_info_shared *); > > -extern bool vectorizable_live_operation (loop_vec_info, > > +extern bool vectorizable_live_operation (vec_info *, > > stmt_vec_info, > > gimple_stmt_iterator *, > > slp_tree, slp_instance, int, > > bool, stmt_vector_for_cost *); > > -- > > 2.26.2 > -- Richard Biener <rguent...@suse.de> SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg, Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)