On Wed, Jun 26, 2024 at 4:48 PM Feng Xue OS <f...@os.amperecomputing.com> wrote: > > Updated the patches based on comments. > > The input vectype of reduction PHI statement must be determined before > vect cost computation for the reduction. Since lance-reducing operation has > different input vectype from normal one, so we need to traverse all reduction > statements to find out the input vectype with the least lanes, and set that to > the PHI statement.
OK > --- > gcc/tree-vect-loop.cc | 79 ++++++++++++++++++++++++++++++------------- > 1 file changed, 56 insertions(+), 23 deletions(-) > > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc > index 347dac97e49..419f4b08d2b 100644 > --- a/gcc/tree-vect-loop.cc > +++ b/gcc/tree-vect-loop.cc > @@ -7643,7 +7643,9 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > { > stmt_vec_info def = loop_vinfo->lookup_def (reduc_def); > stmt_vec_info vdef = vect_stmt_to_vectorize (def); > - if (STMT_VINFO_REDUC_IDX (vdef) == -1) > + int reduc_idx = STMT_VINFO_REDUC_IDX (vdef); > + > + if (reduc_idx == -1) > { > if (dump_enabled_p ()) > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > @@ -7686,10 +7688,57 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > return false; > } > } > - else if (!stmt_info) > - /* First non-conversion stmt. */ > - stmt_info = vdef; > - reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)]; > + else > + { > + /* First non-conversion stmt. */ > + if (!stmt_info) > + stmt_info = vdef; > + > + if (lane_reducing_op_p (op.code)) > + { > + enum vect_def_type dt; > + tree vectype_op; > + > + /* The last operand of lane-reducing operation is for > + reduction. */ > + gcc_assert (reduc_idx > 0 && reduc_idx == (int) op.num_ops - 1); > + > + if (!vect_is_simple_use (op.ops[0], loop_vinfo, &dt, > &vectype_op)) > + return false; > + > + tree type_op = TREE_TYPE (op.ops[0]); > + > + if (!vectype_op) > + { > + vectype_op = get_vectype_for_scalar_type (loop_vinfo, > + type_op); > + if (!vectype_op) > + return false; > + } > + > + /* For lane-reducing operation vectorizable analysis needs the > + reduction PHI information */ > + STMT_VINFO_REDUC_DEF (def) = phi_info; > + > + /* Each lane-reducing operation has its own input vectype, while > + reduction PHI will record the input vectype with the least > + lanes. */ > + STMT_VINFO_REDUC_VECTYPE_IN (vdef) = vectype_op; > + > + /* To accommodate lane-reducing operations of mixed input > + vectypes, choose input vectype with the least lanes for the > + reduction PHI statement, which would result in the most > + ncopies for vectorized reduction results. */ > + if (!vectype_in > + || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE > (vectype_in))) > + < GET_MODE_SIZE (SCALAR_TYPE_MODE (type_op)))) > + vectype_in = vectype_op; > + } > + else > + vectype_in = STMT_VINFO_VECTYPE (phi_info); > + } > + > + reduc_def = op.ops[reduc_idx]; > reduc_chain_length++; > if (!stmt_info && slp_node) > slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0]; > @@ -7747,6 +7796,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > > tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); > STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out; > + STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in; > + > gimple_match_op op; > if (!gimple_extract_op (stmt_info->stmt, &op)) > gcc_unreachable (); > @@ -7831,16 +7882,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > = get_vectype_for_scalar_type (loop_vinfo, > TREE_TYPE (op.ops[i]), slp_op[i]); > > - /* To properly compute ncopies we are interested in the widest > - non-reduction input type in case we're looking at a widening > - accumulation that we later handle in vect_transform_reduction. */ > - if (lane_reducing > - && vectype_op[i] > - && (!vectype_in > - || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in))) > - < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE > (vectype_op[i])))))) > - vectype_in = vectype_op[i]; > - > /* Record how the non-reduction-def value of COND_EXPR is defined. > ??? For a chain of multiple CONDs we'd have to match them up all. > */ > if (op.code == COND_EXPR && reduc_chain_length == 1) > @@ -7859,14 +7900,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > } > } > } > - if (!vectype_in) > - vectype_in = STMT_VINFO_VECTYPE (phi_info); > - STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in; > - > - /* Each lane-reducing operation has its own input vectype, while reduction > - PHI records the input vectype with least lanes. */ > - if (lane_reducing) > - STMT_VINFO_REDUC_VECTYPE_IN (stmt_info) = vectype_in; > > enum vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (phi_info); > STMT_VINFO_REDUC_TYPE (reduc_info) = reduction_type; > -- > 2.17.1 > > > ________________________________________ > From: Feng Xue OS <f...@os.amperecomputing.com> > Sent: Thursday, June 20, 2024 1:47 PM > To: Richard Biener > Cc: gcc-patches@gcc.gnu.org > Subject: Re: [PATCH 4/8] vect: Determine input vectype for multiple > lane-reducing > > >> + if (lane_reducing_op_p (op.code)) > >> + { > >> + unsigned group_size = slp_node ? SLP_TREE_LANES (slp_node) : > >> 0; > >> + tree op_type = TREE_TYPE (op.ops[0]); > >> + tree new_vectype_in = get_vectype_for_scalar_type > >> (loop_vinfo, > >> + op_type, > >> + > >> group_size); > > > > I think doing it this way does not adhere to the vector type size constraint > > with loop vectorization. You should use vect_is_simple_use like the > > original code did as the actual vector definition determines the vector type > > used. > > OK, though this might be wordy. > > Actually, STMT_VINFO_REDUC_VECTYPE_IN is logically equivalent to > nunits_vectype > that is determined in vect_determine_vf_for_stmt_1(). So how about setting > the type > in this function? > > > > > You are always using op.ops[0] here - I think that works because > > reduc_idx is the last operand of all lane-reducing ops. But then > > we should assert reduc_idx != 0 here and add a comment. > > Already added in the following assertion. > > >> + > >> + /* The last operand of lane-reducing operation is for > >> + reduction. */ > >> + gcc_assert (reduc_idx > 0 && reduc_idx == (int) op.num_ops - > >> 1); > > ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ > >> + > >> + /* For lane-reducing operation vectorizable analysis needs > >> the > >> + reduction PHI information */ > >> + STMT_VINFO_REDUC_DEF (def) = phi_info; > >> + > >> + if (!new_vectype_in) > >> + return false; > >> + > >> + /* Each lane-reducing operation has its own input vectype, > >> while > >> + reduction PHI will record the input vectype with the least > >> + lanes. */ > >> + STMT_VINFO_REDUC_VECTYPE_IN (vdef) = new_vectype_in; > >> + > >> + /* To accommodate lane-reducing operations of mixed input > >> + vectypes, choose input vectype with the least lanes for > >> the > >> + reduction PHI statement, which would result in the most > >> + ncopies for vectorized reduction results. */ > >> + if (!vectype_in > >> + || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE > >> (vectype_in))) > >> + < GET_MODE_SIZE (SCALAR_TYPE_MODE (op_type)))) > >> + vectype_in = new_vectype_in; > > > > I know this is a fragile area but I always wonder since the accumulating > > operand > > is the largest (all lane-reducing ops are widening), and that will be > > equal to the > > type of the PHI node, how this condition can be ever true. > > In the original code, accumulating operand is skipped! While it is correctly, > we > should not count the operand, this is why we call operation lane-reducing. > > > > > ncopies is determined by the VF, so the comment is at least misleading. > > > >> + } > >> + else > >> + vectype_in = STMT_VINFO_VECTYPE (phi_info); > > > > Please initialize vectype_in from phi_info before the loop (that > > should never be NULL). > > > > May not, as the below explanation. > > > I'll note that with your patch it seems we'd initialize vectype_in to > > the biggest > > non-accumulation vector type involved in lane-reducing ops but the > > accumulating > > type might still be larger. Why, when we have multiple lane-reducing > > ops, would > > we chose the largest input here? I see we eventually do > > > > if (slp_node) > > ncopies = 1; > > else > > ncopies = vect_get_num_copies (loop_vinfo, vectype_in); > > > > but then IIRC we always force a single cycle def for lane-reducing ops(?). > > > > In particular for vect_transform_reduction and SLP we rely on > > SLP_TREE_NUMBER_OF_VEC_STMTS while non-SLP uses > > STMT_VINFO_REDUC_VECTYPE_IN. > > > > So I wonder what breaks when we set vectype_in = vector type of PHI? > > > > Yes. It is right, nothing is broken. Suppose that a loop contains three > dot_prods, > two are <16 * char>, one is <8 * short>, and choose <4 * int> as vectype_in: > > With the patch #7, we get: > > vector<4> int sum_v0 = { 0, 0, 0, 0 }; > vector<4> int sum_v1 = { 0, 0, 0, 0 }; > vector<4> int sum_v2 = { 0, 0, 0, 0 }; > vector<4> int sum_v3 = { 0, 0, 0, 0 }; > > loop () { > sum_v0 = dot_prod<16 * char>(char_a0, char_a1, sum_v0); > > sum_v0 = dot_prod<16 * char>(char_b0, char_b1, sum_v0); > > sum_v0 = dot_prod<8 * short>(short_c0_lo, short_c1_lo, sum_v0); > sum_v1 = dot_prod<8 * short>(short_c0_hi, short_c1_hi, sum_v1); > > sum_v2 = sum_v2; > sum_v3 = sum_v3; > } > > The def/use cycles (sum_v2 and sum_v3> would be optimized away finally. > Then this gets same result as setting vectype_in to <8 * short>. > > With the patch #8, we get: > > vector<4> int sum_v0 = { 0, 0, 0, 0 }; > vector<4> int sum_v1 = { 0, 0, 0, 0 }; > vector<4> int sum_v2 = { 0, 0, 0, 0 }; > vector<4> int sum_v3 = { 0, 0, 0, 0 }; > > loop () { > sum_v0 = dot_prod<16 * char>(char_a0, char_a1, sum_v0); > > sum_v1 = dot_prod<16 * char>(char_b0, char_b1, sum_v1); > > sum_v2 = dot_prod<8 * short>(short_c0_lo, short_c1_lo, sum_v2); > sum_v3 = dot_prod<8 * short>(short_c0_hi, short_c1_hi, sum_v3); > } > > All dot_prods are assigned to separate def/use cycles, and no > dependency. More def/use cycles, higher instruction parallelism, > but there need extra cost in epilogue to combine the result. > > So we consider a somewhat compact def/use layout similar to > single-defuse-cycle, in which two <16 * char> dot_prods are independent, > and cycle 2 and 3 are not used, and this is better than the 1st scheme. > > vector<4> int sum_v0 = { 0, 0, 0, 0 }; > vector<4> int sum_v1 = { 0, 0, 0, 0 }; > > loop () { > sum_v0 = dot_prod<16 * char>(char_a0, char_a1, sum_v0); > > sum_v1 = dot_prod<16 * char>(char_b0, char_b1, sum_v1); > > sum_v0 = dot_prod<8 * short>(short_c0_lo, short_c1_lo, sum_v0); > sum_v1 = dot_prod<8 * short>(short_c0_hi, short_c1_hi, sum_v1); > } > > For this purpose, we need to track the vectype_in that results in > the most ncopies, for this case, the type is <8 * short>. > > BTW: would you please also take a look at patch #7 and #8? > > Thanks, > Feng > > ________________________________________ > From: Richard Biener <richard.guent...@gmail.com> > Sent: Wednesday, June 19, 2024 9:01 PM > To: Feng Xue OS > Cc: gcc-patches@gcc.gnu.org > Subject: Re: [PATCH 4/8] vect: Determine input vectype for multiple > lane-reducing > > On Sun, Jun 16, 2024 at 9:25 AM Feng Xue OS <f...@os.amperecomputing.com> > wrote: > > > > The input vectype of reduction PHI statement must be determined before > > vect cost computation for the reduction. Since lance-reducing operation has > > different input vectype from normal one, so we need to traverse all > > reduction > > statements to find out the input vectype with the least lanes, and set that > > to > > the PHI statement. > > > > Thanks, > > Feng > > > > --- > > gcc/ > > * tree-vect-loop.cc (vectorizable_reduction): Determine input > > vectype > > during traversal of reduction statements. > > --- > > gcc/tree-vect-loop.cc | 72 +++++++++++++++++++++++++++++-------------- > > 1 file changed, 49 insertions(+), 23 deletions(-) > > > > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc > > index 0f7b125e72d..39aa5cb1197 100644 > > --- a/gcc/tree-vect-loop.cc > > +++ b/gcc/tree-vect-loop.cc > > @@ -7643,7 +7643,9 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > > { > > stmt_vec_info def = loop_vinfo->lookup_def (reduc_def); > > stmt_vec_info vdef = vect_stmt_to_vectorize (def); > > - if (STMT_VINFO_REDUC_IDX (vdef) == -1) > > + int reduc_idx = STMT_VINFO_REDUC_IDX (vdef); > > + > > + if (reduc_idx == -1) > > { > > if (dump_enabled_p ()) > > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > > @@ -7686,10 +7688,50 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > > return false; > > } > > } > > - else if (!stmt_info) > > - /* First non-conversion stmt. */ > > - stmt_info = vdef; > > - reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)]; > > + else > > + { > > + /* First non-conversion stmt. */ > > + if (!stmt_info) > > + stmt_info = vdef; > > + > > + if (lane_reducing_op_p (op.code)) > > + { > > + unsigned group_size = slp_node ? SLP_TREE_LANES (slp_node) : > > 0; > > + tree op_type = TREE_TYPE (op.ops[0]); > > + tree new_vectype_in = get_vectype_for_scalar_type (loop_vinfo, > > + op_type, > > + > > group_size); > > I think doing it this way does not adhere to the vector type size constraint > with loop vectorization. You should use vect_is_simple_use like the > original code did as the actual vector definition determines the vector type > used. > > You are always using op.ops[0] here - I think that works because > reduc_idx is the last operand of all lane-reducing ops. But then > we should assert reduc_idx != 0 here and add a comment. > > > + > > + /* The last operand of lane-reducing operation is for > > + reduction. */ > > + gcc_assert (reduc_idx > 0 && reduc_idx == (int) op.num_ops - > > 1); > > + > > + /* For lane-reducing operation vectorizable analysis needs the > > + reduction PHI information */ > > + STMT_VINFO_REDUC_DEF (def) = phi_info; > > + > > + if (!new_vectype_in) > > + return false; > > + > > + /* Each lane-reducing operation has its own input vectype, > > while > > + reduction PHI will record the input vectype with the least > > + lanes. */ > > + STMT_VINFO_REDUC_VECTYPE_IN (vdef) = new_vectype_in; > > + > > + /* To accommodate lane-reducing operations of mixed input > > + vectypes, choose input vectype with the least lanes for the > > + reduction PHI statement, which would result in the most > > + ncopies for vectorized reduction results. */ > > + if (!vectype_in > > + || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE > > (vectype_in))) > > + < GET_MODE_SIZE (SCALAR_TYPE_MODE (op_type)))) > > + vectype_in = new_vectype_in; > > I know this is a fragile area but I always wonder since the accumulating > operand > is the largest (all lane-reducing ops are widening), and that will be > equal to the > type of the PHI node, how this condition can be ever true. > > ncopies is determined by the VF, so the comment is at least misleading. > > > + } > > + else > > + vectype_in = STMT_VINFO_VECTYPE (phi_info); > > Please initialize vectype_in from phi_info before the loop (that > should never be NULL). > > I'll note that with your patch it seems we'd initialize vectype_in to > the biggest > non-accumulation vector type involved in lane-reducing ops but the > accumulating > type might still be larger. Why, when we have multiple lane-reducing > ops, would > we chose the largest input here? I see we eventually do > > if (slp_node) > ncopies = 1; > else > ncopies = vect_get_num_copies (loop_vinfo, vectype_in); > > but then IIRC we always force a single cycle def for lane-reducing ops(?). > In particular for vect_transform_reduction and SLP we rely on > SLP_TREE_NUMBER_OF_VEC_STMTS while non-SLP uses > STMT_VINFO_REDUC_VECTYPE_IN. > > So I wonder what breaks when we set vectype_in = vector type of PHI? > > Richard. > > > + } > > + > > + reduc_def = op.ops[reduc_idx]; > > reduc_chain_length++; > > if (!stmt_info && slp_node) > > slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0]; > > @@ -7747,6 +7789,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > > > > tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); > > STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out; > > + STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in; > > + > > gimple_match_op op; > > if (!gimple_extract_op (stmt_info->stmt, &op)) > > gcc_unreachable (); > > @@ -7831,16 +7875,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > > = get_vectype_for_scalar_type (loop_vinfo, > > TREE_TYPE (op.ops[i]), slp_op[i]); > > > > - /* To properly compute ncopies we are interested in the widest > > - non-reduction input type in case we're looking at a widening > > - accumulation that we later handle in vect_transform_reduction. */ > > - if (lane_reducing > > - && vectype_op[i] > > - && (!vectype_in > > - || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in))) > > - < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE > > (vectype_op[i])))))) > > - vectype_in = vectype_op[i]; > > - > > /* Record how the non-reduction-def value of COND_EXPR is defined. > > ??? For a chain of multiple CONDs we'd have to match them up all. > > */ > > if (op.code == COND_EXPR && reduc_chain_length == 1) > > @@ -7859,14 +7893,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > > } > > } > > } > > - if (!vectype_in) > > - vectype_in = STMT_VINFO_VECTYPE (phi_info); > > - STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in; > > - > > - /* Each lane-reducing operation has its own input vectype, while > > reduction > > - PHI records the input vectype with least lanes. */ > > - if (lane_reducing) > > - STMT_VINFO_REDUC_VECTYPE_IN (stmt_info) = vectype_in; > > > > enum vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE > > (phi_info); > > STMT_VINFO_REDUC_TYPE (reduc_info) = reduction_type; > > -- > > 2.17.1