Re: [PATCH 4/8] vect: Determine input vectype for multiple lane-reducing

Richard Biener Fri, 28 Jun 2024 06:00:22 -0700

On Wed, Jun 26, 2024 at 4:48 PM Feng Xue OS <f...@os.amperecomputing.com> wrote:
>
> Updated the patches based on comments.
>
> The input vectype of reduction PHI statement must be determined before
> vect cost computation for the reduction. Since lance-reducing operation has
> different input vectype from normal one, so we need to traverse all reduction
> statements to find out the input vectype with the least lanes, and set that to
> the PHI statement.


OK

> ---
>  gcc/tree-vect-loop.cc | 79 ++++++++++++++++++++++++++++++-------------
>  1 file changed, 56 insertions(+), 23 deletions(-)
>
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 347dac97e49..419f4b08d2b 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -7643,7 +7643,9 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>      {
>        stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
>        stmt_vec_info vdef = vect_stmt_to_vectorize (def);
> -      if (STMT_VINFO_REDUC_IDX (vdef) == -1)
> +      int reduc_idx = STMT_VINFO_REDUC_IDX (vdef);
> +
> +      if (reduc_idx == -1)
>         {
>           if (dump_enabled_p ())
>             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> @@ -7686,10 +7688,57 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>               return false;
>             }
>         }
> -      else if (!stmt_info)
> -       /* First non-conversion stmt.  */
> -       stmt_info = vdef;
> -      reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
> +      else
> +       {
> +         /* First non-conversion stmt.  */
> +         if (!stmt_info)
> +           stmt_info = vdef;
> +
> +         if (lane_reducing_op_p (op.code))
> +           {
> +             enum vect_def_type dt;
> +             tree vectype_op;
> +
> +             /* The last operand of lane-reducing operation is for
> +                reduction.  */
> +             gcc_assert (reduc_idx > 0 && reduc_idx == (int) op.num_ops - 1);
> +
> +             if (!vect_is_simple_use (op.ops[0], loop_vinfo, &dt, 
> &vectype_op))
> +               return false;
> +
> +             tree type_op = TREE_TYPE (op.ops[0]);
> +
> +             if (!vectype_op)
> +               {
> +                 vectype_op = get_vectype_for_scalar_type (loop_vinfo,
> +                                                           type_op);
> +                 if (!vectype_op)
> +                   return false;
> +               }
> +
> +             /* For lane-reducing operation vectorizable analysis needs the
> +                reduction PHI information */
> +             STMT_VINFO_REDUC_DEF (def) = phi_info;
> +
> +             /* Each lane-reducing operation has its own input vectype, while
> +                reduction PHI will record the input vectype with the least
> +                lanes.  */
> +             STMT_VINFO_REDUC_VECTYPE_IN (vdef) = vectype_op;
> +
> +             /* To accommodate lane-reducing operations of mixed input
> +                vectypes, choose input vectype with the least lanes for the
> +                reduction PHI statement, which would result in the most
> +                ncopies for vectorized reduction results.  */
> +             if (!vectype_in
> +                 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE 
> (vectype_in)))
> +                      < GET_MODE_SIZE (SCALAR_TYPE_MODE (type_op))))
> +               vectype_in = vectype_op;
> +           }
> +         else
> +           vectype_in = STMT_VINFO_VECTYPE (phi_info);
> +       }
> +
> +      reduc_def = op.ops[reduc_idx];
>        reduc_chain_length++;
>        if (!stmt_info && slp_node)
>         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
> @@ -7747,6 +7796,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>
>    tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
>    STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
> +  STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
> +
>    gimple_match_op op;
>    if (!gimple_extract_op (stmt_info->stmt, &op))
>      gcc_unreachable ();
> @@ -7831,16 +7882,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>           = get_vectype_for_scalar_type (loop_vinfo,
>                                          TREE_TYPE (op.ops[i]), slp_op[i]);
>
> -      /* To properly compute ncopies we are interested in the widest
> -        non-reduction input type in case we're looking at a widening
> -        accumulation that we later handle in vect_transform_reduction.  */
> -      if (lane_reducing
> -         && vectype_op[i]
> -         && (!vectype_in
> -             || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
> -                 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE 
> (vectype_op[i]))))))
> -       vectype_in = vectype_op[i];
> -
>        /* Record how the non-reduction-def value of COND_EXPR is defined.
>          ???  For a chain of multiple CONDs we'd have to match them up all.  
> */
>        if (op.code == COND_EXPR && reduc_chain_length == 1)
> @@ -7859,14 +7900,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>             }
>         }
>      }
> -  if (!vectype_in)
> -    vectype_in = STMT_VINFO_VECTYPE (phi_info);
> -  STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
> -
> -  /* Each lane-reducing operation has its own input vectype, while reduction
> -     PHI records the input vectype with least lanes.  */
> -  if (lane_reducing)
> -    STMT_VINFO_REDUC_VECTYPE_IN (stmt_info) = vectype_in;
>
>    enum vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (phi_info);
>    STMT_VINFO_REDUC_TYPE (reduc_info) = reduction_type;
> --
> 2.17.1
>
>
> ________________________________________
> From: Feng Xue OS <f...@os.amperecomputing.com>
> Sent: Thursday, June 20, 2024 1:47 PM
> To: Richard Biener
> Cc: gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH 4/8] vect: Determine input vectype for multiple 
> lane-reducing
>
> >> +         if (lane_reducing_op_p (op.code))
> >> +           {
> >> +             unsigned group_size = slp_node ? SLP_TREE_LANES (slp_node) : 
> >> 0;
> >> +             tree op_type = TREE_TYPE (op.ops[0]);
> >> +             tree new_vectype_in = get_vectype_for_scalar_type 
> >> (loop_vinfo,
> >> +                                                                op_type,
> >> +                                                                
> >> group_size);
> >
> > I think doing it this way does not adhere to the vector type size constraint
> > with loop vectorization.  You should use vect_is_simple_use like the
> > original code did as the actual vector definition determines the vector type
> > used.
>
> OK, though this might be wordy.
>
> Actually, STMT_VINFO_REDUC_VECTYPE_IN is logically equivalent to 
> nunits_vectype
> that is determined in vect_determine_vf_for_stmt_1(). So how about setting 
> the type
> in this function?
>
> >
> > You are always using op.ops[0] here - I think that works because
> > reduc_idx is the last operand of all lane-reducing ops.  But then
> > we should assert reduc_idx != 0 here and add a comment.
>
> Already added in the following assertion.
>
> >> +
> >> +             /* The last operand of lane-reducing operation is for
> >> +                reduction.  */
> >> +             gcc_assert (reduc_idx > 0 && reduc_idx == (int) op.num_ops - 
> >> 1);
>
>                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> >> +
> >> +             /* For lane-reducing operation vectorizable analysis needs 
> >> the
> >> +                reduction PHI information */
> >> +             STMT_VINFO_REDUC_DEF (def) = phi_info;
> >> +
> >> +             if (!new_vectype_in)
> >> +               return false;
> >> +
> >> +             /* Each lane-reducing operation has its own input vectype, 
> >> while
> >> +                reduction PHI will record the input vectype with the least
> >> +                lanes.  */
> >> +             STMT_VINFO_REDUC_VECTYPE_IN (vdef) = new_vectype_in;
> >> +
> >> +             /* To accommodate lane-reducing operations of mixed input
> >> +                vectypes, choose input vectype with the least lanes for 
> >> the
> >> +                reduction PHI statement, which would result in the most
> >> +                ncopies for vectorized reduction results.  */
> >> +             if (!vectype_in
> >> +                 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE 
> >> (vectype_in)))
> >> +                      < GET_MODE_SIZE (SCALAR_TYPE_MODE (op_type))))
> >> +               vectype_in = new_vectype_in;
> >
> > I know this is a fragile area but I always wonder since the accumulating 
> > operand
> > is the largest (all lane-reducing ops are widening), and that will be
> > equal to the
> > type of the PHI node, how this condition can be ever true.
>
> In the original code, accumulating operand is skipped! While it is correctly, 
> we
> should not count the operand, this is why we call operation lane-reducing.
>
> >
> > ncopies is determined by the VF, so the comment is at least misleading.
> >
> >> +           }
> >> +         else
> >> +           vectype_in = STMT_VINFO_VECTYPE (phi_info);
> >
> > Please initialize vectype_in from phi_info before the loop (that
> > should never be NULL).
> >
>
> May not, as the below explanation.
>
> > I'll note that with your patch it seems we'd initialize vectype_in to
> > the biggest
> > non-accumulation vector type involved in lane-reducing ops but the 
> > accumulating
> > type might still be larger.   Why, when we have multiple lane-reducing
> > ops, would
> > we chose the largest input here?  I see we eventually do
> >
> >   if (slp_node)
> >     ncopies = 1;
> >   else
> >     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
> >
> > but then IIRC we always force a single cycle def for lane-reducing ops(?).
>
>
> > In particular for vect_transform_reduction and SLP we rely on
> > SLP_TREE_NUMBER_OF_VEC_STMTS while non-SLP uses
> > STMT_VINFO_REDUC_VECTYPE_IN.
> >
> > So I wonder what breaks when we set vectype_in = vector type of PHI?
> >
>
> Yes. It is right, nothing is broken. Suppose that a loop contains three 
> dot_prods,
> two are <16 * char>, one is <8 * short>, and choose <4 * int> as vectype_in:
>
> With the patch #7, we get:
>
>       vector<4> int sum_v0 = { 0, 0, 0, 0 };
>       vector<4> int sum_v1 = { 0, 0, 0, 0 };
>       vector<4> int sum_v2 = { 0, 0, 0, 0 };
>       vector<4> int sum_v3 = { 0, 0, 0, 0 };
>
>       loop () {
>          sum_v0 = dot_prod<16 * char>(char_a0, char_a1, sum_v0);
>
>          sum_v0 = dot_prod<16 * char>(char_b0, char_b1, sum_v0);
>
>          sum_v0 = dot_prod<8 * short>(short_c0_lo, short_c1_lo, sum_v0);
>          sum_v1 = dot_prod<8 * short>(short_c0_hi, short_c1_hi, sum_v1);
>
>          sum_v2 = sum_v2;
>          sum_v3 = sum_v3;
>       }
>
> The def/use cycles (sum_v2 and sum_v3> would be optimized away finally.
> Then this gets same result as setting vectype_in to <8 * short>.
>
> With the patch #8, we get:
>
>       vector<4> int sum_v0 = { 0, 0, 0, 0 };
>       vector<4> int sum_v1 = { 0, 0, 0, 0 };
>       vector<4> int sum_v2 = { 0, 0, 0, 0 };
>       vector<4> int sum_v3 = { 0, 0, 0, 0 };
>
>       loop () {
>          sum_v0 = dot_prod<16 * char>(char_a0, char_a1, sum_v0);
>
>          sum_v1 = dot_prod<16 * char>(char_b0, char_b1, sum_v1);
>
>          sum_v2 = dot_prod<8 * short>(short_c0_lo, short_c1_lo, sum_v2);
>          sum_v3 = dot_prod<8 * short>(short_c0_hi, short_c1_hi, sum_v3);
>       }
>
> All dot_prods are assigned to separate def/use cycles, and no
> dependency. More def/use cycles, higher instruction parallelism,
> but there need extra cost in epilogue to combine the result.
>
> So we consider a somewhat compact def/use layout similar to
> single-defuse-cycle, in which two <16 * char> dot_prods are independent,
> and cycle 2 and 3 are not used, and this is better than the 1st scheme.
>
>       vector<4> int sum_v0 = { 0, 0, 0, 0 };
>       vector<4> int sum_v1 = { 0, 0, 0, 0 };
>
>       loop () {
>          sum_v0 = dot_prod<16 * char>(char_a0, char_a1, sum_v0);
>
>          sum_v1 = dot_prod<16 * char>(char_b0, char_b1, sum_v1);
>
>          sum_v0 = dot_prod<8 * short>(short_c0_lo, short_c1_lo, sum_v0);
>          sum_v1 = dot_prod<8 * short>(short_c0_hi, short_c1_hi, sum_v1);
>       }
>
> For this purpose, we need to track the vectype_in that results in
> the most ncopies, for this case, the type is <8 * short>.
>
> BTW: would you please also take a look at patch #7 and #8?
>
> Thanks,
> Feng
>
> ________________________________________
> From: Richard Biener <richard.guent...@gmail.com>
> Sent: Wednesday, June 19, 2024 9:01 PM
> To: Feng Xue OS
> Cc: gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH 4/8] vect: Determine input vectype for multiple 
> lane-reducing
>
> On Sun, Jun 16, 2024 at 9:25 AM Feng Xue OS <f...@os.amperecomputing.com> 
> wrote:
> >
> > The input vectype of reduction PHI statement must be determined before
> > vect cost computation for the reduction. Since lance-reducing operation has
> > different input vectype from normal one, so we need to traverse all 
> > reduction
> > statements to find out the input vectype with the least lanes, and set that 
> > to
> > the PHI statement.
> >
> > Thanks,
> > Feng
> >
> > ---
> > gcc/
> >         * tree-vect-loop.cc (vectorizable_reduction): Determine input 
> > vectype
> >         during traversal of reduction statements.
> > ---
> >  gcc/tree-vect-loop.cc | 72 +++++++++++++++++++++++++++++--------------
> >  1 file changed, 49 insertions(+), 23 deletions(-)
> >
> > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> > index 0f7b125e72d..39aa5cb1197 100644
> > --- a/gcc/tree-vect-loop.cc
> > +++ b/gcc/tree-vect-loop.cc
> > @@ -7643,7 +7643,9 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> >      {
> >        stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
> >        stmt_vec_info vdef = vect_stmt_to_vectorize (def);
> > -      if (STMT_VINFO_REDUC_IDX (vdef) == -1)
> > +      int reduc_idx = STMT_VINFO_REDUC_IDX (vdef);
> > +
> > +      if (reduc_idx == -1)
> >         {
> >           if (dump_enabled_p ())
> >             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > @@ -7686,10 +7688,50 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> >               return false;
> >             }
> >         }
> > -      else if (!stmt_info)
> > -       /* First non-conversion stmt.  */
> > -       stmt_info = vdef;
> > -      reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
> > +      else
> > +       {
> > +         /* First non-conversion stmt.  */
> > +         if (!stmt_info)
> > +           stmt_info = vdef;
> > +
> > +         if (lane_reducing_op_p (op.code))
> > +           {
> > +             unsigned group_size = slp_node ? SLP_TREE_LANES (slp_node) : 
> > 0;
> > +             tree op_type = TREE_TYPE (op.ops[0]);
> > +             tree new_vectype_in = get_vectype_for_scalar_type (loop_vinfo,
> > +                                                                op_type,
> > +                                                                
> > group_size);
>
> I think doing it this way does not adhere to the vector type size constraint
> with loop vectorization.  You should use vect_is_simple_use like the
> original code did as the actual vector definition determines the vector type
> used.
>
> You are always using op.ops[0] here - I think that works because
> reduc_idx is the last operand of all lane-reducing ops.  But then
> we should assert reduc_idx != 0 here and add a comment.
>
> > +
> > +             /* The last operand of lane-reducing operation is for
> > +                reduction.  */
> > +             gcc_assert (reduc_idx > 0 && reduc_idx == (int) op.num_ops - 
> > 1);
> > +
> > +             /* For lane-reducing operation vectorizable analysis needs the
> > +                reduction PHI information */
> > +             STMT_VINFO_REDUC_DEF (def) = phi_info;
> > +
> > +             if (!new_vectype_in)
> > +               return false;
> > +
> > +             /* Each lane-reducing operation has its own input vectype, 
> > while
> > +                reduction PHI will record the input vectype with the least
> > +                lanes.  */
> > +             STMT_VINFO_REDUC_VECTYPE_IN (vdef) = new_vectype_in;
> > +
> > +             /* To accommodate lane-reducing operations of mixed input
> > +                vectypes, choose input vectype with the least lanes for the
> > +                reduction PHI statement, which would result in the most
> > +                ncopies for vectorized reduction results.  */
> > +             if (!vectype_in
> > +                 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE 
> > (vectype_in)))
> > +                      < GET_MODE_SIZE (SCALAR_TYPE_MODE (op_type))))
> > +               vectype_in = new_vectype_in;
>
> I know this is a fragile area but I always wonder since the accumulating 
> operand
> is the largest (all lane-reducing ops are widening), and that will be
> equal to the
> type of the PHI node, how this condition can be ever true.
>
> ncopies is determined by the VF, so the comment is at least misleading.
>
> > +           }
> > +         else
> > +           vectype_in = STMT_VINFO_VECTYPE (phi_info);
>
> Please initialize vectype_in from phi_info before the loop (that
> should never be NULL).
>
> I'll note that with your patch it seems we'd initialize vectype_in to
> the biggest
> non-accumulation vector type involved in lane-reducing ops but the 
> accumulating
> type might still be larger.   Why, when we have multiple lane-reducing
> ops, would
> we chose the largest input here?  I see we eventually do
>
>   if (slp_node)
>     ncopies = 1;
>   else
>     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
>
> but then IIRC we always force a single cycle def for lane-reducing ops(?).
> In particular for vect_transform_reduction and SLP we rely on
> SLP_TREE_NUMBER_OF_VEC_STMTS while non-SLP uses
> STMT_VINFO_REDUC_VECTYPE_IN.
>
> So I wonder what breaks when we set vectype_in = vector type of PHI?
>
> Richard.
>
> > +       }
> > +
> > +      reduc_def = op.ops[reduc_idx];
> >        reduc_chain_length++;
> >        if (!stmt_info && slp_node)
> >         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
> > @@ -7747,6 +7789,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> >
> >    tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
> >    STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
> > +  STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
> > +
> >    gimple_match_op op;
> >    if (!gimple_extract_op (stmt_info->stmt, &op))
> >      gcc_unreachable ();
> > @@ -7831,16 +7875,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> >           = get_vectype_for_scalar_type (loop_vinfo,
> >                                          TREE_TYPE (op.ops[i]), slp_op[i]);
> >
> > -      /* To properly compute ncopies we are interested in the widest
> > -        non-reduction input type in case we're looking at a widening
> > -        accumulation that we later handle in vect_transform_reduction.  */
> > -      if (lane_reducing
> > -         && vectype_op[i]
> > -         && (!vectype_in
> > -             || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
> > -                 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE 
> > (vectype_op[i]))))))
> > -       vectype_in = vectype_op[i];
> > -
> >        /* Record how the non-reduction-def value of COND_EXPR is defined.
> >          ???  For a chain of multiple CONDs we'd have to match them up all. 
> >  */
> >        if (op.code == COND_EXPR && reduc_chain_length == 1)
> > @@ -7859,14 +7893,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> >             }
> >         }
> >      }
> > -  if (!vectype_in)
> > -    vectype_in = STMT_VINFO_VECTYPE (phi_info);
> > -  STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
> > -
> > -  /* Each lane-reducing operation has its own input vectype, while 
> > reduction
> > -     PHI records the input vectype with least lanes.  */
> > -  if (lane_reducing)
> > -    STMT_VINFO_REDUC_VECTYPE_IN (stmt_info) = vectype_in;
> >
> >    enum vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE 
> > (phi_info);
> >    STMT_VINFO_REDUC_TYPE (reduc_info) = reduction_type;
> > --
> > 2.17.1

Re: [PATCH 4/8] vect: Determine input vectype for multiple lane-reducing

Reply via email to