Re: [Vectorizer] Add SLP support for masked loads

Richard Biener Mon, 20 May 2019 05:50:01 -0700

On Mon, May 20, 2019 at 2:19 PM Richard Sandiford
<richard.sandif...@arm.com> wrote:
>
> Alejandro Martinez Vicente <alejandro.martinezvice...@arm.com> writes:
> > Hi Richards,
> >
> > This is the new version of the patch, addressing your comments.
>
> You forgot the changelog :-)
>
> The patch LGTM otherwise.  Richard, any objections?


Nope.

Richard.

> >
> > Alejandro
> >
> >> -----Original Message-----
> >> From: Richard Sandiford <richard.sandif...@arm.com>
> >> Sent: 08 May 2019 14:36
> >> To: Richard Biener <richard.guent...@gmail.com>
> >> Cc: Alejandro Martinez Vicente <alejandro.martinezvice...@arm.com>; GCC
> >> Patches <gcc-patches@gcc.gnu.org>; nd <n...@arm.com>
> >> Subject: Re: [Vectorizer] Add SLP support for masked loads
> >>
> >> Richard Biener <richard.guent...@gmail.com> writes:
> >> > On Fri, Apr 26, 2019 at 3:14 PM Richard Sandiford
> >> > <richard.sandif...@arm.com> wrote:
> >> >>
> >> >> Alejandro Martinez Vicente <alejandro.martinezvice...@arm.com>
> >> writes:
> >> >> > Hi,
> >> >> >
> >> >> > Current vectorizer doesn't support masked loads for SLP. We should
> >> >> > add that, to allow things like:
> >> >> >
> >> >> > void
> >> >> > f (int *restrict x, int *restrict y, int *restrict z, int n) {
> >> >> >   for (int i = 0; i < n; i += 2)
> >> >> >     {
> >> >> >       x[i] = y[i] ? z[i] : 1;
> >> >> >       x[i + 1] = y[i + 1] ? z[i + 1] : 2;
> >> >> >     }
> >> >> > }
> >> >> >
> >> >> > to be vectorized using contiguous loads rather than LD2 and ST2.
> >> >> >
> >> >> > This patch was motivated by SVE, but it is completely generic and
> >> >> > should apply to any architecture with masked loads.
> >> >> >
> >> >> > After the patch is applied, the above code generates this output
> >> >> > (-march=armv8.2-a+sve -O2 -ftree-vectorize):
> >> >> >
> >> >> > 0000000000000000 <f>:
> >> >> >    0: 7100007f        cmp     w3, #0x0
> >> >> >    4: 540002cd        b.le    5c <f+0x5c>
> >> >> >    8: 51000464        sub     w4, w3, #0x1
> >> >> >    c: d2800003        mov     x3, #0x0                        // #0
> >> >> >   10: 90000005        adrp    x5, 0 <f>
> >> >> >   14: 25d8e3e0        ptrue   p0.d
> >> >> >   18: 53017c84        lsr     w4, w4, #1
> >> >> >   1c: 910000a5        add     x5, x5, #0x0
> >> >> >   20: 11000484        add     w4, w4, #0x1
> >> >> >   24: 85c0e0a1        ld1rd   {z1.d}, p0/z, [x5]
> >> >> >   28: 2598e3e3        ptrue   p3.s
> >> >> >   2c: d37ff884        lsl     x4, x4, #1
> >> >> >   30: 25a41fe2        whilelo p2.s, xzr, x4
> >> >> >   34: d503201f        nop
> >> >> >   38: a5434820        ld1w    {z0.s}, p2/z, [x1, x3, lsl #2]
> >> >> >   3c: 25808c11        cmpne   p1.s, p3/z, z0.s, #0
> >> >> >   40: 25808810        cmpne   p0.s, p2/z, z0.s, #0
> >> >> >   44: a5434040        ld1w    {z0.s}, p0/z, [x2, x3, lsl #2]
> >> >> >   48: 05a1c400        sel     z0.s, p1, z0.s, z1.s
> >> >> >   4c: e5434800        st1w    {z0.s}, p2, [x0, x3, lsl #2]
> >> >> >   50: 04b0e3e3        incw    x3
> >> >> >   54: 25a41c62        whilelo p2.s, x3, x4
> >> >> >   58: 54ffff01        b.ne    38 <f+0x38>  // b.any
> >> >> >   5c: d65f03c0        ret
> >> >> >
> >> >> >
> >> >> > I tested this patch in an aarch64 machine bootstrapping the
> >> >> > compiler and running the checks.
> >> >> >
> >> >> > Alejandro
> >> >> >
> >> >> > gcc/Changelog:
> >> >> >
> >> >> > 2019-01-16  Alejandro Martinez  <alejandro.martinezvice...@arm.com>
> >> >> >
> >> >> >       * config/aarch64/aarch64-sve.md (copysign<mode>3): New
> >> define_expand.
> >> >> >       (xorsign<mode>3): Likewise.
> >> >> >       internal-fn.c: Marked mask_load_direct and mask_store_direct as
> >> >> >       vectorizable.
> >> >> >       tree-data-ref.c (data_ref_compare_tree): Fixed comment typo.
> >> >> >       tree-vect-data-refs.c (can_group_stmts_p): Allow masked loads 
> >> >> > to be
> >> >> >       combined even if masks different.
> >> >> >       (slp_vect_only_p): New function to detect masked loads that are 
> >> >> > only
> >> >> >       vectorizable using SLP.
> >> >> >       (vect_analyze_data_ref_accesses): Mark SLP only vectorizable 
> >> >> > groups.
> >> >> >       tree-vect-loop.c (vect_dissolve_slp_only_groups): New function 
> >> >> > to
> >> >> >       dissolve SLP-only vectorizable groups when SLP has been 
> >> >> > discarded.
> >> >> >       (vect_analyze_loop_2): Call vect_dissolve_slp_only_groups when
> >> needed.
> >> >> >       tree-vect-slp.c (vect_get_and_check_slp_defs): Check masked 
> >> >> > loads
> >> >> >       masks.
> >> >> >       (vect_build_slp_tree_1): Fixed comment typo.
> >> >> >       (vect_build_slp_tree_2): Include masks from masked loads in SLP
> >> tree.
> >> >> >       tree-vect-stmts.c (vect_get_vec_defs_for_operand): New function 
> >> >> > to
> >> get
> >> >> >       vec_defs for operand with optional SLP and vectype.
> >> >> >       (vectorizable_load): Allow vectorizaion of masked loads for SLP 
> >> >> > only.
> >> >> >       tree-vectorizer.h (_stmt_vec_info): Added flag for SLP-only
> >> >> >       vectorizable.
> >> >> >       tree-vectorizer.c (vec_info::new_stmt_vec_info): Likewise.
> >> >> >
> >> >> > gcc/testsuite/Changelog:
> >> >> >
> >> >> > 2019-01-16  Alejandro Martinez  <alejandro.martinezvice...@arm.com>
> >> >> >
> >> >> >       * gcc.target/aarch64/sve/mask_load_slp_1.c: New test for SLP
> >> >> >       vectorized masked loads.
> >> >> >
> >> >> > diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c index
> >> >> > 4f2ef45..67eee59 100644
> >> >> > --- a/gcc/internal-fn.c
> >> >> > +++ b/gcc/internal-fn.c
> >> >> > @@ -100,11 +100,11 @@ init_internal_fns ()
> >> >> >  /* Create static initializers for the information returned by
> >> >> >     direct_internal_fn.  */
> >> >> >  #define not_direct { -2, -2, false } -#define mask_load_direct {
> >> >> > -1, 2, false }
> >> >> > +#define mask_load_direct { -1, 2, true }
> >> >> >  #define load_lanes_direct { -1, -1, false }  #define
> >> >> > mask_load_lanes_direct { -1, -1, false }  #define
> >> >> > gather_load_direct { -1, -1, false } -#define mask_store_direct {
> >> >> > 3, 2, false }
> >> >> > +#define mask_store_direct { 3, 2, true }
> >> >> >  #define store_lanes_direct { 0, 0, false }  #define
> >> >> > mask_store_lanes_direct { 0, 0, false }  #define
> >> >> > scatter_store_direct { 3, 3, false } diff --git
> >> >> > a/gcc/testsuite/gcc.target/aarch64/sve/mask_load_slp_1.c
> >> >> > b/gcc/testsuite/gcc.target/aarch64/sve/mask_load_slp_1.c
> >> >> > new file mode 100644
> >> >> > index 0000000..b106cae
> >> >> > --- /dev/null
> >> >> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/mask_load_slp_1.c
> >> >> > @@ -0,0 +1,74 @@
> >> >> > +/* { dg-do compile } */
> >> >> > +/* { dg-options "-O2 -ftree-vectorize" } */
> >> >> > +
> >> >> > +#include <stdint.h>
> >> >> > +
> >> >> > +#define MASK_SLP_2(TYPE_COND, ALT_VAL)                               
> >> >> >         \
> >> >> > +void __attribute__ ((noinline, noclone))                             
> >> >> > \
> >> >> > +mask_slp_##TYPE_COND##_2_##ALT_VAL (int *restrict x, int *restrict y,
> >> \
> >> >> > +                                 TYPE_COND *restrict z, int n)       
> >> >> > \
> >> >> > +{                                                                    
> >> >> > \
> >> >> > +  for (int i = 0; i < n; i += 2)                                     
> >> >> > \
> >> >> > +    {                                                                
> >> >> >         \
> >> >> > +      x[i] = y[i] ? z[i] : 1;                                        
> >> >> >         \
> >> >> > +      x[i + 1] = y[i + 1] ? z[i + 1] : ALT_VAL;                      
> >> >> >         \
> >> >> > +    }                                                                
> >> >> >         \
> >> >> > +}
> >> >> > +
> >> >> > +#define MASK_SLP_4(TYPE_COND, ALT_VAL)                               
> >> >> >         \
> >> >> > +void __attribute__ ((noinline, noclone))                             
> >> >> > \
> >> >> > +mask_slp_##TYPE_COND##_4_##ALT_VAL (int *restrict x, int *restrict y,
> >> \
> >> >> > +                                 TYPE_COND *restrict z, int n)       
> >> >> > \
> >> >> > +{                                                                    
> >> >> > \
> >> >> > +  for (int i = 0; i < n; i += 4)                                     
> >> >> > \
> >> >> > +    {                                                                
> >> >> >         \
> >> >> > +      x[i] = y[i] ? z[i] : 1;                                        
> >> >> >         \
> >> >> > +      x[i + 1] = y[i + 1] ? z[i + 1] : ALT_VAL;                      
> >> >> >         \
> >> >> > +      x[i + 2] = y[i + 2] ? z[i + 2] : 1;                            
> >> >> > \
> >> >> > +      x[i + 3] = y[i + 3] ? z[i + 3] : ALT_VAL;                      
> >> >> >         \
> >> >> > +    }                                                                
> >> >> >         \
> >> >> > +}
> >> >> > +
> >> >> > +#define MASK_SLP_8(TYPE_COND, ALT_VAL)                               
> >> >> >         \
> >> >> > +void __attribute__ ((noinline, noclone))                             
> >> >> > \
> >> >> > +mask_slp_##TYPE_COND##_8_##ALT_VAL (int *restrict x, int *restrict y,
> >> \
> >> >> > +                                 TYPE_COND *restrict z, int n)       
> >> >> > \
> >> >> > +{                                                                    
> >> >> > \
> >> >> > +  for (int i = 0; i < n; i += 8)                                     
> >> >> > \
> >> >> > +    {                                                                
> >> >> >         \
> >> >> > +      x[i] = y[i] ? z[i] : 1;                                        
> >> >> >         \
> >> >> > +      x[i + 1] = y[i + 1] ? z[i + 1] : ALT_VAL;                      
> >> >> >         \
> >> >> > +      x[i + 2] = y[i + 2] ? z[i + 2] : 1;                            
> >> >> > \
> >> >> > +      x[i + 3] = y[i + 3] ? z[i + 3] : ALT_VAL;                      
> >> >> >         \
> >> >> > +      x[i + 4] = y[i + 4] ? z[i + 4] : 1;                            
> >> >> > \
> >> >> > +      x[i + 5] = y[i + 5] ? z[i + 5] : ALT_VAL;                      
> >> >> >         \
> >> >> > +      x[i + 6] = y[i + 6] ? z[i + 6] : 1;                            
> >> >> > \
> >> >> > +      x[i + 7] = y[i + 7] ? z[i + 7] : ALT_VAL;                      
> >> >> >         \
> >> >> > +    }                                                                
> >> >> >         \
> >> >> > +}
> >> >> > +
> >> >> > +MASK_SLP_2(int8_t, 1)
> >> >> > +MASK_SLP_2(int8_t, 2)
> >> >> > +MASK_SLP_2(int, 1)
> >> >> > +MASK_SLP_2(int, 2)
> >> >> > +MASK_SLP_2(int64_t, 1)
> >> >> > +MASK_SLP_2(int64_t, 2)
> >> >> > +
> >> >> > +MASK_SLP_4(int8_t, 1)
> >> >> > +MASK_SLP_4(int8_t, 2)
> >> >> > +MASK_SLP_4(int, 1)
> >> >> > +MASK_SLP_4(int, 2)
> >> >> > +MASK_SLP_4(int64_t, 1)
> >> >> > +MASK_SLP_4(int64_t, 2)
> >> >> > +
> >> >> > +MASK_SLP_8(int8_t, 1)
> >> >> > +MASK_SLP_8(int8_t, 2)
> >> >> > +MASK_SLP_8(int, 1)
> >> >> > +MASK_SLP_8(int, 2)
> >> >> > +MASK_SLP_8(int64_t, 1)
> >> >> > +MASK_SLP_8(int64_t, 2)
> >> >> > +
> >> >> > +/* { dg-final { scan-assembler-not {\tld2w\t} } } */
> >> >> > +/* { dg-final { scan-assembler-not {\tst2w\t} } } */
> >> >> > +/* { dg-final { scan-assembler-times {\tld1w\t} 48 } } */
> >> >> > +/* { dg-final { scan-assembler-times {\tst1w\t} 40 } } */
> >> >> > diff --git a/gcc/tree-data-ref.c b/gcc/tree-data-ref.c index
> >> >> > 7d1f03c..1833a5f 100644
> >> >> > --- a/gcc/tree-data-ref.c
> >> >> > +++ b/gcc/tree-data-ref.c
> >> >> > @@ -1272,7 +1272,7 @@ create_data_ref (edge nest, loop_p loop, tree
> >> memref, gimple *stmt,
> >> >> >    return dr;
> >> >> >  }
> >> >> >
> >> >> > -/*  A helper function computes order between two tree epxressions T1
> >> and T2.
> >> >> > +/*  A helper function computes order between two tree expressions T1
> >> and T2.
> >> >> >      This is used in comparator functions sorting objects based on the
> >> order
> >> >> >      of tree expressions.  The function returns -1, 0, or 1.  */
> >> >> >
> >> >> > diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
> >> >> > index 7bbd47f..8a82147 100644
> >> >> > --- a/gcc/tree-vect-data-refs.c
> >> >> > +++ b/gcc/tree-vect-data-refs.c
> >> >> > @@ -2837,22 +2837,72 @@ can_group_stmts_p (stmt_vec_info
> >> stmt1_info, stmt_vec_info stmt2_info)
> >> >> >        if (ifn != gimple_call_internal_fn (call2))
> >> >> >       return false;
> >> >> >
> >> >> > -      /* Check that the masks are the same.  Cope with casts of 
> >> >> > masks,
> >> >> > +      /* Check that the masks can be combined.  */
> >> >> > +      tree mask1 = gimple_call_arg (call1, 2);
> >> >> > +      tree mask2 = gimple_call_arg (call2, 2);
> >> >> > +      if (!operand_equal_p (mask1, mask2, 0))
> >> >> > +     {
> >> >> > +       /* Stores need identical masks.  */
> >> >> > +       if (ifn == IFN_MASK_STORE)
> >> >> > +         {
> >> >> > +           mask1 = strip_conversion (mask1);
> >> >> > +           if (!mask1)
> >> >> > +             return false;
> >> >> > +           mask2 = strip_conversion (mask2);
> >> >> > +           if (!mask2)
> >> >> > +             return false;
> >> >> > +           if (!operand_equal_p (mask1, mask2, 0))
> >> >> > +             return false;
> >> >> > +         }
> >> >> > +       /* Loads are allowed different masks under SLP only.
> >> >> > +          (See slp_vect_only_p () below).  */
> >> >> > +     }
> >> >> > +      return true;
> >> >> > +    }
> >> >> > +
> >> >> > +  return false;
> >> >> > +}
> >> >> > +
> >> >> > +/* Return true if vectorizable_* routines can handle statements
> >> STMT1_INFO
> >> >> > +   and STMT2_INFO being in a single group for SLP only.  */
> >> >> > +
> >> >> > +static bool
> >> >> > +slp_vect_only_p (stmt_vec_info stmt1_info, stmt_vec_info
> >> >> > +stmt2_info) {
> >> >> > +  if (gimple_assign_single_p (stmt1_info->stmt))
> >> >> > +    {
> >> >> > +      gcc_assert (gimple_assign_single_p (stmt2_info->stmt));
> >> >> > +      return false;
> >> >> > +    }
> >> >> > +
> >> >> > +  gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);  if (call1
> >> >> > + && gimple_call_internal_p (call1))
> >> >> > +    {
> >> >> > +      /* Check for two masked loads or two masked stores.  */
> >> >> > +      gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
> >> >> > +      gcc_assert (call2 && gimple_call_internal_p (call2));
> >> >> > +      internal_fn ifn = gimple_call_internal_fn (call1);
> >> >> > +      if (ifn != IFN_MASK_LOAD)
> >> >> > +     return false;
> >> >> > +      gcc_assert (ifn == gimple_call_internal_fn (call2));
> >> >> > +
> >> >> > +      /* Check if the masks are the same.  Cope with casts of
> >> >> > + masks,
> >> >> >        like those created by build_mask_conversion.  */
> >> >> >        tree mask1 = gimple_call_arg (call1, 2);
> >> >> >        tree mask2 = gimple_call_arg (call2, 2);
> >> >> >        if (!operand_equal_p (mask1, mask2, 0))
> >> >> >       {
> >> >> > +       /* This is the only case that is just for SLP: non-identical 
> >> >> > but
> >> >> > +          otherwise slp-compatible masks.  */
> >> >> >         mask1 = strip_conversion (mask1);
> >> >> >         if (!mask1)
> >> >> > -         return false;
> >> >> > +         return true;
> >> >> >         mask2 = strip_conversion (mask2);
> >> >> >         if (!mask2)
> >> >> > -         return false;
> >> >> > +         return true;
> >> >> >         if (!operand_equal_p (mask1, mask2, 0))
> >> >> > -         return false;
> >> >> > +         return true;
> >> >> >       }
> >> >> > -      return true;
> >> >> >      }
> >> >> >
> >> >> >    return false;
> >> >>
> >> >> Normally I'd say it would be better to add a bool argument to
> >> >> can_group_stmts_p that says whether we want non-SLP-only rules, or
> >> >> perhaps convert the return type to an enum.  But given that the
> >> >> non-SLP path is going away soon anyway, I guess separate functions
> >> >> are better despite the cut-&-paste.
> >> >>
> >> >> > diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index
> >> >> > afbf9a9..754a2e4 100644
> >> >> > --- a/gcc/tree-vect-loop.c
> >> >> > +++ b/gcc/tree-vect-loop.c
> >> >> > @@ -1755,6 +1755,49 @@ vect_get_datarefs_in_loop (loop_p loop,
> >> basic_block *bbs,
> >> >> >    return opt_result::success ();
> >> >> >  }
> >> >> >
> >> >> > +/* Look for SLP-only access groups and turn each individual access 
> >> >> > into
> >> its own
> >> >> > +   group.  */
> >> >> > +static void
> >> >> > +vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo) {
> >> >> > +  unsigned int i;
> >> >> > +  struct data_reference *dr;
> >> >> > +
> >> >> > +  DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
> >> >> > +
> >> >> > +  vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
> >> >> > + FOR_EACH_VEC_ELT (datarefs, i, dr)
> >> >> > +    {
> >> >> > +      gcc_assert (DR_REF (dr));
> >> >> > +      stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT
> >> >> > + (dr));
> >> >> > +
> >> >> > +      /* Check if the load is a part of an interleaving chain.  */
> >> >> > +      if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
> >> >> > +     {
> >> >> > +       stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT
> >> (stmt_info);
> >> >> > +       unsigned int group_size = DR_GROUP_SIZE (first_element);
> >> >> > +
> >> >> > +       /* Check if SLP-only groups.  */
> >> >> > +       if (STMT_VINFO_SLP_VECT_ONLY (first_element))
> >> >> > +         {
> >> >> > +             /* Dissolve the group.  */
> >> >> > +             STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
> >> >> > +
> >> >> > +             stmt_vec_info vinfo = first_element;
> >> >> > +             while (vinfo)
> >> >> > +               {
> >> >> > +                 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
> >> >> > +                 DR_GROUP_FIRST_ELEMENT (vinfo) = NULL;
> >> >> > +                 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
> >> >> > +                 DR_GROUP_SIZE (vinfo) = 1;
> >> >> > +                 DR_GROUP_GAP (vinfo) = group_size - 1;
> >> >> > +                 vinfo = next;
> >> >>
> >> >> I think DR_GROUP_FIRST_ELEMENT should be vinfo here, so that it
> >> >> remains a grouped access with only one element.
> >> >
> >> > Then the above looks like single-element interleaving?  Do we handle
> >> > interleaving at all for masked loads/stores?
> >>
> >> Not yet, but it's on the wishlist.
> >>
> >> > Generally a no longer grouped access would have
> >> DR_GROUP_FIRST_ELEMENT
> >> > NULL and "no" size/gap (well, nobody looks at those fields then).  It
> >> > would need vectorization with strided accesses then though thus you
> >> > need to set the strided flag.
> >>
> >> But with the way get_load_store_type is structured, single-element groups
> >> give strictly more information than a strided access.  We still fall back 
> >> on
> >> gather/scatter or elementwise accesses if necessary.
> >>
> >> (One of the reasons for adding get_load_store_type was to avoid the
> >> group/stride choice dictating a particular implementation.)
> >>
> >> So I think single element groups are better here.
> >>
> >> > I think you want to have a testcase exercising this path.
> >>
> >> No argument with this of course. :-)
> >>
> >> Richard
> >
> > diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c
> > index 04081f3..3051a7a 100644
> > --- a/gcc/internal-fn.c
> > +++ b/gcc/internal-fn.c
> > @@ -100,7 +100,7 @@ init_internal_fns ()
> >  /* Create static initializers for the information returned by
> >     direct_internal_fn.  */
> >  #define not_direct { -2, -2, false }
> > -#define mask_load_direct { -1, 2, false }
> > +#define mask_load_direct { -1, 2, true }
> >  #define load_lanes_direct { -1, -1, false }
> >  #define mask_load_lanes_direct { -1, -1, false }
> >  #define gather_load_direct { -1, -1, false }
> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mask_load_slp_1.c 
> > b/gcc/testsuite/gcc.target/aarch64/sve/mask_load_slp_1.c
> > new file mode 100644
> > index 0000000..78c70b2
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/mask_load_slp_1.c
> > @@ -0,0 +1,90 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -ftree-vectorize" } */
> > +
> > +#include <stdint.h>
> > +
> > +#define MASK_SLP_2(TYPE_COND, ALT_VAL)                                     
> >   \
> > +void __attribute__ ((noinline, noclone))                             \
> > +mask_slp_##TYPE_COND##_2_##ALT_VAL (int *restrict x, int *restrict y,      
> >   \
> > +                                 TYPE_COND *restrict z, int n)       \
> > +{                                                                    \
> > +  for (int i = 0; i < n; i += 2)                                     \
> > +    {                                                                      
> >   \
> > +      x[i] = y[i] ? z[i] : 1;                                              
> >   \
> > +      x[i + 1] = y[i + 1] ? z[i + 1] : ALT_VAL;                            
> >   \
> > +    }                                                                      
> >   \
> > +}
> > +
> > +#define MASK_SLP_4(TYPE_COND, ALT_VAL)                                     
> >   \
> > +void __attribute__ ((noinline, noclone))                             \
> > +mask_slp_##TYPE_COND##_4_##ALT_VAL (int *restrict x, int *restrict y,      
> >   \
> > +                                 TYPE_COND *restrict z, int n)       \
> > +{                                                                    \
> > +  for (int i = 0; i < n; i += 4)                                     \
> > +    {                                                                      
> >   \
> > +      x[i] = y[i] ? z[i] : 1;                                              
> >   \
> > +      x[i + 1] = y[i + 1] ? z[i + 1] : ALT_VAL;                            
> >   \
> > +      x[i + 2] = y[i + 2] ? z[i + 2] : 1;                            \
> > +      x[i + 3] = y[i + 3] ? z[i + 3] : ALT_VAL;                            
> >   \
> > +    }                                                                      
> >   \
> > +}
> > +
> > +#define MASK_SLP_8(TYPE_COND, ALT_VAL)                                     
> >   \
> > +void __attribute__ ((noinline, noclone))                             \
> > +mask_slp_##TYPE_COND##_8_##ALT_VAL (int *restrict x, int *restrict y,      
> >   \
> > +                                 TYPE_COND *restrict z, int n)       \
> > +{                                                                    \
> > +  for (int i = 0; i < n; i += 8)                                     \
> > +    {                                                                      
> >   \
> > +      x[i] = y[i] ? z[i] : 1;                                              
> >   \
> > +      x[i + 1] = y[i + 1] ? z[i + 1] : ALT_VAL;                            
> >   \
> > +      x[i + 2] = y[i + 2] ? z[i + 2] : 1;                            \
> > +      x[i + 3] = y[i + 3] ? z[i + 3] : ALT_VAL;                            
> >   \
> > +      x[i + 4] = y[i + 4] ? z[i + 4] : 1;                            \
> > +      x[i + 5] = y[i + 5] ? z[i + 5] : ALT_VAL;                            
> >   \
> > +      x[i + 6] = y[i + 6] ? z[i + 6] : 1;                            \
> > +      x[i + 7] = y[i + 7] ? z[i + 7] : ALT_VAL;                            
> >   \
> > +    }                                                                      
> >   \
> > +}
> > +
> > +#define MASK_SLP_FAIL(TYPE_COND)                                     \
> > +void __attribute__ ((noinline, noclone))                             \
> > +mask_slp_##TYPE_COND##_FAIL (int *restrict x, int *restrict y,             
> >   \
> > +                          TYPE_COND *restrict z, int n)              \
> > +{                                                                    \
> > +  for (int i = 0; i < n; i += 2)                                     \
> > +    {                                                                      
> >   \
> > +      x[i] = y[i] ? z[i] : 1;                                              
> >   \
> > +      x[i + 1] = y[i + 1] ? z[i + 1] : x[z[i + 1]];                  \
> > +    }                                                                      
> >   \
> > +}
> > +
> > +MASK_SLP_2(int8_t, 1)
> > +MASK_SLP_2(int8_t, 2)
> > +MASK_SLP_2(int, 1)
> > +MASK_SLP_2(int, 2)
> > +MASK_SLP_2(int64_t, 1)
> > +MASK_SLP_2(int64_t, 2)
> > +
> > +MASK_SLP_4(int8_t, 1)
> > +MASK_SLP_4(int8_t, 2)
> > +MASK_SLP_4(int, 1)
> > +MASK_SLP_4(int, 2)
> > +MASK_SLP_4(int64_t, 1)
> > +MASK_SLP_4(int64_t, 2)
> > +
> > +MASK_SLP_8(int8_t, 1)
> > +MASK_SLP_8(int8_t, 2)
> > +MASK_SLP_8(int, 1)
> > +MASK_SLP_8(int, 2)
> > +MASK_SLP_8(int64_t, 1)
> > +MASK_SLP_8(int64_t, 2)
> > +
> > +MASK_SLP_FAIL(int8_t)
> > +MASK_SLP_FAIL(int)
> > +MASK_SLP_FAIL(int64_t)
> > +
> > +/* { dg-final { scan-assembler-not {\tld2w\t} } } */
> > +/* { dg-final { scan-assembler-not {\tst2w\t} } } */
> > +/* { dg-final { scan-assembler-times {\tld1w\t} 48 } } */
> > +/* { dg-final { scan-assembler-times {\tst1w\t} 40 } } */
> > diff --git a/gcc/tree-data-ref.c b/gcc/tree-data-ref.c
> > index 67b960d..4dc03ef 100644
> > --- a/gcc/tree-data-ref.c
> > +++ b/gcc/tree-data-ref.c
> > @@ -1271,7 +1271,7 @@ create_data_ref (edge nest, loop_p loop, tree memref, 
> > gimple *stmt,
> >    return dr;
> >  }
> >
> > -/*  A helper function computes order between two tree epxressions T1 and 
> > T2.
> > +/*  A helper function computes order between two tree expressions T1 and 
> > T2.
> >      This is used in comparator functions sorting objects based on the order
> >      of tree expressions.  The function returns -1, 0, or 1.  */
> >
> > diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
> > index d71a39f..55d87f8 100644
> > --- a/gcc/tree-vect-data-refs.c
> > +++ b/gcc/tree-vect-data-refs.c
> > @@ -2863,10 +2863,12 @@ strip_conversion (tree op)
> >  }
> >
> >  /* Return true if vectorizable_* routines can handle statements STMT1_INFO
> > -   and STMT2_INFO being in a single group.  */
> > +   and STMT2_INFO being in a single group.  When ALLOW_SLP_P, masked loads 
> > can
> > +   be grouped in SLP mode.  */
> >
> >  static bool
> > -can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info)
> > +can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
> > +                bool allow_slp_p)
> >  {
> >    if (gimple_assign_single_p (stmt1_info->stmt))
> >      return gimple_assign_single_p (stmt2_info->stmt);
> > @@ -2888,7 +2890,8 @@ can_group_stmts_p (stmt_vec_info stmt1_info, 
> > stmt_vec_info stmt2_info)
> >        like those created by build_mask_conversion.  */
> >        tree mask1 = gimple_call_arg (call1, 2);
> >        tree mask2 = gimple_call_arg (call2, 2);
> > -      if (!operand_equal_p (mask1, mask2, 0))
> > +      if (!operand_equal_p (mask1, mask2, 0)
> > +          && (ifn == IFN_MASK_STORE || !allow_slp_p))
> >       {
> >         mask1 = strip_conversion (mask1);
> >         if (!mask1)
> > @@ -2974,7 +2977,7 @@ vect_analyze_data_ref_accesses (vec_info *vinfo)
> >             || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
> >                                       DR_BASE_ADDRESS (drb)) != 0
> >             || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
> > -           || !can_group_stmts_p (stmtinfo_a, stmtinfo_b))
> > +           || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
> >           break;
> >
> >         /* Check that the data-refs have the same constant size.  */
> > @@ -3059,6 +3062,13 @@ vect_analyze_data_ref_accesses (vec_info *vinfo)
> >         DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
> >         lastinfo = stmtinfo_b;
> >
> > +       STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
> > +         = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
> > +
> > +       if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
> > +         dump_printf_loc (MSG_NOTE, vect_location,
> > +                          "Load suitable for SLP vectorization only.\n");
> > +
> >         if (init_b == init_prev
> >             && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
> >             && dump_enabled_p ())
> > diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> > index 057a874..5166b42 100644
> > --- a/gcc/tree-vect-loop.c
> > +++ b/gcc/tree-vect-loop.c
> > @@ -1753,6 +1753,50 @@ vect_get_datarefs_in_loop (loop_p loop, basic_block 
> > *bbs,
> >    return opt_result::success ();
> >  }
> >
> > +/* Look for SLP-only access groups and turn each individual access into 
> > its own
> > +   group.  */
> > +static void
> > +vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
> > +{
> > +  unsigned int i;
> > +  struct data_reference *dr;
> > +
> > +  DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
> > +
> > +  vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
> > +  FOR_EACH_VEC_ELT (datarefs, i, dr)
> > +    {
> > +      gcc_assert (DR_REF (dr));
> > +      stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
> > +
> > +      /* Check if the load is a part of an interleaving chain.  */
> > +      if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
> > +     {
> > +       stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
> > +       unsigned int group_size = DR_GROUP_SIZE (first_element);
> > +
> > +       /* Check if SLP-only groups.  */
> > +       if (!STMT_SLP_TYPE (stmt_info)
> > +           && STMT_VINFO_SLP_VECT_ONLY (first_element))
> > +         {
> > +           /* Dissolve the group.  */
> > +           STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
> > +
> > +           stmt_vec_info vinfo = first_element;
> > +           while (vinfo)
> > +             {
> > +               stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
> > +               DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
> > +               DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
> > +               DR_GROUP_SIZE (vinfo) = 1;
> > +               DR_GROUP_GAP (vinfo) = group_size - 1;
> > +               vinfo = next;
> > +             }
> > +         }
> > +     }
> > +    }
> > +}
> > +
> >  /* Function vect_analyze_loop_2.
> >
> >     Apply a set of analyses on LOOP, and create a loop_vec_info struct
> > @@ -1964,6 +2008,9 @@ start_over:
> >       }
> >      }
> >
> > +  /* Dissolve SLP-only groups.  */
> > +  vect_dissolve_slp_only_groups (loop_vinfo);
> > +
> >    /* Scan all the remaining operations in the loop that are not subject
> >       to SLP and make sure they are vectorizable.  */
> >    ok = vect_analyze_loop_operations (loop_vinfo);
> > diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
> > index 2a1e5b8..0d2784e 100644
> > --- a/gcc/tree-vect-slp.c
> > +++ b/gcc/tree-vect-slp.c
> > @@ -325,6 +325,14 @@ vect_get_and_check_slp_defs (vec_info *vinfo, unsigned 
> > char *swap,
> >       {
> >         internal_fn ifn = gimple_call_internal_fn (stmt);
> >         commutative_op = first_commutative_argument (ifn);
> > +
> > +       /* Masked load, only look at mask.  */
> > +       if (ifn == IFN_MASK_LOAD)
> > +         {
> > +           number_of_oprnds = 1;
> > +           /* Mask operand index.  */
> > +           first_op_idx = 5;
> > +         }
> >       }
> >      }
> >    else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
> > @@ -624,7 +632,7 @@ vect_two_operations_perm_ok_p (vec<stmt_vec_info> stmts,
> >     is false then this indicates the comparison could not be
> >     carried out or the stmts will never be vectorized by SLP.
> >
> > -   Note COND_EXPR is possibly ismorphic to another one after swapping its
> > +   Note COND_EXPR is possibly isomorphic to another one after swapping its
> >     operands.  Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
> >     the first stmt by swapping the two operands of comparison; set SWAP[i]
> >     to 2 if stmt I is isormorphic to the first stmt by inverting the code
> > @@ -1143,14 +1151,23 @@ vect_build_slp_tree_2 (vec_info *vinfo,
> >                             &this_max_nunits, matches, &two_operators))
> >      return NULL;
> >
> > -  /* If the SLP node is a load, terminate the recursion.  */
> > +  /* If the SLP node is a load, terminate the recursion unless masked.  */
> >    if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
> >        && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
> >      {
> > -      *max_nunits = this_max_nunits;
> > -      (*tree_size)++;
> > -      node = vect_create_new_slp_node (stmts);
> > -      return node;
> > +      if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
> > +     {
> > +       /* Masked load.  */
> > +       gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
> > +       nops = 1;
> > +     }
> > +      else
> > +     {
> > +       *max_nunits = this_max_nunits;
> > +       (*tree_size)++;
> > +       node = vect_create_new_slp_node (stmts);
> > +       return node;
> > +     }
> >      }
> >
> >    /* Get at the operands, verifying they are compatible.  */
> > diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
> > index ced4264..7122ce9 100644
> > --- a/gcc/tree-vect-stmts.c
> > +++ b/gcc/tree-vect-stmts.c
> > @@ -7622,14 +7622,6 @@ vectorizable_load (stmt_vec_info stmt_info, 
> > gimple_stmt_iterator *gsi,
> >        if (!scalar_dest)
> >       return false;
> >
> > -      if (slp_node != NULL)
> > -     {
> > -       if (dump_enabled_p ())
> > -         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > -                          "SLP of masked loads not supported.\n");
> > -       return false;
> > -     }
> > -
> >        int mask_index = internal_fn_mask_index (ifn);
> >        if (mask_index >= 0)
> >       {
> > @@ -7712,6 +7704,15 @@ vectorizable_load (stmt_vec_info stmt_info, 
> > gimple_stmt_iterator *gsi,
> >        first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
> >        group_size = DR_GROUP_SIZE (first_stmt_info);
> >
> > +      /* Refuse non-SLP vectorization of SLP-only groups.  */
> > +      if (!slp && STMT_VINFO_SLP_VECT_ONLY (first_stmt_info))
> > +     {
> > +       if (dump_enabled_p ())
> > +         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > +                          "cannot vectorize load in non-SLP mode.\n");
> > +       return false;
> > +     }
> > +
> >        if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
> >       slp_perm = true;
> >
> > @@ -8389,8 +8390,19 @@ vectorizable_load (stmt_vec_info stmt_info, 
> > gimple_stmt_iterator *gsi,
> >                                         simd_lane_access_p,
> >                                         byte_offset, bump);
> >         if (mask)
> > -         vec_mask = vect_get_vec_def_for_operand (mask, stmt_info,
> > -                                                  mask_vectype);
> > +         {
> > +           if (slp_node)
> > +             {
> > +               auto_vec<tree> ops (1);
> > +               auto_vec<vec<tree> > vec_defs (1);
> > +               ops.quick_push (mask);
> > +               vect_get_slp_defs (ops, slp_node, &vec_defs);
> > +               vec_mask = vec_defs[0][0];
> > +             }
> > +           else
> > +             vec_mask = vect_get_vec_def_for_operand (mask, stmt_info,
> > +                                                      mask_vectype);
> > +         }
> >       }
> >        else
> >       {
> > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> > index 6f59af6..62d9341 100644
> > --- a/gcc/tree-vectorizer.h
> > +++ b/gcc/tree-vectorizer.h
> > @@ -396,7 +396,7 @@ typedef struct _loop_vec_info : public vec_info {
> >    /* Condition under which this loop is analyzed and versioned.  */
> >    tree num_iters_assumptions;
> >
> > -  /* Threshold of number of iterations below which vectorzation will not be
> > +  /* Threshold of number of iterations below which vectorization will not 
> > be
> >       performed. It is calculated from MIN_PROFITABLE_ITERS and
> >       PARAM_MIN_VECT_LOOP_BOUND.  */
> >    unsigned int th;
> > @@ -935,6 +935,9 @@ struct _stmt_vec_info {
> >       and OPERATION_BITS without changing the result.  */
> >    unsigned int operation_precision;
> >    signop operation_sign;
> > +
> > +  /* True if this is only suitable for SLP vectorization.  */
> > +  bool slp_vect_only_p;
> >  };
> >
> >  /* Information about a gather/scatter call.  */
> > @@ -1030,6 +1033,7 @@ STMT_VINFO_BB_VINFO (stmt_vec_info stmt_vinfo)
> >  #define STMT_VINFO_NUM_SLP_USES(S)   (S)->num_slp_uses
> >  #define STMT_VINFO_REDUC_TYPE(S)     (S)->reduc_type
> >  #define STMT_VINFO_REDUC_DEF(S)              (S)->reduc_def
> > +#define STMT_VINFO_SLP_VECT_ONLY(S)     (S)->slp_vect_only_p
> >
> >  #define DR_GROUP_FIRST_ELEMENT(S) \
> >    (gcc_checking_assert ((S)->dr_aux.dr), (S)->first_element)
> > diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
> > index d271049..4f6c65f 100644
> > --- a/gcc/tree-vectorizer.c
> > +++ b/gcc/tree-vectorizer.c
> > @@ -641,6 +641,7 @@ vec_info::new_stmt_vec_info (gimple *stmt)
> >    STMT_VINFO_VECTORIZABLE (res) = true;
> >    STMT_VINFO_VEC_REDUCTION_TYPE (res) = TREE_CODE_REDUCTION;
> >    STMT_VINFO_VEC_CONST_COND_REDUC_CODE (res) = ERROR_MARK;
> > +  STMT_VINFO_SLP_VECT_ONLY (res) = false;
> >
> >    if (gimple_code (stmt) == GIMPLE_PHI
> >        && is_loop_header_bb_p (gimple_bb (stmt)))

Re: [Vectorizer] Add SLP support for masked loads

Reply via email to