Richard Biener <rguent...@suse.de> writes:
> This adds a gather vectorization capability to the vectorizer
> without target support by decomposing the offset vector, doing
> sclar loads and then building a vector from the result.  This
> is aimed mainly at cases where vectorizing the rest of the loop
> offsets the cost of vectorizing the gather.
>
> Note it's difficult to avoid vectorizing the offset load, but in
> some cases later passes can turn the vector load + extract into
> scalar loads, see the followup patch.
>
> On SPEC CPU 2017 510.parest_r this improves runtime from 250s
> to 219s on a Zen2 CPU which has its native gather instructions
> disabled (using those the runtime instead increases to 254s)
> using -Ofast -march=znver2 [-flto].  It turns out the critical
> loops in this benchmark all perform gather operations.

Nice!

> Bootstrapped and tested on x86_64-unknown-linux-gnu.
>
> Any comments?  I still plan to run this over full SPEC and
> I have to apply TLC to the followup patch before I can post it.
>
> I think neither power nor z has gather so I'm curious if the
> patch helps 510.parest_r there, I'm unsure about neon/advsimd.
> Both might need the followup patch - I was surprised about
> the speedup without it on Zen (the followup improves runtime
> to 198s there).
>
> Thanks,
> Richard.
>
> 2021-07-30  Richard Biener  <rguent...@suse.de>
>
>       * tree-vect-data-refs.c (vect_check_gather_scatter):
>       Include widening conversions only when the result is
>       still handed by native gather or the current offset
>       size not already matches the data size.
>       Also succeed analysis in case there's no native support,
>       noted by a IFN_LAST ifn and a NULL decl.
>       * tree-vect-patterns.c (vect_recog_gather_scatter_pattern):
>       Test for no IFN gather rather than decl gather.
>       * tree-vect-stmts.c (vect_model_load_cost): Pass in the
>       gather-scatter info and cost emulated gathers accordingly.
>       (vect_truncate_gather_scatter_offset): Properly test for
>       no IFN gather.
>       (vect_use_strided_gather_scatters_p): Likewise.
>       (get_load_store_type): Handle emulated gathers and its
>       restrictions.
>       (vectorizable_load): Likewise.  Emulate them by extracting
>         scalar offsets, doing scalar loads and a vector construct.
>
>       * gcc.target/i386/vect-gather-1.c: New testcase.
>       * gfortran.dg/vect/vect-8.f90: Adjust.
> ---
>  gcc/testsuite/gcc.target/i386/vect-gather-1.c |  18 ++++
>  gcc/testsuite/gfortran.dg/vect/vect-8.f90     |   2 +-
>  gcc/tree-vect-data-refs.c                     |  29 +++--
>  gcc/tree-vect-patterns.c                      |   2 +-
>  gcc/tree-vect-stmts.c                         | 100 ++++++++++++++++--
>  5 files changed, 136 insertions(+), 15 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/vect-gather-1.c
>
> diff --git a/gcc/testsuite/gcc.target/i386/vect-gather-1.c 
> b/gcc/testsuite/gcc.target/i386/vect-gather-1.c
> new file mode 100644
> index 00000000000..134aef39666
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/vect-gather-1.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -msse2 -fdump-tree-vect-details" } */
> +
> +#ifndef INDEXTYPE
> +#define INDEXTYPE int
> +#endif
> +double vmul(INDEXTYPE *rowstart, INDEXTYPE *rowend,
> +         double *luval, double *dst)
> +{
> +  double res = 0;
> +  for (const INDEXTYPE * col = rowstart; col != rowend; ++col, ++luval)
> +        res += *luval * dst[*col];
> +  return res;
> +}
> +
> +/* With gather emulation this should be profitable to vectorize
> +   even with plain SSE2.  */
> +/* { dg-final { scan-tree-dump "loop vectorized" "vect" } } */
> diff --git a/gcc/testsuite/gfortran.dg/vect/vect-8.f90 
> b/gcc/testsuite/gfortran.dg/vect/vect-8.f90
> index 9994805d77f..cc1aebfbd84 100644
> --- a/gcc/testsuite/gfortran.dg/vect/vect-8.f90
> +++ b/gcc/testsuite/gfortran.dg/vect/vect-8.f90
> @@ -706,5 +706,5 @@ END SUBROUTINE kernel
>  
>  ! { dg-final { scan-tree-dump-times "vectorized 24 loops" 1 "vect" { target 
> aarch64_sve } } }
>  ! { dg-final { scan-tree-dump-times "vectorized 23 loops" 1 "vect" { target 
> { aarch64*-*-* && { ! aarch64_sve } } } } }
> -! { dg-final { scan-tree-dump-times "vectorized 2\[23\] loops" 1 "vect" { 
> target { vect_intdouble_cvt && { ! aarch64*-*-* } } } } }
> +! { dg-final { scan-tree-dump-times "vectorized 2\[234\] loops" 1 "vect" { 
> target { vect_intdouble_cvt && { ! aarch64*-*-* } } } } }
>  ! { dg-final { scan-tree-dump-times "vectorized 17 loops" 1 "vect" { target 
> { { ! vect_intdouble_cvt } && { ! aarch64*-*-* } } } } }
> diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
> index 6995efba899..0279e75fa8e 100644
> --- a/gcc/tree-vect-data-refs.c
> +++ b/gcc/tree-vect-data-refs.c
> @@ -4007,8 +4007,26 @@ vect_check_gather_scatter (stmt_vec_info stmt_info, 
> loop_vec_info loop_vinfo,
>             continue;
>           }
>  
> -       if (TYPE_PRECISION (TREE_TYPE (op0))
> -           < TYPE_PRECISION (TREE_TYPE (off)))
> +       /* Include the conversion if it is widening and we're using
> +          the IFN path or the target can handle the converted from
> +          offset or the current size is not already the same as the
> +          data vector element size.  */
> +       if ((TYPE_PRECISION (TREE_TYPE (op0))
> +            < TYPE_PRECISION (TREE_TYPE (off)))
> +           && ((!use_ifn_p
> +                && (DR_IS_READ (dr)
> +                    ? (targetm.vectorize.builtin_gather
> +                       && targetm.vectorize.builtin_gather (vectype,
> +                                                            TREE_TYPE (op0),
> +                                                            scale))
> +                    : (targetm.vectorize.builtin_scatter
> +                       && targetm.vectorize.builtin_scatter (vectype,
> +                                                             TREE_TYPE (op0),
> +                                                             scale))))
> +               || (!use_ifn_p
> +                   && !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)),
> +                                        TYPE_SIZE (TREE_TYPE (vectype)),
> +                                        0))))

I don't understand this bit: it seems to disable the path for
use_ifn_p altogether.

>           {
>             off = op0;
>             offtype = TREE_TYPE (off);
> @@ -4036,7 +4054,8 @@ vect_check_gather_scatter (stmt_vec_info stmt_info, 
> loop_vec_info loop_vinfo,
>        if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
>                                    vectype, memory_type, offtype, scale,
>                                    &ifn, &offset_vectype))
> -     return false;
> +     ifn = IFN_LAST;
> +      decl = NULL_TREE;
>      }
>    else
>      {
> @@ -4050,10 +4069,6 @@ vect_check_gather_scatter (stmt_vec_info stmt_info, 
> loop_vec_info loop_vinfo,
>         if (targetm.vectorize.builtin_scatter)
>           decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
>       }
> -
> -      if (!decl)
> -     return false;
> -
>        ifn = IFN_LAST;
>        /* The offset vector type will be read from DECL when needed.  */
>        offset_vectype = NULL_TREE;
> diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c
> index 743fd3f5414..25de97bd9b0 100644
> --- a/gcc/tree-vect-patterns.c
> +++ b/gcc/tree-vect-patterns.c
> @@ -4811,7 +4811,7 @@ vect_recog_gather_scatter_pattern (vec_info *vinfo,
>       function for the gather/scatter operation.  */
>    gather_scatter_info gs_info;
>    if (!vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info)
> -      || gs_info.decl)
> +      || gs_info.ifn == IFN_LAST)
>      return NULL;
>  
>    /* Convert the mask to the right form.  */
> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
> index 05085e1b110..9d51b476db6 100644
> --- a/gcc/tree-vect-stmts.c
> +++ b/gcc/tree-vect-stmts.c
> @@ -1084,6 +1084,7 @@ static void
>  vect_model_load_cost (vec_info *vinfo,
>                     stmt_vec_info stmt_info, unsigned ncopies, poly_uint64 vf,
>                     vect_memory_access_type memory_access_type,
> +                   gather_scatter_info *gs_info,
>                     slp_tree slp_node,
>                     stmt_vector_for_cost *cost_vec)
>  {
> @@ -1172,9 +1173,17 @@ vect_model_load_cost (vec_info *vinfo,
>    if (memory_access_type == VMAT_ELEMENTWISE
>        || memory_access_type == VMAT_GATHER_SCATTER)
>      {
> -      /* N scalar loads plus gathering them into a vector.  */
>        tree vectype = STMT_VINFO_VECTYPE (stmt_info);
>        unsigned int assumed_nunits = vect_nunits_for_cost (vectype);
> +      if (memory_access_type == VMAT_GATHER_SCATTER
> +       && gs_info->ifn == IFN_LAST && !gs_info->decl)
> +     /* For emulated gathers N offset vector element extracts
> +        (we assume the scalar scaling and ptr + offset add is consumed by
> +        the load).  */
> +     inside_cost += record_stmt_cost (cost_vec, ncopies * assumed_nunits,
> +                                      vec_to_scalar, stmt_info, 0,
> +                                      vect_body);
> +      /* N scalar loads plus gathering them into a vector.  */
>        inside_cost += record_stmt_cost (cost_vec,
>                                      ncopies * assumed_nunits,
>                                      scalar_load, stmt_info, 0, vect_body);
> @@ -1184,7 +1193,9 @@ vect_model_load_cost (vec_info *vinfo,
>                       &inside_cost, &prologue_cost, 
>                       cost_vec, cost_vec, true);
>    if (memory_access_type == VMAT_ELEMENTWISE
> -      || memory_access_type == VMAT_STRIDED_SLP)
> +      || memory_access_type == VMAT_STRIDED_SLP
> +      || (memory_access_type == VMAT_GATHER_SCATTER
> +       && gs_info->ifn == IFN_LAST && !gs_info->decl))
>      inside_cost += record_stmt_cost (cost_vec, ncopies, vec_construct,
>                                    stmt_info, 0, vect_body);
>  
> @@ -1866,7 +1877,8 @@ vect_truncate_gather_scatter_offset (stmt_vec_info 
> stmt_info,
>        tree memory_type = TREE_TYPE (DR_REF (dr));
>        if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
>                                    vectype, memory_type, offset_type, scale,
> -                                  &gs_info->ifn, &gs_info->offset_vectype))
> +                                  &gs_info->ifn, &gs_info->offset_vectype)
> +       || gs_info->ifn == IFN_LAST)
>       continue;
>  
>        gs_info->decl = NULL_TREE;
> @@ -1901,7 +1913,7 @@ vect_use_strided_gather_scatters_p (stmt_vec_info 
> stmt_info,
>                                   gather_scatter_info *gs_info)
>  {
>    if (!vect_check_gather_scatter (stmt_info, loop_vinfo, gs_info)
> -      || gs_info->decl)
> +      || gs_info->ifn == IFN_LAST)
>      return vect_truncate_gather_scatter_offset (stmt_info, loop_vinfo,
>                                               masked_p, gs_info);
>  
> @@ -2355,6 +2367,27 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
> stmt_info,
>                            vls_type == VLS_LOAD ? "gather" : "scatter");
>         return false;
>       }
> +      else if (gs_info->ifn == IFN_LAST && !gs_info->decl)
> +     {
> +       if (vls_type != VLS_LOAD)
> +         {
> +           if (dump_enabled_p ())
> +             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                              "unsupported emulated scatter.\n");
> +           return false;
> +         }
> +       else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ()
> +                || !known_eq (TYPE_VECTOR_SUBPARTS (vectype),
> +                              TYPE_VECTOR_SUBPARTS
> +                                (gs_info->offset_vectype)))
> +         {
> +           if (dump_enabled_p ())
> +             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                              "unsupported vector types for emulated "
> +                              "gather.\n");
> +           return false;
> +         }
> +     }
>        /* Gather-scatter accesses perform only component accesses, alignment
>        is irrelevant for them.  */
>        *alignment_support_scheme = dr_unaligned_supported;
> @@ -8692,6 +8725,15 @@ vectorizable_load (vec_info *vinfo,
>                            "unsupported access type for masked load.\n");
>         return false;
>       }
> +      else if (memory_access_type == VMAT_GATHER_SCATTER
> +            && gs_info.ifn == IFN_LAST
> +            && !gs_info.decl)
> +     {
> +       if (dump_enabled_p ())
> +         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                          "unsupported masked emulated gather.\n");
> +       return false;
> +     }
>      }
>  
>    if (!vec_stmt) /* transformation not required.  */
> @@ -8725,7 +8767,7 @@ vectorizable_load (vec_info *vinfo,
>  
>        STMT_VINFO_TYPE (orig_stmt_info) = load_vec_info_type;
>        vect_model_load_cost (vinfo, stmt_info, ncopies, vf, 
> memory_access_type,
> -                         slp_node, cost_vec);
> +                         &gs_info, slp_node, cost_vec);
>        return true;
>      }
>  
> @@ -9438,7 +9480,8 @@ vectorizable_load (vec_info *vinfo,
>                   unsigned int misalign;
>                   unsigned HOST_WIDE_INT align;
>  
> -                 if (memory_access_type == VMAT_GATHER_SCATTER)
> +                 if (memory_access_type == VMAT_GATHER_SCATTER
> +                     && gs_info.ifn != IFN_LAST)
>                     {
>                       tree zero = build_zero_cst (vectype);
>                       tree scale = size_int (gs_info.scale);
> @@ -9456,6 +9499,51 @@ vectorizable_load (vec_info *vinfo,
>                       data_ref = NULL_TREE;
>                       break;
>                     }
> +                 else if (memory_access_type == VMAT_GATHER_SCATTER)
> +                   {
> +                     /* Emulated gather-scatter.  */
> +                     gcc_assert (!final_mask);

For this to be safe, we need to make sure that
check_load_store_for_partial_vectors clears
LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P.  Is that already guaranteed?
(Maybe yes, I didn't look closely. :-))

LGTM otherwise FWIW.

Thanks,
Richard

> +                     unsigned HOST_WIDE_INT const_nunits
> +                       = nunits.to_constant ();
> +                     vec<constructor_elt, va_gc> *ctor_elts;
> +                     vec_alloc (ctor_elts, const_nunits);
> +                     gimple_seq stmts = NULL;
> +                     tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset));
> +                     tree scale = size_int (gs_info.scale);
> +                     align
> +                       = get_object_alignment (DR_REF (first_dr_info->dr));
> +                     tree ltype = build_aligned_type (TREE_TYPE (vectype),
> +                                                      align);
> +                     for (unsigned k = 0; k < const_nunits; ++k)
> +                       {
> +                         tree boff = size_binop (MULT_EXPR,
> +                                                 TYPE_SIZE (idx_type),
> +                                                 bitsize_int (k));
> +                         tree idx = gimple_build (&stmts, BIT_FIELD_REF,
> +                                                  idx_type, vec_offset,
> +                                                  TYPE_SIZE (idx_type),
> +                                                  boff);
> +                         idx = gimple_convert (&stmts, sizetype, idx);
> +                         idx = gimple_build (&stmts, MULT_EXPR,
> +                                             sizetype, idx, scale);
> +                         tree ptr = gimple_build (&stmts, PLUS_EXPR,
> +                                                  TREE_TYPE (dataref_ptr),
> +                                                  dataref_ptr, idx);
> +                         ptr = gimple_convert (&stmts, ptr_type_node, ptr);
> +                         tree elt = make_ssa_name (TREE_TYPE (vectype));
> +                         tree ref = build2 (MEM_REF, ltype, ptr,
> +                                            build_int_cst (ref_type, 0));
> +                         new_stmt = gimple_build_assign (elt, ref);
> +                         gimple_seq_add_stmt (&stmts, new_stmt);
> +                         CONSTRUCTOR_APPEND_ELT (ctor_elts, NULL_TREE, elt);
> +                       }
> +                     gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
> +                     new_stmt = gimple_build_assign (NULL_TREE,
> +                                                     build_constructor
> +                                                       (vectype, ctor_elts));
> +                     data_ref = NULL_TREE;
> +                     break;
> +                   }
>  
>                   align =
>                     known_alignment (DR_TARGET_ALIGNMENT (first_dr_info));

Reply via email to