On Fri, Nov 12, 2021 at 7:01 PM Richard Sandiford via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > This patch adds SLP support for IFN_GATHER_LOAD. Like the SLP > support for IFN_MASK_LOAD, it works by treating only some of the > arguments as child nodes. Unlike IFN_MASK_LOAD, it requires the > other arguments (base, scale, and extension type) to be the same > for all calls in the group. It does not require/expect the loads > to be in a group (which probably wouldn't make sense for gathers). > > I was worried about the possible alias effect of moving gathers > around to be part of the same SLP group. The patch therefore > makes vect_analyze_data_ref_dependence treat gathers and scatters > as a top-level concern, punting if the accesses aren't completely > independent and if the user hasn't told us that a particular > VF is safe. I think in practice we already punted in the same > circumstances; the idea is just to make it more explicit. > > Regstrapped on aarch64-linux-gnu and x86_64-linux-gnu. OK to install?
Btw, I filed PR102467 for this a while ago, you might want to mention that in the ChangeLog. OK. Thanks, Richard. > Richard > > > gcc/ > * doc/sourcebuild.texi (vect_gather_load_ifn): Document. > * tree-vect-data-refs.c (vect_analyze_data_ref_dependence): > Commonize safelen handling. Punt for anything involving > gathers and scatters unless safelen says otherwise. > * tree-vect-slp.c (arg1_map): New variable. > (vect_get_operand_map): Handle IFN_GATHER_LOAD. > (vect_build_slp_tree_1): Likewise. > (vect_build_slp_tree_2): Likewise. > (compatible_calls_p): If vect_get_operand_map returns nonnull, > check that any skipped arguments are equal. > (vect_slp_analyze_node_operations_1): Tighten reduction check. > * tree-vect-stmts.c (check_load_store_for_partial_vectors): Take > an ncopies argument. > (vect_get_gather_scatter_ops): Take slp_node and ncopies arguments. > Handle SLP nodes. > (vectorizable_store, vectorizable_load): Adjust accordingly. > > gcc/testsuite/ > * lib/target-supports.exp > (check_effective_target_vect_gather_load_ifn): New target test. > * gcc.dg/vect/vect-gather-1.c: New test. > * gcc.dg/vect/vect-gather-2.c: Likewise. > * gcc.target/aarch64/sve/gather_load_11.c: Likewise. > --- > gcc/doc/sourcebuild.texi | 4 ++ > gcc/testsuite/gcc.dg/vect/vect-gather-1.c | 60 +++++++++++++++++ > gcc/testsuite/gcc.dg/vect/vect-gather-2.c | 36 +++++++++++ > .../gcc.target/aarch64/sve/gather_load_11.c | 49 ++++++++++++++ > gcc/testsuite/lib/target-supports.exp | 6 ++ > gcc/tree-vect-data-refs.c | 64 +++++++++---------- > gcc/tree-vect-slp.c | 29 +++++++-- > gcc/tree-vect-stmts.c | 26 ++++---- > 8 files changed, 223 insertions(+), 51 deletions(-) > create mode 100644 gcc/testsuite/gcc.dg/vect/vect-gather-1.c > create mode 100644 gcc/testsuite/gcc.dg/vect/vect-gather-2.c > create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/gather_load_11.c > > diff --git a/gcc/doc/sourcebuild.texi b/gcc/doc/sourcebuild.texi > index 40b1e0d8167..702cd0c53e4 100644 > --- a/gcc/doc/sourcebuild.texi > +++ b/gcc/doc/sourcebuild.texi > @@ -1639,6 +1639,10 @@ Target supports vector masked loads. > @item vect_masked_store > Target supports vector masked stores. > > +@item vect_gather_load_ifn > +Target supports vector gather loads using internal functions > +(rather than via built-in functions or emulation). > + > @item vect_scatter_store > Target supports vector scatter stores. > > diff --git a/gcc/testsuite/gcc.dg/vect/vect-gather-1.c > b/gcc/testsuite/gcc.dg/vect/vect-gather-1.c > new file mode 100644 > index 00000000000..4cee73fc775 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/vect-gather-1.c > @@ -0,0 +1,60 @@ > +#include "tree-vect.h" > + > +#define N 16 > + > +void __attribute__((noipa)) > +f (int *restrict y, int *restrict x, int *restrict indices) > +{ > + for (int i = 0; i < N; ++i) > + { > + y[i * 2] = x[indices[i * 2]] + 1; > + y[i * 2 + 1] = x[indices[i * 2 + 1]] + 2; > + } > +} > + > +int y[N * 2]; > +int x[N * 2] = { > + 72704, 52152, 51301, 96681, > + 57937, 60490, 34504, 60944, > + 42225, 28333, 88336, 74300, > + 29250, 20484, 38852, 91536, > + 86917, 63941, 31590, 21998, > + 22419, 26974, 28668, 13968, > + 3451, 20247, 44089, 85521, > + 22871, 87362, 50555, 85939 > +}; > +int indices[N * 2] = { > + 15, 16, 9, 19, > + 7, 22, 19, 1, > + 22, 13, 15, 30, > + 5, 12, 11, 11, > + 10, 25, 5, 20, > + 22, 24, 24, 28, > + 30, 19, 6, 4, > + 7, 12, 8, 21 > +}; > +int expected[N * 2] = { > + 91537, 86919, 28334, 22000, > + 60945, 28670, 21999, 52154, > + 28669, 20486, 91537, 50557, > + 60491, 29252, 74301, 74302, > + 88337, 20249, 60491, 22421, > + 28669, 3453, 3452, 22873, > + 50556, 22000, 34505, 57939, > + 60945, 29252, 42226, 26976 > +}; > + > +int > +main (void) > +{ > + check_vect (); > + > + f (y, x, indices); > + for (int i = 0; i < 32; ++i) > + if (y[i] != expected[i]) > + __builtin_abort (); > + > + return 0; > +} > + > +/* { dg-final { scan-tree-dump "Loop contains only SLP stmts" vect { target > vect_gather_load_ifn } } } */ > diff --git a/gcc/testsuite/gcc.dg/vect/vect-gather-2.c > b/gcc/testsuite/gcc.dg/vect/vect-gather-2.c > new file mode 100644 > index 00000000000..a1f6ba458a9 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/vect-gather-2.c > @@ -0,0 +1,36 @@ > +/* { dg-do compile } */ > + > +#define N 16 > + > +void > +f1 (int *restrict y, int *restrict x1, int *restrict x2, > + int *restrict indices) > +{ > + for (int i = 0; i < N; ++i) > + { > + y[i * 2] = x1[indices[i * 2]] + 1; > + y[i * 2 + 1] = x2[indices[i * 2 + 1]] + 2; > + } > +} > + > +void > +f2 (int *restrict y, int *restrict x, int *restrict indices) > +{ > + for (int i = 0; i < N; ++i) > + { > + y[i * 2] = x[indices[i * 2]] + 1; > + y[i * 2 + 1] = x[indices[i * 2 + 1] * 2] + 2; > + } > +} > + > +void > +f3 (int *restrict y, int *restrict x, int *restrict indices) > +{ > + for (int i = 0; i < N; ++i) > + { > + y[i * 2] = x[indices[i * 2]] + 1; > + y[i * 2 + 1] = x[(unsigned int) indices[i * 2 + 1]] + 2; > + } > +} > + > +/* { dg-final { scan-tree-dump-not "Loop contains only SLP stmts" vect { > target vect_gather_load_ifn } } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/gather_load_11.c > b/gcc/testsuite/gcc.target/aarch64/sve/gather_load_11.c > new file mode 100644 > index 00000000000..f6f78c1c8d9 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/gather_load_11.c > @@ -0,0 +1,49 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -fno-vect-cost-model" } */ > + > +#include <stdint.h> > + > +void > +f1 (int32_t *restrict y, int32_t *restrict x, int32_t *restrict index) > +{ > + for (int i = 0; i < 100; ++i) > + { > + y[i * 2] = x[index[i * 2]] + 1; > + y[i * 2 + 1] = x[index[i * 2 + 1]] + 2; > + } > +} > + > +void > +f2 (int32_t *restrict y, int32_t *restrict x, uint32_t *restrict index) > +{ > + for (int i = 0; i < 100; ++i) > + { > + y[i * 2] = x[index[i * 2]] + 1; > + y[i * 2 + 1] = x[index[i * 2 + 1]] + 2; > + } > +} > + > +void > +f3 (int32_t *restrict y, int32_t *restrict x, uint64_t *restrict index) > +{ > + for (int i = 0; i < 100; ++i) > + { > + y[i * 2] = x[index[i * 2]] + 1; > + y[i * 2 + 1] = x[index[i * 2 + 1]] + 2; > + } > +} > + > +void > +f4 (int64_t *restrict y, int64_t *restrict x, uint64_t *restrict index) > +{ > + for (int i = 0; i < 100; ++i) > + { > + y[i * 2] = x[index[i * 2]] + 1; > + y[i * 2 + 1] = x[index[i * 2 + 1]] + 2; > + } > +} > + > +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, > \[x[0-9]+, z[0-9]+\.s, sxtw #?2\]} 1 } } */ > +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, > \[x[0-9]+, z[0-9]+\.s, uxtw #?2\]} 1 } } */ > +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d, p[0-7]/z, > \[x[0-9]+, z[0-9]+\.d, lsl #?2\]} 1 } } */ > +/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, > \[x[0-9]+, z[0-9]+\.d, lsl #?3\]} 1 } } */ > diff --git a/gcc/testsuite/lib/target-supports.exp > b/gcc/testsuite/lib/target-supports.exp > index 8cbda192fe0..e3cada910ca 100644 > --- a/gcc/testsuite/lib/target-supports.exp > +++ b/gcc/testsuite/lib/target-supports.exp > @@ -7985,6 +7985,12 @@ proc check_effective_target_vect_masked_store { } { > || [istarget amdgcn*-*-*] }] > } > > +# Return 1 if the target supports vector gather loads via internal functions. > + > +proc check_effective_target_vect_gather_load_ifn { } { > + return [expr { [check_effective_target_aarch64_sve] }] > +} > + > # Return 1 if the target supports vector scatter stores. > > proc check_effective_target_vect_scatter_store { } { > diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c > index 888ad72f3a9..12a82cd694a 100644 > --- a/gcc/tree-vect-data-refs.c > +++ b/gcc/tree-vect-data-refs.c > @@ -359,6 +359,20 @@ vect_analyze_data_ref_dependence (struct > data_dependence_relation *ddr, > lambda_vector dist_v; > unsigned int loop_depth; > > + /* If user asserted safelen consecutive iterations can be > + executed concurrently, assume independence. */ > + auto apply_safelen = [&]() > + { > + if (loop->safelen >= 2) > + { > + if ((unsigned int) loop->safelen < *max_vf) > + *max_vf = loop->safelen; > + LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false; > + return true; > + } > + return false; > + }; > + > /* In loop analysis all data references should be vectorizable. */ > if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a) > || !STMT_VINFO_VECTORIZABLE (stmtinfo_b)) > @@ -393,26 +407,23 @@ vect_analyze_data_ref_dependence (struct > data_dependence_relation *ddr, > get_alias_set (DR_REF (drb)))) > return opt_result::success (); > > + if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a) > + || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b)) > + { > + if (apply_safelen ()) > + return opt_result::success (); > + > + return opt_result::failure_at > + (stmtinfo_a->stmt, > + "possible alias involving gather/scatter between %T and %T\n", > + DR_REF (dra), DR_REF (drb)); > + } > + > /* Unknown data dependence. */ > if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) > { > - /* If user asserted safelen consecutive iterations can be > - executed concurrently, assume independence. */ > - if (loop->safelen >= 2) > - { > - if ((unsigned int) loop->safelen < *max_vf) > - *max_vf = loop->safelen; > - LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false; > - return opt_result::success (); > - } > - > - if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a) > - || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b)) > - return opt_result::failure_at > - (stmtinfo_a->stmt, > - "versioning for alias not supported for: " > - "can't determine dependence between %T and %T\n", > - DR_REF (dra), DR_REF (drb)); > + if (apply_safelen ()) > + return opt_result::success (); > > if (dump_enabled_p ()) > dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt, > @@ -427,23 +438,8 @@ vect_analyze_data_ref_dependence (struct > data_dependence_relation *ddr, > /* Known data dependence. */ > if (DDR_NUM_DIST_VECTS (ddr) == 0) > { > - /* If user asserted safelen consecutive iterations can be > - executed concurrently, assume independence. */ > - if (loop->safelen >= 2) > - { > - if ((unsigned int) loop->safelen < *max_vf) > - *max_vf = loop->safelen; > - LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false; > - return opt_result::success (); > - } > - > - if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a) > - || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b)) > - return opt_result::failure_at > - (stmtinfo_a->stmt, > - "versioning for alias not supported for: " > - "bad dist vector for %T and %T\n", > - DR_REF (dra), DR_REF (drb)); > + if (apply_safelen ()) > + return opt_result::success (); > > if (dump_enabled_p ()) > dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt, > diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c > index 2594ab7607f..0f09fc1fda8 100644 > --- a/gcc/tree-vect-slp.c > +++ b/gcc/tree-vect-slp.c > @@ -459,6 +459,7 @@ static const int cond_expr_maps[3][5] = { > { 4, -2, -1, 1, 2 }, > { 4, -1, -2, 2, 1 } > }; > +static const int arg1_map[] = { 1, 1 }; > static const int arg2_map[] = { 1, 2 }; > > /* For most SLP statements, there is a one-to-one mapping between > @@ -490,6 +491,9 @@ vect_get_operand_map (const gimple *stmt, unsigned char > swap = 0) > case IFN_MASK_LOAD: > return arg2_map; > > + case IFN_GATHER_LOAD: > + return arg1_map; > + > default: > break; > } > @@ -825,6 +829,20 @@ compatible_calls_p (gcall *call1, gcall *call2) > if (gimple_call_fntype (call1) != gimple_call_fntype (call2)) > return false; > } > + > + /* Check that any unvectorized arguments are equal. */ > + if (const int *map = vect_get_operand_map (call1)) > + { > + unsigned int nkept = *map++; > + unsigned int mapi = 0; > + for (unsigned int i = 0; i < nargs; ++i) > + if (mapi < nkept && map[mapi] == int (i)) > + mapi += 1; > + else if (!operand_equal_p (gimple_call_arg (call1, i), > + gimple_call_arg (call2, i))) > + return false; > + } > + > return true; > } > > @@ -982,7 +1000,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char > *swap, > else > rhs_code = CALL_EXPR; > > - if (cfn == CFN_MASK_LOAD) > + if (cfn == CFN_MASK_LOAD || cfn == CFN_GATHER_LOAD) > load_p = true; > else if ((internal_fn_p (cfn) > && !vectorizable_internal_fn_p (as_internal_fn (cfn))) > @@ -1126,7 +1144,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char > *swap, > continue; > } > > - if (!load_p && call_stmt) > + if (call_stmt && first_stmt_code != CFN_MASK_LOAD) > { > if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt), > call_stmt)) > @@ -1211,7 +1229,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char > *swap, > } /* Grouped access. */ > else > { > - if (load_p) > + if (load_p && rhs_code != CFN_GATHER_LOAD) > { > /* Not grouped load. */ > if (dump_enabled_p ()) > @@ -1692,7 +1710,8 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, > && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))) > { > if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt)) > - gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)); > + gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD) > + || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)); > else > { > *max_nunits = this_max_nunits; > @@ -4408,7 +4427,7 @@ vect_slp_analyze_node_operations_1 (vec_info *vinfo, > slp_tree node, > calculated by the recursive call). Otherwise it is the number of > scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by > VF divided by the number of elements in a vector. */ > - if (!STMT_VINFO_GROUPED_ACCESS (stmt_info) > + if (!STMT_VINFO_DATA_REF (stmt_info) > && REDUC_GROUP_FIRST_ELEMENT (stmt_info)) > { > for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i) > diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c > index 101f61feff6..06da5a9bc13 100644 > --- a/gcc/tree-vect-stmts.c > +++ b/gcc/tree-vect-stmts.c > @@ -1674,6 +1674,7 @@ check_load_store_for_partial_vectors (loop_vec_info > loop_vinfo, tree vectype, > int group_size, > vect_memory_access_type > memory_access_type, > + unsigned int ncopies, > gather_scatter_info *gs_info, > tree scalar_mask) > { > @@ -1698,7 +1699,6 @@ check_load_store_for_partial_vectors (loop_vec_info > loop_vinfo, tree vectype, > LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; > return; > } > - unsigned int ncopies = vect_get_num_copies (loop_vinfo, vectype); > vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, > scalar_mask); > return; > } > @@ -1721,7 +1721,6 @@ check_load_store_for_partial_vectors (loop_vec_info > loop_vinfo, tree vectype, > LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; > return; > } > - unsigned int ncopies = vect_get_num_copies (loop_vinfo, vectype); > vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, > scalar_mask); > return; > } > @@ -2963,6 +2962,7 @@ vect_build_gather_load_calls (vec_info *vinfo, > stmt_vec_info stmt_info, > static void > vect_get_gather_scatter_ops (loop_vec_info loop_vinfo, > class loop *loop, stmt_vec_info stmt_info, > + slp_tree slp_node, unsigned int ncopies, > gather_scatter_info *gs_info, > tree *dataref_ptr, vec<tree> *vec_offset) > { > @@ -2975,10 +2975,12 @@ vect_get_gather_scatter_ops (loop_vec_info loop_vinfo, > new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); > gcc_assert (!new_bb); > } > - unsigned ncopies = vect_get_num_copies (loop_vinfo, > gs_info->offset_vectype); > - vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, ncopies, > - gs_info->offset, vec_offset, > - gs_info->offset_vectype); > + if (slp_node) > + vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[0], vec_offset); > + else > + vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, ncopies, > + gs_info->offset, vec_offset, > + gs_info->offset_vectype); > } > > /* Prepare to implement a grouped or strided load or store using > @@ -7484,7 +7486,7 @@ vectorizable_store (vec_info *vinfo, > && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) > check_load_store_for_partial_vectors (loop_vinfo, vectype, vls_type, > group_size, memory_access_type, > - &gs_info, mask); > + ncopies, &gs_info, mask); > > if (slp_node > && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN > (slp_node)[0], > @@ -8147,8 +8149,8 @@ vectorizable_store (vec_info *vinfo, > else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) > { > vect_get_gather_scatter_ops (loop_vinfo, loop, stmt_info, > - &gs_info, &dataref_ptr, > - &vec_offsets); > + slp_node, ncopies, &gs_info, > + &dataref_ptr, &vec_offsets); > vec_offset = vec_offsets[0]; > } > else > @@ -8827,7 +8829,7 @@ vectorizable_load (vec_info *vinfo, > && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) > check_load_store_for_partial_vectors (loop_vinfo, vectype, VLS_LOAD, > group_size, memory_access_type, > - &gs_info, mask); > + ncopies, &gs_info, mask); > > if (dump_enabled_p () > && memory_access_type != VMAT_ELEMENTWISE > @@ -9445,8 +9447,8 @@ vectorizable_load (vec_info *vinfo, > else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) > { > vect_get_gather_scatter_ops (loop_vinfo, loop, stmt_info, > - &gs_info, &dataref_ptr, > - &vec_offsets); > + slp_node, ncopies, &gs_info, > + &dataref_ptr, &vec_offsets); > } > else > dataref_ptr > -- > 2.25.1 >