It turns out target costing code looks at STMT_VINFO_MEMORY_ACCESS_TYPE to identify operations from (emulated) gathers for example. This doesn't work for SLP loads since we do not set STMT_VINFO_MEMORY_ACCESS_TYPE there as the vectorization strathegy might differ between different stmt uses. It seems we got away with setting it for stores though. The following adds a memory_access_type field to slp_tree and sets it from load and store vectorization code. All the costing doesn't record the SLP node (that was only done selectively for some corner case). The costing is really in need of a big overhaul, the following just massages the two relevant ops to fix gcc.dg/target/pr88531-2[bc].c FAILs when switching on SLP for non-grouped stores. In particular currently we either have a SLP node or a stmt_info in the cost hook but not both.
So the following mitigates this, postponing a rewrite of costing to next stage1. Other targets look possibly affected as well but are left to respective maintainers to update. Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed. (this is a re-post/test from a patch from June) PR tree-optimization/117080 * tree-vectorizer.h (_slp_tree::memory_access_type): Add. (SLP_TREE_MEMORY_ACCESS_TYPE): New. (record_stmt_cost): Add another overload. * tree-vect-slp.cc (_slp_tree::_slp_tree): Initialize memory_access_type. * tree-vect-stmts.cc (vectorizable_store): Set SLP_TREE_MEMORY_ACCESS_TYPE. (vectorizable_load): Likewise. Also record the SLP node when costing emulated gather offset decompose and vector composition. * config/i386/i386.cc (ix86_vector_costs::add_stmt_cost): Also recognize SLP emulated gather/scatter. --- gcc/config/i386/i386.cc | 22 ++++++--- gcc/tree-vect-slp.cc | 1 + gcc/tree-vect-stmts.cc | 16 +++++-- gcc/tree-vectorizer.h | 102 ++++++++++++++++++++++++---------------- 4 files changed, 91 insertions(+), 50 deletions(-) diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index ab0ade3790f..a1f0ae7a7e1 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -25201,13 +25201,21 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, (AGU and load ports). Try to account for this by scaling the construction cost by the number of elements involved. */ if ((kind == vec_construct || kind == vec_to_scalar) - && stmt_info - && (STMT_VINFO_TYPE (stmt_info) == load_vec_info_type - || STMT_VINFO_TYPE (stmt_info) == store_vec_info_type) - && ((STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_ELEMENTWISE - && (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF (stmt_info))) - != INTEGER_CST)) - || STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)) + && ((stmt_info + && (STMT_VINFO_TYPE (stmt_info) == load_vec_info_type + || STMT_VINFO_TYPE (stmt_info) == store_vec_info_type) + && ((STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_ELEMENTWISE + && (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF (stmt_info))) + != INTEGER_CST)) + || (STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) + == VMAT_GATHER_SCATTER))) + || (node + && ((SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_ELEMENTWISE + && (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF + (SLP_TREE_REPRESENTATIVE (node)))) + != INTEGER_CST)) + || (SLP_TREE_MEMORY_ACCESS_TYPE (node) + == VMAT_GATHER_SCATTER))))) { stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign); stmt_cost *= (TYPE_VECTOR_SUBPARTS (vectype) + 1); diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index 914b0b61b4d..83cb39fc214 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -124,6 +124,7 @@ _slp_tree::_slp_tree () this->ldst_lanes = false; SLP_TREE_VECTYPE (this) = NULL_TREE; SLP_TREE_REPRESENTATIVE (this) = NULL; + SLP_TREE_MEMORY_ACCESS_TYPE (this) = VMAT_INVARIANT; SLP_TREE_REF_COUNT (this) = 1; this->failed = NULL; this->max_nunits = 1; diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index ad4a3141ab8..4f6905f1541 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -8350,6 +8350,8 @@ vectorizable_store (vec_info *vinfo, if (costing_p) /* transformation not required. */ { STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) = memory_access_type; + if (slp_node) + SLP_TREE_MEMORY_ACCESS_TYPE (slp_node) = memory_access_type; if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) @@ -8390,7 +8392,10 @@ vectorizable_store (vec_info *vinfo, && first_stmt_info != stmt_info) return true; } - gcc_assert (memory_access_type == STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info)); + if (slp_node) + gcc_assert (memory_access_type == SLP_TREE_MEMORY_ACCESS_TYPE (stmt_info)); + else + gcc_assert (memory_access_type == STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info)); /* Transform. */ @@ -10232,6 +10237,8 @@ vectorizable_load (vec_info *vinfo, if (!slp) STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) = memory_access_type; + else + SLP_TREE_MEMORY_ACCESS_TYPE (slp_node) = memory_access_type; if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) @@ -10256,6 +10263,9 @@ vectorizable_load (vec_info *vinfo, if (!slp) gcc_assert (memory_access_type == STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info)); + else + gcc_assert (memory_access_type + == SLP_TREE_MEMORY_ACCESS_TYPE (slp_node)); if (dump_enabled_p () && !costing_p) dump_printf_loc (MSG_NOTE, vect_location, @@ -11337,7 +11347,7 @@ vectorizable_load (vec_info *vinfo, offset add is consumed by the load). */ inside_cost = record_stmt_cost (cost_vec, const_nunits, vec_to_scalar, stmt_info, - 0, vect_body); + slp_node, 0, vect_body); /* N scalar loads plus gathering them into a vector. */ inside_cost @@ -11345,7 +11355,7 @@ vectorizable_load (vec_info *vinfo, stmt_info, 0, vect_body); inside_cost = record_stmt_cost (cost_vec, 1, vec_construct, - stmt_info, 0, vect_body); + stmt_info, slp_node, 0, vect_body); continue; } unsigned HOST_WIDE_INT const_offset_nunits diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 73bccb5a40a..2775d873ca4 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -161,6 +161,46 @@ struct vect_scalar_ops_slice_hash : typed_noop_remove<vect_scalar_ops_slice> static bool equal (const value_type &, const compare_type &); }; +/* Describes how we're going to vectorize an individual load or store, + or a group of loads or stores. */ +enum vect_memory_access_type { + /* An access to an invariant address. This is used only for loads. */ + VMAT_INVARIANT, + + /* A simple contiguous access. */ + VMAT_CONTIGUOUS, + + /* A contiguous access that goes down in memory rather than up, + with no additional permutation. This is used only for stores + of invariants. */ + VMAT_CONTIGUOUS_DOWN, + + /* A simple contiguous access in which the elements need to be permuted + after loading or before storing. Only used for loop vectorization; + SLP uses separate permutes. */ + VMAT_CONTIGUOUS_PERMUTE, + + /* A simple contiguous access in which the elements need to be reversed + after loading or before storing. */ + VMAT_CONTIGUOUS_REVERSE, + + /* An access that uses IFN_LOAD_LANES or IFN_STORE_LANES. */ + VMAT_LOAD_STORE_LANES, + + /* An access in which each scalar element is loaded or stored + individually. */ + VMAT_ELEMENTWISE, + + /* A hybrid of VMAT_CONTIGUOUS and VMAT_ELEMENTWISE, used for grouped + SLP accesses. Each unrolled iteration uses a contiguous load + or store for the whole group, but the groups from separate iterations + are combined in the same way as for VMAT_ELEMENTWISE. */ + VMAT_STRIDED_SLP, + + /* The access uses gather loads or scatter stores. */ + VMAT_GATHER_SCATTER +}; + /************************************************************************ SLP ************************************************************************/ @@ -228,6 +268,10 @@ struct _slp_tree { int vertex; + /* Classifies how the load or store is going to be implemented + for loop vectorization. */ + vect_memory_access_type memory_access_type; + /* If not NULL this is a cached failed SLP discovery attempt with the lanes that failed during SLP discovery as 'false'. This is a copy of the matches array. */ @@ -315,6 +359,7 @@ public: #define SLP_TREE_REPRESENTATIVE(S) (S)->representative #define SLP_TREE_LANES(S) (S)->lanes #define SLP_TREE_CODE(S) (S)->code +#define SLP_TREE_MEMORY_ACCESS_TYPE(S) (S)->memory_access_type enum vect_partial_vector_style { vect_partial_vectors_none, @@ -1203,46 +1248,6 @@ enum vec_load_store_type { VLS_STORE_INVARIANT }; -/* Describes how we're going to vectorize an individual load or store, - or a group of loads or stores. */ -enum vect_memory_access_type { - /* An access to an invariant address. This is used only for loads. */ - VMAT_INVARIANT, - - /* A simple contiguous access. */ - VMAT_CONTIGUOUS, - - /* A contiguous access that goes down in memory rather than up, - with no additional permutation. This is used only for stores - of invariants. */ - VMAT_CONTIGUOUS_DOWN, - - /* A simple contiguous access in which the elements need to be permuted - after loading or before storing. Only used for loop vectorization; - SLP uses separate permutes. */ - VMAT_CONTIGUOUS_PERMUTE, - - /* A simple contiguous access in which the elements need to be reversed - after loading or before storing. */ - VMAT_CONTIGUOUS_REVERSE, - - /* An access that uses IFN_LOAD_LANES or IFN_STORE_LANES. */ - VMAT_LOAD_STORE_LANES, - - /* An access in which each scalar element is loaded or stored - individually. */ - VMAT_ELEMENTWISE, - - /* A hybrid of VMAT_CONTIGUOUS and VMAT_ELEMENTWISE, used for grouped - SLP accesses. Each unrolled iteration uses a contiguous load - or store for the whole group, but the groups from separate iterations - are combined in the same way as for VMAT_ELEMENTWISE. */ - VMAT_STRIDED_SLP, - - /* The access uses gather loads or scatter stores. */ - VMAT_GATHER_SCATTER -}; - class dr_vec_info { public: /* The data reference itself. */ @@ -2346,6 +2351,23 @@ record_stmt_cost (stmt_vector_for_cost *body_cost_vec, int count, STMT_VINFO_VECTYPE (stmt_info), misalign, where); } +/* Overload of record_stmt_cost with VECTYPE derived from STMT_INFO and + SLP node specified. */ + +inline unsigned +record_stmt_cost (stmt_vector_for_cost *body_cost_vec, int count, + enum vect_cost_for_stmt kind, stmt_vec_info stmt_info, + slp_tree node, + int misalign, enum vect_cost_model_location where) +{ + if (node) + return record_stmt_cost (body_cost_vec, count, kind, node, + STMT_VINFO_VECTYPE (stmt_info), misalign, where); + else + return record_stmt_cost (body_cost_vec, count, kind, stmt_info, + STMT_VINFO_VECTYPE (stmt_info), misalign, where); +} + extern void vect_finish_replace_stmt (vec_info *, stmt_vec_info, gimple *); extern void vect_finish_stmt_generation (vec_info *, stmt_vec_info, gimple *, gimple_stmt_iterator *); -- 2.43.0