This RFC patch proposes to address PR61338, a long-standing issue where
GCC emits redundant reverse permutations for negative-step contiguous
data references (DRs). Consider the following loop:
for (int i = 100; i > 0; i--) {
a[i] *= 2;
}
On AArch64, current GCC emits the following loop body:
ldr q30, [x1]
tbl v30.16b, {v30.16b}, v31.16b
add v30.4s, v30.4s, v30.4s
tbl v30.16b, {v30.16b}, v31.16b
str q30, [x1], -16
* The problem
The two tbl instructions perform reverse permutations and cancel each
other. Since the arithmetic operation between them is lane-wise, they
are actually redundant.
Current GCC vectorizer classifies negative-step contiguous DRs as
VMAT_CONTIGUOUS_REVERSE, and generates reverse permutations in function
vectorizable_load/store(). At that point, the vectorizer does not know
how the vectors are defined or used, and therefore cannot decide whether
the reverse permutations are redundant.
* Proposed fix
This RFC fixes the issue at the SLP graph level, in vect_optimize_slp(),
where we have a full view of how vectors are produced and consumed in
each SLP instance. The implementation performs the following two steps:
1) Count per-node references across SLP instances and conservatively
reject nodes shared by multiple instances.
2) For each SLP instance (starting from its root), recursively verify
that the instance is suitable for skipping reverse permutations. This
is done through a series of conservative checks on all nodes in the
instance. If all checks succeed, the corresponding DR nodes are
marked so that vectorizable_load/store() can skip generating reverse
permutations.
* Future work
This patch is an RFC, so the current implementation is conservative. It
can potentially be extended to support more lane-wise operations and
more loop patterns such as reductions.
This patch is bootstrapped and regression-tested on x86_64-linux-gnu,
arm-linux-gnueabihf and aarch64-linux-gnu. Additional test cases will
be added later in a formal patch if this overall direction is agreed.
Comments and suggestions on this would be appreciated.
gcc/ChangeLog:
* tree-vect-slp.cc (_slp_tree::_slp_tree): Initialize
skip_rev_perm to false.
(vect_print_slp_tree): Dump skip_rev_perm flag.
(vect_count_slp_node_refs): New function to count SLP node
references across SLP instances.
(vect_can_skip_reverse_perm_p): New function to check whether
reverse permutations in each SLP instance can be skipped.
(vect_mark_skip_reverse_perm): New function to mark the nodes
whose reverse permutations can be skipped.
(vect_optimize_slp): Apply reverse permutation elimination.
* tree-vect-stmts.cc (vect_simple_negative_step_p): New function
to check negative-step contiguous DRs.
(vect_lanewise_code_p): New function to check lane-wise code.
(vectorizable_store): Skip generating reverse permutations when
skip_rev_perm is set.
(vectorizable_load): Likewise.
* tree-vectorizer.h (struct _slp_tree): Add skip_rev_perm field.
(SLP_TREE_SKIP_REV_PERM): New macro.
(vect_simple_negative_step_p): Declare.
(vect_lanewise_code_p): Declare.
---
gcc/tree-vect-slp.cc | 118 +++++++++++++++++++++++++++++++++++++++++
gcc/tree-vect-stmts.cc | 64 +++++++++++++++++++++-
gcc/tree-vectorizer.h | 6 +++
3 files changed, 186 insertions(+), 2 deletions(-)
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index c932e8d5cba..fd87b83f516 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -119,6 +119,7 @@ _slp_tree::_slp_tree ()
SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
SLP_TREE_LANE_PERMUTATION (this) = vNULL;
SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
+ SLP_TREE_SKIP_REV_PERM (this) = false;
SLP_TREE_CODE (this) = ERROR_MARK;
SLP_TREE_GS_SCALE (this) = 0;
SLP_TREE_GS_BASE (this) = NULL_TREE;
@@ -3320,6 +3321,8 @@ vect_print_slp_tree (dump_flags_t dump_kind,
dump_location_t loc,
dump_printf (dump_kind, " }%s\n",
node->ldst_lanes ? " (load-lanes)" : "");
}
+ if (SLP_TREE_SKIP_REV_PERM (node))
+ dump_printf_loc (metadata, user_loc, "\treverse permutation skipped\n");
if (SLP_TREE_CHILDREN (node).is_empty ())
return;
dump_printf_loc (metadata, user_loc, "\tchildren");
@@ -8310,6 +8313,115 @@ vect_cse_slp_nodes (scalar_stmts_to_slp_tree_map_t
*bst_map, slp_tree& node)
*bst_map->get (SLP_TREE_SCALAR_STMTS (node)) = node;
}
+/* Traverse the SLP tree rooted at NODE and increment the reference count of
+ each node in REFCNTS. When called once per SLP instance, REFCNTS records
+ how many distinct instances reference each node. VISITED ensures nodes are
+ counted at most once per instance. */
+
+static void
+vect_count_slp_node_refs (slp_tree node, hash_set<slp_tree> &visited,
+ hash_map<slp_tree, unsigned> &refcnts)
+{
+ if (!node || visited.add (node))
+ return;
+
+ unsigned *cntp = refcnts.get (node);
+ refcnts.put (node, cntp ? (*cntp + 1) : 1);
+
+ for (slp_tree child : SLP_TREE_CHILDREN (node))
+ vect_count_slp_node_refs (child, visited, refcnts);
+}
+
+/* Return true if the SLP instance rooted at NODE can safely skip reverse
+ permutations for its negative-step contiguous DRs. REFCNTS is used to
+ reject nodes shared across SLP instances. CANDIDATES is populated with
+ corresponding DR nodes. */
+
+static bool
+vect_can_skip_reverse_perm_p (loop_vec_info loop_vinfo, slp_tree node,
+ hash_map<slp_tree, unsigned> &refcnts,
+ auto_vec<slp_tree> &candidates)
+{
+ if (!node)
+ return false;
+
+ /* Constant or external defs are acceptable if they are uniform vectors,
+ since uniform vectors stay unchanged after reversal. */
+ vect_def_type def_type = SLP_TREE_DEF_TYPE (node);
+ if (def_type == vect_constant_def
+ || def_type == vect_external_def)
+ return vect_slp_tree_uniform_p (node);
+
+ /* Require internal defs and reject nodes shared across SLP instances. */
+ if (def_type != vect_internal_def
+ || refcnts.get_or_insert (node, NULL) > 1)
+ return false;
+
+ /* Only handle gimple assign statements for now. */
+ stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
+ if (!stmt_info || !is_gimple_assign (stmt_info->stmt))
+ return false;
+
+ /* Any use outside the loop means the vector value escapes, so skipping the
+ reverse permutation could change the lane order observed by that use. */
+ tree stmt_def = gimple_assign_lhs (stmt_info->stmt);
+ if (TREE_CODE (stmt_def) == SSA_NAME)
+ {
+ imm_use_iterator imm_iter;
+ use_operand_p use_p;
+ FOR_EACH_IMM_USE_FAST (use_p, imm_iter, stmt_def)
+ if (!flow_bb_inside_loop_p (loop_vinfo->loop,
+ gimple_bb (USE_STMT (use_p))))
+ return false;
+ }
+
+ if (STMT_VINFO_DATA_REF (stmt_info))
+ {
+ if (vect_simple_negative_step_p (loop_vinfo, stmt_info))
+ candidates.safe_push (node);
+ else
+ return false;
+ }
+ else if (!vect_lanewise_code_p (gimple_assign_rhs_code (stmt_info->stmt)))
+ return false;
+
+ for (slp_tree child : SLP_TREE_CHILDREN (node))
+ if (!vect_can_skip_reverse_perm_p (loop_vinfo, child, refcnts, candidates))
+ return false;
+
+ return true;
+}
+
+/* Mark SLP nodes whose reverse permutations can be skipped. */
+
+static void
+vect_mark_skip_reverse_perm (loop_vec_info loop_vinfo)
+{
+ /* Count how many instances reference each SLP tree node. */
+ hash_map<slp_tree, unsigned> refcnts;
+ for (auto inst : loop_vinfo->slp_instances)
+ {
+ hash_set<slp_tree> visited;
+ vect_count_slp_node_refs (SLP_INSTANCE_TREE (inst), visited, refcnts);
+ }
+
+ /* Iterate again to find SLP instances where reverse permutations inside can
+ be safely skipped and mark the corresponding nodes. */
+ for (auto inst : loop_vinfo->slp_instances)
+ {
+ slp_tree root = SLP_INSTANCE_TREE (inst);
+ stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (root);
+ if (!stmt_info
+ || !STMT_VINFO_DATA_REF (stmt_info))
+ continue;
+
+ auto_vec<slp_tree> candidates;
+ if (vect_can_skip_reverse_perm_p (loop_vinfo, root, refcnts, candidates))
+ for (slp_tree node : candidates)
+ SLP_TREE_SKIP_REV_PERM (node) = true;
+ }
+}
+
/* Optimize the SLP graph of VINFO. */
void
@@ -8327,6 +8439,12 @@ vect_optimize_slp (vec_info *vinfo)
vect_cse_slp_nodes (bst_map, SLP_INSTANCE_TREE (inst));
release_scalar_stmts_to_slp_tree_map (bst_map);
+
+ /* Apply reverse permutation elimination. This is restricted to loop SLP as
+ it's required that no vector value escapes from the SLP graph. */
+ loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
+ if (loop_vinfo)
+ vect_mark_skip_reverse_perm (loop_vinfo);
}
/* Gather loads reachable from the individual SLP graph entries. */
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 22285250aa8..db3a550c301 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1859,6 +1859,64 @@ compare_step_with_zero (vec_info *vinfo, stmt_vec_info
stmt_info)
size_zero_node);
}
+/* STMT_INFO is a load or store. Return true if it is a contiguous DR with a
+ negative step in an innermost loop. Strided, gather/scatter, grouped, and
+ partial-vector accesses are excluded. */
+
+bool
+vect_simple_negative_step_p (loop_vec_info loop_vinfo, stmt_vec_info stmt_info)
+{
+ if (loop_vinfo->loop->inner
+ || LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
+ return false;
+
+ if (STMT_VINFO_STRIDED_P (stmt_info)
+ || STMT_VINFO_GATHER_SCATTER_P (stmt_info)
+ || STMT_VINFO_GROUPED_ACCESS (stmt_info))
+ return false;
+
+ return compare_step_with_zero (loop_vinfo, stmt_info) < 0;
+}
+
+/* Return true for a conservative subset of tree codes that are lane-wise. */
+
+bool
+vect_lanewise_code_p (tree_code code)
+{
+ switch (code)
+ {
+ case PLUS_EXPR:
+ case MINUS_EXPR:
+ case MULT_EXPR:
+ case NEGATE_EXPR:
+ case MIN_EXPR:
+ case MAX_EXPR:
+ case ABS_EXPR:
+ case LSHIFT_EXPR:
+ case RSHIFT_EXPR:
+ case LROTATE_EXPR:
+ case RROTATE_EXPR:
+ case BIT_IOR_EXPR:
+ case BIT_XOR_EXPR:
+ case BIT_AND_EXPR:
+ case BIT_NOT_EXPR:
+ case LT_EXPR:
+ case LE_EXPR:
+ case GT_EXPR:
+ case GE_EXPR:
+ case EQ_EXPR:
+ case NE_EXPR:
+ case UNLT_EXPR:
+ case UNLE_EXPR:
+ case UNGT_EXPR:
+ case UNGE_EXPR:
+ case UNEQ_EXPR:
+ return true;
+ default:
+ return false;
+ }
+}
+
/* If the target supports a permute mask that reverses the elements in
a vector of type VECTYPE, return that mask, otherwise return null. */
@@ -9323,7 +9381,8 @@ vectorizable_store (vec_info *vinfo,
if (!costing_p)
vec_oprnd = vec_oprnds[i];
- if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
+ if (memory_access_type == VMAT_CONTIGUOUS_REVERSE
+ && !SLP_TREE_SKIP_REV_PERM (slp_node))
{
if (costing_p)
inside_cost += record_stmt_cost (cost_vec, 1, vec_perm,
@@ -11844,7 +11903,8 @@ vectorizable_load (vec_info *vinfo,
}
}
- if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
+ if (memory_access_type == VMAT_CONTIGUOUS_REVERSE
+ && !SLP_TREE_SKIP_REV_PERM (slp_node))
{
if (costing_p)
inside_cost = record_stmt_cost (cost_vec, 1, vec_perm,
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index bde164301a8..d18a296748a 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -358,6 +358,9 @@ struct _slp_tree {
poly_uint64 max_nunits;
/* The DEF type of this node. */
enum vect_def_type def_type;
+ /* Whether the reverse permutation associated with a contiguous negative-step
+ DR can be eliminated. */
+ bool skip_rev_perm;
/* The number of scalar lanes produced by this node. */
unsigned int lanes;
/* The operation of this node. */
@@ -462,6 +465,7 @@ public:
#define SLP_TREE_LOAD_PERMUTATION(S) (S)->load_permutation
#define SLP_TREE_LANE_PERMUTATION(S) (S)->lane_permutation
#define SLP_TREE_DEF_TYPE(S) (S)->def_type
+#define SLP_TREE_SKIP_REV_PERM(S) (S)->skip_rev_perm
#define SLP_TREE_VECTYPE(S) (S)->vectype
#define SLP_TREE_REPRESENTATIVE(S) (S)->representative
#define SLP_TREE_LANES(S) (S)->lanes
@@ -2522,6 +2526,8 @@ extern bool supportable_indirect_convert_operation
(code_helper,
tree = NULL_TREE,
slp_tree = NULL);
extern int compare_step_with_zero (vec_info *, stmt_vec_info);
+extern bool vect_simple_negative_step_p (loop_vec_info, stmt_vec_info);
+extern bool vect_lanewise_code_p (tree_code);
extern unsigned record_stmt_cost (stmt_vector_for_cost *, int,
enum vect_cost_for_stmt, stmt_vec_info,
--
2.43.0