When transforming multiple lane-reducing operations in a loop reduction chain,
originally, corresponding vectorized statements are generated into def-use
cycles starting from 0. The def-use cycle with smaller index, would contain
more statements, which means more instruction dependency. For example:
int sum = 1;
for (i)
{
sum += d0[i] * d1[i]; // dot-prod <vector(16) char>
sum += w[i]; // widen-sum <vector(16) char>
sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
sum += n[i]; // normal <vector(4) int>
}
Original transformation result:
for (i / 16)
{
sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
sum_v1 = sum_v1; // copy
sum_v2 = sum_v2; // copy
sum_v3 = sum_v3; // copy
sum_v0 = WIDEN_SUM (w_v0[i: 0 ~ 15], sum_v0);
sum_v1 = sum_v1; // copy
sum_v2 = sum_v2; // copy
sum_v3 = sum_v3; // copy
sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0);
sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1);
sum_v2 = sum_v2; // copy
sum_v3 = sum_v3; // copy
...
}
For a higher instruction parallelism in final vectorized loop, an optimal
means is to make those effective vector lane-reducing ops be distributed
evenly among all def-use cycles. Transformed as the below, DOT_PROD,
WIDEN_SUM and SADs are generated into disparate cycles, instruction
dependency among them could be eliminated.
for (i / 16)
{
sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
sum_v1 = sum_v1; // copy
sum_v2 = sum_v2; // copy
sum_v3 = sum_v3; // copy
sum_v0 = sum_v0; // copy
sum_v1 = WIDEN_SUM (w_v1[i: 0 ~ 15], sum_v1);
sum_v2 = sum_v2; // copy
sum_v3 = sum_v3; // copy
sum_v0 = sum_v0; // copy
sum_v1 = sum_v1; // copy
sum_v2 = SAD (s0_v2[i: 0 ~ 7 ], s1_v2[i: 0 ~ 7 ], sum_v2);
sum_v3 = SAD (s0_v3[i: 8 ~ 15], s1_v3[i: 8 ~ 15], sum_v3);
...
}
Thanks,
Feng
---
gcc/
PR tree-optimization/114440
* tree-vectorizer.h (struct _stmt_vec_info): Add a new field
reduc_result_pos.
(vect_transform_reduction): Add a new parameter of slp_instance type.
* tree-vect-stmts.cc (vect_transform_stmt): Add a new argument
slp_node_instance to vect_transform_reduction.
* tree-vect-loop.cc (vect_transform_reduction): Add a new parameter
slp_node_instance. Generate lane-reducing statements in an optimized
order.
---
gcc/tree-vect-loop.cc | 73 +++++++++++++++++++++++++++++++++++-------
gcc/tree-vect-stmts.cc | 3 +-
gcc/tree-vectorizer.h | 8 ++++-
3 files changed, 71 insertions(+), 13 deletions(-)
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index a3374fb2d1a..841ef4c9120 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -8673,7 +8673,8 @@ vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo,
stmt_vec_info stmt_info,
bool
vect_transform_reduction (loop_vec_info loop_vinfo,
stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
- gimple **vec_stmt, slp_tree slp_node)
+ gimple **vec_stmt, slp_tree slp_node,
+ slp_instance slp_node_instance)
{
tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
@@ -8863,6 +8864,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
sum += d0[i] * d1[i]; // dot-prod <vector(16) char>
sum += w[i]; // widen-sum <vector(16) char>
sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
+ sum += n[i]; // normal <vector(4) int>
}
The vector size is 128-bit,vectorization factor is 16. Reduction
@@ -8880,25 +8882,30 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
sum_v2 = sum_v2; // copy
sum_v3 = sum_v3; // copy
- sum_v0 = WIDEN_SUM (w_v0[i: 0 ~ 15], sum_v0);
- sum_v1 = sum_v1; // copy
+ sum_v0 = sum_v0; // copy
+ sum_v1 = WIDEN_SUM (w_v1[i: 0 ~ 15], sum_v1);
sum_v2 = sum_v2; // copy
sum_v3 = sum_v3; // copy
- sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0);
- sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1);
- sum_v2 = sum_v2; // copy
+ sum_v0 = sum_v0; // copy
+ sum_v1 = SAD (s0_v1[i: 0 ~ 7 ], s1_v1[i: 0 ~ 7 ], sum_v1);
+ sum_v2 = SAD (s0_v2[i: 8 ~ 15], s1_v2[i: 8 ~ 15], sum_v2);
sum_v3 = sum_v3; // copy
+
+ sum_v0 += n_v0[i: 0 ~ 3 ];
+ sum_v1 += n_v1[i: 4 ~ 7 ];
+ sum_v2 += n_v2[i: 8 ~ 11];
+ sum_v3 += n_v3[i: 12 ~ 15];
}
- sum_v = sum_v0 + sum_v1 + sum_v2 + sum_v3; // = sum_v0 + sum_v1
- */
+ Moreover, for a higher instruction parallelism in final vectorized
+ loop, it is considered to make those effective vector lane-reducing
+ ops be distributed evenly among all def-use cycles. In the above
+ example, DOT_PROD, WIDEN_SUM and SADs are generated into disparate
+ cycles, instruction dependency among them could be eliminated. */
unsigned effec_ncopies = vec_oprnds[0].length ();
unsigned total_ncopies = vec_oprnds[reduc_index].length ();
- if (slp_node)
- gcc_assert (effec_ncopies == SLP_TREE_VEC_STMTS_EFFEC_NUM (slp_node));
-
gcc_assert (effec_ncopies <= total_ncopies);
if (effec_ncopies < total_ncopies)
@@ -8909,6 +8916,50 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
vec_oprnds[i].safe_grow_cleared (total_ncopies);
}
}
+
+ unsigned effec_phis_ncopies = total_ncopies;
+
+ if (slp_node)
+ {
+ slp_tree reduc_phis = slp_node_instance->reduc_phis;
+
+ effec_phis_ncopies = SLP_TREE_VEC_STMTS_EFFEC_NUM (reduc_phis);
+ gcc_assert (effec_ncopies <= effec_phis_ncopies);
+ gcc_assert (effec_ncopies == SLP_TREE_VEC_STMTS_EFFEC_NUM (slp_node));
+ }
+
+ if (effec_ncopies < effec_phis_ncopies)
+ {
+ /* Find suitable def-use cycles to generate vectorized statements
+ into, and reorder operands based on the selection. */
+ unsigned curr_pos = reduc_info->reduc_result_pos;
+ unsigned next_pos = (curr_pos + effec_ncopies) % effec_phis_ncopies;
+
+ gcc_assert (curr_pos < effec_phis_ncopies);
+ reduc_info->reduc_result_pos = next_pos;
+
+ if (curr_pos)
+ {
+ unsigned count = effec_phis_ncopies - effec_ncopies;
+ unsigned start = curr_pos - count;
+
+ if ((int) start < 0)
+ {
+ count = curr_pos;
+ start = 0;
+ }
+
+ for (unsigned i = 0; i < op.num_ops - 1; i++)
+ {
+ for (unsigned j = effec_ncopies; j > start; j--)
+ {
+ unsigned k = j - 1;
+ std::swap (vec_oprnds[i][k], vec_oprnds[i][k + count]);
+ gcc_assert (!vec_oprnds[i][k]);
+ }
+ }
+ }
+ }
}
bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info);
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 8b9659d221c..3a5c1f7e6cc 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -13458,7 +13458,8 @@ vect_transform_stmt (vec_info *vinfo,
case reduc_vec_info_type:
done = vect_transform_reduction (as_a <loop_vec_info> (vinfo), stmt_info,
- gsi, &vec_stmt, slp_node);
+ gsi, &vec_stmt, slp_node,
+ slp_node_instance);
gcc_assert (done);
break;
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 0914f8064c6..8e6b494be90 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -1412,6 +1412,12 @@ public:
/* The vector type for performing the actual reduction. */
tree reduc_vectype;
+ /* For loop reduction with multiple vectorized results (ncopies > 1), a
+ lane-reducing operation participating in it may not use all of those
+ results, this field specifies result index starting from which any
+ following land-reducing operation would be assigned to. */
+ unsigned int reduc_result_pos;
+
/* If IS_REDUC_INFO is true and if the vector code is performing
N scalar reductions in parallel, this variable gives the initial
scalar values of those N reductions. */
@@ -2514,7 +2520,7 @@ extern bool vectorizable_induction (loop_vec_info,
stmt_vec_info,
stmt_vector_for_cost *);
extern bool vect_transform_reduction (loop_vec_info, stmt_vec_info,
gimple_stmt_iterator *,
- gimple **, slp_tree);
+ gimple **, slp_tree, slp_instance);
extern bool vect_transform_cycle_phi (loop_vec_info, stmt_vec_info,
gimple **,
slp_tree, slp_instance);
--
2.17.1
From 05e92cb5d93e213ec1bb0d8a02a9ae398bbe4442 Mon Sep 17 00:00:00 2001
From: Feng Xue <f...@os.amperecomputing.com>
Date: Wed, 29 May 2024 17:28:14 +0800
Subject: [PATCH 4/4] vect: Optimize order of lane-reducing statements in loop
def-use cycles
When transforming multiple lane-reducing operations in a loop reduction chain,
originally, corresponding vectorized statements are generated into def-use
cycles starting from 0. The def-use cycle with smaller index, would contain
more statements, which means more instruction dependency. For example:
int sum = 1;
for (i)
{
sum += d0[i] * d1[i]; // dot-prod <vector(16) char>
sum += w[i]; // widen-sum <vector(16) char>
sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
sum += n[i]; // normal <vector(4) int>
}
Original transformation result:
for (i / 16)
{
sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
sum_v1 = sum_v1; // copy
sum_v2 = sum_v2; // copy
sum_v3 = sum_v3; // copy
sum_v0 = WIDEN_SUM (w_v0[i: 0 ~ 15], sum_v0);
sum_v1 = sum_v1; // copy
sum_v2 = sum_v2; // copy
sum_v3 = sum_v3; // copy
sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0);
sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1);
sum_v2 = sum_v2; // copy
sum_v3 = sum_v3; // copy
...
}
For a higher instruction parallelism in final vectorized loop, an optimal
means is to make those effective vector lane-reducing ops be distributed
evenly among all def-use cycles. Transformed as the below, DOT_PROD,
WIDEN_SUM and SADs are generated into disparate cycles, instruction
dependency among them could be eliminated.
for (i / 16)
{
sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
sum_v1 = sum_v1; // copy
sum_v2 = sum_v2; // copy
sum_v3 = sum_v3; // copy
sum_v0 = sum_v0; // copy
sum_v1 = WIDEN_SUM (w_v1[i: 0 ~ 15], sum_v1);
sum_v2 = sum_v2; // copy
sum_v3 = sum_v3; // copy
sum_v0 = sum_v0; // copy
sum_v1 = sum_v1; // copy
sum_v2 = SAD (s0_v2[i: 0 ~ 7 ], s1_v2[i: 0 ~ 7 ], sum_v2);
sum_v3 = SAD (s0_v3[i: 8 ~ 15], s1_v3[i: 8 ~ 15], sum_v3);
...
}
2024-03-22 Feng Xue <f...@os.amperecomputing.com>
gcc/
PR tree-optimization/114440
* tree-vectorizer.h (struct _stmt_vec_info): Add a new field
reduc_result_pos.
(vect_transform_reduction): Add a new parameter of slp_instance type.
* tree-vect-stmts.cc (vect_transform_stmt): Add a new argument
slp_node_instance to vect_transform_reduction.
* tree-vect-loop.cc (vect_transform_reduction): Add a new parameter
slp_node_instance. Generate lane-reducing statements in an optimized
order.
---
gcc/tree-vect-loop.cc | 73 +++++++++++++++++++++++++++++++++++-------
gcc/tree-vect-stmts.cc | 3 +-
gcc/tree-vectorizer.h | 8 ++++-
3 files changed, 71 insertions(+), 13 deletions(-)
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index a3374fb2d1a..841ef4c9120 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -8673,7 +8673,8 @@ vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
bool
vect_transform_reduction (loop_vec_info loop_vinfo,
stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
- gimple **vec_stmt, slp_tree slp_node)
+ gimple **vec_stmt, slp_tree slp_node,
+ slp_instance slp_node_instance)
{
tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
@@ -8863,6 +8864,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
sum += d0[i] * d1[i]; // dot-prod <vector(16) char>
sum += w[i]; // widen-sum <vector(16) char>
sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
+ sum += n[i]; // normal <vector(4) int>
}
The vector size is 128-bitï¼vectorization factor is 16. Reduction
@@ -8880,25 +8882,30 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
sum_v2 = sum_v2; // copy
sum_v3 = sum_v3; // copy
- sum_v0 = WIDEN_SUM (w_v0[i: 0 ~ 15], sum_v0);
- sum_v1 = sum_v1; // copy
+ sum_v0 = sum_v0; // copy
+ sum_v1 = WIDEN_SUM (w_v1[i: 0 ~ 15], sum_v1);
sum_v2 = sum_v2; // copy
sum_v3 = sum_v3; // copy
- sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0);
- sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1);
- sum_v2 = sum_v2; // copy
+ sum_v0 = sum_v0; // copy
+ sum_v1 = SAD (s0_v1[i: 0 ~ 7 ], s1_v1[i: 0 ~ 7 ], sum_v1);
+ sum_v2 = SAD (s0_v2[i: 8 ~ 15], s1_v2[i: 8 ~ 15], sum_v2);
sum_v3 = sum_v3; // copy
+
+ sum_v0 += n_v0[i: 0 ~ 3 ];
+ sum_v1 += n_v1[i: 4 ~ 7 ];
+ sum_v2 += n_v2[i: 8 ~ 11];
+ sum_v3 += n_v3[i: 12 ~ 15];
}
- sum_v = sum_v0 + sum_v1 + sum_v2 + sum_v3; // = sum_v0 + sum_v1
- */
+ Moreover, for a higher instruction parallelism in final vectorized
+ loop, it is considered to make those effective vector lane-reducing
+ ops be distributed evenly among all def-use cycles. In the above
+ example, DOT_PROD, WIDEN_SUM and SADs are generated into disparate
+ cycles, instruction dependency among them could be eliminated. */
unsigned effec_ncopies = vec_oprnds[0].length ();
unsigned total_ncopies = vec_oprnds[reduc_index].length ();
- if (slp_node)
- gcc_assert (effec_ncopies == SLP_TREE_VEC_STMTS_EFFEC_NUM (slp_node));
-
gcc_assert (effec_ncopies <= total_ncopies);
if (effec_ncopies < total_ncopies)
@@ -8909,6 +8916,50 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
vec_oprnds[i].safe_grow_cleared (total_ncopies);
}
}
+
+ unsigned effec_phis_ncopies = total_ncopies;
+
+ if (slp_node)
+ {
+ slp_tree reduc_phis = slp_node_instance->reduc_phis;
+
+ effec_phis_ncopies = SLP_TREE_VEC_STMTS_EFFEC_NUM (reduc_phis);
+ gcc_assert (effec_ncopies <= effec_phis_ncopies);
+ gcc_assert (effec_ncopies == SLP_TREE_VEC_STMTS_EFFEC_NUM (slp_node));
+ }
+
+ if (effec_ncopies < effec_phis_ncopies)
+ {
+ /* Find suitable def-use cycles to generate vectorized statements
+ into, and reorder operands based on the selection. */
+ unsigned curr_pos = reduc_info->reduc_result_pos;
+ unsigned next_pos = (curr_pos + effec_ncopies) % effec_phis_ncopies;
+
+ gcc_assert (curr_pos < effec_phis_ncopies);
+ reduc_info->reduc_result_pos = next_pos;
+
+ if (curr_pos)
+ {
+ unsigned count = effec_phis_ncopies - effec_ncopies;
+ unsigned start = curr_pos - count;
+
+ if ((int) start < 0)
+ {
+ count = curr_pos;
+ start = 0;
+ }
+
+ for (unsigned i = 0; i < op.num_ops - 1; i++)
+ {
+ for (unsigned j = effec_ncopies; j > start; j--)
+ {
+ unsigned k = j - 1;
+ std::swap (vec_oprnds[i][k], vec_oprnds[i][k + count]);
+ gcc_assert (!vec_oprnds[i][k]);
+ }
+ }
+ }
+ }
}
bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info);
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 8b9659d221c..3a5c1f7e6cc 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -13458,7 +13458,8 @@ vect_transform_stmt (vec_info *vinfo,
case reduc_vec_info_type:
done = vect_transform_reduction (as_a <loop_vec_info> (vinfo), stmt_info,
- gsi, &vec_stmt, slp_node);
+ gsi, &vec_stmt, slp_node,
+ slp_node_instance);
gcc_assert (done);
break;
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 0914f8064c6..8e6b494be90 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -1412,6 +1412,12 @@ public:
/* The vector type for performing the actual reduction. */
tree reduc_vectype;
+ /* For loop reduction with multiple vectorized results (ncopies > 1), a
+ lane-reducing operation participating in it may not use all of those
+ results, this field specifies result index starting from which any
+ following land-reducing operation would be assigned to. */
+ unsigned int reduc_result_pos;
+
/* If IS_REDUC_INFO is true and if the vector code is performing
N scalar reductions in parallel, this variable gives the initial
scalar values of those N reductions. */
@@ -2514,7 +2520,7 @@ extern bool vectorizable_induction (loop_vec_info, stmt_vec_info,
stmt_vector_for_cost *);
extern bool vect_transform_reduction (loop_vec_info, stmt_vec_info,
gimple_stmt_iterator *,
- gimple **, slp_tree);
+ gimple **, slp_tree, slp_instance);
extern bool vect_transform_cycle_phi (loop_vec_info, stmt_vec_info,
gimple **,
slp_tree, slp_instance);
--
2.17.1