From: Pan Li <[email protected]>
To get better vectorized code of .SAT_SUB, we would like to avoid the
truncated operation for the assignment. For example, as below.
unsigned int _1;
unsigned int _2;
_9 = (unsigned short int).SAT_SUB (_1, _2);
If we make sure that the _1 is in the range of unsigned short int. Such
as a def similar to:
_1 = (unsigned short int)_4;
Then we can do the distribute the truncation operation to:
_3 = MIN_EXPR (_2, 65535);
_9 = .SAT_SUB ((unsigned short int)_1, (unsigned short int)_3);
Let's take RISC-V vector as example to tell the changes. For below
sample code:
__attribute__((noinline))
void test (uint16_t *x, unsigned b, unsigned n)
{
unsigned a = 0;
uint16_t *p = x;
do {
a = *--p;
*p = (uint16_t)(a >= b ? a - b : 0);
} while (--n);
}
Before this patch:
...
.L3:
vle16.v v1,0(a3)
vrsub.vx v5,v2,t1
mv t3,a4
addw a4,a4,t5
vrgather.vv v3,v1,v5
vsetvli zero,zero,e32,m1,ta,ma
vzext.vf2 v1,v3
vssubu.vx v1,v1,a1
vsetvli zero,zero,e16,mf2,ta,ma
vncvt.x.x.w v1,v1
vrgather.vv v3,v1,v5
vse16.v v3,0(a3)
sub a3,a3,t4
bgtu t6,a4,.L3
...
After this patch:
test:
...
.L3:
vle16.v v3,0(a3)
vrsub.vx v5,v2,a6
mv a7,a4
addw a4,a4,t3
vrgather.vv v1,v3,v5
vssubu.vv v1,v1,v6
vrgather.vv v3,v1,v5
vse16.v v3,0(a3)
sub a3,a3,t1
bgtu t4,a4,.L3
...
The below test suites are passed for this patch:
1. The rv64gcv fully regression tests.
2. The rv64gcv build with glibc.
3. The x86 bootstrap tests.
4. The x86 fully regression tests.
gcc/ChangeLog:
* tree-vect-patterns.cc (vect_recog_sat_sub_pattern_distribute):
Add new func impl to perform the truncation distribution.
(vect_recog_sat_sub_pattern): Perform above optimize before
generate .SAT_SUB call.
Signed-off-by: Pan Li <[email protected]>
---
gcc/tree-vect-patterns.cc | 75 +++++++++++++++++++++++++++++++++++++++
1 file changed, 75 insertions(+)
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index 86e893a1c43..90449bd0ddd 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -4566,6 +4566,79 @@ vect_recog_sat_add_pattern (vec_info *vinfo,
stmt_vec_info stmt_vinfo,
return NULL;
}
+/*
+ * Try to distribute the truncation for .SAT_SUB pattern, mostly occurs in
+ * the benchmark zip. Aka:
+ *
+ * unsigned int _1;
+ * unsigned int _2;
+ * _9 = (unsigned short int).SAT_SUB (_1, _2);
+ *
+ * if _1 is known to be in the range of unsigned short int. For example
+ * there is a def _1 = (unsigned short int)_4. Then we can distribute the
+ * truncation to:
+ *
+ * _3 = MIN (65535, _2);
+ * _9 = .SAT_SUB (_4, (unsigned short int)_3);
+ *
+ * Then, we can better vectorized code and avoid the unnecessary narrowing
+ * stmt during vectorization.
+ */
+static void
+vect_recog_sat_sub_pattern_distribute (vec_info *vinfo,
+ stmt_vec_info stmt_vinfo,
+ gimple *stmt, tree lhs, tree *ops)
+{
+ tree otype = TREE_TYPE (lhs);
+ tree itype = TREE_TYPE (ops[0]);
+ unsigned itype_prec = TYPE_PRECISION (itype);
+ unsigned otype_prec = TYPE_PRECISION (otype);
+
+ if (types_compatible_p (otype, itype) || otype_prec >= itype_prec)
+ return;
+
+ int_range_max r;
+ gimple_ranger granger;
+
+ if (granger.range_of_expr (r, ops[0], stmt) && !r.undefined_p ())
+ {
+ wide_int bound = r.upper_bound ();
+ wide_int otype_max = wi::mask (otype_prec, /* negate */false,
itype_prec);
+ tree v_otype = get_vectype_for_scalar_type (vinfo, otype);
+ tree v_itype = get_vectype_for_scalar_type (vinfo, itype);
+
+ if (!wi::leu_p (bound, otype_max) || v_otype == NULL || v_itype == NULL
+ || !target_supports_op_p (v_itype, MIN_EXPR, optab_vector))
+ return;
+
+ /* 1. Build truncated op_0 */
+ vect_unpromoted_value unprom;
+ tree tmp = vect_look_through_possible_promotion (vinfo, ops[0], &unprom);
+
+ if (tmp == NULL_TREE || TYPE_PRECISION (unprom.type) != otype_prec)
+ {
+ tmp = vect_recog_temp_ssa_var (otype, NULL);
+ gimple *op_0_cast = gimple_build_assign (tmp, NOP_EXPR, ops[0]);
+ append_pattern_def_seq (vinfo, stmt_vinfo, op_0_cast, v_otype);
+ }
+
+ ops[0] = tmp;
+
+ /* 2. Build MIN_EXPR (op_1, 65536) */
+ tree max = wide_int_to_tree (itype, otype_max);
+ tree op_1_in = vect_recog_temp_ssa_var (itype, NULL);
+ gimple *op_1_min = gimple_build_assign (op_1_in, MIN_EXPR, ops[1], max);
+ append_pattern_def_seq (vinfo, stmt_vinfo, op_1_min, v_itype);
+
+ /* 3. Build truncated op_1 */
+ tmp = vect_recog_temp_ssa_var (otype, NULL);
+ gimple *op_1_cast = gimple_build_assign (tmp, NOP_EXPR, op_1_in);
+ append_pattern_def_seq (vinfo, stmt_vinfo, op_1_cast, v_otype);
+
+ ops[1] = tmp;
+ }
+}
+
/*
* Try to detect saturation sub pattern (SAT_ADD), aka below gimple:
* _7 = _1 >= _2;
@@ -4591,6 +4664,8 @@ vect_recog_sat_sub_pattern (vec_info *vinfo,
stmt_vec_info stmt_vinfo,
if (gimple_unsigned_integer_sat_sub (lhs, ops, NULL))
{
+ vect_recog_sat_sub_pattern_distribute (vinfo, stmt_vinfo, last_stmt,
+ lhs, ops);
gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo,
IFN_SAT_SUB, type_out,
lhs, ops[0], ops[1]);
--
2.34.1