Hi,
For below simplified case:
#define LEN (32000)
__attribute__((aligned(16))) float a[LEN],b[LEN];
int foo (int M)
{
for (int i = 0; i < M; i++)
a[i+M] = a[i] + b[i];
}
Compiling it with command like:
$ aarch64-elf-gcc -O3 -S foo.c -o foo.S -std=c99
The assembly code of vectorized loop is in below form:
mov x1, 0
mov w2, 0
.L4:
ldr q0, [x1, x3]
add w2, w2, 1
ldr q1, [x1, x4]
cmp w2, w5
fadd v0.4s, v0.4s, v1.4s
str q0, [x6, x1]
add x1, x1, 16
bcc .L4
Induction variable w2 is unnecessary and can be eliminated with x1. This is
safe because X1 will never overflow during all iterations of loop. The
optimal assembly should be like:
mov x1, 0
.L4:
ldr q0, [x1, x2]
ldr q1, [x1, x4]
fadd v0.4s, v0.4s, v1.4s
str q0, [x5, x1]
add x1, x1, 16
cmp x1, x3
bcc .L4
This case can be handled if we do more complex overflow check on unsigned
type in function iv_elimination_compare_lt.
Also there is another blocker for the transformation, function
number_of_iterations_lt calls fold_build2 to build folded form of
may_be_zero, while iv_elimination_compare_lt only handles simple form tree
expressions. It's possible to have iv_elimination_compare_lt to do some
undo transformation on may_be_zero, but I found it's difficult for cases
involving signed/unsigned conversion like case loop-41.c. Since I think
there is no obvious benefit to fold may_be_zero here (somehow because the
two operands are already in folded forms), this patch just calls build2_loc
instead.
This optimization is picked up by patch B, but patch A is necessary since
the check in iv_elimination_compare_lt of two aff_trees isn't enough when
two different types (one in signed type, the other in unsigned) involved. I
have to use tree comparison here instead. Considering below simple case:
Analyzing # of iterations of loop 5
exit condition [1, + , 1](no_overflow) <= i_1
bounds on difference of bases: -3 ... 1
result:
zero if i_1 + 1 < 1
# of iterations (unsigned int) i_1, bounded by 2
number of iterations (unsigned int) i_1; zero if i_1 + 1 < 1
use 0
compare
in statement if (S.7_9 > i_1)
at position
type integer(kind=4)
base 1
step 1
is a biv
related candidates
candidate 0 (important)
var_before ivtmp.28
var_after ivtmp.28
incremented at end
type unsigned int
base 0
step 1
When GCC trying to eliminate use 0 with cand 0, the miscellaneous trees in
iv_elimination_compare_lt are like below with i_1 of signed type:
B: i_1 + 1
A: 0
niter->niter: (unsigned int)i_1
Apparently, (B-A-1) is "i_1", which doesn't equal to "(unsigned int)i_1".
Without this patch, it is considered equal to each other.
Note that the induction variable IS necessary on 32 bit systems since
otherwise there is type overflow.
These two patch fix the mentioned problem.
They pass bootstrap and regression test on x86_64/x86/aarch64/arm, so any
comments?
Thanks,
bin
PATCH A)
2014-06-23 Bin Cheng <bin.ch...@arm.com>
* tree-ssa-loop-ivopts.c (iv_elimination_compare_lt): Check number
of iteration using tree comparison.
PATCH B)
2014-06-23 Bin Cheng <bin.ch...@arm.com>
* tree-ssa-loop-niter.c (number_of_iterations_lt): Build unfolded
form of may_be_zero.
* tree-ssa-loop-ivopts.c (iv_nowrap_period)
(nowrap_cand_for_loop_niter_p): New functions.
(period_greater_niter_exit): New function refactored from
may_eliminate_iv.
(iv_elimination_compare_lt): New parameter. Relax overflow check.
Handle special forms may_be_zero expression.
(may_eliminate_iv): Call period_greater_niter_exit. Pass new
argument for iv_elimination_compare_lt.
gcc/testsuite/ChangeLog
2014-06-23 Bin Cheng <bin.ch...@arm.com>
* gcc.dg/tree-ssa/loop-40.c: New test.
* gcc.dg/tree-ssa/loop-41.c: New test.
Index: gcc/testsuite/gcc.dg/tree-ssa/loop-41.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/loop-41.c (revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/loop-41.c (revision 0)
@@ -0,0 +1,38 @@
+/* { dg-do compile { target { lp64 } } } */
+/* { dg-options "-O2 -fdump-tree-ivopts-details" } */
+
+typedef long unsigned int size_t;
+extern float a[100], b[100];
+
+int foo (int M, int l)
+{
+ unsigned ivtmp = 0, niters, _37, _38, bnd;
+ size_t _67, _1;
+ float *vectp_a, *vectp_b, *vectp_a2;
+ float vect__6, vect__7, vect__8;
+
+ _38 = (unsigned int)l;
+ bnd = _38 + 1;
+
+ _1 = (size_t) M;
+ _67 = _1 * 4;
+ vectp_a = a; vectp_b = b; vectp_a2 = a + _67;
+
+ do
+ {
+ vect__6 = *vectp_a;
+ vect__7 = *vectp_b;
+ vect__8 = vect__6 + vect__7;
+ *vectp_a = vect__8;
+ vectp_a = vectp_a + 4;
+ vectp_b = vectp_b + 4;
+ vectp_a2 = vectp_a2 + 4;
+ ivtmp += 1;
+ }
+ while (ivtmp < bnd);
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "Replacing exit test: if \\(bnd.*\\)" 1
"ivopts" } } */
+/* { dg-final { cleanup-tree-dump "ivopts" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/loop-40.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/loop-40.c (revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/loop-40.c (revision 0)
@@ -0,0 +1,38 @@
+/* { dg-do compile { target { lp64 } } } */
+/* { dg-options "-O2 -fdump-tree-ivopts-details" } */
+
+typedef long unsigned int size_t;
+extern float a[100], b[100];
+
+int foo (int M, unsigned int l)
+{
+ unsigned ivtmp = 0, niters, _37, _38, bnd;
+ size_t _67, _1;
+ float *vectp_a, *vectp_b, *vectp_a2;
+ float vect__6, vect__7, vect__8;
+
+ _38 = (unsigned int)l;
+ bnd = _38 + 1;
+
+ _1 = (size_t) M;
+ _67 = _1 * 4;
+ vectp_a = a; vectp_b = b; vectp_a2 = a + _67;
+
+ do
+ {
+ vect__6 = *vectp_a;
+ vect__7 = *vectp_b;
+ vect__8 = vect__6 + vect__7;
+ *vectp_a = vect__8;
+ vectp_a = vectp_a + 4;
+ vectp_b = vectp_b + 4;
+ vectp_a2 = vectp_a2 + 4;
+ ivtmp += 1;
+ }
+ while (ivtmp < bnd);
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "Replacing exit test: if \\(bnd.*\\)" 1
"ivopts" } } */
+/* { dg-final { cleanup-tree-dump "ivopts" } } */
Index: gcc/tree-ssa-loop-niter.c
===================================================================
--- gcc/tree-ssa-loop-niter.c (revision 211488)
+++ gcc/tree-ssa-loop-niter.c (working copy)
@@ -1150,11 +1150,16 @@ number_of_iterations_lt (tree type, affine_iv *iv0
First try to derive a lower bound on the value of
iv1->base - iv0->base, computed in full precision. If the difference
is nonnegative, we are done, otherwise we must record the
- condition. */
+ condition.
+
+ Call build2_loc here rather than fold_build2 to build the unfolded
+ expression because it can be handled more easily by optimizations
+ like IVOPT. */
if (mpz_sgn (bnds->below) < 0)
- niter->may_be_zero = fold_build2 (LT_EXPR, boolean_type_node,
- iv1->base, iv0->base);
+ niter->may_be_zero = build2_loc (UNKNOWN_LOCATION,
+ LT_EXPR, boolean_type_node,
+ iv1->base, iv0->base);
niter->niter = delta;
niter->max = widest_int::from (wi::from_mpz (niter_type, bnds->up,
false),
TYPE_SIGN (niter_type));
Index: gcc/tree-ssa-loop-ivopts.c
===================================================================
--- gcc/tree-ssa-loop-ivopts.c (revision 211488)
+++ gcc/tree-ssa-loop-ivopts.c (working copy)
@@ -4432,6 +4432,44 @@ iv_period (struct iv *iv)
return period;
}
+/* Returns no wrapping period of induction variable IV. For now
+ only unsigned type IV is handled, we could extend it in case
+ of non-overflow for signed ones. Return zero if it can't be
+ decided. */
+
+static tree
+iv_nowrap_period (struct iv *iv)
+{
+ bool overflow;
+ tree type;
+ tree base = iv->base, step = iv->step;
+ widest_int base_val, step_val, max_val, span, period;
+
+ gcc_assert (step && TREE_CODE (step) == INTEGER_CST);
+
+ type = TREE_TYPE (base);
+ if (!TYPE_UNSIGNED (type) || TREE_CODE (base) != INTEGER_CST)
+ return integer_zero_node;
+
+ base_val = wi::to_widest (base);
+ step_val = wi::to_widest (step);
+ if (!POINTER_TYPE_P (type) && TYPE_MAX_VALUE (type)
+ && TREE_CODE (TYPE_MAX_VALUE (type)) == INTEGER_CST)
+ max_val = wi::to_widest (TYPE_MAX_VALUE (type));
+ else
+ {
+ wide_int max_wi = wi::max_value (TYPE_PRECISION (type), UNSIGNED);
+ max_val = wi::to_widest (wide_int_to_tree (type, max_wi));
+ }
+
+ span = max_val - base_val + step_val - 1;
+ period = wi::div_trunc (span, step_val, UNSIGNED, &overflow);
+ if (overflow)
+ return integer_zero_node;
+
+ return wide_int_to_tree (type, period);
+}
+
/* Returns the comparison operator used when eliminating the iv USE. */
static enum tree_code
@@ -4560,7 +4598,84 @@ difference_cannot_overflow_p (tree base, tree offs
}
}
-/* Tries to replace loop exit by one formulated in terms of a LT_EXPR
+/* Check whether PERIOD of CAND is greater than the number of iterations
+ described by DESC for which the exit condition is true. The exit
+ condition is comparison against USE. */
+
+static bool
+period_greater_niter_exit (struct ivopts_data *data,
+ struct iv_use *use, struct iv_cand *cand,
+ tree period, struct tree_niter_desc *desc)
+{
+ struct loop *loop = data->current_loop;
+
+ /* If the number of iterations is constant, compare against it directly. */
+ if (TREE_CODE (desc->niter) == INTEGER_CST)
+ {
+ /* See cand_value_at. */
+ if (stmt_after_increment (loop, cand, use->stmt))
+ {
+ if (!tree_int_cst_lt (desc->niter, period))
+ return false;
+ }
+ else
+ {
+ if (tree_int_cst_lt (period, desc->niter))
+ return false;
+ }
+ }
+
+ /* If not, and if this is the only possible exit of the loop, see whether
+ we can get a conservative estimate on the number of iterations of the
+ entire loop and compare against that instead. */
+ else
+ {
+ widest_int period_value, max_niter;
+
+ max_niter = desc->max;
+ if (stmt_after_increment (loop, cand, use->stmt))
+ max_niter += 1;
+ period_value = wi::to_widest (period);
+ if (wi::gtu_p (max_niter, period_value))
+ {
+ /* See if we can take advantage of inferred loop bound information.
*/
+ if (data->loop_single_exit_p)
+ {
+ if (!max_loop_iterations (loop, &max_niter))
+ return false;
+ /* The loop bound is already adjusted by adding 1. */
+ if (wi::gtu_p (max_niter, period_value))
+ return false;
+ }
+ else
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/* Determine whether no wrapping period of CAND is greater than
+ the number of iterations described by DESC for which exit
+ conditionis true. The exit condition is comparison against USE. */
+
+static bool
+nowrap_cand_for_loop_niter_p (struct ivopts_data *data,
+ struct iv_use *use,
+ struct iv_cand *cand,
+ struct tree_niter_desc *desc)
+{
+ tree period;
+
+ period = iv_nowrap_period (cand->iv);
+ if (period != integer_zero_node
+ && period_greater_niter_exit (data, use, cand, period, desc))
+ return true;
+
+ return false;
+}
+
+/* Tries to replace loop exit in USE by one formulated in terms of a LT_EXPR
comparison with CAND. NITER describes the number of iterations of
the loops. If successful, the comparison in COMP_P is altered accordingly.
@@ -4597,24 +4712,32 @@ difference_cannot_overflow_p (tree base, tree offs
1) if a + 1 <= b, then p_0 - a + b is the final value of p, hence there is
no
overflow in computing it or the values of p.
2) if a + 1 > b, then we need to verify that the expression p_0 - a does not
- overflow. To prove this, we use the fact that p_0 = base + a. */
+ overflow. To prove this, we use the fact that p_0 = base + a.
+
+ Moreover, with more complex overflow analysis for unsigned type, it is
+ possible to eliminate I with P if P is an induction candidate of unsigned
+ integer type other than pointer if we can prove that P doesn't overflow
+ during all iterations of current loop. */
+
static bool
-iv_elimination_compare_lt (struct ivopts_data *data,
- struct iv_cand *cand, enum tree_code *comp_p,
+iv_elimination_compare_lt (struct ivopts_data *data, struct iv_use *use,
+ struct iv_cand *cand, enum tree_code *comp_p,
struct tree_niter_desc *niter)
{
tree cand_type, a, b, mbz, nit_type = TREE_TYPE (niter->niter), offset;
- struct aff_tree nit, tmpa, tmpb;
+ struct aff_tree nit, tmp1, tmpa, tmpb;
enum tree_code comp;
HOST_WIDE_INT step;
/* We need to know that the candidate induction variable does not overflow.
- While more complex analysis may be used to prove this, for now just
- check that the variable appears in the original program and that it
- is computed in a type that guarantees no overflows. */
+ Apart from checking that the variable appears in the original program
+ and that is is computed in a type that guarantees no overflows, we also
+ check no wrapping periord of the variable covers the niter if it has
+ unsigned type. More complex analysis may be used to prove this. */
cand_type = TREE_TYPE (cand->iv->base);
- if (cand->pos != IP_ORIGINAL || !nowrap_type_p (cand_type))
+ if ((cand->pos != IP_ORIGINAL || !nowrap_type_p (cand_type))
+ && !nowrap_cand_for_loop_niter_p (data, use, cand, niter))
return false;
/* Make sure that the loop iterates till the loop bound is hit, as otherwise
@@ -4641,6 +4764,12 @@ static bool
a = TREE_OPERAND (op0, 0);
b = TREE_OPERAND (mbz, 1);
}
+ /* Handle "0 + 1 > b' + 1", in which "a == 0" and "b = b' + 1". */
+ else if (integer_onep (op0))
+ {
+ a = integer_zero_node;
+ b = TREE_OPERAND (mbz, 1);
+ }
else
return false;
}
@@ -4654,6 +4783,12 @@ static bool
a = TREE_OPERAND (op1, 0);
b = TREE_OPERAND (mbz, 0);
}
+ /* Handle "b' + 1 < 0 + 1", in which "a == 0" and "b = b' + 1". */
+ else if (integer_onep (op1))
+ {
+ a = integer_zero_node;
+ b = TREE_OPERAND (mbz, 0);
+ }
else
return false;
}
@@ -4673,12 +4808,15 @@ static bool
return false;
/* Finally, check that CAND->IV->BASE - CAND->IV->STEP * A does not
- overflow. */
- offset = fold_build2 (MULT_EXPR, TREE_TYPE (cand->iv->step),
- cand->iv->step,
- fold_convert (TREE_TYPE (cand->iv->step), a));
- if (!difference_cannot_overflow_p (cand->iv->base, offset))
- return false;
+ overflow. The check is uncessary when A equals to ZERO. */
+ if (a != integer_zero_node)
+ {
+ offset = fold_build2 (MULT_EXPR, TREE_TYPE (cand->iv->step),
+ cand->iv->step,
+ fold_convert (TREE_TYPE (cand->iv->step), a));
+ if (!difference_cannot_overflow_p (cand->iv->base, offset))
+ return false;
+ }
/* Determine the new comparison operator. */
comp = step < 0 ? GT_EXPR : LT_EXPR;
@@ -4733,50 +4871,9 @@ may_eliminate_iv (struct ivopts_data *data,
This is the case iff the period of the induction variable is greater
than the number of iterations for which the exit condition is true. */
period = iv_period (cand->iv);
+ if (!period_greater_niter_exit (data, use, cand, period, desc))
+ return false;
- /* If the number of iterations is constant, compare against it directly. */
- if (TREE_CODE (desc->niter) == INTEGER_CST)
- {
- /* See cand_value_at. */
- if (stmt_after_increment (loop, cand, use->stmt))
- {
- if (!tree_int_cst_lt (desc->niter, period))
- return false;
- }
- else
- {
- if (tree_int_cst_lt (period, desc->niter))
- return false;
- }
- }
-
- /* If not, and if this is the only possible exit of the loop, see whether
- we can get a conservative estimate on the number of iterations of the
- entire loop and compare against that instead. */
- else
- {
- widest_int period_value, max_niter;
-
- max_niter = desc->max;
- if (stmt_after_increment (loop, cand, use->stmt))
- max_niter += 1;
- period_value = wi::to_widest (period);
- if (wi::gtu_p (max_niter, period_value))
- {
- /* See if we can take advantage of inferred loop bound information.
*/
- if (data->loop_single_exit_p)
- {
- if (!max_loop_iterations (loop, &max_niter))
- return false;
- /* The loop bound is already adjusted by adding 1. */
- if (wi::gtu_p (max_niter, period_value))
- return false;
- }
- else
- return false;
- }
- }
-
cand_value_at (loop, cand, use->stmt, desc->niter, &bnd);
*bound = fold_convert (TREE_TYPE (cand->iv->base),
@@ -4796,7 +4893,7 @@ may_eliminate_iv (struct ivopts_data *data,
base the exit condition on it. However, that is often too
expensive. */
if (!integer_zerop (desc->may_be_zero))
- return iv_elimination_compare_lt (data, cand, comp, desc);
+ return iv_elimination_compare_lt (data, use, cand, comp, desc);
return true;
}
Index: gcc/tree-ssa-loop-ivopts.c
===================================================================
--- gcc/tree-ssa-loop-ivopts.c (revision 211488)
+++ gcc/tree-ssa-loop-ivopts.c (working copy)
@@ -4605,7 +4605,7 @@ iv_elimination_compare_lt (struct ivopts_data *dat
struct tree_niter_desc *niter)
{
tree cand_type, a, b, mbz, nit_type = TREE_TYPE (niter->niter), offset;
- struct aff_tree nit, tmpa, tmpb;
+ struct aff_tree nit, tmp1, tmpa, tmpb;
enum tree_code comp;
HOST_WIDE_INT step;
@@ -4661,15 +4661,19 @@ iv_elimination_compare_lt (struct ivopts_data *dat
return false;
/* Expected number of iterations is B - A - 1. Check that it matches
- the actual number, i.e., that B - A - NITER = 1. */
+ the actual number, i.e., that B - A = NITER + 1. */
tree_to_aff_combination (niter->niter, nit_type, &nit);
- tree_to_aff_combination (fold_convert (nit_type, a), nit_type, &tmpa);
- tree_to_aff_combination (fold_convert (nit_type, b), nit_type, &tmpb);
- aff_combination_scale (&nit, -1);
- aff_combination_scale (&tmpa, -1);
- aff_combination_add (&tmpb, &tmpa);
- aff_combination_add (&tmpb, &nit);
- if (tmpb.n != 0 || tmpb.offset != 1)
+ aff_combination_const (&tmp1, nit_type, 1);
+ tree_to_aff_combination (b, TREE_TYPE (b), &tmpb);
+ aff_combination_add (&nit, &tmp1);
+ if (a != integer_zero_node)
+ {
+ tree_to_aff_combination (a, TREE_TYPE (b), &tmpa);
+ aff_combination_scale (&tmpa, -1);
+ aff_combination_add (&tmpb, &tmpa);
+ }
+ if (!operand_equal_p (aff_combination_to_tree (&nit),
+ aff_combination_to_tree (&tmpb), 0))
return false;
/* Finally, check that CAND->IV->BASE - CAND->IV->STEP * A does not