Hello,
I'm facing the following problem related to ivopts. The problem is that
GCC generates a lot of induction variables and as a result there is an
unnecessary increase of stack usage and register pressure.
For instance, for the attached testcase (tc_ivopts.c) GCC generates 26
induction variables (25 for each of lhsX[{0-4}][{0-4}][k] and one for
rhs[k][j][{0-2}]). You should be able to reproduce this issue on arm
target using GCC with "-O2 -mcpu=cortex-a9 -mtune=cortex-a9". For
reference, I'm attaching assembly I get on current trunk.
The reason might be in use groups costs, in particular, in the way of
estimation. Currently, the cost of a tuple (group, candidate) is a sum
of per-use costs in the group. So, the cost of a group grows
proportional to the number of uses in the group. This approach has a
negative effect on the algorithm for finding the best set of induction
variables: the part of a total cost related to adding a new candidate is
almost washed out by the cost of the group. In addition, when there is a
lot of groups with many uses inside and a target is out of free
registers, the cost of spill is washing out too. As a result, GCC
prefers to use native candidates (candidate created for a particular
group) for each group of uses instead of considering the real cost of
introducing a new variable into a set.
The summing approach was added as a part of this patch
(https://gcc.gnu.org/ml/gcc-patches/2015-05/msg00641.html) and the
motivation for taking the sum does not seem to be
specifically discussed.
I propose the following patch that changes a group cost from cost
summing to selecting the largest one inside a group. For the given test
case I have: necessary size of stack is decreased by almost 3 times and
ldr\str amount are decreased by less than 2 times. Also I'm attaching
assembly after applying the patch.
The essential change in the patch is just:
diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
index f9211ad..a149418 100644
--- a/gcc/tree-ssa-loop-ivopts.c
+++ b/gcc/tree-ssa-loop-ivopts.c
@@ -5151,7 +5151,8 @@ determine_group_iv_cost_address (struct
ivopts_data *data,
offset and where the iv_use is. */
cost = get_computation_cost (data, next, cand, true,
NULL, &can_autoinc, NULL);
- sum_cost += cost;
+ if (sum_cost < cost)
+ sum_cost = cost;
}
set_group_iv_cost (data, group, cand, sum_cost, depends_on,
NULL_TREE, ERROR_MARK, inv_expr);
Any suggestions?
Thanks,
Evgeny.
#define SIZE 20
int gp22 = SIZE;
double rhs [SIZE][SIZE][5];
void x_solve()
{
int j, k;
double lhsX[5][5][gp22][gp22];
for (j = 1; j < gp22; j++) {
for (k = 1; k < gp22; k++) {
lhsX[0][0][j][k] -= lhsX[1][0][j][k];
lhsX[0][1][j][k] -= lhsX[1][1][j][k];
lhsX[0][2][j][k] -= lhsX[1][2][j][k];
lhsX[0][3][j][k] -= lhsX[1][3][j][k];
lhsX[0][4][j][k] -= lhsX[1][4][j][k];
lhsX[2][0][j][k] -= lhsX[1][0][j][k];
lhsX[2][1][j][k] -= lhsX[1][1][j][k];
lhsX[2][2][j][k] -= lhsX[1][2][j][k];
lhsX[2][3][j][k] -= lhsX[1][3][j][k];
lhsX[2][4][j][k] -= lhsX[1][4][j][k];
lhsX[3][0][j][k] -= lhsX[1][0][j][k];
lhsX[3][1][j][k] -= lhsX[1][1][j][k];
lhsX[3][2][j][k] -= lhsX[1][2][j][k];
lhsX[3][3][j][k] -= lhsX[1][3][j][k];
lhsX[3][4][j][k] -= lhsX[1][4][j][k];
lhsX[4][0][j][k] -= lhsX[1][0][j][k];
lhsX[4][1][j][k] -= lhsX[1][1][j][k];
lhsX[4][2][j][k] -= lhsX[1][2][j][k];
lhsX[4][3][j][k] -= lhsX[1][3][j][k];
lhsX[4][4][j][k] -= lhsX[1][4][j][k];
lhsX[0][0][j][k] -= lhsX[3][0][j][k];
lhsX[0][1][j][k] -= lhsX[3][1][j][k];
lhsX[0][2][j][k] -= lhsX[3][2][j][k];
lhsX[0][3][j][k] -= lhsX[3][3][j][k];
lhsX[0][4][j][k] -= lhsX[3][4][j][k];
lhsX[2][0][j][k] -= lhsX[3][0][j][k];
lhsX[2][1][j][k] -= lhsX[3][1][j][k];
lhsX[2][2][j][k] -= lhsX[3][2][j][k];
lhsX[2][3][j][k] -= lhsX[3][3][j][k];
lhsX[2][4][j][k] -= lhsX[3][4][j][k];
lhsX[4][0][j][k] -= lhsX[3][0][j][k];
lhsX[4][1][j][k] -= lhsX[3][1][j][k];
lhsX[4][2][j][k] -= lhsX[3][2][j][k];
lhsX[4][3][j][k] -= lhsX[3][3][j][k];
lhsX[4][4][j][k] -= lhsX[3][4][j][k];
rhs[k][j][0] -= rhs[k][j][1];
rhs[k][j][1] -= rhs[k][j][2];
}/*end j*/
}/*end k*/
}
.cpu cortex-a9
.eabi_attribute 20, 1
.eabi_attribute 21, 1
.eabi_attribute 23, 3
.eabi_attribute 24, 1
.eabi_attribute 25, 1
.eabi_attribute 26, 2
.eabi_attribute 30, 2
.eabi_attribute 34, 1
.eabi_attribute 18, 4
.file "bt.c"
.global __aeabi_dsub
.text
.align 2
.global x_solve
.syntax unified
.arm
.fpu softvfp
.type x_solve, %function
x_solve:
@ args = 0, pretend = 0, frame = 216
@ frame_needed = 1, uses_anonymous_args = 0
movw r3, #:lower16:.LANCHOR0
push {r4, r5, r6, r7, r8, r9, r10, fp, lr}
movt r3, #:upper16:.LANCHOR0
add fp, sp, #32
sub sp, sp, #220
ldr r2, [r3]
mov r1, #200
cmp r2, #1
str r2, [fp, #-136]
mul r3, r2, r2
lsl r0, r2, #3
mul r1, r1, r3
add r2, r3, r3, lsl #2
add r1, r1, #8
sub sp, sp, r1
ble .L1
lsl r2, r2, #3
lsl r3, r3, #3
lsr r1, r0, #3
str r0, [fp, #-140]
lsr r2, r2, #3
lsr r3, r3, #3
str sp, [fp, #-248]
lsl ip, r2, #1
add r0, r2, r3
lsl r5, r0, #1
add lr, r2, r1
add r6, ip, r2
lsl r10, r3, #1
str r6, [fp, #-72]
lsl r8, r6, #3
add r6, r3, r0
str ip, [fp, #-60]
add r6, r1, r6, lsl #1
str r8, [fp, #-40]
add r8, lr, r3, lsl #2
add ip, r3, r1
str r6, [fp, #-68]
add r6, r5, r0
add r6, r6, r1
add r5, r5, r1
lsl r6, r6, #3
add r4, r10, r3
str r8, [fp, #-56]
add r8, ip, r2, lsl #2
str r6, [fp, #-212]
add r6, r10, lr
lsl r10, r5, #3
lsl r9, r4, #3
str r8, [fp, #-80]
add r8, r1, r0, lsl #2
add r4, r4, lr
str r10, [fp, #-188]
add r7, r2, lr
lsl r10, r4, #3
str r7, [fp, #-52]
add r7, r3, ip
str r7, [fp, #-44]
add r7, r2, r0
str r8, [fp, #-92]
add r8, r9, r2, lsl #4
str r10, [fp, #-168]
lsl r10, r6, #3
ldr r6, [fp, #-52]
add r7, r1, r7, lsl #1
str r8, [fp, #-64]
add r8, r3, ip
ldr r4, [fp, #-72]
add r8, r3, r8
str r7, [fp, #-84]
add r7, r2, lr
add r7, r2, r7
str r8, [fp, #-48]
str r10, [fp, #-160]
lsl r10, r6, #3
ldr r8, [fp, #-40]
add r9, r9, r2, lsl #5
ldr r5, [fp, #-60]
add r2, r2, r7
add r4, r4, ip
str r10, [fp, #-180]
lsl r4, r4, #3
lsl r10, r2, #3
add r8, r8, r3, lsl #4
ldr r6, [fp, #-48]
add r5, r5, ip
str r10, [fp, #-220]
str r4, [fp, #-204]
lsl r10, r7, #3
lsl r4, r5, #3
str r9, [fp, #-88]
str r8, [fp, #-76]
lsl lr, lr, #3
ldr r9, [fp, #-136]
lsl ip, ip, #3
ldr r8, [fp, #-40]
str r4, [fp, #-184]
str r10, [fp, #-200]
add r9, r9, r1
ldr r7, [fp, #-44]
add r1, r1, r0
add r8, r8, r3, lsl #5
add r3, r3, r6
str lr, [fp, #-144]
lsl r9, r9, #3
ldr r0, [fp, #-140]
lsl r4, r7, #3
ldr r7, [fp, #-84]
ldr r2, [fp, #-64]
str r4, [fp, #-156]
lsl r4, r3, #3
ldr r3, [fp, #-56]
str r4, [fp, #-172]
lsl r4, r6, #3
ldr r6, [fp, #-68]
lsl lr, r7, #3
str r4, [fp, #-164]
str lr, [fp, #-228]
str ip, [fp, #-148]
lsl r4, r6, #3
str r4, [fp, #-196]
lsl r4, r3, #3
ldr r3, [fp, #-80]
str r4, [fp, #-176]
lsl lr, r3, #3
ldr r3, [fp, #-92]
str lr, [fp, #-224]
lsl r3, r3, #3
str r3, [fp, #-236]
lsl r3, r1, #3
add r1, r2, r0
ldr r2, [fp, #-88]
str r1, [fp, #-192]
str r3, [fp, #-152]
movw r3, #:lower16:rhs
add r1, r2, r0
ldr r2, [fp, #-76]
movt r3, #:upper16:rhs
str r1, [fp, #-232]
str r3, [fp, #-244]
mov r3, #1
add r2, r2, r0
str r3, [fp, #-128]
str r2, [fp, #-208]
add r2, r8, r0
mov r8, #0
str r2, [fp, #-216]
sub r2, r9, #8
str r2, [fp, #-240]
.L4:
ldr r3, [fp, #-128]
ldr r0, [fp, #-156]
ldr r5, [fp, #-168]
ldr r2, [fp, #-244]
add r0, r8, r0
ldr r1, [fp, #-152]
add r4, r3, r3, lsl #2
str r0, [fp, #-52]
add r0, r8, r5
ldr r5, [fp, #-172]
ldr r3, [fp, #-140]
add r1, r8, r1
add r4, r2, r4, lsl #3
ldr r2, [fp, #-148]
str r1, [fp, #-48]
add r1, r8, r5
ldr r5, [fp, #-176]
add r10, r3, r8
add r2, r8, r2
ldr r3, [fp, #-144]
str r2, [fp, #-44]
add r4, r4, #800
add r2, r8, r5
ldr r5, [fp, #-180]
add r3, r8, r3
ldr r9, [fp, #-196]
str r3, [fp, #-40]
add r3, r8, r5
ldr r5, [fp, #-184]
add r9, r8, r9
ldr r7, [fp, #-192]
str r9, [fp, #-92]
add r5, r8, r5
ldr r6, [fp, #-188]
str r5, [fp, #-80]
add r7, r8, r7
ldr r5, [fp, #-200]
str r7, [fp, #-88]
add r6, r8, r6
str r6, [fp, #-84]
add r9, r8, r5
ldr r5, [fp, #-204]
ldr ip, [fp, #-160]
str r10, [fp, #-132]
add r7, r8, r5
ldr r5, [fp, #-208]
add lr, r8, ip
str r3, [fp, #-76]
ldr ip, [fp, #-164]
add r6, r8, r5
ldr r5, [fp, #-212]
add ip, r8, ip
add r5, r8, r5
str r5, [fp, #-96]
ldr r5, [fp, #-216]
ldr r3, [fp, #-220]
add r5, r8, r5
add r3, r8, r3
str r3, [fp, #-100]
ldr r3, [fp, #-224]
add r3, r8, r3
str r3, [fp, #-104]
ldr r3, [fp, #-228]
add r3, r8, r3
str r3, [fp, #-108]
ldr r3, [fp, #-232]
add r3, r8, r3
str r3, [fp, #-112]
ldr r3, [fp, #-236]
add r3, r8, r3
str r3, [fp, #-124]
ldr r3, [fp, #-240]
add r8, r3, r8
ldr r3, [fp, #-40]
str r8, [fp, #-116]
ldr r8, [fp, #-248]
add r3, r8, r3
add r1, r8, r1
str r3, [fp, #-40]
add lr, r8, lr
ldr r3, [fp, #-44]
add ip, r8, ip
str r1, [fp, #-68]
add r1, r8, r2
ldr r2, [fp, #-76]
add r0, r8, r0
add r3, r8, r3
str r1, [fp, #-72]
str r3, [fp, #-44]
add r10, r8, r10
ldr r3, [fp, #-48]
add r1, r8, r2
ldr r2, [fp, #-80]
add r9, r8, r9
str r1, [fp, #-76]
add r7, r8, r7
add r3, r8, r3
str lr, [fp, #-56]
add r1, r8, r2
str r3, [fp, #-48]
ldr r2, [fp, #-84]
add r6, r8, r6
ldr r3, [fp, #-52]
str r1, [fp, #-80]
add r1, r8, r2
str ip, [fp, #-60]
add r3, r8, r3
str r0, [fp, #-64]
str r3, [fp, #-52]
mov r3, r8
str r1, [fp, #-84]
add r5, r3, r5
ldr r2, [fp, #-88]
add r1, r8, r2
ldr r2, [fp, #-92]
str r1, [fp, #-88]
add r1, r8, r2
ldr r2, [fp, #-96]
str r1, [fp, #-92]
add r8, r8, r2
ldr r2, [fp, #-100]
add r1, r3, r2
ldr r2, [fp, #-104]
str r1, [fp, #-96]
add r1, r3, r2
ldr r2, [fp, #-108]
str r1, [fp, #-100]
add r1, r3, r2
ldr r2, [fp, #-112]
str r1, [fp, #-104]
add r1, r3, r2
ldr r2, [fp, #-124]
str r1, [fp, #-108]
add r1, r3, r2
ldr r2, [fp, #-116]
str r1, [fp, #-112]
add r3, r3, r2
str r3, [fp, #-116]
.L3:
ldr r1, [fp, #-40]
ldrd r2, [r1, #8]!
str r1, [fp, #-40]
ldrd r0, [r10, #8]!
bl __aeabi_dsub
ldr ip, [fp, #-44]
strd r0, [r10]
ldr r0, [fp, #-48]
ldrd r2, [r0, #8]!
str r0, [fp, #-48]
ldrd r0, [ip, #8]!
str ip, [fp, #-44]
bl __aeabi_dsub
ldr ip, [fp, #-44]
ldr lr, [fp, #-56]
strd r0, [ip]
ldrd r2, [lr, #8]!
str lr, [fp, #-56]
ldr lr, [fp, #-52]
ldrd r0, [lr, #8]!
str lr, [fp, #-52]
bl __aeabi_dsub
ldr r3, [fp, #-52]
ldr lr, [fp, #-60]
strd r0, [r3]
ldr r1, [fp, #-64]
ldrd r2, [r1, #8]!
str r1, [fp, #-64]
ldrd r0, [lr, #8]!
str lr, [fp, #-60]
bl __aeabi_dsub
ldr r2, [fp, #-60]
ldr ip, [fp, #-68]
strd r0, [r2]
ldr r0, [fp, #-72]
ldrd r2, [r0, #8]!
str r0, [fp, #-72]
ldrd r0, [ip, #8]!
str ip, [fp, #-68]
bl __aeabi_dsub
ldr ip, [fp, #-68]
ldr lr, [fp, #-76]
strd r0, [ip]
ldr r1, [fp, #-40]
ldrd r2, [r1]
ldrd r0, [lr, #8]!
str lr, [fp, #-76]
bl __aeabi_dsub
ldr r3, [fp, #-76]
ldr lr, [fp, #-80]
strd r0, [r3]
ldr r0, [fp, #-48]
ldrd r2, [r0]
ldrd r0, [lr, #8]!
str lr, [fp, #-80]
bl __aeabi_dsub
ldr r2, [fp, #-80]
ldr lr, [fp, #-56]
strd r0, [r2]
ldrd r2, [lr]
ldr lr, [fp, #-84]
ldrd r0, [lr, #8]!
str lr, [fp, #-84]
bl __aeabi_dsub
ldr lr, [fp, #-84]
ldr ip, [fp, #-88]
ldr r2, [fp, #-64]
strd r0, [lr]
ldrd r0, [ip, #8]!
ldrd r2, [r2]
str ip, [fp, #-88]
bl __aeabi_dsub
ldr r3, [fp, #-88]
ldr ip, [fp, #-92]
strd r0, [r3]
ldr r3, [fp, #-72]
ldrd r0, [ip, #8]!
ldrd r2, [r3]
str ip, [fp, #-92]
bl __aeabi_dsub
ldr ip, [fp, #-92]
strd r0, [ip]
ldr r1, [fp, #-40]
ldrd r2, [r1]
ldrd r0, [r9, #8]!
bl __aeabi_dsub
strd r0, [r9]
ldr r0, [fp, #-48]
ldrd r2, [r0]
ldrd r0, [r7, #8]!
bl __aeabi_dsub
ldr lr, [fp, #-56]
strd r0, [r7]
ldrd r0, [r6, #8]!
ldrd r2, [lr]
bl __aeabi_dsub
ldr r2, [fp, #-64]
strd r0, [r6]
ldrd r0, [r8, #8]!
ldrd r2, [r2]
bl __aeabi_dsub
ldr r3, [fp, #-72]
strd r0, [r8]
ldrd r0, [r5, #8]!
ldrd r2, [r3]
bl __aeabi_dsub
strd r0, [r5]
ldr r1, [fp, #-40]
ldr lr, [fp, #-96]
ldrd r2, [r1]
ldrd r0, [lr, #8]!
str lr, [fp, #-96]
bl __aeabi_dsub
ldr r2, [fp, #-96]
ldr ip, [fp, #-100]
strd r0, [r2]
ldr r0, [fp, #-48]
ldrd r2, [r0]
ldrd r0, [ip, #8]!
str ip, [fp, #-100]
bl __aeabi_dsub
ldr ip, [fp, #-100]
ldr lr, [fp, #-56]
strd r0, [ip]
ldrd r2, [lr]
ldr lr, [fp, #-104]
ldrd r0, [lr, #8]!
str lr, [fp, #-104]
bl __aeabi_dsub
ldr lr, [fp, #-104]
ldr r2, [fp, #-64]
strd r0, [lr]
ldr lr, [fp, #-108]
ldrd r2, [r2]
ldrd r0, [lr, #8]!
str lr, [fp, #-108]
bl __aeabi_dsub
ldr r2, [fp, #-108]
ldr lr, [fp, #-112]
strd r0, [r2]
ldr r0, [fp, #-72]
ldrd r2, [r0]
ldrd r0, [lr, #8]!
str lr, [fp, #-112]
bl __aeabi_dsub
ldr r3, [fp, #-112]
strd r0, [r3]
ldrd r0, [r10]
ldrd r2, [r9]
bl __aeabi_dsub
ldr ip, [fp, #-44]
strd r0, [r10]
ldrd r2, [r7]
ldrd r0, [ip]
bl __aeabi_dsub
ldr r3, [fp, #-52]
ldr ip, [fp, #-44]
strd r0, [ip]
ldrd r0, [r3]
ldrd r2, [r6]
bl __aeabi_dsub
ldr r3, [fp, #-52]
ldr r2, [fp, #-60]
strd r0, [r3]
ldrd r0, [r2]
ldrd r2, [r8]
bl __aeabi_dsub
ldr r2, [fp, #-60]
ldr ip, [fp, #-68]
strd r0, [r2]
ldrd r0, [ip]
ldrd r2, [r5]
bl __aeabi_dsub
ldr r3, [fp, #-76]
ldr ip, [fp, #-68]
strd r0, [ip]
ldrd r0, [r3]
ldrd r2, [r9]
bl __aeabi_dsub
ldr r3, [fp, #-76]
ldr r2, [fp, #-80]
strd r0, [r3]
ldrd r0, [r2]
ldrd r2, [r7]
bl __aeabi_dsub
ldr r2, [fp, #-80]
ldr lr, [fp, #-84]
strd r0, [r2]
ldrd r0, [lr]
ldrd r2, [r6]
bl __aeabi_dsub
ldr r3, [fp, #-88]
ldr lr, [fp, #-84]
strd r0, [lr]
ldrd r0, [r3]
ldrd r2, [r8]
bl __aeabi_dsub
ldr r3, [fp, #-88]
ldr ip, [fp, #-92]
strd r0, [r3]
ldrd r0, [ip]
ldrd r2, [r5]
bl __aeabi_dsub
ldr ip, [fp, #-92]
strd r0, [ip]
ldr r1, [fp, #-96]
ldrd r2, [r9]
ldrd r0, [r1]
bl __aeabi_dsub
ldr r2, [fp, #-96]
ldr ip, [fp, #-100]
strd r0, [r2]
ldrd r0, [ip]
ldrd r2, [r7]
bl __aeabi_dsub
ldr ip, [fp, #-100]
strd r0, [ip]
ldr r1, [fp, #-104]
ldrd r2, [r6]
ldrd r0, [r1]
bl __aeabi_dsub
ldr r2, [fp, #-108]
ldr ip, [fp, #-104]
strd r0, [ip]
ldrd r0, [r2]
ldrd r2, [r8]
bl __aeabi_dsub
ldr r2, [fp, #-108]
ldr r3, [fp, #-112]
strd r0, [r2]
ldrd r0, [r3]
ldrd r2, [r5]
bl __aeabi_dsub
ldr lr, [fp, #-112]
ldrd r2, [r4, #8]
strd r0, [lr]
ldrd r0, [r4]
strd r2, [fp, #-124]
bl __aeabi_dsub
ldrd r2, [r4, #16]
strd r0, [r4]
ldrd r0, [fp, #-124]
bl __aeabi_dsub
ldr r3, [fp, #-116]
strd r0, [r4, #8]
add r4, r4, #800
cmp r10, r3
bne .L3
ldr r3, [fp, #-128]
ldr r2, [fp, #-136]
ldr r8, [fp, #-132]
add r3, r3, #1
cmp r2, r3
str r3, [fp, #-128]
bne .L4
.L1:
sub sp, fp, #32
@ sp needed
pop {r4, r5, r6, r7, r8, r9, r10, fp, pc}
.size x_solve, .-x_solve
.comm rhs,16000,8
.global gp22
.data
.align 2
.set .LANCHOR0,. + 0
.type gp22, %object
.size gp22, 4
gp22:
.word 20
.ident "GCC: (GNU) 7.0.0 20161102 (experimental)"
.section .note.GNU-stack,"",%progbits
.cpu cortex-a9
.eabi_attribute 20, 1
.eabi_attribute 21, 1
.eabi_attribute 23, 3
.eabi_attribute 24, 1
.eabi_attribute 25, 1
.eabi_attribute 26, 2
.eabi_attribute 30, 2
.eabi_attribute 34, 1
.eabi_attribute 18, 4
.file "bt.c"
.global __aeabi_dsub
.text
.align 2
.global x_solve
.syntax unified
.arm
.fpu softvfp
.type x_solve, %function
x_solve:
@ args = 0, pretend = 0, frame = 80
@ frame_needed = 1, uses_anonymous_args = 0
movw r3, #:lower16:.LANCHOR0
push {r4, r5, r6, r7, r8, r9, r10, fp, lr}
movt r3, #:upper16:.LANCHOR0
add fp, sp, #32
sub sp, sp, #84
ldr r3, [r3]
mov r2, #200
cmp r3, #1
mul r1, r3, r3
lsl lr, r3, #3
str lr, [fp, #-92]
mul r2, r2, r1
add r0, r1, r1, lsl #2
lsl ip, r1, #3
lsl r0, r0, #3
str ip, [fp, #-96]
add r2, r2, #8
str r0, [fp, #-100]
sub sp, sp, r2
ble .L1
ldr r4, .L9
add r3, r3, r3, lsl #2
lsr r2, r0, #3
lsr r0, ip, #3
add ip, r2, r0
lsl r8, r0, #4
add r3, r4, r3, lsl #3
lsl ip, ip, #3
lsl r0, r0, #5
lsl r10, r2, #4
lsl r9, r2, #5
str r4, [fp, #-88]
sub r3, r3, #40
str ip, [fp, #-108]
add r2, sp, lr
str r3, [fp, #-104]
sub r3, lr, #8
str r0, [fp, #-68]
str r2, [fp, #-84]
str r3, [fp, #-112]
.L4:
ldr r5, [fp, #-84]
ldr r3, [fp, #-100]
add r6, r3, r5
ldr r3, [fp, #-96]
add r7, r3, r5
ldr r3, [fp, #-108]
mov r2, r7
mov r7, r6
mov r6, r2
add r4, r5, r3
ldr r3, [fp, #-112]
str r4, [fp, #-40]
add r3, r3, r5
str r3, [fp, #-80]
ldr r3, [fp, #-88]
str r3, [fp, #-76]
mov r3, r9
mov r9, r5
str r3, [fp, #-44]
.L3:
ldrd r0, [r9, #8]!
ldrd r2, [r7, #8]!
bl __aeabi_dsub
ldr r5, [fp, #-40]
strd r0, [r9]
add r1, r9, r8
add r0, r8, r1
add ip, r7, r8
ldrd r2, [r5, #8]!
add r4, r8, ip
str r1, [fp, #-52]
str r0, [fp, #-48]
ldrd r0, [r6, #8]!
str ip, [fp, #-60]
str r4, [fp, #-72]
bl __aeabi_dsub
add r3, r5, r8
strd r0, [r6]
str r3, [fp, #-64]
add r3, r6, r8
mov r4, r3
ldrd r0, [r9, r8]
ldrd r2, [r7, r8]
bl __aeabi_dsub
strd r0, [r9, r8]
ldrd r0, [r6, r8]
str r5, [fp, #-40]
ldrd r2, [r5, r8]
bl __aeabi_dsub
ldr r5, [fp, #-68]
strd r0, [r6, r8]
ldrd r2, [r7, r5]
ldrd r0, [r9, r5]
bl __aeabi_dsub
strd r0, [r9, r5]
ldrd r0, [r9, r10]
ldrd r2, [r7]
bl __aeabi_dsub
ldr r5, [fp, #-40]
strd r0, [r9, r10]
ldrd r0, [r6, r10]
ldrd r2, [r5]
bl __aeabi_dsub
ldr r5, [fp, #-52]
strd r0, [r6, r10]
ldrd r2, [r7, r8]
ldrd r0, [r5, r10]
bl __aeabi_dsub
strd r0, [r5, r10]
ldr r5, [fp, #-40]
ldrd r0, [r4, r10]
ldrd r2, [r5, r8]
bl __aeabi_dsub
ldr r5, [fp, #-68]
strd r0, [r4, r10]
str r4, [fp, #-56]
ldrd r2, [r7, r5]
ldr r4, [fp, #-48]
ldrd r0, [r4, r10]
bl __aeabi_dsub
strd r0, [r4, r10]
ldrd r2, [r7]
ldrd r0, [r7, r10]
bl __aeabi_dsub
ldr r4, [fp, #-40]
strd r0, [r7, r10]
ldrd r2, [r4]
ldrd r0, [r4, r10]
bl __aeabi_dsub
ldr r5, [fp, #-60]
strd r0, [r4, r10]
ldrd r2, [r7, r8]
ldrd r0, [r5, r10]
bl __aeabi_dsub
strd r0, [r5, r10]
ldr r5, [fp, #-64]
ldrd r2, [r4, r8]
ldrd r0, [r5, r10]
bl __aeabi_dsub
strd r0, [r5, r10]
ldr r5, [fp, #-68]
ldr r4, [fp, #-72]
ldrd r2, [r7, r5]
ldrd r0, [r4, r10]
bl __aeabi_dsub
strd r0, [r4, r10]
ldr r4, [fp, #-44]
ldrd r2, [r7]
ldrd r0, [r9, r4]
bl __aeabi_dsub
strd r0, [r9, r4]
ldrd r0, [r6, r4]
ldr r3, [fp, #-40]
ldrd r2, [r3]
bl __aeabi_dsub
ldr r5, [fp, #-52]
strd r0, [r6, r4]
ldrd r0, [r5, r4]
ldrd r2, [r7, r8]
bl __aeabi_dsub
mov r2, r5
ldr r3, [fp, #-40]
mov r5, r4
strd r0, [r2, r4]
ldr r4, [fp, #-56]
ldrd r2, [r3, r8]
ldrd r0, [r4, r5]
bl __aeabi_dsub
strd r0, [r4, r5]
ldr r4, [fp, #-48]
ldrd r0, [r4, r5]
ldr r5, [fp, #-68]
ldrd r2, [r7, r5]
bl __aeabi_dsub
ldr ip, [fp, #-44]
strd r0, [r4, ip]
ldrd r2, [r7, r10]
ldrd r0, [r9]
bl __aeabi_dsub
ldr r4, [fp, #-40]
strd r0, [r9]
ldrd r0, [r6]
ldrd r2, [r4, r10]
bl __aeabi_dsub
ldr r4, [fp, #-60]
strd r0, [r6]
ldrd r0, [r9, r8]
ldrd r2, [r4, r10]
bl __aeabi_dsub
ldr r3, [fp, #-64]
strd r0, [r9, r8]
ldrd r0, [r6, r8]
ldrd r2, [r3, r10]
bl __aeabi_dsub
ldr r3, [fp, #-72]
strd r0, [r6, r8]
ldrd r0, [r9, r5]
ldrd r2, [r3, r10]
bl __aeabi_dsub
strd r0, [r9, r5]
ldrd r0, [r9, r10]
ldrd r2, [r7, r10]
bl __aeabi_dsub
ldr r3, [fp, #-40]
strd r0, [r9, r10]
ldrd r0, [r6, r10]
ldrd r2, [r3, r10]
bl __aeabi_dsub
ldr r5, [fp, #-52]
strd r0, [r6, r10]
ldrd r2, [r4, r10]
ldrd r0, [r5, r10]
bl __aeabi_dsub
ldr r3, [fp, #-64]
ldr r4, [fp, #-56]
strd r0, [r5, r10]
ldrd r2, [r3, r10]
ldrd r0, [r4, r10]
bl __aeabi_dsub
ldr r5, [fp, #-72]
strd r0, [r4, r10]
ldr r4, [fp, #-48]
ldrd r2, [r5, r10]
ldrd r0, [r4, r10]
bl __aeabi_dsub
strd r0, [r4, r10]
ldr r4, [fp, #-44]
ldrd r2, [r7, r10]
ldrd r0, [r9, r4]
bl __aeabi_dsub
mov ip, r4
strd r0, [r9, r4]
ldr r4, [fp, #-40]
ldrd r0, [r6, ip]
ldrd r2, [r4, r10]
mov r4, ip
bl __aeabi_dsub
ldr ip, [fp, #-60]
mov r3, r4
strd r0, [r6, r4]
ldr r4, [fp, #-52]
ldrd r0, [r4, r3]
ldrd r2, [ip, r10]
bl __aeabi_dsub
ldr r3, [fp, #-44]
strd r0, [r4, r3]
ldr r4, [fp, #-56]
ldrd r0, [r4, r3]
ldr r3, [fp, #-64]
ldrd r2, [r3, r10]
bl __aeabi_dsub
ldr r3, [fp, #-44]
ldr lr, [fp, #-48]
strd r0, [r4, r3]
ldrd r0, [lr, r3]
ldrd r2, [r5, r10]
bl __aeabi_dsub
ldr r3, [fp, #-76]
ldr lr, [fp, #-48]
ldr r2, [fp, #-44]
ldrd r4, [r3, #8]
strd r0, [lr, r2]
ldrd r0, [r3]
mov r2, r4
mov r3, r5
bl __aeabi_dsub
ldr r3, [fp, #-76]
strd r0, [r3]
mov r0, r4
mov r1, r5
mov r4, r3
ldrd r2, [r3, #16]
bl __aeabi_dsub
ldr r2, [fp, #-80]
add r3, r4, #800
strd r0, [r4, #8]
str r3, [fp, #-76]
cmp r9, r2
bne .L3
ldr r2, [fp, #-84]
ldr r1, [fp, #-92]
ldr r3, [fp, #-88]
ldr r9, [fp, #-44]
add r2, r2, r1
str r2, [fp, #-84]
add r3, r3, #40
ldr r2, [fp, #-104]
str r3, [fp, #-88]
cmp r3, r2
bne .L4
.L1:
sub sp, fp, #32
@ sp needed
pop {r4, r5, r6, r7, r8, r9, r10, fp, pc}
.L10:
.align 2
.L9:
.word rhs+840
.size x_solve, .-x_solve
.comm rhs,16000,8
.global gp22
.data
.align 2
.set .LANCHOR0,. + 0
.type gp22, %object
.size gp22, 4
gp22:
.word 20
.ident "GCC: (GNU) 7.0.0 20161102 (experimental)"
.section .note.GNU-stack,"",%progbits
* tree-ssa-loop-ivopts.c (determine_group_iv_cost_address): Rename
'sum_cost' to 'max_cost'. Use maximum of per-use costs instead of sum.
diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
index f9211ad..7d40ae8 100644
--- a/gcc/tree-ssa-loop-ivopts.c
+++ b/gcc/tree-ssa-loop-ivopts.c
@@ -5122,41 +5122,42 @@ determine_group_iv_cost_address (struct ivopts_data *data,
bool can_autoinc;
iv_inv_expr_ent *inv_expr = NULL;
struct iv_use *use = group->vuses[0];
- comp_cost sum_cost = no_cost, cost;
+ comp_cost max_cost = no_cost, cost;
cost = get_computation_cost (data, use, cand, true,
&depends_on, &can_autoinc, &inv_expr);
- sum_cost = cost;
- if (!sum_cost.infinite_cost_p () && cand->ainc_use == use)
+ max_cost = cost;
+ if (!max_cost.infinite_cost_p () && cand->ainc_use == use)
{
if (can_autoinc)
- sum_cost -= cand->cost_step;
+ max_cost -= cand->cost_step;
/* If we generated the candidate solely for exploiting autoincrement
opportunities, and it turns out it can't be used, set the cost to
infinity to make sure we ignore it. */
else if (cand->pos == IP_AFTER_USE || cand->pos == IP_BEFORE_USE)
- sum_cost = infinite_cost;
+ max_cost = infinite_cost;
}
/* Uses in a group can share setup code, so only add setup cost once. */
cost -= cost.scratch;
/* Compute and add costs for rest uses of this group. */
- for (i = 1; i < group->vuses.length () && !sum_cost.infinite_cost_p (); i++)
+ for (i = 1; i < group->vuses.length () && !max_cost.infinite_cost_p (); i++)
{
struct iv_use *next = group->vuses[i];
/* TODO: We could skip computing cost for sub iv_use when it has the
same cost as the first iv_use, but the cost really depends on the
offset and where the iv_use is. */
- cost = get_computation_cost (data, next, cand, true,
- NULL, &can_autoinc, NULL);
- sum_cost += cost;
+ cost = get_computation_cost (data, next, cand, true,
+ NULL, &can_autoinc, NULL);
+ if (max_cost < cost)
+ max_cost = cost;
}
- set_group_iv_cost (data, group, cand, sum_cost, depends_on,
+ set_group_iv_cost (data, group, cand, max_cost, depends_on,
NULL_TREE, ERROR_MARK, inv_expr);
- return !sum_cost.infinite_cost_p ();
+ return !max_cost.infinite_cost_p ();
}
/* Computes value of candidate CAND at position AT in iteration NITER, and