Hello,

I'm facing the following problem related to ivopts. The problem is that GCC generates a lot of induction variables and as a result there is an unnecessary increase of stack usage and register pressure.

For instance, for the attached testcase (tc_ivopts.c) GCC generates 26 induction variables (25 for each of lhsX[{0-4}][{0-4}][k] and one for rhs[k][j][{0-2}]). You should be able to reproduce this issue on arm target using GCC with "-O2 -mcpu=cortex-a9 -mtune=cortex-a9". For reference, I'm attaching assembly I get on current trunk.

The reason might be in use groups costs, in particular, in the way of estimation. Currently, the cost of a tuple (group, candidate) is a sum of per-use costs in the group. So, the cost of a group grows proportional to the number of uses in the group. This approach has a negative effect on the algorithm for finding the best set of induction variables: the part of a total cost related to adding a new candidate is almost washed out by the cost of the group. In addition, when there is a lot of groups with many uses inside and a target is out of free registers, the cost of spill is washing out too. As a result, GCC prefers to use native candidates (candidate created for a particular group) for each group of uses instead of considering the real cost of introducing a new variable into a set.

The summing approach was added as a part of this patch (https://gcc.gnu.org/ml/gcc-patches/2015-05/msg00641.html) and the motivation for taking the sum does not seem to be
specifically discussed.

I propose the following patch that changes a group cost from cost summing to selecting the largest one inside a group. For the given test case I have: necessary size of stack is decreased by almost 3 times and ldr\str amount are decreased by less than 2 times. Also I'm attaching assembly after applying the patch.

The essential change in the patch is just:

diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
index f9211ad..a149418 100644
--- a/gcc/tree-ssa-loop-ivopts.c
+++ b/gcc/tree-ssa-loop-ivopts.c
@@ -5151,7 +5151,8 @@ determine_group_iv_cost_address (struct ivopts_data *data,
         offset and where the iv_use is.  */
        cost = get_computation_cost (data, next, cand, true,
                                     NULL, &can_autoinc, NULL);
-      sum_cost += cost;
+      if (sum_cost < cost)
+        sum_cost = cost;
     }
   set_group_iv_cost (data, group, cand, sum_cost, depends_on,
                     NULL_TREE, ERROR_MARK, inv_expr);

Any suggestions?


Thanks,
Evgeny.
#define SIZE 20
int gp22 = SIZE;
double rhs [SIZE][SIZE][5];

void x_solve()
{
  int j, k;
  double lhsX[5][5][gp22][gp22];
  for (j = 1; j < gp22; j++) {
    for (k = 1; k < gp22; k++) {
      lhsX[0][0][j][k] -= lhsX[1][0][j][k];
      lhsX[0][1][j][k] -= lhsX[1][1][j][k];
      lhsX[0][2][j][k] -= lhsX[1][2][j][k];
      lhsX[0][3][j][k] -= lhsX[1][3][j][k];
      lhsX[0][4][j][k] -= lhsX[1][4][j][k];

      lhsX[2][0][j][k] -= lhsX[1][0][j][k];
      lhsX[2][1][j][k] -= lhsX[1][1][j][k];
      lhsX[2][2][j][k] -= lhsX[1][2][j][k];
      lhsX[2][3][j][k] -= lhsX[1][3][j][k];
      lhsX[2][4][j][k] -= lhsX[1][4][j][k];

      lhsX[3][0][j][k] -= lhsX[1][0][j][k];
      lhsX[3][1][j][k] -= lhsX[1][1][j][k];
      lhsX[3][2][j][k] -= lhsX[1][2][j][k];
      lhsX[3][3][j][k] -= lhsX[1][3][j][k];
      lhsX[3][4][j][k] -= lhsX[1][4][j][k];

      lhsX[4][0][j][k] -= lhsX[1][0][j][k];
      lhsX[4][1][j][k] -= lhsX[1][1][j][k];
      lhsX[4][2][j][k] -= lhsX[1][2][j][k];
      lhsX[4][3][j][k] -= lhsX[1][3][j][k];
      lhsX[4][4][j][k] -= lhsX[1][4][j][k];
        
      lhsX[0][0][j][k] -= lhsX[3][0][j][k];
      lhsX[0][1][j][k] -= lhsX[3][1][j][k];
      lhsX[0][2][j][k] -= lhsX[3][2][j][k];
      lhsX[0][3][j][k] -= lhsX[3][3][j][k];
      lhsX[0][4][j][k] -= lhsX[3][4][j][k];

      lhsX[2][0][j][k] -= lhsX[3][0][j][k];
      lhsX[2][1][j][k] -= lhsX[3][1][j][k];
      lhsX[2][2][j][k] -= lhsX[3][2][j][k];
      lhsX[2][3][j][k] -= lhsX[3][3][j][k];
      lhsX[2][4][j][k] -= lhsX[3][4][j][k];

      lhsX[4][0][j][k] -= lhsX[3][0][j][k];
      lhsX[4][1][j][k] -= lhsX[3][1][j][k];
      lhsX[4][2][j][k] -= lhsX[3][2][j][k];
      lhsX[4][3][j][k] -= lhsX[3][3][j][k];
      lhsX[4][4][j][k] -= lhsX[3][4][j][k];

      rhs[k][j][0] -= rhs[k][j][1];
      rhs[k][j][1] -= rhs[k][j][2];
    }/*end j*/
  }/*end k*/
}
	.cpu cortex-a9
	.eabi_attribute 20, 1
	.eabi_attribute 21, 1
	.eabi_attribute 23, 3
	.eabi_attribute 24, 1
	.eabi_attribute 25, 1
	.eabi_attribute 26, 2
	.eabi_attribute 30, 2
	.eabi_attribute 34, 1
	.eabi_attribute 18, 4
	.file	"bt.c"
	.global	__aeabi_dsub
	.text
	.align	2
	.global	x_solve
	.syntax unified
	.arm
	.fpu softvfp
	.type	x_solve, %function
x_solve:
	@ args = 0, pretend = 0, frame = 216
	@ frame_needed = 1, uses_anonymous_args = 0
	movw	r3, #:lower16:.LANCHOR0
	push	{r4, r5, r6, r7, r8, r9, r10, fp, lr}
	movt	r3, #:upper16:.LANCHOR0
	add	fp, sp, #32
	sub	sp, sp, #220
	ldr	r2, [r3]
	mov	r1, #200
	cmp	r2, #1
	str	r2, [fp, #-136]
	mul	r3, r2, r2
	lsl	r0, r2, #3
	mul	r1, r1, r3
	add	r2, r3, r3, lsl #2
	add	r1, r1, #8
	sub	sp, sp, r1
	ble	.L1
	lsl	r2, r2, #3
	lsl	r3, r3, #3
	lsr	r1, r0, #3
	str	r0, [fp, #-140]
	lsr	r2, r2, #3
	lsr	r3, r3, #3
	str	sp, [fp, #-248]
	lsl	ip, r2, #1
	add	r0, r2, r3
	lsl	r5, r0, #1
	add	lr, r2, r1
	add	r6, ip, r2
	lsl	r10, r3, #1
	str	r6, [fp, #-72]
	lsl	r8, r6, #3
	add	r6, r3, r0
	str	ip, [fp, #-60]
	add	r6, r1, r6, lsl #1
	str	r8, [fp, #-40]
	add	r8, lr, r3, lsl #2
	add	ip, r3, r1
	str	r6, [fp, #-68]
	add	r6, r5, r0
	add	r6, r6, r1
	add	r5, r5, r1
	lsl	r6, r6, #3
	add	r4, r10, r3
	str	r8, [fp, #-56]
	add	r8, ip, r2, lsl #2
	str	r6, [fp, #-212]
	add	r6, r10, lr
	lsl	r10, r5, #3
	lsl	r9, r4, #3
	str	r8, [fp, #-80]
	add	r8, r1, r0, lsl #2
	add	r4, r4, lr
	str	r10, [fp, #-188]
	add	r7, r2, lr
	lsl	r10, r4, #3
	str	r7, [fp, #-52]
	add	r7, r3, ip
	str	r7, [fp, #-44]
	add	r7, r2, r0
	str	r8, [fp, #-92]
	add	r8, r9, r2, lsl #4
	str	r10, [fp, #-168]
	lsl	r10, r6, #3
	ldr	r6, [fp, #-52]
	add	r7, r1, r7, lsl #1
	str	r8, [fp, #-64]
	add	r8, r3, ip
	ldr	r4, [fp, #-72]
	add	r8, r3, r8
	str	r7, [fp, #-84]
	add	r7, r2, lr
	add	r7, r2, r7
	str	r8, [fp, #-48]
	str	r10, [fp, #-160]
	lsl	r10, r6, #3
	ldr	r8, [fp, #-40]
	add	r9, r9, r2, lsl #5
	ldr	r5, [fp, #-60]
	add	r2, r2, r7
	add	r4, r4, ip
	str	r10, [fp, #-180]
	lsl	r4, r4, #3
	lsl	r10, r2, #3
	add	r8, r8, r3, lsl #4
	ldr	r6, [fp, #-48]
	add	r5, r5, ip
	str	r10, [fp, #-220]
	str	r4, [fp, #-204]
	lsl	r10, r7, #3
	lsl	r4, r5, #3
	str	r9, [fp, #-88]
	str	r8, [fp, #-76]
	lsl	lr, lr, #3
	ldr	r9, [fp, #-136]
	lsl	ip, ip, #3
	ldr	r8, [fp, #-40]
	str	r4, [fp, #-184]
	str	r10, [fp, #-200]
	add	r9, r9, r1
	ldr	r7, [fp, #-44]
	add	r1, r1, r0
	add	r8, r8, r3, lsl #5
	add	r3, r3, r6
	str	lr, [fp, #-144]
	lsl	r9, r9, #3
	ldr	r0, [fp, #-140]
	lsl	r4, r7, #3
	ldr	r7, [fp, #-84]
	ldr	r2, [fp, #-64]
	str	r4, [fp, #-156]
	lsl	r4, r3, #3
	ldr	r3, [fp, #-56]
	str	r4, [fp, #-172]
	lsl	r4, r6, #3
	ldr	r6, [fp, #-68]
	lsl	lr, r7, #3
	str	r4, [fp, #-164]
	str	lr, [fp, #-228]
	str	ip, [fp, #-148]
	lsl	r4, r6, #3
	str	r4, [fp, #-196]
	lsl	r4, r3, #3
	ldr	r3, [fp, #-80]
	str	r4, [fp, #-176]
	lsl	lr, r3, #3
	ldr	r3, [fp, #-92]
	str	lr, [fp, #-224]
	lsl	r3, r3, #3
	str	r3, [fp, #-236]
	lsl	r3, r1, #3
	add	r1, r2, r0
	ldr	r2, [fp, #-88]
	str	r1, [fp, #-192]
	str	r3, [fp, #-152]
	movw	r3, #:lower16:rhs
	add	r1, r2, r0
	ldr	r2, [fp, #-76]
	movt	r3, #:upper16:rhs
	str	r1, [fp, #-232]
	str	r3, [fp, #-244]
	mov	r3, #1
	add	r2, r2, r0
	str	r3, [fp, #-128]
	str	r2, [fp, #-208]
	add	r2, r8, r0
	mov	r8, #0
	str	r2, [fp, #-216]
	sub	r2, r9, #8
	str	r2, [fp, #-240]
.L4:
	ldr	r3, [fp, #-128]
	ldr	r0, [fp, #-156]
	ldr	r5, [fp, #-168]
	ldr	r2, [fp, #-244]
	add	r0, r8, r0
	ldr	r1, [fp, #-152]
	add	r4, r3, r3, lsl #2
	str	r0, [fp, #-52]
	add	r0, r8, r5
	ldr	r5, [fp, #-172]
	ldr	r3, [fp, #-140]
	add	r1, r8, r1
	add	r4, r2, r4, lsl #3
	ldr	r2, [fp, #-148]
	str	r1, [fp, #-48]
	add	r1, r8, r5
	ldr	r5, [fp, #-176]
	add	r10, r3, r8
	add	r2, r8, r2
	ldr	r3, [fp, #-144]
	str	r2, [fp, #-44]
	add	r4, r4, #800
	add	r2, r8, r5
	ldr	r5, [fp, #-180]
	add	r3, r8, r3
	ldr	r9, [fp, #-196]
	str	r3, [fp, #-40]
	add	r3, r8, r5
	ldr	r5, [fp, #-184]
	add	r9, r8, r9
	ldr	r7, [fp, #-192]
	str	r9, [fp, #-92]
	add	r5, r8, r5
	ldr	r6, [fp, #-188]
	str	r5, [fp, #-80]
	add	r7, r8, r7
	ldr	r5, [fp, #-200]
	str	r7, [fp, #-88]
	add	r6, r8, r6
	str	r6, [fp, #-84]
	add	r9, r8, r5
	ldr	r5, [fp, #-204]
	ldr	ip, [fp, #-160]
	str	r10, [fp, #-132]
	add	r7, r8, r5
	ldr	r5, [fp, #-208]
	add	lr, r8, ip
	str	r3, [fp, #-76]
	ldr	ip, [fp, #-164]
	add	r6, r8, r5
	ldr	r5, [fp, #-212]
	add	ip, r8, ip
	add	r5, r8, r5
	str	r5, [fp, #-96]
	ldr	r5, [fp, #-216]
	ldr	r3, [fp, #-220]
	add	r5, r8, r5
	add	r3, r8, r3
	str	r3, [fp, #-100]
	ldr	r3, [fp, #-224]
	add	r3, r8, r3
	str	r3, [fp, #-104]
	ldr	r3, [fp, #-228]
	add	r3, r8, r3
	str	r3, [fp, #-108]
	ldr	r3, [fp, #-232]
	add	r3, r8, r3
	str	r3, [fp, #-112]
	ldr	r3, [fp, #-236]
	add	r3, r8, r3
	str	r3, [fp, #-124]
	ldr	r3, [fp, #-240]
	add	r8, r3, r8
	ldr	r3, [fp, #-40]
	str	r8, [fp, #-116]
	ldr	r8, [fp, #-248]
	add	r3, r8, r3
	add	r1, r8, r1
	str	r3, [fp, #-40]
	add	lr, r8, lr
	ldr	r3, [fp, #-44]
	add	ip, r8, ip
	str	r1, [fp, #-68]
	add	r1, r8, r2
	ldr	r2, [fp, #-76]
	add	r0, r8, r0
	add	r3, r8, r3
	str	r1, [fp, #-72]
	str	r3, [fp, #-44]
	add	r10, r8, r10
	ldr	r3, [fp, #-48]
	add	r1, r8, r2
	ldr	r2, [fp, #-80]
	add	r9, r8, r9
	str	r1, [fp, #-76]
	add	r7, r8, r7
	add	r3, r8, r3
	str	lr, [fp, #-56]
	add	r1, r8, r2
	str	r3, [fp, #-48]
	ldr	r2, [fp, #-84]
	add	r6, r8, r6
	ldr	r3, [fp, #-52]
	str	r1, [fp, #-80]
	add	r1, r8, r2
	str	ip, [fp, #-60]
	add	r3, r8, r3
	str	r0, [fp, #-64]
	str	r3, [fp, #-52]
	mov	r3, r8
	str	r1, [fp, #-84]
	add	r5, r3, r5
	ldr	r2, [fp, #-88]
	add	r1, r8, r2
	ldr	r2, [fp, #-92]
	str	r1, [fp, #-88]
	add	r1, r8, r2
	ldr	r2, [fp, #-96]
	str	r1, [fp, #-92]
	add	r8, r8, r2
	ldr	r2, [fp, #-100]
	add	r1, r3, r2
	ldr	r2, [fp, #-104]
	str	r1, [fp, #-96]
	add	r1, r3, r2
	ldr	r2, [fp, #-108]
	str	r1, [fp, #-100]
	add	r1, r3, r2
	ldr	r2, [fp, #-112]
	str	r1, [fp, #-104]
	add	r1, r3, r2
	ldr	r2, [fp, #-124]
	str	r1, [fp, #-108]
	add	r1, r3, r2
	ldr	r2, [fp, #-116]
	str	r1, [fp, #-112]
	add	r3, r3, r2
	str	r3, [fp, #-116]
.L3:
	ldr	r1, [fp, #-40]
	ldrd	r2, [r1, #8]!
	str	r1, [fp, #-40]
	ldrd	r0, [r10, #8]!
	bl	__aeabi_dsub
	ldr	ip, [fp, #-44]
	strd	r0, [r10]
	ldr	r0, [fp, #-48]
	ldrd	r2, [r0, #8]!
	str	r0, [fp, #-48]
	ldrd	r0, [ip, #8]!
	str	ip, [fp, #-44]
	bl	__aeabi_dsub
	ldr	ip, [fp, #-44]
	ldr	lr, [fp, #-56]
	strd	r0, [ip]
	ldrd	r2, [lr, #8]!
	str	lr, [fp, #-56]
	ldr	lr, [fp, #-52]
	ldrd	r0, [lr, #8]!
	str	lr, [fp, #-52]
	bl	__aeabi_dsub
	ldr	r3, [fp, #-52]
	ldr	lr, [fp, #-60]
	strd	r0, [r3]
	ldr	r1, [fp, #-64]
	ldrd	r2, [r1, #8]!
	str	r1, [fp, #-64]
	ldrd	r0, [lr, #8]!
	str	lr, [fp, #-60]
	bl	__aeabi_dsub
	ldr	r2, [fp, #-60]
	ldr	ip, [fp, #-68]
	strd	r0, [r2]
	ldr	r0, [fp, #-72]
	ldrd	r2, [r0, #8]!
	str	r0, [fp, #-72]
	ldrd	r0, [ip, #8]!
	str	ip, [fp, #-68]
	bl	__aeabi_dsub
	ldr	ip, [fp, #-68]
	ldr	lr, [fp, #-76]
	strd	r0, [ip]
	ldr	r1, [fp, #-40]
	ldrd	r2, [r1]
	ldrd	r0, [lr, #8]!
	str	lr, [fp, #-76]
	bl	__aeabi_dsub
	ldr	r3, [fp, #-76]
	ldr	lr, [fp, #-80]
	strd	r0, [r3]
	ldr	r0, [fp, #-48]
	ldrd	r2, [r0]
	ldrd	r0, [lr, #8]!
	str	lr, [fp, #-80]
	bl	__aeabi_dsub
	ldr	r2, [fp, #-80]
	ldr	lr, [fp, #-56]
	strd	r0, [r2]
	ldrd	r2, [lr]
	ldr	lr, [fp, #-84]
	ldrd	r0, [lr, #8]!
	str	lr, [fp, #-84]
	bl	__aeabi_dsub
	ldr	lr, [fp, #-84]
	ldr	ip, [fp, #-88]
	ldr	r2, [fp, #-64]
	strd	r0, [lr]
	ldrd	r0, [ip, #8]!
	ldrd	r2, [r2]
	str	ip, [fp, #-88]
	bl	__aeabi_dsub
	ldr	r3, [fp, #-88]
	ldr	ip, [fp, #-92]
	strd	r0, [r3]
	ldr	r3, [fp, #-72]
	ldrd	r0, [ip, #8]!
	ldrd	r2, [r3]
	str	ip, [fp, #-92]
	bl	__aeabi_dsub
	ldr	ip, [fp, #-92]
	strd	r0, [ip]
	ldr	r1, [fp, #-40]
	ldrd	r2, [r1]
	ldrd	r0, [r9, #8]!
	bl	__aeabi_dsub
	strd	r0, [r9]
	ldr	r0, [fp, #-48]
	ldrd	r2, [r0]
	ldrd	r0, [r7, #8]!
	bl	__aeabi_dsub
	ldr	lr, [fp, #-56]
	strd	r0, [r7]
	ldrd	r0, [r6, #8]!
	ldrd	r2, [lr]
	bl	__aeabi_dsub
	ldr	r2, [fp, #-64]
	strd	r0, [r6]
	ldrd	r0, [r8, #8]!
	ldrd	r2, [r2]
	bl	__aeabi_dsub
	ldr	r3, [fp, #-72]
	strd	r0, [r8]
	ldrd	r0, [r5, #8]!
	ldrd	r2, [r3]
	bl	__aeabi_dsub
	strd	r0, [r5]
	ldr	r1, [fp, #-40]
	ldr	lr, [fp, #-96]
	ldrd	r2, [r1]
	ldrd	r0, [lr, #8]!
	str	lr, [fp, #-96]
	bl	__aeabi_dsub
	ldr	r2, [fp, #-96]
	ldr	ip, [fp, #-100]
	strd	r0, [r2]
	ldr	r0, [fp, #-48]
	ldrd	r2, [r0]
	ldrd	r0, [ip, #8]!
	str	ip, [fp, #-100]
	bl	__aeabi_dsub
	ldr	ip, [fp, #-100]
	ldr	lr, [fp, #-56]
	strd	r0, [ip]
	ldrd	r2, [lr]
	ldr	lr, [fp, #-104]
	ldrd	r0, [lr, #8]!
	str	lr, [fp, #-104]
	bl	__aeabi_dsub
	ldr	lr, [fp, #-104]
	ldr	r2, [fp, #-64]
	strd	r0, [lr]
	ldr	lr, [fp, #-108]
	ldrd	r2, [r2]
	ldrd	r0, [lr, #8]!
	str	lr, [fp, #-108]
	bl	__aeabi_dsub
	ldr	r2, [fp, #-108]
	ldr	lr, [fp, #-112]
	strd	r0, [r2]
	ldr	r0, [fp, #-72]
	ldrd	r2, [r0]
	ldrd	r0, [lr, #8]!
	str	lr, [fp, #-112]
	bl	__aeabi_dsub
	ldr	r3, [fp, #-112]
	strd	r0, [r3]
	ldrd	r0, [r10]
	ldrd	r2, [r9]
	bl	__aeabi_dsub
	ldr	ip, [fp, #-44]
	strd	r0, [r10]
	ldrd	r2, [r7]
	ldrd	r0, [ip]
	bl	__aeabi_dsub
	ldr	r3, [fp, #-52]
	ldr	ip, [fp, #-44]
	strd	r0, [ip]
	ldrd	r0, [r3]
	ldrd	r2, [r6]
	bl	__aeabi_dsub
	ldr	r3, [fp, #-52]
	ldr	r2, [fp, #-60]
	strd	r0, [r3]
	ldrd	r0, [r2]
	ldrd	r2, [r8]
	bl	__aeabi_dsub
	ldr	r2, [fp, #-60]
	ldr	ip, [fp, #-68]
	strd	r0, [r2]
	ldrd	r0, [ip]
	ldrd	r2, [r5]
	bl	__aeabi_dsub
	ldr	r3, [fp, #-76]
	ldr	ip, [fp, #-68]
	strd	r0, [ip]
	ldrd	r0, [r3]
	ldrd	r2, [r9]
	bl	__aeabi_dsub
	ldr	r3, [fp, #-76]
	ldr	r2, [fp, #-80]
	strd	r0, [r3]
	ldrd	r0, [r2]
	ldrd	r2, [r7]
	bl	__aeabi_dsub
	ldr	r2, [fp, #-80]
	ldr	lr, [fp, #-84]
	strd	r0, [r2]
	ldrd	r0, [lr]
	ldrd	r2, [r6]
	bl	__aeabi_dsub
	ldr	r3, [fp, #-88]
	ldr	lr, [fp, #-84]
	strd	r0, [lr]
	ldrd	r0, [r3]
	ldrd	r2, [r8]
	bl	__aeabi_dsub
	ldr	r3, [fp, #-88]
	ldr	ip, [fp, #-92]
	strd	r0, [r3]
	ldrd	r0, [ip]
	ldrd	r2, [r5]
	bl	__aeabi_dsub
	ldr	ip, [fp, #-92]
	strd	r0, [ip]
	ldr	r1, [fp, #-96]
	ldrd	r2, [r9]
	ldrd	r0, [r1]
	bl	__aeabi_dsub
	ldr	r2, [fp, #-96]
	ldr	ip, [fp, #-100]
	strd	r0, [r2]
	ldrd	r0, [ip]
	ldrd	r2, [r7]
	bl	__aeabi_dsub
	ldr	ip, [fp, #-100]
	strd	r0, [ip]
	ldr	r1, [fp, #-104]
	ldrd	r2, [r6]
	ldrd	r0, [r1]
	bl	__aeabi_dsub
	ldr	r2, [fp, #-108]
	ldr	ip, [fp, #-104]
	strd	r0, [ip]
	ldrd	r0, [r2]
	ldrd	r2, [r8]
	bl	__aeabi_dsub
	ldr	r2, [fp, #-108]
	ldr	r3, [fp, #-112]
	strd	r0, [r2]
	ldrd	r0, [r3]
	ldrd	r2, [r5]
	bl	__aeabi_dsub
	ldr	lr, [fp, #-112]
	ldrd	r2, [r4, #8]
	strd	r0, [lr]
	ldrd	r0, [r4]
	strd	r2, [fp, #-124]
	bl	__aeabi_dsub
	ldrd	r2, [r4, #16]
	strd	r0, [r4]
	ldrd	r0, [fp, #-124]
	bl	__aeabi_dsub
	ldr	r3, [fp, #-116]
	strd	r0, [r4, #8]
	add	r4, r4, #800
	cmp	r10, r3
	bne	.L3
	ldr	r3, [fp, #-128]
	ldr	r2, [fp, #-136]
	ldr	r8, [fp, #-132]
	add	r3, r3, #1
	cmp	r2, r3
	str	r3, [fp, #-128]
	bne	.L4
.L1:
	sub	sp, fp, #32
	@ sp needed
	pop	{r4, r5, r6, r7, r8, r9, r10, fp, pc}
	.size	x_solve, .-x_solve
	.comm	rhs,16000,8
	.global	gp22
	.data
	.align	2
	.set	.LANCHOR0,. + 0
	.type	gp22, %object
	.size	gp22, 4
gp22:
	.word	20
	.ident	"GCC: (GNU) 7.0.0 20161102 (experimental)"
	.section	.note.GNU-stack,"",%progbits
	.cpu cortex-a9
	.eabi_attribute 20, 1
	.eabi_attribute 21, 1
	.eabi_attribute 23, 3
	.eabi_attribute 24, 1
	.eabi_attribute 25, 1
	.eabi_attribute 26, 2
	.eabi_attribute 30, 2
	.eabi_attribute 34, 1
	.eabi_attribute 18, 4
	.file	"bt.c"
	.global	__aeabi_dsub
	.text
	.align	2
	.global	x_solve
	.syntax unified
	.arm
	.fpu softvfp
	.type	x_solve, %function
x_solve:
	@ args = 0, pretend = 0, frame = 80
	@ frame_needed = 1, uses_anonymous_args = 0
	movw	r3, #:lower16:.LANCHOR0
	push	{r4, r5, r6, r7, r8, r9, r10, fp, lr}
	movt	r3, #:upper16:.LANCHOR0
	add	fp, sp, #32
	sub	sp, sp, #84
	ldr	r3, [r3]
	mov	r2, #200
	cmp	r3, #1
	mul	r1, r3, r3
	lsl	lr, r3, #3
	str	lr, [fp, #-92]
	mul	r2, r2, r1
	add	r0, r1, r1, lsl #2
	lsl	ip, r1, #3
	lsl	r0, r0, #3
	str	ip, [fp, #-96]
	add	r2, r2, #8
	str	r0, [fp, #-100]
	sub	sp, sp, r2
	ble	.L1
	ldr	r4, .L9
	add	r3, r3, r3, lsl #2
	lsr	r2, r0, #3
	lsr	r0, ip, #3
	add	ip, r2, r0
	lsl	r8, r0, #4
	add	r3, r4, r3, lsl #3
	lsl	ip, ip, #3
	lsl	r0, r0, #5
	lsl	r10, r2, #4
	lsl	r9, r2, #5
	str	r4, [fp, #-88]
	sub	r3, r3, #40
	str	ip, [fp, #-108]
	add	r2, sp, lr
	str	r3, [fp, #-104]
	sub	r3, lr, #8
	str	r0, [fp, #-68]
	str	r2, [fp, #-84]
	str	r3, [fp, #-112]
.L4:
	ldr	r5, [fp, #-84]
	ldr	r3, [fp, #-100]
	add	r6, r3, r5
	ldr	r3, [fp, #-96]
	add	r7, r3, r5
	ldr	r3, [fp, #-108]
	mov	r2, r7
	mov	r7, r6
	mov	r6, r2
	add	r4, r5, r3
	ldr	r3, [fp, #-112]
	str	r4, [fp, #-40]
	add	r3, r3, r5
	str	r3, [fp, #-80]
	ldr	r3, [fp, #-88]
	str	r3, [fp, #-76]
	mov	r3, r9
	mov	r9, r5
	str	r3, [fp, #-44]
.L3:
	ldrd	r0, [r9, #8]!
	ldrd	r2, [r7, #8]!
	bl	__aeabi_dsub
	ldr	r5, [fp, #-40]
	strd	r0, [r9]
	add	r1, r9, r8
	add	r0, r8, r1
	add	ip, r7, r8
	ldrd	r2, [r5, #8]!
	add	r4, r8, ip
	str	r1, [fp, #-52]
	str	r0, [fp, #-48]
	ldrd	r0, [r6, #8]!
	str	ip, [fp, #-60]
	str	r4, [fp, #-72]
	bl	__aeabi_dsub
	add	r3, r5, r8
	strd	r0, [r6]
	str	r3, [fp, #-64]
	add	r3, r6, r8
	mov	r4, r3
	ldrd	r0, [r9, r8]
	ldrd	r2, [r7, r8]
	bl	__aeabi_dsub
	strd	r0, [r9, r8]
	ldrd	r0, [r6, r8]
	str	r5, [fp, #-40]
	ldrd	r2, [r5, r8]
	bl	__aeabi_dsub
	ldr	r5, [fp, #-68]
	strd	r0, [r6, r8]
	ldrd	r2, [r7, r5]
	ldrd	r0, [r9, r5]
	bl	__aeabi_dsub
	strd	r0, [r9, r5]
	ldrd	r0, [r9, r10]
	ldrd	r2, [r7]
	bl	__aeabi_dsub
	ldr	r5, [fp, #-40]
	strd	r0, [r9, r10]
	ldrd	r0, [r6, r10]
	ldrd	r2, [r5]
	bl	__aeabi_dsub
	ldr	r5, [fp, #-52]
	strd	r0, [r6, r10]
	ldrd	r2, [r7, r8]
	ldrd	r0, [r5, r10]
	bl	__aeabi_dsub
	strd	r0, [r5, r10]
	ldr	r5, [fp, #-40]
	ldrd	r0, [r4, r10]
	ldrd	r2, [r5, r8]
	bl	__aeabi_dsub
	ldr	r5, [fp, #-68]
	strd	r0, [r4, r10]
	str	r4, [fp, #-56]
	ldrd	r2, [r7, r5]
	ldr	r4, [fp, #-48]
	ldrd	r0, [r4, r10]
	bl	__aeabi_dsub
	strd	r0, [r4, r10]
	ldrd	r2, [r7]
	ldrd	r0, [r7, r10]
	bl	__aeabi_dsub
	ldr	r4, [fp, #-40]
	strd	r0, [r7, r10]
	ldrd	r2, [r4]
	ldrd	r0, [r4, r10]
	bl	__aeabi_dsub
	ldr	r5, [fp, #-60]
	strd	r0, [r4, r10]
	ldrd	r2, [r7, r8]
	ldrd	r0, [r5, r10]
	bl	__aeabi_dsub
	strd	r0, [r5, r10]
	ldr	r5, [fp, #-64]
	ldrd	r2, [r4, r8]
	ldrd	r0, [r5, r10]
	bl	__aeabi_dsub
	strd	r0, [r5, r10]
	ldr	r5, [fp, #-68]
	ldr	r4, [fp, #-72]
	ldrd	r2, [r7, r5]
	ldrd	r0, [r4, r10]
	bl	__aeabi_dsub
	strd	r0, [r4, r10]
	ldr	r4, [fp, #-44]
	ldrd	r2, [r7]
	ldrd	r0, [r9, r4]
	bl	__aeabi_dsub
	strd	r0, [r9, r4]
	ldrd	r0, [r6, r4]
	ldr	r3, [fp, #-40]
	ldrd	r2, [r3]
	bl	__aeabi_dsub
	ldr	r5, [fp, #-52]
	strd	r0, [r6, r4]
	ldrd	r0, [r5, r4]
	ldrd	r2, [r7, r8]
	bl	__aeabi_dsub
	mov	r2, r5
	ldr	r3, [fp, #-40]
	mov	r5, r4
	strd	r0, [r2, r4]
	ldr	r4, [fp, #-56]
	ldrd	r2, [r3, r8]
	ldrd	r0, [r4, r5]
	bl	__aeabi_dsub
	strd	r0, [r4, r5]
	ldr	r4, [fp, #-48]
	ldrd	r0, [r4, r5]
	ldr	r5, [fp, #-68]
	ldrd	r2, [r7, r5]
	bl	__aeabi_dsub
	ldr	ip, [fp, #-44]
	strd	r0, [r4, ip]
	ldrd	r2, [r7, r10]
	ldrd	r0, [r9]
	bl	__aeabi_dsub
	ldr	r4, [fp, #-40]
	strd	r0, [r9]
	ldrd	r0, [r6]
	ldrd	r2, [r4, r10]
	bl	__aeabi_dsub
	ldr	r4, [fp, #-60]
	strd	r0, [r6]
	ldrd	r0, [r9, r8]
	ldrd	r2, [r4, r10]
	bl	__aeabi_dsub
	ldr	r3, [fp, #-64]
	strd	r0, [r9, r8]
	ldrd	r0, [r6, r8]
	ldrd	r2, [r3, r10]
	bl	__aeabi_dsub
	ldr	r3, [fp, #-72]
	strd	r0, [r6, r8]
	ldrd	r0, [r9, r5]
	ldrd	r2, [r3, r10]
	bl	__aeabi_dsub
	strd	r0, [r9, r5]
	ldrd	r0, [r9, r10]
	ldrd	r2, [r7, r10]
	bl	__aeabi_dsub
	ldr	r3, [fp, #-40]
	strd	r0, [r9, r10]
	ldrd	r0, [r6, r10]
	ldrd	r2, [r3, r10]
	bl	__aeabi_dsub
	ldr	r5, [fp, #-52]
	strd	r0, [r6, r10]
	ldrd	r2, [r4, r10]
	ldrd	r0, [r5, r10]
	bl	__aeabi_dsub
	ldr	r3, [fp, #-64]
	ldr	r4, [fp, #-56]
	strd	r0, [r5, r10]
	ldrd	r2, [r3, r10]
	ldrd	r0, [r4, r10]
	bl	__aeabi_dsub
	ldr	r5, [fp, #-72]
	strd	r0, [r4, r10]
	ldr	r4, [fp, #-48]
	ldrd	r2, [r5, r10]
	ldrd	r0, [r4, r10]
	bl	__aeabi_dsub
	strd	r0, [r4, r10]
	ldr	r4, [fp, #-44]
	ldrd	r2, [r7, r10]
	ldrd	r0, [r9, r4]
	bl	__aeabi_dsub
	mov	ip, r4
	strd	r0, [r9, r4]
	ldr	r4, [fp, #-40]
	ldrd	r0, [r6, ip]
	ldrd	r2, [r4, r10]
	mov	r4, ip
	bl	__aeabi_dsub
	ldr	ip, [fp, #-60]
	mov	r3, r4
	strd	r0, [r6, r4]
	ldr	r4, [fp, #-52]
	ldrd	r0, [r4, r3]
	ldrd	r2, [ip, r10]
	bl	__aeabi_dsub
	ldr	r3, [fp, #-44]
	strd	r0, [r4, r3]
	ldr	r4, [fp, #-56]
	ldrd	r0, [r4, r3]
	ldr	r3, [fp, #-64]
	ldrd	r2, [r3, r10]
	bl	__aeabi_dsub
	ldr	r3, [fp, #-44]
	ldr	lr, [fp, #-48]
	strd	r0, [r4, r3]
	ldrd	r0, [lr, r3]
	ldrd	r2, [r5, r10]
	bl	__aeabi_dsub
	ldr	r3, [fp, #-76]
	ldr	lr, [fp, #-48]
	ldr	r2, [fp, #-44]
	ldrd	r4, [r3, #8]
	strd	r0, [lr, r2]
	ldrd	r0, [r3]
	mov	r2, r4
	mov	r3, r5
	bl	__aeabi_dsub
	ldr	r3, [fp, #-76]
	strd	r0, [r3]
	mov	r0, r4
	mov	r1, r5
	mov	r4, r3
	ldrd	r2, [r3, #16]
	bl	__aeabi_dsub
	ldr	r2, [fp, #-80]
	add	r3, r4, #800
	strd	r0, [r4, #8]
	str	r3, [fp, #-76]
	cmp	r9, r2
	bne	.L3
	ldr	r2, [fp, #-84]
	ldr	r1, [fp, #-92]
	ldr	r3, [fp, #-88]
	ldr	r9, [fp, #-44]
	add	r2, r2, r1
	str	r2, [fp, #-84]
	add	r3, r3, #40
	ldr	r2, [fp, #-104]
	str	r3, [fp, #-88]
	cmp	r3, r2
	bne	.L4
.L1:
	sub	sp, fp, #32
	@ sp needed
	pop	{r4, r5, r6, r7, r8, r9, r10, fp, pc}
.L10:
	.align	2
.L9:
	.word	rhs+840
	.size	x_solve, .-x_solve
	.comm	rhs,16000,8
	.global	gp22
	.data
	.align	2
	.set	.LANCHOR0,. + 0
	.type	gp22, %object
	.size	gp22, 4
gp22:
	.word	20
	.ident	"GCC: (GNU) 7.0.0 20161102 (experimental)"
	.section	.note.GNU-stack,"",%progbits
	* tree-ssa-loop-ivopts.c (determine_group_iv_cost_address): Rename
	'sum_cost' to 'max_cost'.  Use maximum of per-use costs instead of sum.

diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
index f9211ad..7d40ae8 100644
--- a/gcc/tree-ssa-loop-ivopts.c
+++ b/gcc/tree-ssa-loop-ivopts.c
@@ -5122,41 +5122,42 @@ determine_group_iv_cost_address (struct ivopts_data *data,
   bool can_autoinc;
   iv_inv_expr_ent *inv_expr = NULL;
   struct iv_use *use = group->vuses[0];
-  comp_cost sum_cost = no_cost, cost;
+  comp_cost max_cost = no_cost, cost;
 
   cost = get_computation_cost (data, use, cand, true,
 			       &depends_on, &can_autoinc, &inv_expr);
 
-  sum_cost = cost;
-  if (!sum_cost.infinite_cost_p () && cand->ainc_use == use)
+  max_cost = cost;
+  if (!max_cost.infinite_cost_p () && cand->ainc_use == use)
     {
       if (can_autoinc)
-	sum_cost -= cand->cost_step;
+	max_cost -= cand->cost_step;
       /* If we generated the candidate solely for exploiting autoincrement
 	 opportunities, and it turns out it can't be used, set the cost to
 	 infinity to make sure we ignore it.  */
       else if (cand->pos == IP_AFTER_USE || cand->pos == IP_BEFORE_USE)
-	sum_cost = infinite_cost;
+	max_cost = infinite_cost;
     }
 
   /* Uses in a group can share setup code, so only add setup cost once.  */
   cost -= cost.scratch;
   /* Compute and add costs for rest uses of this group.  */
-  for (i = 1; i < group->vuses.length () && !sum_cost.infinite_cost_p (); i++)
+  for (i = 1; i < group->vuses.length () && !max_cost.infinite_cost_p (); i++)
     {
       struct iv_use *next = group->vuses[i];
 
       /* TODO: We could skip computing cost for sub iv_use when it has the
 	 same cost as the first iv_use, but the cost really depends on the
 	 offset and where the iv_use is.  */
-	cost = get_computation_cost (data, next, cand, true,
-				     NULL, &can_autoinc, NULL);
-      sum_cost += cost;
+      cost = get_computation_cost (data, next, cand, true,
+				   NULL, &can_autoinc, NULL);
+      if (max_cost < cost)
+	max_cost = cost;
     }
-  set_group_iv_cost (data, group, cand, sum_cost, depends_on,
+  set_group_iv_cost (data, group, cand, max_cost, depends_on,
 		     NULL_TREE, ERROR_MARK, inv_expr);
 
-  return !sum_cost.infinite_cost_p ();
+  return !max_cost.infinite_cost_p ();
 }
 
 /* Computes value of candidate CAND at position AT in iteration NITER, and

Reply via email to