Hello, I encounter an issue with PRE optimization, which created worse code than no optimization.
This the test function: void foo(int *data, int *m_v4w, int num) { int i; int m0; for( i=0; i<num; i++ ) { int *data1 = (data[i] - 2); int *data2 = data[i]; int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; int d0, d1, d2, d3; d0 = data1[0]; d1 = data2[0]; d2 = data1[1]; d3 = data2[1]; tmp0 = m_v4w[0]; tmp1 = m_v4w[2]; tmp2 = m_v4w[4]; tmp3 = m_v4w[6]; tmp4 = m_v4w[8]; tmp5 = m_v4w[10]; tmp6 = m_v4w[12]; tmp7 = m_v4w[14]; m0 = tmp0 * d0; m0 += tmp1 * d0; m0 += tmp2 * d1; m0 += tmp3 * d1; m0 += tmp4 * d2; m0 += tmp5 * d2; m0 += tmp6 * d3; m0 += tmp7 * d3; data2[0] = m0; } } The following is the code generated for our processor (easy to understand). PRE pass moves address expression (reg + constant offset) out of loop. It increases both size and cycle unnecessarily since our architecture support reg + constant offset addressing mode. It is even worse in bigger loops. Too many registers cause spill in register allocation and performance suffers further. This happens on both 4.4 and TRUNK versions. I also tested on x86, which is the same. gcc_compiled.: .section .text, "axU" .align 8 .global foo .type foo,@function foo: cmplew p0, r2, zr sbpt p0.0, [link] addw r9, r1, #0x08 : addw r8, r1, #0x10 addw r7, r1, #0x18 : addw r6, r1, #0x20 addw r5, r1, #0x28 : addw r4, r1, #0x30 addw r3, r1, #0x38 .L3: loop r2,.L6 ldw r10, [r0], #4! : ldw r11, [r1] ldw r14, [r9] : ldw r22, [r7] ldw r15, [r8] : ldw r21, [r5] ldw r20, [r3] : ldw r13, [r6] ldw r12, [r4] ldw r19, [r10] ldw r18, [r10, #-2] : ldw r17, [r10, #2] ldw r16, [r10, #4] : addw r14, r14, r11 addw r15, r22, r15 : addw r13, r21, r13 addw r12, r20, r12 mulw r15, r15, r19 mulw r14, r14, r18 : mulw r13, r13, r17 mulw r12, r12, r16 addw r11, r15, r14 addw r11, r11, r13 addw r11, r11, r12 .L6: stw r11, [r10] .L7: sbl [link] .size foo,.-foo The following is assembly code generated with -fno-tree-pre option. foo: cmplew p0, r2, zr sbpt p0.0, [link] .L3: loop r2,.L6 ldw r3, [r0], #4! : ldw r4, [r1] ldw r7, [r1, #8] : ldw r15, [r1, #24] ldw r8, [r1, #16] : ldw r14, [r1, #40] ldw r13, [r1, #56] : ldw r6, [r1, #32] ldw r5, [r1, #48] ldw r12, [r3] ldw r11, [r3, #-2] : ldw r10, [r3, #2] ldw r9, [r3, #4] : addw r7, r7, r4 addw r8, r15, r8 : addw r6, r14, r6 addw r5, r13, r5 mulw r8, r8, r12 mulw r7, r7, r11 : mulw r6, r6, r10 mulw r5, r5, r9 addw r4, r8, r7 addw r4, r4, r6 addw r4, r4, r5 .L6: stw r4, [r3] .L7: sbl [link] .size foo,.-foo