Hello,
I encounter an issue with PRE optimization, which created worse
code than no optimization.

This the test function: 

void foo(int *data, int *m_v4w, int num)
{
  int i;
  int m0;
  for( i=0; i<num; i++ )
  {
    int *data1 = (data[i] - 2);
    int *data2 =  data[i];
    int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    int d0, d1, d2, d3;

    d0 = data1[0];
    d1 = data2[0];
    d2 = data1[1];
    d3 = data2[1];

    tmp0 = m_v4w[0];
    tmp1 = m_v4w[2];
    tmp2 = m_v4w[4];
    tmp3 = m_v4w[6];
    tmp4 = m_v4w[8];
    tmp5 = m_v4w[10];
    tmp6 = m_v4w[12];
    tmp7 = m_v4w[14];

    m0 = tmp0 * d0;
    m0 += tmp1 * d0;
    m0 += tmp2 * d1;
    m0 += tmp3 * d1;
    m0 += tmp4 * d2;
    m0 += tmp5 * d2;
    m0 += tmp6 * d3;
    m0 += tmp7 * d3;

    data2[0] =  m0;
  }             
}   

The following is the code generated for our processor (easy to understand). 
PRE pass moves address expression (reg + constant offset) out of loop.
It increases both size and cycle unnecessarily since our architecture support
reg + constant offset addressing mode. It is even worse in bigger loops. 
Too many registers cause spill in register allocation and performance suffers
further. This happens on both 4.4 and TRUNK versions.  I also tested on x86,
which is the same. 

gcc_compiled.:
        .section .text, "axU"
        .align 8
        .global foo
        .type   foo,@function
foo:
        cmplew p0, r2, zr
        sbpt p0.0, [link]
        addw r9, r1, #0x08      :       addw r8, r1, #0x10
        addw r7, r1, #0x18      :       addw r6, r1, #0x20
        addw r5, r1, #0x28      :       addw r4, r1, #0x30
        addw r3, r1, #0x38
.L3:
        loop  r2,.L6
        ldw r10, [r0], #4!      :       ldw r11, [r1]
        ldw r14, [r9]   :       ldw r22, [r7]
        ldw r15, [r8]   :       ldw r21, [r5]
        ldw r20, [r3]   :       ldw r13, [r6]
        ldw r12, [r4]
        ldw r19, [r10]
        ldw r18, [r10, #-2]     :       ldw r17, [r10, #2]
        ldw r16, [r10, #4]      :       addw r14, r14, r11
        addw r15, r22, r15      :       addw r13, r21, r13
        addw r12, r20, r12
        mulw r15, r15, r19
        mulw r14, r14, r18      :       mulw r13, r13, r17
        mulw r12, r12, r16
        addw r11, r15, r14
        addw r11, r11, r13
        addw r11, r11, r12
.L6:
        stw r11, [r10]
.L7:
        sbl [link]
        .size   foo,.-foo


The following is assembly code generated with -fno-tree-pre option. 
foo:
        cmplew p0, r2, zr
        sbpt p0.0, [link]
.L3:
        loop  r2,.L6
        ldw r3, [r0], #4!       :       ldw r4, [r1]
        ldw r7, [r1, #8]        :       ldw r15, [r1, #24]
        ldw r8, [r1, #16]       :       ldw r14, [r1, #40]
        ldw r13, [r1, #56]      :       ldw r6, [r1, #32]
        ldw r5, [r1, #48]
        ldw r12, [r3]
        ldw r11, [r3, #-2]      :       ldw r10, [r3, #2]
        ldw r9, [r3, #4]        :       addw r7, r7, r4
        addw r8, r15, r8        :       addw r6, r14, r6
        addw r5, r13, r5
        mulw r8, r8, r12
        mulw r7, r7, r11        :       mulw r6, r6, r10
        mulw r5, r5, r9
        addw r4, r8, r7
        addw r4, r4, r6
        addw r4, r4, r5
.L6:
        stw r4, [r3]
.L7:
        sbl [link]
        .size   foo,.-foo



Reply via email to