Hello,

I look at the following the code to see what is the difference between
GCC4 and GCC3 in using POST_INC address mode (or other similar modes). 

void tst(char * __restrict__ a, char * __restrict__ b){
  *a++ = *b++;
  *a++ = *b++;
  *a++ = *b++;
  *a++ = *b++;
  *a++ = *b++;
  *a++ = *b++;
  *a = *b;
}


Using ARM processor as a target, GCC4.2.2 generates the following
assembly:
tst:
        @ args = 0, pretend = 0, frame = 0
        @ frame_needed = 0, uses_anonymous_args = 0
        @ link register save eliminated.
        mov     r2, r1
        ldrb    ip, [r2], #1    @ zero_extendqisi2
        mov     r3, r0
        strb    ip, [r3], #1
        ldrb    r1, [r1, #1]    @ zero_extendqisi2
        strb    r1, [r0, #1]
        ldrb    r1, [r2, #1]    @ zero_extendqisi2
        strb    r1, [r3, #1]
        add     r2, r2, #1
        ldrb    r1, [r2, #1]    @ zero_extendqisi2
        add     r3, r3, #1
        strb    r1, [r3, #1]
        add     r2, r2, #1
        ldrb    r1, [r2, #1]    @ zero_extendqisi2
        add     r3, r3, #1
        strb    r1, [r3, #1]
        add     r2, r2, #1
        ldrb    r1, [r2, #1]    @ zero_extendqisi2
        add     r3, r3, #1
        strb    r1, [r3, #1]
        ldrb    r2, [r2, #2]    @ zero_extendqisi2
        @ lr needed for prologue
        strb    r2, [r3, #2]
        bx      lr
        .size   tst, .-tst
        .ident  "GCC: (GNU) 4.2.2"

And GCC3.4.6 generates much better code by using POST_INC address mode
extensively

tst:
        @ args = 0, pretend = 0, frame = 0
        @ frame_needed = 0, uses_anonymous_args = 0
        @ link register save eliminated.
        ldrb    r3, [r1], #1    @ zero_extendqisi2
        strb    r3, [r0], #1
        ldrb    r3, [r1], #1    @ zero_extendqisi2
        strb    r3, [r0], #1
        ldrb    r3, [r1], #1    @ zero_extendqisi2
        strb    r3, [r0], #1
        ldrb    r3, [r1], #1    @ zero_extendqisi2
        strb    r3, [r0], #1
        ldrb    r3, [r1], #1    @ zero_extendqisi2
        strb    r3, [r0], #1
        ldrb    r3, [r1], #1    @ zero_extendqisi2
        strb    r3, [r0], #1
        ldrb    r3, [r1, #0]    @ zero_extendqisi2
        @ lr needed for prologue
        strb    r3, [r0, #0]
        mov     pc, lr
        .size   tst, .-tst
        .ident  "GCC: (GNU) 3.4.6"

I look at dumped tst.c.102t.final_cleanup:
tst (a, b)
{
  char * restrict a.54;
  char * restrict a.53;
  char * restrict a.52;
  char * restrict a.51;
  char * restrict a.50;
  char * restrict b.48;
  char * restrict b.47;
  char * restrict b.46;
  char * restrict b.45;
  char * restrict b.44;

<bb 2>:
  *a = *b;
  a.50 = a + 1B;
  b.44 = b + 1B;
  *a.50 = *b.44;
  a.51 = a.50 + 1B;
  b.45 = b.44 + 1B;
  *a.51 = *b.45;
  a.52 = a.51 + 1B;
  b.46 = b.45 + 1B;
  *a.52 = *b.46;
  a.53 = a.52 + 1B;
  b.47 = b.46 + 1B;
  *a.53 = *b.47;
  a.54 = a.53 + 1B;
  b.48 = b.47 + 1B;
  *a.54 = *b.48;
  *(a.54 + 1B) = *(b.48 + 1B);
  return;

}
I believe it is a fundermental issue for Tree-SSA IR. POST_INC address
mode requires a pattern that the same variable is used for incrementing
(both USE and DEF), while the SSA form produces a different varible for
each DEF. Therefore, GCC4 cannot efficiently use POST_INC and other
similar address modes. Is there any solution to overcome this problem?
Any suggestion is greatly appreciated. 


Bingfeng Mei
Broadcom UK

Reply via email to