[ARM]Extra load store/instructions compared to gcc-3.4

Alexey Kravets Wed, 25 Apr 2012 05:17:21 -0700

Hi guys,
I have a test case (shell sort, see attached) compiled with different
ARM compilers:
GCC-4.6.3, GCC-3.4.6, and ARMCC.


Both ARMCC and GCC-3.4.6  generate quite optimal assembly while GCC-4.6.3
inserts extra load/store instructions compared to the other compilers.

Can the SSA representation usage in modern GCC be the reason for this?

If so, has anyone tried to do something about it?

The generated assembly codes are attached:
sort-3.4.s: Assembly, generated by the GCC-3.4.6
sort-4.6.3.s: Assembly, generated by the GCC-4.6.3
sort-armcc.s: Assembly, generated by the ARMCC

% armcc
ARM C/C++ Compiler, 4.1 [Build 713]

The file has been compiled with following options:
for GCC:
-O3
for ARMCC:
-O3 -Otime


-- 
Alexey Kravets
mr.kayr...@gmail.com

        .file   "sort.i"
        .global __divsi3
        .text
        .align  2
        .global shell_sort
        .type   shell_sort, %function
shell_sort:
        @ args = 0, pretend = 0, frame = 0
        @ frame_needed = 0, uses_anonymous_args = 0
        stmfd   sp!, {r4, r5, r6, r7, r8, r9, sl, fp, lr}
        mov     fp, r1
        sub     r7, r0, #4
        mov     r8, #1
.L2:
        add     r3, r8, r8, asl #1
        add     r8, r3, #1
        cmp     r8, fp
        ble     .L2
.L17:
        mov     r0, r8
        mov     r1, #3
        bl      __divsi3
        add     r9, r0, #1
        cmp     r9, fp
        mov     r8, r0
        bgt     .L16
.L26:
        ldr     sl, [r7, r9, asl #2]
        mov     r4, r9
        b       .L11
.L25:
        ldr     r5, [r7, r6, asl #2]
        mov     r0, r5
        bl      strcmp
        cmp     r0, #0
        ble     .L12
        str     r5, [r7, r4, asl #2]
        mov     r4, r6
.L11:
        cmp     r4, r8
        rsb     r6, r8, r4
        mov     r1, sl
        bgt     .L25
.L12:
        add     r9, r9, #1
        cmp     r9, fp
        str     sl, [r7, r4, asl #2]
        ble     .L26
.L16:
        cmp     r8, #1
        bgt     .L17
        ldmfd   sp!, {r4, r5, r6, r7, r8, r9, sl, fp, pc}
        .size   shell_sort, .-shell_sort
        .ident  "GCC: (GNU) 3.4.6"

#include <string.h>
void shell_sort(char *strings[], int n)
{
    int h, i, j;
    char *v;

    strings--;        /* Make array 1 origin */
    h = 1;
    do {h = h * 3 + 1;} while (h <= n);
    do {
        h = h / 3;
        for (i = h + 1; i <= n; i++) {
            v = strings[i];
            j = i;
            while (j > h && strcmp(strings[j-h], v) > 0) {
                strings[j] = strings[j-h];
                j = j-h;
            }
            strings[j] = v;
        }
    }
    while (h > 1);
}

        .cpu cortex-a9
        .eabi_attribute 27, 3
        .fpu vfp3
        .eabi_attribute 20, 1
        .eabi_attribute 21, 1
        .eabi_attribute 23, 3
        .eabi_attribute 24, 1
        .eabi_attribute 25, 1
        .eabi_attribute 26, 2
        .eabi_attribute 30, 2
        .eabi_attribute 34, 1
        .eabi_attribute 18, 2
        .file   "shell_sort.c"
        .text
        .align  2
        .global shell_sort
        .type   shell_sort, %function
shell_sort:
        @ args = 0, pretend = 0, frame = 40
        @ frame_needed = 0, uses_anonymous_args = 0
        stmfd   sp!, {r4, r5, r6, r7, r8, r9, sl, fp, lr}
        mov     r3, r1
        mov     r9, #1
        sub     sp, sp, #44
        sub     r2, r0, #4
        str     r0, [sp, #28]
        str     r1, [sp, #32]
        str     r2, [sp, #24]
.L2:
        add     r9, r9, r9, asl #1
        add     r9, r9, #1
        cmp     r9, r3
        ble     .L2
        ldr     r2, [sp, #32]
        movw    r3, #21846
        movt    r3, 21845
        str     r3, [sp, #36]
        add     r2, r2, #1
        str     r2, [sp, #20]
.L9:
        ldr     r2, [sp, #36]
        smull   r2, r3, r2, r9
        ldr     r2, [sp, #32]
        sub     r9, r3, r9, asr #31
        add     r3, r9, #1
        cmp     r2, r3
        str     r3, [sp, #8]
        blt     .L3
        rsb     r3, r9, r9, asl #30
        mov     r2, r9, asl #2
        mov     r3, r3, asl #2
        str     r2, [sp, #0]
        rsb     fp, r9, #0
        str     r3, [sp, #4]
        ldr     r3, [sp, #28]
        add     r3, r3, r2
        ldr     r2, [sp, #28]
        str     r3, [sp, #12]
        str     r2, [sp, #16]
.L7:
        ldr     r8, [sp, #12]
        ldr     r3, [sp, #8]
        mov     r2, r8
        ldr     sl, [r2], #4
        cmp     r9, r3
        str     r2, [sp, #12]
        bge     .L4
        ldr     r4, [sp, #16]
        mov     r7, r3
        b       .L5
.L6:
        ldr     r3, [sp, #0]
        cmp     r9, r6
        ldr     r2, [sp, #4]
        mov     r7, r6
        str     r5, [r4, r3]
        add     r4, r4, r2
        bge     .L4
.L5:
        ldr     r5, [r4, #0]
        mov     r1, sl
        add     r6, r7, fp
        mov     r8, r4
        mov     r0, r5
        bl      strcmp
        cmp     r0, #0
        bgt     .L6
        ldr     r3, [sp, #24]
        add     r8, r3, r7, asl #2
.L4:
        ldr     r2, [sp, #8]
        ldr     r3, [sp, #20]
        str     sl, [r8, #0]
        add     r2, r2, #1
        str     r2, [sp, #8]
        cmp     r2, r3
        ldr     r2, [sp, #16]
        add     r2, r2, #4
        str     r2, [sp, #16]
        bne     .L7
.L3:
        cmp     r9, #1
        bgt     .L9
        add     sp, sp, #44
        ldmfd   sp!, {r4, r5, r6, r7, r8, r9, sl, fp, pc}
        .size   shell_sort, .-shell_sort
        .ident  "GCC: (VDLinux.RC1) 4.6.3 20120105 (prerelease)"
        .section        .note.GNU-stack,"",%progbits

; generated by ARM C/C++ Compiler, 4.1 [Build 713]
; commandline armcc [-S --cpu=Cortex-A9 --fpu=VFPv3 -O3 -Otime shell_sort.c]
        ARM
        REQUIRE8
        PRESERVE8

        AREA ||.text||, CODE, READONLY, ALIGN=2

shell_sort PROC
        PUSH     {r4-r12,lr}
        MOV      r10,r1
        SUB      r6,r0,#4
        MOV      r5,#1
|L0.16|
        ADD      r0,r5,r5,LSL #1
        ADD      r5,r0,#1
        CMP      r5,r10
        BLE      |L0.16|
        LDR      r11,|L0.140|
|L0.36|
        SMULL    r1,r0,r11,r5
        SUB      r5,r0,r0,ASR #31
        ADD      r8,r5,#1
        CMP      r8,r10
        BGT      |L0.128|
|L0.56|
        LDR      r9,[r6,r8,LSL #2]
        MOV      r4,r8
        B        |L0.80|
|L0.68|
        LDR      r0,[r6,r7,LSL #2]
        STR      r0,[r6,r4,LSL #2]
        MOV      r4,r7
|L0.80|
        CMP      r4,r5
        BLE      |L0.112|
        SUB      r7,r4,r5
        MOV      r1,r9
        LDR      r0,[r6,r7,LSL #2]
        BL       strcmp
        CMP      r0,#0
        BGT      |L0.68|
|L0.112|
        ADD      r8,r8,#1
        CMP      r8,r10
        STR      r9,[r6,r4,LSL #2]
        BLE      |L0.56|
|L0.128|
        CMP      r5,#1
        BGT      |L0.36|
        POP      {r4-r12,pc}
        ENDP

|L0.140|
        DCD      0x55555556

        AREA ||.arm_vfe_header||, DATA, READONLY, NOALLOC, ALIGN=2

        DCD      0x00000000

        EXPORT shell_sort [CODE]

        IMPORT ||Lib$$Request$$armlib|| [CODE,WEAK]
        IMPORT strcmp [CODE]

        ATTR FILESCOPE
        ATTR SETVALUE Tag_ABI_PCS_wchar_t,2
        ATTR SETVALUE Tag_ABI_enum_size,1
        ATTR SETVALUE Tag_ABI_optimization_goals,2
        ATTR SETSTRING Tag_conformance,"2.06"
        ATTR SETVALUE AV,18,1

        ASSERT {ENDIAN} = "little"
        ASSERT {INTER} = {TRUE}
        ASSERT {ROPI} = {FALSE}
        ASSERT {RWPI} = {FALSE}
        ASSERT {IEEE_FULL} = {FALSE}
        ASSERT {IEEE_PART} = {FALSE}
        ASSERT {IEEE_JAVA} = {FALSE}
        END

[ARM]Extra load store/instructions compared to gcc-3.4

Reply via email to