Hi guys, I have a test case (shell sort, see attached) compiled with different ARM compilers: GCC-4.6.3, GCC-3.4.6, and ARMCC.
Both ARMCC and GCC-3.4.6 generate quite optimal assembly while GCC-4.6.3 inserts extra load/store instructions compared to the other compilers. Can the SSA representation usage in modern GCC be the reason for this? If so, has anyone tried to do something about it? The generated assembly codes are attached: sort-3.4.s: Assembly, generated by the GCC-3.4.6 sort-4.6.3.s: Assembly, generated by the GCC-4.6.3 sort-armcc.s: Assembly, generated by the ARMCC % armcc ARM C/C++ Compiler, 4.1 [Build 713] The file has been compiled with following options: for GCC: -O3 for ARMCC: -O3 -Otime -- Alexey Kravets mr.kayr...@gmail.com
.file "sort.i" .global __divsi3 .text .align 2 .global shell_sort .type shell_sort, %function shell_sort: @ args = 0, pretend = 0, frame = 0 @ frame_needed = 0, uses_anonymous_args = 0 stmfd sp!, {r4, r5, r6, r7, r8, r9, sl, fp, lr} mov fp, r1 sub r7, r0, #4 mov r8, #1 .L2: add r3, r8, r8, asl #1 add r8, r3, #1 cmp r8, fp ble .L2 .L17: mov r0, r8 mov r1, #3 bl __divsi3 add r9, r0, #1 cmp r9, fp mov r8, r0 bgt .L16 .L26: ldr sl, [r7, r9, asl #2] mov r4, r9 b .L11 .L25: ldr r5, [r7, r6, asl #2] mov r0, r5 bl strcmp cmp r0, #0 ble .L12 str r5, [r7, r4, asl #2] mov r4, r6 .L11: cmp r4, r8 rsb r6, r8, r4 mov r1, sl bgt .L25 .L12: add r9, r9, #1 cmp r9, fp str sl, [r7, r4, asl #2] ble .L26 .L16: cmp r8, #1 bgt .L17 ldmfd sp!, {r4, r5, r6, r7, r8, r9, sl, fp, pc} .size shell_sort, .-shell_sort .ident "GCC: (GNU) 3.4.6"
#include <string.h> void shell_sort(char *strings[], int n) { int h, i, j; char *v; strings--; /* Make array 1 origin */ h = 1; do {h = h * 3 + 1;} while (h <= n); do { h = h / 3; for (i = h + 1; i <= n; i++) { v = strings[i]; j = i; while (j > h && strcmp(strings[j-h], v) > 0) { strings[j] = strings[j-h]; j = j-h; } strings[j] = v; } } while (h > 1); }
.cpu cortex-a9 .eabi_attribute 27, 3 .fpu vfp3 .eabi_attribute 20, 1 .eabi_attribute 21, 1 .eabi_attribute 23, 3 .eabi_attribute 24, 1 .eabi_attribute 25, 1 .eabi_attribute 26, 2 .eabi_attribute 30, 2 .eabi_attribute 34, 1 .eabi_attribute 18, 2 .file "shell_sort.c" .text .align 2 .global shell_sort .type shell_sort, %function shell_sort: @ args = 0, pretend = 0, frame = 40 @ frame_needed = 0, uses_anonymous_args = 0 stmfd sp!, {r4, r5, r6, r7, r8, r9, sl, fp, lr} mov r3, r1 mov r9, #1 sub sp, sp, #44 sub r2, r0, #4 str r0, [sp, #28] str r1, [sp, #32] str r2, [sp, #24] .L2: add r9, r9, r9, asl #1 add r9, r9, #1 cmp r9, r3 ble .L2 ldr r2, [sp, #32] movw r3, #21846 movt r3, 21845 str r3, [sp, #36] add r2, r2, #1 str r2, [sp, #20] .L9: ldr r2, [sp, #36] smull r2, r3, r2, r9 ldr r2, [sp, #32] sub r9, r3, r9, asr #31 add r3, r9, #1 cmp r2, r3 str r3, [sp, #8] blt .L3 rsb r3, r9, r9, asl #30 mov r2, r9, asl #2 mov r3, r3, asl #2 str r2, [sp, #0] rsb fp, r9, #0 str r3, [sp, #4] ldr r3, [sp, #28] add r3, r3, r2 ldr r2, [sp, #28] str r3, [sp, #12] str r2, [sp, #16] .L7: ldr r8, [sp, #12] ldr r3, [sp, #8] mov r2, r8 ldr sl, [r2], #4 cmp r9, r3 str r2, [sp, #12] bge .L4 ldr r4, [sp, #16] mov r7, r3 b .L5 .L6: ldr r3, [sp, #0] cmp r9, r6 ldr r2, [sp, #4] mov r7, r6 str r5, [r4, r3] add r4, r4, r2 bge .L4 .L5: ldr r5, [r4, #0] mov r1, sl add r6, r7, fp mov r8, r4 mov r0, r5 bl strcmp cmp r0, #0 bgt .L6 ldr r3, [sp, #24] add r8, r3, r7, asl #2 .L4: ldr r2, [sp, #8] ldr r3, [sp, #20] str sl, [r8, #0] add r2, r2, #1 str r2, [sp, #8] cmp r2, r3 ldr r2, [sp, #16] add r2, r2, #4 str r2, [sp, #16] bne .L7 .L3: cmp r9, #1 bgt .L9 add sp, sp, #44 ldmfd sp!, {r4, r5, r6, r7, r8, r9, sl, fp, pc} .size shell_sort, .-shell_sort .ident "GCC: (VDLinux.RC1) 4.6.3 20120105 (prerelease)" .section .note.GNU-stack,"",%progbits
; generated by ARM C/C++ Compiler, 4.1 [Build 713] ; commandline armcc [-S --cpu=Cortex-A9 --fpu=VFPv3 -O3 -Otime shell_sort.c] ARM REQUIRE8 PRESERVE8 AREA ||.text||, CODE, READONLY, ALIGN=2 shell_sort PROC PUSH {r4-r12,lr} MOV r10,r1 SUB r6,r0,#4 MOV r5,#1 |L0.16| ADD r0,r5,r5,LSL #1 ADD r5,r0,#1 CMP r5,r10 BLE |L0.16| LDR r11,|L0.140| |L0.36| SMULL r1,r0,r11,r5 SUB r5,r0,r0,ASR #31 ADD r8,r5,#1 CMP r8,r10 BGT |L0.128| |L0.56| LDR r9,[r6,r8,LSL #2] MOV r4,r8 B |L0.80| |L0.68| LDR r0,[r6,r7,LSL #2] STR r0,[r6,r4,LSL #2] MOV r4,r7 |L0.80| CMP r4,r5 BLE |L0.112| SUB r7,r4,r5 MOV r1,r9 LDR r0,[r6,r7,LSL #2] BL strcmp CMP r0,#0 BGT |L0.68| |L0.112| ADD r8,r8,#1 CMP r8,r10 STR r9,[r6,r4,LSL #2] BLE |L0.56| |L0.128| CMP r5,#1 BGT |L0.36| POP {r4-r12,pc} ENDP |L0.140| DCD 0x55555556 AREA ||.arm_vfe_header||, DATA, READONLY, NOALLOC, ALIGN=2 DCD 0x00000000 EXPORT shell_sort [CODE] IMPORT ||Lib$$Request$$armlib|| [CODE,WEAK] IMPORT strcmp [CODE] ATTR FILESCOPE ATTR SETVALUE Tag_ABI_PCS_wchar_t,2 ATTR SETVALUE Tag_ABI_enum_size,1 ATTR SETVALUE Tag_ABI_optimization_goals,2 ATTR SETSTRING Tag_conformance,"2.06" ATTR SETVALUE AV,18,1 ASSERT {ENDIAN} = "little" ASSERT {INTER} = {TRUE} ASSERT {ROPI} = {FALSE} ASSERT {RWPI} = {FALSE} ASSERT {IEEE_FULL} = {FALSE} ASSERT {IEEE_PART} = {FALSE} ASSERT {IEEE_JAVA} = {FALSE} END