Hello, I am looking at GCC's loop unrolling and find it quite inefficient compared with manually unrolled loop even for very simple loop. The followings are a simple loop and its manually unrolled version. I didn't apply any trick on manually unrolled one as it is exact replications of original loop body. I have expected by -funroll-loops the first version should produce code of similar quality as the second one. However, compiled with ARM target of mainline GCC, both functions produce very different results.
GCC-unrolled version mainly suffers from two issues. First, the load/store offsets are registers. Extra ADD instructions are needed to increase offset over iteration. In the contrast, manually unrolled code makes use of immediate offset efficiently and only need one ADD to adjust base register in the end. Second, the alias (dependence) analysis is over conservative. The LOAD instruction of next unrolled iteration cannot be moved beyond previous STORE instruction even they are clearly not aliased. I suspect the failure of alias analysis is related to the first issue of handling base and offset address. The .sched2 file shows that the first loop body requires 57 cycles whereas the second one takes 50 cycles for arm9 (56 cycles vs 34 cycles for Xscale). It become even worse for our VLIW porting due to longer latency of MUL and Load instructions and incapability of filling all slots (120 cycles vs. 20 cycles) By analyzing compilation phases, I believe if the loop unrolling happens at the tree-level, or if we have an optimizing pass like "ivopts" after loop unrolling in RTL level, GCC can produce far more efficient loop-unrolled code. "ivopts" pass really does a wonderful job in optimizing induction variables. Strangely, I found some unrolling functions at tree-level, but there is no independent tree-level loop unrolling pass except "cunroll", which is complete unrolling. What prevents such a tree-level unrolling pass? Or is there any suggestion to improve existing RTL level unrolling? Thanks in advance. Cheers, Bingfeng Mei Broadcom UK void Unroll( short s, int * restrict b_inout, int *restrict out) { int i; for (i=0; i<64; i++) { b_inout[i] = b_inout[i] * s; } } void ManualUnroll( short s, int * restrict b_inout, int *restrict out) { int i; for (i=0; i<64;) { b_inout[i] = b_inout[i] * s; i++; b_inout[i] = b_inout[i] * s; i++; b_inout[i] = b_inout[i] * s; i++; b_inout[i] = b_inout[i] * s; i++; b_inout[i] = b_inout[i] * s; i++; b_inout[i] = b_inout[i] * s; i++; b_inout[i] = b_inout[i] * s; i++; b_inout[i] = b_inout[i] * s; i++; } } arm-elf-gcc tst2.c -O2 -std=c99 -S -v -fdump-tree-all -da -mcpu=arm9 -funroll-loops Unroll: @ args = 0, pretend = 0, frame = 0 @ frame_needed = 0, uses_anonymous_args = 0 @ link register save eliminated. mov r0, r0, asl #16 stmfd sp!, {r4, r5, r6} mov r4, r1 mov r6, r0, asr #16 mov r5, #0 .L2: ldr r1, [r4, r5] add ip, r5, #4 mul r0, r6, r1 str r0, [r4, r5] ldr r3, [r4, ip] add r0, ip, #4 mul r2, r6, r3 str r2, [r4, ip] ldr r1, [r4, r0] add ip, r5, #12 mul r3, r6, r1 str r3, [r4, r0] ldr r2, [r4, ip] add r1, r5, #16 mul r3, r6, r2 str r3, [r4, ip] ldr r0, [r4, r1] add ip, r5, #20 mul r3, r6, r0 str r3, [r4, r1] ldr r2, [r4, ip] add r1, r5, #24 mul r0, r6, r2 str r0, [r4, ip] ldr r3, [r4, r1] add ip, r5, #28 mul r0, r6, r3 str r0, [r4, r1] ldr r2, [r4, ip] add r5, r5, #32 mul r3, r6, r2 cmp r5, #256 str r3, [r4, ip] bne .L2 ldmfd sp!, {r4, r5, r6} bx lr .size Unroll, .-Unroll arm-elf-gcc tst2.c -O2 -std=c99 -S -v -fdump-tree-all -da -mcpu=arm9 ManualUnroll: @ args = 0, pretend = 0, frame = 0 @ frame_needed = 0, uses_anonymous_args = 0 @ link register save eliminated. mov r0, r0, asl #16 stmfd sp!, {r4, r5, r6, r7, r8, r9, sl, fp} mov sl, r1 mov r9, r0, asr #16 add fp, r1, #256 .L7: ldr r3, [sl, #0] ldr r2, [sl, #4] ldr r1, [sl, #8] ldr r0, [sl, #12] ldr ip, [sl, #16] add r4, sl, #20 ldmia r4, {r4, r5, r6} @ phole ldm mul r7, r9, r3 mul r8, r9, r2 mul r3, r9, r1 mul r2, r9, r0 mul r1, r9, ip mul r0, r9, r4 mul ip, r9, r5 mul r4, r9, r6 stmia sl, {r7, r8} @ phole stm str r3, [sl, #8] str r2, [sl, #12] str r1, [sl, #16] str r0, [sl, #20] str ip, [sl, #24] str r4, [sl, #28] add sl, sl, #32 cmp sl, fp bne .L7 ldmfd sp!, {r4, r5, r6, r7, r8, r9, sl, fp} bx lr .size ManualUnroll, .-ManualUnroll .ident "GCC: (GNU) 4.4.0 20080530 (experimental)"