The stack usage of the code gcc-4.x generated looks inefficient on x86 and x86_64. A simple test case is below; ======== #define copy_from_asm(x, addr, err) \ asm volatile( \ "1:\tmovl %2, %1\n" \ "2:\n" \ ".section .fixup,\"ax\"\n" \ "\txor %1,%1\n" \ "\tmov $1,%0\n" \ "\tjmp 2b\n" \ ".previous\n" \ : "=r" (err), "=r" (x) \ : "m" (*(int*)(addr)), "0" (err))
#define copy_from(x, addr, err) do { \ (err) = 0; \ copy_from_asm((x), (addr), (err)); \ } while (0) #define copy(x, addr) ({ \ int __err; \ copy_from((x), (addr), __err); \ __err; \ }) int src[32]; int dst[32]; #define my_copy(x) do { err |= copy(dst[x], &src[x]); } while (0) int test(void) { int err = 0; my_copy( 0); my_copy( 1); my_copy( 2); my_copy( 3); my_copy( 4); my_copy( 5); my_copy( 6); my_copy( 7); my_copy( 8); my_copy( 9); my_copy(10); my_copy(11); my_copy(12); my_copy(13); my_copy(14); my_copy(15); my_copy(16); my_copy(17); my_copy(18); my_copy(19); my_copy(20); my_copy(21); my_copy(22); my_copy(23); my_copy(24); my_copy(25); my_copy(26); my_copy(27); my_copy(28); my_copy(29); my_copy(30); my_copy(31); return err; } ====== I compiled this test case with gcc-3.4.6, 4.2.4, 4.3.2 and 4.4-20081205, and the compile option is "-g -Os -mno-red-zone". The code size of objects are below; $ size test.o.* text data bss dec hex filename 945 0 0 945 3b1 test.o.34 1157 0 0 1157 485 test.o.42 1133 0 0 1133 46d test.o.43 1201 0 0 1201 4b1 test.o.44 gcc-3.4.6 generates; 0000000000000000 <test>: 0: 31 c9 xor %ecx,%ecx 2: 8b 05 00 00 00 00 mov 0x0(%rip),%eax # 8 <test+0x8> 8: 89 05 00 00 00 00 mov %eax,0x0(%rip) # e <test+0xe> e: 31 c0 xor %eax,%eax 10: 8b 15 00 00 00 00 mov 0x0(%rip),%edx # 16 <test+0x16> 16: 09 c1 or %eax,%ecx 18: 89 15 00 00 00 00 mov %edx,0x0(%rip) # 1e <test+0x1e> 1e: 31 c0 xor %eax,%eax 20: 8b 15 00 00 00 00 mov 0x0(%rip),%edx # 26 <test+0x26> 26: 09 c1 or %eax,%ecx gcc-4.4 generates; 0000000000000000 <test>: 0: 41 57 push %r15 2: 31 c0 xor %eax,%eax 4: 41 56 push %r14 6: 41 55 push %r13 8: 41 89 c5 mov %eax,%r13d b: 41 54 push %r12 d: 55 push %rbp e: 53 push %rbx f: 48 83 ec 58 sub $0x58,%rsp 13: 8b 15 00 00 00 00 mov 0x0(%rip),%edx # 19 <test+0x19> 19: 89 15 00 00 00 00 mov %edx,0x0(%rip) # 1f <test+0x1f> 1f: 41 89 c6 mov %eax,%r14d 22: 8b 15 00 00 00 00 mov 0x0(%rip),%edx # 28 <test+0x28> 28: 89 15 00 00 00 00 mov %edx,0x0(%rip) # 2e <test+0x2e> 2e: 41 89 c4 mov %eax,%r12d ... bf: 31 d2 xor %edx,%edx c1: 44 8b 3d 00 00 00 00 mov 0x0(%rip),%r15d # c8 <test+0xc8> c8: 89 54 24 04 mov %edx,0x4(%rsp) cc: 44 89 3d 00 00 00 00 mov %r15d,0x0(%rip) # d3 <test+0xd3> d3: 45 31 ff xor %r15d,%r15d d6: 8b 15 00 00 00 00 mov 0x0(%rip),%edx # dc <test+0xdc> dc: 44 89 7c 24 54 mov %r15d,0x54(%rsp) ... 26d: 0b 54 24 54 or 0x54(%rsp),%edx 271: 0b 54 24 50 or 0x50(%rsp),%edx 275: 0b 54 24 4c or 0x4c(%rsp),%edx 279: 0b 54 24 48 or 0x48(%rsp),%edx ... On gcc-4.x, error values temporally stored to stack and at the last "or" all stored data. This stack usage seems inefficient. -- Summary: [4.2/4.3/4.4 regression] Inefficient stack usage Product: gcc Version: 4.4.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: c AssignedTo: unassigned at gcc dot gnu dot org ReportedBy: h-shimamoto at ct dot jp dot nec dot com GCC host triplet: x86_64-unknown-linux-gnu http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38533