The stack usage of the code gcc-4.x generated looks inefficient on x86 and
x86_64. A simple test case is below;
#define copy_from_asm(x, addr, err)     \
asm volatile(                           \
        "1:\tmovl %2, %1\n"             \
        "2:\n"                          \
        ".section .fixup,\"ax\"\n"      \
        "\txor %1,%1\n"                 \
        "\tmov $1,%0\n"                 \
        "\tjmp 2b\n"                    \
        ".previous\n"                   \
        : "=r" (err), "=r" (x)          \
        : "m" (*(int*)(addr)), "0" (err))

#define copy_from(x, addr, err) do {            \
        (err) = 0;                              \
        copy_from_asm((x), (addr), (err));      \
} while (0)

#define copy(x, addr)   ({              \
        int __err;                      \
        copy_from((x), (addr), __err);  \
        __err;                          \

int src[32];
int dst[32];

#define my_copy(x)      do { err |= copy(dst[x], &src[x]); } while (0)

int test(void)
        int err = 0;

        my_copy( 0); my_copy( 1); my_copy( 2); my_copy( 3);
        my_copy( 4); my_copy( 5); my_copy( 6); my_copy( 7);
        my_copy( 8); my_copy( 9); my_copy(10); my_copy(11);
        my_copy(12); my_copy(13); my_copy(14); my_copy(15);
        my_copy(16); my_copy(17); my_copy(18); my_copy(19);
        my_copy(20); my_copy(21); my_copy(22); my_copy(23);
        my_copy(24); my_copy(25); my_copy(26); my_copy(27);
        my_copy(28); my_copy(29); my_copy(30); my_copy(31);

        return err;

I compiled this test case with gcc-3.4.6, 4.2.4, 4.3.2 and 4.4-20081205,
and the compile option is "-g -Os -mno-red-zone".
The code size of objects are below;
$ size test.o.*
   text    data     bss     dec     hex filename
    945       0       0     945     3b1 test.o.34
   1157       0       0    1157     485 test.o.42
   1133       0       0    1133     46d test.o.43
   1201       0       0    1201     4b1 test.o.44

gcc-3.4.6 generates;
0000000000000000 <test>:
   0:   31 c9                   xor    %ecx,%ecx
   2:   8b 05 00 00 00 00       mov    0x0(%rip),%eax        # 8 <test+0x8>
   8:   89 05 00 00 00 00       mov    %eax,0x0(%rip)        # e <test+0xe>
   e:   31 c0                   xor    %eax,%eax
  10:   8b 15 00 00 00 00       mov    0x0(%rip),%edx        # 16 <test+0x16>
  16:   09 c1                   or     %eax,%ecx
  18:   89 15 00 00 00 00       mov    %edx,0x0(%rip)        # 1e <test+0x1e>
  1e:   31 c0                   xor    %eax,%eax
  20:   8b 15 00 00 00 00       mov    0x0(%rip),%edx        # 26 <test+0x26>
  26:   09 c1                   or     %eax,%ecx

gcc-4.4 generates;
0000000000000000 <test>:
   0:   41 57                   push   %r15
   2:   31 c0                   xor    %eax,%eax
   4:   41 56                   push   %r14
   6:   41 55                   push   %r13
   8:   41 89 c5                mov    %eax,%r13d
   b:   41 54                   push   %r12
   d:   55                      push   %rbp
   e:   53                      push   %rbx
   f:   48 83 ec 58             sub    $0x58,%rsp
  13:   8b 15 00 00 00 00       mov    0x0(%rip),%edx        # 19 <test+0x19>
  19:   89 15 00 00 00 00       mov    %edx,0x0(%rip)        # 1f <test+0x1f>
  1f:   41 89 c6                mov    %eax,%r14d
  22:   8b 15 00 00 00 00       mov    0x0(%rip),%edx        # 28 <test+0x28>
  28:   89 15 00 00 00 00       mov    %edx,0x0(%rip)        # 2e <test+0x2e>
  2e:   41 89 c4                mov    %eax,%r12d
  bf:   31 d2                   xor    %edx,%edx
  c1:   44 8b 3d 00 00 00 00    mov    0x0(%rip),%r15d        # c8 <test+0xc8>
  c8:   89 54 24 04             mov    %edx,0x4(%rsp)
  cc:   44 89 3d 00 00 00 00    mov    %r15d,0x0(%rip)        # d3 <test+0xd3>
  d3:   45 31 ff                xor    %r15d,%r15d
  d6:   8b 15 00 00 00 00       mov    0x0(%rip),%edx        # dc <test+0xdc>
  dc:   44 89 7c 24 54          mov    %r15d,0x54(%rsp)
 26d:   0b 54 24 54             or     0x54(%rsp),%edx
 271:   0b 54 24 50             or     0x50(%rsp),%edx
 275:   0b 54 24 4c             or     0x4c(%rsp),%edx
 279:   0b 54 24 48             or     0x48(%rsp),%edx

On gcc-4.x, error values temporally stored to stack and at the last "or" all
stored data. This stack usage seems inefficient.

           Summary: [4.2/4.3/4.4 regression] Inefficient stack usage
           Product: gcc
           Version: 4.4.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: h-shimamoto at ct dot jp dot nec dot com
  GCC host triplet: x86_64-unknown-linux-gnu

Reply via email to