http://gcc.gnu.org/bugzilla/show_bug.cgi?id=55953
Bug #: 55953 Summary: hand loop faster then builtin memset Classification: Unclassified Product: gcc Version: unknown Status: UNCONFIRMED Severity: normal Priority: P3 Component: c AssignedTo: unassig...@gcc.gnu.org ReportedBy: dushis...@mail.ru variant 1: char c[100]; void f(void) { for(int i=0; i < 100; ++i) c[i] = '0'; } assembly: push %rbp vmovdqa 0x117(%rip),%ymm0 # 0x400960 mov %rsp,%rbp pop %rbp movb $0x30,0x20086c(%rip)# 0x6010c0 <c+96> vmovdqa %ymm0,0x200804(%rip)# 0x601060 <c> vmovdqa %ymm0,0x20081c(%rip)# 0x601080 <c+32> vmovdqa %ymm0,0x200834(%rip)# 0x6010a0 <c+64> movb $0x30,0x20084e(%rip)# 0x6010c1 <c+97> movb $0x30,0x200848(%rip)# 0x6010c2 <c+98> movb $0x30,0x200842(%rip)# 0x6010c3 <c+99> vzeroupper retq variant 2: char c[100]; void f(void) { memset(c, '0', 100); } assembly: movabs $0x3030303030303030,%rax movl $0x30303030,0x20086c(%rip) # 0x6010c0 <c+96> mov %rax,0x200805(%rip) # 0x601060 <c> mov %rax,0x200806(%rip) # 0x601068 <c+8> mov %rax,0x200807(%rip) # 0x601070 <c+16> mov %rax,0x200808(%rip) # 0x601078 <c+24> mov %rax,0x200809(%rip) # 0x601080 <c+32> mov %rax,0x20080a(%rip) # 0x601088 <c+40> mov %rax,0x20080b(%rip) # 0x601090 <c+48> mov %rax,0x20080c(%rip) # 0x601098 <c+56> mov %rax,0x20080d(%rip) # 0x6010a0 <c+64> mov %rax,0x20080e(%rip) # 0x6010a8 <c+72> mov %rax,0x20080f(%rip) # 0x6010b0 <c+80> mov %rax,0x200810(%rip) # 0x6010b8 <c+88> retq The first variants take (for (size_t i = 0; i < 10000000; ++i) f();): 0.150000 secs, abs 0.150255 The second variants take: 0.170000 secs, abs 0.175502 CPU Intel i7, gcc --version gcc (Gentoo 4.7.2 p1.3, pie-0.5.5) 4.7.2 compile options: -Ofast -march=native Expected behaviour: assembly code should be the same, and it should be variant 1, or faster.