https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111657
--- Comment #5 from Uroš Bizjak <ubizjak at gmail dot com> ---
I have tried to compile with -mtune=nocona that has:
static stringop_algs nocona_memcpy[2] = {
{libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
{libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
{100000, unrolled_loop, false}, {-1, libcall, false}}}};
and compiler produces code as expected in both cases (use unrolled_loop when
rep movsq is unavailable):
foo:
movq %fs:0, %rdx
leaq t@tpoff(%rdx), %rsi
movl $30, %ecx
rep movsq
ret
bar:
xorl %edx, %edx
.L4:
movl %edx, %eax
movq %gs:s(%rax), %r9
movq %gs:s+8(%rax), %r8
movq %gs:s+16(%rax), %rsi
movq %gs:s+24(%rax), %rcx
movq %r9, (%rdi,%rax)
movq %r8, 8(%rdi,%rax)
movq %rsi, 16(%rdi,%rax)
movq %rcx, 24(%rdi,%rax)
addl $32, %edx
cmpl $224, %edx
jb .L4
addq %rdx, %rdi
movq %gs:s(%rdx), %rax
movq %rax, (%rdi)
movq %gs:s+8(%rdx), %rax
movq %rax, 8(%rdi)
ret