Hi! As the testcase shows, for the memset (x, 0, y); snippet which handles y from 16 to 31 inclusive for some tunings, we generate: .L36: movq $0, (%rdi) movq $0, 8(%rdi) movq $0, -8(%rsi,%rdi) which is correct only for y from 16 to 24 inclusive, if y is 25 to 31, we clear the first 16 bytes and last 8 bytes of the buffer, but would leave 1 to 7 bytes untouched in between that. With this patch we emit: .L36: movq $0, (%rdi) movq $0, 8(%rdi) movq $0, -16(%rsi,%rdi) movq $0, -8(%rsi,%rdi)
Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? 2014-01-21 Jakub Jelinek <ja...@redhat.com> PR target/59003 * config/i386/i386.c (expand_small_movmem_or_setmem): If mode is smaller than size, perform several stores or loads and stores at dst + count - size to store or copy all of size bytes, rather than just last modesize bytes. * gcc.dg/tree-prof/pr59003.c: New test. --- gcc/config/i386/i386.c.jj 2014-01-20 19:12:56.000000000 +0100 +++ gcc/config/i386/i386.c 2014-01-21 16:46:19.965094839 +0100 @@ -23397,16 +23397,24 @@ expand_small_movmem_or_setmem (rtx destm } destmem = offset_address (destmem, count, 1); - destmem = offset_address (destmem, GEN_INT (-size - GET_MODE_SIZE (mode)), + destmem = offset_address (destmem, GEN_INT (-2 * size), GET_MODE_SIZE (mode)); - if (issetmem) - emit_move_insn (destmem, gen_lowpart (mode, value)); - else + if (!issetmem) { srcmem = offset_address (srcmem, count, 1); - srcmem = offset_address (srcmem, GEN_INT (-size - GET_MODE_SIZE (mode)), + srcmem = offset_address (srcmem, GEN_INT (-2 * size), GET_MODE_SIZE (mode)); - emit_move_insn (destmem, srcmem); + } + for (n = 0; n * GET_MODE_SIZE (mode) < size; n++) + { + if (issetmem) + emit_move_insn (destmem, gen_lowpart (mode, value)); + else + { + emit_move_insn (destmem, srcmem); + srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode)); + } + destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode)); } emit_jump_insn (gen_jump (done_label)); emit_barrier (); --- gcc/testsuite/gcc.dg/tree-prof/pr59003.c.jj 2014-01-21 15:43:58.004385383 +0100 +++ gcc/testsuite/gcc.dg/tree-prof/pr59003.c 2014-01-21 15:45:51.000000000 +0100 @@ -0,0 +1,29 @@ +/* PR target/59003 */ +/* { dg-options "-O2" } */ +/* { dg-options "-O2 -mtune=amdfam10" { target i?86-*-* x86_64-*-* } } */ + +__attribute__((noinline, noclone)) void * +foo (void *p, unsigned int q) +{ + return __builtin_memset (p, 0, q * 4UL); +} + +char buf[128] __attribute__((aligned (32))); + +int +main () +{ + int i; + for (i = 0; i < 100000; i++) + foo (buf + 4, 1 + (i & 1)); + for (i = 0; i < 128; i++) + { + buf[i] = 'X'; + asm volatile ("" : : : "memory"); + } + foo (buf + 32, 7); + for (i = 0; i < 128; i++) + if (buf[i] != ((i < 32 || i >= 32 + 28) ? 'X' : 0)) + __builtin_abort (); + return 0; +} Jakub