Hi!

As the testcase shows, for the memset (x, 0, y); snippet which handles
y from 16 to 31 inclusive for some tunings, we generate:
 .L36:
        movq    $0, (%rdi)
        movq    $0, 8(%rdi)
        movq    $0, -8(%rsi,%rdi)
which is correct only for y from 16 to 24 inclusive, if y is 25 to 31,
we clear the first 16 bytes and last 8 bytes of the buffer, but would leave
1 to 7 bytes untouched in between that.
With this patch we emit:
 .L36:
        movq    $0, (%rdi)
        movq    $0, 8(%rdi)
        movq    $0, -16(%rsi,%rdi)
        movq    $0, -8(%rsi,%rdi)

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2014-01-21  Jakub Jelinek  <ja...@redhat.com>

        PR target/59003
        * config/i386/i386.c (expand_small_movmem_or_setmem): If mode is
        smaller than size, perform several stores or loads and stores
        at dst + count - size to store or copy all of size bytes, rather
        than just last modesize bytes.

        * gcc.dg/tree-prof/pr59003.c: New test.

--- gcc/config/i386/i386.c.jj   2014-01-20 19:12:56.000000000 +0100
+++ gcc/config/i386/i386.c      2014-01-21 16:46:19.965094839 +0100
@@ -23397,16 +23397,24 @@ expand_small_movmem_or_setmem (rtx destm
     }
 
   destmem = offset_address (destmem, count, 1);
-  destmem = offset_address (destmem, GEN_INT (-size - GET_MODE_SIZE (mode)),
+  destmem = offset_address (destmem, GEN_INT (-2 * size),
                            GET_MODE_SIZE (mode));
-  if (issetmem)
-    emit_move_insn (destmem, gen_lowpart (mode, value));
-  else
+  if (!issetmem)
     {
       srcmem = offset_address (srcmem, count, 1);
-      srcmem = offset_address (srcmem, GEN_INT (-size - GET_MODE_SIZE (mode)),
+      srcmem = offset_address (srcmem, GEN_INT (-2 * size),
                               GET_MODE_SIZE (mode));
-      emit_move_insn (destmem, srcmem);
+    }
+  for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
+    {
+      if (issetmem)
+       emit_move_insn (destmem, gen_lowpart (mode, value));
+      else
+       {
+         emit_move_insn (destmem, srcmem);
+         srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
+       }
+      destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
     }
   emit_jump_insn (gen_jump (done_label));
   emit_barrier ();
--- gcc/testsuite/gcc.dg/tree-prof/pr59003.c.jj 2014-01-21 15:43:58.004385383 
+0100
+++ gcc/testsuite/gcc.dg/tree-prof/pr59003.c    2014-01-21 15:45:51.000000000 
+0100
@@ -0,0 +1,29 @@
+/* PR target/59003 */
+/* { dg-options "-O2" } */
+/* { dg-options "-O2 -mtune=amdfam10" { target i?86-*-* x86_64-*-* } } */
+
+__attribute__((noinline, noclone)) void *
+foo (void *p, unsigned int q)
+{
+  return __builtin_memset (p, 0, q * 4UL);
+}
+
+char buf[128] __attribute__((aligned (32)));
+
+int
+main ()
+{
+  int i;
+  for (i = 0; i < 100000; i++)
+    foo (buf + 4, 1 + (i & 1));
+  for (i = 0; i < 128; i++)
+    {
+      buf[i] = 'X';
+      asm volatile ("" : : : "memory");
+    }
+  foo (buf + 32, 7);
+  for (i = 0; i < 128; i++)
+    if (buf[i] != ((i < 32 || i >= 32 + 28) ? 'X' : 0))
+      __builtin_abort ();
+  return 0;
+}

        Jakub

Reply via email to