Hi, 

The below patch fixes the FFT/Scimark regression caused by useless prefetch
generation.

This fix tries to make prefetch less aggressive by prefetching arrays in the
inner loop, when the step is invariant in the entire loop nest.

GCC currently tries to prefetch invariant steps when they are in the inner
loop. But does not check if the step is variant in outer loops.

In the scimark FFT case, the trip count of the inner loop varies by a non
constant step, which is invariant in the inner loop. 
But the step variable is varying in outer loop. This makes
inner loop trip count small (at run time varies sometimes as small as 1
iteration) 

Prefetching ahead x iteration when the inner loop trip count is smaller than x
leads to useless prefetches. 

Flag used: -O3 -march=amdfam10 

Before 
**                                                              **
** SciMark2 Numeric Benchmark, see http://math.nist.gov/scimark **
** for details. (Results can be submitted to p...@nist.gov)     **
**                                                              **
Using       2.00 seconds min time per kenel.
Composite Score:          550.50
FFT             Mflops:    38.66    (N=1024)
SOR             Mflops:   617.61    (100 x 100)
MonteCarlo:     Mflops:   173.74
Sparse matmult  Mflops:   675.63    (N=1000, nz=5000)
LU              Mflops:  1246.88    (M=100, N=100)


After 
**                                                              **
** SciMark2 Numeric Benchmark, see http://math.nist.gov/scimark **
** for details. (Results can be submitted to p...@nist.gov)     **
**                                                              **
Using       2.00 seconds min time per kenel.
Composite Score:          639.20
FFT             Mflops:   479.19    (N=1024)
SOR             Mflops:   617.61    (100 x 100)
MonteCarlo:     Mflops:   173.18
Sparse matmult  Mflops:   679.13    (N=1000, nz=5000)
LU              Mflops:  1246.88    (M=100, N=100)

GCC regression "make check -k" passes with x86_64-unknown-linux-gnu
New tests that PASS:

gcc.dg/pr53397-1.c scan-assembler prefetcht0
gcc.dg/pr53397-1.c scan-tree-dump aprefetch "Issued prefetch"
gcc.dg/pr53397-1.c (test for excess errors)
gcc.dg/pr53397-2.c scan-tree-dump aprefetch "loop variant step"
gcc.dg/pr53397-2.c scan-tree-dump aprefetch "Not prefetching"
gcc.dg/pr53397-2.c (test for excess errors)


Checked CPU2006 and polyhedron on latest AMD processor, no regressions noted.

Ok to commit in trunk?

regards,
Venkat

gcc/ChangeLog
+2012-10-01  Venkataramanan Kumar  <venkataramanan.ku...@amd.com>
+
+   * tree-ssa-loop-prefetch.c (gather_memory_references_ref):$
+   Perform non constant step prefetching in inner loop, only $
+   when it is invariant in the entire loop nest.  $
+   * testsuite/gcc.dg/pr53397-1.c: New test case $
+   Checks we are prefecthing for loop invariant steps$
+   * testsuite/gcc.dg/pr53397-2.c: New test case$
+   Checks we are not prefecthing for loop variant steps
+


Index: gcc/testsuite/gcc.dg/pr53397-1.c
===================================================================
--- gcc/testsuite/gcc.dg/pr53397-1.c    (revision 0)
+++ gcc/testsuite/gcc.dg/pr53397-1.c    (revision 0)
@@ -0,0 +1,28 @@
+/* Prefetching when the step is loop invariant.  */
+
+/* { dg-do compile } */
+/* { dg-options "-O3 -fprefetch-loop-arrays -fdump-tree-aprefetch-details 
--param min-insn-to-prefetch-ratio=3 --param simultaneous-prefetches=10 
-fdump-tree-aprefetch-details" } */
+
+
+double data[16384];
+void prefetch_when_non_constant_step_is_invariant(int step, int n)
+{
+     int a;
+     int b;
+     for (a = 1; a < step; a++) {
+        for (b = 0; b < n; b += 2 * step) {
+
+          int i = 2*(b + a);
+          int j = 2*(b + a + step);
+
+
+          data[j]   = data[i];
+          data[j+1] = data[i+1];
+        }
+     }
+}
+
+/* { dg-final { scan-tree-dump "Issued prefetch" "aprefetch" } } */
+/* { dg-final { scan-assembler "prefetcht0" } } */
+
+/* { dg-final { cleanup-tree-dump "aprefetch" } } */
Index: gcc/testsuite/gcc.dg/pr53397-2.c
===================================================================
--- gcc/testsuite/gcc.dg/pr53397-2.c    (revision 0)
+++ gcc/testsuite/gcc.dg/pr53397-2.c    (revision 0)
@@ -0,0 +1,29 @@
+/* Not prefetching when the step is loop variant.  */
+
+/* { dg-do compile } */
+/* { dg-options "-O3 -fprefetch-loop-arrays -fdump-tree-aprefetch-details 
--param min-insn-to-prefetch-ratio=3 --param simultaneous-prefetches=10 
-fdump-tree-aprefetch-details" } */
+
+
+double data[16384];
+void donot_prefetch_when_non_constant_step_is_variant(int step, int n)
+{ 
+     int a;
+     int b;
+     for (a = 1; a < step; a++,step*=2) {
+        for (b = 0; b < n; b += 2 * step) {
+
+          int i = 2*(b + a);
+          int j = 2*(b + a + step);
+
+
+          data[j]   = data[i];
+          data[j+1] = data[i+1];
+        }
+     } 
+}
+
+/* { dg-final { scan-tree-dump "Not prefetching" "aprefetch" } } */
+/* { dg-final { scan-tree-dump "loop variant step" "aprefetch" } } */
+
+/* { dg-final { cleanup-tree-dump "aprefetch" } } */
+
Index: gcc/tree-ssa-loop-prefetch.c
===================================================================
--- gcc/tree-ssa-loop-prefetch.c        (revision 191642)
+++ gcc/tree-ssa-loop-prefetch.c        (working copy)
@@ -523,6 +523,7 @@
   tree base, step;
   HOST_WIDE_INT delta;
   struct mem_ref_group *agrp;
+  loop_p ploop;
 
   if (get_base_address (ref) == NULL)
     return false;
@@ -537,10 +538,50 @@
   if (may_be_nonaddressable_p (base))
     return false;
 
-  /* Limit non-constant step prefetching only to the innermost loops.  */
-  if (!cst_and_fits_in_hwi (step) && loop->inner != NULL)
-    return false;
+  /* Limit non-constant step prefetching only to the innermost loops and 
+     only when the step is invariant in the entire loop nest. */
+  if (!cst_and_fits_in_hwi (step))
+    {
+      if( loop->inner != NULL)
+        {
+          if (dump_file && (dump_flags & TDF_DETAILS))
+            {
+              fprintf (dump_file, "Reference %p:\n", (void *) ref);
+              fprintf (dump_file, "(base " );
+              print_generic_expr (dump_file, base, TDF_SLIM);
+              fprintf (dump_file, ", step ");
+              print_generic_expr (dump_file, step, TDF_TREE);
+              fprintf (dump_file, ")\n");
+              fprintf (dump_file, "Ignoring %p, non-constant step prefetching\
+                is limited to inner most loops \n",(void *) ref);
+            }
+            return false;    
+         }
+      else
+        {
+          ploop = loop;
 
+          while (loop_outer (ploop))
+            {
+              if (!expr_invariant_in_loop_p (ploop , step))
+                {
+                  if (dump_file && (dump_flags & TDF_DETAILS))
+                    {
+                      fprintf (dump_file, "Reference %p:\n", (void *) ref);
+                      fprintf (dump_file, "(base " );
+                      print_generic_expr (dump_file, base, TDF_SLIM);
+                      fprintf (dump_file, ", step ");
+                      print_generic_expr (dump_file, step, TDF_TREE);
+                      fprintf (dump_file, ")\n");
+                      fprintf (dump_file, "Not prefetching, ignoring %p due to 
loop variant step\n",(void *) ref);
+                    }
+                      return false;                 
+                }
+                ploop = loop_outer (ploop);
+            }
+        }
+    }
+
   /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP
      are integer constants.  */
   agrp = find_or_create_group (refs, base, step);

Reply via email to