Hi,
while looking into schedules produced for Buldozer and Core I noticed that they
do not seem to match reality.  This is because ix86_issue_rate limits those CPUs
into 3 instructions per cycle, while they are designed to do 4 and somewhat
confused ix86_adjust_cost.

I also added stack engine into modern chips even though scheduler doesn't
really understand that multiple push operations can happen in one cycle. At
least it gets the stack updates in sequences of push/pop operations.

I did not updated buldozer issue rates yet.  The current scheduler model won't 
allow
it to execute more than 3 instructions per cycle (and 2 for version 3).  I think
bdver1.md/bdver3.md needs to be updated first.

I am testing x86_64-linux and will commit if there are no complains.

Honza

        * i386.c (ix86_issue_rate): Pentium4/Nocona issue 2 instructions
        per cycle, Core/CoreI7/Haswell 4 instructions per cycle.
        (ix86_adjust_cost): Add stack engine to modern AMD chips;
        fix for core; remove Atom that mistakely shared code with AMD.
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c  (revision 203011)
+++ config/i386/i386.c  (working copy)
@@ -24435,17 +24435,14 @@ ix86_issue_rate (void)
     case PROCESSOR_SLM:
     case PROCESSOR_K6:
     case PROCESSOR_BTVER2:
+    case PROCESSOR_PENTIUM4:
+    case PROCESSOR_NOCONA:
       return 2;
 
     case PROCESSOR_PENTIUMPRO:
-    case PROCESSOR_PENTIUM4:
-    case PROCESSOR_CORE2:
-    case PROCESSOR_COREI7:
-    case PROCESSOR_HASWELL:
     case PROCESSOR_ATHLON:
     case PROCESSOR_K8:
     case PROCESSOR_AMDFAM10:
-    case PROCESSOR_NOCONA:
     case PROCESSOR_GENERIC:
     case PROCESSOR_BDVER1:
     case PROCESSOR_BDVER2:
@@ -24453,6 +24450,11 @@ ix86_issue_rate (void)
     case PROCESSOR_BTVER1:
       return 3;
 
+    case PROCESSOR_CORE2:
+    case PROCESSOR_COREI7:
+    case PROCESSOR_HASWELL:
+      return 4;
+
     default:
       return 1;
     }
@@ -24709,10 +24711,15 @@ ix86_adjust_cost (rtx insn, rtx link, rt
     case PROCESSOR_BDVER3:
     case PROCESSOR_BTVER1:
     case PROCESSOR_BTVER2:
-    case PROCESSOR_ATOM:
     case PROCESSOR_GENERIC:
       memory = get_attr_memory (insn);
 
+      /* Stack engine allows to execute push&pop instructions in parall.  */
+      if (((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
+          && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
+         && (ix86_tune != PROCESSOR_ATHLON && ix86_tune != PROCESSOR_K8))
+       return 0;
+
       /* Show ability of reorder buffer to hide latency of load by executing
         in parallel with previous instruction in case
         previous instruction is not needed to compute the address.  */
@@ -24737,6 +24744,29 @@ ix86_adjust_cost (rtx insn, rtx link, rt
          else
            cost = 0;
        }
+      break;
+
+    case PROCESSOR_CORE2:
+    case PROCESSOR_COREI7:
+    case PROCESSOR_HASWELL:
+      memory = get_attr_memory (insn);
+
+      /* Stack engine allows to execute push&pop instructions in parall.  */
+      if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
+         && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
+       return 0;
+
+      /* Show ability of reorder buffer to hide latency of load by executing
+        in parallel with previous instruction in case
+        previous instruction is not needed to compute the address.  */
+      if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
+         && !ix86_agi_dependent (dep_insn, insn))
+       {
+         if (cost >= 4)
+           cost -= 4;
+         else
+           cost = 0;
+       }
       break;
 
     case PROCESSOR_SLM:

Reply via email to