Fwd: [PATCH] Scheduling result adjustment to enable macro-fusion

Wei Mi Tue, 03 Sep 2013 16:02:38 -0700

This is a patch to prevent scheduler from scheduling compare and
branch away, in order to increase macro-fusion opportunity on recent
x86 platforms. It is motivated by the following small testcase.


double __attribute__ ((noinline)) bar (double sum);

int a[100];

double bar (double sum)
{
  int i;
  for (i = 0; i < 1000000; i++)
   sum += (0.5 + (a[i%100] - 128));
  return sum;
}

int main() {
  double total;
  int i;

  for (i = 0; i < 1000; i++)
    total += bar (i);

  return total != 0.333;
}

~/workarea/gcc-r201963/build/install/bin/gcc -O2 -mtune=corei7-avx 1.c -o 1.out
The binary of the kernel loop in func bar () is:

  401180:       89 c8                   mov    %ecx,%eax
  401182:       66 0f 57 c9             xorpd  %xmm1,%xmm1
  401186:       f7 ee                   imul   %esi
  401188:       89 c8                   mov    %ecx,%eax
  40118a:       c1 f8 1f                sar    $0x1f,%eax
  40118d:       c1 fa 05                sar    $0x5,%edx
  401190:       29 c2                   sub    %eax,%edx
  401192:       b8 64 00 00 00          mov    $0x64,%eax
  401197:       0f af d0                imul   %eax,%edx
  40119a:       89 c8                   mov    %ecx,%eax
  40119c:       83 c1 01                add    $0x1,%ecx
  40119f:       29 d0                   sub    %edx,%eax
  4011a1:       48 98                   cltq
  4011a3:       8b 04 85 60 51 6c 00    mov    0x6c5160(,%rax,4),%eax
  4011aa:       83 c0 80                add    $0xffffff80,%eax
  4011ad:       81 f9 40 42 0f 00       cmp    $0xf4240,%ecx
  4011b3:       f2 0f 2a c8             cvtsi2sd %eax,%xmm1
  4011b7:       f2 0f 58 ca             addsd  %xmm2,%xmm1
  4011bb:       f2 0f 58 c1             addsd  %xmm1,%xmm0
  4011bf:       75 bf                   jne    401180 <bar+0x10>

Here cmp (addr: 4011ad) and jne (addr: 4011bf) are not consecutive in
object code, but they are consecutive before sched2 pass. If we
manually keep the cmp and jne together, the performance of 1.out
changes from 2.40s to 2.31s on a sandybridge machine. Perf stat result
shows that UOPS_RETIRED.MACRO_FUSED event increases from 131,075 to
1,000,130,308, and UOPS_RETIRED.ANY event decreases from
23,002,543,637 to 22,002,511,525.

The patch is to reschedule cmp and jmp to make them consecutive. It is
done at the end of scheduling each block before schedule result is
commited. bootstrapped and regression ok on x86_64-linux-gnu. ok for
trunk?

2013-09-03  Wei Mi  <w...@google.com>

        * haifa-sched.c (move_insns): New function.
        (adjust_for_macro_fusion): Ditto.
        (schedule_block): Call adjust_for_macro_fusion before commit schedule.
        * doc/tm.texi.in: Generated.
        * doc/tm.texi: Ditto.
        * config/i386/x86-tune.def (DEF_TUNE): Add m_COREI7 for
        X86_TUNE_FUSE_CMP_AND_BRANCH.
        * config/i386/i386.c (ix86_macro_fusion_p): New function.
        (ix86_macro_fusion_pair_p): Ditto.
        * target.def: Add macro_fusion_p and macro_fusion_pair_p in sched
        group.

Index: haifa-sched.c
===================================================================
--- haifa-sched.c       (revision 201963)
+++ haifa-sched.c       (working copy)
@@ -5605,6 +5605,56 @@ choose_ready (struct ready_list *ready,
     }
 }

+/* Move insn scheduled_insns[I] to the position J in scheduled_insns.  */
+
+static void
+move_insns (int i, int j)
+{
+  rtx insn = scheduled_insns[i];
+  scheduled_insns.ordered_remove (i);
+  scheduled_insns.safe_insert (j, insn);
+}
+
+/* If the last cond jump and the cond register setting insn are consecutive
+   before scheduling, and are scheduled away from each other, this func
+   tries to rearrange insns in scheduled_insns and keep those two insns
+   together. This is good for performance on microarchitectures supporting
+   macro-fusion.  */
+
+static void
+adjust_for_macro_fusion ()
+{
+  int i = -1, length;
+  unsigned int condreg1, condreg2;
+  rtx cc_reg_1;
+  rtx insn;
+  rtx last = scheduled_insns.last();
+
+  targetm.fixed_condition_code_regs (&condreg1, &condreg2);
+  cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
+  length = scheduled_insns.length ();
+  if (any_condjump_p (last) && reg_referenced_p (cc_reg_1, PATTERN (last)))
+    {
+      for (i = length - 2; i >= 0; i--)
+       {
+         insn = scheduled_insns[i];
+         if (modified_in_p (cc_reg_1, insn))
+            break;
+       }
+    }
+  if (i < 0 || i == length - 2)
+    return;
+
+  if (NEXT_INSN (insn) != last)
+    return;
+
+  if (!targetm.sched.macro_fusion_pair_p
+      || !targetm.sched.macro_fusion_pair_p (insn, last))
+    return;
+
+  move_insns (i, length - 2);
+}
+
 /* This function is called when we have successfully scheduled a
    block.  It uses the schedule stored in the scheduled_insns vector
    to rearrange the RTL.  PREV_HEAD is used as the anchor to which we
@@ -6421,6 +6471,9 @@ schedule_block (basic_block *target_bb,

   if (success)
     {
+      if (targetm.sched.macro_fusion_p
+         && targetm.sched.macro_fusion_p ())
+       adjust_for_macro_fusion ();
       commit_schedule (prev_head, tail, target_bb);
       if (sched_verbose)
        fprintf (sched_dump, ";;   total time = %d\n", clock_var);
Index: doc/tm.texi.in
===================================================================
--- doc/tm.texi.in      (revision 201963)
+++ doc/tm.texi.in      (working copy)
@@ -4940,6 +4940,10 @@ them: try the first ones in this list fi

 @hook TARGET_SCHED_REORDER2

+@hook TARGET_SCHED_MACRO_FUSION_P
+
+@hook TARGET_SCHED_MACRO_FUSION_PAIR_P
+
 @hook TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK

 @hook TARGET_SCHED_INIT
Index: doc/tm.texi
===================================================================
--- doc/tm.texi (revision 201963)
+++ doc/tm.texi (working copy)
@@ -6553,6 +6553,18 @@ scheduling one insn causes other insns t
 cycle.  These other insns can then be taken into account properly.
 @end deftypefn

+@deftypefn {Target Hook} bool TARGET_SCHED_MACRO_FUSION_P (void)
+This hook is used to check whether target platform supports macro fusion.
+@end deftypefn
+
+@deftypefn {Target Hook} bool TARGET_SCHED_MACRO_FUSION_PAIR_P (rtx
@var{condgen}, rtx @var{condjmp})
+This hook is used to check whether two insns could be macro fused for
+target microarchitecture. Now it is used in scheduler to adjust scheduling
+result for macro-fusion. If this hook returns true for the given insn pair
+(@var{condgen} and @var{condjmp}), scheduler will reschedule @var{condgen}
+to the position just before condjmp before commit the scheduling result.
+@end deftypefn
+
 @deftypefn {Target Hook} void
TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK (rtx @var{head}, rtx
@var{tail})
 This hook is called after evaluation forward dependencies of insns in
 chain given by two parameter values (@var{head} and @var{tail}
Index: config/i386/x86-tune.def
===================================================================
--- config/i386/x86-tune.def    (revision 201963)
+++ config/i386/x86-tune.def    (working copy)
@@ -196,7 +196,8 @@ DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS,
 /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
    with a subsequent conditional jump instruction into a single
    compare-and-branch uop.  */
-DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH, "fuse_cmp_and_branch", m_BDVER)
+DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH, "fuse_cmp_and_branch",
+          m_COREI7 | m_BDVER)
 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
    will impact LEA instruction selection. */
 DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_ATOM | m_SLM)
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c  (revision 201963)
+++ config/i386/i386.c  (working copy)
@@ -24850,6 +24850,99 @@ ia32_multipass_dfa_lookahead (void)
     }
 }

+/* Return true if target platform supports macro-fusion.  */
+
+static bool
+ix86_macro_fusion_p ()
+{
+  if (TARGET_FUSE_CMP_AND_BRANCH)
+    return true;
+  else
+    return false;
+}
+
+/* Check whether current microarchitecture support macro fusion
+   for insn pair "CONDGEN + CONDJMP". Refer to
+   "Intel Architectures Optimization Reference Manual". */
+
+static bool
+ix86_macro_fusion_pair_p (rtx condgen, rtx condjmp)
+{
+  rtx src;
+  if (strcmp (ix86_tune_string, "corei7"))
+    {
+      /* For Nehalem.  */
+      rtx single_set = single_set (condgen);
+      /* Nehalem doesn't support macro-fusion for add/sub+jmp.  */
+      if (single_set == NULL_RTX)
+        return false;
+
+      src = SET_SRC (single_set);
+      if (GET_CODE (src) != COMPARE)
+       return false;
+
+      /* Nehalem doesn't support macro-fusion for cmp/test MEM-IMM
+        insn pattern.  */
+      if ((MEM_P (XEXP (src, 0))
+          && CONST_INT_P (XEXP (src, 1)))
+         || (MEM_P (XEXP (src, 1))
+             && CONST_INT_P (XEXP (src, 0))))
+       return false;
+
+      /* Nehalem doesn't support macro-fusion for add/sub/dec/inc + jmp.  */
+      if (get_attr_type (condgen) != TYPE_TEST
+         && get_attr_type (condgen) != TYPE_ICMP)
+       return false;
+      return true;
+    }
+  else if (strcmp (ix86_tune_string, "corei7-avx"))
+    {
+      /* For Sandybridge.  */
+      enum rtx_code ccode;
+      rtx compare_set = NULL_RTX, test_if, cond;
+      rtx single_set = single_set (condgen);
+      if (single_set != NULL_RTX)
+        compare_set = single_set;
+      else
+       {
+         int i;
+         rtx pat = PATTERN (condgen);
+         for (i = 0; i < XVECLEN (pat, 0); i++)
+           if (GET_CODE (XVECEXP (pat, 0, i)) == SET
+               && GET_CODE (SET_SRC (XVECEXP (pat, 0, i))) == COMPARE)
+             compare_set = XVECEXP (pat, 0, i);
+       }
+
+      if (compare_set == NULL_RTX)
+       return false;
+      src = SET_SRC (compare_set);
+      if (GET_CODE (src) != COMPARE)
+       return false;
+
+      /* Sandybridge doesn't support macro-fusion for cmp/test MEM-IMM
+        insn pattern.  */
+      if ((MEM_P (XEXP (src, 0))
+           && CONST_INT_P (XEXP (src, 1)))
+          || (MEM_P (XEXP (src, 1))
+              && CONST_INT_P (XEXP (src, 0))))
+        return false;
+
+      /* Sandybridge doesn't support macro-fusion for inc/dec +
+        unsigned comparison jmp.  */
+      test_if = SET_SRC (pc_set (condjmp));
+      cond = XEXP (test_if, 0);
+      ccode = GET_CODE (cond);
+      if (get_attr_type (condgen) == TYPE_INCDEC
+         && (ccode == GEU
+             || ccode == GTU
+             || ccode == LEU
+             || ccode == LTU))
+       return false;
+      return true;
+    }
+  return false;
+}
+
 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
    execution. It is applied if
    (1) IMUL instruction is on the top of list;
@@ -42982,6 +43075,10 @@ ix86_memmodel_check (unsigned HOST_WIDE_
 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
   ia32_multipass_dfa_lookahead
+#undef TARGET_SCHED_MACRO_FUSION_P
+#define TARGET_SCHED_MACRO_FUSION_P ix86_macro_fusion_p
+#undef TARGET_SCHED_MACRO_FUSION_PAIR_P
+#define TARGET_SCHED_MACRO_FUSION_PAIR_P ix86_macro_fusion_pair_p

 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
Index: target.def
===================================================================
--- target.def  (revision 201963)
+++ target.def  (working copy)
@@ -1041,6 +1041,20 @@ scheduling one insn causes other insns t
 cycle.  These other insns can then be taken into account properly.",
  int, (FILE *file, int verbose, rtx *ready, int *n_readyp, int clock), NULL)

+DEFHOOK
+(macro_fusion_p,
+ "This hook is used to check whether target platform supports macro fusion.",
+ bool, (void), NULL)
+
+DEFHOOK
+(macro_fusion_pair_p,
+ "This hook is used to check whether two insns could be macro fused for\n\
+target microarchitecture. Now it is used in scheduler to adjust scheduling\n\
+result for macro-fusion. If this hook returns true for the given insn pair\n\
+(@var{condgen} and @var{condjmp}), scheduler will reschedule @var{condgen}\n\
+to the position just before condjmp before commit the scheduling result.",
+ bool, (rtx condgen, rtx condjmp), NULL)
+
 /* The following member value is a pointer to a function called
    after evaluation forward dependencies of insns in chain given
    by two parameter values (head and tail correspondingly).  */

Fwd: [PATCH] Scheduling result adjustment to enable macro-fusion

Reply via email to