This is a patch to prevent scheduler from scheduling compare and branch away, in order to increase macro-fusion opportunity on recent x86 platforms. It is motivated by the following small testcase.
double __attribute__ ((noinline)) bar (double sum); int a[100]; double bar (double sum) { int i; for (i = 0; i < 1000000; i++) sum += (0.5 + (a[i%100] - 128)); return sum; } int main() { double total; int i; for (i = 0; i < 1000; i++) total += bar (i); return total != 0.333; } ~/workarea/gcc-r201963/build/install/bin/gcc -O2 -mtune=corei7-avx 1.c -o 1.out The binary of the kernel loop in func bar () is: 401180: 89 c8 mov %ecx,%eax 401182: 66 0f 57 c9 xorpd %xmm1,%xmm1 401186: f7 ee imul %esi 401188: 89 c8 mov %ecx,%eax 40118a: c1 f8 1f sar $0x1f,%eax 40118d: c1 fa 05 sar $0x5,%edx 401190: 29 c2 sub %eax,%edx 401192: b8 64 00 00 00 mov $0x64,%eax 401197: 0f af d0 imul %eax,%edx 40119a: 89 c8 mov %ecx,%eax 40119c: 83 c1 01 add $0x1,%ecx 40119f: 29 d0 sub %edx,%eax 4011a1: 48 98 cltq 4011a3: 8b 04 85 60 51 6c 00 mov 0x6c5160(,%rax,4),%eax 4011aa: 83 c0 80 add $0xffffff80,%eax 4011ad: 81 f9 40 42 0f 00 cmp $0xf4240,%ecx 4011b3: f2 0f 2a c8 cvtsi2sd %eax,%xmm1 4011b7: f2 0f 58 ca addsd %xmm2,%xmm1 4011bb: f2 0f 58 c1 addsd %xmm1,%xmm0 4011bf: 75 bf jne 401180 <bar+0x10> Here cmp (addr: 4011ad) and jne (addr: 4011bf) are not consecutive in object code, but they are consecutive before sched2 pass. If we manually keep the cmp and jne together, the performance of 1.out changes from 2.40s to 2.31s on a sandybridge machine. Perf stat result shows that UOPS_RETIRED.MACRO_FUSED event increases from 131,075 to 1,000,130,308, and UOPS_RETIRED.ANY event decreases from 23,002,543,637 to 22,002,511,525. The patch is to reschedule cmp and jmp to make them consecutive. It is done at the end of scheduling each block before schedule result is commited. bootstrapped and regression ok on x86_64-linux-gnu. ok for trunk? 2013-09-03 Wei Mi <w...@google.com> * haifa-sched.c (move_insns): New function. (adjust_for_macro_fusion): Ditto. (schedule_block): Call adjust_for_macro_fusion before commit schedule. * doc/tm.texi.in: Generated. * doc/tm.texi: Ditto. * config/i386/x86-tune.def (DEF_TUNE): Add m_COREI7 for X86_TUNE_FUSE_CMP_AND_BRANCH. * config/i386/i386.c (ix86_macro_fusion_p): New function. (ix86_macro_fusion_pair_p): Ditto. * target.def: Add macro_fusion_p and macro_fusion_pair_p in sched group. Index: haifa-sched.c =================================================================== --- haifa-sched.c (revision 201963) +++ haifa-sched.c (working copy) @@ -5605,6 +5605,56 @@ choose_ready (struct ready_list *ready, } } +/* Move insn scheduled_insns[I] to the position J in scheduled_insns. */ + +static void +move_insns (int i, int j) +{ + rtx insn = scheduled_insns[i]; + scheduled_insns.ordered_remove (i); + scheduled_insns.safe_insert (j, insn); +} + +/* If the last cond jump and the cond register setting insn are consecutive + before scheduling, and are scheduled away from each other, this func + tries to rearrange insns in scheduled_insns and keep those two insns + together. This is good for performance on microarchitectures supporting + macro-fusion. */ + +static void +adjust_for_macro_fusion () +{ + int i = -1, length; + unsigned int condreg1, condreg2; + rtx cc_reg_1; + rtx insn; + rtx last = scheduled_insns.last(); + + targetm.fixed_condition_code_regs (&condreg1, &condreg2); + cc_reg_1 = gen_rtx_REG (CCmode, condreg1); + length = scheduled_insns.length (); + if (any_condjump_p (last) && reg_referenced_p (cc_reg_1, PATTERN (last))) + { + for (i = length - 2; i >= 0; i--) + { + insn = scheduled_insns[i]; + if (modified_in_p (cc_reg_1, insn)) + break; + } + } + if (i < 0 || i == length - 2) + return; + + if (NEXT_INSN (insn) != last) + return; + + if (!targetm.sched.macro_fusion_pair_p + || !targetm.sched.macro_fusion_pair_p (insn, last)) + return; + + move_insns (i, length - 2); +} + /* This function is called when we have successfully scheduled a block. It uses the schedule stored in the scheduled_insns vector to rearrange the RTL. PREV_HEAD is used as the anchor to which we @@ -6421,6 +6471,9 @@ schedule_block (basic_block *target_bb, if (success) { + if (targetm.sched.macro_fusion_p + && targetm.sched.macro_fusion_p ()) + adjust_for_macro_fusion (); commit_schedule (prev_head, tail, target_bb); if (sched_verbose) fprintf (sched_dump, ";; total time = %d\n", clock_var); Index: doc/tm.texi.in =================================================================== --- doc/tm.texi.in (revision 201963) +++ doc/tm.texi.in (working copy) @@ -4940,6 +4940,10 @@ them: try the first ones in this list fi @hook TARGET_SCHED_REORDER2 +@hook TARGET_SCHED_MACRO_FUSION_P + +@hook TARGET_SCHED_MACRO_FUSION_PAIR_P + @hook TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK @hook TARGET_SCHED_INIT Index: doc/tm.texi =================================================================== --- doc/tm.texi (revision 201963) +++ doc/tm.texi (working copy) @@ -6553,6 +6553,18 @@ scheduling one insn causes other insns t cycle. These other insns can then be taken into account properly. @end deftypefn +@deftypefn {Target Hook} bool TARGET_SCHED_MACRO_FUSION_P (void) +This hook is used to check whether target platform supports macro fusion. +@end deftypefn + +@deftypefn {Target Hook} bool TARGET_SCHED_MACRO_FUSION_PAIR_P (rtx @var{condgen}, rtx @var{condjmp}) +This hook is used to check whether two insns could be macro fused for +target microarchitecture. Now it is used in scheduler to adjust scheduling +result for macro-fusion. If this hook returns true for the given insn pair +(@var{condgen} and @var{condjmp}), scheduler will reschedule @var{condgen} +to the position just before condjmp before commit the scheduling result. +@end deftypefn + @deftypefn {Target Hook} void TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK (rtx @var{head}, rtx @var{tail}) This hook is called after evaluation forward dependencies of insns in chain given by two parameter values (@var{head} and @var{tail} Index: config/i386/x86-tune.def =================================================================== --- config/i386/x86-tune.def (revision 201963) +++ config/i386/x86-tune.def (working copy) @@ -196,7 +196,8 @@ DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction with a subsequent conditional jump instruction into a single compare-and-branch uop. */ -DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH, "fuse_cmp_and_branch", m_BDVER) +DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH, "fuse_cmp_and_branch", + m_COREI7 | m_BDVER) /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag will impact LEA instruction selection. */ DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_ATOM | m_SLM) Index: config/i386/i386.c =================================================================== --- config/i386/i386.c (revision 201963) +++ config/i386/i386.c (working copy) @@ -24850,6 +24850,99 @@ ia32_multipass_dfa_lookahead (void) } } +/* Return true if target platform supports macro-fusion. */ + +static bool +ix86_macro_fusion_p () +{ + if (TARGET_FUSE_CMP_AND_BRANCH) + return true; + else + return false; +} + +/* Check whether current microarchitecture support macro fusion + for insn pair "CONDGEN + CONDJMP". Refer to + "Intel Architectures Optimization Reference Manual". */ + +static bool +ix86_macro_fusion_pair_p (rtx condgen, rtx condjmp) +{ + rtx src; + if (strcmp (ix86_tune_string, "corei7")) + { + /* For Nehalem. */ + rtx single_set = single_set (condgen); + /* Nehalem doesn't support macro-fusion for add/sub+jmp. */ + if (single_set == NULL_RTX) + return false; + + src = SET_SRC (single_set); + if (GET_CODE (src) != COMPARE) + return false; + + /* Nehalem doesn't support macro-fusion for cmp/test MEM-IMM + insn pattern. */ + if ((MEM_P (XEXP (src, 0)) + && CONST_INT_P (XEXP (src, 1))) + || (MEM_P (XEXP (src, 1)) + && CONST_INT_P (XEXP (src, 0)))) + return false; + + /* Nehalem doesn't support macro-fusion for add/sub/dec/inc + jmp. */ + if (get_attr_type (condgen) != TYPE_TEST + && get_attr_type (condgen) != TYPE_ICMP) + return false; + return true; + } + else if (strcmp (ix86_tune_string, "corei7-avx")) + { + /* For Sandybridge. */ + enum rtx_code ccode; + rtx compare_set = NULL_RTX, test_if, cond; + rtx single_set = single_set (condgen); + if (single_set != NULL_RTX) + compare_set = single_set; + else + { + int i; + rtx pat = PATTERN (condgen); + for (i = 0; i < XVECLEN (pat, 0); i++) + if (GET_CODE (XVECEXP (pat, 0, i)) == SET + && GET_CODE (SET_SRC (XVECEXP (pat, 0, i))) == COMPARE) + compare_set = XVECEXP (pat, 0, i); + } + + if (compare_set == NULL_RTX) + return false; + src = SET_SRC (compare_set); + if (GET_CODE (src) != COMPARE) + return false; + + /* Sandybridge doesn't support macro-fusion for cmp/test MEM-IMM + insn pattern. */ + if ((MEM_P (XEXP (src, 0)) + && CONST_INT_P (XEXP (src, 1))) + || (MEM_P (XEXP (src, 1)) + && CONST_INT_P (XEXP (src, 0)))) + return false; + + /* Sandybridge doesn't support macro-fusion for inc/dec + + unsigned comparison jmp. */ + test_if = SET_SRC (pc_set (condjmp)); + cond = XEXP (test_if, 0); + ccode = GET_CODE (cond); + if (get_attr_type (condgen) == TYPE_INCDEC + && (ccode == GEU + || ccode == GTU + || ccode == LEU + || ccode == LTU)) + return false; + return true; + } + return false; +} + /* Try to reorder ready list to take advantage of Atom pipelined IMUL execution. It is applied if (1) IMUL instruction is on the top of list; @@ -42982,6 +43075,10 @@ ix86_memmodel_check (unsigned HOST_WIDE_ #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \ ia32_multipass_dfa_lookahead +#undef TARGET_SCHED_MACRO_FUSION_P +#define TARGET_SCHED_MACRO_FUSION_P ix86_macro_fusion_p +#undef TARGET_SCHED_MACRO_FUSION_PAIR_P +#define TARGET_SCHED_MACRO_FUSION_PAIR_P ix86_macro_fusion_pair_p #undef TARGET_FUNCTION_OK_FOR_SIBCALL #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall Index: target.def =================================================================== --- target.def (revision 201963) +++ target.def (working copy) @@ -1041,6 +1041,20 @@ scheduling one insn causes other insns t cycle. These other insns can then be taken into account properly.", int, (FILE *file, int verbose, rtx *ready, int *n_readyp, int clock), NULL) +DEFHOOK +(macro_fusion_p, + "This hook is used to check whether target platform supports macro fusion.", + bool, (void), NULL) + +DEFHOOK +(macro_fusion_pair_p, + "This hook is used to check whether two insns could be macro fused for\n\ +target microarchitecture. Now it is used in scheduler to adjust scheduling\n\ +result for macro-fusion. If this hook returns true for the given insn pair\n\ +(@var{condgen} and @var{condjmp}), scheduler will reschedule @var{condgen}\n\ +to the position just before condjmp before commit the scheduling result.", + bool, (rtx condgen, rtx condjmp), NULL) + /* The following member value is a pointer to a function called after evaluation forward dependencies of insns in chain given by two parameter values (head and tail correspondingly). */