Hi, This patch is to address the missing optimization reported in http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60738
Now in process_single_reg_class_operands, any allocno A conflicting with a single reg class operand B is marked to never use the reg class in IRA. This is non-optimal when A is in a hot region while B is in a cold region. The patch allows A to use the register in the single reg class if only the hotness difference between A and B is large enough. The patch also extends lra_split to make sure A is splitted in the code region for B instead of being spilled. bootstrap and regression test are ok for x86_64-linux-gnu. Is it ok for trunk? Thanks, Wei. ChangeLog: 2014-04-25 Wei Mi <w...@google.com> PR rtl-optimization/60738 * params.h: New param. * params.def: Ditto. * lra-constraints.c (need_for_split_p): Let more cases to do lra-split. * ira-lives.c (process_single_reg_class_operands): Avoid to add single reg class into conflict hardreg set in some cases. ChangeLog: 2014-04-25 Wei Mi <w...@google.com> PR rtl-optimization/60738 * testsuite/gcc.target/i386/pr60738-2.c: New test. * testsuite/gcc.target/i386/pr60738-1.c: New test. Index: params.def =================================================================== --- params.def (revision 209253) +++ params.def (working copy) @@ -826,6 +826,11 @@ DEFPARAM (PARAM_LRA_MAX_CONSIDERED_RELOA "The max number of reload pseudos which are considered during spilling a non-reload pseudo", 500, 0, 0) +DEFPARAM (PARAM_LRA_SPLIT_FREQ_RATIO, + "lra-split-freq-ratio", + "The ratio used to check when lra split is preferred than spilled", + 9, 0, 0) + /* Switch initialization conversion will refuse to create arrays that are bigger than this parameter times the number of switch branches. */ Index: ira-lives.c =================================================================== --- ira-lives.c (revision 209253) +++ ira-lives.c (working copy) @@ -1025,7 +1025,11 @@ process_single_reg_class_operands (bool { ira_object_t obj = ira_object_id_map[px]; a = OBJECT_ALLOCNO (obj); - if (a != operand_a) + /* If a is much hotter in some other region, don't add reg class + cl into its conflict hardreg set. Let lra_split to do splitting + here for operand_a. */ + if (a != operand_a + && (LRA_SPLIT_FREQ_RATIO * freq >= a->freq)) { /* We could increase costs of A instead of making it conflicting with the hard register. But it works worse Index: params.h =================================================================== --- params.h (revision 209253) +++ params.h (working copy) @@ -198,6 +198,8 @@ extern void init_param_values (int *para PARAM_VALUE (PARAM_IRA_LOOP_RESERVED_REGS) #define LRA_MAX_CONSIDERED_RELOAD_PSEUDOS \ PARAM_VALUE (PARAM_LRA_MAX_CONSIDERED_RELOAD_PSEUDOS) +#define LRA_SPLIT_FREQ_RATIO \ + PARAM_VALUE (PARAM_LRA_SPLIT_FREQ_RATIO) #define SWITCH_CONVERSION_BRANCH_RATIO \ PARAM_VALUE (PARAM_SWITCH_CONVERSION_BRANCH_RATIO) #define LOOP_INVARIANT_MAX_BBS_IN_LOOP \ Index: lra-constraints.c =================================================================== --- lra-constraints.c (revision 209253) +++ lra-constraints.c (working copy) @@ -129,6 +129,7 @@ #include "ira.h" #include "rtl-error.h" #include "lra-int.h" +#include "params.h" /* Value of LRA_CURR_RELOAD_NUM at the beginning of BB of the current insn. Remember that LRA_CURR_RELOAD_NUM is the number of emitted @@ -4632,8 +4633,13 @@ static bitmap_head ebb_global_regs; static inline bool need_for_split_p (HARD_REG_SET potential_reload_hard_regs, int regno) { + int freq; + rtx last_use_insn; int hard_regno = regno < FIRST_PSEUDO_REGISTER ? regno : reg_renumber[regno]; + last_use_insn = skip_usage_debug_insns (usage_insns[regno].insns); + freq = REG_FREQ_FROM_BB (BLOCK_FOR_INSN (last_use_insn)); + lra_assert (hard_regno >= 0); return ((TEST_HARD_REG_BIT (potential_reload_hard_regs, hard_regno) /* Don't split eliminable hard registers, otherwise we can @@ -4653,25 +4659,27 @@ need_for_split_p (HARD_REG_SET potential && (regno >= FIRST_PSEUDO_REGISTER || ! TEST_HARD_REG_BIT (call_used_reg_set, regno) || usage_insns[regno].calls_num == calls_num) - /* We need at least 2 reloads to make pseudo splitting - profitable. We should provide hard regno splitting in - any case to solve 1st insn scheduling problem when - moving hard register definition up might result in - impossibility to find hard register for reload pseudo of - small register class. */ - && (usage_insns[regno].reloads_num - + (regno < FIRST_PSEUDO_REGISTER ? 0 : 3) < reloads_num) - && (regno < FIRST_PSEUDO_REGISTER - /* For short living pseudos, spilling + inheritance can - be considered a substitution for splitting. - Therefore we do not splitting for local pseudos. It - decreases also aggressiveness of splitting. The - minimal number of references is chosen taking into - account that for 2 references splitting has no sense - as we can just spill the pseudo. */ - || (regno >= FIRST_PSEUDO_REGISTER - && lra_reg_info[regno].nrefs > 3 - && bitmap_bit_p (&ebb_global_regs, regno)))) + /* If + && ((LRA_SPLIT_FREQ_RATIO * freq < lra_reg_info[regno].freq) + /* We need at least 2 reloads to make pseudo splitting + profitable. We should provide hard regno splitting in + any case to solve 1st insn scheduling problem when + moving hard register definition up might result in + impossibility to find hard register for reload pseudo of + small register class. */ + || ((usage_insns[regno].reloads_num + + (regno < FIRST_PSEUDO_REGISTER ? 0 : 3) < reloads_num) + && (regno < FIRST_PSEUDO_REGISTER + /* For short living pseudos, spilling + inheritance can + be considered a substitution for splitting. + Therefore we do not splitting for local pseudos. It + decreases also aggressiveness of splitting. The + minimal number of references is chosen taking into + account that for 2 references splitting has no sense + as we can just spill the pseudo. */ + || (regno >= FIRST_PSEUDO_REGISTER + && lra_reg_info[regno].nrefs > 3 + && bitmap_bit_p (&ebb_global_regs, regno)))))) || (regno >= FIRST_PSEUDO_REGISTER && need_for_call_save_p (regno))); } Index: testsuite/gcc.target/i386/pr60738-1.c =================================================================== --- testsuite/gcc.target/i386/pr60738-1.c (revision 0) +++ testsuite/gcc.target/i386/pr60738-1.c (revision 0) @@ -0,0 +1,21 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ +/* { dg-final { scan-assembler "mov\[^\\n]*ecx," } } */ + +/* This test is to ensure r1 is lra-splitted in coldpath and leave + aside ecx for r2. */ + +int a, b, c, d, e, f, cond1, cond2; +void foo() { + int r1, r2, r3; + r1 = b; + r2 = d; + if (__builtin_expect(cond1 > 3, 0)) { + if (__builtin_expect(cond2 > 3, 0)) { + e = e * 5; + c = a << r1; + } + } + c = c << r2; + f = r1 + r2; +} Index: testsuite/gcc.target/i386/pr60738-2.c =================================================================== --- testsuite/gcc.target/i386/pr60738-2.c (revision 0) +++ testsuite/gcc.target/i386/pr60738-2.c (revision 0) @@ -0,0 +1,34 @@ +/* { dg-do compile { target { ia32 } } */ +/* { dg-options "-O2" } */ +/* { dg-final { scan-assembler "mov\[^\\n]*A\[^\\n]*, .ecx" } } */ +/* { dg-final { scan-assembler "mov\[^\\n]ecx, B\[^\\n]*" } } */ + +/* This test is to ensure no spill is generated for r1 + on hotpath because r1 can only use cl register. */ + +int a, b, c, d, e, cond1, cond2, A[20], B[20]; +void foo() { + int r1, r2, r3, r4, r5, r6, r7, r8; + r2 = A[2]; + r3 = A[3]; + r4 = A[4]; + r5 = A[5]; + r6 = A[6]; + r7 = A[7]; + r8 = A[8]; + + if (__builtin_expect(cond1 > 3, 0)) { + if (__builtin_expect(cond2 > 3, 0)) { + r1 = b; + c = a << r1; + } + } + + B[2] = r2; + B[3] = r3; + B[4] = r4; + B[5] = r5; + B[6] = r6; + B[7] = r7; + B[8] = r8; +}