Hi,

This patch is to address the missing optimization reported in
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60738

Now in process_single_reg_class_operands, any allocno A conflicting
with a single reg class operand B is marked to never use the reg class
in IRA. This is non-optimal when A is in a hot region while B is in a
cold region. The patch allows A to use the register in the single reg
class if only the hotness difference between A and B is large enough.
The patch also extends lra_split to make sure A is splitted in the
code region for B instead of being spilled.

bootstrap and regression test are ok for x86_64-linux-gnu. Is it ok for trunk?

Thanks,
Wei.

ChangeLog:

2014-04-25  Wei Mi  <w...@google.com>

        PR rtl-optimization/60738
        * params.h: New param.
        * params.def: Ditto.
        * lra-constraints.c (need_for_split_p): Let more
        cases to do lra-split.
        * ira-lives.c (process_single_reg_class_operands):
        Avoid to add single reg class into conflict hardreg set in
        some cases.


ChangeLog:

2014-04-25  Wei Mi  <w...@google.com>

        PR rtl-optimization/60738
        * testsuite/gcc.target/i386/pr60738-2.c: New test.
        * testsuite/gcc.target/i386/pr60738-1.c: New test.

Index: params.def
===================================================================
--- params.def  (revision 209253)
+++ params.def  (working copy)
@@ -826,6 +826,11 @@ DEFPARAM (PARAM_LRA_MAX_CONSIDERED_RELOA
          "The max number of reload pseudos which are considered
during spilling a non-reload pseudo",
          500, 0, 0)

+DEFPARAM (PARAM_LRA_SPLIT_FREQ_RATIO,
+         "lra-split-freq-ratio",
+         "The ratio used to check when lra split is preferred than spilled",
+         9, 0, 0)
+
 /* Switch initialization conversion will refuse to create arrays that are
    bigger than this parameter times the number of switch branches.  */

Index: ira-lives.c
===================================================================
--- ira-lives.c (revision 209253)
+++ ira-lives.c (working copy)
@@ -1025,7 +1025,11 @@ process_single_reg_class_operands (bool
         {
          ira_object_t obj = ira_object_id_map[px];
          a = OBJECT_ALLOCNO (obj);
-         if (a != operand_a)
+         /* If a is much hotter in some other region, don't add reg class
+            cl into its conflict hardreg set. Let lra_split to do splitting
+            here for operand_a.  */
+         if (a != operand_a
+             && (LRA_SPLIT_FREQ_RATIO * freq >= a->freq))
            {
              /* We could increase costs of A instead of making it
                 conflicting with the hard register.  But it works worse
Index: params.h
===================================================================
--- params.h    (revision 209253)
+++ params.h    (working copy)
@@ -198,6 +198,8 @@ extern void init_param_values (int *para
   PARAM_VALUE (PARAM_IRA_LOOP_RESERVED_REGS)
 #define LRA_MAX_CONSIDERED_RELOAD_PSEUDOS \
   PARAM_VALUE (PARAM_LRA_MAX_CONSIDERED_RELOAD_PSEUDOS)
+#define LRA_SPLIT_FREQ_RATIO \
+  PARAM_VALUE (PARAM_LRA_SPLIT_FREQ_RATIO)
 #define SWITCH_CONVERSION_BRANCH_RATIO \
   PARAM_VALUE (PARAM_SWITCH_CONVERSION_BRANCH_RATIO)
 #define LOOP_INVARIANT_MAX_BBS_IN_LOOP \
Index: lra-constraints.c
===================================================================
--- lra-constraints.c   (revision 209253)
+++ lra-constraints.c   (working copy)
@@ -129,6 +129,7 @@
 #include "ira.h"
 #include "rtl-error.h"
 #include "lra-int.h"
+#include "params.h"

 /* Value of LRA_CURR_RELOAD_NUM at the beginning of BB of the current
    insn.  Remember that LRA_CURR_RELOAD_NUM is the number of emitted
@@ -4632,8 +4633,13 @@ static bitmap_head ebb_global_regs;
 static inline bool
 need_for_split_p (HARD_REG_SET potential_reload_hard_regs, int regno)
 {
+  int freq;
+  rtx last_use_insn;
   int hard_regno = regno < FIRST_PSEUDO_REGISTER ? regno : reg_renumber[regno];

+  last_use_insn = skip_usage_debug_insns (usage_insns[regno].insns);
+  freq = REG_FREQ_FROM_BB (BLOCK_FOR_INSN (last_use_insn));
+
   lra_assert (hard_regno >= 0);
   return ((TEST_HARD_REG_BIT (potential_reload_hard_regs, hard_regno)
           /* Don't split eliminable hard registers, otherwise we can
@@ -4653,25 +4659,27 @@ need_for_split_p (HARD_REG_SET potential
           && (regno >= FIRST_PSEUDO_REGISTER
               || ! TEST_HARD_REG_BIT (call_used_reg_set, regno)
               || usage_insns[regno].calls_num == calls_num)
-          /* We need at least 2 reloads to make pseudo splitting
-             profitable.  We should provide hard regno splitting in
-             any case to solve 1st insn scheduling problem when
-             moving hard register definition up might result in
-             impossibility to find hard register for reload pseudo of
-             small register class.  */
-          && (usage_insns[regno].reloads_num
-              + (regno < FIRST_PSEUDO_REGISTER ? 0 : 3) < reloads_num)
-          && (regno < FIRST_PSEUDO_REGISTER
-              /* For short living pseudos, spilling + inheritance can
-                 be considered a substitution for splitting.
-                 Therefore we do not splitting for local pseudos.  It
-                 decreases also aggressiveness of splitting.  The
-                 minimal number of references is chosen taking into
-                 account that for 2 references splitting has no sense
-                 as we can just spill the pseudo.  */
-              || (regno >= FIRST_PSEUDO_REGISTER
-                  && lra_reg_info[regno].nrefs > 3
-                  && bitmap_bit_p (&ebb_global_regs, regno))))
+          /* If
+          && ((LRA_SPLIT_FREQ_RATIO * freq < lra_reg_info[regno].freq)
+              /* We need at least 2 reloads to make pseudo splitting
+                 profitable.  We should provide hard regno splitting in
+                 any case to solve 1st insn scheduling problem when
+                 moving hard register definition up might result in
+                 impossibility to find hard register for reload pseudo of
+                 small register class.  */
+              || ((usage_insns[regno].reloads_num
+                   + (regno < FIRST_PSEUDO_REGISTER ? 0 : 3) < reloads_num)
+                  && (regno < FIRST_PSEUDO_REGISTER
+                      /* For short living pseudos, spilling + inheritance can
+                         be considered a substitution for splitting.
+                         Therefore we do not splitting for local pseudos.  It
+                         decreases also aggressiveness of splitting.  The
+                         minimal number of references is chosen taking into
+                         account that for 2 references splitting has no sense
+                         as we can just spill the pseudo.  */
+                      || (regno >= FIRST_PSEUDO_REGISTER
+                          && lra_reg_info[regno].nrefs > 3
+                          && bitmap_bit_p (&ebb_global_regs, regno))))))
          || (regno >= FIRST_PSEUDO_REGISTER && need_for_call_save_p (regno)));
 }
Index: testsuite/gcc.target/i386/pr60738-1.c
===================================================================
--- testsuite/gcc.target/i386/pr60738-1.c       (revision 0)
+++ testsuite/gcc.target/i386/pr60738-1.c       (revision 0)
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { scan-assembler "mov\[^\\n]*ecx," } } */
+
+/* This test is to ensure r1 is lra-splitted in coldpath and leave
+   aside ecx for r2.  */
+
+int a, b, c, d, e, f, cond1, cond2;
+void foo() {
+  int r1, r2, r3;
+  r1 = b;
+  r2 = d;
+  if (__builtin_expect(cond1 > 3, 0)) {
+    if (__builtin_expect(cond2 > 3, 0)) {
+      e = e * 5;
+      c = a << r1;
+    }
+  }
+  c = c << r2;
+  f = r1 + r2;
+}
Index: testsuite/gcc.target/i386/pr60738-2.c
===================================================================
--- testsuite/gcc.target/i386/pr60738-2.c       (revision 0)
+++ testsuite/gcc.target/i386/pr60738-2.c       (revision 0)
@@ -0,0 +1,34 @@
+/* { dg-do compile  { target { ia32 } } */
+/* { dg-options "-O2" } */
+/* { dg-final { scan-assembler "mov\[^\\n]*A\[^\\n]*, .ecx" } } */
+/* { dg-final { scan-assembler "mov\[^\\n]ecx, B\[^\\n]*" } } */
+
+/* This test is to ensure no spill is generated for r1
+   on hotpath because r1 can only use cl register.  */
+
+int a, b, c, d, e, cond1, cond2, A[20], B[20];
+void foo() {
+  int r1, r2, r3, r4, r5, r6, r7, r8;
+  r2 = A[2];
+  r3 = A[3];
+  r4 = A[4];
+  r5 = A[5];
+  r6 = A[6];
+  r7 = A[7];
+  r8 = A[8];
+
+  if (__builtin_expect(cond1 > 3, 0)) {
+    if (__builtin_expect(cond2 > 3, 0)) {
+      r1 = b;
+      c = a << r1;
+    }
+  }
+
+  B[2] = r2;
+  B[3] = r3;
+  B[4] = r4;
+  B[5] = r5;
+  B[6] = r6;
+  B[7] = r7;
+  B[8] = r8;
+}

Reply via email to