This patch catches a missed optimization opportunity where GCC currently
generates worse code than LLVM.  The issue, as nicely analyzed in bugzilla,
boils down to the following three insns in combine:

(insn 6 5 7 2 (parallel [
            (set (reg:DI 85)
                (ashift:DI (reg:DI 85)
                    (const_int 32 [0x20])))
            (clobber (reg:CC 17 flags))
        ]) "pr92180.c":4:10 564 {*ashldi3_1}
     (expr_list:REG_UNUSED (reg:CC 17 flags)
        (nil)))
(insn 7 6 14 2 (parallel [
            (set (reg:DI 84)
                (ior:DI (reg:DI 84)
                    (reg:DI 85)))
            (clobber (reg:CC 17 flags))
        ]) "pr92180.c":4:10 454 {*iordi_1}
     (expr_list:REG_DEAD (reg:DI 85)
        (expr_list:REG_UNUSED (reg:CC 17 flags)
            (nil))))
(insn 14 7 15 2 (set (reg/i:SI 0 ax)
        (subreg:SI (reg:DI 84) 0)) "pr92180.c":5:1 67 {*movsi_internal}
     (expr_list:REG_DEAD (reg:DI 84)
        (nil)))

Normally, combine/simplify-rtx would notice that insns 6 and 7
(which update highpart bits) are unnecessary as the final insn 14
only requires to lowpart bits.  The complication is that insn 14
sets a hard register in targetm.class_likely_spilled_p which
prevents combine from performing its simplifications, and removing
the redundant instructions.

At first glance a fix would appear to require changes to combine,
potentially affecting code generation on all small register class
targets...  An alternate (and I think clever) solution is to spot
that this problematic situation can be avoided by the backend.

At RTL expansion time, the middle-end has a clear separation between
pseudos and hard registers, so the RTL initially contains:

(insn 9 8 10 2 (set (reg:SI 86)
        (subreg:SI (reg:DI 82 [ _1 ]) 0)) "pr92180.c":6:10 -1
     (nil))
(insn 10 9 14 2 (set (reg:SI 83 [ <retval> ])
        (reg:SI 86)) "pr92180.c":6:10 -1
     (nil))
(insn 14 10 15 2 (set (reg/i:SI 0 ax)
        (reg:SI 83 [ <retval> ])) "pr92180.c":7:1 -1
     (nil))

which can be optimized without problems by combine; it is only the
intervening passes (initially fwprop1) that propagate computations
into sets of hard registers, and disable those opportunities.

The solution proposed here is to have the x86 backend/recog prevent
early RTL passes composing instructions (that set likely_spilled hard
registers) that they (combine) can't simplify, until after reload.
We allow sets from pseudo registers, immediate constants and memory
accesses, but anything more complicated is performed via a temporary
pseudo.  Not only does this simplify things for the register allocator,
but any remaining register-to-register moves are easily cleaned up
by the late optimization passes after reload, such as peephole2 and
cprop_hardreg.

This patch has been tested on x86_64-pc-linux-gnu with a
"make bootstrap" and a "make -k check" with no new failures.
Ok for mainline?


2020-08-17  Roger Sayle  <ro...@nextmovesoftware.com>

gcc/ChangeLog
        PR rtl-optimization/92180
        * config/i386/i386.c (ix86_hardreg_mov_ok): New function to
        determine whether (set DST SRC) should be allowed at this point.
        * config/i386/i386-protos.h (ix86_hardreg_mov_ok): Prototype here.
        * config/i386/i386-expand.c (ix86_expand_move): Check whether
        this is a complex set of a likely spilled hard register, and if
        so place the value in a pseudo, and load the hard reg from it.
        * config/i386/i386.md (*movdi_internal, *movsi_internal,
        *movhi_internal, *movqi_internal): Make these instructions
        conditional on ix86_hardreg_mov_ok.
        (*lea<mode>): Make this define_insn_and_split conditional on
        ix86_hardreg_mov_ok.

gcc/testsuite/ChangeLog
        PR rtl-optimization/92180
        * gcc.target/i386/pr92180.c: New test.


Thanks in advance,
Roger
--
Roger Sayle
NextMove Software
Cambridge, UK

diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index f441ba9..e6e4433 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -190,6 +190,17 @@ ix86_expand_move (machine_mode mode, rtx operands[])
   op0 = operands[0];
   op1 = operands[1];
 
+  /* Avoid complex sets of likely spilled hard registers before reload.  */
+  if (!ix86_hardreg_mov_ok (op0, op1))
+    {
+      tmp = gen_reg_rtx (mode);
+      operands[0] = tmp;
+      ix86_expand_move (mode, operands);
+      operands[0] = op0;
+      operands[1] = tmp;
+      op1 = tmp;
+    }
+
   switch (GET_CODE (op1))
     {
     case CONST:
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index b6088f2..a10bc56 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -161,6 +161,7 @@ extern rtx ix86_find_base_term (rtx);
 extern bool ix86_check_movabs (rtx, int);
 extern bool ix86_check_no_addr_space (rtx);
 extern void ix86_split_idivmod (machine_mode, rtx[], bool);
+extern bool ix86_hardreg_mov_ok (rtx, rtx);
 
 extern rtx assign_386_stack_local (machine_mode, enum ix86_stack_slot);
 extern int ix86_attr_length_immediate_default (rtx_insn *, bool);
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 10eb2dd..06a4d18 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -18511,6 +18511,22 @@ ix86_class_likely_spilled_p (reg_class_t rclass)
   return false;
 }
 
+/* Return true if a set of DST by the expression SRC should be allowed.
+   This prevents complex sets of likely_spilled hard regs before reload.  */
+
+bool
+ix86_hardreg_mov_ok (rtx dst, rtx src)
+{
+  /* Avoid complex sets of likely_spilled hard registers before reload.  */
+  if (REG_P (dst) && HARD_REGISTER_P (dst)
+      && !REG_P (src) && !MEM_P (src)
+      && !x86_64_immediate_operand (src, GET_MODE (dst))
+      && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst)))
+      && !reload_completed)
+    return false;
+  return true;
+}
+
 /* If we are copying between registers from different register sets
    (e.g. FP and integer), we may need a memory location.
 
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 9d4e669..ab2fdc3 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -2077,7 +2077,8 @@
     "=r  ,o  ,r,r  ,r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,m,?r 
,?*Yd,?r,?*v,?*y,?*x,*k,*k ,*r,*m,*k")
        (match_operand:DI 1 "general_operand"
     "riFo,riF,Z,rem,i,re,C ,*y,m  ,*y,*y,r  ,C ,*v,m ,*v,v,*Yd,r   ,*v,r  ,*x 
,*y ,*r,*km,*k,*k,CBC"))]
-  "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "!(MEM_P (operands[0]) && MEM_P (operands[1]))
+   && ix86_hardreg_mov_ok (operands[0], operands[1])"
 {
   switch (get_attr_type (insn))
     {
@@ -2297,7 +2298,8 @@
     "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,*k")
        (match_operand:SI 1 "general_operand"
     "g ,re,C ,*y,m  ,*y,*y,r  ,C ,*v,m ,*v,*v,r  ,*r,*km,*k ,CBC"))]
-  "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "!(MEM_P (operands[0]) && MEM_P (operands[1]))
+   && ix86_hardreg_mov_ok (operands[0], operands[1])"
 {
   switch (get_attr_type (insn))
     {
@@ -2405,7 +2407,8 @@
 (define_insn "*movhi_internal"
   [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r ,r ,m ,k,k ,r,m,k")
        (match_operand:HI 1 "general_operand"      "r ,rn,rm,rn,r,km,k,k,CBC"))]
-  "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "!(MEM_P (operands[0]) && MEM_P (operands[1]))
+   && ix86_hardreg_mov_ok (operands[0], operands[1])"
 {
   switch (get_attr_type (insn))
     {
@@ -2494,7 +2497,8 @@
                        "=Q,R,r,q,q,r,r ,?r,m ,k,k,r,m,k,k,k")
        (match_operand:QI 1 "general_operand"
                        "Q ,R,r,n,m,q,rn, m,qn,r,k,k,k,m,C,BC"))]
-  "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "!(MEM_P (operands[0]) && MEM_P (operands[1]))
+   && ix86_hardreg_mov_ok (operands[0], operands[1])"
 {
   char buf[128];
   const char *ops;
@@ -5149,7 +5153,7 @@
 (define_insn_and_split "*lea<mode>"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
        (match_operand:SWI48 1 "address_no_seg_operand" "Ts"))]
-  ""
+  "ix86_hardreg_mov_ok (operands[0], operands[1])"
 {
   if (SImode_address_operand (operands[1], VOIDmode))
     {
diff --git a/gcc/testsuite/gcc.target/i386/pr92180.c 
b/gcc/testsuite/gcc.target/i386/pr92180.c
new file mode 100644
index 0000000..0d5c4a2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92180.c
@@ -0,0 +1,9 @@
+/* PR rtl-optimization/92180 */
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+unsigned int foo() {
+  return __builtin_ia32_rdtsc();
+}
+
+/* { dg-final { scan-assembler-not "sal" } } */

Reply via email to