This patch tweaks the way that min and max are expanded, so that the
semantics of these operations are visible to the early RTL optimization
passes, until split into explicit comparison and conditional move
instructions. The good news is that i386.md already contains all of
the required logic (many thanks to Richard Biener and Uros Bizjak),
but this is currently only to enable scalar-to-vector (STV) synthesis
of min/max instructions.  This change enables this functionality for
all TARGET_CMOVE architectures for SImode, HImode and DImode.

My first attempt to support "min(reg,reg)" as an instruction revealed
why this functionality isn't already enabled: In PR rtl-optimization 91154
we end up generating a cmp instead of a test in gcc.target/i386/minmax-3.c
which has poor performance on AMD Opteron.  My solution to this is to
actually support "min(reg,general_operand)" allowing us to inspect
any immediate constant at the point this operation is split.  This
allows us to use "test" instructions for min/max against 0, 1 and -1.
As an added benefit it allows us to CSE large immediate constants,
reducing code size.

Previously, "int foo(int x) { return max(x,12345); }" would generate:

foo:    cmpl    $12345, %edi
        movl    $12345, %eax
        cmovge  %edi, %eax
        ret

with this patch we instead generate:

foo:    movl    $12345, %eax
        cmpl    %eax, %edi
        cmovge  %edi, %eax
        ret


I've also included a peephole2 to avoid the "movl $0,%eax" instructions
being produced by the register allocator.  Materializing constants as
late as possible reduces register pressure, but for const0_rtx on x86,
it's preferable to use "xor" by moving this set from between a flags
setting operation and its use.  This should also help instruction macro
fusion on some microarchitectures, where test/cmp and the following
instruction can sometimes be combined.

Previously, "int foo(int x) { return max(x,0); }" would generate:

foo:    testl   %edi, %edi
        movl    $0, %eax
        cmovns  %edi, %eax
        ret

with this patch we instead generate:
foo:    xorl    %eax, %eax
        testl   %edi, %edi
        cmovns  %edi, %eax
        ret

The benefits of having min/max explicit at the RTL level can be seen
from compiling the following example with "-O2 -fno-tree-reassoc".


#define max(a,b) (((a) > (b))? (a) : (b))

int foo(int x)
{
  int y = max(x,5);
  return max(y,10);
}

where GCC currently produces:

foo:    cmpl    $5, %edi
        movl    $5, %eax
        movl    $10, %edx
        cmovge  %edi, %eax
        cmpl    $10, %eax
        cmovl   %edx, %eax
        ret

and with this patch it instead now produces:

foo:    movl    $10, %eax
        cmpl    %eax, %edi
        cmovge  %edi, %eax
        ret


The original motivation was from looking at a performance critical
function in a quantum mechanics code, that performed MIN_EXPR and
MAX_EXPR of the same arguments (effectively a two-element sort),
where GCC was performing the comparison twice.  I'd hoped that it
might be possible to fuse these together, perhaps in combine, but
this patch is an intermediate step towards that goal.

This patch has been tested on x86_64-pc-linux-gnu with a make
bootstrap followed by make -k check with no new regressions.
Ok for mainline?


2020-07-30  Roger Sayle  <ro...@nextmovesoftware.com>

        * config/i386/i386.md (MAXMIN_IMODE): No longer needed.
        (<maxmin><mode>3):  Support SWI248 and general_operand for
        second operand, when TARGET_CMOVE.
        (<maxmin><mode>3_1 splitter): Optimize comparisons against
        0, 1 and -1 to use "test" instead of "cmp".
        (*<maxmin>di3_doubleword): Likewise, allow general_operand
        and enable on TARGET_CMOVE.
        (peephole2): Convert clearing a register after a flag setting
        instruction into an xor followed by the original flag setter.


Many thanks in advance.  Almost all of the credit goes to Uros and 
Richard for already implementing this feature, I've just copied the
transformations from optab expansion, that allow it to be enabled
without penalty (this late in the compilation).

Roger
--
Roger Sayle
NextMove Software
Cambridge, UK

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index b24a455..717d55f 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -18809,45 +18809,68 @@
 
 ;; min/max patterns
 
-(define_mode_iterator MAXMIN_IMODE
-  [(SI "TARGET_SSE4_1") (DI "TARGET_AVX512VL")])
 (define_code_attr maxmin_rel
   [(smax "GE") (smin "LE") (umax "GEU") (umin "LEU")])
 
 (define_expand "<code><mode>3"
   [(parallel
-    [(set (match_operand:MAXMIN_IMODE 0 "register_operand")
-         (maxmin:MAXMIN_IMODE
-           (match_operand:MAXMIN_IMODE 1 "register_operand")
-           (match_operand:MAXMIN_IMODE 2 "nonimmediate_operand")))
+    [(set (match_operand:SWI248 0 "register_operand")
+         (maxmin:SWI248
+           (match_operand:SWI248 1 "register_operand")
+           (match_operand:SWI248 2 "general_operand")))
      (clobber (reg:CC FLAGS_REG))])]
-  "TARGET_STV")
+  "TARGET_CMOVE")
 
 (define_insn_and_split "*<code><mode>3_1"
-  [(set (match_operand:MAXMIN_IMODE 0 "register_operand")
-       (maxmin:MAXMIN_IMODE
-         (match_operand:MAXMIN_IMODE 1 "register_operand")
-         (match_operand:MAXMIN_IMODE 2 "nonimmediate_operand")))
+  [(set (match_operand:SWI248 0 "register_operand")
+       (maxmin:SWI248
+         (match_operand:SWI248 1 "register_operand")
+         (match_operand:SWI248 2 "general_operand")))
    (clobber (reg:CC FLAGS_REG))]
-  "(TARGET_64BIT || <MODE>mode != DImode) && TARGET_STV
+  "TARGET_CMOVE
    && ix86_pre_reload_split ()"
   "#"
   "&& 1"
   [(set (match_dup 0)
-       (if_then_else:MAXMIN_IMODE (match_dup 3)
+       (if_then_else:SWI248 (match_dup 3)
          (match_dup 1)
          (match_dup 2)))]
 {
   machine_mode mode = <MODE>mode;
+  rtx cmp_op = operands[2];
 
-  if (!register_operand (operands[2], mode))
-    operands[2] = force_reg (mode, operands[2]);
+  if (!register_operand (cmp_op, mode))
+    operands[2] = force_reg (mode, cmp_op);
 
   enum rtx_code code = <maxmin_rel>;
-  machine_mode cmpmode = SELECT_CC_MODE (code, operands[1], operands[2]);
+
+  if (cmp_op == const1_rtx)
+    {
+      /* Convert smax (x, 1) into (x > 0 ? x : 1).
+        Convert umax (x, 1) into (x != 0 ? x : 1).
+        Convert ?min (x, 1) into (x <= 0 ? x : 1).  */
+      cmp_op = const0_rtx;
+      if (code == GE)
+       code = GT;
+      else if (code == GEU)
+       code = NE;
+    }
+  /* Convert smin (x, -1) into (x < 0 ? x : -1).  */
+  else if (cmp_op == constm1_rtx && code == LE)
+    {
+      cmp_op = const0_rtx;
+      code = LT;
+    }
+  /* Convert smax (x, -1) into (x >= 0 ? x : -1).  */
+  else if (cmp_op == constm1_rtx && code == GE)
+    cmp_op = const0_rtx;
+  else if (cmp_op != const0_rtx)
+    cmp_op = operands[2];
+
+  machine_mode cmpmode = SELECT_CC_MODE (code, operands[1], cmp_op);
   rtx flags = gen_rtx_REG (cmpmode, FLAGS_REG);
 
-  rtx tmp = gen_rtx_COMPARE (cmpmode, operands[1], operands[2]);
+  rtx tmp = gen_rtx_COMPARE (cmpmode, operands[1], cmp_op);
   emit_insn (gen_rtx_SET (flags, tmp));
 
   operands[3] = gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
@@ -18856,9 +18879,9 @@
 (define_insn_and_split "*<code>di3_doubleword"
   [(set (match_operand:DI 0 "register_operand")
        (maxmin:DI (match_operand:DI 1 "register_operand")
-                  (match_operand:DI 2 "nonimmediate_operand")))
+                  (match_operand:DI 2 "general_operand")))
    (clobber (reg:CC FLAGS_REG))]
-  "!TARGET_64BIT && TARGET_STV && TARGET_AVX512VL
+  "!TARGET_64BIT && TARGET_CMOVE
    && ix86_pre_reload_split ()"
   "#"
   "&& 1"
@@ -18910,6 +18933,20 @@
       gcc_unreachable ();
     }
 })
+
+;; Avoid clearing a register between a flags setting comparison and its use,
+;; i.e. prefer "xorl %eax,%eax; test/cmp" over "test/cmp; movl $0, %eax".
+(define_peephole2
+  [(set (reg FLAGS_REG) (match_operand 0))
+   (set (match_operand:SWI48 1 "register_operand") (const_int 0))]
+  "peep2_regno_dead_p (0, FLAGS_REG)
+   && !reg_overlap_mentioned_p (operands[1], operands[0])"
+  [(parallel [(set (match_dup 1) (const_int 0))
+             (clobber (reg:CC FLAGS_REG))])
+   (set (match_dup 2) (match_dup 0))]
+{
+  operands[2] = gen_rtx_REG (GET_MODE (operands[0]), FLAGS_REG);
+})
 
 ;; Misc patterns (?)
 

Reply via email to