Hi Jeff,

On 28/09/16 17:07, Jeff Law wrote:
On 09/28/2016 02:48 AM, Kyrill Tkachov wrote:
Hi all,

This patch tries to avoid materialising an immediate
when comparison has shown that it is already loaded.
This is performed during ifcvt and only when the target has the
conditional negate
or inverse optabs, which for now is only aarch64.

Thus for the code:
int
foo (int a, int b)
{
  return a == 5 ? -5 : b;
}

we currently emit on aarch64:
foo:
        mov     w2, -5
        cmp     w0, 5
        csel    w0, w1, w2, ne
        ret

but with this patch we emit:
foo:
        cmp     w0, 5
        csneg   w0, w1, w0, ne
        ret

Bootstrapped and tested on arm, aarch64, x86_64.

Ok for trunk?

Thanks,
Kyrill

P.S. The simpler form of the transformation: X == CST ? CST : Y --> X ==
CST ? X : Y
could also be beneficial IMO but I found that it can cause bad codegen
in combine
due to extending the live range of X. I'm still working on a way to work
around that,
but that is a separate piece of work anyway.
FWIW, the simpler form of the transformation is already done just prior to 
leaving SSA form in tree-ssa-uncprop.c.  So there may not be a ton of 
opportunities for the simpler form in the RTL optimizers.


Indeed there weren't that many places, but they were some. They helped in 
avoiding materialising the floating point 0.0
in particular in some cases.

My only concern here is this transformation isn't significantly different than 
the simpler form, which is apparently causing some kind of combine problem.

I'd like to understand the combine problem better before giving final approval 
to this patch.


Sure, I'm attaching the ifcvt patch that does the aforementioned simple 
transform.
The testcase it regresses is:
int
foo (float a, float b, float c, int d)
{
  return a > 5 && a < 10 ? 6 : 0;
}

As far as I got is:
The relevant pre-combine (aarch64) RTL without the patch is:
(insn 21 19 45 2 (set (reg:SI 87)
        (zero_extend:SI (subreg:QI (reg:SI 86) 0))) fbad.c:4 90 
{*zero_extendqisi2_aarch64}
     (expr_list:REG_DEAD (reg:SI 86)
        (nil)))
(insn 47 45 48 2 (set (reg:CC 66 cc)
        (compare:CC (reg:SI 87)
            (const_int 0 [0]))) fbad.c:4 392 {cmpsi}
     (expr_list:REG_DEAD (reg:SI 87)
        (nil)))
(insn 48 47 30 2 (set (reg:SI 76 [ <retval> ])
        (if_then_else:SI (ne (reg:CC 66 cc)
                (const_int 0 [0]))
            (reg:SI 89)
            (const_int 0 [0]))) fbad.c:4 444 {*cmovsi_insn}
     (expr_list:REG_DEAD (reg:SI 89)
        (expr_list:REG_DEAD (reg:CC 66 cc)
            (nil))))


and with the patch it is:
(insn 21 19 45 2 (set (reg:SI 87)
        (zero_extend:SI (subreg:QI (reg:SI 86) 0))) fbad.c:4 90 
{*zero_extendqisi2_aarch64}
     (expr_list:REG_DEAD (reg:SI 86)
        (nil)))
(insn 46 45 47 2 (set (reg:CC 66 cc)
        (compare:CC (reg:SI 87)
            (const_int 0 [0]))) fbad.c:4 392 {cmpsi}
     (nil))
(insn 47 46 30 2 (set (reg:SI 76 [ <retval> ])
        (if_then_else:SI (eq (reg:CC 66 cc)
                (const_int 0 [0]))
            (reg:SI 87)
            (reg:SI 89))) fbad.c:4 444 {*cmovsi_insn}
     (expr_list:REG_DEAD (reg:SI 89)
        (expr_list:REG_DEAD (reg:SI 87)
            (expr_list:REG_DEAD (reg:CC 66 cc)
                (nil)))))

Combine is trying to combine 21 -> 47 in the first case (21 -> 46 in the 
second) and without the
transformation ends up looking into the uses of CC as well, the IF_THEN_ELSE in 
insn 48.
But I think it only does that if reg 87 is dead in insn 47.
In the end what happens is it fails to merge the comparison and the use of the 
comparison
and ends up emitting extra code.
So before:
    fmov    s2, 5.0e+0
    fmov    s1, 1.0e+1
    mov    w0, 6
    fcmpe    s0, s2
    fccmpe    s0, s1, 0, gt
    csel    w0, w0, wzr, mi //conditional select
    ret

after:
foo:
    fmov    s2, 5.0e+0
    fmov    s1, 1.0e+1
    mov    w1, 6
    fcmpe    s0, s2
    fccmpe    s0, s1, 0, gt
    cset    w0, mi // conditional set
    cmp    w0, 0
    csel    w0, w0, w1, eq
    ret

I think it has to do with the logic to set and use added_sets_2 in try_combine
but I got lost trying to debug it, so I sort of attributed it to "extending 
live ranges
of registers is bad"

Kyrill



Jeff

diff --git a/gcc/ifcvt.c b/gcc/ifcvt.c
index 24542f008485e6c28e068030fa301f2ce040efc1..18cda590f3c17d0f7a48ba3e3f09bd715f60891b 100644
--- a/gcc/ifcvt.c
+++ b/gcc/ifcvt.c
@@ -1810,13 +1810,38 @@ noce_try_cmove (struct noce_if_info *if_info)
       && (CONSTANT_P (if_info->b) || register_operand (if_info->b, VOIDmode)))
     {
       start_sequence ();
+      rtx cond = if_info->cond;
 
-      code = GET_CODE (if_info->cond);
-      target = noce_emit_cmove (if_info, if_info->x, code,
-				XEXP (if_info->cond, 0),
-				XEXP (if_info->cond, 1),
-				if_info->a, if_info->b);
+      code = GET_CODE (cond);
+
+      rtx a = if_info->a;
+      rtx b = if_info->b;
+      /* Transform x == CST ? CST : y into x == CST ? x : y to avoid
+	 materializing CST for the conditional move.  Similarly for
+	 inequality.  */
+      if ((code == EQ || code == NE)
+	  && REG_P (XEXP (cond, 0))
+	  && GET_MODE_CLASS (GET_MODE (XEXP (cond, 0))) != MODE_CC
+	  && GET_MODE (XEXP (cond, 0)) == GET_MODE (if_info->x)
+	  && CONSTANT_P (XEXP (cond, 1)))
+	{
+	  rtx cst = XEXP (cond, 1);
+	  rtx non_cst = XEXP (cond, 0);
+	  rtx eq_side = code == EQ ? b : a;
+
+	  if (rtx_equal_p (eq_side, cst))
+	    {
+	      if (code == EQ)
+		b = non_cst;
+	      else
+		a = non_cst;
+	    }
+	}
 
+      target = noce_emit_cmove (if_info, if_info->x, code,
+				XEXP (cond, 0),
+				XEXP (cond, 1),
+				a, b);
       if (target)
 	{
 	  if (target != if_info->x)
@@ -1840,11 +1865,11 @@ noce_try_cmove (struct noce_if_info *if_info)
 	 we don't know about, so give them a chance before trying this
 	 approach.  */
       else if (!targetm.have_conditional_execution ()
-		&& CONST_INT_P (if_info->a) && CONST_INT_P (if_info->b))
+		&& CONST_INT_P (a) && CONST_INT_P (b))
 	{
 	  machine_mode mode = GET_MODE (if_info->x);
-	  HOST_WIDE_INT ifalse = INTVAL (if_info->a);
-	  HOST_WIDE_INT itrue = INTVAL (if_info->b);
+	  HOST_WIDE_INT ifalse = INTVAL (a);
+	  HOST_WIDE_INT itrue = INTVAL (b);
 	  rtx target = noce_emit_store_flag (if_info, if_info->x, false, -1);
 	  if (!target)
 	    {
diff --git a/gcc/testsuite/gcc.target/aarch64/ifcvt_avoid_const_materialization_1.c b/gcc/testsuite/gcc.target/aarch64/ifcvt_avoid_const_materialization_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..3fdd072f643d91cc63014c9450a40c2345729f64
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/ifcvt_avoid_const_materialization_1.c
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+/* Check that we avoid moving the immediate into a register
+   if comparison has shown that it already is in one of the registers.  */
+
+float
+foo (float a, float b)
+{
+  return a == 0.0f ? 0.0f : b;
+}
+
+double
+doublefoo (double a, double b)
+{
+  return a == 0.0 ? 0.0 : b;
+}
+
+int
+bar (int a, int b)
+{
+  return a == 4 ? 4 : b;
+}
+
+int
+baz (int a, int b)
+{
+  return a != 5 ? b : 5;
+}
+
+/* { dg-final { scan-assembler-not "mov\\tw\[0-9\]+" } } */
+/* { dg-final { scan-assembler-not "fmov\\t(s|d)\[0-9\]+" } } */

Reply via email to