Hi Jeff,
On 28/09/16 17:07, Jeff Law wrote:
On 09/28/2016 02:48 AM, Kyrill Tkachov wrote:
Hi all,
This patch tries to avoid materialising an immediate
when comparison has shown that it is already loaded.
This is performed during ifcvt and only when the target has the
conditional negate
or inverse optabs, which for now is only aarch64.
Thus for the code:
int
foo (int a, int b)
{
return a == 5 ? -5 : b;
}
we currently emit on aarch64:
foo:
mov w2, -5
cmp w0, 5
csel w0, w1, w2, ne
ret
but with this patch we emit:
foo:
cmp w0, 5
csneg w0, w1, w0, ne
ret
Bootstrapped and tested on arm, aarch64, x86_64.
Ok for trunk?
Thanks,
Kyrill
P.S. The simpler form of the transformation: X == CST ? CST : Y --> X ==
CST ? X : Y
could also be beneficial IMO but I found that it can cause bad codegen
in combine
due to extending the live range of X. I'm still working on a way to work
around that,
but that is a separate piece of work anyway.
FWIW, the simpler form of the transformation is already done just prior to
leaving SSA form in tree-ssa-uncprop.c. So there may not be a ton of
opportunities for the simpler form in the RTL optimizers.
Indeed there weren't that many places, but they were some. They helped in
avoiding materialising the floating point 0.0
in particular in some cases.
My only concern here is this transformation isn't significantly different than
the simpler form, which is apparently causing some kind of combine problem.
I'd like to understand the combine problem better before giving final approval
to this patch.
Sure, I'm attaching the ifcvt patch that does the aforementioned simple
transform.
The testcase it regresses is:
int
foo (float a, float b, float c, int d)
{
return a > 5 && a < 10 ? 6 : 0;
}
As far as I got is:
The relevant pre-combine (aarch64) RTL without the patch is:
(insn 21 19 45 2 (set (reg:SI 87)
(zero_extend:SI (subreg:QI (reg:SI 86) 0))) fbad.c:4 90
{*zero_extendqisi2_aarch64}
(expr_list:REG_DEAD (reg:SI 86)
(nil)))
(insn 47 45 48 2 (set (reg:CC 66 cc)
(compare:CC (reg:SI 87)
(const_int 0 [0]))) fbad.c:4 392 {cmpsi}
(expr_list:REG_DEAD (reg:SI 87)
(nil)))
(insn 48 47 30 2 (set (reg:SI 76 [ <retval> ])
(if_then_else:SI (ne (reg:CC 66 cc)
(const_int 0 [0]))
(reg:SI 89)
(const_int 0 [0]))) fbad.c:4 444 {*cmovsi_insn}
(expr_list:REG_DEAD (reg:SI 89)
(expr_list:REG_DEAD (reg:CC 66 cc)
(nil))))
and with the patch it is:
(insn 21 19 45 2 (set (reg:SI 87)
(zero_extend:SI (subreg:QI (reg:SI 86) 0))) fbad.c:4 90
{*zero_extendqisi2_aarch64}
(expr_list:REG_DEAD (reg:SI 86)
(nil)))
(insn 46 45 47 2 (set (reg:CC 66 cc)
(compare:CC (reg:SI 87)
(const_int 0 [0]))) fbad.c:4 392 {cmpsi}
(nil))
(insn 47 46 30 2 (set (reg:SI 76 [ <retval> ])
(if_then_else:SI (eq (reg:CC 66 cc)
(const_int 0 [0]))
(reg:SI 87)
(reg:SI 89))) fbad.c:4 444 {*cmovsi_insn}
(expr_list:REG_DEAD (reg:SI 89)
(expr_list:REG_DEAD (reg:SI 87)
(expr_list:REG_DEAD (reg:CC 66 cc)
(nil)))))
Combine is trying to combine 21 -> 47 in the first case (21 -> 46 in the
second) and without the
transformation ends up looking into the uses of CC as well, the IF_THEN_ELSE in
insn 48.
But I think it only does that if reg 87 is dead in insn 47.
In the end what happens is it fails to merge the comparison and the use of the
comparison
and ends up emitting extra code.
So before:
fmov s2, 5.0e+0
fmov s1, 1.0e+1
mov w0, 6
fcmpe s0, s2
fccmpe s0, s1, 0, gt
csel w0, w0, wzr, mi //conditional select
ret
after:
foo:
fmov s2, 5.0e+0
fmov s1, 1.0e+1
mov w1, 6
fcmpe s0, s2
fccmpe s0, s1, 0, gt
cset w0, mi // conditional set
cmp w0, 0
csel w0, w0, w1, eq
ret
I think it has to do with the logic to set and use added_sets_2 in try_combine
but I got lost trying to debug it, so I sort of attributed it to "extending
live ranges
of registers is bad"
Kyrill
Jeff
diff --git a/gcc/ifcvt.c b/gcc/ifcvt.c
index 24542f008485e6c28e068030fa301f2ce040efc1..18cda590f3c17d0f7a48ba3e3f09bd715f60891b 100644
--- a/gcc/ifcvt.c
+++ b/gcc/ifcvt.c
@@ -1810,13 +1810,38 @@ noce_try_cmove (struct noce_if_info *if_info)
&& (CONSTANT_P (if_info->b) || register_operand (if_info->b, VOIDmode)))
{
start_sequence ();
+ rtx cond = if_info->cond;
- code = GET_CODE (if_info->cond);
- target = noce_emit_cmove (if_info, if_info->x, code,
- XEXP (if_info->cond, 0),
- XEXP (if_info->cond, 1),
- if_info->a, if_info->b);
+ code = GET_CODE (cond);
+
+ rtx a = if_info->a;
+ rtx b = if_info->b;
+ /* Transform x == CST ? CST : y into x == CST ? x : y to avoid
+ materializing CST for the conditional move. Similarly for
+ inequality. */
+ if ((code == EQ || code == NE)
+ && REG_P (XEXP (cond, 0))
+ && GET_MODE_CLASS (GET_MODE (XEXP (cond, 0))) != MODE_CC
+ && GET_MODE (XEXP (cond, 0)) == GET_MODE (if_info->x)
+ && CONSTANT_P (XEXP (cond, 1)))
+ {
+ rtx cst = XEXP (cond, 1);
+ rtx non_cst = XEXP (cond, 0);
+ rtx eq_side = code == EQ ? b : a;
+
+ if (rtx_equal_p (eq_side, cst))
+ {
+ if (code == EQ)
+ b = non_cst;
+ else
+ a = non_cst;
+ }
+ }
+ target = noce_emit_cmove (if_info, if_info->x, code,
+ XEXP (cond, 0),
+ XEXP (cond, 1),
+ a, b);
if (target)
{
if (target != if_info->x)
@@ -1840,11 +1865,11 @@ noce_try_cmove (struct noce_if_info *if_info)
we don't know about, so give them a chance before trying this
approach. */
else if (!targetm.have_conditional_execution ()
- && CONST_INT_P (if_info->a) && CONST_INT_P (if_info->b))
+ && CONST_INT_P (a) && CONST_INT_P (b))
{
machine_mode mode = GET_MODE (if_info->x);
- HOST_WIDE_INT ifalse = INTVAL (if_info->a);
- HOST_WIDE_INT itrue = INTVAL (if_info->b);
+ HOST_WIDE_INT ifalse = INTVAL (a);
+ HOST_WIDE_INT itrue = INTVAL (b);
rtx target = noce_emit_store_flag (if_info, if_info->x, false, -1);
if (!target)
{
diff --git a/gcc/testsuite/gcc.target/aarch64/ifcvt_avoid_const_materialization_1.c b/gcc/testsuite/gcc.target/aarch64/ifcvt_avoid_const_materialization_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..3fdd072f643d91cc63014c9450a40c2345729f64
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/ifcvt_avoid_const_materialization_1.c
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+/* Check that we avoid moving the immediate into a register
+ if comparison has shown that it already is in one of the registers. */
+
+float
+foo (float a, float b)
+{
+ return a == 0.0f ? 0.0f : b;
+}
+
+double
+doublefoo (double a, double b)
+{
+ return a == 0.0 ? 0.0 : b;
+}
+
+int
+bar (int a, int b)
+{
+ return a == 4 ? 4 : b;
+}
+
+int
+baz (int a, int b)
+{
+ return a != 5 ? b : 5;
+}
+
+/* { dg-final { scan-assembler-not "mov\\tw\[0-9\]+" } } */
+/* { dg-final { scan-assembler-not "fmov\\t(s|d)\[0-9\]+" } } */