Hi,

recently I wondered why a snippet like the following is not being
if-converted at all on s390:

int foo (int *a, unsigned int n)
{
  int min = 999999;
  int bla = 0;
  for (int i = 0; i < n; i++)
    {
      if (a[i] < min)
        {
          min = a[i];
          bla = 1;
        }
    }

  if (bla)
    min += 1;
  return min;
}

After skimming through ifcvt's code, it turned out that there are two
paths that could handle such a situation.  At first I looked at
cond_move_process_if_block ().  One condition to check if the block is
suitable for processing is

      if (reg_overlap_mentioned_p (dest, cond))
        return FALSE;

i.e. abort if the destination of a conditional move appears in the
condition.  This means we do not handle minimum or maximum searches like
if (a < min) min = a; with this.  That might be intentional as
noce_try_minmax () can handle it but for me the reason was not entirely
obvious.

Disabling the condition above for the snippet

 if (a[i] < min)
 {
   min = a[i];
   bla = 1;
 }

yields assembly like the following on i386:

 cmpl    %edx, %ecx
 cmovg   %ecx, %edx
 cmpl    %edx, %ecx
 cmovg   %eax, %esi

meaning the compare is emitted multiple times and a write to a register
used by the compare is obviously wrong.

The compares are produced by noce_emit_cmove () calling
emit_conditional_move () which, in turn, seems to prepare and emit a
comparison every time it emits a conditional move.  Is this really
necessary or the correct thing to do?

--

The second ifcvt path uses noce_convert_multiple_sets () whose
preconditions don't seem as strict.  The conversion result is never
considered however, because costs are estimated as higher than the
non-converted version.  The cost estimation seems odd to me at best though:

Before noce_process_if_block () the original costs are set to
 if_info.original_cost = COSTS_N_INSNS (2);
but the adding of the then/else block costs is only performed after
noce_convert_multiple_sets () i.e. the costs after conversion will never
be lower than the original costs.  When artifically lowering the costs
we end up with the same assembly sequence as above including the
superfluous compares.

To address these issues I came up with a tentative patch that
 - adds cost for the original then basic block and adapts s390's
noce_conversion_profitable_p () hook to allow processing (magic value
for now).
 - extracts the compare from the first-emitted conditional move and uses
it for the subsequent conditional moves getting rid of all but the first
compare.

Test suite on s390 and i386 looks ok.

Some questions/remarks, comments welcome:
 - ifcvt performs creates things that it expects other passes to clean
up afterwards.  In the case of the superfluous compares this might be
possible but the code is actually wrong and we cannot rely on a pass
fixing wrong code.
 - The extraction of the compare from the conditional move is pretty
ad-hoc and pattern-dependent currently.  Is there a way to do it in a
more backend-independent fashion?
 - Branch mispredict costs don't seem to play a major role in ifcvt.
Shouldn't they be accounted for in all cases, maybe weighted by bb
probabilities?

Regards
 Robin
diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c
index 0c72098..e9cdf5d 100644
--- a/gcc/config/s390/s390.c
+++ b/gcc/config/s390/s390.c
@@ -78,6 +78,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "rtl-iter.h"
 #include "intl.h"
 #include "tm-constrs.h"
+#include "ifcvt.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -15671,6 +15672,22 @@ s390_asan_shadow_offset (void)
   return TARGET_64BIT ? HOST_WIDE_INT_1U << 52 : HOST_WIDE_INT_UC (0x20000000);
 }
 
+static bool
+s390_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *if_info)
+{
+  bool speed_p = if_info->speed_p;
+
+  /* Cost up the new sequence.  */
+  unsigned int cost = seq_cost (seq, speed_p) - 5;
+
+  if (cost <= if_info->original_cost)
+    return true;
+
+  /* When compiling for size, we can make a reasonably accurately guess
+     at the size growth.  When compiling for speed, use the maximum.  */
+  return speed_p && cost <= if_info->max_seq_cost;
+}
+
 /* Initialize GCC target structure.  */
 
 #undef  TARGET_ASM_ALIGNED_HI_OP
@@ -15902,6 +15919,9 @@ s390_asan_shadow_offset (void)
 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT s390_support_vector_misalignment
 
+#undef TARGET_NOCE_CONVERSION_PROFITABLE_P
+#define TARGET_NOCE_CONVERSION_PROFITABLE_P s390_noce_conversion_profitable_p
+
 #undef TARGET_VECTOR_ALIGNMENT
 #define TARGET_VECTOR_ALIGNMENT s390_vector_alignment
 
diff --git a/gcc/ifcvt.c b/gcc/ifcvt.c
index fd682a4..e0b3f48 100644
--- a/gcc/ifcvt.c
+++ b/gcc/ifcvt.c
@@ -772,6 +772,8 @@ static int noce_try_store_flag_constants (struct noce_if_info *);
 static int noce_try_store_flag_mask (struct noce_if_info *);
 static rtx noce_emit_cmove (struct noce_if_info *, rtx, enum rtx_code, rtx,
 			    rtx, rtx, rtx);
+static rtx noce_emit_cmove (struct noce_if_info *, rtx, enum rtx_code, rtx,
+			    rtx, rtx, rtx, rtx);
 static int noce_try_cmove (struct noce_if_info *);
 static int noce_try_cmove_arith (struct noce_if_info *);
 static rtx noce_get_alt_condition (struct noce_if_info *, rtx, rtx_insn **);
@@ -1654,7 +1656,7 @@ noce_try_store_flag_mask (struct noce_if_info *if_info)
 
 static rtx
 noce_emit_cmove (struct noce_if_info *if_info, rtx x, enum rtx_code code,
-		 rtx cmp_a, rtx cmp_b, rtx vfalse, rtx vtrue)
+		 rtx cmp_a, rtx cmp_b, rtx vfalse, rtx vtrue, rtx cached_cmp)
 {
   rtx target ATTRIBUTE_UNUSED;
   int unsignedp ATTRIBUTE_UNUSED;
@@ -1702,7 +1704,8 @@ noce_emit_cmove (struct noce_if_info *if_info, rtx x, enum rtx_code code,
 
   target = emit_conditional_move (x, code, cmp_a, cmp_b, VOIDmode,
 				  vtrue, vfalse, GET_MODE (x),
-				  unsignedp);
+				  unsignedp, cached_cmp);
+
   if (target)
     return target;
 
@@ -1740,7 +1743,8 @@ noce_emit_cmove (struct noce_if_info *if_info, rtx x, enum rtx_code code,
 
       target = emit_conditional_move (promoted_target, code, cmp_a, cmp_b,
 				      VOIDmode, reg_vtrue, reg_vfalse,
-				      GET_MODE (reg_vtrue), unsignedp);
+				      GET_MODE (reg_vtrue), unsignedp,
+				      cached_cmp);
       /* Nope, couldn't do it in that mode either.  */
       if (!target)
 	return NULL_RTX;
@@ -1755,6 +1759,14 @@ noce_emit_cmove (struct noce_if_info *if_info, rtx x, enum rtx_code code,
     return NULL_RTX;
 }
 
+static rtx
+noce_emit_cmove (struct noce_if_info *if_info, rtx x, enum rtx_code code,
+		 rtx cmp_a, rtx cmp_b, rtx vfalse, rtx vtrue)
+{
+  return noce_emit_cmove (if_info, x, code, cmp_a, cmp_b, vfalse, vtrue,
+			  NULL_RTX);
+}
+
 /* Try only simple constants and registers here.  More complex cases
    are handled in noce_try_cmove_arith after noce_try_store_flag_arith
    has had a go at it.  */
@@ -3074,6 +3086,54 @@ bb_valid_for_noce_process_p (basic_block test_bb, rtx cond,
   return false;
 }
 
+/* Precondition: valid_noce_bla_block.  */
+static void
+get_bb_cost (basic_block test_bb, unsigned int *cost)
+{
+  rtx_insn *last_insn = last_active_insn (test_bb, FALSE);
+  rtx last_set = NULL_RTX;
+
+  last_set = single_set (last_insn);
+
+  rtx x = SET_DEST (last_set);
+  rtx_insn *first_insn = first_active_insn (test_bb);
+  rtx first_set = single_set (first_insn);
+
+  if (!first_set)
+    return;
+
+  /* We have a single simple set, that's okay.  */
+  bool speed_p = optimize_bb_for_speed_p (test_bb);
+
+  if (first_insn == last_insn)
+    {
+      *cost += insn_rtx_cost (first_set, speed_p);
+      return;
+    }
+
+  rtx_insn *prev_last_insn = PREV_INSN (last_insn);
+  gcc_assert (prev_last_insn);
+
+  int potential_cost = insn_rtx_cost (last_set, speed_p);
+  rtx_insn *insn;
+  FOR_BB_INSNS (test_bb, insn)
+    {
+      if (insn != last_insn)
+	{
+	  if (!active_insn_p (insn))
+	    continue;
+
+	  rtx sset = single_set (insn);
+	  gcc_assert (sset);
+
+	  potential_cost += insn_rtx_cost (sset, speed_p);
+	}
+    }
+
+  *cost += potential_cost;
+  return;
+}
+
 /* We have something like:
 
      if (x > y)
@@ -3142,6 +3202,10 @@ noce_convert_multiple_sets (struct noce_if_info *if_info)
   auto_vec<rtx_insn *> unmodified_insns;
   int count = 0;
 
+  rtx cached_cmp;
+  bool have_cached_cmp = false;
+  bool first = true;
+
   FOR_BB_INSNS (then_bb, insn)
     {
       /* Skip over non-insns.  */
@@ -3214,9 +3278,33 @@ noce_convert_multiple_sets (struct noce_if_info *if_info)
 	  old_val = lowpart_subreg (dst_mode, old_val, src_mode);
 	}
 
-      /* Actually emit the conditional move.  */
-      rtx temp_dest = noce_emit_cmove (if_info, temp, cond_code,
-				       x, y, new_val, old_val);
+      rtx temp_dest;
+      rtx_insn *cmove_insn;
+
+      if (first || !have_cached_cmp)
+	{
+	  temp_dest = noce_emit_cmove (if_info, temp, cond_code,
+				       x, y, new_val, old_val, NULL_RTX);
+	  if (temp_dest != NULL_RTX)
+	    {
+	      first = false;
+	      cmove_insn = get_last_insn ();
+	      if (cmove_insn != NULL_RTX
+		  && GET_CODE (PATTERN (cmove_insn)) == SET
+		  && GET_CODE (SET_SRC (PATTERN (cmove_insn))) == IF_THEN_ELSE
+		  && XEXP (SET_SRC (PATTERN (cmove_insn)), 0) != NULL_RTX)
+		{
+		  have_cached_cmp = true;
+		  cached_cmp = XEXP (SET_SRC (PATTERN (cmove_insn)), 0);
+		}
+	    }
+	}
+      else
+	{
+	  /* Actually emit the conditional move.  */
+	  temp_dest = noce_emit_cmove (if_info, temp, cond_code,
+				       x, y, new_val, old_val, cached_cmp);
+	}
 
       /* If we failed to expand the conditional move, drop out and don't
 	 try to continue.  */
@@ -3390,7 +3478,11 @@ noce_process_if_block (struct noce_if_info *if_info)
       && !HAVE_cc0
       && bb_ok_for_noce_convert_multiple_sets (then_bb))
     {
-      if (noce_convert_multiple_sets (if_info))
+      struct noce_if_info tmp_info = *if_info;
+      unsigned int cost = tmp_info.original_cost;
+      get_bb_cost (then_bb, &cost);
+      tmp_info.original_cost += cost;
+      if (noce_convert_multiple_sets (&tmp_info))
 	{
 	  if (dump_file && if_info->transform_name)
 	    fprintf (dump_file, "if-conversion succeeded through %s\n",
@@ -3796,6 +3888,7 @@ cond_move_convert_if_block (struct noce_if_info *if_infop,
 
       target = noce_emit_cmove (if_infop, dest, code, cond_arg0, cond_arg1,
 				t, e);
+
       if (!target)
 	return false;
 
diff --git a/gcc/optabs.c b/gcc/optabs.c
index 8fd5d91..eb64749 100644
--- a/gcc/optabs.c
+++ b/gcc/optabs.c
@@ -4223,7 +4223,7 @@ emit_indirect_jump (rtx loc)
 rtx
 emit_conditional_move (rtx target, enum rtx_code code, rtx op0, rtx op1,
 		       machine_mode cmode, rtx op2, rtx op3,
-		       machine_mode mode, int unsignedp)
+		       machine_mode mode, int unsignedp, rtx cached_cmp)
 {
   rtx comparison;
   rtx_insn *last;
@@ -4305,7 +4305,10 @@ emit_conditional_move (rtx target, enum rtx_code code, rtx op0, rtx op1,
 	      struct expand_operand ops[4];
 
 	      create_output_operand (&ops[0], target, mode);
-	      create_fixed_operand (&ops[1], comparison);
+	      if (cached_cmp != NULL_RTX)
+		create_fixed_operand (&ops[1], cached_cmp);
+	      else
+		create_fixed_operand (&ops[1], comparison);
 	      create_input_operand (&ops[2], op2, mode);
 	      create_input_operand (&ops[3], op3, mode);
 	      if (maybe_expand_insn (icode, 4, ops))
@@ -4336,6 +4339,16 @@ emit_conditional_move (rtx target, enum rtx_code code, rtx op0, rtx op1,
     }
 }
 
+rtx
+emit_conditional_move (rtx target, enum rtx_code code, rtx op0, rtx op1,
+		       machine_mode cmode, rtx op2, rtx op3,
+		       machine_mode mode, int unsignedp)
+{
+  return emit_conditional_move (target, code, op0, op1, cmode, op2, op3,
+				mode, unsignedp, NULL_RTX);
+
+}
+
 
 /* Emit a conditional negate or bitwise complement using the
    negcc or notcc optabs if available.  Return NULL_RTX if such operations
diff --git a/gcc/optabs.h b/gcc/optabs.h
index 07d07fe..b522c8b 100644
--- a/gcc/optabs.h
+++ b/gcc/optabs.h
@@ -262,6 +262,8 @@ extern void emit_indirect_jump (rtx);
 
 /* Emit a conditional move operation.  */
 rtx emit_conditional_move (rtx, enum rtx_code, rtx, rtx, machine_mode,
+			   rtx, rtx, machine_mode, int, rtx);
+rtx emit_conditional_move (rtx, enum rtx_code, rtx, rtx, machine_mode,
 			   rtx, rtx, machine_mode, int);
 
 /* Emit a conditional negate or bitwise complement operation.  */

Reply via email to