On 17/06/13 19:07, Richard Biener wrote:
On Mon, 17 Jun 2013, Kugan wrote:

Hi,

I am attempting to fix Bug 43721 - Failure to optimise (a/b) and (a%b) into
single __aeabi_idivmod call
(http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43721)

execute_cse_sincos tree level pass does similar cse so I attempted to use
similar approach here. Div/mod cse is not really using built-in functions
though at this level.

The issue with performing the transform at the same time as we
transform SINCOS is that the vectorizer will now no longer be able
to vectorize these loops.  It would need to be taught how to
handle the builtin calls (basically undo the transformation, I don't
know of any ISA that can do vectorized combined div/mod).  Which
means it should rather be done at the point we CSE reciprocals
(which also replaces computes with builtin target function calls).

Thanks Richard. Since execute_cse_reciprocals is handling reciprocals only, I added another pass to do divmod. Is that OK?

For the case of div and mod operations, after CSE is performed, there isnt a
way to represent the resulting stament in gimple. We will endup with divmod
taking two arguments and returning double the size of one arguments in the
three address format (divmod will return reminder and quotient so the return
type is double the size of argument type).

Since GIMPLE_ASSIGN will result in type checking failure in this case, I
atempted use built-in functions (GIMPLE_CALL to represent the runtime library
call). Name for the function here  is target specific and can be obtained from
sdivmod_optab so the builtin function name defined in tree level is not used.
I am not entirelt sure this is the right approach so I am attaching the first
cut of the patch to get your feedback and understand the right approach to
this problem.

If we don't want to expose new builtins to the user (I'm not sure we
want that), then using "internal functions" is an easier way to
avoid these issues (see gimple.h and internal-fn.(def|h)).

I have now changed to use internal functions. Thanks for that.

Generally the transform looks useful to me as it moves forward with
the general idea of moving pattern recognition done during RTL expansion
to an earlier place.

For the use of a larger integer type and shifts to represent
the modulo and division result I don't think that's the very best
idea.  Instead resorting to a complex integer type as return
value looks more appealing (similar to sincos using cexpi here).
That way you also avoid the ugly hard-coding of bit-sizes.


I have changed it to use complex integers now.

+  if (HAVE_divsi3
+       || (GET_MODE_BITSIZE (TYPE_MODE (type)) != 32)

watch out for types whose TYPE_PRECISION is not the bitsize
of their mode.  Also it should be GET_MODE_PRECISION here.

+       || !optab_libfunc (TYPE_UNSIGNED (type)? udivmod_optab :
sdivmod_optab,
+                            TYPE_MODE (type)))

targets that use a libfunc should also get this optimization, as
it always removes computations.  I think the proper test is
for whether the target can do division and/or modulus without
using a libfunc, not whether there is a divmod optab/libfunc.

I guess best way to do is by defining a target hook and let the target define the required behaviour. Is that what you had in mind?

I have attached a modified patch with these changes.

Others knowing this piece of the compiler better may want to comment
here, of course.

Thanks,
Richard.




Thanks,
Kugan
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index f030b56..3fae80e 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -11375,3 +11375,8 @@ It returns true if the target supports GNU indirect functions.
 The support includes the assembler, linker and dynamic linker.
 The default value of this hook is based on target's libc.
 @end deftypefn
+
+@deftypefn {Target Hook} bool TARGET_COMBINE_DIVMOD (enum machine_mode @var{mode})
+This target hook returns @code{true} if the target provides divmod libcall operation for the machine mode @var{mode} and must be used to combine integer division and modulus operations. Return @code{false} otherwise.
+@end deftypefn
+
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index cc25fec..12974b1 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -11198,3 +11198,6 @@ memory model bits are allowed.
 @hook TARGET_ATOMIC_TEST_AND_SET_TRUEVAL
 
 @hook TARGET_HAS_IFUNC_P
+
+@hook TARGET_COMBINE_DIVMOD
+
diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c
index b841abd..0db06f1 100644
--- a/gcc/internal-fn.c
+++ b/gcc/internal-fn.c
@@ -61,6 +61,62 @@ get_multi_vector_move (tree array_type, convert_optab optab)
   return icode;
 }
 
+/* Expand DIVMOD call STMT.  */
+static void
+expand_DIVMOD (gimple stmt)
+{
+  tree type, lhs, arg0, arg1;
+  rtx op0, op1, res0, res1, target;
+  enum machine_mode mode, compute_mode;
+  rtx libval;
+  rtx libfunc = NULL_RTX;
+  bool is_unsigned;
+
+  lhs = gimple_call_lhs (stmt);
+  arg0 = gimple_call_arg (stmt, 0);
+  arg1 = gimple_call_arg (stmt, 1);
+  mode = TYPE_MODE (TREE_TYPE (arg0));
+  is_unsigned = TYPE_UNSIGNED (TREE_TYPE (arg0));
+
+  op0 = expand_expr (arg0, NULL_RTX, VOIDmode, EXPAND_NORMAL);
+  op1 = expand_expr (arg1, NULL_RTX, VOIDmode, EXPAND_NORMAL);
+  target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
+
+  /* Now convert to the best mode to use.  */
+  for (compute_mode = mode; compute_mode != VOIDmode;
+       compute_mode = GET_MODE_WIDER_MODE (compute_mode))
+    if (optab_libfunc (is_unsigned ? udivmod_optab : sdivmod_optab, compute_mode))
+      break;
+
+  if (compute_mode != mode)
+    {
+      op0 = convert_modes (compute_mode, mode, op0, is_unsigned);
+      op1 = convert_modes (compute_mode, mode, op1, is_unsigned);
+    }
+
+  /* Get the libcall.  */
+  libfunc = optab_libfunc (is_unsigned ? udivmod_optab : sdivmod_optab, compute_mode);
+  gcc_assert (libfunc);
+  machine_mode libval_mode
+    = smallest_mode_for_size (2 * GET_MODE_BITSIZE (compute_mode),
+                              MODE_INT);
+  libval = emit_library_call_value (libfunc, NULL_RTX, LCT_CONST,
+                                    libval_mode, 2,
+                                    op0, compute_mode,
+                                    op1, compute_mode);
+
+  /* Get quotient and remainder into two registers.  */
+  res0 = simplify_gen_subreg (mode, libval, libval_mode, 0);
+  res1 = simplify_gen_subreg (mode, libval, libval_mode, GET_MODE_SIZE (compute_mode));
+
+  /* Now build the complex integer target.  */
+  expand_expr (build2 (COMPLEX_EXPR, TREE_TYPE (lhs),
+                       make_tree (TREE_TYPE (arg0), res0),
+                       make_tree (TREE_TYPE (arg1), res1)),
+               target, VOIDmode, EXPAND_NORMAL);
+}
+
+
 /* Expand LOAD_LANES call STMT.  */
 
 static void
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 8900d90..4c5d4e0 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -40,3 +40,4 @@ along with GCC; see the file COPYING3.  If not see
 
 DEF_INTERNAL_FN (LOAD_LANES, ECF_CONST | ECF_LEAF)
 DEF_INTERNAL_FN (STORE_LANES, ECF_CONST | ECF_LEAF)
+DEF_INTERNAL_FN (DIVMOD, ECF_CONST | ECF_LEAF)
diff --git a/gcc/passes.c b/gcc/passes.c
index c8b03ee..03cea4c 100644
--- a/gcc/passes.c
+++ b/gcc/passes.c
@@ -1492,6 +1492,7 @@ init_optimization_passes (void)
 	}
       NEXT_PASS (pass_lower_vector_ssa);
       NEXT_PASS (pass_cse_reciprocals);
+      NEXT_PASS (pass_cse_divmod);
       NEXT_PASS (pass_reassoc);
       NEXT_PASS (pass_strength_reduction);
       NEXT_PASS (pass_dominator);
diff --git a/gcc/target.def b/gcc/target.def
index 3ba3e0a..8036a97 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -2500,6 +2500,15 @@ DEFHOOK
  bool, (const_tree field, enum machine_mode mode),
  default_member_type_forces_blk)
 
+/* True if div and mod operations for MODE should be combined.  */
+DEFHOOK
+(combine_divmod,
+ "This target hook returns @code{true} if the target provides divmod libcall\
+ operation for the machine mode @var{mode} and must be used to combine\
+ integer division and modulus operations. Return @code{false} otherwise.",
+ bool, (enum machine_mode mode),
+ default_combine_divmod)
+
 /* Return the class for a secondary reload, and fill in extra information.  */
 DEFHOOK
 (secondary_reload,
diff --git a/gcc/targhooks.c b/gcc/targhooks.c
index d3a3f5f..afa8076 100644
--- a/gcc/targhooks.c
+++ b/gcc/targhooks.c
@@ -1560,6 +1560,14 @@ default_member_type_forces_blk (const_tree, enum machine_mode)
   return false;
 }
 
+/* Default version of combine_divmod.  */
+
+bool
+default_combine_divmod (enum machine_mode)
+{
+  return false;
+}
+
 /* Default version of canonicalize_comparison.  */
 
 void
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index 2da6fb8..97dd036 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -198,3 +198,5 @@ extern void default_asm_output_ident_directive (const char*);
 
 extern enum machine_mode default_cstore_mode (enum insn_code);
 extern bool default_member_type_forces_blk (const_tree, enum machine_mode);
+
+extern bool default_combine_divmod (enum machine_mode);
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index b8c59a7..4fe7d24 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -329,6 +329,7 @@ extern struct gimple_opt_pass pass_stdarg;
 extern struct gimple_opt_pass pass_early_warn_uninitialized;
 extern struct gimple_opt_pass pass_late_warn_uninitialized;
 extern struct gimple_opt_pass pass_cse_reciprocals;
+extern struct gimple_opt_pass pass_cse_divmod;
 extern struct gimple_opt_pass pass_cse_sincos;
 extern struct gimple_opt_pass pass_optimize_bswap;
 extern struct gimple_opt_pass pass_optimize_widening_mul;
diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c
index e9c32b3..8e09fe9 100644
--- a/gcc/tree-ssa-math-opts.c
+++ b/gcc/tree-ssa-math-opts.c
@@ -147,6 +147,15 @@ static struct
 
 static struct
 {
+  /* Number of combined dovmod calls inserted.  */
+  int divmod_calls_inserted;
+
+  /* Number of instructions uses divmod result.  */
+  int divmod_result_used;
+} divmod_stats;
+
+static struct
+{
   /* Number of cexpi calls inserted.  */
   int inserted;
 } sincos_stats;
@@ -664,7 +673,7 @@ struct gimple_opt_pass pass_cse_reciprocals =
    statements in the vector.  */
 
 static bool
-maybe_record_sincos (vec<gimple> *stmts,
+maybe_record_stmt (vec<gimple> *stmts,
 		     basic_block *top_bb, gimple use_stmt)
 {
   basic_block use_bb = gimple_bb (use_stmt);
@@ -684,6 +693,229 @@ maybe_record_sincos (vec<gimple> *stmts,
   return true;
 }
 
+/* Look for div and mod statements with the same operands and
+   create a library call if needed.  We first walk over all
+   immediate uses of one of the operand looking for matched statement
+   that we can combine.  In a second pass replace the statement with
+   a library calls and statements with the results from library call.  */
+
+static bool
+execute_cse_div_mod_1 (gimple stmt)
+{
+  gimple_stmt_iterator gsi;
+  imm_use_iterator use_iter;
+  gimple use_stmt;
+  bool found = false;
+  vec<gimple> stmts = vNULL;
+  basic_block top_bb = NULL;
+  bool cfg_changed = false;
+  int i;
+  tree type, op1, op2;
+
+  op1 = gimple_assign_rhs1 (stmt);
+  op2 = gimple_assign_rhs2 (stmt);
+  type = TREE_TYPE (op1);
+
+  /* Skip if both the operands and constant. */
+  if (TREE_CODE (op1) == INTEGER_CST && TREE_CODE (op2) == INTEGER_CST)
+    return false;
+
+  /* Skip if the target doesnt support or require it.  */
+  if (!targetm.combine_divmod (TYPE_MODE (type)))
+    return false;
+
+  stmts.safe_push (stmt);
+  top_bb = gimple_bb (stmt);
+
+  /* look fpr a TRUNC_MOD_EXPR coresponding to stmt
+     TRUNC_DIV_EXPR s to combine.  */
+  FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, (TREE_CODE (op2) == SSA_NAME) ? op2 : op1)
+    {
+      if (is_gimple_assign (use_stmt)
+          && (gimple_assign_rhs_code (use_stmt) == TRUNC_MOD_EXPR))
+        {
+          tree rhs1 = gimple_assign_rhs1 (use_stmt);
+          tree rhs2 = gimple_assign_rhs2 (use_stmt);
+
+          if ((rhs1 == op1 && rhs2 == op2)
+              ||(rhs1 == op1 && rhs2 == op2))
+            found |= maybe_record_stmt (&stmts, &top_bb, use_stmt);
+        }
+    }
+
+  /* If we have matched instructions to combine, try creatiing Library
+     call and use the results.  */
+  if (found)
+    {
+      gimple def_stmt1 = NULL, def_stmt2 = NULL;
+      bool insert_after_op1_def, insert_after_op2_def;
+      tree res, rhs;
+      gimple assign_stmt, call_stmt;
+      tree return_type = build_complex_type (type);
+
+      if (TREE_CODE (op1) == SSA_NAME)
+        def_stmt1 = SSA_NAME_DEF_STMT (op1);
+      if (TREE_CODE (op2) == SSA_NAME)
+        def_stmt2 = SSA_NAME_DEF_STMT (op2);
+
+      /* Is the call to be insterted adter op1 define stmt.  */
+      insert_after_op1_def = TREE_CODE (op1) == SSA_NAME
+                            && !SSA_NAME_IS_DEFAULT_DEF (op1)
+                            && gimple_code (def_stmt1) != GIMPLE_PHI
+                            && gimple_bb (def_stmt1) == top_bb;
+
+      /* Is the call to be insterted adter op2 define stmt.  */
+      insert_after_op2_def = TREE_CODE (op2) == SSA_NAME
+                            && !SSA_NAME_IS_DEFAULT_DEF (op2)
+                            && gimple_code (def_stmt2) != GIMPLE_PHI
+                            && gimple_bb (def_stmt2) == top_bb;
+
+      /* Create a library call and instert it at the place idenified.  */
+      call_stmt = gimple_build_call_internal (IFN_DIVMOD, 2, op1, op2);
+      res = make_temp_ssa_name (return_type,
+                                call_stmt, "divmod");
+      gimple_call_set_lhs (call_stmt, res);
+      divmod_stats.divmod_calls_inserted++;
+
+      /* Insert call at the beginning of top_bb but not earlier
+         than the name def statement.  */
+      if (insert_after_op1_def || insert_after_op2_def)
+        {
+          if (insert_after_op1_def && insert_after_op2_def)
+            {
+              for (gsi = gsi_start_bb (top_bb);
+                   !gsi_end_p (gsi); gsi_next (&gsi))
+                {
+                  gimple g = gsi_stmt (gsi);
+                  if (g == def_stmt1)
+                    {
+                      gsi = gsi_for_stmt (def_stmt2);
+                      break;
+                    }
+                  else if (g == def_stmt2)
+                    {
+                      gsi = gsi_for_stmt (def_stmt1);
+                      break;
+                    }
+                }
+            }
+
+          /* Only one of the definition is in the basic block, place after
+             that.  */
+          else if (insert_after_op1_def)
+            gsi = gsi_for_stmt (def_stmt1);
+          else
+            gsi = gsi_for_stmt (def_stmt2);
+
+          gsi_insert_after (&gsi, call_stmt, GSI_SAME_STMT);
+        }
+      else
+        {
+          /* op1 and op2 are defined before the top_bb.  Insert as first
+             non label instruction.  */
+          gsi = gsi_after_labels (top_bb);
+          gsi_insert_before (&gsi, call_stmt, GSI_SAME_STMT);
+        }
+
+      /* Adjust the recorded old statements to use the results.  */
+      for (i = 0; stmts.iterate (i, &use_stmt); ++i)
+        {
+          gsi = gsi_for_stmt (use_stmt);
+          divmod_stats.divmod_result_used++;
+
+          switch (gimple_assign_rhs_code (use_stmt))
+            {
+            case TRUNC_MOD_EXPR:
+              /* Get the top 32 bit from result and assign it.  */
+              rhs = fold_build1 (IMAGPART_EXPR, type, res);
+              break;
+
+            case TRUNC_DIV_EXPR:
+              /* Get the bottom 32 bit from result and assign it.  */
+              rhs = fold_build1 (REALPART_EXPR, type, res);
+              break;
+
+            default:
+              gcc_unreachable ();
+            }
+
+          /* Replace the statement with a copy.  */
+          assign_stmt = gimple_build_assign (gimple_assign_lhs (use_stmt), rhs);
+          gsi_replace (&gsi, assign_stmt, true);
+
+          if (gimple_purge_dead_eh_edges (gimple_bb (assign_stmt)))
+            cfg_changed = true;
+        }
+    }
+
+  stmts.release ();
+  return cfg_changed;
+}
+
+/* Go through all the floating-point SSA_NAMEs, and call
+   execute_cse_reciprocals_1 on each of them.  */
+static unsigned int
+execute_cse_divmod (void)
+{
+  basic_block bb;
+
+  memset (&divmod_stats, 0, sizeof (divmod_stats));
+  calculate_dominance_info (CDI_DOMINATORS);
+  calculate_dominance_info (CDI_POST_DOMINATORS);
+  FOR_EACH_BB (bb)
+    {
+      gimple_stmt_iterator gsi;
+      for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+        {
+          gimple stmt = gsi_stmt (gsi);
+
+          if (is_gimple_assign (stmt)
+              && (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_binary)
+              && (gimple_assign_rhs_code (stmt) ==  TRUNC_DIV_EXPR))
+            {
+              execute_cse_div_mod_1 (stmt);
+            }
+        }
+    }
+
+  statistics_counter_event (cfun, "number of combined divmod calls inserted",
+                            divmod_stats.divmod_calls_inserted);
+  statistics_counter_event (cfun, "number of instructions uses divmod result",
+                            divmod_stats.divmod_result_used);
+
+  free_dominance_info (CDI_DOMINATORS);
+  free_dominance_info (CDI_POST_DOMINATORS);
+  return 0;
+}
+
+
+static bool
+gate_cse_divmod (void)
+{
+  return optimize;
+}
+
+struct gimple_opt_pass pass_cse_divmod =
+{
+ {
+  GIMPLE_PASS,
+  "divmod",                             /* name */
+  OPTGROUP_NONE,                        /* optinfo_flags */
+  gate_cse_divmod,                      /* gate */
+  execute_cse_divmod,                   /* execute */
+  NULL,                                 /* sub */
+  NULL,                                 /* next */
+  0,                                    /* static_pass_number */
+  TV_NONE,                              /* tv_id */
+  PROP_ssa,                             /* properties_required */
+  0,                                    /* properties_provided */
+  0,                                    /* properties_destroyed */
+  0,                                    /* todo_flags_start */
+  TODO_update_ssa | TODO_verify_ssa
+    | TODO_verify_stmts                /* todo_flags_finish */
+ }
+};
+
 /* Look for sin, cos and cexpi calls with the same argument NAME and
    create a single call to cexpi CSEing the result in this case.
    We first walk over all immediate uses of the argument collecting
@@ -717,15 +949,15 @@ execute_cse_sincos_1 (tree name)
       switch (DECL_FUNCTION_CODE (fndecl))
 	{
 	CASE_FLT_FN (BUILT_IN_COS):
-	  seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
+          seen_cos |= maybe_record_stmt (&stmts, &top_bb, use_stmt) ? 1 : 0;
 	  break;
 
 	CASE_FLT_FN (BUILT_IN_SIN):
-	  seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
+          seen_sin |= maybe_record_stmt (&stmts, &top_bb, use_stmt) ? 1 : 0;
 	  break;
 
 	CASE_FLT_FN (BUILT_IN_CEXPI):
-	  seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
+          seen_cexpi |= maybe_record_stmt (&stmts, &top_bb, use_stmt) ? 1 : 0;
 	  break;
 
 	default:;

Reply via email to