New version of the patch, with all of Richard Sandiford's comments
applied and retested.
Ok for commit?
Kenny
2012-03-31 Kenneth Zadeck <zad...@naturalbridge.com>
* toplev.c (backend_init_target): Call initializer for lower-subreg
pass.
* lower-subreg.c (target_info): New static var.
(compute_move_cost, profitable_shift_p, init_lower_subreg): New
functions.
(find_pseudo_copy, resolve_simple_move): Added code to only split
based on costs.
(find_decomposable_subregs): Added code to mark as decomposable
moves that are not profitable.
(find_decomposable_shift_zext): Added code to only decompose
shifts and zext if profitable.
(resolve_shift_zext): Added comment.
(decompose_multiword_subregs): Dump list of profitable
transformations. Add code to skip non profitable transformations.
*rtl.h(init_lower_subreg): Added declaration.
Index: toplev.c
===================================================================
--- toplev.c (revision 186034)
+++ toplev.c (working copy)
@@ -1660,6 +1660,7 @@ backend_init_target (void)
/* rtx_cost is mode-dependent, so cached values need to be recomputed
on a mode change. */
init_expmed ();
+ init_lower_subreg ();
/* We may need to recompute regno_save_code[] and regno_restore_code[]
after a mode change as well. */
Index: lower-subreg.c
===================================================================
--- lower-subreg.c (revision 186034)
+++ lower-subreg.c (working copy)
@@ -41,6 +41,7 @@ along with GCC; see the file COPYING3.
#include "tree-pass.h"
#include "df.h"
+
#ifdef STACK_GROWS_DOWNWARD
# undef STACK_GROWS_DOWNWARD
# define STACK_GROWS_DOWNWARD 1
@@ -52,10 +53,36 @@ DEF_VEC_P (bitmap);
DEF_VEC_ALLOC_P (bitmap,heap);
/* Decompose multi-word pseudo-registers into individual
- pseudo-registers when possible. This is possible when all the uses
- of a multi-word register are via SUBREG, or are copies of the
- register to another location. Breaking apart the register permits
- more CSE and permits better register allocation. */
+ pseudo-registers when possible and profitable. This is possible
+ when all the uses of a multi-word register are via SUBREG, or are
+ copies of the register to another location. Breaking apart the
+ register permits more CSE and permits better register allocation.
+ This is profitable if the machine does not have move instructions
+ to do this.
+
+ This pass only splits moves with modes that are wider than
+ word_mode and ASHIFTs, LSHIFTRTs and ZERO_EXTENDs with integer
+ modes that are twice the width of word_mode. The latter could be
+ generalized if there was a need to do this, but the trend in
+ architectures is to not need this.
+
+ There are two useful preprocessor defines for use by maintainers:
+
+ #define LOG_COSTS
+
+ if you wish to see the actual cost estimates that are being used
+ for each mode wider than word mode and the cost estimates for zero
+ extension and the shifts. This can be useful when port maintainers
+ are tuning insn rtx costs.
+
+ #define FORCE_LOWERING
+
+ if you wish to test the pass with all the transformation forced on.
+ This can be useful for finding bugs in the transformations.
+
+ Both of these should not be enabled at the same time. */
+
+#define FORCE_LOWERING
/* Bit N in this bitmap is set if regno N is used in a context in
which we can decompose it. */
@@ -75,8 +102,188 @@ static bitmap subreg_context;
copy from reg M to reg N. */
static VEC(bitmap,heap) *reg_copy_graph;
-/* Return whether X is a simple object which we can take a word_mode
- subreg of. */
+static struct {
+
+ /* This pass can transform 4 different operations: move, ashift,
+ lshiftrt, and zero_extend. There is a boolean vector for move
+ splitting that is indexed by mode and is true for each mode that is
+ to have its copies split. The other three operations are only done
+ for one mode so they are only controlled by a single boolean .*/
+ bool move_modes_to_split[MAX_MACHINE_MODE];
+
+ /* Other splitting operations to be done on the this platform. */
+ bool splitting_ashift[MAX_BITS_PER_WORD];
+ bool splitting_lshiftrt[MAX_BITS_PER_WORD];
+ bool splitting_zext;
+
+ bool something_to_do;
+
+ /* Precomputed cost values used to determine if lowering shift
+ operations is profitable. */
+ int word_mode_move_cost;
+ int move_zero_cost;
+} target_info;
+
+/* See what the move cost is. */
+static int
+compute_move_cost (enum machine_mode mode)
+{
+ rtx src = gen_rtx_REG (mode, FIRST_PSEUDO_REGISTER);
+ rtx trg = gen_rtx_REG (mode, FIRST_PSEUDO_REGISTER + 1);
+ rtx pat = gen_rtx_SET (VOIDmode, trg, src);
+
+ return insn_rtx_cost (pat, true);
+}
+
+
+/* Return true if it is profitable to lower and shift by SHIFT_AMT.
+ CODE can be either ASHIFT or LSHIFTRT. */
+static bool
+profitable_shift_p (enum rtx_code code, int shift_amt)
+{
+ rtx trg = gen_rtx_REG (GET_MODE_WIDER_MODE (word_mode), FIRST_PSEUDO_REGISTER);
+ rtx src = gen_rtx_REG (GET_MODE_WIDER_MODE (word_mode), FIRST_PSEUDO_REGISTER + 1);
+ int wide_cost
+ = insn_rtx_cost (gen_rtx_SET (VOIDmode, trg,
+ gen_rtx_fmt_ee (code, GET_MODE_WIDER_MODE (word_mode),
+ src, GEN_INT (shift_amt))),
+ true);
+#ifdef FORCE_LOWERING
+ return true;
+#endif
+
+#ifdef LOG_COSTS
+ fprintf (stderr, "shift code %d, shift amt %d, wide_cost %d\n",
+ code, shift_amt, wide_cost);
+#endif
+ if (shift_amt == BITS_PER_WORD)
+ return wide_cost
+ >= target_info.word_mode_move_cost + target_info.move_zero_cost;
+ else
+ {
+ int narrow_cost;
+
+ trg = gen_rtx_REG (word_mode, FIRST_PSEUDO_REGISTER);
+ src = gen_rtx_REG (word_mode, FIRST_PSEUDO_REGISTER + 1);
+ narrow_cost
+ = insn_rtx_cost (gen_rtx_SET
+ (VOIDmode, trg,
+ gen_rtx_fmt_ee (code, word_mode,
+ src,
+ GEN_INT (shift_amt - BITS_PER_WORD))),
+ true);
+
+#ifdef LOG_COSTS
+ fprintf (stderr, "narrow_cost %d\n", narrow_cost);
+#endif
+ return wide_cost > narrow_cost + target_info.move_zero_cost;
+ }
+}
+
+
+/* Initialize pass once per execution. This envolves determining
+ which operations on the machine are profitable. If none are found,
+ then the pass just returns when called. */
+
+void
+init_lower_subreg (void)
+{
+ rtx trg = gen_rtx_REG (word_mode, FIRST_PSEUDO_REGISTER);
+ rtx pat;
+ unsigned int i;
+ int factor;
+
+ memset (&target_info, 0, sizeof target_info);
+
+ target_info.word_mode_move_cost = compute_move_cost (word_mode);
+ target_info.move_zero_cost
+ = insn_rtx_cost (gen_rtx_SET (VOIDmode, trg,
+ CONST0_RTX (word_mode)), true);
+
+#ifdef LOG_COSTS
+ fprintf (stderr, "word mode move cost %d\n", target_info.word_mode_move_cost);
+ fprintf (stderr, "move 0 cost %d\n", target_info.move_zero_cost);
+#endif
+ for (i = 0; i < MAX_MACHINE_MODE; i++)
+ {
+ int t;
+ factor = GET_MODE_SIZE (i) / UNITS_PER_WORD;
+
+ if (factor <= 1)
+ continue;
+
+ t = compute_move_cost ((enum machine_mode) i);
+
+#ifdef LOG_COSTS
+ fprintf (stderr, "mode %s, move cost %d, factor %d\n",
+ mode_name[i], t, factor);
+#endif
+ if (t >= target_info.word_mode_move_cost * factor)
+ {
+ target_info.move_modes_to_split[i] = true;
+ target_info.something_to_do = true;
+ }
+#ifdef FORCE_LOWERING
+ target_info.move_modes_to_split[i] = true;
+ target_info.something_to_do = true;
+#endif
+ }
+
+ /* For the moves and shifts, the only case that is checked is one
+ where the mode of the target is an integer mode twice the width
+ of the word_mode.
+
+ If it is not profitable to split a double word move then do not
+ even consider the shifts or the zero extension. */
+ if (target_info.move_modes_to_split[(int) GET_MODE_WIDER_MODE (word_mode)])
+ {
+ /* The only case here to check to see if moving the upper part with a
+ zero is cheaper than doing the zext itself. */
+ pat = gen_rtx_SET (VOIDmode, trg,
+ gen_rtx_ZERO_EXTEND (GET_MODE_WIDER_MODE (word_mode),
+ gen_reg_rtx (word_mode)));
+
+ if (insn_rtx_cost (pat, true)
+ >= target_info.word_mode_move_cost + target_info.move_zero_cost)
+ {
+ target_info.splitting_zext = true;
+ target_info.something_to_do = true;
+ }
+
+#ifdef FORCE_LOWERING
+ target_info.splitting_zext = true;
+ target_info.something_to_do = true;
+#endif
+ /* For the shifts there are three shift amounts that need to be
+ checked: word_mode, word_mode + 1 and word_mode * 1.5. The first
+ of these can be done without a shift. The second and third case
+ are the same on large machines but may be different on small
+ machines that special case shift 1 and 2. If any of these are
+ found to be useful, then we set the flags to consider those cases
+ and when the actual shift amount is known, redo the cost
+ calculation. */
+
+ for (i = 0; i < BITS_PER_WORD; i++)
+ {
+ bool p = profitable_shift_p (ASHIFT, BITS_PER_WORD + i);
+
+ target_info.splitting_ashift[i] = p;
+ target_info.something_to_do |= p;
+
+ p = profitable_shift_p (LSHIFTRT, BITS_PER_WORD + i);
+
+ target_info.splitting_lshiftrt[i] = p;
+ target_info.something_to_do |= p;
+
+#ifdef FORCE_LOWERING
+ target_info.splitting_ashift[i] = true;
+ target_info.splitting_lshiftrt[i] = true;
+ target_info.something_to_do = true;
+#endif
+ }
+ }
+}
+
static bool
simple_move_operand (rtx x)
@@ -173,7 +380,7 @@ find_pseudo_copy (rtx set)
if (HARD_REGISTER_NUM_P (rd) || HARD_REGISTER_NUM_P (rs))
return false;
- if (GET_MODE_SIZE (GET_MODE (dest)) <= UNITS_PER_WORD)
+ if (!target_info.move_modes_to_split[(int) GET_MODE (dest)])
return false;
b = VEC_index (bitmap, reg_copy_graph, rs);
@@ -335,6 +542,11 @@ find_decomposable_subregs (rtx *px, void
bitmap_set_bit (decomposable_context, regno);
break;
case SIMPLE_MOVE:
+ /* If this is marked as a simple move with a mode this
+ large, it is because find_pseudo_copy returned false
+ because this was not a mode we wanted to split. */
+ if (!target_info.move_modes_to_split[(int) GET_MODE (x)])
+ bitmap_set_bit (non_decomposable_context, regno);
break;
default:
gcc_unreachable ();
@@ -668,7 +887,7 @@ resolve_simple_move (rtx set, rtx insn)
orig_mode = GET_MODE (dest);
words = (GET_MODE_SIZE (orig_mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
- if (words <= 1)
+ if (!target_info.move_modes_to_split[(int) GET_MODE (dest)])
return insn;
start_sequence ();
@@ -941,17 +1160,29 @@ find_decomposable_shift_zext (rtx insn)
rtx set;
rtx op;
rtx op_operand;
+ enum machine_mode mode;
set = single_set (insn);
if (!set)
return 0;
op = SET_SRC (set);
- if (GET_CODE (op) != ASHIFT
- && GET_CODE (op) != LSHIFTRT
- && GET_CODE (op) != ZERO_EXTEND)
+ mode = GET_MODE (op);
+
+ if (mode != GET_MODE_WIDER_MODE (word_mode))
return 0;
+ switch (GET_CODE (op))
+ {
+ case ASHIFT:
+ case LSHIFTRT:
+ case ZERO_EXTEND:
+ break;
+
+ default:
+ return 0;
+ }
+
op_operand = XEXP (op, 0);
if (!REG_P (SET_DEST (set)) || !REG_P (op_operand)
|| HARD_REGISTER_NUM_P (REGNO (SET_DEST (set)))
@@ -959,25 +1190,51 @@ find_decomposable_shift_zext (rtx insn)
|| !SCALAR_INT_MODE_P (GET_MODE (op)))
return 0;
- if (GET_CODE (op) == ZERO_EXTEND)
- {
- if (GET_MODE (op_operand) != word_mode
- || GET_MODE_BITSIZE (GET_MODE (op)) != 2 * BITS_PER_WORD)
- return 0;
- }
- else /* left or right shift */
+ switch (GET_CODE (op))
{
- if (!CONST_INT_P (XEXP (op, 1))
- || INTVAL (XEXP (op, 1)) < BITS_PER_WORD
- || GET_MODE_BITSIZE (GET_MODE (op_operand)) != 2 * BITS_PER_WORD)
+ case ASHIFT:
+ {
+ int shift_val;
+
+ if (!CONST_INT_P (XEXP (op, 1))
+ || INTVAL (XEXP (op, 1)) < BITS_PER_WORD)
+ return 0;
+
+ shift_val = INTVAL (XEXP (op, 1));
+ if (target_info.splitting_ashift[shift_val - BITS_PER_WORD])
+ return 0;
+
+ bitmap_set_bit (decomposable_context, REGNO (op_operand));
+ break;
+ }
+
+ case LSHIFTRT:
+ {
+ int shift_val;
+
+ if (!CONST_INT_P (XEXP (op, 1))
+ || INTVAL (XEXP (op, 1)) < BITS_PER_WORD)
+ return 0;
+
+ shift_val = INTVAL (XEXP (op, 1));
+ if (target_info.splitting_lshiftrt[shift_val - BITS_PER_WORD])
+ return 0;
+
+ bitmap_set_bit (decomposable_context, REGNO (op_operand));
+ break;
+ }
+
+ case ZERO_EXTEND:
+ if (GET_MODE (op_operand) != word_mode || !target_info.splitting_zext)
return 0;
+ break;
+
+ default:
+ gcc_unreachable ();
}
bitmap_set_bit (decomposable_context, REGNO (SET_DEST (set)));
- if (GET_CODE (op) != ZERO_EXTEND)
- bitmap_set_bit (decomposable_context, REGNO (op_operand));
-
return 1;
}
@@ -1008,6 +1265,8 @@ resolve_shift_zext (rtx insn)
op_operand = XEXP (op, 0);
+ /* We can tear this operation apart only if the regs were already
+ torn apart. */
if (!resolve_reg_p (SET_DEST (set)) && !resolve_reg_p (op_operand))
return NULL_RTX;
@@ -1083,8 +1351,57 @@ decompose_multiword_subregs (void)
unsigned int max;
basic_block bb;
- if (df)
- df_set_flags (DF_DEFER_INSN_RESCAN);
+ if (dump_file)
+ {
+ unsigned int i;
+ const char *sep;
+
+ for (i = 0; i < MAX_MACHINE_MODE; i++)
+ {
+ if (GET_MODE_SIZE (i) > UNITS_PER_WORD)
+ fprintf (dump_file, "%s mode %s for copy lowering.\n",
+ target_info.move_modes_to_split[i] ? "Splitting" : "Skipping", mode_name[i]);
+ }
+
+ fprintf (dump_file, "%s mode %s for zero_extend lowering.\n",
+ target_info.splitting_zext ? "Splitting" : "Skipping",
+ mode_name[GET_MODE_WIDER_MODE (word_mode)]);
+
+
+ fprintf (dump_file, "Splitting mode %s for ashift lowering with shift amounts = ",
+ mode_name[GET_MODE_WIDER_MODE (word_mode)]);
+ sep = "";
+ for (i = 0; i < BITS_PER_WORD; i++)
+ {
+ if (target_info.splitting_ashift[i])
+ {
+ fprintf (dump_file, "%s%d", sep, i + BITS_PER_WORD);
+ sep = ",";
+ }
+ }
+
+ fprintf (dump_file, "\nSplitting mode %s for lshiftrt lowering with shift amounts = ",
+ mode_name[GET_MODE_WIDER_MODE (word_mode)]);
+ sep = "";
+ for (i = 0; i < BITS_PER_WORD; i++)
+ {
+ if (target_info.splitting_lshiftrt[i])
+ {
+ fprintf (dump_file, "%s%d", sep, i + BITS_PER_WORD);
+ sep = ",";
+ }
+ }
+
+ fprintf (dump_file, "\n");
+
+ if (!target_info.something_to_do)
+ fprintf (dump_file, "Nothing to lower for port.\n\n");
+ }
+
+
+ /* Check if this target even has any modes to consider lowering. */
+ if (!target_info.something_to_do)
+ return;
max = max_reg_num ();
@@ -1094,24 +1411,39 @@ decompose_multiword_subregs (void)
all the insns. */
{
unsigned int i;
+ bool useful_modes_seen = false;
for (i = FIRST_PSEUDO_REGISTER; i < max; ++i)
{
- if (regno_reg_rtx[i] != NULL
- && GET_MODE_SIZE (GET_MODE (regno_reg_rtx[i])) > UNITS_PER_WORD)
- break;
+ if (regno_reg_rtx[i] != NULL)
+ {
+ enum machine_mode mode = GET_MODE (regno_reg_rtx[i]);
+ if (regno_reg_rtx[i] != NULL && target_info.move_modes_to_split [mode])
+ {
+ useful_modes_seen = true;
+ break;
+ }
+ }
+ }
+
+ if (!useful_modes_seen)
+ {
+ if (dump_file)
+ fprintf (dump_file, "Nothing to lower in this function.\n");
+ return;
}
- if (i == max)
- return;
}
- if (df)
- run_word_dce ();
+ if (df)
+ {
+ df_set_flags (DF_DEFER_INSN_RESCAN);
+ run_word_dce ();
+ }
- /* FIXME: When the dataflow branch is merged, we can change this
- code to look for each multi-word pseudo-register and to find each
- insn which sets or uses that register. That should be faster
- than scanning all the insns. */
+ /* FIXME: It may be possible to change this code to look for each
+ multi-word pseudo-register and to find each insn which sets or
+ uses that register. That should be faster than scanning all the
+ insns. */
decomposable_context = BITMAP_ALLOC (NULL);
non_decomposable_context = BITMAP_ALLOC (NULL);
Index: rtl.h
===================================================================
--- rtl.h (revision 186034)
+++ rtl.h (working copy)
@@ -2523,6 +2523,9 @@ extern void init_expmed (void);
extern void expand_inc (rtx, rtx);
extern void expand_dec (rtx, rtx);
+/* In lower-subreg.c */
+extern void init_lower_subreg (void);
+
/* In gcse.c */
extern bool can_copy_p (enum machine_mode);
extern bool can_assign_to_reg_without_clobbers_p (rtx);