Ping. Thanks, Bill
On Thu, 2012-05-03 at 09:33 -0500, William J. Schmidt wrote: > This patch was posted for comment back in February during stage 4. It > addresses a performance issue noted in the EEMBC routelookup benchmark > on a common idiom: > > if (...) > x = y->left; > else > x = y->right; > > If the two loads can be hoisted out of the if/else, the if/else can be > replaced by a conditional move instruction on architectures that support > one. Because this speculates one of the loads, the patch constrains the > optimization to avoid introducing page faults. > > Bootstrapped and regression tested on powerpc-unknown-linux-gnu with no > new failures. The patch provides significant improvement to the > routelookup benchmark, and is neutral on SPEC cpu2000/cpu2006. > > One question is what optimization level should be required for this. > Because of the speculation, -O3 might be in order. I don't believe > -Ofast is required as there is no potential correctness issue involved. > Right now the patch doesn't check the optimization level (like the rest > of the phi-opt transforms), which is likely a poor choice. > > Ok for trunk? > > Thanks, > Bill > > > 2012-05-03 Bill Schmidt <wschm...@linux.vnet.ibm.com> > > * tree-ssa-phiopt.c (tree_ssa_phiopt_worker): Add argument to forward > declaration. > (hoist_adjacent_loads, gate_hoist_loads): New forward declarations. > (tree_ssa_phiopt): Call gate_hoist_loads. > (tree_ssa_cs_elim): Add parm to tree_ssa_phiopt_worker call. > (tree_ssa_phiopt_worker): Add do_hoist_loads to formal arg list; call > hoist_adjacent_loads. > (local_reg_dependence): New function. > (local_mem_dependence): Likewise. > (hoist_adjacent_loads): Likewise. > (gate_hoist_loads): Likewise. > * common.opt (fhoist-adjacent-loads): New switch. > * Makefile.in (tree-ssa-phiopt.o): Added dependencies. > * params.def (PARAM_MIN_CMOVE_STRUCT_ALIGN): New param. > > > Index: gcc/tree-ssa-phiopt.c > =================================================================== > --- gcc/tree-ssa-phiopt.c (revision 187057) > +++ gcc/tree-ssa-phiopt.c (working copy) > @@ -37,9 +37,17 @@ along with GCC; see the file COPYING3. If not see > #include "cfgloop.h" > #include "tree-data-ref.h" > #include "tree-pretty-print.h" > +#include "gimple-pretty-print.h" > +#include "insn-config.h" > +#include "expr.h" > +#include "optabs.h" > > +#ifndef HAVE_conditional_move > +#define HAVE_conditional_move (0) > +#endif > + > static unsigned int tree_ssa_phiopt (void); > -static unsigned int tree_ssa_phiopt_worker (bool); > +static unsigned int tree_ssa_phiopt_worker (bool, bool); > static bool conditional_replacement (basic_block, basic_block, > edge, edge, gimple, tree, tree); > static int value_replacement (basic_block, basic_block, > @@ -53,6 +61,9 @@ static bool cond_store_replacement (basic_block, b > static bool cond_if_else_store_replacement (basic_block, basic_block, > basic_block); > static struct pointer_set_t * get_non_trapping (void); > static void replace_phi_edge_with_variable (basic_block, edge, gimple, tree); > +static void hoist_adjacent_loads (basic_block, basic_block, > + basic_block, basic_block); > +static bool gate_hoist_loads (void); > > /* This pass tries to replaces an if-then-else block with an > assignment. We have four kinds of transformations. Some of these > @@ -138,12 +149,56 @@ static void replace_phi_edge_with_variable (basic_ > bb2: > x = PHI <x' (bb0), ...>; > > - A similar transformation is done for MAX_EXPR. */ > + A similar transformation is done for MAX_EXPR. > > + > + This pass also performs a fifth transformation of a slightly different > + flavor. > + > + Adjacent Load Hoisting > + ---------------------- > + > + This transformation replaces > + > + bb0: > + if (...) goto bb2; else goto bb1; > + bb1: > + x1 = (<expr>).field1; > + goto bb3; > + bb2: > + x2 = (<expr>).field2; > + bb3: > + # x = PHI <x1, x2>; > + > + with > + > + bb0: > + x1 = (<expr>).field1; > + x2 = (<expr>).field2; > + if (...) goto bb2; else goto bb1; > + bb1: > + goto bb3; > + bb2: > + bb3: > + # x = PHI <x1, x2>; > + > + The purpose of this transformation is to enable generation of conditional > + move instructions such as Intel CMOVE or PowerPC ISEL. Because one of > + the loads is speculative, the transformation is restricted to very > + specific cases to avoid introducing a page fault. We are looking for > + the common idiom: > + > + if (...) > + x = y->left; > + else > + x = y->right; > + > + where left and right are typically adjacent pointers in a tree structure. > */ > + > static unsigned int > tree_ssa_phiopt (void) > { > - return tree_ssa_phiopt_worker (false); > + return tree_ssa_phiopt_worker (false, gate_hoist_loads ()); > } > > /* This pass tries to transform conditional stores into unconditional > @@ -190,7 +245,7 @@ tree_ssa_phiopt (void) > static unsigned int > tree_ssa_cs_elim (void) > { > - return tree_ssa_phiopt_worker (true); > + return tree_ssa_phiopt_worker (true, false); > } > > /* Return the singleton PHI in the SEQ of PHIs for edges E0 and E1. */ > @@ -227,9 +282,11 @@ static tree condstoretemp; > /* The core routine of conditional store replacement and normal > phi optimizations. Both share much of the infrastructure in how > to match applicable basic block patterns. DO_STORE_ELIM is true > - when we want to do conditional store replacement, false otherwise. */ > + when we want to do conditional store replacement, false otherwise. > + DO_HOIST_LOADS is true when we want to hoist adjacent loads out > + of diamond control flow patterns, false otherwise. */ > static unsigned int > -tree_ssa_phiopt_worker (bool do_store_elim) > +tree_ssa_phiopt_worker (bool do_store_elim, bool do_hoist_loads) > { > basic_block bb; > basic_block *bb_order; > @@ -312,6 +369,20 @@ static unsigned int > cfgchanged = true; > continue; > } > + else if (do_hoist_loads > + && EDGE_SUCC (bb1, 0)->dest == EDGE_SUCC (bb2, 0)->dest) > + { > + basic_block bb3 = EDGE_SUCC (bb1, 0)->dest; > + > + if (single_succ_p (bb1) > + && single_succ_p (bb2) > + && single_pred_p (bb1) > + && single_pred_p (bb2) > + && EDGE_COUNT (bb->succs) == 2 > + && EDGE_COUNT (bb3->preds) == 2) > + hoist_adjacent_loads (bb, bb1, bb2, bb3); > + continue; > + } > else > continue; > > @@ -1707,6 +1778,220 @@ cond_if_else_store_replacement (basic_block then_b > return ok; > } > > +/* Return TRUE iff EXPR contains any SSA names that are defined in BB. */ > + > +static bool > +local_reg_dependence (tree expr, basic_block bb) > +{ > + int i; > + > + if (TREE_CODE (expr) == SSA_NAME) > + { > + if (SSA_NAME_IS_DEFAULT_DEF (expr)) > + return false; > + else > + return (gimple_bb (SSA_NAME_DEF_STMT (expr)) == bb); > + } > + > + for (i = 0; i < TREE_OPERAND_LENGTH (expr); i++) > + if (TREE_OPERAND (expr, i) > + && local_reg_dependence (TREE_OPERAND (expr, i), bb)) > + return true; > + > + return false; > +} > + > +/* Return TRUE if STMT has a VUSE whose corresponding VDEF is in BB. */ > + > +static bool > +local_mem_dependence (gimple stmt, basic_block bb) > +{ > + tree vuse = gimple_vuse (stmt); > + gimple def; > + > + if (!vuse) > + return false; > + > + def = SSA_NAME_DEF_STMT (vuse); > + return (def && gimple_bb (def) == bb); > +} > + > +/* Given a "diamond" control-flow pattern where BB0 tests a condition, > + BB1 and BB2 are "then" and "else" blocks dependent on this test, > + and BB3 rejoins control flow following BB1 and BB2, look for > + opportunities to hoist loads as follows. If BB3 contains a PHI of > + two loads, one each occurring in BB1 and BB2, and the loads are > + provably of adjacent fields in the same structure, then move both > + loads into BB0. Of course this can only be done if there are no > + dependencies preventing such motion. > + > + One of the hoisted loads will always be speculative, so the > + transformation is currently conservative: > + > + - The fields must be strictly adjacent. > + - Hoisting is only done on aligned pointer fields. > + - The two fields must occupy a single memory block that is > + guaranteed to not cross a page boundary. > + > + The last is difficult to prove, as such memory blocks should be > + aligned on the minimum of the stack alignment boundary and the > + alignment guaranteed by heap allocation interfaces. Thus we rely > + on a parameter for the alignment value. > + > + Provided a good value is used for the last case, the first two > + restrictions can probably be relaxed. */ > + > +static void > +hoist_adjacent_loads (basic_block bb0, basic_block bb1, > + basic_block bb2, basic_block bb3) > +{ > + int param_align = PARAM_VALUE (PARAM_MIN_CMOVE_STRUCT_ALIGN); > + gimple_stmt_iterator gsi; > + > + /* Walk the phis in bb3 looking for an opportunity. We are looking > + for phis of two SSA names, one each of which is defined in bb1 and > + bb2. */ > + for (gsi = gsi_start_phis (bb3); !gsi_end_p (gsi); gsi_next (&gsi)) > + { > + gimple phi_stmt = gsi_stmt (gsi); > + gimple def1, def2; > + tree arg1, arg2, ref1, ref2, field1, field2; > + tree tree_offset1, tree_offset2, tree_size1, tree_size2; > + int offset1, offset2, size1, size2, block_offset; > + unsigned align1, align2; > + gimple_stmt_iterator gsi2; > + > + if (gimple_phi_num_args (phi_stmt) != 2) > + continue; > + > + arg1 = gimple_phi_arg_def (phi_stmt, 0); > + arg2 = gimple_phi_arg_def (phi_stmt, 1); > + > + if (TREE_CODE (arg1) != SSA_NAME > + || TREE_CODE (arg2) != SSA_NAME > + || SSA_NAME_IS_DEFAULT_DEF (arg1) > + || SSA_NAME_IS_DEFAULT_DEF (arg2) > + /* FORNOW: Only do this optimization for pointer types. */ > + || !POINTER_TYPE_P (TREE_TYPE (arg1))) > + continue; > + > + def1 = SSA_NAME_DEF_STMT (arg1); > + def2 = SSA_NAME_DEF_STMT (arg2); > + > + if ((gimple_bb (def1) != bb1 || gimple_bb (def2) != bb2) > + && (gimple_bb (def2) != bb1 || gimple_bb (def1) != bb2)) > + continue; > + > + /* Unless -fhoist-adjacent-loads was specified, check the mode > + of the arguments to be sure a conditional move can be generated > + for it. */ > + if (flag_hoist_adjacent_loads != 1 > + && !optab_handler (cmov_optab, TYPE_MODE (TREE_TYPE (arg1)))) > + continue; > + > + /* Both statements must be assignments whose RHS is a COMPONENT_REF. > */ > + if (!gimple_assign_single_p (def1) > + || !gimple_assign_single_p (def2)) > + continue; > + > + ref1 = gimple_assign_rhs1 (def1); > + ref2 = gimple_assign_rhs1 (def2); > + > + if (TREE_CODE (ref1) != COMPONENT_REF > + || TREE_CODE (ref2) != COMPONENT_REF) > + continue; > + > + /* The zeroth operand of the two component references must be > + identical. It is not sufficient to compare get_base_address of > + the two references, because this could allow for different > + elements of the same array in the two trees. It is not safe to > + assume that the existence of one array element implies the > + existence of a different one. */ > + if (!operand_equal_p (TREE_OPERAND (ref1, 0), TREE_OPERAND (ref2, 0), > 0)) > + continue; > + > + field1 = TREE_OPERAND (ref1, 1); > + field2 = TREE_OPERAND (ref2, 1); > + > + tree_offset1 = DECL_FIELD_BIT_OFFSET (field1); > + tree_offset2 = DECL_FIELD_BIT_OFFSET (field2); > + tree_size1 = DECL_SIZE_UNIT (field1); > + tree_size2 = DECL_SIZE_UNIT (field2); > + > + if (TREE_CODE (tree_offset1) != INTEGER_CST > + || TREE_CODE (tree_offset2) != INTEGER_CST > + || TREE_CODE (tree_size1) != INTEGER_CST > + || TREE_CODE (tree_size2) != INTEGER_CST) > + continue; > + > + /* FORNOW: The two field references must be byte-aligned, naturally > + aligned within the record, and adjacent. */ > + offset1 = TREE_INT_CST_LOW (tree_offset1); > + offset2 = TREE_INT_CST_LOW (tree_offset2); > + size1 = TREE_INT_CST_LOW (tree_size1); > + size2 = TREE_INT_CST_LOW (tree_size2); > + align1 = TYPE_ALIGN (TREE_TYPE (arg1)); > + align2 = TYPE_ALIGN (TREE_TYPE (arg2)); > + > + if (offset1 % BITS_PER_UNIT != 0 > + || offset2 % BITS_PER_UNIT != 0 > + || offset1 % align1 != 0 > + || offset2 % align2 != 0) > + continue; > + > + offset1 /= BITS_PER_UNIT; > + offset2 /= BITS_PER_UNIT; > + > + if (offset1 + size1 != offset2 > + && offset2 + size2 != offset1) > + continue; > + > + /* The two field references must fit within a block of memory that > + is guaranteed to be on the same page. */ > + block_offset = MIN (offset1, offset2) % param_align; > + > + if (block_offset + size1 + size2 > param_align) > + continue; > + > + /* The two expressions cannot be dependent upon SSA names > + or vdefs defined in bb1/bb2. */ > + if (local_reg_dependence (TREE_OPERAND (ref1, 0), bb1) > + || local_reg_dependence (TREE_OPERAND (ref2, 0), bb2) > + || local_mem_dependence (def1, bb1) > + || local_mem_dependence (def2, bb2)) > + continue; > + > + /* The conditions are satisfied; hoist the loads from bb1 and bb2 into > + bb0. */ > + gsi2 = gsi_for_stmt (def1); > + gsi_move_to_bb_end (&gsi2, bb0); > + gsi2 = gsi_for_stmt (def2); > + gsi_move_to_bb_end (&gsi2, bb0); > + > + if (dump_file && (dump_flags & TDF_DETAILS)) > + { > + fprintf (dump_file, > + "\nHoisting adjacent loads from %d and %d into %d: \n", > + bb1->index, bb2->index, bb0->index); > + print_gimple_stmt (dump_file, def1, 0, TDF_VOPS|TDF_MEMSYMS); > + print_gimple_stmt (dump_file, def2, 0, TDF_VOPS|TDF_MEMSYMS); > + } > + } > +} > + > +/* Determine whether we should attempt to hoist adjacent loads out of > + diamond patterns in pass_phiopt. Always hoist loads if > + -fhoist-adjacent-loads is specified, or if the target machine has > + a conditional move instruction and -fno-hoist-adjacent-loads is > + not specified. */ > + > +static bool > +gate_hoist_loads (void) > +{ > + return (flag_hoist_adjacent_loads == 1 > + || (HAVE_conditional_move && flag_hoist_adjacent_loads != 0)); > +} > + > /* Always do these optimizations if we have SSA > trees to work on. */ > static bool > Index: gcc/common.opt > =================================================================== > --- gcc/common.opt (revision 187057) > +++ gcc/common.opt (working copy) > @@ -1186,6 +1186,11 @@ fgraphite-identity > Common Report Var(flag_graphite_identity) Optimization > Enable Graphite Identity transformation > > +fhoist-adjacent-loads > +Common Report Var(flag_hoist_adjacent_loads) Init(-1) Optimization > +Enable hoisting adjacent loads to encourage generating conditional move > +instructions > + > floop-parallelize-all > Common Report Var(flag_loop_parallelize_all) Optimization > Mark all loops as parallel > Index: gcc/Makefile.in > =================================================================== > --- gcc/Makefile.in (revision 187057) > +++ gcc/Makefile.in (working copy) > @@ -2351,7 +2351,8 @@ tree-ssa-phiopt.o : tree-ssa-phiopt.c $(CONFIG_H) > $(TM_H) $(GGC_H) $(TREE_H) $(TM_P_H) $(BASIC_BLOCK_H) \ > $(TREE_FLOW_H) $(TREE_PASS_H) $(TREE_DUMP_H) langhooks.h $(FLAGS_H) \ > $(DIAGNOSTIC_H) $(TIMEVAR_H) pointer-set.h domwalk.h $(CFGLOOP_H) \ > - $(TREE_DATA_REF_H) tree-pretty-print.h > + $(TREE_DATA_REF_H) tree-pretty-print.h gimple-pretty-print.h \ > + insn-config.h $(EXPR_H) $(OPTABS_H) > tree-nrv.o : tree-nrv.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \ > $(TM_H) $(TREE_H) $(FUNCTION_H) $(BASIC_BLOCK_H) $(FLAGS_H) \ > $(DIAGNOSTIC_H) $(TREE_FLOW_H) $(TIMEVAR_H) $(TREE_DUMP_H) $(TREE_PASS_H) > \ > Index: gcc/params.def > =================================================================== > --- gcc/params.def (revision 187057) > +++ gcc/params.def (working copy) > @@ -979,6 +979,13 @@ DEFPARAM (PARAM_MAX_TRACKED_STRLENS, > "track string lengths", > 1000, 0, 0) > > +/* For adjacent load hoisting transformation in tree-ssa-phiopt.c. */ > +DEFPARAM (PARAM_MIN_CMOVE_STRUCT_ALIGN, > + "min-cmove-struct-align", > + "Minimum byte alignment to assume for structures in the stack " > + "or heap when considering load hoisting for conditional moves", > + 16, 8, 256) > + > /* > Local variables: > mode:c > >