Index: gcc/doc/invoke.texi
===================================================================
--- gcc/doc/invoke.texi	(revision 186804)
+++ gcc/doc/invoke.texi	(working copy)
@@ -410,7 +410,8 @@ Objective-C and Objective-C++ Dialects}.
 -ftree-loop-if-convert-stores -ftree-loop-im @gol
 -ftree-phiprop -ftree-loop-distribution -ftree-loop-distribute-patterns @gol
 -ftree-loop-ivcanon -ftree-loop-linear -ftree-loop-optimize @gol
--ftree-parallelize-loops=@var{n} -ftree-pre -ftree-pta -ftree-reassoc @gol
+-ftree-parallelize-loops=@var{n} -ftree-pre -ftree-partial-pre -ftree-pta @gol
+-ftree-reassoc @gol
 -ftree-sink -ftree-sra -ftree-switch-conversion -ftree-tail-merge @gol
 -ftree-ter -ftree-vect-loop-version -ftree-vectorize -ftree-vrp @gol
 -funit-at-a-time -funroll-all-loops -funroll-loops @gol
@@ -6267,8 +6268,8 @@ invoking @option{-O2} on programs that u
 Optimize yet more.  @option{-O3} turns on all optimizations specified
 by @option{-O2} and also turns on the @option{-finline-functions},
 @option{-funswitch-loops}, @option{-fpredictive-commoning},
-@option{-fgcse-after-reload}, @option{-ftree-vectorize} and
-@option{-fipa-cp-clone} options.
+@option{-fgcse-after-reload}, @option{-ftree-vectorize},
+@option{-ftree-partial-pre} and @option{-fipa-cp-clone} options.
 
 @item -O0
 @opindex O0
@@ -7063,6 +7064,11 @@ at @option{-O} and higher.
 Perform partial redundancy elimination (PRE) on trees.  This flag is
 enabled by default at @option{-O2} and @option{-O3}.
 
+@item -ftree-partial-pre
+@opindex ftree-partial-pre
+Make partial redundancy elimination (PRE) more aggressive.  This flag is
+enabled by default at @option{-O3}.
+
 @item -ftree-forwprop
 @opindex ftree-forwprop
 Perform forward propagation on trees.  This flag is enabled by default
Index: gcc/opts.c
===================================================================
--- gcc/opts.c	(revision 186804)
+++ gcc/opts.c	(working copy)
@@ -499,6 +499,7 @@ static const struct default_options defa
     { OPT_LEVELS_3_PLUS, OPT_fgcse_after_reload, NULL, 1 },
     { OPT_LEVELS_3_PLUS, OPT_ftree_vectorize, NULL, 1 },
     { OPT_LEVELS_3_PLUS, OPT_fipa_cp_clone, NULL, 1 },
+    { OPT_LEVELS_3_PLUS, OPT_ftree_partial_pre, NULL, 1 },
 
     /* -Ofast adds optimizations to -O3.  */
     { OPT_LEVELS_FAST, OPT_ffast_math, NULL, 1 },
Index: gcc/tree-ssa-pre.c
===================================================================
--- gcc/tree-ssa-pre.c	(revision 186804)
+++ gcc/tree-ssa-pre.c	(working copy)
@@ -3774,20 +3774,51 @@ do_partial_partial_insertion (basic_bloc
 		}
 	      else
 		avail[bprime->index] = edoubleprime;
-
 	    }
 
 	  /* If we can insert it, it's not the same value
 	     already existing along every predecessor, and
 	     it's defined by some predecessor, it is
 	     partially redundant.  */
-	  if (!cant_insert && by_all && dbg_cnt (treepre_insert))
+	  if (!cant_insert && by_all)
 	    {
-	      pre_stats.pa_insert++;
-	      if (insert_into_preds_of_block (block, get_expression_id (expr),
-					      avail))
-		new_stuff = true;
-	    }
+	      edge succ;
+	      bool do_insertion = false;
+
+	      /* Insert only if we can remove a later expression on a path
+		 that we want to optimize for speed.
+		 The phi node that we will be inserting in BLOCK is not free,
+		 and inserting it for the sake of !optimize_for_speed successor
+		 may cause regressions on the speed path.  */
+	      FOR_EACH_EDGE (succ, ei, block->succs)
+		{
+		  if (bitmap_set_contains_value (PA_IN (succ->dest), val))
+		    {
+		      if (optimize_edge_for_speed_p (succ))
+			do_insertion = true;
+		    }
+		}
+
+	      if (!do_insertion)
+		{
+		  if (dump_file && (dump_flags & TDF_DETAILS))
+		    {
+		      fprintf (dump_file, "Skipping partial partial redundancy "
+			       "for expression ");
+		      print_pre_expr (dump_file, expr);
+		      fprintf (dump_file, " (%04d), not partially anticipated "
+			       "on any to be optimized for speed edges\n", val);
+		    }
+		}
+	      else if (dbg_cnt (treepre_insert))
+		{
+		  pre_stats.pa_insert++;
+		  if (insert_into_preds_of_block (block,
+						  get_expression_id (expr),
+						  avail))
+		    new_stuff = true;
+		}	   
+	    } 
 	  free (avail);
 	}
     }
@@ -4948,7 +4979,8 @@ execute_pre (bool do_fre)
 {
   unsigned int todo = 0;
 
-  do_partial_partial = optimize > 2 && optimize_function_for_speed_p (cfun);
+  do_partial_partial =
+    flag_tree_partial_pre && optimize_function_for_speed_p (cfun);
 
   /* This has to happen before SCCVN runs because
      loop_optimizer_init may create new phis, etc.  */
Index: gcc/common.opt
===================================================================
--- gcc/common.opt	(revision 186804)
+++ gcc/common.opt	(working copy)
@@ -2020,6 +2020,10 @@ ftree-pre
 Common Report Var(flag_tree_pre) Optimization
 Enable SSA-PRE optimization on trees
 
+ftree-partial-pre
+Common Report Var(flag_tree_partial_pre) Optimization
+In SSA-PRE optimization on trees, enable partial-partial redundancy elimination
+
 ftree-pta
 Common Report Var(flag_tree_pta) Init(1) Optimization
 Perform function-local points-to analysis on trees.
