diff --git a/gcc/bb-reorder.c b/gcc/bb-reorder.c
index 0d29b2d..f50f5c2 100644
--- a/gcc/bb-reorder.c
+++ b/gcc/bb-reorder.c
@@ -179,7 +179,10 @@ static void find_traces_1_round (int, int, gcov_type, struct trace *, int *,
 				 int, fibheap_t *, int);
 static basic_block copy_bb (basic_block, edge, basic_block, int);
 static fibheapkey_t bb_to_key (basic_block);
-static bool better_edge_p (const_basic_block, const_edge, int, int, int, int, const_edge);
+static bool better_edge_p (const_basic_block, const_edge, int, int, int, int,
+			   const_edge);
+static bool connect_better_edge_p (const_edge, bool, int, const_edge,
+				   struct trace *);
 static void connect_traces (int, struct trace *);
 static bool copy_bb_p (const_basic_block, int);
 static bool push_to_next_round_p (const_basic_block, int, int, int, gcov_type);
@@ -437,6 +440,7 @@ find_traces_1_round (int branch_th, int exec_th, gcov_type count_th,
   /* Heap for discarded basic blocks which are possible starting points for
      the next round.  */
   fibheap_t new_heap = fibheap_new ();
+  bool for_size = optimize_function_for_size_p (cfun);
 
   while (!fibheap_empty (*heap))
     {
@@ -456,10 +460,11 @@ find_traces_1_round (int branch_th, int exec_th, gcov_type count_th,
       /* If the BB's frequency is too low send BB to the next round.  When
 	 partitioning hot/cold blocks into separate sections, make sure all
 	 the cold blocks (and ONLY the cold blocks) go into the (extra) final
-	 round.  */
+	 round.  Do not push to next round when optimizing for size.  */
 
-      if (push_to_next_round_p (bb, round, number_of_rounds, exec_th,
-				count_th))
+      if (!for_size
+	  && push_to_next_round_p (bb, round, number_of_rounds, exec_th,
+				   count_th))
 	{
 	  int key = bb_to_key (bb);
 	  bbd[bb->index].heap = new_heap;
@@ -530,10 +535,11 @@ find_traces_1_round (int branch_th, int exec_th, gcov_type count_th,
 		}
 
 	      /* Edge that cannot be fallthru or improbable or infrequent
-		 successor (i.e. it is unsuitable successor).  */
+		 successor (i.e. it is unsuitable successor).
+		 For size, ignore the frequency and probability.  */
 	      if (!(e->flags & EDGE_CAN_FALLTHRU) || (e->flags & EDGE_COMPLEX)
-		  || prob < branch_th || EDGE_FREQUENCY (e) < exec_th
-		  || e->count < count_th)
+		  || (prob < branch_th || EDGE_FREQUENCY (e) < exec_th
+		      || e->count < count_th) && !for_size)
 		continue;
 
 	      /* If partitioning hot/cold basic blocks, don't consider edges
@@ -558,6 +564,14 @@ find_traces_1_round (int branch_th, int exec_th, gcov_type count_th,
 	  /* Add all non-selected successors to the heaps.  */
 	  FOR_EACH_EDGE (e, ei, bb->succs)
 	    {
+	      /* Wait for the predecessors.  */
+	      if ((e == best_edge) && for_size
+		  && (EDGE_COUNT (best_edge->dest->succs) > 1
+		      || EDGE_COUNT (best_edge->dest->preds) > 1))
+		{
+		  best_edge = NULL;
+		}
+
 	      if (e == best_edge
 		  || e->dest == EXIT_BLOCK_PTR
 		  || bb_visited_trace (e->dest))
@@ -596,11 +610,12 @@ find_traces_1_round (int branch_th, int exec_th, gcov_type count_th,
 		    {
 		      /* When partitioning hot/cold basic blocks, make sure
 			 the cold blocks (and only the cold blocks) all get
-			 pushed to the last round of trace collection.  */
+			 pushed to the last round of trace collection.  
+			 Do not push to next round when optimizing for size.  */
 
-		      if (push_to_next_round_p (e->dest, round,
-						number_of_rounds,
-						exec_th, count_th))
+		      if (!for_size && push_to_next_round_p (e->dest, round,
+							     number_of_rounds,
+							     exec_th, count_th))
 			which_heap = new_heap;
 		    }
 
@@ -681,6 +696,8 @@ find_traces_1_round (int branch_th, int exec_th, gcov_type count_th,
 		  (i.e. 2 * B->frequency >= EDGE_FREQUENCY (AC) )
 		  Best ordering is then A B C.
 
+		  For size, A B C is always the best order.
+
 		  This situation is created for example by:
 
 		  if (A) B;
@@ -700,7 +717,8 @@ find_traces_1_round (int branch_th, int exec_th, gcov_type count_th,
 			    & EDGE_CAN_FALLTHRU)
 			&& !(single_succ_edge (e->dest)->flags & EDGE_COMPLEX)
 			&& single_succ (e->dest) == best_edge->dest
-			&& 2 * e->dest->frequency >= EDGE_FREQUENCY (best_edge))
+			&& (2 * e->dest->frequency >= EDGE_FREQUENCY (best_edge)
+			    || for_size))
 		      {
 			best_edge = e;
 			if (dump_file)
@@ -820,6 +838,10 @@ bb_to_key (basic_block bb)
   edge_iterator ei;
   int priority = 0;
 
+  /* Use index as key to align with its original order.  */
+  if (optimize_function_for_size_p (cfun))
+    return bb->index;
+
   /* Do not start in probably never executed blocks.  */
 
   if (BB_PARTITION (bb) == BB_COLD_PARTITION
@@ -864,6 +886,13 @@ better_edge_p (const_basic_block bb, const_edge e, int prob, int freq, int best_
   int diff_prob = best_prob / 10;
   int diff_freq = best_freq / 10;
 
+  if (optimize_function_for_size_p (cfun))
+    {
+      /* The smaller one is better to keep the original order.  */
+      return !cur_best_edge
+	     || cur_best_edge->dest->index > e->dest->index;
+    }
+
   if (prob > best_prob + diff_prob)
     /* The edge has higher probability than the temporary best edge.  */
     is_better_edge = true;
@@ -899,6 +928,49 @@ better_edge_p (const_basic_block bb, const_edge e, int prob, int freq, int best_
   return is_better_edge;
 }
 
+/* Return true when the edge E is better than the temporary best edge
+   CUR_BEST_EDGE.  If SRC_INDEX_P is true, the function compares the src bb of
+   E and CUR_BEST_EDGE; otherwise it will compare the dest bb.
+   BEST_LEN is the trace length of src (or dest) bb in CUR_BEST_EDGE.
+   TRACES record the information about traces.
+   When optimizing for size, the edge with smaller index is better.
+   When optimizing for speed, the edge with bigger probability or longer trace
+   is better.  */
+
+static bool
+connect_better_edge_p (const_edge e, bool src_index_p, int best_len,
+		       const_edge cur_best_edge, struct trace *traces)
+{
+  int e_index;
+  int b_index;
+
+  if (!cur_best_edge)
+    return true;
+
+  if (optimize_function_for_size_p (cfun))
+    {
+      e_index = src_index_p ? e->src->index : e->dest->index;
+      b_index = src_index_p ? cur_best_edge->src->index
+			      : cur_best_edge->dest->index;
+      /* The smaller one is better to keep the original order.  */
+      return b_index > e_index;
+    } 
+  else if (src_index_p)
+    {
+      e_index = e->src->index;
+      return e->probability > cur_best_edge->probability
+	     || (e->probability == cur_best_edge->probability
+		 && (traces[bbd[e_index].end_of_trace].length > best_len));
+    }
+  else
+    {
+      e_index = e->dest->index;
+      return e->probability > cur_best_edge->probability
+	     || (e->probability == cur_best_edge->probability
+		 && (traces[bbd[e_index].start_of_trace].length > best_len));
+    }
+}
+
 /* Connect traces in array TRACES, N_TRACES is the count of traces.  */
 
 static void
@@ -912,6 +984,7 @@ connect_traces (int n_traces, struct trace *traces)
   int current_partition;
   int freq_threshold;
   gcov_type count_threshold;
+  bool for_size = optimize_function_for_size_p (cfun);
 
   freq_threshold = max_entry_frequency * DUPLICATION_THRESHOLD / 1000;
   if (max_entry_count < INT_MAX / 1000)
@@ -975,10 +1048,7 @@ connect_traces (int n_traces, struct trace *traces)
 		  && bbd[si].end_of_trace >= 0
 		  && !connected[bbd[si].end_of_trace]
 		  && (BB_PARTITION (e->src) == current_partition)
-		  && (!best
-		      || e->probability > best->probability
-		      || (e->probability == best->probability
-			  && traces[bbd[si].end_of_trace].length > best_len)))
+		  && connect_better_edge_p (e, true, best_len, best, traces))
 		{
 		  best = e;
 		  best_len = traces[bbd[si].end_of_trace].length;
@@ -1021,17 +1091,50 @@ connect_traces (int n_traces, struct trace *traces)
 		  && bbd[di].start_of_trace >= 0
 		  && !connected[bbd[di].start_of_trace]
 		  && (BB_PARTITION (e->dest) == current_partition)
-		  && (!best
-		      || e->probability > best->probability
-		      || (e->probability == best->probability
-			  && traces[bbd[di].start_of_trace].length > best_len)))
+		  && connect_better_edge_p (e, false, best_len, best, traces))
 		{
 		  best = e;
 		  best_len = traces[bbd[di].start_of_trace].length;
 		}
 	    }
 
-	  if (best)
+	  if (for_size)
+	    {
+	      if (!best)
+		break;
+
+	      /* It is OK to connect block n with block n + 1 or a block
+		 before n.  For others, only connect to the loop header.  */
+	      if (best->dest->index > (traces[t].last->index + 1))
+		{
+		  int count = EDGE_COUNT(best->dest->preds);
+
+		  FOR_EACH_EDGE (e, ei, best->dest->preds)
+		    if (e->flags & EDGE_DFS_BACK)
+		      count--;
+
+		  /* If dest has multiple predecessors, skip it.  Expect
+		     block dest->index - 1 connect with it later.  */
+		  if (count != 1) 
+		    break;
+		}
+
+	      /* Only connect Trace n with Trace n + 1.  It is conservative
+		 to keep the order as close as the original order.  */
+	      if (last_trace != bbd[best->dest->index].start_of_trace - 1)
+		break;
+
+	      if (dump_file)
+		{
+		  fprintf (dump_file, "Connection: %d %d\n",
+			   best->src->index, best->dest->index);
+		}
+	      t = bbd[best->dest->index].start_of_trace;
+	      traces[last_trace].last->aux = traces[t].first;
+	      connected[t] = true;
+	      last_trace = t;
+	    }
+	  else if (best)
 	    {
 	      if (dump_file)
 		{
@@ -1169,6 +1272,10 @@ copy_bb_p (const_basic_block bb, int code_may_grow)
   int max_size = uncond_jump_length;
   rtx insn;
 
+  /* Avoid duplicating blocks for size.  */
+  if (optimize_function_for_size_p (cfun))
+    return false;
+
   if (!bb->frequency)
     return false;
   if (EDGE_COUNT (bb->preds) < 2)
@@ -2352,15 +2459,6 @@ gate_handle_reorder_blocks (void)
 {
   if (targetm.cannot_modify_jumps_p ())
     return false;
-  /* Don't reorder blocks when optimizing for size because extra jump insns may
-     be created; also barrier may create extra padding.
-
-     More correctly we should have a block reordering mode that tried to
-     minimize the combined size of all the jumps.  This would more or less
-     automatically remove extra jumps, but would also try to use more short
-     jumps instead of long jumps.  */
-  if (!optimize_function_for_speed_p (cfun))
-    return false;
   return (optimize > 0
 	  && (flag_reorder_blocks || flag_reorder_blocks_and_partition));
 }