Index: gcc/config/arm/arm-cores.def
===================================================================
--- gcc/config/arm/arm-cores.def	(revision 203599)
+++ gcc/config/arm/arm-cores.def	(working copy)
@@ -134,8 +134,8 @@
 ARM_CORE("cortex-r4f",	  cortexr4f,	7R,				 FL_LDSCHED, cortex)
 ARM_CORE("cortex-r5",	  cortexr5,	7R,				 FL_LDSCHED | FL_ARM_DIV, cortex)
 ARM_CORE("cortex-r7",	  cortexr7,	7R,				 FL_LDSCHED | FL_ARM_DIV, cortex)
-ARM_CORE("cortex-m4",	  cortexm4,	7EM,				 FL_LDSCHED, cortex)
-ARM_CORE("cortex-m3",	  cortexm3,	7M,				 FL_LDSCHED, cortex)
+ARM_CORE("cortex-m4",	  cortexm4,	7EM,				 FL_LDSCHED, v7m)
+ARM_CORE("cortex-m3",	  cortexm3,	7M,				 FL_LDSCHED, v7m)
 ARM_CORE("cortex-m1",	  cortexm1,	6M,				 FL_LDSCHED, v6m)
 ARM_CORE("cortex-m0",	  cortexm0,	6M,				 FL_LDSCHED, v6m)
 ARM_CORE("cortex-m0plus", cortexm0plus,	6M,				 FL_LDSCHED, v6m)
Index: gcc/config/arm/arm.c
===================================================================
--- gcc/config/arm/arm.c	(revision 203599)
+++ gcc/config/arm/arm.c	(working copy)
@@ -267,6 +267,7 @@
 static unsigned int arm_autovectorize_vector_sizes (void);
 static int arm_default_branch_cost (bool, bool);
 static int arm_cortex_a5_branch_cost (bool, bool);
+static int arm_cortex_m_branch_cost (bool, bool);
 
 static bool arm_vectorize_vec_perm_const_ok (enum machine_mode vmode,
 					     const unsigned char *sel);
@@ -1150,7 +1151,7 @@
 {
   arm_slowmul_rtx_costs,
   NULL,
-  NULL,
+  NULL,						/* Sched adj cost.  */
   3,						/* Constant limit.  */
   5,						/* Max cond insns.  */
   ARM_PREFETCH_NOT_BENEFICIAL,
@@ -1166,7 +1167,7 @@
 {
   arm_fastmul_rtx_costs,
   NULL,
-  NULL,
+  NULL,						/* Sched adj cost.  */
   1,						/* Constant limit.  */
   5,						/* Max cond insns.  */
   ARM_PREFETCH_NOT_BENEFICIAL,
@@ -1185,7 +1186,7 @@
 {
   arm_fastmul_rtx_costs,
   NULL,
-  NULL,
+  NULL,						/* Sched adj cost.  */
   1,						/* Constant limit.  */
   3,						/* Max cond insns.  */
   ARM_PREFETCH_NOT_BENEFICIAL,
@@ -1217,7 +1218,7 @@
 {
   arm_9e_rtx_costs,
   NULL,
-  NULL,
+  NULL,						/* Sched adj cost.  */
   1,						/* Constant limit.  */
   5,						/* Max cond insns.  */
   ARM_PREFETCH_NOT_BENEFICIAL,
@@ -1233,7 +1234,7 @@
 {
   arm_9e_rtx_costs,
   NULL,
-  NULL,
+  NULL,						/* Sched adj cost.  */
   1,						/* Constant limit.  */
   5,						/* Max cond insns.  */
   ARM_PREFETCH_NOT_BENEFICIAL,
@@ -1250,7 +1251,7 @@
 {
   arm_9e_rtx_costs,
   &generic_extra_costs,
-  NULL,
+  NULL,						/* Sched adj cost.  */
   1,						/* Constant limit.  */
   5,						/* Max cond insns.  */
   ARM_PREFETCH_NOT_BENEFICIAL,
@@ -1266,7 +1267,7 @@
 {
   arm_9e_rtx_costs,
   &cortexa15_extra_costs,
-  NULL,
+  NULL,						/* Sched adj cost.  */
   1,						/* Constant limit.  */
   2,						/* Max cond insns.  */
   ARM_PREFETCH_NOT_BENEFICIAL,
@@ -1285,7 +1286,7 @@
 {
   arm_9e_rtx_costs,
   NULL,
-  NULL,
+  NULL,						/* Sched adj cost.  */
   1,						/* Constant limit.  */
   1,						/* Max cond insns.  */
   ARM_PREFETCH_NOT_BENEFICIAL,
@@ -1313,13 +1314,36 @@
   false                                         /* Prefer Neon for 64-bits bitops.  */
 };
 
+/* armv7m tuning.  On Cortex-M4 cores for example, MOVW/MOVT take a single
+   cycle to execute each.  An LDR from the constant pool also takes two cycles
+   to execute, but mildly increases pipelining opportunity (consecutive
+   loads/stores can be pipelined together, saving one cycle), and may also
+   improve icache utilisation.  Hence we prefer the constant pool for such
+   processors.  */
+
+const struct tune_params arm_v7m_tune =
+{
+  arm_9e_rtx_costs,
+  &generic_extra_costs,
+  NULL,						/* Sched adj cost.  */
+  1,						/* Constant limit.  */
+  5,						/* Max cond insns.  */
+  ARM_PREFETCH_NOT_BENEFICIAL,
+  true,						/* Prefer constant pool.  */
+  arm_cortex_m_branch_cost,
+  false,					/* Prefer LDRD/STRD.  */
+  {false, false},				/* Prefer non short circuit.  */
+  &arm_default_vec_cost,                        /* Vectorizer costs.  */
+  false                                         /* Prefer Neon for 64-bits bitops.  */
+};
+
 /* The arm_v6m_tune is duplicated from arm_cortex_tune, rather than
    arm_v6t2_tune. It is used for cortex-m0, cortex-m1 and cortex-m0plus.  */
 const struct tune_params arm_v6m_tune =
 {
   arm_9e_rtx_costs,
   NULL,
-  NULL,
+  NULL,						/* Sched adj cost.  */
   1,						/* Constant limit.  */
   5,						/* Max cond insns.  */
   ARM_PREFETCH_NOT_BENEFICIAL,
@@ -11108,6 +11132,20 @@
   return speed_p ? 0 : arm_default_branch_cost (speed_p, predictable_p);
 }
 
+/* Thumb-2 branches are relatively cheap on Cortex-M processors ("1 + P cycles"
+   on Cortex-M4, where P varies from 1 to 3 according to some criteria), since
+   sequences of non-executed instructions in IT blocks probably take the same
+   amount of time as executed instructions (and the IT instruction itself takes
+   space in icache).  This function was experimentally determined to give good
+   results on a popular embedded benchmark.  */
+
+static int
+arm_cortex_m_branch_cost (bool speed_p, bool predictable_p)
+{
+  return (TARGET_32BIT && speed_p) ? 1
+         : arm_default_branch_cost (speed_p, predictable_p);
+}
+
 static bool fp_consts_inited = false;
 
 static REAL_VALUE_TYPE value_fp0;
Index: gcc/testsuite/gcc.dg/tree-ssa/forwprop-28.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/forwprop-28.c	(revision 203599)
+++ gcc/testsuite/gcc.dg/tree-ssa/forwprop-28.c	(working copy)
@@ -1,5 +1,9 @@
 /* { dg-do compile { target { ! "m68k*-*-* mmix*-*-* mep*-*-* bfin*-*-* v850*-*-* picochip*-*-* moxie*-*-* cris*-*-* m32c*-*-* fr30*-*-* mcore*-*-* powerpc*-*-* xtensa*-*-*"} } } */
 /* { dg-options "-O2 -fdump-tree-forwprop1" } */
+/* Skip on ARM Cortex-M, where LOGICAL_OP_NON_SHORT_CIRCUIT is set to false,
+   leading to two conditional jumps when evaluating an && condition.  Forwprop1
+   is not able to optimize this.  */
+/* { dg-skip-if "" { arm_cortex_m } } */
 
 extern char *frob (void);
 extern _Bool testit (void);
Index: gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-4.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-4.c	(revision 203599)
+++ gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-4.c	(working copy)
@@ -59,9 +59,9 @@
    code we missed the edge when the first conditional is false
    (b_elt is zero, which means the second conditional is always
    zero.  */
-/* ARM Cortex-M0 defined LOGICAL_OP_NON_SHORT_CIRCUIT to false,
+/* ARM Cortex-M defined LOGICAL_OP_NON_SHORT_CIRCUIT to false,
    so skip below test.  */
-/* { dg-final { scan-tree-dump-times "Threaded" 3 "dom1" { target { ! { { mips*-*-* avr-*-* } || { arm_cortex_m && arm_thumb1 } } } } } } */
+/* { dg-final { scan-tree-dump-times "Threaded" 3 "dom1" { target { ! { { mips*-*-* avr-*-* } || { arm_cortex_m } } } } } } */
 /* MIPS defines LOGICAL_OP_NON_SHORT_CIRCUIT to 0, so we split both
    "a_elt || b_elt" and "b_elt && kill_elt" into two conditions each,
    rather than using "(var1 != 0) op (var2 != 0)".  Also, as on other targets,
Index: gcc/testsuite/gcc.dg/tree-ssa/ssa-vrp-thread-1.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/ssa-vrp-thread-1.c	(revision 203599)
+++ gcc/testsuite/gcc.dg/tree-ssa/ssa-vrp-thread-1.c	(working copy)
@@ -26,6 +26,8 @@
   oof ();
 }
 
-/* { dg-final { scan-tree-dump-times "Threaded" 1 "vrp1" }  } */
+/* ARM Cortex-M defined LOGICAL_OP_NON_SHORT_CIRCUIT to false,
+   so skip below test.  */
+/* { dg-final { { scan-tree-dump-times "Threaded" 1 "vrp1" } || { arm_cortex_m } }  } */
 /* { dg-final { cleanup-tree-dump "vrp1" } } */
 
Index: gcc/testsuite/gcc.dg/tree-ssa/vrp47.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/vrp47.c	(revision 203599)
+++ gcc/testsuite/gcc.dg/tree-ssa/vrp47.c	(working copy)
@@ -6,10 +6,10 @@
 /* { dg-do compile { target { ! "mips*-*-* s390*-*-*  avr-*-* mn10300-*-*" } } } */
 /* { dg-options "-O2 -fdump-tree-vrp1 -fdump-tree-dom1 -fdump-tree-vrp2" } */
 /* { dg-additional-options "-march=i586" { target { { i?86-*-* x86_64-*-* } && ia32 } } } */
-/* Skip on ARM Cortex-M0, where LOGICAL_OP_NON_SHORT_CIRCUIT is set to false,
+/* Skip on ARM Cortex-M, where LOGICAL_OP_NON_SHORT_CIRCUIT is set to false,
    leading to two conditional jumps when evaluating an && condition.  VRP is
    not able to optimize this.  */
-/* { dg-skip-if "" { arm_cortex_m && arm_thumb1} } */
+/* { dg-skip-if "" { arm_cortex_m } } */
 
 int h(int x, int y)
 {
Index: gcc/testsuite/gcc.dg/tree-ssa/vrp87.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/vrp87.c	(revision 203599)
+++ gcc/testsuite/gcc.dg/tree-ssa/vrp87.c	(working copy)
@@ -2,6 +2,10 @@
 
 /* { dg-options "-O2 -fdump-tree-vrp2-details -fdump-tree-cddce2-details" } */
 /* { dg-additional-options "-mbranch-cost=2" { target avr-*-* } } */
+/* Skip on ARM Cortex-M, where LOGICAL_OP_NON_SHORT_CIRCUIT is set to false,
+   leading to two conditional jumps when evaluating an && condition.  VRP is
+   not able to optimize this.  */
+/* { dg-skip-if "" { arm_cortex_m } } */
 
 struct bitmap_head_def;
 typedef struct bitmap_head_def *bitmap;
