Hi,
This patch intends to minimize the use of literal pool for some armv7-m
targets that have slower speed to load data from flash than to fetch
instruction from flash. The normal literal load instruction is now replaced
by MOVW/MOVT instructions. A new option -mslow-flash-data is created for
this purpose. So far this feature doesn't support PIC code and target that
isn't based on armv7-m.
Tested with GCC regression test on QEMU for cortex-m3. No new regressions.
Is it OK to trunk?
BR,
Terry
2013-11-06 Terry Guo <terry....@arm.com>
* doc/invoke.texi (-mslow-flash-data): Document new option.
* config/arm/arm.opt (mslow-flash-data): New option.
* config/arm/arm-protos.h
(arm_max_const_double_inline_cost): Declare it.
* config/arm/arm.h (TARGET_USE_MOVT): Always true when
disable literal pools.
(arm_disable_literal_pool): Declare it.
* config/arm/arm.c (arm_disable_literal_pool): New
variable.
(arm_option_override): Handle new option.
(thumb2_legitimate_address_p): Invalid certain address
format.
(arm_max_const_double_inline_cost): New function.
* config/arm/arm.md (types.md): Include it a little
earlier.
(use_literal_pool): New attribute.
(enabled): Use new attribute.
(split pattern): Replace symbol+offset with MOVW/MOVT.
diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index 944cf10..c5b16da 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -121,6 +121,7 @@ extern rtx arm_gen_compare_reg (RTX_CODE, rtx, rtx, rtx);
extern rtx arm_gen_return_addr_mask (void);
extern void arm_reload_in_hi (rtx *);
extern void arm_reload_out_hi (rtx *);
+extern int arm_max_const_double_inline_cost (void);
extern int arm_const_double_inline_cost (rtx);
extern bool arm_const_double_by_parts (rtx);
extern bool arm_const_double_by_immediates (rtx);
diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h
index 1781b75..25927a1 100644
--- a/gcc/config/arm/arm.h
+++ b/gcc/config/arm/arm.h
@@ -329,7 +329,9 @@ extern void (*arm_lang_output_object_attributes_hook)(void);
/* Should MOVW/MOVT be used in preference to a constant pool. */
#define TARGET_USE_MOVT \
- (arm_arch_thumb2 && !optimize_size && !current_tune->prefer_constant_pool)
+ (arm_arch_thumb2 \
+ && (arm_disable_literal_pool \
+ || (!optimize_size && !current_tune->prefer_constant_pool)))
/* We could use unified syntax for arm mode, but for now we just use it
for Thumb-2. */
@@ -554,6 +556,9 @@ extern int arm_arch_thumb_hwdiv;
than core registers. */
extern int prefer_neon_for_64bits;
+/* Nonzero if shouldn't use literal pool in generated code. */
+extern int arm_disable_literal_pool;
+
#ifndef TARGET_DEFAULT
#define TARGET_DEFAULT (MASK_APCS_FRAME)
#endif
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 78554e8..de2a9c0 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -864,6 +864,9 @@ int arm_arch_thumb_hwdiv;
than core registers. */
int prefer_neon_for_64bits = 0;
+/* Nonzero if shouldn't use literal pool in generated code. */
+int arm_disable_literal_pool = 0;
+
/* In case of a PRE_INC, POST_INC, PRE_DEC, POST_DEC memory reference,
we must report the mode of the memory reference from
TARGET_PRINT_OPERAND to TARGET_PRINT_OPERAND_ADDRESS. */
@@ -2505,6 +2508,16 @@ arm_option_override (void)
if (TARGET_APCS_FRAME)
flag_shrink_wrap = false;
+ /* We only support -mslow-flash-data on armv7-m targets. */
+ if (target_slow_flash_data
+ && ((!(arm_arch7 && !arm_arch_notm) && !arm_arch7em)
+ || (TARGET_THUMB1 || flag_pic || TARGET_NEON)))
+ error ("-mslow-flash-data only supports non-pic code on armv7-m targets");
+
+ /* Currently, for slow flash data, we just disable literal pools. */
+ if (target_slow_flash_data)
+ arm_disable_literal_pool = 1;
+
/* Register global variables with the garbage collector. */
arm_add_gc_roots ();
}
@@ -6348,6 +6361,25 @@ thumb2_legitimate_address_p (enum machine_mode mode, rtx
x, int strict_p)
&& thumb2_legitimate_index_p (mode, xop0, strict_p)));
}
+ /* Normally we can assign constant values to its target register without
+ the help of constant pool. But there are cases we have to use constant
+ pool like:
+ 1) assign a label to register.
+ 2) sign-extend a 8bit value to 32bit and then assign to register.
+
+ Constant pool access in format:
+ (set (reg r0) (mem (symbol_ref (".LC0"))))
+ will cause the use of literal pool (later in function arm_reorg).
+ So here we mark such format as an invalid format, then compiler
+ will adjust it into:
+ (set (reg r0) (symbol_ref (".LC0")))
+ (set (reg r0) (mem (reg r0))).
+ No extra register is required, and (mem (reg r0)) won't cause the use
+ of literal pools. */
+ else if (arm_disable_literal_pool && code == SYMBOL_REF
+ && CONSTANT_POOL_ADDRESS_P (x))
+ return 0;
+
else if (GET_MODE_CLASS (mode) != MODE_FLOAT
&& code == SYMBOL_REF
&& CONSTANT_POOL_ADDRESS_P (x)
@@ -16114,6 +16146,18 @@ push_minipool_fix (rtx insn, HOST_WIDE_INT address,
rtx *loc,
minipool_fix_tail = fix;
}
+/* Return maximum allowed cost of synthesizing a 64-bit constant VAL inline.
+ Returns 99 if we always want to synthesize the value. */
+int
+arm_max_const_double_inline_cost ()
+{
+ /* Let the value get synthesized to avoid the use of literal pools. */
+ if (arm_disable_literal_pool)
+ return 99;
+
+ return ((optimize_size || arm_ld_sched) ? 3 : 4);
+}
+
/* Return the cost of synthesizing a 64-bit constant VAL inline.
Returns the number of insns needed, or 99 if we don't know how to
do it. */
diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index 3726201..4f32e42 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -82,6 +82,9 @@
;; Processor type. This is created automatically from arm-cores.def.
(include "arm-tune.md")
+;; Instruction classification types
+(include "types.md")
+
; IS_THUMB is set to 'yes' when we are generating Thumb code, and 'no' when
; generating ARM code. This is used to control the length of some insn
; patterns that share the same RTL in both ARM and Thumb code.
@@ -191,6 +194,12 @@
(const_string "yes")]
(const_string "no")))
+(define_attr "use_literal_pool" "no,yes"
+ (cond [(and (eq_attr "type" "f_loads,f_loadd")
+ (match_test "CONSTANT_P (operands[1])"))
+ (const_string "yes")]
+ (const_string "no")))
+
; Allows an insn to disable certain alternatives for reasons other than
; arch support.
(define_attr "insn_enabled" "no,yes"
@@ -210,6 +219,10 @@
(match_test "arm_restrict_it"))
(const_string "no")
+ (and (eq_attr "use_literal_pool" "yes")
+ (match_test "arm_disable_literal_pool"))
+ (const_string "no")
+
(eq_attr "arch_enabled" "no")
(const_string "no")
@@ -245,9 +258,6 @@
(set_attr "length" "4")
(set_attr "pool_range" "250")])
-;; Instruction classification types
-(include "types.md")
-
; Load scheduling, set from the arm_ld_sched variable
; initialized by arm_option_override()
(define_attr "ldsched" "no,yes" (const (symbol_ref "arm_ld_sched")))
@@ -6021,7 +6031,7 @@
"TARGET_32BIT
&& reload_completed
&& (arm_const_double_inline_cost (operands[1])
- <= ((optimize_size || arm_ld_sched) ? 3 : 4))"
+ <= arm_max_const_double_inline_cost ())"
[(const_int 0)]
"
arm_split_constant (SET, SImode, curr_insn,
@@ -6284,6 +6294,47 @@
"
)
+;; A normal way to do (symbol + offset) requires three instructions at least
+;; (depends on how big the offset is) as below:
+;; movw r0, #:lower16:g
+;; movw r0, #:upper16:g
+;; adds r0, #4
+;;
+;; A better way would be:
+;; movw r0, #:lower16:g+4
+;; movw r0, #:upper16:g+4
+;;
+;; The limitation of this way is that the length of offset should be a 16-bit
+;; signed value, because current assembler only supports REL type relocation
for
+;; such case. If the more powerful RELA type is supported in future, we should
+;; update this pattern to go with better way.
+(define_split
+ [(set (match_operand:SI 0 "arm_general_register_operand" "")
+ (const:SI (plus:SI (match_operand:SI 1 "general_operand" "")
+ (match_operand:SI 2 "const_int_operand" ""))))]
+ "TARGET_THUMB2
+ && arm_disable_literal_pool
+ && reload_completed
+ && GET_CODE (operands[1]) == SYMBOL_REF"
+ [(clobber (const_int 0))]
+ "
+ int offset = INTVAL (operands[2]);
+
+ if (offset < -0x8000 || offset > 0x7fff)
+ {
+ arm_emit_movpair (operands[0], operands[1]);
+ emit_insn (gen_rtx_SET (SImode, operands[0],
+ gen_rtx_PLUS (SImode, operands[0],
operands[2])));
+ }
+ else
+ {
+ rtx op = gen_rtx_CONST (SImode,
+ gen_rtx_PLUS (SImode, operands[1],
operands[2]));
+ arm_emit_movpair (operands[0], op);
+ }
+ "
+)
+
;; Split symbol_refs at the later stage (after cprop), instead of generating
;; movt/movw pair directly at expand. Otherwise corresponding high_sum
;; and lo_sum would be merged back into memory load at cprop. However,
diff --git a/gcc/config/arm/arm.opt b/gcc/config/arm/arm.opt
index 9b74038..0d24c32 100644
--- a/gcc/config/arm/arm.opt
+++ b/gcc/config/arm/arm.opt
@@ -267,3 +267,7 @@ Enable unaligned word and halfword accesses to packed data.
mneon-for-64bits
Target Report RejectNegative Var(use_neon_for_64bits) Init(0)
Use Neon to perform 64-bits operations rather than core registers.
+
+mslow-flash-data
+Target Report Var(target_slow_flash_data) Init(0)
+Assume loading data from flash is slower than fetching instructions.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index adbc45b..a5991cb 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -534,6 +534,7 @@ Objective-C and Objective-C++ Dialects}.
-mfix-cortex-m3-ldrd @gol
-munaligned-access @gol
-mneon-for-64bits @gol
+-mslow-flash-data @gol
-mrestrict-it}
@emph{AVR Options}
@@ -12295,6 +12296,12 @@ Enables using Neon to handle scalar 64-bits
operations. This is
disabled by default since the cost of moving data from core registers
to Neon is high.
+@item -mslow-flash-data
+@opindex mslow-flash-data
+Assume loading data from flash is slower than fetching instruction.
+Therefore literal load is minimized for better performance.
+This option is off by default.
+
@item -mrestrict-it
@opindex mrestrict-it
Restricts generation of IT blocks to conform to the rules of ARMv8.
diff --git a/gcc/testsuite/gcc.target/arm/thumb2-slow-flash-data.c
b/gcc/testsuite/gcc.target/arm/thumb2-slow-flash-data.c
new file mode 100644
index 0000000..9852ea5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/thumb2-slow-flash-data.c
@@ -0,0 +1,74 @@
+/* The option -mslow-flash-data is just for performance tuning, it
+ doesn't totally disable the use of literal pools. But for below
+ simple cases, the use of literal pool should be replaced by
+ movw/movt or read-only constant pool. */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_cortex_m } */
+/* { dg-require-effective-target arm_thumb2_ok } */
+/* { dg-options "-O2 -mthumb -mslow-flash-data" } */
+
+float sf;
+double df;
+long long l;
+static char *p = "Hello World";
+
+float
+testsf (float *p)
+{
+ if (*p > 1.1234f)
+ return 2.1234f;
+ else
+ return 3.1234f;
+}
+
+double
+testdf (double *p)
+{
+ if (*p > 4.1234)
+ return 2.1234;
+ else
+ return 3.1234;
+}
+
+long long
+testll (long long *p)
+{
+ if (*p > 0x123456789ABCDEFll)
+ return 0x111111111ll;
+ else
+ return 0x222222222ll;
+}
+
+char *
+testchar ()
+{
+ return p + 4;
+}
+
+int
+foo (int a, int b)
+{
+ int i;
+ volatile *labelref = &&label1;
+
+ if (a > b)
+ {
+ while (i < b)
+ {
+ a += *labelref;
+ i += 1;
+ }
+ goto *labelref;
+ }
+ else
+ b = b + 3;
+
+ a = a * b;
+
+label1:
+ return a + b;
+}
+
+/* { dg-final { scan-assembler-times "movt" 13 } } */
+/* { dg-final { scan-assembler-times "movt.*LC0\\+4" 1 } } */