Hi all,
This is the 4.8 backport of the Cortex-A53 erratum 835769 workaround.
4.8 doesn't have rtx_insns and the type attributes are different.
Other than that there's not much different from the trunk version.
Bootstrapped and tested on aarch64-none-linux-gnu with and without the
workaround enabled.
Compiled various large benchmarks with it.
Ok for the 4.8 branch?
Thanks,
Kyrill
2014-10-17 Kyrylo Tkachov <kyrylo.tkac...@arm.com>
* config/aarch64/aarch64.opt (mfix-cortex-a53-835769): New option.
* config/aarch64/aarch64.h (ADJUST_INSN_LENGTH): Define.
(FINAL_PRESCAN_INSN): Likewise.
* config/aarch64/aarch64.h (is_mem_p): New function.
(has_memory_op): Likewise.
(aarch64_prev_real_insn): Likewise.
(is_madd_op): Likewise.
(dep_between_memop_and_curr): Likewise.
(aarch64_madd_needs_nop): Likewise.
(aarch64_final_prescan_insn): Likewise.
commit a6620e170704e4967f84ebd315cf25738ae3c3f2
Author: Kyrylo Tkachov <kyrylo.tkac...@arm.com>
Date: Thu Oct 16 11:14:15 2014 +0100
[AArch64] Add -mfix-cortex-a53-835769 fix
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 5d0072f..07ff703 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -247,6 +247,8 @@ aarch64_builtin_vectorized_function (tree fndecl,
extern void aarch64_split_combinev16qi (rtx operands[3]);
extern void aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel);
+extern bool aarch64_madd_needs_nop (rtx);
+extern void aarch64_final_prescan_insn (rtx);
extern bool
aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 0ac9ba1..f507278 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -6040,6 +6040,131 @@ aarch64_mangle_type (const_tree type)
return NULL;
}
+
+/* Return true iff X is a MEM rtx. */
+
+static int
+is_mem_p (rtx *x, void *data ATTRIBUTE_UNUSED)
+{
+ return MEM_P (*x);
+}
+
+
+/* Return true if mem_insn contains a MEM RTX somewhere in it. */
+
+static bool
+has_memory_op (rtx mem_insn)
+{
+ rtx pattern = PATTERN (mem_insn);
+ return for_each_rtx (&pattern, is_mem_p, NULL);
+}
+
+
+/* Find the first rtx before insn that will generate an assembly
+ instruction. */
+
+static rtx
+aarch64_prev_real_insn (rtx insn)
+{
+ if (!insn)
+ return NULL;
+
+ do
+ {
+ insn = prev_real_insn (insn);
+ }
+ while (insn && recog_memoized (insn) < 0);
+
+ return insn;
+}
+
+/* Return true iff t1 is the v8type of a multiply-accumulate instruction. */
+
+static bool
+is_madd_op (enum attr_v8type t1)
+{
+ return t1 == V8TYPE_MADD
+ || t1 == V8TYPE_MADDL;
+}
+
+
+/* Check if there is a register dependency between a load and the insn
+ for which we hold recog_data. */
+
+static bool
+dep_between_memop_and_curr (rtx memop)
+{
+ rtx load_reg;
+ int opno;
+
+ gcc_assert (GET_CODE (memop) == SET);
+
+ if (!REG_P (SET_DEST (memop)))
+ return false;
+
+ load_reg = SET_DEST (memop);
+ for (opno = 1; opno < recog_data.n_operands; opno++)
+ {
+ rtx operand = recog_data.operand[opno];
+ if (REG_P (operand)
+ && reg_overlap_mentioned_p (load_reg, operand))
+ return true;
+
+ }
+ return false;
+}
+
+
+
+/* When working around the Cortex-A53 erratum 835769,
+ given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
+ instruction and has a preceding memory instruction such that a NOP
+ should be inserted between them. */
+
+bool
+aarch64_madd_needs_nop (rtx insn)
+{
+ enum attr_v8type attr_type;
+ rtx prev;
+ rtx body;
+
+ if (!aarch64_fix_a53_err835769)
+ return false;
+
+ if (recog_memoized (insn) < 0)
+ return false;
+
+ attr_type = get_attr_v8type (insn);
+ if (!is_madd_op (attr_type))
+ return false;
+
+ prev = aarch64_prev_real_insn (insn);
+ if (!prev || !has_memory_op (prev))
+ return false;
+
+ body = single_set (prev);
+
+ /* If the previous insn is a memory op and there is no dependency between
+ it and the madd, emit a nop between them. If we know it's a memop but
+ body is NULL, return true to be safe. */
+ if (GET_MODE (recog_data.operand[0]) == DImode
+ && (!body || !dep_between_memop_and_curr (body)))
+ return true;
+
+ return false;
+
+}
+
+/* Implement FINAL_PRESCAN_INSN. */
+
+void
+aarch64_final_prescan_insn (rtx insn)
+{
+ if (aarch64_madd_needs_nop (insn))
+ fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
+}
+
+
/* Return the equivalent letter for size. */
static unsigned char
sizetochar (int size)
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 19ac5eb..1a8b993 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -465,6 +465,18 @@ enum target_cpus
(TARGET_CPU_generic | (AARCH64_CPU_DEFAULT_FLAGS << 6))
#endif
+/* If inserting NOP before a mult-accumulate insn remember to adjust the
+ length so that conditional branching code is updated appropriately. */
+#define ADJUST_INSN_LENGTH(insn, length) \
+ do \
+ { \
+ if (aarch64_madd_needs_nop (insn)) \
+ length += 4; \
+ } while (0)
+
+#define FINAL_PRESCAN_INSN(INSN, OPVEC, NOPERANDS) \
+ aarch64_final_prescan_insn (INSN); \
+
/* The processor for which instructions should be scheduled. */
extern enum aarch64_processor aarch64_tune;
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index 3518248..f414ad4 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -67,6 +67,10 @@ mgeneral-regs-only
Target Report RejectNegative Mask(GENERAL_REGS_ONLY)
Generate code which uses only the general registers
+mfix-cortex-a53-835769
+Target Report Var(aarch64_fix_a53_err835769) Init(0)
+Workaround for ARM Cortex-A53 Erratum number 835769
+
mlittle-endian
Target Report RejectNegative InverseMask(BIG_END)
Assume target CPU is configured as little endian
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index c96ef22..d3ac468 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -475,6 +475,7 @@ Objective-C and Objective-C++ Dialects}.
-mstrict-align @gol
-momit-leaf-frame-pointer -mno-omit-leaf-frame-pointer @gol
-mtls-dialect=desc -mtls-dialect=traditional @gol
+-mfix-cortex-a53-835769 -mno-fix-cortex-a53-835769 @gol
-march=@var{name} -mcpu=@var{name} -mtune=@var{name}}
@emph{Adapteva Epiphany Options}
@@ -10934,6 +10935,14 @@ of TLS variables. This is the default.
Use traditional TLS as the thread-local storage mechanism for dynamic accesses
of TLS variables.
+@item -mfix-cortex-a53-835769
+@itemx -mno-fix-cortex-a53-835769
+@opindex -mfix-cortex-a53-835769
+@opindex -mno-fix-cortex-a53-835769
+Enable or disable the workaround for the ARM Cortex-A53 erratum number 835769.
+This will involve inserting a NOP instruction between memory instructions and
+64-bit integer multiply-accumulate instructions.
+
@item -march=@var{name}
@opindex march
Specify the name of the target architecture, optionally suffixed by one or