On 10/10/14 15:18, Richard Earnshaw wrote:
On 10/10/14 11:53, Kyrill Tkachov wrote:
Hi all,


Some early revisions of the Cortex-A53 have an erratum (835769) whereby
it is possible for a 64-bit multiply-accumulate instruction in
AArch64 state to generate an incorrect result.  The details are quite
complex and hard to determine statically, since branches in the code
may exist in some circumstances, but all cases end with a memory
(load, store, or prefetch) instruction followed immediately by the
multiply-accumulate operation.

The safest work-around for this issue is to make the compiler avoid
emitting multiply-accumulate instructions immediately after memory
instructions and the simplest way to do this is to insert a NOP. A
linker patching technique can also be used, by moving potentially
affected multiply-accumulate instruction into a patch region and
replacing the original instruction with a branch to the patch.

This patch achieves the compiler part by using the final prescan pass.
The insn length attribute is updated accordingly using ADJUST_INSN_LENGTH
so that conditional branches work properly.

The fix is disabled by default and can be turned on by the
-mfix-cortex-a53-835769 compiler option.

I'm attaching a trunk and a 4.9 version of the patch.
The 4.9 version is different only in that rtx_insn* is replaced by rtx.

Tested on aarch64-none-linux-gnu (and bootstrap with the option) and
built various large benchmarks with it.

Ok?

Thanks,
Kyrill

2014-10-10  Kyrylo Tkachov<kyrylo.tkac...@arm.com>
              Ramana Radhakrishnan<ramana.radhakrish...@arm.com>

       * config/aarch64/aarch64.h (FINAL_PRESCAN_INSN): Define.
       (ADJUST_INSN_LENGTH): Define.
       * config/aarch64/aarch64.opt (mfix-cortex-a53-835769): New option.
       * config/aarch64/aarch64.c (is_mem_p): New function.
       (is_memory_op): Likewise.
       (aarch64_prev_real_insn): Likewise.
       (is_madd_op): Likewise.
       (dep_between_memop_and_next): Likewise.
       (aarch64_madd_needs_nop): Likewise.
       (aarch64_final_prescan_insn): Likewise.
       * doc/invoke.texi (AArch64 Options): Document -mfix-cortex-a53-835769
       and -mno-fix-cortex-a53-835769 options.


a53-workaround-new.patch


commit 6ee2bec04fbce15b13b59b54ef4fe11aeda74ff4
Author: Kyrill Tkachov <kyrylo.tkac...@arm.com>
Date:   Wed Oct 8 12:48:34 2014 +0000

     [AArch64] Add final prescan workaround for Cortex-A53 erratum.

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index b5f53d2..c57a467 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -308,6 +308,8 @@ aarch64_builtin_vectorized_function (tree fndecl,
extern void aarch64_split_combinev16qi (rtx operands[3]);
  extern void aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel);
+extern bool aarch64_madd_needs_nop (rtx_insn *);
+extern void aarch64_final_prescan_insn (rtx_insn *);
  extern bool
  aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel);
  void aarch64_atomic_assign_expand_fenv (tree *, tree *, tree *);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 5144c35..76a2480 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -7586,6 +7586,128 @@ aarch64_mangle_type (const_tree type)
    return NULL;
  }
+static int
+is_mem_p (rtx *x, void *data ATTRIBUTE_UNUSED)
+{
+  return MEM_P (*x);
+}
+
+static bool
+is_memory_op (rtx_insn *mem_insn)
+{
+   rtx pattern = PATTERN (mem_insn);
+   return for_each_rtx (&pattern, is_mem_p, NULL);
+}
+
+/* Find the first rtx_insn before insn that will generate an assembly
+   instruction.  */
+
+static rtx_insn *
+aarch64_prev_real_insn (rtx_insn *insn)
+{
+  if (!insn)
+    return NULL;
+
+  do
+    {
+      insn = prev_real_insn (insn);
+    }
+  while (insn && recog_memoized (insn) < 0);
+
+  return insn;
+}
+
+static bool
+is_madd_op (enum attr_type t1)
+{
+  unsigned int i;
+  /* A number of these may be AArch32 only.  */
+  enum attr_type mlatypes[] = {
+    TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
+    TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
+    TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
+  };
+
+  for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
+    {
+      if (t1 == mlatypes[i])
+       return true;
+    }
+
+  return false;
+}
+
+/* Check if there is a register dependency between a load and the insn
+   for which we hold recog_data.  */
+
+static bool
+dep_between_memop_and_curr (rtx memop)
+{
+  rtx load_reg;
+  int opno;
+
+  if (!memop)
+    return false;
+
+  if (!REG_P (SET_DEST (memop)))
+    return false;
+
+  load_reg = SET_DEST (memop);
+  for (opno = 0; opno < recog_data.n_operands; opno++)
+    {
+      rtx operand = recog_data.operand[opno];
+      if (REG_P (operand)
+          && reg_overlap_mentioned_p (load_reg, operand))
+        return true;
+
+    }
+  return false;
+}
+
+bool
+aarch64_madd_needs_nop (rtx_insn* insn)
+{
+  enum attr_type attr_type;
+  rtx_insn *prev;
+  rtx body;
+
+  if (!aarch64_fix_a53_err835769)
+    return false;
+
+  if (recog_memoized (insn) < 0)
+    return false;
+
+  attr_type = get_attr_type (insn);
+  if (!is_madd_op (attr_type))
+    return false;
+
+  prev = aarch64_prev_real_insn (insn);
+  if (!prev)
+    return false;
+
+  body = single_set (prev);
+
+  /* If the previous insn is a memory op and there is no dependency between
+     it and the madd, emit a nop between them.  If we know the previous insn is
+     a memory op but body is NULL, emit the nop to be safe, it's probably a
+     load/store pair insn.  */
+  if (is_memory_op (prev)
+      && GET_MODE (recog_data.operand[0]) == DImode
+      && (!dep_between_memop_and_curr (body)))
+    return true;
+
+  return false;
+
+}
+
+void
+aarch64_final_prescan_insn (rtx_insn *insn)
+{
+  if (aarch64_madd_needs_nop (insn))
+    fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
+}
+
+
  /* Return the equivalent letter for size.  */
  static char
  sizetochar (int size)
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index db950da..e9e5fd8 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -486,6 +486,15 @@ enum target_cpus
    (TARGET_CPU_generic | (AARCH64_CPU_DEFAULT_FLAGS << 6))
  #endif
+/* If inserting NOP before a mult-accumulate insn remember to adjust the
+   length so that conditional branching code is updated appropriately.  */
+#define ADJUST_INSN_LENGTH(insn, length)       \
+  if (aarch64_madd_needs_nop (insn))           \
+    length += 4;
+
+#define FINAL_PRESCAN_INSN(INSN, OPVEC, NOPERANDS)     \
+    aarch64_final_prescan_insn (INSN);                 \
+
  /* The processor for which instructions should be scheduled.  */
  extern enum aarch64_processor aarch64_tune;
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index f5a15b7..77deb2e 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -67,6 +67,10 @@ mgeneral-regs-only
  Target Report RejectNegative Mask(GENERAL_REGS_ONLY)
  Generate code which uses only the general registers
+mfix-cortex-a53-835769
+Target Report Var(aarch64_fix_a53_err835769) Init(0)
+Workaround for ARM Cortex-A53 Erratum number 835769
+
  mlittle-endian
  Target Report RejectNegative InverseMask(BIG_END)
  Assume target CPU is configured as little endian
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 5fe7e15..acc9dd9 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -489,6 +489,7 @@ Objective-C and Objective-C++ Dialects}.
  -mstrict-align @gol
  -momit-leaf-frame-pointer  -mno-omit-leaf-frame-pointer @gol
  -mtls-dialect=desc  -mtls-dialect=traditional @gol
+-mfix-cortex-a53-835769  -mno-fix-cortex-a53-835769 @gol
  -march=@var{name}  -mcpu=@var{name}  -mtune=@var{name}}
@emph{Adapteva Epiphany Options}
@@ -11744,6 +11745,14 @@ of TLS variables.  This is the default.
  Use traditional TLS as the thread-local storage mechanism for dynamic accesses
  of TLS variables.
+@item -mfix-cortex-a53-835769
+@itemx -mno-fix-cortex-a53-835769
+@opindex -mfix-cortex-a53-835769
+@opindex -mno-fix-cortex-a53-835769
+Enable or disable the workaround for the ARM Cortex-A53 erratum number 835769.
+This will involve inserting a NOP instruction between memory instructions and
+64-bit integer multiply-accumulate instructions.
+
  @item -march=@var{name}
  @opindex march
  Specify the name of the target architecture, optionally suffixed by one or


a53-49-workaround.patch


commit 48cc738e2c38dd372ae054d30964dd780478c4d1
Author: Kyrill Tkachov <kyrylo.tkac...@arm.com>
Date:   Tue Oct 7 12:42:01 2014 +0000

     [AArch64] Add workaround for Cortex-A53 erratum 835769
2014-10-07 Kyrylo Tkachov <kyrylo.tkac...@arm.com>
                 Ramana Radhakrishnan  <ramana.radhakrish...@arm.com>
* config/aarch64/aarch64.h (FINAL_PRESCAN_INSN): Define.
        (ADJUST_INSN_LENGTH): Define.
        * config/aarch64/aarch64.opt (mfix-cortex-a53-835769): New
        * option.
        * config/aarch64/aarch64.c (is_mem_p): New function.
        (is_memory_op): Likewise.
        (aarch64_prev_real_insn): Likewise.
        (is_madd_op): Likewise.
        (dep_between_memop_and_curr): Likewise.
        (aarch64_madd_needs_nop): Likewise.
        (aarch64_final_prescan_insn): Likewise.
        * doc/invoke.texi (AArch64 Options): Document
        * -mfix-cortex-a53-835769
        and -mno-fix-cortex-a53-835769 options.


+
+/* Check if there is a register dependency between a load and the insn
+   for which we hold recog_data.  */
+
+static bool
+dep_between_memop_and_curr (rtx memop)
+{
+  rtx load_reg;
+  int opno;
+
+  if (!memop)
+    return false;
+
+  if (!REG_P (SET_DEST (memop)))
+    return false;
+
+  load_reg = SET_DEST (memop);
+  for (opno = 0; opno < recog_data.n_operands; opno++)
+    {
+      rtx operand = recog_data.operand[opno];
+      if (REG_P (operand)
+          && reg_overlap_mentioned_p (load_reg, operand))
+        return true;
+
+    }
+  return false;
+}
+
+bool
+aarch64_madd_needs_nop (rtx insn)
+{
+  enum attr_type attr_type;
+  rtx prev;
+  rtx body;
+
+  if (!aarch64_fix_a53_err835769)
+    return false;
+
+  if (recog_memoized (insn) < 0)
+    return false;
+
+  attr_type = get_attr_type (insn);
+  if (!is_madd_op (attr_type))
+    return false;
+
+  prev = aarch64_prev_real_insn (insn);
+  if (!prev)
+    return false;
+
+  body = single_set (prev);
+
+  /* If the previous insn is a memory op and there is no dependency between
+     it and the madd, emit a nop between them.  If we know the previous insn is
+     a memory op but body is NULL, emit the nop to be safe, it's probably a
+     load/store pair insn.  */
+  if (is_memory_op (prev)
+      && GET_MODE (recog_data.operand[0]) == DImode
+      && (!dep_between_memop_and_curr (body)))
+    return true;
+
It seems a bit strange to me that we expect dep_between_memop_and_curr
to detect the non-single_set() case and act conservatively.  Better
would be to pull that test out to this level, giving us:


   body = single_set (prev);
   if (!body)
     /* Complex memory operation, possibly a load/store pair insn.  Just
        be conservative for now.  */
     return true;

Can do, but I think it should be:
body = single_set (prev);
if (!body && is_memory_op (prev))
  return true;

since we want to do this only when we really do know it's a memory op.
And restructure this a bit so that is_memory_op called checked only once...

Thanks,
Kyrill



   ...

Now dep_between... can assert that it is passed a valid SET() rtx.

R.



Reply via email to