* Claudiu Zissulescu <claz...@gmail.com> [2018-10-10 11:00:16 +0300]:

> Handle store cacheline hazard for A700 cpus by inserting two NOP_S
> between ST ST LD or their logical equivalent (like ST ST NOP_S NOP_S
> J_L.D LD)
> 
> gcc/
> 2016-08-01  Claudiu Zissulescu  <claz...@synopsys.com>
> 
>       * config/arc/arc-arch.h (ARC_TUNE_ARC7XX): New tune value.
>       * config/arc/arc.c (arc_active_insn): New function.
>       (check_store_cacheline_hazard): Likewise.
>       (workaround_arc_anomaly): Use check_store_cacheline_hazard.
>       (arc_override_options): Disable delay slot scheduler for older
>       A7.
>       (arc_store_addr_hazard_p): New implementation, old one renamed to
>       ...
>       (arc_store_addr_hazard_internal_p): Renamed.
>       (arc_reorg): Don't combine into brcc instructions which are part
>       of hardware hazard solution.
>       * config/arc/arc.md (attr tune): Consider new arc7xx tune value.
>       (tune_arc700): Likewise.
>       * config/arc/arc.opt (arc7xx): New tune value.
>       * config/arc/arc700.md: Improve A7 scheduler.

Basically happy with this, most just a few missing header comments on
new functions.

Thanks,
Andrew


> ---
>  gcc/config/arc/arc-arch.h |   1 +
>  gcc/config/arc/arc.c      | 142 ++++++++++++++++++++++++++++++++------
>  gcc/config/arc/arc.md     |   8 ++-
>  gcc/config/arc/arc.opt    |   3 +
>  gcc/config/arc/arc700.md  |  18 +----
>  5 files changed, 132 insertions(+), 40 deletions(-)
> 
> diff --git a/gcc/config/arc/arc-arch.h b/gcc/config/arc/arc-arch.h
> index 859af0684b8..ad540607e55 100644
> --- a/gcc/config/arc/arc-arch.h
> +++ b/gcc/config/arc/arc-arch.h
> @@ -71,6 +71,7 @@ enum arc_tune_attr
>    {
>      ARC_TUNE_NONE,
>      ARC_TUNE_ARC600,
> +    ARC_TUNE_ARC7XX,
>      ARC_TUNE_ARC700_4_2_STD,
>      ARC_TUNE_ARC700_4_2_XMAC,
>      ARC_TUNE_CORE_3,
> diff --git a/gcc/config/arc/arc.c b/gcc/config/arc/arc.c
> index ab7735d6b38..90454928379 100644
> --- a/gcc/config/arc/arc.c
> +++ b/gcc/config/arc/arc.c
> @@ -1308,6 +1308,10 @@ arc_override_options (void)
>    if (TARGET_LONG_CALLS_SET)
>      target_flags &= ~MASK_MILLICODE_THUNK_SET;
>  
> +  /* A7 has an issue with delay slots.  */
> +  if (TARGET_ARC700 && (arc_tune != ARC_TUNE_ARC7XX))
> +    flag_delayed_branch = 0;
> +
>    /* These need to be done at start up.  It's convenient to do them here.  */
>    arc_init ();
>  }
> @@ -7529,11 +7533,91 @@ arc_invalid_within_doloop (const rtx_insn *insn)
>    return NULL;
>  }
>  
> +static rtx_insn *
> +arc_active_insn (rtx_insn *insn)

Missing header comment.

> +{
> +  rtx_insn *nxt = next_active_insn (insn);
> +
> +  if (nxt && GET_CODE (PATTERN (nxt)) == ASM_INPUT)
> +    nxt = next_active_insn (nxt);
> +  return nxt;
> +}
> +
> +/* Search for a sequence made out of two stores and a given number of
> +   loads, insert a nop if required.  */
> +
> +static void
> +check_store_cacheline_hazard (void)
> +{
> +  rtx_insn *insn, *succ0, *insn1;
> +  bool found = false;
> +
> +  for (insn = get_insns (); insn; insn = arc_active_insn (insn))
> +    {
> +      succ0 = arc_active_insn (insn);
> +
> +      if (!succ0)
> +     return;
> +
> +      if (!single_set (insn) || !single_set (succ0))
> +     continue;
> +
> +      if ((get_attr_type (insn) != TYPE_STORE)
> +       || (get_attr_type (succ0) != TYPE_STORE))
> +     continue;
> +
> +      /* Found at least two consecutive stores.  Goto the end of the
> +      store sequence.  */
> +      for (insn1 = succ0; insn1; insn1 = arc_active_insn (insn1))
> +     if (!single_set (insn1) || get_attr_type (insn1) != TYPE_STORE)
> +       break;
> +
> +      /* Now, check the next two instructions for the following cases:
> +         1. next instruction is a LD => insert 2 nops between store
> +         sequence and load.
> +      2. next-next instruction is a LD => inset 1 nop after the store
> +         sequence.  */
> +      if (insn1 && single_set (insn1)
> +       && (get_attr_type (insn1) == TYPE_LOAD))
> +     {
> +       found = true;
> +       emit_insn_before (gen_nopv (), insn1);
> +       emit_insn_before (gen_nopv (), insn1);
> +     }
> +      else
> +     {
> +       if (insn1 && (get_attr_type (insn1) == TYPE_COMPARE))
> +         {
> +           /* REG_SAVE_NOTE is used by Haifa scheduler, we are in
> +              reorg, so it is safe to reuse it for avoiding the
> +              current compare insn to be part of a BRcc
> +              optimization.  */
> +           add_reg_note (insn1, REG_SAVE_NOTE, GEN_INT (3));
> +         }
> +       insn1 = arc_active_insn (insn1);
> +       if (insn1 && single_set (insn1)
> +           && (get_attr_type (insn1) == TYPE_LOAD))
> +         {
> +           found = true;
> +           emit_insn_before (gen_nopv (), insn1);
> +         }
> +     }
> +
> +      insn = insn1;
> +      if (found)
> +     {
> +       /* warning (0, "Potential lockup sequence found, patching"); */

I'm not a fan of this approach.  I'd rather the comment explain what
problem was found and patched, and why displaying a warning is not
appropriate.  The commented out code just leaves me asking ... why?

> +       found = false;
> +     }
> +    }
> +}
> +
>  /* Return true if a load instruction (CONSUMER) uses the same address as a
>     store instruction (PRODUCER).  This function is used to avoid st/ld
>     address hazard in ARC700 cores.  */
> -bool
> -arc_store_addr_hazard_p (rtx_insn* producer, rtx_insn* consumer)
> +
> +static bool
> +arc_store_addr_hazard_internal_p (rtx_insn* producer, rtx_insn* consumer)
>  {
>    rtx in_set, out_set;
>    rtx out_addr, in_addr;
> @@ -7581,6 +7665,14 @@ arc_store_addr_hazard_p (rtx_insn* producer, rtx_insn* 
> consumer)
>    return false;
>  }
>  
> +bool
> +arc_store_addr_hazard_p (rtx_insn* producer, rtx_insn* consumer)
> +{

Missing header comment.

> +  if (TARGET_ARC700 && (arc_tune != ARC_TUNE_ARC7XX))
> +    return true;
> +  return arc_store_addr_hazard_internal_p (producer, consumer);
> +}
> +
>  /* The same functionality as arc_hazard.  It is called in machine
>     reorg before any other optimization.  Hence, the NOP size is taken
>     into account when doing branch shortening.  */
> @@ -7589,6 +7681,7 @@ static void
>  workaround_arc_anomaly (void)
>  {
>    rtx_insn *insn, *succ0;
> +  rtx_insn *succ1;
>  
>    /* For any architecture: call arc_hazard here.  */
>    for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
> @@ -7600,27 +7693,30 @@ workaround_arc_anomaly (void)
>       }
>      }
>  
> -  if (TARGET_ARC700)
> -    {
> -      rtx_insn *succ1;
> +  if (!TARGET_ARC700)
> +    return;
>  
> -      for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
> -     {
> -       succ0 = next_real_insn (insn);
> -       if (arc_store_addr_hazard_p (insn, succ0))
> -         {
> -           emit_insn_after (gen_nopv (), insn);
> -           emit_insn_after (gen_nopv (), insn);
> -           continue;
> -         }
> +  /* Old A7 are suffering of a cache hazard, and we need to insert two
> +     nops between any sequence of stores and a load.  */
> +  if (arc_tune != ARC_TUNE_ARC7XX)
> +    check_store_cacheline_hazard ();
>  
> -       /* Avoid adding nops if the instruction between the ST and LD is
> -          a call or jump.  */
> -       succ1 = next_real_insn (succ0);
> -       if (succ0 && !JUMP_P (succ0) && !CALL_P (succ0)
> -           && arc_store_addr_hazard_p (insn, succ1))
> -         emit_insn_after (gen_nopv (), insn);
> +  for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
> +    {
> +      succ0 = next_real_insn (insn);
> +      if (arc_store_addr_hazard_internal_p (insn, succ0))
> +     {
> +       emit_insn_after (gen_nopv (), insn);
> +       emit_insn_after (gen_nopv (), insn);
> +       continue;
>       }
> +
> +      /* Avoid adding nops if the instruction between the ST and LD is
> +      a call or jump.  */
> +      succ1 = next_real_insn (succ0);
> +      if (succ0 && !JUMP_P (succ0) && !CALL_P (succ0)
> +       && arc_store_addr_hazard_internal_p (insn, succ1))
> +     emit_insn_after (gen_nopv (), insn);
>      }
>  }
>  
> @@ -8291,11 +8387,15 @@ arc_reorg (void)
>             if (!link_insn)
>               continue;
>             else
> -             /* Check if this is a data dependency.  */
>               {
> +               /* Check if this is a data dependency.  */
>                 rtx op, cc_clob_rtx, op0, op1, brcc_insn, note;
>                 rtx cmp0, cmp1;
>  
> +               /* Make sure we can use it for brcc insns.  */
> +               if (find_reg_note (link_insn, REG_SAVE_NOTE, GEN_INT (3)))
> +                 continue;
> +
>                 /* Ok this is the set cc. copy args here.  */
>                 op = XEXP (pc_target, 0);
>  
> diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md
> index fb8a1c9ee09..caf7deda505 100644
> --- a/gcc/config/arc/arc.md
> +++ b/gcc/config/arc/arc.md
> @@ -600,11 +600,13 @@
>  ;;   somehow modify them to become inelegible for delay slots if a decision
>  ;;   is made that makes conditional execution required.
>  
> -(define_attr "tune" "none,arc600,arc700_4_2_std,arc700_4_2_xmac, core_3, \
> -archs4x, archs4xd, archs4xd_slow"
> +(define_attr "tune" "none,arc600,arc7xx,arc700_4_2_std,arc700_4_2_xmac, \
> +core_3, archs4x, archs4xd, archs4xd_slow"
>    (const
>     (cond [(symbol_ref "arc_tune == TUNE_ARC600")
>         (const_string "arc600")
> +       (symbol_ref "arc_tune == ARC_TUNE_ARC7XX")
> +       (const_string "arc7xx")
>         (symbol_ref "arc_tune == TUNE_ARC700_4_2_STD")
>         (const_string "arc700_4_2_std")
>         (symbol_ref "arc_tune == TUNE_ARC700_4_2_XMAC")
> @@ -619,7 +621,7 @@ archs4x, archs4xd, archs4xd_slow"
>        (const_string "none"))))
>  
>  (define_attr "tune_arc700" "false,true"
> -  (if_then_else (eq_attr "tune" "arc700_4_2_std, arc700_4_2_xmac")
> +  (if_then_else (eq_attr "tune" "arc7xx, arc700_4_2_std, arc700_4_2_xmac")
>               (const_string "true")
>               (const_string "false")))
>  
> diff --git a/gcc/config/arc/arc.opt b/gcc/config/arc/arc.opt
> index 93e18af1d27..bcffb2720ba 100644
> --- a/gcc/config/arc/arc.opt
> +++ b/gcc/config/arc/arc.opt
> @@ -262,6 +262,9 @@ Enum(arc_tune_attr) String(arc600) Value(ARC_TUNE_ARC600)
>  EnumValue
>  Enum(arc_tune_attr) String(arc601) Value(ARC_TUNE_ARC600)
>  
> +EnumValue
> +Enum(arc_tune_attr) String(arc7xx) Value(ARC_TUNE_ARC7XX)
> +
>  EnumValue
>  Enum(arc_tune_attr) String(arc700) Value(ARC_TUNE_ARC700_4_2_STD)
>  
> diff --git a/gcc/config/arc/arc700.md b/gcc/config/arc/arc700.md
> index a0f9f74a9f2..cbb868d8dcd 100644
> --- a/gcc/config/arc/arc700.md
> +++ b/gcc/config/arc/arc700.md
> @@ -145,28 +145,14 @@
>  ; no functional unit runs when blockage is reserved
>  (exclusion_set "blockage" "core, multiplier")
>  
> -(define_insn_reservation "data_load_DI" 4
> -  (and (eq_attr "tune_arc700" "true")
> -       (eq_attr "type" "load")
> -       (match_operand:DI 0 "" ""))
> -  "issue+dmp, issue+dmp, dmp_write_port, dmp_write_port")
> -
>  (define_insn_reservation "data_load" 3
>    (and (eq_attr "tune_arc700" "true")
> -       (eq_attr "type" "load")
> -       (not (match_operand:DI 0 "" "")))
> +       (eq_attr "type" "load"))
>    "issue+dmp, nothing, dmp_write_port")
>  
> -(define_insn_reservation "data_store_DI" 2
> -  (and (eq_attr "tune_arc700" "true")
> -       (eq_attr "type" "store")
> -       (match_operand:DI 0 "" ""))
> -  "issue+dmp_write_port, issue+dmp_write_port")
> -
>  (define_insn_reservation "data_store" 1
>    (and (eq_attr "tune_arc700" "true")
> -       (eq_attr "type" "store")
> -       (not (match_operand:DI 0 "" "")))
> +       (eq_attr "type" "store"))
>    "issue+dmp_write_port")
>  
>  (define_bypass 3 "data_store" "data_load" "arc_store_addr_hazard_p")
> -- 
> 2.17.1
> 

Reply via email to