On Fri, May 24, 2024 at 9:27 AM Richard Biener <rguent...@suse.de> wrote:
>
> On Thu, 23 May 2024, Manolis Tsamis wrote:
>
> > This pass detects cases of expensive store forwarding and tries to avoid 
> > them
> > by reordering the stores and using suitable bit insertion sequences.
> > For example it can transform this:
> >
> >      strb    w2, [x1, 1]
> >      ldr     x0, [x1]      # Epxensive store forwarding to larger load.
> >
> > To:
> >
> >      ldr     x0, [x1]
> >      strb    w2, [x1]
> >      bfi     x0, x2, 0, 8
>
> How do we represent atomics?  If the latter is a load-acquire or release
> the transform would be invalid.
>
> > Assembly like this can appear with bitfields or type punning / unions.
> > On stress-ng when running the cpu-union microbenchmark the following 
> > speedups
> > have been observed.
> >
> >   Neoverse-N1:      +29.4%
> >   Intel Coffeelake: +13.1%
> >   AMD 5950X:        +17.5%
> >
> >       PR rtl-optimization/48696
> >
> > gcc/ChangeLog:
> >
> >       * Makefile.in: Add avoid-store-forwarding.o.
> >       * common.opt: New option -favoid-store-forwarding.
> >       * params.opt: New param store-forwarding-max-distance.
> >       * passes.def: Schedule a new pass.
> >       * tree-pass.h (make_pass_rtl_avoid_store_forwarding): Declare.
> >       * avoid-store-forwarding.cc: New file.
> >
> > gcc/testsuite/ChangeLog:
> >
> >       * gcc.dg/avoid-store-forwarding-1.c: New test.
> >       * gcc.dg/avoid-store-forwarding-2.c: New test.
> >       * gcc.dg/avoid-store-forwarding-3.c: New test.
> >
> > Signed-off-by: Manolis Tsamis <manolis.tsa...@vrull.eu>
> > ---
> >
> >  gcc/Makefile.in                               |   1 +
> >  gcc/avoid-store-forwarding.cc                 | 554 ++++++++++++++++++
> >  gcc/common.opt                                |   4 +
> >  gcc/params.opt                                |   4 +
> >  gcc/passes.def                                |   1 +
> >  .../gcc.dg/avoid-store-forwarding-1.c         |  46 ++
> >  .../gcc.dg/avoid-store-forwarding-2.c         |  39 ++
> >  .../gcc.dg/avoid-store-forwarding-3.c         |  31 +
> >  gcc/tree-pass.h                               |   1 +
> >  9 files changed, 681 insertions(+)
> >  create mode 100644 gcc/avoid-store-forwarding.cc
> >  create mode 100644 gcc/testsuite/gcc.dg/avoid-store-forwarding-1.c
> >  create mode 100644 gcc/testsuite/gcc.dg/avoid-store-forwarding-2.c
> >  create mode 100644 gcc/testsuite/gcc.dg/avoid-store-forwarding-3.c
> >
> > diff --git a/gcc/Makefile.in b/gcc/Makefile.in
> > index a7f15694c34..be969b1ca1d 100644
> > --- a/gcc/Makefile.in
> > +++ b/gcc/Makefile.in
> > @@ -1681,6 +1681,7 @@ OBJS = \
> >       statistics.o \
> >       stmt.o \
> >       stor-layout.o \
> > +     avoid-store-forwarding.o \
> >       store-motion.o \
> >       streamer-hooks.o \
> >       stringpool.o \
> > diff --git a/gcc/avoid-store-forwarding.cc b/gcc/avoid-store-forwarding.cc
> > new file mode 100644
> > index 00000000000..d90627c4872
> > --- /dev/null
> > +++ b/gcc/avoid-store-forwarding.cc
> > @@ -0,0 +1,554 @@
> > +/* Avoid store forwarding optimization pass.
> > +   Copyright (C) 2024 Free Software Foundation, Inc.
> > +   Contributed by VRULL GmbH.
> > +
> > +   This file is part of GCC.
> > +
> > +   GCC is free software; you can redistribute it and/or modify it
> > +   under the terms of the GNU General Public License as published by
> > +   the Free Software Foundation; either version 3, or (at your option)
> > +   any later version.
> > +
> > +   GCC is distributed in the hope that it will be useful, but
> > +   WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   General Public License for more details.
> > +
> > +   You should have received a copy of the GNU General Public License
> > +   along with GCC; see the file COPYING3.  If not see
> > +   <http://www.gnu.org/licenses/>.  */
> > +
> > +#include "config.h"
> > +#include "system.h"
> > +#include "coretypes.h"
> > +#include "backend.h"
> > +#include "rtl.h"
> > +#include "alias.h"
> > +#include "rtlanal.h"
> > +#include "tree-pass.h"
> > +#include "cselib.h"
> > +#include "predict.h"
> > +#include "insn-config.h"
> > +#include "expmed.h"
> > +#include "recog.h"
> > +#include "regset.h"
> > +#include "df.h"
> > +#include "expr.h"
> > +#include "memmodel.h"
> > +#include "emit-rtl.h"
> > +#include "vec.h"
> > +
> > +/* This pass tries to detect and avoid cases of store forwarding.
> > +   On many processors there is a large penalty when smaller stores are
> > +   forwarded to larger loads.  The idea used to avoid the stall is to move
> > +   the store after the load and in addition emit a bit insert sequence so
> > +   the load register has the correct value.  For example the following:
> > +
> > +     strb    w2, [x1, 1]
> > +     ldr     x0, [x1]
> > +
> > +   Will be transformed to:
> > +
> > +     ldr     x0, [x1]
> > +     and     w2, w2, 255
> > +     strb    w2, [x1]
> > +     bfi     x0, x2, 0, 8
> > +*/
> > +
> > +namespace {
> > +
> > +const pass_data pass_data_avoid_store_forwarding =
> > +{
> > +  RTL_PASS, /* type.  */
> > +  "avoid_store_forwarding", /* name.  */
> > +  OPTGROUP_NONE, /* optinfo_flags.  */
> > +  TV_NONE, /* tv_id.  */
> > +  0, /* properties_required.  */
> > +  0, /* properties_provided.  */
> > +  0, /* properties_destroyed.  */
> > +  0, /* todo_flags_start.  */
> > +  TODO_df_finish /* todo_flags_finish.  */
> > +};
> > +
> > +class pass_rtl_avoid_store_forwarding : public rtl_opt_pass
> > +{
> > +public:
> > +  pass_rtl_avoid_store_forwarding (gcc::context *ctxt)
> > +    : rtl_opt_pass (pass_data_avoid_store_forwarding, ctxt)
> > +  {}
> > +
> > +  /* opt_pass methods: */
> > +  virtual bool gate (function *)
> > +    {
> > +      return flag_avoid_store_forwarding && optimize >= 1;
> > +    }
> > +
> > +  virtual unsigned int execute (function *) override;
> > +}; // class pass_rtl_avoid_store_forwarding
> > +
> > +typedef struct
> > +{
> > +  /* The store instruction that is a store forwarding candidate.  */
> > +  rtx_insn *store_insn;
> > +  /* SET_DEST (single_set (store_insn)).  */
> > +  rtx store_mem;
> > +  /* The temporary that will hold the stored value at the original store
> > +     position.  */
> > +  rtx mov_reg;
> > +  /* The instruction sequence that inserts the stored value's bits at the
> > +     appropriate position in the loaded value.  */
> > +  rtx_insn *bits_insert_insns;
> > +  /* The byte offset for the store's position within the load.  */
> > +  HOST_WIDE_INT offset;
> > +
> > +  unsigned int insn_cnt;
> > +  bool remove;
> > +  bool forwarded;
> > +} store_info;
> > +
> > +static unsigned int stats_sf_detected = 0;
> > +static unsigned int stats_sf_avoided = 0;
> > +
> > +static rtx
> > +get_load_mem (rtx expr)
> > +{
> > +  if (!expr)
> > +    return NULL_RTX;
> > +
> > +  rtx mem = SET_SRC (expr);
> > +
> > +  if (GET_CODE (mem) == ZERO_EXTEND
> > +      || GET_CODE (mem) == SIGN_EXTEND)
> > +    mem = XEXP (mem, 0);
> > +
> > +  if (MEM_P (mem))
> > +    return mem;
> > +  else
> > +    return NULL_RTX;
> > +}
> > +
> > +/* Return true iff a store to STORE_MEM would write to a sub-region of 
> > bytes
> > +   from what LOAD_MEM would read.  If true also store the relative byte 
> > offset
> > +   of the store within the load to OFF_VAL.  */
> > +
> > +static bool
> > +is_store_forwarding (rtx store_mem, rtx load_mem, HOST_WIDE_INT *off_val)
> > +{
> > +  if (known_ge (GET_MODE_SIZE (GET_MODE (store_mem)),
> > +             GET_MODE_SIZE (GET_MODE (load_mem))))
> > +    return false;
> > +
> > +  rtx off = simplify_gen_binary (MINUS, GET_MODE (XEXP (store_mem, 0)),
> > +                              XEXP (store_mem, 0), XEXP (load_mem, 0));
> > +
> > +  if (CONST_INT_P (off))
> > +    {
> > +      *off_val = INTVAL (off);
> > +      scalar_int_mode store_mode, load_mode;
> > +      if (is_int_mode (GET_MODE (store_mem), &store_mode)
> > +       && is_int_mode (GET_MODE (load_mem), &load_mode))
>
> This is a quite severe limitation - most forwarding issues I ran into
> are caused by vectorization where we have scalar stores and a vector
> load.  And it happens for both integer and floating point elements.
>
Addressed in V2. I haven't searched extensively for such cases yet and
don't have performance numbers but the pass now works fine with scalar
-> vector store forwardings (added a simple testcase for it too).
I also added some simple costing for the generated bit insert
sequences. Depending on the architecture the scalar -> vector
sequences may be large and/or unprofitable.
But on others (e.g. aarch64) they're just a few instructions so I can
see the benefit.

> > +     {
> > +       HOST_WIDE_INT store_mode_size = GET_MODE_SIZE (store_mode);
> > +       HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
> > +
> > +       return *off_val >= 0
> > +              && (*off_val + store_mode_size <= load_mode_size);
> > +     }
> > +    }
> > +
> > +  return false;
> > +}
> > +
> > +/* Return a bit insertion sequence that would make DEST have the correct 
> > value
> > +   if the store represented by STORE_INFO were to be moved after DEST.  */
> > +
> > +static rtx_insn *
> > +generate_bit_insert_sequence (store_info *store_info, rtx dest,
> > +                           machine_mode load_inner_mode)
> > +{
> > +  scalar_int_mode store_mode, load_mode;
> > +  if (!is_int_mode (GET_MODE (store_info->store_mem), &store_mode)
> > +      || !is_int_mode (load_inner_mode, &load_mode))
> > +    return NULL;
> > +
> > +  HOST_WIDE_INT load_mem_size = GET_MODE_SIZE (load_mode);
> > +  HOST_WIDE_INT store_mem_size = GET_MODE_SIZE (store_mode);
> > +  HOST_WIDE_INT bf_offset_bytes;
> > +
> > +  if (BYTES_BIG_ENDIAN)
> > +    bf_offset_bytes = load_mem_size - store_mem_size - store_info->offset;
> > +  else
> > +    bf_offset_bytes = store_info->offset;
> > +
> > +  start_sequence ();
> > +  store_bit_field (dest, store_mem_size * BITS_PER_UNIT,
> > +                bf_offset_bytes * BITS_PER_UNIT, 0, 0,
> > +                GET_MODE (dest), store_info->mov_reg,
> > +                false, false);
> > +  rtx_insn *insns = get_insns ();
>
> While convenient this can actually end up spilling to memory which
> would be worse.  I think you want to call the actual workers for
> the cases you want to handle which can fail instead of spilling.
>
Addressed in V2. I decided to just iterate the generated sequence and
search for MEM with contains_mem_rtx_p; I believe it is desirable to
not expose the worker functions. This should also make the pass
independent of any future refactorings of the store_bit_field helper
functions.

> > +  end_sequence ();
> > +
> > +  return insns;
> > +}
> > +
> > +/* Given a list of small stores that are forwarded to LOAD_INSN, try to
> > +   rearrange them so that a store-forwarding penalty doesn't occur.  */
> > +
> > +static bool
> > +process_forwardings (vec<store_info> &stores, rtx_insn *load_insn)
> > +{
> > +  rtx load = single_set (load_insn);
> > +  machine_mode load_inner_mode = GET_MODE (get_load_mem (load));
> > +
> > +  /* If the stores cover all the bytes of the load without overlap then we 
> > can
> > +     eliminate the load entirely and use the computed value instead.  */
> > +  HOST_WIDE_INT load_size
> > +    = GET_MODE_SIZE (as_a <scalar_int_mode> (GET_MODE (get_load_mem 
> > (load))));
> > +  sbitmap forwarded_bytes = sbitmap_alloc (load_size);
> > +
> > +  unsigned int i;
> > +  store_info* it;
> > +  FOR_EACH_VEC_ELT (stores, i, it)
> > +    {
> > +      HOST_WIDE_INT store_size
> > +     = GET_MODE_SIZE (as_a <scalar_int_mode> (GET_MODE (it->store_mem)));
> > +      if (bitmap_bit_in_range_p (forwarded_bytes, it->offset,
> > +                              it->offset + store_size - 1))
> > +     break;
> > +      bitmap_set_range (forwarded_bytes, it->offset, store_size);
> > +    }
> > +
> > +  bitmap_not (forwarded_bytes, forwarded_bytes);
> > +  bool eliminate_load = bitmap_empty_p (forwarded_bytes);
> > +
> > +  stats_sf_detected++;
> > +
> > +  if (dump_file)
> > +    {
> > +      fprintf (dump_file, "Store forwarding%s detected:\n",
> > +            (stores.length () > 1) ? "s" : "");
> > +
> > +      FOR_EACH_VEC_ELT (stores, i, it)
> > +     {
> > +       fprintf (dump_file, "From: ");
> > +       print_rtl_single (dump_file, it->store_insn);
> > +     }
> > +
> > +      fprintf (dump_file, "To: ");
> > +      print_rtl_single (dump_file, load_insn);
> > +
> > +      if (eliminate_load)
> > +     fprintf (dump_file, "(Load elimination candidate)\n");
> > +    }
> > +
> > +  rtx dest;
> > +  if (eliminate_load)
> > +    dest = gen_reg_rtx (load_inner_mode);
> > +  else
> > +    dest = SET_DEST (load);
> > +
> > +  int move_to_front = -1;
> > +
> > +  /* Check if we can emit bit insert instructions for all forwarded 
> > stores.  */
> > +  FOR_EACH_VEC_ELT (stores, i, it)
> > +    {
> > +      it->mov_reg = gen_reg_rtx (GET_MODE (it->store_mem));
> > +      rtx_insn *insns = NULL;
> > +
> > +      /* If we're eliminating the load then find the store with zero offset
> > +      and use it as the base register to avoid a bit insert.  */
> > +      if (eliminate_load && it->offset == 0)
> > +     {
> > +       start_sequence ();
> > +
> > +       /* We can use a paradoxical subreg to force this to a wider mode, as
> > +          the only use will be inserting the bits (i.e., we don't care 
> > about
> > +          the value of the higher bits).  */
> > +       rtx ext0 = gen_rtx_SUBREG (GET_MODE (dest), it->mov_reg, 0);
> > +       rtx_insn *move0 = emit_move_insn (dest, ext0);
> > +       if (recog_memoized (move0) >= 0)
> > +         {
> > +           insns = get_insns ();
> > +           move_to_front = (int) i;
> > +         }
> > +
> > +       end_sequence ();
> > +     }
> > +
> > +      if (!insns)
> > +     insns = generate_bit_insert_sequence (&(*it), dest, load_inner_mode);
> > +
> > +      if (!insns)
> > +     {
> > +       if (dump_file)
> > +         {
> > +           fprintf (dump_file, "Failed due to: ");
> > +           print_rtl_single (dump_file, it->store_insn);
> > +         }
> > +       return false;
> > +     }
> > +
> > +      it->bits_insert_insns = insns;
> > +    }
> > +
> > +  /* If we have a move instead of bit insert, it needs to be emitted first 
> > in
> > +     the resulting sequence.  */
> > +  if (move_to_front != -1)
> > +    {
> > +      stores.safe_push (stores[move_to_front]);
> > +      stores.ordered_remove (move_to_front);
> > +    }
> > +
> > +  if (dump_file)
> > +    {
> > +      fprintf (dump_file, "Store forwarding%s avoided with bit inserts:\n",
> > +            (stores.length () > 1) ? "s" : "");
> > +
> > +      FOR_EACH_VEC_ELT (stores, i, it)
> > +     {
> > +       if (stores.length () > 1)
> > +         {
> > +           fprintf (dump_file, "For: ");
> > +           print_rtl_single (dump_file, it->store_insn);
> > +         }
> > +
> > +       fprintf (dump_file, "With sequence:\n");
> > +
> > +       for (rtx_insn *insn = it->bits_insert_insns; insn;
> > +            insn = NEXT_INSN (insn))
> > +         {
> > +           fprintf (dump_file, "  ");
> > +           print_rtl_single (dump_file, insn);
> > +         }
> > +     }
> > +    }
> > +
> > +  stats_sf_avoided++;
> > +
> > +  if (eliminate_load)
> > +    {
> > +      machine_mode outter_mode = GET_MODE (SET_DEST (load));
> > +      rtx_code extend = ZERO_EXTEND;
> > +      if (outter_mode != load_inner_mode)
> > +     extend = GET_CODE (SET_SRC (load));
> > +
> > +      rtx load_value = simplify_gen_unary (extend, outter_mode, dest,
> > +                                        load_inner_mode);
> > +      rtx load_move = gen_move_insn (SET_DEST (load), load_value);
> > +      df_insn_rescan (emit_insn_after (load_move, load_insn));
> > +    }
> > +
> > +  FOR_EACH_VEC_ELT (stores, i, it)
> > +    {
> > +      /* Emit code that updated the loaded value to account for the
> > +      missing store.  */
> > +      df_insn_rescan (emit_insn_after (it->bits_insert_insns, load_insn));
> > +    }
> > +
> > +  FOR_EACH_VEC_ELT (stores, i, it)
> > +    {
> > +      rtx store_set = single_set (it->store_insn);
> > +      /* Create a register move at the store's original position to save 
> > the
> > +      stored value.  */
> > +      rtx mov1 = gen_move_insn (it->mov_reg, SET_SRC (store_set));
> > +      df_insn_rescan (emit_insn_before (mov1, it->store_insn));
> > +      /* Create a new store after the load with the saved original value.
> > +      This avoids the forwarding stall.  */
> > +      rtx mov2 = gen_move_insn (SET_DEST (store_set), it->mov_reg);
> > +      df_insn_rescan (emit_insn_after (mov2, load_insn));
> > +      /* Done, delete the original store.  */
> > +      set_insn_deleted (it->store_insn);
> > +    }
> > +
> > +  df_insn_rescan (load_insn);
> > +
> > +  if (eliminate_load)
> > +    set_insn_deleted (load_insn);
> > +
> > +  return true;
> > +}
> > +
> > +/* Process BB for expensive store forwardings.  */
> > +
> > +static void
> > +avoid_store_forwarding (basic_block bb)
> > +{
> > +  auto_vec<store_info, 8> store_exprs;
> > +  rtx_insn *insn;
> > +  unsigned int insn_cnt = 0;
> > +
> > +  FOR_BB_INSNS (bb, insn)
> > +    {
> > +      if (!NONDEBUG_INSN_P (insn))
> > +     continue;
> > +
> > +      rtx set = single_set (insn);
> > +
> > +      /* Store forwarding issues are unlikely if we cross a call.
> > +      Clear store forwarding candidates if we can't understand INSN.  */
> > +      if (CALL_P (insn) || !set || volatile_refs_p (set))
> > +     {
> > +       store_exprs.truncate (0);
> > +       continue;
> > +     }
> > +
> > +      rtx load_mem = get_load_mem (set);
> > +      int removed_count = 0;
> > +
> > +      if (MEM_P (SET_DEST (set)))
> > +     {
> > +       /* Record store forwarding candidate.  */
> > +       store_info info;
> > +       info.store_insn = insn;
> > +       info.store_mem = SET_DEST (set);
> > +       info.insn_cnt = insn_cnt;
> > +       info.remove = false;
> > +       info.forwarded = false;
> > +       store_exprs.safe_push (info);
> > +     }
> > +      else if (load_mem)
> > +     {
> > +       /* Process load for possible store forwardings.  */
> > +       auto_vec<store_info> forwardings;
> > +       bool partial_forwarding = false;
> > +       bool remove_rest = false;
> > +
> > +       unsigned int i;
> > +       store_info *it;
> > +       FOR_EACH_VEC_ELT_REVERSE (store_exprs, i, it)
> > +         {
>
> If you have a basic-block with a lot of stores and loads this becomes
> quadratic.  In practice the store buffer of CPUs has limited size
> and stores remain in the store buffer for a finite time.  I think
> it makes sense to only keep a fixed number of stores (use a
> configurable --param?) and likewise limit the distance between
> store-load candidates you inspect (or maybe just that, by counting
> non-debug insns).  That makes it technically O(1) but at least
> avoids quadraticness for large BBs.
>
> > +           rtx store_mem = it->store_mem;
> > +           HOST_WIDE_INT off_val;
> > +
> > +           if (remove_rest)
> > +             {
> > +               it->remove = true;
> > +               removed_count++;
> > +             }
> > +           else if (is_store_forwarding (store_mem, load_mem, &off_val))
> > +             {
> > +               /* Check if moving this store after the load is legal.  */
> > +               bool write_dep = false;
> > +               for (unsigned int j = store_exprs.length () - 1; j != i; 
> > j--)
> > +                 if (!store_exprs[j].forwarded
> > +                     && output_dependence (store_mem,
> > +                                           store_exprs[j].store_mem))
> > +                   {
> > +                     write_dep = true;
> > +                     break;
> > +                   }
> > +
> > +               if (!write_dep)
> > +                 {
> > +                   it->forwarded = true;
> > +                   it->offset = off_val;
> > +                   forwardings.safe_push (*it);
> > +                 }
> > +               else
> > +                 partial_forwarding = true;
> > +
> > +               it->remove = true;
> > +               removed_count++;
> > +             }
> > +           else if (true_dependence (store_mem, GET_MODE (store_mem),
> > +                                     load_mem))
> > +             {
> > +               /* We cannot keep a store forwarding candidate if it 
> > possibly
> > +                  interferes with this load.  */
> > +               it->remove = true;
> > +               removed_count++;
> > +               remove_rest = true;
> > +             }
> > +         }
> > +
> > +       if (!forwardings.is_empty () && !partial_forwarding)
> > +         process_forwardings (forwardings, insn);
> > +     }
> > +      else
> > +     {
> > +       rtx reg = SET_DEST (set);
> > +
> > +       while (GET_CODE (reg) == ZERO_EXTRACT
> > +             || GET_CODE (reg) == STRICT_LOW_PART
> > +             || GET_CODE (reg) == SUBREG)
> > +         reg = XEXP (reg, 0);
> > +
> > +       /* Drop store forwarding candidates when the address register is
> > +          overwritten.  */
> > +       if (REG_P (reg))
> > +         {
> > +           bool remove_rest = false;
> > +           unsigned int i;
> > +           store_info *it;
> > +           FOR_EACH_VEC_ELT_REVERSE (store_exprs, i, it)
> > +             {
> > +               if (remove_rest
> > +                   || reg_overlap_mentioned_p (reg, it->store_mem))
> > +                 {
> > +                   it->remove = true;
> > +                   removed_count++;
> > +                   remove_rest = true;
> > +                 }
> > +             }
> > +         }
> > +       else
> > +         {
> > +           /* We can't understand INSN.  */
> > +           store_exprs.truncate (0);
> > +           continue;
> > +         }
> > +     }
> > +
> > +      if (removed_count)
> > +     {
> > +       unsigned int i, j;
> > +       store_info *it;
> > +       VEC_ORDERED_REMOVE_IF (store_exprs, i, j, it, it->remove);
> > +     }
> > +
> > +      /* Don't consider store forwarding if the RTL instruction distance is
> > +      more than PARAM_STORE_FORWARDING_MAX_DISTANCE.  */
> > +      if (!store_exprs.is_empty ()
> > +       && (store_exprs[0].insn_cnt
> > +           + param_store_forwarding_max_distance <= insn_cnt))
> > +     store_exprs.ordered_remove (0);
> > +
> > +      insn_cnt++;
> > +    }
> > +}
> > +
> > +unsigned int
> > +pass_rtl_avoid_store_forwarding::execute (function *fn)
> > +{
> > +  df_set_flags (DF_DEFER_INSN_RESCAN);
> > +  df_note_add_problem ();
> > +
> > +  init_alias_analysis ();
> > +  cselib_init (CSELIB_RECORD_MEMORY | CSELIB_PRESERVE_CONSTANTS);
> > +
> > +  stats_sf_detected = 0;
> > +  stats_sf_avoided = 0;
> > +
> > +  basic_block bb;
> > +  FOR_EACH_BB_FN (bb, fn)
> > +    avoid_store_forwarding (bb);
> > +
> > +  end_alias_analysis ();
> > +  cselib_finish ();
> > +  df_analyze ();
> > +
> > +  statistics_counter_event (fn, "Store forwardings detected: ",
> > +                         stats_sf_detected);
> > +  statistics_counter_event (fn, "Store forwardings avoided: ",
> > +                         stats_sf_detected);
> > +
> > +  return 0;
> > +}
> > +
> > +} // anon namespace.
> > +
> > +rtl_opt_pass *
> > +make_pass_rtl_avoid_store_forwarding (gcc::context *ctxt)
> > +{
> > +  return new pass_rtl_avoid_store_forwarding (ctxt);
> > +}
> > diff --git a/gcc/common.opt b/gcc/common.opt
> > index 2c078fdd1f8..2fcf7170c2a 100644
> > --- a/gcc/common.opt
> > +++ b/gcc/common.opt
> > @@ -1747,6 +1747,10 @@ fgcse-sm
> >  Common Var(flag_gcse_sm) Init(0) Optimization
> >  Perform store motion after global common subexpression elimination.
> >
> > +favoid-store-forwarding
> > +Common Var(flag_avoid_store_forwarding) Init(0) Optimization
> > +Try to avoid store forwarding.
> > +
> >  fgcse-las
> >  Common Var(flag_gcse_las) Init(0) Optimization
> >  Perform redundant load after store elimination in global common 
> > subexpression
> > diff --git a/gcc/params.opt b/gcc/params.opt
> > index d34ef545bf0..b8115f5c27a 100644
> > --- a/gcc/params.opt
> > +++ b/gcc/params.opt
> > @@ -1032,6 +1032,10 @@ Allow the store merging pass to introduce unaligned 
> > stores if it is legal to do
> >  Common Joined UInteger Var(param_store_merging_max_size) Init(65536) 
> > IntegerRange(1, 65536) Param Optimization
> >  Maximum size of a single store merging region in bytes.
> >
> > +-param=store-forwarding-max-distance=
> > +Common Joined UInteger Var(param_store_forwarding_max_distance) Init(10) 
> > IntegerRange(1, 1000) Param Optimization
> > +Maximum number of instruction distance that a small store forwarded to a 
> > larger load may stall.
> > +
> >  -param=switch-conversion-max-branch-ratio=
> >  Common Joined UInteger Var(param_switch_conversion_branch_ratio) Init(8) 
> > IntegerRange(1, 65536) Param Optimization
> >  The maximum ratio between array size and switch branches for a switch 
> > conversion to take place.
> > diff --git a/gcc/passes.def b/gcc/passes.def
> > index 1cbbd413097..1e608774707 100644
> > --- a/gcc/passes.def
> > +++ b/gcc/passes.def
> > @@ -462,6 +462,7 @@ along with GCC; see the file COPYING3.  If not see
> >        NEXT_PASS (pass_lower_subreg);
> >        NEXT_PASS (pass_df_initialize_opt);
> >        NEXT_PASS (pass_cse);
> > +      NEXT_PASS (pass_rtl_avoid_store_forwarding);
> >        NEXT_PASS (pass_rtl_fwprop);
> >        NEXT_PASS (pass_rtl_cprop);
> >        NEXT_PASS (pass_rtl_pre);
> > diff --git a/gcc/testsuite/gcc.dg/avoid-store-forwarding-1.c 
> > b/gcc/testsuite/gcc.dg/avoid-store-forwarding-1.c
> > new file mode 100644
> > index 00000000000..0775aee898b
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/avoid-store-forwarding-1.c
> > @@ -0,0 +1,46 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -fdump-rtl-avoid_store_forwarding" } */
> > +
> > +typedef union {
> > +    char arr_8[8];
> > +    long long_value;
> > +} DataUnion;
> > +
> > +long ssll_1 (DataUnion *data, char x)
> > +{
> > +  data->arr_8[0] = x;
> > +  return data->long_value;
> > +}
> > +
> > +long ssll_2 (DataUnion *data, char x)
> > +{
> > +  data->arr_8[1] = x;
> > +  return data->long_value;
> > +}
> > +
> > +long ssll_3 (DataUnion *data, char x)
> > +{
> > +  data->arr_8[7] = x;
> > +  return data->long_value;
> > +}
> > +
> > +long ssll_4 (DataUnion **data, char x)
> > +{
> > +  (*data)->arr_8[0] = x;
> > +  return (*data)->long_value;
> > +}
> > +
> > +long ssll_5 (DataUnion **data, char x)
> > +{
> > +  (*data)->arr_8[1] = x;
> > +  return (*data)->long_value;
> > +}
> > +
> > +long ssll_6 (DataUnion **data, char x)
> > +{
> > +  (*data)->arr_8[7] = x;
> > +  return (*data)->long_value;
> > +}
> > +
> > +/* { dg-final { scan-tree-dump-times "Store forwarding detected" 6 } } */
> > +/* { dg-final { scan-tree-dump-times "Store forwarding avoided" 6 } } */
> > diff --git a/gcc/testsuite/gcc.dg/avoid-store-forwarding-2.c 
> > b/gcc/testsuite/gcc.dg/avoid-store-forwarding-2.c
> > new file mode 100644
> > index 00000000000..cd81aa248fe
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/avoid-store-forwarding-2.c
> > @@ -0,0 +1,39 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -fdump-rtl-avoid_store_forwarding" } */
> > +
> > +typedef union {
> > +    char arr_8[8];
> > +    int long_value;
> > +} DataUnion1;
> > +
> > +long no_ssll_1 (DataUnion1 *data, char x)
> > +{
> > +  data->arr_8[4] = x;
> > +  return data->long_value;
> > +}
> > +
> > +long no_ssll_2 (DataUnion1 *data, char x)
> > +{
> > +  data->arr_8[5] = x;
> > +  return data->long_value;
> > +}
> > +
> > +typedef union {
> > +    char arr_8[8];
> > +    short long_value[4];
> > +} DataUnion2;
> > +
> > +long no_ssll_3 (DataUnion2 *data, char x)
> > +{
> > +  data->arr_8[4] = x;
> > +  return data->long_value[1];
> > +}
> > +
> > +long no_ssll_4 (DataUnion2 *data, char x)
> > +{
> > +  data->arr_8[0] = x;
> > +  return data->long_value[1];
> > +}
> > +
> > +/* { dg-final { scan-tree-dump-times "Store forwarding detected" 0 } } */
> > +/* { dg-final { scan-tree-dump-times "Store forwarding avoided" 0 } } */
> > diff --git a/gcc/testsuite/gcc.dg/avoid-store-forwarding-3.c 
> > b/gcc/testsuite/gcc.dg/avoid-store-forwarding-3.c
> > new file mode 100644
> > index 00000000000..3175f882c86
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/avoid-store-forwarding-3.c
> > @@ -0,0 +1,31 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -fdump-rtl-avoid_store_forwarding" } */
> > +
> > +typedef union {
> > +    char arr_8[8];
> > +    long long_value;
> > +} DataUnion;
> > +
> > +long ssll_multi_1 (DataUnion **data, char x)
> > +{
> > +  (*data)->arr_8[0] = x;
> > +  (*data)->arr_8[2] = x;
> > +  return (*data)->long_value;
> > +}
> > +
> > +long ssll_multi_2 (DataUnion **data, char x)
> > +{
> > +  (*data)->arr_8[0] = x;
> > +  (*data)->arr_8[1] = 11;
> > +  return (*data)->long_value;
> > +}
> > +
> > +long ssll_multi_3 (DataUnion **data, char x, short y)
> > +{
> > +  (*data)->arr_8[1] = x;
> > +  __builtin_memcpy((*data)->arr_8 + 4, &y, sizeof(short));
> > +  return (*data)->long_value;
> > +}
> > +
> > +/* { dg-final { scan-tree-dump-times "Store forwardings detected" 3 } } */
> > +/* { dg-final { scan-tree-dump-times "Store forwardings avoided" 3 } } */
> > diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
> > index 29267589eeb..49957ba3373 100644
> > --- a/gcc/tree-pass.h
> > +++ b/gcc/tree-pass.h
> > @@ -570,6 +570,7 @@ extern rtl_opt_pass *make_pass_rtl_dse3 (gcc::context 
> > *ctxt);
> >  extern rtl_opt_pass *make_pass_rtl_cprop (gcc::context *ctxt);
> >  extern rtl_opt_pass *make_pass_rtl_pre (gcc::context *ctxt);
> >  extern rtl_opt_pass *make_pass_rtl_hoist (gcc::context *ctxt);
> > +extern rtl_opt_pass *make_pass_rtl_avoid_store_forwarding (gcc::context 
> > *ctxt);
> >  extern rtl_opt_pass *make_pass_rtl_store_motion (gcc::context *ctxt);
> >  extern rtl_opt_pass *make_pass_cse_after_global_opts (gcc::context *ctxt);
> >  extern rtl_opt_pass *make_pass_rtl_ifcvt (gcc::context *ctxt);
> >
>
> --
> Richard Biener <rguent...@suse.de>
> SUSE Software Solutions Germany GmbH,
> Frankenstrasse 146, 90461 Nuernberg, Germany;
> GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

Reply via email to