Oops.  Fixed post title.

On Thu, 2015-03-26 at 10:23 -0500, Bill Schmidt wrote:
> Hi,
> 
> This is a follow-up to
> https://gcc.gnu.org/ml/gcc-patches/2015-03/msg01310.html, which
> backported the POWER-specific little-endian swap optimization pass to
> the 4.9 branch.  We also need to backport this to the 4.8 branch.  This
> patch does that.
> 
> The patch is very similar to the 4.9 backport, except for two things.
> First, the passes infrastructure changed quite a bit between 4.8 and
> 4.9, so the code to describe the new pass to the pass manager is
> somewhat different.  Second, I've omitted three of the test cases, which
> happen to fail on 4.8 for unrelated reasons.  (We run out of volatile
> registers and end up saving non-volatiles to the stack in the prologue,
> which generates load/swap sequences for now.)  
> 
> Tested on powerpc64le-unknown-linux-gnu with no regressions.  Is this OK
> for 4.8?
> 
> Thanks,
> Bill
> 
> 
> [gcc]
> 
> 2015-03-26  Bill Schmidt  <wschm...@linux.vnet.ibm.com>
> 
>       Backport of r214242, r214254, and bug fix patches from mainline
>       * config/rs6000/rs6000.c (tree-pass.h): New #include.
>       (rs6000_analyze_swaps): New declaration.
>       (gate_analyze_swaps): New function.
>       (execute_analyze_swaps): Likewise.
>       (pass_analyze_swaps): New struct rtl_opt_pass.
>       (rs6000_option_override): Register swap-optimization pass.
>       (swap_web_entry): New class.
>       (special_handling_values): New enum.
>       (union_defs): New function.
>       (union_uses): Likewise.
>       (insn_is_load_p): Likewise.
>       (insn_is_store_p): Likewise.
>       (insn_is_swap_p): Likewise.
>       (rtx_is_swappable_p): Likewise.
>       (insn_is_swappable_p): Likewise.
>       (chain_purpose): New enum.
>       (chain_contains_only_swaps): New function.
>       (mark_swaps_for_removal): Likewise.
>       (swap_const_vector_halves): Likewise.
>       (adjust_subreg_index): Likewise.
>       (permute_load): Likewise.
>       (permute_store): Likewise.
>       (adjust_extract): Likewise.
>       (adjust_splat): Likewise.
>       (handle_special_swappables): Likewise.
>       (replace_swap_with_copy): Likewise.
>       (dump_swap_insn_table): Likewise.
>       (rs6000_analyze_swaps): Likewise.
>       * config/rs6000/rs6000.opt (moptimize-swaps): New option.
>       * df.h (web_entry_base): New class, replacing struct web_entry.
>       (web_entry_base::pred): New method.
>       (web_entry_base::set_pred): Likewise.
>       (web_entry_base::unionfind_root): Likewise.
>       (web_entry_base::unionfind_union): Likewise.
>       (unionfind_root): Delete external reference.
>       (unionfind_union): Likewise.
>       (union_defs): Likewise.
>       * web.c (web_entry_base::unionfind_root): Convert to method.
>       (web_entry_base::unionfind_union): Likewise.
>       (web_entry): New class.
>       (union_match_dups): Convert to use class structure.
>       (union_defs): Likewise.
>       (entry_register): Likewise.
>       (web_main): Likewise.
> 
> 
> [gcc/testsuite]
> 
> 2015-03-26  Bill Schmidt  <wschm...@linux.vnet.ibm.com>
> 
>       Backport r214254 and related tests from mainline
>       * gcc.target/powerpc/swaps-p8-1.c: New test.
>       * gcc.target/powerpc/swaps-p8-3.c: New test.
>       * gcc.target/powerpc/swaps-p8-4.c: New test.
>       * gcc.target/powerpc/swaps-p8-5.c: New test.
>       * gcc.target/powerpc/swaps-p8-6.c: New test.
>       * gcc.target/powerpc/swaps-p8-7.c: New test.
>       * gcc.target/powerpc/swaps-p8-8.c: New test.
>       * gcc.target/powerpc/swaps-p8-9.c: New test.
>       * gcc.target/powerpc/swaps-p8-10.c: New test.
>       * gcc.target/powerpc/swaps-p8-11.c: New test.
>       * gcc.target/powerpc/swaps-p8-12.c: New test.
>       * gcc.target/powerpc/swaps-p8-13.c: New test.
>       * gcc.target/powerpc/swaps-p8-15.c: New test.
>       * gcc.target/powerpc/swaps-p8-17.c: New test.
> 
> 
> Index: gcc/config/rs6000/rs6000.c
> ===================================================================
> --- gcc/config/rs6000/rs6000.c        (revision 221696)
> +++ gcc/config/rs6000/rs6000.c        (working copy)
> @@ -61,6 +61,7 @@
>  #include "tree-vectorizer.h"
>  #include "dumpfile.h"
>  #include "real.h"
> +#include "tree-pass.h"
>  #if TARGET_XCOFF
>  #include "xcoffout.h"  /* get declarations of xcoff_*_section_name */
>  #endif
> @@ -1153,6 +1154,7 @@ static bool rs6000_secondary_reload_move (enum rs6
>                                         enum machine_mode,
>                                         secondary_reload_info *,
>                                         bool);
> +static unsigned int rs6000_analyze_swaps (function *);
> 
>  /* Hash table stuff for keeping track of TOC entries.  */
> 
> @@ -4046,6 +4048,37 @@ rs6000_option_override_internal (bool global_init_
>    return ret;
>  }
> 
> +static bool
> +gate_analyze_swaps (void)
> +{
> +  return (optimize > 0 && !BYTES_BIG_ENDIAN && TARGET_VSX
> +       && rs6000_optimize_swaps);
> +}
> +
> +static unsigned int
> +execute_analyze_swaps (void)
> +{
> +  return rs6000_analyze_swaps (cfun);
> +}
> +
> +struct rtl_opt_pass pass_analyze_swaps =
> +{
> +  RTL_PASS,
> +  "swaps", /* name */
> +  OPTGROUP_NONE, /* optinfo_flags */
> +  gate_analyze_swaps, /* has_gate */
> +  execute_analyze_swaps, /* has_execute */
> +  NULL, /* sub */
> +  NULL, /* next */
> +  0, /* static_pass_number */
> +  TV_NONE, /* tv_id */
> +  0, /* properties_required */
> +  0, /* properties_provided */
> +  0, /* properties_destroyed */
> +  0, /* todo_flags_start */
> +  TODO_df_finish, /* todo_flags_finish */
> +};
> +
>  /* Implement TARGET_OPTION_OVERRIDE.  On the RS/6000 this is used to
>     define the target cpu type.  */
> 
> @@ -4053,6 +4086,13 @@ static void
>  rs6000_option_override (void)
>  {
>    (void) rs6000_option_override_internal (true);
> +
> +  /* Register machine-specific passes.  This needs to be done at start-up.
> +     It's convenient to do it here (like i386 does).  */
> +  static struct register_pass_info analyze_swaps_info
> +    = { &pass_analyze_swaps.pass, "cse1", 1, PASS_POS_INSERT_BEFORE };
> +
> +  register_pass (&analyze_swaps_info);
>  }
> 
>  
> @@ -33210,7 +33250,1148 @@ emit_fusion_gpr_load (rtx target, rtx mem)
> 
>    return "";
>  }
> +
> +/* Analyze vector computations and remove unnecessary doubleword
> +   swaps (xxswapdi instructions).  This pass is performed only
> +   for little-endian VSX code generation.
> 
> +   For this specific case, loads and stores of 4x32 and 2x64 vectors
> +   are inefficient.  These are implemented using the lvx2dx and
> +   stvx2dx instructions, which invert the order of doublewords in
> +   a vector register.  Thus the code generation inserts an xxswapdi
> +   after each such load, and prior to each such store.  (For spill
> +   code after register assignment, an additional xxswapdi is inserted
> +   following each store in order to return a hard register to its
> +   unpermuted value.)
> +
> +   The extra xxswapdi instructions reduce performance.  This can be
> +   particularly bad for vectorized code.  The purpose of this pass
> +   is to reduce the number of xxswapdi instructions required for
> +   correctness.
> +
> +   The primary insight is that much code that operates on vectors
> +   does not care about the relative order of elements in a register,
> +   so long as the correct memory order is preserved.  If we have
> +   a computation where all input values are provided by lvxd2x/xxswapdi
> +   sequences, all outputs are stored using xxswapdi/stvxd2x sequences,
> +   and all intermediate computations are pure SIMD (independent of
> +   element order), then all the xxswapdi's associated with the loads
> +   and stores may be removed.
> +
> +   This pass uses some of the infrastructure and logical ideas from
> +   the "web" pass in web.c.  We create maximal webs of computations
> +   fitting the description above using union-find.  Each such web is
> +   then optimized by removing its unnecessary xxswapdi instructions.
> +
> +   The pass is placed prior to global optimization so that we can
> +   perform the optimization in the safest and simplest way possible;
> +   that is, by replacing each xxswapdi insn with a register copy insn.
> +   Subsequent forward propagation will remove copies where possible.
> +
> +   There are some operations sensitive to element order for which we
> +   can still allow the operation, provided we modify those operations.
> +   These include CONST_VECTORs, for which we must swap the first and
> +   second halves of the constant vector; and SUBREGs, for which we
> +   must adjust the byte offset to account for the swapped doublewords.
> +   A remaining opportunity would be non-immediate-form splats, for
> +   which we should adjust the selected lane of the input.  We should
> +   also make code generation adjustments for sum-across operations,
> +   since this is a common vectorizer reduction.
> +
> +   Because we run prior to the first split, we can see loads and stores
> +   here that match *vsx_le_perm_{load,store}_<mode>.  These are vanilla
> +   vector loads and stores that have not yet been split into a permuting
> +   load/store and a swap.  (One way this can happen is with a builtin
> +   call to vec_vsx_{ld,st}.)  We can handle these as well, but rather
> +   than deleting a swap, we convert the load/store into a permuting
> +   load/store (which effectively removes the swap).  */
> +
> +/* Notes on Permutes
> +
> +   We do not currently handle computations that contain permutes.  There
> +   is a general transformation that can be performed correctly, but it
> +   may introduce more expensive code than it replaces.  To handle these
> +   would require a cost model to determine when to perform the optimization.
> +   This commentary records how this could be done if desired.
> +
> +   The most general permute is something like this (example for V16QI):
> +
> +   (vec_select:V16QI (vec_concat:V32QI (op1:V16QI) (op2:V16QI))
> +                     (parallel [(const_int a0) (const_int a1)
> +                                 ...
> +                                (const_int a14) (const_int a15)]))
> +
> +   where a0,...,a15 are in [0,31] and select elements from op1 and op2
> +   to produce in the result.
> +
> +   Regardless of mode, we can convert the PARALLEL to a mask of 16
> +   byte-element selectors.  Let's call this M, with M[i] representing
> +   the ith byte-element selector value.  Then if we swap doublewords
> +   throughout the computation, we can get correct behavior by replacing
> +   M with M' as follows:
> +
> +            { M[i+8]+8 : i < 8, M[i+8] in [0,7] U [16,23]
> +    M'[i] = { M[i+8]-8 : i < 8, M[i+8] in [8,15] U [24,31]
> +            { M[i-8]+8 : i >= 8, M[i-8] in [0,7] U [16,23]
> +            { M[i-8]-8 : i >= 8, M[i-8] in [8,15] U [24,31]
> +
> +   This seems promising at first, since we are just replacing one mask
> +   with another.  But certain masks are preferable to others.  If M
> +   is a mask that matches a vmrghh pattern, for example, M' certainly
> +   will not.  Instead of a single vmrghh, we would generate a load of
> +   M' and a vperm.  So we would need to know how many xxswapd's we can
> +   remove as a result of this transformation to determine if it's
> +   profitable; and preferably the logic would need to be aware of all
> +   the special preferable masks.
> +
> +   Another form of permute is an UNSPEC_VPERM, in which the mask is
> +   already in a register.  In some cases, this mask may be a constant
> +   that we can discover with ud-chains, in which case the above
> +   transformation is ok.  However, the common usage here is for the
> +   mask to be produced by an UNSPEC_LVSL, in which case the mask 
> +   cannot be known at compile time.  In such a case we would have to
> +   generate several instructions to compute M' as above at run time,
> +   and a cost model is needed again.  */
> +
> +/* This is based on the union-find logic in web.c.  web_entry_base is
> +   defined in df.h.  */
> +class swap_web_entry : public web_entry_base
> +{
> + public:
> +  /* Pointer to the insn.  */
> +  rtx insn;
> +  /* Set if insn contains a mention of a vector register.  All other
> +     fields are undefined if this field is unset.  */
> +  unsigned int is_relevant : 1;
> +  /* Set if insn is a load.  */
> +  unsigned int is_load : 1;
> +  /* Set if insn is a store.  */
> +  unsigned int is_store : 1;
> +  /* Set if insn is a doubleword swap.  This can either be a register swap
> +     or a permuting load or store (test is_load and is_store for this).  */
> +  unsigned int is_swap : 1;
> +  /* Set if the insn has a live-in use of a parameter register.  */
> +  unsigned int is_live_in : 1;
> +  /* Set if the insn has a live-out def of a return register.  */
> +  unsigned int is_live_out : 1;
> +  /* Set if the insn contains a subreg reference of a vector register.  */
> +  unsigned int contains_subreg : 1;
> +  /* Set if the insn contains a 128-bit integer operand.  */
> +  unsigned int is_128_int : 1;
> +  /* Set if this is a call-insn.  */
> +  unsigned int is_call : 1;
> +  /* Set if this insn does not perform a vector operation for which
> +     element order matters, or if we know how to fix it up if it does.
> +     Undefined if is_swap is set.  */
> +  unsigned int is_swappable : 1;
> +  /* A nonzero value indicates what kind of special handling for this
> +     insn is required if doublewords are swapped.  Undefined if
> +     is_swappable is not set.  */
> +  unsigned int special_handling : 3;
> +  /* Set if the web represented by this entry cannot be optimized.  */
> +  unsigned int web_not_optimizable : 1;
> +  /* Set if this insn should be deleted.  */
> +  unsigned int will_delete : 1;
> +};
> +
> +enum special_handling_values {
> +  SH_NONE = 0,
> +  SH_CONST_VECTOR,
> +  SH_SUBREG,
> +  SH_NOSWAP_LD,
> +  SH_NOSWAP_ST,
> +  SH_EXTRACT,
> +  SH_SPLAT
> +};
> +
> +/* Union INSN with all insns containing definitions that reach USE.
> +   Detect whether USE is live-in to the current function.  */
> +static void
> +union_defs (swap_web_entry *insn_entry, rtx insn, df_ref use)
> +{
> +  struct df_link *link = DF_REF_CHAIN (use);
> +
> +  if (!link)
> +    insn_entry[INSN_UID (insn)].is_live_in = 1;
> +
> +  while (link)
> +    {
> +      if (DF_REF_IS_ARTIFICIAL (link->ref))
> +     insn_entry[INSN_UID (insn)].is_live_in = 1;
> +
> +      if (DF_REF_INSN_INFO (link->ref))
> +     {
> +       rtx def_insn = DF_REF_INSN (link->ref);
> +       (void)unionfind_union (insn_entry + INSN_UID (insn),
> +                              insn_entry + INSN_UID (def_insn));
> +     }
> +
> +      link = link->next;
> +    }
> +}
> +
> +/* Union INSN with all insns containing uses reached from DEF.
> +   Detect whether DEF is live-out from the current function.  */
> +static void
> +union_uses (swap_web_entry *insn_entry, rtx insn, df_ref def)
> +{
> +  struct df_link *link = DF_REF_CHAIN (def);
> +
> +  if (!link)
> +    insn_entry[INSN_UID (insn)].is_live_out = 1;
> +
> +  while (link)
> +    {
> +      /* This could be an eh use or some other artificial use;
> +      we treat these all the same (killing the optimization).  */
> +      if (DF_REF_IS_ARTIFICIAL (link->ref))
> +     insn_entry[INSN_UID (insn)].is_live_out = 1;
> +
> +      if (DF_REF_INSN_INFO (link->ref))
> +     {
> +       rtx use_insn = DF_REF_INSN (link->ref);
> +       (void)unionfind_union (insn_entry + INSN_UID (insn),
> +                              insn_entry + INSN_UID (use_insn));
> +     }
> +
> +      link = link->next;
> +    }
> +}
> +
> +/* Return 1 iff INSN is a load insn, including permuting loads that
> +   represent an lvxd2x instruction; else return 0.  */
> +static unsigned int
> +insn_is_load_p (rtx insn)
> +{
> +  rtx body = PATTERN (insn);
> +
> +  if (GET_CODE (body) == SET)
> +    {
> +      if (GET_CODE (SET_SRC (body)) == MEM)
> +     return 1;
> +
> +      if (GET_CODE (SET_SRC (body)) == VEC_SELECT
> +       && GET_CODE (XEXP (SET_SRC (body), 0)) == MEM)
> +     return 1;
> +
> +      return 0;
> +    }
> +
> +  if (GET_CODE (body) != PARALLEL)
> +    return 0;
> +
> +  rtx set = XVECEXP (body, 0, 0);
> +
> +  if (GET_CODE (set) == SET && GET_CODE (SET_SRC (set)) == MEM)
> +    return 1;
> +
> +  return 0;
> +}
> +
> +/* Return 1 iff INSN is a store insn, including permuting stores that
> +   represent an stvxd2x instruction; else return 0.  */
> +static unsigned int
> +insn_is_store_p (rtx insn)
> +{
> +  rtx body = PATTERN (insn);
> +  if (GET_CODE (body) == SET && GET_CODE (SET_DEST (body)) == MEM)
> +    return 1;
> +  if (GET_CODE (body) != PARALLEL)
> +    return 0;
> +  rtx set = XVECEXP (body, 0, 0);
> +  if (GET_CODE (set) == SET && GET_CODE (SET_DEST (set)) == MEM)
> +    return 1;
> +  return 0;
> +}
> +
> +/* Return 1 iff INSN swaps doublewords.  This may be a reg-reg swap,
> +   a permuting load, or a permuting store.  */
> +static unsigned int
> +insn_is_swap_p (rtx insn)
> +{
> +  rtx body = PATTERN (insn);
> +  if (GET_CODE (body) != SET)
> +    return 0;
> +  rtx rhs = SET_SRC (body);
> +  if (GET_CODE (rhs) != VEC_SELECT)
> +    return 0;
> +  rtx parallel = XEXP (rhs, 1);
> +  if (GET_CODE (parallel) != PARALLEL)
> +    return 0;
> +  unsigned int len = XVECLEN (parallel, 0);
> +  if (len != 2 && len != 4 && len != 8 && len != 16)
> +    return 0;
> +  for (unsigned int i = 0; i < len / 2; ++i)
> +    {
> +      rtx op = XVECEXP (parallel, 0, i);
> +      if (GET_CODE (op) != CONST_INT || INTVAL (op) != len / 2 + i)
> +     return 0;
> +    }
> +  for (unsigned int i = len / 2; i < len; ++i)
> +    {
> +      rtx op = XVECEXP (parallel, 0, i);
> +      if (GET_CODE (op) != CONST_INT || INTVAL (op) != i - len / 2)
> +     return 0;
> +    }
> +  return 1;
> +}
> +
> +/* Return 1 iff OP is an operand that will not be affected by having
> +   vector doublewords swapped in memory.  */
> +static unsigned int
> +rtx_is_swappable_p (rtx op, unsigned int *special)
> +{
> +  enum rtx_code code = GET_CODE (op);
> +  int i, j;
> +  rtx parallel;
> +
> +  switch (code)
> +    {
> +    case LABEL_REF:
> +    case SYMBOL_REF:
> +    case CLOBBER:
> +    case REG:
> +      return 1;
> +
> +    case VEC_CONCAT:
> +    case ASM_INPUT:
> +    case ASM_OPERANDS:
> +      return 0;
> +
> +    case CONST_VECTOR:
> +      {
> +     *special = SH_CONST_VECTOR;
> +     return 1;
> +      }
> +
> +    case VEC_DUPLICATE:
> +      /* Opportunity: If XEXP (op, 0) has the same mode as the result,
> +      and XEXP (op, 1) is a PARALLEL with a single QImode const int,
> +      it represents a vector splat for which we can do special
> +      handling.  */
> +      if (GET_CODE (XEXP (op, 0)) == CONST_INT)
> +     return 1;
> +      else if (GET_CODE (XEXP (op, 0)) == REG
> +            && GET_MODE_INNER (GET_MODE (op)) == GET_MODE (XEXP (op, 0)))
> +     /* This catches V2DF and V2DI splat, at a minimum.  */
> +     return 1;
> +      else if (GET_CODE (XEXP (op, 0)) == VEC_SELECT)
> +     /* If the duplicated item is from a select, defer to the select
> +        processing to see if we can change the lane for the splat.  */
> +     return rtx_is_swappable_p (XEXP (op, 0), special);
> +      else
> +     return 0;
> +
> +    case VEC_SELECT:
> +      /* A vec_extract operation is ok if we change the lane.  */
> +      if (GET_CODE (XEXP (op, 0)) == REG
> +       && GET_MODE_INNER (GET_MODE (XEXP (op, 0))) == GET_MODE (op)
> +       && GET_CODE ((parallel = XEXP (op, 1))) == PARALLEL
> +       && XVECLEN (parallel, 0) == 1
> +       && GET_CODE (XVECEXP (parallel, 0, 0)) == CONST_INT)
> +     {
> +       *special = SH_EXTRACT;
> +       return 1;
> +     }
> +      else
> +     return 0;
> +
> +    case UNSPEC:
> +      {
> +     /* Various operations are unsafe for this optimization, at least
> +        without significant additional work.  Permutes are obviously
> +        problematic, as both the permute control vector and the ordering
> +        of the target values are invalidated by doubleword swapping.
> +        Vector pack and unpack modify the number of vector lanes.
> +        Merge-high/low will not operate correctly on swapped operands.
> +        Vector shifts across element boundaries are clearly uncool,
> +        as are vector select and concatenate operations.  Vector
> +        sum-across instructions define one operand with a specific
> +        order-dependent element, so additional fixup code would be
> +        needed to make those work.  Vector set and non-immediate-form
> +        vector splat are element-order sensitive.  A few of these
> +        cases might be workable with special handling if required.  */
> +     int val = XINT (op, 1);
> +     switch (val)
> +       {
> +       default:
> +         break;
> +       case UNSPEC_VMRGH_DIRECT:
> +       case UNSPEC_VMRGL_DIRECT:
> +       case UNSPEC_VPACK_SIGN_SIGN_SAT:
> +       case UNSPEC_VPACK_SIGN_UNS_SAT:
> +       case UNSPEC_VPACK_UNS_UNS_MOD:
> +       case UNSPEC_VPACK_UNS_UNS_MOD_DIRECT:
> +       case UNSPEC_VPACK_UNS_UNS_SAT:
> +       case UNSPEC_VPERM:
> +       case UNSPEC_VPERM_UNS:
> +       case UNSPEC_VPERMHI:
> +       case UNSPEC_VPERMSI:
> +       case UNSPEC_VPKPX:
> +       case UNSPEC_VSLDOI:
> +       case UNSPEC_VSLO:
> +       case UNSPEC_VSRO:
> +       case UNSPEC_VSUM2SWS:
> +       case UNSPEC_VSUM4S:
> +       case UNSPEC_VSUM4UBS:
> +       case UNSPEC_VSUMSWS:
> +       case UNSPEC_VSUMSWS_DIRECT:
> +       case UNSPEC_VSX_CONCAT:
> +       case UNSPEC_VSX_SET:
> +       case UNSPEC_VSX_SLDWI:
> +       case UNSPEC_VUNPACK_HI_SIGN:
> +       case UNSPEC_VUNPACK_HI_SIGN_DIRECT:
> +       case UNSPEC_VUNPACK_LO_SIGN:
> +       case UNSPEC_VUNPACK_LO_SIGN_DIRECT:
> +       case UNSPEC_VUPKHPX:
> +       case UNSPEC_VUPKHS_V4SF:
> +       case UNSPEC_VUPKHU_V4SF:
> +       case UNSPEC_VUPKLPX:
> +       case UNSPEC_VUPKLS_V4SF:
> +       case UNSPEC_VUPKLU_V4SF:
> +       /* The following could be handled as an idiom with XXSPLTW.
> +          These place a scalar in BE element zero, but the XXSPLTW
> +          will currently expect it in BE element 2 in a swapped
> +          region.  When one of these feeds an XXSPLTW with no other
> +          defs/uses either way, we can avoid the lane change for
> +          XXSPLTW and things will be correct.  TBD.  */
> +       case UNSPEC_VSX_CVDPSPN:
> +       case UNSPEC_VSX_CVSPDP:
> +       case UNSPEC_VSX_CVSPDPN:
> +         return 0;
> +       case UNSPEC_VSPLT_DIRECT:
> +         *special = SH_SPLAT;
> +         return 1;
> +       }
> +      }
> +
> +    default:
> +      break;
> +    }
> +
> +  const char *fmt = GET_RTX_FORMAT (code);
> +  int ok = 1;
> +
> +  for (i = 0; i < GET_RTX_LENGTH (code); ++i)
> +    if (fmt[i] == 'e' || fmt[i] == 'u')
> +      {
> +     unsigned int special_op = SH_NONE;
> +     ok &= rtx_is_swappable_p (XEXP (op, i), &special_op);
> +     /* Ensure we never have two kinds of special handling
> +        for the same insn.  */
> +     if (*special != SH_NONE && special_op != SH_NONE
> +         && *special != special_op)
> +       return 0;
> +     *special = special_op;
> +      }
> +    else if (fmt[i] == 'E')
> +      for (j = 0; j < XVECLEN (op, i); ++j)
> +     {
> +       unsigned int special_op = SH_NONE;
> +       ok &= rtx_is_swappable_p (XVECEXP (op, i, j), &special_op);
> +       /* Ensure we never have two kinds of special handling
> +          for the same insn.  */
> +       if (*special != SH_NONE && special_op != SH_NONE
> +           && *special != special_op)
> +         return 0;
> +       *special = special_op;
> +     }
> +
> +  return ok;
> +}
> +
> +/* Return 1 iff INSN is an operand that will not be affected by
> +   having vector doublewords swapped in memory (in which case
> +   *SPECIAL is unchanged), or that can be modified to be correct
> +   if vector doublewords are swapped in memory (in which case
> +   *SPECIAL is changed to a value indicating how).  */
> +static unsigned int
> +insn_is_swappable_p (swap_web_entry *insn_entry, rtx insn,
> +                  unsigned int *special)
> +{
> +  /* Calls are always bad.  */
> +  if (GET_CODE (insn) == CALL_INSN)
> +    return 0;
> +
> +  /* Loads and stores seen here are not permuting, but we can still
> +     fix them up by converting them to permuting ones.  Exceptions:
> +     UNSPEC_LVE, UNSPEC_LVX, and UNSPEC_STVX, which have a PARALLEL
> +     body instead of a SET; and UNSPEC_STVE, which has an UNSPEC
> +     for the SET source.  */
> +  rtx body = PATTERN (insn);
> +  int i = INSN_UID (insn);
> +
> +  if (insn_entry[i].is_load)
> +    {
> +      if (GET_CODE (body) == SET)
> +     {
> +       *special = SH_NOSWAP_LD;
> +       return 1;
> +     }
> +      else
> +     return 0;
> +    }
> +
> +  if (insn_entry[i].is_store)
> +    {
> +      if (GET_CODE (body) == SET && GET_CODE (SET_SRC (body)) != UNSPEC)
> +     {
> +       *special = SH_NOSWAP_ST;
> +       return 1;
> +     }
> +      else
> +     return 0;
> +    }
> +
> +  /* Otherwise check the operands for vector lane violations.  */
> +  return rtx_is_swappable_p (body, special);
> +}
> +
> +enum chain_purpose { FOR_LOADS, FOR_STORES };
> +
> +/* Return true if the UD or DU chain headed by LINK is non-empty,
> +   and every entry on the chain references an insn that is a
> +   register swap.  Furthermore, if PURPOSE is FOR_LOADS, each such
> +   register swap must have only permuting loads as reaching defs.
> +   If PURPOSE is FOR_STORES, each such register swap must have only
> +   register swaps or permuting stores as reached uses.  */
> +static bool
> +chain_contains_only_swaps (swap_web_entry *insn_entry, struct df_link *link,
> +                        enum chain_purpose purpose)
> +{
> +  if (!link)
> +    return false;
> +
> +  for (; link; link = link->next)
> +    {
> +      if (!VECTOR_MODE_P (GET_MODE (DF_REF_REG (link->ref))))
> +     continue;
> +
> +      if (DF_REF_IS_ARTIFICIAL (link->ref))
> +     return false;
> +
> +      rtx reached_insn = DF_REF_INSN (link->ref);
> +      unsigned uid = INSN_UID (reached_insn);
> +
> +      if (!insn_entry[uid].is_swap || insn_entry[uid].is_load
> +       || insn_entry[uid].is_store)
> +     return false;
> +
> +      if (purpose == FOR_LOADS)
> +     {
> +       df_ref *use_rec;
> +       for (use_rec = DF_INSN_UID_USES (uid); *use_rec; use_rec++)
> +         {
> +           df_ref use = *use_rec;
> +           struct df_link *swap_link = DF_REF_CHAIN (use);
> +
> +           while (swap_link)
> +             {
> +               if (DF_REF_IS_ARTIFICIAL (link->ref))
> +                 return false;
> +
> +               rtx swap_def_insn = DF_REF_INSN (swap_link->ref);
> +               unsigned uid2 = INSN_UID (swap_def_insn);
> +
> +               /* Only permuting loads are allowed.  */
> +               if (!insn_entry[uid2].is_swap || !insn_entry[uid2].is_load)
> +                 return false;
> +
> +               swap_link = swap_link->next;
> +             }
> +         }
> +     }
> +      else if (purpose == FOR_STORES)
> +     {
> +       df_ref *def_rec;
> +       for (def_rec = DF_INSN_UID_DEFS (uid); *def_rec; def_rec++)
> +         {
> +           df_ref def = *def_rec;
> +           struct df_link *swap_link = DF_REF_CHAIN (def);
> +
> +           while (swap_link)
> +             {
> +               if (DF_REF_IS_ARTIFICIAL (link->ref))
> +                 return false;
> +
> +               rtx swap_use_insn = DF_REF_INSN (swap_link->ref);
> +               unsigned uid2 = INSN_UID (swap_use_insn);
> +
> +               /* Permuting stores or register swaps are allowed.  */
> +               if (!insn_entry[uid2].is_swap || insn_entry[uid2].is_load)
> +                 return false;
> +
> +               swap_link = swap_link->next;
> +             }
> +         }
> +     }
> +    }
> +
> +  return true;
> +}
> +
> +/* Mark the xxswapdi instructions associated with permuting loads and
> +   stores for removal.  Note that we only flag them for deletion here,
> +   as there is a possibility of a swap being reached from multiple
> +   loads, etc.  */
> +static void
> +mark_swaps_for_removal (swap_web_entry *insn_entry, unsigned int i)
> +{
> +  rtx insn = insn_entry[i].insn;
> +  unsigned uid = INSN_UID (insn);
> +
> +  if (insn_entry[i].is_load)
> +    {
> +      df_ref *def_rec;
> +      for (def_rec = DF_INSN_UID_DEFS (uid); *def_rec; def_rec++)
> +     {
> +       df_ref def = *def_rec;
> +       struct df_link *link = DF_REF_CHAIN (def);
> +
> +       /* We know by now that these are swaps, so we can delete
> +          them confidently.  */
> +       while (link)
> +         {
> +           rtx use_insn = DF_REF_INSN (link->ref);
> +           insn_entry[INSN_UID (use_insn)].will_delete = 1;
> +           link = link->next;
> +         }
> +     }
> +    }
> +  else if (insn_entry[i].is_store)
> +    {
> +      df_ref *use_rec;
> +      for (use_rec = DF_INSN_UID_USES (uid); *use_rec; use_rec++)
> +     {
> +       df_ref use = *use_rec;
> +       /* Ignore uses for addressability.  */
> +       machine_mode mode = GET_MODE (DF_REF_REG (use));
> +       if (!VECTOR_MODE_P (mode))
> +         continue;
> +
> +       struct df_link *link = DF_REF_CHAIN (use);
> +
> +       /* We know by now that these are swaps, so we can delete
> +          them confidently.  */
> +       while (link)
> +         {
> +           rtx def_insn = DF_REF_INSN (link->ref);
> +           insn_entry[INSN_UID (def_insn)].will_delete = 1;
> +           link = link->next;
> +         }
> +     }
> +    }
> +}
> +
> +/* OP is either a CONST_VECTOR or an expression containing one.
> +   Swap the first half of the vector with the second in the first
> +   case.  Recurse to find it in the second.  */
> +static void
> +swap_const_vector_halves (rtx op)
> +{
> +  int i;
> +  enum rtx_code code = GET_CODE (op);
> +  if (GET_CODE (op) == CONST_VECTOR)
> +    {
> +      int half_units = GET_MODE_NUNITS (GET_MODE (op)) / 2;
> +      for (i = 0; i < half_units; ++i)
> +     {
> +       rtx temp = CONST_VECTOR_ELT (op, i);
> +       CONST_VECTOR_ELT (op, i) = CONST_VECTOR_ELT (op, i + half_units);
> +       CONST_VECTOR_ELT (op, i + half_units) = temp;
> +     }
> +    }
> +  else
> +    {
> +      int j;
> +      const char *fmt = GET_RTX_FORMAT (code);
> +      for (i = 0; i < GET_RTX_LENGTH (code); ++i)
> +     if (fmt[i] == 'e' || fmt[i] == 'u')
> +       swap_const_vector_halves (XEXP (op, i));
> +     else if (fmt[i] == 'E')
> +       for (j = 0; j < XVECLEN (op, i); ++j)
> +         swap_const_vector_halves (XVECEXP (op, i, j));
> +    }
> +}
> +
> +/* Find all subregs of a vector expression that perform a narrowing,
> +   and adjust the subreg index to account for doubleword swapping.  */
> +static void
> +adjust_subreg_index (rtx op)
> +{
> +  enum rtx_code code = GET_CODE (op);
> +  if (code == SUBREG
> +      && (GET_MODE_SIZE (GET_MODE (op))
> +       < GET_MODE_SIZE (GET_MODE (XEXP (op, 0)))))
> +    {
> +      unsigned int index = SUBREG_BYTE (op);
> +      if (index < 8)
> +     index += 8;
> +      else
> +     index -= 8;
> +      SUBREG_BYTE (op) = index;
> +    }
> +
> +  const char *fmt = GET_RTX_FORMAT (code);
> +  int i,j;
> +  for (i = 0; i < GET_RTX_LENGTH (code); ++i)
> +    if (fmt[i] == 'e' || fmt[i] == 'u')
> +      adjust_subreg_index (XEXP (op, i));
> +    else if (fmt[i] == 'E')
> +      for (j = 0; j < XVECLEN (op, i); ++j)
> +     adjust_subreg_index (XVECEXP (op, i, j));
> +}
> +
> +/* Convert the non-permuting load INSN to a permuting one.  */
> +static void
> +permute_load (rtx insn)
> +{
> +  rtx body = PATTERN (insn);
> +  rtx mem_op = SET_SRC (body);
> +  rtx tgt_reg = SET_DEST (body);
> +  machine_mode mode = GET_MODE (tgt_reg);
> +  int n_elts = GET_MODE_NUNITS (mode);
> +  int half_elts = n_elts / 2;
> +  rtx par = gen_rtx_PARALLEL (mode, rtvec_alloc (n_elts));
> +  int i, j;
> +  for (i = 0, j = half_elts; i < half_elts; ++i, ++j)
> +    XVECEXP (par, 0, i) = GEN_INT (j);
> +  for (i = half_elts, j = 0; j < half_elts; ++i, ++j)
> +    XVECEXP (par, 0, i) = GEN_INT (j);
> +  rtx sel = gen_rtx_VEC_SELECT (mode, mem_op, par);
> +  SET_SRC (body) = sel;
> +  INSN_CODE (insn) = -1; /* Force re-recognition.  */
> +  df_insn_rescan (insn);
> +
> +  if (dump_file)
> +    fprintf (dump_file, "Replacing load %d with permuted load\n",
> +          INSN_UID (insn));
> +}
> +
> +/* Convert the non-permuting store INSN to a permuting one.  */
> +static void
> +permute_store (rtx insn)
> +{
> +  rtx body = PATTERN (insn);
> +  rtx src_reg = SET_SRC (body);
> +  machine_mode mode = GET_MODE (src_reg);
> +  int n_elts = GET_MODE_NUNITS (mode);
> +  int half_elts = n_elts / 2;
> +  rtx par = gen_rtx_PARALLEL (mode, rtvec_alloc (n_elts));
> +  int i, j;
> +  for (i = 0, j = half_elts; i < half_elts; ++i, ++j)
> +    XVECEXP (par, 0, i) = GEN_INT (j);
> +  for (i = half_elts, j = 0; j < half_elts; ++i, ++j)
> +    XVECEXP (par, 0, i) = GEN_INT (j);
> +  rtx sel = gen_rtx_VEC_SELECT (mode, src_reg, par);
> +  SET_SRC (body) = sel;
> +  INSN_CODE (insn) = -1; /* Force re-recognition.  */
> +  df_insn_rescan (insn);
> +
> +  if (dump_file)
> +    fprintf (dump_file, "Replacing store %d with permuted store\n",
> +          INSN_UID (insn));
> +}
> +
> +/* Given OP that contains a vector extract operation, adjust the index
> +   of the extracted lane to account for the doubleword swap.  */
> +static void
> +adjust_extract (rtx insn)
> +{
> +  rtx src = SET_SRC (PATTERN (insn));
> +  /* The vec_select may be wrapped in a vec_duplicate for a splat, so
> +     account for that.  */
> +  rtx sel = GET_CODE (src) == VEC_DUPLICATE ? XEXP (src, 0) : src;
> +  rtx par = XEXP (sel, 1);
> +  int half_elts = GET_MODE_NUNITS (GET_MODE (XEXP (sel, 0))) >> 1;
> +  int lane = INTVAL (XVECEXP (par, 0, 0));
> +  lane = lane >= half_elts ? lane - half_elts : lane + half_elts;
> +  XVECEXP (par, 0, 0) = GEN_INT (lane);
> +  INSN_CODE (insn) = -1; /* Force re-recognition.  */
> +  df_insn_rescan (insn);
> +
> +  if (dump_file)
> +    fprintf (dump_file, "Changing lane for extract %d\n", INSN_UID (insn));
> +}
> +
> +/* Given OP that contains a vector direct-splat operation, adjust the index
> +   of the source lane to account for the doubleword swap.  */
> +static void
> +adjust_splat (rtx insn)
> +{
> +  rtx body = PATTERN (insn);
> +  rtx unspec = XEXP (body, 1);
> +  int half_elts = GET_MODE_NUNITS (GET_MODE (unspec)) >> 1;
> +  int lane = INTVAL (XVECEXP (unspec, 0, 1));
> +  lane = lane >= half_elts ? lane - half_elts : lane + half_elts;
> +  XVECEXP (unspec, 0, 1) = GEN_INT (lane);
> +  INSN_CODE (insn) = -1; /* Force re-recognition.  */
> +  df_insn_rescan (insn);
> +
> +  if (dump_file)
> +    fprintf (dump_file, "Changing lane for splat %d\n", INSN_UID (insn));
> +}
> +
> +/* The insn described by INSN_ENTRY[I] can be swapped, but only
> +   with special handling.  Take care of that here.  */
> +static void
> +handle_special_swappables (swap_web_entry *insn_entry, unsigned i)
> +{
> +  rtx insn = insn_entry[i].insn;
> +  rtx body = PATTERN (insn);
> +
> +  switch (insn_entry[i].special_handling)
> +    {
> +    default:
> +      gcc_unreachable ();
> +    case SH_CONST_VECTOR:
> +      {
> +     /* A CONST_VECTOR will only show up somewhere in the RHS of a SET.  */
> +     gcc_assert (GET_CODE (body) == SET);
> +     rtx rhs = SET_SRC (body);
> +     swap_const_vector_halves (rhs);
> +     if (dump_file)
> +       fprintf (dump_file, "Swapping constant halves in insn %d\n", i);
> +     break;
> +      }
> +    case SH_SUBREG:
> +      /* A subreg of the same size is already safe.  For subregs that
> +      select a smaller portion of a reg, adjust the index for
> +      swapped doublewords.  */
> +      adjust_subreg_index (body);
> +      if (dump_file)
> +     fprintf (dump_file, "Adjusting subreg in insn %d\n", i);
> +      break;
> +    case SH_NOSWAP_LD:
> +      /* Convert a non-permuting load to a permuting one.  */
> +      permute_load (insn);
> +      break;
> +    case SH_NOSWAP_ST:
> +      /* Convert a non-permuting store to a permuting one.  */
> +      permute_store (insn);
> +      break;
> +    case SH_EXTRACT:
> +      /* Change the lane on an extract operation.  */
> +      adjust_extract (insn);
> +      break;
> +    case SH_SPLAT:
> +      /* Change the lane on a direct-splat operation.  */
> +      adjust_splat (insn);
> +      break;
> +    }
> +}
> +
> +/* Find the insn from the Ith table entry, which is known to be a
> +   register swap Y = SWAP(X).  Replace it with a copy Y = X.  */
> +static void
> +replace_swap_with_copy (swap_web_entry *insn_entry, unsigned i)
> +{
> +  rtx insn = insn_entry[i].insn;
> +  rtx body = PATTERN (insn);
> +  rtx src_reg = XEXP (SET_SRC (body), 0);
> +  rtx copy = gen_rtx_SET (VOIDmode, SET_DEST (body), src_reg);
> +  rtx new_insn = emit_insn_before (copy, insn);
> +  set_block_for_insn (new_insn, BLOCK_FOR_INSN (insn));
> +  df_insn_rescan (new_insn);
> +
> +  if (dump_file)
> +    {
> +      unsigned int new_uid = INSN_UID (new_insn);
> +      fprintf (dump_file, "Replacing swap %d with copy %d\n", i, new_uid);
> +    }
> +
> +  df_insn_delete (BLOCK_FOR_INSN (insn), INSN_UID (insn));
> +  remove_insn (insn);
> +  INSN_DELETED_P (insn) = 1;
> +}
> +
> +/* Dump the swap table to DUMP_FILE.  */
> +static void
> +dump_swap_insn_table (swap_web_entry *insn_entry)
> +{
> +  int e = get_max_uid ();
> +  fprintf (dump_file, "\nRelevant insns with their flag settings\n\n");
> +
> +  for (int i = 0; i < e; ++i)
> +    if (insn_entry[i].is_relevant)
> +      {
> +     swap_web_entry *pred_entry = (swap_web_entry *)insn_entry[i].pred ();
> +     fprintf (dump_file, "%6d %6d  ", i,
> +              pred_entry && pred_entry->insn
> +              ? INSN_UID (pred_entry->insn) : 0);
> +     if (insn_entry[i].is_load)
> +       fputs ("load ", dump_file);
> +     if (insn_entry[i].is_store)
> +       fputs ("store ", dump_file);
> +     if (insn_entry[i].is_swap)
> +       fputs ("swap ", dump_file);
> +     if (insn_entry[i].is_live_in)
> +       fputs ("live-in ", dump_file);
> +     if (insn_entry[i].is_live_out)
> +       fputs ("live-out ", dump_file);
> +     if (insn_entry[i].contains_subreg)
> +       fputs ("subreg ", dump_file);
> +     if (insn_entry[i].is_128_int)
> +       fputs ("int128 ", dump_file);
> +     if (insn_entry[i].is_call)
> +       fputs ("call ", dump_file);
> +     if (insn_entry[i].is_swappable)
> +       {
> +         fputs ("swappable ", dump_file);
> +         if (insn_entry[i].special_handling == SH_CONST_VECTOR)
> +           fputs ("special:constvec ", dump_file);
> +         else if (insn_entry[i].special_handling == SH_SUBREG)
> +           fputs ("special:subreg ", dump_file);
> +         else if (insn_entry[i].special_handling == SH_NOSWAP_LD)
> +           fputs ("special:load ", dump_file);
> +         else if (insn_entry[i].special_handling == SH_NOSWAP_ST)
> +           fputs ("special:store ", dump_file);
> +         else if (insn_entry[i].special_handling == SH_EXTRACT)
> +           fputs ("special:extract ", dump_file);
> +         else if (insn_entry[i].special_handling == SH_SPLAT)
> +           fputs ("special:splat ", dump_file);
> +       }
> +     if (insn_entry[i].web_not_optimizable)
> +       fputs ("unoptimizable ", dump_file);
> +     if (insn_entry[i].will_delete)
> +       fputs ("delete ", dump_file);
> +     fputs ("\n", dump_file);
> +      }
> +  fputs ("\n", dump_file);
> +}
> +
> +/* Main entry point for this pass.  */
> +unsigned int
> +rs6000_analyze_swaps (function *fun)
> +{
> +  swap_web_entry *insn_entry;
> +  basic_block bb;
> +  rtx insn;
> +
> +  /* Dataflow analysis for use-def chains.  */
> +  df_set_flags (DF_RD_PRUNE_DEAD_DEFS);
> +  df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
> +  df_analyze ();
> +  df_set_flags (DF_DEFER_INSN_RESCAN);
> +
> +  /* Allocate structure to represent webs of insns.  */
> +  insn_entry = XCNEWVEC (swap_web_entry, get_max_uid ());
> +
> +  /* Walk the insns to gather basic data.  */
> +  FOR_ALL_BB_FN (bb, fun)
> +    FOR_BB_INSNS (bb, insn)
> +    {
> +      unsigned int uid = INSN_UID (insn);
> +      if (NONDEBUG_INSN_P (insn))
> +     {
> +       insn_entry[uid].insn = insn;
> +
> +       if (GET_CODE (insn) == CALL_INSN)
> +         insn_entry[uid].is_call = 1;
> +
> +       /* Walk the uses and defs to see if we mention vector regs.
> +          Record any constraints on optimization of such mentions.  */
> +       df_ref *use_rec;
> +       for (use_rec = DF_INSN_UID_USES (uid); *use_rec; use_rec++)
> +         {
> +           df_ref mention = *use_rec;
> +           /* We use DF_REF_REAL_REG here to get inside any subregs.  */
> +           machine_mode mode = GET_MODE (DF_REF_REAL_REG (mention));
> +
> +           /* If a use gets its value from a call insn, it will be
> +              a hard register and will look like (reg:V4SI 3 3).
> +              The df analysis creates two mentions for GPR3 and GPR4,
> +              both DImode.  We must recognize this and treat it as a
> +              vector mention to ensure the call is unioned with this
> +              use.  */
> +           if (mode == DImode && DF_REF_INSN_INFO (mention))
> +             {
> +               rtx feeder = DF_REF_INSN (mention);
> +               /* FIXME:  It is pretty hard to get from the df mention
> +                  to the mode of the use in the insn.  We arbitrarily
> +                  pick a vector mode here, even though the use might
> +                  be a real DImode.  We can be too conservative
> +                  (create a web larger than necessary) because of
> +                  this, so consider eventually fixing this.  */
> +               if (GET_CODE (feeder) == CALL_INSN)
> +                 mode = V4SImode;
> +             }
> +
> +           if (VECTOR_MODE_P (mode) || mode == TImode)
> +             {
> +               insn_entry[uid].is_relevant = 1;
> +               if (mode == TImode || mode == V1TImode)
> +                 insn_entry[uid].is_128_int = 1;
> +               if (DF_REF_INSN_INFO (mention))
> +                 insn_entry[uid].contains_subreg
> +                   = !rtx_equal_p (DF_REF_REG (mention),
> +                                   DF_REF_REAL_REG (mention));
> +               union_defs (insn_entry, insn, mention);
> +             }
> +         }
> +       df_ref *def_rec;
> +       for (def_rec = DF_INSN_UID_DEFS (uid); *def_rec; def_rec++)
> +         {
> +           df_ref mention = *def_rec;
> +           /* We use DF_REF_REAL_REG here to get inside any subregs.  */
> +           machine_mode mode = GET_MODE (DF_REF_REAL_REG (mention));
> +
> +           /* If we're loading up a hard vector register for a call,
> +              it looks like (set (reg:V4SI 9 9) (...)).  The df
> +              analysis creates two mentions for GPR9 and GPR10, both
> +              DImode.  So relying on the mode from the mentions
> +              isn't sufficient to ensure we union the call into the
> +              web with the parameter setup code.  */
> +           if (mode == DImode && GET_CODE (insn) == SET
> +               && VECTOR_MODE_P (GET_MODE (SET_DEST (insn))))
> +             mode = GET_MODE (SET_DEST (insn));
> +
> +           if (VECTOR_MODE_P (mode) || mode == TImode)
> +             {
> +               insn_entry[uid].is_relevant = 1;
> +               if (mode == TImode || mode == V1TImode)
> +                 insn_entry[uid].is_128_int = 1;
> +               if (DF_REF_INSN_INFO (mention))
> +                 insn_entry[uid].contains_subreg
> +                   = !rtx_equal_p (DF_REF_REG (mention),
> +                                   DF_REF_REAL_REG (mention));
> +               /* REG_FUNCTION_VALUE_P is not valid for subregs. */
> +               else if (REG_FUNCTION_VALUE_P (DF_REF_REG (mention)))
> +                 insn_entry[uid].is_live_out = 1;
> +               union_uses (insn_entry, insn, mention);
> +             }
> +         }
> +
> +       if (insn_entry[uid].is_relevant)
> +         {
> +           /* Determine if this is a load or store.  */
> +           insn_entry[uid].is_load = insn_is_load_p (insn);
> +           insn_entry[uid].is_store = insn_is_store_p (insn);
> +
> +           /* Determine if this is a doubleword swap.  If not,
> +              determine whether it can legally be swapped.  */
> +           if (insn_is_swap_p (insn))
> +             insn_entry[uid].is_swap = 1;
> +           else
> +             {
> +               unsigned int special = SH_NONE;
> +               insn_entry[uid].is_swappable
> +                 = insn_is_swappable_p (insn_entry, insn, &special);
> +               if (special != SH_NONE && insn_entry[uid].contains_subreg)
> +                 insn_entry[uid].is_swappable = 0;
> +               else if (special != SH_NONE)
> +                 insn_entry[uid].special_handling = special;
> +               else if (insn_entry[uid].contains_subreg)
> +                 insn_entry[uid].special_handling = SH_SUBREG;
> +             }
> +         }
> +     }
> +    }
> +
> +  if (dump_file)
> +    {
> +      fprintf (dump_file, "\nSwap insn entry table when first built\n");
> +      dump_swap_insn_table (insn_entry);
> +    }
> +
> +  /* Record unoptimizable webs.  */
> +  unsigned e = get_max_uid (), i;
> +  for (i = 0; i < e; ++i)
> +    {
> +      if (!insn_entry[i].is_relevant)
> +     continue;
> +
> +      swap_web_entry *root
> +     = (swap_web_entry*)(&insn_entry[i])->unionfind_root ();
> +      unsigned uid = INSN_UID (insn_entry[i].insn);
> +
> +      if (insn_entry[i].is_live_in || insn_entry[i].is_live_out
> +       || (insn_entry[i].contains_subreg
> +           && insn_entry[i].special_handling != SH_SUBREG)
> +       || insn_entry[i].is_128_int || insn_entry[i].is_call
> +       || !(insn_entry[i].is_swappable || insn_entry[i].is_swap))
> +     root->web_not_optimizable = 1;
> +
> +      /* If we have loads or stores that aren't permuting then the
> +      optimization isn't appropriate.  */
> +      else if ((insn_entry[i].is_load || insn_entry[i].is_store)
> +       && !insn_entry[i].is_swap && !insn_entry[i].is_swappable)
> +     root->web_not_optimizable = 1;
> +
> +      /* If we have permuting loads or stores that are not accompanied
> +      by a register swap, the optimization isn't appropriate.  */
> +      else if (insn_entry[i].is_load && insn_entry[i].is_swap)
> +     {
> +       df_ref *def_rec;
> +
> +       for (def_rec = DF_INSN_UID_DEFS (uid); *def_rec; def_rec++)
> +         {
> +           df_ref def = *def_rec;
> +           struct df_link *link = DF_REF_CHAIN (def);
> +
> +           if (!chain_contains_only_swaps (insn_entry, link, FOR_LOADS))
> +             {
> +               root->web_not_optimizable = 1;
> +               break;
> +             }
> +         }
> +     }
> +      else if (insn_entry[i].is_store && insn_entry[i].is_swap)
> +     {
> +       df_ref *use_rec;
> +
> +       for (use_rec = DF_INSN_UID_USES (uid); *use_rec; use_rec++)
> +         {
> +           df_ref use = *use_rec;
> +           struct df_link *link = DF_REF_CHAIN (use);
> +
> +           if (!chain_contains_only_swaps (insn_entry, link, FOR_STORES))
> +             {
> +               root->web_not_optimizable = 1;
> +               break;
> +             }
> +         }
> +     }
> +    }
> +
> +  if (dump_file)
> +    {
> +      fprintf (dump_file, "\nSwap insn entry table after web analysis\n");
> +      dump_swap_insn_table (insn_entry);
> +    }
> +
> +  /* For each load and store in an optimizable web (which implies
> +     the loads and stores are permuting), find the associated
> +     register swaps and mark them for removal.  Due to various
> +     optimizations we may mark the same swap more than once.  Also
> +     perform special handling for swappable insns that require it.  */
> +  for (i = 0; i < e; ++i)
> +    if ((insn_entry[i].is_load || insn_entry[i].is_store)
> +     && insn_entry[i].is_swap)
> +      {
> +     swap_web_entry* root_entry
> +       = (swap_web_entry*)((&insn_entry[i])->unionfind_root ());
> +     if (!root_entry->web_not_optimizable)
> +       mark_swaps_for_removal (insn_entry, i);
> +      }
> +    else if (insn_entry[i].is_swappable && insn_entry[i].special_handling)
> +      {
> +     swap_web_entry* root_entry
> +       = (swap_web_entry*)((&insn_entry[i])->unionfind_root ());
> +     if (!root_entry->web_not_optimizable)
> +       handle_special_swappables (insn_entry, i);
> +      }
> +
> +  /* Now delete the swaps marked for removal.  */
> +  for (i = 0; i < e; ++i)
> +    if (insn_entry[i].will_delete)
> +      replace_swap_with_copy (insn_entry, i);
> +
> +  /* Clean up.  */
> +  free (insn_entry);
> +  return 0;
> +}
> +
>  
>  struct gcc_target targetm = TARGET_INITIALIZER;
> 
> Index: gcc/config/rs6000/rs6000.opt
> ===================================================================
> --- gcc/config/rs6000/rs6000.opt      (revision 221696)
> +++ gcc/config/rs6000/rs6000.opt      (working copy)
> @@ -585,3 +585,7 @@ Allow double variables in upper registers with -mc
>  mupper-regs-sf
>  Target Undocumented Mask(UPPER_REGS_SF) Var(rs6000_isa_flags)
>  Allow float variables in upper registers with -mcpu=power8 or -mp8-vector
> +
> +moptimize-swaps
> +Target Undocumented Var(rs6000_optimize_swaps) Init(1) Save
> +Analyze and remove doubleword swaps from VSX computations.
> Index: gcc/df.h
> ===================================================================
> --- gcc/df.h  (revision 221696)
> +++ gcc/df.h  (working copy)
> @@ -1132,20 +1132,22 @@ df_get_artificial_uses (unsigned int bb_index)
> 
>  /* web */
> 
> -/* This entry is allocated for each reference in the insn stream.  */
> -struct web_entry
> +class web_entry_base
>  {
> -  /* Pointer to the parent in the union/find tree.  */
> -  struct web_entry *pred;
> -  /* Newly assigned register to the entry.  Set only for roots.  */
> -  rtx reg;
> -  void* extra_info;
> + private:
> +  /* Reference to the parent in the union/find tree.  */
> +  web_entry_base *pred_pvt;
> +
> + public:
> +  /* Accessors.  */
> +  web_entry_base *pred () { return pred_pvt; }
> +  void set_pred (web_entry_base *p) { pred_pvt = p; }
> +
> +  /* Find representative in union-find tree.  */
> +  web_entry_base *unionfind_root ();
> +
> +  /* Union with another set, returning TRUE if they are already unioned.  */
> +  friend bool unionfind_union (web_entry_base *first, web_entry_base 
> *second);
>  };
> 
> -extern struct web_entry *unionfind_root (struct web_entry *);
> -extern bool unionfind_union (struct web_entry *, struct web_entry *);
> -extern void union_defs (df_ref, struct web_entry *,
> -                     unsigned int *used, struct web_entry *,
> -                     bool (*fun) (struct web_entry *, struct web_entry *));
> -
>  #endif /* GCC_DF_H */
> Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-1.c
> ===================================================================
> --- gcc/testsuite/gcc.target/powerpc/swaps-p8-1.c     (revision 0)
> +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-1.c     (working copy)
> @@ -0,0 +1,35 @@
> +/* { dg-do compile { target { powerpc64le-*-* } } } */
> +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { 
> "-mcpu=power8" } } */
> +/* { dg-options "-mcpu=power8 -O3" } */
> +/* { dg-final { scan-assembler "lxvd2x" } } */
> +/* { dg-final { scan-assembler "stxvd2x" } } */
> +/* { dg-final { scan-assembler-not "xxpermdi" } } */
> +
> +void abort();
> +
> +#define N 16
> +
> +signed char ca[N] __attribute__((aligned(16)));
> +signed char cb[] __attribute__((aligned(16)))
> +  = {8, 7, 6, 5, 4, 3, 2,  1,  0, -1, -2, -3, -4, -5, -6, -7};
> +signed char cc[] __attribute__((aligned(16)))
> +  = {1, 1, 2, 2, 3, 3, 2,  2,  1,  1,  0,  0, -1, -1, -2, -2};
> +
> +__attribute__((noinline)) void foo ()
> +{
> +  int i;
> +  for (i = 0; i < N; i++) {
> +    ca[i] = cb[i] - cc[i];
> +  }
> +}
> +
> +int main ()
> +{
> +  signed char cd[] = {7, 6, 4, 3, 1, 0, 0, -1, -1, -2, -2, -3, -3, -4, -4, 
> -5};
> +  int i;
> +  foo ();
> +  for (i = 0; i < N; ++i)
> +    if (ca[i] != cd[i])
> +      abort ();
> +  return 0;
> +}
> Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-10.c
> ===================================================================
> --- gcc/testsuite/gcc.target/powerpc/swaps-p8-10.c    (revision 0)
> +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-10.c    (working copy)
> @@ -0,0 +1,42 @@
> +/* { dg-do run { target { powerpc64le-*-* } } } */
> +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { 
> "-mcpu=power8" } } */
> +/* { dg-options "-mcpu=power8 -O3" } */
> +
> +void abort ();
> +
> +#define N 4096
> +int ca[N] __attribute__((aligned(16)));
> +int cb[N] __attribute__((aligned(16)));
> +int cc[N] __attribute__((aligned(16)));
> +int cd[N] __attribute__((aligned(16)));
> +
> +__attribute__((noinline)) void foo ()
> +{
> +  int i;
> +  for (i = 0; i < N; i++) {
> +    ca[i] = ((cb[i] + cc[i]) * cd[i]) >> 3;
> +  }
> +}
> +
> +__attribute__((noinline)) void init ()
> +{
> +  int i;
> +  for (i = 0; i < N; ++i) {
> +    cb[i] = 3 * i - 2048;
> +    cc[i] = -5 * i + 93;
> +    cd[i] = i % 2 ? 1 : -1;
> +  }
> +}
> +
> +int main ()
> +{
> +  int i;
> +  init ();
> +  foo ();
> +  for (i = 0; i < N; ++i)
> +    if (i % 2 == 1 && ca[i] != (-2 * i - 1955) >> 3)
> +      abort ();
> +    else if (i % 2 == 0 && ca[i] != (1955 + 2 * i) >> 3)
> +      abort ();
> +  return 0;
> +}
> Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-11.c
> ===================================================================
> --- gcc/testsuite/gcc.target/powerpc/swaps-p8-11.c    (revision 0)
> +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-11.c    (working copy)
> @@ -0,0 +1,53 @@
> +/* { dg-do run { target { powerpc64le-*-* } } } */
> +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { 
> "-mcpu=power8" } } */
> +/* { dg-options "-mcpu=power8 -O3" } */
> +
> +#include <altivec.h>
> +void abort ();
> +
> +#define N 4096
> +int ca[N] __attribute__((aligned(16)));
> +int cb[N] __attribute__((aligned(16)));
> +int cc[N] __attribute__((aligned(16)));
> +int cd[N] __attribute__((aligned(16)));
> +int hey;
> +
> +__attribute__((noinline)) void foo ()
> +{
> +  int i;
> +  vector int va, vb, vc, vd, tmp;
> +  vector unsigned int threes = vec_splat_u32(3);
> +  for (i = 0; i < N; i+=4) {
> +    vb = vec_vsx_ld (0, &cb[i]);
> +    vc = vec_vsx_ld (0, &cc[i]);
> +    vd = vec_vsx_ld (0, &cd[i]);
> +    tmp = vec_add (vb, vc);
> +    tmp = vec_sub (tmp, vd);
> +    tmp = vec_sra (tmp, threes);
> +    hey = tmp[3];
> +    vec_vsx_st (tmp, 0, &ca[i]);
> +  }
> +}
> +
> +__attribute__((noinline)) void init ()
> +{
> +  int i;
> +  for (i = 0; i < N; ++i) {
> +    cb[i] = 3 * i - 2048;
> +    cc[i] = -5 * i + 93;
> +    cd[i] = i + 14;
> +  }
> +}
> +
> +int main ()
> +{
> +  int i;
> +  init ();
> +  foo ();
> +  for (i = 0; i < N; ++i)
> +    if (ca[i] != (-3 * i - 1969) >> 3)
> +      abort ();
> +  if (hey != ca[N-1])
> +    abort ();
> +  return 0;
> +}
> Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-12.c
> ===================================================================
> --- gcc/testsuite/gcc.target/powerpc/swaps-p8-12.c    (revision 0)
> +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-12.c    (working copy)
> @@ -0,0 +1,56 @@
> +/* { dg-do compile { target { powerpc64le-*-* } } } */
> +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { 
> "-mcpu=power8" } } */
> +/* { dg-options "-mcpu=power8 -O3" } */
> +/* { dg-final { scan-assembler "lxvd2x" } } */
> +/* { dg-final { scan-assembler "stxvd2x" } } */
> +/* { dg-final { scan-assembler-not "xxpermdi" } } */
> +
> +#include "altivec.h"
> +void abort ();
> +
> +#define N 4096
> +int ca[N] __attribute__((aligned(16)));
> +int cb[N] __attribute__((aligned(16)));
> +int cc[N] __attribute__((aligned(16)));
> +int cd[N] __attribute__((aligned(16)));
> +int hey;
> +
> +__attribute__((noinline)) void foo ()
> +{
> +  int i;
> +  vector int va, vb, vc, vd, tmp;
> +  vector unsigned int threes = vec_splat_u32(3);
> +  for (i = 0; i < N; i+=4) {
> +    vb = vec_vsx_ld (0, &cb[i]);
> +    vc = vec_vsx_ld (0, &cc[i]);
> +    vd = vec_vsx_ld (0, &cd[i]);
> +    tmp = vec_add (vb, vc);
> +    tmp = vec_sub (tmp, vd);
> +    tmp = vec_sra (tmp, threes);
> +    hey = tmp[3];
> +    vec_vsx_st (tmp, 0, &ca[i]);
> +  }
> +}
> +
> +__attribute__((noinline)) void init ()
> +{
> +  int i;
> +  for (i = 0; i < N; ++i) {
> +    cb[i] = 3 * i - 2048;
> +    cc[i] = -5 * i + 93;
> +    cd[i] = i + 14;
> +  }
> +}
> +
> +int main ()
> +{
> +  int i;
> +  init ();
> +  foo ();
> +  for (i = 0; i < N; ++i)
> +    if (ca[i] != (-3 * i - 1969) >> 3)
> +      abort ();
> +  if (hey != ca[N-1])
> +    abort ();
> +  return 0;
> +}
> Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-13.c
> ===================================================================
> --- gcc/testsuite/gcc.target/powerpc/swaps-p8-13.c    (revision 0)
> +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-13.c    (working copy)
> @@ -0,0 +1,54 @@
> +/* { dg-do run { target { powerpc64le-*-* } } } */
> +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { 
> "-mcpu=power8" } } */
> +/* { dg-options "-mcpu=power8 -O3" } */
> +
> +#include <altivec.h>
> +void abort ();
> +
> +#define N 4096
> +long long ca[N] __attribute__((aligned(16)));
> +long long cb[N] __attribute__((aligned(16)));
> +long long cc[N] __attribute__((aligned(16)));
> +long long cd[N] __attribute__((aligned(16)));
> +long long x;
> +
> +__attribute__((noinline)) void foo ()
> +{
> +  int i;
> +  vector long long va, vb, vc, vd, tmp;
> +  volatile unsigned long long three = 3;
> +  vector unsigned long long threes = vec_splats (three);
> +  for (i = 0; i < N; i+=2) {
> +    vb = vec_vsx_ld (0, (vector long long *)&cb[i]);
> +    vc = vec_vsx_ld (0, (vector long long *)&cc[i]);
> +    vd = vec_vsx_ld (0, (vector long long *)&cd[i]);
> +    tmp = vec_add (vb, vc);
> +    tmp = vec_sub (tmp, vd);
> +    tmp = vec_sra (tmp, threes);
> +    x = vec_extract (tmp, 0);
> +    vec_vsx_st (tmp, 0, (vector long long *)&ca[i]);
> +  }
> +}
> +
> +__attribute__((noinline)) void init ()
> +{
> +  int i;
> +  for (i = 0; i < N; ++i) {
> +    cb[i] = 3 * i - 2048;
> +    cc[i] = -5 * i + 93;
> +    cd[i] = i + 14;
> +  }
> +}
> +
> +int main ()
> +{
> +  int i;
> +  init ();
> +  foo ();
> +  for (i = 0; i < N; ++i)
> +    if (ca[i] != (-3 * i - 1969) >> 3)
> +      abort ();
> +  if (x != ca[N-1])
> +    abort ();
> +  return 0;
> +}
> Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-15.c
> ===================================================================
> --- gcc/testsuite/gcc.target/powerpc/swaps-p8-15.c    (revision 0)
> +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-15.c    (working copy)
> @@ -0,0 +1,51 @@
> +/* { dg-do compile { target { powerpc64le-*-* } } } */
> +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { 
> "-mcpu=power8" } } */
> +/* { dg-options "-mcpu=power8 -O3" } */
> +/* { dg-final { scan-assembler "lxvd2x" } } */
> +/* { dg-final { scan-assembler "stxvd2x" } } */
> +/* { dg-final { scan-assembler "xxspltw" } } */
> +
> +/* Currently the analyze_swaps phase cannot optimize this loop because
> +   of the presence of an UNSPEC_VSX_CVDPSPN.  At such time as this is 
> +   handled, we need to add a 'scan-assembler-not "xxpermdi"' directive to
> +   this test.  */
> +#include <altivec.h>
> +void abort();
> +
> +#define N 4096
> +#define M 10000000
> +vector float ca[N][4] = {0};
> +vector float cb[N][4] = {0};
> +vector float cc[N][4] = {0};
> +
> +__attribute__((noinline)) void foo ()
> +{
> +  int i;
> +  for (i = 0; i < N; i++) {
> +    cc[i][0] = vec_mul(vec_splats(cb[i][0][0]), ca[i][0]);
> +    cc[i][0] = vec_madd(cc[i][0],vec_splats(cb[i][0][1]), ca[i][1]);
> +    cc[i][0] = vec_madd(cc[i][0],vec_splats(cb[i][0][2]), ca[i][2]);
> +    cc[i][0] = vec_madd(cc[i][0],vec_splats(cb[i][0][3]), ca[i][3]);
> +
> +    cc[i][1] = vec_mul(vec_splats(cb[i][1][0]), ca[i][0]);
> +    cc[i][1] = vec_madd(cc[i][0],vec_splats(cb[i][1][1]), ca[i][1]);
> +    cc[i][1] = vec_madd(cc[i][0],vec_splats(cb[i][1][2]), ca[i][2]);
> +    cc[i][1] = vec_madd(cc[i][0],vec_splats(cb[i][1][3]), ca[i][3]);
> +    
> +    cc[i][2] = vec_mul(vec_splats(cb[i][2][0]), ca[i][0]);
> +    cc[i][2] = vec_madd(cc[i][0],vec_splats(cb[i][2][1]), ca[i][1]);
> +    cc[i][2] = vec_madd(cc[i][0],vec_splats(cb[i][2][2]), ca[i][2]);
> +    cc[i][2] = vec_madd(cc[i][0],vec_splats(cb[i][2][3]), ca[i][3]);
> +    
> +    cc[i][3] = vec_mul(vec_splats(cb[i][3][0]), ca[i][0]);
> +    cc[i][3] = vec_madd(cc[i][0],vec_splats(cb[i][3][1]), ca[i][1]);
> +    cc[i][3] = vec_madd(cc[i][0],vec_splats(cb[i][3][2]), ca[i][2]);
> +    cc[i][3] = vec_madd(cc[i][0],vec_splats(cb[i][3][3]), ca[i][3]);
> +  }
> +}
> +
> +int main ()
> +{
> +  foo ();
> +  return 0;
> +}
> Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-17.c
> ===================================================================
> --- gcc/testsuite/gcc.target/powerpc/swaps-p8-17.c    (revision 0)
> +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-17.c    (working copy)
> @@ -0,0 +1,15 @@
> +/* { dg-do compile { target { powerpc64le-*-* } } } */
> +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { 
> "-mcpu=power8" } } */
> +/* { dg-options "-mcpu=power8 -O1" } */
> +/* { dg-final { scan-assembler "lxvd2x" } } */
> +/* { dg-final { scan-assembler "xxpermdi" } } */
> +
> +/* Verify that we don't try to do permute removal in the presence of
> +   vec_ste.  This used to ICE.  */
> +#include <altivec.h>
> +
> +void f (void *p)
> +{
> +  vector unsigned int u32 = vec_vsx_ld (1, (const unsigned int *)p);
> +  vec_ste (u32, 1, (unsigned int *)p);
> +}
> Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-3.c
> ===================================================================
> --- gcc/testsuite/gcc.target/powerpc/swaps-p8-3.c     (revision 0)
> +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-3.c     (working copy)
> @@ -0,0 +1,43 @@
> +/* { dg-do compile { target { powerpc64le-*-* } } } */
> +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { 
> "-mcpu=power8" } } */
> +/* { dg-options "-mcpu=power8 -O3" } */
> +/* { dg-final { scan-assembler "lxvd2x" } } */
> +/* { dg-final { scan-assembler "stxvd2x" } } */
> +/* { dg-final { scan-assembler-not "xxpermdi" } } */
> +
> +void abort ();
> +
> +#define N 4096
> +signed char ca[N] __attribute__((aligned(16)));
> +signed char cb[N] __attribute__((aligned(16)));
> +signed char cc[N] __attribute__((aligned(16)));
> +
> +__attribute__((noinline)) void foo ()
> +{
> +  int i;
> +  for (i = 0; i < N; i++) {
> +    ca[i] = cb[i] - cc[i];
> +  }
> +}
> +
> +__attribute__((noinline)) void init ()
> +{
> +  int i, ii;
> +  for (i = 0, ii = 0; i < N; ++i, ii = (ii + 1) % 128) {
> +    cb[i] = ii - 128;
> +    cc[i] = ii/2 - 64;
> +  }
> +}
> +
> +int main ()
> +{
> +  int i, ii;
> +  init ();
> +  foo ();
> +  for (i = 0; i < N; ++i) {
> +    ii = i % 128;
> +    if (ca[i] != ii - ii/2 - 64)
> +      abort ();
> +  }
> +  return 0;
> +}
> Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-4.c
> ===================================================================
> --- gcc/testsuite/gcc.target/powerpc/swaps-p8-4.c     (revision 0)
> +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-4.c     (working copy)
> @@ -0,0 +1,45 @@
> +/* { dg-do compile { target { powerpc64le-*-* } } } */
> +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { 
> "-mcpu=power8" } } */
> +/* { dg-options "-mcpu=power8 -O3" } */
> +/* { dg-final { scan-assembler "lxvd2x" } } */
> +/* { dg-final { scan-assembler "stxvd2x" } } */
> +/* { dg-final { scan-assembler-not "xxpermdi" } } */
> +
> +void abort ();
> +
> +#define N 4096
> +int ca[N] __attribute__((aligned(16)));
> +int cb[N] __attribute__((aligned(16)));
> +int cc[N] __attribute__((aligned(16)));
> +int cd[N] __attribute__((aligned(16)));
> +
> +__attribute__((noinline)) void foo ()
> +{
> +  int i;
> +  for (i = 0; i < N; i++) {
> +    ca[i] = (cb[i] + cc[i]) * cd[i];
> +  }
> +}
> +
> +__attribute__((noinline)) void init ()
> +{
> +  int i;
> +  for (i = 0; i < N; ++i) {
> +    cb[i] = 3 * i - 2048;
> +    cc[i] = -5 * i + 93;
> +    cd[i] = i % 2 ? 1 : -1;
> +  }
> +}
> +
> +int main ()
> +{
> +  int i;
> +  init ();
> +  foo ();
> +  for (i = 0; i < N; ++i)
> +    if (i % 2 == 1 && ca[i] != -2 * i - 1955)
> +      abort ();
> +    else if (i % 2 == 0 && ca[i] != 1955 + 2 * i)
> +      abort ();
> +  return 0;
> +}
> Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-5.c
> ===================================================================
> --- gcc/testsuite/gcc.target/powerpc/swaps-p8-5.c     (revision 0)
> +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-5.c     (working copy)
> @@ -0,0 +1,45 @@
> +/* { dg-do compile { target { powerpc64le-*-* } } } */
> +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { 
> "-mcpu=power8" } } */
> +/* { dg-options "-mcpu=power8 -O3" } */
> +/* { dg-final { scan-assembler "lxvd2x" } } */
> +/* { dg-final { scan-assembler "stxvd2x" } } */
> +/* { dg-final { scan-assembler-not "xxpermdi" } } */
> +
> +void abort ();
> +
> +#define N 4096
> +int ca[N] __attribute__((aligned(16)));
> +int cb[N] __attribute__((aligned(16)));
> +int cc[N] __attribute__((aligned(16)));
> +int cd[N] __attribute__((aligned(16)));
> +
> +__attribute__((noinline)) void foo ()
> +{
> +  int i;
> +  for (i = 0; i < N; i++) {
> +    ca[i] = ((cb[i] + cc[i]) * cd[i]) >> 3;
> +  }
> +}
> +
> +__attribute__((noinline)) void init ()
> +{
> +  int i;
> +  for (i = 0; i < N; ++i) {
> +    cb[i] = 3 * i - 2048;
> +    cc[i] = -5 * i + 93;
> +    cd[i] = i % 2 ? 1 : -1;
> +  }
> +}
> +
> +int main ()
> +{
> +  int i;
> +  init ();
> +  foo ();
> +  for (i = 0; i < N; ++i)
> +    if (i % 2 == 1 && ca[i] != (-2 * i - 1955) >> 3)
> +      abort ();
> +    else if (i % 2 == 0 && ca[i] != (1955 + 2 * i) >> 3)
> +      abort ();
> +  return 0;
> +}
> Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-6.c
> ===================================================================
> --- gcc/testsuite/gcc.target/powerpc/swaps-p8-6.c     (revision 0)
> +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-6.c     (working copy)
> @@ -0,0 +1,32 @@
> +/* { dg-do run { target { powerpc64le-*-* } } } */
> +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { 
> "-mcpu=power8" } } */
> +/* { dg-options "-mcpu=power8 -O3" } */
> +
> +void abort();
> +
> +#define N 16
> +
> +signed char ca[N] __attribute__((aligned(16)));
> +signed char cb[] __attribute__((aligned(16)))
> +  = {8, 7, 6, 5, 4, 3, 2,  1,  0, -1, -2, -3, -4, -5, -6, -7};
> +signed char cc[] __attribute__((aligned(16)))
> +  = {1, 1, 2, 2, 3, 3, 2,  2,  1,  1,  0,  0, -1, -1, -2, -2};
> +
> +__attribute__((noinline)) void foo ()
> +{
> +  int i;
> +  for (i = 0; i < N; i++) {
> +    ca[i] = cb[i] - cc[i];
> +  }
> +}
> +
> +int main ()
> +{
> +  signed char cd[] = {7, 6, 4, 3, 1, 0, 0, -1, -1, -2, -2, -3, -3, -4, -4, 
> -5};
> +  int i;
> +  foo ();
> +  for (i = 0; i < N; ++i)
> +    if (ca[i] != cd[i])
> +      abort ();
> +  return 0;
> +}
> Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-7.c
> ===================================================================
> --- gcc/testsuite/gcc.target/powerpc/swaps-p8-7.c     (revision 0)
> +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-7.c     (working copy)
> @@ -0,0 +1,38 @@
> +/* { dg-do run { target { powerpc64le-*-* } } } */
> +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { 
> "-mcpu=power8" } } */
> +/* { dg-options "-mcpu=power8 -O3" } */
> +
> +void abort ();
> +
> +#define N 256
> +signed char ca[N] __attribute__((aligned(16)));
> +signed char cb[N] __attribute__((aligned(16)));
> +signed char cc[N] __attribute__((aligned(16)));
> +
> +__attribute__((noinline)) void foo ()
> +{
> +  int i;
> +  for (i = 0; i < N; i++) {
> +    ca[i] = cb[i] - cc[i];
> +  }
> +}
> +
> +__attribute__((noinline)) void init ()
> +{
> +  int i;
> +  for (i = 0; i < N; ++i) {
> +    cb[i] = i - 128;
> +    cc[i] = i/2 - 64;
> +  }
> +}
> +
> +int main ()
> +{
> +  int i;
> +  init ();
> +  foo ();
> +  for (i = 0; i < N; ++i)
> +    if (ca[i] != i - i/2 - 64)
> +      abort ();
> +  return 0;
> +}
> Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-8.c
> ===================================================================
> --- gcc/testsuite/gcc.target/powerpc/swaps-p8-8.c     (revision 0)
> +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-8.c     (working copy)
> @@ -0,0 +1,40 @@
> +/* { dg-do run { target { powerpc64le-*-* } } } */
> +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { 
> "-mcpu=power8" } } */
> +/* { dg-options "-mcpu=power8 -O3" } */
> +
> +void abort ();
> +
> +#define N 4096
> +signed char ca[N] __attribute__((aligned(16)));
> +signed char cb[N] __attribute__((aligned(16)));
> +signed char cc[N] __attribute__((aligned(16)));
> +
> +__attribute__((noinline)) void foo ()
> +{
> +  int i;
> +  for (i = 0; i < N; i++) {
> +    ca[i] = cb[i] - cc[i];
> +  }
> +}
> +
> +__attribute__((noinline)) void init ()
> +{
> +  int i, ii;
> +  for (i = 0, ii = 0; i < N; ++i, ii = (ii + 1) % 128) {
> +    cb[i] = ii - 128;
> +    cc[i] = ii/2 - 64;
> +  }
> +}
> +
> +int main ()
> +{
> +  int i, ii;
> +  init ();
> +  foo ();
> +  for (i = 0; i < N; ++i) {
> +    ii = i % 128;
> +    if (ca[i] != ii - ii/2 - 64)
> +      abort ();
> +  }
> +  return 0;
> +}
> Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-9.c
> ===================================================================
> --- gcc/testsuite/gcc.target/powerpc/swaps-p8-9.c     (revision 0)
> +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-9.c     (working copy)
> @@ -0,0 +1,42 @@
> +/* { dg-do run { target { powerpc64le-*-* } } } */
> +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { 
> "-mcpu=power8" } } */
> +/* { dg-options "-mcpu=power8 -O3" } */
> +
> +void abort ();
> +
> +#define N 4096
> +int ca[N] __attribute__((aligned(16)));
> +int cb[N] __attribute__((aligned(16)));
> +int cc[N] __attribute__((aligned(16)));
> +int cd[N] __attribute__((aligned(16)));
> +
> +__attribute__((noinline)) void foo ()
> +{
> +  int i;
> +  for (i = 0; i < N; i++) {
> +    ca[i] = (cb[i] + cc[i]) * cd[i];
> +  }
> +}
> +
> +__attribute__((noinline)) void init ()
> +{
> +  int i;
> +  for (i = 0; i < N; ++i) {
> +    cb[i] = 3 * i - 2048;
> +    cc[i] = -5 * i + 93;
> +    cd[i] = i % 2 ? 1 : -1;
> +  }
> +}
> +
> +int main ()
> +{
> +  int i;
> +  init ();
> +  foo ();
> +  for (i = 0; i < N; ++i)
> +    if (i % 2 == 1 && ca[i] != -2 * i - 1955)
> +      abort ();
> +    else if (i % 2 == 0 && ca[i] != 1955 + 2 * i)
> +      abort ();
> +  return 0;
> +}
> Index: gcc/web.c
> ===================================================================
> --- gcc/web.c (revision 221696)
> +++ gcc/web.c (working copy)
> @@ -53,17 +53,17 @@ along with GCC; see the file COPYING3.  If not see
> 
>  /* Find the root of unionfind tree (the representative of set).  */
> 
> -struct web_entry *
> -unionfind_root (struct web_entry *element)
> +web_entry_base *
> +web_entry_base::unionfind_root ()
>  {
> -  struct web_entry *element1 = element, *element2;
> +  web_entry_base *element = this, *element1 = this, *element2;
> 
> -  while (element->pred)
> -    element = element->pred;
> -  while (element1->pred)
> +  while (element->pred ())
> +    element = element->pred ();
> +  while (element1->pred ())
>      {
> -      element2 = element1->pred;
> -      element1->pred = element;
> +      element2 = element1->pred ();
> +      element1->set_pred (element);
>        element1 = element2;
>      }
>    return element;
> @@ -74,23 +74,32 @@ along with GCC; see the file COPYING3.  If not see
>     nothing is done.  Otherwise, return false.  */
> 
>  bool
> -unionfind_union (struct web_entry *first, struct web_entry *second)
> +unionfind_union (web_entry_base *first, web_entry_base *second)
>  {
> -  first = unionfind_root (first);
> -  second = unionfind_root (second);
> +  first = first->unionfind_root ();
> +  second = second->unionfind_root ();
>    if (first == second)
>      return true;
> -  second->pred = first;
> +  second->set_pred (first);
>    return false;
>  }
> 
> +class web_entry : public web_entry_base
> +{
> + private:
> +  rtx reg_pvt;
> +
> + public:
> +  rtx reg () { return reg_pvt; }
> +  void set_reg (rtx r) { reg_pvt = r; }
> +};
> +
>  /* For INSN, union all defs and uses that are linked by match_dup.
>     FUN is the function that does the union.  */
> 
>  static void
> -union_match_dups (rtx insn, struct web_entry *def_entry,
> -               struct web_entry *use_entry,
> -               bool (*fun) (struct web_entry *, struct web_entry *))
> +union_match_dups (rtx insn, web_entry *def_entry, web_entry *use_entry,
> +               bool (*fun) (web_entry_base *, web_entry_base *))
>  {
>    struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
>    df_ref *use_link = DF_INSN_INFO_USES (insn_info);
> @@ -157,9 +166,9 @@ static void
>     the values 0 and 1 are reserved for use by entry_register.  */
> 
>  void
> -union_defs (df_ref use, struct web_entry *def_entry,
> -         unsigned int *used, struct web_entry *use_entry,
> -         bool (*fun) (struct web_entry *, struct web_entry *))
> +union_defs (df_ref use, web_entry *def_entry,
> +         unsigned int *used, web_entry *use_entry,
> +         bool (*fun) (web_entry_base *, web_entry_base *))
>  {
>    struct df_insn_info *insn_info = DF_REF_INSN_INFO (use);
>    struct df_link *link = DF_REF_CHAIN (use);
> @@ -260,15 +269,15 @@ void
>  /* Find the corresponding register for the given entry.  */
> 
>  static rtx
> -entry_register (struct web_entry *entry, df_ref ref, unsigned int *used)
> +entry_register (web_entry *entry, df_ref ref, unsigned int *used)
>  {
> -  struct web_entry *root;
> +  web_entry *root;
>    rtx reg, newreg;
> 
>    /* Find the corresponding web and see if it has been visited.  */
> -  root = unionfind_root (entry);
> -  if (root->reg)
> -    return root->reg;
> +  root = (web_entry *)entry->unionfind_root ();
> +  if (root->reg ())
> +    return root->reg ();
> 
>    /* We are seeing this web for the first time, do the assignment.  */
>    reg = DF_REF_REAL_REG (ref);
> @@ -292,7 +301,7 @@ static rtx
>                REGNO (newreg));
>      }
> 
> -  root->reg = newreg;
> +  root->set_reg (newreg);
>    return newreg;
>  }
> 
> @@ -326,8 +335,8 @@ gate_handle_web (void)
>  static unsigned int
>  web_main (void)
>  {
> -  struct web_entry *def_entry;
> -  struct web_entry *use_entry;
> +  web_entry *def_entry;
> +  web_entry *use_entry;
>    unsigned int max = max_reg_num ();
>    unsigned int *used;
>    basic_block bb;
> @@ -364,9 +373,9 @@ web_main (void)
>      }
> 
>    /* Record the number of uses and defs at the beginning of the 
> optimization.  */
> -  def_entry = XCNEWVEC (struct web_entry, DF_DEFS_TABLE_SIZE());
> +  def_entry = XCNEWVEC (web_entry, DF_DEFS_TABLE_SIZE());
>    used = XCNEWVEC (unsigned, max);
> -  use_entry = XCNEWVEC (struct web_entry, uses_num);
> +  use_entry = XCNEWVEC (web_entry, uses_num);
> 
>    /* Produce the web.  */
>    FOR_ALL_BB (bb)
> 
> 


Reply via email to