Oops. Fixed post title.
On Thu, 2015-03-26 at 10:23 -0500, Bill Schmidt wrote:
> Hi,
>
> This is a follow-up to
> https://gcc.gnu.org/ml/gcc-patches/2015-03/msg01310.html, which
> backported the POWER-specific little-endian swap optimization pass to
> the 4.9 branch. We also need to backport this to the 4.8 branch. This
> patch does that.
>
> The patch is very similar to the 4.9 backport, except for two things.
> First, the passes infrastructure changed quite a bit between 4.8 and
> 4.9, so the code to describe the new pass to the pass manager is
> somewhat different. Second, I've omitted three of the test cases, which
> happen to fail on 4.8 for unrelated reasons. (We run out of volatile
> registers and end up saving non-volatiles to the stack in the prologue,
> which generates load/swap sequences for now.)
>
> Tested on powerpc64le-unknown-linux-gnu with no regressions. Is this OK
> for 4.8?
>
> Thanks,
> Bill
>
>
> [gcc]
>
> 2015-03-26 Bill Schmidt <wschm...@linux.vnet.ibm.com>
>
> Backport of r214242, r214254, and bug fix patches from mainline
> * config/rs6000/rs6000.c (tree-pass.h): New #include.
> (rs6000_analyze_swaps): New declaration.
> (gate_analyze_swaps): New function.
> (execute_analyze_swaps): Likewise.
> (pass_analyze_swaps): New struct rtl_opt_pass.
> (rs6000_option_override): Register swap-optimization pass.
> (swap_web_entry): New class.
> (special_handling_values): New enum.
> (union_defs): New function.
> (union_uses): Likewise.
> (insn_is_load_p): Likewise.
> (insn_is_store_p): Likewise.
> (insn_is_swap_p): Likewise.
> (rtx_is_swappable_p): Likewise.
> (insn_is_swappable_p): Likewise.
> (chain_purpose): New enum.
> (chain_contains_only_swaps): New function.
> (mark_swaps_for_removal): Likewise.
> (swap_const_vector_halves): Likewise.
> (adjust_subreg_index): Likewise.
> (permute_load): Likewise.
> (permute_store): Likewise.
> (adjust_extract): Likewise.
> (adjust_splat): Likewise.
> (handle_special_swappables): Likewise.
> (replace_swap_with_copy): Likewise.
> (dump_swap_insn_table): Likewise.
> (rs6000_analyze_swaps): Likewise.
> * config/rs6000/rs6000.opt (moptimize-swaps): New option.
> * df.h (web_entry_base): New class, replacing struct web_entry.
> (web_entry_base::pred): New method.
> (web_entry_base::set_pred): Likewise.
> (web_entry_base::unionfind_root): Likewise.
> (web_entry_base::unionfind_union): Likewise.
> (unionfind_root): Delete external reference.
> (unionfind_union): Likewise.
> (union_defs): Likewise.
> * web.c (web_entry_base::unionfind_root): Convert to method.
> (web_entry_base::unionfind_union): Likewise.
> (web_entry): New class.
> (union_match_dups): Convert to use class structure.
> (union_defs): Likewise.
> (entry_register): Likewise.
> (web_main): Likewise.
>
>
> [gcc/testsuite]
>
> 2015-03-26 Bill Schmidt <wschm...@linux.vnet.ibm.com>
>
> Backport r214254 and related tests from mainline
> * gcc.target/powerpc/swaps-p8-1.c: New test.
> * gcc.target/powerpc/swaps-p8-3.c: New test.
> * gcc.target/powerpc/swaps-p8-4.c: New test.
> * gcc.target/powerpc/swaps-p8-5.c: New test.
> * gcc.target/powerpc/swaps-p8-6.c: New test.
> * gcc.target/powerpc/swaps-p8-7.c: New test.
> * gcc.target/powerpc/swaps-p8-8.c: New test.
> * gcc.target/powerpc/swaps-p8-9.c: New test.
> * gcc.target/powerpc/swaps-p8-10.c: New test.
> * gcc.target/powerpc/swaps-p8-11.c: New test.
> * gcc.target/powerpc/swaps-p8-12.c: New test.
> * gcc.target/powerpc/swaps-p8-13.c: New test.
> * gcc.target/powerpc/swaps-p8-15.c: New test.
> * gcc.target/powerpc/swaps-p8-17.c: New test.
>
>
> Index: gcc/config/rs6000/rs6000.c
> ===================================================================
> --- gcc/config/rs6000/rs6000.c (revision 221696)
> +++ gcc/config/rs6000/rs6000.c (working copy)
> @@ -61,6 +61,7 @@
> #include "tree-vectorizer.h"
> #include "dumpfile.h"
> #include "real.h"
> +#include "tree-pass.h"
> #if TARGET_XCOFF
> #include "xcoffout.h" /* get declarations of xcoff_*_section_name */
> #endif
> @@ -1153,6 +1154,7 @@ static bool rs6000_secondary_reload_move (enum rs6
> enum machine_mode,
> secondary_reload_info *,
> bool);
> +static unsigned int rs6000_analyze_swaps (function *);
>
> /* Hash table stuff for keeping track of TOC entries. */
>
> @@ -4046,6 +4048,37 @@ rs6000_option_override_internal (bool global_init_
> return ret;
> }
>
> +static bool
> +gate_analyze_swaps (void)
> +{
> + return (optimize > 0 && !BYTES_BIG_ENDIAN && TARGET_VSX
> + && rs6000_optimize_swaps);
> +}
> +
> +static unsigned int
> +execute_analyze_swaps (void)
> +{
> + return rs6000_analyze_swaps (cfun);
> +}
> +
> +struct rtl_opt_pass pass_analyze_swaps =
> +{
> + RTL_PASS,
> + "swaps", /* name */
> + OPTGROUP_NONE, /* optinfo_flags */
> + gate_analyze_swaps, /* has_gate */
> + execute_analyze_swaps, /* has_execute */
> + NULL, /* sub */
> + NULL, /* next */
> + 0, /* static_pass_number */
> + TV_NONE, /* tv_id */
> + 0, /* properties_required */
> + 0, /* properties_provided */
> + 0, /* properties_destroyed */
> + 0, /* todo_flags_start */
> + TODO_df_finish, /* todo_flags_finish */
> +};
> +
> /* Implement TARGET_OPTION_OVERRIDE. On the RS/6000 this is used to
> define the target cpu type. */
>
> @@ -4053,6 +4086,13 @@ static void
> rs6000_option_override (void)
> {
> (void) rs6000_option_override_internal (true);
> +
> + /* Register machine-specific passes. This needs to be done at start-up.
> + It's convenient to do it here (like i386 does). */
> + static struct register_pass_info analyze_swaps_info
> + = { &pass_analyze_swaps.pass, "cse1", 1, PASS_POS_INSERT_BEFORE };
> +
> + register_pass (&analyze_swaps_info);
> }
>
>
> @@ -33210,7 +33250,1148 @@ emit_fusion_gpr_load (rtx target, rtx mem)
>
> return "";
> }
> +
> +/* Analyze vector computations and remove unnecessary doubleword
> + swaps (xxswapdi instructions). This pass is performed only
> + for little-endian VSX code generation.
>
> + For this specific case, loads and stores of 4x32 and 2x64 vectors
> + are inefficient. These are implemented using the lvx2dx and
> + stvx2dx instructions, which invert the order of doublewords in
> + a vector register. Thus the code generation inserts an xxswapdi
> + after each such load, and prior to each such store. (For spill
> + code after register assignment, an additional xxswapdi is inserted
> + following each store in order to return a hard register to its
> + unpermuted value.)
> +
> + The extra xxswapdi instructions reduce performance. This can be
> + particularly bad for vectorized code. The purpose of this pass
> + is to reduce the number of xxswapdi instructions required for
> + correctness.
> +
> + The primary insight is that much code that operates on vectors
> + does not care about the relative order of elements in a register,
> + so long as the correct memory order is preserved. If we have
> + a computation where all input values are provided by lvxd2x/xxswapdi
> + sequences, all outputs are stored using xxswapdi/stvxd2x sequences,
> + and all intermediate computations are pure SIMD (independent of
> + element order), then all the xxswapdi's associated with the loads
> + and stores may be removed.
> +
> + This pass uses some of the infrastructure and logical ideas from
> + the "web" pass in web.c. We create maximal webs of computations
> + fitting the description above using union-find. Each such web is
> + then optimized by removing its unnecessary xxswapdi instructions.
> +
> + The pass is placed prior to global optimization so that we can
> + perform the optimization in the safest and simplest way possible;
> + that is, by replacing each xxswapdi insn with a register copy insn.
> + Subsequent forward propagation will remove copies where possible.
> +
> + There are some operations sensitive to element order for which we
> + can still allow the operation, provided we modify those operations.
> + These include CONST_VECTORs, for which we must swap the first and
> + second halves of the constant vector; and SUBREGs, for which we
> + must adjust the byte offset to account for the swapped doublewords.
> + A remaining opportunity would be non-immediate-form splats, for
> + which we should adjust the selected lane of the input. We should
> + also make code generation adjustments for sum-across operations,
> + since this is a common vectorizer reduction.
> +
> + Because we run prior to the first split, we can see loads and stores
> + here that match *vsx_le_perm_{load,store}_<mode>. These are vanilla
> + vector loads and stores that have not yet been split into a permuting
> + load/store and a swap. (One way this can happen is with a builtin
> + call to vec_vsx_{ld,st}.) We can handle these as well, but rather
> + than deleting a swap, we convert the load/store into a permuting
> + load/store (which effectively removes the swap). */
> +
> +/* Notes on Permutes
> +
> + We do not currently handle computations that contain permutes. There
> + is a general transformation that can be performed correctly, but it
> + may introduce more expensive code than it replaces. To handle these
> + would require a cost model to determine when to perform the optimization.
> + This commentary records how this could be done if desired.
> +
> + The most general permute is something like this (example for V16QI):
> +
> + (vec_select:V16QI (vec_concat:V32QI (op1:V16QI) (op2:V16QI))
> + (parallel [(const_int a0) (const_int a1)
> + ...
> + (const_int a14) (const_int a15)]))
> +
> + where a0,...,a15 are in [0,31] and select elements from op1 and op2
> + to produce in the result.
> +
> + Regardless of mode, we can convert the PARALLEL to a mask of 16
> + byte-element selectors. Let's call this M, with M[i] representing
> + the ith byte-element selector value. Then if we swap doublewords
> + throughout the computation, we can get correct behavior by replacing
> + M with M' as follows:
> +
> + { M[i+8]+8 : i < 8, M[i+8] in [0,7] U [16,23]
> + M'[i] = { M[i+8]-8 : i < 8, M[i+8] in [8,15] U [24,31]
> + { M[i-8]+8 : i >= 8, M[i-8] in [0,7] U [16,23]
> + { M[i-8]-8 : i >= 8, M[i-8] in [8,15] U [24,31]
> +
> + This seems promising at first, since we are just replacing one mask
> + with another. But certain masks are preferable to others. If M
> + is a mask that matches a vmrghh pattern, for example, M' certainly
> + will not. Instead of a single vmrghh, we would generate a load of
> + M' and a vperm. So we would need to know how many xxswapd's we can
> + remove as a result of this transformation to determine if it's
> + profitable; and preferably the logic would need to be aware of all
> + the special preferable masks.
> +
> + Another form of permute is an UNSPEC_VPERM, in which the mask is
> + already in a register. In some cases, this mask may be a constant
> + that we can discover with ud-chains, in which case the above
> + transformation is ok. However, the common usage here is for the
> + mask to be produced by an UNSPEC_LVSL, in which case the mask
> + cannot be known at compile time. In such a case we would have to
> + generate several instructions to compute M' as above at run time,
> + and a cost model is needed again. */
> +
> +/* This is based on the union-find logic in web.c. web_entry_base is
> + defined in df.h. */
> +class swap_web_entry : public web_entry_base
> +{
> + public:
> + /* Pointer to the insn. */
> + rtx insn;
> + /* Set if insn contains a mention of a vector register. All other
> + fields are undefined if this field is unset. */
> + unsigned int is_relevant : 1;
> + /* Set if insn is a load. */
> + unsigned int is_load : 1;
> + /* Set if insn is a store. */
> + unsigned int is_store : 1;
> + /* Set if insn is a doubleword swap. This can either be a register swap
> + or a permuting load or store (test is_load and is_store for this). */
> + unsigned int is_swap : 1;
> + /* Set if the insn has a live-in use of a parameter register. */
> + unsigned int is_live_in : 1;
> + /* Set if the insn has a live-out def of a return register. */
> + unsigned int is_live_out : 1;
> + /* Set if the insn contains a subreg reference of a vector register. */
> + unsigned int contains_subreg : 1;
> + /* Set if the insn contains a 128-bit integer operand. */
> + unsigned int is_128_int : 1;
> + /* Set if this is a call-insn. */
> + unsigned int is_call : 1;
> + /* Set if this insn does not perform a vector operation for which
> + element order matters, or if we know how to fix it up if it does.
> + Undefined if is_swap is set. */
> + unsigned int is_swappable : 1;
> + /* A nonzero value indicates what kind of special handling for this
> + insn is required if doublewords are swapped. Undefined if
> + is_swappable is not set. */
> + unsigned int special_handling : 3;
> + /* Set if the web represented by this entry cannot be optimized. */
> + unsigned int web_not_optimizable : 1;
> + /* Set if this insn should be deleted. */
> + unsigned int will_delete : 1;
> +};
> +
> +enum special_handling_values {
> + SH_NONE = 0,
> + SH_CONST_VECTOR,
> + SH_SUBREG,
> + SH_NOSWAP_LD,
> + SH_NOSWAP_ST,
> + SH_EXTRACT,
> + SH_SPLAT
> +};
> +
> +/* Union INSN with all insns containing definitions that reach USE.
> + Detect whether USE is live-in to the current function. */
> +static void
> +union_defs (swap_web_entry *insn_entry, rtx insn, df_ref use)
> +{
> + struct df_link *link = DF_REF_CHAIN (use);
> +
> + if (!link)
> + insn_entry[INSN_UID (insn)].is_live_in = 1;
> +
> + while (link)
> + {
> + if (DF_REF_IS_ARTIFICIAL (link->ref))
> + insn_entry[INSN_UID (insn)].is_live_in = 1;
> +
> + if (DF_REF_INSN_INFO (link->ref))
> + {
> + rtx def_insn = DF_REF_INSN (link->ref);
> + (void)unionfind_union (insn_entry + INSN_UID (insn),
> + insn_entry + INSN_UID (def_insn));
> + }
> +
> + link = link->next;
> + }
> +}
> +
> +/* Union INSN with all insns containing uses reached from DEF.
> + Detect whether DEF is live-out from the current function. */
> +static void
> +union_uses (swap_web_entry *insn_entry, rtx insn, df_ref def)
> +{
> + struct df_link *link = DF_REF_CHAIN (def);
> +
> + if (!link)
> + insn_entry[INSN_UID (insn)].is_live_out = 1;
> +
> + while (link)
> + {
> + /* This could be an eh use or some other artificial use;
> + we treat these all the same (killing the optimization). */
> + if (DF_REF_IS_ARTIFICIAL (link->ref))
> + insn_entry[INSN_UID (insn)].is_live_out = 1;
> +
> + if (DF_REF_INSN_INFO (link->ref))
> + {
> + rtx use_insn = DF_REF_INSN (link->ref);
> + (void)unionfind_union (insn_entry + INSN_UID (insn),
> + insn_entry + INSN_UID (use_insn));
> + }
> +
> + link = link->next;
> + }
> +}
> +
> +/* Return 1 iff INSN is a load insn, including permuting loads that
> + represent an lvxd2x instruction; else return 0. */
> +static unsigned int
> +insn_is_load_p (rtx insn)
> +{
> + rtx body = PATTERN (insn);
> +
> + if (GET_CODE (body) == SET)
> + {
> + if (GET_CODE (SET_SRC (body)) == MEM)
> + return 1;
> +
> + if (GET_CODE (SET_SRC (body)) == VEC_SELECT
> + && GET_CODE (XEXP (SET_SRC (body), 0)) == MEM)
> + return 1;
> +
> + return 0;
> + }
> +
> + if (GET_CODE (body) != PARALLEL)
> + return 0;
> +
> + rtx set = XVECEXP (body, 0, 0);
> +
> + if (GET_CODE (set) == SET && GET_CODE (SET_SRC (set)) == MEM)
> + return 1;
> +
> + return 0;
> +}
> +
> +/* Return 1 iff INSN is a store insn, including permuting stores that
> + represent an stvxd2x instruction; else return 0. */
> +static unsigned int
> +insn_is_store_p (rtx insn)
> +{
> + rtx body = PATTERN (insn);
> + if (GET_CODE (body) == SET && GET_CODE (SET_DEST (body)) == MEM)
> + return 1;
> + if (GET_CODE (body) != PARALLEL)
> + return 0;
> + rtx set = XVECEXP (body, 0, 0);
> + if (GET_CODE (set) == SET && GET_CODE (SET_DEST (set)) == MEM)
> + return 1;
> + return 0;
> +}
> +
> +/* Return 1 iff INSN swaps doublewords. This may be a reg-reg swap,
> + a permuting load, or a permuting store. */
> +static unsigned int
> +insn_is_swap_p (rtx insn)
> +{
> + rtx body = PATTERN (insn);
> + if (GET_CODE (body) != SET)
> + return 0;
> + rtx rhs = SET_SRC (body);
> + if (GET_CODE (rhs) != VEC_SELECT)
> + return 0;
> + rtx parallel = XEXP (rhs, 1);
> + if (GET_CODE (parallel) != PARALLEL)
> + return 0;
> + unsigned int len = XVECLEN (parallel, 0);
> + if (len != 2 && len != 4 && len != 8 && len != 16)
> + return 0;
> + for (unsigned int i = 0; i < len / 2; ++i)
> + {
> + rtx op = XVECEXP (parallel, 0, i);
> + if (GET_CODE (op) != CONST_INT || INTVAL (op) != len / 2 + i)
> + return 0;
> + }
> + for (unsigned int i = len / 2; i < len; ++i)
> + {
> + rtx op = XVECEXP (parallel, 0, i);
> + if (GET_CODE (op) != CONST_INT || INTVAL (op) != i - len / 2)
> + return 0;
> + }
> + return 1;
> +}
> +
> +/* Return 1 iff OP is an operand that will not be affected by having
> + vector doublewords swapped in memory. */
> +static unsigned int
> +rtx_is_swappable_p (rtx op, unsigned int *special)
> +{
> + enum rtx_code code = GET_CODE (op);
> + int i, j;
> + rtx parallel;
> +
> + switch (code)
> + {
> + case LABEL_REF:
> + case SYMBOL_REF:
> + case CLOBBER:
> + case REG:
> + return 1;
> +
> + case VEC_CONCAT:
> + case ASM_INPUT:
> + case ASM_OPERANDS:
> + return 0;
> +
> + case CONST_VECTOR:
> + {
> + *special = SH_CONST_VECTOR;
> + return 1;
> + }
> +
> + case VEC_DUPLICATE:
> + /* Opportunity: If XEXP (op, 0) has the same mode as the result,
> + and XEXP (op, 1) is a PARALLEL with a single QImode const int,
> + it represents a vector splat for which we can do special
> + handling. */
> + if (GET_CODE (XEXP (op, 0)) == CONST_INT)
> + return 1;
> + else if (GET_CODE (XEXP (op, 0)) == REG
> + && GET_MODE_INNER (GET_MODE (op)) == GET_MODE (XEXP (op, 0)))
> + /* This catches V2DF and V2DI splat, at a minimum. */
> + return 1;
> + else if (GET_CODE (XEXP (op, 0)) == VEC_SELECT)
> + /* If the duplicated item is from a select, defer to the select
> + processing to see if we can change the lane for the splat. */
> + return rtx_is_swappable_p (XEXP (op, 0), special);
> + else
> + return 0;
> +
> + case VEC_SELECT:
> + /* A vec_extract operation is ok if we change the lane. */
> + if (GET_CODE (XEXP (op, 0)) == REG
> + && GET_MODE_INNER (GET_MODE (XEXP (op, 0))) == GET_MODE (op)
> + && GET_CODE ((parallel = XEXP (op, 1))) == PARALLEL
> + && XVECLEN (parallel, 0) == 1
> + && GET_CODE (XVECEXP (parallel, 0, 0)) == CONST_INT)
> + {
> + *special = SH_EXTRACT;
> + return 1;
> + }
> + else
> + return 0;
> +
> + case UNSPEC:
> + {
> + /* Various operations are unsafe for this optimization, at least
> + without significant additional work. Permutes are obviously
> + problematic, as both the permute control vector and the ordering
> + of the target values are invalidated by doubleword swapping.
> + Vector pack and unpack modify the number of vector lanes.
> + Merge-high/low will not operate correctly on swapped operands.
> + Vector shifts across element boundaries are clearly uncool,
> + as are vector select and concatenate operations. Vector
> + sum-across instructions define one operand with a specific
> + order-dependent element, so additional fixup code would be
> + needed to make those work. Vector set and non-immediate-form
> + vector splat are element-order sensitive. A few of these
> + cases might be workable with special handling if required. */
> + int val = XINT (op, 1);
> + switch (val)
> + {
> + default:
> + break;
> + case UNSPEC_VMRGH_DIRECT:
> + case UNSPEC_VMRGL_DIRECT:
> + case UNSPEC_VPACK_SIGN_SIGN_SAT:
> + case UNSPEC_VPACK_SIGN_UNS_SAT:
> + case UNSPEC_VPACK_UNS_UNS_MOD:
> + case UNSPEC_VPACK_UNS_UNS_MOD_DIRECT:
> + case UNSPEC_VPACK_UNS_UNS_SAT:
> + case UNSPEC_VPERM:
> + case UNSPEC_VPERM_UNS:
> + case UNSPEC_VPERMHI:
> + case UNSPEC_VPERMSI:
> + case UNSPEC_VPKPX:
> + case UNSPEC_VSLDOI:
> + case UNSPEC_VSLO:
> + case UNSPEC_VSRO:
> + case UNSPEC_VSUM2SWS:
> + case UNSPEC_VSUM4S:
> + case UNSPEC_VSUM4UBS:
> + case UNSPEC_VSUMSWS:
> + case UNSPEC_VSUMSWS_DIRECT:
> + case UNSPEC_VSX_CONCAT:
> + case UNSPEC_VSX_SET:
> + case UNSPEC_VSX_SLDWI:
> + case UNSPEC_VUNPACK_HI_SIGN:
> + case UNSPEC_VUNPACK_HI_SIGN_DIRECT:
> + case UNSPEC_VUNPACK_LO_SIGN:
> + case UNSPEC_VUNPACK_LO_SIGN_DIRECT:
> + case UNSPEC_VUPKHPX:
> + case UNSPEC_VUPKHS_V4SF:
> + case UNSPEC_VUPKHU_V4SF:
> + case UNSPEC_VUPKLPX:
> + case UNSPEC_VUPKLS_V4SF:
> + case UNSPEC_VUPKLU_V4SF:
> + /* The following could be handled as an idiom with XXSPLTW.
> + These place a scalar in BE element zero, but the XXSPLTW
> + will currently expect it in BE element 2 in a swapped
> + region. When one of these feeds an XXSPLTW with no other
> + defs/uses either way, we can avoid the lane change for
> + XXSPLTW and things will be correct. TBD. */
> + case UNSPEC_VSX_CVDPSPN:
> + case UNSPEC_VSX_CVSPDP:
> + case UNSPEC_VSX_CVSPDPN:
> + return 0;
> + case UNSPEC_VSPLT_DIRECT:
> + *special = SH_SPLAT;
> + return 1;
> + }
> + }
> +
> + default:
> + break;
> + }
> +
> + const char *fmt = GET_RTX_FORMAT (code);
> + int ok = 1;
> +
> + for (i = 0; i < GET_RTX_LENGTH (code); ++i)
> + if (fmt[i] == 'e' || fmt[i] == 'u')
> + {
> + unsigned int special_op = SH_NONE;
> + ok &= rtx_is_swappable_p (XEXP (op, i), &special_op);
> + /* Ensure we never have two kinds of special handling
> + for the same insn. */
> + if (*special != SH_NONE && special_op != SH_NONE
> + && *special != special_op)
> + return 0;
> + *special = special_op;
> + }
> + else if (fmt[i] == 'E')
> + for (j = 0; j < XVECLEN (op, i); ++j)
> + {
> + unsigned int special_op = SH_NONE;
> + ok &= rtx_is_swappable_p (XVECEXP (op, i, j), &special_op);
> + /* Ensure we never have two kinds of special handling
> + for the same insn. */
> + if (*special != SH_NONE && special_op != SH_NONE
> + && *special != special_op)
> + return 0;
> + *special = special_op;
> + }
> +
> + return ok;
> +}
> +
> +/* Return 1 iff INSN is an operand that will not be affected by
> + having vector doublewords swapped in memory (in which case
> + *SPECIAL is unchanged), or that can be modified to be correct
> + if vector doublewords are swapped in memory (in which case
> + *SPECIAL is changed to a value indicating how). */
> +static unsigned int
> +insn_is_swappable_p (swap_web_entry *insn_entry, rtx insn,
> + unsigned int *special)
> +{
> + /* Calls are always bad. */
> + if (GET_CODE (insn) == CALL_INSN)
> + return 0;
> +
> + /* Loads and stores seen here are not permuting, but we can still
> + fix them up by converting them to permuting ones. Exceptions:
> + UNSPEC_LVE, UNSPEC_LVX, and UNSPEC_STVX, which have a PARALLEL
> + body instead of a SET; and UNSPEC_STVE, which has an UNSPEC
> + for the SET source. */
> + rtx body = PATTERN (insn);
> + int i = INSN_UID (insn);
> +
> + if (insn_entry[i].is_load)
> + {
> + if (GET_CODE (body) == SET)
> + {
> + *special = SH_NOSWAP_LD;
> + return 1;
> + }
> + else
> + return 0;
> + }
> +
> + if (insn_entry[i].is_store)
> + {
> + if (GET_CODE (body) == SET && GET_CODE (SET_SRC (body)) != UNSPEC)
> + {
> + *special = SH_NOSWAP_ST;
> + return 1;
> + }
> + else
> + return 0;
> + }
> +
> + /* Otherwise check the operands for vector lane violations. */
> + return rtx_is_swappable_p (body, special);
> +}
> +
> +enum chain_purpose { FOR_LOADS, FOR_STORES };
> +
> +/* Return true if the UD or DU chain headed by LINK is non-empty,
> + and every entry on the chain references an insn that is a
> + register swap. Furthermore, if PURPOSE is FOR_LOADS, each such
> + register swap must have only permuting loads as reaching defs.
> + If PURPOSE is FOR_STORES, each such register swap must have only
> + register swaps or permuting stores as reached uses. */
> +static bool
> +chain_contains_only_swaps (swap_web_entry *insn_entry, struct df_link *link,
> + enum chain_purpose purpose)
> +{
> + if (!link)
> + return false;
> +
> + for (; link; link = link->next)
> + {
> + if (!VECTOR_MODE_P (GET_MODE (DF_REF_REG (link->ref))))
> + continue;
> +
> + if (DF_REF_IS_ARTIFICIAL (link->ref))
> + return false;
> +
> + rtx reached_insn = DF_REF_INSN (link->ref);
> + unsigned uid = INSN_UID (reached_insn);
> +
> + if (!insn_entry[uid].is_swap || insn_entry[uid].is_load
> + || insn_entry[uid].is_store)
> + return false;
> +
> + if (purpose == FOR_LOADS)
> + {
> + df_ref *use_rec;
> + for (use_rec = DF_INSN_UID_USES (uid); *use_rec; use_rec++)
> + {
> + df_ref use = *use_rec;
> + struct df_link *swap_link = DF_REF_CHAIN (use);
> +
> + while (swap_link)
> + {
> + if (DF_REF_IS_ARTIFICIAL (link->ref))
> + return false;
> +
> + rtx swap_def_insn = DF_REF_INSN (swap_link->ref);
> + unsigned uid2 = INSN_UID (swap_def_insn);
> +
> + /* Only permuting loads are allowed. */
> + if (!insn_entry[uid2].is_swap || !insn_entry[uid2].is_load)
> + return false;
> +
> + swap_link = swap_link->next;
> + }
> + }
> + }
> + else if (purpose == FOR_STORES)
> + {
> + df_ref *def_rec;
> + for (def_rec = DF_INSN_UID_DEFS (uid); *def_rec; def_rec++)
> + {
> + df_ref def = *def_rec;
> + struct df_link *swap_link = DF_REF_CHAIN (def);
> +
> + while (swap_link)
> + {
> + if (DF_REF_IS_ARTIFICIAL (link->ref))
> + return false;
> +
> + rtx swap_use_insn = DF_REF_INSN (swap_link->ref);
> + unsigned uid2 = INSN_UID (swap_use_insn);
> +
> + /* Permuting stores or register swaps are allowed. */
> + if (!insn_entry[uid2].is_swap || insn_entry[uid2].is_load)
> + return false;
> +
> + swap_link = swap_link->next;
> + }
> + }
> + }
> + }
> +
> + return true;
> +}
> +
> +/* Mark the xxswapdi instructions associated with permuting loads and
> + stores for removal. Note that we only flag them for deletion here,
> + as there is a possibility of a swap being reached from multiple
> + loads, etc. */
> +static void
> +mark_swaps_for_removal (swap_web_entry *insn_entry, unsigned int i)
> +{
> + rtx insn = insn_entry[i].insn;
> + unsigned uid = INSN_UID (insn);
> +
> + if (insn_entry[i].is_load)
> + {
> + df_ref *def_rec;
> + for (def_rec = DF_INSN_UID_DEFS (uid); *def_rec; def_rec++)
> + {
> + df_ref def = *def_rec;
> + struct df_link *link = DF_REF_CHAIN (def);
> +
> + /* We know by now that these are swaps, so we can delete
> + them confidently. */
> + while (link)
> + {
> + rtx use_insn = DF_REF_INSN (link->ref);
> + insn_entry[INSN_UID (use_insn)].will_delete = 1;
> + link = link->next;
> + }
> + }
> + }
> + else if (insn_entry[i].is_store)
> + {
> + df_ref *use_rec;
> + for (use_rec = DF_INSN_UID_USES (uid); *use_rec; use_rec++)
> + {
> + df_ref use = *use_rec;
> + /* Ignore uses for addressability. */
> + machine_mode mode = GET_MODE (DF_REF_REG (use));
> + if (!VECTOR_MODE_P (mode))
> + continue;
> +
> + struct df_link *link = DF_REF_CHAIN (use);
> +
> + /* We know by now that these are swaps, so we can delete
> + them confidently. */
> + while (link)
> + {
> + rtx def_insn = DF_REF_INSN (link->ref);
> + insn_entry[INSN_UID (def_insn)].will_delete = 1;
> + link = link->next;
> + }
> + }
> + }
> +}
> +
> +/* OP is either a CONST_VECTOR or an expression containing one.
> + Swap the first half of the vector with the second in the first
> + case. Recurse to find it in the second. */
> +static void
> +swap_const_vector_halves (rtx op)
> +{
> + int i;
> + enum rtx_code code = GET_CODE (op);
> + if (GET_CODE (op) == CONST_VECTOR)
> + {
> + int half_units = GET_MODE_NUNITS (GET_MODE (op)) / 2;
> + for (i = 0; i < half_units; ++i)
> + {
> + rtx temp = CONST_VECTOR_ELT (op, i);
> + CONST_VECTOR_ELT (op, i) = CONST_VECTOR_ELT (op, i + half_units);
> + CONST_VECTOR_ELT (op, i + half_units) = temp;
> + }
> + }
> + else
> + {
> + int j;
> + const char *fmt = GET_RTX_FORMAT (code);
> + for (i = 0; i < GET_RTX_LENGTH (code); ++i)
> + if (fmt[i] == 'e' || fmt[i] == 'u')
> + swap_const_vector_halves (XEXP (op, i));
> + else if (fmt[i] == 'E')
> + for (j = 0; j < XVECLEN (op, i); ++j)
> + swap_const_vector_halves (XVECEXP (op, i, j));
> + }
> +}
> +
> +/* Find all subregs of a vector expression that perform a narrowing,
> + and adjust the subreg index to account for doubleword swapping. */
> +static void
> +adjust_subreg_index (rtx op)
> +{
> + enum rtx_code code = GET_CODE (op);
> + if (code == SUBREG
> + && (GET_MODE_SIZE (GET_MODE (op))
> + < GET_MODE_SIZE (GET_MODE (XEXP (op, 0)))))
> + {
> + unsigned int index = SUBREG_BYTE (op);
> + if (index < 8)
> + index += 8;
> + else
> + index -= 8;
> + SUBREG_BYTE (op) = index;
> + }
> +
> + const char *fmt = GET_RTX_FORMAT (code);
> + int i,j;
> + for (i = 0; i < GET_RTX_LENGTH (code); ++i)
> + if (fmt[i] == 'e' || fmt[i] == 'u')
> + adjust_subreg_index (XEXP (op, i));
> + else if (fmt[i] == 'E')
> + for (j = 0; j < XVECLEN (op, i); ++j)
> + adjust_subreg_index (XVECEXP (op, i, j));
> +}
> +
> +/* Convert the non-permuting load INSN to a permuting one. */
> +static void
> +permute_load (rtx insn)
> +{
> + rtx body = PATTERN (insn);
> + rtx mem_op = SET_SRC (body);
> + rtx tgt_reg = SET_DEST (body);
> + machine_mode mode = GET_MODE (tgt_reg);
> + int n_elts = GET_MODE_NUNITS (mode);
> + int half_elts = n_elts / 2;
> + rtx par = gen_rtx_PARALLEL (mode, rtvec_alloc (n_elts));
> + int i, j;
> + for (i = 0, j = half_elts; i < half_elts; ++i, ++j)
> + XVECEXP (par, 0, i) = GEN_INT (j);
> + for (i = half_elts, j = 0; j < half_elts; ++i, ++j)
> + XVECEXP (par, 0, i) = GEN_INT (j);
> + rtx sel = gen_rtx_VEC_SELECT (mode, mem_op, par);
> + SET_SRC (body) = sel;
> + INSN_CODE (insn) = -1; /* Force re-recognition. */
> + df_insn_rescan (insn);
> +
> + if (dump_file)
> + fprintf (dump_file, "Replacing load %d with permuted load\n",
> + INSN_UID (insn));
> +}
> +
> +/* Convert the non-permuting store INSN to a permuting one. */
> +static void
> +permute_store (rtx insn)
> +{
> + rtx body = PATTERN (insn);
> + rtx src_reg = SET_SRC (body);
> + machine_mode mode = GET_MODE (src_reg);
> + int n_elts = GET_MODE_NUNITS (mode);
> + int half_elts = n_elts / 2;
> + rtx par = gen_rtx_PARALLEL (mode, rtvec_alloc (n_elts));
> + int i, j;
> + for (i = 0, j = half_elts; i < half_elts; ++i, ++j)
> + XVECEXP (par, 0, i) = GEN_INT (j);
> + for (i = half_elts, j = 0; j < half_elts; ++i, ++j)
> + XVECEXP (par, 0, i) = GEN_INT (j);
> + rtx sel = gen_rtx_VEC_SELECT (mode, src_reg, par);
> + SET_SRC (body) = sel;
> + INSN_CODE (insn) = -1; /* Force re-recognition. */
> + df_insn_rescan (insn);
> +
> + if (dump_file)
> + fprintf (dump_file, "Replacing store %d with permuted store\n",
> + INSN_UID (insn));
> +}
> +
> +/* Given OP that contains a vector extract operation, adjust the index
> + of the extracted lane to account for the doubleword swap. */
> +static void
> +adjust_extract (rtx insn)
> +{
> + rtx src = SET_SRC (PATTERN (insn));
> + /* The vec_select may be wrapped in a vec_duplicate for a splat, so
> + account for that. */
> + rtx sel = GET_CODE (src) == VEC_DUPLICATE ? XEXP (src, 0) : src;
> + rtx par = XEXP (sel, 1);
> + int half_elts = GET_MODE_NUNITS (GET_MODE (XEXP (sel, 0))) >> 1;
> + int lane = INTVAL (XVECEXP (par, 0, 0));
> + lane = lane >= half_elts ? lane - half_elts : lane + half_elts;
> + XVECEXP (par, 0, 0) = GEN_INT (lane);
> + INSN_CODE (insn) = -1; /* Force re-recognition. */
> + df_insn_rescan (insn);
> +
> + if (dump_file)
> + fprintf (dump_file, "Changing lane for extract %d\n", INSN_UID (insn));
> +}
> +
> +/* Given OP that contains a vector direct-splat operation, adjust the index
> + of the source lane to account for the doubleword swap. */
> +static void
> +adjust_splat (rtx insn)
> +{
> + rtx body = PATTERN (insn);
> + rtx unspec = XEXP (body, 1);
> + int half_elts = GET_MODE_NUNITS (GET_MODE (unspec)) >> 1;
> + int lane = INTVAL (XVECEXP (unspec, 0, 1));
> + lane = lane >= half_elts ? lane - half_elts : lane + half_elts;
> + XVECEXP (unspec, 0, 1) = GEN_INT (lane);
> + INSN_CODE (insn) = -1; /* Force re-recognition. */
> + df_insn_rescan (insn);
> +
> + if (dump_file)
> + fprintf (dump_file, "Changing lane for splat %d\n", INSN_UID (insn));
> +}
> +
> +/* The insn described by INSN_ENTRY[I] can be swapped, but only
> + with special handling. Take care of that here. */
> +static void
> +handle_special_swappables (swap_web_entry *insn_entry, unsigned i)
> +{
> + rtx insn = insn_entry[i].insn;
> + rtx body = PATTERN (insn);
> +
> + switch (insn_entry[i].special_handling)
> + {
> + default:
> + gcc_unreachable ();
> + case SH_CONST_VECTOR:
> + {
> + /* A CONST_VECTOR will only show up somewhere in the RHS of a SET. */
> + gcc_assert (GET_CODE (body) == SET);
> + rtx rhs = SET_SRC (body);
> + swap_const_vector_halves (rhs);
> + if (dump_file)
> + fprintf (dump_file, "Swapping constant halves in insn %d\n", i);
> + break;
> + }
> + case SH_SUBREG:
> + /* A subreg of the same size is already safe. For subregs that
> + select a smaller portion of a reg, adjust the index for
> + swapped doublewords. */
> + adjust_subreg_index (body);
> + if (dump_file)
> + fprintf (dump_file, "Adjusting subreg in insn %d\n", i);
> + break;
> + case SH_NOSWAP_LD:
> + /* Convert a non-permuting load to a permuting one. */
> + permute_load (insn);
> + break;
> + case SH_NOSWAP_ST:
> + /* Convert a non-permuting store to a permuting one. */
> + permute_store (insn);
> + break;
> + case SH_EXTRACT:
> + /* Change the lane on an extract operation. */
> + adjust_extract (insn);
> + break;
> + case SH_SPLAT:
> + /* Change the lane on a direct-splat operation. */
> + adjust_splat (insn);
> + break;
> + }
> +}
> +
> +/* Find the insn from the Ith table entry, which is known to be a
> + register swap Y = SWAP(X). Replace it with a copy Y = X. */
> +static void
> +replace_swap_with_copy (swap_web_entry *insn_entry, unsigned i)
> +{
> + rtx insn = insn_entry[i].insn;
> + rtx body = PATTERN (insn);
> + rtx src_reg = XEXP (SET_SRC (body), 0);
> + rtx copy = gen_rtx_SET (VOIDmode, SET_DEST (body), src_reg);
> + rtx new_insn = emit_insn_before (copy, insn);
> + set_block_for_insn (new_insn, BLOCK_FOR_INSN (insn));
> + df_insn_rescan (new_insn);
> +
> + if (dump_file)
> + {
> + unsigned int new_uid = INSN_UID (new_insn);
> + fprintf (dump_file, "Replacing swap %d with copy %d\n", i, new_uid);
> + }
> +
> + df_insn_delete (BLOCK_FOR_INSN (insn), INSN_UID (insn));
> + remove_insn (insn);
> + INSN_DELETED_P (insn) = 1;
> +}
> +
> +/* Dump the swap table to DUMP_FILE. */
> +static void
> +dump_swap_insn_table (swap_web_entry *insn_entry)
> +{
> + int e = get_max_uid ();
> + fprintf (dump_file, "\nRelevant insns with their flag settings\n\n");
> +
> + for (int i = 0; i < e; ++i)
> + if (insn_entry[i].is_relevant)
> + {
> + swap_web_entry *pred_entry = (swap_web_entry *)insn_entry[i].pred ();
> + fprintf (dump_file, "%6d %6d ", i,
> + pred_entry && pred_entry->insn
> + ? INSN_UID (pred_entry->insn) : 0);
> + if (insn_entry[i].is_load)
> + fputs ("load ", dump_file);
> + if (insn_entry[i].is_store)
> + fputs ("store ", dump_file);
> + if (insn_entry[i].is_swap)
> + fputs ("swap ", dump_file);
> + if (insn_entry[i].is_live_in)
> + fputs ("live-in ", dump_file);
> + if (insn_entry[i].is_live_out)
> + fputs ("live-out ", dump_file);
> + if (insn_entry[i].contains_subreg)
> + fputs ("subreg ", dump_file);
> + if (insn_entry[i].is_128_int)
> + fputs ("int128 ", dump_file);
> + if (insn_entry[i].is_call)
> + fputs ("call ", dump_file);
> + if (insn_entry[i].is_swappable)
> + {
> + fputs ("swappable ", dump_file);
> + if (insn_entry[i].special_handling == SH_CONST_VECTOR)
> + fputs ("special:constvec ", dump_file);
> + else if (insn_entry[i].special_handling == SH_SUBREG)
> + fputs ("special:subreg ", dump_file);
> + else if (insn_entry[i].special_handling == SH_NOSWAP_LD)
> + fputs ("special:load ", dump_file);
> + else if (insn_entry[i].special_handling == SH_NOSWAP_ST)
> + fputs ("special:store ", dump_file);
> + else if (insn_entry[i].special_handling == SH_EXTRACT)
> + fputs ("special:extract ", dump_file);
> + else if (insn_entry[i].special_handling == SH_SPLAT)
> + fputs ("special:splat ", dump_file);
> + }
> + if (insn_entry[i].web_not_optimizable)
> + fputs ("unoptimizable ", dump_file);
> + if (insn_entry[i].will_delete)
> + fputs ("delete ", dump_file);
> + fputs ("\n", dump_file);
> + }
> + fputs ("\n", dump_file);
> +}
> +
> +/* Main entry point for this pass. */
> +unsigned int
> +rs6000_analyze_swaps (function *fun)
> +{
> + swap_web_entry *insn_entry;
> + basic_block bb;
> + rtx insn;
> +
> + /* Dataflow analysis for use-def chains. */
> + df_set_flags (DF_RD_PRUNE_DEAD_DEFS);
> + df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
> + df_analyze ();
> + df_set_flags (DF_DEFER_INSN_RESCAN);
> +
> + /* Allocate structure to represent webs of insns. */
> + insn_entry = XCNEWVEC (swap_web_entry, get_max_uid ());
> +
> + /* Walk the insns to gather basic data. */
> + FOR_ALL_BB_FN (bb, fun)
> + FOR_BB_INSNS (bb, insn)
> + {
> + unsigned int uid = INSN_UID (insn);
> + if (NONDEBUG_INSN_P (insn))
> + {
> + insn_entry[uid].insn = insn;
> +
> + if (GET_CODE (insn) == CALL_INSN)
> + insn_entry[uid].is_call = 1;
> +
> + /* Walk the uses and defs to see if we mention vector regs.
> + Record any constraints on optimization of such mentions. */
> + df_ref *use_rec;
> + for (use_rec = DF_INSN_UID_USES (uid); *use_rec; use_rec++)
> + {
> + df_ref mention = *use_rec;
> + /* We use DF_REF_REAL_REG here to get inside any subregs. */
> + machine_mode mode = GET_MODE (DF_REF_REAL_REG (mention));
> +
> + /* If a use gets its value from a call insn, it will be
> + a hard register and will look like (reg:V4SI 3 3).
> + The df analysis creates two mentions for GPR3 and GPR4,
> + both DImode. We must recognize this and treat it as a
> + vector mention to ensure the call is unioned with this
> + use. */
> + if (mode == DImode && DF_REF_INSN_INFO (mention))
> + {
> + rtx feeder = DF_REF_INSN (mention);
> + /* FIXME: It is pretty hard to get from the df mention
> + to the mode of the use in the insn. We arbitrarily
> + pick a vector mode here, even though the use might
> + be a real DImode. We can be too conservative
> + (create a web larger than necessary) because of
> + this, so consider eventually fixing this. */
> + if (GET_CODE (feeder) == CALL_INSN)
> + mode = V4SImode;
> + }
> +
> + if (VECTOR_MODE_P (mode) || mode == TImode)
> + {
> + insn_entry[uid].is_relevant = 1;
> + if (mode == TImode || mode == V1TImode)
> + insn_entry[uid].is_128_int = 1;
> + if (DF_REF_INSN_INFO (mention))
> + insn_entry[uid].contains_subreg
> + = !rtx_equal_p (DF_REF_REG (mention),
> + DF_REF_REAL_REG (mention));
> + union_defs (insn_entry, insn, mention);
> + }
> + }
> + df_ref *def_rec;
> + for (def_rec = DF_INSN_UID_DEFS (uid); *def_rec; def_rec++)
> + {
> + df_ref mention = *def_rec;
> + /* We use DF_REF_REAL_REG here to get inside any subregs. */
> + machine_mode mode = GET_MODE (DF_REF_REAL_REG (mention));
> +
> + /* If we're loading up a hard vector register for a call,
> + it looks like (set (reg:V4SI 9 9) (...)). The df
> + analysis creates two mentions for GPR9 and GPR10, both
> + DImode. So relying on the mode from the mentions
> + isn't sufficient to ensure we union the call into the
> + web with the parameter setup code. */
> + if (mode == DImode && GET_CODE (insn) == SET
> + && VECTOR_MODE_P (GET_MODE (SET_DEST (insn))))
> + mode = GET_MODE (SET_DEST (insn));
> +
> + if (VECTOR_MODE_P (mode) || mode == TImode)
> + {
> + insn_entry[uid].is_relevant = 1;
> + if (mode == TImode || mode == V1TImode)
> + insn_entry[uid].is_128_int = 1;
> + if (DF_REF_INSN_INFO (mention))
> + insn_entry[uid].contains_subreg
> + = !rtx_equal_p (DF_REF_REG (mention),
> + DF_REF_REAL_REG (mention));
> + /* REG_FUNCTION_VALUE_P is not valid for subregs. */
> + else if (REG_FUNCTION_VALUE_P (DF_REF_REG (mention)))
> + insn_entry[uid].is_live_out = 1;
> + union_uses (insn_entry, insn, mention);
> + }
> + }
> +
> + if (insn_entry[uid].is_relevant)
> + {
> + /* Determine if this is a load or store. */
> + insn_entry[uid].is_load = insn_is_load_p (insn);
> + insn_entry[uid].is_store = insn_is_store_p (insn);
> +
> + /* Determine if this is a doubleword swap. If not,
> + determine whether it can legally be swapped. */
> + if (insn_is_swap_p (insn))
> + insn_entry[uid].is_swap = 1;
> + else
> + {
> + unsigned int special = SH_NONE;
> + insn_entry[uid].is_swappable
> + = insn_is_swappable_p (insn_entry, insn, &special);
> + if (special != SH_NONE && insn_entry[uid].contains_subreg)
> + insn_entry[uid].is_swappable = 0;
> + else if (special != SH_NONE)
> + insn_entry[uid].special_handling = special;
> + else if (insn_entry[uid].contains_subreg)
> + insn_entry[uid].special_handling = SH_SUBREG;
> + }
> + }
> + }
> + }
> +
> + if (dump_file)
> + {
> + fprintf (dump_file, "\nSwap insn entry table when first built\n");
> + dump_swap_insn_table (insn_entry);
> + }
> +
> + /* Record unoptimizable webs. */
> + unsigned e = get_max_uid (), i;
> + for (i = 0; i < e; ++i)
> + {
> + if (!insn_entry[i].is_relevant)
> + continue;
> +
> + swap_web_entry *root
> + = (swap_web_entry*)(&insn_entry[i])->unionfind_root ();
> + unsigned uid = INSN_UID (insn_entry[i].insn);
> +
> + if (insn_entry[i].is_live_in || insn_entry[i].is_live_out
> + || (insn_entry[i].contains_subreg
> + && insn_entry[i].special_handling != SH_SUBREG)
> + || insn_entry[i].is_128_int || insn_entry[i].is_call
> + || !(insn_entry[i].is_swappable || insn_entry[i].is_swap))
> + root->web_not_optimizable = 1;
> +
> + /* If we have loads or stores that aren't permuting then the
> + optimization isn't appropriate. */
> + else if ((insn_entry[i].is_load || insn_entry[i].is_store)
> + && !insn_entry[i].is_swap && !insn_entry[i].is_swappable)
> + root->web_not_optimizable = 1;
> +
> + /* If we have permuting loads or stores that are not accompanied
> + by a register swap, the optimization isn't appropriate. */
> + else if (insn_entry[i].is_load && insn_entry[i].is_swap)
> + {
> + df_ref *def_rec;
> +
> + for (def_rec = DF_INSN_UID_DEFS (uid); *def_rec; def_rec++)
> + {
> + df_ref def = *def_rec;
> + struct df_link *link = DF_REF_CHAIN (def);
> +
> + if (!chain_contains_only_swaps (insn_entry, link, FOR_LOADS))
> + {
> + root->web_not_optimizable = 1;
> + break;
> + }
> + }
> + }
> + else if (insn_entry[i].is_store && insn_entry[i].is_swap)
> + {
> + df_ref *use_rec;
> +
> + for (use_rec = DF_INSN_UID_USES (uid); *use_rec; use_rec++)
> + {
> + df_ref use = *use_rec;
> + struct df_link *link = DF_REF_CHAIN (use);
> +
> + if (!chain_contains_only_swaps (insn_entry, link, FOR_STORES))
> + {
> + root->web_not_optimizable = 1;
> + break;
> + }
> + }
> + }
> + }
> +
> + if (dump_file)
> + {
> + fprintf (dump_file, "\nSwap insn entry table after web analysis\n");
> + dump_swap_insn_table (insn_entry);
> + }
> +
> + /* For each load and store in an optimizable web (which implies
> + the loads and stores are permuting), find the associated
> + register swaps and mark them for removal. Due to various
> + optimizations we may mark the same swap more than once. Also
> + perform special handling for swappable insns that require it. */
> + for (i = 0; i < e; ++i)
> + if ((insn_entry[i].is_load || insn_entry[i].is_store)
> + && insn_entry[i].is_swap)
> + {
> + swap_web_entry* root_entry
> + = (swap_web_entry*)((&insn_entry[i])->unionfind_root ());
> + if (!root_entry->web_not_optimizable)
> + mark_swaps_for_removal (insn_entry, i);
> + }
> + else if (insn_entry[i].is_swappable && insn_entry[i].special_handling)
> + {
> + swap_web_entry* root_entry
> + = (swap_web_entry*)((&insn_entry[i])->unionfind_root ());
> + if (!root_entry->web_not_optimizable)
> + handle_special_swappables (insn_entry, i);
> + }
> +
> + /* Now delete the swaps marked for removal. */
> + for (i = 0; i < e; ++i)
> + if (insn_entry[i].will_delete)
> + replace_swap_with_copy (insn_entry, i);
> +
> + /* Clean up. */
> + free (insn_entry);
> + return 0;
> +}
> +
>
> struct gcc_target targetm = TARGET_INITIALIZER;
>
> Index: gcc/config/rs6000/rs6000.opt
> ===================================================================
> --- gcc/config/rs6000/rs6000.opt (revision 221696)
> +++ gcc/config/rs6000/rs6000.opt (working copy)
> @@ -585,3 +585,7 @@ Allow double variables in upper registers with -mc
> mupper-regs-sf
> Target Undocumented Mask(UPPER_REGS_SF) Var(rs6000_isa_flags)
> Allow float variables in upper registers with -mcpu=power8 or -mp8-vector
> +
> +moptimize-swaps
> +Target Undocumented Var(rs6000_optimize_swaps) Init(1) Save
> +Analyze and remove doubleword swaps from VSX computations.
> Index: gcc/df.h
> ===================================================================
> --- gcc/df.h (revision 221696)
> +++ gcc/df.h (working copy)
> @@ -1132,20 +1132,22 @@ df_get_artificial_uses (unsigned int bb_index)
>
> /* web */
>
> -/* This entry is allocated for each reference in the insn stream. */
> -struct web_entry
> +class web_entry_base
> {
> - /* Pointer to the parent in the union/find tree. */
> - struct web_entry *pred;
> - /* Newly assigned register to the entry. Set only for roots. */
> - rtx reg;
> - void* extra_info;
> + private:
> + /* Reference to the parent in the union/find tree. */
> + web_entry_base *pred_pvt;
> +
> + public:
> + /* Accessors. */
> + web_entry_base *pred () { return pred_pvt; }
> + void set_pred (web_entry_base *p) { pred_pvt = p; }
> +
> + /* Find representative in union-find tree. */
> + web_entry_base *unionfind_root ();
> +
> + /* Union with another set, returning TRUE if they are already unioned. */
> + friend bool unionfind_union (web_entry_base *first, web_entry_base
> *second);
> };
>
> -extern struct web_entry *unionfind_root (struct web_entry *);
> -extern bool unionfind_union (struct web_entry *, struct web_entry *);
> -extern void union_defs (df_ref, struct web_entry *,
> - unsigned int *used, struct web_entry *,
> - bool (*fun) (struct web_entry *, struct web_entry *));
> -
> #endif /* GCC_DF_H */
> Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-1.c
> ===================================================================
> --- gcc/testsuite/gcc.target/powerpc/swaps-p8-1.c (revision 0)
> +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-1.c (working copy)
> @@ -0,0 +1,35 @@
> +/* { dg-do compile { target { powerpc64le-*-* } } } */
> +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } {
> "-mcpu=power8" } } */
> +/* { dg-options "-mcpu=power8 -O3" } */
> +/* { dg-final { scan-assembler "lxvd2x" } } */
> +/* { dg-final { scan-assembler "stxvd2x" } } */
> +/* { dg-final { scan-assembler-not "xxpermdi" } } */
> +
> +void abort();
> +
> +#define N 16
> +
> +signed char ca[N] __attribute__((aligned(16)));
> +signed char cb[] __attribute__((aligned(16)))
> + = {8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7};
> +signed char cc[] __attribute__((aligned(16)))
> + = {1, 1, 2, 2, 3, 3, 2, 2, 1, 1, 0, 0, -1, -1, -2, -2};
> +
> +__attribute__((noinline)) void foo ()
> +{
> + int i;
> + for (i = 0; i < N; i++) {
> + ca[i] = cb[i] - cc[i];
> + }
> +}
> +
> +int main ()
> +{
> + signed char cd[] = {7, 6, 4, 3, 1, 0, 0, -1, -1, -2, -2, -3, -3, -4, -4,
> -5};
> + int i;
> + foo ();
> + for (i = 0; i < N; ++i)
> + if (ca[i] != cd[i])
> + abort ();
> + return 0;
> +}
> Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-10.c
> ===================================================================
> --- gcc/testsuite/gcc.target/powerpc/swaps-p8-10.c (revision 0)
> +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-10.c (working copy)
> @@ -0,0 +1,42 @@
> +/* { dg-do run { target { powerpc64le-*-* } } } */
> +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } {
> "-mcpu=power8" } } */
> +/* { dg-options "-mcpu=power8 -O3" } */
> +
> +void abort ();
> +
> +#define N 4096
> +int ca[N] __attribute__((aligned(16)));
> +int cb[N] __attribute__((aligned(16)));
> +int cc[N] __attribute__((aligned(16)));
> +int cd[N] __attribute__((aligned(16)));
> +
> +__attribute__((noinline)) void foo ()
> +{
> + int i;
> + for (i = 0; i < N; i++) {
> + ca[i] = ((cb[i] + cc[i]) * cd[i]) >> 3;
> + }
> +}
> +
> +__attribute__((noinline)) void init ()
> +{
> + int i;
> + for (i = 0; i < N; ++i) {
> + cb[i] = 3 * i - 2048;
> + cc[i] = -5 * i + 93;
> + cd[i] = i % 2 ? 1 : -1;
> + }
> +}
> +
> +int main ()
> +{
> + int i;
> + init ();
> + foo ();
> + for (i = 0; i < N; ++i)
> + if (i % 2 == 1 && ca[i] != (-2 * i - 1955) >> 3)
> + abort ();
> + else if (i % 2 == 0 && ca[i] != (1955 + 2 * i) >> 3)
> + abort ();
> + return 0;
> +}
> Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-11.c
> ===================================================================
> --- gcc/testsuite/gcc.target/powerpc/swaps-p8-11.c (revision 0)
> +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-11.c (working copy)
> @@ -0,0 +1,53 @@
> +/* { dg-do run { target { powerpc64le-*-* } } } */
> +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } {
> "-mcpu=power8" } } */
> +/* { dg-options "-mcpu=power8 -O3" } */
> +
> +#include <altivec.h>
> +void abort ();
> +
> +#define N 4096
> +int ca[N] __attribute__((aligned(16)));
> +int cb[N] __attribute__((aligned(16)));
> +int cc[N] __attribute__((aligned(16)));
> +int cd[N] __attribute__((aligned(16)));
> +int hey;
> +
> +__attribute__((noinline)) void foo ()
> +{
> + int i;
> + vector int va, vb, vc, vd, tmp;
> + vector unsigned int threes = vec_splat_u32(3);
> + for (i = 0; i < N; i+=4) {
> + vb = vec_vsx_ld (0, &cb[i]);
> + vc = vec_vsx_ld (0, &cc[i]);
> + vd = vec_vsx_ld (0, &cd[i]);
> + tmp = vec_add (vb, vc);
> + tmp = vec_sub (tmp, vd);
> + tmp = vec_sra (tmp, threes);
> + hey = tmp[3];
> + vec_vsx_st (tmp, 0, &ca[i]);
> + }
> +}
> +
> +__attribute__((noinline)) void init ()
> +{
> + int i;
> + for (i = 0; i < N; ++i) {
> + cb[i] = 3 * i - 2048;
> + cc[i] = -5 * i + 93;
> + cd[i] = i + 14;
> + }
> +}
> +
> +int main ()
> +{
> + int i;
> + init ();
> + foo ();
> + for (i = 0; i < N; ++i)
> + if (ca[i] != (-3 * i - 1969) >> 3)
> + abort ();
> + if (hey != ca[N-1])
> + abort ();
> + return 0;
> +}
> Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-12.c
> ===================================================================
> --- gcc/testsuite/gcc.target/powerpc/swaps-p8-12.c (revision 0)
> +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-12.c (working copy)
> @@ -0,0 +1,56 @@
> +/* { dg-do compile { target { powerpc64le-*-* } } } */
> +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } {
> "-mcpu=power8" } } */
> +/* { dg-options "-mcpu=power8 -O3" } */
> +/* { dg-final { scan-assembler "lxvd2x" } } */
> +/* { dg-final { scan-assembler "stxvd2x" } } */
> +/* { dg-final { scan-assembler-not "xxpermdi" } } */
> +
> +#include "altivec.h"
> +void abort ();
> +
> +#define N 4096
> +int ca[N] __attribute__((aligned(16)));
> +int cb[N] __attribute__((aligned(16)));
> +int cc[N] __attribute__((aligned(16)));
> +int cd[N] __attribute__((aligned(16)));
> +int hey;
> +
> +__attribute__((noinline)) void foo ()
> +{
> + int i;
> + vector int va, vb, vc, vd, tmp;
> + vector unsigned int threes = vec_splat_u32(3);
> + for (i = 0; i < N; i+=4) {
> + vb = vec_vsx_ld (0, &cb[i]);
> + vc = vec_vsx_ld (0, &cc[i]);
> + vd = vec_vsx_ld (0, &cd[i]);
> + tmp = vec_add (vb, vc);
> + tmp = vec_sub (tmp, vd);
> + tmp = vec_sra (tmp, threes);
> + hey = tmp[3];
> + vec_vsx_st (tmp, 0, &ca[i]);
> + }
> +}
> +
> +__attribute__((noinline)) void init ()
> +{
> + int i;
> + for (i = 0; i < N; ++i) {
> + cb[i] = 3 * i - 2048;
> + cc[i] = -5 * i + 93;
> + cd[i] = i + 14;
> + }
> +}
> +
> +int main ()
> +{
> + int i;
> + init ();
> + foo ();
> + for (i = 0; i < N; ++i)
> + if (ca[i] != (-3 * i - 1969) >> 3)
> + abort ();
> + if (hey != ca[N-1])
> + abort ();
> + return 0;
> +}
> Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-13.c
> ===================================================================
> --- gcc/testsuite/gcc.target/powerpc/swaps-p8-13.c (revision 0)
> +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-13.c (working copy)
> @@ -0,0 +1,54 @@
> +/* { dg-do run { target { powerpc64le-*-* } } } */
> +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } {
> "-mcpu=power8" } } */
> +/* { dg-options "-mcpu=power8 -O3" } */
> +
> +#include <altivec.h>
> +void abort ();
> +
> +#define N 4096
> +long long ca[N] __attribute__((aligned(16)));
> +long long cb[N] __attribute__((aligned(16)));
> +long long cc[N] __attribute__((aligned(16)));
> +long long cd[N] __attribute__((aligned(16)));
> +long long x;
> +
> +__attribute__((noinline)) void foo ()
> +{
> + int i;
> + vector long long va, vb, vc, vd, tmp;
> + volatile unsigned long long three = 3;
> + vector unsigned long long threes = vec_splats (three);
> + for (i = 0; i < N; i+=2) {
> + vb = vec_vsx_ld (0, (vector long long *)&cb[i]);
> + vc = vec_vsx_ld (0, (vector long long *)&cc[i]);
> + vd = vec_vsx_ld (0, (vector long long *)&cd[i]);
> + tmp = vec_add (vb, vc);
> + tmp = vec_sub (tmp, vd);
> + tmp = vec_sra (tmp, threes);
> + x = vec_extract (tmp, 0);
> + vec_vsx_st (tmp, 0, (vector long long *)&ca[i]);
> + }
> +}
> +
> +__attribute__((noinline)) void init ()
> +{
> + int i;
> + for (i = 0; i < N; ++i) {
> + cb[i] = 3 * i - 2048;
> + cc[i] = -5 * i + 93;
> + cd[i] = i + 14;
> + }
> +}
> +
> +int main ()
> +{
> + int i;
> + init ();
> + foo ();
> + for (i = 0; i < N; ++i)
> + if (ca[i] != (-3 * i - 1969) >> 3)
> + abort ();
> + if (x != ca[N-1])
> + abort ();
> + return 0;
> +}
> Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-15.c
> ===================================================================
> --- gcc/testsuite/gcc.target/powerpc/swaps-p8-15.c (revision 0)
> +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-15.c (working copy)
> @@ -0,0 +1,51 @@
> +/* { dg-do compile { target { powerpc64le-*-* } } } */
> +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } {
> "-mcpu=power8" } } */
> +/* { dg-options "-mcpu=power8 -O3" } */
> +/* { dg-final { scan-assembler "lxvd2x" } } */
> +/* { dg-final { scan-assembler "stxvd2x" } } */
> +/* { dg-final { scan-assembler "xxspltw" } } */
> +
> +/* Currently the analyze_swaps phase cannot optimize this loop because
> + of the presence of an UNSPEC_VSX_CVDPSPN. At such time as this is
> + handled, we need to add a 'scan-assembler-not "xxpermdi"' directive to
> + this test. */
> +#include <altivec.h>
> +void abort();
> +
> +#define N 4096
> +#define M 10000000
> +vector float ca[N][4] = {0};
> +vector float cb[N][4] = {0};
> +vector float cc[N][4] = {0};
> +
> +__attribute__((noinline)) void foo ()
> +{
> + int i;
> + for (i = 0; i < N; i++) {
> + cc[i][0] = vec_mul(vec_splats(cb[i][0][0]), ca[i][0]);
> + cc[i][0] = vec_madd(cc[i][0],vec_splats(cb[i][0][1]), ca[i][1]);
> + cc[i][0] = vec_madd(cc[i][0],vec_splats(cb[i][0][2]), ca[i][2]);
> + cc[i][0] = vec_madd(cc[i][0],vec_splats(cb[i][0][3]), ca[i][3]);
> +
> + cc[i][1] = vec_mul(vec_splats(cb[i][1][0]), ca[i][0]);
> + cc[i][1] = vec_madd(cc[i][0],vec_splats(cb[i][1][1]), ca[i][1]);
> + cc[i][1] = vec_madd(cc[i][0],vec_splats(cb[i][1][2]), ca[i][2]);
> + cc[i][1] = vec_madd(cc[i][0],vec_splats(cb[i][1][3]), ca[i][3]);
> +
> + cc[i][2] = vec_mul(vec_splats(cb[i][2][0]), ca[i][0]);
> + cc[i][2] = vec_madd(cc[i][0],vec_splats(cb[i][2][1]), ca[i][1]);
> + cc[i][2] = vec_madd(cc[i][0],vec_splats(cb[i][2][2]), ca[i][2]);
> + cc[i][2] = vec_madd(cc[i][0],vec_splats(cb[i][2][3]), ca[i][3]);
> +
> + cc[i][3] = vec_mul(vec_splats(cb[i][3][0]), ca[i][0]);
> + cc[i][3] = vec_madd(cc[i][0],vec_splats(cb[i][3][1]), ca[i][1]);
> + cc[i][3] = vec_madd(cc[i][0],vec_splats(cb[i][3][2]), ca[i][2]);
> + cc[i][3] = vec_madd(cc[i][0],vec_splats(cb[i][3][3]), ca[i][3]);
> + }
> +}
> +
> +int main ()
> +{
> + foo ();
> + return 0;
> +}
> Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-17.c
> ===================================================================
> --- gcc/testsuite/gcc.target/powerpc/swaps-p8-17.c (revision 0)
> +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-17.c (working copy)
> @@ -0,0 +1,15 @@
> +/* { dg-do compile { target { powerpc64le-*-* } } } */
> +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } {
> "-mcpu=power8" } } */
> +/* { dg-options "-mcpu=power8 -O1" } */
> +/* { dg-final { scan-assembler "lxvd2x" } } */
> +/* { dg-final { scan-assembler "xxpermdi" } } */
> +
> +/* Verify that we don't try to do permute removal in the presence of
> + vec_ste. This used to ICE. */
> +#include <altivec.h>
> +
> +void f (void *p)
> +{
> + vector unsigned int u32 = vec_vsx_ld (1, (const unsigned int *)p);
> + vec_ste (u32, 1, (unsigned int *)p);
> +}
> Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-3.c
> ===================================================================
> --- gcc/testsuite/gcc.target/powerpc/swaps-p8-3.c (revision 0)
> +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-3.c (working copy)
> @@ -0,0 +1,43 @@
> +/* { dg-do compile { target { powerpc64le-*-* } } } */
> +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } {
> "-mcpu=power8" } } */
> +/* { dg-options "-mcpu=power8 -O3" } */
> +/* { dg-final { scan-assembler "lxvd2x" } } */
> +/* { dg-final { scan-assembler "stxvd2x" } } */
> +/* { dg-final { scan-assembler-not "xxpermdi" } } */
> +
> +void abort ();
> +
> +#define N 4096
> +signed char ca[N] __attribute__((aligned(16)));
> +signed char cb[N] __attribute__((aligned(16)));
> +signed char cc[N] __attribute__((aligned(16)));
> +
> +__attribute__((noinline)) void foo ()
> +{
> + int i;
> + for (i = 0; i < N; i++) {
> + ca[i] = cb[i] - cc[i];
> + }
> +}
> +
> +__attribute__((noinline)) void init ()
> +{
> + int i, ii;
> + for (i = 0, ii = 0; i < N; ++i, ii = (ii + 1) % 128) {
> + cb[i] = ii - 128;
> + cc[i] = ii/2 - 64;
> + }
> +}
> +
> +int main ()
> +{
> + int i, ii;
> + init ();
> + foo ();
> + for (i = 0; i < N; ++i) {
> + ii = i % 128;
> + if (ca[i] != ii - ii/2 - 64)
> + abort ();
> + }
> + return 0;
> +}
> Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-4.c
> ===================================================================
> --- gcc/testsuite/gcc.target/powerpc/swaps-p8-4.c (revision 0)
> +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-4.c (working copy)
> @@ -0,0 +1,45 @@
> +/* { dg-do compile { target { powerpc64le-*-* } } } */
> +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } {
> "-mcpu=power8" } } */
> +/* { dg-options "-mcpu=power8 -O3" } */
> +/* { dg-final { scan-assembler "lxvd2x" } } */
> +/* { dg-final { scan-assembler "stxvd2x" } } */
> +/* { dg-final { scan-assembler-not "xxpermdi" } } */
> +
> +void abort ();
> +
> +#define N 4096
> +int ca[N] __attribute__((aligned(16)));
> +int cb[N] __attribute__((aligned(16)));
> +int cc[N] __attribute__((aligned(16)));
> +int cd[N] __attribute__((aligned(16)));
> +
> +__attribute__((noinline)) void foo ()
> +{
> + int i;
> + for (i = 0; i < N; i++) {
> + ca[i] = (cb[i] + cc[i]) * cd[i];
> + }
> +}
> +
> +__attribute__((noinline)) void init ()
> +{
> + int i;
> + for (i = 0; i < N; ++i) {
> + cb[i] = 3 * i - 2048;
> + cc[i] = -5 * i + 93;
> + cd[i] = i % 2 ? 1 : -1;
> + }
> +}
> +
> +int main ()
> +{
> + int i;
> + init ();
> + foo ();
> + for (i = 0; i < N; ++i)
> + if (i % 2 == 1 && ca[i] != -2 * i - 1955)
> + abort ();
> + else if (i % 2 == 0 && ca[i] != 1955 + 2 * i)
> + abort ();
> + return 0;
> +}
> Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-5.c
> ===================================================================
> --- gcc/testsuite/gcc.target/powerpc/swaps-p8-5.c (revision 0)
> +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-5.c (working copy)
> @@ -0,0 +1,45 @@
> +/* { dg-do compile { target { powerpc64le-*-* } } } */
> +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } {
> "-mcpu=power8" } } */
> +/* { dg-options "-mcpu=power8 -O3" } */
> +/* { dg-final { scan-assembler "lxvd2x" } } */
> +/* { dg-final { scan-assembler "stxvd2x" } } */
> +/* { dg-final { scan-assembler-not "xxpermdi" } } */
> +
> +void abort ();
> +
> +#define N 4096
> +int ca[N] __attribute__((aligned(16)));
> +int cb[N] __attribute__((aligned(16)));
> +int cc[N] __attribute__((aligned(16)));
> +int cd[N] __attribute__((aligned(16)));
> +
> +__attribute__((noinline)) void foo ()
> +{
> + int i;
> + for (i = 0; i < N; i++) {
> + ca[i] = ((cb[i] + cc[i]) * cd[i]) >> 3;
> + }
> +}
> +
> +__attribute__((noinline)) void init ()
> +{
> + int i;
> + for (i = 0; i < N; ++i) {
> + cb[i] = 3 * i - 2048;
> + cc[i] = -5 * i + 93;
> + cd[i] = i % 2 ? 1 : -1;
> + }
> +}
> +
> +int main ()
> +{
> + int i;
> + init ();
> + foo ();
> + for (i = 0; i < N; ++i)
> + if (i % 2 == 1 && ca[i] != (-2 * i - 1955) >> 3)
> + abort ();
> + else if (i % 2 == 0 && ca[i] != (1955 + 2 * i) >> 3)
> + abort ();
> + return 0;
> +}
> Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-6.c
> ===================================================================
> --- gcc/testsuite/gcc.target/powerpc/swaps-p8-6.c (revision 0)
> +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-6.c (working copy)
> @@ -0,0 +1,32 @@
> +/* { dg-do run { target { powerpc64le-*-* } } } */
> +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } {
> "-mcpu=power8" } } */
> +/* { dg-options "-mcpu=power8 -O3" } */
> +
> +void abort();
> +
> +#define N 16
> +
> +signed char ca[N] __attribute__((aligned(16)));
> +signed char cb[] __attribute__((aligned(16)))
> + = {8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7};
> +signed char cc[] __attribute__((aligned(16)))
> + = {1, 1, 2, 2, 3, 3, 2, 2, 1, 1, 0, 0, -1, -1, -2, -2};
> +
> +__attribute__((noinline)) void foo ()
> +{
> + int i;
> + for (i = 0; i < N; i++) {
> + ca[i] = cb[i] - cc[i];
> + }
> +}
> +
> +int main ()
> +{
> + signed char cd[] = {7, 6, 4, 3, 1, 0, 0, -1, -1, -2, -2, -3, -3, -4, -4,
> -5};
> + int i;
> + foo ();
> + for (i = 0; i < N; ++i)
> + if (ca[i] != cd[i])
> + abort ();
> + return 0;
> +}
> Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-7.c
> ===================================================================
> --- gcc/testsuite/gcc.target/powerpc/swaps-p8-7.c (revision 0)
> +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-7.c (working copy)
> @@ -0,0 +1,38 @@
> +/* { dg-do run { target { powerpc64le-*-* } } } */
> +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } {
> "-mcpu=power8" } } */
> +/* { dg-options "-mcpu=power8 -O3" } */
> +
> +void abort ();
> +
> +#define N 256
> +signed char ca[N] __attribute__((aligned(16)));
> +signed char cb[N] __attribute__((aligned(16)));
> +signed char cc[N] __attribute__((aligned(16)));
> +
> +__attribute__((noinline)) void foo ()
> +{
> + int i;
> + for (i = 0; i < N; i++) {
> + ca[i] = cb[i] - cc[i];
> + }
> +}
> +
> +__attribute__((noinline)) void init ()
> +{
> + int i;
> + for (i = 0; i < N; ++i) {
> + cb[i] = i - 128;
> + cc[i] = i/2 - 64;
> + }
> +}
> +
> +int main ()
> +{
> + int i;
> + init ();
> + foo ();
> + for (i = 0; i < N; ++i)
> + if (ca[i] != i - i/2 - 64)
> + abort ();
> + return 0;
> +}
> Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-8.c
> ===================================================================
> --- gcc/testsuite/gcc.target/powerpc/swaps-p8-8.c (revision 0)
> +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-8.c (working copy)
> @@ -0,0 +1,40 @@
> +/* { dg-do run { target { powerpc64le-*-* } } } */
> +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } {
> "-mcpu=power8" } } */
> +/* { dg-options "-mcpu=power8 -O3" } */
> +
> +void abort ();
> +
> +#define N 4096
> +signed char ca[N] __attribute__((aligned(16)));
> +signed char cb[N] __attribute__((aligned(16)));
> +signed char cc[N] __attribute__((aligned(16)));
> +
> +__attribute__((noinline)) void foo ()
> +{
> + int i;
> + for (i = 0; i < N; i++) {
> + ca[i] = cb[i] - cc[i];
> + }
> +}
> +
> +__attribute__((noinline)) void init ()
> +{
> + int i, ii;
> + for (i = 0, ii = 0; i < N; ++i, ii = (ii + 1) % 128) {
> + cb[i] = ii - 128;
> + cc[i] = ii/2 - 64;
> + }
> +}
> +
> +int main ()
> +{
> + int i, ii;
> + init ();
> + foo ();
> + for (i = 0; i < N; ++i) {
> + ii = i % 128;
> + if (ca[i] != ii - ii/2 - 64)
> + abort ();
> + }
> + return 0;
> +}
> Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-9.c
> ===================================================================
> --- gcc/testsuite/gcc.target/powerpc/swaps-p8-9.c (revision 0)
> +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-9.c (working copy)
> @@ -0,0 +1,42 @@
> +/* { dg-do run { target { powerpc64le-*-* } } } */
> +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } {
> "-mcpu=power8" } } */
> +/* { dg-options "-mcpu=power8 -O3" } */
> +
> +void abort ();
> +
> +#define N 4096
> +int ca[N] __attribute__((aligned(16)));
> +int cb[N] __attribute__((aligned(16)));
> +int cc[N] __attribute__((aligned(16)));
> +int cd[N] __attribute__((aligned(16)));
> +
> +__attribute__((noinline)) void foo ()
> +{
> + int i;
> + for (i = 0; i < N; i++) {
> + ca[i] = (cb[i] + cc[i]) * cd[i];
> + }
> +}
> +
> +__attribute__((noinline)) void init ()
> +{
> + int i;
> + for (i = 0; i < N; ++i) {
> + cb[i] = 3 * i - 2048;
> + cc[i] = -5 * i + 93;
> + cd[i] = i % 2 ? 1 : -1;
> + }
> +}
> +
> +int main ()
> +{
> + int i;
> + init ();
> + foo ();
> + for (i = 0; i < N; ++i)
> + if (i % 2 == 1 && ca[i] != -2 * i - 1955)
> + abort ();
> + else if (i % 2 == 0 && ca[i] != 1955 + 2 * i)
> + abort ();
> + return 0;
> +}
> Index: gcc/web.c
> ===================================================================
> --- gcc/web.c (revision 221696)
> +++ gcc/web.c (working copy)
> @@ -53,17 +53,17 @@ along with GCC; see the file COPYING3. If not see
>
> /* Find the root of unionfind tree (the representative of set). */
>
> -struct web_entry *
> -unionfind_root (struct web_entry *element)
> +web_entry_base *
> +web_entry_base::unionfind_root ()
> {
> - struct web_entry *element1 = element, *element2;
> + web_entry_base *element = this, *element1 = this, *element2;
>
> - while (element->pred)
> - element = element->pred;
> - while (element1->pred)
> + while (element->pred ())
> + element = element->pred ();
> + while (element1->pred ())
> {
> - element2 = element1->pred;
> - element1->pred = element;
> + element2 = element1->pred ();
> + element1->set_pred (element);
> element1 = element2;
> }
> return element;
> @@ -74,23 +74,32 @@ along with GCC; see the file COPYING3. If not see
> nothing is done. Otherwise, return false. */
>
> bool
> -unionfind_union (struct web_entry *first, struct web_entry *second)
> +unionfind_union (web_entry_base *first, web_entry_base *second)
> {
> - first = unionfind_root (first);
> - second = unionfind_root (second);
> + first = first->unionfind_root ();
> + second = second->unionfind_root ();
> if (first == second)
> return true;
> - second->pred = first;
> + second->set_pred (first);
> return false;
> }
>
> +class web_entry : public web_entry_base
> +{
> + private:
> + rtx reg_pvt;
> +
> + public:
> + rtx reg () { return reg_pvt; }
> + void set_reg (rtx r) { reg_pvt = r; }
> +};
> +
> /* For INSN, union all defs and uses that are linked by match_dup.
> FUN is the function that does the union. */
>
> static void
> -union_match_dups (rtx insn, struct web_entry *def_entry,
> - struct web_entry *use_entry,
> - bool (*fun) (struct web_entry *, struct web_entry *))
> +union_match_dups (rtx insn, web_entry *def_entry, web_entry *use_entry,
> + bool (*fun) (web_entry_base *, web_entry_base *))
> {
> struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
> df_ref *use_link = DF_INSN_INFO_USES (insn_info);
> @@ -157,9 +166,9 @@ static void
> the values 0 and 1 are reserved for use by entry_register. */
>
> void
> -union_defs (df_ref use, struct web_entry *def_entry,
> - unsigned int *used, struct web_entry *use_entry,
> - bool (*fun) (struct web_entry *, struct web_entry *))
> +union_defs (df_ref use, web_entry *def_entry,
> + unsigned int *used, web_entry *use_entry,
> + bool (*fun) (web_entry_base *, web_entry_base *))
> {
> struct df_insn_info *insn_info = DF_REF_INSN_INFO (use);
> struct df_link *link = DF_REF_CHAIN (use);
> @@ -260,15 +269,15 @@ void
> /* Find the corresponding register for the given entry. */
>
> static rtx
> -entry_register (struct web_entry *entry, df_ref ref, unsigned int *used)
> +entry_register (web_entry *entry, df_ref ref, unsigned int *used)
> {
> - struct web_entry *root;
> + web_entry *root;
> rtx reg, newreg;
>
> /* Find the corresponding web and see if it has been visited. */
> - root = unionfind_root (entry);
> - if (root->reg)
> - return root->reg;
> + root = (web_entry *)entry->unionfind_root ();
> + if (root->reg ())
> + return root->reg ();
>
> /* We are seeing this web for the first time, do the assignment. */
> reg = DF_REF_REAL_REG (ref);
> @@ -292,7 +301,7 @@ static rtx
> REGNO (newreg));
> }
>
> - root->reg = newreg;
> + root->set_reg (newreg);
> return newreg;
> }
>
> @@ -326,8 +335,8 @@ gate_handle_web (void)
> static unsigned int
> web_main (void)
> {
> - struct web_entry *def_entry;
> - struct web_entry *use_entry;
> + web_entry *def_entry;
> + web_entry *use_entry;
> unsigned int max = max_reg_num ();
> unsigned int *used;
> basic_block bb;
> @@ -364,9 +373,9 @@ web_main (void)
> }
>
> /* Record the number of uses and defs at the beginning of the
> optimization. */
> - def_entry = XCNEWVEC (struct web_entry, DF_DEFS_TABLE_SIZE());
> + def_entry = XCNEWVEC (web_entry, DF_DEFS_TABLE_SIZE());
> used = XCNEWVEC (unsigned, max);
> - use_entry = XCNEWVEC (struct web_entry, uses_num);
> + use_entry = XCNEWVEC (web_entry, uses_num);
>
> /* Produce the web. */
> FOR_ALL_BB (bb)
>
>