CC: james.greenha...@arm.com
CC: kyrylo.tkac...@foss.arm.com
---
gcc/config.gcc | 2 +-
gcc/config/aarch64/aarch64-passes.def | 1 +
gcc/config/aarch64/aarch64-protos.h | 49 +
gcc/config/aarch64/aarch64-tuning-flags.def | 2 +
gcc/config/aarch64/aarch64.c | 48 +-
.../aarch64/falkor-tag-collision-avoidance.c | 857 ++++++++++++++++++
gcc/config/aarch64/t-aarch64 | 9 +
7 files changed, 922 insertions(+), 46 deletions(-)
create mode 100644 gcc/config/aarch64/falkor-tag-collision-avoidance.c
diff --git a/gcc/config.gcc b/gcc/config.gcc
index 63162aab676..c66dda0770e 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -304,7 +304,7 @@ aarch64*-*-*)
extra_headers="arm_fp16.h arm_neon.h arm_acle.h"
c_target_objs="aarch64-c.o"
cxx_target_objs="aarch64-c.o"
- extra_objs="aarch64-builtins.o aarch-common.o cortex-a57-fma-steering.o"
+ extra_objs="aarch64-builtins.o aarch-common.o cortex-a57-fma-steering.o
falkor-tag-collision-avoidance.o"
target_gtfiles="\$(srcdir)/config/aarch64/aarch64-builtins.c"
target_has_targetm_common=yes
;;
diff --git a/gcc/config/aarch64/aarch64-passes.def
b/gcc/config/aarch64/aarch64-passes.def
index 87747b420b0..f61a8870aa1 100644
--- a/gcc/config/aarch64/aarch64-passes.def
+++ b/gcc/config/aarch64/aarch64-passes.def
@@ -19,3 +19,4 @@
<http://www.gnu.org/licenses/>. */
INSERT_PASS_AFTER (pass_regrename, 1, pass_fma_steering);
+INSERT_PASS_AFTER (pass_machine_reorg, 1, pass_tag_collision_avoidance);
diff --git a/gcc/config/aarch64/aarch64-protos.h
b/gcc/config/aarch64/aarch64-protos.h
index 87c6ae20278..0a4558c2023 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -283,6 +283,49 @@ struct tune_params
const struct cpu_prefetch_tune *prefetch;
};
+/* Classifies an address.
+
+ ADDRESS_REG_IMM
+ A simple base register plus immediate offset.
+
+ ADDRESS_REG_WB
+ A base register indexed by immediate offset with writeback.
+
+ ADDRESS_REG_REG
+ A base register indexed by (optionally scaled) register.
+
+ ADDRESS_REG_UXTW
+ A base register indexed by (optionally scaled) zero-extended register.
+
+ ADDRESS_REG_SXTW
+ A base register indexed by (optionally scaled) sign-extended register.
+
+ ADDRESS_LO_SUM
+ A LO_SUM rtx with a base register and "LO12" symbol relocation.
+
+ ADDRESS_SYMBOLIC:
+ A constant symbolic address, in pc-relative literal pool. */
+
+enum aarch64_address_type {
+ ADDRESS_REG_IMM,
+ ADDRESS_REG_WB,
+ ADDRESS_REG_REG,
+ ADDRESS_REG_UXTW,
+ ADDRESS_REG_SXTW,
+ ADDRESS_LO_SUM,
+ ADDRESS_SYMBOLIC
+};
+
+/* Address information. */
+struct aarch64_address_info {
+ enum aarch64_address_type type;
+ rtx base;
+ rtx offset;
+ poly_int64 const_offset;
+ int shift;
+ enum aarch64_symbol_type symbol_type;
+};
+
#define AARCH64_FUSION_PAIR(x, name) \
AARCH64_FUSE_##name##_index,
/* Supported fusion operations. */
@@ -546,6 +589,11 @@ void aarch64_swap_ldrstr_operands (rtx *, bool);
extern void aarch64_asm_output_pool_epilogue (FILE *, const char *,
tree, HOST_WIDE_INT);
+
+extern bool aarch64_classify_address (struct aarch64_address_info *, rtx,
+ machine_mode, bool,
+ aarch64_addr_query_type = ADDR_QUERY_M);
+
/* Defined in common/config/aarch64-common.c. */
bool aarch64_handle_option (struct gcc_options *, struct gcc_options *,
const struct cl_decoded_option *, location_t);
@@ -556,6 +604,7 @@ std::string aarch64_get_extension_string_for_isa_flags
(unsigned long,
unsigned long);
rtl_opt_pass *make_pass_fma_steering (gcc::context *ctxt);
+rtl_opt_pass *make_pass_tag_collision_avoidance (gcc::context *ctxt);
poly_uint64 aarch64_regmode_natural_size (machine_mode);
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def
index fb9700ca42f..88a2b25ab43 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -44,4 +44,6 @@ AARCH64_EXTRA_TUNING_OPTION ("cheap_shift_extend",
CHEAP_SHIFT_EXTEND)
/* Disallow load/store pair instructions on Q-registers. */
AARCH64_EXTRA_TUNING_OPTION ("no_ldp_stp_qregs", NO_LDP_STP_QREGS)
+AARCH64_EXTRA_TUNING_OPTION ("rename_load_regs", RENAME_LOAD_REGS)
+
#undef AARCH64_EXTRA_TUNING_OPTION
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index b88e7cac27a..31dc58b7cb5 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -78,48 +78,6 @@
/* Defined for convenience. */
#define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
-/* Classifies an address.
-
- ADDRESS_REG_IMM
- A simple base register plus immediate offset.
-
- ADDRESS_REG_WB
- A base register indexed by immediate offset with writeback.
-
- ADDRESS_REG_REG
- A base register indexed by (optionally scaled) register.
-
- ADDRESS_REG_UXTW
- A base register indexed by (optionally scaled) zero-extended register.
-
- ADDRESS_REG_SXTW
- A base register indexed by (optionally scaled) sign-extended register.
-
- ADDRESS_LO_SUM
- A LO_SUM rtx with a base register and "LO12" symbol relocation.
-
- ADDRESS_SYMBOLIC:
- A constant symbolic address, in pc-relative literal pool. */
-
-enum aarch64_address_type {
- ADDRESS_REG_IMM,
- ADDRESS_REG_WB,
- ADDRESS_REG_REG,
- ADDRESS_REG_UXTW,
- ADDRESS_REG_SXTW,
- ADDRESS_LO_SUM,
- ADDRESS_SYMBOLIC
-};
-
-struct aarch64_address_info {
- enum aarch64_address_type type;
- rtx base;
- rtx offset;
- poly_int64 const_offset;
- int shift;
- enum aarch64_symbol_type symbol_type;
-};
-
/* Information about a legitimate vector immediate operand. */
struct simd_immediate_info
{
@@ -906,7 +864,7 @@ static const struct tune_params qdf24xx_tunings =
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */
&qdf24xx_prefetch_tune
};
@@ -5712,10 +5670,10 @@ virt_or_elim_regno_p (unsigned regno)
If it is, fill in INFO appropriately. STRICT_P is true if
REG_OK_STRICT is in effect. */
-static bool
+bool
aarch64_classify_address (struct aarch64_address_info *info,
rtx x, machine_mode mode, bool strict_p,
- aarch64_addr_query_type type = ADDR_QUERY_M)
+ aarch64_addr_query_type type)
{
enum rtx_code code = GET_CODE (x);
rtx op0, op1;
diff --git a/gcc/config/aarch64/falkor-tag-collision-avoidance.c
b/gcc/config/aarch64/falkor-tag-collision-avoidance.c
new file mode 100644
index 00000000000..49b517cc1c7
--- /dev/null
+++ b/gcc/config/aarch64/falkor-tag-collision-avoidance.c
@@ -0,0 +1,857 @@
+/* Tag Collision Avoidance pass for Falkor.
+ Copyright (C) 2018 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ <http://www.gnu.org/licenses/>. */
+
+#define IN_TARGET_CODE 1
+
+#include "config.h"
+#define INCLUDE_LIST
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "target.h"
+#include "rtl.h"
+#include "tree.h"
+#include "tree-pass.h"
+#include "aarch64-protos.h"
+#include "hash-map.h"
+#include "cfgloop.h"
+#include "cfgrtl.h"
+#include "rtl-iter.h"
+#include "df.h"
+#include "memmodel.h"
+#include "optabs.h"
+#include "regs.h"
+#include "recog.h"
+#include "regrename.h"
+#include "print-rtl.h"
+
+/* The Falkor hardware prefetching system uses the encoding of the registers
+ and offsets of loads to decide which of the multiple hardware prefetchers to
+ assign the load to. This has the positive effect of accelerating prefetches
+ when all related loads with uniform strides are assigned to the same
+ prefetcher unit. The down side is that because of the way the assignment
+ works, multiple unrelated loads may end up on the same prefetch unit, thus
+ causing the unit to bounce between different sets of addresses and never
+ train correctly. The point of this pass is to avoid such collisions so that
+ unrelated loads are spread out to different prefetchers. It also makes a
+ rudimentary attempt to ensure that related loads with the same tags don't
+ get moved out unnecessarily.
+
+ Perhaps a future enhancement would be to make a more concerted attempt to
+ get related loads under the same tag. See the memcpy/memset implementation
+ for falkor in glibc to understand the kind of impact this can have on
+ falkor.
+
+ The assignment of loads is based on a tag that is computed from the encoding
+ of the first destination register (the only destination in case of LDR), the
+ base register and the offset (either the register or the immediate value, as
+ encoded in the instruction). This is what the 14 bit tag looks like:
+
+ |<- 6 bits ->|<- 4b ->|<- 4b ->|
+ --------------------------------
+ | OFFSET | SRC | DST |
+ --------------------------------
+
+ For all cases, the SRC and DST are the 4 LSB of the encoding of the register
+ in the instruction. Offset computation is more involved and is as follows:
+
+ - For register offset addressing: 4 LSB of the offset register with the MSB
+ of the 6 bits set to 1.
+
+ - For immediate offset: 4 LSB of the encoded immediate offset. The encoding
+ depends on the width of the load and is expressed as multiples of the
+ width.
+
+ - For loads with update: 4 LSB of the offset. The encoding here is the
+ exact number by which the base is offset and incremented.
+
+ Based on the above it is clear that registers 0 and 16 will result in
+ collisions, 1 and 17 and so on. This pass detects such collisions within a
+ def/use chain of the source register in a loop and tries to resolve the
+ collision by renaming one of the destination registers. */
+
+/* Get the destination part of the tag. */
+#define TAG_GET_DEST(__tag) ((__tag) & 0xf)
+
+/* Get the tag with the destination part updated. */
+#define TAG_UPDATE_DEST(__tag, __dest) (((__tag) & ~0xf) | (__dest & 0xf))
+
+#define MAX_PREFETCH_STRIDE 2048
+
+/* The instruction information structure. This is used to cache information
+ about the INSN that we derive when traversing through all of the insns in
+ loops. */
+class tag_insn_info
+{
+public:
+ rtx_insn *insn;
+ rtx dest;
+ rtx base;
+ rtx offset;
+ bool writeback;
+ bool ldp;
+
+ tag_insn_info (rtx_insn *i, rtx d, rtx b, rtx o, bool w, bool p)
+ : insn (i), dest (d), base (b), offset (o), writeback (w), ldp (p)
+ {}
+
+ /* Compute the tag based on BASE, DEST and OFFSET of the load. */
+ unsigned tag ()
+ {
+ unsigned int_offset = 0;
+ rtx offset = this->offset;
+ unsigned dest = REGNO (this->dest);
+ unsigned base = REGNO (this->base);
+ machine_mode dest_mode = GET_MODE (this->dest);
+
+ /* Falkor does not support SVE; GET_LOAD_INFO ensures that the
+ destination mode is constant here. */
+ unsigned dest_mode_size = GET_MODE_SIZE (dest_mode).to_constant ();
+
+ /* For loads of larger than 16 bytes, the DEST part of the tag is 0. */
+ if ((dest_mode_size << this->ldp) > 16)
+ dest = 0;
+
+ if (offset && REG_P (offset))
+ int_offset = (1 << 5) | REGNO (offset);
+ else if (offset && CONST_INT_P (offset))
+ {
+ int_offset = INTVAL (offset);
+ int_offset /= dest_mode_size;
+ if (!this->writeback)
+ int_offset >>= 2;
+ }
+ return ((dest & 0xf)
+ | ((base & 0xf) << 4)
+ | ((int_offset & 0x3f) << 8));
+ }
+};
+
+/* Hash map to traverse and process instructions with colliding tags. */
+typedef hash_map <rtx, auto_vec <tag_insn_info *> > tag_map_t;
+
+/* Vector of instructions with colliding tags. */
+typedef auto_vec <tag_insn_info *> insn_info_list_t;
+
+/* Pair of instruction information and unavailable register set to pass to
+ CHECK_COLLIDING_TAGS. */
+typedef std::pair <tag_insn_info *, HARD_REG_SET *> arg_pair_t;
+
+
+/* Callback to free all tag_insn_info objects. */
+bool
+free_insn_info (const rtx &t ATTRIBUTE_UNUSED, insn_info_list_t *v,
+ void *arg ATTRIBUTE_UNUSED)
+{
+ while (v->length () > 0)
+ delete v->pop ();
+
+ return true;
+}
+
+
+/* Add all aliases of the register to the unavailable register set. REG is the
+ smallest register number that can then be used to reference its aliases.
+ UNAVAILABLE is the hard register set to add the ignored register numbers to
+ and MODE is the mode in which the registers would have been used. */
+static void
+ignore_all_aliases (HARD_REG_SET *unavailable, machine_mode mode, unsigned reg)
+{
+ add_to_hard_reg_set (unavailable, mode, reg);
+ add_to_hard_reg_set (unavailable, mode, reg + 16);
+ add_to_hard_reg_set (unavailable, mode, reg + 32);
+ add_to_hard_reg_set (unavailable, mode, reg + 48);
+}
+
+
+/* Callback to check which destination registers are unavailable to us for
+ renaming because of the base and offset colliding. This is a callback that
+ gets called for every name value pair (T, V) in the TAG_MAP. The ARG is an
+ std::pair of the tag_insn_info of the original insn and the hard register
+ set UNAVAILABLE that is used to record hard register numbers that cannot be
+ used for the renaming. This always returns true since we want to traverse
+ through the entire TAG_MAP. */
+bool
+check_colliding_tags (const rtx &t, const insn_info_list_t &v, arg_pair_t *arg)
+{
+ HARD_REG_SET *unavailable = arg->second;
+ unsigned orig_tag = arg->first->tag ();
+ unsigned tag = INTVAL (t);
+ machine_mode mode = GET_MODE (arg->first->dest);
+
+ /* Can't collide with emptiness. */
+ if (v.length () == 0)
+ return true;
+
+ /* Drop all aliased destination registers that result in the same
+ tag. It is not necessary to drop all of them but we do anyway
+ because it is quicker than checking ranges. */
+ if (TAG_UPDATE_DEST (tag, 0) == TAG_UPDATE_DEST (orig_tag, 0))
+ ignore_all_aliases (unavailable, mode, TAG_GET_DEST (tag));
+
+ return true;
+}
+
+
+/* Initialize and build a set of hard register numbers UNAVAILABLE to avoid for
+ renaming. INSN_INFO is the original insn, TAG_MAP is the map of the list of
+ insns indexed by their tags, HEAD is the def/use chain head of the
+ destination register of the original insn. The routine returns the super
+ class of register classes that may be used during the renaming. */
+static enum reg_class
+init_unavailable (tag_insn_info *insn_info, tag_map_t &tag_map, du_head_p head,
+ HARD_REG_SET *unavailable)
+{
+ unsigned dest = head->regno;
+ enum reg_class super_class = NO_REGS;
+ machine_mode mode = GET_MODE (insn_info->dest);
+
+ CLEAR_HARD_REG_SET (*unavailable);
+
+ for (struct du_chain *tmp = head->first; tmp; tmp = tmp->next_use)
+ {
+ if (DEBUG_INSN_P (tmp->insn))
+ continue;
+
+ IOR_COMPL_HARD_REG_SET (*unavailable, reg_class_contents[tmp->cl]);
+ super_class = reg_class_superunion[(int) super_class][(int) tmp->cl];
+ }
+
+ for (unsigned i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+ if (fixed_regs[i] || global_regs[i])
+ add_to_hard_reg_set (unavailable, mode, i);
+
+ arg_pair_t arg = arg_pair_t (insn_info, unavailable);
+
+ /* Exclude all registers that would lead to collisions with other loads. */
+ tag_map.traverse <arg_pair_t *, check_colliding_tags> (&arg);
+
+ /* Finally, also ignore all aliases of the current reg. */
+ ignore_all_aliases (unavailable, mode, dest & 0xf);
+
+ return super_class;
+}
+
+
+/* Find a suitable and available register and rename the chain of occurrences
+ of the register defined in the def/use chain headed by HEAD in which INSN
+ exists. CUR_TAG, TAGS and TAG_MAP are used to determine which registers are
+ unavailable due to a potential collision due to the rename. The routine
+ returns the register number in case of a successful rename or -1 to indicate
+ failure. */
+static int
+rename_chain (tag_insn_info *insn_info, tag_map_t &tag_map, du_head_p head)
+{
+ unsigned dest_regno = head->regno;
+
+ if (head->cannot_rename || head->renamed)
+ return -1;
+
+ HARD_REG_SET unavailable;
+
+ enum reg_class super_class = init_unavailable (insn_info, tag_map, head,
+ &unavailable);
+
+ unsigned new_regno = find_rename_reg (head, super_class, &unavailable,
+ dest_regno, false);
+
+ /* Attempt to rename as long as regrename doesn't just throw the same
+ register at us. */
+ if (new_regno != dest_regno && regrename_do_replace (head, new_regno))
+ {
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file, "\tInsn %d: Renamed %d to %d\n",
+ INSN_UID (insn_info->insn), dest_regno, new_regno);
+
+ return new_regno;
+ }
+
+ return -1;
+}
+
+
+/* Go through the def/use chains for the register and find the chain for this
+ insn to rename. The function returns the hard register number in case of a
+ successful rename and -1 otherwise. */
+static int
+rename_dest (tag_insn_info *insn_info, tag_map_t &tag_map)
+{
+ struct du_chain *chain = NULL;
+ du_head_p head = NULL;
+ int i;
+
+ /* Search the chain where this instruction is (one of) the root. */
+ rtx_insn *insn = insn_info->insn;
+ operand_rr_info *dest_op_info = insn_rr[INSN_UID (insn)].op_info;
+ unsigned dest_regno = REGNO (insn_info->dest);
+
+ for (i = 0; i < dest_op_info->n_chains; i++)
+ {
+ /* The register tracked by this chain does not match the
+ destination register of insn. */
+ if (dest_op_info->heads[i]->regno != dest_regno)
+ continue;
+
+ head = dest_op_info->heads[i];
+ /* The chain was merged in another, find the new head. */
+ if (!head->first)
+ head = regrename_chain_from_id (head->id);
+
+ for (chain = head->first; chain; chain = chain->next_use)
+ /* Found the insn in the chain, so try renaming the register in this
+ chain. */
+ if (chain->insn == insn)
+ return rename_chain (insn_info, tag_map, head);
+ }
+
+ return -1;
+}
+
+
+/* Flag to track if the map has changed. */
+static bool map_changed = false;
+
+/* The actual reallocation logic. For each vector of collisions V, try to
+ resolve the collision by attempting to rename the destination register of
+ all but one of the loads. This is a callback that is invoked for each
+ name-value pair (T, V) in TAG_MAP. The function returns true whenever it
+ returns unchanged and false otherwise to halt traversal. */
+bool
+avoid_collisions_1 (const rtx &t, insn_info_list_t *v, tag_map_t *tag_map)
+{
+ /* We need at least two loads to cause a tag collision, return unchanged. */
+ if (v->length () < 2)
+ return true;
+
+ tag_insn_info *vec_start = v->pop ();
+ tag_insn_info *insn_info = vec_start;
+
+ /* Try to rename at least one register to reduce the collision. If we
+ iterate all the way through, we end up dropping one of the loads from the
+ list. This is fine because we want at most one element to ensure that a
+ subsequent rename attempt does not end up worsening the collision. */
+ do
+ {
+ int new_regno;
+
+ if ((new_regno = rename_dest (insn_info, *tag_map)) != -1)
+ {
+ rtx new_tag = GEN_INT (TAG_UPDATE_DEST (INTVAL (t), new_regno));
+
+ tag_map->get_or_insert (new_tag).safe_push (insn_info);
+ df_set_regs_ever_live (new_regno, true);
+ map_changed = true;
+ return false;
+ }
+
+ v->safe_insert (0, insn_info);
+ insn_info = v->pop ();
+ }
+ while (insn_info != vec_start);
+
+ if (dump_file)
+ fprintf (dump_file, "\t>> Failed to rename destination in insn %d\n\t>>",
+ INSN_UID (insn_info->insn));
+
+ /* Drop the last element and move on to the next tag. */
+ delete insn_info;
+ return true;
+}
+
+
+/* For each set of collisions, attempt to rename the registers or insert a move
+ to avoid the collision. We repeatedly traverse through TAG_MAP using
+ AVOID_COLLISIONS_1 trying to rename registers to avoid collisions until a
+ full traversal results in no change in the map. */
+static void
+avoid_collisions (tag_map_t &tag_map)
+{
+ do
+ {
+ map_changed = false;
+ tag_map.traverse <tag_map_t *, avoid_collisions_1> (&tag_map);
+ }
+ while (map_changed);
+}
+
+
+
+/* Find the use def chain in which INSN exists and then see if there is a
+ definition inside the loop and outside it. We use this as a simple
+ approximation to determine whether the base register is an IV. The basic
+ idea is to find INSN in the use-def chains for its base register and find
+ all definitions that reach it. Of all these definitions, there should be at
+ least one definition that is a simple addition of a constant value, either
+ as a binary operation or a pre or post update.
+
+ The function returns true if the base register is estimated to be an IV. */
+static bool
+iv_p (rtx_insn *insn, rtx reg, struct loop *loop)
+{
+ df_ref ause;
+ unsigned regno = REGNO (reg);
+
+ /* Ignore loads from the stack. */
+ if (regno == SP_REGNUM)
+ return false;
+
+ for (ause = DF_REG_USE_CHAIN (regno); ause; ause = DF_REF_NEXT_REG (ause))
+ {
+ if (!DF_REF_INSN_INFO (ause)
+ || !NONDEBUG_INSN_P (DF_REF_INSN (ause)))
+ continue;
+
+ if (insn != DF_REF_INSN (ause))
+ continue;
+
+ struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
+ df_ref def_rec;
+
+ FOR_EACH_INSN_INFO_DEF (def_rec, insn_info)
+ {
+ rtx_insn *insn = DF_REF_INSN (def_rec);
+ basic_block bb = BLOCK_FOR_INSN (insn);
+
+ if (dominated_by_p (CDI_DOMINATORS, bb, loop->header)
+ && bb->loop_father == loop)
+ {
+ if (recog_memoized (insn) < 0)
+ continue;
+
+ rtx pat = PATTERN (insn);
+
+ /* Prefetch or clobber; unlikely to be a constant stride. The
+ falkor software prefetcher tuning is pretty conservative, so
+ its presence indicates that the access pattern is probably
+ strided but most likely with an unknown stride size or a
+ stride size that is quite large. */
+ if (GET_CODE (pat) != SET)
+ continue;
+
+ rtx x = SET_SRC (pat);
+ if (GET_CODE (x) == ZERO_EXTRACT
+ || GET_CODE (x) == ZERO_EXTEND
+ || GET_CODE (x) == SIGN_EXTEND)
+ x = XEXP (x, 0);
+
+ /* Loading the value from memory; unlikely to be a constant
+ stride. */
+ if (MEM_P (x))
+ continue;
+
+ /* An increment or decrement by a constant MODE_SIZE amount or
+ the result of a binary expression is likely to be an IV. */
+ if (GET_CODE (x) == POST_INC
+ || GET_CODE (x) == POST_DEC
+ || GET_CODE (x) == PRE_INC
+ || GET_CODE (x) == PRE_DEC)
+ return true;
+ else if (BINARY_P (x)
+ && (CONST_INT_P (XEXP (x, 0))
+ || CONST_INT_P (XEXP (x, 1))))
+ {
+ rtx stride = (CONST_INT_P (XEXP (x, 0))
+ ? XEXP (x, 0) : XEXP (x, 1));
+
+ /* Don't bother with very long strides because the prefetcher
+ is unable to train on them anyway. */
+ if (INTVAL (stride) < MAX_PREFETCH_STRIDE)
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+ return false;
+}
+
+
+/* Return true if SRC is a strided load in the LOOP, false otherwise.
+ If it is a strided load, set the BASE and OFFSET. Also, if this is
+ a pre/post increment load, set PRE_POST to true. */
+static bool
+valid_src_p (rtx src, rtx_insn *insn, struct loop *loop, bool *pre_post,
+ rtx *base, rtx *offset, bool load_pair)
+{
+ subrtx_var_iterator::array_type array;
+ rtx x = NULL_RTX;
+
+ FOR_EACH_SUBRTX_VAR (iter, array, src, NONCONST)
+ if (MEM_P (*iter))
+ {
+ x = *iter;
+ break;
+ }
+
+ if (!x)
+ return false;
+
+ struct aarch64_address_info addr;
+ machine_mode mode = GET_MODE (x);
+
+ if (!aarch64_classify_address (&addr, XEXP (x, 0), mode, true))
+ return false;
+
+ unsigned regno = REGNO (addr.base);
+ if (global_regs[regno] || fixed_regs[regno])
+ return false;
+
+ if (addr.type == ADDRESS_REG_WB)
+ {
+ unsigned code = GET_CODE (XEXP (x, 0));
+
+ *pre_post = true;
+ *base = addr.base;
+
+ if (code == PRE_MODIFY || code == POST_MODIFY)
+ *offset = addr.offset;
+ else
+ {
+ /*Writeback is only supported for fixed-width modes. */
+ unsigned int_offset = GET_MODE_SIZE (mode).to_constant ();
+
+ /* For post-incremented load pairs we would increment the base twice
+ over, so make that adjustment. */
+ if (load_pair && (code == POST_INC || code == POST_DEC))
+ int_offset *= 2;
+
+ *offset = GEN_INT (int_offset);
+ }
+ return true;
+ }
+ else if (addr.type == ADDRESS_REG_IMM || addr.type == ADDRESS_REG_REG)
+ {
+ /* Check if the load is strided. */
+ if (!iv_p (insn, addr.base, loop))
+ return false;
+
+ *base = addr.base;
+ *offset = addr.offset;
+ return true;
+ }
+
+ return false;
+}
+
+
+/* Return true if INSN is a strided load in LOOP. If it is a strided load, set
+ the DEST, BASE and OFFSET. Also, if this is a pre/post increment load, set
+ PRE_POST to true.
+
+ The routine does checks on the destination of the insn and depends on
+ STRIDED_LOAD_P to check the source and fill in the BASE and OFFSET. */
+static bool
+get_load_info (rtx_insn *insn, struct loop *loop, rtx *dest, rtx *base,
+ rtx *offset, bool *pre_post, bool *ldp)
+{
+ if (!INSN_P (insn) || recog_memoized (insn) < 0)
+ return false;
+
+ rtx pat = PATTERN (insn);
+ unsigned code = GET_CODE (pat);
+ bool load_pair = (code == PARALLEL);
+
+ /* For a load pair we need only the first base and destination
+ registers. We however need to ensure that our pre/post increment
+ offset is doubled; we do that in STRIDED_LOAD_P. */
+ if (load_pair)
+ {
+ pat = XVECEXP (pat, 0, 0);
+ code = GET_CODE (pat);
+ }
+
+ if (code != SET)
+ return false;
+
+ rtx dest_rtx = SET_DEST (pat);
+
+ if (!REG_P (dest_rtx))
+ return false;
+
+ /* Don't try to rename R0 or V0 because we could be messing with the return
+ value. */
+ if (REGNO (dest_rtx) == R0_REGNUM || REGNO (dest_rtx) == V0_REGNUM)
+ return false;
+
+ unsigned regno = REGNO (dest_rtx);
+ machine_mode mode = GET_MODE (dest_rtx);
+ machine_mode inner_mode = GET_MODE_INNER (mode);
+
+ /* Falkor does not support SVE vectors. */
+ if (!GET_MODE_SIZE (mode).is_constant ())
+ return false;
+
+ /* Ignore vector struct or lane loads. */
+ if (GET_MODE_SIZE (mode).to_constant ()
+ != GET_MODE_SIZE (inner_mode).to_constant ())
+ return false;
+
+ /* The largest width we want to bother with is a load of a pair of
+ quad-words. */
+ if ((GET_MODE_SIZE (mode).to_constant () << load_pair)
+ > GET_MODE_SIZE (OImode))
+ return false;
+
+ /* Ignore loads into the stack pointer because it is unlikely to be a
+ stream. */
+ if (regno == SP_REGNUM)
+ return false;
+
+ if (valid_src_p (SET_SRC (pat), insn, loop, pre_post, base, offset,
+ load_pair))
+ {
+ *dest = dest_rtx;
+ *ldp = load_pair;
+
+ return true;
+ }
+
+ return false;
+}
+
+
+/* Return whether INSN and CAND are in the same def/use chain. */
+static bool
+in_same_chain (rtx_insn *insn, rtx_insn *cand, unsigned regno)
+{
+ struct du_chain *chain = NULL;
+ du_head_p head = NULL;
+ int i;
+
+ /* Search the chain where this instruction is (one of) the root. */
+ operand_rr_info *op_info = insn_rr[INSN_UID (insn)].op_info;
+
+ for (i = 0; i < op_info->n_chains; i++)
+ {
+ /* The register tracked by this chain does not match the
+ dest register of insn. */
+ if (op_info->heads[i]->regno != regno)
+ continue;
+
+ head = op_info->heads[i];
+ /* The chain was merged in another, find the new head. */
+ if (!head->first)
+ head = regrename_chain_from_id (head->id);
+
+ bool found_insn = false, found_cand = false;
+
+ for (chain = head->first; chain; chain = chain->next_use)
+ {
+ rtx *loc = &SET_DEST (PATTERN (chain->insn));
+
+ if (chain->loc != loc)
+ continue;
+
+ if (chain->insn == insn)
+ found_insn = true;
+
+ if (chain->insn == cand)
+ found_cand = true;
+
+ if (found_insn && found_cand)
+ return true;
+ }
+ }
+
+ return false;
+}
+
+
+/* Callback function to traverse the tag map and drop loads that have the same
+ destination and and in the same chain of occurrence. Routine always returns
+ true to allow traversal through all of TAG_MAP. */
+bool
+single_dest_per_chain (const rtx &t ATTRIBUTE_UNUSED, insn_info_list_t *v,
+ void *arg ATTRIBUTE_UNUSED)
+{
+ for (int i = v->length () - 1; i>= 1; i--)
+ {
+ tag_insn_info *insn_info = (*v)[i];
+
+ for (int j = v->length () - 2; j >= 0; j--)
+ {
+ /* Filter out destinations in the same chain. */
+ if (in_same_chain (insn_info->insn, (*v)[j]->insn,
+ REGNO (insn_info->dest)))
+ {
+ v->ordered_remove (j);
+ i = v->length ();
+ break;
+ }
+ }
+ }
+
+ return true;
+}
+
+
+/* Callback invoked for each name-value pair (T, INSN_INFO) to dump the insn
+ list INSN_INFO for tag T. */
+bool
+dump_insn_list (const rtx &t, const insn_info_list_t &insn_info,
+ void *unused ATTRIBUTE_UNUSED)
+{
+ gcc_assert (dump_file);
+ fprintf (dump_file, "Tag 0x%lx ::\n", INTVAL (t));
+
+ for (unsigned i = 0; i < insn_info.length (); i++)
+ dump_insn_slim (dump_file, insn_info[i]->insn);
+
+ fprintf (dump_file, "\n");
+
+ return true;
+}
+
+
+/* Record all loads in LOOP into TAG_MAP indexed by the falkor hardware
+ prefetcher memory tags. */
+static void
+record_loads (tag_map_t &tag_map, struct loop *loop)
+{
+ rtx_insn *insn;
+ basic_block *body, bb;
+
+ body = get_loop_body (loop);
+
+ for (unsigned i = 0; i < loop->num_nodes; i++)
+ {
+ bb = body[i];
+ FOR_BB_INSNS (bb, insn)
+ {
+ rtx base = NULL_RTX;
+ rtx dest = NULL_RTX;
+ rtx offset = NULL_RTX;
+ bool writeback = false;
+ bool ldp = false;
+
+ if (!INSN_P (insn) || DEBUG_INSN_P (insn))
+ continue;
+
+ if (get_load_info (insn, loop, &dest, &base, &offset, &writeback,
+ &ldp))
+ {
+ tag_insn_info *i = new tag_insn_info (insn, dest, base, offset,
+ writeback, ldp);
+ rtx tag = GEN_INT (i->tag ());
+ tag_map.get_or_insert (tag).safe_push (i);
+ }
+ }
+ }
+
+ if (dump_file)
+ {
+ fprintf (dump_file, "Loop %d: Tag map generated.\n", loop->num);
+ tag_map.traverse <void *, dump_insn_list> (NULL);
+ }
+
+ /* Try to reduce the dataset before launching into the rename attempt. Drop
+ destinations in the same collision chain that appear in the same def/use
+ chain, all as defs. These chains will move together in a rename so
+ there's no point in keeping both in there. */
+ tag_map.traverse <void *, single_dest_per_chain> (NULL);
+}
+
+
+/* Tag collision avoidance pass for Falkor. The pass runs in two phases for
+ each loop; the first phase collects all loads that we consider as
+ interesting for renaming into a tag-indexed map of lists. The second phase
+ renames the destination register of the loads in an attempt to spread out
+ the loads into different tags. */
+void
+execute_tag_collision_avoidance ()
+{
+ struct loop *loop;
+
+ df_set_flags (DF_RD_PRUNE_DEAD_DEFS);
+ df_chain_add_problem (DF_UD_CHAIN);
+ df_compute_regs_ever_live (true);
+ df_analyze ();
+ df_set_flags (DF_DEFER_INSN_RESCAN);
+
+ regrename_init (true);
+ regrename_analyze (NULL);
+
+ compute_bb_for_insn ();
+ calculate_dominance_info (CDI_DOMINATORS);
+ loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
+
+ FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
+ {
+ tag_map_t tag_map (512);
+
+ record_loads (tag_map, loop);
+ avoid_collisions (tag_map);
+ if (dump_file)
+ {
+ fprintf (dump_file, "Loop %d: Completed rename.\n", loop->num);
+ tag_map.traverse <void *, dump_insn_list> (NULL);
+ }
+ tag_map.traverse <void *, free_insn_info> (NULL);
+ }
+
+ loop_optimizer_finalize ();
+ free_dominance_info (CDI_DOMINATORS);
+ regrename_finish ();
+}
+
+
+const pass_data pass_data_tag_collision_avoidance =
+{
+ RTL_PASS, /* type */
+ "tag_collision_avoidance", /* name */
+ OPTGROUP_NONE, /* optinfo_flags */
+ TV_NONE, /* tv_id */
+ 0, /* properties_required */
+ 0, /* properties_provided */
+ 0, /* properties_destroyed */
+ 0, /* todo_flags_start */
+ TODO_df_finish, /* todo_flags_finish */
+};
+
+
+class pass_tag_collision_avoidance : public rtl_opt_pass
+{
+public:
+ pass_tag_collision_avoidance (gcc::context *ctxt)
+ : rtl_opt_pass (pass_data_tag_collision_avoidance, ctxt)
+ {}
+
+ /* opt_pass methods: */
+ virtual bool gate (function *)
+ {
+ return ((aarch64_tune_params.extra_tuning_flags
+ & AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS)
+ && optimize >= 2);
+ }
+
+ virtual unsigned int execute (function *)
+ {
+ execute_tag_collision_avoidance ();
+ return 0;
+ }
+
+}; // class pass_tag_collision_avoidance
+
+
+/* Create a new pass instance. */
+rtl_opt_pass *
+make_pass_tag_collision_avoidance (gcc::context *ctxt)
+{
+ return new pass_tag_collision_avoidance (ctxt);
+}
diff --git a/gcc/config/aarch64/t-aarch64 b/gcc/config/aarch64/t-aarch64
index 0be1f0d63aa..f185b404ce6 100644
--- a/gcc/config/aarch64/t-aarch64
+++ b/gcc/config/aarch64/t-aarch64
@@ -67,6 +67,15 @@ cortex-a57-fma-steering.o:
$(srcdir)/config/aarch64/cortex-a57-fma-steering.c \
$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
$(srcdir)/config/aarch64/cortex-a57-fma-steering.c
+falkor-tag-collision-avoidance.o: $(srcdir)/config/aarch64/falkor-tag-collision-avoidance.c \
+ $(CONFIG_H) $(SYSTEM_H) $(TM_H) $(REGS_H) insn-config.h $(RTL_BASE_H) \
+ dominance.h cfg.h cfganal.h $(BASIC_BLOCK_H) $(INSN_ATTR_H) $(RECOG_H) \
+ output.h hash-map.h $(DF_H) $(OBSTACK_H) $(TARGET_H) $(RTL_H) \
+ $(CONTEXT_H) $(TREE_PASS_H) regrename.h \
+ $(srcdir)/config/aarch64/aarch64-protos.h
+ $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
+ $(srcdir)/config/aarch64/falkor-tag-collision-avoidance.c
+
comma=,
MULTILIB_OPTIONS = $(subst $(comma),/, $(patsubst %, mabi=%, $(subst
$(comma),$(comma)mabi=,$(TM_MULTILIB_CONFIG))))
MULTILIB_DIRNAMES = $(subst $(comma), ,$(TM_MULTILIB_CONFIG))