Hello Alex: On 21/05/24 10:22 pm, Alex Coplan wrote: > Hi Ajit, > > I've left some more comments below. It's getting there now, thanks for > your patience. > > On 21/05/2024 20:32, Ajit Agarwal wrote: >> Hello Alex/Richard: >> >> All comments are addressed. >> >> Move pair fusion pass from aarch64-ldp-fusion.cc to middle-end >> to support multiple targets. >> >> Common infrastructure of load store pair fusion is divided into >> target independent and target dependent code. >> >> Target independent code is structured in the following files. >> gcc/pair-fusion.h >> gcc/pair-fusion.cc >> >> Target independent code is the Generic code with pure virtual >> function to interface betwwen target independent and dependent >> code. >> >> Bootstrapped and regtested on aarch64-linux-gnu. >> >> Thabks & Regards >> Ajit >> >> >> aarch64, middle-end: Move pair_fusion pass from aarch64 to middle-end >> >> Move pair fusion pass from aarch64-ldp-fusion.cc to middle-end >> to support multiple targets. >> >> Common infrastructure of load store pair fusion is divided into >> target independent and target dependent code. >> >> Target independent code is structured in the following files. >> gcc/pair-fusion.h >> gcc/pair-fusion.cc >> >> Target independent code is the Generic code with pure virtual >> function to interface betwwen target independent and dependent >> code. >> >> 2024-05-21 Ajit Kumar Agarwal <aagar...@linux.ibm.com> >> >> gcc/ChangeLog: >> >> * pair-fusion.h: Generic header code for load store pair fusion >> that can be shared across different architectures. >> * pair-fusion.cc: Generic source code implementation for >> load store pair fusion that can be shared across different >> architectures. >> * Makefile.in: Add new object file pair-fusion.o. >> * config/aarch64/aarch64-ldp-fusion.cc: Delete generic code and move it >> to pair-fusion.cc in the middle-end. >> * config/aarch64/t-aarch64: Add header file dependency pair-fusion.h. > > insert "on" after dependency. >
Addressed in v3 of the patch. >> --- >> gcc/Makefile.in | 1 + >> gcc/config/aarch64/aarch64-ldp-fusion.cc | 3282 +--------------------- >> gcc/config/aarch64/t-aarch64 | 2 +- >> gcc/pair-fusion.cc | 3013 ++++++++++++++++++++ >> gcc/pair-fusion.h | 189 ++ >> 5 files changed, 3280 insertions(+), 3207 deletions(-) >> create mode 100644 gcc/pair-fusion.cc >> create mode 100644 gcc/pair-fusion.h >> >> diff --git a/gcc/Makefile.in b/gcc/Makefile.in >> index a7f15694c34..643342f623d 100644 >> --- a/gcc/Makefile.in >> +++ b/gcc/Makefile.in >> @@ -1563,6 +1563,7 @@ OBJS = \ >> ipa-strub.o \ >> ipa.o \ >> ira.o \ >> + pair-fusion.o \ >> ira-build.o \ >> ira-costs.o \ >> ira-conflicts.o \ >> diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc >> b/gcc/config/aarch64/aarch64-ldp-fusion.cc >> index 085366cdf68..612f62060bc 100644 >> --- a/gcc/config/aarch64/aarch64-ldp-fusion.cc >> +++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc >> @@ -40,262 +40,13 @@ >> >> using namespace rtl_ssa; > > I think we should drop this, since the public interface and remaining > backend code in this file is independent of RTL-SSA. I think you should > also drop the inlcude of "rtl-ssa.h" from this file. These two > changes will force you to get the header file (pair-fusion.h) right. > > With these changes we can also significantly thin out the include list > in this file. The current set of includes is: > > #define INCLUDE_ALGORITHM > #define INCLUDE_FUNCTIONAL > #define INCLUDE_LIST > #define INCLUDE_TYPE_TRAITS > #include "config.h" > #include "system.h" > #include "coretypes.h" > #include "backend.h" > #include "rtl.h" > #include "df.h" > #include "rtl-iter.h" > #include "rtl-ssa.h" > #include "cfgcleanup.h" > #include "tree-pass.h" > #include "ordered-hash-map.h" > #include "tree-dfa.h" > #include "fold-const.h" > #include "tree-hash-traits.h" > #include "print-tree.h" > #include "insn-attr.h" > > I think instead the following should be enough for this file: > > #include "config.h" > #include "system.h" > #include "coretypes.h" > #include "backend.h" > #include "rtl.h" > #include "memmodel.h" > #include "emit-rtl.h" > #include "tm_p.h" > #include "rtl-iter.h" > #include "tree-pass.h" > #include "insn-attr.h" > #include "pair-fusion.h" > Addressed in v3 of the patch. >> >> +#include "pair-fusion.h" >> + >> static constexpr HOST_WIDE_INT LDP_IMM_BITS = 7; >> static constexpr HOST_WIDE_INT LDP_IMM_SIGN_BIT = (1 << (LDP_IMM_BITS - 1)); >> static constexpr HOST_WIDE_INT LDP_MAX_IMM = LDP_IMM_SIGN_BIT - 1; >> static constexpr HOST_WIDE_INT LDP_MIN_IMM = -LDP_MAX_IMM - 1; >> > <snip> >> diff --git a/gcc/config/aarch64/t-aarch64 b/gcc/config/aarch64/t-aarch64 >> index 78713558e7d..bdada08be70 100644 >> --- a/gcc/config/aarch64/t-aarch64 >> +++ b/gcc/config/aarch64/t-aarch64 >> @@ -203,7 +203,7 @@ aarch64-early-ra.o: >> $(srcdir)/config/aarch64/aarch64-early-ra.cc \ >> aarch64-ldp-fusion.o: $(srcdir)/config/aarch64/aarch64-ldp-fusion.cc \ >> $(CONFIG_H) $(SYSTEM_H) $(CORETYPES_H) $(BACKEND_H) $(RTL_H) $(DF_H) \ >> $(RTL_SSA_H) cfgcleanup.h tree-pass.h ordered-hash-map.h tree-dfa.h \ >> - fold-const.h tree-hash-traits.h print-tree.h >> + fold-const.h tree-hash-traits.h print-tree.h pair-fusion.h >> $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \ >> $(srcdir)/config/aarch64/aarch64-ldp-fusion.cc >> >> diff --git a/gcc/pair-fusion.cc b/gcc/pair-fusion.cc >> new file mode 100644 >> index 00000000000..c873a0f90a8 >> --- /dev/null >> +++ b/gcc/pair-fusion.cc >> @@ -0,0 +1,3013 @@ >> +// Pair Mem fusion generic class implementation. > > How about: > > // Pass to fuse adjacent loads/stores into paired memory accesses. > > instead? > Addressed in v3 of the patch. >> +// Copyright (C) 2024 Free Software Foundation, Inc. >> +// >> +// This file is part of GCC. >> +// >> +// GCC is free software; you can redistribute it and/or modify it >> +// under the terms of the GNU General Public License as published by >> +// the Free Software Foundation; either version 3, or (at your option) >> +// any later version. >> +// >> +// GCC is distributed in the hope that it will be useful, but >> +// WITHOUT ANY WARRANTY; without even the implied warranty of >> +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> +// General Public License for more details. >> +// >> +// You should have received a copy of the GNU General Public License >> +// along with GCC; see the file COPYING3. If not see >> +// <http://www.gnu.org/licenses/>. >> + >> +#define INCLUDE_ALGORITHM >> +#define INCLUDE_FUNCTIONAL >> +#define INCLUDE_LIST >> +#define INCLUDE_TYPE_TRAITS >> +#include "config.h" >> +#include "system.h" >> +#include "coretypes.h" >> +#include "backend.h" >> +#include "rtl.h" >> +#include "df.h" >> +#include "rtl-iter.h" >> +#include "rtl-ssa.h" >> +#include "cfgcleanup.h" >> +#include "tree-pass.h" >> +#include "ordered-hash-map.h" >> +#include "tree-dfa.h" >> +#include "fold-const.h" >> +#include "tree-hash-traits.h" >> +#include "print-tree.h" >> +#include "insn-attr.h" > > I don't think you need this last include any more. > Addressed in v3 of the patch. >> + >> +using namespace rtl_ssa; >> + >> +#include "pair-fusion.h" >> + >> +// We pack these fields (load_p, fpsimd_p, and size) into an integer >> +// (LFS) which we use as part of the key into the main hash tables. >> +// >> +// The idea is that we group candidates together only if they agree on >> +// the fields below. Candidates that disagree on any of these >> +// properties shouldn't be merged together. >> +struct lfs_fields >> +{ >> + bool load_p; >> + bool fpsimd_p; >> + unsigned size; >> +}; >> + >> +using insn_list_t = std::list<insn_info *>; >> + >> +// Information about the accesses at a given offset from a particular >> +// base. Stored in an access_group, see below. >> +struct access_record >> +{ >> + poly_int64 offset; >> + std::list<insn_info *> cand_insns; >> + std::list<access_record>::iterator place; >> + >> + access_record (poly_int64 off) : offset (off) {} >> +}; >> + >> +// A group of accesses where adjacent accesses could be ldp/stp >> +// candidates. The splay tree supports efficient insertion, >> +// while the list supports efficient iteration. >> +struct access_group >> +{ >> + splay_tree<access_record *> tree; >> + std::list<access_record> list; >> + >> + template<typename Alloc> >> + inline void track (Alloc node_alloc, poly_int64 offset, insn_info *insn); >> +}; >> + >> +// Test if this base candidate is viable according to HAZARDS. >> +bool base_cand::viable () const >> +{ >> + return !hazards[0] || !hazards[1] || (*hazards[0] > *hazards[1]); >> +} >> + >> +// Information about an alternate base. For a def_info D, it may >> +// instead be expressed as D = BASE + OFFSET. >> +struct alt_base >> +{ >> + def_info *base; >> + poly_int64 offset; >> +}; >> + >> +// Virtual base class for load/store walkers used in alias analysis. >> +struct alias_walker >> +{ >> + virtual bool conflict_p (int &budget) const = 0; >> + virtual insn_info *insn () const = 0; >> + virtual bool valid () const = 0; >> + virtual void advance () = 0; >> +}; >> + >> + >> +pair_fusion::pair_fusion () >> +{ >> + calculate_dominance_info (CDI_DOMINATORS); >> + df_analyze (); >> + crtl->ssa = new rtl_ssa::function_info (cfun); >> +} >> + >> +pair_fusion::~pair_fusion () >> +{ >> + if (crtl->ssa->perform_pending_updates ()) >> + cleanup_cfg (0); >> + >> + free_dominance_info (CDI_DOMINATORS); >> + >> + delete crtl->ssa; >> + crtl->ssa = nullptr; >> +} >> + >> +// This is the main function to start the pass. >> +void >> +pair_fusion::run () >> +{ >> + if (!track_loads_p () && !track_stores_p ()) >> + return; >> + >> + for (auto bb : crtl->ssa->bbs ()) >> + process_block (bb); >> +} >> + >> +// State used by the pass for a given basic block. >> +struct pair_fusion_bb_info >> +{ >> + using def_hash = nofree_ptr_hash<def_info>; >> + using expr_key_t = pair_hash<tree_operand_hash, int_hash<int, -1, -2>>; >> + using def_key_t = pair_hash<def_hash, int_hash<int, -1, -2>>; >> + >> + // Map of <tree base, LFS> -> access_group. >> + ordered_hash_map<expr_key_t, access_group> expr_map; >> + >> + // Map of <RTL-SSA def_info *, LFS> -> access_group. >> + ordered_hash_map<def_key_t, access_group> def_map; >> + >> + // Given the def_info for an RTL base register, express it as an offset >> from >> + // some canonical base instead. >> + // >> + // Canonicalizing bases in this way allows us to identify adjacent >> accesses >> + // even if they see different base register defs. >> + hash_map<def_hash, alt_base> canon_base_map; >> + >> + static const size_t obstack_alignment = sizeof (void *); >> + >> + pair_fusion_bb_info (bb_info *bb, pair_fusion *d) >> + : m_bb (bb), m_pass (d), m_emitted_tombstone (false) >> + { >> + obstack_specify_allocation (&m_obstack, OBSTACK_CHUNK_SIZE, >> + obstack_alignment, obstack_chunk_alloc, >> + obstack_chunk_free); >> + } >> + ~pair_fusion_bb_info () >> + { >> + obstack_free (&m_obstack, nullptr); >> + >> + if (m_emitted_tombstone) >> + { >> + bitmap_release (&m_tombstone_bitmap); >> + bitmap_obstack_release (&m_bitmap_obstack); >> + } >> + } >> + >> + inline void track_access (insn_info *, bool load, rtx mem); >> + inline void transform (); >> + inline void cleanup_tombstones (); >> + >> +private: >> + obstack m_obstack; >> + bb_info *m_bb; >> + pair_fusion *m_pass; >> + >> + // State for keeping track of tombstone insns emitted for this BB. >> + bitmap_obstack m_bitmap_obstack; >> + bitmap_head m_tombstone_bitmap; >> + bool m_emitted_tombstone; >> + >> + inline splay_tree_node<access_record *> *node_alloc (access_record *); >> + >> + template<typename Map> >> + inline void traverse_base_map (Map &map); >> + inline void transform_for_base (int load_size, access_group &group); >> + >> + inline void merge_pairs (insn_list_t &, insn_list_t &, >> + bool load_p, unsigned access_size); >> + >> + inline bool try_fuse_pair (bool load_p, unsigned access_size, >> + insn_info *i1, insn_info *i2); >> + >> + inline bool fuse_pair (bool load_p, unsigned access_size, >> + int writeback, >> + insn_info *i1, insn_info *i2, >> + base_cand &base, >> + const insn_range_info &move_range); >> + >> + inline void track_tombstone (int uid); >> + >> + inline bool track_via_mem_expr (insn_info *, rtx mem, lfs_fields lfs); >> +}; > > Nit: need a blank line here. > Addressed in v3 of the patch. >> +splay_tree_node<access_record *> * >> +pair_fusion_bb_info::node_alloc (access_record *access) >> +{ >> + using T = splay_tree_node<access_record *>; >> + void *addr = obstack_alloc (&m_obstack, sizeof (T)); >> + return new (addr) T (access); >> +} >> + > <snip> >> diff --git a/gcc/pair-fusion.h b/gcc/pair-fusion.h >> new file mode 100644 >> index 00000000000..80a3581191e >> --- /dev/null >> +++ b/gcc/pair-fusion.h >> @@ -0,0 +1,189 @@ >> +// Pair Mem fusion generic header file. > > Let's have a similar comment to the top of pair-fusion.cc: > > // Pass to fuse adjacent loads/stores into paired memory accesses. > // > // This file contains the definition of the virtual base class which is > // overriden by targets that make use of the pass. > >> +// Copyright (C) 2024 Free Software Foundation, Inc. >> +// >> +// This file is part of GCC. >> +// >> +// GCC is free software; you can redistribute it and/or modify it >> +// under the terms of the GNU General Public License as published by >> +// the Free Software Foundation; either version 3, or (at your option) >> +// any later version. >> +// >> +// GCC is distributed in the hope that it will be useful, but >> +// WITHOUT ANY WARRANTY; without even the implied warranty of >> +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> +// General Public License for more details. >> +// >> +// You should have received a copy of the GNU General Public License >> +// along with GCC; see the file COPYING3. If not see >> +// <http://www.gnu.org/licenses/>. >> + >> +namespace rtl_ssa { >> + class def_info; >> + class insn_info; >> + class insn_range_info; > > You need to forward-declare bb_info as well. Also, since we've dropped the > `using namespace rtl_ssa;` from the header, you need to prefix the various > rtl_ssa names with rtl_ssa:: in the header. > > See the comments at the top of aarch64-ldp-fusion.cc: you'll notice > these problems if you remove the rtl-ssa #include (and using namespace) > there. > Addressed in v3 of the patch. >> +} >> + >> +// Information about a potential base candidate, used in try_fuse_pair. >> +// There may be zero, one, or two viable RTL bases for a given pair. >> +struct base_cand >> +{ >> + // DEF is the def of the base register to be used by the pair. >> + def_info *def; >> + >> + // FROM_INSN is -1 if the base candidate is already shared by both >> + // candidate insns. Otherwise it holds the index of the insn from >> + // which the base originated. >> + // >> + // In the case that the base is shared, either DEF is already used >> + // by both candidate accesses, or both accesses see different versions >> + // of the same regno, in which case DEF is the def consumed by the >> + // first candidate access. >> + int from_insn; >> + >> + // To form a pair, we do so by moving the first access down and the second >> + // access up. To determine where to form the pair, and whether or not >> + // it is safe to form the pair, we track instructions which cannot be >> + // re-ordered past due to either dataflow or alias hazards. >> + // >> + // Since we allow changing the base used by an access, the choice of >> + // base can change which instructions act as re-ordering hazards for >> + // this pair (due to different dataflow). We store the initial >> + // dataflow hazards for this choice of base candidate in HAZARDS. >> + // >> + // These hazards act as re-ordering barriers to each candidate insn >> + // respectively, in program order. >> + // >> + // Later on, when we take alias analysis into account, we narrow >> + // HAZARDS accordingly. >> + insn_info *hazards[2]; >> + >> + base_cand (def_info *def, int insn) >> + : def (def), from_insn (insn), hazards {nullptr, nullptr} {} >> + >> + base_cand (def_info *def) : base_cand (def, -1) {} >> + >> + // Test if this base candidate is viable according to HAZARDS. >> + inline bool viable () const; >> +}; >> + >> +struct alias_walker; >> + >> +// When querying should_handle_writeback, this enum is used to >> +// qualify which opportunities we are asking about. >> +enum class writeback { >> + // Only those writeback opportunities that arise from existing >> + // auto-increment accesses. >> + EXISTING, >> + >> + // All writeback opportunities, including those that involve folding >> + // base register updates into a non-writeback pair. >> + ALL >> +}; >> + >> +// This class can be overriden by targets to give a pass that fuses >> +// adjacent loads and stores into load/store pair instructions. >> +// >> +// The target can override the various virtual functions to customize >> +// the behaviour of the pass as appropriate for the target. >> +struct pair_fusion { >> + pair_fusion (); >> + >> + // Given: >> + // - an rtx REG_OP, the non-memory operand in a load/store insn, >> + // - a machine_mode MEM_MODE, the mode of the MEM in that insn, and >> + // - a boolean LOAD_P (true iff the insn is a load), then: >> + // return true if the access should be considered an FP/SIMD access. >> + // Such accesses are segregated from GPR accesses, since we only want >> + // to form pairs for accesses that use the same register file. >> + virtual bool fpsimd_op_p (rtx, machine_mode, bool) >> + { >> + return false; >> + } >> + >> + // Return true if we should consider forming pairs from memory >> + // accesses with operand mode MODE at this stage in compilation. >> + virtual bool pair_operand_mode_ok_p (machine_mode mode) = 0; >> + >> + // Return true iff REG_OP is a suitable register operand for a paired >> + // memory access, where LOAD_P is true if we're asking about loads and >> + // false for stores. MODE gives the mode of the operand. >> + virtual bool pair_reg_operand_ok_p (bool load_p, rtx reg_op, >> + machine_mode mode) = 0; >> + >> + // Return alias check limit. >> + // This is needed to avoid unbounded quadratic behaviour when >> + // performing alias analysis. >> + virtual int pair_mem_alias_check_limit () = 0; >> + >> + // Return true if we should try to handle writeback opportunities. >> + // WHICH determines the kinds of writeback opportunities the caller >> + // is asking about. >> + virtual bool should_handle_writeback (enum writeback which) = 0; >> + >> + // Given BASE_MEM, the mem from the lower candidate access for a pair, >> + // and LOAD_P (true if the access is a load), check if we should proceed >> + // to form the pair given the target's code generation policy on >> + // paired accesses. >> + virtual bool pair_mem_ok_with_policy (rtx base_mem, bool load_p) = 0; >> + >> + // Generate the pattern for a paired access. PATS gives the patterns >> + // for the individual memory accesses (which by this point must share a >> + // common base register). If WRITEBACK is non-NULL, then this rtx >> + // describes the update to the base register that should be performed by >> + // the resulting insn. LOAD_P is true iff the accesses are loads. >> + virtual rtx gen_pair (rtx *pats, rtx writeback, bool load_p) = 0; >> + >> + // Return true if INSN is a paired memory access. If so, set LOAD_P to >> + // true iff INSN is a load pair. >> + virtual bool pair_mem_insn_p (rtx_insn *insn, bool &load_p) = 0; >> + >> + // Return true if we should track loads. >> + virtual bool track_loads_p () >> + { >> + return true; >> + } >> + >> + // Return true if we should track stores. >> + virtual bool track_stores_p () >> + { >> + return true; >> + } >> + >> + // Return true if OFFSET is in range for a paired memory access. >> + virtual bool pair_mem_in_range_p (HOST_WIDE_INT offset) = 0; >> + >> + // Given a load/store pair insn in PATTERN, unpack the insn, storing >> + // the register operands in REGS, and returning the mem. LOAD_P is >> + // true for loads and false for stores. >> + virtual rtx destructure_pair (rtx regs[2], rtx pattern, bool load_p) = 0; >> + >> + // Given a pair mem in MEM, register operands in REGS, and an rtx >> + // representing the effect of writeback on the base register in WB_EFFECT, >> + // return an insn representing a writeback variant of this pair. >> + // LOAD_P is true iff the pair is a load. >> + // This is used when promoting existing non-writeback pairs to writeback >> + // variants. >> + virtual rtx gen_promote_writeback_pair (rtx wb_effect, rtx mem, >> + rtx regs[2], bool load_p) = 0; >> + >> + inline void process_block (bb_info *bb); >> + inline insn_info *find_trailing_add (insn_info *insns[2], >> + const insn_range_info &pair_range, >> + int initial_writeback, >> + rtx *writeback_effect, >> + def_info **add_def, >> + def_info *base_def, >> + poly_int64 initial_offset, >> + unsigned access_size); >> + inline int get_viable_bases (insn_info *insns[2], >> + vec<base_cand> &base_cands, >> + rtx cand_mems[2], >> + unsigned access_size, >> + bool reversed); >> + inline void do_alias_analysis (insn_info *alias_hazards[4], >> + alias_walker *walkers[4], >> + bool load_p); >> + inline void try_promote_writeback (insn_info *insn, bool load_p); >> + void run (); >> + ~pair_fusion (); >> +}; >> -- >> 2.39.3 >> > > Thanks, > Alex> Thanks & Regards Ajit