Hi Ajit, I've left some more comments below. It's getting there now, thanks for your patience.
On 21/05/2024 20:32, Ajit Agarwal wrote: > Hello Alex/Richard: > > All comments are addressed. > > Move pair fusion pass from aarch64-ldp-fusion.cc to middle-end > to support multiple targets. > > Common infrastructure of load store pair fusion is divided into > target independent and target dependent code. > > Target independent code is structured in the following files. > gcc/pair-fusion.h > gcc/pair-fusion.cc > > Target independent code is the Generic code with pure virtual > function to interface betwwen target independent and dependent > code. > > Bootstrapped and regtested on aarch64-linux-gnu. > > Thabks & Regards > Ajit > > > aarch64, middle-end: Move pair_fusion pass from aarch64 to middle-end > > Move pair fusion pass from aarch64-ldp-fusion.cc to middle-end > to support multiple targets. > > Common infrastructure of load store pair fusion is divided into > target independent and target dependent code. > > Target independent code is structured in the following files. > gcc/pair-fusion.h > gcc/pair-fusion.cc > > Target independent code is the Generic code with pure virtual > function to interface betwwen target independent and dependent > code. > > 2024-05-21 Ajit Kumar Agarwal <aagar...@linux.ibm.com> > > gcc/ChangeLog: > > * pair-fusion.h: Generic header code for load store pair fusion > that can be shared across different architectures. > * pair-fusion.cc: Generic source code implementation for > load store pair fusion that can be shared across different > architectures. > * Makefile.in: Add new object file pair-fusion.o. > * config/aarch64/aarch64-ldp-fusion.cc: Delete generic code and move it > to pair-fusion.cc in the middle-end. > * config/aarch64/t-aarch64: Add header file dependency pair-fusion.h. insert "on" after dependency. > --- > gcc/Makefile.in | 1 + > gcc/config/aarch64/aarch64-ldp-fusion.cc | 3282 +--------------------- > gcc/config/aarch64/t-aarch64 | 2 +- > gcc/pair-fusion.cc | 3013 ++++++++++++++++++++ > gcc/pair-fusion.h | 189 ++ > 5 files changed, 3280 insertions(+), 3207 deletions(-) > create mode 100644 gcc/pair-fusion.cc > create mode 100644 gcc/pair-fusion.h > > diff --git a/gcc/Makefile.in b/gcc/Makefile.in > index a7f15694c34..643342f623d 100644 > --- a/gcc/Makefile.in > +++ b/gcc/Makefile.in > @@ -1563,6 +1563,7 @@ OBJS = \ > ipa-strub.o \ > ipa.o \ > ira.o \ > + pair-fusion.o \ > ira-build.o \ > ira-costs.o \ > ira-conflicts.o \ > diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc > b/gcc/config/aarch64/aarch64-ldp-fusion.cc > index 085366cdf68..612f62060bc 100644 > --- a/gcc/config/aarch64/aarch64-ldp-fusion.cc > +++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc > @@ -40,262 +40,13 @@ > > using namespace rtl_ssa; I think we should drop this, since the public interface and remaining backend code in this file is independent of RTL-SSA. I think you should also drop the inlcude of "rtl-ssa.h" from this file. These two changes will force you to get the header file (pair-fusion.h) right. With these changes we can also significantly thin out the include list in this file. The current set of includes is: #define INCLUDE_ALGORITHM #define INCLUDE_FUNCTIONAL #define INCLUDE_LIST #define INCLUDE_TYPE_TRAITS #include "config.h" #include "system.h" #include "coretypes.h" #include "backend.h" #include "rtl.h" #include "df.h" #include "rtl-iter.h" #include "rtl-ssa.h" #include "cfgcleanup.h" #include "tree-pass.h" #include "ordered-hash-map.h" #include "tree-dfa.h" #include "fold-const.h" #include "tree-hash-traits.h" #include "print-tree.h" #include "insn-attr.h" I think instead the following should be enough for this file: #include "config.h" #include "system.h" #include "coretypes.h" #include "backend.h" #include "rtl.h" #include "memmodel.h" #include "emit-rtl.h" #include "tm_p.h" #include "rtl-iter.h" #include "tree-pass.h" #include "insn-attr.h" #include "pair-fusion.h" > > +#include "pair-fusion.h" > + > static constexpr HOST_WIDE_INT LDP_IMM_BITS = 7; > static constexpr HOST_WIDE_INT LDP_IMM_SIGN_BIT = (1 << (LDP_IMM_BITS - 1)); > static constexpr HOST_WIDE_INT LDP_MAX_IMM = LDP_IMM_SIGN_BIT - 1; > static constexpr HOST_WIDE_INT LDP_MIN_IMM = -LDP_MAX_IMM - 1; > <snip> > diff --git a/gcc/config/aarch64/t-aarch64 b/gcc/config/aarch64/t-aarch64 > index 78713558e7d..bdada08be70 100644 > --- a/gcc/config/aarch64/t-aarch64 > +++ b/gcc/config/aarch64/t-aarch64 > @@ -203,7 +203,7 @@ aarch64-early-ra.o: > $(srcdir)/config/aarch64/aarch64-early-ra.cc \ > aarch64-ldp-fusion.o: $(srcdir)/config/aarch64/aarch64-ldp-fusion.cc \ > $(CONFIG_H) $(SYSTEM_H) $(CORETYPES_H) $(BACKEND_H) $(RTL_H) $(DF_H) \ > $(RTL_SSA_H) cfgcleanup.h tree-pass.h ordered-hash-map.h tree-dfa.h \ > - fold-const.h tree-hash-traits.h print-tree.h > + fold-const.h tree-hash-traits.h print-tree.h pair-fusion.h > $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \ > $(srcdir)/config/aarch64/aarch64-ldp-fusion.cc > > diff --git a/gcc/pair-fusion.cc b/gcc/pair-fusion.cc > new file mode 100644 > index 00000000000..c873a0f90a8 > --- /dev/null > +++ b/gcc/pair-fusion.cc > @@ -0,0 +1,3013 @@ > +// Pair Mem fusion generic class implementation. How about: // Pass to fuse adjacent loads/stores into paired memory accesses. instead? > +// Copyright (C) 2024 Free Software Foundation, Inc. > +// > +// This file is part of GCC. > +// > +// GCC is free software; you can redistribute it and/or modify it > +// under the terms of the GNU General Public License as published by > +// the Free Software Foundation; either version 3, or (at your option) > +// any later version. > +// > +// GCC is distributed in the hope that it will be useful, but > +// WITHOUT ANY WARRANTY; without even the implied warranty of > +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > +// General Public License for more details. > +// > +// You should have received a copy of the GNU General Public License > +// along with GCC; see the file COPYING3. If not see > +// <http://www.gnu.org/licenses/>. > + > +#define INCLUDE_ALGORITHM > +#define INCLUDE_FUNCTIONAL > +#define INCLUDE_LIST > +#define INCLUDE_TYPE_TRAITS > +#include "config.h" > +#include "system.h" > +#include "coretypes.h" > +#include "backend.h" > +#include "rtl.h" > +#include "df.h" > +#include "rtl-iter.h" > +#include "rtl-ssa.h" > +#include "cfgcleanup.h" > +#include "tree-pass.h" > +#include "ordered-hash-map.h" > +#include "tree-dfa.h" > +#include "fold-const.h" > +#include "tree-hash-traits.h" > +#include "print-tree.h" > +#include "insn-attr.h" I don't think you need this last include any more. > + > +using namespace rtl_ssa; > + > +#include "pair-fusion.h" > + > +// We pack these fields (load_p, fpsimd_p, and size) into an integer > +// (LFS) which we use as part of the key into the main hash tables. > +// > +// The idea is that we group candidates together only if they agree on > +// the fields below. Candidates that disagree on any of these > +// properties shouldn't be merged together. > +struct lfs_fields > +{ > + bool load_p; > + bool fpsimd_p; > + unsigned size; > +}; > + > +using insn_list_t = std::list<insn_info *>; > + > +// Information about the accesses at a given offset from a particular > +// base. Stored in an access_group, see below. > +struct access_record > +{ > + poly_int64 offset; > + std::list<insn_info *> cand_insns; > + std::list<access_record>::iterator place; > + > + access_record (poly_int64 off) : offset (off) {} > +}; > + > +// A group of accesses where adjacent accesses could be ldp/stp > +// candidates. The splay tree supports efficient insertion, > +// while the list supports efficient iteration. > +struct access_group > +{ > + splay_tree<access_record *> tree; > + std::list<access_record> list; > + > + template<typename Alloc> > + inline void track (Alloc node_alloc, poly_int64 offset, insn_info *insn); > +}; > + > +// Test if this base candidate is viable according to HAZARDS. > +bool base_cand::viable () const > +{ > + return !hazards[0] || !hazards[1] || (*hazards[0] > *hazards[1]); > +} > + > +// Information about an alternate base. For a def_info D, it may > +// instead be expressed as D = BASE + OFFSET. > +struct alt_base > +{ > + def_info *base; > + poly_int64 offset; > +}; > + > +// Virtual base class for load/store walkers used in alias analysis. > +struct alias_walker > +{ > + virtual bool conflict_p (int &budget) const = 0; > + virtual insn_info *insn () const = 0; > + virtual bool valid () const = 0; > + virtual void advance () = 0; > +}; > + > + > +pair_fusion::pair_fusion () > +{ > + calculate_dominance_info (CDI_DOMINATORS); > + df_analyze (); > + crtl->ssa = new rtl_ssa::function_info (cfun); > +} > + > +pair_fusion::~pair_fusion () > +{ > + if (crtl->ssa->perform_pending_updates ()) > + cleanup_cfg (0); > + > + free_dominance_info (CDI_DOMINATORS); > + > + delete crtl->ssa; > + crtl->ssa = nullptr; > +} > + > +// This is the main function to start the pass. > +void > +pair_fusion::run () > +{ > + if (!track_loads_p () && !track_stores_p ()) > + return; > + > + for (auto bb : crtl->ssa->bbs ()) > + process_block (bb); > +} > + > +// State used by the pass for a given basic block. > +struct pair_fusion_bb_info > +{ > + using def_hash = nofree_ptr_hash<def_info>; > + using expr_key_t = pair_hash<tree_operand_hash, int_hash<int, -1, -2>>; > + using def_key_t = pair_hash<def_hash, int_hash<int, -1, -2>>; > + > + // Map of <tree base, LFS> -> access_group. > + ordered_hash_map<expr_key_t, access_group> expr_map; > + > + // Map of <RTL-SSA def_info *, LFS> -> access_group. > + ordered_hash_map<def_key_t, access_group> def_map; > + > + // Given the def_info for an RTL base register, express it as an offset > from > + // some canonical base instead. > + // > + // Canonicalizing bases in this way allows us to identify adjacent accesses > + // even if they see different base register defs. > + hash_map<def_hash, alt_base> canon_base_map; > + > + static const size_t obstack_alignment = sizeof (void *); > + > + pair_fusion_bb_info (bb_info *bb, pair_fusion *d) > + : m_bb (bb), m_pass (d), m_emitted_tombstone (false) > + { > + obstack_specify_allocation (&m_obstack, OBSTACK_CHUNK_SIZE, > + obstack_alignment, obstack_chunk_alloc, > + obstack_chunk_free); > + } > + ~pair_fusion_bb_info () > + { > + obstack_free (&m_obstack, nullptr); > + > + if (m_emitted_tombstone) > + { > + bitmap_release (&m_tombstone_bitmap); > + bitmap_obstack_release (&m_bitmap_obstack); > + } > + } > + > + inline void track_access (insn_info *, bool load, rtx mem); > + inline void transform (); > + inline void cleanup_tombstones (); > + > +private: > + obstack m_obstack; > + bb_info *m_bb; > + pair_fusion *m_pass; > + > + // State for keeping track of tombstone insns emitted for this BB. > + bitmap_obstack m_bitmap_obstack; > + bitmap_head m_tombstone_bitmap; > + bool m_emitted_tombstone; > + > + inline splay_tree_node<access_record *> *node_alloc (access_record *); > + > + template<typename Map> > + inline void traverse_base_map (Map &map); > + inline void transform_for_base (int load_size, access_group &group); > + > + inline void merge_pairs (insn_list_t &, insn_list_t &, > + bool load_p, unsigned access_size); > + > + inline bool try_fuse_pair (bool load_p, unsigned access_size, > + insn_info *i1, insn_info *i2); > + > + inline bool fuse_pair (bool load_p, unsigned access_size, > + int writeback, > + insn_info *i1, insn_info *i2, > + base_cand &base, > + const insn_range_info &move_range); > + > + inline void track_tombstone (int uid); > + > + inline bool track_via_mem_expr (insn_info *, rtx mem, lfs_fields lfs); > +}; Nit: need a blank line here. > +splay_tree_node<access_record *> * > +pair_fusion_bb_info::node_alloc (access_record *access) > +{ > + using T = splay_tree_node<access_record *>; > + void *addr = obstack_alloc (&m_obstack, sizeof (T)); > + return new (addr) T (access); > +} > + <snip> > diff --git a/gcc/pair-fusion.h b/gcc/pair-fusion.h > new file mode 100644 > index 00000000000..80a3581191e > --- /dev/null > +++ b/gcc/pair-fusion.h > @@ -0,0 +1,189 @@ > +// Pair Mem fusion generic header file. Let's have a similar comment to the top of pair-fusion.cc: // Pass to fuse adjacent loads/stores into paired memory accesses. // // This file contains the definition of the virtual base class which is // overriden by targets that make use of the pass. > +// Copyright (C) 2024 Free Software Foundation, Inc. > +// > +// This file is part of GCC. > +// > +// GCC is free software; you can redistribute it and/or modify it > +// under the terms of the GNU General Public License as published by > +// the Free Software Foundation; either version 3, or (at your option) > +// any later version. > +// > +// GCC is distributed in the hope that it will be useful, but > +// WITHOUT ANY WARRANTY; without even the implied warranty of > +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > +// General Public License for more details. > +// > +// You should have received a copy of the GNU General Public License > +// along with GCC; see the file COPYING3. If not see > +// <http://www.gnu.org/licenses/>. > + > +namespace rtl_ssa { > + class def_info; > + class insn_info; > + class insn_range_info; You need to forward-declare bb_info as well. Also, since we've dropped the `using namespace rtl_ssa;` from the header, you need to prefix the various rtl_ssa names with rtl_ssa:: in the header. See the comments at the top of aarch64-ldp-fusion.cc: you'll notice these problems if you remove the rtl-ssa #include (and using namespace) there. > +} > + > +// Information about a potential base candidate, used in try_fuse_pair. > +// There may be zero, one, or two viable RTL bases for a given pair. > +struct base_cand > +{ > + // DEF is the def of the base register to be used by the pair. > + def_info *def; > + > + // FROM_INSN is -1 if the base candidate is already shared by both > + // candidate insns. Otherwise it holds the index of the insn from > + // which the base originated. > + // > + // In the case that the base is shared, either DEF is already used > + // by both candidate accesses, or both accesses see different versions > + // of the same regno, in which case DEF is the def consumed by the > + // first candidate access. > + int from_insn; > + > + // To form a pair, we do so by moving the first access down and the second > + // access up. To determine where to form the pair, and whether or not > + // it is safe to form the pair, we track instructions which cannot be > + // re-ordered past due to either dataflow or alias hazards. > + // > + // Since we allow changing the base used by an access, the choice of > + // base can change which instructions act as re-ordering hazards for > + // this pair (due to different dataflow). We store the initial > + // dataflow hazards for this choice of base candidate in HAZARDS. > + // > + // These hazards act as re-ordering barriers to each candidate insn > + // respectively, in program order. > + // > + // Later on, when we take alias analysis into account, we narrow > + // HAZARDS accordingly. > + insn_info *hazards[2]; > + > + base_cand (def_info *def, int insn) > + : def (def), from_insn (insn), hazards {nullptr, nullptr} {} > + > + base_cand (def_info *def) : base_cand (def, -1) {} > + > + // Test if this base candidate is viable according to HAZARDS. > + inline bool viable () const; > +}; > + > +struct alias_walker; > + > +// When querying should_handle_writeback, this enum is used to > +// qualify which opportunities we are asking about. > +enum class writeback { > + // Only those writeback opportunities that arise from existing > + // auto-increment accesses. > + EXISTING, > + > + // All writeback opportunities, including those that involve folding > + // base register updates into a non-writeback pair. > + ALL > +}; > + > +// This class can be overriden by targets to give a pass that fuses > +// adjacent loads and stores into load/store pair instructions. > +// > +// The target can override the various virtual functions to customize > +// the behaviour of the pass as appropriate for the target. > +struct pair_fusion { > + pair_fusion (); > + > + // Given: > + // - an rtx REG_OP, the non-memory operand in a load/store insn, > + // - a machine_mode MEM_MODE, the mode of the MEM in that insn, and > + // - a boolean LOAD_P (true iff the insn is a load), then: > + // return true if the access should be considered an FP/SIMD access. > + // Such accesses are segregated from GPR accesses, since we only want > + // to form pairs for accesses that use the same register file. > + virtual bool fpsimd_op_p (rtx, machine_mode, bool) > + { > + return false; > + } > + > + // Return true if we should consider forming pairs from memory > + // accesses with operand mode MODE at this stage in compilation. > + virtual bool pair_operand_mode_ok_p (machine_mode mode) = 0; > + > + // Return true iff REG_OP is a suitable register operand for a paired > + // memory access, where LOAD_P is true if we're asking about loads and > + // false for stores. MODE gives the mode of the operand. > + virtual bool pair_reg_operand_ok_p (bool load_p, rtx reg_op, > + machine_mode mode) = 0; > + > + // Return alias check limit. > + // This is needed to avoid unbounded quadratic behaviour when > + // performing alias analysis. > + virtual int pair_mem_alias_check_limit () = 0; > + > + // Return true if we should try to handle writeback opportunities. > + // WHICH determines the kinds of writeback opportunities the caller > + // is asking about. > + virtual bool should_handle_writeback (enum writeback which) = 0; > + > + // Given BASE_MEM, the mem from the lower candidate access for a pair, > + // and LOAD_P (true if the access is a load), check if we should proceed > + // to form the pair given the target's code generation policy on > + // paired accesses. > + virtual bool pair_mem_ok_with_policy (rtx base_mem, bool load_p) = 0; > + > + // Generate the pattern for a paired access. PATS gives the patterns > + // for the individual memory accesses (which by this point must share a > + // common base register). If WRITEBACK is non-NULL, then this rtx > + // describes the update to the base register that should be performed by > + // the resulting insn. LOAD_P is true iff the accesses are loads. > + virtual rtx gen_pair (rtx *pats, rtx writeback, bool load_p) = 0; > + > + // Return true if INSN is a paired memory access. If so, set LOAD_P to > + // true iff INSN is a load pair. > + virtual bool pair_mem_insn_p (rtx_insn *insn, bool &load_p) = 0; > + > + // Return true if we should track loads. > + virtual bool track_loads_p () > + { > + return true; > + } > + > + // Return true if we should track stores. > + virtual bool track_stores_p () > + { > + return true; > + } > + > + // Return true if OFFSET is in range for a paired memory access. > + virtual bool pair_mem_in_range_p (HOST_WIDE_INT offset) = 0; > + > + // Given a load/store pair insn in PATTERN, unpack the insn, storing > + // the register operands in REGS, and returning the mem. LOAD_P is > + // true for loads and false for stores. > + virtual rtx destructure_pair (rtx regs[2], rtx pattern, bool load_p) = 0; > + > + // Given a pair mem in MEM, register operands in REGS, and an rtx > + // representing the effect of writeback on the base register in WB_EFFECT, > + // return an insn representing a writeback variant of this pair. > + // LOAD_P is true iff the pair is a load. > + // This is used when promoting existing non-writeback pairs to writeback > + // variants. > + virtual rtx gen_promote_writeback_pair (rtx wb_effect, rtx mem, > + rtx regs[2], bool load_p) = 0; > + > + inline void process_block (bb_info *bb); > + inline insn_info *find_trailing_add (insn_info *insns[2], > + const insn_range_info &pair_range, > + int initial_writeback, > + rtx *writeback_effect, > + def_info **add_def, > + def_info *base_def, > + poly_int64 initial_offset, > + unsigned access_size); > + inline int get_viable_bases (insn_info *insns[2], > + vec<base_cand> &base_cands, > + rtx cand_mems[2], > + unsigned access_size, > + bool reversed); > + inline void do_alias_analysis (insn_info *alias_hazards[4], > + alias_walker *walkers[4], > + bool load_p); > + inline void try_promote_writeback (insn_info *insn, bool load_p); > + void run (); > + ~pair_fusion (); > +}; > -- > 2.39.3 > Thanks, Alex