Ping ^3! Thanks & Regards Ajit
-------- Forwarded Message -------- Subject: [PING ^3][PATCH v2] rs6000: Add new pass for replacement of contiguous addresses vector load lxv with lxvp Date: Mon, 27 Nov 2023 10:15:26 +0530 From: Ajit Agarwal <aagar...@linux.ibm.com> To: Kewen.Lin <li...@linux.ibm.com>, Segher Boessenkool <seg...@kernel.crashing.org>, Peter Bergner <berg...@linux.ibm.com> Ping^3! Thanks & Regards Ajit -------- Forwarded Message -------- Subject: [PING ^2][PATCH v2] rs6000: Add new pass for replacement of contiguous addresses vector load lxv with lxvp Date: Fri, 10 Nov 2023 12:34:31 +0530 From: Ajit Agarwal <aagar...@linux.ibm.com> To: gcc-patches <gcc-patches@gcc.gnu.org> CC: Kewen.Lin <li...@linux.ibm.com>, Segher Boessenkool <seg...@kernel.crashing.org>, Peter Bergner <berg...@linux.ibm.com> Ping ^2. On 23/10/23 2:02 pm, Ajit Agarwal wrote: > > > Ping ^1. > > -------- Forwarded Message -------- > Subject: [PING ^0][PATCH v2] rs6000: Add new pass for replacement of > contiguous addresses vector load lxv with lxvp > Date: Sun, 15 Oct 2023 17:43:24 +0530 > From: Ajit Agarwal <aagar...@linux.ibm.com> > To: gcc-patches <gcc-patches@gcc.gnu.org> > CC: Segher Boessenkool <seg...@kernel.crashing.org>, Kewen.Lin > <li...@linux.ibm.com>, Peter Bergner <berg...@linux.ibm.com> > > Hello All: > > Please review. > > Thanks & Regards > Ajit > > > -------- Forwarded Message -------- > Subject: [PATCH v2] rs6000: Add new pass for replacement of contiguous > addresses vector load lxv with lxvp > Date: Sun, 8 Oct 2023 00:34:27 +0530 > From: Ajit Agarwal <aagar...@linux.ibm.com> > To: gcc-patches <gcc-patches@gcc.gnu.org> > CC: Segher Boessenkool <seg...@kernel.crashing.org>, Peter Bergner > <berg...@linux.ibm.com>, Kewen.Lin <li...@linux.ibm.com> > > Hello All: > > This patch add new pass to replace contiguous addresses vector load lxv with > mma instruction > lxvp. This patch addresses one regressions failure in ARM architecture. > > Bootstrapped and regtested with powepc64-linux-gnu. > > Thanks & Regards > Ajit > > > rs6000: Add new pass for replacement of contiguous lxv with lxvp. > > New pass to replace contiguous addresses lxv with lxvp. This pass > is registered after ree rtl pass. > > 2023-10-07 Ajit Kumar Agarwal <aagar...@linux.ibm.com> > > gcc/ChangeLog: > > * config/rs6000/rs6000-passes.def: Registered vecload pass. > * config/rs6000/rs6000-vecload-opt.cc: Add new pass. > * config.gcc: Add new executable. > * config/rs6000/rs6000-protos.h: Add new prototype for vecload > pass. > * config/rs6000/rs6000.cc: Add new prototype for vecload pass. > * config/rs6000/t-rs6000: Add new rule. > > gcc/testsuite/ChangeLog: > > * g++.target/powerpc/vecload.C: New test. > --- > gcc/config.gcc | 4 +- > gcc/config/rs6000/rs6000-passes.def | 1 + > gcc/config/rs6000/rs6000-protos.h | 2 + > gcc/config/rs6000/rs6000-vecload-opt.cc | 234 +++++++++++++++++++++ > gcc/config/rs6000/rs6000.cc | 3 +- > gcc/config/rs6000/t-rs6000 | 4 + > gcc/testsuite/g++.target/powerpc/vecload.C | 15 ++ > 7 files changed, 260 insertions(+), 3 deletions(-) > create mode 100644 gcc/config/rs6000/rs6000-vecload-opt.cc > create mode 100644 gcc/testsuite/g++.target/powerpc/vecload.C > > diff --git a/gcc/config.gcc b/gcc/config.gcc > index ee46d96bf62..482ab094b89 100644 > --- a/gcc/config.gcc > +++ b/gcc/config.gcc > @@ -515,7 +515,7 @@ or1k*-*-*) > ;; > powerpc*-*-*) > cpu_type=rs6000 > - extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o" > + extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o > rs6000-vecload-opt.o" > extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o" > extra_objs="${extra_objs} rs6000-builtins.o rs6000-builtin.o" > extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h" > @@ -552,7 +552,7 @@ riscv*) > ;; > rs6000*-*-*) > extra_options="${extra_options} g.opt fused-madd.opt > rs6000/rs6000-tables.opt" > - extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o" > + extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o > rs6000-vecload-opt.o" > extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o" > target_gtfiles="$target_gtfiles > \$(srcdir)/config/rs6000/rs6000-logue.cc > \$(srcdir)/config/rs6000/rs6000-call.cc" > target_gtfiles="$target_gtfiles > \$(srcdir)/config/rs6000/rs6000-pcrel-opt.cc" > diff --git a/gcc/config/rs6000/rs6000-passes.def > b/gcc/config/rs6000/rs6000-passes.def > index ca899d5f7af..9ecf8ce6a9c 100644 > --- a/gcc/config/rs6000/rs6000-passes.def > +++ b/gcc/config/rs6000/rs6000-passes.def > @@ -28,6 +28,7 @@ along with GCC; see the file COPYING3. If not see > The power8 does not have instructions that automaticaly do the byte > swaps > for loads and stores. */ > INSERT_PASS_BEFORE (pass_cse, 1, pass_analyze_swaps); > + INSERT_PASS_AFTER (pass_ree, 1, pass_analyze_vecload); > > /* Pass to do the PCREL_OPT optimization that combines the load of an > external symbol's address along with a single load or store using that > diff --git a/gcc/config/rs6000/rs6000-protos.h > b/gcc/config/rs6000/rs6000-protos.h > index f70118ea40f..9c44bae33d3 100644 > --- a/gcc/config/rs6000/rs6000-protos.h > +++ b/gcc/config/rs6000/rs6000-protos.h > @@ -91,6 +91,7 @@ extern int mems_ok_for_quad_peep (rtx, rtx); > extern bool gpr_or_gpr_p (rtx, rtx); > extern bool direct_move_p (rtx, rtx); > extern bool quad_address_p (rtx, machine_mode, bool); > +extern bool mode_supports_dq_form (machine_mode); > extern bool quad_load_store_p (rtx, rtx); > extern bool fusion_gpr_load_p (rtx, rtx, rtx, rtx); > extern void expand_fusion_gpr_load (rtx *); > @@ -344,6 +345,7 @@ class rtl_opt_pass; > > extern rtl_opt_pass *make_pass_analyze_swaps (gcc::context *); > extern rtl_opt_pass *make_pass_pcrel_opt (gcc::context *); > +extern rtl_opt_pass *make_pass_analyze_vecload (gcc::context *); > extern bool rs6000_sum_of_two_registers_p (const_rtx expr); > extern bool rs6000_quadword_masked_address_p (const_rtx exp); > extern rtx rs6000_gen_lvx (enum machine_mode, rtx, rtx); > diff --git a/gcc/config/rs6000/rs6000-vecload-opt.cc > b/gcc/config/rs6000/rs6000-vecload-opt.cc > new file mode 100644 > index 00000000000..63ee733af89 > --- /dev/null > +++ b/gcc/config/rs6000/rs6000-vecload-opt.cc > @@ -0,0 +1,234 @@ > +/* Subroutines used to replace lxv with lxvp > + for p10 little-endian VSX code. > + Copyright (C) 2020-2023 Free Software Foundation, Inc. > + Contributed by Ajit Kumar Agarwal <aagar...@linux.ibm.com>. > + > + This file is part of GCC. > + > + GCC is free software; you can redistribute it and/or modify it > + under the terms of the GNU General Public License as published > + by the Free Software Foundation; either version 3, or (at your > + option) any later version. > + > + GCC is distributed in the hope that it will be useful, but WITHOUT > + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY > + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public > + License for more details. > + > + You should have received a copy of the GNU General Public License > + along with GCC; see the file COPYING3. If not see > + <http://www.gnu.org/licenses/>. */ > + > +#define IN_TARGET_CODE 1 > + > +#include "config.h" > +#include "system.h" > +#include "coretypes.h" > +#include "backend.h" > +#include "rtl.h" > +#include "tree.h" > +#include "memmodel.h" > +#include "df.h" > +#include "tm_p.h" > +#include "ira.h" > +#include "print-tree.h" > +#include "varasm.h" > +#include "explow.h" > +#include "expr.h" > +#include "output.h" > +#include "tree-pass.h" > +#include "regs.h" > +#include "rtx-vector-builder.h" > +#include "rs6000-protos.h" > + > +static inline bool > +quad_address_offset_p (HOST_WIDE_INT offset) > +{ > + return (IN_RANGE (offset, -32768, 32767) && ((offset) & 0xf) == 0); > +} > + > +/* Replace identified lxv with lxvp. */ > +static void > +replace_lxv_with_lxvp (rtx_insn *insn1, rtx_insn *insn2) > +{ > + rtx body = PATTERN (insn1); > + rtx src_exp = SET_SRC (body); > + rtx dest_exp = SET_DEST (body); > + rtx lxv; > + rtx insn2_body = PATTERN (insn2); > + rtx insn2_dest_exp = SET_DEST (insn2_body); > + unsigned int regno = REGNO (dest_exp); > + > + if (regno > REGNO (insn2_dest_exp)) > + { > + df_set_regs_ever_live (REGNO (dest_exp), false); > + df_set_regs_ever_live (REGNO (insn2_dest_exp), true); > + SET_REGNO (dest_exp, REGNO (insn2_dest_exp)); > + dest_exp->used = 1; > + df_set_regs_ever_live (REGNO (insn2_dest_exp), false); > + df_set_regs_ever_live (regno, true); > + SET_REGNO (insn2_dest_exp, regno); > + insn2_dest_exp->used = 1; > + } > + rtx opnd = gen_rtx_REG (OOmode, REGNO (dest_exp)); > + PUT_MODE (src_exp, OOmode); > + lxv = gen_movoo (opnd, src_exp); > + rtx_insn *new_insn = emit_insn_before (lxv, insn1); > + set_block_for_insn (new_insn, BLOCK_FOR_INSN (insn1)); > + df_insn_rescan (new_insn); > + > + if (dump_file) > + { > + unsigned int new_uid = INSN_UID (new_insn); > + fprintf (dump_file, "Replacing lxv %d with lxvp %d\n", > + INSN_UID (insn1), new_uid); > + } > + df_insn_delete (insn1); > + remove_insn (insn1); > + df_insn_delete (insn2); > + remove_insn (insn2); > + insn1->set_deleted (); > + insn2->set_deleted (); > +} > + > +/* Identify lxv instruction that are candidate of continguous > + addresses and replace them with mma instruction lxvp. */ > +unsigned int > +rs6000_analyze_vecload (function *fun) > +{ > + basic_block bb; > + rtx_insn *insn, *curr_insn = 0; > + rtx_insn *insn1 = 0, *insn2 = 0; > + bool first_vec_insn = false; > + unsigned int offset = 0; > + unsigned int regno = 0; > + > + FOR_ALL_BB_FN (bb, fun) > + FOR_BB_INSNS_SAFE (bb, insn, curr_insn) > + { > + if (NONDEBUG_INSN_P (insn) && GET_CODE (PATTERN (insn)) == SET) > + { > + rtx set = single_set (insn); > + rtx src = SET_SRC (set); > + machine_mode mode = GET_MODE (SET_DEST (set)); > + bool dest_fp_p, dest_vmx_p, dest_vsx_p = false; > + rtx dest = SET_DEST (PATTERN (insn)); > + int dest_regno; > + > + if (REG_P (dest)) > + { > + dest_regno = REGNO (dest); > + dest_fp_p = FP_REGNO_P (dest_regno); > + dest_vmx_p = ALTIVEC_REGNO_P (dest_regno); > + dest_vsx_p = dest_fp_p | dest_vmx_p; > + } > + else > + { > + dest_regno = -1; > + dest_fp_p = dest_vmx_p = dest_vsx_p = false; > + } > + > + if (TARGET_VSX && TARGET_MMA && dest_vsx_p) > + { > + if (mode_supports_dq_form (mode) > + && dest_regno >= 0 && MEM_P (src) > + && quad_address_p (XEXP (src, 0), mode, true)) > + { > + if (first_vec_insn) > + { > + rtx addr = XEXP (src, 0); > + insn2 = insn; > + > + if (GET_CODE (addr) != PLUS) > + return false; > + > + rtx op0 = XEXP (addr, 0); > + if (!REG_P (op0) || !INT_REG_OK_FOR_BASE_P (op0, true)) > + return false; > + > + rtx op1 = XEXP (addr, 1); > + if (!CONST_INT_P (op1)) > + return false; > + > + mem_attrs attrs (*get_mem_attrs (src)); > + bool reg_attrs_found = false; > + > + if (REG_P (dest) && REG_ATTRS (dest)) > + { > + poly_int64 off = REG_ATTRS (dest)->offset; > + if (known_ge (off, 0)) > + reg_attrs_found = true; > + } > + if ((attrs.offset_known_p && known_ge (attrs.offset, 0)) > + && reg_attrs_found > + && quad_address_offset_p (INTVAL (op1)) > + && (regno == REGNO (op0)) > + && ((INTVAL (op1) - offset) == 16)) > + { > + replace_lxv_with_lxvp (insn1, insn2); > + return true; > + } > + } > + if (REG_P (XEXP (src, 0)) > + && GET_CODE (XEXP (src, 0)) != PLUS) > + { > + mem_attrs attrs (*get_mem_attrs (src)); > + if (attrs.offset_known_p) > + offset = attrs.offset; > + if (offset == 0 && REG_P (dest) && REG_ATTRS (dest)) > + offset = REG_ATTRS (dest)->offset; > + regno = REGNO (XEXP (src,0)); > + first_vec_insn = true; > + insn1 = insn; > + } > + } > + } > + } > + } > + return false; > +} > + > +const pass_data pass_data_analyze_vecload = > +{ > + RTL_PASS, /* type */ > + "vecload", /* name */ > + OPTGROUP_NONE, /* optinfo_flags */ > + TV_NONE, /* tv_id */ > + 0, /* properties_required */ > + 0, /* properties_provided */ > + 0, /* properties_destroyed */ > + 0, /* todo_flags_start */ > + TODO_df_finish, /* todo_flags_finish */ > +}; > + > +class pass_analyze_vecload : public rtl_opt_pass > +{ > +public: > + pass_analyze_vecload(gcc::context *ctxt) > + : rtl_opt_pass(pass_data_analyze_vecload, ctxt) > + {} > + > + /* opt_pass methods: */ > + virtual bool gate (function *) > + { > + return (optimize > 0 && TARGET_VSX); > + } > + > + virtual unsigned int execute (function *fun) > + { > + return rs6000_analyze_vecload (fun); > + } > + > + opt_pass *clone () > + { > + return new pass_analyze_vecload (m_ctxt); > + } > + > +}; // class pass_analyze_vecload > + > +rtl_opt_pass * > +make_pass_analyze_vecload (gcc::context *ctxt) > +{ > + return new pass_analyze_vecload (ctxt); > +} > + > diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc > index cc9253bb040..dba545271e0 100644 > --- a/gcc/config/rs6000/rs6000.cc > +++ b/gcc/config/rs6000/rs6000.cc > @@ -387,7 +387,7 @@ mode_supports_vmx_dform (machine_mode mode) > /* Return true if we have D-form addressing in VSX registers. This > addressing > is more limited than normal d-form addressing in that the offset must be > aligned on a 16-byte boundary. */ > -static inline bool > +bool > mode_supports_dq_form (machine_mode mode) > { > return ((reg_addr[mode].addr_mask[RELOAD_REG_ANY] & RELOAD_REG_QUAD_OFFSET) > @@ -1178,6 +1178,7 @@ static bool rs6000_secondary_reload_move (enum > rs6000_reg_type, > secondary_reload_info *, > bool); > rtl_opt_pass *make_pass_analyze_swaps (gcc::context*); > +rtl_opt_pass *make_pass_analyze_vecload (gcc::context*); > > /* Hash table stuff for keeping track of TOC entries. */ > > diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000 > index f183b42ce1d..da7ae26e88b 100644 > --- a/gcc/config/rs6000/t-rs6000 > +++ b/gcc/config/rs6000/t-rs6000 > @@ -47,6 +47,10 @@ rs6000-builtin.o: $(srcdir)/config/rs6000/rs6000-builtin.cc > $(COMPILE) $< > $(POSTCOMPILE) > > +rs6000-vecload-opt.o: $(srcdir)/config/rs6000/rs6000-vecload-opt.cc > + $(COMPILE) $< > + $(POSTCOMPILE) > + > build/rs6000-gen-builtins.o: $(srcdir)/config/rs6000/rs6000-gen-builtins.cc > build/rbtree.o: $(srcdir)/config/rs6000/rbtree.cc > > diff --git a/gcc/testsuite/g++.target/powerpc/vecload.C > b/gcc/testsuite/g++.target/powerpc/vecload.C > new file mode 100644 > index 00000000000..f1689ad6522 > --- /dev/null > +++ b/gcc/testsuite/g++.target/powerpc/vecload.C > @@ -0,0 +1,15 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target powerpc_p9vector_ok } */ > +/* { dg-options "-mdejagnu-cpu=power10 -O2 -mmma" } */ > + > +#include <altivec.h> > + > +void > +foo (__vector_quad *dst, vector unsigned char *ptr, vector unsigned char src) > +{ > + __vector_quad acc; > + __builtin_mma_xvf32ger(&acc, src, ptr[0]); > + __builtin_mma_xvf32gerpp(&acc, src, ptr[1]); > + *dst = acc; > +} > +/* { dg-final { scan-assembler {\mlxvp\M} } } */