Hello Richard: On 12/08/24 5:30 pm, Richard Sandiford wrote: > Ajit Agarwal <aagar...@linux.ibm.com> writes: >> [...] >> +static void >> +update_change (set_info *set) >> +{ >> + if (!set->has_any_uses ()) >> + return; >> + >> + auto *use = *set->all_uses ().begin (); >> + do >> + { >> + auto *next_use = use->next_use (); >> + if (use->is_in_phi ()) >> + { >> + update_change (use->phi ()); >> + } >> + else >> + { >> + crtl->ssa->remove_use (use); > > This isn't right. AFAICT it's simply removing uses from a debug > instruction even though the debug instruction pattern still refers > to the register. > > Instead, the debug instruction needs to be updated to refer to the > correct half of the new double-width load result. If the update fails > for any reason, the debug insn should be "reset" (i.e. changed to > > INSN_VAR_LOCATION_LOC (use_rtl) = gen_rtx_UNKNOWN_VAR_LOC (); > > see insn_combination::substitute_debug_use in late-combine.cc for > an example). >
Addressed in v8 of the patch. >> + } >> + use = next_use; >> + } >> + while (use); >> +} >> [...] >> +// Generate new reg rtx with copy of OLD_DEST for OOmode pair. >> +static rtx >> +new_reg_rtx (rtx old_dest) >> +{ >> + rtx new_dest_exp = gen_reg_rtx (OOmode); >> + ORIGINAL_REGNO (new_dest_exp) = ORIGINAL_REGNO (old_dest); >> + REG_USERVAR_P (new_dest_exp) = REG_USERVAR_P (old_dest); >> + REG_POINTER (new_dest_exp) = REG_POINTER (old_dest); >> + REG_ATTRS (new_dest_exp) = REG_ATTRS (old_dest); >> + max_regno = max_reg_num (); >> + return new_dest_exp; >> +} > > The new register is a different size and mode from OLD_DEST, so it isn't > appropriate to copy across this much information. The caller should just > use gen_reg_rtx directly. > Addressed in v8 of the patch. >> + >> +// Set subreg with use of INSN given SRC rtx instruction. >> +static void >> +set_load_subreg (insn_info *i1, rtx src) >> +{ >> + rtx set = single_set (i1->rtl()); >> + rtx old_dest = SET_DEST (set); >> + >> + for (auto def : i1->defs ()) >> + { >> + auto set = dyn_cast<set_info *> (def); >> + for (auto use : set->nondebug_insn_uses ()) >> + { >> + insn_info *info = use->insn (); >> + if (!info || !info->rtl ()) >> + continue; >> + >> + rtx_insn *rtl_insn = info->rtl (); >> + df_ref ref; >> + >> + FOR_EACH_INSN_USE (ref, rtl_insn) >> + { >> + rtx dest_exp = SET_DEST (PATTERN (i1->rtl ())); >> + if (REG_P (dest_exp) >> + && DF_REF_REGNO (ref) == REGNO (dest_exp)) >> + { >> + rtx *loc = DF_REF_LOC (ref); >> + insn_propagation prop (rtl_insn, old_dest, src); >> + if (GET_CODE (*loc) == SUBREG) >> + { >> + if (!prop.apply_to_pattern (loc)) >> + { >> + if (dump_file != NULL) >> + { >> + fprintf (dump_file, >> + "Cannot propagate insn \n"); >> + print_rtl_single (dump_file, rtl_insn); > > We can't simply ignore the failure, since it would leave the instruction > referring to the wrong register. We either need to assert that the failure > can't happen (dangerous) or bail out of the load fusion if propagation fails. > Addressed in v8 of the patch. >> + } >> + return; >> + } >> + } >> + else >> + *loc = copy_rtx (src); >> + } >> + } >> + } > > We shouldn't have two ways of doing this. The above FOR_EACH_INSN_USE > loop should be replaced by something like: > > insn_propagation prop (rtl_insn, old_dest, src); > if (!prop.apply_to_pattern (loc)) > ... > > so that the substitution is done in one step, and only done using > insn_propagation. > > Like I mentioned before, this change needs to be described as an > insn_change as well, so that rtl-ssa framework will see it. This is > important because later insn movement decisions require the def-use > information to be up-to-date. > Addressed in v8 of the patch. >> + } >> +} >> + >> +// Set subreg for OO mode store pair to generate registers in pairs >> +// given insn_info I1 and I2. >> +static void >> +set_multiword_subreg_store (insn_info *i1, insn_info *i2) >> +{ >> + rtx_insn *insn1 = i1->rtl (); >> + rtx_insn *insn2 = i2->rtl (); >> + rtx body = PATTERN (insn1); >> + rtx src_exp = SET_SRC (body); >> + rtx insn2_body = PATTERN (insn2); >> + rtx insn2_dest_exp = SET_DEST (insn2_body); >> + machine_mode mode = GET_MODE (src_exp); >> + int regoff; >> + rtx src; >> + rtx addr = XEXP (insn2_dest_exp, 0); >> + >> + PUT_MODE_RAW (src_exp, OOmode); >> + if (GET_CODE (addr) == PLUS >> + && XEXP (addr, 1) && CONST_INT_P (XEXP (addr, 1))) >> + regoff = 16; >> + else >> + regoff = 0; >> + >> + src = simplify_gen_subreg (mode, >> + src_exp, GET_MODE (src_exp), >> + regoff); >> + >> + set_store_subreg (i1, src, regoff); >> + >> + int regoff1 = 0; >> + rtx src1; >> + >> + src1 = simplify_gen_subreg (mode, >> + src_exp, GET_MODE (src_exp), >> + regoff1); >> + >> + set_store_subreg (i2, src1, regoff1); >> + set_rescan_store (i1); >> + set_rescan_store (i2); >> + df_insn_rescan (insn1); >> +} >> + >> +// Return true iff insn I1 has already existing subreg. >> +static bool >> +use_has_subreg_p (insn_info *i1) >> +{ >> + for (auto def : i1->defs ()) >> + { >> + auto set = dyn_cast<set_info *> (def); >> + for (auto use : set->nondebug_insn_uses ()) >> + { >> + insn_info *info = use->insn (); >> + if (info && info->rtl ()) >> + { >> + rtx_insn *rtl_insn = info->rtl (); >> + df_ref ref; >> + FOR_EACH_INSN_USE (ref, rtl_insn) >> + { >> + rtx dest_exp = SET_DEST (PATTERN (i1->rtl ())); >> + if (REG_P (dest_exp) >> + && DF_REF_REGNO (ref) == REGNO (dest_exp)) >> + { >> + rtx *loc = DF_REF_LOC (ref); >> + if (GET_CODE (*loc) == SUBREG) >> + return true; >> + } >> + } >> + } >> + } >> + } >> + return false; >> +} >> + >> +// Set subreg for OO mode pair load for existing subreg rtx to generate >> +// registers in pairs given insn_info I2 and I2. >> +static void >> +set_multiword_subreg_load (insn_info *i1, insn_info *i2) >> + >> +{ >> + rtx_insn *insn1 = i1->rtl (); >> + rtx body = PATTERN (insn1); >> + rtx dest_exp = SET_DEST (body); >> + machine_mode mode = GET_MODE (dest_exp); >> + int regoff1; >> + regoff1 = 16; >> + >> + bool subreg_p = false; >> + >> + if (use_has_subreg_p (i1)) >> + subreg_p = true; > > Like I said in the last review, I don't think we should have a separate > path for uses that involved subregs and uses that didn't. What is currently > the "subreg_p" path should be the only path. We should use it even if > there were no references involving subregs. > > We shouldn't in general change the width of an existing pseudo register, > since it would invalidate REG_ATTRS and any debug information associated > with the register. The rs6000 form of load fusion should always create > a fresh pseudo register for the double-width fused result. > Addressed in v8 of the patch. >> + >> + rtx new_dest_exp = dest_exp; >> + >> + if (subreg_p) >> + new_dest_exp = new_reg_rtx (dest_exp); >> + else >> + PUT_MODE_RAW (new_dest_exp, OOmode); >> + >> + rtx src = simplify_gen_subreg (mode, >> + new_dest_exp, >> + OOmode, >> + regoff1); >> + >> + set_load_subreg (i1, src); >> + if (subreg_p) >> + propagate_insn (i1, new_dest_exp); >> + >> + int regoff = 0; >> + rtx sset = single_set (i2->rtl ()); >> + rtx insn2_dest_exp = SET_DEST (sset); >> + machine_mode insn2_mode = GET_MODE (insn2_dest_exp); >> + >> + src = simplify_gen_subreg (insn2_mode, >> + new_dest_exp, >> + OOmode, >> + regoff); >> + >> + set_load_subreg (i2, src); >> + >> + if (subreg_p) >> + propagate_insn (i2, new_dest_exp); >> + >> + if (subreg_p) >> + { >> + auto attempt = crtl->ssa->new_change_attempt (); >> + resource_info resource = { GET_MODE (new_dest_exp), REGNO >> (new_dest_exp) }; >> + auto *set = crtl->ssa->allocate<set_info> (i1, resource); >> + if (set) >> + { >> + auto def = find_access (i1->defs (), REGNO (new_dest_exp)); >> + if (!def) >> + i1->defs() = insert_access (attempt, set, i1->defs()); >> + } > > Changes to rtl-ssa insn_infos should always happen through insn_changes, > rather than by direct modification of insns. This allows the framework > to do all the required book-keeping. E.g. the above does not extend > the function_info::m_defs array. (And it shouldn't extend that array > directly -- rtl-ssa should do that internally.) > > We already have: > > function_info::create_set > > which creates a temporary definition of something, for use in an > insn_change. I'm not sure that it handles completely new registers, > because no-one's needed that yet, but it would be a reasonable thing > to add (in function_info::change_insns). > > In summary: > > - please always use a new pseudo register for the fused load result > - please always use insn_propagation to update uses of the load results > - please create an insn_change for each use, and add the changes to the > existing list of insn_changes made by the main pair-fusion.cc code > - please don't try to change the rtl-ssa ir directly from rs6000 code; > instead, describe the new instruction using insn_changes > - once we know that the fusion is going to go ahead, we should update > all debug uses to refer to the new register half, in a similar way > to late-combine's insn_combination::substitute_optional_uses > (perhaps I should try to provide an abstraction around that). > Addressed in v8 of the patch. > Thanks, > Richard > Thanks & Regards Ajit >> + } >> + >> + set_rescan_load (i1); >> + set_rescan_load (i2); >> + df_insn_rescan (insn1); >> +} >> + >> +// Set subreg for OO mode pair to generate sequential registers given >> +// insn_info pairs I1, I2 and LOAD_P is true iff load insn and false >> +// if store insn. >> +void >> +rs6000_pair_fusion::set_multiword_subreg (insn_info *i1, insn_info *i2, >> + bool load_p) >> +{ >> + if (load_p) >> + set_multiword_subreg_load (i1, i2); >> + else >> + set_multiword_subreg_store (i1, i2); >> +} >> + >> +rtx >> +rs6000_pair_fusion::gen_pair (rtx *pats, rtx, bool load_p) >> +{ >> + rtx i1 = pats[0]; >> + rtx src_exp = SET_SRC (i1); >> + rtx dest_exp = SET_DEST (i1); >> + PUT_MODE_RAW (src_exp, OOmode); >> + PUT_MODE_RAW (dest_exp, OOmode); >> + rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest_exp), >> + gen_rtvec (1, src_exp), >> + UNSPEC_LXVP); >> + rtx set = gen_rtx_SET (dest_exp, unspec); >> + if (dump_file) >> + { >> + if (load_p) >> + fprintf (dump_file, "lxv with lxvp "); >> + else >> + fprintf (dump_file, "stxv with stxvp "); >> + print_rtl_single (dump_file, set); >> + } >> + return set; >> +} >> + >> +const pass_data pass_data_mem_fusion = >> +{ >> + RTL_PASS, /* type */ >> + "mem_fusion", /* name */ >> + OPTGROUP_NONE, /* optinfo_flags */ >> + TV_NONE, /* tv_id */ >> + 0, /* properties_required */ >> + 0, /* properties_provided */ >> + 0, /* properties_destroyed */ >> + 0, /* todo_flags_start */ >> + TODO_df_finish, /* todo_flags_finish */ >> +}; >> + >> +class pass_mem_fusion : public rtl_opt_pass >> +{ >> +public: >> + pass_mem_fusion (gcc::context *ctxt) >> + : rtl_opt_pass (pass_data_mem_fusion, ctxt) >> + {} >> + >> + opt_pass *clone () override { return new pass_mem_fusion (m_ctxt);} >> + >> + /* opt_pass methods: */ >> + bool gate (function *) >> + { >> + return (optimize > 0 && TARGET_VSX && TARGET_POWER10); >> + } >> + >> + unsigned int execute (function *) final override >> + { >> + rs6000_pair_fusion pass; >> + pass.run (); >> + return 0; >> + } >> +}; // class pass_mem_fusion >> + >> +rtl_opt_pass * >> +make_pass_mem_fusion (gcc::context *ctxt) >> +{ >> + return new pass_mem_fusion (ctxt); >> +} >> diff --git a/gcc/config/rs6000/rs6000-passes.def >> b/gcc/config/rs6000/rs6000-passes.def >> index 46a0d0b8c56..0b48f57014d 100644 >> --- a/gcc/config/rs6000/rs6000-passes.def >> +++ b/gcc/config/rs6000/rs6000-passes.def >> @@ -28,7 +28,9 @@ along with GCC; see the file COPYING3. If not see >> The power8 does not have instructions that automaticaly do the byte >> swaps >> for loads and stores. */ >> INSERT_PASS_BEFORE (pass_cse, 1, pass_analyze_swaps); >> - >> + /* Pass to replace adjacent memory addresses lxv/stxv instruction with >> + lxvp/stxvp instruction. */ >> + INSERT_PASS_BEFORE (pass_early_remat, 1, pass_mem_fusion); >> /* Pass to do the PCREL_OPT optimization that combines the load of an >> external symbol's address along with a single load or store using that >> address as a base register. */ >> diff --git a/gcc/config/rs6000/rs6000-protos.h >> b/gcc/config/rs6000/rs6000-protos.h >> index b40557a8557..5295ff66950 100644 >> --- a/gcc/config/rs6000/rs6000-protos.h >> +++ b/gcc/config/rs6000/rs6000-protos.h >> @@ -342,6 +342,7 @@ namespace gcc { class context; } >> class rtl_opt_pass; >> >> extern rtl_opt_pass *make_pass_analyze_swaps (gcc::context *); >> +extern rtl_opt_pass *make_pass_mem_fusion (gcc::context *); >> extern rtl_opt_pass *make_pass_pcrel_opt (gcc::context *); >> extern bool rs6000_sum_of_two_registers_p (const_rtx expr); >> extern bool rs6000_quadword_masked_address_p (const_rtx exp); >> diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc >> index 5ed64b1e686..449d9e446d5 100644 >> --- a/gcc/config/rs6000/rs6000.cc >> +++ b/gcc/config/rs6000/rs6000.cc >> @@ -27442,7 +27442,8 @@ rs6000_split_multireg_move (rtx dst, rtx src) >> reg_mode = word_mode; >> reg_mode_size = GET_MODE_SIZE (reg_mode); >> >> - gcc_assert (reg_mode_size * nregs == GET_MODE_SIZE (mode)); >> + gcc_assert (mode == OOmode >> + || reg_mode_size * nregs == GET_MODE_SIZE (mode)); >> >> /* TDmode residing in FP registers is special, since the ISA requires that >> the lower-numbered word of a register pair is always the most >> significant >> @@ -27489,6 +27490,11 @@ rs6000_split_multireg_move (rtx dst, rtx src) >> int reg_mode_nregs = hard_regno_nregs (reg, reg_mode); >> if (MEM_P (dst)) >> { >> + rtx addr = XEXP (dst, 0); >> + rtx opnd1 = NULL_RTX; >> + if (addr && GET_CODE (addr) == PLUS) >> + opnd1 = XEXP (addr,1); >> + >> unsigned offset = 0; >> unsigned size = GET_MODE_SIZE (reg_mode); >> >> @@ -27502,7 +27508,13 @@ rs6000_split_multireg_move (rtx dst, rtx src) >> { >> unsigned subreg >> = WORDS_BIG_ENDIAN ? i : (nregs - reg_mode_nregs - i); >> - rtx dst2 = adjust_address (dst, reg_mode, offset); >> + rtx dst2 = dst; >> + >> + if ((GET_CODE (addr) != PLUS >> + || (opnd1 && CONST_INT_P(opnd1)))) >> + dst2 = adjust_address (dst, reg_mode, offset); >> + else >> + PUT_MODE_RAW (dst, reg_mode); >> rtx src2 = gen_rtx_REG (reg_mode, reg + subreg); >> offset += size; >> emit_insn (gen_rtx_SET (dst2, src2)); >> @@ -27513,15 +27525,25 @@ rs6000_split_multireg_move (rtx dst, rtx src) >> >> if (MEM_P (src)) >> { >> + rtx addr = XEXP (src, 0); >> + rtx opnd1 = NULL_RTX; >> + if (addr && GET_CODE (addr) == PLUS) >> + opnd1 = XEXP (addr,1); >> + >> unsigned offset = 0; >> unsigned size = GET_MODE_SIZE (reg_mode); >> >> - for (int i = 0; i < nregs; i += reg_mode_nregs) >> + for (int i = nregs-1; i >= 0; i -= reg_mode_nregs) >> { >> unsigned subreg >> = WORDS_BIG_ENDIAN ? i : (nregs - reg_mode_nregs - i); >> rtx dst2 = gen_rtx_REG (reg_mode, reg + subreg); >> - rtx src2 = adjust_address (src, reg_mode, offset); >> + rtx src2 = src; >> + >> + if ((GET_CODE (addr) != PLUS || (opnd1 && CONST_INT_P (opnd1)))) >> + src2 = adjust_address (src, reg_mode, offset); >> + else >> + PUT_MODE_RAW (src2, reg_mode); >> offset += size; >> emit_insn (gen_rtx_SET (dst2, src2)); >> } >> @@ -27529,7 +27551,8 @@ rs6000_split_multireg_move (rtx dst, rtx src) >> /* If we are writing an accumulator register, we have to >> prime it after we've written it. */ >> if (TARGET_MMA >> - && GET_MODE (dst) == XOmode && FP_REGNO_P (REGNO (dst))) >> + && REG_P (dst) && GET_MODE (dst) == XOmode >> + && FP_REGNO_P (REGNO (dst))) >> emit_insn (gen_mma_xxmtacc (dst, dst)); >> >> return; >> @@ -27622,9 +27645,12 @@ rs6000_split_multireg_move (rtx dst, rtx src) >> { >> for (i = nregs - 1; i >= 0; i--) >> { >> - rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + i); >> - rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + i); >> - emit_insn (gen_rtx_SET (dst_i, src_i)); >> + if (REG_P (dst) && REG_P (src)) >> + { >> + rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + i); >> + rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + i); >> + emit_insn (gen_rtx_SET (dst_i, src_i)); >> + } >> } >> } >> else >> @@ -27639,7 +27665,8 @@ rs6000_split_multireg_move (rtx dst, rtx src) >> /* If we are writing an accumulator register, we have to >> prime it after we've written it. */ >> if (TARGET_MMA >> - && GET_MODE (dst) == XOmode && FP_REGNO_P (REGNO (dst))) >> + && REG_P (dst) && GET_MODE (dst) == XOmode >> + && FP_REGNO_P (REGNO (dst))) >> emit_insn (gen_mma_xxmtacc (dst, dst)); >> } >> else >> @@ -27696,7 +27723,7 @@ rs6000_split_multireg_move (rtx dst, rtx src) >> >> /* If the base register we are using to address memory is >> also a destination reg, then change that register last. */ >> - if (REG_P (breg) >> + if (REG_P (dst) && REG_P (breg) >> && REGNO (breg) >= REGNO (dst) >> && REGNO (breg) < REGNO (dst) + nregs) >> j = REGNO (breg) - REGNO (dst); >> @@ -27794,9 +27821,12 @@ rs6000_split_multireg_move (rtx dst, rtx src) >> /* XO/OO are opaque so cannot use subregs. */ >> if (mode == OOmode || mode == XOmode ) >> { >> - rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + j); >> - rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + j); >> - emit_insn (gen_rtx_SET (dst_i, src_i)); >> + if (REG_P (dst) && REG_P (src)) >> + { >> + rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + j); >> + rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + j); >> + emit_insn (gen_rtx_SET (dst_i, src_i)); >> + } >> } >> else >> emit_insn (gen_rtx_SET (simplify_gen_subreg (reg_mode, dst, mode, >> @@ -27814,7 +27844,9 @@ rs6000_split_multireg_move (rtx dst, rtx src) >> if (restore_basereg != NULL_RTX) >> emit_insn (restore_basereg); >> } >> + return; >> } >> + >> >> /* Return true if the peephole2 can combine a load involving a combination >> of >> an addis instruction and a load with an offset that can be fused >> together on >> diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md >> index f59be536570..99f070ee24d 100644 >> --- a/gcc/config/rs6000/rs6000.md >> +++ b/gcc/config/rs6000/rs6000.md >> @@ -159,6 +159,7 @@ >> UNSPEC_XXSPLTIW_CONST >> UNSPEC_FMAX >> UNSPEC_FMIN >> + UNSPEC_LXVP >> ]) >> >> ;; >> diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000 >> index 155788de40a..a052163eec4 100644 >> --- a/gcc/config/rs6000/t-rs6000 >> +++ b/gcc/config/rs6000/t-rs6000 >> @@ -34,6 +34,11 @@ rs6000-p8swap.o: $(srcdir)/config/rs6000/rs6000-p8swap.cc >> $(COMPILE) $< >> $(POSTCOMPILE) >> >> +rs6000-mem-fusion.o: $(srcdir)/config/rs6000/rs6000-mem-fusion.cc >> + $(COMPILE) $< >> + $(POSTCOMPILE) >> + >> + >> rs6000-d.o: $(srcdir)/config/rs6000/rs6000-d.cc >> $(COMPILE) $< >> $(POSTCOMPILE) >> diff --git a/gcc/pair-fusion.cc b/gcc/pair-fusion.cc >> index 31d2c21c88f..b1521253474 100644 >> --- a/gcc/pair-fusion.cc >> +++ b/gcc/pair-fusion.cc >> @@ -312,9 +312,9 @@ static int >> encode_lfs (lfs_fields fields) >> { >> int size_log2 = exact_log2 (fields.size); >> - gcc_checking_assert (size_log2 >= 2 && size_log2 <= 4); >> - return ((int)fields.load_p << 3) >> - | ((int)fields.fpsimd_p << 2) >> + gcc_checking_assert (size_log2 >= 2 && size_log2 <= 9); >> + return ((int)fields.load_p << 4) >> + | ((int)fields.fpsimd_p << 3) >> | (size_log2 - 2); >> } >> >> @@ -322,8 +322,8 @@ encode_lfs (lfs_fields fields) >> static lfs_fields >> decode_lfs (int lfs) >> { >> - bool load_p = (lfs & (1 << 3)); >> - bool fpsimd_p = (lfs & (1 << 2)); >> + bool load_p = (lfs & (1 << 4)); >> + bool fpsimd_p = (lfs & (1 << 3)); >> unsigned size = 1U << ((lfs & 3) + 2); >> return { load_p, fpsimd_p, size }; >> } >> @@ -425,6 +425,9 @@ pair_fusion_bb_info::track_access (insn_info *insn, bool >> load_p, rtx mem) >> if (MEM_VOLATILE_P (mem)) >> return; >> >> + if (load_p && !m_pass->fuseable_load_p (insn)) >> + return; >> + >> // Ignore writeback accesses if the hook says to do so. >> if (!m_pass->should_handle_writeback (writeback_type::EXISTING) >> && GET_RTX_CLASS (GET_CODE (XEXP (mem, 0))) == RTX_AUTOINC) >> @@ -1814,7 +1817,7 @@ pair_fusion_bb_info::fuse_pair (bool load_p, >> } >> >> rtx reg_notes = combine_reg_notes (first, second, load_p); >> - >> + m_pass->set_multiword_subreg (i1, i2, load_p); >> rtx pair_pat = m_pass->gen_pair (pats, writeback_effect, load_p); >> insn_change *pair_change = nullptr; >> auto set_pair_pat = [pair_pat,reg_notes](insn_change *change) { >> @@ -1828,18 +1831,18 @@ pair_fusion_bb_info::fuse_pair (bool load_p, >> changes.safe_push (make_delete (first)); >> pair_change = make_change (second); >> changes.safe_push (pair_change); >> - >> pair_change->move_range = move_range; >> pair_change->new_defs = merge_access_arrays (attempt, >> input_defs[0], >> input_defs[1]); >> + m_pass->modify_new_rtx_insn (first, &attempt, &pair_change, changes); >> gcc_assert (pair_change->new_defs.is_valid ()); >> - >> pair_change->new_uses >> = merge_access_arrays (attempt, >> drop_memory_access (input_uses[0]), >> drop_memory_access (input_uses[1])); >> gcc_assert (pair_change->new_uses.is_valid ()); >> + m_pass->set_uses (i1, i2, &pair_change, &attempt); >> set_pair_pat (pair_change); >> } >> else >> @@ -1872,6 +1875,7 @@ pair_fusion_bb_info::fuse_pair (bool load_p, >> store_def, >> change->new_defs); >> gcc_assert (change->new_defs.is_valid ()); >> + m_pass->set_defs (i1, i2, &change, &attempt); >> change->move_range = move_range; >> pair_change = change; >> break; >> @@ -1908,6 +1912,7 @@ pair_fusion_bb_info::fuse_pair (bool load_p, >> change->new_defs = insert_access (attempt, new_set, >> change->new_defs); >> gcc_assert (change->new_defs.is_valid ()); >> + m_pass->set_defs (i1, i2, &change, &attempt); >> pair_change = change; >> break; >> } >> @@ -2405,6 +2410,15 @@ pair_fusion_bb_info::try_fuse_pair (bool load_p, >> unsigned access_size, >> reg_ops[i] = XEXP (pats[i], !load_p); >> } >> >> + if (!load_p && !m_pass->fuseable_store_p (i1, i2)) >> + { >> + if (dump_file) >> + fprintf (dump_file, >> + "punting on store-mem-pairs due to non fuseable cand >> (%d,%d)\n", >> + insns[0]->uid (), insns[1]->uid ()); >> + return false; >> + } >> + >> if (load_p && reg_overlap_mentioned_p (reg_ops[0], reg_ops[1])) >> { >> if (dump_file) >> @@ -2997,6 +3011,8 @@ void pair_fusion::process_block (bb_info *bb) >> if (GET_CODE (pat) != SET) >> continue; >> >> + change_existing_multword_mode (rti); >> + >> if (track_stores && MEM_P (XEXP (pat, 0))) >> bb_state.track_access (insn, false, XEXP (pat, 0)); >> else if (track_loads && MEM_P (XEXP (pat, 1))) >> diff --git a/gcc/pair-fusion.h b/gcc/pair-fusion.h >> index 45e4edceecb..b3af49de892 100644 >> --- a/gcc/pair-fusion.h >> +++ b/gcc/pair-fusion.h >> @@ -26,8 +26,12 @@ namespace rtl_ssa { >> class insn_info; >> class insn_range_info; >> class bb_info; >> + class insn_change; >> + class insn_range_info; >> } >> >> +class obstack_watermark; >> + >> // Information about a potential base candidate, used in try_fuse_pair. >> // There may be zero, one, or two viable RTL bases for a given pair. >> struct base_cand >> @@ -142,6 +146,19 @@ struct pair_fusion { >> // true iff INSN is a load pair. >> virtual bool pair_mem_insn_p (rtx_insn *insn, bool &load_p) = 0; >> >> + // Given INSN change multiword mode load and store to respective >> + // unspec instruction. >> + virtual void change_existing_multword_mode (rtx_insn *insn) = 0; >> + >> + // Given INSN and watermark ATTEMPT and PAIR_CHANGE sets the >> + // new rtx with INSN. Remove all uses of definition that are >> + // removed given CHANGES. >> + virtual void modify_new_rtx_insn (rtl_ssa::insn_info *first, >> + obstack_watermark *attempt, >> + rtl_ssa::insn_change **pair_change, >> + auto_vec<rtl_ssa::insn_change *> &changes) >> + = 0; >> + >> // Return true if we should track loads. >> virtual bool track_loads_p () >> { >> @@ -171,6 +188,37 @@ struct pair_fusion { >> virtual rtx gen_promote_writeback_pair (rtx wb_effect, rtx mem, >> rtx regs[2], bool load_p) = 0; >> >> + // Given insn_info pair I1 and I2, sets subreg with multiword registers >> + // to assign register pairs by allocators. >> + // LOAD_P is true iff the pair is a load. >> + virtual void set_multiword_subreg (rtl_ssa::insn_info *i1, >> + rtl_ssa::insn_info *i2, >> + bool load_p) = 0; >> + >> + // Given insn_info pair I1 and I2, sets uses in PAIR_CHANGE. >> + virtual void set_uses (rtl_ssa::insn_info *i1, >> + rtl_ssa::insn_info *i2, >> + rtl_ssa::insn_change **pair_change, >> + obstack_watermark *attempt) = 0; >> + >> + // Given insn_info pair I1 and I2, sets defss in PAIR_CHANGE. >> + virtual void set_defs (rtl_ssa::insn_info *i1, >> + rtl_ssa::insn_info *i2, >> + rtl_ssa::insn_change **pair_change, >> + obstack_watermark *attempt) = 0; >> + >> + >> + // Given insn_info pair I1 and I2, checks if pairs are feasible to perform >> + // store mem pairs. >> + // Return true if feasible to perform store mem pairs otherwise false. >> + virtual bool fuseable_store_p (rtl_ssa::insn_info *i1, >> + rtl_ssa::insn_info *i2) = 0; >> + >> + // Given insn_info pair I1 and I2, checks if pairs are feasible to perform >> + // load mem pairs. >> + // Return true if feasible to perform load mem pairs otherwise false. >> + virtual bool fuseable_load_p (rtl_ssa::insn_info *info) = 0; >> + >> void process_block (rtl_ssa::bb_info *bb); >> rtl_ssa::insn_info *find_trailing_add (rtl_ssa::insn_info *insns[2], >> const rtl_ssa::insn_range_info >> diff --git a/gcc/rtl-ssa/functions.h b/gcc/rtl-ssa/functions.h >> index 8be04f1aa96..27a732b7353 100644 >> --- a/gcc/rtl-ssa/functions.h >> +++ b/gcc/rtl-ssa/functions.h >> @@ -222,6 +222,13 @@ public: >> template<typename T, typename... Ts> >> T *change_alloc (obstack_watermark &wm, Ts... args); >> >> + auto_vec<access_info *> &get_m_temp_defs () { return m_temp_defs; } >> + >> + template<typename T, typename... Ts> >> + T *allocate (Ts... args); >> + >> + void remove_use (use_info *); >> + >> private: >> class bb_phi_info; >> class build_info; >> @@ -231,9 +238,6 @@ private: >> // allocate_temp during its lifetime. >> obstack_watermark temp_watermark () { return &m_temp_obstack; } >> >> - template<typename T, typename... Ts> >> - T *allocate (Ts... args); >> - >> template<typename T, typename... Ts> >> T *allocate_temp (Ts... args); >> >> @@ -269,7 +273,6 @@ private: >> static void insert_use_after (use_info *, use_info *); >> >> void add_use (use_info *); >> - void remove_use (use_info *); >> >> insn_info::order_node *need_order_node (insn_info *); >> >> diff --git a/gcc/testsuite/g++.target/powerpc/mem-fusion-1.C >> b/gcc/testsuite/g++.target/powerpc/mem-fusion-1.C >> new file mode 100644 >> index 00000000000..d10ff0cdf36 >> --- /dev/null >> +++ b/gcc/testsuite/g++.target/powerpc/mem-fusion-1.C >> @@ -0,0 +1,22 @@ >> +/* { dg-do compile } */ >> +/* { dg-require-effective-target power10_ok } */ >> +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */ >> + >> +#include <altivec.h> >> + >> +void >> +foo2 () >> +{ >> + __vector_quad *dst1; >> + __vector_quad *dst2; >> + vector unsigned char src; >> + __vector_quad acc; >> + vector unsigned char *ptr; >> + __builtin_mma_xvf32ger(&acc, src, ptr[0]); >> + __builtin_mma_xvf32gerpp(&acc, src, ptr[1]); >> + *dst1 = acc; >> + __builtin_mma_xvf32ger(&acc, src, ptr[2]); >> + __builtin_mma_xvf32gerpp(&acc, src, ptr[3]); >> + *dst2 = acc; >> +} >> +/* { dg-final { scan-assembler {\mlxvp\M} } } */ >> diff --git a/gcc/testsuite/g++.target/powerpc/mem-fusion.C >> b/gcc/testsuite/g++.target/powerpc/mem-fusion.C >> new file mode 100644 >> index 00000000000..c523572cf3c >> --- /dev/null >> +++ b/gcc/testsuite/g++.target/powerpc/mem-fusion.C >> @@ -0,0 +1,15 @@ >> +/* { dg-do compile } */ >> +/* { dg-require-effective-target power10_ok } */ >> +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */ >> + >> +#include <altivec.h> >> + >> +void >> +foo (__vector_quad *dst, vector unsigned char *ptr, vector unsigned char >> src) >> +{ >> + __vector_quad acc; >> + __builtin_mma_xvf32ger(&acc, src, ptr[0]); >> + __builtin_mma_xvf32gerpp(&acc, src, ptr[1]); >> + *dst = acc; >> +} >> +/* { dg-final { scan-assembler {\mlxvp\M} } } */ >> diff --git a/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c >> b/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c >> index 69ee826e1be..ae29127f954 100644 >> --- a/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c >> +++ b/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c >> @@ -258,8 +258,8 @@ foo13b (__vector_quad *dst, __vector_quad *src, vec_t >> *vec) >> dst[13] = acc; >> } >> >> -/* { dg-final { scan-assembler-times {\mlxv\M} 40 } } */ >> -/* { dg-final { scan-assembler-times {\mlxvp\M} 12 } } */ >> +/* { dg-final { scan-assembler-times {\mlxv\M} 0 } } */ >> +/* { dg-final { scan-assembler-times {\mlxvp\M} 32 } } */ >> /* { dg-final { scan-assembler-times {\mstxvp\M} 40 } } */ >> /* { dg-final { scan-assembler-times {\mxxmfacc\M} 20 } } */ >> /* { dg-final { scan-assembler-times {\mxxmtacc\M} 6 } } */