Hi, I've added an assert to this patch to detect a related condition. Prior to this patch, the prologue code was causing us to incorrectly call the expand-time routine rs6000_emit_vsx_le_store(). This was harmless, but we should have caught this sooner. The patch removes this problem, but we should still be on the lookout for such behavior in the future. I'll commit this version after 5.1 is released, if there are no objections.
Thanks, Bill 2015-04-20 Bill Schmidt <wschm...@linux.vnet.ibm.com> * config/rs6000/altivec.md (*altivec_lvx_<mode>_internal): Remove asterisk from name so this can be generated directly. (*altivec_stvx_<mode>_internal): Likewise. * config/rs6000/rs6000.c (rs6000_emit_le_vsx_store): Add assert that this is never called during or after reload/lra. (rs6000_frame_related): Remove split_reg argument and logic that references it. (emit_frame_save): Remove last parameter from call to rs6000_frame_related. (rs6000_emit_prologue): Remove last parameter from eight calls to rs6000_frame_related. Force generation of stvx instruction for Altivec register saves. Remove split_reg handling, which is no longer needed. (rs6000_emit_epilogue): Force generation of lvx instruction for Altivec register restores. Index: gcc/config/rs6000/altivec.md =================================================================== --- gcc/config/rs6000/altivec.md (revision 222230) +++ gcc/config/rs6000/altivec.md (working copy) @@ -2455,7 +2455,7 @@ } }) -(define_insn "*altivec_lvx_<mode>_internal" +(define_insn "altivec_lvx_<mode>_internal" [(parallel [(set (match_operand:VM2 0 "register_operand" "=v") (match_operand:VM2 1 "memory_operand" "Z")) @@ -2478,7 +2478,7 @@ } }) -(define_insn "*altivec_stvx_<mode>_internal" +(define_insn "altivec_stvx_<mode>_internal" [(parallel [(set (match_operand:VM2 0 "memory_operand" "=Z") (match_operand:VM2 1 "register_operand" "v")) Index: gcc/config/rs6000/rs6000.c =================================================================== --- gcc/config/rs6000/rs6000.c (revision 222230) +++ gcc/config/rs6000/rs6000.c (working copy) @@ -8371,6 +8371,11 @@ rs6000_emit_le_vsx_store (rtx dest, rtx source, ma { rtx tmp, permute_src, permute_tmp; + /* This should never be called during or after reload, because it does + not re-permute the source register. It is intended only for use + during expand. */ + gcc_assert (!reload_in_progress && !lra_in_progress && !reload_completed); + /* Use V2DImode to do swaps of types with 128-bit scalare parts (TImode, V1TImode). */ if (mode == TImode || mode == V1TImode) @@ -22768,7 +22773,7 @@ output_probe_stack_range (rtx reg1, rtx reg2) static rtx rs6000_frame_related (rtx insn, rtx reg, HOST_WIDE_INT val, - rtx reg2, rtx rreg, rtx split_reg) + rtx reg2, rtx rreg) { rtx real, temp; @@ -22859,11 +22864,6 @@ rs6000_frame_related (rtx insn, rtx reg, HOST_WIDE } } - /* If a store insn has been split into multiple insns, the - true source register is given by split_reg. */ - if (split_reg != NULL_RTX) - real = gen_rtx_SET (VOIDmode, SET_DEST (real), split_reg); - RTX_FRAME_RELATED_P (insn) = 1; add_reg_note (insn, REG_FRAME_RELATED_EXPR, real); @@ -22971,7 +22971,7 @@ emit_frame_save (rtx frame_reg, machine_mode mode, reg = gen_rtx_REG (mode, regno); insn = emit_insn (gen_frame_store (reg, frame_reg, offset)); return rs6000_frame_related (insn, frame_reg, frame_reg_to_sp, - NULL_RTX, NULL_RTX, NULL_RTX); + NULL_RTX, NULL_RTX); } /* Emit an offset memory reference suitable for a frame store, while @@ -23551,7 +23551,7 @@ rs6000_emit_prologue (void) insn = emit_insn (gen_rtx_PARALLEL (VOIDmode, p)); rs6000_frame_related (insn, frame_reg_rtx, sp_off - frame_off, - treg, GEN_INT (-info->total_size), NULL_RTX); + treg, GEN_INT (-info->total_size)); sp_off = frame_off = info->total_size; } @@ -23636,7 +23636,7 @@ rs6000_emit_prologue (void) insn = emit_move_insn (mem, reg); rs6000_frame_related (insn, frame_reg_rtx, sp_off - frame_off, - NULL_RTX, NULL_RTX, NULL_RTX); + NULL_RTX, NULL_RTX); END_USE (0); } } @@ -23692,7 +23692,7 @@ rs6000_emit_prologue (void) info->lr_save_offset, DFmode, sel); rs6000_frame_related (insn, ptr_reg, sp_off, - NULL_RTX, NULL_RTX, NULL_RTX); + NULL_RTX, NULL_RTX); if (lr) END_USE (0); } @@ -23771,7 +23771,7 @@ rs6000_emit_prologue (void) SAVRES_SAVE | SAVRES_GPR); rs6000_frame_related (insn, spe_save_area_ptr, sp_off - save_off, - NULL_RTX, NULL_RTX, NULL_RTX); + NULL_RTX, NULL_RTX); } /* Move the static chain pointer back. */ @@ -23821,7 +23821,7 @@ rs6000_emit_prologue (void) info->lr_save_offset + ptr_off, reg_mode, sel); rs6000_frame_related (insn, ptr_reg, sp_off - ptr_off, - NULL_RTX, NULL_RTX, NULL_RTX); + NULL_RTX, NULL_RTX); if (lr) END_USE (0); } @@ -23837,7 +23837,7 @@ rs6000_emit_prologue (void) info->gp_save_offset + frame_off + reg_size * i); insn = emit_insn (gen_rtx_PARALLEL (VOIDmode, p)); rs6000_frame_related (insn, frame_reg_rtx, sp_off - frame_off, - NULL_RTX, NULL_RTX, NULL_RTX); + NULL_RTX, NULL_RTX); } else if (!WORLD_SAVE_P (info)) { @@ -24160,7 +24160,7 @@ rs6000_emit_prologue (void) info->altivec_save_offset + ptr_off, 0, V4SImode, SAVRES_SAVE | SAVRES_VR); rs6000_frame_related (insn, scratch_reg, sp_off - ptr_off, - NULL_RTX, NULL_RTX, NULL_RTX); + NULL_RTX, NULL_RTX); if (REGNO (frame_reg_rtx) == REGNO (scratch_reg)) { /* The oddity mentioned above clobbered our frame reg. */ @@ -24176,7 +24176,7 @@ rs6000_emit_prologue (void) for (i = info->first_altivec_reg_save; i <= LAST_ALTIVEC_REGNO; ++i) if (info->vrsave_mask & ALTIVEC_REG_BIT (i)) { - rtx areg, savereg, mem, split_reg; + rtx areg, savereg, mem; int offset; offset = (info->altivec_save_offset + frame_off @@ -24192,20 +24192,13 @@ rs6000_emit_prologue (void) mem = gen_frame_mem (V4SImode, gen_rtx_PLUS (Pmode, frame_reg_rtx, areg)); - insn = emit_move_insn (mem, savereg); + /* Rather than emitting a generic move, force use of the stvx + instruction, which we always want. In particular we don't + want xxpermdi/stxvd2x for little endian. */ + insn = emit_insn (gen_altivec_stvx_v4si_internal (mem, savereg)); - /* When we split a VSX store into two insns, we need to make - sure the DWARF info knows which register we are storing. - Pass it in to be used on the appropriate note. */ - if (!BYTES_BIG_ENDIAN - && GET_CODE (PATTERN (insn)) == SET - && GET_CODE (SET_SRC (PATTERN (insn))) == VEC_SELECT) - split_reg = savereg; - else - split_reg = NULL_RTX; - rs6000_frame_related (insn, frame_reg_rtx, sp_off - frame_off, - areg, GEN_INT (offset), split_reg); + areg, GEN_INT (offset)); } } @@ -24847,7 +24840,10 @@ rs6000_emit_epilogue (int sibcall) mem = gen_frame_mem (V4SImode, addr); reg = gen_rtx_REG (V4SImode, i); - emit_move_insn (reg, mem); + /* Rather than emitting a generic move, force use of the + lvx instruction, which we always want. In particular + we don't want lxvd2x/xxpermdi for little endian. */ + (void) emit_insn (gen_altivec_lvx_v4si_internal (reg, mem)); } } @@ -25050,7 +25046,10 @@ rs6000_emit_epilogue (int sibcall) mem = gen_frame_mem (V4SImode, addr); reg = gen_rtx_REG (V4SImode, i); - emit_move_insn (reg, mem); + /* Rather than emitting a generic move, force use of the + lvx instruction, which we always want. In particular + we don't want lxvd2x/xxpermdi for little endian. */ + (void) emit_insn (gen_altivec_lvx_v4si_internal (reg, mem)); } } On Tue, 2015-04-07 at 11:11 -0400, David Edelsohn wrote: > On Mon, Apr 6, 2015 at 2:19 PM, Bill Schmidt > <wschm...@linux.vnet.ibm.com> wrote: > > Hi, > > > > The prologue and epilogue code to save/restore Altivec registers uses > > the generic emit_move_insn logic. This means that when VSX is available > > on a little-endian target, we will generate xxswapd/stxvd2x for prologue > > saves, and lxvd2x/xxswapd for epilogue restores. This happens too late > > to be cleaned up by swap optimization. Since the stack save slots are > > aligned, we should always use lvx and stvx for this purpose instead. > > This improves performance on LE targets, is performance-neutral for BE, > > and is always safe. > > > > This change causes the previously failing test > > gcc.target/powerpc/swaps-p8-2.c to succeed again. This test started > > failing when an unrelated change raised register pressure on the vector > > registers, causing us to generate the above save/restore sequences. The > > test was testing whether all xxswapd instructions had been removed. > > With this change, we no longer generate them for the save/restores, and > > the test succeeds. > > > > Because we no longer generate the two-instruction sequences, we no > > longer need the code in rs6000_frame_related to identify the true source > > register for the two-instruction store. I've removed that along with > > the extra argument it required, and adjusted the callers. > > > > Bootstrapped and tested on powerpc64le-unknown-linux-gnu with no > > regressions, and one fixed failure. Is this ok for trunk after 5.1 > > branches? I'd also like to backport this as far as 4.8 for the > > performance improvement. > > > > Thanks, > > Bill > > > >