Author: Pengfei Wang Date: 2019-11-05T12:52:54-08:00 New Revision: 9a9b6492a66c3f83e58f5b4e451797b6baf7f3ea
URL: https://github.com/llvm/llvm-project/commit/9a9b6492a66c3f83e58f5b4e451797b6baf7f3ea DIFF: https://github.com/llvm/llvm-project/commit/9a9b6492a66c3f83e58f5b4e451797b6baf7f3ea.diff LOG: [WinEH] Allocate space in funclets stack to save XMM CSRs Summary: This is an alternate approach to D63396 Currently funclets reuse the same stack slots that are used in the parent function for saving callee-saved xmm registers. If the parent function modifies a callee-saved xmm register before an excpetion is thrown, the catch handler will overwrite the original saved value. This patch allocates space in funclets stack for saving callee-saved xmm registers and uses RSP instead RBP to access memory. Signed-off-by: Pengfei Wang <pengfei.w...@intel.com> Reviewers: rnk, RKSimon, craig.topper, annita.zhang, LuoYuanke, andrew.w.kaylor Subscribers: hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D66596 Signed-off-by: Pengfei Wang <pengfei.w...@intel.com> llvm-svn: 370005 (cherry picked from commit 564fb58a32a808c34d809820d00e2f23c0307a71) Added: llvm/test/CodeGen/X86/win64-funclet-savexmm.ll Modified: llvm/lib/Target/X86/X86FrameLowering.cpp llvm/lib/Target/X86/X86FrameLowering.h llvm/lib/Target/X86/X86MachineFunctionInfo.h llvm/lib/Target/X86/X86RegisterInfo.cpp llvm/test/CodeGen/X86/avx512-intel-ocl.ll llvm/test/CodeGen/X86/catchpad-realign-savexmm.ll llvm/test/CodeGen/X86/x86-interrupt_cc.ll Removed: ################################################################################ diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index e310fe069117..854156b2bc8e 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -1396,9 +1396,13 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, int FI; if (unsigned Reg = TII.isStoreToStackSlot(FrameInstr, FI)) { if (X86::FR64RegClass.contains(Reg)) { + int Offset; unsigned IgnoredFrameReg; - int Offset = getFrameIndexReference(MF, FI, IgnoredFrameReg); - Offset += SEHFrameOffset; + if (IsWin64Prologue && IsFunclet) + Offset = getWin64EHFrameIndexRef(MF, FI, IgnoredFrameReg); + else + Offset = getFrameIndexReference(MF, FI, IgnoredFrameReg) + + SEHFrameOffset; HasWinCFI = true; assert(!NeedsWinFPO && "SEH_SaveXMM incompatible with FPO data"); @@ -1554,9 +1558,13 @@ X86FrameLowering::getPSPSlotOffsetFromSP(const MachineFunction &MF) const { unsigned X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const { + const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); // This is the size of the pushed CSRs. - unsigned CSSize = - MF.getInfo<X86MachineFunctionInfo>()->getCalleeSavedFrameSize(); + unsigned CSSize = X86FI->getCalleeSavedFrameSize(); + // This is the size of callee saved XMMs. + const auto& WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo(); + unsigned XMMSize = WinEHXMMSlotInfo.size() * + TRI->getSpillSize(X86::VR128RegClass); // This is the amount of stack a funclet needs to allocate. unsigned UsedSize; EHPersonality Personality = @@ -1576,7 +1584,7 @@ X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const { unsigned FrameSizeMinusRBP = alignTo(CSSize + UsedSize, getStackAlignment()); // Subtract out the size of the callee saved registers. This is how much stack // each funclet will allocate. - return FrameSizeMinusRBP - CSSize; + return FrameSizeMinusRBP + XMMSize - CSSize; } static bool isTailCallOpcode(unsigned Opc) { @@ -1850,6 +1858,20 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, return Offset + FPDelta; } +int X86FrameLowering::getWin64EHFrameIndexRef(const MachineFunction &MF, + int FI, unsigned &FrameReg) const { + const MachineFrameInfo &MFI = MF.getFrameInfo(); + const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + const auto& WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo(); + const auto it = WinEHXMMSlotInfo.find(FI); + + if (it == WinEHXMMSlotInfo.end()) + return getFrameIndexReference(MF, FI, FrameReg); + + FrameReg = TRI->getStackRegister(); + return alignTo(MFI.getMaxCallFrameSize(), getStackAlignment()) + it->second; +} + int X86FrameLowering::getFrameIndexReferenceSP(const MachineFunction &MF, int FI, unsigned &FrameReg, int Adjustment) const { @@ -1948,6 +1970,8 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); unsigned CalleeSavedFrameSize = 0; + unsigned XMMCalleeSavedFrameSize = 0; + auto &WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo(); int SpillSlotOffset = getOffsetOfLocalArea() + X86FI->getTCReturnAddrDelta(); int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); @@ -2025,12 +2049,20 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( unsigned Size = TRI->getSpillSize(*RC); unsigned Align = TRI->getSpillAlignment(*RC); // ensure alignment - SpillSlotOffset -= std::abs(SpillSlotOffset) % Align; + assert(SpillSlotOffset < 0 && "SpillSlotOffset should always < 0 on X86"); + SpillSlotOffset = -alignTo(-SpillSlotOffset, Align); + // spill into slot SpillSlotOffset -= Size; int SlotIndex = MFI.CreateFixedSpillStackObject(Size, SpillSlotOffset); CSI[i - 1].setFrameIdx(SlotIndex); MFI.ensureMaxAlignment(Align); + + // Save the start offset and size of XMM in stack frame for funclets. + if (X86::VR128RegClass.contains(Reg)) { + WinEHXMMSlotInfo[SlotIndex] = XMMCalleeSavedFrameSize; + XMMCalleeSavedFrameSize += Size; + } } return true; diff --git a/llvm/lib/Target/X86/X86FrameLowering.h b/llvm/lib/Target/X86/X86FrameLowering.h index d32746e3a36e..c5218cc09b8a 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.h +++ b/llvm/lib/Target/X86/X86FrameLowering.h @@ -99,6 +99,8 @@ class X86FrameLowering : public TargetFrameLowering { int getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const override; + int getWin64EHFrameIndexRef(const MachineFunction &MF, + int FI, unsigned &SPReg) const; int getFrameIndexReferenceSP(const MachineFunction &MF, int FI, unsigned &SPReg, int Adjustment) const; int getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI, diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/llvm/lib/Target/X86/X86MachineFunctionInfo.h index d7e535598d81..5cb80a082b56 100644 --- a/llvm/lib/Target/X86/X86MachineFunctionInfo.h +++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.h @@ -36,6 +36,10 @@ class X86MachineFunctionInfo : public MachineFunctionInfo { /// is stashed. signed char RestoreBasePointerOffset = 0; + /// WinEHXMMSlotInfo - Slot information of XMM registers in the stack frame + /// in bytes. + DenseMap<int, unsigned> WinEHXMMSlotInfo; + /// CalleeSavedFrameSize - Size of the callee-saved register portion of the /// stack frame in bytes. unsigned CalleeSavedFrameSize = 0; @@ -120,6 +124,10 @@ class X86MachineFunctionInfo : public MachineFunctionInfo { void setRestoreBasePointer(const MachineFunction *MF); int getRestoreBasePointerOffset() const {return RestoreBasePointerOffset; } + DenseMap<int, unsigned>& getWinEHXMMSlotInfo() { return WinEHXMMSlotInfo; } + const DenseMap<int, unsigned>& getWinEHXMMSlotInfo() const { + return WinEHXMMSlotInfo; } + unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; } void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; } diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp index 2e2f1f9e438a..c8966dfffa0c 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -692,12 +692,27 @@ static bool tryOptimizeLEAtoMOV(MachineBasicBlock::iterator II) { return true; } +static bool isFuncletReturnInstr(MachineInstr &MI) { + switch (MI.getOpcode()) { + case X86::CATCHRET: + case X86::CLEANUPRET: + return true; + default: + return false; + } + llvm_unreachable("impossible"); +} + void X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const { MachineInstr &MI = *II; - MachineFunction &MF = *MI.getParent()->getParent(); + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); + bool IsEHFuncletEpilogue = MBBI == MBB.end() ? false + : isFuncletReturnInstr(*MBBI); const X86FrameLowering *TFI = getFrameLowering(MF); int FrameIndex = MI.getOperand(FIOperandNum).getIndex(); @@ -709,6 +724,8 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, MF.getFrameInfo().isFixedObjectIndex(FrameIndex)) && "Return instruction can only reference SP relative frame objects"); FIOffset = TFI->getFrameIndexReferenceSP(MF, FrameIndex, BasePtr, 0); + } else if (TFI->Is64Bit && (MBB.isEHFuncletEntry() || IsEHFuncletEpilogue)) { + FIOffset = TFI->getWin64EHFrameIndexRef(MF, FrameIndex, BasePtr); } else { FIOffset = TFI->getFrameIndexReference(MF, FrameIndex, BasePtr); } diff --git a/llvm/test/CodeGen/X86/avx512-intel-ocl.ll b/llvm/test/CodeGen/X86/avx512-intel-ocl.ll index defedd2a7f63..751d610c2ca7 100644 --- a/llvm/test/CodeGen/X86/avx512-intel-ocl.ll +++ b/llvm/test/CodeGen/X86/avx512-intel-ocl.ll @@ -94,12 +94,12 @@ define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind { ; X32-NEXT: movl %esp, %ebp ; X32-NEXT: andl $-64, %esp ; X32-NEXT: subl $256, %esp ## imm = 0x100 -; X32-NEXT: vmovaps %zmm1, {{[0-9]+}}(%esp) ## 64-byte Spill +; X32-NEXT: vmovaps %zmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 64-byte Spill ; X32-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl %eax, (%esp) ; X32-NEXT: calll _func_float16_ptr -; X32-NEXT: vaddps {{[0-9]+}}(%esp), %zmm0, %zmm0 ## 64-byte Folded Reload +; X32-NEXT: vaddps {{[-0-9]+}}(%e{{[sb]}}p), %zmm0, %zmm0 ## 64-byte Folded Reload ; X32-NEXT: vaddps {{[0-9]+}}(%esp), %zmm0, %zmm0 ; X32-NEXT: movl %ebp, %esp ; X32-NEXT: popl %ebp @@ -184,110 +184,110 @@ define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x fl ; WIN64-KNL-LABEL: test_prolog_epilog: ; WIN64-KNL: # %bb.0: ; WIN64-KNL-NEXT: pushq %rbp -; WIN64-KNL-NEXT: subq $1328, %rsp # imm = 0x530 +; WIN64-KNL-NEXT: subq $1264, %rsp # imm = 0x4F0 ; WIN64-KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rbp -; WIN64-KNL-NEXT: kmovw %k7, 1198(%rbp) # 2-byte Spill -; WIN64-KNL-NEXT: kmovw %k6, 1196(%rbp) # 2-byte Spill -; WIN64-KNL-NEXT: kmovw %k5, 1194(%rbp) # 2-byte Spill -; WIN64-KNL-NEXT: kmovw %k4, 1192(%rbp) # 2-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm21, 1104(%rbp) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm20, 992(%rbp) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm19, 896(%rbp) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm18, 832(%rbp) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm17, 768(%rbp) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm16, 704(%rbp) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm15, 640(%rbp) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm14, 576(%rbp) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm13, 512(%rbp) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm12, 448(%rbp) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm11, 384(%rbp) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm10, 320(%rbp) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm9, 256(%rbp) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm8, 192(%rbp) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm7, 128(%rbp) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm6, 64(%rbp) # 64-byte Spill +; WIN64-KNL-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; WIN64-KNL-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; WIN64-KNL-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; WIN64-KNL-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; WIN64-KNL-NEXT: vmovaps %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovaps %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovaps %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovaps %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovaps %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovaps %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovaps %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovaps %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovaps %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovaps %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovaps %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovaps %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovaps %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovaps %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovaps %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovaps %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; WIN64-KNL-NEXT: andq $-64, %rsp ; WIN64-KNL-NEXT: vmovaps %zmm1, {{[0-9]+}}(%rsp) ; WIN64-KNL-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) ; WIN64-KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rcx ; WIN64-KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rdx ; WIN64-KNL-NEXT: callq func_float16 -; WIN64-KNL-NEXT: vmovaps 64(%rbp), %zmm6 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps 128(%rbp), %zmm7 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps 192(%rbp), %zmm8 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps 256(%rbp), %zmm9 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps 320(%rbp), %zmm10 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps 384(%rbp), %zmm11 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps 448(%rbp), %zmm12 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps 512(%rbp), %zmm13 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps 576(%rbp), %zmm14 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps 640(%rbp), %zmm15 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps 704(%rbp), %zmm16 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps 768(%rbp), %zmm17 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps 832(%rbp), %zmm18 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps 896(%rbp), %zmm19 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps 992(%rbp), %zmm20 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps 1104(%rbp), %zmm21 # 64-byte Reload -; WIN64-KNL-NEXT: kmovw 1192(%rbp), %k4 # 2-byte Reload -; WIN64-KNL-NEXT: kmovw 1194(%rbp), %k5 # 2-byte Reload -; WIN64-KNL-NEXT: kmovw 1196(%rbp), %k6 # 2-byte Reload -; WIN64-KNL-NEXT: kmovw 1198(%rbp), %k7 # 2-byte Reload -; WIN64-KNL-NEXT: leaq 1200(%rbp), %rsp +; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; WIN64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; WIN64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; WIN64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; WIN64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; WIN64-KNL-NEXT: leaq 1136(%rbp), %rsp ; WIN64-KNL-NEXT: popq %rbp ; WIN64-KNL-NEXT: retq ; ; WIN64-SKX-LABEL: test_prolog_epilog: ; WIN64-SKX: # %bb.0: ; WIN64-SKX-NEXT: pushq %rbp -; WIN64-SKX-NEXT: subq $1328, %rsp # imm = 0x530 +; WIN64-SKX-NEXT: subq $1264, %rsp # imm = 0x4F0 ; WIN64-SKX-NEXT: leaq {{[0-9]+}}(%rsp), %rbp -; WIN64-SKX-NEXT: kmovq %k7, 1192(%rbp) # 8-byte Spill -; WIN64-SKX-NEXT: kmovq %k6, 1184(%rbp) # 8-byte Spill -; WIN64-SKX-NEXT: kmovq %k5, 1176(%rbp) # 8-byte Spill -; WIN64-SKX-NEXT: kmovq %k4, 1168(%rbp) # 8-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm21, 1056(%rbp) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm20, 960(%rbp) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm19, 896(%rbp) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm18, 832(%rbp) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm17, 768(%rbp) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm16, 704(%rbp) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm15, 640(%rbp) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm14, 576(%rbp) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm13, 512(%rbp) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm12, 448(%rbp) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm11, 384(%rbp) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm10, 320(%rbp) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm9, 256(%rbp) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm8, 192(%rbp) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm7, 128(%rbp) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm6, 64(%rbp) # 64-byte Spill +; WIN64-SKX-NEXT: kmovq %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WIN64-SKX-NEXT: kmovq %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WIN64-SKX-NEXT: kmovq %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WIN64-SKX-NEXT: kmovq %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WIN64-SKX-NEXT: vmovaps %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovaps %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovaps %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovaps %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovaps %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovaps %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovaps %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovaps %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovaps %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovaps %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovaps %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovaps %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovaps %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovaps %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovaps %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovaps %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; WIN64-SKX-NEXT: andq $-64, %rsp ; WIN64-SKX-NEXT: vmovaps %zmm1, {{[0-9]+}}(%rsp) ; WIN64-SKX-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) ; WIN64-SKX-NEXT: leaq {{[0-9]+}}(%rsp), %rcx ; WIN64-SKX-NEXT: leaq {{[0-9]+}}(%rsp), %rdx ; WIN64-SKX-NEXT: callq func_float16 -; WIN64-SKX-NEXT: vmovaps 64(%rbp), %zmm6 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps 128(%rbp), %zmm7 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps 192(%rbp), %zmm8 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps 256(%rbp), %zmm9 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps 320(%rbp), %zmm10 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps 384(%rbp), %zmm11 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps 448(%rbp), %zmm12 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps 512(%rbp), %zmm13 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps 576(%rbp), %zmm14 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps 640(%rbp), %zmm15 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps 704(%rbp), %zmm16 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps 768(%rbp), %zmm17 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps 832(%rbp), %zmm18 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps 896(%rbp), %zmm19 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps 960(%rbp), %zmm20 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps 1056(%rbp), %zmm21 # 64-byte Reload -; WIN64-SKX-NEXT: kmovq 1168(%rbp), %k4 # 8-byte Reload -; WIN64-SKX-NEXT: kmovq 1176(%rbp), %k5 # 8-byte Reload -; WIN64-SKX-NEXT: kmovq 1184(%rbp), %k6 # 8-byte Reload -; WIN64-SKX-NEXT: kmovq 1192(%rbp), %k7 # 8-byte Reload -; WIN64-SKX-NEXT: leaq 1200(%rbp), %rsp +; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; WIN64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 8-byte Reload +; WIN64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload +; WIN64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 8-byte Reload +; WIN64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload +; WIN64-SKX-NEXT: leaq 1136(%rbp), %rsp ; WIN64-SKX-NEXT: popq %rbp ; WIN64-SKX-NEXT: retq ; @@ -296,47 +296,47 @@ define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x fl ; X64-KNL-NEXT: pushq %rsi ; X64-KNL-NEXT: pushq %rdi ; X64-KNL-NEXT: subq $1064, %rsp ## imm = 0x428 -; X64-KNL-NEXT: kmovw %k7, {{[0-9]+}}(%rsp) ## 2-byte Spill -; X64-KNL-NEXT: kmovw %k6, {{[0-9]+}}(%rsp) ## 2-byte Spill -; X64-KNL-NEXT: kmovw %k5, {{[0-9]+}}(%rsp) ## 2-byte Spill -; X64-KNL-NEXT: kmovw %k4, {{[0-9]+}}(%rsp) ## 2-byte Spill -; X64-KNL-NEXT: vmovups %zmm31, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-KNL-NEXT: vmovups %zmm30, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-KNL-NEXT: vmovups %zmm29, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-KNL-NEXT: vmovups %zmm28, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-KNL-NEXT: vmovups %zmm27, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-KNL-NEXT: vmovups %zmm26, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-KNL-NEXT: vmovups %zmm25, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-KNL-NEXT: vmovups %zmm24, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-KNL-NEXT: vmovups %zmm23, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-KNL-NEXT: vmovups %zmm22, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-KNL-NEXT: vmovups %zmm21, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-KNL-NEXT: vmovups %zmm20, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-KNL-NEXT: vmovups %zmm19, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-KNL-NEXT: vmovups %zmm18, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-KNL-NEXT: vmovups %zmm17, {{[0-9]+}}(%rsp) ## 64-byte Spill +; X64-KNL-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; X64-KNL-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; X64-KNL-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; X64-KNL-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; X64-KNL-NEXT: vmovups %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-KNL-NEXT: vmovups %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-KNL-NEXT: vmovups %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-KNL-NEXT: vmovups %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-KNL-NEXT: vmovups %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-KNL-NEXT: vmovups %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-KNL-NEXT: vmovups %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-KNL-NEXT: vmovups %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-KNL-NEXT: vmovups %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-KNL-NEXT: vmovups %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-KNL-NEXT: vmovups %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-KNL-NEXT: vmovups %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-KNL-NEXT: vmovups %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-KNL-NEXT: vmovups %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-KNL-NEXT: vmovups %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill ; X64-KNL-NEXT: vmovups %zmm16, (%rsp) ## 64-byte Spill ; X64-KNL-NEXT: callq _func_float16 ; X64-KNL-NEXT: vmovups (%rsp), %zmm16 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm17 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm18 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm19 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm20 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm21 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm22 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm23 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm24 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm25 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm26 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm27 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm28 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm29 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm30 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm31 ## 64-byte Reload -; X64-KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k4 ## 2-byte Reload -; X64-KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k5 ## 2-byte Reload -; X64-KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k6 ## 2-byte Reload -; X64-KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k7 ## 2-byte Reload +; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 ## 64-byte Reload +; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 ## 64-byte Reload +; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 ## 64-byte Reload +; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 ## 64-byte Reload +; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 ## 64-byte Reload +; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 ## 64-byte Reload +; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 ## 64-byte Reload +; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 ## 64-byte Reload +; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 ## 64-byte Reload +; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 ## 64-byte Reload +; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 ## 64-byte Reload +; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 ## 64-byte Reload +; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 ## 64-byte Reload +; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 ## 64-byte Reload +; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 ## 64-byte Reload +; X64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload +; X64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload +; X64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload +; X64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload ; X64-KNL-NEXT: addq $1064, %rsp ## imm = 0x428 ; X64-KNL-NEXT: popq %rdi ; X64-KNL-NEXT: popq %rsi @@ -346,49 +346,49 @@ define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x fl ; X64-SKX: ## %bb.0: ; X64-SKX-NEXT: pushq %rsi ; X64-SKX-NEXT: pushq %rdi -; X64-SKX-NEXT: subq $1192, %rsp ## imm = 0x4A8 -; X64-SKX-NEXT: kmovq %k7, {{[0-9]+}}(%rsp) ## 8-byte Spill -; X64-SKX-NEXT: kmovq %k6, {{[0-9]+}}(%rsp) ## 8-byte Spill -; X64-SKX-NEXT: kmovq %k5, {{[0-9]+}}(%rsp) ## 8-byte Spill -; X64-SKX-NEXT: kmovq %k4, {{[0-9]+}}(%rsp) ## 8-byte Spill -; X64-SKX-NEXT: vmovups %zmm31, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-SKX-NEXT: vmovups %zmm30, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-SKX-NEXT: vmovups %zmm29, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-SKX-NEXT: vmovups %zmm28, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-SKX-NEXT: vmovups %zmm27, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-SKX-NEXT: vmovups %zmm26, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-SKX-NEXT: vmovups %zmm25, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-SKX-NEXT: vmovups %zmm24, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-SKX-NEXT: vmovups %zmm23, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-SKX-NEXT: vmovups %zmm22, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-SKX-NEXT: vmovups %zmm21, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-SKX-NEXT: vmovups %zmm20, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-SKX-NEXT: vmovups %zmm19, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-SKX-NEXT: vmovups %zmm18, {{[0-9]+}}(%rsp) ## 64-byte Spill -; X64-SKX-NEXT: vmovups %zmm17, {{[0-9]+}}(%rsp) ## 64-byte Spill +; X64-SKX-NEXT: subq $1064, %rsp ## imm = 0x428 +; X64-SKX-NEXT: kmovq %k7, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-SKX-NEXT: kmovq %k6, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-SKX-NEXT: kmovq %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-SKX-NEXT: kmovq %k4, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-SKX-NEXT: vmovups %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-SKX-NEXT: vmovups %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-SKX-NEXT: vmovups %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-SKX-NEXT: vmovups %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-SKX-NEXT: vmovups %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-SKX-NEXT: vmovups %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-SKX-NEXT: vmovups %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-SKX-NEXT: vmovups %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-SKX-NEXT: vmovups %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-SKX-NEXT: vmovups %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-SKX-NEXT: vmovups %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-SKX-NEXT: vmovups %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-SKX-NEXT: vmovups %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-SKX-NEXT: vmovups %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-SKX-NEXT: vmovups %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill ; X64-SKX-NEXT: vmovups %zmm16, (%rsp) ## 64-byte Spill ; X64-SKX-NEXT: callq _func_float16 ; X64-SKX-NEXT: vmovups (%rsp), %zmm16 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm17 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm18 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm19 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm20 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm21 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm22 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm23 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm24 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm25 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm26 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm27 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm28 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm29 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm30 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm31 ## 64-byte Reload -; X64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k4 ## 8-byte Reload -; X64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k5 ## 8-byte Reload -; X64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k6 ## 8-byte Reload -; X64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k7 ## 8-byte Reload -; X64-SKX-NEXT: addq $1192, %rsp ## imm = 0x4A8 +; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 ## 64-byte Reload +; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 ## 64-byte Reload +; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 ## 64-byte Reload +; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 ## 64-byte Reload +; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 ## 64-byte Reload +; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 ## 64-byte Reload +; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 ## 64-byte Reload +; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 ## 64-byte Reload +; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 ## 64-byte Reload +; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 ## 64-byte Reload +; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 ## 64-byte Reload +; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 ## 64-byte Reload +; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 ## 64-byte Reload +; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 ## 64-byte Reload +; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 ## 64-byte Reload +; X64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 8-byte Reload +; X64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 8-byte Reload +; X64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 8-byte Reload +; X64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 8-byte Reload +; X64-SKX-NEXT: addq $1064, %rsp ## imm = 0x428 ; X64-SKX-NEXT: popq %rdi ; X64-SKX-NEXT: popq %rsi ; X64-SKX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/catchpad-realign-savexmm.ll b/llvm/test/CodeGen/X86/catchpad-realign-savexmm.ll index 1160101792ff..e8bccdabdcd4 100644 --- a/llvm/test/CodeGen/X86/catchpad-realign-savexmm.ll +++ b/llvm/test/CodeGen/X86/catchpad-realign-savexmm.ll @@ -51,3 +51,18 @@ catch: ; CHECK: popq %rbp ; CHECK: retq ; CHECK: .seh_handlerdata +; CHECK: # %catch +; CHECK: movq %rdx, 16(%rsp) +; CHECK: pushq %rbp +; CHECK: .seh_pushreg 5 +; CHECK: subq $48, %rsp +; CHECK: .seh_stackalloc 48 +; CHECK: leaq 64(%rdx), %rbp +; CHECK: movapd %xmm6, 32(%rsp) +; CHECK: .seh_savexmm 6, 32 +; CHECK: .seh_endprologue +; CHECK: movapd 32(%rsp), %xmm6 +; CHECK: leaq .LBB0_1(%rip), %rax +; CHECK: addq $48, %rsp +; CHECK: popq %rbp +; CHECK: retq # CATCHRET diff --git a/llvm/test/CodeGen/X86/win64-funclet-savexmm.ll b/llvm/test/CodeGen/X86/win64-funclet-savexmm.ll new file mode 100644 index 000000000000..62ddebb9a5a0 --- /dev/null +++ b/llvm/test/CodeGen/X86/win64-funclet-savexmm.ll @@ -0,0 +1,115 @@ +; RUN: llc -mtriple=x86_64-pc-windows-msvc -mattr=+avx < %s | FileCheck %s + +; void bar(int a, int b, int c, int d, int e); +; void baz(int x); +; +; void foo(int a, int b, int c, int d, int e) +; { +; __asm("nop" ::: "bx", "cx", "xmm5", "xmm6", "ymm7"); +; try { +; bar(a, b, c, d, e); +; } +; catch (...) { +; baz(a); +; if (a) +; __asm("nop" ::: "xmm8"); +; } +; } + +%rtti.TypeDescriptor2 = type { i8**, i8*, [3 x i8] } + +$"??_R0H@8" = comdat any + +@"??_7type_info@@6B@" = external constant i8* +@"??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat + +declare dso_local i32 @__CxxFrameHandler3(...) +declare dso_local void @"?bar@@YAXHHHHH@Z"(i32, i32, i32, i32, i32) +declare dso_local void @"?baz@@YAXH@Z"(i32) + +define dso_local void @"?foo@@YAXHHHHH@Z"(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) { +entry: + %e.addr = alloca i32, align 4 + %d.addr = alloca i32, align 4 + %c.addr = alloca i32, align 4 + %b.addr = alloca i32, align 4 + %a.addr = alloca i32, align 4 + store i32 %e, i32* %e.addr, align 4 + store i32 %d, i32* %d.addr, align 4 + store i32 %c, i32* %c.addr, align 4 + store i32 %b, i32* %b.addr, align 4 + store i32 %a, i32* %a.addr, align 4 + call void asm sideeffect "nop", "~{bx},~{cx},~{xmm5},~{xmm6},~{ymm7}"() + %0 = load i32, i32* %e.addr, align 4 + %1 = load i32, i32* %d.addr, align 4 + %2 = load i32, i32* %c.addr, align 4 + %3 = load i32, i32* %b.addr, align 4 + %4 = load i32, i32* %a.addr, align 4 + invoke void @"?bar@@YAXHHHHH@Z"(i32 %4, i32 %3, i32 %2, i32 %1, i32 %0) + to label %invoke.cont unwind label %catch.dispatch + +catch.dispatch: ; preds = %entry + %5 = catchswitch within none [label %catch] unwind to caller + +catch: ; preds = %catch.dispatch + %6 = catchpad within %5 [i8* null, i32 64, i8* null] + %7 = load i32, i32* %a.addr, align 4 + call void @"?baz@@YAXH@Z"(i32 %7) [ "funclet"(token %6) ] + %8 = load i32, i32* %a.addr, align 4 + %tobool = icmp ne i32 %8, 0 + br i1 %tobool, label %if.then, label %if.end + +if.then: ; preds = %catch + call void asm sideeffect "nop", "~{xmm8}"() [ "funclet"(token %6) ] + br label %if.end + +invoke.cont: ; preds = %entry + br label %try.cont + +if.end: ; preds = %if.then, %catch + catchret from %6 to label %catchret.dest + +catchret.dest: ; preds = %if.end + br label %try.cont + +try.cont: ; preds = %catchret.dest, %invoke.cont + ret void +} + +; CHECK: # %catch +; CHECK: movq %rdx, 16(%rsp) +; CHECK: pushq %rbp +; CHECK: .seh_pushreg 5 +; CHECK: pushq %rbx +; CHECK: .seh_pushreg 3 +; CHECK: subq $88, %rsp +; CHECK: .seh_stackalloc 88 +; CHECK: leaq 112(%rdx), %rbp +; CHECK: vmovaps %xmm8, 48(%rsp) +; CHECK: .seh_savexmm 8, 48 +; CHECK: vmovaps %xmm7, 64(%rsp) +; CHECK: .seh_savexmm 7, 64 +; CHECK: vmovaps %xmm6, 80(%rsp) +; CHECK: .seh_savexmm 6, 80 +; CHECK: .seh_endprologue +; CHECK: movl -{{[0-9]+}}(%rbp), %ecx +; CHECK: vmovaps 80(%rsp), %xmm6 +; CHECK: vmovaps 64(%rsp), %xmm7 +; CHECK: vmovaps 48(%rsp), %xmm8 +; CHECK: leaq .LBB0_1(%rip), %rax +; CHECK: addq $88, %rsp +; CHECK: popq %rbx +; CHECK: popq %rbp +; CHECK: retq # CATCHRET + +; CHECK-LABEL: "$handlerMap$0$?foo@@YAXHHHHH@Z": +; CHECK-NEXT: .long 64 # Adjectives +; CHECK-NEXT: .long 0 # Type +; CHECK-NEXT: .long 0 # CatchObjOffset +; CHECK-NEXT: .long "?catch$2@?0??foo@@YAXHHHHH@Z@4HA"@IMGREL # Handler +; Sum of: +; 16 RDX store offset +; 16 two pushes +; 72 stack alloc +; CHECK-NEXT: .long 120 # ParentFrameOffset + diff --git a/llvm/test/CodeGen/X86/x86-interrupt_cc.ll b/llvm/test/CodeGen/X86/x86-interrupt_cc.ll index 09f82b46c216..2043816f3a07 100644 --- a/llvm/test/CodeGen/X86/x86-interrupt_cc.ll +++ b/llvm/test/CodeGen/X86/x86-interrupt_cc.ll @@ -294,7 +294,7 @@ define x86_intrcc void @foo(i8* %frame) { ; CHECK64-SKX-NEXT: kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Spill ; CHECK64-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x91,0x84,0x24,0x30,0x08,0x00,0x00] ; CHECK64-SKX-NEXT: vmovups %zmm31, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x11,0xbc,0x24,0xe0,0x07,0x00,0x00] +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x11,0x7c,0x24,0x1f] ; CHECK64-SKX-NEXT: vmovups %zmm30, {{[0-9]+}}(%rsp) ## 64-byte Spill ; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x11,0x74,0x24,0x1e] ; CHECK64-SKX-NEXT: vmovups %zmm29, {{[0-9]+}}(%rsp) ## 64-byte Spill @@ -398,7 +398,7 @@ define x86_intrcc void @foo(i8* %frame) { ; CHECK64-SKX-NEXT: .cfi_offset %xmm28, -448 ; CHECK64-SKX-NEXT: .cfi_offset %xmm29, -384 ; CHECK64-SKX-NEXT: .cfi_offset %xmm30, -320 -; CHECK64-SKX-NEXT: .cfi_offset %xmm31, -224 +; CHECK64-SKX-NEXT: .cfi_offset %xmm31, -256 ; CHECK64-SKX-NEXT: .cfi_offset %k0, -144 ; CHECK64-SKX-NEXT: .cfi_offset %k1, -136 ; CHECK64-SKX-NEXT: .cfi_offset %k2, -128 @@ -474,7 +474,7 @@ define x86_intrcc void @foo(i8* %frame) { ; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm30 ## 64-byte Reload ; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x10,0x74,0x24,0x1e] ; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm31 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x10,0xbc,0x24,0xe0,0x07,0x00,0x00] +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x10,0x7c,0x24,0x1f] ; CHECK64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k0 ## 8-byte Reload ; CHECK64-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x90,0x84,0x24,0x30,0x08,0x00,0x00] ; CHECK64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k1 ## 8-byte Reload @@ -635,7 +635,7 @@ define x86_intrcc void @foo(i8* %frame) { ; CHECK32-SKX-NEXT: kmovq %k0, {{[0-9]+}}(%esp) ## 8-byte Spill ; CHECK32-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x91,0x84,0x24,0x30,0x02,0x00,0x00] ; CHECK32-SKX-NEXT: vmovups %zmm7, {{[0-9]+}}(%esp) ## 64-byte Spill -; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0xbc,0x24,0xe0,0x01,0x00,0x00] +; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x7c,0x24,0x07] ; CHECK32-SKX-NEXT: vmovups %zmm6, {{[0-9]+}}(%esp) ## 64-byte Spill ; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x74,0x24,0x06] ; CHECK32-SKX-NEXT: vmovups %zmm5, {{[0-9]+}}(%esp) ## 64-byte Spill @@ -661,7 +661,7 @@ define x86_intrcc void @foo(i8* %frame) { ; CHECK32-SKX-NEXT: .cfi_offset %xmm4, -384 ; CHECK32-SKX-NEXT: .cfi_offset %xmm5, -320 ; CHECK32-SKX-NEXT: .cfi_offset %xmm6, -256 -; CHECK32-SKX-NEXT: .cfi_offset %xmm7, -160 +; CHECK32-SKX-NEXT: .cfi_offset %xmm7, -192 ; CHECK32-SKX-NEXT: .cfi_offset %k0, -80 ; CHECK32-SKX-NEXT: .cfi_offset %k1, -72 ; CHECK32-SKX-NEXT: .cfi_offset %k2, -64 @@ -689,7 +689,7 @@ define x86_intrcc void @foo(i8* %frame) { ; CHECK32-SKX-NEXT: vmovups {{[0-9]+}}(%esp), %zmm6 ## 64-byte Reload ; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x74,0x24,0x06] ; CHECK32-SKX-NEXT: vmovups {{[0-9]+}}(%esp), %zmm7 ## 64-byte Reload -; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0xbc,0x24,0xe0,0x01,0x00,0x00] +; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x7c,0x24,0x07] ; CHECK32-SKX-NEXT: kmovq {{[0-9]+}}(%esp), %k0 ## 8-byte Reload ; CHECK32-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x90,0x84,0x24,0x30,0x02,0x00,0x00] ; CHECK32-SKX-NEXT: kmovq {{[0-9]+}}(%esp), %k1 ## 8-byte Reload _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits