https://github.com/jofrn updated https://github.com/llvm/llvm-project/pull/120598
>From 3a82883ef143c15cd3f213b83d3bda492c0e0d9c Mon Sep 17 00:00:00 2001 From: jofrn <jofer...@amd.com> Date: Thu, 19 Dec 2024 11:19:39 -0500 Subject: [PATCH] [SelectionDAG][X86] Widen <2 x T> vector types for atomic load Vector types of 2 elements must be widened. This change does this for vector types of atomic load in SelectionDAG so that it can translate aligned vectors of >1 size. Also, it combines the v2 type into its equivalent scalar type so as to avoid a move to vector. commit-id:2894ccd1 --- llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 6 +- .../SelectionDAG/LegalizeVectorTypes.cpp | 39 ++++++++- llvm/lib/Target/X86/X86ISelLowering.cpp | 79 ++++++++++++++++++- llvm/test/CodeGen/X86/atomic-load-store.ll | 41 ++++++++++ 4 files changed, 159 insertions(+), 6 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index b81c9f87cb27d7..3b3dddc44e3682 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -1046,6 +1046,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue WidenVecRes_EXTRACT_SUBVECTOR(SDNode* N); SDValue WidenVecRes_INSERT_SUBVECTOR(SDNode *N); SDValue WidenVecRes_INSERT_VECTOR_ELT(SDNode* N); + SDValue WidenVecRes_ATOMIC_LOAD(AtomicSDNode *N); SDValue WidenVecRes_LOAD(SDNode* N); SDValue WidenVecRes_VP_LOAD(VPLoadSDNode *N); SDValue WidenVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *N); @@ -1129,8 +1130,9 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { /// resulting wider type. It takes: /// LdChain: list of chains for the load to be generated. /// Ld: load to widen - SDValue GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain, - LoadSDNode *LD); + template <typename T> + SDValue GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain, T *LD, + bool IsAtomic = false); /// Helper function to generate a set of extension loads to load a vector with /// a resulting wider type. It takes: diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index c85e4ba2cfa5a7..7c4caa96244b8b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -4515,6 +4515,9 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { break; case ISD::EXTRACT_SUBVECTOR: Res = WidenVecRes_EXTRACT_SUBVECTOR(N); break; case ISD::INSERT_VECTOR_ELT: Res = WidenVecRes_INSERT_VECTOR_ELT(N); break; + case ISD::ATOMIC_LOAD: + Res = WidenVecRes_ATOMIC_LOAD(cast<AtomicSDNode>(N)); + break; case ISD::LOAD: Res = WidenVecRes_LOAD(N); break; case ISD::STEP_VECTOR: case ISD::SPLAT_VECTOR: @@ -5901,6 +5904,30 @@ SDValue DAGTypeLegalizer::WidenVecRes_INSERT_VECTOR_ELT(SDNode *N) { N->getOperand(1), N->getOperand(2)); } +SDValue DAGTypeLegalizer::WidenVecRes_ATOMIC_LOAD(AtomicSDNode *N) { + SmallVector<SDValue, 16> LdChain; // Chain for the series of load + SDValue Result = GenWidenVectorLoads(LdChain, N, true /*IsAtomic*/); + + if (Result) { + // If we generate a single load, we can use that for the chain. Otherwise, + // build a factor node to remember the multiple loads are independent and + // chain to that. + SDValue NewChain; + if (LdChain.size() == 1) + NewChain = LdChain[0]; + else + NewChain = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, LdChain); + + // Modified the chain - switch anything that used the old chain to use + // the new one. + ReplaceValueWith(SDValue(N, 1), NewChain); + + return Result; + } + + report_fatal_error("Unable to widen atomic vector load"); +} + SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) { LoadSDNode *LD = cast<LoadSDNode>(N); ISD::LoadExtType ExtType = LD->getExtensionType(); @@ -7699,8 +7726,9 @@ static SDValue BuildVectorFromScalar(SelectionDAG& DAG, EVT VecTy, return DAG.getNode(ISD::BITCAST, dl, VecTy, VecOp); } +template <typename T> SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain, - LoadSDNode *LD) { + T *LD, bool IsAtomic) { // The strategy assumes that we can efficiently load power-of-two widths. // The routine chops the vector into the largest vector loads with the same // element type or scalar loads and then recombines it to the widen vector @@ -7757,8 +7785,13 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain, } while (TypeSize::isKnownGT(RemainingWidth, NewVTWidth)); } - SDValue LdOp = DAG.getLoad(*FirstVT, dl, Chain, BasePtr, LD->getPointerInfo(), - LD->getOriginalAlign(), MMOFlags, AAInfo); + SDValue LdOp; + if (IsAtomic) + LdOp = DAG.getAtomic(ISD::ATOMIC_LOAD, dl, *FirstVT, *FirstVT, Chain, + BasePtr, LD->getMemOperand()); + else + LdOp = DAG.getLoad(*FirstVT, dl, Chain, BasePtr, LD->getPointerInfo(), + LD->getOriginalAlign(), MMOFlags, AAInfo); LdChain.push_back(LdOp.getValue(1)); // Check if we can load the element with one instruction. diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index fda93a2eb38745..eca5547a4307d5 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2672,7 +2672,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, ISD::STRICT_FP_ROUND, ISD::INTRINSIC_VOID, ISD::INTRINSIC_WO_CHAIN, - ISD::INTRINSIC_W_CHAIN}); + ISD::INTRINSIC_W_CHAIN, + ISD::ATOMIC_LOAD}); computeRegisterProperties(Subtarget.getRegisterInfo()); @@ -52242,6 +52243,81 @@ static SDValue combineConstantPoolLoads(SDNode *N, const SDLoc &dl, return SDValue(); } +static MVT getScalarTypeFromVectorType(MVT VT) { + if (VT == MVT::v2i8) + return MVT::i16; + if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) + return MVT::i32; + if (VT == MVT::v2i32 || VT == MVT::v2f32) + return MVT::i64; + LLVM_DEBUG(dbgs() << VT << '\n'); + llvm_unreachable("Invalid VT for scalar type translation"); +} + +static SDValue combineAtomicLoad(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + auto &MRI = DAG.getMachineFunction().getRegInfo(); + auto &TRI = *Subtarget.getRegisterInfo(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + auto *Ld = cast<AtomicSDNode>(N); + SDLoc dl(Ld); + EVT RegVT = Ld->getValueType(0); + assert(RegVT == Ld->getMemoryVT()); + EVT OldVT = N->getValueType(0); + + // If N has a vector type, then load the elements together. + // i.e. `v2i16 = AtomicLoad` is treated as + // `i32 = AtomicLoad`. + if (OldVT.isVector() && OldVT.getVectorNumElements() == 2) { + MVT VT = getScalarTypeFromVectorType(N->getValueType(0).getSimpleVT()); + + SDValue NewLd = DAG.getAtomic(ISD::ATOMIC_LOAD, dl, VT, VT, + Ld->getChain(), Ld->getBasePtr(), + Ld->getMemOperand()); + + // Find the register to copy into so that its vector register may be + // rewritten with a scalar register. + SDNode *CopyToReg = nullptr; + for (SDNode *User : N->users()) + if (User->getOpcode() == ISD::CopyToReg) { + CopyToReg = User; + break; + } + + if (CopyToReg) { + Register RegToCopyTo = cast<RegisterSDNode>(CopyToReg->getOperand(1))->getReg(); + + // Check if it is legal to replace the register. + const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(RegToCopyTo); + assert(RC); + unsigned PhysRegSize = TRI.getRegSizeInBits(*RC); + const bool IsPhysRegToReplace = Register::isPhysicalRegister(RegToCopyTo) && + (PhysRegSize == VT.getSizeInBits() || RegToCopyTo == X86::XMM0); + const bool IsVirtRegToReplace = Register::isVirtualRegister(RegToCopyTo) && + TLI.getRegClassFor(VT) == MRI.getRegClass(RegToCopyTo); + + if (IsPhysRegToReplace || IsVirtRegToReplace) { + SDValue VecReg = CopyToReg->getOperand(1); + SDValue Glue = NewLd.getValue(0); + Register NewReg = MRI.createVirtualRegister(TLI.getRegClassFor(VT)); + SDValue Ret = DAG.getCopyToReg(NewLd, dl, NewReg, NewLd, Glue); + + // Replace the register. + DAG.ReplaceAllUsesOfValueWith(VecReg, DAG.getRegister(NewReg, VT)); + + // Replace the nodes. + DAG.ReplaceAllUsesOfValueWith(SDValue(CopyToReg, 1), Ret.getValue(1)); + DAG.ReplaceAllUsesOfValueWith(SDValue(CopyToReg, 0), Ret.getValue(0)); + DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLd); + return NewLd; + } + } + } + + return SDValue(); +} + static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -59172,6 +59248,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::AVGFLOORU: return combineAVG(N, DAG, DCI, Subtarget); case X86ISD::BEXTR: case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget); + case ISD::ATOMIC_LOAD: return combineAtomicLoad(N, DAG, DCI, Subtarget); case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget); case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget); case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget); diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll index 39e9fdfa5e62b0..7b2e5e60eca20e 100644 --- a/llvm/test/CodeGen/X86/atomic-load-store.ll +++ b/llvm/test/CodeGen/X86/atomic-load-store.ll @@ -146,6 +146,47 @@ define <1 x i64> @atomic_vec1_i64_align(ptr %x) nounwind { ret <1 x i64> %ret } +define <2 x i8> @atomic_vec2_i8(ptr %x) { +; CHECK3-LABEL: atomic_vec2_i8: +; CHECK3: ## %bb.0: +; CHECK3-NEXT: movzwl (%rdi), %eax +; CHECK3-NEXT: retq +; +; CHECK0-LABEL: atomic_vec2_i8: +; CHECK0: ## %bb.0: +; CHECK0-NEXT: movw (%rdi), %ax +; CHECK0-NEXT: retq + %ret = load atomic <2 x i8>, ptr %x acquire, align 4 + ret <2 x i8> %ret +} + +define <2 x i16> @atomic_vec2_i16(ptr %x) { +; CHECK-LABEL: atomic_vec2_i16: +; CHECK: ## %bb.0: +; CHECK-NEXT: movl (%rdi), %eax +; CHECK-NEXT: retq + %ret = load atomic <2 x i16>, ptr %x acquire, align 4 + ret <2 x i16> %ret +} + +define <2 x i32> @atomic_vec2_i32_align(ptr %x) { +; CHECK-LABEL: atomic_vec2_i32_align: +; CHECK: ## %bb.0: +; CHECK-NEXT: movq (%rdi), %rax +; CHECK-NEXT: retq + %ret = load atomic <2 x i32>, ptr %x acquire, align 8 + ret <2 x i32> %ret +} + +define <2 x float> @atomic_vec2_float_align(ptr %x) { +; CHECK-LABEL: atomic_vec2_float_align: +; CHECK: ## %bb.0: +; CHECK-NEXT: movq (%rdi), %rax +; CHECK-NEXT: retq + %ret = load atomic <2 x float>, ptr %x acquire, align 8 + ret <2 x float> %ret +} + define <1 x ptr> @atomic_vec1_ptr(ptr %x) nounwind { ; CHECK3-LABEL: atomic_vec1_ptr: ; CHECK3: ## %bb.0: _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits