================ @@ -5386,6 +5386,153 @@ bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, return true; } +bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper, + MachineInstr &MI, + Intrinsic::ID IID) const { + + MachineIRBuilder &B = Helper.MIRBuilder; + MachineRegisterInfo &MRI = *B.getMRI(); + + Register DstReg = MI.getOperand(0).getReg(); + Register Src0 = MI.getOperand(2).getReg(); + + Register Src1, Src2; + if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane) { + Src1 = MI.getOperand(3).getReg(); + if (IID == Intrinsic::amdgcn_writelane) { + Src2 = MI.getOperand(4).getReg(); + } + } + + LLT Ty = MRI.getType(DstReg); + unsigned Size = Ty.getSizeInBits(); + + if (Size == 32) { + if (Ty.isScalar()) + // Already legal + return true; + + Register Src0Valid = B.buildBitcast(S32, Src0).getReg(0); + MachineInstrBuilder LaneOpDst; + switch (IID) { + case Intrinsic::amdgcn_readfirstlane: { + LaneOpDst = B.buildIntrinsic(IID, {S32}).addUse(Src0Valid); + break; + } + case Intrinsic::amdgcn_readlane: { + LaneOpDst = B.buildIntrinsic(IID, {S32}).addUse(Src0Valid).addUse(Src1); + break; + } + case Intrinsic::amdgcn_writelane: { + Register Src2Valid = B.buildBitcast(S32, Src2).getReg(0); + LaneOpDst = B.buildIntrinsic(IID, {S32}) + .addUse(Src0Valid) + .addUse(Src1) + .addUse(Src2Valid); + } + } + + Register LaneOpDstReg = LaneOpDst.getReg(0); + B.buildBitcast(DstReg, LaneOpDstReg); + MI.eraseFromParent(); + return true; + } + + if (Size < 32) { + Register Src0Cast = MRI.getType(Src0).isScalar() + ? Src0 + : B.buildBitcast(LLT::scalar(Size), Src0).getReg(0); + Register Src0Valid = B.buildAnyExt(S32, Src0Cast).getReg(0); + + MachineInstrBuilder LaneOpDst; + switch (IID) { + case Intrinsic::amdgcn_readfirstlane: { + LaneOpDst = B.buildIntrinsic(IID, {S32}).addUse(Src0Valid); + break; + } + case Intrinsic::amdgcn_readlane: { + LaneOpDst = B.buildIntrinsic(IID, {S32}).addUse(Src0Valid).addUse(Src1); + break; + } + case Intrinsic::amdgcn_writelane: { + Register Src2Cast = + MRI.getType(Src2).isScalar() + ? Src2 + : B.buildBitcast(LLT::scalar(Size), Src2).getReg(0); + Register Src2Valid = B.buildAnyExt(LLT::scalar(32), Src2Cast).getReg(0); + LaneOpDst = B.buildIntrinsic(IID, {S32}) + .addUse(Src0Valid) + .addUse(Src1) + .addUse(Src2Valid); + } + } + + Register LaneOpDstReg = LaneOpDst.getReg(0); + if (Ty.isScalar()) + B.buildTrunc(DstReg, LaneOpDstReg); + else { + auto Trunc = B.buildTrunc(LLT::scalar(Size), LaneOpDstReg); + B.buildBitcast(DstReg, Trunc); + } + + MI.eraseFromParent(); + return true; + } + + if ((Size % 32) == 0) { + SmallVector<Register, 2> PartialRes; + unsigned NumParts = Size / 32; + auto Src0Parts = B.buildUnmerge(S32, Src0); ---------------- arsenm wrote:
For the multiple of `<2 x s16>`, it's a bit nicer to preserve the 16-bit element types https://github.com/llvm/llvm-project/pull/89217 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits