Author: Jeffrey Byrnes Date: 2022-10-13T10:48:41-07:00 New Revision: ec1747cb71d0db73b268d17367b83652cd4e2ad3
URL: https://github.com/llvm/llvm-project/commit/ec1747cb71d0db73b268d17367b83652cd4e2ad3 DIFF: https://github.com/llvm/llvm-project/commit/ec1747cb71d0db73b268d17367b83652cd4e2ad3.diff LOG: Able to produce good initial SelectionDAG for ret. resolved extract_subvector legalizing, able to build the test.ll Added: Modified: llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp llvm/lib/Target/AMDGPU/SIISelLowering.cpp llvm/lib/Target/AMDGPU/SIInstructions.td llvm/lib/Target/AMDGPU/SIRegisterInfo.td Removed: ################################################################################ diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index e62f57c536b37..629e7b84cf71d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -971,12 +971,16 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) { TargetLowering::TypeLegal && "Unexpected illegal type!"); - for (const SDValue &Op : Node->op_values()) + for (const SDValue &Op : Node->op_values()) { + if (TLI.getTypeAction(*DAG.getContext(), Op.getValueType()) == + TargetLowering::TypeLegal) errs() << "TargetLowering::TypeLegal\n"; + if (Op.getOpcode() == ISD::Register) errs() << "Register\n"; assert((TLI.getTypeAction(*DAG.getContext(), Op.getValueType()) == TargetLowering::TypeLegal || Op.getOpcode() == ISD::TargetConstant || Op.getOpcode() == ISD::Register) && "Unexpected illegal type!"); + } #endif // Figure out the correct action; the way to query this varies by opcode diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td index b6c66077675ff..523788106db63 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -22,28 +22,28 @@ def CC_SI_Gfx : CallingConv<[ // 32 is reserved for the stack pointer // 33 is reserved for the frame pointer // 34 is reserved for the base pointer - CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[ + CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, v4i8] , CCAssignToReg<[ SGPR4, SGPR5, SGPR6, SGPR7, SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15, SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23, SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29 ]>>>, - CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[ + CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, v4i8] , CCAssignToReg<[ VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7, VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31 ]>>>, - CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>> + CCIfType<[i32, f32, v2i16, v2f16, i16, f16, v4i8, i1], CCAssignToStack<4, 4>> ]>; def RetCC_SI_Gfx : CallingConv<[ CCIfType<[i1], CCPromoteToType<i32>>, CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>, - CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[ + CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, v4i8] , CCAssignToReg<[ VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7, VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, @@ -66,7 +66,7 @@ def RetCC_SI_Gfx : CallingConv<[ def CC_SI_SHADER : CallingConv<[ - CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[ + CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, v4i8] , CCAssignToReg<[ SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7, SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15, SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23, @@ -76,7 +76,7 @@ def CC_SI_SHADER : CallingConv<[ ]>>>, // 32*4 + 4 is the minimum for a fetch shader consumer with 32 inputs. - CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[ + CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, v4i8] , CCAssignToReg<[ VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7, VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, @@ -99,7 +99,7 @@ def CC_SI_SHADER : CallingConv<[ def RetCC_SI_Shader : CallingConv<[ CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>, - CCIfType<[i32, i16] , CCAssignToReg<[ + CCIfType<[i32, i16, v4i8] , CCAssignToReg<[ SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7, SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15, SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23, @@ -183,19 +183,19 @@ def CC_AMDGPU_Func : CallingConv<[ CCIfByVal<CCPassByVal<4, 4>>, CCIfType<[i1], CCPromoteToType<i32>>, CCIfType<[i8, i16], CCIfExtend<CCPromoteToType<i32>>>, - CCIfType<[i32, f32, i16, f16, v2i16, v2f16, i1], CCAssignToReg<[ + CCIfType<[i32, f32, i16, f16, v2i16, v2f16, v4i8, i1], CCAssignToReg<[ VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7, VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>, - CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>> + CCIfType<[i32, f32, v2i16, v2f16, i16, f16, v4i8, i1], CCAssignToStack<4, 4>> ]>; // Calling convention for leaf functions def RetCC_AMDGPU_Func : CallingConv<[ CCIfType<[i1], CCPromoteToType<i32>>, CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>, - CCIfType<[i32, f32, i16, f16, v2i16, v2f16], CCAssignToReg<[ + CCIfType<[i32, f32, i16, f16, v2i16, v2f16, v4i8], CCAssignToReg<[ VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7, VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 9c2247f336ee1..9980e851f9820 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -322,7 +322,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v16f16, MVT::v16i16, MVT::v16f32, MVT::v16i32, MVT::v32f32, MVT::v32i32, MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64, MVT::v4i64, - MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64}, + MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64, MVT::v2i8, + MVT::v4i8}, Custom); setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); @@ -1246,6 +1247,15 @@ SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SmallVector<SDValue, 8> Args; EVT VT = Op.getValueType(); + if (VT == MVT::v4i8) { + SDLoc SL(Op); + SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Op.getOperand(0)); + SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Op.getOperand(1)); + SDValue BV = DAG.getBuildVector(MVT::v2i16, SL, { Lo, Hi }); + return DAG.getNode(ISD::BITCAST, SL, VT, BV); + } + + if (VT == MVT::v4i16 || VT == MVT::v4f16) { SDLoc SL(Op); SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(0)); @@ -1270,6 +1280,36 @@ SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, EVT SrcVT = Op.getOperand(0).getValueType(); // For these types, we have some TableGen patterns except if the index is 1 + if ((SrcVT == MVT::v4i8 && VT == MVT::v2i8) && + Start != 1) { + SDLoc SL(Op); + SDValue Vec = Op.getOperand(0); + SDValue Idx = Op.getOperand(1); + unsigned VecSize = SrcVT.getSizeInBits(); + EVT EltVT = SrcVT.getVectorElementType(); + unsigned EltSize = EltVT.getSizeInBits(); + SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32); + + MVT IntVT = MVT::getIntegerVT(VecSize); + MVT RestIntVT = MVT::getIntegerVT(VT.getSizeInBits()); + + // Convert vector index to bit-index (* EltSize) + SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor); + + // Convert source vector to corresponding scalar + SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec); + + // Shift to get the appropriate bits for subvector + SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx); + + // Trunc to bitsize of result vector of extract_subvector + SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, RestIntVT, Elt); + + SDValue Recast = DAG.getNode(ISD::BITCAST, SL, VT, Result); + return Recast; + } + + if (((SrcVT == MVT::v4f16 && VT == MVT::v2f16) || (SrcVT == MVT::v4i16 && VT == MVT::v2i16)) && Start != 1) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 6e0478ed2f166..f962e49418c50 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -83,6 +83,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass); addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass); + addRegisterClass(MVT::v4i8, &AMDGPU::SReg_32RegClass); + addRegisterClass(MVT::v2i8, &AMDGPU::SReg_32RegClass); + addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); const SIRegisterInfo *TRI = STI.getRegisterInfo(); @@ -173,20 +176,20 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand); - setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand); - setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand); + //setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand); + //setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand); setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand); setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand); setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand); - setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand); - setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand); + //setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand); + //setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand); setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand); setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand); setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand); setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand); setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand); - setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand); + //setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand); setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand); setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand); setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand); @@ -242,7 +245,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, MVT::v2f64, MVT::v4i16, MVT::v4f16, MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64, MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16, - MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32}) { + MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32, MVT::v4i8}) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch (Op) { case ISD::LOAD: @@ -607,7 +610,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, if (!Subtarget->hasVOP3PInsts()) setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16}, Custom); - + + setOperationAction(ISD::BUILD_VECTOR, MVT::v2i8, Custom); + setOperationAction(ISD::FNEG, MVT::v2f16, Legal); // This isn't really legal, but this avoids the legalizer unrolling it (and // allows matching fneg (fabs x) patterns) @@ -645,7 +650,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16, - MVT::v16f16, MVT::v16i16}, + MVT::v16f16, MVT::v16i16, MVT::v4i8}, Custom); for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16}) @@ -826,8 +831,10 @@ MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, return VT.isInteger() ? MVT::v2i16 : MVT::v2f16; return VT.isInteger() ? MVT::i32 : MVT::f32; } + if (Size == 8) + return Subtarget->has16BitInsts() ? MVT::v4i8 : MVT::i32; - if (Size < 16) + if (Size < 16 && Size != 8) return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32; return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32; } @@ -850,6 +857,9 @@ unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, unsigned Size = ScalarVT.getSizeInBits(); // FIXME: Should probably promote 8-bit vectors to i16. + if (Size == 8 && Subtarget->has16BitInsts()) + return (NumElts + 1) / 4; + if (Size == 16 && Subtarget->has16BitInsts()) return (NumElts + 1) / 2; @@ -872,6 +882,12 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv( unsigned NumElts = VT.getVectorNumElements(); EVT ScalarVT = VT.getScalarType(); unsigned Size = ScalarVT.getSizeInBits(); + if (Size == 8 && Subtarget->has16BitInsts()) { + RegisterVT = MVT::v4i8; + NumIntermediates = (NumElts + 1) / 4; + IntermediateVT = RegisterVT; + return NumIntermediates; + } // FIXME: We should fix the ABI to be the same on targets without 16-bit // support, but unless we can properly handle 3-vectors, it will be still be // inconsistent. @@ -5857,8 +5873,16 @@ SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op, EVT ResultVT = Op.getValueType(); ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op); - EVT PackVT = ResultVT.isInteger() ? MVT::v2i16 : MVT::v2f16; - EVT EltVT = PackVT.getVectorElementType(); + EVT PackVT; + EVT EltVT; + auto ScalarSize = ResultVT.getVectorElementType().getSizeInBits() ; + if (ScalarSize == 8) { + PackVT = MVT::v2i8; + } + else { + PackVT = ResultVT.isInteger() ? MVT::v2i16 : MVT::v2f16; + } + EltVT = PackVT.getVectorElementType(); int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements(); // vector_shuffle <0,1,6,7> lhs, rhs @@ -5969,32 +5993,56 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op, return DAG.getNode(ISD::BITCAST, SL, VT, Blend); } - assert(VT == MVT::v2f16 || VT == MVT::v2i16); - assert(!Subtarget->hasVOP3PInsts() && "this should be legal"); + if (VT != MVT::v2i8) { + assert(VT == MVT::v2f16 || VT == MVT::v2i16); + assert(!Subtarget->hasVOP3PInsts() && "this should be legal"); + } + + + EVT SrcVT = Op.getOperand(1).getValueType(); // i8, i16 + EVT BCVT = (SrcVT) == MVT::f16 ? MVT::i16 : SrcVT; + + unsigned VecSize = VT.getSizeInBits(); // 16, 32 + EVT EltVT = SrcVT.getVectorElementType(); + unsigned EltSize = EltVT.getSizeInBits(); + + SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32); + // Convert vector index to bit-index (* EltSize) + SDValue ScaledShift = DAG.getNode(ISD::SHL, SL, MVT::i32, DAG.getConstant(1, SL, MVT::i32), ScaleFactor); + + MVT IntVT = MVT::getIntegerVT(VecSize); // i16, i32 + MVT FloatVT = MVT::getFloatingPointVT(VecSize); // f32 + MVT RestIntVT = MVT::getIntegerVT(VT.getSizeInBits()); + + + SDValue Lo = Op.getOperand(0); SDValue Hi = Op.getOperand(1); // Avoid adding defined bits with the zero_extend. if (Hi.isUndef()) { - Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo); - SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo); + Lo = DAG.getNode(ISD::BITCAST, SL, BCVT, Lo); + SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, IntVT, Lo); return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo); } - Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi); - Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi); + Hi = DAG.getNode(ISD::BITCAST, SL, BCVT, Hi); + Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Hi); - SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi, - DAG.getConstant(16, SL, MVT::i32)); + SDValue ShlHi = DAG.getNode(ISD::SHL, SL, IntVT, Hi, ScaledShift); if (Lo.isUndef()) return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi); - Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo); - Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo); + Lo = DAG.getNode(ISD::BITCAST, SL, BCVT, Lo); + Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Lo); - SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi); - return DAG.getNode(ISD::BITCAST, SL, VT, Or); + SDValue Or = DAG.getNode(ISD::OR, SL, IntVT, Lo, ShlHi); + auto temp = DAG.getNode(ISD::BITCAST, SL, VT, Or); + errs() << "Build Final node : \n"; + temp->dump(); + errs() << "\n"; + return temp; } bool diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 8100d82d21f30..d88272fc485c2 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1313,6 +1313,9 @@ def : BitConvert <f16, i16, VGPR_32>; def : BitConvert <i16, f16, SReg_32>; def : BitConvert <f16, i16, SReg_32>; +def : BitConvert <v2i8, i16, SReg_32>; +def : BitConvert <i16, v2i8, SReg_32>; + // 32-bit bitcast def : BitConvert <i32, f32, VGPR_32>; def : BitConvert <f32, i32, VGPR_32>; @@ -1329,6 +1332,9 @@ def : BitConvert <f32, v2f16, SReg_32>; def : BitConvert <v2i16, f32, SReg_32>; def : BitConvert <f32, v2i16, SReg_32>; +def : BitConvert <v4i8, i32, SReg_32>; +def : BitConvert <v4i8, v2i16, SReg_32>; + // 64-bit bitcast def : BitConvert <i64, f64, VReg_64>; def : BitConvert <f64, i64, VReg_64>; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index fae76be2b1ddb..c07333b17ff37 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -369,7 +369,7 @@ def SGPR_HI16 : SIRegisterClass<"AMDGPU", [i16, f16], 16, } // SGPR 32-bit registers -def SGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, +def SGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, v4i8, v2i8], 32, (add (sequence "SGPR%u", 0, 105))> { // Give all SGPR classes higher priority than VGPR classes, because // we want to spill SGPRs to VGPRs. @@ -406,7 +406,7 @@ def SGPR_512Regs : SIRegisterTuples<getSubRegs<16>.ret, SGPR_32, 105, 4, 16, "s" def SGPR_1024Regs : SIRegisterTuples<getSubRegs<32>.ret, SGPR_32, 105, 4, 32, "s">; // Trap handler TMP 32-bit registers -def TTMP_32 : SIRegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32, +def TTMP_32 : SIRegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16, v4i8, v2i8], 32, (add (sequence "TTMP%u", 0, 15))> { let isAllocatable = 0; let HasSGPR = 1; @@ -528,7 +528,7 @@ class RegisterTypes<list<ValueType> reg_types> { } def Reg16Types : RegisterTypes<[i16, f16]>; -def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, p2, p3, p5, p6]>; +def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, v4i8, v2i8, p2, p3, p5, p6]>; let HasVGPR = 1 in { def VGPR_LO16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, @@ -600,7 +600,7 @@ def AGPR_LO16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, } // AccVGPR 32-bit registers -def AGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, +def AGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, v4i8, v2i8], 32, (add (sequence "AGPR%u", 0, 255))> { let AllocationPriority = 0; let Size = 32; @@ -639,7 +639,7 @@ def AGPR_1024 : SIRegisterTuples<getSubRegs<32>.ret, AGPR_32, 255, 1, 32, "a">; // Register classes used as source and destination //===----------------------------------------------------------------------===// -def Pseudo_SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, +def Pseudo_SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, v4i8, v2i8], 32, (add FP_REG, SP_REG)> { let isAllocatable = 0; let CopyCost = -1; @@ -662,7 +662,7 @@ def LDS_DIRECT_CLASS : RegisterClass<"AMDGPU", [i32], 32, let GeneratePressureSet = 0, HasSGPR = 1 in { // Subset of SReg_32 without M0 for SMRD instructions and alike. // See comments in SIInstructions.td for more info. -def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, +def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, v4i8, v2i8, i1], 32, (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, XNACK_MASK_LO, XNACK_MASK_HI, SGPR_NULL, SGPR_NULL_HI, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT, SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, SRC_POPS_EXITING_WAVE_ID, @@ -680,7 +680,7 @@ def SReg_LO16_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i16, f16], 16, let AllocationPriority = 0; } -def SReg_32_XEXEC_HI : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, +def SReg_32_XEXEC_HI : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, v4i8, v2i8, i1], 32, (add SReg_32_XM0_XEXEC, EXEC_LO, M0_CLASS)> { let AllocationPriority = 0; } @@ -691,7 +691,7 @@ def SReg_LO16_XEXEC_HI : SIRegisterClass<"AMDGPU", [i16, f16], 16, let AllocationPriority = 0; } -def SReg_32_XM0 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, +def SReg_32_XM0 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, v4i8, v2i8, i1], 32, (add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI)> { let AllocationPriority = 0; } @@ -710,20 +710,20 @@ def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16, } // End GeneratePressureSet = 0 // Register class for all scalar registers (SGPRs + Special Registers) -def SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, +def SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, v4i8, v2i8, i1], 32, (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI)> { let AllocationPriority = 0; let HasSGPR = 1; } let GeneratePressureSet = 0 in { -def SRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, +def SRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, v4i8, v2i8], 32, (add SReg_32, LDS_DIRECT_CLASS)> { let isAllocatable = 0; let HasSGPR = 1; } -def SGPR_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16], 32, +def SGPR_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16, v4i8, v2i8], 32, (add SGPR_64Regs)> { let CopyCost = 1; let AllocationPriority = 1; @@ -807,7 +807,7 @@ defm "" : SRegClass<16, [v16i32, v16f32, v8i64, v8f64], SGPR_512Regs, TTMP_512Re defm "" : SRegClass<32, [v32i32, v32f32, v16i64, v16f64], SGPR_1024Regs>; } -def VRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, +def VRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, v4i8, v2i8], 32, (add VGPR_32, LDS_DIRECT_CLASS)> { let isAllocatable = 0; let HasVGPR = 1; @@ -887,14 +887,14 @@ def VReg_1 : SIRegisterClass<"AMDGPU", [i1], 32, (add)> { let HasVGPR = 1; } -def VS_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, +def VS_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, v4i8, v2i8], 32, (add VGPR_32, SReg_32, LDS_DIRECT_CLASS)> { let isAllocatable = 0; let HasVGPR = 1; let HasSGPR = 1; } -def VS_32_Lo128 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, +def VS_32_Lo128 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, v4i8, v2i8], 32, (add VGPR_32_Lo128, SReg_32, LDS_DIRECT_CLASS)> { let isAllocatable = 0; let HasVGPR = 1; _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits