On Fri, Jun 07, 2013 at 05:48:05PM -0700, Tom Stellard wrote: > On Fri, Jun 07, 2013 at 05:24:42PM +0200, Michel Dänzer wrote: > > > > The most important difference to the previous version of these is that > > whole quad mode is now enabled and M0 initialized appropriately for the > > LDS instructions, which now allows all of the relevant piglit tests to > > pass. > > > > Hi Michel, > > After I gave this series my r-b, I was reviewing your Mesa patches, and > I suddenly had an idea for a better way to implement this. See my > comments below: > > > From bb5adcd52cc5cadc308e85f635675199f5c02f35 Mon Sep 17 00:00:00 2001 > > From: =?UTF-8?q?Michel=20D=C3=A4nzer?= <michel.daen...@amd.com> > > Date: Thu, 21 Feb 2013 17:56:22 +0100 > > Subject: [PATCH 3/3] R600/SI: Support AMDGPU.ddx/y intrinsics > > MIME-Version: 1.0 > > Content-Type: text/plain; charset=UTF-8 > > Content-Transfer-Encoding: 8bit > > > > Use LDS for calculating the deltas between neighbouring pixels. > > > > Signed-off-by: Michel Dänzer <michel.daen...@amd.com> > > --- > > lib/Target/R600/SIISelLowering.cpp | 77 > > +++++++++++++++++++++++++++++++++++++- > > lib/Target/R600/SIISelLowering.h | 6 +++ > > lib/Target/R600/SIInstructions.td | 42 ++++++++++++++++++++- > > 3 files changed, 121 insertions(+), 4 deletions(-) > > > > diff --git a/lib/Target/R600/SIISelLowering.cpp > > b/lib/Target/R600/SIISelLowering.cpp > > index ac6a4c3..7ea226a 100644 > > --- a/lib/Target/R600/SIISelLowering.cpp > > +++ b/lib/Target/R600/SIISelLowering.cpp > > @@ -249,7 +249,7 @@ SDValue SITargetLowering::LowerFormalArguments( > > > > MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( > > MachineInstr * MI, MachineBasicBlock * BB) const { > > - > > + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); > > MachineBasicBlock::iterator I = *MI; > > > > switch (MI->getOpcode()) { > > @@ -257,7 +257,6 @@ MachineBasicBlock * > > SITargetLowering::EmitInstrWithCustomInserter( > > return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); > > case AMDGPU::BRANCH: return BB; > > case AMDGPU::SI_ADDR64_RSRC: { > > - MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); > > unsigned SuperReg = MI->getOperand(0).getReg(); > > unsigned SubRegLo = > > MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); > > unsigned SubRegHi = > > MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); > > @@ -282,10 +281,84 @@ MachineBasicBlock * > > SITargetLowering::EmitInstrWithCustomInserter( > > MI->eraseFromParent(); > > break; > > } > > + case AMDGPU::SI_DD: > > + LowerSI_DD(MI, *BB, I, MRI); > > + break; > > + case AMDGPU::SI_TID: > > + LowerSI_TID(MI, *BB, I, MRI); > > + break; > > } > > return BB; > > } > > > > +void SITargetLowering::LowerSI_DD(MachineInstr *MI, MachineBasicBlock &BB, > > + MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const { > > + unsigned coord0 = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass); > > + unsigned coord1 = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass); > > + MachineOperand dst = MI->getOperand(0); > > + MachineOperand coord = MI->getOperand(1); > > + MachineOperand ldsaddr = MI->getOperand(2); > > + MachineOperand ldsaddr0 = MI->getOperand(3); > > + MachineOperand ldsdelta = MI->getOperand(4); > > + > > + // Write this thread's coordinate to LDS > > + BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::DS_WRITE_B32)) > > + .addOperand(coord) > > + .addImm(0) // LDS > > + .addOperand(ldsaddr) > > + .addOperand(coord) > > + .addOperand(coord) > > + .addImm(0) > > + .addImm(0); > > + > > + // Read top right / bottom left thread's coordinate from LDS > > + BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::DS_READ_B32), coord0) > > + .addImm(0) // LDS > > + .addOperand(ldsaddr0) > > + .addOperand(ldsaddr0) > > + .addOperand(ldsaddr0) > > + .addOperand(ldsdelta) > > + .addImm(0); > > + > > + // Read top left thread's coordinate from LDS > > + BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::DS_READ_B32), coord1) > > + .addImm(0) // LDS > > + .addOperand(ldsaddr0) > > + .addOperand(ldsaddr0) > > + .addOperand(ldsaddr0) > > + .addImm(0) > > + .addImm(0); > > + > > + // Subtract top left coordinate from top right / bottom left > > + BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_SUB_F32_e32)) > > + .addOperand(dst) > > + .addReg(coord0) > > + .addReg(coord1); > > + > > + MI->eraseFromParent(); > > +} > > + > > +void SITargetLowering::LowerSI_TID(MachineInstr *MI, MachineBasicBlock &BB, > > + MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const { > > + unsigned mbcnt_lo = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass); > > + MachineOperand dst = MI->getOperand(0); > > + > > + // Get this thread's ID > > + BuildMI(BB, I, BB.findDebugLoc(I), > > TII->get(AMDGPU::V_MBCNT_LO_U32_B32_e64), mbcnt_lo) > > + .addImm(0xffffffff) > > + .addImm(0) > > + .addImm(0) > > + .addImm(0) > > + .addImm(0) > > + .addImm(0); > > + BuildMI(BB, I, BB.findDebugLoc(I), > > TII->get(AMDGPU::V_MBCNT_HI_U32_B32_e32)) > > + .addOperand(dst) > > + .addImm(0xffffffff) > > + .addReg(mbcnt_lo); > > + > > + MI->eraseFromParent(); > > +} > > + > > EVT SITargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { > > return MVT::i1; > > } > > diff --git a/lib/Target/R600/SIISelLowering.h > > b/lib/Target/R600/SIISelLowering.h > > index 9b263b9..1a6538d 100644 > > --- a/lib/Target/R600/SIISelLowering.h > > +++ b/lib/Target/R600/SIISelLowering.h > > @@ -26,6 +26,12 @@ class SITargetLowering : public AMDGPUTargetLowering { > > > > SDValue LowerParameter(SelectionDAG &DAG, EVT VT, SDLoc DL, > > SDValue Chain, unsigned Offset) const; > > + > > + void LowerSI_DD(MachineInstr *MI, MachineBasicBlock &BB, > > + MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) > > const; > > + void LowerSI_TID(MachineInstr *MI, MachineBasicBlock &BB, > > + MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) > > const; > > + > > SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; > > SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const; > > SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; > > diff --git a/lib/Target/R600/SIInstructions.td > > b/lib/Target/R600/SIInstructions.td > > index d8fbf3e..1126729 100644 > > --- a/lib/Target/R600/SIInstructions.td > > +++ b/lib/Target/R600/SIInstructions.td > > @@ -903,8 +903,8 @@ defm V_MAC_F32 : VOP2_32 <0x0000001f, "V_MAC_F32", []>; > > defm V_MADMK_F32 : VOP2_32 <0x00000020, "V_MADMK_F32", []>; > > defm V_MADAK_F32 : VOP2_32 <0x00000021, "V_MADAK_F32", []>; > > //defm V_BCNT_U32_B32 : VOP2_32 <0x00000022, "V_BCNT_U32_B32", []>; > > -//defm V_MBCNT_LO_U32_B32 : VOP2_32 <0x00000023, "V_MBCNT_LO_U32_B32", []>; > > -//defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>; > > +defm V_MBCNT_LO_U32_B32 : VOP2_32 <0x00000023, "V_MBCNT_LO_U32_B32", []>; > > +defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>; > > > > let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC > > defm V_ADD_I32 : VOP2b_32 <0x00000025, "V_ADD_I32", > > @@ -1110,6 +1110,24 @@ def LOAD_CONST : AMDGPUShaderInst < > > [(set GPRF32:$dst, (int_AMDGPU_load_const imm:$src))] > > >; > > > > +let usesCustomInserter = 1 in { > > + > > +def SI_DD : InstSI < > > + (outs VReg_32:$dst), > > + (ins VReg_32:$src, VReg_32:$lds_addr, VReg_32:$lds_addr0, > > i8imm:$ldsdelta), > > + "SI_DD $src, $lds_addr, $lds_addr0, $ldsdelta", > > + [] > > +>; > > + > > +def SI_TID : InstSI < > > + (outs VReg_32:$dst), > > + (ins), > > + "SI_TID", > > + [] > > +>; > > + > > +} // end usesCustomInserter > > + > > // SI Psuedo instructions. These are used by the CFG structurizer pass > > // and should be lowered to ISA instructions prior to codegen. > > > > @@ -1544,6 +1562,26 @@ def : Pat < > > sub3) > > >; > > > > +class DDXY <Intrinsic name, bits<4> ldsdelta> : Pat < > > + (name v4f32:$src, imm, imm, imm), > > + (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (v4f32 > > (IMPLICIT_DEF)), > > + (SI_DD (EXTRACT_SUBREG $src, sub0), (V_LSHLREV_B32_e32 2, (SI_TID)), > > + (V_AND_B32_e32 0xfffffff0, (V_LSHLREV_B32_e32 2, (SI_TID))), > > + ldsdelta), sub0), > > + (SI_DD (EXTRACT_SUBREG $src, sub1), (V_LSHLREV_B32_e32 2, (SI_TID)), > > + (V_AND_B32_e32 0xfffffff0, (V_LSHLREV_B32_e32 2, (SI_TID))), > > + ldsdelta), sub1), > > + (SI_DD (EXTRACT_SUBREG $src, sub2), (V_LSHLREV_B32_e32 2, (SI_TID)), > > + (V_AND_B32_e32 0xfffffff0, (V_LSHLREV_B32_e32 2, (SI_TID))), > > + ldsdelta), sub2), > > + (SI_DD (EXTRACT_SUBREG $src, sub3), (V_LSHLREV_B32_e32 2, (SI_TID)), > > + (V_AND_B32_e32 0xfffffff0, (V_LSHLREV_B32_e32 2, (SI_TID))), > > + ldsdelta), sub3) > > +>; > > Based on this pattern, I don't think you need to use a ddx/ddy intrinsic > here. All of the instructions you are lowering DDX/DDY to have an > equivalent LLVM IR instruction or LLVM intrinsic. > > For the DS_READ and DS_WRITE instructions all you need to do is emit > load/stores to the local address space and then add patterns for those > int the backend. As an added bonus this will add support for OpenCL > local address spaces. I think the rest of the instructions are pretty straight > forward (unless I've overlooked something). Let me know if you have any > questions.
I did overlook something. You will need to add an intrinsic for thread id in order to implement ddx/ddy completely in LLVM IR, but I still think it is the best way. -Tom > > + > > +def : DDXY<int_AMDGPU_ddx, 4>; > > +def : DDXY<int_AMDGPU_ddy, 8>; > > + > > def : Pat < > > (i32 (sext i1:$src0)), > > (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src0) > > -- > > 1.8.3 > > > > > _______________________________________________ > > llvm-commits mailing list > > llvm-comm...@cs.uiuc.edu > > http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits > > > _______________________________________________ > llvm-commits mailing list > llvm-comm...@cs.uiuc.edu > http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev