https://github.com/mariusz-sikora-at-amd updated https://github.com/llvm/llvm-project/pull/74576
>From 23759746b66c33028ad2340b1e98067ebf1f8074 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin <stanislav.mekhanos...@amd.com> Date: Tue, 28 Jun 2022 15:24:24 -0700 Subject: [PATCH 1/2] [AMDGPU] GFX12: select @llvm.prefetch intrinsic --- .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 21 + llvm/lib/Target/AMDGPU/GCNSubtarget.h | 2 + llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 22 + llvm/lib/Target/AMDGPU/SIISelLowering.h | 2 + llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 2 + llvm/lib/Target/AMDGPU/SIInstructions.td | 12 + llvm/lib/Target/AMDGPU/SMInstructions.td | 34 ++ llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll | 496 ++++++++++++++++++ 8 files changed, 591 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 62996a3b3fb79..f0b3ed7adc294 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3101,6 +3101,24 @@ void AMDGPURegisterBankInfo::applyMappingImpl( applyDefaultMapping(OpdMapper); constrainOpWithReadfirstlane(B, MI, 8); // M0 return; + case Intrinsic::prefetch: { + if (!Subtarget.hasPrefetch()) { + MI.eraseFromParent(); + return; + } + unsigned PtrBank = + getRegBankID(MI.getOperand(1).getReg(), MRI, AMDGPU::SGPRRegBankID); + if (PtrBank == AMDGPU::VGPRRegBankID) { + MI.eraseFromParent(); + return; + } + // FIXME: There is currently no support for prefetch in global isel. + // There is no node equivalence and what's worse there is no MMO produced + // for a prefetch on global isel path. + // Prefetch does not affect execution so erase it for now. + MI.eraseFromParent(); + return; + } default: { if (const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID)) { @@ -4830,6 +4848,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); // %data1 break; } + case Intrinsic::prefetch: + OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); + break; default: return getInvalidInstructionMapping(); diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 94b9e49b765a6..21a9b8147034f 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -828,6 +828,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasInstPrefetch() const { return getGeneration() >= GFX10; } + bool hasPrefetch() const { return GFX12Insts; } + // Scratch is allocated in 256 dword per wave blocks for the entire // wavefront. When viewed from the perspective of an arbitrary workitem, this // is 4-byte aligned. diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index a7f4d63229b7e..93af38d877c5d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -763,6 +763,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, if (Subtarget->hasMad64_32()) setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom); + if (Subtarget->hasPrefetch()) + setOperationAction(ISD::PREFETCH, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16, MVT::v2i16, MVT::v2f16, MVT::i128}, @@ -3858,6 +3861,23 @@ SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op, return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL); } +SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const { + if (Op->isDivergent()) + return SDValue(); + + switch (cast<MemSDNode>(Op)->getAddressSpace()) { + case AMDGPUAS::FLAT_ADDRESS: + case AMDGPUAS::GLOBAL_ADDRESS: + case AMDGPUAS::CONSTANT_ADDRESS: + case AMDGPUAS::CONSTANT_ADDRESS_32BIT: + break; + default: + return SDValue(); + } + + return Op; +} + Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const { Register Reg = StringSwitch<Register>(RegName) @@ -5395,6 +5415,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerSTACKSAVE(Op, DAG); case ISD::GET_ROUNDING: return lowerGET_ROUNDING(Op, DAG); + case ISD::PREFETCH: + return lowerPREFETCH(Op, DAG); } return SDValue(); } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index c9cc149218a99..5bc091d6e84de 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -416,6 +416,8 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const; + Register getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const override; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index b5b456d691254..8e96d5f8abe15 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -483,6 +483,8 @@ bool SIInstrInfo::getMemOperandsWithOffsetWidth( Offset = OffsetOp ? OffsetOp->getImm() : 0; // Get appropriate operand, and compute width accordingly. DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst); + if (DataOpIdx == -1) + return false; Width = getOpSize(LdSt, DataOpIdx); return true; } diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 9362fe5d9678b..6d513fb0bdecc 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -3164,6 +3164,18 @@ def : GCNPat < (as_i1timm $bound_ctrl)) >; +class SMPrefetchGetPcPat<string type, int cache_type> : GCNPat < + (prefetch (i64 imm:$offset), timm, timm, (i32 cache_type)), + (!cast<SM_Prefetch_Pseudo>("S_PREFETCH_"#type) (S_ADD_U64_PSEUDO (S_GETPC_B64), $offset), + (i32 20), (i32 SGPR_NULL), (i8 0)) + // Offset 20 should roughly adjust getpc sequence length. + > { + let AddedComplexity = 9; +} + +def : SMPrefetchGetPcPat<"INST", 0>; +def : SMPrefetchGetPcPat<"DATA", 1>; + //===----------------------------------------------------------------------===// // Fract Patterns //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td index c18846483cf95..a77856caae7a6 100644 --- a/llvm/lib/Target/AMDGPU/SMInstructions.td +++ b/llvm/lib/Target/AMDGPU/SMInstructions.td @@ -814,6 +814,14 @@ def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ return isUniformL }]; } +def smrd_prefetch : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type), + (prefetch node:$ptr, node:$rw, node:$loc, node:$type), + [{ return !N->getOperand(1)->isDivergent();}]> { + let GISelPredicateCode = [{ + return isInstrUniform(MI); + }]; +} + def SMRDImm : ComplexPattern<iPTR, 2, "SelectSMRDImm">; def SMRDImm32 : ComplexPattern<iPTR, 2, "SelectSMRDImm32">; def SMRDSgpr : ComplexPattern<iPTR, 2, "SelectSMRDSgpr">; @@ -959,6 +967,32 @@ def : GCNPat < } } // let OtherPredicates = [HasShaderCyclesRegister] +def SIMM24bitPtr : ImmLeaf <iPTR, + [{return isInt<24>(Imm);}] +>; + +multiclass SMPrefetchPat<string type, int cache_type> { + def : GCNPat < + (smrd_prefetch (SMRDImm i64:$sbase, i32:$offset), timm, timm, (i32 cache_type)), + (!cast<SM_Prefetch_Pseudo>("S_PREFETCH_"#type) $sbase, $offset, (i32 SGPR_NULL), (i8 0)) + >; + + def : GCNPat < + (smrd_prefetch (i64 SReg_64:$sbase), timm, timm, (i32 cache_type)), + (!cast<SM_Prefetch_Pseudo>("S_PREFETCH_"#type) $sbase, 0, (i32 SGPR_NULL), (i8 0)) + >; + + def : GCNPat < + (prefetch SIMM24bitPtr:$offset, timm, timm, (i32 cache_type)), + (!cast<SM_Prefetch_Pseudo>("S_PREFETCH_"#type#"_PC_REL") (as_i32timm $offset), (i32 SGPR_NULL), (i8 0)) + > { + let AddedComplexity = 10; + } +} + +defm : SMPrefetchPat<"INST", 0>; +defm : SMPrefetchPat<"DATA", 1>; + //===----------------------------------------------------------------------===// // GFX10. //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll b/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll new file mode 100644 index 0000000000000..bca76770953b9 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll @@ -0,0 +1,496 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX12-SDAG %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX12-GISEL %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11 %s + +; Scalar data prefetch + +define amdgpu_ps void @prefetch_data_sgpr(ptr addrspace(4) inreg %ptr) { +; GFX12-SDAG-LABEL: prefetch_data_sgpr: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_prefetch_data s[0:1], 0x0, null, 0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX11-LABEL: prefetch_data_sgpr: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: prefetch_data_sgpr: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_endpgm +entry: + tail call void @llvm.prefetch.p4(ptr addrspace(4) %ptr, i32 0, i32 0, i32 1) + ret void +} + +define amdgpu_ps void @prefetch_data_sgpr_offset(ptr addrspace(4) inreg %ptr) { +; GFX12-SDAG-LABEL: prefetch_data_sgpr_offset: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_prefetch_data s[0:1], 0x200, null, 0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX11-LABEL: prefetch_data_sgpr_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: prefetch_data_sgpr_offset: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr float, ptr addrspace(4) %ptr, i32 128 + tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1) + ret void +} + +define amdgpu_ps void @prefetch_data_pc_rel() { +; GFX12-SDAG-LABEL: prefetch_data_pc_rel: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_prefetch_data_pc_rel 0x0, null, 0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX11-LABEL: prefetch_data_pc_rel: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: prefetch_data_pc_rel: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_endpgm +entry: + tail call void @llvm.prefetch.p4(ptr addrspace(4) null, i32 0, i32 0, i32 1) + ret void +} + +define amdgpu_ps void @prefetch_data_pc_rel_offset() { +; GFX12-SDAG-LABEL: prefetch_data_pc_rel_offset: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_prefetch_data_pc_rel 0x200, null, 0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX11-LABEL: prefetch_data_pc_rel_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: prefetch_data_pc_rel_offset: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr float, ptr addrspace(4) null, i32 128 + tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1) + ret void +} + +; Check large offsets + +define amdgpu_ps void @prefetch_data_sgpr_max_offset(ptr addrspace(4) inreg %ptr) { +; GFX12-SDAG-LABEL: prefetch_data_sgpr_max_offset: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_prefetch_data s[0:1], 0x7fffff, null, 0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX11-LABEL: prefetch_data_sgpr_max_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: prefetch_data_sgpr_max_offset: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 8388607 + tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1) + ret void +} + +define amdgpu_ps void @prefetch_data_sgpr_min_offset(ptr addrspace(4) inreg %ptr) { +; GFX12-SDAG-LABEL: prefetch_data_sgpr_min_offset: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_prefetch_data s[0:1], -0x800000, null, 0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX11-LABEL: prefetch_data_sgpr_min_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: prefetch_data_sgpr_min_offset: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 -8388608 + tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1) + ret void +} + +define amdgpu_ps void @prefetch_data_sgpr_too_large_offset(ptr addrspace(4) inreg %ptr) { +; GFX12-SDAG-LABEL: prefetch_data_sgpr_too_large_offset: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_add_co_u32 s0, s0, 0x800000 +; GFX12-SDAG-NEXT: s_add_co_ci_u32 s1, s1, 0 +; GFX12-SDAG-NEXT: s_prefetch_data s[0:1], 0x0, null, 0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX11-LABEL: prefetch_data_sgpr_too_large_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: prefetch_data_sgpr_too_large_offset: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 8388608 + tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1) + ret void +} + +define amdgpu_ps void @prefetch_data_pc_rel_max_offset() { +; GFX12-SDAG-LABEL: prefetch_data_pc_rel_max_offset: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_prefetch_data_pc_rel 0x7fffff, null, 0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX11-LABEL: prefetch_data_pc_rel_max_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: prefetch_data_pc_rel_max_offset: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i8, ptr addrspace(4) null, i32 8388607 + tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1) + ret void +} + +define amdgpu_ps void @prefetch_data_pc_rel_min_offset() { +; GFX12-SDAG-LABEL: prefetch_data_pc_rel_min_offset: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_prefetch_data_pc_rel -0x800000, null, 0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX11-LABEL: prefetch_data_pc_rel_min_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: prefetch_data_pc_rel_min_offset: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i8, ptr addrspace(4) null, i32 -8388608 + tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1) + ret void +} + +define amdgpu_ps void @prefetch_data_pc_rel_too_large_offset() { +; GFX12-SDAG-LABEL: prefetch_data_pc_rel_too_large_offset: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_getpc_b64 s[0:1] +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_add_co_u32 s0, s0, 0x800000 +; GFX12-SDAG-NEXT: s_add_co_ci_u32 s1, s1, 0 +; GFX12-SDAG-NEXT: s_prefetch_data s[0:1], 0x14, null, 0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX11-LABEL: prefetch_data_pc_rel_too_large_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: prefetch_data_pc_rel_too_large_offset: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i8, ptr addrspace(4) null, i32 8388608 + tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1) + ret void +} + +; Check divergent address + +define amdgpu_ps void @prefetch_data_vgpr(ptr addrspace(1) %ptr) { +; GCN-LABEL: prefetch_data_vgpr: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_endpgm +entry: + tail call void @llvm.prefetch.p1(ptr addrspace(1) %ptr, i32 0, i32 0, i32 1) + ret void +} + +; Check LDS and Scratch, we cannot prefetch it + +define amdgpu_ps void @prefetch_data_lds(ptr addrspace(3) inreg %ptr) { +; GCN-LABEL: prefetch_data_lds: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_endpgm +entry: + tail call void @llvm.prefetch.p3(ptr addrspace(3) %ptr, i32 0, i32 0, i32 1) + ret void +} + +define amdgpu_ps void @prefetch_data_scratch(ptr addrspace(5) inreg %ptr) { +; GCN-LABEL: prefetch_data_scratch: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_endpgm +entry: + tail call void @llvm.prefetch.p5(ptr addrspace(5) %ptr, i32 0, i32 0, i32 1) + ret void +} + +; Check supported address spaces + +define amdgpu_ps void @prefetch_data_sgpr_flat(ptr inreg %ptr) { +; GFX12-SDAG-LABEL: prefetch_data_sgpr_flat: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_prefetch_data s[0:1], 0x0, null, 0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX11-LABEL: prefetch_data_sgpr_flat: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: prefetch_data_sgpr_flat: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_endpgm +entry: + tail call void @llvm.prefetch.pf(ptr %ptr, i32 0, i32 0, i32 1) + ret void +} + +define amdgpu_ps void @prefetch_data_sgpr_global(ptr addrspace(1) inreg %ptr) { +; GFX12-SDAG-LABEL: prefetch_data_sgpr_global: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_prefetch_data s[0:1], 0x0, null, 0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX11-LABEL: prefetch_data_sgpr_global: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: prefetch_data_sgpr_global: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_endpgm +entry: + tail call void @llvm.prefetch.p1(ptr addrspace(1) %ptr, i32 0, i32 0, i32 1) + ret void +} + +define amdgpu_ps void @prefetch_data_sgpr_constant_32bit(ptr addrspace(6) inreg %ptr) { +; GFX12-SDAG-LABEL: prefetch_data_sgpr_constant_32bit: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_mov_b32 s1, 0 +; GFX12-SDAG-NEXT: s_prefetch_data s[0:1], 0x0, null, 0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX11-LABEL: prefetch_data_sgpr_constant_32bit: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: prefetch_data_sgpr_constant_32bit: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_endpgm +entry: + tail call void @llvm.prefetch.p6(ptr addrspace(6) %ptr, i32 0, i32 0, i32 1) + ret void +} + +; I$ prefetch + +define amdgpu_ps void @prefetch_inst_sgpr(ptr addrspace(4) inreg %ptr) { +; GFX12-SDAG-LABEL: prefetch_inst_sgpr: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_prefetch_inst s[0:1], 0x0, null, 0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX11-LABEL: prefetch_inst_sgpr: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: prefetch_inst_sgpr: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_endpgm +entry: + tail call void @llvm.prefetch.p4(ptr addrspace(4) %ptr, i32 0, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @prefetch_inst_sgpr_offset(ptr addrspace(4) inreg %ptr) { +; GFX12-SDAG-LABEL: prefetch_inst_sgpr_offset: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_prefetch_inst s[0:1], 0x80, null, 0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX11-LABEL: prefetch_inst_sgpr_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: prefetch_inst_sgpr_offset: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 128 + tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @prefetch_inst_pc_rel() { +; GFX12-SDAG-LABEL: prefetch_inst_pc_rel: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_prefetch_inst_pc_rel 0x0, null, 0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX11-LABEL: prefetch_inst_pc_rel: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: prefetch_inst_pc_rel: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_endpgm +entry: + tail call void @llvm.prefetch.p4(ptr addrspace(4) null, i32 0, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @prefetch_inst_pc_rel_offset() { +; GFX12-SDAG-LABEL: prefetch_inst_pc_rel_offset: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_prefetch_inst_pc_rel 0x80, null, 0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX11-LABEL: prefetch_inst_pc_rel_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: prefetch_inst_pc_rel_offset: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i8, ptr addrspace(4) null, i32 128 + tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0) + ret void +} + +; Check large offsets + +define amdgpu_ps void @prefetch_inst_sgpr_max_offset(ptr addrspace(4) inreg %ptr) { +; GFX12-SDAG-LABEL: prefetch_inst_sgpr_max_offset: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_prefetch_inst s[0:1], 0x7fffff, null, 0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX11-LABEL: prefetch_inst_sgpr_max_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: prefetch_inst_sgpr_max_offset: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 8388607 + tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @prefetch_inst_sgpr_min_offset(ptr addrspace(4) inreg %ptr) { +; GFX12-SDAG-LABEL: prefetch_inst_sgpr_min_offset: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_prefetch_inst s[0:1], -0x800000, null, 0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX11-LABEL: prefetch_inst_sgpr_min_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: prefetch_inst_sgpr_min_offset: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 -8388608 + tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @prefetch_inst_sgpr_too_large_offset(ptr addrspace(4) inreg %ptr) { +; GFX12-SDAG-LABEL: prefetch_inst_sgpr_too_large_offset: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_add_co_u32 s0, s0, 0x800000 +; GFX12-SDAG-NEXT: s_add_co_ci_u32 s1, s1, 0 +; GFX12-SDAG-NEXT: s_prefetch_inst s[0:1], 0x0, null, 0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX11-LABEL: prefetch_inst_sgpr_too_large_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: prefetch_inst_sgpr_too_large_offset: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 8388608 + tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @prefetch_inst_pc_rel_max_offset() { +; GFX12-SDAG-LABEL: prefetch_inst_pc_rel_max_offset: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_prefetch_inst_pc_rel 0x7fffff, null, 0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX11-LABEL: prefetch_inst_pc_rel_max_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: prefetch_inst_pc_rel_max_offset: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i8, ptr addrspace(4) null, i32 8388607 + tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @prefetch_inst_pc_rel_min_offset() { +; GFX12-SDAG-LABEL: prefetch_inst_pc_rel_min_offset: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_prefetch_inst_pc_rel -0x800000, null, 0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX11-LABEL: prefetch_inst_pc_rel_min_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: prefetch_inst_pc_rel_min_offset: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i8, ptr addrspace(4) null, i32 -8388608 + tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @prefetch_inst_pc_rel_too_large_offset() { +; GFX12-SDAG-LABEL: prefetch_inst_pc_rel_too_large_offset: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_getpc_b64 s[0:1] +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_add_co_u32 s0, s0, 0x800000 +; GFX12-SDAG-NEXT: s_add_co_ci_u32 s1, s1, 0 +; GFX12-SDAG-NEXT: s_prefetch_inst s[0:1], 0x14, null, 0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX11-LABEL: prefetch_inst_pc_rel_too_large_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: prefetch_inst_pc_rel_too_large_offset: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i8, ptr addrspace(4) null, i32 8388608 + tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.prefetch.pf(ptr nocapture readonly, i32, i32, i32) +declare void @llvm.prefetch.p1(ptr addrspace(1) nocapture readonly, i32, i32, i32) +declare void @llvm.prefetch.p3(ptr addrspace(3) nocapture readonly, i32, i32, i32) +declare void @llvm.prefetch.p4(ptr addrspace(4) nocapture readonly, i32, i32, i32) +declare void @llvm.prefetch.p5(ptr addrspace(5) nocapture readonly, i32, i32, i32) +declare void @llvm.prefetch.p6(ptr addrspace(6) nocapture readonly, i32, i32, i32) >From 3d56730e6607ee9e463188b0b936ef703ad85feb Mon Sep 17 00:00:00 2001 From: Mariusz Sikora <mariusz.sik...@amd.com> Date: Thu, 7 Dec 2023 14:18:59 +0100 Subject: [PATCH 2/2] Stop generating _PC_REL form --- llvm/lib/Target/AMDGPU/SMInstructions.td | 11 -- llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll | 150 ---------------------- 2 files changed, 161 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td index a77856caae7a6..fb47ee62930fa 100644 --- a/llvm/lib/Target/AMDGPU/SMInstructions.td +++ b/llvm/lib/Target/AMDGPU/SMInstructions.td @@ -967,10 +967,6 @@ def : GCNPat < } } // let OtherPredicates = [HasShaderCyclesRegister] -def SIMM24bitPtr : ImmLeaf <iPTR, - [{return isInt<24>(Imm);}] ->; - multiclass SMPrefetchPat<string type, int cache_type> { def : GCNPat < (smrd_prefetch (SMRDImm i64:$sbase, i32:$offset), timm, timm, (i32 cache_type)), @@ -981,13 +977,6 @@ multiclass SMPrefetchPat<string type, int cache_type> { (smrd_prefetch (i64 SReg_64:$sbase), timm, timm, (i32 cache_type)), (!cast<SM_Prefetch_Pseudo>("S_PREFETCH_"#type) $sbase, 0, (i32 SGPR_NULL), (i8 0)) >; - - def : GCNPat < - (prefetch SIMM24bitPtr:$offset, timm, timm, (i32 cache_type)), - (!cast<SM_Prefetch_Pseudo>("S_PREFETCH_"#type#"_PC_REL") (as_i32timm $offset), (i32 SGPR_NULL), (i8 0)) - > { - let AddedComplexity = 10; - } } defm : SMPrefetchPat<"INST", 0>; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll b/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll index bca76770953b9..d5dcfef91923e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll @@ -43,43 +43,6 @@ entry: ret void } -define amdgpu_ps void @prefetch_data_pc_rel() { -; GFX12-SDAG-LABEL: prefetch_data_pc_rel: -; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_prefetch_data_pc_rel 0x0, null, 0 -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX11-LABEL: prefetch_data_pc_rel: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: prefetch_data_pc_rel: -; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_endpgm -entry: - tail call void @llvm.prefetch.p4(ptr addrspace(4) null, i32 0, i32 0, i32 1) - ret void -} - -define amdgpu_ps void @prefetch_data_pc_rel_offset() { -; GFX12-SDAG-LABEL: prefetch_data_pc_rel_offset: -; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_prefetch_data_pc_rel 0x200, null, 0 -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX11-LABEL: prefetch_data_pc_rel_offset: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: prefetch_data_pc_rel_offset: -; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_endpgm -entry: - %gep = getelementptr float, ptr addrspace(4) null, i32 128 - tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1) - ret void -} - ; Check large offsets define amdgpu_ps void @prefetch_data_sgpr_max_offset(ptr addrspace(4) inreg %ptr) { @@ -141,44 +104,6 @@ entry: ret void } -define amdgpu_ps void @prefetch_data_pc_rel_max_offset() { -; GFX12-SDAG-LABEL: prefetch_data_pc_rel_max_offset: -; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_prefetch_data_pc_rel 0x7fffff, null, 0 -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX11-LABEL: prefetch_data_pc_rel_max_offset: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: prefetch_data_pc_rel_max_offset: -; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_endpgm -entry: - %gep = getelementptr i8, ptr addrspace(4) null, i32 8388607 - tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1) - ret void -} - -define amdgpu_ps void @prefetch_data_pc_rel_min_offset() { -; GFX12-SDAG-LABEL: prefetch_data_pc_rel_min_offset: -; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_prefetch_data_pc_rel -0x800000, null, 0 -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX11-LABEL: prefetch_data_pc_rel_min_offset: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: prefetch_data_pc_rel_min_offset: -; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_endpgm -entry: - %gep = getelementptr i8, ptr addrspace(4) null, i32 -8388608 - tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1) - ret void -} - define amdgpu_ps void @prefetch_data_pc_rel_too_large_offset() { ; GFX12-SDAG-LABEL: prefetch_data_pc_rel_too_large_offset: ; GFX12-SDAG: ; %bb.0: ; %entry @@ -329,43 +254,6 @@ entry: ret void } -define amdgpu_ps void @prefetch_inst_pc_rel() { -; GFX12-SDAG-LABEL: prefetch_inst_pc_rel: -; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_prefetch_inst_pc_rel 0x0, null, 0 -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX11-LABEL: prefetch_inst_pc_rel: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: prefetch_inst_pc_rel: -; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_endpgm -entry: - tail call void @llvm.prefetch.p4(ptr addrspace(4) null, i32 0, i32 0, i32 0) - ret void -} - -define amdgpu_ps void @prefetch_inst_pc_rel_offset() { -; GFX12-SDAG-LABEL: prefetch_inst_pc_rel_offset: -; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_prefetch_inst_pc_rel 0x80, null, 0 -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX11-LABEL: prefetch_inst_pc_rel_offset: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: prefetch_inst_pc_rel_offset: -; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_endpgm -entry: - %gep = getelementptr i8, ptr addrspace(4) null, i32 128 - tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0) - ret void -} - ; Check large offsets define amdgpu_ps void @prefetch_inst_sgpr_max_offset(ptr addrspace(4) inreg %ptr) { @@ -427,44 +315,6 @@ entry: ret void } -define amdgpu_ps void @prefetch_inst_pc_rel_max_offset() { -; GFX12-SDAG-LABEL: prefetch_inst_pc_rel_max_offset: -; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_prefetch_inst_pc_rel 0x7fffff, null, 0 -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX11-LABEL: prefetch_inst_pc_rel_max_offset: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: prefetch_inst_pc_rel_max_offset: -; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_endpgm -entry: - %gep = getelementptr i8, ptr addrspace(4) null, i32 8388607 - tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0) - ret void -} - -define amdgpu_ps void @prefetch_inst_pc_rel_min_offset() { -; GFX12-SDAG-LABEL: prefetch_inst_pc_rel_min_offset: -; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_prefetch_inst_pc_rel -0x800000, null, 0 -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX11-LABEL: prefetch_inst_pc_rel_min_offset: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: prefetch_inst_pc_rel_min_offset: -; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_endpgm -entry: - %gep = getelementptr i8, ptr addrspace(4) null, i32 -8388608 - tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0) - ret void -} - define amdgpu_ps void @prefetch_inst_pc_rel_too_large_offset() { ; GFX12-SDAG-LABEL: prefetch_inst_pc_rel_too_large_offset: ; GFX12-SDAG: ; %bb.0: ; %entry _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits