[llvm] [clang] [flang] [clang-tools-extra] [openmp] [mlir] [libcxx] [lldb] [libc] GlobalISel: Guide return in llvm::getIConstantSplatVal (PR #71989)
jayfoad wrote: Typo in subject "**Guard** return ..."? https://github.com/llvm/llvm-project/pull/71989 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [clang-tools-extra] [AMDGPU] Revert "Preliminary patch for divergence driven instruction selection. Operands Folding 1." (PR #71710)
https://github.com/jayfoad ready_for_review https://github.com/llvm/llvm-project/pull/71710 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang-tools-extra] [clang] [AMDGPU] Revert "Preliminary patch for divergence driven instruction selection. Operands Folding 1." (PR #71710)
https://github.com/jayfoad closed https://github.com/llvm/llvm-project/pull/71710 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [lldb] [mlir] [libcxx] [openmp] [flang] [libcxxabi] [compiler-rt] [clang] [AMDGPU] Add GFX12 encoding for VINTERP instructions (PR #74616)
https://github.com/jayfoad updated https://github.com/llvm/llvm-project/pull/74616 >From 69580e5f77514fecf0aabe2a80c98881f9bd7288 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Tue, 7 Feb 2023 16:27:27 + Subject: [PATCH 1/2] [AMDGPU] Add GFX12 encoding for VINTERP instructions --- .../Disassembler/AMDGPUDisassembler.cpp | 6 +- llvm/lib/Target/AMDGPU/VINTERPInstructions.td | 38 ++- llvm/test/MC/AMDGPU/gfx11_asm_vinterp.s | 187 ++--- .../AMDGPU/gfx12_dasm_vinterp.txt | 251 ++ 4 files changed, 378 insertions(+), 104 deletions(-) create mode 100644 llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vinterp.txt diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 3175f6358a045..c37af739e2019 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -782,9 +782,13 @@ DecodeStatus AMDGPUDisassembler::convertEXPInst(MCInst &MI) const { DecodeStatus AMDGPUDisassembler::convertVINTERPInst(MCInst &MI) const { if (MI.getOpcode() == AMDGPU::V_INTERP_P10_F16_F32_inreg_gfx11 || + MI.getOpcode() == AMDGPU::V_INTERP_P10_F16_F32_inreg_gfx12 || MI.getOpcode() == AMDGPU::V_INTERP_P10_RTZ_F16_F32_inreg_gfx11 || + MI.getOpcode() == AMDGPU::V_INTERP_P10_RTZ_F16_F32_inreg_gfx12 || MI.getOpcode() == AMDGPU::V_INTERP_P2_F16_F32_inreg_gfx11 || - MI.getOpcode() == AMDGPU::V_INTERP_P2_RTZ_F16_F32_inreg_gfx11) { + MI.getOpcode() == AMDGPU::V_INTERP_P2_F16_F32_inreg_gfx12 || + MI.getOpcode() == AMDGPU::V_INTERP_P2_RTZ_F16_F32_inreg_gfx11 || + MI.getOpcode() == AMDGPU::V_INTERP_P2_RTZ_F16_F32_inreg_gfx12) { // The MCInst has this field that is not directly encoded in the // instruction. insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::op_sel); diff --git a/llvm/lib/Target/AMDGPU/VINTERPInstructions.td b/llvm/lib/Target/AMDGPU/VINTERPInstructions.td index 7d03150bf5b11..fc563b7493adf 100644 --- a/llvm/lib/Target/AMDGPU/VINTERPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VINTERPInstructions.td @@ -10,7 +10,7 @@ // VINTERP encoding //===--===// -class VINTERPe_gfx11 op, VOPProfile P> : Enc64 { +class VINTERPe : Enc64 { bits<8> vdst; bits<4> src0_modifiers; bits<9> src0; @@ -31,7 +31,6 @@ class VINTERPe_gfx11 op, VOPProfile P> : Enc64 { let Inst{13}= !if(P.HasOpSel, src2_modifiers{2}, 0); // op_sel(2) let Inst{14}= !if(P.HasOpSel, src0_modifiers{3}, 0); // op_sel(3) let Inst{15}= clamp; - let Inst{22-16} = op; let Inst{40-32} = src0; let Inst{49-41} = src1; let Inst{58-50} = src2; @@ -40,6 +39,14 @@ class VINTERPe_gfx11 op, VOPProfile P> : Enc64 { let Inst{63}= src2_modifiers{0}; // neg(2) } +class VINTERPe_gfx11 op, VOPProfile P> : VINTERPe { + let Inst{22-16} = op; +} + +class VINTERPe_gfx12 op, VOPProfile P> : VINTERPe { + let Inst{20-16} = op{4-0}; +} + //===--===// // VOP3 VINTERP //===--===// @@ -171,17 +178,28 @@ defm : VInterpF16Pat op> { +multiclass VINTERP_Real_gfx11 op> { + let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in { def _gfx11 : VINTERP_Real(NAME), SIEncodingFamily.GFX11>, VINTERPe_gfx11(NAME).Pfl>; } } -defm V_INTERP_P10_F32_inreg : VINTERP_Real_gfx11<0x000>; -defm V_INTERP_P2_F32_inreg : VINTERP_Real_gfx11<0x001>; -defm V_INTERP_P10_F16_F32_inreg : VINTERP_Real_gfx11<0x002>; -defm V_INTERP_P2_F16_F32_inreg : VINTERP_Real_gfx11<0x003>; -defm V_INTERP_P10_RTZ_F16_F32_inreg : VINTERP_Real_gfx11<0x004>; -defm V_INTERP_P2_RTZ_F16_F32_inreg : VINTERP_Real_gfx11<0x005>; +multiclass VINTERP_Real_gfx12 op> { + let AssemblerPredicate = isGFX12Only, DecoderNamespace = "GFX12" in { +def _gfx12 : + VINTERP_Real(NAME), SIEncodingFamily.GFX12>, + VINTERPe_gfx12(NAME).Pfl>; + } +} + +multiclass VINTERP_Real_gfx11_gfx12 op> : + VINTERP_Real_gfx11, VINTERP_Real_gfx12; + +defm V_INTERP_P10_F32_inreg : VINTERP_Real_gfx11_gfx12<0x000>; +defm V_INTERP_P2_F32_inreg : VINTERP_Real_gfx11_gfx12<0x001>; +defm V_INTERP_P10_F16_F32_inreg : VINTERP_Real_gfx11_gfx12<0x002>; +defm V_INTERP_P2_F16_F32_inreg : VINTERP_Real_gfx11_gfx12<0x003>; +defm V_INTERP_P10_RTZ_F16_F32_inreg : VINTERP_Real_gfx11_gfx12<0x004>; +defm V_INTERP_P2_RTZ_F16_F32_inreg : VINTERP_Real_gfx11_gfx12<0x005>; diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vinterp.s b/llvm/test/MC/AMDGPU/gfx11_asm_vinterp.s index e2e53776783f3..fdfbf65c0e3cf 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vinterp.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vinterp.s @@ -1,277 +1,278 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -show-encoding %s | FileCheck -check-p
[flang] [clang] [libcxxabi] [lld] [lldb] [mlir] [llvm] [clang-tools-extra] [openmp] [compiler-rt] [libcxx] [AMDGPU] Add GFX12 encoding for VINTERP instructions (PR #74616)
https://github.com/jayfoad updated https://github.com/llvm/llvm-project/pull/74616 >From 69580e5f77514fecf0aabe2a80c98881f9bd7288 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Tue, 7 Feb 2023 16:27:27 + Subject: [PATCH 1/2] [AMDGPU] Add GFX12 encoding for VINTERP instructions --- .../Disassembler/AMDGPUDisassembler.cpp | 6 +- llvm/lib/Target/AMDGPU/VINTERPInstructions.td | 38 ++- llvm/test/MC/AMDGPU/gfx11_asm_vinterp.s | 187 ++--- .../AMDGPU/gfx12_dasm_vinterp.txt | 251 ++ 4 files changed, 378 insertions(+), 104 deletions(-) create mode 100644 llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vinterp.txt diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 3175f6358a045..c37af739e2019 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -782,9 +782,13 @@ DecodeStatus AMDGPUDisassembler::convertEXPInst(MCInst &MI) const { DecodeStatus AMDGPUDisassembler::convertVINTERPInst(MCInst &MI) const { if (MI.getOpcode() == AMDGPU::V_INTERP_P10_F16_F32_inreg_gfx11 || + MI.getOpcode() == AMDGPU::V_INTERP_P10_F16_F32_inreg_gfx12 || MI.getOpcode() == AMDGPU::V_INTERP_P10_RTZ_F16_F32_inreg_gfx11 || + MI.getOpcode() == AMDGPU::V_INTERP_P10_RTZ_F16_F32_inreg_gfx12 || MI.getOpcode() == AMDGPU::V_INTERP_P2_F16_F32_inreg_gfx11 || - MI.getOpcode() == AMDGPU::V_INTERP_P2_RTZ_F16_F32_inreg_gfx11) { + MI.getOpcode() == AMDGPU::V_INTERP_P2_F16_F32_inreg_gfx12 || + MI.getOpcode() == AMDGPU::V_INTERP_P2_RTZ_F16_F32_inreg_gfx11 || + MI.getOpcode() == AMDGPU::V_INTERP_P2_RTZ_F16_F32_inreg_gfx12) { // The MCInst has this field that is not directly encoded in the // instruction. insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::op_sel); diff --git a/llvm/lib/Target/AMDGPU/VINTERPInstructions.td b/llvm/lib/Target/AMDGPU/VINTERPInstructions.td index 7d03150bf5b11..fc563b7493adf 100644 --- a/llvm/lib/Target/AMDGPU/VINTERPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VINTERPInstructions.td @@ -10,7 +10,7 @@ // VINTERP encoding //===--===// -class VINTERPe_gfx11 op, VOPProfile P> : Enc64 { +class VINTERPe : Enc64 { bits<8> vdst; bits<4> src0_modifiers; bits<9> src0; @@ -31,7 +31,6 @@ class VINTERPe_gfx11 op, VOPProfile P> : Enc64 { let Inst{13}= !if(P.HasOpSel, src2_modifiers{2}, 0); // op_sel(2) let Inst{14}= !if(P.HasOpSel, src0_modifiers{3}, 0); // op_sel(3) let Inst{15}= clamp; - let Inst{22-16} = op; let Inst{40-32} = src0; let Inst{49-41} = src1; let Inst{58-50} = src2; @@ -40,6 +39,14 @@ class VINTERPe_gfx11 op, VOPProfile P> : Enc64 { let Inst{63}= src2_modifiers{0}; // neg(2) } +class VINTERPe_gfx11 op, VOPProfile P> : VINTERPe { + let Inst{22-16} = op; +} + +class VINTERPe_gfx12 op, VOPProfile P> : VINTERPe { + let Inst{20-16} = op{4-0}; +} + //===--===// // VOP3 VINTERP //===--===// @@ -171,17 +178,28 @@ defm : VInterpF16Pat op> { +multiclass VINTERP_Real_gfx11 op> { + let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in { def _gfx11 : VINTERP_Real(NAME), SIEncodingFamily.GFX11>, VINTERPe_gfx11(NAME).Pfl>; } } -defm V_INTERP_P10_F32_inreg : VINTERP_Real_gfx11<0x000>; -defm V_INTERP_P2_F32_inreg : VINTERP_Real_gfx11<0x001>; -defm V_INTERP_P10_F16_F32_inreg : VINTERP_Real_gfx11<0x002>; -defm V_INTERP_P2_F16_F32_inreg : VINTERP_Real_gfx11<0x003>; -defm V_INTERP_P10_RTZ_F16_F32_inreg : VINTERP_Real_gfx11<0x004>; -defm V_INTERP_P2_RTZ_F16_F32_inreg : VINTERP_Real_gfx11<0x005>; +multiclass VINTERP_Real_gfx12 op> { + let AssemblerPredicate = isGFX12Only, DecoderNamespace = "GFX12" in { +def _gfx12 : + VINTERP_Real(NAME), SIEncodingFamily.GFX12>, + VINTERPe_gfx12(NAME).Pfl>; + } +} + +multiclass VINTERP_Real_gfx11_gfx12 op> : + VINTERP_Real_gfx11, VINTERP_Real_gfx12; + +defm V_INTERP_P10_F32_inreg : VINTERP_Real_gfx11_gfx12<0x000>; +defm V_INTERP_P2_F32_inreg : VINTERP_Real_gfx11_gfx12<0x001>; +defm V_INTERP_P10_F16_F32_inreg : VINTERP_Real_gfx11_gfx12<0x002>; +defm V_INTERP_P2_F16_F32_inreg : VINTERP_Real_gfx11_gfx12<0x003>; +defm V_INTERP_P10_RTZ_F16_F32_inreg : VINTERP_Real_gfx11_gfx12<0x004>; +defm V_INTERP_P2_RTZ_F16_F32_inreg : VINTERP_Real_gfx11_gfx12<0x005>; diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vinterp.s b/llvm/test/MC/AMDGPU/gfx11_asm_vinterp.s index e2e53776783f3..fdfbf65c0e3cf 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vinterp.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vinterp.s @@ -1,277 +1,278 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -show-encoding %s | FileCheck -check-p
[lld] [mlir] [clang-tools-extra] [libcxxabi] [lldb] [flang] [compiler-rt] [openmp] [libcxx] [clang] [llvm] [AMDGPU] Add GFX12 encoding for VINTERP instructions (PR #74616)
https://github.com/jayfoad closed https://github.com/llvm/llvm-project/pull/74616 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] fb2b907 - [AArch64][SME2] Add REQUIRES to new test
Author: Jay Foad Date: 2023-12-07T17:42:37Z New Revision: fb2b907fbd2c9ac25077dae01d777d884e09a7a4 URL: https://github.com/llvm/llvm-project/commit/fb2b907fbd2c9ac25077dae01d777d884e09a7a4 DIFF: https://github.com/llvm/llvm-project/commit/fb2b907fbd2c9ac25077dae01d777d884e09a7a4.diff LOG: [AArch64][SME2] Add REQUIRES to new test Added: Modified: clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_qrshr.c Removed: diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_qrshr.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_qrshr.c index 066d37772ebc2..50cac48887894 100644 --- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_qrshr.c +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_qrshr.c @@ -1,4 +1,6 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target + // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [AArch64][SME2] Add _x2/_x4 svqrshr builtins. (PR #74100)
jayfoad wrote: I committed a fix for builds that do not enable AArch64: fb2b907fbd2c9ac25077dae01d777d884e09a7a4 https://github.com/llvm/llvm-project/pull/74100 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [libcxxabi] [clang-tools-extra] [lldb] [clang] [lld] [compiler-rt] [flang] [libcxx] [AMDGPU] GFX12: select @llvm.prefetch intrinsic (PR #74576)
@@ -959,6 +967,32 @@ def : GCNPat < } } // let OtherPredicates = [HasShaderCyclesRegister] +def SIMM24bitPtr : ImmLeaf (Imm);}] +>; + +multiclass SMPrefetchPat { + def : GCNPat < +(smrd_prefetch (SMRDImm i64:$sbase, i32:$offset), timm, timm, (i32 cache_type)), +(!cast("S_PREFETCH_"#type) $sbase, $offset, (i32 SGPR_NULL), (i8 0)) + >; + + def : GCNPat < +(smrd_prefetch (i64 SReg_64:$sbase), timm, timm, (i32 cache_type)), +(!cast("S_PREFETCH_"#type) $sbase, 0, (i32 SGPR_NULL), (i8 0)) + >; + + def : GCNPat < +(prefetch SIMM24bitPtr:$offset, timm, timm, (i32 cache_type)), +(!cast("S_PREFETCH_"#type#"_PC_REL") (as_i32timm $offset), (i32 SGPR_NULL), (i8 0)) + > { +let AddedComplexity = 10; + } jayfoad wrote: But that is how `llvm.prefetch` is defined: "`address` is the address to be prefetched". A different operation should use a different intrinsic. https://github.com/llvm/llvm-project/pull/74576 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[compiler-rt] [flang] [lldb] [lld] [clang] [llvm] [libcxxabi] [libcxx] [clang-tools-extra] [AMDGPU] GFX12: select @llvm.prefetch intrinsic (PR #74576)
@@ -959,6 +967,32 @@ def : GCNPat < } } // let OtherPredicates = [HasShaderCyclesRegister] +def SIMM24bitPtr : ImmLeaf (Imm);}] +>; + +multiclass SMPrefetchPat { + def : GCNPat < +(smrd_prefetch (SMRDImm i64:$sbase, i32:$offset), timm, timm, (i32 cache_type)), +(!cast("S_PREFETCH_"#type) $sbase, $offset, (i32 SGPR_NULL), (i8 0)) + >; + + def : GCNPat < +(smrd_prefetch (i64 SReg_64:$sbase), timm, timm, (i32 cache_type)), +(!cast("S_PREFETCH_"#type) $sbase, 0, (i32 SGPR_NULL), (i8 0)) + >; + + def : GCNPat < +(prefetch SIMM24bitPtr:$offset, timm, timm, (i32 cache_type)), +(!cast("S_PREFETCH_"#type#"_PC_REL") (as_i32timm $offset), (i32 SGPR_NULL), (i8 0)) + > { +let AddedComplexity = 10; + } jayfoad wrote: I really don't know. What would the use cases look like? Maybe it could be a generic intrinsic, if there is consensus that it is useful. For the existing llvm.prefetch intrinsic, the only useful case I think of for instruction prefetching is: ``` define @f0() { call @llvm.prefetch(@f1, ...) ... call @f1() } define @f1() { ... } ``` to prefetch the code at the start of a function you are going to call. We could codegen that case using the _pc_rel form of the instruction. https://github.com/llvm/llvm-project/pull/74576 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[lld] [clang] [compiler-rt] [lldb] [libcxx] [flang] [libc] [clang-tools-extra] [llvm] [GlobalISel] Add G_PREFETCH (PR #74863)
https://github.com/jayfoad updated https://github.com/llvm/llvm-project/pull/74863 >From e406c734609d3cd1ae436084c42c1c63d8af2795 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Fri, 8 Dec 2023 14:08:09 + Subject: [PATCH 1/2] [GlobalISel] Add G_PREFETCH --- .../CodeGen/GlobalISel/MachineIRBuilder.h | 4 ++ llvm/include/llvm/Support/TargetOpcodes.def | 3 + llvm/include/llvm/Target/GenericOpcodes.td| 9 +++ llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp | 12 .../CodeGen/GlobalISel/MachineIRBuilder.cpp | 10 +++ llvm/lib/CodeGen/MachineVerifier.cpp | 23 +++ llvm/lib/IR/Verifier.cpp | 2 +- llvm/lib/Target/AArch64/AArch64InstrGISel.td | 4 +- .../AArch64/GISel/AArch64LegalizerInfo.cpp| 55 .../AArch64/GISel/AArch64LegalizerInfo.h | 1 + .../GlobalISel/legalizer-info-validation.mir | 3 + llvm/test/MachineVerifier/test_g_prefetch.mir | 40 .../builtins/match-table-replacerreg.td | 20 +++--- .../match-table-imms.td | 28 - .../match-table-patfrag-root.td | 2 +- .../GlobalISelCombinerEmitter/match-table.td | 62 +-- llvm/test/TableGen/GlobalISelEmitter.td | 2 +- 17 files changed, 195 insertions(+), 85 deletions(-) create mode 100644 llvm/test/MachineVerifier/test_g_prefetch.mir diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h index 3d36d06a7e9da..eb846acde3e04 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h @@ -1529,6 +1529,10 @@ class MachineIRBuilder { /// Build and insert `G_FENCE Ordering, Scope`. MachineInstrBuilder buildFence(unsigned Ordering, unsigned Scope); + /// Build and insert G_PREFETCH \p Addr, \p RW, \p Locality, \p CacheType + MachineInstrBuilder buildPrefetch(const SrcOp &Addr, unsigned RW, +unsigned Locality, unsigned CacheType); + /// Build and insert \p Dst = G_FREEZE \p Src MachineInstrBuilder buildFreeze(const DstOp &Dst, const SrcOp &Src) { return buildInstr(TargetOpcode::G_FREEZE, {Dst}, {Src}); diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def index 941c6d5f8cad8..91d9eb745a48f 100644 --- a/llvm/include/llvm/Support/TargetOpcodes.def +++ b/llvm/include/llvm/Support/TargetOpcodes.def @@ -415,6 +415,9 @@ HANDLE_TARGET_OPCODE_MARKER(GENERIC_ATOMICRMW_OP_END, G_ATOMICRMW_UDEC_WRAP) // Generic atomic fence HANDLE_TARGET_OPCODE(G_FENCE) +/// Generic prefetch +HANDLE_TARGET_OPCODE(G_PREFETCH) + /// Generic conditional branch instruction. HANDLE_TARGET_OPCODE(G_BRCOND) diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td index 9a9c09d3c20d6..73e38b15bf671 100644 --- a/llvm/include/llvm/Target/GenericOpcodes.td +++ b/llvm/include/llvm/Target/GenericOpcodes.td @@ -1209,6 +1209,15 @@ def G_FENCE : GenericInstruction { let hasSideEffects = true; } +// Generic opcode equivalent to the llvm.prefetch intrinsic. +def G_PREFETCH : GenericInstruction { + let OutOperandList = (outs); + let InOperandList = (ins ptype0:$address, i32imm:$rw, i32imm:$locality, i32imm:$cachetype); + let hasSideEffects = true; + let mayLoad = true; + let mayStore = true; +} + //-- // Variadic ops //-- diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index 14a4e72152e7c..b2850846bde67 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -2435,6 +2435,18 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, MIRBuilder.buildInstr(TargetOpcode::G_RESET_FPMODE, {}, {}); return true; } + case Intrinsic::prefetch: { +Value *Addr = CI.getOperand(0); +ConstantInt *RW = cast(CI.getOperand(1)); +ConstantInt *Locality = cast(CI.getOperand(2)); +ConstantInt *CacheType = cast(CI.getOperand(3)); + +MIRBuilder.buildPrefetch(getOrCreateVReg(*Addr), RW->getZExtValue(), + Locality->getZExtValue(), + CacheType->getZExtValue()); + +return true; + } #define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC) \ case Intrinsic::INTRINSIC: #include "llvm/IR/ConstrainedOps.def" diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp index 80e9c08e850b6..f7febc9357c11 100644 --- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp +++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp @@ -1051,6 +1051,16 @@ MachineIRBuilder::buildFence(unsigned Ordering, unsigned Scope) { .addImm(
[llvm] [flang] [clang] [lld] [clang-tools-extra] [libcxx] [lldb] [libc] [compiler-rt] [GlobalISel] Add G_PREFETCH (PR #74863)
https://github.com/jayfoad closed https://github.com/llvm/llvm-project/pull/74863 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
@@ -1708,6 +1710,13 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, } ++Iter; +if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) { + auto builder = jayfoad wrote: Upper case B for Builder. https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
@@ -388,6 +388,8 @@ class SIInsertWaitcnts : public MachineFunctionPass { // message. DenseSet ReleaseVGPRInsts; + // bool insertWaitcntAfterMemOp(MachineFunction &MF); jayfoad wrote: Remove all the unused code, don't just comment it out. https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
@@ -1708,6 +1710,13 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, } ++Iter; +if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) { + auto builder = + BuildMI(Block, Iter, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) + .addImm(0); jayfoad wrote: On GFX10+ VMEM stores should have S_WAITCNT_VSCNT 0 as well as (or instead of) this. https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
@@ -0,0 +1,222 @@ +; Testing the -amdgpu-precise-memory-op option +; COM: llc -mtriple=amdgcn -mcpu=hawaii -mattr=+amdgpu-precise-memory-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX7 jayfoad wrote: What is COM: ? https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Improve selection of ballot.i64 intrinsic in wave32 mode. (PR #71556)
@@ -2314,9 +2314,8 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) { SDValue VCMP = Cond->getOperand(0); auto CC = cast(Cond->getOperand(2))->get(); auto *CRHS = dyn_cast(Cond->getOperand(1)); -if ((CC == ISD::SETEQ || CC == ISD::SETNE) && CRHS && CRHS->isZero() && -// TODO: make condition below an assert after fixing ballot bitwidth. -VCMP.getValueType().getSizeInBits() == ST->getWavefrontSize()) { +if ((CC == ISD::SETEQ || CC == ISD::SETNE) && CRHS && CRHS->isZero()) { + assert(VCMP.getValueType().getSizeInBits() == ST->getWavefrontSize()); jayfoad wrote: You are asserting that instcombine has been run? That seems wrong. What about -O0 compiles? https://github.com/llvm/llvm-project/pull/71556 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
@@ -1847,6 +1862,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { TrackedWaitcntSet.clear(); BlockInfos.clear(); + jayfoad wrote: Remove this https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
@@ -1708,6 +1710,19 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, } ++Iter; +if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) { + auto Builder = + BuildMI(Block, Iter, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) + .addImm(0); + if (IsGFX10Plus) { jayfoad wrote: I guess this works but it seems a bit wasteful to insert S_WAITCNT after stores and S_WAITCNT_VSCNT after loads. Does anyone care? Stepping back a bit, I think you can probably implement this by calling generateWaitcnt instead of building the instructions yourself. https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
@@ -1708,6 +1710,19 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, } ++Iter; +if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) { + auto Builder = + BuildMI(Block, Iter, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) + .addImm(0); + if (IsGFX10Plus) { +Builder = +BuildMI(Block, Iter, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT)) +.addReg(AMDGPU::SGPR_NULL, RegState::Undef) +.addImm(0); + } + OldWaitcntInstr = Builder.getInstr(); jayfoad wrote: Nit: if you're going to set OldWaitcntInstr then really it ought to point to the first in a sequence of waitcnts, not the last. https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Define new targets gfx1200 and gfx1201 (PR #73133)
https://github.com/jayfoad created https://github.com/llvm/llvm-project/pull/73133 Define target names and ELF numbers for new GFX12 targets gfx1200 and gfx1201. For now they behave identically to GFX11. >From 1011b8e7da174146dfb4c9a4bf54468ea021 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Tue, 21 Nov 2023 15:46:04 + Subject: [PATCH] [AMDGPU] Define new targets gfx1200 and gfx1201 Define target names and ELF numbers for new GFX12 targets gfx1200 and gfx1201. For now they behave identically to GFX11. --- clang/include/clang/Basic/Cuda.h | 2 + clang/lib/Basic/Cuda.cpp | 2 + clang/lib/Basic/Targets/NVPTX.cpp | 2 + clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 2 + clang/test/CodeGenOpenCL/amdgpu-features.cl | 4 + clang/test/Driver/amdgpu-macros.cl| 2 + clang/test/Driver/amdgpu-mcpu.cl | 4 + clang/test/Misc/target-invalid-cpu-note.c | 4 +- llvm/docs/AMDGPUUsage.rst | 18 - llvm/include/llvm/BinaryFormat/ELF.h | 6 +- llvm/include/llvm/TargetParser/TargetParser.h | 5 +- llvm/lib/Object/ELFObjectFile.cpp | 6 ++ llvm/lib/ObjectYAML/ELFYAML.cpp | 2 + llvm/lib/Target/AMDGPU/AMDGPU.td | 75 ++- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 3 +- llvm/lib/Target/AMDGPU/GCNProcessors.td | 12 +++ llvm/lib/Target/AMDGPU/GCNSubtarget.h | 1 + .../MCTargetDesc/AMDGPUTargetStreamer.cpp | 4 + llvm/lib/Target/AMDGPU/SIDefines.h| 1 + llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 8 ++ llvm/lib/Target/AMDGPU/SIInstrInfo.td | 4 +- .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp| 26 ++- llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 6 ++ llvm/lib/TargetParser/TargetParser.cpp| 27 +++ .../CodeGen/AMDGPU/directive-amdgcn-target.ll | 4 + .../CodeGen/AMDGPU/elf-header-flags-mach.ll | 4 + .../Object/AMDGPU/elf-header-flags-mach.yaml | 14 .../llvm-objdump/ELF/AMDGPU/subtarget.ll | 12 +++ .../llvm-readobj/ELF/amdgpu-elf-headers.test | 18 + llvm/tools/llvm-readobj/ELFDumper.cpp | 4 + 30 files changed, 272 insertions(+), 10 deletions(-) diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h index 878f8d70f90c0a9..2d912bdbbd1bc59 100644 --- a/clang/include/clang/Basic/Cuda.h +++ b/clang/include/clang/Basic/Cuda.h @@ -113,6 +113,8 @@ enum class CudaArch { GFX1103, GFX1150, GFX1151, + GFX1200, + GFX1201, Generic, // A processor model named 'generic' if the target backend defines a // public one. LAST, diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp index 2307352bd3becef..65840b9f20252b6 100644 --- a/clang/lib/Basic/Cuda.cpp +++ b/clang/lib/Basic/Cuda.cpp @@ -135,6 +135,8 @@ static const CudaArchToStringMap arch_names[] = { GFX(1103), // gfx1103 GFX(1150), // gfx1150 GFX(1151), // gfx1151 +GFX(1200), // gfx1200 +GFX(1201), // gfx1201 {CudaArch::Generic, "generic", ""}, // clang-format on }; diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp index a9fc88295700b89..3a4a75b0348f209 100644 --- a/clang/lib/Basic/Targets/NVPTX.cpp +++ b/clang/lib/Basic/Targets/NVPTX.cpp @@ -214,6 +214,8 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts, case CudaArch::GFX1103: case CudaArch::GFX1150: case CudaArch::GFX1151: + case CudaArch::GFX1200: + case CudaArch::GFX1201: case CudaArch::Generic: case CudaArch::LAST: break; diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index 2f7dd83bd2d65c9..9b8fbbdf8046787 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -3540,6 +3540,8 @@ void CGOpenMPRuntimeGPU::processRequiresDirective( case CudaArch::GFX1103: case CudaArch::GFX1150: case CudaArch::GFX1151: + case CudaArch::GFX1200: + case CudaArch::GFX1201: case CudaArch::Generic: case CudaArch::UNUSED: case CudaArch::UNKNOWN: diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl index 03c20ae46faaa46..8959634572b44e9 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-features.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl @@ -49,6 +49,8 @@ // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1103 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1103 %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1150 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1150 %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1151 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1151 %s +// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1200 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1200 %s +// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1201 -S -
[clang] [llvm] [AMDGPU] Define new targets gfx1200 and gfx1201 (PR #73133)
https://github.com/jayfoad updated https://github.com/llvm/llvm-project/pull/73133 >From 1011b8e7da174146dfb4c9a4bf54468ea021 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Tue, 21 Nov 2023 15:46:04 + Subject: [PATCH 1/2] [AMDGPU] Define new targets gfx1200 and gfx1201 Define target names and ELF numbers for new GFX12 targets gfx1200 and gfx1201. For now they behave identically to GFX11. --- clang/include/clang/Basic/Cuda.h | 2 + clang/lib/Basic/Cuda.cpp | 2 + clang/lib/Basic/Targets/NVPTX.cpp | 2 + clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 2 + clang/test/CodeGenOpenCL/amdgpu-features.cl | 4 + clang/test/Driver/amdgpu-macros.cl| 2 + clang/test/Driver/amdgpu-mcpu.cl | 4 + clang/test/Misc/target-invalid-cpu-note.c | 4 +- llvm/docs/AMDGPUUsage.rst | 18 - llvm/include/llvm/BinaryFormat/ELF.h | 6 +- llvm/include/llvm/TargetParser/TargetParser.h | 5 +- llvm/lib/Object/ELFObjectFile.cpp | 6 ++ llvm/lib/ObjectYAML/ELFYAML.cpp | 2 + llvm/lib/Target/AMDGPU/AMDGPU.td | 75 ++- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 3 +- llvm/lib/Target/AMDGPU/GCNProcessors.td | 12 +++ llvm/lib/Target/AMDGPU/GCNSubtarget.h | 1 + .../MCTargetDesc/AMDGPUTargetStreamer.cpp | 4 + llvm/lib/Target/AMDGPU/SIDefines.h| 1 + llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 8 ++ llvm/lib/Target/AMDGPU/SIInstrInfo.td | 4 +- .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp| 26 ++- llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 6 ++ llvm/lib/TargetParser/TargetParser.cpp| 27 +++ .../CodeGen/AMDGPU/directive-amdgcn-target.ll | 4 + .../CodeGen/AMDGPU/elf-header-flags-mach.ll | 4 + .../Object/AMDGPU/elf-header-flags-mach.yaml | 14 .../llvm-objdump/ELF/AMDGPU/subtarget.ll | 12 +++ .../llvm-readobj/ELF/amdgpu-elf-headers.test | 18 + llvm/tools/llvm-readobj/ELFDumper.cpp | 4 + 30 files changed, 272 insertions(+), 10 deletions(-) diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h index 878f8d70f90c0a9..2d912bdbbd1bc59 100644 --- a/clang/include/clang/Basic/Cuda.h +++ b/clang/include/clang/Basic/Cuda.h @@ -113,6 +113,8 @@ enum class CudaArch { GFX1103, GFX1150, GFX1151, + GFX1200, + GFX1201, Generic, // A processor model named 'generic' if the target backend defines a // public one. LAST, diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp index 2307352bd3becef..65840b9f20252b6 100644 --- a/clang/lib/Basic/Cuda.cpp +++ b/clang/lib/Basic/Cuda.cpp @@ -135,6 +135,8 @@ static const CudaArchToStringMap arch_names[] = { GFX(1103), // gfx1103 GFX(1150), // gfx1150 GFX(1151), // gfx1151 +GFX(1200), // gfx1200 +GFX(1201), // gfx1201 {CudaArch::Generic, "generic", ""}, // clang-format on }; diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp index a9fc88295700b89..3a4a75b0348f209 100644 --- a/clang/lib/Basic/Targets/NVPTX.cpp +++ b/clang/lib/Basic/Targets/NVPTX.cpp @@ -214,6 +214,8 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts, case CudaArch::GFX1103: case CudaArch::GFX1150: case CudaArch::GFX1151: + case CudaArch::GFX1200: + case CudaArch::GFX1201: case CudaArch::Generic: case CudaArch::LAST: break; diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index 2f7dd83bd2d65c9..9b8fbbdf8046787 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -3540,6 +3540,8 @@ void CGOpenMPRuntimeGPU::processRequiresDirective( case CudaArch::GFX1103: case CudaArch::GFX1150: case CudaArch::GFX1151: + case CudaArch::GFX1200: + case CudaArch::GFX1201: case CudaArch::Generic: case CudaArch::UNUSED: case CudaArch::UNKNOWN: diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl index 03c20ae46faaa46..8959634572b44e9 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-features.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl @@ -49,6 +49,8 @@ // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1103 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1103 %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1150 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1150 %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1151 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1151 %s +// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1200 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1200 %s +// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1201 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1201 %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1103 -targ
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
@@ -1708,6 +1710,19 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, } ++Iter; +if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) { + auto Builder = + BuildMI(Block, Iter, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) + .addImm(0); + if (IsGFX10Plus) { jayfoad wrote: Yes but why? On GFX10+, why would you put s_waitcnt(0) after a store or s_waitcnt_vscnt(0) after a load? https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [flang] [compiler-rt] [libcxx] [mlir] [clang] [libc] [AMDGPU] Define new targets gfx1200 and gfx1201 (PR #73133)
https://github.com/jayfoad updated https://github.com/llvm/llvm-project/pull/73133 >From 1011b8e7da174146dfb4c9a4bf54468ea021 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Tue, 21 Nov 2023 15:46:04 + Subject: [PATCH 1/2] [AMDGPU] Define new targets gfx1200 and gfx1201 Define target names and ELF numbers for new GFX12 targets gfx1200 and gfx1201. For now they behave identically to GFX11. --- clang/include/clang/Basic/Cuda.h | 2 + clang/lib/Basic/Cuda.cpp | 2 + clang/lib/Basic/Targets/NVPTX.cpp | 2 + clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 2 + clang/test/CodeGenOpenCL/amdgpu-features.cl | 4 + clang/test/Driver/amdgpu-macros.cl| 2 + clang/test/Driver/amdgpu-mcpu.cl | 4 + clang/test/Misc/target-invalid-cpu-note.c | 4 +- llvm/docs/AMDGPUUsage.rst | 18 - llvm/include/llvm/BinaryFormat/ELF.h | 6 +- llvm/include/llvm/TargetParser/TargetParser.h | 5 +- llvm/lib/Object/ELFObjectFile.cpp | 6 ++ llvm/lib/ObjectYAML/ELFYAML.cpp | 2 + llvm/lib/Target/AMDGPU/AMDGPU.td | 75 ++- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 3 +- llvm/lib/Target/AMDGPU/GCNProcessors.td | 12 +++ llvm/lib/Target/AMDGPU/GCNSubtarget.h | 1 + .../MCTargetDesc/AMDGPUTargetStreamer.cpp | 4 + llvm/lib/Target/AMDGPU/SIDefines.h| 1 + llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 8 ++ llvm/lib/Target/AMDGPU/SIInstrInfo.td | 4 +- .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp| 26 ++- llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 6 ++ llvm/lib/TargetParser/TargetParser.cpp| 27 +++ .../CodeGen/AMDGPU/directive-amdgcn-target.ll | 4 + .../CodeGen/AMDGPU/elf-header-flags-mach.ll | 4 + .../Object/AMDGPU/elf-header-flags-mach.yaml | 14 .../llvm-objdump/ELF/AMDGPU/subtarget.ll | 12 +++ .../llvm-readobj/ELF/amdgpu-elf-headers.test | 18 + llvm/tools/llvm-readobj/ELFDumper.cpp | 4 + 30 files changed, 272 insertions(+), 10 deletions(-) diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h index 878f8d70f90c0a9..2d912bdbbd1bc59 100644 --- a/clang/include/clang/Basic/Cuda.h +++ b/clang/include/clang/Basic/Cuda.h @@ -113,6 +113,8 @@ enum class CudaArch { GFX1103, GFX1150, GFX1151, + GFX1200, + GFX1201, Generic, // A processor model named 'generic' if the target backend defines a // public one. LAST, diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp index 2307352bd3becef..65840b9f20252b6 100644 --- a/clang/lib/Basic/Cuda.cpp +++ b/clang/lib/Basic/Cuda.cpp @@ -135,6 +135,8 @@ static const CudaArchToStringMap arch_names[] = { GFX(1103), // gfx1103 GFX(1150), // gfx1150 GFX(1151), // gfx1151 +GFX(1200), // gfx1200 +GFX(1201), // gfx1201 {CudaArch::Generic, "generic", ""}, // clang-format on }; diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp index a9fc88295700b89..3a4a75b0348f209 100644 --- a/clang/lib/Basic/Targets/NVPTX.cpp +++ b/clang/lib/Basic/Targets/NVPTX.cpp @@ -214,6 +214,8 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts, case CudaArch::GFX1103: case CudaArch::GFX1150: case CudaArch::GFX1151: + case CudaArch::GFX1200: + case CudaArch::GFX1201: case CudaArch::Generic: case CudaArch::LAST: break; diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index 2f7dd83bd2d65c9..9b8fbbdf8046787 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -3540,6 +3540,8 @@ void CGOpenMPRuntimeGPU::processRequiresDirective( case CudaArch::GFX1103: case CudaArch::GFX1150: case CudaArch::GFX1151: + case CudaArch::GFX1200: + case CudaArch::GFX1201: case CudaArch::Generic: case CudaArch::UNUSED: case CudaArch::UNKNOWN: diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl index 03c20ae46faaa46..8959634572b44e9 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-features.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl @@ -49,6 +49,8 @@ // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1103 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1103 %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1150 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1150 %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1151 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1151 %s +// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1200 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1200 %s +// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1201 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1201 %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1103 -targ
[compiler-rt] [flang] [libc] [libcxx] [llvm] [lldb] [clang-tools-extra] [clang] [AMDGPU] Fix nondeterminism in SIFixSGPRCopies (PR #70644)
https://github.com/jayfoad updated https://github.com/llvm/llvm-project/pull/70644 >From bfc7b2041f5a05105808b0b1ee0427d9c9eb9f4b Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Mon, 30 Oct 2023 15:23:48 + Subject: [PATCH 1/4] Precommit test --- .../AMDGPU/fix-sgpr-copies-nondeterminism.ll | 52 +++ 1 file changed, 52 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-nondeterminism.ll diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-nondeterminism.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-nondeterminism.ll new file mode 100644 index 000..8b7e691dbddeae5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-nondeterminism.ll @@ -0,0 +1,52 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s + +define amdgpu_gs void @f(i32 inreg %arg, i32 %arg1, i32 %arg2) { +; CHECK-LABEL: f: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT:s_cmp_eq_u32 s0, 0 +; CHECK-NEXT:s_mov_b32 s0, 0 +; CHECK-NEXT:s_cbranch_scc1 .LBB0_2 +; CHECK-NEXT: ; %bb.1: ; %bb3 +; CHECK-NEXT:v_mov_b32_e32 v4, v1 +; CHECK-NEXT:s_branch .LBB0_3 +; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT:v_mov_b32_e32 v0, 1 +; CHECK-NEXT:v_mov_b32_e32 v4, 0 +; CHECK-NEXT: .LBB0_3: ; %bb4 +; CHECK-NEXT:v_mov_b32_e32 v1, 0 +; CHECK-NEXT:s_mov_b32 s1, s0 +; CHECK-NEXT:s_mov_b32 s2, s0 +; CHECK-NEXT:s_mov_b32 s3, s0 +; CHECK-NEXT:s_delay_alu instid0(VALU_DEP_1) +; CHECK-NEXT:v_mov_b32_e32 v2, v1 +; CHECK-NEXT:v_mov_b32_e32 v3, v1 +; CHECK-NEXT:v_mov_b32_e32 v5, v1 +; CHECK-NEXT:v_mov_b32_e32 v6, v1 +; CHECK-NEXT:v_mov_b32_e32 v7, v1 +; CHECK-NEXT:s_clause 0x1 +; CHECK-NEXT:buffer_store_b128 v[0:3], v1, s[0:3], 0 idxen +; CHECK-NEXT:buffer_store_b128 v[4:7], v1, s[0:3], 0 idxen +; CHECK-NEXT:s_nop 0 +; CHECK-NEXT:s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; CHECK-NEXT:s_endpgm +bb: + %i = icmp eq i32 %arg, 0 + br i1 %i, label %bb4, label %bb3 + +bb3: + br label %bb4 + +bb4: + %i5 = phi i32 [ %arg1, %bb3 ], [ 1, %bb ] + %i6 = phi i32 [ %arg2, %bb3 ], [ 0, %bb ] + %i7 = insertelement <4 x i32> zeroinitializer, i32 %i5, i64 0 + %i8 = bitcast <4 x i32> %i7 to <4 x float> + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %i8, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0, i32 0) + %i9 = insertelement <4 x i32> zeroinitializer, i32 %i6, i64 0 + %i10 = bitcast <4 x i32> %i9 to <4 x float> + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %i10, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg) >From aa050e8d720150b97d7af18d97d1d7f5d010bedc Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Mon, 30 Oct 2023 10:40:22 + Subject: [PATCH 2/4] [AMDGPU] Fix nondeterminism in SIFixSGPRCopies There are a couple of loops that iterate over V2SCopies. The iteration order needs to be deterministic, otherwise we can call moveToVALU in different orders, which causes temporary vregs to be allocated in different orders, which can affect register allocation heuristics. --- llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp| 8 +++ .../AMDGPU/fix-sgpr-copies-nondeterminism.ll | 22 +-- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index b32ed9fef5dd34e..3e6ed2d793ae563 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -125,7 +125,7 @@ class SIFixSGPRCopies : public MachineFunctionPass { SmallVector PHINodes; SmallVector S2VCopies; unsigned NextVGPRToSGPRCopyID; - DenseMap V2SCopies; + MapVector V2SCopies; DenseMap> SiblingPenalty; public: @@ -988,7 +988,7 @@ bool SIFixSGPRCopies::needToBeConvertedToVALU(V2SCopyInfo *Info) { for (auto J : Info->Siblings) { auto InfoIt = V2SCopies.find(J); if (InfoIt != V2SCopies.end()) { - MachineInstr *SiblingCopy = InfoIt->getSecond().Copy; + MachineInstr *SiblingCopy = InfoIt->second.Copy; if (SiblingCopy->isImplicitDef()) // the COPY has already been MoveToVALUed continue; @@ -1023,12 +1023,12 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) { unsigned CurID = LoweringWorklist.pop_back_val(); auto CurInfoIt = V2SCopies.find(CurID); if (CurInfoIt != V2SCopies.end()) { - V2SCopyInfo C = CurInfoIt->getSecond(); + V2SCopyInfo C = CurInfoIt->second; LLVM_DEBUG(dbgs() << "Processing ...\n"; C.dump()); for (auto S : C.Siblings) { auto SibInfoIt = V2SCopies.find(S); if (SibInfoIt != V2SCopies.end()) { - V2SCopyInfo &SI = SibInfoIt->getSecond(); + V2SCopyInfo &SI = SibInfoIt->second; L
[llvm] [libc] [libcxx] [lldb] [flang] [compiler-rt] [clang-tools-extra] [clang] [AMDGPU] Fix nondeterminism in SIFixSGPRCopies (PR #70644)
https://github.com/jayfoad closed https://github.com/llvm/llvm-project/pull/70644 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [clang-tools-extra] [llvm] [AMDGPU] GCNRegPressure printing pass for testing. (PR #70031)
https://github.com/jayfoad approved this pull request. > Should we move on and submit this patch? Yes! > @jayfoad do you have concerns about live-through register set computation or > others? I personally have no interest in the live-through part. You could remove it from this patch, but I don't mind if others want to keep it. > I believe fixing trackers should go to another PR. Agreed. https://github.com/llvm/llvm-project/pull/70031 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [clang-tools-extra] [compiler-rt] [flang] [llvm] [AMDGPU] Fold operand after shrinking instruction in SIFoldOperands (PR #68426)
@@ -290,37 +291,40 @@ bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const { if (Fold.Commuted) TII->commuteInstruction(*Inst32, false); -return true; - } - assert(!Fold.needsShrink() && "not handled"); +Fold.UseMI = Inst32; +Fold.UseOpNo = AMDGPU::getNamedOperandIdx(Fold.UseMI->getOpcode(), + AMDGPU::OpName::src0); jayfoad wrote: Adding the assert showed up some problems to do with knowing whether or not the instruction has been commuted. I need to spend some more time on that. https://github.com/llvm/llvm-project/pull/68426 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [clang-tools-extra] [compiler-rt] [flang] [llvm] [AMDGPU] Fold operand after shrinking instruction in SIFoldOperands (PR #68426)
https://github.com/jayfoad converted_to_draft https://github.com/llvm/llvm-project/pull/68426 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang-tools-extra] [clang] [AMDGPU] New ttracedata intrinsics (PR #70235)
https://github.com/jayfoad updated https://github.com/llvm/llvm-project/pull/70235 >From e02640686a8cf0a42cec01da4f32b6888f5de11f Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Wed, 25 Oct 2023 17:14:40 +0100 Subject: [PATCH 1/2] [AMDGPU] New ttracedata intrinsics Add llvm.amdgcn.s.ttracedata and llvm.amdgcn.s.ttracedata.imm which map directly to the corresponding instructions s_ttracedata and s_ttracedata_imm. These are inherently whole-wave operations so any non-uniform inputs are readfirstlaned. --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 7 +++ .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 10 llvm/lib/Target/AMDGPU/SOPInstructions.td | 9 +++- .../AMDGPU/llvm.amdgcn.s.ttracedata.ll| 53 +++ 4 files changed, 77 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.ttracedata.ll diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 5f1d1d932f74cbd..a3acfccd00f8e16 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1697,6 +1697,13 @@ def int_amdgcn_s_setprio : DefaultAttrsIntrinsic<[], [llvm_i16_ty], [ImmArg>, IntrNoMem, IntrHasSideEffects]>; +def int_amdgcn_s_ttracedata : + DefaultAttrsIntrinsic<[], [llvm_i32_ty], +[IntrNoMem, IntrHasSideEffects]>; +def int_amdgcn_s_ttracedata_imm : + DefaultAttrsIntrinsic<[], [llvm_i16_ty], +[IntrNoMem, IntrHasSideEffects, ImmArg>]>; + // This is IntrHasSideEffects so it can be used to read cycle counters. def int_amdgcn_s_getreg : ClangBuiltin<"__builtin_amdgcn_s_getreg">, diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 5b056bd9e5dba2c..f117f732cb84ffb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3064,6 +3064,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl( constrainOpWithReadfirstlane(B, MI, 2); return; } +case Intrinsic::amdgcn_s_ttracedata: + constrainOpWithReadfirstlane(B, MI, 1); // M0 + return; case Intrinsic::amdgcn_raw_buffer_load_lds: case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: { applyDefaultMapping(OpdMapper); @@ -4653,6 +4656,13 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); break; } +case Intrinsic::amdgcn_s_ttracedata: { + // This must be an SGPR, but accept a VGPR. + unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI, + AMDGPU::SGPRRegBankID); + OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32); + break; +} case Intrinsic::amdgcn_end_cf: { unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 2f3b0ff2f76215e..0ec4f8150bfcc06 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -1500,7 +1500,10 @@ def S_INCPERFLEVEL : SOPP_Pseudo <"s_incperflevel", (ins i32imm:$simm16), "$simm def S_DECPERFLEVEL : SOPP_Pseudo <"s_decperflevel", (ins i32imm:$simm16), "$simm16", [(int_amdgcn_s_decperflevel timm:$simm16)]> { } -def S_TTRACEDATA : SOPP_Pseudo <"s_ttracedata", (ins)> { + +let Uses = [M0] in +def S_TTRACEDATA : SOPP_Pseudo <"s_ttracedata", (ins), "", +[(int_amdgcn_s_ttracedata M0)]> { let simm16 = 0; let fixed_imm = 1; } @@ -1544,8 +1547,10 @@ let SubtargetPredicate = isGFX10Plus in { [(SIdenorm_mode (i32 timm:$simm16))]>; } + let hasSideEffects = 1 in def S_TTRACEDATA_IMM : -SOPP_Pseudo<"s_ttracedata_imm", (ins s16imm:$simm16), "$simm16">; +SOPP_Pseudo<"s_ttracedata_imm", (ins s16imm:$simm16), "$simm16", +[(int_amdgcn_s_ttracedata_imm timm:$simm16)]>; } // End SubtargetPredicate = isGFX10Plus let SubtargetPredicate = isGFX11Plus in { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.ttracedata.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.ttracedata.ll new file mode 100644 index 000..37b5357950e648b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.ttracedata.ll @@ -0,0 +1,53 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s + +declare void @llvm.amdgcn.s.ttracedata(i32) +declare void @llvm.amdgcn.s.ttracedata.imm(i16) + +define
[llvm] [clang] [clang-tools-extra] [AMDGPU] New ttracedata intrinsics (PR #70235)
https://github.com/jayfoad closed https://github.com/llvm/llvm-project/pull/70235 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] effd47e - [Clang][AArch64] Add REQUIRES to new test
Author: Jay Foad Date: 2023-12-13T10:49:52Z New Revision: effd47ed45e3badd756103346a7c3b9e1e939e5e URL: https://github.com/llvm/llvm-project/commit/effd47ed45e3badd756103346a7c3b9e1e939e5e DIFF: https://github.com/llvm/llvm-project/commit/effd47ed45e3badd756103346a7c3b9e1e939e5e.diff LOG: [Clang][AArch64] Add REQUIRES to new test Added: Modified: clang/test/CodeGen/arm-vector_type-params-returns.c Removed: diff --git a/clang/test/CodeGen/arm-vector_type-params-returns.c b/clang/test/CodeGen/arm-vector_type-params-returns.c index 61b617083515a7..14c3512ab81a9f 100644 --- a/clang/test/CodeGen/arm-vector_type-params-returns.c +++ b/clang/test/CodeGen/arm-vector_type-params-returns.c @@ -12,6 +12,8 @@ // RUN: %clang_cc1 -DNEON_HEADER -DSVE_HEADER2 -triple aarch64 -target-feature +sve -emit-llvm -O2 -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s // RUN: %clang_cc1 -DNEON_HEADER -DSVE_HEADER2 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o - /dev/null %s +// REQUIRES: aarch64-registered-target + #ifdef SVE_HEADER #include #endif ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [clang-tools-extra] [Clang][AArch64] Add fix vector types to header into SVE (PR #73258)
@@ -0,0 +1,134 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3 + +// RUN: %clang_cc1 -DSVE_HEADER -triple aarch64 -target-feature +sve -emit-llvm -O2 -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s +// RUN: %clang_cc1 -DSVE_HEADER -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o - /dev/null %s + +// RUN: %clang_cc1 -DNEON_HEADER -triple aarch64 -target-feature +sve -emit-llvm -O2 -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s +// RUN: %clang_cc1 -DNEON_HEADER -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o - /dev/null %s + +// RUN: %clang_cc1 -DSVE_HEADER -DNEON_HEADER -triple aarch64 -target-feature +sve -emit-llvm -O2 -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s +// RUN: %clang_cc1 -DSVE_HEADER -DNEON_HEADER -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o - /dev/null %s + +// RUN: %clang_cc1 -DNEON_HEADER -DSVE_HEADER2 -triple aarch64 -target-feature +sve -emit-llvm -O2 -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s +// RUN: %clang_cc1 -DNEON_HEADER -DSVE_HEADER2 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o - /dev/null %s + jayfoad wrote: I've just added a `REQUIRES` line to this test in effd47ed45e3badd756103346a7c3b9e1e939e5e since it was failing in my AMDGPU-only build. https://github.com/llvm/llvm-project/pull/73258 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [llvm] [lldb] [libcxx] [compiler-rt] [libc] [flang] [clang] [lld] [AMDGPU] Use alias info to relax waitcounts for LDS DMA (PR #74537)
@@ -0,0 +1,154 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck %s --check-prefixeses=GCN,GFX9 +; RUN: llc -march=amdgcn -mcpu=gfx1030 < %s | FileCheck %s --check-prefixeses=GCN,GFX10 jayfoad wrote: > --check-prefixeses That's what happens when you enable `M-x gollum-mode` in Emacs. https://github.com/llvm/llvm-project/pull/74537 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [AArch64][SME2] Add SQRSHRN, UQRSHRN, SQRSHRUN builtins for SME2, SVE2p1 (PR #75325)
jayfoad wrote: Please remember to add a suitable `REQUIRES:` line to these new codegen tests, or put them in an `ARM` subdirectory with a suitable `lit.local.cfg`! This new test is failing in non-ARM builds with: ``` FAIL: Clang :: CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qrshr.c (5567 of 76786) TEST 'Clang :: CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qrshr.c' FAILED Exit Code: 1 Command Output (stderr): -- RUN: at line 2: /jenkins/workspace/llvm-llpc/llvm-npi-merge/llvm-project/build/bin/clang -cc1 -internal-isystem /jenkins/workspace/llvm-llpc/llvm-npi-merge/llvm-project/build/lib/clang/18/include -nostdsysteminc -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - /jenkins/workspace/llvm-llpc/llvm-npi-merge/llvm-project/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qrshr.c | /jenkins/workspace/llvm-llpc/llvm-npi-merge/llvm-project/build/bin/opt -S -passes=mem2reg,instcombine,tailcallelim | /jenkins/workspace/llvm-llpc/llvm-npi-merge/llvm-project/build/bin/FileCheck /jenkins/workspace/llvm-llpc/llvm-npi-merge/llvm-project/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qrshr.c + /jenkins/workspace/llvm-llpc/llvm-npi-merge/llvm-project/build/bin/clang -cc1 -internal-isystem /jenkins/workspace/llvm-llpc/llvm-npi-merge/llvm-project/build/lib/clang/18/include -nostdsysteminc -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - /jenkins/workspace/llvm-llpc/llvm-npi-merge/llvm-project/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qrshr.c + /jenkins/workspace/llvm-llpc/llvm-npi-merge/llvm-project/build/bin/opt -S -passes=mem2reg,instcombine,tailcallelim + /jenkins/workspace/llvm-llpc/llvm-npi-merge/llvm-project/build/bin/FileCheck /jenkins/workspace/llvm-llpc/llvm-npi-merge/llvm-project/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qrshr.c /jenkins/workspace/llvm-llpc/llvm-npi-merge/llvm-project/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qrshr.c:10:10: fatal error: 'arm_sve.h' file not found 10 | #include | ^~~ 1 error generated. /jenkins/workspace/llvm-llpc/llvm-npi-merge/llvm-project/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qrshr.c:22:17: error: CHECK-LABEL: expected string not found in input // CHECK-LABEL: @test_svqrshrn_s16_s32_x2( ^ :1:1: note: scanning from here ; ModuleID = '' ^ :1:14: note: possible intended match here ; ModuleID = '' ^ Input file: Check file: /jenkins/workspace/llvm-llpc/llvm-npi-merge/llvm-project/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qrshr.c -dump-input=help explains the following input dump. Input was: << 1: ; ModuleID = '' label:22'0 X~~ error: no match found label:22'1 ? possible intended match 2: source_filename = "" label:22'0 >> ``` https://github.com/llvm/llvm-project/pull/75325 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] 50e78de - [AArch64][SME2] Add REQUIRES to new test
Author: Jay Foad Date: 2023-12-14T13:20:37Z New Revision: 50e78de76a5e77e15ddea48dfb520d6bbcbc1c45 URL: https://github.com/llvm/llvm-project/commit/50e78de76a5e77e15ddea48dfb520d6bbcbc1c45 DIFF: https://github.com/llvm/llvm-project/commit/50e78de76a5e77e15ddea48dfb520d6bbcbc1c45.diff LOG: [AArch64][SME2] Add REQUIRES to new test Added: Modified: clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qrshr.c Removed: diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qrshr.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qrshr.c index 8e8b7203148934..6ebf224db92377 100644 --- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qrshr.c +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qrshr.c @@ -1,4 +1,5 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [AArch64][SME2] Add SQRSHRN, UQRSHRN, SQRSHRUN builtins for SME2, SVE2p1 (PR #75325)
jayfoad wrote: > Please remember to add a suitable `REQUIRES:` line to these new codegen tests I've added one in 50e78de76a5e77e15ddea48dfb520d6bbcbc1c45 https://github.com/llvm/llvm-project/pull/75325 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[lld] [compiler-rt] [libc] [clang] [libcxx] [lldb] [flang] [mlir] [llvm] [clang-tools-extra] [AMDGPU] GFX12: Add Split Workgroup Barrier (PR #74836)
@@ -684,6 +684,51 @@ s_rndne_f16 s5, 0xfe0b s_rndne_f16 s5, 0x3456 // GFX12: encoding: [0xff,0x6e,0x85,0xbe,0x56,0x34,0x00,0x00] +s_barrier_signal -2 jayfoad wrote: Missing `s_get_barrier_state` tests in this file? https://github.com/llvm/llvm-project/pull/74836 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [flang] [compiler-rt] [lld] [libcxx] [clang] [libcxxabi] [clang-tools-extra] [lldb] [AMDGPU] GFX12: select @llvm.prefetch intrinsic (PR #74576)
@@ -3164,6 +3164,18 @@ def : GCNPat < (as_i1timm $bound_ctrl)) >; +class SMPrefetchGetPcPat : GCNPat < jayfoad wrote: This pattern also interprets the "address" argument as being an offset from PC, so it should also be removed from this version of the patch. https://github.com/llvm/llvm-project/pull/74576 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [RISCV] Implement multi-lib reuse rule for RISC-V bare-metal toolchain (PR #73765)
jayfoad wrote: The new test is crashing in my Release+Asserts build: ``` FAIL: Clang :: Driver/riscv-toolchain-gcc-multilib-reuse.c (1081 of 1081) TEST 'Clang :: Driver/riscv-toolchain-gcc-multilib-reuse.c' FAILED Exit Code: 2 Command Output (stderr): -- RUN: at line 1: /home/jayfoad2/llvm-release/bin/clang /home/jayfoad2/git/llvm-project/clang/test/Driver/riscv-toolchain-gcc-multilib-reuse.c -target riscv64-unknown-elf --gcc-toolchain=/home/jayfoad2/git/llvm-project/clang/test/Driver/Inputs/multilib_riscv_elf_sdk --print-multi-directory-march=rv32imc -mabi=ilp32| /home/jayfoad2/llvm-release/bin/FileCheck -check-prefix=GCC-MULTI-LIB-REUSE-RV32IMC-ILP32 /home/jayfoad2/git/llvm-project/clang/test/Driver/riscv-toolchain-gcc-multilib-reuse.c + /home/jayfoad2/llvm-release/bin/FileCheck -check-prefix=GCC-MULTI-LIB-REUSE-RV32IMC-ILP32 /home/jayfoad2/git/llvm-project/clang/test/Driver/riscv-toolchain-gcc-multilib-reuse.c + /home/jayfoad2/llvm-release/bin/clang /home/jayfoad2/git/llvm-project/clang/test/Driver/riscv-toolchain-gcc-multilib-reuse.c -target riscv64-unknown-elf --gcc-toolchain=/home/jayfoad2/git/llvm-project/clang/test/Driver/Inputs/multilib_riscv_elf_sdk --print-multi-directory -march=rv32imc -mabi=ilp32 clang: /home/jayfoad2/git/llvm-project/clang/lib/Driver/ToolChains/CommonArgs.cpp:2189: void clang::driver::tools::addMultilibFlag(bool, const llvm::StringRef, Multilib::flags_list &): Assertion `Flag.front() == '-'' failed. PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace, preprocessed source, and associated run script. Stack dump: 0. Program arguments: /home/jayfoad2/llvm-release/bin/clang /home/jayfoad2/git/llvm-project/clang/test/Driver/riscv-toolchain-gcc-multilib-reuse.c -target riscv64-unknown-elf --gcc-toolchain=/home/jayfoad2/git/llvm-project/clang/test/Driver/Inputs/multilib_riscv_elf_sdk --print-multi-directory -march=rv32imc -mabi=ilp32 1. Compilation construction #0 0x070bfaf7 llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) (/home/jayfoad2/llvm-release/bin/clang+0x70bfaf7) #1 0x070bd6ae llvm::sys::RunSignalHandlers() (/home/jayfoad2/llvm-release/bin/clang+0x70bd6ae) #2 0x070c01ca SignalHandler(int) Signals.cpp:0:0 #3 0x7fc909c42520 (/lib/x86_64-linux-gnu/libc.so.6+0x42520) #4 0x7fc909c969fc __pthread_kill_implementation ./nptl/pthread_kill.c:44:76 #5 0x7fc909c969fc __pthread_kill_internal ./nptl/pthread_kill.c:78:10 #6 0x7fc909c969fc pthread_kill ./nptl/pthread_kill.c:89:10 #7 0x7fc909c42476 gsignal ./signal/../sysdeps/posix/raise.c:27:6 #8 0x7fc909c287f3 abort ./stdlib/abort.c:81:7 #9 0x7fc909c2871b _nl_load_domain ./intl/loadmsgcat.c:1177:9 #10 0x7fc909c39e96 (/lib/x86_64-linux-gnu/libc.so.6+0x39e96) #11 0x07b32257 clang::driver::tools::addMultilibFlag(bool, llvm::StringRef, std::vector, std::allocator>, std::allocator, std::allocator>>>&) (/home/jayfoad2/llvm-release/bin/clang+0x7b32257) #12 0x07abb016 clang::driver::MultilibBuilder::flag(llvm::StringRef, bool) (/home/jayfoad2/llvm-release/bin/clang+0x7abb016) #13 0x07b9ddbf findRISCVMultilibs(clang::driver::Driver const&, llvm::Triple const&, llvm::StringRef, llvm::opt::ArgList const&, clang::driver::DetectedMultilibs&) Gnu.cpp:0:0 #14 0x07b95459 clang::driver::toolchains::Generic_GCC::GCCInstallationDetector::ScanGCCForMultilibs(llvm::Triple const&, llvm::opt::ArgList const&, llvm::StringRef, bool) Gnu.cpp:0:0 #15 0x07b9b164 clang::driver::toolchains::Generic_GCC::GCCInstallationDetector::ScanLibDirForGCCTriple(llvm::Triple const&, llvm::opt::ArgList const&, std::__cxx11::basic_string, std::allocator> const&, llvm::StringRef, bool, bool, bool) Gnu.cpp:0:0 #16 0x07b9324c clang::driver::toolchains::Generic_GCC::GCCInstallationDetector::init(llvm::Triple const&, llvm::opt::ArgList const&) Gnu.cpp:0:0 #17 0x07bf34ba clang::driver::toolchains::RISCVToolChain::RISCVToolChain(clang::driver::Driver const&, llvm::Triple const&, llvm::opt::ArgList const&) RISCVToolchain.cpp:0:0 #18 0x07a31458 clang::driver::Driver::getToolChain(llvm::opt::ArgList const&, llvm::Triple const&) const (/home/jayfoad2/llvm-release/bin/clang+0x7a31458) #19 0x07a38bbe clang::driver::Driver::BuildCompilation(llvm::ArrayRef) (/home/jayfoad2/llvm-release/bin/clang+0x7a38bbe) #20 0x04a8a25a clang_main(int, char**, llvm::ToolContext const&) (/home/jayfoad2/llvm-release/bin/clang+0x4a8a25a) #21 0x04a9bb61 main (/home/jayfoad2/llvm-release/bin/clang+0x4a9bb61) #22 0x7fc909c29d90 __libc_start_call_main ./csu/../sysdeps/nptl/libc_start_call_main.h:58:16 #23 0x7fc909c29e40 call_init ./csu/../csu/libc-start.c:128:20 #24 0x7fc909c29e40 __libc_start_main ./csu/../csu/libc-start.c:379:5 #25 0x04a875a5 _start (/home/jayfoad2/llvm-relea
[flang] [llvm] [clang-tools-extra] [clang] [libcxx] [libc] [compiler-rt] [AMDGPU] Produce better memoperand for LDS DMA (PR #75247)
jayfoad wrote: > Use PoisonValue instead of nullptr for load memop as a Value. What is the effect of that? I thought nullptr was supposed to represent an unknown value, so you have to conservatively assume it might alias with anything. https://github.com/llvm/llvm-project/pull/75247 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [libcxx] [lldb] [clang] [lld] [flang] [compiler-rt] [clang-tools-extra] [libc] [AMDGPU] Use alias info to relax waitcounts for LDS DMA (PR #74537)
jayfoad wrote: How does this work in a case like this? ``` call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.3, i32 4, i32 0, i32 0, i32 0, i32 0) call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %ptr, i32 4, i32 0, i32 0, i32 0, i32 0) %val.3 = load float, ptr addrspace(3) @lds.3, align 4 ``` i.e. - store to known lds address `@lds.3` (this will use slot 0 and another slot e.g. slot 3?) - store to unknown lds address (this will use slot 0?) - load from known lds address `@lds.3` (this will use slot 3?) https://github.com/llvm/llvm-project/pull/74537 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[flang] [clang-tools-extra] [lld] [llvm] [compiler-rt] [lldb] [libc] [libcxx] [clang] [AMDGPU] Use alias info to relax waitcounts for LDS DMA (PR #74537)
jayfoad wrote: > > How does this work in a case like this? > > ``` > > call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr > > addrspace(3) @lds.3, i32 4, i32 0, i32 0, i32 0, i32 0) > > call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr > > addrspace(3) %ptr, i32 4, i32 0, i32 0, i32 0, i32 0) > > %val.3 = load float, ptr addrspace(3) @lds.3, align 4 > > ``` > > > > > > > > > > > > > > > > > > > > > > > > i.e. > > ``` > > * store to known lds address `@lds.3` (this will use slot 0 and another > > slot e.g. slot 3?) > > > > * store to unknown lds address (this will use slot 0?) > > > > * load from known lds address `@lds.3` (this will use slot 3?) > > ``` > > It does not know the pointer, so it uses default slot 0 and waits till 0. Test case: ``` @lds.0 = internal addrspace(3) global [64 x float] poison, align 16 @lds.1 = internal addrspace(3) global [64 x float] poison, align 16 declare void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) nocapture, i32 %size, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux) define amdgpu_kernel void @f(<4 x i32> %rsrc, i32 %i1, i32 %i2, ptr addrspace(1) %out, ptr addrspace(3) %ptr) { main_body: call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.0, i32 4, i32 0, i32 0, i32 0, i32 0) call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %ptr, i32 4, i32 0, i32 0, i32 0, i32 0) %gep.0 = getelementptr float, ptr addrspace(3) @lds.0, i32 %i1 %gep.1 = getelementptr float, ptr addrspace(3) @lds.1, i32 %i2 %val.0 = load volatile float, ptr addrspace(3) %gep.0, align 4 %val.1 = load volatile float, ptr addrspace(3) %gep.1, align 4 %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1 store float %val.0, ptr addrspace(1) %out store float %val.1, ptr addrspace(1) %out.gep.1 ret void } ``` Generates: ``` s_load_dwordx8 s[4:11], s[0:1], 0x24 s_load_dword s2, s[0:1], 0x44 s_mov_b32 m0, 0 v_mov_b32_e32 v2, 0 s_waitcnt lgkmcnt(0) buffer_load_dword off, s[4:7], 0 lds s_mov_b32 m0, s2 s_lshl_b32 s0, s8, 2 buffer_load_dword off, s[4:7], 0 lds s_lshl_b32 s1, s9, 2 v_mov_b32_e32 v0, s0 v_mov_b32_e32 v1, s1 s_waitcnt vmcnt(1) ds_read_b32 v0, v0 s_waitcnt vmcnt(0) ds_read_b32 v1, v1 offset:256 s_waitcnt lgkmcnt(0) global_store_dwordx2 v2, v[0:1], s[10:11] s_endpgm ``` The `s_waitcnt vmcnt(1)` seems incorrect, because the second buffer-load-to-lds might clobber `@lds.0`. > I have to tell anyone interested here: before I even wrote this code it > didn't know of the dependency and did not wait for anything at all. Everyone > was happy. I am still happy, because buffer/flat/global-load-to-lds was removed in GFX11. https://github.com/llvm/llvm-project/pull/74537 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[mlir] [llvm] [clang] [libcxx] [libc] [compiler-rt] [flang] [AMDGPU] Define new targets gfx1200 and gfx1201 (PR #73133)
https://github.com/jayfoad closed https://github.com/llvm/llvm-project/pull/73133 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [clang][dataflow] Retrieve members from accessors called using member… (PR #73978)
jayfoad wrote: Hi, on my Release+Asserts build this is causing: ``` FAIL: Clang-Unit :: Analysis/FlowSensitive/./ClangAnalysisFlowSensitiveTests/32/38 (134 of 658) TEST 'Clang-Unit :: Analysis/FlowSensitive/./ClangAnalysisFlowSensitiveTests/32/38' FAILED Script(shard): -- GTEST_OUTPUT=json:/home/jayfoad2/llvm-release/tools/clang/unittests/Analysis/FlowSensitive/./ClangAnalysisFlowSensitiveTests-Clang-Unit-2611196-32-38.json GTEST_SHUFFLE=0 GTEST_TOTAL_SHARDS=38 GTEST_SHARD_INDEX=32 /home/jayfoad2/llvm-release/tools/clang/unittests/Analysis/FlowSensitive/./ClangAnalysisFlowSensitiveTests -- Script: -- /home/jayfoad2/llvm-release/tools/clang/unittests/Analysis/FlowSensitive/./ClangAnalysisFlowSensitiveTests --gtest_filter=EnvironmentTest.ModelMemberForAccessorUsingMethodPointerThroughTemplate -- /home/jayfoad2/git/llvm-project/clang/unittests/Analysis/FlowSensitive/DataflowEnvironmentTest.cpp:362: Failure Value of: DAContext.getModeledFields(QualType(Struct->getTypeForDecl(), 0)) Expected: contains at least one element that is equal to 0x4b29e98 Actual: {} /home/jayfoad2/git/llvm-project/clang/unittests/Analysis/FlowSensitive/DataflowEnvironmentTest.cpp:362 Value of: DAContext.getModeledFields(QualType(Struct->getTypeForDecl(), 0)) Expected: contains at least one element that is equal to 0x4b29e98 Actual: {} Failed Tests (1): Clang-Unit :: Analysis/FlowSensitive/./ClangAnalysisFlowSensitiveTests/EnvironmentTest/ModelMemberForAccessorUsingMethodPointerThroughTemplate ``` https://github.com/llvm/llvm-project/pull/73978 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [clang-format] Fix a bug in `git-clang-format --binary` (PR #74293)
https://github.com/jayfoad approved this pull request. LGTM but the commit message should really explain what problem this fixes instead of just saying "rework". https://github.com/llvm/llvm-project/pull/74293 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] Reapply "InstCombine: Introduce SimplifyDemandedUseFPClass"" (PR #74056)
jayfoad wrote: > The referenced issue violates the spec for finite-only math only by > using a return value for a constant infinity. You mean this issue? https://github.com/llvm/llvm-project/commit/5a36904c515b#commitcomment-129847939 Can you explain how your patch "broke" it? If you return infinity from a function marked with `ninf`, I would expect your patch to have no effect, because `DemandedMask & Known.KnownFPClasses` will be empty so `getFPClassConstant` will return `nullptr`. https://github.com/llvm/llvm-project/pull/74056 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[lldb] [llvm] [mlir] [openmp] [libc] [flang] [clang] [AMDGPU] GFX12 global_atomic_ordered_add_b64 instruction and intrinsic (PR #76149)
https://github.com/jayfoad updated https://github.com/llvm/llvm-project/pull/76149 >From b14a554a15e4de88c9afc428f9c6898090e6eb23 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Thu, 21 Dec 2023 12:00:26 + Subject: [PATCH] [AMDGPU] GFX12 global_atomic_ordered_add_b64 instruction and intrinsic --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 10 ++- llvm/lib/Target/AMDGPU/AMDGPUInstructions.td | 1 + .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 1 + .../Target/AMDGPU/AMDGPUSearchableTables.td | 1 + llvm/lib/Target/AMDGPU/FLATInstructions.td| 11 +++- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 1 + ...vm.amdgcn.global.atomic.ordered.add.b64.ll | 65 +++ llvm/test/MC/AMDGPU/gfx11_unsupported.s | 3 + llvm/test/MC/AMDGPU/gfx12_asm_vflat.s | 24 +++ .../Disassembler/AMDGPU/gfx12_dasm_vflat.txt | 12 10 files changed, 124 insertions(+), 5 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 51bd9b63c127ed..3985c8871e1615 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -10,6 +10,8 @@ // //===--===// +def global_ptr_ty : LLVMQualPointerType<1>; + class AMDGPUReadPreloadRegisterIntrinsic : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>; @@ -2353,10 +2355,10 @@ def int_amdgcn_s_get_waveid_in_workgroup : Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; -class AMDGPUGlobalAtomicRtn : Intrinsic < +class AMDGPUGlobalAtomicRtn : Intrinsic < [vt], - [llvm_anyptr_ty,// vaddr - vt], // vdata(VGPR) + [pt, // vaddr + vt], // vdata(VGPR) [IntrArgMemOnly, IntrWillReturn, NoCapture>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>; @@ -2486,6 +2488,8 @@ def int_amdgcn_permlanex16_var : ClangBuiltin<"__builtin_amdgcn_permlanex16_var" [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg>, ImmArg>, IntrNoCallback, IntrNoFree]>; +def int_amdgcn_global_atomic_ordered_add_b64 : AMDGPUGlobalAtomicRtn; + def int_amdgcn_flat_atomic_fmin_num : AMDGPUGlobalAtomicRtn; def int_amdgcn_flat_atomic_fmax_num : AMDGPUGlobalAtomicRtn; def int_amdgcn_global_atomic_fmin_num : AMDGPUGlobalAtomicRtn; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index eaf72d7157ee2d..36e07d944c942c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -642,6 +642,7 @@ defm int_amdgcn_global_atomic_fmax : noret_op; defm int_amdgcn_global_atomic_csub : noret_op; defm int_amdgcn_flat_atomic_fadd : local_addr_space_atomic_op; defm int_amdgcn_ds_fadd_v2bf16 : noret_op; +defm int_amdgcn_global_atomic_ordered_add_b64 : noret_op; defm int_amdgcn_flat_atomic_fmin_num : noret_op; defm int_amdgcn_flat_atomic_fmax_num : noret_op; defm int_amdgcn_global_atomic_fmin_num : noret_op; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index c9412f720c62ec..fba060464a6e74 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4690,6 +4690,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_flat_atomic_fmax_num: case Intrinsic::amdgcn_global_atomic_fadd_v2bf16: case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: +case Intrinsic::amdgcn_global_atomic_ordered_add_b64: return getDefaultMappingAllVGPR(MI); case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index beb670669581f1..4cc8871a00fe1f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -243,6 +243,7 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 0dd2b3f5c2c912..615f8cd54d8f9c 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -926,9 +926,11 @@ defm GLOBAL_LOAD_LDS_USHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_usho defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_sshort">; defm GLOBAL_LOAD_LDS_DWORD : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dword">; -} // End is_flat_global = 1 - +let Subt
[clang] [openmp] [flang] [lldb] [libc] [mlir] [llvm] [AMDGPU] GFX12 global_atomic_ordered_add_b64 instruction and intrinsic (PR #76149)
jayfoad wrote: Ping! https://github.com/llvm/llvm-project/pull/76149 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[openmp] [clang] [libc] [mlir] [lldb] [flang] [llvm] [AMDGPU] GFX12 global_atomic_ordered_add_b64 instruction and intrinsic (PR #76149)
https://github.com/jayfoad closed https://github.com/llvm/llvm-project/pull/76149 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang-tools-extra] [clang] [AMDGPU][GFX12] Default component broadcast store (PR #76212)
https://github.com/jayfoad approved this pull request. LGTM. @arsenm does this address your concerns? https://github.com/llvm/llvm-project/pull/76212 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [llvm] [clang] [AMDGPU] Flip the default value of maybeAtomic. NFCI. (PR #75220)
https://github.com/jayfoad updated https://github.com/llvm/llvm-project/pull/75220 >From 429d0a22cd4208eb0c854ccf98df1ba86fd3b0cb Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Tue, 12 Dec 2023 17:15:26 + Subject: [PATCH] [AMDGPU] Flip the default value of maybeAtomic. NFCI. In practice maybeAtomic = 0 is used to prevent SIMemoryLegalizer from interfering with instructions that are mayLoad or mayStore but lack MachineMemOperands. These instructions should be the exception not the rule, so this patch sets maybeAtomic = 1 by default and only overrides it to 0 where necessary. --- llvm/lib/Target/AMDGPU/BUFInstructions.td| 4 llvm/lib/Target/AMDGPU/DSInstructions.td | 1 - llvm/lib/Target/AMDGPU/EXPInstructions.td| 1 + llvm/lib/Target/AMDGPU/FLATInstructions.td | 7 --- llvm/lib/Target/AMDGPU/LDSDIRInstructions.td | 1 + llvm/lib/Target/AMDGPU/SIInstrFormats.td | 2 +- llvm/lib/Target/AMDGPU/SIInstructions.td | 2 +- llvm/lib/Target/AMDGPU/SMInstructions.td | 1 + 8 files changed, 5 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index 44fd4ef8641270..4696ea47f9cefd 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -477,7 +477,6 @@ class MUBUF_Load_Pseudo .ret; let mayLoad = 0; let mayStore = 1; - let maybeAtomic = 1; let elements = getMUBUFElements.ret; let tfe = isTFE; } @@ -618,7 +616,6 @@ class MUBUF_Pseudo_Store_Lds let LGKM_CNT = 1; let mayLoad = 1; let mayStore = 1; - let maybeAtomic = 1; let has_vdata = 0; let has_vaddr = 0; @@ -680,7 +677,6 @@ class MUBUF_Atomic_Pseudo patt // Most instruction load and store data, so set this as the default. let mayLoad = 1; let mayStore = 1; - let maybeAtomic = 1; let hasSideEffects = 0; let SchedRW = [WriteLDS]; diff --git a/llvm/lib/Target/AMDGPU/EXPInstructions.td b/llvm/lib/Target/AMDGPU/EXPInstructions.td index ff1d661ef6fe1d..4cfee7d013ef1a 100644 --- a/llvm/lib/Target/AMDGPU/EXPInstructions.td +++ b/llvm/lib/Target/AMDGPU/EXPInstructions.td @@ -20,6 +20,7 @@ class EXPCommon : InstSI< let EXP_CNT = 1; let mayLoad = done; let mayStore = 1; + let maybeAtomic = 0; let UseNamedOperandTable = 1; let Uses = !if(row, [EXEC, M0], [EXEC]); let SchedRW = [WriteExport]; diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index c0251164faee8b..a1ff3af663352e 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -173,7 +173,6 @@ class FLAT_Load_Pseudo { @@ -221,7 +219,6 @@ class FLAT_Global_Load_AddTid_Pseudo { @@ -450,7 +444,6 @@ class FLAT_AtomicNoRet_Pseudo : InstSI< let hasSideEffects = 0; let mayLoad = 1; let mayStore = 0; + let maybeAtomic = 0; string Mnemonic = opName; let UseNamedOperandTable = 1; diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td index 585a3eb7861878..1b66d163714fbc 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td +++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td @@ -91,7 +91,7 @@ class InstSI { let hasSideEffects = 1; - let maybeAtomic = 1; } let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { @@ -557,6 +556,7 @@ def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins), let hasNoSchedulingInfo = 1; let FixedSize = 1; let isMeta = 1; + let maybeAtomic = 0; } // Used as an isel pseudo to directly emit initialization with an diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td index c18846483cf95a..323f49ab91f01e 100644 --- a/llvm/lib/Target/AMDGPU/SMInstructions.td +++ b/llvm/lib/Target/AMDGPU/SMInstructions.td @@ -29,6 +29,7 @@ class SM_Pseudo patt let mayStore = 0; let mayLoad = 1; let hasSideEffects = 0; + let maybeAtomic = 0; let UseNamedOperandTable = 1; let SchedRW = [WriteSMEM]; ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [clang-tools-extra] [llvm] [AMDGPU] Flip the default value of maybeAtomic. NFCI. (PR #75220)
https://github.com/jayfoad closed https://github.com/llvm/llvm-project/pull/75220 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [clang-tools-extra] [AMDGPU] Flip the default value of maybeAtomic. NFCI. (PR #75220)
@@ -29,6 +29,7 @@ class SM_Pseudo patt let mayStore = 0; let mayLoad = 1; let hasSideEffects = 0; + let maybeAtomic = 0; jayfoad wrote: #77443 https://github.com/llvm/llvm-project/pull/75220 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [libclc] [lld] [flang] [mlir] [libcxx] [libunwind] [clang] [lldb] [libc] [llvm] [compiler-rt] [AMDGPU] Fix broken sign-extended subword buffer load combine (PR #77470)
https://github.com/jayfoad updated https://github.com/llvm/llvm-project/pull/77470 >From ae231d88c5b5e2e0996edefd45389992f8e97d05 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Tue, 9 Jan 2024 13:16:24 + Subject: [PATCH 1/3] [AMDGPU] Precommit tests for broken combine Add tests for sign-extending the result of an unsigned subword buffer load from the wrong width. --- .../llvm.amdgcn.struct.buffer.load.ll | 82 +++ 1 file changed, 82 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll index 81c0f7557e6417..fcd7821a86897e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll @@ -500,6 +500,47 @@ define amdgpu_ps float @struct_buffer_load_i8_sext__sgpr_rsrc__vgpr_vindex__vgpr ret float %cast } +define amdgpu_ps float @struct_buffer_load_i8_sext_wrong_width(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { + ; GFX8-LABEL: name: struct_buffer_load_i8_sext_wrong_width + ; GFX8: bb.1 (%ir-block.0): + ; GFX8-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX8-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; GFX8-NEXT: [[BUFFER_LOAD_SBYTE_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SBYTE_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8) + ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SBYTE_BOTHEN]] + ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; + ; GFX12-LABEL: name: struct_buffer_load_i8_sext_wrong_width + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; GFX12-NEXT: [[BUFFER_LOAD_SBYTE_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SBYTE_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8) + ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SBYTE_VBUFFER_BOTHEN]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + %val = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %trunc = trunc i8 %val to i4 + %ext = sext i4 %trunc to i32 + %cast = bitcast i32 %ext to float + ret float %cast +} + define amdgpu_ps float @struct_buffer_load_i16_zext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; GFX8-LABEL: name: struct_buffer_load_i16_zext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; GFX8: bb.1 (%ir-block.0): @@ -580,6 +621,47 @@ define amdgpu_ps float @struct_buffer_load_i16_sext__sgpr_rsrc__vgpr_vindex__vgp ret float %cast } +define amdgpu_ps float @struct_buffer_load_i16_sext_wrong_width(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { + ; GFX8-LABEL: name: struct_buffer_load_i16_sext_wrong_width + ; GFX8: bb.1 (%ir-block.0): + ; GFX8-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; G
[clang-tools-extra] [libc] [mlir] [lld] [libcxx] [libclc] [llvm] [clang] [flang] [libunwind] [lldb] [compiler-rt] [AMDGPU] Fix broken sign-extended subword buffer load combine (PR #77470)
https://github.com/jayfoad closed https://github.com/llvm/llvm-project/pull/77470 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] 6bec3e9 - [APInt] Remove all uses of zextOrSelf, sextOrSelf and truncOrSelf
Author: Jay Foad Date: 2022-05-19T11:23:13+01:00 New Revision: 6bec3e9303d68b8b264de3a02ca943d9dd752004 URL: https://github.com/llvm/llvm-project/commit/6bec3e9303d68b8b264de3a02ca943d9dd752004 DIFF: https://github.com/llvm/llvm-project/commit/6bec3e9303d68b8b264de3a02ca943d9dd752004.diff LOG: [APInt] Remove all uses of zextOrSelf, sextOrSelf and truncOrSelf Most clients only used these methods because they wanted to be able to extend or truncate to the same bit width (which is a no-op). Now that the standard zext, sext and trunc allow this, there is no reason to use the OrSelf versions. The OrSelf versions additionally have the strange behaviour of allowing extending to a *smaller* width, or truncating to a *larger* width, which are also treated as no-ops. A small amount of client code relied on this (ConstantRange::castOp and MicrosoftCXXNameMangler::mangleNumber) and needed rewriting. Differential Revision: https://reviews.llvm.org/D125557 Added: Modified: clang/lib/AST/ExprConstant.cpp clang/lib/AST/MicrosoftMangle.cpp clang/lib/CodeGen/CGBuiltin.cpp clang/lib/Sema/SemaDecl.cpp clang/lib/StaticAnalyzer/Core/LoopUnrolling.cpp llvm/lib/Analysis/BasicAliasAnalysis.cpp llvm/lib/Analysis/ConstantFolding.cpp llvm/lib/Analysis/LazyValueInfo.cpp llvm/lib/Analysis/MemoryBuiltins.cpp llvm/lib/Analysis/ScalarEvolution.cpp llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp llvm/lib/IR/ConstantRange.cpp llvm/lib/Support/APFixedPoint.cpp llvm/lib/Support/APInt.cpp llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp llvm/lib/Target/AArch64/AArch64ISelLowering.cpp llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp llvm/lib/Target/RISCV/RISCVISelLowering.cpp llvm/lib/Target/X86/X86ISelLowering.cpp llvm/lib/Target/X86/X86TargetTransformInfo.cpp llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp llvm/test/TableGen/VarLenEncoder.td llvm/utils/TableGen/VarLenCodeEmitterGen.cpp polly/lib/CodeGen/IslExprBuilder.cpp Removed: diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 519be84a342b3..f679dba44f001 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -8596,7 +8596,7 @@ static bool getBytesReturnedByAllocSizeCall(const ASTContext &Ctx, Into = ExprResult.Val.getInt(); if (Into.isNegative() || !Into.isIntN(BitsInSizeT)) return false; -Into = Into.zextOrSelf(BitsInSizeT); +Into = Into.zext(BitsInSizeT); return true; }; @@ -9582,8 +9582,8 @@ bool PointerExprEvaluator::VisitCXXNewExpr(const CXXNewExpr *E) { unsigned Bits = std::max(CAT->getSize().getBitWidth(), ArrayBound.getBitWidth()); - llvm::APInt InitBound = CAT->getSize().zextOrSelf(Bits); - llvm::APInt AllocBound = ArrayBound.zextOrSelf(Bits); + llvm::APInt InitBound = CAT->getSize().zext(Bits); + llvm::APInt AllocBound = ArrayBound.zext(Bits); if (InitBound.ugt(AllocBound)) { if (IsNothrow) return ZeroInitialization(E); @@ -10377,9 +10377,9 @@ bool VectorExprEvaluator::VisitCastExpr(const CastExpr *E) { for (unsigned i = 0; i < NElts; i++) { llvm::APInt Elt; if (BigEndian) - Elt = SValInt.rotl(i*EltSize+FloatEltSize).truncOrSelf(FloatEltSize); + Elt = SValInt.rotl(i * EltSize + FloatEltSize).trunc(FloatEltSize); else - Elt = SValInt.rotr(i*EltSize).truncOrSelf(FloatEltSize); + Elt = SValInt.rotr(i * EltSize).trunc(FloatEltSize); Elts.push_back(APValue(APFloat(Sem, Elt))); } } else if (EltTy->isIntegerType()) { diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp index abe2b64f57278..e84946d1f21ec 100644 --- a/clang/lib/AST/MicrosoftMangle.cpp +++ b/clang/lib/AST/MicrosoftMangle.cpp @@ -808,8 +808,8 @@ void MicrosoftCXXNameMangler::mangleNumber(llvm::APSInt Number) { // to convert every integer to signed 64 bit before mangling (including // unsigned 64 bit values). Do the same, but preserve bits beyond the bottom // 64. - llvm::APInt Value = - Number.isSigned() ? Number.sextOrSelf(64) : Number.zextOrSelf(64); + unsigned Width = std::max(Number.getBitWidth(), 64U); + llvm::APInt Value = Number.extend(Width);
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
@@ -2326,6 +2326,20 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, } #endif +if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) { + AMDGPU::Waitcnt Wait; + if (ST->hasExtendedWaitCounts()) +Wait = AMDGPU::Waitcnt(0, 0, 0, 0, 0, 0, 0); + else +Wait = AMDGPU::Waitcnt(0, 0, 0, 0); + + if (!Inst.mayStore()) +Wait.StoreCnt = ~0u; jayfoad wrote: ```suggestion AMDGPU::Waitcnt Wait = WCG->getAllZeroWaitcnt(Inst.mayStore()); ``` However, as a general rule: - loads and atomics-with-return update LOADcnt - stores and atomics-without-return update STOREcnt so it might be more accurate to use the condition `Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst)`. Please make sure you have tests for atomics with and without return. https://github.com/llvm/llvm-project/pull/79236 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
@@ -2326,6 +2326,20 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, } #endif +if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) { + AMDGPU::Waitcnt Wait; + if (ST->hasExtendedWaitCounts()) +Wait = AMDGPU::Waitcnt(0, 0, 0, 0, 0, 0, 0); + else +Wait = AMDGPU::Waitcnt(0, 0, 0, 0); + + if (!Inst.mayStore()) +Wait.StoreCnt = ~0u; jayfoad wrote: GFX10 introduced a separate counter for **VMEM** stores with the name VScnt. GFX12 just renamed it to STOREcnt. No architecture has a separate store counter for DS or SMEM. So `ds_add_u32 v0, v1` followed by `s_waitcnt lgkmcnt(0)` (pre-GFX12) or `s_wait_dscnt 0` (GFX12) is fine . https://github.com/llvm/llvm-project/pull/79236 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
@@ -2594,12 +2594,10 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || MOI.getFailureOrdering() == AtomicOrdering::Acquire || MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) { - Changed |= CC->insertWait(MI, MOI.getScope(), -MOI.getInstrAddrSpace(), -isAtomicRet(*MI) ? SIMemOp::LOAD : - SIMemOp::STORE, -MOI.getIsCrossAddressSpaceOrdering(), -Position::AFTER); + Changed |= jayfoad wrote: Remove this. https://github.com/llvm/llvm-project/pull/79236 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
@@ -0,0 +1,1406 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX9 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX90A +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX10 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+enable-flat-scratch,+precise-memory < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX11 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX12 + +; from atomicrmw-expand.ll +; covers flat_load, flat_atomic (atomic with return) +; +define void @syncscope_workgroup_nortn(ptr %addr, float %val) { +; GFX9-LABEL: syncscope_workgroup_nortn: +; GFX9: ; %bb.0: +; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT:flat_load_dword v4, v[0:1] +; GFX9-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT:s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX9-NEXT:; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT:v_add_f32_e32 v3, v4, v2 +; GFX9-NEXT:flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX9-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT:v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT:s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT:v_mov_b32_e32 v4, v3 +; GFX9-NEXT:s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT:s_cbranch_execnz .LBB0_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT:s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT:s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: syncscope_workgroup_nortn: +; GFX90A: ; %bb.0: +; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT:flat_load_dword v5, v[0:1] +; GFX90A-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT:s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX90A-NEXT:; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT:v_add_f32_e32 v4, v5, v2 +; GFX90A-NEXT:flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT:v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT:s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT:v_mov_b32_e32 v5, v3 +; GFX90A-NEXT:s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT:s_cbranch_execnz .LBB0_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT:s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT:s_setpc_b64 s[30:31] +; +; GFX10-LABEL: syncscope_workgroup_nortn: +; GFX10: ; %bb.0: +; GFX10-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT:flat_load_dword v4, v[0:1] +; GFX10-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT:s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX10-NEXT:; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT:v_add_f32_e32 v3, v4, v2 +; GFX10-NEXT:s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT:flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT:buffer_gl0_inv +; GFX10-NEXT:v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT:v_mov_b32_e32 v4, v3 +; GFX10-NEXT:s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT:s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT:s_cbranch_execnz .LBB0_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT:s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT:s_setpc_b64 s[30:31] +; +; GFX9-FLATSCR-LABEL: syncscope_workgroup_nortn: +; GFX9-FLATSCR: ; %bb.0: +; GFX9-FLATSCR-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLATSCR-NEXT:flat_load_dword v4, v[0:1] +; GFX9-FLATSCR-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-FLATSCR-NEXT:s_mov_b64 s[0:1], 0 +; GFX9-FLATSCR-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX9-FLATSCR-NEXT:; =>This Inner Loop Header: Depth=1 +; GFX9-FLATSCR-NEXT:v_add_f32_e32 v3, v4, v2 +; GFX9-FLATSCR-NEXT:flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX9-FLATSCR-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-FLATSCR-NEXT:v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-FLATSCR-NEXT:s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-FLATSCR-NEXT:v_mov_b32_e32 v4, v3 +; GFX9-FLATSCR-NEXT:s_andn2_b64 exec, exec, s[0:1] +; GFX9-FLATSCR-NEXT:s_cbranch_execnz .LBB0_1 +; GFX9-FLATSCR-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-FLATSCR-NEXT:s_or_b64 exec, exec, s[0:1] +; GFX9-FLATSCR-NEXT:s_setpc_b64 s[30:31] +; +; GFX11-LABEL: syncscope_workgroup_nortn: +; GFX11: ; %bb.0: +; GFX11-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT:flat_load_b32 v4, v[0:1] +; GFX11-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT:s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX11-NEXT:; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT:s_delay_al
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
@@ -0,0 +1,1406 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX9 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX90A +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX10 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+enable-flat-scratch,+precise-memory < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX11 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX12 + +; from atomicrmw-expand.ll +; covers flat_load, flat_atomic (atomic with return) +; +define void @syncscope_workgroup_nortn(ptr %addr, float %val) { +; GFX9-LABEL: syncscope_workgroup_nortn: +; GFX9: ; %bb.0: +; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT:flat_load_dword v4, v[0:1] +; GFX9-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT:s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX9-NEXT:; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT:v_add_f32_e32 v3, v4, v2 +; GFX9-NEXT:flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX9-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT:v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT:s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT:v_mov_b32_e32 v4, v3 +; GFX9-NEXT:s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT:s_cbranch_execnz .LBB0_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT:s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT:s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: syncscope_workgroup_nortn: +; GFX90A: ; %bb.0: +; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT:flat_load_dword v5, v[0:1] +; GFX90A-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT:s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX90A-NEXT:; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT:v_add_f32_e32 v4, v5, v2 +; GFX90A-NEXT:flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT:v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT:s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT:v_mov_b32_e32 v5, v3 +; GFX90A-NEXT:s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT:s_cbranch_execnz .LBB0_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT:s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT:s_setpc_b64 s[30:31] +; +; GFX10-LABEL: syncscope_workgroup_nortn: +; GFX10: ; %bb.0: +; GFX10-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT:flat_load_dword v4, v[0:1] +; GFX10-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT:s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX10-NEXT:; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT:v_add_f32_e32 v3, v4, v2 +; GFX10-NEXT:s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT:flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT:buffer_gl0_inv +; GFX10-NEXT:v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT:v_mov_b32_e32 v4, v3 +; GFX10-NEXT:s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT:s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT:s_cbranch_execnz .LBB0_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT:s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT:s_setpc_b64 s[30:31] +; +; GFX9-FLATSCR-LABEL: syncscope_workgroup_nortn: +; GFX9-FLATSCR: ; %bb.0: +; GFX9-FLATSCR-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLATSCR-NEXT:flat_load_dword v4, v[0:1] +; GFX9-FLATSCR-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-FLATSCR-NEXT:s_mov_b64 s[0:1], 0 +; GFX9-FLATSCR-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX9-FLATSCR-NEXT:; =>This Inner Loop Header: Depth=1 +; GFX9-FLATSCR-NEXT:v_add_f32_e32 v3, v4, v2 +; GFX9-FLATSCR-NEXT:flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX9-FLATSCR-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-FLATSCR-NEXT:v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-FLATSCR-NEXT:s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-FLATSCR-NEXT:v_mov_b32_e32 v4, v3 +; GFX9-FLATSCR-NEXT:s_andn2_b64 exec, exec, s[0:1] +; GFX9-FLATSCR-NEXT:s_cbranch_execnz .LBB0_1 +; GFX9-FLATSCR-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-FLATSCR-NEXT:s_or_b64 exec, exec, s[0:1] +; GFX9-FLATSCR-NEXT:s_setpc_b64 s[30:31] +; +; GFX11-LABEL: syncscope_workgroup_nortn: +; GFX11: ; %bb.0: +; GFX11-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT:flat_load_b32 v4, v[0:1] +; GFX11-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT:s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX11-NEXT:; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT:s_delay_al
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
jayfoad wrote: > This logic would need updating again for GFX12. It seems like it's > duplicating a lot of knowledge which is already implemented in > SIInsertWaitcnts. Just to demonstrate, you could implement this feature in SIInsertWaitcnts for **all** supported architectures with something like: ```diff diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 6ecb1c8bf6e1..910cd094f8f2 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -2299,6 +2299,12 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, updateEventWaitcntAfter(Inst, &ScoreBrackets); +AMDGPU::Waitcnt Wait = +AMDGPU::Waitcnt::allZeroExceptVsCnt(ST->hasExtendedWaitCounts()); +ScoreBrackets.simplifyWaitcnt(Wait); +Modified |= generateWaitcnt(Wait, std::next(Inst.getIterator()), Block, +ScoreBrackets, /*OldWaitcntInstr=*/nullptr); + #if 0 // TODO: implement resource type check controlled by options with ub = LB. // If this instruction generates a S_SETVSKIP because it is an // indexed resource, and we are on Tahiti, then it will also force ``` Handling VSCNT/STORECNT correctly is a little more complicated but not much. https://github.com/llvm/llvm-project/pull/79236 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
https://github.com/jayfoad requested changes to this pull request. I've added _some_ inline comments, but really I don't want to spend the time to review this properly (or maintain it, or extend it for new architectures in future). All this logic already exists in SIInsertWaitcnts. Duplicating it here is not a good design. https://github.com/llvm/llvm-project/pull/79236 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
@@ -2378,6 +2409,215 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( return Changed; } +bool SIGfx6CacheControl::handleNonAtomicForPreciseMemory( +MachineBasicBlock::iterator &MI) { + assert(MI->mayLoadOrStore()); + + MachineInstr &Inst = *MI; + AMDGPU::Waitcnt Wait; + + if (TII->isSMRD(Inst)) { // scalar +if (Inst.mayStore()) + return false; +Wait.DsCnt = 0; // LgkmCnt + } else { // vector +if (Inst.mayLoad()) { // vector load + if (TII->isVMEM(Inst))// VMEM load +Wait.LoadCnt = 0; // VmCnt + else if (TII->isFLAT(Inst)) { // Flat load +Wait.LoadCnt = 0; // VmCnt +Wait.DsCnt = 0; // LgkmCnt + } else// LDS load +Wait.DsCnt = 0; // LgkmCnt +} else {// vector store + if (TII->isVMEM(Inst))// VMEM store +Wait.LoadCnt = 0; // VmCnt + else if (TII->isFLAT(Inst)) { // Flat store +Wait.LoadCnt = 0; // VmCnt +Wait.DsCnt = 0; // LgkmCnt + } else +Wait.DsCnt = 0; // LDS store; LgkmCnt +} + } + + unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); + MachineBasicBlock &MBB = *MI->getParent(); + BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); + --MI; + return true; +} + +bool SIGfx6CacheControl::handleAtomicForPreciseMemory( +MachineBasicBlock::iterator &MI, bool IsAtomicWithRet) { + assert(MI->mayLoadOrStore()); + + AMDGPU::Waitcnt Wait; + + Wait.LoadCnt = 0; // VmCnt + Wait.DsCnt = 0; // LgkmCnt + + unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); + MachineBasicBlock &MBB = *MI->getParent(); + BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); + --MI; + return true; +} + +bool SIGfx10CacheControl::handleNonAtomicForPreciseMemory( +MachineBasicBlock::iterator &MI) { + assert(MI->mayLoadOrStore()); + + MachineInstr &Inst = *MI; + AMDGPU::Waitcnt Wait; + + bool BuildWaitCnt = true; + bool BuildVsCnt = false; + + if (TII->isSMRD(Inst)) { // scalar +if (Inst.mayStore()) + return false; +Wait.DsCnt = 0; // LgkmCnt + } else { // vector +if (Inst.mayLoad()) { // vector load + if (TII->isVMEM(Inst))// VMEM load +Wait.LoadCnt = 0; // VmCnt + else if (TII->isFLAT(Inst)) { // Flat load +Wait.LoadCnt = 0; // VmCnt +Wait.DsCnt = 0; // LgkmCnt + } else// LDS load +Wait.DsCnt = 0; // LgkmCnt +} + +// For some vector instructions, mayLoad() and mayStore() can be both true. +if (Inst.mayStore()) { // vector store; an instruction can be both + // load/store + if (TII->isVMEM(Inst)) { // VMEM store +if (!Inst.mayLoad()) + BuildWaitCnt = false; +BuildVsCnt = true; + } else if (TII->isFLAT(Inst)) { // Flat store +Wait.DsCnt = 0; // LgkmCnt +BuildVsCnt = true; + } else +Wait.DsCnt = 0; // LDS store; LgkmCnt +} + } + + MachineBasicBlock &MBB = *MI->getParent(); + if (BuildWaitCnt) { +unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); +BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); +--MI; + } + + if (BuildVsCnt) { +BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT)) +.addReg(AMDGPU::SGPR_NULL, RegState::Undef) +.addImm(0); +--MI; + } + return true; +} + +bool SIGfx10CacheControl ::handleAtomicForPreciseMemory( +MachineBasicBlock::iterator &MI, bool IsAtomicWithRet) { + assert(MI->mayLoadOrStore()); + + AMDGPU::Waitcnt Wait; + + Wait.DsCnt = 0; // LgkmCnt + if (IsAtomicWithRet) +Wait.LoadCnt = 0; // VmCnt + + unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); + MachineBasicBlock &MBB = *MI->getParent(); + BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); + --MI; + if (!IsAtomicWithRet) { +BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT)) +.addReg(AMDGPU::SGPR_NULL, RegState::Undef) +.addImm(0); +--MI; + } + return true; +} + +bool SIGfx12CacheControl ::handleNonAtomicForPreciseMemory( +MachineBasicBlock::iterator &MI) { + assert(MI->mayLoadOrStore()); + + MachineInstr &Inst = *MI; + unsigned WaitType = 0; + // For some vector instructions, mayLoad() and mayStore() can be both true. jayfoad wrote: What kind of (non-atomic) instructions is this supposed to handle? https://github.com/llvm/llvm-project/pull/79236 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
@@ -2378,6 +2409,215 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( return Changed; } +bool SIGfx6CacheControl::handleNonAtomicForPreciseMemory( +MachineBasicBlock::iterator &MI) { + assert(MI->mayLoadOrStore()); + + MachineInstr &Inst = *MI; + AMDGPU::Waitcnt Wait; + + if (TII->isSMRD(Inst)) { // scalar +if (Inst.mayStore()) + return false; +Wait.DsCnt = 0; // LgkmCnt + } else { // vector +if (Inst.mayLoad()) { // vector load + if (TII->isVMEM(Inst))// VMEM load +Wait.LoadCnt = 0; // VmCnt + else if (TII->isFLAT(Inst)) { // Flat load +Wait.LoadCnt = 0; // VmCnt +Wait.DsCnt = 0; // LgkmCnt + } else// LDS load +Wait.DsCnt = 0; // LgkmCnt +} else {// vector store + if (TII->isVMEM(Inst))// VMEM store +Wait.LoadCnt = 0; // VmCnt + else if (TII->isFLAT(Inst)) { // Flat store +Wait.LoadCnt = 0; // VmCnt +Wait.DsCnt = 0; // LgkmCnt + } else +Wait.DsCnt = 0; // LDS store; LgkmCnt +} + } + + unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); + MachineBasicBlock &MBB = *MI->getParent(); + BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); + --MI; + return true; +} + +bool SIGfx6CacheControl::handleAtomicForPreciseMemory( +MachineBasicBlock::iterator &MI, bool IsAtomicWithRet) { + assert(MI->mayLoadOrStore()); + + AMDGPU::Waitcnt Wait; + + Wait.LoadCnt = 0; // VmCnt + Wait.DsCnt = 0; // LgkmCnt + + unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); + MachineBasicBlock &MBB = *MI->getParent(); + BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); + --MI; + return true; +} + +bool SIGfx10CacheControl::handleNonAtomicForPreciseMemory( +MachineBasicBlock::iterator &MI) { + assert(MI->mayLoadOrStore()); + + MachineInstr &Inst = *MI; + AMDGPU::Waitcnt Wait; + + bool BuildWaitCnt = true; + bool BuildVsCnt = false; + + if (TII->isSMRD(Inst)) { // scalar +if (Inst.mayStore()) + return false; +Wait.DsCnt = 0; // LgkmCnt + } else { // vector +if (Inst.mayLoad()) { // vector load + if (TII->isVMEM(Inst))// VMEM load +Wait.LoadCnt = 0; // VmCnt + else if (TII->isFLAT(Inst)) { // Flat load +Wait.LoadCnt = 0; // VmCnt +Wait.DsCnt = 0; // LgkmCnt + } else// LDS load +Wait.DsCnt = 0; // LgkmCnt +} + +// For some vector instructions, mayLoad() and mayStore() can be both true. +if (Inst.mayStore()) { // vector store; an instruction can be both + // load/store + if (TII->isVMEM(Inst)) { // VMEM store +if (!Inst.mayLoad()) + BuildWaitCnt = false; +BuildVsCnt = true; + } else if (TII->isFLAT(Inst)) { // Flat store +Wait.DsCnt = 0; // LgkmCnt +BuildVsCnt = true; + } else +Wait.DsCnt = 0; // LDS store; LgkmCnt +} + } + + MachineBasicBlock &MBB = *MI->getParent(); + if (BuildWaitCnt) { +unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); +BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); +--MI; + } + + if (BuildVsCnt) { +BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT)) +.addReg(AMDGPU::SGPR_NULL, RegState::Undef) +.addImm(0); +--MI; + } + return true; +} + +bool SIGfx10CacheControl ::handleAtomicForPreciseMemory( +MachineBasicBlock::iterator &MI, bool IsAtomicWithRet) { + assert(MI->mayLoadOrStore()); + + AMDGPU::Waitcnt Wait; + + Wait.DsCnt = 0; // LgkmCnt + if (IsAtomicWithRet) +Wait.LoadCnt = 0; // VmCnt + + unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); + MachineBasicBlock &MBB = *MI->getParent(); + BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); + --MI; + if (!IsAtomicWithRet) { +BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT)) +.addReg(AMDGPU::SGPR_NULL, RegState::Undef) +.addImm(0); +--MI; + } + return true; +} + +bool SIGfx12CacheControl ::handleNonAtomicForPreciseMemory( +MachineBasicBlock::iterator &MI) { + assert(MI->mayLoadOrStore()); + + MachineInstr &Inst = *MI; + unsigned WaitType = 0; + // For some vector instructions, mayLoad() and mayStore() can be both true. + bool LoadAndStore = false; + + if (TII->isSMRD(Inst)) { // scalar +if (Inst.mayStore()) + return false; + +WaitType = AMDGPU::S_WAIT_KMCNT; + } else { // vector +if (Inst.mayLoad() && Inst.mayStore()) { + WaitType = AMDGPU::S_WAIT_LOADCNT; + LoadAndStore = true; +} else if (Inst.mayLoad()) { // vector load + if (TII->isVMEM(In
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
https://github.com/jayfoad edited https://github.com/llvm/llvm-project/pull/79236 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
@@ -355,6 +356,18 @@ class SICacheControl { MachineBasicBlock::iterator &MI) const { return false; } + +public: + // The following is for supporting precise memory mode. When the feature + // precise-memory is enabled, an s_waitcnt instruction is inserted + // after each memory instruction. + + virtual bool + handleNonAtomicForPreciseMemory(MachineBasicBlock::iterator &MI) = 0; + /// Handles atomic instruction \p MI with \p IsAtomicWithRet indicating + /// whether \p MI returns a result. + virtual bool handleAtomicForPreciseMemory(MachineBasicBlock::iterator &MI, jayfoad wrote: This function is never even called. https://github.com/llvm/llvm-project/pull/79236 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [AMDGPU] Allow w64 ballot to be used on w32 targets (PR #80183)
jayfoad wrote: After this change is there any value in having two different builtins? You could just have one that always return 64 bits. https://github.com/llvm/llvm-project/pull/80183 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [AMDGPU] Check wavefrontsize for GFX11 WMMA builtins (PR #79980)
https://github.com/jayfoad closed https://github.com/llvm/llvm-project/pull/79980 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
@@ -2561,6 +2567,70 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, return Changed; } +bool SIMemoryLegalizer::GFX9InsertWaitcntForPreciseMem(MachineFunction &MF) { + const GCNSubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + IsaVersion IV = getIsaVersion(ST.getCPU()); + + bool Changed = false; + + for (auto &MBB : MF) { +for (auto MI = MBB.begin(); MI != MBB.end();) { + MachineInstr &Inst = *MI; + ++MI; + if (Inst.mayLoadOrStore() == false) +continue; + + // Todo: if next insn is an s_waitcnt + AMDGPU::Waitcnt Wait; + + if (!(Inst.getDesc().TSFlags & SIInstrFlags::maybeAtomic)) { +if (TII->isSMRD(Inst)) { // scalar jayfoad wrote: This logic would need updating again for GFX12. It seems like it's duplicating a lot of knowledge which is already implemented in SIInsertWaitcnts. https://github.com/llvm/llvm-project/pull/79236 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [AMDGPU][GFX12] Add tests for unsupported builtins (PR #78729)
https://github.com/jayfoad edited https://github.com/llvm/llvm-project/pull/78729 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [AMDGPU][GFX12] Add tests for unsupported builtins (PR #78729)
https://github.com/jayfoad approved this pull request. LGTM. https://github.com/llvm/llvm-project/pull/78729 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [AMDGPU][GFX12] Add tests for unsupported builtins (PR #78729)
@@ -4,10 +4,114 @@ typedef unsigned int uint; -kernel void test_builtins_amdgcn_gws_insts(uint a, uint b) { +#pragma OPENCL EXTENSION cl_khr_fp64:enable + +typedef float v2f __attribute__((ext_vector_type(2))); +typedef float v4f __attribute__((ext_vector_type(4))); +typedef float v16f __attribute__((ext_vector_type(16))); +typedef float v32f __attribute__((ext_vector_type(32))); +typedef half v4h __attribute__((ext_vector_type(4))); +typedef half v8h __attribute__((ext_vector_type(8))); +typedef half v16h __attribute__((ext_vector_type(16))); +typedef half v32h __attribute__((ext_vector_type(32))); +typedef intv2i __attribute__((ext_vector_type(2))); +typedef intv4i __attribute__((ext_vector_type(4))); +typedef intv16i __attribute__((ext_vector_type(16))); +typedef intv32i __attribute__((ext_vector_type(32))); +typedef short v2s __attribute__((ext_vector_type(2))); +typedef short v4s __attribute__((ext_vector_type(4))); +typedef short v8s __attribute__((ext_vector_type(8))); +typedef short v16s __attribute__((ext_vector_type(16))); +typedef short v32s __attribute__((ext_vector_type(32))); +typedef double v4d __attribute__((ext_vector_type(4))); + +void builtin_test_unsupported(global v32f*out_v32f, + global v16f*out_v16f, + global v4f* out_v4f, + global v32i*out_v32i, + global v16i*out_v16i, + global v4i* out_v4i, + global v4d* out_v4d, + global double* out_double, + double a_double , double b_double , double c_double, jayfoad wrote: Nit: you don't really need separate out/a/b/c versions of all these types. You could just test expressions like: ``` x_v32f = __builtin_amdgcn_mfma_f32_32x32x1f32(x_float, x_float, x_v32f, 0, 0, 0); ``` https://github.com/llvm/llvm-project/pull/78729 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [clang-tools-extra] [llvm] [AMDGPU] Update SITargetLowering::getAddrModeArguments (PR #78740)
https://github.com/jayfoad updated https://github.com/llvm/llvm-project/pull/78740 >From c7636536d65a3792223e083dc5bacd0a8e6ff3d7 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Fri, 19 Jan 2024 16:06:00 + Subject: [PATCH] [AMDGPU] Update SITargetLowering::getAddrModeArguments Handle every intrinsic for which getTgtMemIntrinsic returns with Info.ptrVal set to one of the intrinsic's operands. A bunch of these cases were missing. --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 36 +++ 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index cc0c4d4e36eaa8e..66ae9222fb50c89 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1406,31 +1406,41 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II, SmallVectorImpl &Ops, Type *&AccessTy) const { + Value *Ptr = nullptr; switch (II->getIntrinsicID()) { - case Intrinsic::amdgcn_ds_ordered_add: - case Intrinsic::amdgcn_ds_ordered_swap: + case Intrinsic::amdgcn_atomic_cond_sub_u32: case Intrinsic::amdgcn_ds_append: case Intrinsic::amdgcn_ds_consume: case Intrinsic::amdgcn_ds_fadd: - case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_fmax: - case Intrinsic::amdgcn_global_atomic_fadd: + case Intrinsic::amdgcn_ds_fmin: + case Intrinsic::amdgcn_ds_ordered_add: + case Intrinsic::amdgcn_ds_ordered_swap: case Intrinsic::amdgcn_flat_atomic_fadd: - case Intrinsic::amdgcn_flat_atomic_fmin: + case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: case Intrinsic::amdgcn_flat_atomic_fmax: - case Intrinsic::amdgcn_flat_atomic_fmin_num: case Intrinsic::amdgcn_flat_atomic_fmax_num: + case Intrinsic::amdgcn_flat_atomic_fmin: + case Intrinsic::amdgcn_flat_atomic_fmin_num: + case Intrinsic::amdgcn_global_atomic_csub: + case Intrinsic::amdgcn_global_atomic_fadd: case Intrinsic::amdgcn_global_atomic_fadd_v2bf16: - case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: - case Intrinsic::amdgcn_global_atomic_csub: { -Value *Ptr = II->getArgOperand(0); -AccessTy = II->getType(); -Ops.push_back(Ptr); -return true; - } + case Intrinsic::amdgcn_global_atomic_fmax: + case Intrinsic::amdgcn_global_atomic_fmax_num: + case Intrinsic::amdgcn_global_atomic_fmin: + case Intrinsic::amdgcn_global_atomic_fmin_num: + case Intrinsic::amdgcn_global_atomic_ordered_add_b64: +Ptr = II->getArgOperand(0); +break; + case Intrinsic::amdgcn_global_load_lds: +Ptr = II->getArgOperand(1); +break; default: return false; } + AccessTy = II->getType(); + Ops.push_back(Ptr); + return true; } bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM, ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[mlir] [clang] [llvm] [AMDGPU] Add GFX12 WMMA and SWMMAC instructions (PR #77795)
jayfoad wrote: > Also need to be updated: > > https://github.com/llvm/llvm-project/blob/bb6a4850553dd4140a5bd63187ec1b14d0b731f9/llvm/lib/Target/AMDGPU/SMInstructions.td#L14 What needs to be updated and why? https://github.com/llvm/llvm-project/pull/77795 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [flang] [llvm] [clang] [compiler-rt] [AMDGPU] Fold operand after shrinking instruction in SIFoldOperands (PR #68426)
https://github.com/jayfoad closed https://github.com/llvm/llvm-project/pull/68426 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang-tools-extra] [clang] [AMDGPU] Update SITargetLowering::getAddrModeArguments (PR #78740)
https://github.com/jayfoad closed https://github.com/llvm/llvm-project/pull/78740 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [clang-tools-extra] [AMDGPU] CodeGen for GFX12 S_WAIT_* instructions (PR #77438)
jayfoad wrote: > @jayfoad, can you link to the documentation where these new registers are > described? Preferably from a comment in the top of the file(s). It would make > it easier to review for correctness. ISA documentation will be linked from https://llvm.org/docs/AMDGPUUsage.html#additional-documentation when it is made public. https://github.com/llvm/llvm-project/pull/77438 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [AMDGPU] Check wavefrontsize for GFX11 WMMA builtins (PR #79980)
https://github.com/jayfoad created https://github.com/llvm/llvm-project/pull/79980 None >From cace712a8f379df3498dd76bc1f95eb4671e997c Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Tue, 30 Jan 2024 11:04:33 + Subject: [PATCH] [AMDGPU] Check wavefrontsize for GFX11 WMMA builtins --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 34 +-- .../builtins-amdgcn-wmma-w32-gfx10-err.cl | 16 - .../builtins-amdgcn-wmma-w64-gfx10-err.cl | 18 +- .../CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl | 2 +- 4 files changed, 35 insertions(+), 35 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 74dfd1d214e8..e9dd8dcd0b60 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -292,23 +292,23 @@ TARGET_BUILTIN(__builtin_amdgcn_s_wait_event_export_ready, "v", "n", "gfx11-inst // Postfix w32 indicates the builtin requires wavefront size of 32. // Postfix w64 indicates the builtin requires wavefront size of 64. //===--===// -TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w32, "V8fV16hV16hV8f", "nc", "gfx11-insts") -TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32, "V8fV16sV16sV8f", "nc", "gfx11-insts") -TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w32, "V16hV16hV16hV16hIb", "nc", "gfx11-insts") -TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32, "V16sV16sV16sV16sIb", "nc", "gfx11-insts") -TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32, "V16hV16hV16hV16hIb", "nc", "gfx11-insts") -TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32, "V16sV16sV16sV16sIb", "nc", "gfx11-insts") -TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32, "V8iIbV4iIbV4iV8iIb", "nc", "gfx11-insts") -TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32, "V8iIbV2iIbV2iV8iIb", "nc", "gfx11-insts") - -TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w64, "V4fV16hV16hV4f", "nc", "gfx11-insts") -TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64, "V4fV16sV16sV4f", "nc", "gfx11-insts") -TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w64, "V8hV16hV16hV8hIb", "nc", "gfx11-insts") -TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64, "V8sV16sV16sV8sIb", "nc", "gfx11-insts") -TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64, "V8hV16hV16hV8hIb", "nc", "gfx11-insts") -TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64, "V8sV16sV16sV8sIb", "nc", "gfx11-insts") -TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64, "V4iIbV4iIbV4iV4iIb", "nc", "gfx11-insts") -TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64, "V4iIbV2iIbV2iV4iIb", "nc", "gfx11-insts") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w32, "V8fV16hV16hV8f", "nc", "gfx11-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32, "V8fV16sV16sV8f", "nc", "gfx11-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w32, "V16hV16hV16hV16hIb", "nc", "gfx11-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32, "V16sV16sV16sV16sIb", "nc", "gfx11-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32, "V16hV16hV16hV16hIb", "nc", "gfx11-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32, "V16sV16sV16sV16sIb", "nc", "gfx11-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32, "V8iIbV4iIbV4iV8iIb", "nc", "gfx11-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32, "V8iIbV2iIbV2iV8iIb", "nc", "gfx11-insts,wavefrontsize32") + +TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w64, "V4fV16hV16hV4f", "nc", "gfx11-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64, "V4fV16sV16sV4f", "nc", "gfx11-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w64, "V8hV16hV16hV8hIb", "nc", "gfx11-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64, "V8sV16sV16sV8sIb", "nc", "gfx11-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64, "V8hV16hV16hV8hIb", "nc", "gfx11-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64, "V8sV16sV16sV8sIb", "nc", "gfx11-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64, "V4iIbV4iIbV4iV4iIb", "nc", "gfx11-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64, "V4iIbV2iIbV2iV4iIb", "nc", "gfx11-insts,wavefrontsize64") TARGET_BUILTIN(__builtin_amdgcn_s_sendmsg_rtn, "UiUIi", "n", "gfx11-insts") TARGET_BUILTIN(__builtin_amdgcn_s_sendmsg_rtnl, "UWiUIi", "n", "gfx11-insts") diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32-gfx10-e
[clang] [AMDGPU] Check wavefrontsize for GFX11 WMMA builtins (PR #79980)
@@ -21,14 +21,14 @@ void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out4f, v16h a16h, v16h b global v8s* out8s, v4i a4i, v4i b4i, v8s c8s, global v4i* out4i, v2i a2i, v2i b2i, v4i c4i) { - *out4f = __builtin_amdgcn_wmma_f32_16x16x16_f16_w64(a16h, b16h, c4f); // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_f16_w64' needs target feature gfx11-insts}} - *out4f = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64(a16s, b16s, c4f); // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64' needs target feature gfx11-insts}} - *out8h = __builtin_amdgcn_wmma_f16_16x16x16_f16_w64(a16h, b16h, c8h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_w64' needs target feature gfx11-insts}} - *out8s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64(a16s, b16s, c8s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64' needs target feature gfx11-insts}} - *out8h = __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64(a16h, b16h, c8h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64' needs target feature gfx11-insts}} - *out8s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(a16s, b16s, c8s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64' needs target feature gfx11-insts}} - *out4i = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64(true, a4i, true, b4i, c4i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64' needs target feature gfx11-insts}} - *out4i = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64(true, a2i, true, b2i, c4i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64' needs target feature gfx11-insts}} + *out4f = __builtin_amdgcn_wmma_f32_16x16x16_f16_w64(a16h, b16h, c4f); // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_f16_w64' needs target feature gfx11-insts,wavefrontsize64}} + *out4f = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64(a16s, b16s, c4f); // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64' needs target feature gfx11-insts,wavefrontsize64}} + *out8h = __builtin_amdgcn_wmma_f16_16x16x16_f16_w64(a16h, b16h, c8h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_w64' needs target feature gfx11-insts,wavefrontsize64}} + *out8s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64(a16s, b16s, c8s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64' needs target feature gfx11-insts,wavefrontsize64}} + *out8h = __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64(a16h, b16h, c8h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64' needs target feature gfx11-insts,wavefrontsize64}} + *out8s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(a16s, b16s, c8s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64' needs target feature gfx11-insts,wavefrontsize64}} + *out4i = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64(true, a4i, true, b4i, c4i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64' needs target feature gfx11-insts,wavefrontsize64}} + *out4i = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64(true, a2i, true, b2i, c4i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64' needs target feature gfx11-insts,wavefrontsize64}} } -#endif \ No newline at end of file +#endif jayfoad wrote: Yes. My editor did that. Previously there was no newline on the end of the `#endif`. Lots of tools flag that as unusual. https://github.com/llvm/llvm-project/pull/79980 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [AMDGPU] Check wavefrontsize for GFX11 WMMA builtins (PR #79980)
jayfoad wrote: > Do you think it makes sense to add two gfx11 tests where _w32 variant is now > rejected with w64, and _w64 variant rejected with w32? Maybe, but i didn't have the energy to add yet more tests. > Maybe what is being printed in *-gfx10-err.cl test is enough, though. Right, that was my thinking. https://github.com/llvm/llvm-project/pull/79980 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [AMDGPU] Add GFX12 __builtin_amdgcn_s_sleep_var (PR #77926)
https://github.com/jayfoad created https://github.com/llvm/llvm-project/pull/77926 None >From 3d4b8547514f2315130599230e769a8c73be01c3 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Fri, 12 Jan 2024 12:43:16 + Subject: [PATCH] [AMDGPU] Add GFX12 __builtin_amdgcn_s_sleep_var --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 1 + clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl | 15 +++ 2 files changed, 16 insertions(+) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index e562ef04a30194..d0c4b664bf0313 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -410,6 +410,7 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", "nc", "fp8-insts") // GFX12+ only builtins. //===--===// +TARGET_BUILTIN(__builtin_amdgcn_s_sleep_var, "vUi", "n", "gfx12-insts") TARGET_BUILTIN(__builtin_amdgcn_permlane16_var, "UiUiUiUiIbIb", "nc", "gfx12-insts") TARGET_BUILTIN(__builtin_amdgcn_permlanex16_var, "UiUiUiUiIbIb", "nc", "gfx12-insts") TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal, "vIi", "n", "gfx12-insts") diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl index 2899d9e5c28898..ebd367bba0cdc1 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl @@ -5,6 +5,21 @@ typedef unsigned int uint; +// CHECK-LABEL: @test_s_sleep_var( +// CHECK-NEXT: entry: +// CHECK-NEXT:[[D_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT:store i32 [[D:%.*]], ptr addrspace(5) [[D_ADDR]], align 4 +// CHECK-NEXT:[[TMP0:%.*]] = load i32, ptr addrspace(5) [[D_ADDR]], align 4 +// CHECK-NEXT:call void @llvm.amdgcn.s.sleep.var(i32 [[TMP0]]) +// CHECK-NEXT:call void @llvm.amdgcn.s.sleep.var(i32 15) +// CHECK-NEXT:ret void +// +void test_s_sleep_var(int d) +{ + __builtin_amdgcn_s_sleep_var(d); + __builtin_amdgcn_s_sleep_var(15); +} + // CHECK-LABEL: @test_permlane16_var( // CHECK-NEXT: entry: // CHECK-NEXT:[[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [llvm] [clang] [AMDGPU][GFX12] Add Atomic cond_sub_u32 (PR #76224)
jayfoad wrote: > Adding support in atomicrmw. This will require to add new operation to > aromicrmw "cond_sub" Yes, and we have (Matt has) done this in the past, but it will require a wider consensus. I think it's fine to add AMDGPU intrinsics for this in the mean time. https://github.com/llvm/llvm-project/pull/76224 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [clang-tools-extra] [AMDGPU] Disable V_MAD_U64_U32/V_MAD_I64_I32 workaround for GFX12 (PR #77927)
https://github.com/jayfoad updated https://github.com/llvm/llvm-project/pull/77927 >From 3f3bcdb89adf032e26c95807abf5e3b23ff50e4a Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Fri, 12 Jan 2024 12:24:28 + Subject: [PATCH 1/2] Precommit extra GFX12 test coverage --- .../GlobalISel/inst-select-mad_64_32.mir | 21 ++ llvm/test/CodeGen/AMDGPU/llvm.mulo.ll | 163 ++ llvm/test/CodeGen/AMDGPU/mad_64_32.ll | 211 ++ 3 files changed, 395 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-mad_64_32.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-mad_64_32.mir index 698281caca245e9..6e33ef37397d6b4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-mad_64_32.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-mad_64_32.mir @@ -1,6 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -mcpu=gfx1030 -run-pass=instruction-select -global-isel-abort=2 -pass-remarks-missed='gisel*' -verify-machineinstrs %s -o - 2>%t | FileCheck -check-prefix=GFX10 %s # RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -global-isel-abort=2 -pass-remarks-missed='gisel*' -verify-machineinstrs %s -o - 2>%t | FileCheck -check-prefix=GFX11 %s +# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -global-isel-abort=2 -pass-remarks-missed='gisel*' -verify-machineinstrs %s -o - 2>%t | FileCheck -check-prefix=GFX12 %s --- name: mad_u64_u32_vvv @@ -18,6 +19,7 @@ body: | ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3 ; GFX10-NEXT: [[V_MAD_U64_U32_e64_:%[0-9]+]]:vreg_64, [[V_MAD_U64_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_U64_U32_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_MAD_U64_U32_e64_]], implicit [[V_MAD_U64_U32_e64_1]] +; ; GFX11-LABEL: name: mad_u64_u32_vvv ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 ; GFX11-NEXT: {{ $}} @@ -26,6 +28,15 @@ body: | ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3 ; GFX11-NEXT: [[V_MAD_U64_U32_gfx11_e64_:%[0-9]+]]:vreg_64, [[V_MAD_U64_U32_gfx11_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_U64_U32_gfx11_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MAD_U64_U32_gfx11_e64_]], implicit [[V_MAD_U64_U32_gfx11_e64_1]] +; +; GFX12-LABEL: name: mad_u64_u32_vvv +; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 +; GFX12-NEXT: {{ $}} +; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 +; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 +; GFX12-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3 +; GFX12-NEXT: [[V_MAD_U64_U32_gfx11_e64_:%[0-9]+]]:vreg_64, [[V_MAD_U64_U32_gfx11_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_U64_U32_gfx11_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec +; GFX12-NEXT: S_ENDPGM 0, implicit [[V_MAD_U64_U32_gfx11_e64_]], implicit [[V_MAD_U64_U32_gfx11_e64_1]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = COPY $vgpr2 @@ -51,6 +62,7 @@ body: | ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3 ; GFX10-NEXT: [[V_MAD_I64_I32_e64_:%[0-9]+]]:vreg_64, [[V_MAD_I64_I32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_I64_I32_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_MAD_I64_I32_e64_]], implicit [[V_MAD_I64_I32_e64_1]] +; ; GFX11-LABEL: name: mad_i64_i32_vvv ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 ; GFX11-NEXT: {{ $}} @@ -59,6 +71,15 @@ body: | ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3 ; GFX11-NEXT: [[V_MAD_I64_I32_gfx11_e64_:%[0-9]+]]:vreg_64, [[V_MAD_I64_I32_gfx11_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_I64_I32_gfx11_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MAD_I64_I32_gfx11_e64_]], implicit [[V_MAD_I64_I32_gfx11_e64_1]] +; +; GFX12-LABEL: name: mad_i64_i32_vvv +; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 +; GFX12-NEXT: {{ $}} +; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 +; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 +; GFX12-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3 +; GFX12-NEXT: [[V_MAD_I64_I32_gfx11_e64_:%[0-9]+]]:vreg_64, [[V_MAD_I64_I32_gfx11_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_I64_I32_gfx11_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec +; GFX12-NEXT: S_ENDPGM 0, implicit [[V_MAD_I64_I32_gfx11_e64_]], implicit [[V_MAD_I64_I32_gfx11_e64_1]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = COPY $vgpr2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll index 249acec639540b3..b9b03e52ec865c0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll @@ -3,6 +3,7 @@ ; RUN: llc -march=amdgcn -
[llvm] [clang] [clang-tools-extra] [AMDGPU] Disable V_MAD_U64_U32/V_MAD_I64_I32 workaround for GFX12 (PR #77927)
https://github.com/jayfoad updated https://github.com/llvm/llvm-project/pull/77927 >From 3f3bcdb89adf032e26c95807abf5e3b23ff50e4a Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Fri, 12 Jan 2024 12:24:28 + Subject: [PATCH 1/3] Precommit extra GFX12 test coverage --- .../GlobalISel/inst-select-mad_64_32.mir | 21 ++ llvm/test/CodeGen/AMDGPU/llvm.mulo.ll | 163 ++ llvm/test/CodeGen/AMDGPU/mad_64_32.ll | 211 ++ 3 files changed, 395 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-mad_64_32.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-mad_64_32.mir index 698281caca245e..6e33ef37397d6b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-mad_64_32.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-mad_64_32.mir @@ -1,6 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -mcpu=gfx1030 -run-pass=instruction-select -global-isel-abort=2 -pass-remarks-missed='gisel*' -verify-machineinstrs %s -o - 2>%t | FileCheck -check-prefix=GFX10 %s # RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -global-isel-abort=2 -pass-remarks-missed='gisel*' -verify-machineinstrs %s -o - 2>%t | FileCheck -check-prefix=GFX11 %s +# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -global-isel-abort=2 -pass-remarks-missed='gisel*' -verify-machineinstrs %s -o - 2>%t | FileCheck -check-prefix=GFX12 %s --- name: mad_u64_u32_vvv @@ -18,6 +19,7 @@ body: | ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3 ; GFX10-NEXT: [[V_MAD_U64_U32_e64_:%[0-9]+]]:vreg_64, [[V_MAD_U64_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_U64_U32_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_MAD_U64_U32_e64_]], implicit [[V_MAD_U64_U32_e64_1]] +; ; GFX11-LABEL: name: mad_u64_u32_vvv ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 ; GFX11-NEXT: {{ $}} @@ -26,6 +28,15 @@ body: | ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3 ; GFX11-NEXT: [[V_MAD_U64_U32_gfx11_e64_:%[0-9]+]]:vreg_64, [[V_MAD_U64_U32_gfx11_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_U64_U32_gfx11_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MAD_U64_U32_gfx11_e64_]], implicit [[V_MAD_U64_U32_gfx11_e64_1]] +; +; GFX12-LABEL: name: mad_u64_u32_vvv +; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 +; GFX12-NEXT: {{ $}} +; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 +; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 +; GFX12-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3 +; GFX12-NEXT: [[V_MAD_U64_U32_gfx11_e64_:%[0-9]+]]:vreg_64, [[V_MAD_U64_U32_gfx11_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_U64_U32_gfx11_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec +; GFX12-NEXT: S_ENDPGM 0, implicit [[V_MAD_U64_U32_gfx11_e64_]], implicit [[V_MAD_U64_U32_gfx11_e64_1]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = COPY $vgpr2 @@ -51,6 +62,7 @@ body: | ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3 ; GFX10-NEXT: [[V_MAD_I64_I32_e64_:%[0-9]+]]:vreg_64, [[V_MAD_I64_I32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_I64_I32_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_MAD_I64_I32_e64_]], implicit [[V_MAD_I64_I32_e64_1]] +; ; GFX11-LABEL: name: mad_i64_i32_vvv ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 ; GFX11-NEXT: {{ $}} @@ -59,6 +71,15 @@ body: | ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3 ; GFX11-NEXT: [[V_MAD_I64_I32_gfx11_e64_:%[0-9]+]]:vreg_64, [[V_MAD_I64_I32_gfx11_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_I64_I32_gfx11_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MAD_I64_I32_gfx11_e64_]], implicit [[V_MAD_I64_I32_gfx11_e64_1]] +; +; GFX12-LABEL: name: mad_i64_i32_vvv +; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 +; GFX12-NEXT: {{ $}} +; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 +; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 +; GFX12-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3 +; GFX12-NEXT: [[V_MAD_I64_I32_gfx11_e64_:%[0-9]+]]:vreg_64, [[V_MAD_I64_I32_gfx11_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_I64_I32_gfx11_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec +; GFX12-NEXT: S_ENDPGM 0, implicit [[V_MAD_I64_I32_gfx11_e64_]], implicit [[V_MAD_I64_I32_gfx11_e64_1]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = COPY $vgpr2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll index 249acec639540b..b9b03e52ec865c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll @@ -3,6 +3,7 @@ ; RUN: llc -march=amdgcn -mcpu
[libcxx] [clang] [libc] [llvm] [clang-tools-extra] [flang] [compiler-rt] [AMDGPU] Fix llvm.amdgcn.s.wait.event.export.ready for GFX12 (PR #78191)
https://github.com/jayfoad updated https://github.com/llvm/llvm-project/pull/78191 >From 9990fbc26ed3dc245a5127345326050acac49d66 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Fri, 21 Apr 2023 10:46:43 +0100 Subject: [PATCH] [AMDGPU] Fix llvm.amdgcn.s.wait.event.export.ready for GFX12 The meaning of bit 0 of the immediate operand of S_WAIT_EVENT has been flipped from GFX11. --- llvm/lib/Target/AMDGPU/SOPInstructions.td| 8 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.event.ll | 9 ++--- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 46fa3d57a21cb2..b78d900c9bbf42 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -1768,10 +1768,10 @@ def : GCNPat< (S_SEXT_I32_I16 $src) >; -def : GCNPat < - (int_amdgcn_s_wait_event_export_ready), -(S_WAIT_EVENT (i16 0)) ->; +let SubtargetPredicate = isNotGFX12Plus in + def : GCNPat <(int_amdgcn_s_wait_event_export_ready), (S_WAIT_EVENT (i16 0))>; +let SubtargetPredicate = isGFX12Plus in + def : GCNPat <(int_amdgcn_s_wait_event_export_ready), (S_WAIT_EVENT (i16 1))>; // The first 10 bits of the mode register are the core FP mode on all // subtargets. diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.event.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.event.ll index 3e95e4dec67a2b..25b5ddcf946b35 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.event.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.event.ll @@ -1,8 +1,11 @@ -; RUN: llc -global-isel=0 -march=amdgcn -verify-machineinstrs -mcpu=gfx1100 < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -mcpu=gfx1100 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel=0 -march=amdgcn -verify-machineinstrs -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s +; RUN: llc -global-isel=1 -march=amdgcn -verify-machineinstrs -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s +; RUN: llc -global-isel=0 -march=amdgcn -verify-machineinstrs -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s +; RUN: llc -global-isel=1 -march=amdgcn -verify-machineinstrs -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s ; GCN-LABEL: {{^}}test_wait_event: -; GCN: s_wait_event 0x0 +; GFX11: s_wait_event 0x0 +; GFX12: s_wait_event 0x1 define amdgpu_ps void @test_wait_event() #0 { entry: ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [clang] [llvm] [AMDGPU] Disable V_MAD_U64_U32/V_MAD_I64_I32 workaround for GFX12 (PR #77927)
https://github.com/jayfoad closed https://github.com/llvm/llvm-project/pull/77927 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[flang] [libc] [llvm] [clang-tools-extra] [clang] [compiler-rt] [libcxx] [AMDGPU] Fix llvm.amdgcn.s.wait.event.export.ready for GFX12 (PR #78191)
https://github.com/jayfoad closed https://github.com/llvm/llvm-project/pull/78191 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [clang-tools-extra] [llvm] [AMDGPU] Src1 of VOP3 DPP instructions can be SGPR on GFX12 (PR #77929)
https://github.com/jayfoad updated https://github.com/llvm/llvm-project/pull/77929 >From 4299ba898449f782c642b0c27f0ec9970aee0a1c Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Fri, 12 Jan 2024 11:34:02 + Subject: [PATCH 1/2] [AMDGPU] Src1 of VOP3 DPP instructions can be SGPR on GFX12 --- llvm/lib/Target/AMDGPU/AMDGPU.td| 3 ++- llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir | 1 + llvm/test/MC/AMDGPU/gfx12_asm_features.s| 17 + .../Disassembler/AMDGPU/gfx12_dasm_features.txt | 13 + 4 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_features.txt diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index b27edb1e9e14bb..682ca6c57c973b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1502,7 +1502,8 @@ def FeatureISAVersion12 : FeatureSet< FeatureHasRestrictedSOffset, FeatureVGPRSingleUseHintInsts, FeatureMADIntraFwdBug, - FeatureScalarDwordx3Loads]>; + FeatureScalarDwordx3Loads, + FeatureDPPSrc1SGPR]>; //===--===// diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir index fe1345e29f133d..7d081a1491da6e 100644 --- a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir +++ b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir @@ -1,5 +1,6 @@ # RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN,GFX1100 # RUN: llc -march=amdgcn -mcpu=gfx1150 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN,GFX1150 +# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN,GFX1150 --- diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_features.s b/llvm/test/MC/AMDGPU/gfx12_asm_features.s index 7e58bdb3b444e1..da4464c6494dbf 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_features.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_features.s @@ -1,5 +1,22 @@ // RUN: llvm-mc -arch=amdgcn -show-encoding -mcpu=gfx1200 %s | FileCheck --check-prefix=GFX12 %s +// +// Subtargets allow src1 of VOP3 DPP instructions to be SGPR or inlinable +// constant. +// + +v_add3_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf +// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff] + +v_add3_u32_e64_dpp v5, v1, 42, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf +// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xfa,0x54,0x0d,0x04,0x01,0x1b,0x00,0xff] + +v_add3_u32_e64_dpp v5, v1, s2, v0 dpp8:[7,6,5,4,3,2,1,0] +// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xe9,0x04,0x00,0x04,0x01,0x77,0x39,0x05] + +v_add3_u32_e64_dpp v5, v1, 42, v0 dpp8:[7,6,5,4,3,2,1,0] +// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xe9,0x54,0x01,0x04,0x01,0x77,0x39,0x05] + // // Elements of CPol operand can be given in any order // diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_features.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_features.txt new file mode 100644 index 00..2c64522422ad0d --- /dev/null +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_features.txt @@ -0,0 +1,13 @@ +# RUN: llvm-mc -arch=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s + +# GFX12: v_add3_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff] +0x05,0x00,0x55,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff + +# GFX12: v_add3_u32_e64_dpp v5, v1, 42, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd6,0xfa,0x54,0x0d,0x04,0x01,0x1b,0x00,0xff] +0x05,0x00,0x55,0xd6,0xfa,0x54,0x0d,0x04,0x01,0x1b,0x00,0xff + +# GFX12: v_add3_u32_e64_dpp v5, v1, s2, v0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x55,0xd6,0xe9,0x04,0x00,0x04,0x01,0x77,0x39,0x05] +0x05,0x00,0x55,0xd6,0xe9,0x04,0x00,0x04,0x01,0x77,0x39,0x05 + +# GFX12: v_add3_u32_e64_dpp v5, v1, 42, v0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x55,0xd6,0xe9,0x54,0x01,0x04,0x01,0x77,0x39,0x05] +0x05,0x00,0x55,0xd6,0xe9,0x54,0x01,0x04,0x01,0x77,0x39,0x05 >From a65834ad3d8aed3e9cb1414d7576d5244a31f8a2 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Wed, 17 Jan 2024 14:39:09 + Subject: [PATCH 2/2] More tests --- llvm/test/MC/AMDGPU/gfx1150_asm_features.s | 6 ++ llvm/test/MC/AMDGPU/gfx12_asm_features.s | 6 ++ llvm/test/MC/Disassembler/AMDGPU/gfx1150_dasm_features.txt | 6 ++ llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_features.txt | 6 ++ 4 files changed, 24 insertions(+) diff --git a/llvm/test/MC/AMDGPU/gfx1150_asm_features.s b/llvm/test/MC/AMDGPU/gfx1150_asm_features.s index a4904c40b40ae7..55c855175a89e0 100644 --- a/llvm/test/MC/AMDGPU/gfx115
[clang-tools-extra] [llvm] [clang] [AMDGPU] CodeGen for GFX12 S_WAIT_* instructions (PR #77438)
jayfoad wrote: @Pierre-vh @arsen ping! (Sorry, I know it has only been a few days.) https://github.com/llvm/llvm-project/pull/77438 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [clang-tools-extra] [llvm] [AMDGPU] Src1 of VOP3 DPP instructions can be SGPR on GFX12 (PR #77929)
https://github.com/jayfoad closed https://github.com/llvm/llvm-project/pull/77929 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [clang-tools-extra] [AMDGPU] Work around s_getpc_b64 zero extending on GFX12 (PR #78186)
https://github.com/jayfoad updated https://github.com/llvm/llvm-project/pull/78186 >From d3f4ebf849f6ef1ea373e5c7f93398db6681b2b6 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Mon, 15 Jan 2024 15:02:08 + Subject: [PATCH 1/4] Add GFX11/12 test coverage --- llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll | 103 +- 1 file changed, 77 insertions(+), 26 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll index 598d7a8033c2e54..2c1baeeeda21697 100644 --- a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll +++ b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll @@ -1,32 +1,83 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stress-regalloc=2 -verify-machineinstrs < %s | FileCheck %s - +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stress-regalloc=2 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX9 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -stress-regalloc=2 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX11 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -stress-regalloc=2 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX12 define void @test_remat_s_getpc_b64() { -; CHECK-LABEL: test_remat_s_getpc_b64: -; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT:s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT:buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; CHECK-NEXT:s_mov_b64 exec, s[4:5] -; CHECK-NEXT:v_writelane_b32 v0, s30, 0 -; CHECK-NEXT:s_getpc_b64 s[4:5] -; CHECK-NEXT:v_writelane_b32 v0, s31, 1 -; CHECK-NEXT:;;#ASMSTART -; CHECK-NEXT:;;#ASMEND -; CHECK-NEXT:;;#ASMSTART -; CHECK-NEXT:;;#ASMEND -; CHECK-NEXT:s_getpc_b64 s[4:5] -; CHECK-NEXT:v_mov_b32_e32 v1, s4 -; CHECK-NEXT:v_mov_b32_e32 v2, s5 -; CHECK-NEXT:global_store_dwordx2 v[1:2], v[1:2], off -; CHECK-NEXT:v_readlane_b32 s31, v0, 1 -; CHECK-NEXT:v_readlane_b32 s30, v0, 0 -; CHECK-NEXT:s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT:buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT:s_mov_b64 exec, s[4:5] -; CHECK-NEXT:s_waitcnt vmcnt(0) -; CHECK-NEXT:s_setpc_b64 s[30:31] +; GFX9-LABEL: test_remat_s_getpc_b64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT:s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT:buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT:s_mov_b64 exec, s[4:5] +; GFX9-NEXT:v_writelane_b32 v0, s30, 0 +; GFX9-NEXT:s_getpc_b64 s[4:5] +; GFX9-NEXT:v_writelane_b32 v0, s31, 1 +; GFX9-NEXT:;;#ASMSTART +; GFX9-NEXT:;;#ASMEND +; GFX9-NEXT:;;#ASMSTART +; GFX9-NEXT:;;#ASMEND +; GFX9-NEXT:s_getpc_b64 s[4:5] +; GFX9-NEXT:v_mov_b32_e32 v1, s4 +; GFX9-NEXT:v_mov_b32_e32 v2, s5 +; GFX9-NEXT:global_store_dwordx2 v[1:2], v[1:2], off +; GFX9-NEXT:v_readlane_b32 s31, v0, 1 +; GFX9-NEXT:v_readlane_b32 s30, v0, 0 +; GFX9-NEXT:s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT:buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT:s_mov_b64 exec, s[4:5] +; GFX9-NEXT:s_waitcnt vmcnt(0) +; GFX9-NEXT:s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_remat_s_getpc_b64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT:s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT:scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; GFX11-NEXT:s_mov_b32 exec_lo, s0 +; GFX11-NEXT:v_writelane_b32 v0, s30, 0 +; GFX11-NEXT:s_getpc_b64 s[0:1] +; GFX11-NEXT:;;#ASMSTART +; GFX11-NEXT:;;#ASMEND +; GFX11-NEXT:v_writelane_b32 v0, s31, 1 +; GFX11-NEXT:;;#ASMSTART +; GFX11-NEXT:;;#ASMEND +; GFX11-NEXT:s_getpc_b64 s[0:1] +; GFX11-NEXT:s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT:v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT:v_readlane_b32 s31, v0, 1 +; GFX11-NEXT:v_readlane_b32 s30, v0, 0 +; GFX11-NEXT:global_store_b64 v[1:2], v[1:2], off +; GFX11-NEXT:s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT:scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT:s_mov_b32 exec_lo, s0 +; GFX11-NEXT:s_waitcnt vmcnt(0) +; GFX11-NEXT:s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_remat_s_getpc_b64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT:s_xor_saveexec_b32 s0, -1 +; GFX12-NEXT:scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; GFX12-NEXT:s_mov_b32 exec_lo, s0 +; GFX12-NEXT:v_writelane_b32 v0, s30, 0 +; GFX12-NEXT:s_getpc_b64 s[0:1] +; GFX12-NEXT:;;#ASMSTART +; GFX12-NEXT:;;#ASMEND +; GFX12-NEXT:v_writelane_b32 v0, s31, 1 +; GFX12-NEXT:;;#ASMSTART +; GFX12-
[clang] [AMDGPU] Add GFX12 __builtin_amdgcn_s_sleep_var (PR #77926)
https://github.com/jayfoad closed https://github.com/llvm/llvm-project/pull/77926 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [clang-tools-extra] [llvm] [AMDGPU] Work around s_getpc_b64 zero extending on GFX12 (PR #78186)
https://github.com/jayfoad updated https://github.com/llvm/llvm-project/pull/78186 >From d3f4ebf849f6ef1ea373e5c7f93398db6681b2b6 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Mon, 15 Jan 2024 15:02:08 + Subject: [PATCH 1/4] Add GFX11/12 test coverage --- llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll | 103 +- 1 file changed, 77 insertions(+), 26 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll index 598d7a8033c2e54..2c1baeeeda21697 100644 --- a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll +++ b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll @@ -1,32 +1,83 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stress-regalloc=2 -verify-machineinstrs < %s | FileCheck %s - +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stress-regalloc=2 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX9 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -stress-regalloc=2 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX11 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -stress-regalloc=2 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX12 define void @test_remat_s_getpc_b64() { -; CHECK-LABEL: test_remat_s_getpc_b64: -; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT:s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT:buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; CHECK-NEXT:s_mov_b64 exec, s[4:5] -; CHECK-NEXT:v_writelane_b32 v0, s30, 0 -; CHECK-NEXT:s_getpc_b64 s[4:5] -; CHECK-NEXT:v_writelane_b32 v0, s31, 1 -; CHECK-NEXT:;;#ASMSTART -; CHECK-NEXT:;;#ASMEND -; CHECK-NEXT:;;#ASMSTART -; CHECK-NEXT:;;#ASMEND -; CHECK-NEXT:s_getpc_b64 s[4:5] -; CHECK-NEXT:v_mov_b32_e32 v1, s4 -; CHECK-NEXT:v_mov_b32_e32 v2, s5 -; CHECK-NEXT:global_store_dwordx2 v[1:2], v[1:2], off -; CHECK-NEXT:v_readlane_b32 s31, v0, 1 -; CHECK-NEXT:v_readlane_b32 s30, v0, 0 -; CHECK-NEXT:s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT:buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT:s_mov_b64 exec, s[4:5] -; CHECK-NEXT:s_waitcnt vmcnt(0) -; CHECK-NEXT:s_setpc_b64 s[30:31] +; GFX9-LABEL: test_remat_s_getpc_b64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT:s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT:buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT:s_mov_b64 exec, s[4:5] +; GFX9-NEXT:v_writelane_b32 v0, s30, 0 +; GFX9-NEXT:s_getpc_b64 s[4:5] +; GFX9-NEXT:v_writelane_b32 v0, s31, 1 +; GFX9-NEXT:;;#ASMSTART +; GFX9-NEXT:;;#ASMEND +; GFX9-NEXT:;;#ASMSTART +; GFX9-NEXT:;;#ASMEND +; GFX9-NEXT:s_getpc_b64 s[4:5] +; GFX9-NEXT:v_mov_b32_e32 v1, s4 +; GFX9-NEXT:v_mov_b32_e32 v2, s5 +; GFX9-NEXT:global_store_dwordx2 v[1:2], v[1:2], off +; GFX9-NEXT:v_readlane_b32 s31, v0, 1 +; GFX9-NEXT:v_readlane_b32 s30, v0, 0 +; GFX9-NEXT:s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT:buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT:s_mov_b64 exec, s[4:5] +; GFX9-NEXT:s_waitcnt vmcnt(0) +; GFX9-NEXT:s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_remat_s_getpc_b64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT:s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT:scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; GFX11-NEXT:s_mov_b32 exec_lo, s0 +; GFX11-NEXT:v_writelane_b32 v0, s30, 0 +; GFX11-NEXT:s_getpc_b64 s[0:1] +; GFX11-NEXT:;;#ASMSTART +; GFX11-NEXT:;;#ASMEND +; GFX11-NEXT:v_writelane_b32 v0, s31, 1 +; GFX11-NEXT:;;#ASMSTART +; GFX11-NEXT:;;#ASMEND +; GFX11-NEXT:s_getpc_b64 s[0:1] +; GFX11-NEXT:s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT:v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT:v_readlane_b32 s31, v0, 1 +; GFX11-NEXT:v_readlane_b32 s30, v0, 0 +; GFX11-NEXT:global_store_b64 v[1:2], v[1:2], off +; GFX11-NEXT:s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT:scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT:s_mov_b32 exec_lo, s0 +; GFX11-NEXT:s_waitcnt vmcnt(0) +; GFX11-NEXT:s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_remat_s_getpc_b64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT:s_xor_saveexec_b32 s0, -1 +; GFX12-NEXT:scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; GFX12-NEXT:s_mov_b32 exec_lo, s0 +; GFX12-NEXT:v_writelane_b32 v0, s30, 0 +; GFX12-NEXT:s_getpc_b64 s[0:1] +; GFX12-NEXT:;;#ASMSTART +; GFX12-NEXT:;;#ASMEND +; GFX12-NEXT:v_writelane_b32 v0, s31, 1 +; GFX12-NEXT:;;#ASMSTART +; GFX12-
[clang-tools-extra] [clang] [llvm] [AMDGPU] Work around s_getpc_b64 zero extending on GFX12 (PR #78186)
https://github.com/jayfoad closed https://github.com/llvm/llvm-project/pull/78186 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [clang] [llvm] [AMDGPU] CodeGen for GFX12 S_WAIT_* instructions (PR #77438)
https://github.com/jayfoad closed https://github.com/llvm/llvm-project/pull/77438 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [clang] [llvm] [AMDGPU] Update uses of new VOP2 pseudos for GFX12 (PR #78155)
@@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefixes=SI %s jayfoad wrote: Done as part of a merge from main to fix conflicts. https://github.com/llvm/llvm-project/pull/78155 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [clang-tools-extra] [AMDGPU] Update uses of new VOP2 pseudos for GFX12 (PR #78155)
https://github.com/jayfoad closed https://github.com/llvm/llvm-project/pull/78155 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Add GFX12 WMMA and SWMMAC instructions (PR #77795)
jayfoad wrote: Some of the tests in this patch need regenerating now that #77438 has been merged. https://github.com/llvm/llvm-project/pull/77795 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits