[llvm] [clang] [flang] [clang-tools-extra] [openmp] [mlir] [libcxx] [lldb] [libc] GlobalISel: Guide return in llvm::getIConstantSplatVal (PR #71989)

2023-11-12 Thread Jay Foad via cfe-commits

jayfoad wrote:

Typo in subject "**Guard** return ..."?

https://github.com/llvm/llvm-project/pull/71989
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [clang-tools-extra] [AMDGPU] Revert "Preliminary patch for divergence driven instruction selection. Operands Folding 1." (PR #71710)

2023-11-13 Thread Jay Foad via cfe-commits

https://github.com/jayfoad ready_for_review 
https://github.com/llvm/llvm-project/pull/71710
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang-tools-extra] [clang] [AMDGPU] Revert "Preliminary patch for divergence driven instruction selection. Operands Folding 1." (PR #71710)

2023-11-13 Thread Jay Foad via cfe-commits

https://github.com/jayfoad closed 
https://github.com/llvm/llvm-project/pull/71710
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [lldb] [mlir] [libcxx] [openmp] [flang] [libcxxabi] [compiler-rt] [clang] [AMDGPU] Add GFX12 encoding for VINTERP instructions (PR #74616)

2023-12-06 Thread Jay Foad via cfe-commits

https://github.com/jayfoad updated 
https://github.com/llvm/llvm-project/pull/74616

>From 69580e5f77514fecf0aabe2a80c98881f9bd7288 Mon Sep 17 00:00:00 2001
From: Jay Foad 
Date: Tue, 7 Feb 2023 16:27:27 +
Subject: [PATCH 1/2] [AMDGPU] Add GFX12 encoding for VINTERP instructions

---
 .../Disassembler/AMDGPUDisassembler.cpp   |   6 +-
 llvm/lib/Target/AMDGPU/VINTERPInstructions.td |  38 ++-
 llvm/test/MC/AMDGPU/gfx11_asm_vinterp.s   | 187 ++---
 .../AMDGPU/gfx12_dasm_vinterp.txt | 251 ++
 4 files changed, 378 insertions(+), 104 deletions(-)
 create mode 100644 llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vinterp.txt

diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp 
b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 3175f6358a045..c37af739e2019 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -782,9 +782,13 @@ DecodeStatus AMDGPUDisassembler::convertEXPInst(MCInst 
&MI) const {
 
 DecodeStatus AMDGPUDisassembler::convertVINTERPInst(MCInst &MI) const {
   if (MI.getOpcode() == AMDGPU::V_INTERP_P10_F16_F32_inreg_gfx11 ||
+  MI.getOpcode() == AMDGPU::V_INTERP_P10_F16_F32_inreg_gfx12 ||
   MI.getOpcode() == AMDGPU::V_INTERP_P10_RTZ_F16_F32_inreg_gfx11 ||
+  MI.getOpcode() == AMDGPU::V_INTERP_P10_RTZ_F16_F32_inreg_gfx12 ||
   MI.getOpcode() == AMDGPU::V_INTERP_P2_F16_F32_inreg_gfx11 ||
-  MI.getOpcode() == AMDGPU::V_INTERP_P2_RTZ_F16_F32_inreg_gfx11) {
+  MI.getOpcode() == AMDGPU::V_INTERP_P2_F16_F32_inreg_gfx12 ||
+  MI.getOpcode() == AMDGPU::V_INTERP_P2_RTZ_F16_F32_inreg_gfx11 ||
+  MI.getOpcode() == AMDGPU::V_INTERP_P2_RTZ_F16_F32_inreg_gfx12) {
 // The MCInst has this field that is not directly encoded in the
 // instruction.
 insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::op_sel);
diff --git a/llvm/lib/Target/AMDGPU/VINTERPInstructions.td 
b/llvm/lib/Target/AMDGPU/VINTERPInstructions.td
index 7d03150bf5b11..fc563b7493adf 100644
--- a/llvm/lib/Target/AMDGPU/VINTERPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VINTERPInstructions.td
@@ -10,7 +10,7 @@
 // VINTERP encoding
 
//===--===//
 
-class VINTERPe_gfx11  op, VOPProfile P> : Enc64 {
+class VINTERPe  : Enc64 {
   bits<8> vdst;
   bits<4> src0_modifiers;
   bits<9> src0;
@@ -31,7 +31,6 @@ class VINTERPe_gfx11  op, VOPProfile P> : Enc64 {
   let Inst{13}= !if(P.HasOpSel, src2_modifiers{2}, 0); // op_sel(2)
   let Inst{14}= !if(P.HasOpSel, src0_modifiers{3}, 0); // op_sel(3)
   let Inst{15}= clamp;
-  let Inst{22-16} = op;
   let Inst{40-32} = src0;
   let Inst{49-41} = src1;
   let Inst{58-50} = src2;
@@ -40,6 +39,14 @@ class VINTERPe_gfx11  op, VOPProfile P> : Enc64 {
   let Inst{63}= src2_modifiers{0}; // neg(2)
 }
 
+class VINTERPe_gfx11  op, VOPProfile P> : VINTERPe {
+  let Inst{22-16} = op;
+}
+
+class VINTERPe_gfx12  op, VOPProfile P> : VINTERPe {
+  let Inst{20-16} = op{4-0};
+}
+
 
//===--===//
 // VOP3 VINTERP
 
//===--===//
@@ -171,17 +178,28 @@ defm : VInterpF16Pat op> {
+multiclass VINTERP_Real_gfx11  op> {
+  let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in {
 def _gfx11 :
   VINTERP_Real(NAME), SIEncodingFamily.GFX11>,
   VINTERPe_gfx11(NAME).Pfl>;
   }
 }
 
-defm V_INTERP_P10_F32_inreg  : VINTERP_Real_gfx11<0x000>;
-defm V_INTERP_P2_F32_inreg  : VINTERP_Real_gfx11<0x001>;
-defm V_INTERP_P10_F16_F32_inreg  : VINTERP_Real_gfx11<0x002>;
-defm V_INTERP_P2_F16_F32_inreg  : VINTERP_Real_gfx11<0x003>;
-defm V_INTERP_P10_RTZ_F16_F32_inreg  : VINTERP_Real_gfx11<0x004>;
-defm V_INTERP_P2_RTZ_F16_F32_inreg  : VINTERP_Real_gfx11<0x005>;
+multiclass VINTERP_Real_gfx12  op> {
+  let AssemblerPredicate = isGFX12Only, DecoderNamespace = "GFX12" in {
+def _gfx12 :
+  VINTERP_Real(NAME), SIEncodingFamily.GFX12>,
+  VINTERPe_gfx12(NAME).Pfl>;
+  }
+}
+
+multiclass VINTERP_Real_gfx11_gfx12  op> :
+  VINTERP_Real_gfx11, VINTERP_Real_gfx12;
+
+defm V_INTERP_P10_F32_inreg : VINTERP_Real_gfx11_gfx12<0x000>;
+defm V_INTERP_P2_F32_inreg : VINTERP_Real_gfx11_gfx12<0x001>;
+defm V_INTERP_P10_F16_F32_inreg : VINTERP_Real_gfx11_gfx12<0x002>;
+defm V_INTERP_P2_F16_F32_inreg : VINTERP_Real_gfx11_gfx12<0x003>;
+defm V_INTERP_P10_RTZ_F16_F32_inreg : VINTERP_Real_gfx11_gfx12<0x004>;
+defm V_INTERP_P2_RTZ_F16_F32_inreg : VINTERP_Real_gfx11_gfx12<0x005>;
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vinterp.s 
b/llvm/test/MC/AMDGPU/gfx11_asm_vinterp.s
index e2e53776783f3..fdfbf65c0e3cf 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vinterp.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vinterp.s
@@ -1,277 +1,278 @@
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -show-encoding %s | FileCheck 
-check-p

[flang] [clang] [libcxxabi] [lld] [lldb] [mlir] [llvm] [clang-tools-extra] [openmp] [compiler-rt] [libcxx] [AMDGPU] Add GFX12 encoding for VINTERP instructions (PR #74616)

2023-12-07 Thread Jay Foad via cfe-commits

https://github.com/jayfoad updated 
https://github.com/llvm/llvm-project/pull/74616

>From 69580e5f77514fecf0aabe2a80c98881f9bd7288 Mon Sep 17 00:00:00 2001
From: Jay Foad 
Date: Tue, 7 Feb 2023 16:27:27 +
Subject: [PATCH 1/2] [AMDGPU] Add GFX12 encoding for VINTERP instructions

---
 .../Disassembler/AMDGPUDisassembler.cpp   |   6 +-
 llvm/lib/Target/AMDGPU/VINTERPInstructions.td |  38 ++-
 llvm/test/MC/AMDGPU/gfx11_asm_vinterp.s   | 187 ++---
 .../AMDGPU/gfx12_dasm_vinterp.txt | 251 ++
 4 files changed, 378 insertions(+), 104 deletions(-)
 create mode 100644 llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vinterp.txt

diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp 
b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 3175f6358a045..c37af739e2019 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -782,9 +782,13 @@ DecodeStatus AMDGPUDisassembler::convertEXPInst(MCInst 
&MI) const {
 
 DecodeStatus AMDGPUDisassembler::convertVINTERPInst(MCInst &MI) const {
   if (MI.getOpcode() == AMDGPU::V_INTERP_P10_F16_F32_inreg_gfx11 ||
+  MI.getOpcode() == AMDGPU::V_INTERP_P10_F16_F32_inreg_gfx12 ||
   MI.getOpcode() == AMDGPU::V_INTERP_P10_RTZ_F16_F32_inreg_gfx11 ||
+  MI.getOpcode() == AMDGPU::V_INTERP_P10_RTZ_F16_F32_inreg_gfx12 ||
   MI.getOpcode() == AMDGPU::V_INTERP_P2_F16_F32_inreg_gfx11 ||
-  MI.getOpcode() == AMDGPU::V_INTERP_P2_RTZ_F16_F32_inreg_gfx11) {
+  MI.getOpcode() == AMDGPU::V_INTERP_P2_F16_F32_inreg_gfx12 ||
+  MI.getOpcode() == AMDGPU::V_INTERP_P2_RTZ_F16_F32_inreg_gfx11 ||
+  MI.getOpcode() == AMDGPU::V_INTERP_P2_RTZ_F16_F32_inreg_gfx12) {
 // The MCInst has this field that is not directly encoded in the
 // instruction.
 insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::op_sel);
diff --git a/llvm/lib/Target/AMDGPU/VINTERPInstructions.td 
b/llvm/lib/Target/AMDGPU/VINTERPInstructions.td
index 7d03150bf5b11..fc563b7493adf 100644
--- a/llvm/lib/Target/AMDGPU/VINTERPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VINTERPInstructions.td
@@ -10,7 +10,7 @@
 // VINTERP encoding
 
//===--===//
 
-class VINTERPe_gfx11  op, VOPProfile P> : Enc64 {
+class VINTERPe  : Enc64 {
   bits<8> vdst;
   bits<4> src0_modifiers;
   bits<9> src0;
@@ -31,7 +31,6 @@ class VINTERPe_gfx11  op, VOPProfile P> : Enc64 {
   let Inst{13}= !if(P.HasOpSel, src2_modifiers{2}, 0); // op_sel(2)
   let Inst{14}= !if(P.HasOpSel, src0_modifiers{3}, 0); // op_sel(3)
   let Inst{15}= clamp;
-  let Inst{22-16} = op;
   let Inst{40-32} = src0;
   let Inst{49-41} = src1;
   let Inst{58-50} = src2;
@@ -40,6 +39,14 @@ class VINTERPe_gfx11  op, VOPProfile P> : Enc64 {
   let Inst{63}= src2_modifiers{0}; // neg(2)
 }
 
+class VINTERPe_gfx11  op, VOPProfile P> : VINTERPe {
+  let Inst{22-16} = op;
+}
+
+class VINTERPe_gfx12  op, VOPProfile P> : VINTERPe {
+  let Inst{20-16} = op{4-0};
+}
+
 
//===--===//
 // VOP3 VINTERP
 
//===--===//
@@ -171,17 +178,28 @@ defm : VInterpF16Pat op> {
+multiclass VINTERP_Real_gfx11  op> {
+  let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in {
 def _gfx11 :
   VINTERP_Real(NAME), SIEncodingFamily.GFX11>,
   VINTERPe_gfx11(NAME).Pfl>;
   }
 }
 
-defm V_INTERP_P10_F32_inreg  : VINTERP_Real_gfx11<0x000>;
-defm V_INTERP_P2_F32_inreg  : VINTERP_Real_gfx11<0x001>;
-defm V_INTERP_P10_F16_F32_inreg  : VINTERP_Real_gfx11<0x002>;
-defm V_INTERP_P2_F16_F32_inreg  : VINTERP_Real_gfx11<0x003>;
-defm V_INTERP_P10_RTZ_F16_F32_inreg  : VINTERP_Real_gfx11<0x004>;
-defm V_INTERP_P2_RTZ_F16_F32_inreg  : VINTERP_Real_gfx11<0x005>;
+multiclass VINTERP_Real_gfx12  op> {
+  let AssemblerPredicate = isGFX12Only, DecoderNamespace = "GFX12" in {
+def _gfx12 :
+  VINTERP_Real(NAME), SIEncodingFamily.GFX12>,
+  VINTERPe_gfx12(NAME).Pfl>;
+  }
+}
+
+multiclass VINTERP_Real_gfx11_gfx12  op> :
+  VINTERP_Real_gfx11, VINTERP_Real_gfx12;
+
+defm V_INTERP_P10_F32_inreg : VINTERP_Real_gfx11_gfx12<0x000>;
+defm V_INTERP_P2_F32_inreg : VINTERP_Real_gfx11_gfx12<0x001>;
+defm V_INTERP_P10_F16_F32_inreg : VINTERP_Real_gfx11_gfx12<0x002>;
+defm V_INTERP_P2_F16_F32_inreg : VINTERP_Real_gfx11_gfx12<0x003>;
+defm V_INTERP_P10_RTZ_F16_F32_inreg : VINTERP_Real_gfx11_gfx12<0x004>;
+defm V_INTERP_P2_RTZ_F16_F32_inreg : VINTERP_Real_gfx11_gfx12<0x005>;
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vinterp.s 
b/llvm/test/MC/AMDGPU/gfx11_asm_vinterp.s
index e2e53776783f3..fdfbf65c0e3cf 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vinterp.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vinterp.s
@@ -1,277 +1,278 @@
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -show-encoding %s | FileCheck 
-check-p

[lld] [mlir] [clang-tools-extra] [libcxxabi] [lldb] [flang] [compiler-rt] [openmp] [libcxx] [clang] [llvm] [AMDGPU] Add GFX12 encoding for VINTERP instructions (PR #74616)

2023-12-07 Thread Jay Foad via cfe-commits

https://github.com/jayfoad closed 
https://github.com/llvm/llvm-project/pull/74616
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] fb2b907 - [AArch64][SME2] Add REQUIRES to new test

2023-12-07 Thread Jay Foad via cfe-commits

Author: Jay Foad
Date: 2023-12-07T17:42:37Z
New Revision: fb2b907fbd2c9ac25077dae01d777d884e09a7a4

URL: 
https://github.com/llvm/llvm-project/commit/fb2b907fbd2c9ac25077dae01d777d884e09a7a4
DIFF: 
https://github.com/llvm/llvm-project/commit/fb2b907fbd2c9ac25077dae01d777d884e09a7a4.diff

LOG: [AArch64][SME2] Add REQUIRES to new test

Added: 


Modified: 
clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_qrshr.c

Removed: 




diff  --git 
a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_qrshr.c 
b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_qrshr.c
index 066d37772ebc2..50cac48887894 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_qrshr.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_qrshr.c
@@ -1,4 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve 
-target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | 
opt -S  -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve 
-target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x 
c++ %s | opt -S  -passes=mem2reg,instcombine,tailcallelim | FileCheck %s 
-check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu 
-target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall 
-emit-llvm -o - %s | opt -S  -passes=mem2reg,instcombine,tailcallelim | 
FileCheck %s



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [AArch64][SME2] Add _x2/_x4 svqrshr builtins. (PR #74100)

2023-12-07 Thread Jay Foad via cfe-commits

jayfoad wrote:

I committed a fix for builds that do not enable AArch64: 
fb2b907fbd2c9ac25077dae01d777d884e09a7a4

https://github.com/llvm/llvm-project/pull/74100
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [libcxxabi] [clang-tools-extra] [lldb] [clang] [lld] [compiler-rt] [flang] [libcxx] [AMDGPU] GFX12: select @llvm.prefetch intrinsic (PR #74576)

2023-12-07 Thread Jay Foad via cfe-commits


@@ -959,6 +967,32 @@ def : GCNPat <
 }
 } // let OtherPredicates = [HasShaderCyclesRegister]
 
+def SIMM24bitPtr : ImmLeaf (Imm);}]
+>;
+
+multiclass SMPrefetchPat {
+  def : GCNPat <
+(smrd_prefetch (SMRDImm i64:$sbase, i32:$offset), timm, timm, (i32 
cache_type)),
+(!cast("S_PREFETCH_"#type) $sbase, $offset, (i32 
SGPR_NULL), (i8 0))
+  >;
+
+  def : GCNPat <
+(smrd_prefetch (i64 SReg_64:$sbase), timm, timm, (i32 cache_type)),
+(!cast("S_PREFETCH_"#type) $sbase, 0, (i32 SGPR_NULL), 
(i8 0))
+  >;
+
+  def : GCNPat <
+(prefetch SIMM24bitPtr:$offset, timm, timm, (i32 cache_type)),
+(!cast("S_PREFETCH_"#type#"_PC_REL") (as_i32timm 
$offset), (i32 SGPR_NULL), (i8 0))
+  > {
+let AddedComplexity = 10;
+  }

jayfoad wrote:

But that is how `llvm.prefetch` is defined: "`address` is the address to be 
prefetched". A different operation should use a different intrinsic.

https://github.com/llvm/llvm-project/pull/74576
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[compiler-rt] [flang] [lldb] [lld] [clang] [llvm] [libcxxabi] [libcxx] [clang-tools-extra] [AMDGPU] GFX12: select @llvm.prefetch intrinsic (PR #74576)

2023-12-08 Thread Jay Foad via cfe-commits


@@ -959,6 +967,32 @@ def : GCNPat <
 }
 } // let OtherPredicates = [HasShaderCyclesRegister]
 
+def SIMM24bitPtr : ImmLeaf (Imm);}]
+>;
+
+multiclass SMPrefetchPat {
+  def : GCNPat <
+(smrd_prefetch (SMRDImm i64:$sbase, i32:$offset), timm, timm, (i32 
cache_type)),
+(!cast("S_PREFETCH_"#type) $sbase, $offset, (i32 
SGPR_NULL), (i8 0))
+  >;
+
+  def : GCNPat <
+(smrd_prefetch (i64 SReg_64:$sbase), timm, timm, (i32 cache_type)),
+(!cast("S_PREFETCH_"#type) $sbase, 0, (i32 SGPR_NULL), 
(i8 0))
+  >;
+
+  def : GCNPat <
+(prefetch SIMM24bitPtr:$offset, timm, timm, (i32 cache_type)),
+(!cast("S_PREFETCH_"#type#"_PC_REL") (as_i32timm 
$offset), (i32 SGPR_NULL), (i8 0))
+  > {
+let AddedComplexity = 10;
+  }

jayfoad wrote:

I really don't know. What would the use cases look like? Maybe it could be a 
generic intrinsic, if there is consensus that it is useful.

For the existing llvm.prefetch intrinsic, the only useful case I think of for 
instruction prefetching is:
```
define @f0() {
  call @llvm.prefetch(@f1, ...) 
  ...
  call @f1()
}
define @f1() { ... }
```
to prefetch the code at the start of a function you are going to call. We could 
codegen that case using the _pc_rel form of the instruction.

https://github.com/llvm/llvm-project/pull/74576
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[lld] [clang] [compiler-rt] [lldb] [libcxx] [flang] [libc] [clang-tools-extra] [llvm] [GlobalISel] Add G_PREFETCH (PR #74863)

2023-12-11 Thread Jay Foad via cfe-commits

https://github.com/jayfoad updated 
https://github.com/llvm/llvm-project/pull/74863

>From e406c734609d3cd1ae436084c42c1c63d8af2795 Mon Sep 17 00:00:00 2001
From: Jay Foad 
Date: Fri, 8 Dec 2023 14:08:09 +
Subject: [PATCH 1/2] [GlobalISel] Add G_PREFETCH

---
 .../CodeGen/GlobalISel/MachineIRBuilder.h |  4 ++
 llvm/include/llvm/Support/TargetOpcodes.def   |  3 +
 llvm/include/llvm/Target/GenericOpcodes.td|  9 +++
 llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp  | 12 
 .../CodeGen/GlobalISel/MachineIRBuilder.cpp   | 10 +++
 llvm/lib/CodeGen/MachineVerifier.cpp  | 23 +++
 llvm/lib/IR/Verifier.cpp  |  2 +-
 llvm/lib/Target/AArch64/AArch64InstrGISel.td  |  4 +-
 .../AArch64/GISel/AArch64LegalizerInfo.cpp| 55 
 .../AArch64/GISel/AArch64LegalizerInfo.h  |  1 +
 .../GlobalISel/legalizer-info-validation.mir  |  3 +
 llvm/test/MachineVerifier/test_g_prefetch.mir | 40 
 .../builtins/match-table-replacerreg.td   | 20 +++---
 .../match-table-imms.td   | 28 -
 .../match-table-patfrag-root.td   |  2 +-
 .../GlobalISelCombinerEmitter/match-table.td  | 62 +--
 llvm/test/TableGen/GlobalISelEmitter.td   |  2 +-
 17 files changed, 195 insertions(+), 85 deletions(-)
 create mode 100644 llvm/test/MachineVerifier/test_g_prefetch.mir

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h 
b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
index 3d36d06a7e9da..eb846acde3e04 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
@@ -1529,6 +1529,10 @@ class MachineIRBuilder {
   /// Build and insert `G_FENCE Ordering, Scope`.
   MachineInstrBuilder buildFence(unsigned Ordering, unsigned Scope);
 
+  /// Build and insert G_PREFETCH \p Addr, \p RW, \p Locality, \p CacheType
+  MachineInstrBuilder buildPrefetch(const SrcOp &Addr, unsigned RW,
+unsigned Locality, unsigned CacheType);
+
   /// Build and insert \p Dst = G_FREEZE \p Src
   MachineInstrBuilder buildFreeze(const DstOp &Dst, const SrcOp &Src) {
 return buildInstr(TargetOpcode::G_FREEZE, {Dst}, {Src});
diff --git a/llvm/include/llvm/Support/TargetOpcodes.def 
b/llvm/include/llvm/Support/TargetOpcodes.def
index 941c6d5f8cad8..91d9eb745a48f 100644
--- a/llvm/include/llvm/Support/TargetOpcodes.def
+++ b/llvm/include/llvm/Support/TargetOpcodes.def
@@ -415,6 +415,9 @@ HANDLE_TARGET_OPCODE_MARKER(GENERIC_ATOMICRMW_OP_END, 
G_ATOMICRMW_UDEC_WRAP)
 // Generic atomic fence
 HANDLE_TARGET_OPCODE(G_FENCE)
 
+/// Generic prefetch
+HANDLE_TARGET_OPCODE(G_PREFETCH)
+
 /// Generic conditional branch instruction.
 HANDLE_TARGET_OPCODE(G_BRCOND)
 
diff --git a/llvm/include/llvm/Target/GenericOpcodes.td 
b/llvm/include/llvm/Target/GenericOpcodes.td
index 9a9c09d3c20d6..73e38b15bf671 100644
--- a/llvm/include/llvm/Target/GenericOpcodes.td
+++ b/llvm/include/llvm/Target/GenericOpcodes.td
@@ -1209,6 +1209,15 @@ def G_FENCE : GenericInstruction {
   let hasSideEffects = true;
 }
 
+// Generic opcode equivalent to the llvm.prefetch intrinsic.
+def G_PREFETCH : GenericInstruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins ptype0:$address, i32imm:$rw, i32imm:$locality, 
i32imm:$cachetype);
+  let hasSideEffects = true;
+  let mayLoad = true;
+  let mayStore = true;
+}
+
 
//--
 // Variadic ops
 
//--
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp 
b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 14a4e72152e7c..b2850846bde67 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -2435,6 +2435,18 @@ bool IRTranslator::translateKnownIntrinsic(const 
CallInst &CI, Intrinsic::ID ID,
 MIRBuilder.buildInstr(TargetOpcode::G_RESET_FPMODE, {}, {});
 return true;
   }
+  case Intrinsic::prefetch: {
+Value *Addr = CI.getOperand(0);
+ConstantInt *RW = cast(CI.getOperand(1));
+ConstantInt *Locality = cast(CI.getOperand(2));
+ConstantInt *CacheType = cast(CI.getOperand(3));
+
+MIRBuilder.buildPrefetch(getOrCreateVReg(*Addr), RW->getZExtValue(),
+ Locality->getZExtValue(),
+ CacheType->getZExtValue());
+
+return true;
+  }
 #define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC)  \
   case Intrinsic::INTRINSIC:
 #include "llvm/IR/ConstrainedOps.def"
diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp 
b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index 80e9c08e850b6..f7febc9357c11 100644
--- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -1051,6 +1051,16 @@ MachineIRBuilder::buildFence(unsigned Ordering, unsigned 
Scope) {
 .addImm(

[llvm] [flang] [clang] [lld] [clang-tools-extra] [libcxx] [lldb] [libc] [compiler-rt] [GlobalISel] Add G_PREFETCH (PR #74863)

2023-12-11 Thread Jay Foad via cfe-commits

https://github.com/jayfoad closed 
https://github.com/llvm/llvm-project/pull/74863
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)

2023-11-21 Thread Jay Foad via cfe-commits


@@ -1708,6 +1710,13 @@ bool 
SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
 }
 
 ++Iter;
+if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) {
+  auto builder =

jayfoad wrote:

Upper case B for Builder.

https://github.com/llvm/llvm-project/pull/68932
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)

2023-11-21 Thread Jay Foad via cfe-commits


@@ -388,6 +388,8 @@ class SIInsertWaitcnts : public MachineFunctionPass {
   // message.
   DenseSet ReleaseVGPRInsts;
 
+  // bool insertWaitcntAfterMemOp(MachineFunction &MF);

jayfoad wrote:

Remove all the unused code, don't just comment it out.

https://github.com/llvm/llvm-project/pull/68932
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)

2023-11-21 Thread Jay Foad via cfe-commits


@@ -1708,6 +1710,13 @@ bool 
SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
 }
 
 ++Iter;
+if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) {
+  auto builder =
+  BuildMI(Block, Iter, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+  .addImm(0);

jayfoad wrote:

On GFX10+ VMEM stores should have S_WAITCNT_VSCNT 0 as well as (or instead of) 
this.

https://github.com/llvm/llvm-project/pull/68932
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)

2023-11-21 Thread Jay Foad via cfe-commits


@@ -0,0 +1,222 @@
+; Testing the -amdgpu-precise-memory-op option
+; COM: llc -mtriple=amdgcn -mcpu=hawaii -mattr=+amdgpu-precise-memory-op 
-verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX7

jayfoad wrote:

What is COM: ?

https://github.com/llvm/llvm-project/pull/68932
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Improve selection of ballot.i64 intrinsic in wave32 mode. (PR #71556)

2023-11-21 Thread Jay Foad via cfe-commits


@@ -2314,9 +2314,8 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
 SDValue VCMP = Cond->getOperand(0);
 auto CC = cast(Cond->getOperand(2))->get();
 auto *CRHS = dyn_cast(Cond->getOperand(1));
-if ((CC == ISD::SETEQ || CC == ISD::SETNE) && CRHS && CRHS->isZero() &&
-// TODO: make condition below an assert after fixing ballot bitwidth.
-VCMP.getValueType().getSizeInBits() == ST->getWavefrontSize()) {
+if ((CC == ISD::SETEQ || CC == ISD::SETNE) && CRHS && CRHS->isZero()) {
+  assert(VCMP.getValueType().getSizeInBits() == ST->getWavefrontSize());

jayfoad wrote:

You are asserting that instcombine has been run? That seems wrong. What about 
-O0 compiles?

https://github.com/llvm/llvm-project/pull/71556
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)

2023-11-22 Thread Jay Foad via cfe-commits


@@ -1847,6 +1862,7 @@ bool 
SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
 
   TrackedWaitcntSet.clear();
   BlockInfos.clear();
+

jayfoad wrote:

Remove this

https://github.com/llvm/llvm-project/pull/68932
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)

2023-11-22 Thread Jay Foad via cfe-commits


@@ -1708,6 +1710,19 @@ bool 
SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
 }
 
 ++Iter;
+if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) {
+  auto Builder =
+  BuildMI(Block, Iter, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+  .addImm(0);
+  if (IsGFX10Plus) {

jayfoad wrote:

I guess this works but it seems a bit wasteful to insert S_WAITCNT after stores 
and S_WAITCNT_VSCNT after loads. Does anyone care?

Stepping back a bit, I think you can probably implement this by calling 
generateWaitcnt instead of building the instructions yourself.

https://github.com/llvm/llvm-project/pull/68932
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)

2023-11-22 Thread Jay Foad via cfe-commits


@@ -1708,6 +1710,19 @@ bool 
SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
 }
 
 ++Iter;
+if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) {
+  auto Builder =
+  BuildMI(Block, Iter, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+  .addImm(0);
+  if (IsGFX10Plus) {
+Builder =
+BuildMI(Block, Iter, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT))
+.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+.addImm(0);
+  }
+  OldWaitcntInstr = Builder.getInstr();

jayfoad wrote:

Nit: if you're going to set OldWaitcntInstr then really it ought to point to 
the first in a sequence of waitcnts, not the last.

https://github.com/llvm/llvm-project/pull/68932
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Define new targets gfx1200 and gfx1201 (PR #73133)

2023-11-22 Thread Jay Foad via cfe-commits

https://github.com/jayfoad created 
https://github.com/llvm/llvm-project/pull/73133

Define target names and ELF numbers for new GFX12 targets gfx1200 and
gfx1201. For now they behave identically to GFX11.


>From 1011b8e7da174146dfb4c9a4bf54468ea021 Mon Sep 17 00:00:00 2001
From: Jay Foad 
Date: Tue, 21 Nov 2023 15:46:04 +
Subject: [PATCH] [AMDGPU] Define new targets gfx1200 and gfx1201

Define target names and ELF numbers for new GFX12 targets gfx1200 and
gfx1201. For now they behave identically to GFX11.
---
 clang/include/clang/Basic/Cuda.h  |  2 +
 clang/lib/Basic/Cuda.cpp  |  2 +
 clang/lib/Basic/Targets/NVPTX.cpp |  2 +
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp  |  2 +
 clang/test/CodeGenOpenCL/amdgpu-features.cl   |  4 +
 clang/test/Driver/amdgpu-macros.cl|  2 +
 clang/test/Driver/amdgpu-mcpu.cl  |  4 +
 clang/test/Misc/target-invalid-cpu-note.c |  4 +-
 llvm/docs/AMDGPUUsage.rst | 18 -
 llvm/include/llvm/BinaryFormat/ELF.h  |  6 +-
 llvm/include/llvm/TargetParser/TargetParser.h |  5 +-
 llvm/lib/Object/ELFObjectFile.cpp |  6 ++
 llvm/lib/ObjectYAML/ELFYAML.cpp   |  2 +
 llvm/lib/Target/AMDGPU/AMDGPU.td  | 75 ++-
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h  |  3 +-
 llvm/lib/Target/AMDGPU/GCNProcessors.td   | 12 +++
 llvm/lib/Target/AMDGPU/GCNSubtarget.h |  1 +
 .../MCTargetDesc/AMDGPUTargetStreamer.cpp |  4 +
 llvm/lib/Target/AMDGPU/SIDefines.h|  1 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp|  8 ++
 llvm/lib/Target/AMDGPU/SIInstrInfo.td |  4 +-
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp| 26 ++-
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |  6 ++
 llvm/lib/TargetParser/TargetParser.cpp| 27 +++
 .../CodeGen/AMDGPU/directive-amdgcn-target.ll |  4 +
 .../CodeGen/AMDGPU/elf-header-flags-mach.ll   |  4 +
 .../Object/AMDGPU/elf-header-flags-mach.yaml  | 14 
 .../llvm-objdump/ELF/AMDGPU/subtarget.ll  | 12 +++
 .../llvm-readobj/ELF/amdgpu-elf-headers.test  | 18 +
 llvm/tools/llvm-readobj/ELFDumper.cpp |  4 +
 30 files changed, 272 insertions(+), 10 deletions(-)

diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h
index 878f8d70f90c0a9..2d912bdbbd1bc59 100644
--- a/clang/include/clang/Basic/Cuda.h
+++ b/clang/include/clang/Basic/Cuda.h
@@ -113,6 +113,8 @@ enum class CudaArch {
   GFX1103,
   GFX1150,
   GFX1151,
+  GFX1200,
+  GFX1201,
   Generic, // A processor model named 'generic' if the target backend defines a
// public one.
   LAST,
diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp
index 2307352bd3becef..65840b9f20252b6 100644
--- a/clang/lib/Basic/Cuda.cpp
+++ b/clang/lib/Basic/Cuda.cpp
@@ -135,6 +135,8 @@ static const CudaArchToStringMap arch_names[] = {
 GFX(1103), // gfx1103
 GFX(1150), // gfx1150
 GFX(1151), // gfx1151
+GFX(1200), // gfx1200
+GFX(1201), // gfx1201
 {CudaArch::Generic, "generic", ""},
 // clang-format on
 };
diff --git a/clang/lib/Basic/Targets/NVPTX.cpp 
b/clang/lib/Basic/Targets/NVPTX.cpp
index a9fc88295700b89..3a4a75b0348f209 100644
--- a/clang/lib/Basic/Targets/NVPTX.cpp
+++ b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -214,6 +214,8 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions 
&Opts,
   case CudaArch::GFX1103:
   case CudaArch::GFX1150:
   case CudaArch::GFX1151:
+  case CudaArch::GFX1200:
+  case CudaArch::GFX1201:
   case CudaArch::Generic:
   case CudaArch::LAST:
 break;
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp 
b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 2f7dd83bd2d65c9..9b8fbbdf8046787 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -3540,6 +3540,8 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(
   case CudaArch::GFX1103:
   case CudaArch::GFX1150:
   case CudaArch::GFX1151:
+  case CudaArch::GFX1200:
+  case CudaArch::GFX1201:
   case CudaArch::Generic:
   case CudaArch::UNUSED:
   case CudaArch::UNKNOWN:
diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl 
b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index 03c20ae46faaa46..8959634572b44e9 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -49,6 +49,8 @@
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1103 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX1103 %s
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1150 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX1150 %s
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1151 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX1151 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1200 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX1200 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1201 -S -

[clang] [llvm] [AMDGPU] Define new targets gfx1200 and gfx1201 (PR #73133)

2023-11-22 Thread Jay Foad via cfe-commits

https://github.com/jayfoad updated 
https://github.com/llvm/llvm-project/pull/73133

>From 1011b8e7da174146dfb4c9a4bf54468ea021 Mon Sep 17 00:00:00 2001
From: Jay Foad 
Date: Tue, 21 Nov 2023 15:46:04 +
Subject: [PATCH 1/2] [AMDGPU] Define new targets gfx1200 and gfx1201

Define target names and ELF numbers for new GFX12 targets gfx1200 and
gfx1201. For now they behave identically to GFX11.
---
 clang/include/clang/Basic/Cuda.h  |  2 +
 clang/lib/Basic/Cuda.cpp  |  2 +
 clang/lib/Basic/Targets/NVPTX.cpp |  2 +
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp  |  2 +
 clang/test/CodeGenOpenCL/amdgpu-features.cl   |  4 +
 clang/test/Driver/amdgpu-macros.cl|  2 +
 clang/test/Driver/amdgpu-mcpu.cl  |  4 +
 clang/test/Misc/target-invalid-cpu-note.c |  4 +-
 llvm/docs/AMDGPUUsage.rst | 18 -
 llvm/include/llvm/BinaryFormat/ELF.h  |  6 +-
 llvm/include/llvm/TargetParser/TargetParser.h |  5 +-
 llvm/lib/Object/ELFObjectFile.cpp |  6 ++
 llvm/lib/ObjectYAML/ELFYAML.cpp   |  2 +
 llvm/lib/Target/AMDGPU/AMDGPU.td  | 75 ++-
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h  |  3 +-
 llvm/lib/Target/AMDGPU/GCNProcessors.td   | 12 +++
 llvm/lib/Target/AMDGPU/GCNSubtarget.h |  1 +
 .../MCTargetDesc/AMDGPUTargetStreamer.cpp |  4 +
 llvm/lib/Target/AMDGPU/SIDefines.h|  1 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp|  8 ++
 llvm/lib/Target/AMDGPU/SIInstrInfo.td |  4 +-
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp| 26 ++-
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |  6 ++
 llvm/lib/TargetParser/TargetParser.cpp| 27 +++
 .../CodeGen/AMDGPU/directive-amdgcn-target.ll |  4 +
 .../CodeGen/AMDGPU/elf-header-flags-mach.ll   |  4 +
 .../Object/AMDGPU/elf-header-flags-mach.yaml  | 14 
 .../llvm-objdump/ELF/AMDGPU/subtarget.ll  | 12 +++
 .../llvm-readobj/ELF/amdgpu-elf-headers.test  | 18 +
 llvm/tools/llvm-readobj/ELFDumper.cpp |  4 +
 30 files changed, 272 insertions(+), 10 deletions(-)

diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h
index 878f8d70f90c0a9..2d912bdbbd1bc59 100644
--- a/clang/include/clang/Basic/Cuda.h
+++ b/clang/include/clang/Basic/Cuda.h
@@ -113,6 +113,8 @@ enum class CudaArch {
   GFX1103,
   GFX1150,
   GFX1151,
+  GFX1200,
+  GFX1201,
   Generic, // A processor model named 'generic' if the target backend defines a
// public one.
   LAST,
diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp
index 2307352bd3becef..65840b9f20252b6 100644
--- a/clang/lib/Basic/Cuda.cpp
+++ b/clang/lib/Basic/Cuda.cpp
@@ -135,6 +135,8 @@ static const CudaArchToStringMap arch_names[] = {
 GFX(1103), // gfx1103
 GFX(1150), // gfx1150
 GFX(1151), // gfx1151
+GFX(1200), // gfx1200
+GFX(1201), // gfx1201
 {CudaArch::Generic, "generic", ""},
 // clang-format on
 };
diff --git a/clang/lib/Basic/Targets/NVPTX.cpp 
b/clang/lib/Basic/Targets/NVPTX.cpp
index a9fc88295700b89..3a4a75b0348f209 100644
--- a/clang/lib/Basic/Targets/NVPTX.cpp
+++ b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -214,6 +214,8 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions 
&Opts,
   case CudaArch::GFX1103:
   case CudaArch::GFX1150:
   case CudaArch::GFX1151:
+  case CudaArch::GFX1200:
+  case CudaArch::GFX1201:
   case CudaArch::Generic:
   case CudaArch::LAST:
 break;
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp 
b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 2f7dd83bd2d65c9..9b8fbbdf8046787 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -3540,6 +3540,8 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(
   case CudaArch::GFX1103:
   case CudaArch::GFX1150:
   case CudaArch::GFX1151:
+  case CudaArch::GFX1200:
+  case CudaArch::GFX1201:
   case CudaArch::Generic:
   case CudaArch::UNUSED:
   case CudaArch::UNKNOWN:
diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl 
b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index 03c20ae46faaa46..8959634572b44e9 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -49,6 +49,8 @@
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1103 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX1103 %s
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1150 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX1150 %s
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1151 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX1151 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1200 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX1200 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1201 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX1201 %s
 
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1103 -targ

[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)

2023-11-22 Thread Jay Foad via cfe-commits


@@ -1708,6 +1710,19 @@ bool 
SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
 }
 
 ++Iter;
+if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) {
+  auto Builder =
+  BuildMI(Block, Iter, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+  .addImm(0);
+  if (IsGFX10Plus) {

jayfoad wrote:

Yes but why? On GFX10+, why would you put s_waitcnt(0) after a store or 
s_waitcnt_vscnt(0) after a load?

https://github.com/llvm/llvm-project/pull/68932
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [flang] [compiler-rt] [libcxx] [mlir] [clang] [libc] [AMDGPU] Define new targets gfx1200 and gfx1201 (PR #73133)

2023-11-23 Thread Jay Foad via cfe-commits

https://github.com/jayfoad updated 
https://github.com/llvm/llvm-project/pull/73133

>From 1011b8e7da174146dfb4c9a4bf54468ea021 Mon Sep 17 00:00:00 2001
From: Jay Foad 
Date: Tue, 21 Nov 2023 15:46:04 +
Subject: [PATCH 1/2] [AMDGPU] Define new targets gfx1200 and gfx1201

Define target names and ELF numbers for new GFX12 targets gfx1200 and
gfx1201. For now they behave identically to GFX11.
---
 clang/include/clang/Basic/Cuda.h  |  2 +
 clang/lib/Basic/Cuda.cpp  |  2 +
 clang/lib/Basic/Targets/NVPTX.cpp |  2 +
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp  |  2 +
 clang/test/CodeGenOpenCL/amdgpu-features.cl   |  4 +
 clang/test/Driver/amdgpu-macros.cl|  2 +
 clang/test/Driver/amdgpu-mcpu.cl  |  4 +
 clang/test/Misc/target-invalid-cpu-note.c |  4 +-
 llvm/docs/AMDGPUUsage.rst | 18 -
 llvm/include/llvm/BinaryFormat/ELF.h  |  6 +-
 llvm/include/llvm/TargetParser/TargetParser.h |  5 +-
 llvm/lib/Object/ELFObjectFile.cpp |  6 ++
 llvm/lib/ObjectYAML/ELFYAML.cpp   |  2 +
 llvm/lib/Target/AMDGPU/AMDGPU.td  | 75 ++-
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h  |  3 +-
 llvm/lib/Target/AMDGPU/GCNProcessors.td   | 12 +++
 llvm/lib/Target/AMDGPU/GCNSubtarget.h |  1 +
 .../MCTargetDesc/AMDGPUTargetStreamer.cpp |  4 +
 llvm/lib/Target/AMDGPU/SIDefines.h|  1 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp|  8 ++
 llvm/lib/Target/AMDGPU/SIInstrInfo.td |  4 +-
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp| 26 ++-
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |  6 ++
 llvm/lib/TargetParser/TargetParser.cpp| 27 +++
 .../CodeGen/AMDGPU/directive-amdgcn-target.ll |  4 +
 .../CodeGen/AMDGPU/elf-header-flags-mach.ll   |  4 +
 .../Object/AMDGPU/elf-header-flags-mach.yaml  | 14 
 .../llvm-objdump/ELF/AMDGPU/subtarget.ll  | 12 +++
 .../llvm-readobj/ELF/amdgpu-elf-headers.test  | 18 +
 llvm/tools/llvm-readobj/ELFDumper.cpp |  4 +
 30 files changed, 272 insertions(+), 10 deletions(-)

diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h
index 878f8d70f90c0a9..2d912bdbbd1bc59 100644
--- a/clang/include/clang/Basic/Cuda.h
+++ b/clang/include/clang/Basic/Cuda.h
@@ -113,6 +113,8 @@ enum class CudaArch {
   GFX1103,
   GFX1150,
   GFX1151,
+  GFX1200,
+  GFX1201,
   Generic, // A processor model named 'generic' if the target backend defines a
// public one.
   LAST,
diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp
index 2307352bd3becef..65840b9f20252b6 100644
--- a/clang/lib/Basic/Cuda.cpp
+++ b/clang/lib/Basic/Cuda.cpp
@@ -135,6 +135,8 @@ static const CudaArchToStringMap arch_names[] = {
 GFX(1103), // gfx1103
 GFX(1150), // gfx1150
 GFX(1151), // gfx1151
+GFX(1200), // gfx1200
+GFX(1201), // gfx1201
 {CudaArch::Generic, "generic", ""},
 // clang-format on
 };
diff --git a/clang/lib/Basic/Targets/NVPTX.cpp 
b/clang/lib/Basic/Targets/NVPTX.cpp
index a9fc88295700b89..3a4a75b0348f209 100644
--- a/clang/lib/Basic/Targets/NVPTX.cpp
+++ b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -214,6 +214,8 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions 
&Opts,
   case CudaArch::GFX1103:
   case CudaArch::GFX1150:
   case CudaArch::GFX1151:
+  case CudaArch::GFX1200:
+  case CudaArch::GFX1201:
   case CudaArch::Generic:
   case CudaArch::LAST:
 break;
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp 
b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 2f7dd83bd2d65c9..9b8fbbdf8046787 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -3540,6 +3540,8 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(
   case CudaArch::GFX1103:
   case CudaArch::GFX1150:
   case CudaArch::GFX1151:
+  case CudaArch::GFX1200:
+  case CudaArch::GFX1201:
   case CudaArch::Generic:
   case CudaArch::UNUSED:
   case CudaArch::UNKNOWN:
diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl 
b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index 03c20ae46faaa46..8959634572b44e9 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -49,6 +49,8 @@
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1103 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX1103 %s
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1150 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX1150 %s
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1151 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX1151 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1200 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX1200 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1201 -S -emit-llvm -o - %s | 
FileCheck --check-prefix=GFX1201 %s
 
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1103 -targ

[compiler-rt] [flang] [libc] [libcxx] [llvm] [lldb] [clang-tools-extra] [clang] [AMDGPU] Fix nondeterminism in SIFixSGPRCopies (PR #70644)

2023-10-31 Thread Jay Foad via cfe-commits

https://github.com/jayfoad updated 
https://github.com/llvm/llvm-project/pull/70644

>From bfc7b2041f5a05105808b0b1ee0427d9c9eb9f4b Mon Sep 17 00:00:00 2001
From: Jay Foad 
Date: Mon, 30 Oct 2023 15:23:48 +
Subject: [PATCH 1/4] Precommit test

---
 .../AMDGPU/fix-sgpr-copies-nondeterminism.ll  | 52 +++
 1 file changed, 52 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-nondeterminism.ll

diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-nondeterminism.ll 
b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-nondeterminism.ll
new file mode 100644
index 000..8b7e691dbddeae5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-nondeterminism.ll
@@ -0,0 +1,52 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 3
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s
+
+define amdgpu_gs void @f(i32 inreg %arg, i32 %arg1, i32 %arg2) {
+; CHECK-LABEL: f:
+; CHECK:   ; %bb.0: ; %bb
+; CHECK-NEXT:s_cmp_eq_u32 s0, 0
+; CHECK-NEXT:s_mov_b32 s0, 0
+; CHECK-NEXT:s_cbranch_scc1 .LBB0_2
+; CHECK-NEXT:  ; %bb.1: ; %bb3
+; CHECK-NEXT:v_mov_b32_e32 v4, v1
+; CHECK-NEXT:s_branch .LBB0_3
+; CHECK-NEXT:  .LBB0_2:
+; CHECK-NEXT:v_mov_b32_e32 v0, 1
+; CHECK-NEXT:v_mov_b32_e32 v4, 0
+; CHECK-NEXT:  .LBB0_3: ; %bb4
+; CHECK-NEXT:v_mov_b32_e32 v1, 0
+; CHECK-NEXT:s_mov_b32 s1, s0
+; CHECK-NEXT:s_mov_b32 s2, s0
+; CHECK-NEXT:s_mov_b32 s3, s0
+; CHECK-NEXT:s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT:v_mov_b32_e32 v2, v1
+; CHECK-NEXT:v_mov_b32_e32 v3, v1
+; CHECK-NEXT:v_mov_b32_e32 v5, v1
+; CHECK-NEXT:v_mov_b32_e32 v6, v1
+; CHECK-NEXT:v_mov_b32_e32 v7, v1
+; CHECK-NEXT:s_clause 0x1
+; CHECK-NEXT:buffer_store_b128 v[0:3], v1, s[0:3], 0 idxen
+; CHECK-NEXT:buffer_store_b128 v[4:7], v1, s[0:3], 0 idxen
+; CHECK-NEXT:s_nop 0
+; CHECK-NEXT:s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; CHECK-NEXT:s_endpgm
+bb:
+  %i = icmp eq i32 %arg, 0
+  br i1 %i, label %bb4, label %bb3
+
+bb3:
+  br label %bb4
+
+bb4:
+  %i5 = phi i32 [ %arg1, %bb3 ], [ 1, %bb ]
+  %i6 = phi i32 [ %arg2, %bb3 ], [ 0, %bb ]
+  %i7 = insertelement <4 x i32> zeroinitializer, i32 %i5, i64 0
+  %i8 = bitcast <4 x i32> %i7 to <4 x float>
+  call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %i8, <4 x i32> 
zeroinitializer, i32 0, i32 0, i32 0, i32 0)
+  %i9 = insertelement <4 x i32> zeroinitializer, i32 %i6, i64 0
+  %i10 = bitcast <4 x i32> %i9 to <4 x float>
+  call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %i10, <4 x i32> 
zeroinitializer, i32 0, i32 0, i32 0, i32 0)
+  ret void
+}
+
+declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, 
i32, i32, i32, i32 immarg)

>From aa050e8d720150b97d7af18d97d1d7f5d010bedc Mon Sep 17 00:00:00 2001
From: Jay Foad 
Date: Mon, 30 Oct 2023 10:40:22 +
Subject: [PATCH 2/4] [AMDGPU] Fix nondeterminism in SIFixSGPRCopies

There are a couple of loops that iterate over V2SCopies. The iteration
order needs to be deterministic, otherwise we can call moveToVALU in
different orders, which causes temporary vregs to be allocated in
different orders, which can affect register allocation heuristics.
---
 llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp|  8 +++
 .../AMDGPU/fix-sgpr-copies-nondeterminism.ll  | 22 +--
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp 
b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index b32ed9fef5dd34e..3e6ed2d793ae563 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -125,7 +125,7 @@ class SIFixSGPRCopies : public MachineFunctionPass {
   SmallVector PHINodes;
   SmallVector S2VCopies;
   unsigned NextVGPRToSGPRCopyID;
-  DenseMap V2SCopies;
+  MapVector V2SCopies;
   DenseMap> SiblingPenalty;
 
 public:
@@ -988,7 +988,7 @@ bool SIFixSGPRCopies::needToBeConvertedToVALU(V2SCopyInfo 
*Info) {
   for (auto J : Info->Siblings) {
 auto InfoIt = V2SCopies.find(J);
 if (InfoIt != V2SCopies.end()) {
-  MachineInstr *SiblingCopy = InfoIt->getSecond().Copy;
+  MachineInstr *SiblingCopy = InfoIt->second.Copy;
   if (SiblingCopy->isImplicitDef())
 // the COPY has already been MoveToVALUed
 continue;
@@ -1023,12 +1023,12 @@ void 
SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
 unsigned CurID = LoweringWorklist.pop_back_val();
 auto CurInfoIt = V2SCopies.find(CurID);
 if (CurInfoIt != V2SCopies.end()) {
-  V2SCopyInfo C = CurInfoIt->getSecond();
+  V2SCopyInfo C = CurInfoIt->second;
   LLVM_DEBUG(dbgs() << "Processing ...\n"; C.dump());
   for (auto S : C.Siblings) {
 auto SibInfoIt = V2SCopies.find(S);
 if (SibInfoIt != V2SCopies.end()) {
-  V2SCopyInfo &SI = SibInfoIt->getSecond();
+  V2SCopyInfo &SI = SibInfoIt->second;
   L

[llvm] [libc] [libcxx] [lldb] [flang] [compiler-rt] [clang-tools-extra] [clang] [AMDGPU] Fix nondeterminism in SIFixSGPRCopies (PR #70644)

2023-10-31 Thread Jay Foad via cfe-commits

https://github.com/jayfoad closed 
https://github.com/llvm/llvm-project/pull/70644
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang-tools-extra] [llvm] [AMDGPU] GCNRegPressure printing pass for testing. (PR #70031)

2023-11-01 Thread Jay Foad via cfe-commits

https://github.com/jayfoad approved this pull request.

> Should we move on and submit this patch?

Yes!

> @jayfoad do you have concerns about live-through register set computation or 
> others?

I personally have no interest in the live-through part. You could remove it 
from this patch, but I don't mind if others want to keep it.

> I believe fixing trackers should go to another PR.

Agreed.

https://github.com/llvm/llvm-project/pull/70031
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang-tools-extra] [compiler-rt] [flang] [llvm] [AMDGPU] Fold operand after shrinking instruction in SIFoldOperands (PR #68426)

2023-11-02 Thread Jay Foad via cfe-commits


@@ -290,37 +291,40 @@ bool SIFoldOperands::updateOperand(FoldCandidate &Fold) 
const {
 
 if (Fold.Commuted)
   TII->commuteInstruction(*Inst32, false);
-return true;
-  }
 
-  assert(!Fold.needsShrink() && "not handled");
+Fold.UseMI = Inst32;
+Fold.UseOpNo = AMDGPU::getNamedOperandIdx(Fold.UseMI->getOpcode(),
+  AMDGPU::OpName::src0);

jayfoad wrote:

Adding the assert showed up some problems to do with knowing whether or not the 
instruction has been commuted. I need to spend some more time on that.

https://github.com/llvm/llvm-project/pull/68426
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang-tools-extra] [compiler-rt] [flang] [llvm] [AMDGPU] Fold operand after shrinking instruction in SIFoldOperands (PR #68426)

2023-11-02 Thread Jay Foad via cfe-commits

https://github.com/jayfoad converted_to_draft 
https://github.com/llvm/llvm-project/pull/68426
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang-tools-extra] [clang] [AMDGPU] New ttracedata intrinsics (PR #70235)

2023-11-02 Thread Jay Foad via cfe-commits

https://github.com/jayfoad updated 
https://github.com/llvm/llvm-project/pull/70235

>From e02640686a8cf0a42cec01da4f32b6888f5de11f Mon Sep 17 00:00:00 2001
From: Jay Foad 
Date: Wed, 25 Oct 2023 17:14:40 +0100
Subject: [PATCH 1/2] [AMDGPU] New ttracedata intrinsics

Add llvm.amdgcn.s.ttracedata and llvm.amdgcn.s.ttracedata.imm which map
directly to the corresponding instructions s_ttracedata and
s_ttracedata_imm. These are inherently whole-wave operations so any
non-uniform inputs are readfirstlaned.
---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |  7 +++
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  | 10 
 llvm/lib/Target/AMDGPU/SOPInstructions.td |  9 +++-
 .../AMDGPU/llvm.amdgcn.s.ttracedata.ll| 53 +++
 4 files changed, 77 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.ttracedata.ll

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 5f1d1d932f74cbd..a3acfccd00f8e16 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1697,6 +1697,13 @@ def int_amdgcn_s_setprio :
   DefaultAttrsIntrinsic<[], [llvm_i16_ty], [ImmArg>, IntrNoMem,
 IntrHasSideEffects]>;
 
+def int_amdgcn_s_ttracedata :
+  DefaultAttrsIntrinsic<[], [llvm_i32_ty],
+[IntrNoMem, IntrHasSideEffects]>;
+def int_amdgcn_s_ttracedata_imm :
+  DefaultAttrsIntrinsic<[], [llvm_i16_ty],
+[IntrNoMem, IntrHasSideEffects, ImmArg>]>;
+
 // This is IntrHasSideEffects so it can be used to read cycle counters.
 def int_amdgcn_s_getreg :
   ClangBuiltin<"__builtin_amdgcn_s_getreg">,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 5b056bd9e5dba2c..f117f732cb84ffb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3064,6 +3064,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
   constrainOpWithReadfirstlane(B, MI, 2);
   return;
 }
+case Intrinsic::amdgcn_s_ttracedata:
+  constrainOpWithReadfirstlane(B, MI, 1); // M0
+  return;
 case Intrinsic::amdgcn_raw_buffer_load_lds:
 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: {
   applyDefaultMapping(OpdMapper);
@@ -4653,6 +4656,13 @@ AMDGPURegisterBankInfo::getInstrMapping(const 
MachineInstr &MI) const {
   OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
   break;
 }
+case Intrinsic::amdgcn_s_ttracedata: {
+  // This must be an SGPR, but accept a VGPR.
+  unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI,
+   AMDGPU::SGPRRegBankID);
+  OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
+  break;
+}
 case Intrinsic::amdgcn_end_cf: {
   unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
   OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td 
b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 2f3b0ff2f76215e..0ec4f8150bfcc06 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1500,7 +1500,10 @@ def S_INCPERFLEVEL : SOPP_Pseudo <"s_incperflevel", (ins 
i32imm:$simm16), "$simm
 def S_DECPERFLEVEL : SOPP_Pseudo <"s_decperflevel", (ins i32imm:$simm16), 
"$simm16",
   [(int_amdgcn_s_decperflevel timm:$simm16)]> {
 }
-def S_TTRACEDATA : SOPP_Pseudo <"s_ttracedata", (ins)> {
+
+let Uses = [M0] in
+def S_TTRACEDATA : SOPP_Pseudo <"s_ttracedata", (ins), "",
+[(int_amdgcn_s_ttracedata M0)]> {
   let simm16 = 0;
   let fixed_imm = 1;
 }
@@ -1544,8 +1547,10 @@ let SubtargetPredicate = isGFX10Plus in {
   [(SIdenorm_mode (i32 timm:$simm16))]>;
   }
 
+  let hasSideEffects = 1 in
   def S_TTRACEDATA_IMM :
-SOPP_Pseudo<"s_ttracedata_imm", (ins s16imm:$simm16), "$simm16">;
+SOPP_Pseudo<"s_ttracedata_imm", (ins s16imm:$simm16), "$simm16",
+[(int_amdgcn_s_ttracedata_imm timm:$simm16)]>;
 } // End SubtargetPredicate = isGFX10Plus
 
 let SubtargetPredicate = isGFX11Plus in {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.ttracedata.ll 
b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.ttracedata.ll
new file mode 100644
index 000..37b5357950e648b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.ttracedata.ll
@@ -0,0 +1,53 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 3
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < 
%s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < 
%s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
+
+declare void @llvm.amdgcn.s.ttracedata(i32)
+declare void @llvm.amdgcn.s.ttracedata.imm(i16)
+
+define

[llvm] [clang] [clang-tools-extra] [AMDGPU] New ttracedata intrinsics (PR #70235)

2023-11-02 Thread Jay Foad via cfe-commits

https://github.com/jayfoad closed 
https://github.com/llvm/llvm-project/pull/70235
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] effd47e - [Clang][AArch64] Add REQUIRES to new test

2023-12-13 Thread Jay Foad via cfe-commits

Author: Jay Foad
Date: 2023-12-13T10:49:52Z
New Revision: effd47ed45e3badd756103346a7c3b9e1e939e5e

URL: 
https://github.com/llvm/llvm-project/commit/effd47ed45e3badd756103346a7c3b9e1e939e5e
DIFF: 
https://github.com/llvm/llvm-project/commit/effd47ed45e3badd756103346a7c3b9e1e939e5e.diff

LOG: [Clang][AArch64] Add REQUIRES to new test

Added: 


Modified: 
clang/test/CodeGen/arm-vector_type-params-returns.c

Removed: 




diff  --git a/clang/test/CodeGen/arm-vector_type-params-returns.c 
b/clang/test/CodeGen/arm-vector_type-params-returns.c
index 61b617083515a7..14c3512ab81a9f 100644
--- a/clang/test/CodeGen/arm-vector_type-params-returns.c
+++ b/clang/test/CodeGen/arm-vector_type-params-returns.c
@@ -12,6 +12,8 @@
 // RUN: %clang_cc1 -DNEON_HEADER -DSVE_HEADER2  -triple aarch64 
-target-feature +sve -emit-llvm -O2 -o - %s | opt -S -passes=mem2reg,sroa | 
FileCheck %s
 // RUN: %clang_cc1 -DNEON_HEADER -DSVE_HEADER2 -triple aarch64-none-linux-gnu 
-target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o - /dev/null %s
 
+// REQUIRES: aarch64-registered-target
+
 #ifdef SVE_HEADER
   #include 
 #endif



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [clang-tools-extra] [Clang][AArch64] Add fix vector types to header into SVE (PR #73258)

2023-12-13 Thread Jay Foad via cfe-commits


@@ -0,0 +1,134 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --version 3
+
+// RUN: %clang_cc1 -DSVE_HEADER -triple aarch64 -target-feature +sve 
-emit-llvm -O2 -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s
+// RUN: %clang_cc1 -DSVE_HEADER -triple aarch64-none-linux-gnu -target-feature 
+sve2p1 -S -disable-O0-optnone -Werror -Wall -o - /dev/null %s
+
+// RUN: %clang_cc1 -DNEON_HEADER -triple aarch64 -target-feature +sve 
-emit-llvm -O2 -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s
+// RUN: %clang_cc1 -DNEON_HEADER -triple aarch64-none-linux-gnu 
-target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o - /dev/null %s
+
+// RUN: %clang_cc1 -DSVE_HEADER -DNEON_HEADER -triple aarch64 -target-feature 
+sve -emit-llvm -O2 -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s
+// RUN: %clang_cc1 -DSVE_HEADER -DNEON_HEADER -triple aarch64-none-linux-gnu 
-target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o - /dev/null %s
+
+// RUN: %clang_cc1 -DNEON_HEADER -DSVE_HEADER2  -triple aarch64 
-target-feature +sve -emit-llvm -O2 -o - %s | opt -S -passes=mem2reg,sroa | 
FileCheck %s
+// RUN: %clang_cc1 -DNEON_HEADER -DSVE_HEADER2 -triple aarch64-none-linux-gnu 
-target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o - /dev/null %s
+

jayfoad wrote:

I've just added a `REQUIRES` line to this test in 
effd47ed45e3badd756103346a7c3b9e1e939e5e since it was failing in my AMDGPU-only 
build.

https://github.com/llvm/llvm-project/pull/73258
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang-tools-extra] [llvm] [lldb] [libcxx] [compiler-rt] [libc] [flang] [clang] [lld] [AMDGPU] Use alias info to relax waitcounts for LDS DMA (PR #74537)

2023-12-13 Thread Jay Foad via cfe-commits


@@ -0,0 +1,154 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck %s 
--check-prefixeses=GCN,GFX9
+; RUN: llc -march=amdgcn -mcpu=gfx1030 < %s | FileCheck %s 
--check-prefixeses=GCN,GFX10

jayfoad wrote:

> --check-prefixeses

That's what happens when you enable `M-x gollum-mode` in Emacs.

https://github.com/llvm/llvm-project/pull/74537
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [AArch64][SME2] Add SQRSHRN, UQRSHRN, SQRSHRUN builtins for SME2, SVE2p1 (PR #75325)

2023-12-14 Thread Jay Foad via cfe-commits

jayfoad wrote:

Please remember to add a suitable `REQUIRES:` line to these new codegen tests, 
or put them in an `ARM` subdirectory with a suitable `lit.local.cfg`!

This new test is failing in non-ARM builds with:
```
FAIL: Clang :: CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qrshr.c (5567 of 
76786)
 TEST 'Clang :: 
CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qrshr.c' FAILED 

Exit Code: 1

Command Output (stderr):
--
RUN: at line 2: 
/jenkins/workspace/llvm-llpc/llvm-npi-merge/llvm-project/build/bin/clang -cc1 
-internal-isystem 
/jenkins/workspace/llvm-llpc/llvm-npi-merge/llvm-project/build/lib/clang/18/include
 -nostdsysteminc -triple aarch64-none-linux-gnu -target-feature +sve 
-target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - 
/jenkins/workspace/llvm-llpc/llvm-npi-merge/llvm-project/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qrshr.c
 | /jenkins/workspace/llvm-llpc/llvm-npi-merge/llvm-project/build/bin/opt -S  
-passes=mem2reg,instcombine,tailcallelim | 
/jenkins/workspace/llvm-llpc/llvm-npi-merge/llvm-project/build/bin/FileCheck 
/jenkins/workspace/llvm-llpc/llvm-npi-merge/llvm-project/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qrshr.c
+ /jenkins/workspace/llvm-llpc/llvm-npi-merge/llvm-project/build/bin/clang -cc1 
-internal-isystem 
/jenkins/workspace/llvm-llpc/llvm-npi-merge/llvm-project/build/lib/clang/18/include
 -nostdsysteminc -triple aarch64-none-linux-gnu -target-feature +sve 
-target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - 
/jenkins/workspace/llvm-llpc/llvm-npi-merge/llvm-project/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qrshr.c
+ /jenkins/workspace/llvm-llpc/llvm-npi-merge/llvm-project/build/bin/opt -S 
-passes=mem2reg,instcombine,tailcallelim
+ /jenkins/workspace/llvm-llpc/llvm-npi-merge/llvm-project/build/bin/FileCheck 
/jenkins/workspace/llvm-llpc/llvm-npi-merge/llvm-project/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qrshr.c
/jenkins/workspace/llvm-llpc/llvm-npi-merge/llvm-project/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qrshr.c:10:10:
 fatal error: 'arm_sve.h' file not found
   10 | #include 
  |  ^~~
1 error generated.
/jenkins/workspace/llvm-llpc/llvm-npi-merge/llvm-project/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qrshr.c:22:17:
 error: CHECK-LABEL: expected string not found in input
// CHECK-LABEL: @test_svqrshrn_s16_s32_x2(
^
:1:1: note: scanning from here
; ModuleID = ''
^
:1:14: note: possible intended match here
; ModuleID = ''
 ^

Input file: 
Check file: 
/jenkins/workspace/llvm-llpc/llvm-npi-merge/llvm-project/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qrshr.c

-dump-input=help explains the following input dump.

Input was:
<<
1: ; ModuleID = '' 
label:22'0 X~~ error: no match found
label:22'1  ?  possible intended match
2: source_filename = "" 
label:22'0 
>>
```

https://github.com/llvm/llvm-project/pull/75325
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] 50e78de - [AArch64][SME2] Add REQUIRES to new test

2023-12-14 Thread Jay Foad via cfe-commits

Author: Jay Foad
Date: 2023-12-14T13:20:37Z
New Revision: 50e78de76a5e77e15ddea48dfb520d6bbcbc1c45

URL: 
https://github.com/llvm/llvm-project/commit/50e78de76a5e77e15ddea48dfb520d6bbcbc1c45
DIFF: 
https://github.com/llvm/llvm-project/commit/50e78de76a5e77e15ddea48dfb520d6bbcbc1c45.diff

LOG: [AArch64][SME2] Add REQUIRES to new test

Added: 


Modified: 
clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qrshr.c

Removed: 




diff  --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qrshr.c 
b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qrshr.c
index 8e8b7203148934..6ebf224db92377 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qrshr.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qrshr.c
@@ -1,4 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve 
-target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | 
opt -S  -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S 
-disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S  
-passes=mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve 
-target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x 
c++ %s | opt -S  -passes=mem2reg,instcombine,tailcallelim | FileCheck %s 
-check-prefix=CPP-CHECK



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [AArch64][SME2] Add SQRSHRN, UQRSHRN, SQRSHRUN builtins for SME2, SVE2p1 (PR #75325)

2023-12-14 Thread Jay Foad via cfe-commits

jayfoad wrote:

> Please remember to add a suitable `REQUIRES:` line to these new codegen tests

I've added one in 50e78de76a5e77e15ddea48dfb520d6bbcbc1c45

https://github.com/llvm/llvm-project/pull/75325
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[lld] [compiler-rt] [libc] [clang] [libcxx] [lldb] [flang] [mlir] [llvm] [clang-tools-extra] [AMDGPU] GFX12: Add Split Workgroup Barrier (PR #74836)

2023-12-14 Thread Jay Foad via cfe-commits


@@ -684,6 +684,51 @@ s_rndne_f16 s5, 0xfe0b
 s_rndne_f16 s5, 0x3456
 // GFX12: encoding: [0xff,0x6e,0x85,0xbe,0x56,0x34,0x00,0x00]
 
+s_barrier_signal -2

jayfoad wrote:

Missing `s_get_barrier_state` tests in this file?

https://github.com/llvm/llvm-project/pull/74836
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [flang] [compiler-rt] [lld] [libcxx] [clang] [libcxxabi] [clang-tools-extra] [lldb] [AMDGPU] GFX12: select @llvm.prefetch intrinsic (PR #74576)

2023-12-14 Thread Jay Foad via cfe-commits


@@ -3164,6 +3164,18 @@ def : GCNPat <
 (as_i1timm $bound_ctrl))
 >;
 
+class SMPrefetchGetPcPat : GCNPat <

jayfoad wrote:

This pattern also interprets the "address" argument as being an offset from PC, 
so it should also be removed from this version of the patch.

https://github.com/llvm/llvm-project/pull/74576
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [RISCV] Implement multi-lib reuse rule for RISC-V bare-metal toolchain (PR #73765)

2023-12-18 Thread Jay Foad via cfe-commits

jayfoad wrote:

The new test is crashing in my Release+Asserts build:
```
FAIL: Clang :: Driver/riscv-toolchain-gcc-multilib-reuse.c (1081 of 1081)
 TEST 'Clang :: 
Driver/riscv-toolchain-gcc-multilib-reuse.c' FAILED 
Exit Code: 2

Command Output (stderr):
--
RUN: at line 1: /home/jayfoad2/llvm-release/bin/clang 
/home/jayfoad2/git/llvm-project/clang/test/Driver/riscv-toolchain-gcc-multilib-reuse.c
-target riscv64-unknown-elf
--gcc-toolchain=/home/jayfoad2/git/llvm-project/clang/test/Driver/Inputs/multilib_riscv_elf_sdk
--print-multi-directory-march=rv32imc -mabi=ilp32| 
/home/jayfoad2/llvm-release/bin/FileCheck 
-check-prefix=GCC-MULTI-LIB-REUSE-RV32IMC-ILP32 
/home/jayfoad2/git/llvm-project/clang/test/Driver/riscv-toolchain-gcc-multilib-reuse.c
+ /home/jayfoad2/llvm-release/bin/FileCheck 
-check-prefix=GCC-MULTI-LIB-REUSE-RV32IMC-ILP32 
/home/jayfoad2/git/llvm-project/clang/test/Driver/riscv-toolchain-gcc-multilib-reuse.c
+ /home/jayfoad2/llvm-release/bin/clang 
/home/jayfoad2/git/llvm-project/clang/test/Driver/riscv-toolchain-gcc-multilib-reuse.c
 -target riscv64-unknown-elf 
--gcc-toolchain=/home/jayfoad2/git/llvm-project/clang/test/Driver/Inputs/multilib_riscv_elf_sdk
 --print-multi-directory -march=rv32imc -mabi=ilp32
clang: 
/home/jayfoad2/git/llvm-project/clang/lib/Driver/ToolChains/CommonArgs.cpp:2189:
 void clang::driver::tools::addMultilibFlag(bool, const llvm::StringRef, 
Multilib::flags_list &): Assertion `Flag.front() == '-'' failed.
PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and 
include the crash backtrace, preprocessed source, and associated run script.
Stack dump:
0.  Program arguments: /home/jayfoad2/llvm-release/bin/clang 
/home/jayfoad2/git/llvm-project/clang/test/Driver/riscv-toolchain-gcc-multilib-reuse.c
 -target riscv64-unknown-elf 
--gcc-toolchain=/home/jayfoad2/git/llvm-project/clang/test/Driver/Inputs/multilib_riscv_elf_sdk
 --print-multi-directory -march=rv32imc -mabi=ilp32
1.  Compilation construction
 #0 0x070bfaf7 llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) 
(/home/jayfoad2/llvm-release/bin/clang+0x70bfaf7)
 #1 0x070bd6ae llvm::sys::RunSignalHandlers() 
(/home/jayfoad2/llvm-release/bin/clang+0x70bd6ae)
 #2 0x070c01ca SignalHandler(int) Signals.cpp:0:0
 #3 0x7fc909c42520 (/lib/x86_64-linux-gnu/libc.so.6+0x42520)
 #4 0x7fc909c969fc __pthread_kill_implementation ./nptl/pthread_kill.c:44:76
 #5 0x7fc909c969fc __pthread_kill_internal ./nptl/pthread_kill.c:78:10
 #6 0x7fc909c969fc pthread_kill ./nptl/pthread_kill.c:89:10
 #7 0x7fc909c42476 gsignal ./signal/../sysdeps/posix/raise.c:27:6
 #8 0x7fc909c287f3 abort ./stdlib/abort.c:81:7
 #9 0x7fc909c2871b _nl_load_domain ./intl/loadmsgcat.c:1177:9
#10 0x7fc909c39e96 (/lib/x86_64-linux-gnu/libc.so.6+0x39e96)
#11 0x07b32257 clang::driver::tools::addMultilibFlag(bool, 
llvm::StringRef, std::vector, std::allocator>, 
std::allocator, 
std::allocator>>>&) (/home/jayfoad2/llvm-release/bin/clang+0x7b32257)
#12 0x07abb016 clang::driver::MultilibBuilder::flag(llvm::StringRef, 
bool) (/home/jayfoad2/llvm-release/bin/clang+0x7abb016)
#13 0x07b9ddbf findRISCVMultilibs(clang::driver::Driver const&, 
llvm::Triple const&, llvm::StringRef, llvm::opt::ArgList const&, 
clang::driver::DetectedMultilibs&) Gnu.cpp:0:0
#14 0x07b95459 
clang::driver::toolchains::Generic_GCC::GCCInstallationDetector::ScanGCCForMultilibs(llvm::Triple
 const&, llvm::opt::ArgList const&, llvm::StringRef, bool) Gnu.cpp:0:0
#15 0x07b9b164 
clang::driver::toolchains::Generic_GCC::GCCInstallationDetector::ScanLibDirForGCCTriple(llvm::Triple
 const&, llvm::opt::ArgList const&, std::__cxx11::basic_string, std::allocator> const&, llvm::StringRef, bool, 
bool, bool) Gnu.cpp:0:0
#16 0x07b9324c 
clang::driver::toolchains::Generic_GCC::GCCInstallationDetector::init(llvm::Triple
 const&, llvm::opt::ArgList const&) Gnu.cpp:0:0
#17 0x07bf34ba 
clang::driver::toolchains::RISCVToolChain::RISCVToolChain(clang::driver::Driver 
const&, llvm::Triple const&, llvm::opt::ArgList const&) RISCVToolchain.cpp:0:0
#18 0x07a31458 clang::driver::Driver::getToolChain(llvm::opt::ArgList 
const&, llvm::Triple const&) const 
(/home/jayfoad2/llvm-release/bin/clang+0x7a31458)
#19 0x07a38bbe 
clang::driver::Driver::BuildCompilation(llvm::ArrayRef) 
(/home/jayfoad2/llvm-release/bin/clang+0x7a38bbe)
#20 0x04a8a25a clang_main(int, char**, llvm::ToolContext const&) 
(/home/jayfoad2/llvm-release/bin/clang+0x4a8a25a)
#21 0x04a9bb61 main (/home/jayfoad2/llvm-release/bin/clang+0x4a9bb61)
#22 0x7fc909c29d90 __libc_start_call_main 
./csu/../sysdeps/nptl/libc_start_call_main.h:58:16
#23 0x7fc909c29e40 call_init ./csu/../csu/libc-start.c:128:20
#24 0x7fc909c29e40 __libc_start_main ./csu/../csu/libc-start.c:379:5
#25 0x04a875a5 _start (/home/jayfoad2/llvm-relea

[flang] [llvm] [clang-tools-extra] [clang] [libcxx] [libc] [compiler-rt] [AMDGPU] Produce better memoperand for LDS DMA (PR #75247)

2023-12-19 Thread Jay Foad via cfe-commits

jayfoad wrote:

> Use PoisonValue instead of nullptr for load memop as a Value.

What is the effect of that? I thought nullptr was supposed to represent an 
unknown value, so you have to conservatively assume it might alias with 
anything.

https://github.com/llvm/llvm-project/pull/75247
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [libcxx] [lldb] [clang] [lld] [flang] [compiler-rt] [clang-tools-extra] [libc] [AMDGPU] Use alias info to relax waitcounts for LDS DMA (PR #74537)

2023-12-19 Thread Jay Foad via cfe-commits

jayfoad wrote:

How does this work in a case like this?
```
call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) 
@lds.3, i32 4, i32 0, i32 0, i32 0, i32 0)
call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) 
%ptr, i32 4, i32 0, i32 0, i32 0, i32 0)
%val.3 = load float, ptr addrspace(3) @lds.3, align 4
```
i.e.
- store to known lds address `@lds.3` (this will use slot 0 and another slot 
e.g. slot 3?)
- store to unknown lds address (this will use slot 0?)
- load from known lds address `@lds.3` (this will use slot 3?)

https://github.com/llvm/llvm-project/pull/74537
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[flang] [clang-tools-extra] [lld] [llvm] [compiler-rt] [lldb] [libc] [libcxx] [clang] [AMDGPU] Use alias info to relax waitcounts for LDS DMA (PR #74537)

2023-12-19 Thread Jay Foad via cfe-commits

jayfoad wrote:

> > How does this work in a case like this?
> > ```
> > call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr 
> > addrspace(3) @lds.3, i32 4, i32 0, i32 0, i32 0, i32 0)
> > call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr 
> > addrspace(3) %ptr, i32 4, i32 0, i32 0, i32 0, i32 0)
> > %val.3 = load float, ptr addrspace(3) @lds.3, align 4
> > ```
> > 
> > 
> > 
> >   
> > 
> > 
> >   
> > 
> > 
> > 
> >   
> > i.e.
> > ```
> > * store to known lds address `@lds.3` (this will use slot 0 and another 
> > slot e.g. slot 3?)
> > 
> > * store to unknown lds address (this will use slot 0?)
> > 
> > * load from known lds address `@lds.3` (this will use slot 3?)
> > ```
> 
> It does not know the pointer, so it uses default slot 0 and waits till 0.

Test case:
```
@lds.0 = internal addrspace(3) global [64 x float] poison, align 16
@lds.1 = internal addrspace(3) global [64 x float] poison, align 16

declare void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) 
nocapture, i32 %size, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux)

define amdgpu_kernel void @f(<4 x i32> %rsrc, i32 %i1, i32 %i2, ptr 
addrspace(1) %out, ptr addrspace(3) %ptr) {
main_body:
  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) 
@lds.0, i32 4, i32 0, i32 0, i32 0, i32 0)
  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) 
%ptr, i32 4, i32 0, i32 0, i32 0, i32 0)
  %gep.0 = getelementptr float, ptr addrspace(3) @lds.0, i32 %i1
  %gep.1 = getelementptr float, ptr addrspace(3) @lds.1, i32 %i2
  %val.0 = load volatile float, ptr addrspace(3) %gep.0, align 4
  %val.1 = load volatile float, ptr addrspace(3) %gep.1, align 4
  %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
  store float %val.0, ptr addrspace(1) %out
  store float %val.1, ptr addrspace(1) %out.gep.1
  ret void
}
```
Generates:
```
s_load_dwordx8 s[4:11], s[0:1], 0x24
s_load_dword s2, s[0:1], 0x44
s_mov_b32 m0, 0
v_mov_b32_e32 v2, 0
s_waitcnt lgkmcnt(0)
buffer_load_dword off, s[4:7], 0 lds
s_mov_b32 m0, s2
s_lshl_b32 s0, s8, 2
buffer_load_dword off, s[4:7], 0 lds
s_lshl_b32 s1, s9, 2
v_mov_b32_e32 v0, s0
v_mov_b32_e32 v1, s1
s_waitcnt vmcnt(1)
ds_read_b32 v0, v0
s_waitcnt vmcnt(0)
ds_read_b32 v1, v1 offset:256
s_waitcnt lgkmcnt(0)
global_store_dwordx2 v2, v[0:1], s[10:11]
s_endpgm
```
The `s_waitcnt vmcnt(1)` seems incorrect, because the second buffer-load-to-lds 
might clobber `@lds.0`.

> I have to tell anyone interested here: before I even wrote this code it 
> didn't know of the dependency and did not wait for anything at all. Everyone 
> was happy.

I am still happy, because buffer/flat/global-load-to-lds was removed in GFX11.

https://github.com/llvm/llvm-project/pull/74537
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[mlir] [llvm] [clang] [libcxx] [libc] [compiler-rt] [flang] [AMDGPU] Define new targets gfx1200 and gfx1201 (PR #73133)

2023-11-23 Thread Jay Foad via cfe-commits

https://github.com/jayfoad closed 
https://github.com/llvm/llvm-project/pull/73133
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang][dataflow] Retrieve members from accessors called using member… (PR #73978)

2023-12-04 Thread Jay Foad via cfe-commits

jayfoad wrote:

Hi, on my Release+Asserts build this is causing:
```
FAIL: Clang-Unit :: 
Analysis/FlowSensitive/./ClangAnalysisFlowSensitiveTests/32/38 (134 of 658)
 TEST 'Clang-Unit :: 
Analysis/FlowSensitive/./ClangAnalysisFlowSensitiveTests/32/38' FAILED 

Script(shard):
--
GTEST_OUTPUT=json:/home/jayfoad2/llvm-release/tools/clang/unittests/Analysis/FlowSensitive/./ClangAnalysisFlowSensitiveTests-Clang-Unit-2611196-32-38.json
 GTEST_SHUFFLE=0 GTEST_TOTAL_SHARDS=38 GTEST_SHARD_INDEX=32 
/home/jayfoad2/llvm-release/tools/clang/unittests/Analysis/FlowSensitive/./ClangAnalysisFlowSensitiveTests
--

Script:
--
/home/jayfoad2/llvm-release/tools/clang/unittests/Analysis/FlowSensitive/./ClangAnalysisFlowSensitiveTests
 
--gtest_filter=EnvironmentTest.ModelMemberForAccessorUsingMethodPointerThroughTemplate
--
/home/jayfoad2/git/llvm-project/clang/unittests/Analysis/FlowSensitive/DataflowEnvironmentTest.cpp:362:
 Failure
Value of: DAContext.getModeledFields(QualType(Struct->getTypeForDecl(), 0))
Expected: contains at least one element that is equal to 0x4b29e98
  Actual: {}


/home/jayfoad2/git/llvm-project/clang/unittests/Analysis/FlowSensitive/DataflowEnvironmentTest.cpp:362
Value of: DAContext.getModeledFields(QualType(Struct->getTypeForDecl(), 0))
Expected: contains at least one element that is equal to 0x4b29e98
  Actual: {}





Failed Tests (1):
  Clang-Unit :: 
Analysis/FlowSensitive/./ClangAnalysisFlowSensitiveTests/EnvironmentTest/ModelMemberForAccessorUsingMethodPointerThroughTemplate
```

https://github.com/llvm/llvm-project/pull/73978
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang-format] Fix a bug in `git-clang-format --binary` (PR #74293)

2023-12-04 Thread Jay Foad via cfe-commits

https://github.com/jayfoad approved this pull request.

LGTM but the commit message should really explain what problem this fixes 
instead of just saying "rework".

https://github.com/llvm/llvm-project/pull/74293
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] Reapply "InstCombine: Introduce SimplifyDemandedUseFPClass"" (PR #74056)

2023-12-21 Thread Jay Foad via cfe-commits

jayfoad wrote:

> The referenced issue violates the spec for finite-only math only by
> using a return value for a constant infinity.

You mean this issue? 
https://github.com/llvm/llvm-project/commit/5a36904c515b#commitcomment-129847939

Can you explain how your patch "broke" it? If you return infinity from a 
function marked with `ninf`, I would expect your patch to have no effect, 
because `DemandedMask & Known.KnownFPClasses` will be empty so 
`getFPClassConstant` will return `nullptr`.

https://github.com/llvm/llvm-project/pull/74056
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[lldb] [llvm] [mlir] [openmp] [libc] [flang] [clang] [AMDGPU] GFX12 global_atomic_ordered_add_b64 instruction and intrinsic (PR #76149)

2023-12-21 Thread Jay Foad via cfe-commits

https://github.com/jayfoad updated 
https://github.com/llvm/llvm-project/pull/76149

>From b14a554a15e4de88c9afc428f9c6898090e6eb23 Mon Sep 17 00:00:00 2001
From: Jay Foad 
Date: Thu, 21 Dec 2023 12:00:26 +
Subject: [PATCH] [AMDGPU] GFX12 global_atomic_ordered_add_b64 instruction and
 intrinsic

---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  | 10 ++-
 llvm/lib/Target/AMDGPU/AMDGPUInstructions.td  |  1 +
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |  1 +
 .../Target/AMDGPU/AMDGPUSearchableTables.td   |  1 +
 llvm/lib/Target/AMDGPU/FLATInstructions.td| 11 +++-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |  1 +
 ...vm.amdgcn.global.atomic.ordered.add.b64.ll | 65 +++
 llvm/test/MC/AMDGPU/gfx11_unsupported.s   |  3 +
 llvm/test/MC/AMDGPU/gfx12_asm_vflat.s | 24 +++
 .../Disassembler/AMDGPU/gfx12_dasm_vflat.txt  | 12 
 10 files changed, 124 insertions(+), 5 deletions(-)
 create mode 100644 
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 51bd9b63c127ed..3985c8871e1615 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -10,6 +10,8 @@
 //
 
//===--===//
 
+def global_ptr_ty : LLVMQualPointerType<1>;
+
 class AMDGPUReadPreloadRegisterIntrinsic
   : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>;
 
@@ -2353,10 +2355,10 @@ def int_amdgcn_s_get_waveid_in_workgroup :
   Intrinsic<[llvm_i32_ty], [],
 [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, 
IntrNoFree]>;
 
-class AMDGPUGlobalAtomicRtn : Intrinsic <
+class AMDGPUGlobalAtomicRtn : 
Intrinsic <
   [vt],
-  [llvm_anyptr_ty,// vaddr
-   vt],   // vdata(VGPR)
+  [pt,  // vaddr
+   vt], // vdata(VGPR)
   [IntrArgMemOnly, IntrWillReturn, NoCapture>, IntrNoCallback, 
IntrNoFree], "",
   [SDNPMemOperand]>;
 
@@ -2486,6 +2488,8 @@ def int_amdgcn_permlanex16_var : 
ClangBuiltin<"__builtin_amdgcn_permlanex16_var"
 [IntrNoMem, IntrConvergent, IntrWillReturn,
  ImmArg>, ImmArg>, IntrNoCallback, 
IntrNoFree]>;
 
+def int_amdgcn_global_atomic_ordered_add_b64 : 
AMDGPUGlobalAtomicRtn;
+
 def int_amdgcn_flat_atomic_fmin_num   : 
AMDGPUGlobalAtomicRtn;
 def int_amdgcn_flat_atomic_fmax_num   : 
AMDGPUGlobalAtomicRtn;
 def int_amdgcn_global_atomic_fmin_num : 
AMDGPUGlobalAtomicRtn;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td 
b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index eaf72d7157ee2d..36e07d944c942c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -642,6 +642,7 @@ defm int_amdgcn_global_atomic_fmax : noret_op;
 defm int_amdgcn_global_atomic_csub : noret_op;
 defm int_amdgcn_flat_atomic_fadd : local_addr_space_atomic_op;
 defm int_amdgcn_ds_fadd_v2bf16 : noret_op;
+defm int_amdgcn_global_atomic_ordered_add_b64 : noret_op;
 defm int_amdgcn_flat_atomic_fmin_num : noret_op;
 defm int_amdgcn_flat_atomic_fmax_num : noret_op;
 defm int_amdgcn_global_atomic_fmin_num : noret_op;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index c9412f720c62ec..fba060464a6e74 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4690,6 +4690,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const 
MachineInstr &MI) const {
 case Intrinsic::amdgcn_flat_atomic_fmax_num:
 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
+case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
   return getDefaultMappingAllVGPR(MI);
 case Intrinsic::amdgcn_ds_ordered_add:
 case Intrinsic::amdgcn_ds_ordered_swap:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td 
b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index beb670669581f1..4cc8871a00fe1f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -243,6 +243,7 @@ def : SourceOfDivergence;
 def : SourceOfDivergence;
 def : SourceOfDivergence;
 def : SourceOfDivergence;
+def : SourceOfDivergence;
 def : SourceOfDivergence;
 def : SourceOfDivergence;
 def : SourceOfDivergence;
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td 
b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 0dd2b3f5c2c912..615f8cd54d8f9c 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -926,9 +926,11 @@ defm GLOBAL_LOAD_LDS_USHORT : FLAT_Global_Load_LDS_Pseudo 
<"global_load_lds_usho
 defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Global_Load_LDS_Pseudo 
<"global_load_lds_sshort">;
 defm GLOBAL_LOAD_LDS_DWORD  : FLAT_Global_Load_LDS_Pseudo 
<"global_load_lds_dword">;
 
-} // End is_flat_global = 1
-
+let Subt

[clang] [openmp] [flang] [lldb] [libc] [mlir] [llvm] [AMDGPU] GFX12 global_atomic_ordered_add_b64 instruction and intrinsic (PR #76149)

2024-01-02 Thread Jay Foad via cfe-commits

jayfoad wrote:

Ping!

https://github.com/llvm/llvm-project/pull/76149
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[openmp] [clang] [libc] [mlir] [lldb] [flang] [llvm] [AMDGPU] GFX12 global_atomic_ordered_add_b64 instruction and intrinsic (PR #76149)

2024-01-02 Thread Jay Foad via cfe-commits

https://github.com/jayfoad closed 
https://github.com/llvm/llvm-project/pull/76149
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang-tools-extra] [clang] [AMDGPU][GFX12] Default component broadcast store (PR #76212)

2024-01-05 Thread Jay Foad via cfe-commits

https://github.com/jayfoad approved this pull request.

LGTM. @arsenm does this address your concerns?

https://github.com/llvm/llvm-project/pull/76212
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang-tools-extra] [llvm] [clang] [AMDGPU] Flip the default value of maybeAtomic. NFCI. (PR #75220)

2024-01-09 Thread Jay Foad via cfe-commits

https://github.com/jayfoad updated 
https://github.com/llvm/llvm-project/pull/75220

>From 429d0a22cd4208eb0c854ccf98df1ba86fd3b0cb Mon Sep 17 00:00:00 2001
From: Jay Foad 
Date: Tue, 12 Dec 2023 17:15:26 +
Subject: [PATCH] [AMDGPU] Flip the default value of maybeAtomic. NFCI.

In practice maybeAtomic = 0 is used to prevent SIMemoryLegalizer from
interfering with instructions that are mayLoad or mayStore but lack
MachineMemOperands. These instructions should be the exception not the
rule, so this patch sets maybeAtomic = 1 by default and only overrides
it to 0 where necessary.
---
 llvm/lib/Target/AMDGPU/BUFInstructions.td| 4 
 llvm/lib/Target/AMDGPU/DSInstructions.td | 1 -
 llvm/lib/Target/AMDGPU/EXPInstructions.td| 1 +
 llvm/lib/Target/AMDGPU/FLATInstructions.td   | 7 ---
 llvm/lib/Target/AMDGPU/LDSDIRInstructions.td | 1 +
 llvm/lib/Target/AMDGPU/SIInstrFormats.td | 2 +-
 llvm/lib/Target/AMDGPU/SIInstructions.td | 2 +-
 llvm/lib/Target/AMDGPU/SMInstructions.td | 1 +
 8 files changed, 5 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td 
b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 44fd4ef8641270..4696ea47f9cefd 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -477,7 +477,6 @@ class MUBUF_Load_Pseudo .ret;
   let mayLoad = 0;
   let mayStore = 1;
-  let maybeAtomic = 1;
   let elements = getMUBUFElements.ret;
   let tfe = isTFE;
 }
@@ -618,7 +616,6 @@ class MUBUF_Pseudo_Store_Lds
   let LGKM_CNT = 1;
   let mayLoad = 1;
   let mayStore = 1;
-  let maybeAtomic = 1;
 
   let has_vdata = 0;
   let has_vaddr = 0;
@@ -680,7 +677,6 @@ class MUBUF_Atomic_Pseudo patt
   // Most instruction load and store data, so set this as the default.
   let mayLoad = 1;
   let mayStore = 1;
-  let maybeAtomic = 1;
 
   let hasSideEffects = 0;
   let SchedRW = [WriteLDS];
diff --git a/llvm/lib/Target/AMDGPU/EXPInstructions.td 
b/llvm/lib/Target/AMDGPU/EXPInstructions.td
index ff1d661ef6fe1d..4cfee7d013ef1a 100644
--- a/llvm/lib/Target/AMDGPU/EXPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/EXPInstructions.td
@@ -20,6 +20,7 @@ class EXPCommon : InstSI<
   let EXP_CNT = 1;
   let mayLoad = done;
   let mayStore = 1;
+  let maybeAtomic = 0;
   let UseNamedOperandTable = 1;
   let Uses = !if(row, [EXEC, M0], [EXEC]);
   let SchedRW = [WriteExport];
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td 
b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index c0251164faee8b..a1ff3af663352e 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -173,7 +173,6 @@ class FLAT_Load_Pseudo  {
@@ -221,7 +219,6 @@ class FLAT_Global_Load_AddTid_Pseudo  {
@@ -450,7 +444,6 @@ class FLAT_AtomicNoRet_Pseudo : InstSI<
   let hasSideEffects = 0;
   let mayLoad = 1;
   let mayStore = 0;
+  let maybeAtomic = 0;
 
   string Mnemonic = opName;
   let UseNamedOperandTable = 1;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td 
b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index 585a3eb7861878..1b66d163714fbc 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -91,7 +91,7 @@ class InstSI  {
   let hasSideEffects = 1;
-  let maybeAtomic = 1;
 }
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
@@ -557,6 +556,7 @@ def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins),
   let hasNoSchedulingInfo = 1;
   let FixedSize = 1;
   let isMeta = 1;
+  let maybeAtomic = 0;
 }
 
 // Used as an isel pseudo to directly emit initialization with an
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td 
b/llvm/lib/Target/AMDGPU/SMInstructions.td
index c18846483cf95a..323f49ab91f01e 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -29,6 +29,7 @@ class SM_Pseudo  patt
   let mayStore = 0;
   let mayLoad = 1;
   let hasSideEffects = 0;
+  let maybeAtomic = 0;
   let UseNamedOperandTable = 1;
   let SchedRW = [WriteSMEM];
 

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang-tools-extra] [llvm] [AMDGPU] Flip the default value of maybeAtomic. NFCI. (PR #75220)

2024-01-09 Thread Jay Foad via cfe-commits

https://github.com/jayfoad closed 
https://github.com/llvm/llvm-project/pull/75220
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [clang-tools-extra] [AMDGPU] Flip the default value of maybeAtomic. NFCI. (PR #75220)

2024-01-09 Thread Jay Foad via cfe-commits


@@ -29,6 +29,7 @@ class SM_Pseudo  patt
   let mayStore = 0;
   let mayLoad = 1;
   let hasSideEffects = 0;
+  let maybeAtomic = 0;

jayfoad wrote:

#77443

https://github.com/llvm/llvm-project/pull/75220
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang-tools-extra] [libclc] [lld] [flang] [mlir] [libcxx] [libunwind] [clang] [lldb] [libc] [llvm] [compiler-rt] [AMDGPU] Fix broken sign-extended subword buffer load combine (PR #77470)

2024-01-09 Thread Jay Foad via cfe-commits

https://github.com/jayfoad updated 
https://github.com/llvm/llvm-project/pull/77470

>From ae231d88c5b5e2e0996edefd45389992f8e97d05 Mon Sep 17 00:00:00 2001
From: Jay Foad 
Date: Tue, 9 Jan 2024 13:16:24 +
Subject: [PATCH 1/3] [AMDGPU] Precommit tests for broken combine

Add tests for sign-extending the result of an unsigned subword buffer
load from the wrong width.
---
 .../llvm.amdgcn.struct.buffer.load.ll | 82 +++
 1 file changed, 82 insertions(+)

diff --git 
a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll 
b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll
index 81c0f7557e6417..fcd7821a86897e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll
@@ -500,6 +500,47 @@ define amdgpu_ps float 
@struct_buffer_load_i8_sext__sgpr_rsrc__vgpr_vindex__vgpr
   ret float %cast
 }
 
+define amdgpu_ps float @struct_buffer_load_i8_sext_wrong_width(<4 x i32> inreg 
%rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+  ; GFX8-LABEL: name: struct_buffer_load_i8_sext_wrong_width
+  ; GFX8: bb.1 (%ir-block.0):
+  ; GFX8-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, 
$vgpr1
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX8-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX8-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX8-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX8-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], 
%subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], 
%subreg.sub3
+  ; GFX8-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX8-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX8-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX8-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], 
%subreg.sub0, [[COPY5]], %subreg.sub1
+  ; GFX8-NEXT:   [[BUFFER_LOAD_SBYTE_BOTHEN:%[0-9]+]]:vgpr_32 = 
BUFFER_LOAD_SBYTE_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 
0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
+  ; GFX8-NEXT:   $vgpr0 = COPY [[BUFFER_LOAD_SBYTE_BOTHEN]]
+  ; GFX8-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
+  ; GFX12-LABEL: name: struct_buffer_load_i8_sext_wrong_width
+  ; GFX12: bb.1 (%ir-block.0):
+  ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, 
$vgpr1
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX12-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX12-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX12-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], 
%subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], 
%subreg.sub3
+  ; GFX12-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX12-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX12-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX12-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], 
%subreg.sub0, [[COPY5]], %subreg.sub1
+  ; GFX12-NEXT:   [[BUFFER_LOAD_SBYTE_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = 
BUFFER_LOAD_SBYTE_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 
[[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[BUFFER_LOAD_SBYTE_VBUFFER_BOTHEN]]
+  ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  %val = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 
%vindex, i32 %voffset, i32 %soffset, i32 0)
+  %trunc = trunc i8 %val to i4
+  %ext = sext i4 %trunc to i32
+  %cast = bitcast i32 %ext to float
+  ret float %cast
+}
+
 define amdgpu_ps float 
@struct_buffer_load_i16_zext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4
 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
   ; GFX8-LABEL: name: 
struct_buffer_load_i16_zext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
   ; GFX8: bb.1 (%ir-block.0):
@@ -580,6 +621,47 @@ define amdgpu_ps float 
@struct_buffer_load_i16_sext__sgpr_rsrc__vgpr_vindex__vgp
   ret float %cast
 }
 
+define amdgpu_ps float @struct_buffer_load_i16_sext_wrong_width(<4 x i32> 
inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+  ; GFX8-LABEL: name: struct_buffer_load_i16_sext_wrong_width
+  ; GFX8: bb.1 (%ir-block.0):
+  ; GFX8-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, 
$vgpr1
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX8-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX8-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX8-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX8-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], 
%subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], 
%subreg.sub3
+  ; G

[clang-tools-extra] [libc] [mlir] [lld] [libcxx] [libclc] [llvm] [clang] [flang] [libunwind] [lldb] [compiler-rt] [AMDGPU] Fix broken sign-extended subword buffer load combine (PR #77470)

2024-01-10 Thread Jay Foad via cfe-commits

https://github.com/jayfoad closed 
https://github.com/llvm/llvm-project/pull/77470
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] 6bec3e9 - [APInt] Remove all uses of zextOrSelf, sextOrSelf and truncOrSelf

2022-05-19 Thread Jay Foad via cfe-commits

Author: Jay Foad
Date: 2022-05-19T11:23:13+01:00
New Revision: 6bec3e9303d68b8b264de3a02ca943d9dd752004

URL: 
https://github.com/llvm/llvm-project/commit/6bec3e9303d68b8b264de3a02ca943d9dd752004
DIFF: 
https://github.com/llvm/llvm-project/commit/6bec3e9303d68b8b264de3a02ca943d9dd752004.diff

LOG: [APInt] Remove all uses of zextOrSelf, sextOrSelf and truncOrSelf

Most clients only used these methods because they wanted to be able to
extend or truncate to the same bit width (which is a no-op). Now that
the standard zext, sext and trunc allow this, there is no reason to use
the OrSelf versions.

The OrSelf versions additionally have the strange behaviour of allowing
extending to a *smaller* width, or truncating to a *larger* width, which
are also treated as no-ops. A small amount of client code relied on this
(ConstantRange::castOp and MicrosoftCXXNameMangler::mangleNumber) and
needed rewriting.

Differential Revision: https://reviews.llvm.org/D125557

Added: 


Modified: 
clang/lib/AST/ExprConstant.cpp
clang/lib/AST/MicrosoftMangle.cpp
clang/lib/CodeGen/CGBuiltin.cpp
clang/lib/Sema/SemaDecl.cpp
clang/lib/StaticAnalyzer/Core/LoopUnrolling.cpp
llvm/lib/Analysis/BasicAliasAnalysis.cpp
llvm/lib/Analysis/ConstantFolding.cpp
llvm/lib/Analysis/LazyValueInfo.cpp
llvm/lib/Analysis/MemoryBuiltins.cpp
llvm/lib/Analysis/ScalarEvolution.cpp
llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
llvm/lib/IR/ConstantRange.cpp
llvm/lib/Support/APFixedPoint.cpp
llvm/lib/Support/APInt.cpp
llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
llvm/lib/Target/RISCV/RISCVISelLowering.cpp
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/lib/Target/X86/X86TargetTransformInfo.cpp
llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
llvm/test/TableGen/VarLenEncoder.td
llvm/utils/TableGen/VarLenCodeEmitterGen.cpp
polly/lib/CodeGen/IslExprBuilder.cpp

Removed: 




diff  --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 519be84a342b3..f679dba44f001 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -8596,7 +8596,7 @@ static bool getBytesReturnedByAllocSizeCall(const 
ASTContext &Ctx,
 Into = ExprResult.Val.getInt();
 if (Into.isNegative() || !Into.isIntN(BitsInSizeT))
   return false;
-Into = Into.zextOrSelf(BitsInSizeT);
+Into = Into.zext(BitsInSizeT);
 return true;
   };
 
@@ -9582,8 +9582,8 @@ bool PointerExprEvaluator::VisitCXXNewExpr(const 
CXXNewExpr *E) {
 
   unsigned Bits =
   std::max(CAT->getSize().getBitWidth(), ArrayBound.getBitWidth());
-  llvm::APInt InitBound = CAT->getSize().zextOrSelf(Bits);
-  llvm::APInt AllocBound = ArrayBound.zextOrSelf(Bits);
+  llvm::APInt InitBound = CAT->getSize().zext(Bits);
+  llvm::APInt AllocBound = ArrayBound.zext(Bits);
   if (InitBound.ugt(AllocBound)) {
 if (IsNothrow)
   return ZeroInitialization(E);
@@ -10377,9 +10377,9 @@ bool VectorExprEvaluator::VisitCastExpr(const CastExpr 
*E) {
   for (unsigned i = 0; i < NElts; i++) {
 llvm::APInt Elt;
 if (BigEndian)
-  Elt = SValInt.rotl(i*EltSize+FloatEltSize).truncOrSelf(FloatEltSize);
+  Elt = SValInt.rotl(i * EltSize + FloatEltSize).trunc(FloatEltSize);
 else
-  Elt = SValInt.rotr(i*EltSize).truncOrSelf(FloatEltSize);
+  Elt = SValInt.rotr(i * EltSize).trunc(FloatEltSize);
 Elts.push_back(APValue(APFloat(Sem, Elt)));
   }
 } else if (EltTy->isIntegerType()) {

diff  --git a/clang/lib/AST/MicrosoftMangle.cpp 
b/clang/lib/AST/MicrosoftMangle.cpp
index abe2b64f57278..e84946d1f21ec 100644
--- a/clang/lib/AST/MicrosoftMangle.cpp
+++ b/clang/lib/AST/MicrosoftMangle.cpp
@@ -808,8 +808,8 @@ void MicrosoftCXXNameMangler::mangleNumber(llvm::APSInt 
Number) {
   // to convert every integer to signed 64 bit before mangling (including
   // unsigned 64 bit values). Do the same, but preserve bits beyond the bottom
   // 64.
-  llvm::APInt Value =
-  Number.isSigned() ? Number.sextOrSelf(64) : Number.zextOrSelf(64);
+  unsigned Width = std::max(Number.getBitWidth(), 64U);
+  llvm::APInt Value = Number.extend(Width);

[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)

2024-03-26 Thread Jay Foad via cfe-commits


@@ -2326,6 +2326,20 @@ bool 
SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
 }
 #endif
 
+if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) {
+  AMDGPU::Waitcnt Wait;
+  if (ST->hasExtendedWaitCounts())
+Wait = AMDGPU::Waitcnt(0, 0, 0, 0, 0, 0, 0);
+  else
+Wait = AMDGPU::Waitcnt(0, 0, 0, 0);
+
+  if (!Inst.mayStore())
+Wait.StoreCnt = ~0u;

jayfoad wrote:

```suggestion
  AMDGPU::Waitcnt Wait = WCG->getAllZeroWaitcnt(Inst.mayStore());
```
However, as a general rule:
- loads and atomics-with-return update LOADcnt
- stores and atomics-without-return update STOREcnt

so it might be more accurate to use the condition `Inst.mayStore() && 
!SIInstrInfo::isAtomicRet(Inst)`.

Please make sure you have tests for atomics with and without return.

https://github.com/llvm/llvm-project/pull/79236
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)

2024-03-29 Thread Jay Foad via cfe-commits


@@ -2326,6 +2326,20 @@ bool 
SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
 }
 #endif
 
+if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) {
+  AMDGPU::Waitcnt Wait;
+  if (ST->hasExtendedWaitCounts())
+Wait = AMDGPU::Waitcnt(0, 0, 0, 0, 0, 0, 0);
+  else
+Wait = AMDGPU::Waitcnt(0, 0, 0, 0);
+
+  if (!Inst.mayStore())
+Wait.StoreCnt = ~0u;

jayfoad wrote:

GFX10 introduced a separate counter for **VMEM** stores with the name VScnt. 
GFX12 just renamed it to STOREcnt. No architecture has a separate store counter 
for DS or SMEM. So `ds_add_u32 v0, v1` followed by `s_waitcnt lgkmcnt(0)` 
(pre-GFX12) or `s_wait_dscnt 0` (GFX12) is fine .

https://github.com/llvm/llvm-project/pull/79236
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)

2024-04-02 Thread Jay Foad via cfe-commits


@@ -2594,12 +2594,10 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const 
SIMemOpInfo &MOI,
 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
 MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
-  Changed |= CC->insertWait(MI, MOI.getScope(),
-MOI.getInstrAddrSpace(),
-isAtomicRet(*MI) ? SIMemOp::LOAD :
-   SIMemOp::STORE,
-MOI.getIsCrossAddressSpaceOrdering(),
-Position::AFTER);
+  Changed |=

jayfoad wrote:

Remove this.

https://github.com/llvm/llvm-project/pull/79236
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)

2024-04-02 Thread Jay Foad via cfe-commits


@@ -0,0 +1,1406 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+precise-memory < %s | 
FileCheck %s -check-prefixes=GFX9
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+precise-memory < %s | 
FileCheck %s -check-prefixes=GFX90A
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+precise-memory < %s | 
FileCheck %s -check-prefixes=GFX10
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 
-mattr=+enable-flat-scratch,+precise-memory < %s | FileCheck 
--check-prefixes=GFX9-FLATSCR %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+precise-memory < %s | 
FileCheck %s -check-prefixes=GFX11
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+precise-memory < %s | 
FileCheck %s -check-prefixes=GFX12
+
+; from atomicrmw-expand.ll
+; covers flat_load, flat_atomic (atomic with return)
+;
+define void @syncscope_workgroup_nortn(ptr %addr, float %val) {
+; GFX9-LABEL: syncscope_workgroup_nortn:
+; GFX9:   ; %bb.0:
+; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:flat_load_dword v4, v[0:1]
+; GFX9-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:s_mov_b64 s[4:5], 0
+; GFX9-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX9-NEXT:; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:v_add_f32_e32 v3, v4, v2
+; GFX9-NEXT:flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX9-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX9-NEXT:s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:v_mov_b32_e32 v4, v3
+; GFX9-NEXT:s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:s_cbranch_execnz .LBB0_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: syncscope_workgroup_nortn:
+; GFX90A:   ; %bb.0:
+; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:flat_load_dword v5, v[0:1]
+; GFX90A-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX90A-NEXT:; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:v_add_f32_e32 v4, v5, v2
+; GFX90A-NEXT:flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:s_cbranch_execnz .LBB0_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: syncscope_workgroup_nortn:
+; GFX10:   ; %bb.0:
+; GFX10-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:flat_load_dword v4, v[0:1]
+; GFX10-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX10-NEXT:; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:v_add_f32_e32 v3, v4, v2
+; GFX10-NEXT:s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:buffer_gl0_inv
+; GFX10-NEXT:v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:v_mov_b32_e32 v4, v3
+; GFX10-NEXT:s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:s_cbranch_execnz .LBB0_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX9-FLATSCR-LABEL: syncscope_workgroup_nortn:
+; GFX9-FLATSCR:   ; %bb.0:
+; GFX9-FLATSCR-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:flat_load_dword v4, v[0:1]
+; GFX9-FLATSCR-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:s_mov_b64 s[0:1], 0
+; GFX9-FLATSCR-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX9-FLATSCR-NEXT:; =>This Inner Loop Header: Depth=1
+; GFX9-FLATSCR-NEXT:v_add_f32_e32 v3, v4, v2
+; GFX9-FLATSCR-NEXT:flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX9-FLATSCR-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX9-FLATSCR-NEXT:s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-FLATSCR-NEXT:v_mov_b32_e32 v4, v3
+; GFX9-FLATSCR-NEXT:s_andn2_b64 exec, exec, s[0:1]
+; GFX9-FLATSCR-NEXT:s_cbranch_execnz .LBB0_1
+; GFX9-FLATSCR-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-FLATSCR-NEXT:s_or_b64 exec, exec, s[0:1]
+; GFX9-FLATSCR-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: syncscope_workgroup_nortn:
+; GFX11:   ; %bb.0:
+; GFX11-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:flat_load_b32 v4, v[0:1]
+; GFX11-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX11-NEXT:; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:s_delay_al

[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)

2024-04-02 Thread Jay Foad via cfe-commits


@@ -0,0 +1,1406 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+precise-memory < %s | 
FileCheck %s -check-prefixes=GFX9
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+precise-memory < %s | 
FileCheck %s -check-prefixes=GFX90A
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+precise-memory < %s | 
FileCheck %s -check-prefixes=GFX10
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 
-mattr=+enable-flat-scratch,+precise-memory < %s | FileCheck 
--check-prefixes=GFX9-FLATSCR %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+precise-memory < %s | 
FileCheck %s -check-prefixes=GFX11
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+precise-memory < %s | 
FileCheck %s -check-prefixes=GFX12
+
+; from atomicrmw-expand.ll
+; covers flat_load, flat_atomic (atomic with return)
+;
+define void @syncscope_workgroup_nortn(ptr %addr, float %val) {
+; GFX9-LABEL: syncscope_workgroup_nortn:
+; GFX9:   ; %bb.0:
+; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:flat_load_dword v4, v[0:1]
+; GFX9-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:s_mov_b64 s[4:5], 0
+; GFX9-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX9-NEXT:; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:v_add_f32_e32 v3, v4, v2
+; GFX9-NEXT:flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX9-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX9-NEXT:s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:v_mov_b32_e32 v4, v3
+; GFX9-NEXT:s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:s_cbranch_execnz .LBB0_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: syncscope_workgroup_nortn:
+; GFX90A:   ; %bb.0:
+; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:flat_load_dword v5, v[0:1]
+; GFX90A-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX90A-NEXT:; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:v_add_f32_e32 v4, v5, v2
+; GFX90A-NEXT:flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:s_cbranch_execnz .LBB0_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: syncscope_workgroup_nortn:
+; GFX10:   ; %bb.0:
+; GFX10-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:flat_load_dword v4, v[0:1]
+; GFX10-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX10-NEXT:; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:v_add_f32_e32 v3, v4, v2
+; GFX10-NEXT:s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:buffer_gl0_inv
+; GFX10-NEXT:v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:v_mov_b32_e32 v4, v3
+; GFX10-NEXT:s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:s_cbranch_execnz .LBB0_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX9-FLATSCR-LABEL: syncscope_workgroup_nortn:
+; GFX9-FLATSCR:   ; %bb.0:
+; GFX9-FLATSCR-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:flat_load_dword v4, v[0:1]
+; GFX9-FLATSCR-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:s_mov_b64 s[0:1], 0
+; GFX9-FLATSCR-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX9-FLATSCR-NEXT:; =>This Inner Loop Header: Depth=1
+; GFX9-FLATSCR-NEXT:v_add_f32_e32 v3, v4, v2
+; GFX9-FLATSCR-NEXT:flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX9-FLATSCR-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR-NEXT:v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX9-FLATSCR-NEXT:s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-FLATSCR-NEXT:v_mov_b32_e32 v4, v3
+; GFX9-FLATSCR-NEXT:s_andn2_b64 exec, exec, s[0:1]
+; GFX9-FLATSCR-NEXT:s_cbranch_execnz .LBB0_1
+; GFX9-FLATSCR-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-FLATSCR-NEXT:s_or_b64 exec, exec, s[0:1]
+; GFX9-FLATSCR-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: syncscope_workgroup_nortn:
+; GFX11:   ; %bb.0:
+; GFX11-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:flat_load_b32 v4, v[0:1]
+; GFX11-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX11-NEXT:; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:s_delay_al

[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)

2024-02-07 Thread Jay Foad via cfe-commits

jayfoad wrote:

> This logic would need updating again for GFX12. It seems like it's 
> duplicating a lot of knowledge which is already implemented in 
> SIInsertWaitcnts.

Just to demonstrate, you could implement this feature in SIInsertWaitcnts for 
**all** supported architectures with something like:
```diff
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp 
b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 6ecb1c8bf6e1..910cd094f8f2 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -2299,6 +2299,12 @@ bool 
SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
 
 updateEventWaitcntAfter(Inst, &ScoreBrackets);
 
+AMDGPU::Waitcnt Wait =
+AMDGPU::Waitcnt::allZeroExceptVsCnt(ST->hasExtendedWaitCounts());
+ScoreBrackets.simplifyWaitcnt(Wait);
+Modified |= generateWaitcnt(Wait, std::next(Inst.getIterator()), Block,
+ScoreBrackets, /*OldWaitcntInstr=*/nullptr);
+
 #if 0 // TODO: implement resource type check controlled by options with ub = 
LB.
 // If this instruction generates a S_SETVSKIP because it is an
 // indexed resource, and we are on Tahiti, then it will also force
```
Handling VSCNT/STORECNT correctly is a little more complicated but not much.

https://github.com/llvm/llvm-project/pull/79236
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)

2024-02-27 Thread Jay Foad via cfe-commits

https://github.com/jayfoad requested changes to this pull request.

I've added _some_ inline comments, but really I don't want to spend the time to 
review this properly (or maintain it, or extend it for new architectures in 
future). All this logic already exists in SIInsertWaitcnts. Duplicating it here 
is not a good design.

https://github.com/llvm/llvm-project/pull/79236
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)

2024-02-27 Thread Jay Foad via cfe-commits


@@ -2378,6 +2409,215 @@ bool 
SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
   return Changed;
 }
 
+bool SIGfx6CacheControl::handleNonAtomicForPreciseMemory(
+MachineBasicBlock::iterator &MI) {
+  assert(MI->mayLoadOrStore());
+
+  MachineInstr &Inst = *MI;
+  AMDGPU::Waitcnt Wait;
+
+  if (TII->isSMRD(Inst)) { // scalar
+if (Inst.mayStore())
+  return false;
+Wait.DsCnt = 0; // LgkmCnt
+  } else {  // vector
+if (Inst.mayLoad()) {   // vector load
+  if (TII->isVMEM(Inst))// VMEM load
+Wait.LoadCnt = 0;   // VmCnt
+  else if (TII->isFLAT(Inst)) { // Flat load
+Wait.LoadCnt = 0;   // VmCnt
+Wait.DsCnt = 0; // LgkmCnt
+  } else// LDS load
+Wait.DsCnt = 0; // LgkmCnt
+} else {// vector store
+  if (TII->isVMEM(Inst))// VMEM store
+Wait.LoadCnt = 0;   // VmCnt
+  else if (TII->isFLAT(Inst)) { // Flat store
+Wait.LoadCnt = 0;   // VmCnt
+Wait.DsCnt = 0; // LgkmCnt
+  } else
+Wait.DsCnt = 0; // LDS store; LgkmCnt
+}
+  }
+
+  unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+  MachineBasicBlock &MBB = *MI->getParent();
+  BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+  --MI;
+  return true;
+}
+
+bool SIGfx6CacheControl::handleAtomicForPreciseMemory(
+MachineBasicBlock::iterator &MI, bool IsAtomicWithRet) {
+  assert(MI->mayLoadOrStore());
+
+  AMDGPU::Waitcnt Wait;
+
+  Wait.LoadCnt = 0; // VmCnt
+  Wait.DsCnt = 0;   // LgkmCnt
+
+  unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+  MachineBasicBlock &MBB = *MI->getParent();
+  BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+  --MI;
+  return true;
+}
+
+bool SIGfx10CacheControl::handleNonAtomicForPreciseMemory(
+MachineBasicBlock::iterator &MI) {
+  assert(MI->mayLoadOrStore());
+
+  MachineInstr &Inst = *MI;
+  AMDGPU::Waitcnt Wait;
+
+  bool BuildWaitCnt = true;
+  bool BuildVsCnt = false;
+
+  if (TII->isSMRD(Inst)) { // scalar
+if (Inst.mayStore())
+  return false;
+Wait.DsCnt = 0; // LgkmCnt
+  } else {  // vector
+if (Inst.mayLoad()) {   // vector load
+  if (TII->isVMEM(Inst))// VMEM load
+Wait.LoadCnt = 0;   // VmCnt
+  else if (TII->isFLAT(Inst)) { // Flat load
+Wait.LoadCnt = 0;   // VmCnt
+Wait.DsCnt = 0; // LgkmCnt
+  } else// LDS load
+Wait.DsCnt = 0; // LgkmCnt
+}
+
+// For some vector instructions, mayLoad() and mayStore() can be both true.
+if (Inst.mayStore()) { // vector store; an instruction can be both
+   // load/store
+  if (TII->isVMEM(Inst)) { // VMEM store
+if (!Inst.mayLoad())
+  BuildWaitCnt = false;
+BuildVsCnt = true;
+  } else if (TII->isFLAT(Inst)) { // Flat store
+Wait.DsCnt = 0;   // LgkmCnt
+BuildVsCnt = true;
+  } else
+Wait.DsCnt = 0; // LDS store; LgkmCnt
+}
+  }
+
+  MachineBasicBlock &MBB = *MI->getParent();
+  if (BuildWaitCnt) {
+unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+--MI;
+  }
+
+  if (BuildVsCnt) {
+BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT))
+.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+.addImm(0);
+--MI;
+  }
+  return true;
+}
+
+bool SIGfx10CacheControl ::handleAtomicForPreciseMemory(
+MachineBasicBlock::iterator &MI, bool IsAtomicWithRet) {
+  assert(MI->mayLoadOrStore());
+
+  AMDGPU::Waitcnt Wait;
+
+  Wait.DsCnt = 0; // LgkmCnt
+  if (IsAtomicWithRet)
+Wait.LoadCnt = 0; // VmCnt
+
+  unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+  MachineBasicBlock &MBB = *MI->getParent();
+  BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+  --MI;
+  if (!IsAtomicWithRet) {
+BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT))
+.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+.addImm(0);
+--MI;
+  }
+  return true;
+}
+
+bool SIGfx12CacheControl ::handleNonAtomicForPreciseMemory(
+MachineBasicBlock::iterator &MI) {
+  assert(MI->mayLoadOrStore());
+
+  MachineInstr &Inst = *MI;
+  unsigned WaitType = 0;
+  // For some vector instructions, mayLoad() and mayStore() can be both true.

jayfoad wrote:

What kind of (non-atomic) instructions is this supposed to handle?

https://github.com/llvm/llvm-project/pull/79236
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)

2024-02-27 Thread Jay Foad via cfe-commits


@@ -2378,6 +2409,215 @@ bool 
SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
   return Changed;
 }
 
+bool SIGfx6CacheControl::handleNonAtomicForPreciseMemory(
+MachineBasicBlock::iterator &MI) {
+  assert(MI->mayLoadOrStore());
+
+  MachineInstr &Inst = *MI;
+  AMDGPU::Waitcnt Wait;
+
+  if (TII->isSMRD(Inst)) { // scalar
+if (Inst.mayStore())
+  return false;
+Wait.DsCnt = 0; // LgkmCnt
+  } else {  // vector
+if (Inst.mayLoad()) {   // vector load
+  if (TII->isVMEM(Inst))// VMEM load
+Wait.LoadCnt = 0;   // VmCnt
+  else if (TII->isFLAT(Inst)) { // Flat load
+Wait.LoadCnt = 0;   // VmCnt
+Wait.DsCnt = 0; // LgkmCnt
+  } else// LDS load
+Wait.DsCnt = 0; // LgkmCnt
+} else {// vector store
+  if (TII->isVMEM(Inst))// VMEM store
+Wait.LoadCnt = 0;   // VmCnt
+  else if (TII->isFLAT(Inst)) { // Flat store
+Wait.LoadCnt = 0;   // VmCnt
+Wait.DsCnt = 0; // LgkmCnt
+  } else
+Wait.DsCnt = 0; // LDS store; LgkmCnt
+}
+  }
+
+  unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+  MachineBasicBlock &MBB = *MI->getParent();
+  BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+  --MI;
+  return true;
+}
+
+bool SIGfx6CacheControl::handleAtomicForPreciseMemory(
+MachineBasicBlock::iterator &MI, bool IsAtomicWithRet) {
+  assert(MI->mayLoadOrStore());
+
+  AMDGPU::Waitcnt Wait;
+
+  Wait.LoadCnt = 0; // VmCnt
+  Wait.DsCnt = 0;   // LgkmCnt
+
+  unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+  MachineBasicBlock &MBB = *MI->getParent();
+  BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+  --MI;
+  return true;
+}
+
+bool SIGfx10CacheControl::handleNonAtomicForPreciseMemory(
+MachineBasicBlock::iterator &MI) {
+  assert(MI->mayLoadOrStore());
+
+  MachineInstr &Inst = *MI;
+  AMDGPU::Waitcnt Wait;
+
+  bool BuildWaitCnt = true;
+  bool BuildVsCnt = false;
+
+  if (TII->isSMRD(Inst)) { // scalar
+if (Inst.mayStore())
+  return false;
+Wait.DsCnt = 0; // LgkmCnt
+  } else {  // vector
+if (Inst.mayLoad()) {   // vector load
+  if (TII->isVMEM(Inst))// VMEM load
+Wait.LoadCnt = 0;   // VmCnt
+  else if (TII->isFLAT(Inst)) { // Flat load
+Wait.LoadCnt = 0;   // VmCnt
+Wait.DsCnt = 0; // LgkmCnt
+  } else// LDS load
+Wait.DsCnt = 0; // LgkmCnt
+}
+
+// For some vector instructions, mayLoad() and mayStore() can be both true.
+if (Inst.mayStore()) { // vector store; an instruction can be both
+   // load/store
+  if (TII->isVMEM(Inst)) { // VMEM store
+if (!Inst.mayLoad())
+  BuildWaitCnt = false;
+BuildVsCnt = true;
+  } else if (TII->isFLAT(Inst)) { // Flat store
+Wait.DsCnt = 0;   // LgkmCnt
+BuildVsCnt = true;
+  } else
+Wait.DsCnt = 0; // LDS store; LgkmCnt
+}
+  }
+
+  MachineBasicBlock &MBB = *MI->getParent();
+  if (BuildWaitCnt) {
+unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+--MI;
+  }
+
+  if (BuildVsCnt) {
+BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT))
+.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+.addImm(0);
+--MI;
+  }
+  return true;
+}
+
+bool SIGfx10CacheControl ::handleAtomicForPreciseMemory(
+MachineBasicBlock::iterator &MI, bool IsAtomicWithRet) {
+  assert(MI->mayLoadOrStore());
+
+  AMDGPU::Waitcnt Wait;
+
+  Wait.DsCnt = 0; // LgkmCnt
+  if (IsAtomicWithRet)
+Wait.LoadCnt = 0; // VmCnt
+
+  unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+  MachineBasicBlock &MBB = *MI->getParent();
+  BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+  --MI;
+  if (!IsAtomicWithRet) {
+BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT))
+.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+.addImm(0);
+--MI;
+  }
+  return true;
+}
+
+bool SIGfx12CacheControl ::handleNonAtomicForPreciseMemory(
+MachineBasicBlock::iterator &MI) {
+  assert(MI->mayLoadOrStore());
+
+  MachineInstr &Inst = *MI;
+  unsigned WaitType = 0;
+  // For some vector instructions, mayLoad() and mayStore() can be both true.
+  bool LoadAndStore = false;
+
+  if (TII->isSMRD(Inst)) { // scalar
+if (Inst.mayStore())
+  return false;
+
+WaitType = AMDGPU::S_WAIT_KMCNT;
+  } else { // vector
+if (Inst.mayLoad() && Inst.mayStore()) {
+  WaitType = AMDGPU::S_WAIT_LOADCNT;
+  LoadAndStore = true;
+} else if (Inst.mayLoad()) { // vector load
+  if (TII->isVMEM(In

[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)

2024-02-27 Thread Jay Foad via cfe-commits

https://github.com/jayfoad edited 
https://github.com/llvm/llvm-project/pull/79236
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)

2024-02-27 Thread Jay Foad via cfe-commits


@@ -355,6 +356,18 @@ class SICacheControl {
MachineBasicBlock::iterator &MI) const {
 return false;
   }
+
+public:
+  // The following is for supporting precise memory mode. When the feature
+  // precise-memory is enabled, an s_waitcnt instruction is inserted
+  // after each memory instruction.
+
+  virtual bool
+  handleNonAtomicForPreciseMemory(MachineBasicBlock::iterator &MI) = 0;
+  /// Handles atomic instruction \p MI with \p IsAtomicWithRet indicating
+  /// whether \p MI returns a result.
+  virtual bool handleAtomicForPreciseMemory(MachineBasicBlock::iterator &MI,

jayfoad wrote:

This function is never even called.

https://github.com/llvm/llvm-project/pull/79236
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [AMDGPU] Allow w64 ballot to be used on w32 targets (PR #80183)

2024-02-01 Thread Jay Foad via cfe-commits

jayfoad wrote:

After this change is there any value in having two different builtins? You 
could just have one that always return 64 bits.

https://github.com/llvm/llvm-project/pull/80183
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [AMDGPU] Check wavefrontsize for GFX11 WMMA builtins (PR #79980)

2024-02-01 Thread Jay Foad via cfe-commits

https://github.com/jayfoad closed 
https://github.com/llvm/llvm-project/pull/79980
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)

2024-01-24 Thread Jay Foad via cfe-commits


@@ -2561,6 +2567,70 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const 
SIMemOpInfo &MOI,
   return Changed;
 }
 
+bool SIMemoryLegalizer::GFX9InsertWaitcntForPreciseMem(MachineFunction &MF) {
+  const GCNSubtarget &ST = MF.getSubtarget();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  IsaVersion IV = getIsaVersion(ST.getCPU());
+
+  bool Changed = false;
+
+  for (auto &MBB : MF) {
+for (auto MI = MBB.begin(); MI != MBB.end();) {
+  MachineInstr &Inst = *MI;
+  ++MI;
+  if (Inst.mayLoadOrStore() == false)
+continue;
+
+  // Todo: if next insn is an s_waitcnt
+  AMDGPU::Waitcnt Wait;
+
+  if (!(Inst.getDesc().TSFlags & SIInstrFlags::maybeAtomic)) {
+if (TII->isSMRD(Inst)) {  // scalar

jayfoad wrote:

This logic would need updating again for GFX12. It seems like it's duplicating 
a lot of knowledge which is already implemented in SIInsertWaitcnts.

https://github.com/llvm/llvm-project/pull/79236
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [AMDGPU][GFX12] Add tests for unsupported builtins (PR #78729)

2024-01-24 Thread Jay Foad via cfe-commits

https://github.com/jayfoad edited 
https://github.com/llvm/llvm-project/pull/78729
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [AMDGPU][GFX12] Add tests for unsupported builtins (PR #78729)

2024-01-24 Thread Jay Foad via cfe-commits

https://github.com/jayfoad approved this pull request.

LGTM.

https://github.com/llvm/llvm-project/pull/78729
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [AMDGPU][GFX12] Add tests for unsupported builtins (PR #78729)

2024-01-24 Thread Jay Foad via cfe-commits


@@ -4,10 +4,114 @@
 
 typedef unsigned int uint;
 
-kernel void test_builtins_amdgcn_gws_insts(uint a, uint b) {
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+
+typedef float  v2f   __attribute__((ext_vector_type(2)));
+typedef float  v4f   __attribute__((ext_vector_type(4)));
+typedef float  v16f  __attribute__((ext_vector_type(16)));
+typedef float  v32f  __attribute__((ext_vector_type(32)));
+typedef half   v4h   __attribute__((ext_vector_type(4)));
+typedef half   v8h   __attribute__((ext_vector_type(8)));
+typedef half   v16h  __attribute__((ext_vector_type(16)));
+typedef half   v32h  __attribute__((ext_vector_type(32)));
+typedef intv2i   __attribute__((ext_vector_type(2)));
+typedef intv4i   __attribute__((ext_vector_type(4)));
+typedef intv16i  __attribute__((ext_vector_type(16)));
+typedef intv32i  __attribute__((ext_vector_type(32)));
+typedef short  v2s   __attribute__((ext_vector_type(2)));
+typedef short  v4s   __attribute__((ext_vector_type(4)));
+typedef short  v8s   __attribute__((ext_vector_type(8)));
+typedef short  v16s  __attribute__((ext_vector_type(16)));
+typedef short  v32s  __attribute__((ext_vector_type(32)));
+typedef double v4d   __attribute__((ext_vector_type(4)));
+
+void builtin_test_unsupported(global v32f*out_v32f,
+  global v16f*out_v16f,
+  global v4f* out_v4f,
+  global v32i*out_v32i,
+  global v16i*out_v16i,
+  global v4i* out_v4i,
+  global v4d* out_v4d,
+  global double*  out_double,
+  double a_double , double b_double , double 
c_double,

jayfoad wrote:

Nit: you don't really need separate out/a/b/c versions of all these types. You 
could just test expressions like:
```
x_v32f = __builtin_amdgcn_mfma_f32_32x32x1f32(x_float, x_float, x_v32f, 0, 0, 
0);
```

https://github.com/llvm/llvm-project/pull/78729
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang-tools-extra] [llvm] [AMDGPU] Update SITargetLowering::getAddrModeArguments (PR #78740)

2024-01-24 Thread Jay Foad via cfe-commits

https://github.com/jayfoad updated 
https://github.com/llvm/llvm-project/pull/78740

>From c7636536d65a3792223e083dc5bacd0a8e6ff3d7 Mon Sep 17 00:00:00 2001
From: Jay Foad 
Date: Fri, 19 Jan 2024 16:06:00 +
Subject: [PATCH] [AMDGPU] Update SITargetLowering::getAddrModeArguments

Handle every intrinsic for which getTgtMemIntrinsic returns with
Info.ptrVal set to one of the intrinsic's operands. A bunch of these
cases were missing.
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 36 +++
 1 file changed, 23 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index cc0c4d4e36eaa8e..66ae9222fb50c89 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1406,31 +1406,41 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo 
&Info,
 bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
 SmallVectorImpl &Ops,
 Type *&AccessTy) const {
+  Value *Ptr = nullptr;
   switch (II->getIntrinsicID()) {
-  case Intrinsic::amdgcn_ds_ordered_add:
-  case Intrinsic::amdgcn_ds_ordered_swap:
+  case Intrinsic::amdgcn_atomic_cond_sub_u32:
   case Intrinsic::amdgcn_ds_append:
   case Intrinsic::amdgcn_ds_consume:
   case Intrinsic::amdgcn_ds_fadd:
-  case Intrinsic::amdgcn_ds_fmin:
   case Intrinsic::amdgcn_ds_fmax:
-  case Intrinsic::amdgcn_global_atomic_fadd:
+  case Intrinsic::amdgcn_ds_fmin:
+  case Intrinsic::amdgcn_ds_ordered_add:
+  case Intrinsic::amdgcn_ds_ordered_swap:
   case Intrinsic::amdgcn_flat_atomic_fadd:
-  case Intrinsic::amdgcn_flat_atomic_fmin:
+  case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
   case Intrinsic::amdgcn_flat_atomic_fmax:
-  case Intrinsic::amdgcn_flat_atomic_fmin_num:
   case Intrinsic::amdgcn_flat_atomic_fmax_num:
+  case Intrinsic::amdgcn_flat_atomic_fmin:
+  case Intrinsic::amdgcn_flat_atomic_fmin_num:
+  case Intrinsic::amdgcn_global_atomic_csub:
+  case Intrinsic::amdgcn_global_atomic_fadd:
   case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
-  case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
-  case Intrinsic::amdgcn_global_atomic_csub: {
-Value *Ptr = II->getArgOperand(0);
-AccessTy = II->getType();
-Ops.push_back(Ptr);
-return true;
-  }
+  case Intrinsic::amdgcn_global_atomic_fmax:
+  case Intrinsic::amdgcn_global_atomic_fmax_num:
+  case Intrinsic::amdgcn_global_atomic_fmin:
+  case Intrinsic::amdgcn_global_atomic_fmin_num:
+  case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
+Ptr = II->getArgOperand(0);
+break;
+  case Intrinsic::amdgcn_global_load_lds:
+Ptr = II->getArgOperand(1);
+break;
   default:
 return false;
   }
+  AccessTy = II->getType();
+  Ops.push_back(Ptr);
+  return true;
 }
 
 bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM,

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[mlir] [clang] [llvm] [AMDGPU] Add GFX12 WMMA and SWMMAC instructions (PR #77795)

2024-01-24 Thread Jay Foad via cfe-commits

jayfoad wrote:

> Also need to be updated:
> 
> https://github.com/llvm/llvm-project/blob/bb6a4850553dd4140a5bd63187ec1b14d0b731f9/llvm/lib/Target/AMDGPU/SMInstructions.td#L14

What needs to be updated and why?

https://github.com/llvm/llvm-project/pull/77795
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang-tools-extra] [flang] [llvm] [clang] [compiler-rt] [AMDGPU] Fold operand after shrinking instruction in SIFoldOperands (PR #68426)

2024-01-29 Thread Jay Foad via cfe-commits

https://github.com/jayfoad closed 
https://github.com/llvm/llvm-project/pull/68426
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang-tools-extra] [clang] [AMDGPU] Update SITargetLowering::getAddrModeArguments (PR #78740)

2024-01-29 Thread Jay Foad via cfe-commits

https://github.com/jayfoad closed 
https://github.com/llvm/llvm-project/pull/78740
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [clang-tools-extra] [AMDGPU] CodeGen for GFX12 S_WAIT_* instructions (PR #77438)

2024-01-30 Thread Jay Foad via cfe-commits

jayfoad wrote:

> @jayfoad, can you link to the documentation where these new registers are 
> described? Preferably from a comment in the top of the file(s). It would make 
> it easier to review for correctness.

ISA documentation will be linked from 
https://llvm.org/docs/AMDGPUUsage.html#additional-documentation when it is made 
public.

https://github.com/llvm/llvm-project/pull/77438
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [AMDGPU] Check wavefrontsize for GFX11 WMMA builtins (PR #79980)

2024-01-30 Thread Jay Foad via cfe-commits

https://github.com/jayfoad created 
https://github.com/llvm/llvm-project/pull/79980

None

>From cace712a8f379df3498dd76bc1f95eb4671e997c Mon Sep 17 00:00:00 2001
From: Jay Foad 
Date: Tue, 30 Jan 2024 11:04:33 +
Subject: [PATCH] [AMDGPU] Check wavefrontsize for GFX11 WMMA builtins

---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  | 34 +--
 .../builtins-amdgcn-wmma-w32-gfx10-err.cl | 16 -
 .../builtins-amdgcn-wmma-w64-gfx10-err.cl | 18 +-
 .../CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl |  2 +-
 4 files changed, 35 insertions(+), 35 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 74dfd1d214e8..e9dd8dcd0b60 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -292,23 +292,23 @@ 
TARGET_BUILTIN(__builtin_amdgcn_s_wait_event_export_ready, "v", "n", "gfx11-inst
 // Postfix w32 indicates the builtin requires wavefront size of 32.
 // Postfix w64 indicates the builtin requires wavefront size of 64.
 
//===--===//
-TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w32, "V8fV16hV16hV8f", 
"nc", "gfx11-insts")
-TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32, "V8fV16sV16sV8f", 
"nc", "gfx11-insts")
-TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w32, 
"V16hV16hV16hV16hIb", "nc", "gfx11-insts")
-TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32, 
"V16sV16sV16sV16sIb", "nc", "gfx11-insts")
-TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32, 
"V16hV16hV16hV16hIb", "nc", "gfx11-insts")
-TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32, 
"V16sV16sV16sV16sIb", "nc", "gfx11-insts")
-TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32, 
"V8iIbV4iIbV4iV8iIb", "nc", "gfx11-insts")
-TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32, 
"V8iIbV2iIbV2iV8iIb", "nc", "gfx11-insts")
-
-TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w64, "V4fV16hV16hV4f", 
"nc", "gfx11-insts")
-TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64, "V4fV16sV16sV4f", 
"nc", "gfx11-insts")
-TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w64, "V8hV16hV16hV8hIb", 
"nc", "gfx11-insts")
-TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64, 
"V8sV16sV16sV8sIb", "nc", "gfx11-insts")
-TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64, 
"V8hV16hV16hV8hIb", "nc", "gfx11-insts")
-TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64, 
"V8sV16sV16sV8sIb", "nc", "gfx11-insts")
-TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64, 
"V4iIbV4iIbV4iV4iIb", "nc", "gfx11-insts")
-TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64, 
"V4iIbV2iIbV2iV4iIb", "nc", "gfx11-insts")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w32, "V8fV16hV16hV8f", 
"nc", "gfx11-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32, "V8fV16sV16sV8f", 
"nc", "gfx11-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w32, 
"V16hV16hV16hV16hIb", "nc", "gfx11-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32, 
"V16sV16sV16sV16sIb", "nc", "gfx11-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32, 
"V16hV16hV16hV16hIb", "nc", "gfx11-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32, 
"V16sV16sV16sV16sIb", "nc", "gfx11-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32, 
"V8iIbV4iIbV4iV8iIb", "nc", "gfx11-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32, 
"V8iIbV2iIbV2iV8iIb", "nc", "gfx11-insts,wavefrontsize32")
+
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w64, "V4fV16hV16hV4f", 
"nc", "gfx11-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64, "V4fV16sV16sV4f", 
"nc", "gfx11-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w64, "V8hV16hV16hV8hIb", 
"nc", "gfx11-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64, 
"V8sV16sV16sV8sIb", "nc", "gfx11-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64, 
"V8hV16hV16hV8hIb", "nc", "gfx11-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64, 
"V8sV16sV16sV8sIb", "nc", "gfx11-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64, 
"V4iIbV4iIbV4iV4iIb", "nc", "gfx11-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64, 
"V4iIbV2iIbV2iV4iIb", "nc", "gfx11-insts,wavefrontsize64")
 
 TARGET_BUILTIN(__builtin_amdgcn_s_sendmsg_rtn, "UiUIi", "n", "gfx11-insts")
 TARGET_BUILTIN(__builtin_amdgcn_s_sendmsg_rtnl, "UWiUIi", "n", "gfx11-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32-gfx10-e

[clang] [AMDGPU] Check wavefrontsize for GFX11 WMMA builtins (PR #79980)

2024-01-30 Thread Jay Foad via cfe-commits


@@ -21,14 +21,14 @@ void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* 
out4f, v16h a16h, v16h b
 global v8s* out8s, v4i a4i, v4i 
b4i, v8s c8s,
 global v4i* out4i, v2i a2i, v2i 
b2i, v4i c4i)
 {
- *out4f = __builtin_amdgcn_wmma_f32_16x16x16_f16_w64(a16h, b16h, c4f);  // 
expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_f16_w64' needs target 
feature gfx11-insts}}
- *out4f = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64(a16s, b16s, c4f);  // 
expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64' needs target 
feature gfx11-insts}}
- *out8h = __builtin_amdgcn_wmma_f16_16x16x16_f16_w64(a16h, b16h, c8h, true); 
// expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_w64' needs target 
feature gfx11-insts}}
- *out8s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64(a16s, b16s, c8s, true); 
// expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64' needs target 
feature gfx11-insts}}
- *out8h = __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64(a16h, b16h, c8h, 
true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64' 
needs target feature gfx11-insts}}
- *out8s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(a16s, b16s, c8s, 
true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64' 
needs target feature gfx11-insts}}
- *out4i = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64(true, a4i, true, b4i, 
c4i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64' 
needs target feature gfx11-insts}}
- *out4i = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64(true, a2i, true, b2i, 
c4i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64' 
needs target feature gfx11-insts}}
+ *out4f = __builtin_amdgcn_wmma_f32_16x16x16_f16_w64(a16h, b16h, c4f);  // 
expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_f16_w64' needs target 
feature gfx11-insts,wavefrontsize64}}
+ *out4f = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64(a16s, b16s, c4f);  // 
expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64' needs target 
feature gfx11-insts,wavefrontsize64}}
+ *out8h = __builtin_amdgcn_wmma_f16_16x16x16_f16_w64(a16h, b16h, c8h, true); 
// expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_w64' needs target 
feature gfx11-insts,wavefrontsize64}}
+ *out8s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64(a16s, b16s, c8s, true); 
// expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64' needs target 
feature gfx11-insts,wavefrontsize64}}
+ *out8h = __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64(a16h, b16h, c8h, 
true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64' 
needs target feature gfx11-insts,wavefrontsize64}}
+ *out8s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(a16s, b16s, c8s, 
true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64' 
needs target feature gfx11-insts,wavefrontsize64}}
+ *out4i = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64(true, a4i, true, b4i, 
c4i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64' 
needs target feature gfx11-insts,wavefrontsize64}}
+ *out4i = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64(true, a2i, true, b2i, 
c4i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64' 
needs target feature gfx11-insts,wavefrontsize64}}
 }
 
-#endif
\ No newline at end of file
+#endif

jayfoad wrote:

Yes. My editor did that. Previously there was no newline on the end of the 
`#endif`. Lots of tools flag that as unusual.

https://github.com/llvm/llvm-project/pull/79980
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [AMDGPU] Check wavefrontsize for GFX11 WMMA builtins (PR #79980)

2024-01-30 Thread Jay Foad via cfe-commits

jayfoad wrote:

> Do you think it makes sense to add two gfx11 tests where _w32 variant is now 
> rejected with w64, and _w64 variant rejected with w32?

Maybe, but i didn't have the energy to add yet more tests.

> Maybe what is being printed in *-gfx10-err.cl test is enough, though.

Right, that was my thinking.

https://github.com/llvm/llvm-project/pull/79980
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [AMDGPU] Add GFX12 __builtin_amdgcn_s_sleep_var (PR #77926)

2024-01-12 Thread Jay Foad via cfe-commits

https://github.com/jayfoad created 
https://github.com/llvm/llvm-project/pull/77926

None

>From 3d4b8547514f2315130599230e769a8c73be01c3 Mon Sep 17 00:00:00 2001
From: Jay Foad 
Date: Fri, 12 Jan 2024 12:43:16 +
Subject: [PATCH] [AMDGPU] Add GFX12 __builtin_amdgcn_s_sleep_var

---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |  1 +
 clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl | 15 +++
 2 files changed, 16 insertions(+)

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index e562ef04a30194..d0c4b664bf0313 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -410,6 +410,7 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", 
"nc", "fp8-insts")
 // GFX12+ only builtins.
 
//===--===//
 
+TARGET_BUILTIN(__builtin_amdgcn_s_sleep_var, "vUi", "n", "gfx12-insts")
 TARGET_BUILTIN(__builtin_amdgcn_permlane16_var,  "UiUiUiUiIbIb", "nc", 
"gfx12-insts")
 TARGET_BUILTIN(__builtin_amdgcn_permlanex16_var, "UiUiUiUiIbIb", "nc", 
"gfx12-insts")
 TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal, "vIi", "n", "gfx12-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
index 2899d9e5c28898..ebd367bba0cdc1 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
@@ -5,6 +5,21 @@
 
 typedef unsigned int uint;
 
+// CHECK-LABEL: @test_s_sleep_var(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[D_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:store i32 [[D:%.*]], ptr addrspace(5) [[D_ADDR]], align 4
+// CHECK-NEXT:[[TMP0:%.*]] = load i32, ptr addrspace(5) [[D_ADDR]], align 4
+// CHECK-NEXT:call void @llvm.amdgcn.s.sleep.var(i32 [[TMP0]])
+// CHECK-NEXT:call void @llvm.amdgcn.s.sleep.var(i32 15)
+// CHECK-NEXT:ret void
+//
+void test_s_sleep_var(int d)
+{
+  __builtin_amdgcn_s_sleep_var(d);
+  __builtin_amdgcn_s_sleep_var(15);
+}
+
 // CHECK-LABEL: @test_permlane16_var(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:[[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang-tools-extra] [llvm] [clang] [AMDGPU][GFX12] Add Atomic cond_sub_u32 (PR #76224)

2024-01-15 Thread Jay Foad via cfe-commits

jayfoad wrote:

> Adding support in atomicrmw. This will require to add new operation to 
> aromicrmw "cond_sub"

Yes, and we have (Matt has) done this in the past, but it will require a wider 
consensus. I think it's fine to add AMDGPU intrinsics for this in the mean time.

https://github.com/llvm/llvm-project/pull/76224
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [clang-tools-extra] [AMDGPU] Disable V_MAD_U64_U32/V_MAD_I64_I32 workaround for GFX12 (PR #77927)

2024-01-17 Thread Jay Foad via cfe-commits

https://github.com/jayfoad updated 
https://github.com/llvm/llvm-project/pull/77927

>From 3f3bcdb89adf032e26c95807abf5e3b23ff50e4a Mon Sep 17 00:00:00 2001
From: Jay Foad 
Date: Fri, 12 Jan 2024 12:24:28 +
Subject: [PATCH 1/2] Precommit extra GFX12 test coverage

---
 .../GlobalISel/inst-select-mad_64_32.mir  |  21 ++
 llvm/test/CodeGen/AMDGPU/llvm.mulo.ll | 163 ++
 llvm/test/CodeGen/AMDGPU/mad_64_32.ll | 211 ++
 3 files changed, 395 insertions(+)

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-mad_64_32.mir 
b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-mad_64_32.mir
index 698281caca245e9..6e33ef37397d6b4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-mad_64_32.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-mad_64_32.mir
@@ -1,6 +1,7 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -march=amdgcn -mcpu=gfx1030 -run-pass=instruction-select 
-global-isel-abort=2 -pass-remarks-missed='gisel*' -verify-machineinstrs %s -o 
- 2>%t | FileCheck -check-prefix=GFX10 %s
 # RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select 
-global-isel-abort=2 -pass-remarks-missed='gisel*' -verify-machineinstrs %s -o 
- 2>%t | FileCheck -check-prefix=GFX11 %s
+# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=instruction-select 
-global-isel-abort=2 -pass-remarks-missed='gisel*' -verify-machineinstrs %s -o 
- 2>%t | FileCheck -check-prefix=GFX12 %s
 
 ---
 name: mad_u64_u32_vvv
@@ -18,6 +19,7 @@ body: |
 ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3
 ; GFX10-NEXT: [[V_MAD_U64_U32_e64_:%[0-9]+]]:vreg_64, 
[[V_MAD_U64_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_U64_U32_e64 [[COPY]], 
[[COPY1]], [[COPY2]], 0, implicit $exec
 ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_MAD_U64_U32_e64_]], implicit 
[[V_MAD_U64_U32_e64_1]]
+;
 ; GFX11-LABEL: name: mad_u64_u32_vvv
 ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
 ; GFX11-NEXT: {{  $}}
@@ -26,6 +28,15 @@ body: |
 ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3
 ; GFX11-NEXT: [[V_MAD_U64_U32_gfx11_e64_:%[0-9]+]]:vreg_64, 
[[V_MAD_U64_U32_gfx11_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = 
V_MAD_U64_U32_gfx11_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec
 ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MAD_U64_U32_gfx11_e64_]], implicit 
[[V_MAD_U64_U32_gfx11_e64_1]]
+;
+; GFX12-LABEL: name: mad_u64_u32_vvv
+; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+; GFX12-NEXT: {{  $}}
+; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+; GFX12-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3
+; GFX12-NEXT: [[V_MAD_U64_U32_gfx11_e64_:%[0-9]+]]:vreg_64, 
[[V_MAD_U64_U32_gfx11_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = 
V_MAD_U64_U32_gfx11_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec
+; GFX12-NEXT: S_ENDPGM 0, implicit [[V_MAD_U64_U32_gfx11_e64_]], implicit 
[[V_MAD_U64_U32_gfx11_e64_1]]
 %0:vgpr(s32) = COPY $vgpr0
 %1:vgpr(s32) = COPY $vgpr1
 %2:vgpr(s32) = COPY $vgpr2
@@ -51,6 +62,7 @@ body: |
 ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3
 ; GFX10-NEXT: [[V_MAD_I64_I32_e64_:%[0-9]+]]:vreg_64, 
[[V_MAD_I64_I32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_I64_I32_e64 [[COPY]], 
[[COPY1]], [[COPY2]], 0, implicit $exec
 ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_MAD_I64_I32_e64_]], implicit 
[[V_MAD_I64_I32_e64_1]]
+;
 ; GFX11-LABEL: name: mad_i64_i32_vvv
 ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
 ; GFX11-NEXT: {{  $}}
@@ -59,6 +71,15 @@ body: |
 ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3
 ; GFX11-NEXT: [[V_MAD_I64_I32_gfx11_e64_:%[0-9]+]]:vreg_64, 
[[V_MAD_I64_I32_gfx11_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = 
V_MAD_I64_I32_gfx11_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec
 ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MAD_I64_I32_gfx11_e64_]], implicit 
[[V_MAD_I64_I32_gfx11_e64_1]]
+;
+; GFX12-LABEL: name: mad_i64_i32_vvv
+; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+; GFX12-NEXT: {{  $}}
+; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+; GFX12-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3
+; GFX12-NEXT: [[V_MAD_I64_I32_gfx11_e64_:%[0-9]+]]:vreg_64, 
[[V_MAD_I64_I32_gfx11_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = 
V_MAD_I64_I32_gfx11_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec
+; GFX12-NEXT: S_ENDPGM 0, implicit [[V_MAD_I64_I32_gfx11_e64_]], implicit 
[[V_MAD_I64_I32_gfx11_e64_1]]
 %0:vgpr(s32) = COPY $vgpr0
 %1:vgpr(s32) = COPY $vgpr1
 %2:vgpr(s32) = COPY $vgpr2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll 
b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
index 249acec639540b3..b9b03e52ec865c0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
@@ -3,6 +3,7 @@
 ; RUN: llc -march=amdgcn -

[llvm] [clang] [clang-tools-extra] [AMDGPU] Disable V_MAD_U64_U32/V_MAD_I64_I32 workaround for GFX12 (PR #77927)

2024-01-17 Thread Jay Foad via cfe-commits

https://github.com/jayfoad updated 
https://github.com/llvm/llvm-project/pull/77927

>From 3f3bcdb89adf032e26c95807abf5e3b23ff50e4a Mon Sep 17 00:00:00 2001
From: Jay Foad 
Date: Fri, 12 Jan 2024 12:24:28 +
Subject: [PATCH 1/3] Precommit extra GFX12 test coverage

---
 .../GlobalISel/inst-select-mad_64_32.mir  |  21 ++
 llvm/test/CodeGen/AMDGPU/llvm.mulo.ll | 163 ++
 llvm/test/CodeGen/AMDGPU/mad_64_32.ll | 211 ++
 3 files changed, 395 insertions(+)

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-mad_64_32.mir 
b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-mad_64_32.mir
index 698281caca245e..6e33ef37397d6b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-mad_64_32.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-mad_64_32.mir
@@ -1,6 +1,7 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -march=amdgcn -mcpu=gfx1030 -run-pass=instruction-select 
-global-isel-abort=2 -pass-remarks-missed='gisel*' -verify-machineinstrs %s -o 
- 2>%t | FileCheck -check-prefix=GFX10 %s
 # RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select 
-global-isel-abort=2 -pass-remarks-missed='gisel*' -verify-machineinstrs %s -o 
- 2>%t | FileCheck -check-prefix=GFX11 %s
+# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=instruction-select 
-global-isel-abort=2 -pass-remarks-missed='gisel*' -verify-machineinstrs %s -o 
- 2>%t | FileCheck -check-prefix=GFX12 %s
 
 ---
 name: mad_u64_u32_vvv
@@ -18,6 +19,7 @@ body: |
 ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3
 ; GFX10-NEXT: [[V_MAD_U64_U32_e64_:%[0-9]+]]:vreg_64, 
[[V_MAD_U64_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_U64_U32_e64 [[COPY]], 
[[COPY1]], [[COPY2]], 0, implicit $exec
 ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_MAD_U64_U32_e64_]], implicit 
[[V_MAD_U64_U32_e64_1]]
+;
 ; GFX11-LABEL: name: mad_u64_u32_vvv
 ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
 ; GFX11-NEXT: {{  $}}
@@ -26,6 +28,15 @@ body: |
 ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3
 ; GFX11-NEXT: [[V_MAD_U64_U32_gfx11_e64_:%[0-9]+]]:vreg_64, 
[[V_MAD_U64_U32_gfx11_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = 
V_MAD_U64_U32_gfx11_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec
 ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MAD_U64_U32_gfx11_e64_]], implicit 
[[V_MAD_U64_U32_gfx11_e64_1]]
+;
+; GFX12-LABEL: name: mad_u64_u32_vvv
+; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+; GFX12-NEXT: {{  $}}
+; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+; GFX12-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3
+; GFX12-NEXT: [[V_MAD_U64_U32_gfx11_e64_:%[0-9]+]]:vreg_64, 
[[V_MAD_U64_U32_gfx11_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = 
V_MAD_U64_U32_gfx11_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec
+; GFX12-NEXT: S_ENDPGM 0, implicit [[V_MAD_U64_U32_gfx11_e64_]], implicit 
[[V_MAD_U64_U32_gfx11_e64_1]]
 %0:vgpr(s32) = COPY $vgpr0
 %1:vgpr(s32) = COPY $vgpr1
 %2:vgpr(s32) = COPY $vgpr2
@@ -51,6 +62,7 @@ body: |
 ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3
 ; GFX10-NEXT: [[V_MAD_I64_I32_e64_:%[0-9]+]]:vreg_64, 
[[V_MAD_I64_I32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_I64_I32_e64 [[COPY]], 
[[COPY1]], [[COPY2]], 0, implicit $exec
 ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_MAD_I64_I32_e64_]], implicit 
[[V_MAD_I64_I32_e64_1]]
+;
 ; GFX11-LABEL: name: mad_i64_i32_vvv
 ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
 ; GFX11-NEXT: {{  $}}
@@ -59,6 +71,15 @@ body: |
 ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3
 ; GFX11-NEXT: [[V_MAD_I64_I32_gfx11_e64_:%[0-9]+]]:vreg_64, 
[[V_MAD_I64_I32_gfx11_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = 
V_MAD_I64_I32_gfx11_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec
 ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MAD_I64_I32_gfx11_e64_]], implicit 
[[V_MAD_I64_I32_gfx11_e64_1]]
+;
+; GFX12-LABEL: name: mad_i64_i32_vvv
+; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+; GFX12-NEXT: {{  $}}
+; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+; GFX12-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3
+; GFX12-NEXT: [[V_MAD_I64_I32_gfx11_e64_:%[0-9]+]]:vreg_64, 
[[V_MAD_I64_I32_gfx11_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = 
V_MAD_I64_I32_gfx11_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec
+; GFX12-NEXT: S_ENDPGM 0, implicit [[V_MAD_I64_I32_gfx11_e64_]], implicit 
[[V_MAD_I64_I32_gfx11_e64_1]]
 %0:vgpr(s32) = COPY $vgpr0
 %1:vgpr(s32) = COPY $vgpr1
 %2:vgpr(s32) = COPY $vgpr2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll 
b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
index 249acec639540b..b9b03e52ec865c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
@@ -3,6 +3,7 @@
 ; RUN: llc -march=amdgcn -mcpu

[libcxx] [clang] [libc] [llvm] [clang-tools-extra] [flang] [compiler-rt] [AMDGPU] Fix llvm.amdgcn.s.wait.event.export.ready for GFX12 (PR #78191)

2024-01-17 Thread Jay Foad via cfe-commits

https://github.com/jayfoad updated 
https://github.com/llvm/llvm-project/pull/78191

>From 9990fbc26ed3dc245a5127345326050acac49d66 Mon Sep 17 00:00:00 2001
From: Jay Foad 
Date: Fri, 21 Apr 2023 10:46:43 +0100
Subject: [PATCH] [AMDGPU] Fix llvm.amdgcn.s.wait.event.export.ready for GFX12

The meaning of bit 0 of the immediate operand of S_WAIT_EVENT has been
flipped from GFX11.
---
 llvm/lib/Target/AMDGPU/SOPInstructions.td| 8 
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.event.ll | 9 ++---
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td 
b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 46fa3d57a21cb2..b78d900c9bbf42 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1768,10 +1768,10 @@ def : GCNPat<
   (S_SEXT_I32_I16 $src)
 >;
 
-def : GCNPat <
-  (int_amdgcn_s_wait_event_export_ready),
-(S_WAIT_EVENT (i16 0))
->;
+let SubtargetPredicate = isNotGFX12Plus in
+  def : GCNPat <(int_amdgcn_s_wait_event_export_ready), (S_WAIT_EVENT (i16 
0))>;
+let SubtargetPredicate = isGFX12Plus in
+  def : GCNPat <(int_amdgcn_s_wait_event_export_ready), (S_WAIT_EVENT (i16 
1))>;
 
 // The first 10 bits of the mode register are the core FP mode on all
 // subtargets.
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.event.ll 
b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.event.ll
index 3e95e4dec67a2b..25b5ddcf946b35 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.event.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.event.ll
@@ -1,8 +1,11 @@
-; RUN: llc -global-isel=0 -march=amdgcn -verify-machineinstrs -mcpu=gfx1100 < 
%s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -mcpu=gfx1100 < %s 
| FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=0 -march=amdgcn -verify-machineinstrs -mcpu=gfx1100 < 
%s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -global-isel=1 -march=amdgcn -verify-machineinstrs -mcpu=gfx1100 < 
%s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -global-isel=0 -march=amdgcn -verify-machineinstrs -mcpu=gfx1200 < 
%s | FileCheck -check-prefixes=GCN,GFX12 %s
+; RUN: llc -global-isel=1 -march=amdgcn -verify-machineinstrs -mcpu=gfx1200 < 
%s | FileCheck -check-prefixes=GCN,GFX12 %s
 
 ; GCN-LABEL: {{^}}test_wait_event:
-; GCN: s_wait_event 0x0
+; GFX11: s_wait_event 0x0
+; GFX12: s_wait_event 0x1
 
 define amdgpu_ps void @test_wait_event() #0 {
 entry:

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang-tools-extra] [clang] [llvm] [AMDGPU] Disable V_MAD_U64_U32/V_MAD_I64_I32 workaround for GFX12 (PR #77927)

2024-01-17 Thread Jay Foad via cfe-commits

https://github.com/jayfoad closed 
https://github.com/llvm/llvm-project/pull/77927
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[flang] [libc] [llvm] [clang-tools-extra] [clang] [compiler-rt] [libcxx] [AMDGPU] Fix llvm.amdgcn.s.wait.event.export.ready for GFX12 (PR #78191)

2024-01-17 Thread Jay Foad via cfe-commits

https://github.com/jayfoad closed 
https://github.com/llvm/llvm-project/pull/78191
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang-tools-extra] [llvm] [AMDGPU] Src1 of VOP3 DPP instructions can be SGPR on GFX12 (PR #77929)

2024-01-17 Thread Jay Foad via cfe-commits

https://github.com/jayfoad updated 
https://github.com/llvm/llvm-project/pull/77929

>From 4299ba898449f782c642b0c27f0ec9970aee0a1c Mon Sep 17 00:00:00 2001
From: Jay Foad 
Date: Fri, 12 Jan 2024 11:34:02 +
Subject: [PATCH 1/2] [AMDGPU] Src1 of VOP3 DPP instructions can be SGPR on
 GFX12

---
 llvm/lib/Target/AMDGPU/AMDGPU.td|  3 ++-
 llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir  |  1 +
 llvm/test/MC/AMDGPU/gfx12_asm_features.s| 17 +
 .../Disassembler/AMDGPU/gfx12_dasm_features.txt | 13 +
 4 files changed, 33 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_features.txt

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index b27edb1e9e14bb..682ca6c57c973b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1502,7 +1502,8 @@ def FeatureISAVersion12 : FeatureSet<
FeatureHasRestrictedSOffset,
FeatureVGPRSingleUseHintInsts,
FeatureMADIntraFwdBug,
-   FeatureScalarDwordx3Loads]>;
+   FeatureScalarDwordx3Loads,
+   FeatureDPPSrc1SGPR]>;
 
 
//===--===//
 
diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir 
b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
index fe1345e29f133d..7d081a1491da6e 100644
--- a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
+++ b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
@@ -1,5 +1,6 @@
 # RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=gcn-dpp-combine 
-verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN,GFX1100
 # RUN: llc -march=amdgcn -mcpu=gfx1150 -run-pass=gcn-dpp-combine 
-verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN,GFX1150
+# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=gcn-dpp-combine 
-verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN,GFX1150
 
 ---
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_features.s 
b/llvm/test/MC/AMDGPU/gfx12_asm_features.s
index 7e58bdb3b444e1..da4464c6494dbf 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_features.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_features.s
@@ -1,5 +1,22 @@
 // RUN: llvm-mc -arch=amdgcn -show-encoding -mcpu=gfx1200 %s | FileCheck 
--check-prefix=GFX12 %s
 
+//
+// Subtargets allow src1 of VOP3 DPP instructions to be SGPR or inlinable
+// constant.
+//
+
+v_add3_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf 
bank_mask:0xf
+// GFX1150: encoding: 
[0x05,0x00,0x55,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_add3_u32_e64_dpp v5, v1, 42, v3 quad_perm:[3,2,1,0] row_mask:0xf 
bank_mask:0xf
+// GFX1150: encoding: 
[0x05,0x00,0x55,0xd6,0xfa,0x54,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
+v_add3_u32_e64_dpp v5, v1, s2, v0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1150: encoding: 
[0x05,0x00,0x55,0xd6,0xe9,0x04,0x00,0x04,0x01,0x77,0x39,0x05]
+
+v_add3_u32_e64_dpp v5, v1, 42, v0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1150: encoding: 
[0x05,0x00,0x55,0xd6,0xe9,0x54,0x01,0x04,0x01,0x77,0x39,0x05]
+
 //
 // Elements of CPol operand can be given in any order
 //
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_features.txt 
b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_features.txt
new file mode 100644
index 00..2c64522422ad0d
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_features.txt
@@ -0,0 +1,13 @@
+# RUN: llvm-mc -arch=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | 
FileCheck -check-prefixes=GFX12 %s
+
+# GFX12: v_add3_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf 
bank_mask:0xf ; encoding: 
[0x05,0x00,0x55,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x55,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff
+
+# GFX12: v_add3_u32_e64_dpp v5, v1, 42, v3 quad_perm:[3,2,1,0] row_mask:0xf 
bank_mask:0xf ; encoding: 
[0x05,0x00,0x55,0xd6,0xfa,0x54,0x0d,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x55,0xd6,0xfa,0x54,0x0d,0x04,0x01,0x1b,0x00,0xff
+
+# GFX12: v_add3_u32_e64_dpp v5, v1, s2, v0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: 
[0x05,0x00,0x55,0xd6,0xe9,0x04,0x00,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x55,0xd6,0xe9,0x04,0x00,0x04,0x01,0x77,0x39,0x05
+
+# GFX12: v_add3_u32_e64_dpp v5, v1, 42, v0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: 
[0x05,0x00,0x55,0xd6,0xe9,0x54,0x01,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x55,0xd6,0xe9,0x54,0x01,0x04,0x01,0x77,0x39,0x05

>From a65834ad3d8aed3e9cb1414d7576d5244a31f8a2 Mon Sep 17 00:00:00 2001
From: Jay Foad 
Date: Wed, 17 Jan 2024 14:39:09 +
Subject: [PATCH 2/2] More tests

---
 llvm/test/MC/AMDGPU/gfx1150_asm_features.s | 6 ++
 llvm/test/MC/AMDGPU/gfx12_asm_features.s   | 6 ++
 llvm/test/MC/Disassembler/AMDGPU/gfx1150_dasm_features.txt | 6 ++
 llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_features.txt   | 6 ++
 4 files changed, 24 insertions(+)

diff --git a/llvm/test/MC/AMDGPU/gfx1150_asm_features.s 
b/llvm/test/MC/AMDGPU/gfx1150_asm_features.s
index a4904c40b40ae7..55c855175a89e0 100644
--- a/llvm/test/MC/AMDGPU/gfx115

[clang-tools-extra] [llvm] [clang] [AMDGPU] CodeGen for GFX12 S_WAIT_* instructions (PR #77438)

2024-01-17 Thread Jay Foad via cfe-commits

jayfoad wrote:

@Pierre-vh @arsen ping! (Sorry, I know it has only been a few days.)

https://github.com/llvm/llvm-project/pull/77438
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang-tools-extra] [llvm] [AMDGPU] Src1 of VOP3 DPP instructions can be SGPR on GFX12 (PR #77929)

2024-01-17 Thread Jay Foad via cfe-commits

https://github.com/jayfoad closed 
https://github.com/llvm/llvm-project/pull/77929
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [clang-tools-extra] [AMDGPU] Work around s_getpc_b64 zero extending on GFX12 (PR #78186)

2024-01-18 Thread Jay Foad via cfe-commits

https://github.com/jayfoad updated 
https://github.com/llvm/llvm-project/pull/78186

>From d3f4ebf849f6ef1ea373e5c7f93398db6681b2b6 Mon Sep 17 00:00:00 2001
From: Jay Foad 
Date: Mon, 15 Jan 2024 15:02:08 +
Subject: [PATCH 1/4] Add GFX11/12 test coverage

---
 llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll | 103 +-
 1 file changed, 77 insertions(+), 26 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll 
b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
index 598d7a8033c2e54..2c1baeeeda21697 100644
--- a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
@@ -1,32 +1,83 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stress-regalloc=2 
-verify-machineinstrs < %s | FileCheck %s
-
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stress-regalloc=2 
-verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX9
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -stress-regalloc=2 
-verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX11
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -stress-regalloc=2 
-verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX12
 
 define void @test_remat_s_getpc_b64() {
-; CHECK-LABEL: test_remat_s_getpc_b64:
-; CHECK:   ; %bb.0: ; %entry
-; CHECK-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:s_xor_saveexec_b64 s[4:5], -1
-; CHECK-NEXT:buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; CHECK-NEXT:s_mov_b64 exec, s[4:5]
-; CHECK-NEXT:v_writelane_b32 v0, s30, 0
-; CHECK-NEXT:s_getpc_b64 s[4:5]
-; CHECK-NEXT:v_writelane_b32 v0, s31, 1
-; CHECK-NEXT:;;#ASMSTART
-; CHECK-NEXT:;;#ASMEND
-; CHECK-NEXT:;;#ASMSTART
-; CHECK-NEXT:;;#ASMEND
-; CHECK-NEXT:s_getpc_b64 s[4:5]
-; CHECK-NEXT:v_mov_b32_e32 v1, s4
-; CHECK-NEXT:v_mov_b32_e32 v2, s5
-; CHECK-NEXT:global_store_dwordx2 v[1:2], v[1:2], off
-; CHECK-NEXT:v_readlane_b32 s31, v0, 1
-; CHECK-NEXT:v_readlane_b32 s30, v0, 0
-; CHECK-NEXT:s_xor_saveexec_b64 s[4:5], -1
-; CHECK-NEXT:buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; CHECK-NEXT:s_mov_b64 exec, s[4:5]
-; CHECK-NEXT:s_waitcnt vmcnt(0)
-; CHECK-NEXT:s_setpc_b64 s[30:31]
+; GFX9-LABEL: test_remat_s_getpc_b64:
+; GFX9:   ; %bb.0: ; %entry
+; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:v_writelane_b32 v0, s30, 0
+; GFX9-NEXT:s_getpc_b64 s[4:5]
+; GFX9-NEXT:v_writelane_b32 v0, s31, 1
+; GFX9-NEXT:;;#ASMSTART
+; GFX9-NEXT:;;#ASMEND
+; GFX9-NEXT:;;#ASMSTART
+; GFX9-NEXT:;;#ASMEND
+; GFX9-NEXT:s_getpc_b64 s[4:5]
+; GFX9-NEXT:v_mov_b32_e32 v1, s4
+; GFX9-NEXT:v_mov_b32_e32 v2, s5
+; GFX9-NEXT:global_store_dwordx2 v[1:2], v[1:2], off
+; GFX9-NEXT:v_readlane_b32 s31, v0, 1
+; GFX9-NEXT:v_readlane_b32 s30, v0, 0
+; GFX9-NEXT:s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:s_waitcnt vmcnt(0)
+; GFX9-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_remat_s_getpc_b64:
+; GFX11:   ; %bb.0: ; %entry
+; GFX11-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; GFX11-NEXT:s_mov_b32 exec_lo, s0
+; GFX11-NEXT:v_writelane_b32 v0, s30, 0
+; GFX11-NEXT:s_getpc_b64 s[0:1]
+; GFX11-NEXT:;;#ASMSTART
+; GFX11-NEXT:;;#ASMEND
+; GFX11-NEXT:v_writelane_b32 v0, s31, 1
+; GFX11-NEXT:;;#ASMSTART
+; GFX11-NEXT:;;#ASMEND
+; GFX11-NEXT:s_getpc_b64 s[0:1]
+; GFX11-NEXT:s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | 
instid1(VALU_DEP_2)
+; GFX11-NEXT:v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT:v_readlane_b32 s31, v0, 1
+; GFX11-NEXT:v_readlane_b32 s30, v0, 0
+; GFX11-NEXT:global_store_b64 v[1:2], v[1:2], off
+; GFX11-NEXT:s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT:s_mov_b32 exec_lo, s0
+; GFX11-NEXT:s_waitcnt vmcnt(0)
+; GFX11-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_remat_s_getpc_b64:
+; GFX12:   ; %bb.0: ; %entry
+; GFX12-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; GFX12-NEXT:s_mov_b32 exec_lo, s0
+; GFX12-NEXT:v_writelane_b32 v0, s30, 0
+; GFX12-NEXT:s_getpc_b64 s[0:1]
+; GFX12-NEXT:;;#ASMSTART
+; GFX12-NEXT:;;#ASMEND
+; GFX12-NEXT:v_writelane_b32 v0, s31, 1
+; GFX12-NEXT:;;#ASMSTART
+; GFX12-

[clang] [AMDGPU] Add GFX12 __builtin_amdgcn_s_sleep_var (PR #77926)

2024-01-18 Thread Jay Foad via cfe-commits

https://github.com/jayfoad closed 
https://github.com/llvm/llvm-project/pull/77926
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang-tools-extra] [llvm] [AMDGPU] Work around s_getpc_b64 zero extending on GFX12 (PR #78186)

2024-01-18 Thread Jay Foad via cfe-commits

https://github.com/jayfoad updated 
https://github.com/llvm/llvm-project/pull/78186

>From d3f4ebf849f6ef1ea373e5c7f93398db6681b2b6 Mon Sep 17 00:00:00 2001
From: Jay Foad 
Date: Mon, 15 Jan 2024 15:02:08 +
Subject: [PATCH 1/4] Add GFX11/12 test coverage

---
 llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll | 103 +-
 1 file changed, 77 insertions(+), 26 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll 
b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
index 598d7a8033c2e54..2c1baeeeda21697 100644
--- a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
@@ -1,32 +1,83 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stress-regalloc=2 
-verify-machineinstrs < %s | FileCheck %s
-
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stress-regalloc=2 
-verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX9
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -stress-regalloc=2 
-verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX11
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -stress-regalloc=2 
-verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX12
 
 define void @test_remat_s_getpc_b64() {
-; CHECK-LABEL: test_remat_s_getpc_b64:
-; CHECK:   ; %bb.0: ; %entry
-; CHECK-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:s_xor_saveexec_b64 s[4:5], -1
-; CHECK-NEXT:buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; CHECK-NEXT:s_mov_b64 exec, s[4:5]
-; CHECK-NEXT:v_writelane_b32 v0, s30, 0
-; CHECK-NEXT:s_getpc_b64 s[4:5]
-; CHECK-NEXT:v_writelane_b32 v0, s31, 1
-; CHECK-NEXT:;;#ASMSTART
-; CHECK-NEXT:;;#ASMEND
-; CHECK-NEXT:;;#ASMSTART
-; CHECK-NEXT:;;#ASMEND
-; CHECK-NEXT:s_getpc_b64 s[4:5]
-; CHECK-NEXT:v_mov_b32_e32 v1, s4
-; CHECK-NEXT:v_mov_b32_e32 v2, s5
-; CHECK-NEXT:global_store_dwordx2 v[1:2], v[1:2], off
-; CHECK-NEXT:v_readlane_b32 s31, v0, 1
-; CHECK-NEXT:v_readlane_b32 s30, v0, 0
-; CHECK-NEXT:s_xor_saveexec_b64 s[4:5], -1
-; CHECK-NEXT:buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; CHECK-NEXT:s_mov_b64 exec, s[4:5]
-; CHECK-NEXT:s_waitcnt vmcnt(0)
-; CHECK-NEXT:s_setpc_b64 s[30:31]
+; GFX9-LABEL: test_remat_s_getpc_b64:
+; GFX9:   ; %bb.0: ; %entry
+; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:v_writelane_b32 v0, s30, 0
+; GFX9-NEXT:s_getpc_b64 s[4:5]
+; GFX9-NEXT:v_writelane_b32 v0, s31, 1
+; GFX9-NEXT:;;#ASMSTART
+; GFX9-NEXT:;;#ASMEND
+; GFX9-NEXT:;;#ASMSTART
+; GFX9-NEXT:;;#ASMEND
+; GFX9-NEXT:s_getpc_b64 s[4:5]
+; GFX9-NEXT:v_mov_b32_e32 v1, s4
+; GFX9-NEXT:v_mov_b32_e32 v2, s5
+; GFX9-NEXT:global_store_dwordx2 v[1:2], v[1:2], off
+; GFX9-NEXT:v_readlane_b32 s31, v0, 1
+; GFX9-NEXT:v_readlane_b32 s30, v0, 0
+; GFX9-NEXT:s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:s_waitcnt vmcnt(0)
+; GFX9-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_remat_s_getpc_b64:
+; GFX11:   ; %bb.0: ; %entry
+; GFX11-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; GFX11-NEXT:s_mov_b32 exec_lo, s0
+; GFX11-NEXT:v_writelane_b32 v0, s30, 0
+; GFX11-NEXT:s_getpc_b64 s[0:1]
+; GFX11-NEXT:;;#ASMSTART
+; GFX11-NEXT:;;#ASMEND
+; GFX11-NEXT:v_writelane_b32 v0, s31, 1
+; GFX11-NEXT:;;#ASMSTART
+; GFX11-NEXT:;;#ASMEND
+; GFX11-NEXT:s_getpc_b64 s[0:1]
+; GFX11-NEXT:s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | 
instid1(VALU_DEP_2)
+; GFX11-NEXT:v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT:v_readlane_b32 s31, v0, 1
+; GFX11-NEXT:v_readlane_b32 s30, v0, 0
+; GFX11-NEXT:global_store_b64 v[1:2], v[1:2], off
+; GFX11-NEXT:s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT:s_mov_b32 exec_lo, s0
+; GFX11-NEXT:s_waitcnt vmcnt(0)
+; GFX11-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_remat_s_getpc_b64:
+; GFX12:   ; %bb.0: ; %entry
+; GFX12-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; GFX12-NEXT:s_mov_b32 exec_lo, s0
+; GFX12-NEXT:v_writelane_b32 v0, s30, 0
+; GFX12-NEXT:s_getpc_b64 s[0:1]
+; GFX12-NEXT:;;#ASMSTART
+; GFX12-NEXT:;;#ASMEND
+; GFX12-NEXT:v_writelane_b32 v0, s31, 1
+; GFX12-NEXT:;;#ASMSTART
+; GFX12-

[clang-tools-extra] [clang] [llvm] [AMDGPU] Work around s_getpc_b64 zero extending on GFX12 (PR #78186)

2024-01-18 Thread Jay Foad via cfe-commits

https://github.com/jayfoad closed 
https://github.com/llvm/llvm-project/pull/78186
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang-tools-extra] [clang] [llvm] [AMDGPU] CodeGen for GFX12 S_WAIT_* instructions (PR #77438)

2024-01-18 Thread Jay Foad via cfe-commits

https://github.com/jayfoad closed 
https://github.com/llvm/llvm-project/pull/77438
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang-tools-extra] [clang] [llvm] [AMDGPU] Update uses of new VOP2 pseudos for GFX12 (PR #78155)

2024-01-18 Thread Jay Foad via cfe-commits


@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck 
--check-prefixes=SI %s

jayfoad wrote:

Done as part of a merge from main to fix conflicts.

https://github.com/llvm/llvm-project/pull/78155
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [clang-tools-extra] [AMDGPU] Update uses of new VOP2 pseudos for GFX12 (PR #78155)

2024-01-18 Thread Jay Foad via cfe-commits

https://github.com/jayfoad closed 
https://github.com/llvm/llvm-project/pull/78155
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Add GFX12 WMMA and SWMMAC instructions (PR #77795)

2024-01-19 Thread Jay Foad via cfe-commits

jayfoad wrote:

Some of the tests in this patch need regenerating now that #77438 has been 
merged.

https://github.com/llvm/llvm-project/pull/77795
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


  1   2   3   >