[clang] AMDGPU: Implement builtins for gfx1250 wmma instructions (PR #148991)

Changpeng Fang via cfe-commits Tue, 15 Jul 2025 16:50:39 -0700

https://github.com/changpeng updated 
https://github.com/llvm/llvm-project/pull/148991


>From 074800e1906bcce1cc0110c759a6d141ce4ea322 Mon Sep 17 00:00:00 2001
From: Changpeng Fang <changpeng.f...@amd.com>
Date: Tue, 15 Jul 2025 16:37:20 -0700
Subject: [PATCH 1/2] AMDGPU: Implement builtins for gfx1250 wmma instructions

Co-Authored-by: Stanislav Mekhanoshin <stanislav.mekhanos...@amd.com>
Co-Authored-by: Shilei Tian <shilei.t...@amd.com>
---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |  40 ++
 clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp   | 199 +++++++-
 .../builtins-amdgcn-gfx1250-wmma-w32.cl       | 433 ++++++++++++++++++
 ...ins-amdgcn-error-gfx1250-wmma-w32-param.cl | 242 ++++++++++
 4 files changed, 913 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl
 create mode 100644 
clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 71e4b3486167a..29e1e99bba9ef 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -676,5 +676,45 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_bf8, "hiIi", "nc", 
"gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_fp8, "V2hs", "nc", "gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_bf8, "V2hs", "nc", "gfx1250-insts")
 
+// GFX1250 WMMA builtins
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x4_f32, 
"V8fIbV2fIbV2fIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x32_bf16, 
"V8fIbV16yIbV16yIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x32_bf16, 
"V8yIbV16yIbV16yIsV8yIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16, 
"V8yIbV16yIbV16yIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8, 
"V8fV8iV8iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8, 
"V8fV8iV8iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8, 
"V8fV8iV8iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8, 
"V8fV8iV8iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8, 
"V8hV8iV8iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8, 
"V8hV8iV8iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8, 
"V8hV8iV8iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8, 
"V8hV8iV8iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x64_iu8, "V8iIbV8iIbV8iV8iIbIb", 
"nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8, 
"V8hV16iV16iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8, 
"V8hV16iV16iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8, 
"V8hV16iV16iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8, 
"V8hV16iV16iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8, 
"V8fV16iV16iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8, 
"V8fV16iV16iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8, 
"V8fV16iV16iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8, 
"V8fV16iV16iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x32_f16, 
"V8fIbV16hIbV16hIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x32_f16, 
"V8hIbV16hIbV16hIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_32x16x128_f4, "V16fV16iV8iIsV16f", 
"nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x64_bf16, 
"V8fIbV16yIbV32yV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_bf16_16x16x64_bf16, 
"V8yIbV16yIbV32yV8yiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16, 
"V8fIbV16yIbV32yV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x128_fp8_fp8, 
"V8fV8iV16iV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x128_fp8_bf8, 
"V8fV8iV16iV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x128_bf8_fp8, 
"V8fV8iV16iV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x128_bf8_bf8, 
"V8fV8iV16iV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x128_fp8_fp8, 
"V8hV8iV16iV8hiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x128_fp8_bf8, 
"V8hV8iV16iV8hiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x128_bf8_fp8, 
"V8hV8iV16iV8hiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x128_bf8_bf8, 
"V8hV8iV16iV8hiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x128_iu8, 
"V8iIbV8iIbV16iV8iiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x64_f16, 
"V8fIbV16hIbV32hV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x64_f16, 
"V8hIbV16hIbV32hV8hiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+
 #undef BUILTIN
 #undef TARGET_BUILTIN
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp 
b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index 0d8c2ed284994..e1f9cbe7aea26 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -822,7 +822,46 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
   case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
   case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
-  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64: {
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64:
+  // GFX1250 WMMA builtins
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x4_f32:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_bf16:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_f16:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x32_f16:
+  case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x32_bf16:
+  case AMDGPU::BI__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8:
+  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x64_iu8:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_32x16x128_f4:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_f16:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_bf16:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x64_f16:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x64_bf16:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_fp8:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_bf8:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_fp8:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_bf8:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_fp8:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_bf8:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_fp8:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_bf8:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x128_iu8: {
 
     // These operations perform a matrix multiplication and accumulation of
     // the form:
@@ -837,6 +876,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
     // "false".
     bool AppendFalseForOpselArg = false;
     unsigned BuiltinWMMAOp;
+    // Need return type when D and C are of different types.
+    bool NeedReturnType = false;
 
     switch (BuiltinID) {
     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
@@ -975,6 +1016,160 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
       ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
       BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8;
       break;
+    // GFX1250 WMMA builtins
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x4_f32:
+      ArgsForMatchingMatrixTypes = {5, 1};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x4_f32;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_bf16:
+      ArgsForMatchingMatrixTypes = {5, 1};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x32_bf16;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_f16:
+      ArgsForMatchingMatrixTypes = {5, 1};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x32_f16;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x32_f16:
+      ArgsForMatchingMatrixTypes = {5, 1};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x32_f16;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x32_bf16:
+      ArgsForMatchingMatrixTypes = {5, 1};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x32_bf16;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16:
+      NeedReturnType = true;
+      ArgsForMatchingMatrixTypes = {1, 5};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16f32_16x16x32_bf16;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8:
+      ArgsForMatchingMatrixTypes = {3, 0};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_fp8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8:
+      ArgsForMatchingMatrixTypes = {3, 0};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_bf8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8:
+      ArgsForMatchingMatrixTypes = {3, 0};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_fp8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8:
+      ArgsForMatchingMatrixTypes = {3, 0};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_bf8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8:
+      ArgsForMatchingMatrixTypes = {3, 0};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_fp8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8:
+      ArgsForMatchingMatrixTypes = {3, 0};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_bf8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8:
+      ArgsForMatchingMatrixTypes = {3, 0};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_fp8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8:
+      ArgsForMatchingMatrixTypes = {3, 0};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_bf8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8:
+      ArgsForMatchingMatrixTypes = {3, 0};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_fp8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8:
+      ArgsForMatchingMatrixTypes = {3, 0};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_bf8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8:
+      ArgsForMatchingMatrixTypes = {3, 0};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_fp8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8:
+      ArgsForMatchingMatrixTypes = {3, 0};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_bf8;
+      break;
+      case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8:
+      ArgsForMatchingMatrixTypes = {3, 0};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_fp8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8:
+      ArgsForMatchingMatrixTypes = {3, 0};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_bf8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8:
+      ArgsForMatchingMatrixTypes = {3, 0};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_fp8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8:
+      ArgsForMatchingMatrixTypes = {3, 0};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_bf8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x64_iu8:
+      ArgsForMatchingMatrixTypes = {4, 1};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x64_iu8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_32x16x128_f4:
+      ArgsForMatchingMatrixTypes = {3, 0, 1};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_32x16x128_f4;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_f16:
+      ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x64_f16;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_bf16:
+      ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x64_f16:
+      ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x64_f16;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x64_bf16:
+      ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16:
+      ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_fp8:
+      ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_bf8:
+      ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_fp8:
+      ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_bf8:
+      ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_fp8:
+      ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_bf8:
+      ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_fp8:
+      ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_bf8:
+      ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x128_iu8:
+      ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8;
+      break;
     }
 
     SmallVector<Value *, 6> Args;
@@ -984,6 +1179,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
       Args.push_back(Builder.getFalse());
 
     SmallVector<llvm::Type *, 6> ArgTypes;
+    if (NeedReturnType)
+      ArgTypes.push_back(ConvertType(E->getType()));
     for (auto ArgIdx : ArgsForMatchingMatrixTypes)
       ArgTypes.push_back(Args[ArgIdx]->getType());
 
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl
new file mode 100644
index 0000000000000..e4ef3defdb341
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl
@@ -0,0 +1,433 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1250 
-target-feature +wavefrontsize32 -emit-llvm -o - %s | FileCheck %s 
--check-prefix=CHECK-GFX1250
+
+typedef float  v16f  __attribute__((ext_vector_type(16)));
+typedef float  v8f   __attribute__((ext_vector_type(8)));
+typedef float  v2f   __attribute__((ext_vector_type(2)));
+typedef half   v8h   __attribute__((ext_vector_type(8)));
+typedef half   v16h  __attribute__((ext_vector_type(16)));
+typedef half   v32h  __attribute__((ext_vector_type(32)));
+typedef __bf16 v32bf16 __attribute__((ext_vector_type(32)));
+typedef __bf16 v16bf16 __attribute__((ext_vector_type(16)));
+typedef __bf16 v8bf16 __attribute__((ext_vector_type(8)));
+typedef int    v16i   __attribute__((ext_vector_type(16)));
+typedef int    v8i   __attribute__((ext_vector_type(8)));
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x4_f32(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x float> 
@llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 false, <2 x float> [[A:%.*]], 
i1 false, <2 x float> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 false, i1 
true)
+// CHECK-GFX1250-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) 
[[OUT:%.*]], align 32, !tbaa [[TBAA4:![0-9]+]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x4_f32(global v8f* out, v2f a, v2f b, v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x4_f32(0, a, 0, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x32_bf16(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x float> 
@llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 false, <16 x bfloat> 
[[A:%.*]], i1 false, <16 x bfloat> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 
true, i1 false)
+// CHECK-GFX1250-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) 
[[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x32_bf16(global v8f* out, v16bf16 a, v16bf16 b, 
v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x32_bf16(0, a, 0, b, 0, c, true, 
false);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_bf16_16x16x32_bf16(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x bfloat> 
@llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 false, <16 x bfloat> 
[[A:%.*]], i1 false, <16 x bfloat> [[B:%.*]], i16 0, <8 x bfloat> [[C:%.*]], i1 
false, i1 false)
+// CHECK-GFX1250-NEXT:    store <8 x bfloat> [[TMP0]], ptr addrspace(1) 
[[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_bf16_16x16x32_bf16(global v8bf16* out, v16bf16 a, 
v16bf16 b, v8bf16 c)
+{
+  *out = __builtin_amdgcn_wmma_bf16_16x16x32_bf16(0, a, 0, b, 0, c, false, 
false);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_bf16f32_16x16x32_bf16(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x bfloat> 
@llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16.v8f32(i1 false, <16 x 
bfloat> [[A:%.*]], i1 false, <16 x bfloat> [[B:%.*]], i16 0, <8 x float> 
[[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x bfloat> [[TMP0]], ptr addrspace(1) 
[[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_bf16f32_16x16x32_bf16(global v8bf16* out, v16bf16 a, 
v16bf16 b, v8f c)
+{
+  *out = __builtin_amdgcn_wmma_bf16f32_16x16x32_bf16(0, a, 0, b, 0, c, false, 
true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x64_fp8_fp8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x float> 
@llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v8f32.v8i32(<8 x i32> [[A:%.*]], <8 x 
i32> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 true, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) 
[[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x64_fp8_fp8(global v8f* out, v8i a, v8i b, v8f 
c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8(a, b, 0, c, true, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x64_fp8_bf8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x float> 
@llvm.amdgcn.wmma.f32.16x16x64.fp8.bf8.v8f32.v8i32(<8 x i32> [[A:%.*]], <8 x 
i32> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) 
[[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x64_fp8_bf8(global v8f* out, v8i a, v8i b, v8f 
c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8(a, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x64_bf8_fp8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x float> 
@llvm.amdgcn.wmma.f32.16x16x64.bf8.fp8.v8f32.v8i32(<8 x i32> [[A:%.*]], <8 x 
i32> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) 
[[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x64_bf8_fp8(global v8f* out, v8i a, v8i b, v8f 
c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8(a, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x64_bf8_bf8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x float> 
@llvm.amdgcn.wmma.f32.16x16x64.bf8.bf8.v8f32.v8i32(<8 x i32> [[A:%.*]], <8 x 
i32> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) 
[[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x64_bf8_bf8(global v8f* out, v8i a, v8i b, v8f 
c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8(a, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f16_16x16x64_fp8_fp8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x half> 
@llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v8f16.v8i32(<8 x i32> [[A:%.*]], <8 x 
i32> [[B:%.*]], i16 0, <8 x half> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) 
[[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_f16_16x16x64_fp8_fp8(global v8h* out, v8i a, v8i b, v8h 
c)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8(a, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f16_16x16x64_fp8_bf8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x half> 
@llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32> [[A:%.*]], <8 x 
i32> [[B:%.*]], i16 0, <8 x half> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) 
[[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_f16_16x16x64_fp8_bf8(global v8h* out, v8i a, v8i b, v8h 
c)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8(a, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f16_16x16x64_bf8_fp8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x half> 
@llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32> [[A:%.*]], <8 x 
i32> [[B:%.*]], i16 0, <8 x half> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) 
[[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_f16_16x16x64_bf8_fp8(global v8h* out, v8i a, v8i b, v8h 
c)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8(a, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f16_16x16x64_bf8_bf8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x half> 
@llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32> [[A:%.*]], <8 x 
i32> [[B:%.*]], i16 0, <8 x half> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) 
[[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_f16_16x16x64_bf8_bf8(global v8h* out, v8i a, v8i b, v8h 
c)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8(a, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_i32_16x16x64_iu8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> 
@llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 false, <8 x i32> [[A:%.*]], 
i1 false, <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) 
[[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_i32_16x16x64_iu8(global v8i* out, v8i a, v8i b, v8i c)
+{
+  *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, 0, b, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x32_f16(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x float> 
@llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> 
[[A:%.*]], i1 false, <16 x half> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 
false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) 
[[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x32_f16(global v8f* out, v16h a, v16h b, v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x32_f16(0, a, 0, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f16_16x16x32_f16(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x half> 
@llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 false, <16 x half> 
[[A:%.*]], i1 false, <16 x half> [[B:%.*]], i16 0, <8 x half> [[C:%.*]], i1 
false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) 
[[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_f16_16x16x32_f16(global v8h* out, v16h a, v16h b, v8h c)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x32_f16(0, a, 0, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f16_16x16x128_fp8_fp8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x half> 
@llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32> [[A:%.*]], <16 
x i32> [[B:%.*]], i16 0, <8 x half> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) 
[[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_f16_16x16x128_fp8_fp8(global v8h* out, v16i a, v16i b, 
v8h c)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8(a, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f16_16x16x128_fp8_bf8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x half> 
@llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32> [[A:%.*]], <16 
x i32> [[B:%.*]], i16 0, <8 x half> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) 
[[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_f16_16x16x128_fp8_bf8(global v8h* out, v16i a, v16i b, 
v8h c)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8(a, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f16_16x16x128_bf8_fp8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x half> 
@llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32> [[A:%.*]], <16 
x i32> [[B:%.*]], i16 0, <8 x half> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) 
[[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_f16_16x16x128_bf8_fp8(global v8h* out, v16i a, v16i b, 
v8h c)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8(a, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f16_16x16x128_bf8_bf8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x half> 
@llvm.amdgcn.wmma.f16.16x16x128.bf8.bf8.v8f16.v16i32(<16 x i32> [[A:%.*]], <16 
x i32> [[B:%.*]], i16 0, <8 x half> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) 
[[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_f16_16x16x128_bf8_bf8(global v8h* out, v16i a, v16i b, 
v8h c)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8(a, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x128_fp8_fp8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x float> 
@llvm.amdgcn.wmma.f32.16x16x128.fp8.fp8.v8f32.v16i32(<16 x i32> [[A:%.*]], <16 
x i32> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 true, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) 
[[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x128_fp8_fp8(global v8f* out, v16i a, v16i b, 
v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8(a, b, 0, c, true, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x128_fp8_bf8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x float> 
@llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i32> [[A:%.*]], <16 
x i32> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) 
[[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x128_fp8_bf8(global v8f* out, v16i a, v16i b, 
v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8(a, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x128_bf8_fp8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x float> 
@llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32> [[A:%.*]], <16 
x i32> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) 
[[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x128_bf8_fp8(global v8f* out, v16i a, v16i b, 
v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8(a, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x128_bf8_bf8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x float> 
@llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32> [[A:%.*]], <16 
x i32> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) 
[[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x128_bf8_bf8(global v8f* out, v16i a, v16i b, 
v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8(a, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_wmma_f32_32x16x128_f4(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <16 x float> 
@llvm.amdgcn.wmma.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> [[A:%.*]], <8 
x i32> [[B:%.*]], i16 0, <16 x float> [[C:%.*]])
+// CHECK-GFX1250-NEXT:    store <16 x float> [[TMP0]], ptr addrspace(1) 
[[OUT:%.*]], align 64, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_wmma_f32_32x16x128_f4(global v16f* out, v16i a, v8i 
b, v16f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_32x16x128_f4(a, b, 0, c);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f32_16x16x64_bf16(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x float> 
@llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i32(i1 false, <16 x 
bfloat> [[A:%.*]], i1 false, <32 x bfloat> [[B:%.*]], <8 x float> [[C:%.*]], 
i32 [[INDEX:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) 
[[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f32_16x16x64_bf16(global v8f* out, v16bf16 a, v32bf16 
b, v8f c, int index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x64_bf16(0, a, 0, b, c, index, 
false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_bf16_16x16x64_bf16(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x bfloat> 
@llvm.amdgcn.swmmac.bf16.16x16x64.bf16.v8bf16.v16bf16.v32bf16.i32(i1 false, <16 
x bfloat> [[A:%.*]], i1 false, <32 x bfloat> [[B:%.*]], <8 x bfloat> [[C:%.*]], 
i32 [[INDEX:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x bfloat> [[TMP0]], ptr addrspace(1) 
[[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_swmmac_bf16_16x16x64_bf16(global v8bf16* out, v16bf16 a, 
v32bf16 b, v8bf16 c, int index)
+{
+  *out = __builtin_amdgcn_swmmac_bf16_16x16x64_bf16(0, a, 0, b, c, index, 
false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_bf16f32_16x16x64_bf16(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x float> 
@llvm.amdgcn.swmmac.bf16f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i32(i1 false, 
<16 x bfloat> [[A:%.*]], i1 false, <32 x bfloat> [[B:%.*]], <8 x float> 
[[C:%.*]], i32 [[INDEX:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) 
[[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_swmmac_bf16f32_16x16x64_bf16(global v8f* out, v16bf16 a, 
v32bf16 b, v8f c, int index)
+{
+  *out = __builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16(0, a, 0, b, c, index, 
false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f32_16x16x128_fp8_fp8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x float> 
@llvm.amdgcn.swmmac.f32.16x16x128.fp8.fp8.v8f32.v8i32.v16i32.i32(<8 x i32> 
[[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i32 [[INDEX:%.*]], i1 
false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) 
[[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f32_16x16x128_fp8_fp8(global v8f* out, v8i a, v16i b, 
v8f c, int index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x128_fp8_fp8(a, b, c, index, false, 
true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f32_16x16x128_fp8_bf8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x float> 
@llvm.amdgcn.swmmac.f32.16x16x128.fp8.bf8.v8f32.v8i32.v16i32.i32(<8 x i32> 
[[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i32 [[INDEX:%.*]], i1 
false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) 
[[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f32_16x16x128_fp8_bf8(global v8f* out, v8i a, v16i b, 
v8f c, int index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x128_fp8_bf8(a, b, c, index, false, 
true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f32_16x16x128_bf8_fp8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x float> 
@llvm.amdgcn.swmmac.f32.16x16x128.bf8.fp8.v8f32.v8i32.v16i32.i32(<8 x i32> 
[[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i32 [[INDEX:%.*]], i1 
false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) 
[[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f32_16x16x128_bf8_fp8(global v8f* out, v8i a, v16i b, 
v8f c, int index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x128_bf8_fp8(a, b, c, index, false, 
true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f32_16x16x128_bf8_bf8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x float> 
@llvm.amdgcn.swmmac.f32.16x16x128.bf8.bf8.v8f32.v8i32.v16i32.i32(<8 x i32> 
[[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i32 [[INDEX:%.*]], i1 
false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) 
[[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f32_16x16x128_bf8_bf8(global v8f* out, v8i a, v16i b, 
v8f c, int index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x128_bf8_bf8(a, b, c, index, false, 
true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f16_16x16x128_fp8_fp8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x half> 
@llvm.amdgcn.swmmac.f16.16x16x128.fp8.fp8.v8f16.v8i32.v16i32.i32(<8 x i32> 
[[A:%.*]], <16 x i32> [[B:%.*]], <8 x half> [[C:%.*]], i32 [[INDEX:%.*]], i1 
false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) 
[[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f16_16x16x128_fp8_fp8(global v8h* out, v8i a, v16i b, 
v8h c, int index)
+{
+  *out = __builtin_amdgcn_swmmac_f16_16x16x128_fp8_fp8(a, b, c, index, false, 
true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f16_16x16x128_fp8_bf8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x half> 
@llvm.amdgcn.swmmac.f16.16x16x128.fp8.bf8.v8f16.v8i32.v16i32.i32(<8 x i32> 
[[A:%.*]], <16 x i32> [[B:%.*]], <8 x half> [[C:%.*]], i32 [[INDEX:%.*]], i1 
false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) 
[[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f16_16x16x128_fp8_bf8(global v8h* out, v8i a, v16i b, 
v8h c, int index)
+{
+  *out = __builtin_amdgcn_swmmac_f16_16x16x128_fp8_bf8(a, b, c, index, false, 
true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f16_16x16x128_bf8_fp8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x half> 
@llvm.amdgcn.swmmac.f16.16x16x128.bf8.fp8.v8f16.v8i32.v16i32.i32(<8 x i32> 
[[A:%.*]], <16 x i32> [[B:%.*]], <8 x half> [[C:%.*]], i32 [[INDEX:%.*]], i1 
false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) 
[[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f16_16x16x128_bf8_fp8(global v8h* out, v8i a, v16i b, 
v8h c, int index)
+{
+  *out = __builtin_amdgcn_swmmac_f16_16x16x128_bf8_fp8(a, b, c, index, false, 
true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f16_16x16x128_bf8_bf8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x half> 
@llvm.amdgcn.swmmac.f16.16x16x128.bf8.bf8.v8f16.v8i32.v16i32.i32(<8 x i32> 
[[A:%.*]], <16 x i32> [[B:%.*]], <8 x half> [[C:%.*]], i32 [[INDEX:%.*]], i1 
false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) 
[[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f16_16x16x128_bf8_bf8(global v8h* out, v8i a, v16i b, 
v8h c, int index)
+{
+  *out = __builtin_amdgcn_swmmac_f16_16x16x128_bf8_bf8(a, b, c, index, false, 
true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_i32_16x16x128_iu8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> 
@llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.i32(i1 false, <8 x 
i32> [[A:%.*]], i1 false, <16 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i32 
[[INDEX:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) 
[[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_swmmac_i32_16x16x128_iu8(global v8i* out, v8i a, v16i b, v8i 
c, int index)
+{
+  *out = __builtin_amdgcn_swmmac_i32_16x16x128_iu8(0, a, 0, b, c, index, 
false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f32_16x16x64_f16(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x float> 
@llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i32(i1 false, <16 x 
half> [[A:%.*]], i1 false, <32 x half> [[B:%.*]], <8 x float> [[C:%.*]], i32 
[[INDEX:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) 
[[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f32_16x16x64_f16(global v8f* out, v16h a, v32h b, v8f 
c, int index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x64_f16(0, a, 0, b, c, index, false, 
true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f16_16x16x64_f16(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x half> 
@llvm.amdgcn.swmmac.f16.16x16x64.f16.v8f16.v16f16.v32f16.i32(i1 false, <16 x 
half> [[A:%.*]], i1 false, <32 x half> [[B:%.*]], <8 x half> [[C:%.*]], i32 
[[INDEX:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) 
[[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f16_16x16x64_f16(global v8h* out, v16h a, v32h b, v8h 
c, int index)
+{
+  *out = __builtin_amdgcn_swmmac_f16_16x16x64_f16(0, a, 0, b, c, index, false, 
true);
+}
diff --git 
a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl 
b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl
new file mode 100644
index 0000000000000..55d705e6ad238
--- /dev/null
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl
@@ -0,0 +1,242 @@
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -verify 
-emit-llvm -o - %s
+
+typedef float  v16f  __attribute__((ext_vector_type(16)));
+typedef float  v8f   __attribute__((ext_vector_type(8)));
+typedef float  v2f   __attribute__((ext_vector_type(2)));
+typedef half   v8h   __attribute__((ext_vector_type(8)));
+typedef half   v16h  __attribute__((ext_vector_type(16)));
+typedef half   v32h   __attribute__((ext_vector_type(32)));
+typedef __bf16 v32bf16 __attribute__((ext_vector_type(32)));
+typedef __bf16 v16bf16 __attribute__((ext_vector_type(16)));
+typedef __bf16 v8bf16 __attribute__((ext_vector_type(8)));
+typedef int    v16i   __attribute__((ext_vector_type(16)));
+typedef int    v8i   __attribute__((ext_vector_type(8)));
+
+void test_amdgcn_wmma_f32_16x16x4_f32(global v8f* out, v2f a, v2f b, v8f c, 
int mod)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x4_f32(mod, a, 0, b, 0, c, false, 
false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x4_f32' must be a 
constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x4_f32(0, a, mod, b, 0, c, false, 
false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x4_f32' must be a 
constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x4_f32(0, a, 0, b, mod, c, false, 
false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x4_f32' must be a 
constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x4_f32(0, a, 0, b, 0, c, mod, false); 
// expected-error {{'__builtin_amdgcn_wmma_f32_16x16x4_f32' must be a constant 
integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x4_f32(0, a, 0, b, 0, c, false, mod); 
// expected-error {{'__builtin_amdgcn_wmma_f32_16x16x4_f32' must be a constant 
integer}}
+}
+
+void test_amdgcn_wmma_f32_16x16x32_bf16(global v8f* out, v16bf16 a, v16bf16 b, 
v8f c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x32_bf16(mod, a, 0, b, 0, c, false, 
false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x32_bf16' must be a 
constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x32_bf16(0, a, mod, b, 0, c, false, 
false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x32_bf16' must be a 
constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x32_bf16(0, a, 0, b, mod, c, false, 
false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x32_bf16' must be a 
constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x32_bf16(0, a, 0, b, 0, c, mod, 
false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x32_bf16' must be a 
constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x32_bf16(0, a, 0, b, 0, c, false, 
mod); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x32_bf16' must be a 
constant integer}}
+}
+
+void test_amdgcn_wmma_bf16_16x16x32_bf16(global v8bf16* out, v16bf16 a, 
v16bf16 b, v8bf16 c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_bf16_16x16x32_bf16(mod, a, 0, b, 0, c, false, 
false); // expected-error {{'__builtin_amdgcn_wmma_bf16_16x16x32_bf16' must be 
a constant integer}}
+  *out = __builtin_amdgcn_wmma_bf16_16x16x32_bf16(0, a, mod, b, 0, c, false, 
false); // expected-error {{'__builtin_amdgcn_wmma_bf16_16x16x32_bf16' must be 
a constant integer}}
+  *out = __builtin_amdgcn_wmma_bf16_16x16x32_bf16(0, a, 0, b, mod, c, false, 
false); // expected-error {{'__builtin_amdgcn_wmma_bf16_16x16x32_bf16' must be 
a constant integer}}
+  *out = __builtin_amdgcn_wmma_bf16_16x16x32_bf16(0, a, 0, b, 0, c, mod, 
false); // expected-error {{'__builtin_amdgcn_wmma_bf16_16x16x32_bf16' must be 
a constant integer}}
+  *out = __builtin_amdgcn_wmma_bf16_16x16x32_bf16(0, a, 0, b, 0, c, false, 
mod); // expected-error {{'__builtin_amdgcn_wmma_bf16_16x16x32_bf16' must be a 
constant integer}}
+}
+
+void test_amdgcn_wmma_bf16f32_16x16x32_bf16(global v8bf16* out, v16bf16 a, 
v16bf16 b, v8f c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_bf16f32_16x16x32_bf16(mod, a, 0, b, 0, c, 
false, false); // expected-error 
{{'__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_bf16f32_16x16x32_bf16(0, a, mod, b, 0, c, 
false, false); // expected-error 
{{'__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_bf16f32_16x16x32_bf16(0, a, 0, b, mod, c, 
false, false); // expected-error 
{{'__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_bf16f32_16x16x32_bf16(0, a, 0, b, 0, c, mod, 
false); // expected-error {{'__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16' must 
be a constant integer}}
+  *out = __builtin_amdgcn_wmma_bf16f32_16x16x32_bf16(0, a, 0, b, 0, c, false, 
mod); // expected-error {{'__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16' must be 
a constant integer}}
+}
+
+void test_amdgcn_wmma_f32_16x16x64_fp8_fp8(global v8f* out, v8i a, v8i b, v8f 
c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8(a, b, mod, c, false, 
false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8' must 
be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8(a, b, 0, c, mod, false); 
// expected-error {{'__builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8' must be a 
constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8(a, b, 0, c, false, mod); 
// expected-error {{'__builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8' must be a 
constant integer}}
+}
+
+void test_amdgcn_wmma_f32_16x16x64_fp8_bf8(global v8f* out, v8i a, v8i b, v8f 
c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8(a, b, mod, c, false, 
false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8' must 
be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8(a, b, 0, c, mod, false); 
// expected-error {{'__builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8' must be a 
constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8(a, b, 0, c, false, mod); 
// expected-error {{'__builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8' must be a 
constant integer}}
+}
+
+void test_amdgcn_wmma_f32_16x16x64_bf8_fp8(global v8f* out, v8i a, v8i b, v8f 
c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8(a, b, mod, c, false, 
false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8' must 
be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8(a, b, 0, c, mod, false); 
// expected-error {{'__builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8' must be a 
constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8(a, b, 0, c, false, mod); 
// expected-error {{'__builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8' must be a 
constant integer}}
+}
+
+void test_amdgcn_wmma_f32_16x16x64_bf8_bf8(global v8f* out, v8i a, v8i b, v8f 
c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8(a, b, mod, c, false, 
false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8' must 
be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8(a, b, 0, c, mod, false); 
// expected-error {{'__builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8' must be a 
constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8(a, b, 0, c, false, mod); 
// expected-error {{'__builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8' must be a 
constant integer}}
+}
+
+void test_amdgcn_wmma_f16_16x16x64_fp8_fp8(global v8h* out, v8i a, v8i b, v8h 
c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8(a, b, mod, c, false, 
false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8' must 
be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8(a, b, 0, c, mod, false); 
// expected-error {{'__builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8' must be a 
constant integer}}
+  *out = __builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8(a, b, 0, c, false, mod); 
// expected-error {{'__builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8' must be a 
constant integer}}
+}
+
+void test_amdgcn_wmma_f16_16x16x64_fp8_bf8(global v8h* out, v8i a, v8i b, v8h 
c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8(a, b, mod, c, false, 
false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8' must 
be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8(a, b, 0, c, mod, false); 
// expected-error {{'__builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8' must be a 
constant integer}}
+  *out = __builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8(a, b, 0, c, false, mod); 
// expected-error {{'__builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8' must be a 
constant integer}}
+}
+
+void test_amdgcn_wmma_f16_16x16x64_bf8_fp8(global v8h* out, v8i a, v8i b, v8h 
c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8(a, b, mod, c, false, 
false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8' must 
be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8(a, b, 0, c, mod, false); 
// expected-error {{'__builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8' must be a 
constant integer}}
+  *out = __builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8(a, b, 0, c, false, mod); 
// expected-error {{'__builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8' must be a 
constant integer}}
+}
+
+void test_amdgcn_wmma_f16_16x16x64_bf8_bf8(global v8h* out, v8i a, v8i b, v8h 
c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8(a, b, mod, c, false, 
false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8' must 
be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8(a, b, 0, c, mod, false); 
// expected-error {{'__builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8' must be a 
constant integer}}
+  *out = __builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8(a, b, 0, c, false, mod); 
// expected-error {{'__builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8' must be a 
constant integer}}
+}
+
+void test_amdgcn_wmma_i32_16x16x64_iu8(global v8i* out, v8i a, v8i b, v8i c, 
int mod)
+{
+  *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(mod, a, 0, b, c, false, 
false); // expected-error {{'__builtin_amdgcn_wmma_i32_16x16x64_iu8' must be a 
constant integer}}
+  *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, mod, b, c, false, 
false); // expected-error {{'__builtin_amdgcn_wmma_i32_16x16x64_iu8' must be a 
constant integer}}
+  *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, 0, b, c, mod, false); // 
expected-error {{'__builtin_amdgcn_wmma_i32_16x16x64_iu8' must be a constant 
integer}}
+  *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, 0, b, c, false, mod); // 
expected-error {{'__builtin_amdgcn_wmma_i32_16x16x64_iu8' must be a constant 
integer}}
+}
+
+void test_amdgcn_wmma_f32_16x16x32_f16(global v8f* out, v16h a, v16h b, v8f c, 
int mod)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x32_f16(mod, a, 0, b, 0, c, false, 
false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x32_f16' must be a 
constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x32_f16(0, a, mod, b, 0, c, false, 
false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x32_f16' must be a 
constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x32_f16(0, a, 0, b, mod, c, false, 
false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x32_f16' must be a 
constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x32_f16(0, a, 0, b, 0, c, mod, false); 
// expected-error {{'__builtin_amdgcn_wmma_f32_16x16x32_f16' must be a constant 
integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x32_f16(0, a, 0, b, 0, c, false, mod); 
// expected-error {{'__builtin_amdgcn_wmma_f32_16x16x32_f16' must be a constant 
integer}}
+}
+
+void test_amdgcn_wmma_f16_16x16x32_f16(global v8h* out, v16h a, v16h b, v8h c, 
int mod)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x32_f16(mod, a, 0, b, 0, c, false, 
false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x32_f16' must be a 
constant integer}}
+  *out = __builtin_amdgcn_wmma_f16_16x16x32_f16(0, a, mod, b, 0, c, false, 
false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x32_f16' must be a 
constant integer}}
+  *out = __builtin_amdgcn_wmma_f16_16x16x32_f16(0, a, 0, b, mod, c, false, 
false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x32_f16' must be a 
constant integer}}
+  *out = __builtin_amdgcn_wmma_f16_16x16x32_f16(0, a, 0, b, 0, c, mod, false); 
// expected-error {{'__builtin_amdgcn_wmma_f16_16x16x32_f16' must be a constant 
integer}}
+  *out = __builtin_amdgcn_wmma_f16_16x16x32_f16(0, a, 0, b, 0, c, false, mod); 
// expected-error {{'__builtin_amdgcn_wmma_f16_16x16x32_f16' must be a constant 
integer}}
+}
+
+void test_amdgcn_wmma_f16_16x16x128_fp8_fp8(global v8h* out, v16i a, v16i b, 
v8h c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8(a, b, mod, c, false, 
false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8' must 
be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8(a, b, 0, c, mod, false); 
// expected-error {{'__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8' must be a 
constant integer}}
+  *out = __builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8(a, b, 0, c, false, mod); 
// expected-error {{'__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8' must be a 
constant integer}}
+}
+
+void test_amdgcn_wmma_f16_16x16x128_fp8_bf8(global v8h* out, v16i a, v16i b, 
v8h c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8(a, b, mod, c, false, 
false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8' must 
be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8(a, b, 0, c, mod, false); 
// expected-error {{'__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8' must be a 
constant integer}}
+  *out = __builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8(a, b, 0, c, false, mod); 
// expected-error {{'__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8' must be a 
constant integer}}
+}
+
+void test_amdgcn_wmma_f16_16x16x128_bf8_fp8(global v8h* out, v16i a, v16i b, 
v8h c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8(a, b, mod, c, false, 
false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8' must 
be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8(a, b, 0, c, mod, false); 
// expected-error {{'__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8' must be a 
constant integer}}
+  *out = __builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8(a, b, 0, c, false, mod); 
// expected-error {{'__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8' must be a 
constant integer}}
+}
+
+void test_amdgcn_wmma_f16_16x16x128_bf8_bf8(global v8h* out, v16i a, v16i b, 
v8h c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8(a, b, mod, c, false, 
false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8' must 
be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8(a, b, 0, c, mod, false); 
// expected-error {{'__builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8' must be a 
constant integer}}
+  *out = __builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8(a, b, 0, c, false, mod); 
// expected-error {{'__builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8' must be a 
constant integer}}
+}
+
+void test_amdgcn_wmma_f32_16x16x128_fp8_fp8(global v8f* out, v16i a, v16i b, 
v8f c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8(a, b, mod, c, false, 
false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8' must 
be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8(a, b, 0, c, mod, false); 
// expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8' must be a 
constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8(a, b, 0, c, false, mod); 
// expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8' must be a 
constant integer}}
+}
+
+void test_amdgcn_wmma_f32_16x16x128_fp8_bf8(global v8f* out, v16i a, v16i b, 
v8f c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8(a, b, mod, c, false, 
false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8' must 
be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8(a, b, 0, c, mod, false); 
// expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8' must be a 
constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8(a, b, 0, c, false, mod); 
// expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8' must be a 
constant integer}}
+}
+
+void test_amdgcn_wmma_f32_16x16x128_bf8_fp8(global v8f* out, v16i a, v16i b, 
v8f c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8(a, b, mod, c, false, 
false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8' must 
be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8(a, b, 0, c, mod, false); 
// expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8' must be a 
constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8(a, b, 0, c, false, mod); 
// expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8' must be a 
constant integer}}
+}
+
+void test_amdgcn_wmma_f32_16x16x128_bf8_bf8(global v8f* out, v16i a, v16i b, 
v8f c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8(a, b, mod, c, false, 
false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8' must 
be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8(a, b, 0, c, mod, false); 
// expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8' must be a 
constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8(a, b, 0, c, false, mod); 
// expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8' must be a 
constant integer}}
+}
+
+void test_amdgcn_wmma_f32_wmma_f32_32x16x128_f4(global v16f* out, v16i a, v8i 
b, v16f c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_f32_32x16x128_f4(a, b, mod, c); // 
expected-error {{'__builtin_amdgcn_wmma_f32_32x16x128_f4' must be a constant 
integer}}
+}
+
+void test_amdgcn_swmmac_f32_16x16x64_bf16(global v8f* out, v16bf16 a, v32bf16 
b, v8f c, int index, int mod)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x64_bf16(mod, a, 0, b, c, index, 
false, false); // expected-error {{'__builtin_amdgcn_swmmac_f32_16x16x64_bf16' 
must be a constant integer}}
+  *out = __builtin_amdgcn_swmmac_f32_16x16x64_bf16(0, a, mod, b, c, index, 
false, false); // expected-error {{'__builtin_amdgcn_swmmac_f32_16x16x64_bf16' 
must be a constant integer}}
+  *out = __builtin_amdgcn_swmmac_f32_16x16x64_bf16(0, a, 0, b, c, index, mod, 
false); // expected-error {{'__builtin_amdgcn_swmmac_f32_16x16x64_bf16' must be 
a constant integer}}
+  *out = __builtin_amdgcn_swmmac_f32_16x16x64_bf16(0, a, 0, b, c, index, 
false, mod); // expected-error {{'__builtin_amdgcn_swmmac_f32_16x16x64_bf16' 
must be a constant integer}}
+}
+
+void test_amdgcn_swmmac_bf16_16x16x64_bf16(global v8bf16* out, v16bf16 a, 
v32bf16 b, v8bf16 c, int index, int mod)
+{
+  *out = __builtin_amdgcn_swmmac_bf16_16x16x64_bf16(mod, a, 0, b, c, index, 
false, false); // expected-error {{'__builtin_amdgcn_swmmac_bf16_16x16x64_bf16' 
must be a constant integer}}
+  *out = __builtin_amdgcn_swmmac_bf16_16x16x64_bf16(0, a, mod, b, c, index, 
false, false); // expected-error {{'__builtin_amdgcn_swmmac_bf16_16x16x64_bf16' 
must be a constant integer}}
+  *out = __builtin_amdgcn_swmmac_bf16_16x16x64_bf16(0, a, 0, b, c, index, mod, 
false); // expected-error {{'__builtin_amdgcn_swmmac_bf16_16x16x64_bf16' must 
be a constant integer}}
+  *out = __builtin_amdgcn_swmmac_bf16_16x16x64_bf16(0, a, 0, b, c, index, 
false, mod); // expected-error {{'__builtin_amdgcn_swmmac_bf16_16x16x64_bf16' 
must be a constant integer}}
+}
+
+void test_amdgcn_swmmac_bf16f32_16x16x64_bf16(global v8f* out, v16bf16 a, 
v32bf16 b, v8f c, int index, int mod)
+{
+  *out = __builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16(mod, a, 0, b, c, index, 
false, false); // expected-error 
{{'__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16(0, a, mod, b, c, index, 
false, false); // expected-error 
{{'__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16(0, a, 0, b, c, index, 
mod, false); // expected-error 
{{'__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16(0, a, 0, b, c, index, 
false, mod); // expected-error 
{{'__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16' must be a constant integer}}
+}
+
+void test_amdgcn_swmmac_i32_16x16x128_iu8(global v8i* out, v8i a, v16i b, v8i 
c, int index, int mod)
+{
+  *out = __builtin_amdgcn_swmmac_i32_16x16x128_iu8(mod, a, 0, b, c, index, 
false, false); // expected-error {{'__builtin_amdgcn_swmmac_i32_16x16x128_iu8' 
must be a constant integer}}
+  *out = __builtin_amdgcn_swmmac_i32_16x16x128_iu8(0, a, mod, b, c, index, 
false, false); // expected-error {{'__builtin_amdgcn_swmmac_i32_16x16x128_iu8' 
must be a constant integer}}
+  *out = __builtin_amdgcn_swmmac_i32_16x16x128_iu8(0, a, 0, b, c, index, mod, 
false); // expected-error {{'__builtin_amdgcn_swmmac_i32_16x16x128_iu8' must be 
a constant integer}}
+  *out = __builtin_amdgcn_swmmac_i32_16x16x128_iu8(0, a, 0, b, c, index, 
false, mod); // expected-error {{'__builtin_amdgcn_swmmac_i32_16x16x128_iu8' 
must be a constant integer}}
+}
+
+void test_amdgcn_swmmac_f32_16x16x64_f16(global v8f* out, v16h a, v32h b, v8f 
c, int index, int mod)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x64_f16(mod, a, 0, b, c, index, 
false, false); // expected-error {{'__builtin_amdgcn_swmmac_f32_16x16x64_f16' 
must be a constant integer}}
+  *out = __builtin_amdgcn_swmmac_f32_16x16x64_f16(0, a, mod, b, c, index, 
false, false); // expected-error {{'__builtin_amdgcn_swmmac_f32_16x16x64_f16' 
must be a constant integer}}
+  *out = __builtin_amdgcn_swmmac_f32_16x16x64_f16(0, a, 0, b, c, index, mod, 
false); // expected-error {{'__builtin_amdgcn_swmmac_f32_16x16x64_f16' must be 
a constant integer}}
+  *out = __builtin_amdgcn_swmmac_f32_16x16x64_f16(0, a, 0, b, c, index, false, 
mod); // expected-error {{'__builtin_amdgcn_swmmac_f32_16x16x64_f16' must be a 
constant integer}}
+}
+
+void test_amdgcn_swmmac_f16_16x16x64_f16(global v8h* out, v16h a, v32h b, v8h 
c, int index, int mod)
+{
+  *out = __builtin_amdgcn_swmmac_f16_16x16x64_f16(mod, a, 0, b, c, index, 
false, false); // expected-error {{'__builtin_amdgcn_swmmac_f16_16x16x64_f16' 
must be a constant integer}}
+  *out = __builtin_amdgcn_swmmac_f16_16x16x64_f16(0, a, mod, b, c, index, 
false, false); // expected-error {{'__builtin_amdgcn_swmmac_f16_16x16x64_f16' 
must be a constant integer}}
+  *out = __builtin_amdgcn_swmmac_f16_16x16x64_f16(0, a, 0, b, c, index, mod, 
false); // expected-error {{'__builtin_amdgcn_swmmac_f16_16x16x64_f16' must be 
a constant integer}}
+  *out = __builtin_amdgcn_swmmac_f16_16x16x64_f16(0, a, 0, b, c, index, false, 
mod); // expected-error {{'__builtin_amdgcn_swmmac_f16_16x16x64_f16' must be a 
constant integer}}
+}

>From 16f76445dba1c7b362a1e06dd808e9874d1bac83 Mon Sep 17 00:00:00 2001
From: Changpeng Fang <changpeng.f...@amd.com>
Date: Tue, 15 Jul 2025 16:50:00 -0700
Subject: [PATCH 2/2] AMDGPU: Implement builtins for gfx1250 wmma instructions

 Fix a format error
---
 clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp 
b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index e1f9cbe7aea26..dcfdea648e93c 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -1090,7 +1090,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
       ArgsForMatchingMatrixTypes = {3, 0};
       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_bf8;
       break;
-      case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8:
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8:
       ArgsForMatchingMatrixTypes = {3, 0};
       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_fp8;
       break;

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] AMDGPU: Implement builtins for gfx1250 wmma instructions (PR #148991)

Reply via email to