[clang] [llvm] [CUDA] Add support for CUDA-12.6 and sm_100 (PR #97402)

2024-07-02 Thread Sergey Kozub via cfe-commits

https://github.com/sergey-kozub created 
https://github.com/llvm/llvm-project/pull/97402

Adds support for sm_100 (Blackwell), similar to 
https://github.com/llvm/llvm-project/pull/74895

One important aspect is that `sm_100` is not compatible with `sm_90a`, only 
with `sm_90` - note the defines in "BuiltinsNVPTX.def"

>From 02e1acff6ffa1ddc3a26b0edc7e89923ac38978f Mon Sep 17 00:00:00 2001
From: Sergey Kozub 
Date: Tue, 2 Jul 2024 02:44:56 -0700
Subject: [PATCH] [CUDA] Add support for CUDA-12.6 and sm_100

---
 clang/docs/ReleaseNotes.rst |  3 ++-
 clang/include/clang/Basic/BuiltinsNVPTX.def | 10 --
 clang/include/clang/Basic/Cuda.h|  4 +++-
 clang/lib/Basic/Cuda.cpp|  4 
 clang/lib/Basic/Targets/NVPTX.cpp   |  2 ++
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp|  1 +
 clang/lib/Driver/ToolChains/Cuda.cpp|  3 +++
 clang/test/Misc/target-invalid-cpu-note.c   |  2 +-
 llvm/lib/Target/NVPTX/NVPTX.td  |  5 +++--
 9 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index c720e47dbe35b..3c10ee51550d9 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1057,7 +1057,8 @@ CUDA/HIP Language Changes
 
 CUDA Support
 
-- Clang now supports CUDA SDK up to 12.5
+- Clang now supports CUDA SDK up to 12.6
+- Added support for sm_100
 
 AIX Support
 ^^^
diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def 
b/clang/include/clang/Basic/BuiltinsNVPTX.def
index 504314d8d96e9..3f383bc89ee70 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.def
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -27,8 +27,10 @@
 #pragma push_macro("SM_89")
 #pragma push_macro("SM_90")
 #pragma push_macro("SM_90a")
+#pragma push_macro("SM_100")
+#define SM_100 "sm_100"
 #define SM_90a "sm_90a"
-#define SM_90 "sm_90|" SM_90a
+#define SM_90 "sm_90|" SM_90a "|" SM_100
 #define SM_89 "sm_89|" SM_90
 #define SM_87 "sm_87|" SM_89
 #define SM_86 "sm_86|" SM_87
@@ -63,7 +65,9 @@
 #pragma push_macro("PTX83")
 #pragma push_macro("PTX84")
 #pragma push_macro("PTX85")
-#define PTX85 "ptx85"
+#pragma push_macro("PTX86")
+#define PTX86 "ptx86"
+#define PTX85 "ptx85|" PTX86
 #define PTX84 "ptx84|" PTX85
 #define PTX83 "ptx83|" PTX84
 #define PTX82 "ptx82|" PTX83
@@ -1075,6 +1079,7 @@ TARGET_BUILTIN(__nvvm_getctarank_shared_cluster, "iv*3", 
"", AND(SM_90,PTX78))
 #pragma pop_macro("SM_89")
 #pragma pop_macro("SM_90")
 #pragma pop_macro("SM_90a")
+#pragma pop_macro("SM_100")
 #pragma pop_macro("PTX42")
 #pragma pop_macro("PTX60")
 #pragma pop_macro("PTX61")
@@ -1097,3 +1102,4 @@ TARGET_BUILTIN(__nvvm_getctarank_shared_cluster, "iv*3", 
"", AND(SM_90,PTX78))
 #pragma pop_macro("PTX83")
 #pragma pop_macro("PTX84")
 #pragma pop_macro("PTX85")
+#pragma pop_macro("PTX86")
diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h
index 83699f8897f66..a18e62620dd5d 100644
--- a/clang/include/clang/Basic/Cuda.h
+++ b/clang/include/clang/Basic/Cuda.h
@@ -43,9 +43,10 @@ enum class CudaVersion {
   CUDA_123,
   CUDA_124,
   CUDA_125,
+  CUDA_126,
   FULLY_SUPPORTED = CUDA_123,
   PARTIALLY_SUPPORTED =
-  CUDA_125, // Partially supported. Proceed with a warning.
+  CUDA_126, // Partially supported. Proceed with a warning.
   NEW = 1,  // Too new. Issue a warning, but allow using it.
 };
 const char *CudaVersionToString(CudaVersion V);
@@ -78,6 +79,7 @@ enum class OffloadArch {
   SM_89,
   SM_90,
   SM_90a,
+  SM_100,
   GFX600,
   GFX601,
   GFX602,
diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp
index faf3878f064d2..72d9bd89c36e7 100644
--- a/clang/lib/Basic/Cuda.cpp
+++ b/clang/lib/Basic/Cuda.cpp
@@ -43,6 +43,7 @@ static const CudaVersionMapEntry CudaNameVersionMap[] = {
 CUDA_ENTRY(12, 3),
 CUDA_ENTRY(12, 4),
 CUDA_ENTRY(12, 5),
+CUDA_ENTRY(12, 6),
 {"", CudaVersion::NEW, 
llvm::VersionTuple(std::numeric_limits::max())},
 {"unknown", CudaVersion::UNKNOWN, {}} // End of list tombstone.
 };
@@ -96,6 +97,7 @@ static const OffloadArchToStringMap arch_names[] = {
 SM(89),  // Ada Lovelace
 SM(90),  // Hopper
 SM(90a), // Hopper
+SM(100), // Blackwell
 GFX(600),  // gfx600
 GFX(601),  // gfx601
 GFX(602),  // gfx602
@@ -221,6 +223,8 @@ CudaVersion MinVersionForOffloadArch(OffloadArch A) {
 return CudaVersion::CUDA_118;
   case OffloadArch::SM_90a:
 return CudaVersion::CUDA_120;
+  case OffloadArch::SM_100:
+return CudaVersion::CUDA_126;
   default:
 llvm_unreachable("invalid enum");
   }
diff --git a/clang/lib/Basic/Targets/NVPTX.cpp 
b/clang/lib/Basic/Targets/NVPTX.cpp
index 43b653dc52ce0..88a0dbde52d52 100644
--- a/clang/lib/Basic/Targets/NVPTX.cpp
+++ b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -281,6 +281,8 @@ void NVPTXTargetInfo::getTargetDefines(const La

[clang] [llvm] [CUDA] Add support for CUDA-12.6 and sm_100 (PR #97402)

2024-07-02 Thread Sergey Kozub via cfe-commits

sergey-kozub wrote:

 @Artem-B, @andportnoy, you might want to take a look

https://github.com/llvm/llvm-project/pull/97402
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [CUDA] Add support for CUDA-12.6 and sm_100 (PR #97402)

2024-07-02 Thread Sergey Kozub via cfe-commits

https://github.com/sergey-kozub converted_to_draft 
https://github.com/llvm/llvm-project/pull/97402
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [CUDA] Add support for CUDA-12.6 and sm_100 (PR #97402)

2024-07-02 Thread Sergey Kozub via cfe-commits

https://github.com/sergey-kozub updated 
https://github.com/llvm/llvm-project/pull/97402

>From 02e1acff6ffa1ddc3a26b0edc7e89923ac38978f Mon Sep 17 00:00:00 2001
From: Sergey Kozub 
Date: Tue, 2 Jul 2024 02:44:56 -0700
Subject: [PATCH] [CUDA] Add support for CUDA-12.6 and sm_100

---
 clang/docs/ReleaseNotes.rst |  3 ++-
 clang/include/clang/Basic/BuiltinsNVPTX.def | 10 --
 clang/include/clang/Basic/Cuda.h|  4 +++-
 clang/lib/Basic/Cuda.cpp|  4 
 clang/lib/Basic/Targets/NVPTX.cpp   |  2 ++
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp|  1 +
 clang/lib/Driver/ToolChains/Cuda.cpp|  3 +++
 clang/test/Misc/target-invalid-cpu-note.c   |  2 +-
 llvm/lib/Target/NVPTX/NVPTX.td  |  5 +++--
 9 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index c720e47dbe35b..3c10ee51550d9 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1057,7 +1057,8 @@ CUDA/HIP Language Changes
 
 CUDA Support
 
-- Clang now supports CUDA SDK up to 12.5
+- Clang now supports CUDA SDK up to 12.6
+- Added support for sm_100
 
 AIX Support
 ^^^
diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def 
b/clang/include/clang/Basic/BuiltinsNVPTX.def
index 504314d8d96e9..3f383bc89ee70 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.def
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -27,8 +27,10 @@
 #pragma push_macro("SM_89")
 #pragma push_macro("SM_90")
 #pragma push_macro("SM_90a")
+#pragma push_macro("SM_100")
+#define SM_100 "sm_100"
 #define SM_90a "sm_90a"
-#define SM_90 "sm_90|" SM_90a
+#define SM_90 "sm_90|" SM_90a "|" SM_100
 #define SM_89 "sm_89|" SM_90
 #define SM_87 "sm_87|" SM_89
 #define SM_86 "sm_86|" SM_87
@@ -63,7 +65,9 @@
 #pragma push_macro("PTX83")
 #pragma push_macro("PTX84")
 #pragma push_macro("PTX85")
-#define PTX85 "ptx85"
+#pragma push_macro("PTX86")
+#define PTX86 "ptx86"
+#define PTX85 "ptx85|" PTX86
 #define PTX84 "ptx84|" PTX85
 #define PTX83 "ptx83|" PTX84
 #define PTX82 "ptx82|" PTX83
@@ -1075,6 +1079,7 @@ TARGET_BUILTIN(__nvvm_getctarank_shared_cluster, "iv*3", 
"", AND(SM_90,PTX78))
 #pragma pop_macro("SM_89")
 #pragma pop_macro("SM_90")
 #pragma pop_macro("SM_90a")
+#pragma pop_macro("SM_100")
 #pragma pop_macro("PTX42")
 #pragma pop_macro("PTX60")
 #pragma pop_macro("PTX61")
@@ -1097,3 +1102,4 @@ TARGET_BUILTIN(__nvvm_getctarank_shared_cluster, "iv*3", 
"", AND(SM_90,PTX78))
 #pragma pop_macro("PTX83")
 #pragma pop_macro("PTX84")
 #pragma pop_macro("PTX85")
+#pragma pop_macro("PTX86")
diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h
index 83699f8897f66..a18e62620dd5d 100644
--- a/clang/include/clang/Basic/Cuda.h
+++ b/clang/include/clang/Basic/Cuda.h
@@ -43,9 +43,10 @@ enum class CudaVersion {
   CUDA_123,
   CUDA_124,
   CUDA_125,
+  CUDA_126,
   FULLY_SUPPORTED = CUDA_123,
   PARTIALLY_SUPPORTED =
-  CUDA_125, // Partially supported. Proceed with a warning.
+  CUDA_126, // Partially supported. Proceed with a warning.
   NEW = 1,  // Too new. Issue a warning, but allow using it.
 };
 const char *CudaVersionToString(CudaVersion V);
@@ -78,6 +79,7 @@ enum class OffloadArch {
   SM_89,
   SM_90,
   SM_90a,
+  SM_100,
   GFX600,
   GFX601,
   GFX602,
diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp
index faf3878f064d2..72d9bd89c36e7 100644
--- a/clang/lib/Basic/Cuda.cpp
+++ b/clang/lib/Basic/Cuda.cpp
@@ -43,6 +43,7 @@ static const CudaVersionMapEntry CudaNameVersionMap[] = {
 CUDA_ENTRY(12, 3),
 CUDA_ENTRY(12, 4),
 CUDA_ENTRY(12, 5),
+CUDA_ENTRY(12, 6),
 {"", CudaVersion::NEW, 
llvm::VersionTuple(std::numeric_limits::max())},
 {"unknown", CudaVersion::UNKNOWN, {}} // End of list tombstone.
 };
@@ -96,6 +97,7 @@ static const OffloadArchToStringMap arch_names[] = {
 SM(89),  // Ada Lovelace
 SM(90),  // Hopper
 SM(90a), // Hopper
+SM(100), // Blackwell
 GFX(600),  // gfx600
 GFX(601),  // gfx601
 GFX(602),  // gfx602
@@ -221,6 +223,8 @@ CudaVersion MinVersionForOffloadArch(OffloadArch A) {
 return CudaVersion::CUDA_118;
   case OffloadArch::SM_90a:
 return CudaVersion::CUDA_120;
+  case OffloadArch::SM_100:
+return CudaVersion::CUDA_126;
   default:
 llvm_unreachable("invalid enum");
   }
diff --git a/clang/lib/Basic/Targets/NVPTX.cpp 
b/clang/lib/Basic/Targets/NVPTX.cpp
index 43b653dc52ce0..88a0dbde52d52 100644
--- a/clang/lib/Basic/Targets/NVPTX.cpp
+++ b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -281,6 +281,8 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions 
&Opts,
   case OffloadArch::SM_90:
   case OffloadArch::SM_90a:
 return "900";
+  case OffloadArch::SM_100:
+return "1000";
   }
   llvm_unreachable("unhandled OffloadArch");
 }();

[clang] [llvm] [CUDA] Add support for CUDA-12.6 and sm_100 (PR #97402)

2024-07-02 Thread Sergey Kozub via cfe-commits

https://github.com/sergey-kozub closed 
https://github.com/llvm/llvm-project/pull/97402
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [CUDA] Add support for CUDA-12.6 and sm_100 (PR #97402)

2024-07-02 Thread Sergey Kozub via cfe-commits

sergey-kozub wrote:

This PR is redundant, closing.

https://github.com/llvm/llvm-project/pull/97402
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [NVPTX] Add conversion intrinsics from/to fp8 types (e4m3, e5m2) (PR #102969)

2024-08-12 Thread Sergey Kozub via cfe-commits

https://github.com/sergey-kozub created 
https://github.com/llvm/llvm-project/pull/102969

PTX ISA 8.1 supports FP8 conversions:
https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt

This PR adds the support for:

- cvt.rn.satfinite{.relu}.f8x2type.f32 d, a, b;
- cvt.rn.satfinite{.relu}.f8x2type.f16x2 d, a;
- cvt.rn.{.relu}.f16x2.f8x2type d, a;

where .f8x2type = { .e4m3x2, .e5m2x2 };

>From 963d07c0a2ece9616f2e5f4ab7aa22b4d18a5cb8 Mon Sep 17 00:00:00 2001
From: Sergey Kozub 
Date: Mon, 12 Aug 2024 12:52:01 -0700
Subject: [PATCH] [NVPTX] Add conversion intrinsics from/to fp8 types (e4m3,
 e5m2)

---
 clang/include/clang/Basic/BuiltinsNVPTX.def | 15 
 clang/test/CodeGen/builtins-nvptx.c | 36 +
 llvm/include/llvm/IR/IntrinsicsNVVM.td  | 27 +++
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 31 
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td| 27 +++
 llvm/test/CodeGen/NVPTX/convert-sm89.ll | 86 +
 6 files changed, 222 insertions(+)
 create mode 100644 llvm/test/CodeGen/NVPTX/convert-sm89.ll

diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def 
b/clang/include/clang/Basic/BuiltinsNVPTX.def
index 504314d8d96e91..ecbbb1716e0fc5 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.def
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -584,6 +584,21 @@ TARGET_BUILTIN(__nvvm_f2bf16_rz_relu, "yf", "", 
AND(SM_80,PTX70))
 
 TARGET_BUILTIN(__nvvm_f2tf32_rna, "ZUif", "", AND(SM_80,PTX70))
 
+TARGET_BUILTIN(__nvvm_ff_e4m3x2_rn, "sff", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_ff_e4m3x2_rn_relu, "sff", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_ff_e5m2x2_rn, "sff", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_ff_e5m2x2_rn_relu, "sff", "", AND(SM_89,PTX81))
+
+TARGET_BUILTIN(__nvvm_f16x2_e4m3x2_rn, "sV2h", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_f16x2_e4m3x2_rn_relu, "sV2h", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_f16x2_e5m2x2_rn, "sV2h", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_f16x2_e5m2x2_rn_relu, "sV2h", "", AND(SM_89,PTX81))
+
+TARGET_BUILTIN(__nvvm_e4m3x2_f16x2_rn, "V2hs", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_e4m3x2_f16x2_rn_relu, "V2hs", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_e5m2x2_f16x2_rn, "V2hs", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_e5m2x2_f16x2_rn_relu, "V2hs", "", AND(SM_89,PTX81))
+
 // Bitcast
 
 BUILTIN(__nvvm_bitcast_f2i, "if", "")
diff --git a/clang/test/CodeGen/builtins-nvptx.c 
b/clang/test/CodeGen/builtins-nvptx.c
index 75b9d6d1fe1902..9d9f2f31f57e79 100644
--- a/clang/test/CodeGen/builtins-nvptx.c
+++ b/clang/test/CodeGen/builtins-nvptx.c
@@ -22,6 +22,9 @@
 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown 
-target-cpu sm_86 -target-feature +ptx72 \
 // RUN:-fcuda-is-device -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 
-check-prefix=LP64 %s
+// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown 
-target-cpu sm_89 -target-feature +ptx81 \
+// RUN:-fcuda-is-device -emit-llvm -o - -x cuda %s \
+// RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s
 
 #define __device__ __attribute__((device))
 #define __global__ __attribute__((global))
@@ -968,6 +971,39 @@ __device__ void nvvm_cvt_sm80() {
   // CHECK: ret void
 }
 
+// CHECK-LABEL: nvvm_cvt_sm89
+__device__ void nvvm_cvt_sm89() {
+#if __CUDA_ARCH__ >= 890
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.e4m3x2.rn(float 1.00e+00, 
float 1.00e+00)
+  __nvvm_ff_e4m3x2_rn(1, 1);
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.e4m3x2.rn.relu(float 
1.00e+00, float 1.00e+00)
+  __nvvm_ff_e4m3x2_rn_relu(1, 1);
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.e5m2x2.rn(float 1.00e+00, 
float 1.00e+00)
+  __nvvm_ff_e5m2x2_rn(1, 1);
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.e5m2x2.rn.relu(float 
1.00e+00, float 1.00e+00)
+  __nvvm_ff_e5m2x2_rn_relu(1, 1);
+
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.e4m3x2.rn(<2 x half> )
+  __nvvm_f16x2_e4m3x2_rn({1.0f16, 1.0f16});
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.e4m3x2.rn.relu(<2 x half> 
)
+  __nvvm_f16x2_e4m3x2_rn_relu({1.0f16, 1.0f16});
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.e5m2x2.rn(<2 x half> )
+  __nvvm_f16x2_e5m2x2_rn({1.0f16, 1.0f16});
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.e5m2x2.rn.relu(<2 x half> 
)
+  __nvvm_f16x2_e5m2x2_rn_relu({1.0f16, 1.0f16});
+
+  // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e4m3x2.f16x2.rn(i16 18504)
+  __nvvm_e4m3x2_f16x2_rn(0x4848);
+  // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e4m3x2.f16x2.rn.relu(i16 
18504)
+  __nvvm_e4m3x2_f16x2_rn_relu(0x4848);
+  // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.f16x2.rn(i16 19532)
+  __nvvm_e5m2x2_f16x2_rn(0x4c4c);
+  // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.f16x2.rn.relu(i16 
19532)
+  __nvvm_e5m2x2_f16x2_rn_relu(0x4c4c);
+#endif
+  // CHECK: ret void

[clang] [llvm] [NVPTX] Add conversion intrinsics from/to fp8 types (e4m3, e5m2) (PR #102969)

2024-08-13 Thread Sergey Kozub via cfe-commits

https://github.com/sergey-kozub updated 
https://github.com/llvm/llvm-project/pull/102969

>From 72b9a5ff64807bf4722a7168e1210f849bef7071 Mon Sep 17 00:00:00 2001
From: Sergey Kozub 
Date: Mon, 12 Aug 2024 12:52:01 -0700
Subject: [PATCH] [NVPTX] Add conversion intrinsics from/to fp8 types (e4m3,
 e5m2)

---
 clang/include/clang/Basic/BuiltinsNVPTX.def | 15 
 clang/test/CodeGen/builtins-nvptx.c | 36 +
 llvm/include/llvm/IR/IntrinsicsNVVM.td  | 27 +++
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 31 
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td| 27 +++
 llvm/test/CodeGen/NVPTX/convert-sm89.ll | 86 +
 6 files changed, 222 insertions(+)
 create mode 100644 llvm/test/CodeGen/NVPTX/convert-sm89.ll

diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def 
b/clang/include/clang/Basic/BuiltinsNVPTX.def
index 504314d8d96e91..ecbbb1716e0fc5 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.def
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -584,6 +584,21 @@ TARGET_BUILTIN(__nvvm_f2bf16_rz_relu, "yf", "", 
AND(SM_80,PTX70))
 
 TARGET_BUILTIN(__nvvm_f2tf32_rna, "ZUif", "", AND(SM_80,PTX70))
 
+TARGET_BUILTIN(__nvvm_ff_e4m3x2_rn, "sff", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_ff_e4m3x2_rn_relu, "sff", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_ff_e5m2x2_rn, "sff", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_ff_e5m2x2_rn_relu, "sff", "", AND(SM_89,PTX81))
+
+TARGET_BUILTIN(__nvvm_f16x2_e4m3x2_rn, "sV2h", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_f16x2_e4m3x2_rn_relu, "sV2h", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_f16x2_e5m2x2_rn, "sV2h", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_f16x2_e5m2x2_rn_relu, "sV2h", "", AND(SM_89,PTX81))
+
+TARGET_BUILTIN(__nvvm_e4m3x2_f16x2_rn, "V2hs", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_e4m3x2_f16x2_rn_relu, "V2hs", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_e5m2x2_f16x2_rn, "V2hs", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_e5m2x2_f16x2_rn_relu, "V2hs", "", AND(SM_89,PTX81))
+
 // Bitcast
 
 BUILTIN(__nvvm_bitcast_f2i, "if", "")
diff --git a/clang/test/CodeGen/builtins-nvptx.c 
b/clang/test/CodeGen/builtins-nvptx.c
index 75b9d6d1fe1902..9d9f2f31f57e79 100644
--- a/clang/test/CodeGen/builtins-nvptx.c
+++ b/clang/test/CodeGen/builtins-nvptx.c
@@ -22,6 +22,9 @@
 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown 
-target-cpu sm_86 -target-feature +ptx72 \
 // RUN:-fcuda-is-device -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 
-check-prefix=LP64 %s
+// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown 
-target-cpu sm_89 -target-feature +ptx81 \
+// RUN:-fcuda-is-device -emit-llvm -o - -x cuda %s \
+// RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s
 
 #define __device__ __attribute__((device))
 #define __global__ __attribute__((global))
@@ -968,6 +971,39 @@ __device__ void nvvm_cvt_sm80() {
   // CHECK: ret void
 }
 
+// CHECK-LABEL: nvvm_cvt_sm89
+__device__ void nvvm_cvt_sm89() {
+#if __CUDA_ARCH__ >= 890
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.e4m3x2.rn(float 1.00e+00, 
float 1.00e+00)
+  __nvvm_ff_e4m3x2_rn(1, 1);
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.e4m3x2.rn.relu(float 
1.00e+00, float 1.00e+00)
+  __nvvm_ff_e4m3x2_rn_relu(1, 1);
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.e5m2x2.rn(float 1.00e+00, 
float 1.00e+00)
+  __nvvm_ff_e5m2x2_rn(1, 1);
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.e5m2x2.rn.relu(float 
1.00e+00, float 1.00e+00)
+  __nvvm_ff_e5m2x2_rn_relu(1, 1);
+
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.e4m3x2.rn(<2 x half> )
+  __nvvm_f16x2_e4m3x2_rn({1.0f16, 1.0f16});
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.e4m3x2.rn.relu(<2 x half> 
)
+  __nvvm_f16x2_e4m3x2_rn_relu({1.0f16, 1.0f16});
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.e5m2x2.rn(<2 x half> )
+  __nvvm_f16x2_e5m2x2_rn({1.0f16, 1.0f16});
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.e5m2x2.rn.relu(<2 x half> 
)
+  __nvvm_f16x2_e5m2x2_rn_relu({1.0f16, 1.0f16});
+
+  // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e4m3x2.f16x2.rn(i16 18504)
+  __nvvm_e4m3x2_f16x2_rn(0x4848);
+  // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e4m3x2.f16x2.rn.relu(i16 
18504)
+  __nvvm_e4m3x2_f16x2_rn_relu(0x4848);
+  // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.f16x2.rn(i16 19532)
+  __nvvm_e5m2x2_f16x2_rn(0x4c4c);
+  // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.f16x2.rn.relu(i16 
19532)
+  __nvvm_e5m2x2_f16x2_rn_relu(0x4c4c);
+#endif
+  // CHECK: ret void
+}
+
 #define NAN32 0x7FBF
 #define NAN16 (__bf16)0x7FBF
 #define BF16 (__bf16)0.1f
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td 
b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 7caada24dad564..042df62dc0dc28 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1296,6 +1296,33 @@ let Tar

[clang] [llvm] [NVPTX] Add conversion intrinsics from/to fp8 types (e4m3, e5m2) (PR #102969)

2024-08-13 Thread Sergey Kozub via cfe-commits

https://github.com/sergey-kozub updated 
https://github.com/llvm/llvm-project/pull/102969

>From a696f131f97370bd8b9ec264d27555c6ace4d027 Mon Sep 17 00:00:00 2001
From: Sergey Kozub 
Date: Mon, 12 Aug 2024 12:52:01 -0700
Subject: [PATCH] [NVPTX] Add conversion intrinsics from/to fp8 types (e4m3,
 e5m2)

---
 clang/include/clang/Basic/BuiltinsNVPTX.def | 15 
 clang/test/CodeGen/builtins-nvptx.c | 36 +
 llvm/include/llvm/IR/IntrinsicsNVVM.td  | 27 +++
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 31 
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td| 27 +++
 llvm/test/CodeGen/NVPTX/convert-sm89.ll | 86 +
 6 files changed, 222 insertions(+)
 create mode 100644 llvm/test/CodeGen/NVPTX/convert-sm89.ll

diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def 
b/clang/include/clang/Basic/BuiltinsNVPTX.def
index 504314d8d96e91..c11970c279c4bb 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.def
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -584,6 +584,21 @@ TARGET_BUILTIN(__nvvm_f2bf16_rz_relu, "yf", "", 
AND(SM_80,PTX70))
 
 TARGET_BUILTIN(__nvvm_f2tf32_rna, "ZUif", "", AND(SM_80,PTX70))
 
+TARGET_BUILTIN(__nvvm_ff_to_e4m3x2_rn, "sff", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_ff_to_e4m3x2_rn_relu, "sff", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_ff_to_e5m2x2_rn, "sff", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_ff_to_e5m2x2_rn_relu, "sff", "", AND(SM_89,PTX81))
+
+TARGET_BUILTIN(__nvvm_f16x2_to_e4m3x2_rn, "sV2h", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_f16x2_to_e4m3x2_rn_relu, "sV2h", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_f16x2_to_e5m2x2_rn, "sV2h", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_f16x2_to_e5m2x2_rn_relu, "sV2h", "", AND(SM_89,PTX81))
+
+TARGET_BUILTIN(__nvvm_e4m3x2_to_f16x2_rn, "V2hs", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_e4m3x2_to_f16x2_rn_relu, "V2hs", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_e5m2x2_to_f16x2_rn, "V2hs", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_e5m2x2_to_f16x2_rn_relu, "V2hs", "", AND(SM_89,PTX81))
+
 // Bitcast
 
 BUILTIN(__nvvm_bitcast_f2i, "if", "")
diff --git a/clang/test/CodeGen/builtins-nvptx.c 
b/clang/test/CodeGen/builtins-nvptx.c
index 75b9d6d1fe1902..957cf7616c0411 100644
--- a/clang/test/CodeGen/builtins-nvptx.c
+++ b/clang/test/CodeGen/builtins-nvptx.c
@@ -22,6 +22,9 @@
 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown 
-target-cpu sm_86 -target-feature +ptx72 \
 // RUN:-fcuda-is-device -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 
-check-prefix=LP64 %s
+// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown 
-target-cpu sm_89 -target-feature +ptx81 \
+// RUN:-fcuda-is-device -emit-llvm -o - -x cuda %s \
+// RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s
 
 #define __device__ __attribute__((device))
 #define __global__ __attribute__((global))
@@ -968,6 +971,39 @@ __device__ void nvvm_cvt_sm80() {
   // CHECK: ret void
 }
 
+// CHECK-LABEL: nvvm_cvt_sm89
+__device__ void nvvm_cvt_sm89() {
+#if __CUDA_ARCH__ >= 890
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn(float 1.00e+00, 
float 1.00e+00)
+  __nvvm_ff_to_e4m3x2_rn(1, 1);
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn.relu(float 
1.00e+00, float 1.00e+00)
+  __nvvm_ff_to_e4m3x2_rn_relu(1, 1);
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e5m2x2.rn(float 1.00e+00, 
float 1.00e+00)
+  __nvvm_ff_to_e5m2x2_rn(1, 1);
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e5m2x2.rn.relu(float 
1.00e+00, float 1.00e+00)
+  __nvvm_ff_to_e5m2x2_rn_relu(1, 1);
+
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e4m3x2.rn(<2 x half> )
+  __nvvm_f16x2_to_e4m3x2_rn({1.0f16, 1.0f16});
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e4m3x2.rn.relu(<2 x half> 
)
+  __nvvm_f16x2_to_e4m3x2_rn_relu({1.0f16, 1.0f16});
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e5m2x2.rn(<2 x half> )
+  __nvvm_f16x2_to_e5m2x2_rn({1.0f16, 1.0f16});
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e5m2x2.rn.relu(<2 x half> 
)
+  __nvvm_f16x2_to_e5m2x2_rn_relu({1.0f16, 1.0f16});
+
+  // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e4m3x2.to.f16x2.rn(i16 18504)
+  __nvvm_e4m3x2_to_f16x2_rn(0x4848);
+  // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e4m3x2.to.f16x2.rn.relu(i16 
18504)
+  __nvvm_e4m3x2_to_f16x2_rn_relu(0x4848);
+  // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn(i16 19532)
+  __nvvm_e5m2x2_to_f16x2_rn(0x4c4c);
+  // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn.relu(i16 
19532)
+  __nvvm_e5m2x2_to_f16x2_rn_relu(0x4c4c);
+#endif
+  // CHECK: ret void
+}
+
 #define NAN32 0x7FBF
 #define NAN16 (__bf16)0x7FBF
 #define BF16 (__bf16)0.1f
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td 
b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 7caada24dad564..42dcf08cc65cac 100644
--- a/llvm/i

[clang] [llvm] [NVPTX] Add conversion intrinsics from/to fp8 types (e4m3, e5m2) (PR #102969)

2024-08-13 Thread Sergey Kozub via cfe-commits

https://github.com/sergey-kozub updated 
https://github.com/llvm/llvm-project/pull/102969

>From 7db2478f59c5a4f46df040ed4799da815b28bc43 Mon Sep 17 00:00:00 2001
From: Sergey Kozub 
Date: Mon, 12 Aug 2024 12:52:01 -0700
Subject: [PATCH] [NVPTX] Add conversion intrinsics from/to fp8 types (e4m3,
 e5m2)

---
 clang/include/clang/Basic/BuiltinsNVPTX.def | 15 
 clang/test/CodeGen/builtins-nvptx.c | 36 +
 llvm/include/llvm/IR/IntrinsicsNVVM.td  | 27 +++
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 31 
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td| 27 +++
 llvm/test/CodeGen/NVPTX/convert-sm89.ll | 86 +
 6 files changed, 222 insertions(+)
 create mode 100644 llvm/test/CodeGen/NVPTX/convert-sm89.ll

diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def 
b/clang/include/clang/Basic/BuiltinsNVPTX.def
index 504314d8d96e91..c11970c279c4bb 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.def
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -584,6 +584,21 @@ TARGET_BUILTIN(__nvvm_f2bf16_rz_relu, "yf", "", 
AND(SM_80,PTX70))
 
 TARGET_BUILTIN(__nvvm_f2tf32_rna, "ZUif", "", AND(SM_80,PTX70))
 
+TARGET_BUILTIN(__nvvm_ff_to_e4m3x2_rn, "sff", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_ff_to_e4m3x2_rn_relu, "sff", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_ff_to_e5m2x2_rn, "sff", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_ff_to_e5m2x2_rn_relu, "sff", "", AND(SM_89,PTX81))
+
+TARGET_BUILTIN(__nvvm_f16x2_to_e4m3x2_rn, "sV2h", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_f16x2_to_e4m3x2_rn_relu, "sV2h", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_f16x2_to_e5m2x2_rn, "sV2h", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_f16x2_to_e5m2x2_rn_relu, "sV2h", "", AND(SM_89,PTX81))
+
+TARGET_BUILTIN(__nvvm_e4m3x2_to_f16x2_rn, "V2hs", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_e4m3x2_to_f16x2_rn_relu, "V2hs", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_e5m2x2_to_f16x2_rn, "V2hs", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_e5m2x2_to_f16x2_rn_relu, "V2hs", "", AND(SM_89,PTX81))
+
 // Bitcast
 
 BUILTIN(__nvvm_bitcast_f2i, "if", "")
diff --git a/clang/test/CodeGen/builtins-nvptx.c 
b/clang/test/CodeGen/builtins-nvptx.c
index 75b9d6d1fe1902..20399b73e63757 100644
--- a/clang/test/CodeGen/builtins-nvptx.c
+++ b/clang/test/CodeGen/builtins-nvptx.c
@@ -22,6 +22,9 @@
 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown 
-target-cpu sm_86 -target-feature +ptx72 \
 // RUN:-fcuda-is-device -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 
-check-prefix=LP64 %s
+// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown 
-target-cpu sm_89 -target-feature +ptx81 \
+// RUN:-fcuda-is-device -emit-llvm -o - -x cuda %s \
+// RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s
 
 #define __device__ __attribute__((device))
 #define __global__ __attribute__((global))
@@ -968,6 +971,39 @@ __device__ void nvvm_cvt_sm80() {
   // CHECK: ret void
 }
 
+// CHECK-LABEL: nvvm_cvt_sm89
+__device__ void nvvm_cvt_sm89() {
+#if __CUDA_ARCH__ >= 890
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn(float 1.00e+00, 
float 1.00e+00)
+  __nvvm_ff_to_e4m3x2_rn(1.0f, 1.0f);
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn.relu(float 
1.00e+00, float 1.00e+00)
+  __nvvm_ff_to_e4m3x2_rn_relu(1.0f, 1.0f);
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e5m2x2.rn(float 1.00e+00, 
float 1.00e+00)
+  __nvvm_ff_to_e5m2x2_rn(1.0f, 1.0f);
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e5m2x2.rn.relu(float 
1.00e+00, float 1.00e+00)
+  __nvvm_ff_to_e5m2x2_rn_relu(1.0f, 1.0f);
+
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e4m3x2.rn(<2 x half> )
+  __nvvm_f16x2_to_e4m3x2_rn({1.0f16, 1.0f16});
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e4m3x2.rn.relu(<2 x half> 
)
+  __nvvm_f16x2_to_e4m3x2_rn_relu({1.0f16, 1.0f16});
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e5m2x2.rn(<2 x half> )
+  __nvvm_f16x2_to_e5m2x2_rn({1.0f16, 1.0f16});
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e5m2x2.rn.relu(<2 x half> 
)
+  __nvvm_f16x2_to_e5m2x2_rn_relu({1.0f16, 1.0f16});
+
+  // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e4m3x2.to.f16x2.rn(i16 18504)
+  __nvvm_e4m3x2_to_f16x2_rn(0x4848);
+  // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e4m3x2.to.f16x2.rn.relu(i16 
18504)
+  __nvvm_e4m3x2_to_f16x2_rn_relu(0x4848);
+  // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn(i16 19532)
+  __nvvm_e5m2x2_to_f16x2_rn(0x4c4c);
+  // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn.relu(i16 
19532)
+  __nvvm_e5m2x2_to_f16x2_rn_relu(0x4c4c);
+#endif
+  // CHECK: ret void
+}
+
 #define NAN32 0x7FBF
 #define NAN16 (__bf16)0x7FBF
 #define BF16 (__bf16)0.1f
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td 
b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 7caada24dad564..42dcf08cc6

[clang] [llvm] [NVPTX] Add conversion intrinsics from/to fp8 types (e4m3, e5m2) (PR #102969)

2024-08-13 Thread Sergey Kozub via cfe-commits


@@ -722,6 +722,37 @@ let hasSideEffects = false in {
 
   defm CVT_f16x2 : CVT_FROM_FLOAT_V2_SM80<"f16x2", Int32Regs>;
   defm CVT_bf16x2 : CVT_FROM_FLOAT_V2_SM80<"bf16x2", Int32Regs>;
+
+  // FP8 conversions.
+  multiclass CVT_TO_F8X2 {
+def _f32 :
+  NVPTXInst<(outs Int16Regs:$dst),
+(ins Float32Regs:$src1, Float32Regs:$src2, CvtMode:$mode),
+!strconcat("cvt${mode:base}.satfinite${mode:relu}.",
+F8Name, "x2.f32 \t$dst, $src1, $src2;"), []>,
+  Requires<[hasPTX<81>, hasSM<89>]>;
+def _f16x2 :
+  NVPTXInst<(outs Int16Regs:$dst),
+(ins Int32Regs:$src, CvtMode:$mode),
+!strconcat("cvt${mode:base}.satfinite${mode:relu}.",
+F8Name, "x2.f16x2 \t$dst, $src;"), []>,
+  Requires<[hasPTX<81>, hasSM<89>]>;
+  }
+
+  defm CVT_e4m3x2 : CVT_TO_F8X2<"e4m3">;
+  defm CVT_e5m2x2 : CVT_TO_F8X2<"e5m2">;
+
+  multiclass CVT_FROM_F8X2 {
+def x2 :
+  NVPTXInst<(outs Int32Regs:$dst),

sergey-kozub wrote:

This one is confusing, "x2" is only there only because (for some reason) I 
cannot specify an empty suffix.
All the classes in this file use the `CVT__` pattern. I need to 
define two classes: CVT_f16x2_e4m3x2 and CVT_f16x2_e5m2.

You're suggesting to rename to `_f16x2` but I guess "multiclass" only supports 
appending the suffixes, whereas I need "f16x2" to appear in the middle of the 
class name (in order to conform to the common naming pattern).

Also, "outs" is Int32Regs, and I cannot specify <2xf16> here. This is in line 
with the other class definitions in this file.


https://github.com/llvm/llvm-project/pull/102969
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [NVPTX] Add conversion intrinsics from/to fp8 types (e4m3, e5m2) (PR #102969)

2024-08-13 Thread Sergey Kozub via cfe-commits

https://github.com/sergey-kozub updated 
https://github.com/llvm/llvm-project/pull/102969

>From ee028cccb41964223b87edb1db88710bac89080a Mon Sep 17 00:00:00 2001
From: Sergey Kozub 
Date: Mon, 12 Aug 2024 12:52:01 -0700
Subject: [PATCH] [NVPTX] Add conversion intrinsics from/to fp8 types (e4m3,
 e5m2)

---
 clang/include/clang/Basic/BuiltinsNVPTX.def | 15 
 clang/test/CodeGen/builtins-nvptx.c | 36 +
 llvm/include/llvm/IR/IntrinsicsNVVM.td  | 27 +++
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 31 
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td| 27 +++
 llvm/test/CodeGen/NVPTX/convert-sm89.ll | 86 +
 6 files changed, 222 insertions(+)
 create mode 100644 llvm/test/CodeGen/NVPTX/convert-sm89.ll

diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def 
b/clang/include/clang/Basic/BuiltinsNVPTX.def
index 504314d8d96e91..c11970c279c4bb 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.def
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -584,6 +584,21 @@ TARGET_BUILTIN(__nvvm_f2bf16_rz_relu, "yf", "", 
AND(SM_80,PTX70))
 
 TARGET_BUILTIN(__nvvm_f2tf32_rna, "ZUif", "", AND(SM_80,PTX70))
 
+TARGET_BUILTIN(__nvvm_ff_to_e4m3x2_rn, "sff", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_ff_to_e4m3x2_rn_relu, "sff", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_ff_to_e5m2x2_rn, "sff", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_ff_to_e5m2x2_rn_relu, "sff", "", AND(SM_89,PTX81))
+
+TARGET_BUILTIN(__nvvm_f16x2_to_e4m3x2_rn, "sV2h", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_f16x2_to_e4m3x2_rn_relu, "sV2h", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_f16x2_to_e5m2x2_rn, "sV2h", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_f16x2_to_e5m2x2_rn_relu, "sV2h", "", AND(SM_89,PTX81))
+
+TARGET_BUILTIN(__nvvm_e4m3x2_to_f16x2_rn, "V2hs", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_e4m3x2_to_f16x2_rn_relu, "V2hs", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_e5m2x2_to_f16x2_rn, "V2hs", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_e5m2x2_to_f16x2_rn_relu, "V2hs", "", AND(SM_89,PTX81))
+
 // Bitcast
 
 BUILTIN(__nvvm_bitcast_f2i, "if", "")
diff --git a/clang/test/CodeGen/builtins-nvptx.c 
b/clang/test/CodeGen/builtins-nvptx.c
index 75b9d6d1fe1902..20399b73e63757 100644
--- a/clang/test/CodeGen/builtins-nvptx.c
+++ b/clang/test/CodeGen/builtins-nvptx.c
@@ -22,6 +22,9 @@
 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown 
-target-cpu sm_86 -target-feature +ptx72 \
 // RUN:-fcuda-is-device -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 
-check-prefix=LP64 %s
+// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown 
-target-cpu sm_89 -target-feature +ptx81 \
+// RUN:-fcuda-is-device -emit-llvm -o - -x cuda %s \
+// RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s
 
 #define __device__ __attribute__((device))
 #define __global__ __attribute__((global))
@@ -968,6 +971,39 @@ __device__ void nvvm_cvt_sm80() {
   // CHECK: ret void
 }
 
+// CHECK-LABEL: nvvm_cvt_sm89
+__device__ void nvvm_cvt_sm89() {
+#if __CUDA_ARCH__ >= 890
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn(float 1.00e+00, 
float 1.00e+00)
+  __nvvm_ff_to_e4m3x2_rn(1.0f, 1.0f);
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn.relu(float 
1.00e+00, float 1.00e+00)
+  __nvvm_ff_to_e4m3x2_rn_relu(1.0f, 1.0f);
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e5m2x2.rn(float 1.00e+00, 
float 1.00e+00)
+  __nvvm_ff_to_e5m2x2_rn(1.0f, 1.0f);
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e5m2x2.rn.relu(float 
1.00e+00, float 1.00e+00)
+  __nvvm_ff_to_e5m2x2_rn_relu(1.0f, 1.0f);
+
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e4m3x2.rn(<2 x half> )
+  __nvvm_f16x2_to_e4m3x2_rn({1.0f16, 1.0f16});
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e4m3x2.rn.relu(<2 x half> 
)
+  __nvvm_f16x2_to_e4m3x2_rn_relu({1.0f16, 1.0f16});
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e5m2x2.rn(<2 x half> )
+  __nvvm_f16x2_to_e5m2x2_rn({1.0f16, 1.0f16});
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e5m2x2.rn.relu(<2 x half> 
)
+  __nvvm_f16x2_to_e5m2x2_rn_relu({1.0f16, 1.0f16});
+
+  // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e4m3x2.to.f16x2.rn(i16 18504)
+  __nvvm_e4m3x2_to_f16x2_rn(0x4848);
+  // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e4m3x2.to.f16x2.rn.relu(i16 
18504)
+  __nvvm_e4m3x2_to_f16x2_rn_relu(0x4848);
+  // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn(i16 19532)
+  __nvvm_e5m2x2_to_f16x2_rn(0x4c4c);
+  // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn.relu(i16 
19532)
+  __nvvm_e5m2x2_to_f16x2_rn_relu(0x4c4c);
+#endif
+  // CHECK: ret void
+}
+
 #define NAN32 0x7FBF
 #define NAN16 (__bf16)0x7FBF
 #define BF16 (__bf16)0.1f
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td 
b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 7caada24dad564..42dcf08cc6

[clang] [llvm] [NVPTX] Add conversion intrinsics from/to fp8 types (e4m3, e5m2) (PR #102969)

2024-08-13 Thread Sergey Kozub via cfe-commits


@@ -722,6 +722,37 @@ let hasSideEffects = false in {
 
   defm CVT_f16x2 : CVT_FROM_FLOAT_V2_SM80<"f16x2", Int32Regs>;
   defm CVT_bf16x2 : CVT_FROM_FLOAT_V2_SM80<"bf16x2", Int32Regs>;
+
+  // FP8 conversions.
+  multiclass CVT_TO_F8X2 {
+def _f32 :
+  NVPTXInst<(outs Int16Regs:$dst),
+(ins Float32Regs:$src1, Float32Regs:$src2, CvtMode:$mode),
+!strconcat("cvt${mode:base}.satfinite${mode:relu}.",
+F8Name, "x2.f32 \t$dst, $src1, $src2;"), []>,
+  Requires<[hasPTX<81>, hasSM<89>]>;
+def _f16x2 :
+  NVPTXInst<(outs Int16Regs:$dst),
+(ins Int32Regs:$src, CvtMode:$mode),
+!strconcat("cvt${mode:base}.satfinite${mode:relu}.",
+F8Name, "x2.f16x2 \t$dst, $src;"), []>,
+  Requires<[hasPTX<81>, hasSM<89>]>;
+  }
+
+  defm CVT_e4m3x2 : CVT_TO_F8X2<"e4m3">;
+  defm CVT_e5m2x2 : CVT_TO_F8X2<"e5m2">;
+
+  multiclass CVT_FROM_F8X2 {
+def x2 :
+  NVPTXInst<(outs Int32Regs:$dst),

sergey-kozub wrote:

Neat, wasn't aware of the NAME keyword. Updated the PR.

https://github.com/llvm/llvm-project/pull/102969
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [NVPTX] Add conversion intrinsics from/to fp8 types (e4m3, e5m2) (PR #102969)

2024-08-14 Thread Sergey Kozub via cfe-commits

https://github.com/sergey-kozub updated 
https://github.com/llvm/llvm-project/pull/102969

>From e74a0de37302baaf89bfe3230f561684ec5777db Mon Sep 17 00:00:00 2001
From: Sergey Kozub 
Date: Mon, 12 Aug 2024 12:52:01 -0700
Subject: [PATCH] [NVPTX] Add conversion intrinsics from/to fp8 types (e4m3,
 e5m2)

---
 clang/include/clang/Basic/BuiltinsNVPTX.def | 15 
 clang/test/CodeGen/builtins-nvptx.c | 36 +
 llvm/include/llvm/IR/IntrinsicsNVVM.td  | 27 +++
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 31 
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td| 27 +++
 llvm/test/CodeGen/NVPTX/convert-sm89.ll | 86 +
 6 files changed, 222 insertions(+)
 create mode 100644 llvm/test/CodeGen/NVPTX/convert-sm89.ll

diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def 
b/clang/include/clang/Basic/BuiltinsNVPTX.def
index 504314d8d96e91..c11970c279c4bb 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.def
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -584,6 +584,21 @@ TARGET_BUILTIN(__nvvm_f2bf16_rz_relu, "yf", "", 
AND(SM_80,PTX70))
 
 TARGET_BUILTIN(__nvvm_f2tf32_rna, "ZUif", "", AND(SM_80,PTX70))
 
+TARGET_BUILTIN(__nvvm_ff_to_e4m3x2_rn, "sff", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_ff_to_e4m3x2_rn_relu, "sff", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_ff_to_e5m2x2_rn, "sff", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_ff_to_e5m2x2_rn_relu, "sff", "", AND(SM_89,PTX81))
+
+TARGET_BUILTIN(__nvvm_f16x2_to_e4m3x2_rn, "sV2h", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_f16x2_to_e4m3x2_rn_relu, "sV2h", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_f16x2_to_e5m2x2_rn, "sV2h", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_f16x2_to_e5m2x2_rn_relu, "sV2h", "", AND(SM_89,PTX81))
+
+TARGET_BUILTIN(__nvvm_e4m3x2_to_f16x2_rn, "V2hs", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_e4m3x2_to_f16x2_rn_relu, "V2hs", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_e5m2x2_to_f16x2_rn, "V2hs", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_e5m2x2_to_f16x2_rn_relu, "V2hs", "", AND(SM_89,PTX81))
+
 // Bitcast
 
 BUILTIN(__nvvm_bitcast_f2i, "if", "")
diff --git a/clang/test/CodeGen/builtins-nvptx.c 
b/clang/test/CodeGen/builtins-nvptx.c
index 75b9d6d1fe1902..20399b73e63757 100644
--- a/clang/test/CodeGen/builtins-nvptx.c
+++ b/clang/test/CodeGen/builtins-nvptx.c
@@ -22,6 +22,9 @@
 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown 
-target-cpu sm_86 -target-feature +ptx72 \
 // RUN:-fcuda-is-device -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 
-check-prefix=LP64 %s
+// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown 
-target-cpu sm_89 -target-feature +ptx81 \
+// RUN:-fcuda-is-device -emit-llvm -o - -x cuda %s \
+// RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s
 
 #define __device__ __attribute__((device))
 #define __global__ __attribute__((global))
@@ -968,6 +971,39 @@ __device__ void nvvm_cvt_sm80() {
   // CHECK: ret void
 }
 
+// CHECK-LABEL: nvvm_cvt_sm89
+__device__ void nvvm_cvt_sm89() {
+#if __CUDA_ARCH__ >= 890
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn(float 1.00e+00, 
float 1.00e+00)
+  __nvvm_ff_to_e4m3x2_rn(1.0f, 1.0f);
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn.relu(float 
1.00e+00, float 1.00e+00)
+  __nvvm_ff_to_e4m3x2_rn_relu(1.0f, 1.0f);
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e5m2x2.rn(float 1.00e+00, 
float 1.00e+00)
+  __nvvm_ff_to_e5m2x2_rn(1.0f, 1.0f);
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e5m2x2.rn.relu(float 
1.00e+00, float 1.00e+00)
+  __nvvm_ff_to_e5m2x2_rn_relu(1.0f, 1.0f);
+
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e4m3x2.rn(<2 x half> )
+  __nvvm_f16x2_to_e4m3x2_rn({1.0f16, 1.0f16});
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e4m3x2.rn.relu(<2 x half> 
)
+  __nvvm_f16x2_to_e4m3x2_rn_relu({1.0f16, 1.0f16});
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e5m2x2.rn(<2 x half> )
+  __nvvm_f16x2_to_e5m2x2_rn({1.0f16, 1.0f16});
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e5m2x2.rn.relu(<2 x half> 
)
+  __nvvm_f16x2_to_e5m2x2_rn_relu({1.0f16, 1.0f16});
+
+  // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e4m3x2.to.f16x2.rn(i16 18504)
+  __nvvm_e4m3x2_to_f16x2_rn(0x4848);
+  // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e4m3x2.to.f16x2.rn.relu(i16 
18504)
+  __nvvm_e4m3x2_to_f16x2_rn_relu(0x4848);
+  // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn(i16 19532)
+  __nvvm_e5m2x2_to_f16x2_rn(0x4c4c);
+  // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn.relu(i16 
19532)
+  __nvvm_e5m2x2_to_f16x2_rn_relu(0x4c4c);
+#endif
+  // CHECK: ret void
+}
+
 #define NAN32 0x7FBF
 #define NAN16 (__bf16)0x7FBF
 #define BF16 (__bf16)0.1f
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td 
b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 7caada24dad564..42dcf08cc6

[clang] [llvm] [NVPTX] Add conversion intrinsics from/to fp8 types (e4m3, e5m2) (PR #102969)

2024-08-14 Thread Sergey Kozub via cfe-commits


@@ -722,6 +722,37 @@ let hasSideEffects = false in {
 
   defm CVT_f16x2 : CVT_FROM_FLOAT_V2_SM80<"f16x2", Int32Regs>;
   defm CVT_bf16x2 : CVT_FROM_FLOAT_V2_SM80<"bf16x2", Int32Regs>;
+
+  // FP8 conversions.
+  multiclass CVT_TO_F8X2 {
+def _f32 :
+  NVPTXInst<(outs Int16Regs:$dst),
+(ins Float32Regs:$src1, Float32Regs:$src2, CvtMode:$mode),
+!strconcat("cvt${mode:base}.satfinite${mode:relu}.",
+F8Name, "x2.f32 \t$dst, $src1, $src2;"), []>,
+  Requires<[hasPTX<81>, hasSM<89>]>;
+def _f16x2 :
+  NVPTXInst<(outs Int16Regs:$dst),
+(ins Int32Regs:$src, CvtMode:$mode),
+!strconcat("cvt${mode:base}.satfinite${mode:relu}.",
+F8Name, "x2.f16x2 \t$dst, $src;"), []>,
+  Requires<[hasPTX<81>, hasSM<89>]>;
+  }
+
+  defm CVT_e4m3x2 : CVT_TO_F8X2<"e4m3">;
+  defm CVT_e5m2x2 : CVT_TO_F8X2<"e5m2">;
+
+  multiclass CVT_FROM_F8X2 {

sergey-kozub wrote:

Replaced with "class"

https://github.com/llvm/llvm-project/pull/102969
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [NVPTX] Add conversion intrinsics from/to fp8 types (e4m3, e5m2) (PR #102969)

2024-08-14 Thread Sergey Kozub via cfe-commits

https://github.com/sergey-kozub updated 
https://github.com/llvm/llvm-project/pull/102969

>From b74c8bc6009fb2f905089345594b13c8bc75ca36 Mon Sep 17 00:00:00 2001
From: Sergey Kozub 
Date: Mon, 12 Aug 2024 12:52:01 -0700
Subject: [PATCH] [NVPTX] Add conversion intrinsics from/to fp8 types (e4m3,
 e5m2)

---
 clang/include/clang/Basic/BuiltinsNVPTX.def | 15 
 clang/test/CodeGen/builtins-nvptx.c | 36 +
 llvm/include/llvm/IR/IntrinsicsNVVM.td  | 27 +++
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 29 +++
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td| 27 +++
 llvm/test/CodeGen/NVPTX/convert-sm89.ll | 86 +
 6 files changed, 220 insertions(+)
 create mode 100644 llvm/test/CodeGen/NVPTX/convert-sm89.ll

diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def 
b/clang/include/clang/Basic/BuiltinsNVPTX.def
index 504314d8d96e91..c11970c279c4bb 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.def
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -584,6 +584,21 @@ TARGET_BUILTIN(__nvvm_f2bf16_rz_relu, "yf", "", 
AND(SM_80,PTX70))
 
 TARGET_BUILTIN(__nvvm_f2tf32_rna, "ZUif", "", AND(SM_80,PTX70))
 
+TARGET_BUILTIN(__nvvm_ff_to_e4m3x2_rn, "sff", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_ff_to_e4m3x2_rn_relu, "sff", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_ff_to_e5m2x2_rn, "sff", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_ff_to_e5m2x2_rn_relu, "sff", "", AND(SM_89,PTX81))
+
+TARGET_BUILTIN(__nvvm_f16x2_to_e4m3x2_rn, "sV2h", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_f16x2_to_e4m3x2_rn_relu, "sV2h", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_f16x2_to_e5m2x2_rn, "sV2h", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_f16x2_to_e5m2x2_rn_relu, "sV2h", "", AND(SM_89,PTX81))
+
+TARGET_BUILTIN(__nvvm_e4m3x2_to_f16x2_rn, "V2hs", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_e4m3x2_to_f16x2_rn_relu, "V2hs", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_e5m2x2_to_f16x2_rn, "V2hs", "", AND(SM_89,PTX81))
+TARGET_BUILTIN(__nvvm_e5m2x2_to_f16x2_rn_relu, "V2hs", "", AND(SM_89,PTX81))
+
 // Bitcast
 
 BUILTIN(__nvvm_bitcast_f2i, "if", "")
diff --git a/clang/test/CodeGen/builtins-nvptx.c 
b/clang/test/CodeGen/builtins-nvptx.c
index 75b9d6d1fe1902..20399b73e63757 100644
--- a/clang/test/CodeGen/builtins-nvptx.c
+++ b/clang/test/CodeGen/builtins-nvptx.c
@@ -22,6 +22,9 @@
 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown 
-target-cpu sm_86 -target-feature +ptx72 \
 // RUN:-fcuda-is-device -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 
-check-prefix=LP64 %s
+// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown 
-target-cpu sm_89 -target-feature +ptx81 \
+// RUN:-fcuda-is-device -emit-llvm -o - -x cuda %s \
+// RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s
 
 #define __device__ __attribute__((device))
 #define __global__ __attribute__((global))
@@ -968,6 +971,39 @@ __device__ void nvvm_cvt_sm80() {
   // CHECK: ret void
 }
 
+// CHECK-LABEL: nvvm_cvt_sm89
+__device__ void nvvm_cvt_sm89() {
+#if __CUDA_ARCH__ >= 890
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn(float 1.00e+00, 
float 1.00e+00)
+  __nvvm_ff_to_e4m3x2_rn(1.0f, 1.0f);
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn.relu(float 
1.00e+00, float 1.00e+00)
+  __nvvm_ff_to_e4m3x2_rn_relu(1.0f, 1.0f);
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e5m2x2.rn(float 1.00e+00, 
float 1.00e+00)
+  __nvvm_ff_to_e5m2x2_rn(1.0f, 1.0f);
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e5m2x2.rn.relu(float 
1.00e+00, float 1.00e+00)
+  __nvvm_ff_to_e5m2x2_rn_relu(1.0f, 1.0f);
+
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e4m3x2.rn(<2 x half> )
+  __nvvm_f16x2_to_e4m3x2_rn({1.0f16, 1.0f16});
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e4m3x2.rn.relu(<2 x half> 
)
+  __nvvm_f16x2_to_e4m3x2_rn_relu({1.0f16, 1.0f16});
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e5m2x2.rn(<2 x half> )
+  __nvvm_f16x2_to_e5m2x2_rn({1.0f16, 1.0f16});
+  // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e5m2x2.rn.relu(<2 x half> 
)
+  __nvvm_f16x2_to_e5m2x2_rn_relu({1.0f16, 1.0f16});
+
+  // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e4m3x2.to.f16x2.rn(i16 18504)
+  __nvvm_e4m3x2_to_f16x2_rn(0x4848);
+  // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e4m3x2.to.f16x2.rn.relu(i16 
18504)
+  __nvvm_e4m3x2_to_f16x2_rn_relu(0x4848);
+  // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn(i16 19532)
+  __nvvm_e5m2x2_to_f16x2_rn(0x4c4c);
+  // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn.relu(i16 
19532)
+  __nvvm_e5m2x2_to_f16x2_rn_relu(0x4c4c);
+#endif
+  // CHECK: ret void
+}
+
 #define NAN32 0x7FBF
 #define NAN16 (__bf16)0x7FBF
 #define BF16 (__bf16)0.1f
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td 
b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 7caada24dad564..42dcf08cc65

[clang] [llvm] [NVPTX] Add conversion intrinsics from/to fp8 types (e4m3, e5m2) (PR #102969)

2024-08-15 Thread Sergey Kozub via cfe-commits

sergey-kozub wrote:

What do I need to push this through? I'm still seeing "1 workflow awaiting 
approval" message.
Should I add some other reviewer(s) who are authorized to run workflows?

https://github.com/llvm/llvm-project/pull/102969
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [NVPTX] Add conversion intrinsics from/to fp8 types (e4m3, e5m2) (PR #102969)

2024-08-18 Thread Sergey Kozub via cfe-commits

sergey-kozub wrote:

> Looks like that it was clang-format check github was waiting on an approaval 
> for. I've just clicked that button.

Now it's all green, thank you.
How do I actually upstream it? I don't see any buttons on this PR page that'd 
allow me to submit it.


https://github.com/llvm/llvm-project/pull/102969
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [NVPTX] Add conversion intrinsics from/to fp8 types (e4m3, e5m2) (PR #102969)

2024-08-19 Thread Sergey Kozub via cfe-commits

sergey-kozub wrote:

Thanks for pushing this, I'll continue by adding the support to XLA to use 
these instructions.

https://github.com/llvm/llvm-project/pull/102969
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [NVPTX] Add support for PTX 8.6 and CUDA 12.6 (12.8) (PR #123398)

2025-01-21 Thread Sergey Kozub via cfe-commits

https://github.com/sergey-kozub closed 
https://github.com/llvm/llvm-project/pull/123398
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] Remove incorrect CUDA defines (PR #123898)

2025-01-22 Thread Sergey Kozub via cfe-commits

https://github.com/sergey-kozub closed 
https://github.com/llvm/llvm-project/pull/123898
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] Remove incorrect CUDA defines (PR #123898)

2025-01-22 Thread Sergey Kozub via cfe-commits

https://github.com/sergey-kozub updated 
https://github.com/llvm/llvm-project/pull/123898

>From 1c4a581d45b622591f5062830f2ff1e33b159a64 Mon Sep 17 00:00:00 2001
From: Sergey Kozub 
Date: Wed, 22 Jan 2025 07:38:55 +
Subject: [PATCH] Remove incorrect CUDA defines

---
 clang/include/clang/Basic/Cuda.h | 4 +---
 clang/lib/Basic/Cuda.cpp | 4 +---
 clang/lib/Driver/ToolChains/Cuda.cpp | 6 --
 3 files changed, 2 insertions(+), 12 deletions(-)

diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h
index 1cdfc8178db843..f33ba46233a7ab 100644
--- a/clang/include/clang/Basic/Cuda.h
+++ b/clang/include/clang/Basic/Cuda.h
@@ -44,12 +44,10 @@ enum class CudaVersion {
   CUDA_124,
   CUDA_125,
   CUDA_126,
-  CUDA_127,
   CUDA_128,
-  CUDA_129,
   FULLY_SUPPORTED = CUDA_123,
   PARTIALLY_SUPPORTED =
-  CUDA_129, // Partially supported. Proceed with a warning.
+  CUDA_128, // Partially supported. Proceed with a warning.
   NEW = 1,  // Too new. Issue a warning, but allow using it.
 };
 const char *CudaVersionToString(CudaVersion V);
diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp
index b1461429d4f51a..1bfec0b37c5ee8 100644
--- a/clang/lib/Basic/Cuda.cpp
+++ b/clang/lib/Basic/Cuda.cpp
@@ -44,9 +44,7 @@ static const CudaVersionMapEntry CudaNameVersionMap[] = {
 CUDA_ENTRY(12, 4),
 CUDA_ENTRY(12, 5),
 CUDA_ENTRY(12, 6),
-CUDA_ENTRY(12, 7),
 CUDA_ENTRY(12, 8),
-CUDA_ENTRY(12, 9),
 {"", CudaVersion::NEW, 
llvm::VersionTuple(std::numeric_limits::max())},
 {"unknown", CudaVersion::UNKNOWN, {}} // End of list tombstone.
 };
@@ -232,7 +230,7 @@ CudaVersion MinVersionForOffloadArch(OffloadArch A) {
 return CudaVersion::CUDA_120;
   case OffloadArch::SM_100:
   case OffloadArch::SM_100a:
-return CudaVersion::CUDA_127;
+return CudaVersion::CUDA_128;
   default:
 llvm_unreachable("invalid enum");
   }
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp 
b/clang/lib/Driver/ToolChains/Cuda.cpp
index 27e1969dabe551..d4099216c81ba8 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -89,12 +89,8 @@ CudaVersion getCudaVersion(uint32_t raw_version) {
 return CudaVersion::CUDA_125;
   if (raw_version < 12070)
 return CudaVersion::CUDA_126;
-  if (raw_version < 12080)
-return CudaVersion::CUDA_127;
   if (raw_version < 12090)
 return CudaVersion::CUDA_128;
-  if (raw_version < 12100)
-return CudaVersion::CUDA_129;
   return CudaVersion::NEW;
 }
 
@@ -688,9 +684,7 @@ void NVPTX::getNVPTXTargetFeatures(const Driver &D, const 
llvm::Triple &Triple,
   case CudaVersion::CUDA_##CUDA_VER:   
\
 PtxFeature = "+ptx" #PTX_VER;  
\
 break;
-CASE_CUDA_VERSION(129, 87);
 CASE_CUDA_VERSION(128, 87);
-CASE_CUDA_VERSION(127, 86);
 CASE_CUDA_VERSION(126, 85);
 CASE_CUDA_VERSION(125, 85);
 CASE_CUDA_VERSION(124, 84);

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] Remove incorrect CUDA defines (PR #123898)

2025-01-22 Thread Sergey Kozub via cfe-commits


@@ -89,12 +89,8 @@ CudaVersion getCudaVersion(uint32_t raw_version) {
 return CudaVersion::CUDA_125;
   if (raw_version < 12070)
 return CudaVersion::CUDA_126;
-  if (raw_version < 12080)
-return CudaVersion::CUDA_127;
   if (raw_version < 12090)
 return CudaVersion::CUDA_128;
-  if (raw_version < 12100)
-return CudaVersion::CUDA_129;

sergey-kozub wrote:

Done.

https://github.com/llvm/llvm-project/pull/123898
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [NVPTX] Add support for PTX 8.6 and CUDA 12.6 (12.8) (PR #123398)

2025-01-21 Thread Sergey Kozub via cfe-commits

https://github.com/sergey-kozub updated 
https://github.com/llvm/llvm-project/pull/123398

>From dc43fbfbd29c1a088b8261cc2bfc7f6f7e5c7c2f Mon Sep 17 00:00:00 2001
From: Sergey Kozub 
Date: Fri, 17 Jan 2025 21:00:49 +
Subject: [PATCH] Add support for PTX 8.6 and CUDA 12.6 (12.8)

---
 clang/include/clang/Basic/BuiltinsNVPTX.def | 5 -
 clang/include/clang/Basic/Cuda.h| 6 +-
 clang/lib/Basic/Cuda.cpp| 8 ++--
 clang/lib/Basic/Targets/NVPTX.cpp   | 3 +++
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp| 1 +
 clang/lib/Driver/ToolChains/Cuda.cpp| 9 +
 clang/test/Misc/target-invalid-cpu-note/nvptx.c | 1 +
 llvm/lib/Target/NVPTX/NVPTX.td  | 2 ++
 8 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def 
b/clang/include/clang/Basic/BuiltinsNVPTX.def
index 969dd9e41ebfa3..37b4e6ff77fda6 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.def
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -28,7 +28,9 @@
 #pragma push_macro("SM_90")
 #pragma push_macro("SM_90a")
 #pragma push_macro("SM_100")
-#define SM_100 "sm_100"
+#pragma push_macro("SM_100a")
+#define SM_100a "sm_100a"
+#define SM_100 "sm_100|" SM_100a
 #define SM_90a "sm_90a"
 #define SM_90 "sm_90|" SM_90a "|" SM_100
 #define SM_89 "sm_89|" SM_90
@@ -1091,6 +1093,7 @@ TARGET_BUILTIN(__nvvm_getctarank_shared_cluster, "iv*3", 
"", AND(SM_90,PTX78))
 #pragma pop_macro("SM_90")
 #pragma pop_macro("SM_90a")
 #pragma pop_macro("SM_100")
+#pragma pop_macro("SM_100a")
 #pragma pop_macro("PTX42")
 #pragma pop_macro("PTX60")
 #pragma pop_macro("PTX61")
diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h
index c2a4addf488df1..1cdfc8178db843 100644
--- a/clang/include/clang/Basic/Cuda.h
+++ b/clang/include/clang/Basic/Cuda.h
@@ -44,9 +44,12 @@ enum class CudaVersion {
   CUDA_124,
   CUDA_125,
   CUDA_126,
+  CUDA_127,
+  CUDA_128,
+  CUDA_129,
   FULLY_SUPPORTED = CUDA_123,
   PARTIALLY_SUPPORTED =
-  CUDA_126, // Partially supported. Proceed with a warning.
+  CUDA_129, // Partially supported. Proceed with a warning.
   NEW = 1,  // Too new. Issue a warning, but allow using it.
 };
 const char *CudaVersionToString(CudaVersion V);
@@ -80,6 +83,7 @@ enum class OffloadArch {
   SM_90,
   SM_90a,
   SM_100,
+  SM_100a,
   GFX600,
   GFX601,
   GFX602,
diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp
index d56609a2a8f24a..b1461429d4f51a 100644
--- a/clang/lib/Basic/Cuda.cpp
+++ b/clang/lib/Basic/Cuda.cpp
@@ -44,6 +44,9 @@ static const CudaVersionMapEntry CudaNameVersionMap[] = {
 CUDA_ENTRY(12, 4),
 CUDA_ENTRY(12, 5),
 CUDA_ENTRY(12, 6),
+CUDA_ENTRY(12, 7),
+CUDA_ENTRY(12, 8),
+CUDA_ENTRY(12, 9),
 {"", CudaVersion::NEW, 
llvm::VersionTuple(std::numeric_limits::max())},
 {"unknown", CudaVersion::UNKNOWN, {}} // End of list tombstone.
 };
@@ -98,6 +101,7 @@ static const OffloadArchToStringMap arch_names[] = {
 SM(90),  // Hopper
 SM(90a), // Hopper
 SM(100), // Blackwell
+SM(100a),// Blackwell
 GFX(600),  // gfx600
 GFX(601),  // gfx601
 GFX(602),  // gfx602
@@ -227,8 +231,8 @@ CudaVersion MinVersionForOffloadArch(OffloadArch A) {
   case OffloadArch::SM_90a:
 return CudaVersion::CUDA_120;
   case OffloadArch::SM_100:
-return CudaVersion::NEW; // TODO: use specific CUDA version once it's
- // public.
+  case OffloadArch::SM_100a:
+return CudaVersion::CUDA_127;
   default:
 llvm_unreachable("invalid enum");
   }
diff --git a/clang/lib/Basic/Targets/NVPTX.cpp 
b/clang/lib/Basic/Targets/NVPTX.cpp
index dbc3fec3657610..56efad90cb7c84 100644
--- a/clang/lib/Basic/Targets/NVPTX.cpp
+++ b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -285,6 +285,7 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions 
&Opts,
   case OffloadArch::SM_90a:
 return "900";
   case OffloadArch::SM_100:
+  case OffloadArch::SM_100a:
 return "1000";
   }
   llvm_unreachable("unhandled OffloadArch");
@@ -292,6 +293,8 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions 
&Opts,
 Builder.defineMacro("__CUDA_ARCH__", CUDAArchCode);
 if (GPU == OffloadArch::SM_90a)
   Builder.defineMacro("__CUDA_ARCH_FEAT_SM90_ALL", "1");
+if (GPU == OffloadArch::SM_100a)
+  Builder.defineMacro("__CUDA_ARCH_FEAT_SM100_ALL", "1");
   }
 }
 
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp 
b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 87c3635ed3f70e..c13928f61a7481 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -2277,6 +2277,7 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(const 
OMPRequiresDecl *D) {
   case OffloadArch::SM_90:
   case OffloadArch::SM_90a:
   case OffloadArch

[clang] [llvm] [NVPTX] Add support for PTX 8.6 and CUDA 12.6 (12.8) (PR #123398)

2025-01-21 Thread Sergey Kozub via cfe-commits

https://github.com/sergey-kozub edited 
https://github.com/llvm/llvm-project/pull/123398
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [NVPTX] Add support for PTX 8.6 and CUDA 12.6 (12.8) (PR #123398)

2025-01-19 Thread Sergey Kozub via cfe-commits

https://github.com/sergey-kozub updated 
https://github.com/llvm/llvm-project/pull/123398

>From 04313ede76d272ec391361b9828e55d8a27b4bda Mon Sep 17 00:00:00 2001
From: Sergey Kozub 
Date: Fri, 17 Jan 2025 21:00:49 +
Subject: [PATCH] Add support for PTX 8.6 and CUDA 12.6 (12.8)

---
 clang/include/clang/Basic/BuiltinsNVPTX.def | 5 -
 clang/include/clang/Basic/Cuda.h| 6 +-
 clang/lib/Basic/Cuda.cpp| 8 ++--
 clang/lib/Basic/Targets/NVPTX.cpp   | 3 +++
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp| 1 +
 clang/lib/Driver/ToolChains/Cuda.cpp| 9 +
 llvm/lib/Target/NVPTX/NVPTX.td  | 2 ++
 7 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def 
b/clang/include/clang/Basic/BuiltinsNVPTX.def
index 969dd9e41ebfa3..37b4e6ff77fda6 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.def
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -28,7 +28,9 @@
 #pragma push_macro("SM_90")
 #pragma push_macro("SM_90a")
 #pragma push_macro("SM_100")
-#define SM_100 "sm_100"
+#pragma push_macro("SM_100a")
+#define SM_100a "sm_100a"
+#define SM_100 "sm_100|" SM_100a
 #define SM_90a "sm_90a"
 #define SM_90 "sm_90|" SM_90a "|" SM_100
 #define SM_89 "sm_89|" SM_90
@@ -1091,6 +1093,7 @@ TARGET_BUILTIN(__nvvm_getctarank_shared_cluster, "iv*3", 
"", AND(SM_90,PTX78))
 #pragma pop_macro("SM_90")
 #pragma pop_macro("SM_90a")
 #pragma pop_macro("SM_100")
+#pragma pop_macro("SM_100a")
 #pragma pop_macro("PTX42")
 #pragma pop_macro("PTX60")
 #pragma pop_macro("PTX61")
diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h
index c2a4addf488df1..1cdfc8178db843 100644
--- a/clang/include/clang/Basic/Cuda.h
+++ b/clang/include/clang/Basic/Cuda.h
@@ -44,9 +44,12 @@ enum class CudaVersion {
   CUDA_124,
   CUDA_125,
   CUDA_126,
+  CUDA_127,
+  CUDA_128,
+  CUDA_129,
   FULLY_SUPPORTED = CUDA_123,
   PARTIALLY_SUPPORTED =
-  CUDA_126, // Partially supported. Proceed with a warning.
+  CUDA_129, // Partially supported. Proceed with a warning.
   NEW = 1,  // Too new. Issue a warning, but allow using it.
 };
 const char *CudaVersionToString(CudaVersion V);
@@ -80,6 +83,7 @@ enum class OffloadArch {
   SM_90,
   SM_90a,
   SM_100,
+  SM_100a,
   GFX600,
   GFX601,
   GFX602,
diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp
index d56609a2a8f24a..692ab7c319d8bd 100644
--- a/clang/lib/Basic/Cuda.cpp
+++ b/clang/lib/Basic/Cuda.cpp
@@ -44,6 +44,9 @@ static const CudaVersionMapEntry CudaNameVersionMap[] = {
 CUDA_ENTRY(12, 4),
 CUDA_ENTRY(12, 5),
 CUDA_ENTRY(12, 6),
+CUDA_ENTRY(12, 7),
+CUDA_ENTRY(12, 8),
+CUDA_ENTRY(12, 9),
 {"", CudaVersion::NEW, 
llvm::VersionTuple(std::numeric_limits::max())},
 {"unknown", CudaVersion::UNKNOWN, {}} // End of list tombstone.
 };
@@ -98,6 +101,7 @@ static const OffloadArchToStringMap arch_names[] = {
 SM(90),  // Hopper
 SM(90a), // Hopper
 SM(100), // Blackwell
+SM(100a),// Blackwell
 GFX(600),  // gfx600
 GFX(601),  // gfx601
 GFX(602),  // gfx602
@@ -227,8 +231,8 @@ CudaVersion MinVersionForOffloadArch(OffloadArch A) {
   case OffloadArch::SM_90a:
 return CudaVersion::CUDA_120;
   case OffloadArch::SM_100:
-return CudaVersion::NEW; // TODO: use specific CUDA version once it's
- // public.
+  case OffloadArch::SM_100a:
+return CudaVersion::CUDA_128;
   default:
 llvm_unreachable("invalid enum");
   }
diff --git a/clang/lib/Basic/Targets/NVPTX.cpp 
b/clang/lib/Basic/Targets/NVPTX.cpp
index dbc3fec3657610..56efad90cb7c84 100644
--- a/clang/lib/Basic/Targets/NVPTX.cpp
+++ b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -285,6 +285,7 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions 
&Opts,
   case OffloadArch::SM_90a:
 return "900";
   case OffloadArch::SM_100:
+  case OffloadArch::SM_100a:
 return "1000";
   }
   llvm_unreachable("unhandled OffloadArch");
@@ -292,6 +293,8 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions 
&Opts,
 Builder.defineMacro("__CUDA_ARCH__", CUDAArchCode);
 if (GPU == OffloadArch::SM_90a)
   Builder.defineMacro("__CUDA_ARCH_FEAT_SM90_ALL", "1");
+if (GPU == OffloadArch::SM_100a)
+  Builder.defineMacro("__CUDA_ARCH_FEAT_SM100_ALL", "1");
   }
 }
 
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp 
b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 87c3635ed3f70e..c13928f61a7481 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -2277,6 +2277,7 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(const 
OMPRequiresDecl *D) {
   case OffloadArch::SM_90:
   case OffloadArch::SM_90a:
   case OffloadArch::SM_100:
+  case OffloadArch::SM_100a:
   case OffloadArch::GFX600:
  

[clang] [llvm] [NVPTX] Add support for PTX 8.6 and CUDA 12.6 (12.8) (PR #123398)

2025-01-19 Thread Sergey Kozub via cfe-commits


@@ -682,6 +688,9 @@ void NVPTX::getNVPTXTargetFeatures(const Driver &D, const 
llvm::Triple &Triple,
   case CudaVersion::CUDA_##CUDA_VER:   
\
 PtxFeature = "+ptx" #PTX_VER;  
\
 break;
+CASE_CUDA_VERSION(129, 86);
+CASE_CUDA_VERSION(128, 86);
+CASE_CUDA_VERSION(127, 85);

sergey-kozub wrote:

Updated, thank you.
Also, changed line 235 in "clang/lib/Basic/Cuda.cpp", as PTX8.6 supports sm100a.

https://github.com/llvm/llvm-project/pull/123398
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [NVPTX] Add support for PTX 8.6 and CUDA 12.6 (12.8) (PR #123398)

2025-01-20 Thread Sergey Kozub via cfe-commits

https://github.com/sergey-kozub updated 
https://github.com/llvm/llvm-project/pull/123398

>From 2f909002b89628e2bb83391b2287aa00a7ecaaf3 Mon Sep 17 00:00:00 2001
From: Sergey Kozub 
Date: Fri, 17 Jan 2025 21:00:49 +
Subject: [PATCH] Add support for PTX 8.6 and CUDA 12.6 (12.8)

---
 clang/include/clang/Basic/BuiltinsNVPTX.def | 5 -
 clang/include/clang/Basic/Cuda.h| 6 +-
 clang/lib/Basic/Cuda.cpp| 8 ++--
 clang/lib/Basic/Targets/NVPTX.cpp   | 3 +++
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp| 1 +
 clang/lib/Driver/ToolChains/Cuda.cpp| 9 +
 llvm/lib/Target/NVPTX/NVPTX.td  | 2 ++
 7 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def 
b/clang/include/clang/Basic/BuiltinsNVPTX.def
index 969dd9e41ebfa3..37b4e6ff77fda6 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.def
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -28,7 +28,9 @@
 #pragma push_macro("SM_90")
 #pragma push_macro("SM_90a")
 #pragma push_macro("SM_100")
-#define SM_100 "sm_100"
+#pragma push_macro("SM_100a")
+#define SM_100a "sm_100a"
+#define SM_100 "sm_100|" SM_100a
 #define SM_90a "sm_90a"
 #define SM_90 "sm_90|" SM_90a "|" SM_100
 #define SM_89 "sm_89|" SM_90
@@ -1091,6 +1093,7 @@ TARGET_BUILTIN(__nvvm_getctarank_shared_cluster, "iv*3", 
"", AND(SM_90,PTX78))
 #pragma pop_macro("SM_90")
 #pragma pop_macro("SM_90a")
 #pragma pop_macro("SM_100")
+#pragma pop_macro("SM_100a")
 #pragma pop_macro("PTX42")
 #pragma pop_macro("PTX60")
 #pragma pop_macro("PTX61")
diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h
index c2a4addf488df1..1cdfc8178db843 100644
--- a/clang/include/clang/Basic/Cuda.h
+++ b/clang/include/clang/Basic/Cuda.h
@@ -44,9 +44,12 @@ enum class CudaVersion {
   CUDA_124,
   CUDA_125,
   CUDA_126,
+  CUDA_127,
+  CUDA_128,
+  CUDA_129,
   FULLY_SUPPORTED = CUDA_123,
   PARTIALLY_SUPPORTED =
-  CUDA_126, // Partially supported. Proceed with a warning.
+  CUDA_129, // Partially supported. Proceed with a warning.
   NEW = 1,  // Too new. Issue a warning, but allow using it.
 };
 const char *CudaVersionToString(CudaVersion V);
@@ -80,6 +83,7 @@ enum class OffloadArch {
   SM_90,
   SM_90a,
   SM_100,
+  SM_100a,
   GFX600,
   GFX601,
   GFX602,
diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp
index d56609a2a8f24a..b1461429d4f51a 100644
--- a/clang/lib/Basic/Cuda.cpp
+++ b/clang/lib/Basic/Cuda.cpp
@@ -44,6 +44,9 @@ static const CudaVersionMapEntry CudaNameVersionMap[] = {
 CUDA_ENTRY(12, 4),
 CUDA_ENTRY(12, 5),
 CUDA_ENTRY(12, 6),
+CUDA_ENTRY(12, 7),
+CUDA_ENTRY(12, 8),
+CUDA_ENTRY(12, 9),
 {"", CudaVersion::NEW, 
llvm::VersionTuple(std::numeric_limits::max())},
 {"unknown", CudaVersion::UNKNOWN, {}} // End of list tombstone.
 };
@@ -98,6 +101,7 @@ static const OffloadArchToStringMap arch_names[] = {
 SM(90),  // Hopper
 SM(90a), // Hopper
 SM(100), // Blackwell
+SM(100a),// Blackwell
 GFX(600),  // gfx600
 GFX(601),  // gfx601
 GFX(602),  // gfx602
@@ -227,8 +231,8 @@ CudaVersion MinVersionForOffloadArch(OffloadArch A) {
   case OffloadArch::SM_90a:
 return CudaVersion::CUDA_120;
   case OffloadArch::SM_100:
-return CudaVersion::NEW; // TODO: use specific CUDA version once it's
- // public.
+  case OffloadArch::SM_100a:
+return CudaVersion::CUDA_127;
   default:
 llvm_unreachable("invalid enum");
   }
diff --git a/clang/lib/Basic/Targets/NVPTX.cpp 
b/clang/lib/Basic/Targets/NVPTX.cpp
index dbc3fec3657610..56efad90cb7c84 100644
--- a/clang/lib/Basic/Targets/NVPTX.cpp
+++ b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -285,6 +285,7 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions 
&Opts,
   case OffloadArch::SM_90a:
 return "900";
   case OffloadArch::SM_100:
+  case OffloadArch::SM_100a:
 return "1000";
   }
   llvm_unreachable("unhandled OffloadArch");
@@ -292,6 +293,8 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions 
&Opts,
 Builder.defineMacro("__CUDA_ARCH__", CUDAArchCode);
 if (GPU == OffloadArch::SM_90a)
   Builder.defineMacro("__CUDA_ARCH_FEAT_SM90_ALL", "1");
+if (GPU == OffloadArch::SM_100a)
+  Builder.defineMacro("__CUDA_ARCH_FEAT_SM100_ALL", "1");
   }
 }
 
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp 
b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 87c3635ed3f70e..c13928f61a7481 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -2277,6 +2277,7 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(const 
OMPRequiresDecl *D) {
   case OffloadArch::SM_90:
   case OffloadArch::SM_90a:
   case OffloadArch::SM_100:
+  case OffloadArch::SM_100a:
   case OffloadArch::GFX600:
  

[clang] [llvm] [NVPTX] Add support for PTX 8.6 and CUDA 12.6 (12.8) (PR #123398)

2025-01-20 Thread Sergey Kozub via cfe-commits

https://github.com/sergey-kozub edited 
https://github.com/llvm/llvm-project/pull/123398
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [NVPTX] Add support for PTX 8.6 and CUDA 12.6 (12.8) (PR #123398)

2025-01-17 Thread Sergey Kozub via cfe-commits

https://github.com/sergey-kozub created 
https://github.com/llvm/llvm-project/pull/123398

CUDA 12.8 supports PTX 8.6 which enables architecture "sm100a" (supports 
Blackwell-specific instructions).
CUDA 12.7 technically does not exist, map it to PTX 8.5 (same as 12.6).


>From 92e4b10e940e9c08606ae4cf0a94ae77f9a11b58 Mon Sep 17 00:00:00 2001
From: Sergey Kozub 
Date: Fri, 17 Jan 2025 21:00:49 +
Subject: [PATCH] Add support for PTX 8.6 and CUDA 12.6 (12.8)

---
 clang/include/clang/Basic/BuiltinsNVPTX.def | 5 -
 clang/include/clang/Basic/Cuda.h| 6 +-
 clang/lib/Basic/Cuda.cpp| 8 ++--
 clang/lib/Basic/Targets/NVPTX.cpp   | 3 +++
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp| 1 +
 clang/lib/Driver/ToolChains/Cuda.cpp| 9 +
 llvm/lib/Target/NVPTX/NVPTX.td  | 2 ++
 7 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def 
b/clang/include/clang/Basic/BuiltinsNVPTX.def
index 969dd9e41ebfa3..37b4e6ff77fda6 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.def
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -28,7 +28,9 @@
 #pragma push_macro("SM_90")
 #pragma push_macro("SM_90a")
 #pragma push_macro("SM_100")
-#define SM_100 "sm_100"
+#pragma push_macro("SM_100a")
+#define SM_100a "sm_100a"
+#define SM_100 "sm_100|" SM_100a
 #define SM_90a "sm_90a"
 #define SM_90 "sm_90|" SM_90a "|" SM_100
 #define SM_89 "sm_89|" SM_90
@@ -1091,6 +1093,7 @@ TARGET_BUILTIN(__nvvm_getctarank_shared_cluster, "iv*3", 
"", AND(SM_90,PTX78))
 #pragma pop_macro("SM_90")
 #pragma pop_macro("SM_90a")
 #pragma pop_macro("SM_100")
+#pragma pop_macro("SM_100a")
 #pragma pop_macro("PTX42")
 #pragma pop_macro("PTX60")
 #pragma pop_macro("PTX61")
diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h
index c2a4addf488df1..1cdfc8178db843 100644
--- a/clang/include/clang/Basic/Cuda.h
+++ b/clang/include/clang/Basic/Cuda.h
@@ -44,9 +44,12 @@ enum class CudaVersion {
   CUDA_124,
   CUDA_125,
   CUDA_126,
+  CUDA_127,
+  CUDA_128,
+  CUDA_129,
   FULLY_SUPPORTED = CUDA_123,
   PARTIALLY_SUPPORTED =
-  CUDA_126, // Partially supported. Proceed with a warning.
+  CUDA_129, // Partially supported. Proceed with a warning.
   NEW = 1,  // Too new. Issue a warning, but allow using it.
 };
 const char *CudaVersionToString(CudaVersion V);
@@ -80,6 +83,7 @@ enum class OffloadArch {
   SM_90,
   SM_90a,
   SM_100,
+  SM_100a,
   GFX600,
   GFX601,
   GFX602,
diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp
index d56609a2a8f24a..692ab7c319d8bd 100644
--- a/clang/lib/Basic/Cuda.cpp
+++ b/clang/lib/Basic/Cuda.cpp
@@ -44,6 +44,9 @@ static const CudaVersionMapEntry CudaNameVersionMap[] = {
 CUDA_ENTRY(12, 4),
 CUDA_ENTRY(12, 5),
 CUDA_ENTRY(12, 6),
+CUDA_ENTRY(12, 7),
+CUDA_ENTRY(12, 8),
+CUDA_ENTRY(12, 9),
 {"", CudaVersion::NEW, 
llvm::VersionTuple(std::numeric_limits::max())},
 {"unknown", CudaVersion::UNKNOWN, {}} // End of list tombstone.
 };
@@ -98,6 +101,7 @@ static const OffloadArchToStringMap arch_names[] = {
 SM(90),  // Hopper
 SM(90a), // Hopper
 SM(100), // Blackwell
+SM(100a),// Blackwell
 GFX(600),  // gfx600
 GFX(601),  // gfx601
 GFX(602),  // gfx602
@@ -227,8 +231,8 @@ CudaVersion MinVersionForOffloadArch(OffloadArch A) {
   case OffloadArch::SM_90a:
 return CudaVersion::CUDA_120;
   case OffloadArch::SM_100:
-return CudaVersion::NEW; // TODO: use specific CUDA version once it's
- // public.
+  case OffloadArch::SM_100a:
+return CudaVersion::CUDA_128;
   default:
 llvm_unreachable("invalid enum");
   }
diff --git a/clang/lib/Basic/Targets/NVPTX.cpp 
b/clang/lib/Basic/Targets/NVPTX.cpp
index dbc3fec3657610..56efad90cb7c84 100644
--- a/clang/lib/Basic/Targets/NVPTX.cpp
+++ b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -285,6 +285,7 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions 
&Opts,
   case OffloadArch::SM_90a:
 return "900";
   case OffloadArch::SM_100:
+  case OffloadArch::SM_100a:
 return "1000";
   }
   llvm_unreachable("unhandled OffloadArch");
@@ -292,6 +293,8 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions 
&Opts,
 Builder.defineMacro("__CUDA_ARCH__", CUDAArchCode);
 if (GPU == OffloadArch::SM_90a)
   Builder.defineMacro("__CUDA_ARCH_FEAT_SM90_ALL", "1");
+if (GPU == OffloadArch::SM_100a)
+  Builder.defineMacro("__CUDA_ARCH_FEAT_SM100_ALL", "1");
   }
 }
 
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp 
b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 87c3635ed3f70e..c13928f61a7481 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -2277,6 +2277,7 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(const 
OMPRequires

[clang] [llvm] [NVPTX] Add support for PTX 8.6 and CUDA 12.6 (12.8) (PR #123398)

2025-01-20 Thread Sergey Kozub via cfe-commits

sergey-kozub wrote:

It's not clear from the logs why the "builkite" checks have failed. Running 
"check-all" target locally suceeds.
Please advise.

https://github.com/llvm/llvm-project/pull/123398
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [NVPTX] Add support for PTX 8.6 and CUDA 12.6 (12.8) (PR #123398)

2025-01-21 Thread Sergey Kozub via cfe-commits

sergey-kozub wrote:

This is now resolved by fixing the NVPTX test.

https://github.com/llvm/llvm-project/pull/123398
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits