[clang] [llvm] [CUDA] Add support for CUDA-12.6 and sm_100 (PR #97402)
https://github.com/sergey-kozub created https://github.com/llvm/llvm-project/pull/97402 Adds support for sm_100 (Blackwell), similar to https://github.com/llvm/llvm-project/pull/74895 One important aspect is that `sm_100` is not compatible with `sm_90a`, only with `sm_90` - note the defines in "BuiltinsNVPTX.def" >From 02e1acff6ffa1ddc3a26b0edc7e89923ac38978f Mon Sep 17 00:00:00 2001 From: Sergey Kozub Date: Tue, 2 Jul 2024 02:44:56 -0700 Subject: [PATCH] [CUDA] Add support for CUDA-12.6 and sm_100 --- clang/docs/ReleaseNotes.rst | 3 ++- clang/include/clang/Basic/BuiltinsNVPTX.def | 10 -- clang/include/clang/Basic/Cuda.h| 4 +++- clang/lib/Basic/Cuda.cpp| 4 clang/lib/Basic/Targets/NVPTX.cpp | 2 ++ clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp| 1 + clang/lib/Driver/ToolChains/Cuda.cpp| 3 +++ clang/test/Misc/target-invalid-cpu-note.c | 2 +- llvm/lib/Target/NVPTX/NVPTX.td | 5 +++-- 9 files changed, 27 insertions(+), 7 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index c720e47dbe35b..3c10ee51550d9 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -1057,7 +1057,8 @@ CUDA/HIP Language Changes CUDA Support -- Clang now supports CUDA SDK up to 12.5 +- Clang now supports CUDA SDK up to 12.6 +- Added support for sm_100 AIX Support ^^^ diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def index 504314d8d96e9..3f383bc89ee70 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -27,8 +27,10 @@ #pragma push_macro("SM_89") #pragma push_macro("SM_90") #pragma push_macro("SM_90a") +#pragma push_macro("SM_100") +#define SM_100 "sm_100" #define SM_90a "sm_90a" -#define SM_90 "sm_90|" SM_90a +#define SM_90 "sm_90|" SM_90a "|" SM_100 #define SM_89 "sm_89|" SM_90 #define SM_87 "sm_87|" SM_89 #define SM_86 "sm_86|" SM_87 @@ -63,7 +65,9 @@ #pragma push_macro("PTX83") #pragma push_macro("PTX84") #pragma push_macro("PTX85") -#define PTX85 "ptx85" +#pragma push_macro("PTX86") +#define PTX86 "ptx86" +#define PTX85 "ptx85|" PTX86 #define PTX84 "ptx84|" PTX85 #define PTX83 "ptx83|" PTX84 #define PTX82 "ptx82|" PTX83 @@ -1075,6 +1079,7 @@ TARGET_BUILTIN(__nvvm_getctarank_shared_cluster, "iv*3", "", AND(SM_90,PTX78)) #pragma pop_macro("SM_89") #pragma pop_macro("SM_90") #pragma pop_macro("SM_90a") +#pragma pop_macro("SM_100") #pragma pop_macro("PTX42") #pragma pop_macro("PTX60") #pragma pop_macro("PTX61") @@ -1097,3 +1102,4 @@ TARGET_BUILTIN(__nvvm_getctarank_shared_cluster, "iv*3", "", AND(SM_90,PTX78)) #pragma pop_macro("PTX83") #pragma pop_macro("PTX84") #pragma pop_macro("PTX85") +#pragma pop_macro("PTX86") diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h index 83699f8897f66..a18e62620dd5d 100644 --- a/clang/include/clang/Basic/Cuda.h +++ b/clang/include/clang/Basic/Cuda.h @@ -43,9 +43,10 @@ enum class CudaVersion { CUDA_123, CUDA_124, CUDA_125, + CUDA_126, FULLY_SUPPORTED = CUDA_123, PARTIALLY_SUPPORTED = - CUDA_125, // Partially supported. Proceed with a warning. + CUDA_126, // Partially supported. Proceed with a warning. NEW = 1, // Too new. Issue a warning, but allow using it. }; const char *CudaVersionToString(CudaVersion V); @@ -78,6 +79,7 @@ enum class OffloadArch { SM_89, SM_90, SM_90a, + SM_100, GFX600, GFX601, GFX602, diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp index faf3878f064d2..72d9bd89c36e7 100644 --- a/clang/lib/Basic/Cuda.cpp +++ b/clang/lib/Basic/Cuda.cpp @@ -43,6 +43,7 @@ static const CudaVersionMapEntry CudaNameVersionMap[] = { CUDA_ENTRY(12, 3), CUDA_ENTRY(12, 4), CUDA_ENTRY(12, 5), +CUDA_ENTRY(12, 6), {"", CudaVersion::NEW, llvm::VersionTuple(std::numeric_limits::max())}, {"unknown", CudaVersion::UNKNOWN, {}} // End of list tombstone. }; @@ -96,6 +97,7 @@ static const OffloadArchToStringMap arch_names[] = { SM(89), // Ada Lovelace SM(90), // Hopper SM(90a), // Hopper +SM(100), // Blackwell GFX(600), // gfx600 GFX(601), // gfx601 GFX(602), // gfx602 @@ -221,6 +223,8 @@ CudaVersion MinVersionForOffloadArch(OffloadArch A) { return CudaVersion::CUDA_118; case OffloadArch::SM_90a: return CudaVersion::CUDA_120; + case OffloadArch::SM_100: +return CudaVersion::CUDA_126; default: llvm_unreachable("invalid enum"); } diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp index 43b653dc52ce0..88a0dbde52d52 100644 --- a/clang/lib/Basic/Targets/NVPTX.cpp +++ b/clang/lib/Basic/Targets/NVPTX.cpp @@ -281,6 +281,8 @@ void NVPTXTargetInfo::getTargetDefines(const La
[clang] [llvm] [CUDA] Add support for CUDA-12.6 and sm_100 (PR #97402)
sergey-kozub wrote: @Artem-B, @andportnoy, you might want to take a look https://github.com/llvm/llvm-project/pull/97402 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [CUDA] Add support for CUDA-12.6 and sm_100 (PR #97402)
https://github.com/sergey-kozub converted_to_draft https://github.com/llvm/llvm-project/pull/97402 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [CUDA] Add support for CUDA-12.6 and sm_100 (PR #97402)
https://github.com/sergey-kozub updated https://github.com/llvm/llvm-project/pull/97402 >From 02e1acff6ffa1ddc3a26b0edc7e89923ac38978f Mon Sep 17 00:00:00 2001 From: Sergey Kozub Date: Tue, 2 Jul 2024 02:44:56 -0700 Subject: [PATCH] [CUDA] Add support for CUDA-12.6 and sm_100 --- clang/docs/ReleaseNotes.rst | 3 ++- clang/include/clang/Basic/BuiltinsNVPTX.def | 10 -- clang/include/clang/Basic/Cuda.h| 4 +++- clang/lib/Basic/Cuda.cpp| 4 clang/lib/Basic/Targets/NVPTX.cpp | 2 ++ clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp| 1 + clang/lib/Driver/ToolChains/Cuda.cpp| 3 +++ clang/test/Misc/target-invalid-cpu-note.c | 2 +- llvm/lib/Target/NVPTX/NVPTX.td | 5 +++-- 9 files changed, 27 insertions(+), 7 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index c720e47dbe35b..3c10ee51550d9 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -1057,7 +1057,8 @@ CUDA/HIP Language Changes CUDA Support -- Clang now supports CUDA SDK up to 12.5 +- Clang now supports CUDA SDK up to 12.6 +- Added support for sm_100 AIX Support ^^^ diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def index 504314d8d96e9..3f383bc89ee70 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -27,8 +27,10 @@ #pragma push_macro("SM_89") #pragma push_macro("SM_90") #pragma push_macro("SM_90a") +#pragma push_macro("SM_100") +#define SM_100 "sm_100" #define SM_90a "sm_90a" -#define SM_90 "sm_90|" SM_90a +#define SM_90 "sm_90|" SM_90a "|" SM_100 #define SM_89 "sm_89|" SM_90 #define SM_87 "sm_87|" SM_89 #define SM_86 "sm_86|" SM_87 @@ -63,7 +65,9 @@ #pragma push_macro("PTX83") #pragma push_macro("PTX84") #pragma push_macro("PTX85") -#define PTX85 "ptx85" +#pragma push_macro("PTX86") +#define PTX86 "ptx86" +#define PTX85 "ptx85|" PTX86 #define PTX84 "ptx84|" PTX85 #define PTX83 "ptx83|" PTX84 #define PTX82 "ptx82|" PTX83 @@ -1075,6 +1079,7 @@ TARGET_BUILTIN(__nvvm_getctarank_shared_cluster, "iv*3", "", AND(SM_90,PTX78)) #pragma pop_macro("SM_89") #pragma pop_macro("SM_90") #pragma pop_macro("SM_90a") +#pragma pop_macro("SM_100") #pragma pop_macro("PTX42") #pragma pop_macro("PTX60") #pragma pop_macro("PTX61") @@ -1097,3 +1102,4 @@ TARGET_BUILTIN(__nvvm_getctarank_shared_cluster, "iv*3", "", AND(SM_90,PTX78)) #pragma pop_macro("PTX83") #pragma pop_macro("PTX84") #pragma pop_macro("PTX85") +#pragma pop_macro("PTX86") diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h index 83699f8897f66..a18e62620dd5d 100644 --- a/clang/include/clang/Basic/Cuda.h +++ b/clang/include/clang/Basic/Cuda.h @@ -43,9 +43,10 @@ enum class CudaVersion { CUDA_123, CUDA_124, CUDA_125, + CUDA_126, FULLY_SUPPORTED = CUDA_123, PARTIALLY_SUPPORTED = - CUDA_125, // Partially supported. Proceed with a warning. + CUDA_126, // Partially supported. Proceed with a warning. NEW = 1, // Too new. Issue a warning, but allow using it. }; const char *CudaVersionToString(CudaVersion V); @@ -78,6 +79,7 @@ enum class OffloadArch { SM_89, SM_90, SM_90a, + SM_100, GFX600, GFX601, GFX602, diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp index faf3878f064d2..72d9bd89c36e7 100644 --- a/clang/lib/Basic/Cuda.cpp +++ b/clang/lib/Basic/Cuda.cpp @@ -43,6 +43,7 @@ static const CudaVersionMapEntry CudaNameVersionMap[] = { CUDA_ENTRY(12, 3), CUDA_ENTRY(12, 4), CUDA_ENTRY(12, 5), +CUDA_ENTRY(12, 6), {"", CudaVersion::NEW, llvm::VersionTuple(std::numeric_limits::max())}, {"unknown", CudaVersion::UNKNOWN, {}} // End of list tombstone. }; @@ -96,6 +97,7 @@ static const OffloadArchToStringMap arch_names[] = { SM(89), // Ada Lovelace SM(90), // Hopper SM(90a), // Hopper +SM(100), // Blackwell GFX(600), // gfx600 GFX(601), // gfx601 GFX(602), // gfx602 @@ -221,6 +223,8 @@ CudaVersion MinVersionForOffloadArch(OffloadArch A) { return CudaVersion::CUDA_118; case OffloadArch::SM_90a: return CudaVersion::CUDA_120; + case OffloadArch::SM_100: +return CudaVersion::CUDA_126; default: llvm_unreachable("invalid enum"); } diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp index 43b653dc52ce0..88a0dbde52d52 100644 --- a/clang/lib/Basic/Targets/NVPTX.cpp +++ b/clang/lib/Basic/Targets/NVPTX.cpp @@ -281,6 +281,8 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts, case OffloadArch::SM_90: case OffloadArch::SM_90a: return "900"; + case OffloadArch::SM_100: +return "1000"; } llvm_unreachable("unhandled OffloadArch"); }();
[clang] [llvm] [CUDA] Add support for CUDA-12.6 and sm_100 (PR #97402)
https://github.com/sergey-kozub closed https://github.com/llvm/llvm-project/pull/97402 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [CUDA] Add support for CUDA-12.6 and sm_100 (PR #97402)
sergey-kozub wrote: This PR is redundant, closing. https://github.com/llvm/llvm-project/pull/97402 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [NVPTX] Add conversion intrinsics from/to fp8 types (e4m3, e5m2) (PR #102969)
https://github.com/sergey-kozub created https://github.com/llvm/llvm-project/pull/102969 PTX ISA 8.1 supports FP8 conversions: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt This PR adds the support for: - cvt.rn.satfinite{.relu}.f8x2type.f32 d, a, b; - cvt.rn.satfinite{.relu}.f8x2type.f16x2 d, a; - cvt.rn.{.relu}.f16x2.f8x2type d, a; where .f8x2type = { .e4m3x2, .e5m2x2 }; >From 963d07c0a2ece9616f2e5f4ab7aa22b4d18a5cb8 Mon Sep 17 00:00:00 2001 From: Sergey Kozub Date: Mon, 12 Aug 2024 12:52:01 -0700 Subject: [PATCH] [NVPTX] Add conversion intrinsics from/to fp8 types (e4m3, e5m2) --- clang/include/clang/Basic/BuiltinsNVPTX.def | 15 clang/test/CodeGen/builtins-nvptx.c | 36 + llvm/include/llvm/IR/IntrinsicsNVVM.td | 27 +++ llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 31 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td| 27 +++ llvm/test/CodeGen/NVPTX/convert-sm89.ll | 86 + 6 files changed, 222 insertions(+) create mode 100644 llvm/test/CodeGen/NVPTX/convert-sm89.ll diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def index 504314d8d96e91..ecbbb1716e0fc5 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -584,6 +584,21 @@ TARGET_BUILTIN(__nvvm_f2bf16_rz_relu, "yf", "", AND(SM_80,PTX70)) TARGET_BUILTIN(__nvvm_f2tf32_rna, "ZUif", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_ff_e4m3x2_rn, "sff", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_ff_e4m3x2_rn_relu, "sff", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_ff_e5m2x2_rn, "sff", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_ff_e5m2x2_rn_relu, "sff", "", AND(SM_89,PTX81)) + +TARGET_BUILTIN(__nvvm_f16x2_e4m3x2_rn, "sV2h", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_f16x2_e4m3x2_rn_relu, "sV2h", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_f16x2_e5m2x2_rn, "sV2h", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_f16x2_e5m2x2_rn_relu, "sV2h", "", AND(SM_89,PTX81)) + +TARGET_BUILTIN(__nvvm_e4m3x2_f16x2_rn, "V2hs", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_e4m3x2_f16x2_rn_relu, "V2hs", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_e5m2x2_f16x2_rn, "V2hs", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_e5m2x2_f16x2_rn_relu, "V2hs", "", AND(SM_89,PTX81)) + // Bitcast BUILTIN(__nvvm_bitcast_f2i, "if", "") diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c index 75b9d6d1fe1902..9d9f2f31f57e79 100644 --- a/clang/test/CodeGen/builtins-nvptx.c +++ b/clang/test/CodeGen/builtins-nvptx.c @@ -22,6 +22,9 @@ // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \ // RUN:-fcuda-is-device -emit-llvm -o - -x cuda %s \ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP64 %s +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_89 -target-feature +ptx81 \ +// RUN:-fcuda-is-device -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s #define __device__ __attribute__((device)) #define __global__ __attribute__((global)) @@ -968,6 +971,39 @@ __device__ void nvvm_cvt_sm80() { // CHECK: ret void } +// CHECK-LABEL: nvvm_cvt_sm89 +__device__ void nvvm_cvt_sm89() { +#if __CUDA_ARCH__ >= 890 + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.e4m3x2.rn(float 1.00e+00, float 1.00e+00) + __nvvm_ff_e4m3x2_rn(1, 1); + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.e4m3x2.rn.relu(float 1.00e+00, float 1.00e+00) + __nvvm_ff_e4m3x2_rn_relu(1, 1); + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.e5m2x2.rn(float 1.00e+00, float 1.00e+00) + __nvvm_ff_e5m2x2_rn(1, 1); + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.e5m2x2.rn.relu(float 1.00e+00, float 1.00e+00) + __nvvm_ff_e5m2x2_rn_relu(1, 1); + + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.e4m3x2.rn(<2 x half> ) + __nvvm_f16x2_e4m3x2_rn({1.0f16, 1.0f16}); + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.e4m3x2.rn.relu(<2 x half> ) + __nvvm_f16x2_e4m3x2_rn_relu({1.0f16, 1.0f16}); + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.e5m2x2.rn(<2 x half> ) + __nvvm_f16x2_e5m2x2_rn({1.0f16, 1.0f16}); + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.e5m2x2.rn.relu(<2 x half> ) + __nvvm_f16x2_e5m2x2_rn_relu({1.0f16, 1.0f16}); + + // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e4m3x2.f16x2.rn(i16 18504) + __nvvm_e4m3x2_f16x2_rn(0x4848); + // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e4m3x2.f16x2.rn.relu(i16 18504) + __nvvm_e4m3x2_f16x2_rn_relu(0x4848); + // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.f16x2.rn(i16 19532) + __nvvm_e5m2x2_f16x2_rn(0x4c4c); + // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.f16x2.rn.relu(i16 19532) + __nvvm_e5m2x2_f16x2_rn_relu(0x4c4c); +#endif + // CHECK: ret void
[clang] [llvm] [NVPTX] Add conversion intrinsics from/to fp8 types (e4m3, e5m2) (PR #102969)
https://github.com/sergey-kozub updated https://github.com/llvm/llvm-project/pull/102969 >From 72b9a5ff64807bf4722a7168e1210f849bef7071 Mon Sep 17 00:00:00 2001 From: Sergey Kozub Date: Mon, 12 Aug 2024 12:52:01 -0700 Subject: [PATCH] [NVPTX] Add conversion intrinsics from/to fp8 types (e4m3, e5m2) --- clang/include/clang/Basic/BuiltinsNVPTX.def | 15 clang/test/CodeGen/builtins-nvptx.c | 36 + llvm/include/llvm/IR/IntrinsicsNVVM.td | 27 +++ llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 31 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td| 27 +++ llvm/test/CodeGen/NVPTX/convert-sm89.ll | 86 + 6 files changed, 222 insertions(+) create mode 100644 llvm/test/CodeGen/NVPTX/convert-sm89.ll diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def index 504314d8d96e91..ecbbb1716e0fc5 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -584,6 +584,21 @@ TARGET_BUILTIN(__nvvm_f2bf16_rz_relu, "yf", "", AND(SM_80,PTX70)) TARGET_BUILTIN(__nvvm_f2tf32_rna, "ZUif", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_ff_e4m3x2_rn, "sff", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_ff_e4m3x2_rn_relu, "sff", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_ff_e5m2x2_rn, "sff", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_ff_e5m2x2_rn_relu, "sff", "", AND(SM_89,PTX81)) + +TARGET_BUILTIN(__nvvm_f16x2_e4m3x2_rn, "sV2h", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_f16x2_e4m3x2_rn_relu, "sV2h", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_f16x2_e5m2x2_rn, "sV2h", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_f16x2_e5m2x2_rn_relu, "sV2h", "", AND(SM_89,PTX81)) + +TARGET_BUILTIN(__nvvm_e4m3x2_f16x2_rn, "V2hs", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_e4m3x2_f16x2_rn_relu, "V2hs", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_e5m2x2_f16x2_rn, "V2hs", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_e5m2x2_f16x2_rn_relu, "V2hs", "", AND(SM_89,PTX81)) + // Bitcast BUILTIN(__nvvm_bitcast_f2i, "if", "") diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c index 75b9d6d1fe1902..9d9f2f31f57e79 100644 --- a/clang/test/CodeGen/builtins-nvptx.c +++ b/clang/test/CodeGen/builtins-nvptx.c @@ -22,6 +22,9 @@ // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \ // RUN:-fcuda-is-device -emit-llvm -o - -x cuda %s \ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP64 %s +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_89 -target-feature +ptx81 \ +// RUN:-fcuda-is-device -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s #define __device__ __attribute__((device)) #define __global__ __attribute__((global)) @@ -968,6 +971,39 @@ __device__ void nvvm_cvt_sm80() { // CHECK: ret void } +// CHECK-LABEL: nvvm_cvt_sm89 +__device__ void nvvm_cvt_sm89() { +#if __CUDA_ARCH__ >= 890 + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.e4m3x2.rn(float 1.00e+00, float 1.00e+00) + __nvvm_ff_e4m3x2_rn(1, 1); + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.e4m3x2.rn.relu(float 1.00e+00, float 1.00e+00) + __nvvm_ff_e4m3x2_rn_relu(1, 1); + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.e5m2x2.rn(float 1.00e+00, float 1.00e+00) + __nvvm_ff_e5m2x2_rn(1, 1); + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.e5m2x2.rn.relu(float 1.00e+00, float 1.00e+00) + __nvvm_ff_e5m2x2_rn_relu(1, 1); + + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.e4m3x2.rn(<2 x half> ) + __nvvm_f16x2_e4m3x2_rn({1.0f16, 1.0f16}); + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.e4m3x2.rn.relu(<2 x half> ) + __nvvm_f16x2_e4m3x2_rn_relu({1.0f16, 1.0f16}); + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.e5m2x2.rn(<2 x half> ) + __nvvm_f16x2_e5m2x2_rn({1.0f16, 1.0f16}); + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.e5m2x2.rn.relu(<2 x half> ) + __nvvm_f16x2_e5m2x2_rn_relu({1.0f16, 1.0f16}); + + // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e4m3x2.f16x2.rn(i16 18504) + __nvvm_e4m3x2_f16x2_rn(0x4848); + // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e4m3x2.f16x2.rn.relu(i16 18504) + __nvvm_e4m3x2_f16x2_rn_relu(0x4848); + // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.f16x2.rn(i16 19532) + __nvvm_e5m2x2_f16x2_rn(0x4c4c); + // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.f16x2.rn.relu(i16 19532) + __nvvm_e5m2x2_f16x2_rn_relu(0x4c4c); +#endif + // CHECK: ret void +} + #define NAN32 0x7FBF #define NAN16 (__bf16)0x7FBF #define BF16 (__bf16)0.1f diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 7caada24dad564..042df62dc0dc28 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -1296,6 +1296,33 @@ let Tar
[clang] [llvm] [NVPTX] Add conversion intrinsics from/to fp8 types (e4m3, e5m2) (PR #102969)
https://github.com/sergey-kozub updated https://github.com/llvm/llvm-project/pull/102969 >From a696f131f97370bd8b9ec264d27555c6ace4d027 Mon Sep 17 00:00:00 2001 From: Sergey Kozub Date: Mon, 12 Aug 2024 12:52:01 -0700 Subject: [PATCH] [NVPTX] Add conversion intrinsics from/to fp8 types (e4m3, e5m2) --- clang/include/clang/Basic/BuiltinsNVPTX.def | 15 clang/test/CodeGen/builtins-nvptx.c | 36 + llvm/include/llvm/IR/IntrinsicsNVVM.td | 27 +++ llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 31 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td| 27 +++ llvm/test/CodeGen/NVPTX/convert-sm89.ll | 86 + 6 files changed, 222 insertions(+) create mode 100644 llvm/test/CodeGen/NVPTX/convert-sm89.ll diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def index 504314d8d96e91..c11970c279c4bb 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -584,6 +584,21 @@ TARGET_BUILTIN(__nvvm_f2bf16_rz_relu, "yf", "", AND(SM_80,PTX70)) TARGET_BUILTIN(__nvvm_f2tf32_rna, "ZUif", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_ff_to_e4m3x2_rn, "sff", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_ff_to_e4m3x2_rn_relu, "sff", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_ff_to_e5m2x2_rn, "sff", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_ff_to_e5m2x2_rn_relu, "sff", "", AND(SM_89,PTX81)) + +TARGET_BUILTIN(__nvvm_f16x2_to_e4m3x2_rn, "sV2h", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_f16x2_to_e4m3x2_rn_relu, "sV2h", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_f16x2_to_e5m2x2_rn, "sV2h", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_f16x2_to_e5m2x2_rn_relu, "sV2h", "", AND(SM_89,PTX81)) + +TARGET_BUILTIN(__nvvm_e4m3x2_to_f16x2_rn, "V2hs", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_e4m3x2_to_f16x2_rn_relu, "V2hs", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_e5m2x2_to_f16x2_rn, "V2hs", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_e5m2x2_to_f16x2_rn_relu, "V2hs", "", AND(SM_89,PTX81)) + // Bitcast BUILTIN(__nvvm_bitcast_f2i, "if", "") diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c index 75b9d6d1fe1902..957cf7616c0411 100644 --- a/clang/test/CodeGen/builtins-nvptx.c +++ b/clang/test/CodeGen/builtins-nvptx.c @@ -22,6 +22,9 @@ // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \ // RUN:-fcuda-is-device -emit-llvm -o - -x cuda %s \ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP64 %s +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_89 -target-feature +ptx81 \ +// RUN:-fcuda-is-device -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s #define __device__ __attribute__((device)) #define __global__ __attribute__((global)) @@ -968,6 +971,39 @@ __device__ void nvvm_cvt_sm80() { // CHECK: ret void } +// CHECK-LABEL: nvvm_cvt_sm89 +__device__ void nvvm_cvt_sm89() { +#if __CUDA_ARCH__ >= 890 + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn(float 1.00e+00, float 1.00e+00) + __nvvm_ff_to_e4m3x2_rn(1, 1); + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn.relu(float 1.00e+00, float 1.00e+00) + __nvvm_ff_to_e4m3x2_rn_relu(1, 1); + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e5m2x2.rn(float 1.00e+00, float 1.00e+00) + __nvvm_ff_to_e5m2x2_rn(1, 1); + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e5m2x2.rn.relu(float 1.00e+00, float 1.00e+00) + __nvvm_ff_to_e5m2x2_rn_relu(1, 1); + + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e4m3x2.rn(<2 x half> ) + __nvvm_f16x2_to_e4m3x2_rn({1.0f16, 1.0f16}); + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e4m3x2.rn.relu(<2 x half> ) + __nvvm_f16x2_to_e4m3x2_rn_relu({1.0f16, 1.0f16}); + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e5m2x2.rn(<2 x half> ) + __nvvm_f16x2_to_e5m2x2_rn({1.0f16, 1.0f16}); + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e5m2x2.rn.relu(<2 x half> ) + __nvvm_f16x2_to_e5m2x2_rn_relu({1.0f16, 1.0f16}); + + // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e4m3x2.to.f16x2.rn(i16 18504) + __nvvm_e4m3x2_to_f16x2_rn(0x4848); + // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e4m3x2.to.f16x2.rn.relu(i16 18504) + __nvvm_e4m3x2_to_f16x2_rn_relu(0x4848); + // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn(i16 19532) + __nvvm_e5m2x2_to_f16x2_rn(0x4c4c); + // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn.relu(i16 19532) + __nvvm_e5m2x2_to_f16x2_rn_relu(0x4c4c); +#endif + // CHECK: ret void +} + #define NAN32 0x7FBF #define NAN16 (__bf16)0x7FBF #define BF16 (__bf16)0.1f diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 7caada24dad564..42dcf08cc65cac 100644 --- a/llvm/i
[clang] [llvm] [NVPTX] Add conversion intrinsics from/to fp8 types (e4m3, e5m2) (PR #102969)
https://github.com/sergey-kozub updated https://github.com/llvm/llvm-project/pull/102969 >From 7db2478f59c5a4f46df040ed4799da815b28bc43 Mon Sep 17 00:00:00 2001 From: Sergey Kozub Date: Mon, 12 Aug 2024 12:52:01 -0700 Subject: [PATCH] [NVPTX] Add conversion intrinsics from/to fp8 types (e4m3, e5m2) --- clang/include/clang/Basic/BuiltinsNVPTX.def | 15 clang/test/CodeGen/builtins-nvptx.c | 36 + llvm/include/llvm/IR/IntrinsicsNVVM.td | 27 +++ llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 31 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td| 27 +++ llvm/test/CodeGen/NVPTX/convert-sm89.ll | 86 + 6 files changed, 222 insertions(+) create mode 100644 llvm/test/CodeGen/NVPTX/convert-sm89.ll diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def index 504314d8d96e91..c11970c279c4bb 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -584,6 +584,21 @@ TARGET_BUILTIN(__nvvm_f2bf16_rz_relu, "yf", "", AND(SM_80,PTX70)) TARGET_BUILTIN(__nvvm_f2tf32_rna, "ZUif", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_ff_to_e4m3x2_rn, "sff", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_ff_to_e4m3x2_rn_relu, "sff", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_ff_to_e5m2x2_rn, "sff", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_ff_to_e5m2x2_rn_relu, "sff", "", AND(SM_89,PTX81)) + +TARGET_BUILTIN(__nvvm_f16x2_to_e4m3x2_rn, "sV2h", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_f16x2_to_e4m3x2_rn_relu, "sV2h", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_f16x2_to_e5m2x2_rn, "sV2h", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_f16x2_to_e5m2x2_rn_relu, "sV2h", "", AND(SM_89,PTX81)) + +TARGET_BUILTIN(__nvvm_e4m3x2_to_f16x2_rn, "V2hs", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_e4m3x2_to_f16x2_rn_relu, "V2hs", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_e5m2x2_to_f16x2_rn, "V2hs", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_e5m2x2_to_f16x2_rn_relu, "V2hs", "", AND(SM_89,PTX81)) + // Bitcast BUILTIN(__nvvm_bitcast_f2i, "if", "") diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c index 75b9d6d1fe1902..20399b73e63757 100644 --- a/clang/test/CodeGen/builtins-nvptx.c +++ b/clang/test/CodeGen/builtins-nvptx.c @@ -22,6 +22,9 @@ // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \ // RUN:-fcuda-is-device -emit-llvm -o - -x cuda %s \ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP64 %s +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_89 -target-feature +ptx81 \ +// RUN:-fcuda-is-device -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s #define __device__ __attribute__((device)) #define __global__ __attribute__((global)) @@ -968,6 +971,39 @@ __device__ void nvvm_cvt_sm80() { // CHECK: ret void } +// CHECK-LABEL: nvvm_cvt_sm89 +__device__ void nvvm_cvt_sm89() { +#if __CUDA_ARCH__ >= 890 + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn(float 1.00e+00, float 1.00e+00) + __nvvm_ff_to_e4m3x2_rn(1.0f, 1.0f); + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn.relu(float 1.00e+00, float 1.00e+00) + __nvvm_ff_to_e4m3x2_rn_relu(1.0f, 1.0f); + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e5m2x2.rn(float 1.00e+00, float 1.00e+00) + __nvvm_ff_to_e5m2x2_rn(1.0f, 1.0f); + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e5m2x2.rn.relu(float 1.00e+00, float 1.00e+00) + __nvvm_ff_to_e5m2x2_rn_relu(1.0f, 1.0f); + + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e4m3x2.rn(<2 x half> ) + __nvvm_f16x2_to_e4m3x2_rn({1.0f16, 1.0f16}); + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e4m3x2.rn.relu(<2 x half> ) + __nvvm_f16x2_to_e4m3x2_rn_relu({1.0f16, 1.0f16}); + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e5m2x2.rn(<2 x half> ) + __nvvm_f16x2_to_e5m2x2_rn({1.0f16, 1.0f16}); + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e5m2x2.rn.relu(<2 x half> ) + __nvvm_f16x2_to_e5m2x2_rn_relu({1.0f16, 1.0f16}); + + // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e4m3x2.to.f16x2.rn(i16 18504) + __nvvm_e4m3x2_to_f16x2_rn(0x4848); + // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e4m3x2.to.f16x2.rn.relu(i16 18504) + __nvvm_e4m3x2_to_f16x2_rn_relu(0x4848); + // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn(i16 19532) + __nvvm_e5m2x2_to_f16x2_rn(0x4c4c); + // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn.relu(i16 19532) + __nvvm_e5m2x2_to_f16x2_rn_relu(0x4c4c); +#endif + // CHECK: ret void +} + #define NAN32 0x7FBF #define NAN16 (__bf16)0x7FBF #define BF16 (__bf16)0.1f diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 7caada24dad564..42dcf08cc6
[clang] [llvm] [NVPTX] Add conversion intrinsics from/to fp8 types (e4m3, e5m2) (PR #102969)
@@ -722,6 +722,37 @@ let hasSideEffects = false in { defm CVT_f16x2 : CVT_FROM_FLOAT_V2_SM80<"f16x2", Int32Regs>; defm CVT_bf16x2 : CVT_FROM_FLOAT_V2_SM80<"bf16x2", Int32Regs>; + + // FP8 conversions. + multiclass CVT_TO_F8X2 { +def _f32 : + NVPTXInst<(outs Int16Regs:$dst), +(ins Float32Regs:$src1, Float32Regs:$src2, CvtMode:$mode), +!strconcat("cvt${mode:base}.satfinite${mode:relu}.", +F8Name, "x2.f32 \t$dst, $src1, $src2;"), []>, + Requires<[hasPTX<81>, hasSM<89>]>; +def _f16x2 : + NVPTXInst<(outs Int16Regs:$dst), +(ins Int32Regs:$src, CvtMode:$mode), +!strconcat("cvt${mode:base}.satfinite${mode:relu}.", +F8Name, "x2.f16x2 \t$dst, $src;"), []>, + Requires<[hasPTX<81>, hasSM<89>]>; + } + + defm CVT_e4m3x2 : CVT_TO_F8X2<"e4m3">; + defm CVT_e5m2x2 : CVT_TO_F8X2<"e5m2">; + + multiclass CVT_FROM_F8X2 { +def x2 : + NVPTXInst<(outs Int32Regs:$dst), sergey-kozub wrote: This one is confusing, "x2" is only there only because (for some reason) I cannot specify an empty suffix. All the classes in this file use the `CVT__` pattern. I need to define two classes: CVT_f16x2_e4m3x2 and CVT_f16x2_e5m2. You're suggesting to rename to `_f16x2` but I guess "multiclass" only supports appending the suffixes, whereas I need "f16x2" to appear in the middle of the class name (in order to conform to the common naming pattern). Also, "outs" is Int32Regs, and I cannot specify <2xf16> here. This is in line with the other class definitions in this file. https://github.com/llvm/llvm-project/pull/102969 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [NVPTX] Add conversion intrinsics from/to fp8 types (e4m3, e5m2) (PR #102969)
https://github.com/sergey-kozub updated https://github.com/llvm/llvm-project/pull/102969 >From ee028cccb41964223b87edb1db88710bac89080a Mon Sep 17 00:00:00 2001 From: Sergey Kozub Date: Mon, 12 Aug 2024 12:52:01 -0700 Subject: [PATCH] [NVPTX] Add conversion intrinsics from/to fp8 types (e4m3, e5m2) --- clang/include/clang/Basic/BuiltinsNVPTX.def | 15 clang/test/CodeGen/builtins-nvptx.c | 36 + llvm/include/llvm/IR/IntrinsicsNVVM.td | 27 +++ llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 31 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td| 27 +++ llvm/test/CodeGen/NVPTX/convert-sm89.ll | 86 + 6 files changed, 222 insertions(+) create mode 100644 llvm/test/CodeGen/NVPTX/convert-sm89.ll diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def index 504314d8d96e91..c11970c279c4bb 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -584,6 +584,21 @@ TARGET_BUILTIN(__nvvm_f2bf16_rz_relu, "yf", "", AND(SM_80,PTX70)) TARGET_BUILTIN(__nvvm_f2tf32_rna, "ZUif", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_ff_to_e4m3x2_rn, "sff", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_ff_to_e4m3x2_rn_relu, "sff", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_ff_to_e5m2x2_rn, "sff", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_ff_to_e5m2x2_rn_relu, "sff", "", AND(SM_89,PTX81)) + +TARGET_BUILTIN(__nvvm_f16x2_to_e4m3x2_rn, "sV2h", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_f16x2_to_e4m3x2_rn_relu, "sV2h", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_f16x2_to_e5m2x2_rn, "sV2h", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_f16x2_to_e5m2x2_rn_relu, "sV2h", "", AND(SM_89,PTX81)) + +TARGET_BUILTIN(__nvvm_e4m3x2_to_f16x2_rn, "V2hs", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_e4m3x2_to_f16x2_rn_relu, "V2hs", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_e5m2x2_to_f16x2_rn, "V2hs", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_e5m2x2_to_f16x2_rn_relu, "V2hs", "", AND(SM_89,PTX81)) + // Bitcast BUILTIN(__nvvm_bitcast_f2i, "if", "") diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c index 75b9d6d1fe1902..20399b73e63757 100644 --- a/clang/test/CodeGen/builtins-nvptx.c +++ b/clang/test/CodeGen/builtins-nvptx.c @@ -22,6 +22,9 @@ // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \ // RUN:-fcuda-is-device -emit-llvm -o - -x cuda %s \ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP64 %s +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_89 -target-feature +ptx81 \ +// RUN:-fcuda-is-device -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s #define __device__ __attribute__((device)) #define __global__ __attribute__((global)) @@ -968,6 +971,39 @@ __device__ void nvvm_cvt_sm80() { // CHECK: ret void } +// CHECK-LABEL: nvvm_cvt_sm89 +__device__ void nvvm_cvt_sm89() { +#if __CUDA_ARCH__ >= 890 + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn(float 1.00e+00, float 1.00e+00) + __nvvm_ff_to_e4m3x2_rn(1.0f, 1.0f); + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn.relu(float 1.00e+00, float 1.00e+00) + __nvvm_ff_to_e4m3x2_rn_relu(1.0f, 1.0f); + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e5m2x2.rn(float 1.00e+00, float 1.00e+00) + __nvvm_ff_to_e5m2x2_rn(1.0f, 1.0f); + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e5m2x2.rn.relu(float 1.00e+00, float 1.00e+00) + __nvvm_ff_to_e5m2x2_rn_relu(1.0f, 1.0f); + + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e4m3x2.rn(<2 x half> ) + __nvvm_f16x2_to_e4m3x2_rn({1.0f16, 1.0f16}); + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e4m3x2.rn.relu(<2 x half> ) + __nvvm_f16x2_to_e4m3x2_rn_relu({1.0f16, 1.0f16}); + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e5m2x2.rn(<2 x half> ) + __nvvm_f16x2_to_e5m2x2_rn({1.0f16, 1.0f16}); + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e5m2x2.rn.relu(<2 x half> ) + __nvvm_f16x2_to_e5m2x2_rn_relu({1.0f16, 1.0f16}); + + // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e4m3x2.to.f16x2.rn(i16 18504) + __nvvm_e4m3x2_to_f16x2_rn(0x4848); + // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e4m3x2.to.f16x2.rn.relu(i16 18504) + __nvvm_e4m3x2_to_f16x2_rn_relu(0x4848); + // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn(i16 19532) + __nvvm_e5m2x2_to_f16x2_rn(0x4c4c); + // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn.relu(i16 19532) + __nvvm_e5m2x2_to_f16x2_rn_relu(0x4c4c); +#endif + // CHECK: ret void +} + #define NAN32 0x7FBF #define NAN16 (__bf16)0x7FBF #define BF16 (__bf16)0.1f diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 7caada24dad564..42dcf08cc6
[clang] [llvm] [NVPTX] Add conversion intrinsics from/to fp8 types (e4m3, e5m2) (PR #102969)
@@ -722,6 +722,37 @@ let hasSideEffects = false in { defm CVT_f16x2 : CVT_FROM_FLOAT_V2_SM80<"f16x2", Int32Regs>; defm CVT_bf16x2 : CVT_FROM_FLOAT_V2_SM80<"bf16x2", Int32Regs>; + + // FP8 conversions. + multiclass CVT_TO_F8X2 { +def _f32 : + NVPTXInst<(outs Int16Regs:$dst), +(ins Float32Regs:$src1, Float32Regs:$src2, CvtMode:$mode), +!strconcat("cvt${mode:base}.satfinite${mode:relu}.", +F8Name, "x2.f32 \t$dst, $src1, $src2;"), []>, + Requires<[hasPTX<81>, hasSM<89>]>; +def _f16x2 : + NVPTXInst<(outs Int16Regs:$dst), +(ins Int32Regs:$src, CvtMode:$mode), +!strconcat("cvt${mode:base}.satfinite${mode:relu}.", +F8Name, "x2.f16x2 \t$dst, $src;"), []>, + Requires<[hasPTX<81>, hasSM<89>]>; + } + + defm CVT_e4m3x2 : CVT_TO_F8X2<"e4m3">; + defm CVT_e5m2x2 : CVT_TO_F8X2<"e5m2">; + + multiclass CVT_FROM_F8X2 { +def x2 : + NVPTXInst<(outs Int32Regs:$dst), sergey-kozub wrote: Neat, wasn't aware of the NAME keyword. Updated the PR. https://github.com/llvm/llvm-project/pull/102969 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [NVPTX] Add conversion intrinsics from/to fp8 types (e4m3, e5m2) (PR #102969)
https://github.com/sergey-kozub updated https://github.com/llvm/llvm-project/pull/102969 >From e74a0de37302baaf89bfe3230f561684ec5777db Mon Sep 17 00:00:00 2001 From: Sergey Kozub Date: Mon, 12 Aug 2024 12:52:01 -0700 Subject: [PATCH] [NVPTX] Add conversion intrinsics from/to fp8 types (e4m3, e5m2) --- clang/include/clang/Basic/BuiltinsNVPTX.def | 15 clang/test/CodeGen/builtins-nvptx.c | 36 + llvm/include/llvm/IR/IntrinsicsNVVM.td | 27 +++ llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 31 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td| 27 +++ llvm/test/CodeGen/NVPTX/convert-sm89.ll | 86 + 6 files changed, 222 insertions(+) create mode 100644 llvm/test/CodeGen/NVPTX/convert-sm89.ll diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def index 504314d8d96e91..c11970c279c4bb 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -584,6 +584,21 @@ TARGET_BUILTIN(__nvvm_f2bf16_rz_relu, "yf", "", AND(SM_80,PTX70)) TARGET_BUILTIN(__nvvm_f2tf32_rna, "ZUif", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_ff_to_e4m3x2_rn, "sff", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_ff_to_e4m3x2_rn_relu, "sff", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_ff_to_e5m2x2_rn, "sff", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_ff_to_e5m2x2_rn_relu, "sff", "", AND(SM_89,PTX81)) + +TARGET_BUILTIN(__nvvm_f16x2_to_e4m3x2_rn, "sV2h", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_f16x2_to_e4m3x2_rn_relu, "sV2h", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_f16x2_to_e5m2x2_rn, "sV2h", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_f16x2_to_e5m2x2_rn_relu, "sV2h", "", AND(SM_89,PTX81)) + +TARGET_BUILTIN(__nvvm_e4m3x2_to_f16x2_rn, "V2hs", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_e4m3x2_to_f16x2_rn_relu, "V2hs", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_e5m2x2_to_f16x2_rn, "V2hs", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_e5m2x2_to_f16x2_rn_relu, "V2hs", "", AND(SM_89,PTX81)) + // Bitcast BUILTIN(__nvvm_bitcast_f2i, "if", "") diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c index 75b9d6d1fe1902..20399b73e63757 100644 --- a/clang/test/CodeGen/builtins-nvptx.c +++ b/clang/test/CodeGen/builtins-nvptx.c @@ -22,6 +22,9 @@ // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \ // RUN:-fcuda-is-device -emit-llvm -o - -x cuda %s \ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP64 %s +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_89 -target-feature +ptx81 \ +// RUN:-fcuda-is-device -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s #define __device__ __attribute__((device)) #define __global__ __attribute__((global)) @@ -968,6 +971,39 @@ __device__ void nvvm_cvt_sm80() { // CHECK: ret void } +// CHECK-LABEL: nvvm_cvt_sm89 +__device__ void nvvm_cvt_sm89() { +#if __CUDA_ARCH__ >= 890 + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn(float 1.00e+00, float 1.00e+00) + __nvvm_ff_to_e4m3x2_rn(1.0f, 1.0f); + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn.relu(float 1.00e+00, float 1.00e+00) + __nvvm_ff_to_e4m3x2_rn_relu(1.0f, 1.0f); + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e5m2x2.rn(float 1.00e+00, float 1.00e+00) + __nvvm_ff_to_e5m2x2_rn(1.0f, 1.0f); + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e5m2x2.rn.relu(float 1.00e+00, float 1.00e+00) + __nvvm_ff_to_e5m2x2_rn_relu(1.0f, 1.0f); + + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e4m3x2.rn(<2 x half> ) + __nvvm_f16x2_to_e4m3x2_rn({1.0f16, 1.0f16}); + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e4m3x2.rn.relu(<2 x half> ) + __nvvm_f16x2_to_e4m3x2_rn_relu({1.0f16, 1.0f16}); + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e5m2x2.rn(<2 x half> ) + __nvvm_f16x2_to_e5m2x2_rn({1.0f16, 1.0f16}); + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e5m2x2.rn.relu(<2 x half> ) + __nvvm_f16x2_to_e5m2x2_rn_relu({1.0f16, 1.0f16}); + + // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e4m3x2.to.f16x2.rn(i16 18504) + __nvvm_e4m3x2_to_f16x2_rn(0x4848); + // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e4m3x2.to.f16x2.rn.relu(i16 18504) + __nvvm_e4m3x2_to_f16x2_rn_relu(0x4848); + // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn(i16 19532) + __nvvm_e5m2x2_to_f16x2_rn(0x4c4c); + // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn.relu(i16 19532) + __nvvm_e5m2x2_to_f16x2_rn_relu(0x4c4c); +#endif + // CHECK: ret void +} + #define NAN32 0x7FBF #define NAN16 (__bf16)0x7FBF #define BF16 (__bf16)0.1f diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 7caada24dad564..42dcf08cc6
[clang] [llvm] [NVPTX] Add conversion intrinsics from/to fp8 types (e4m3, e5m2) (PR #102969)
@@ -722,6 +722,37 @@ let hasSideEffects = false in { defm CVT_f16x2 : CVT_FROM_FLOAT_V2_SM80<"f16x2", Int32Regs>; defm CVT_bf16x2 : CVT_FROM_FLOAT_V2_SM80<"bf16x2", Int32Regs>; + + // FP8 conversions. + multiclass CVT_TO_F8X2 { +def _f32 : + NVPTXInst<(outs Int16Regs:$dst), +(ins Float32Regs:$src1, Float32Regs:$src2, CvtMode:$mode), +!strconcat("cvt${mode:base}.satfinite${mode:relu}.", +F8Name, "x2.f32 \t$dst, $src1, $src2;"), []>, + Requires<[hasPTX<81>, hasSM<89>]>; +def _f16x2 : + NVPTXInst<(outs Int16Regs:$dst), +(ins Int32Regs:$src, CvtMode:$mode), +!strconcat("cvt${mode:base}.satfinite${mode:relu}.", +F8Name, "x2.f16x2 \t$dst, $src;"), []>, + Requires<[hasPTX<81>, hasSM<89>]>; + } + + defm CVT_e4m3x2 : CVT_TO_F8X2<"e4m3">; + defm CVT_e5m2x2 : CVT_TO_F8X2<"e5m2">; + + multiclass CVT_FROM_F8X2 { sergey-kozub wrote: Replaced with "class" https://github.com/llvm/llvm-project/pull/102969 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [NVPTX] Add conversion intrinsics from/to fp8 types (e4m3, e5m2) (PR #102969)
https://github.com/sergey-kozub updated https://github.com/llvm/llvm-project/pull/102969 >From b74c8bc6009fb2f905089345594b13c8bc75ca36 Mon Sep 17 00:00:00 2001 From: Sergey Kozub Date: Mon, 12 Aug 2024 12:52:01 -0700 Subject: [PATCH] [NVPTX] Add conversion intrinsics from/to fp8 types (e4m3, e5m2) --- clang/include/clang/Basic/BuiltinsNVPTX.def | 15 clang/test/CodeGen/builtins-nvptx.c | 36 + llvm/include/llvm/IR/IntrinsicsNVVM.td | 27 +++ llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 29 +++ llvm/lib/Target/NVPTX/NVPTXIntrinsics.td| 27 +++ llvm/test/CodeGen/NVPTX/convert-sm89.ll | 86 + 6 files changed, 220 insertions(+) create mode 100644 llvm/test/CodeGen/NVPTX/convert-sm89.ll diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def index 504314d8d96e91..c11970c279c4bb 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -584,6 +584,21 @@ TARGET_BUILTIN(__nvvm_f2bf16_rz_relu, "yf", "", AND(SM_80,PTX70)) TARGET_BUILTIN(__nvvm_f2tf32_rna, "ZUif", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_ff_to_e4m3x2_rn, "sff", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_ff_to_e4m3x2_rn_relu, "sff", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_ff_to_e5m2x2_rn, "sff", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_ff_to_e5m2x2_rn_relu, "sff", "", AND(SM_89,PTX81)) + +TARGET_BUILTIN(__nvvm_f16x2_to_e4m3x2_rn, "sV2h", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_f16x2_to_e4m3x2_rn_relu, "sV2h", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_f16x2_to_e5m2x2_rn, "sV2h", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_f16x2_to_e5m2x2_rn_relu, "sV2h", "", AND(SM_89,PTX81)) + +TARGET_BUILTIN(__nvvm_e4m3x2_to_f16x2_rn, "V2hs", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_e4m3x2_to_f16x2_rn_relu, "V2hs", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_e5m2x2_to_f16x2_rn, "V2hs", "", AND(SM_89,PTX81)) +TARGET_BUILTIN(__nvvm_e5m2x2_to_f16x2_rn_relu, "V2hs", "", AND(SM_89,PTX81)) + // Bitcast BUILTIN(__nvvm_bitcast_f2i, "if", "") diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c index 75b9d6d1fe1902..20399b73e63757 100644 --- a/clang/test/CodeGen/builtins-nvptx.c +++ b/clang/test/CodeGen/builtins-nvptx.c @@ -22,6 +22,9 @@ // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \ // RUN:-fcuda-is-device -emit-llvm -o - -x cuda %s \ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP64 %s +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_89 -target-feature +ptx81 \ +// RUN:-fcuda-is-device -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s #define __device__ __attribute__((device)) #define __global__ __attribute__((global)) @@ -968,6 +971,39 @@ __device__ void nvvm_cvt_sm80() { // CHECK: ret void } +// CHECK-LABEL: nvvm_cvt_sm89 +__device__ void nvvm_cvt_sm89() { +#if __CUDA_ARCH__ >= 890 + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn(float 1.00e+00, float 1.00e+00) + __nvvm_ff_to_e4m3x2_rn(1.0f, 1.0f); + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn.relu(float 1.00e+00, float 1.00e+00) + __nvvm_ff_to_e4m3x2_rn_relu(1.0f, 1.0f); + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e5m2x2.rn(float 1.00e+00, float 1.00e+00) + __nvvm_ff_to_e5m2x2_rn(1.0f, 1.0f); + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e5m2x2.rn.relu(float 1.00e+00, float 1.00e+00) + __nvvm_ff_to_e5m2x2_rn_relu(1.0f, 1.0f); + + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e4m3x2.rn(<2 x half> ) + __nvvm_f16x2_to_e4m3x2_rn({1.0f16, 1.0f16}); + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e4m3x2.rn.relu(<2 x half> ) + __nvvm_f16x2_to_e4m3x2_rn_relu({1.0f16, 1.0f16}); + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e5m2x2.rn(<2 x half> ) + __nvvm_f16x2_to_e5m2x2_rn({1.0f16, 1.0f16}); + // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e5m2x2.rn.relu(<2 x half> ) + __nvvm_f16x2_to_e5m2x2_rn_relu({1.0f16, 1.0f16}); + + // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e4m3x2.to.f16x2.rn(i16 18504) + __nvvm_e4m3x2_to_f16x2_rn(0x4848); + // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e4m3x2.to.f16x2.rn.relu(i16 18504) + __nvvm_e4m3x2_to_f16x2_rn_relu(0x4848); + // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn(i16 19532) + __nvvm_e5m2x2_to_f16x2_rn(0x4c4c); + // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn.relu(i16 19532) + __nvvm_e5m2x2_to_f16x2_rn_relu(0x4c4c); +#endif + // CHECK: ret void +} + #define NAN32 0x7FBF #define NAN16 (__bf16)0x7FBF #define BF16 (__bf16)0.1f diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 7caada24dad564..42dcf08cc65
[clang] [llvm] [NVPTX] Add conversion intrinsics from/to fp8 types (e4m3, e5m2) (PR #102969)
sergey-kozub wrote: What do I need to push this through? I'm still seeing "1 workflow awaiting approval" message. Should I add some other reviewer(s) who are authorized to run workflows? https://github.com/llvm/llvm-project/pull/102969 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [NVPTX] Add conversion intrinsics from/to fp8 types (e4m3, e5m2) (PR #102969)
sergey-kozub wrote: > Looks like that it was clang-format check github was waiting on an approaval > for. I've just clicked that button. Now it's all green, thank you. How do I actually upstream it? I don't see any buttons on this PR page that'd allow me to submit it. https://github.com/llvm/llvm-project/pull/102969 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [NVPTX] Add conversion intrinsics from/to fp8 types (e4m3, e5m2) (PR #102969)
sergey-kozub wrote: Thanks for pushing this, I'll continue by adding the support to XLA to use these instructions. https://github.com/llvm/llvm-project/pull/102969 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [NVPTX] Add support for PTX 8.6 and CUDA 12.6 (12.8) (PR #123398)
https://github.com/sergey-kozub closed https://github.com/llvm/llvm-project/pull/123398 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] Remove incorrect CUDA defines (PR #123898)
https://github.com/sergey-kozub closed https://github.com/llvm/llvm-project/pull/123898 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] Remove incorrect CUDA defines (PR #123898)
https://github.com/sergey-kozub updated https://github.com/llvm/llvm-project/pull/123898 >From 1c4a581d45b622591f5062830f2ff1e33b159a64 Mon Sep 17 00:00:00 2001 From: Sergey Kozub Date: Wed, 22 Jan 2025 07:38:55 + Subject: [PATCH] Remove incorrect CUDA defines --- clang/include/clang/Basic/Cuda.h | 4 +--- clang/lib/Basic/Cuda.cpp | 4 +--- clang/lib/Driver/ToolChains/Cuda.cpp | 6 -- 3 files changed, 2 insertions(+), 12 deletions(-) diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h index 1cdfc8178db843..f33ba46233a7ab 100644 --- a/clang/include/clang/Basic/Cuda.h +++ b/clang/include/clang/Basic/Cuda.h @@ -44,12 +44,10 @@ enum class CudaVersion { CUDA_124, CUDA_125, CUDA_126, - CUDA_127, CUDA_128, - CUDA_129, FULLY_SUPPORTED = CUDA_123, PARTIALLY_SUPPORTED = - CUDA_129, // Partially supported. Proceed with a warning. + CUDA_128, // Partially supported. Proceed with a warning. NEW = 1, // Too new. Issue a warning, but allow using it. }; const char *CudaVersionToString(CudaVersion V); diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp index b1461429d4f51a..1bfec0b37c5ee8 100644 --- a/clang/lib/Basic/Cuda.cpp +++ b/clang/lib/Basic/Cuda.cpp @@ -44,9 +44,7 @@ static const CudaVersionMapEntry CudaNameVersionMap[] = { CUDA_ENTRY(12, 4), CUDA_ENTRY(12, 5), CUDA_ENTRY(12, 6), -CUDA_ENTRY(12, 7), CUDA_ENTRY(12, 8), -CUDA_ENTRY(12, 9), {"", CudaVersion::NEW, llvm::VersionTuple(std::numeric_limits::max())}, {"unknown", CudaVersion::UNKNOWN, {}} // End of list tombstone. }; @@ -232,7 +230,7 @@ CudaVersion MinVersionForOffloadArch(OffloadArch A) { return CudaVersion::CUDA_120; case OffloadArch::SM_100: case OffloadArch::SM_100a: -return CudaVersion::CUDA_127; +return CudaVersion::CUDA_128; default: llvm_unreachable("invalid enum"); } diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp index 27e1969dabe551..d4099216c81ba8 100644 --- a/clang/lib/Driver/ToolChains/Cuda.cpp +++ b/clang/lib/Driver/ToolChains/Cuda.cpp @@ -89,12 +89,8 @@ CudaVersion getCudaVersion(uint32_t raw_version) { return CudaVersion::CUDA_125; if (raw_version < 12070) return CudaVersion::CUDA_126; - if (raw_version < 12080) -return CudaVersion::CUDA_127; if (raw_version < 12090) return CudaVersion::CUDA_128; - if (raw_version < 12100) -return CudaVersion::CUDA_129; return CudaVersion::NEW; } @@ -688,9 +684,7 @@ void NVPTX::getNVPTXTargetFeatures(const Driver &D, const llvm::Triple &Triple, case CudaVersion::CUDA_##CUDA_VER: \ PtxFeature = "+ptx" #PTX_VER; \ break; -CASE_CUDA_VERSION(129, 87); CASE_CUDA_VERSION(128, 87); -CASE_CUDA_VERSION(127, 86); CASE_CUDA_VERSION(126, 85); CASE_CUDA_VERSION(125, 85); CASE_CUDA_VERSION(124, 84); ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] Remove incorrect CUDA defines (PR #123898)
@@ -89,12 +89,8 @@ CudaVersion getCudaVersion(uint32_t raw_version) { return CudaVersion::CUDA_125; if (raw_version < 12070) return CudaVersion::CUDA_126; - if (raw_version < 12080) -return CudaVersion::CUDA_127; if (raw_version < 12090) return CudaVersion::CUDA_128; - if (raw_version < 12100) -return CudaVersion::CUDA_129; sergey-kozub wrote: Done. https://github.com/llvm/llvm-project/pull/123898 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [NVPTX] Add support for PTX 8.6 and CUDA 12.6 (12.8) (PR #123398)
https://github.com/sergey-kozub updated https://github.com/llvm/llvm-project/pull/123398 >From dc43fbfbd29c1a088b8261cc2bfc7f6f7e5c7c2f Mon Sep 17 00:00:00 2001 From: Sergey Kozub Date: Fri, 17 Jan 2025 21:00:49 + Subject: [PATCH] Add support for PTX 8.6 and CUDA 12.6 (12.8) --- clang/include/clang/Basic/BuiltinsNVPTX.def | 5 - clang/include/clang/Basic/Cuda.h| 6 +- clang/lib/Basic/Cuda.cpp| 8 ++-- clang/lib/Basic/Targets/NVPTX.cpp | 3 +++ clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp| 1 + clang/lib/Driver/ToolChains/Cuda.cpp| 9 + clang/test/Misc/target-invalid-cpu-note/nvptx.c | 1 + llvm/lib/Target/NVPTX/NVPTX.td | 2 ++ 8 files changed, 31 insertions(+), 4 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def index 969dd9e41ebfa3..37b4e6ff77fda6 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -28,7 +28,9 @@ #pragma push_macro("SM_90") #pragma push_macro("SM_90a") #pragma push_macro("SM_100") -#define SM_100 "sm_100" +#pragma push_macro("SM_100a") +#define SM_100a "sm_100a" +#define SM_100 "sm_100|" SM_100a #define SM_90a "sm_90a" #define SM_90 "sm_90|" SM_90a "|" SM_100 #define SM_89 "sm_89|" SM_90 @@ -1091,6 +1093,7 @@ TARGET_BUILTIN(__nvvm_getctarank_shared_cluster, "iv*3", "", AND(SM_90,PTX78)) #pragma pop_macro("SM_90") #pragma pop_macro("SM_90a") #pragma pop_macro("SM_100") +#pragma pop_macro("SM_100a") #pragma pop_macro("PTX42") #pragma pop_macro("PTX60") #pragma pop_macro("PTX61") diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h index c2a4addf488df1..1cdfc8178db843 100644 --- a/clang/include/clang/Basic/Cuda.h +++ b/clang/include/clang/Basic/Cuda.h @@ -44,9 +44,12 @@ enum class CudaVersion { CUDA_124, CUDA_125, CUDA_126, + CUDA_127, + CUDA_128, + CUDA_129, FULLY_SUPPORTED = CUDA_123, PARTIALLY_SUPPORTED = - CUDA_126, // Partially supported. Proceed with a warning. + CUDA_129, // Partially supported. Proceed with a warning. NEW = 1, // Too new. Issue a warning, but allow using it. }; const char *CudaVersionToString(CudaVersion V); @@ -80,6 +83,7 @@ enum class OffloadArch { SM_90, SM_90a, SM_100, + SM_100a, GFX600, GFX601, GFX602, diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp index d56609a2a8f24a..b1461429d4f51a 100644 --- a/clang/lib/Basic/Cuda.cpp +++ b/clang/lib/Basic/Cuda.cpp @@ -44,6 +44,9 @@ static const CudaVersionMapEntry CudaNameVersionMap[] = { CUDA_ENTRY(12, 4), CUDA_ENTRY(12, 5), CUDA_ENTRY(12, 6), +CUDA_ENTRY(12, 7), +CUDA_ENTRY(12, 8), +CUDA_ENTRY(12, 9), {"", CudaVersion::NEW, llvm::VersionTuple(std::numeric_limits::max())}, {"unknown", CudaVersion::UNKNOWN, {}} // End of list tombstone. }; @@ -98,6 +101,7 @@ static const OffloadArchToStringMap arch_names[] = { SM(90), // Hopper SM(90a), // Hopper SM(100), // Blackwell +SM(100a),// Blackwell GFX(600), // gfx600 GFX(601), // gfx601 GFX(602), // gfx602 @@ -227,8 +231,8 @@ CudaVersion MinVersionForOffloadArch(OffloadArch A) { case OffloadArch::SM_90a: return CudaVersion::CUDA_120; case OffloadArch::SM_100: -return CudaVersion::NEW; // TODO: use specific CUDA version once it's - // public. + case OffloadArch::SM_100a: +return CudaVersion::CUDA_127; default: llvm_unreachable("invalid enum"); } diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp index dbc3fec3657610..56efad90cb7c84 100644 --- a/clang/lib/Basic/Targets/NVPTX.cpp +++ b/clang/lib/Basic/Targets/NVPTX.cpp @@ -285,6 +285,7 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts, case OffloadArch::SM_90a: return "900"; case OffloadArch::SM_100: + case OffloadArch::SM_100a: return "1000"; } llvm_unreachable("unhandled OffloadArch"); @@ -292,6 +293,8 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts, Builder.defineMacro("__CUDA_ARCH__", CUDAArchCode); if (GPU == OffloadArch::SM_90a) Builder.defineMacro("__CUDA_ARCH_FEAT_SM90_ALL", "1"); +if (GPU == OffloadArch::SM_100a) + Builder.defineMacro("__CUDA_ARCH_FEAT_SM100_ALL", "1"); } } diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index 87c3635ed3f70e..c13928f61a7481 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -2277,6 +2277,7 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl *D) { case OffloadArch::SM_90: case OffloadArch::SM_90a: case OffloadArch
[clang] [llvm] [NVPTX] Add support for PTX 8.6 and CUDA 12.6 (12.8) (PR #123398)
https://github.com/sergey-kozub edited https://github.com/llvm/llvm-project/pull/123398 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [NVPTX] Add support for PTX 8.6 and CUDA 12.6 (12.8) (PR #123398)
https://github.com/sergey-kozub updated https://github.com/llvm/llvm-project/pull/123398 >From 04313ede76d272ec391361b9828e55d8a27b4bda Mon Sep 17 00:00:00 2001 From: Sergey Kozub Date: Fri, 17 Jan 2025 21:00:49 + Subject: [PATCH] Add support for PTX 8.6 and CUDA 12.6 (12.8) --- clang/include/clang/Basic/BuiltinsNVPTX.def | 5 - clang/include/clang/Basic/Cuda.h| 6 +- clang/lib/Basic/Cuda.cpp| 8 ++-- clang/lib/Basic/Targets/NVPTX.cpp | 3 +++ clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp| 1 + clang/lib/Driver/ToolChains/Cuda.cpp| 9 + llvm/lib/Target/NVPTX/NVPTX.td | 2 ++ 7 files changed, 30 insertions(+), 4 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def index 969dd9e41ebfa3..37b4e6ff77fda6 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -28,7 +28,9 @@ #pragma push_macro("SM_90") #pragma push_macro("SM_90a") #pragma push_macro("SM_100") -#define SM_100 "sm_100" +#pragma push_macro("SM_100a") +#define SM_100a "sm_100a" +#define SM_100 "sm_100|" SM_100a #define SM_90a "sm_90a" #define SM_90 "sm_90|" SM_90a "|" SM_100 #define SM_89 "sm_89|" SM_90 @@ -1091,6 +1093,7 @@ TARGET_BUILTIN(__nvvm_getctarank_shared_cluster, "iv*3", "", AND(SM_90,PTX78)) #pragma pop_macro("SM_90") #pragma pop_macro("SM_90a") #pragma pop_macro("SM_100") +#pragma pop_macro("SM_100a") #pragma pop_macro("PTX42") #pragma pop_macro("PTX60") #pragma pop_macro("PTX61") diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h index c2a4addf488df1..1cdfc8178db843 100644 --- a/clang/include/clang/Basic/Cuda.h +++ b/clang/include/clang/Basic/Cuda.h @@ -44,9 +44,12 @@ enum class CudaVersion { CUDA_124, CUDA_125, CUDA_126, + CUDA_127, + CUDA_128, + CUDA_129, FULLY_SUPPORTED = CUDA_123, PARTIALLY_SUPPORTED = - CUDA_126, // Partially supported. Proceed with a warning. + CUDA_129, // Partially supported. Proceed with a warning. NEW = 1, // Too new. Issue a warning, but allow using it. }; const char *CudaVersionToString(CudaVersion V); @@ -80,6 +83,7 @@ enum class OffloadArch { SM_90, SM_90a, SM_100, + SM_100a, GFX600, GFX601, GFX602, diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp index d56609a2a8f24a..692ab7c319d8bd 100644 --- a/clang/lib/Basic/Cuda.cpp +++ b/clang/lib/Basic/Cuda.cpp @@ -44,6 +44,9 @@ static const CudaVersionMapEntry CudaNameVersionMap[] = { CUDA_ENTRY(12, 4), CUDA_ENTRY(12, 5), CUDA_ENTRY(12, 6), +CUDA_ENTRY(12, 7), +CUDA_ENTRY(12, 8), +CUDA_ENTRY(12, 9), {"", CudaVersion::NEW, llvm::VersionTuple(std::numeric_limits::max())}, {"unknown", CudaVersion::UNKNOWN, {}} // End of list tombstone. }; @@ -98,6 +101,7 @@ static const OffloadArchToStringMap arch_names[] = { SM(90), // Hopper SM(90a), // Hopper SM(100), // Blackwell +SM(100a),// Blackwell GFX(600), // gfx600 GFX(601), // gfx601 GFX(602), // gfx602 @@ -227,8 +231,8 @@ CudaVersion MinVersionForOffloadArch(OffloadArch A) { case OffloadArch::SM_90a: return CudaVersion::CUDA_120; case OffloadArch::SM_100: -return CudaVersion::NEW; // TODO: use specific CUDA version once it's - // public. + case OffloadArch::SM_100a: +return CudaVersion::CUDA_128; default: llvm_unreachable("invalid enum"); } diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp index dbc3fec3657610..56efad90cb7c84 100644 --- a/clang/lib/Basic/Targets/NVPTX.cpp +++ b/clang/lib/Basic/Targets/NVPTX.cpp @@ -285,6 +285,7 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts, case OffloadArch::SM_90a: return "900"; case OffloadArch::SM_100: + case OffloadArch::SM_100a: return "1000"; } llvm_unreachable("unhandled OffloadArch"); @@ -292,6 +293,8 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts, Builder.defineMacro("__CUDA_ARCH__", CUDAArchCode); if (GPU == OffloadArch::SM_90a) Builder.defineMacro("__CUDA_ARCH_FEAT_SM90_ALL", "1"); +if (GPU == OffloadArch::SM_100a) + Builder.defineMacro("__CUDA_ARCH_FEAT_SM100_ALL", "1"); } } diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index 87c3635ed3f70e..c13928f61a7481 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -2277,6 +2277,7 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl *D) { case OffloadArch::SM_90: case OffloadArch::SM_90a: case OffloadArch::SM_100: + case OffloadArch::SM_100a: case OffloadArch::GFX600:
[clang] [llvm] [NVPTX] Add support for PTX 8.6 and CUDA 12.6 (12.8) (PR #123398)
@@ -682,6 +688,9 @@ void NVPTX::getNVPTXTargetFeatures(const Driver &D, const llvm::Triple &Triple, case CudaVersion::CUDA_##CUDA_VER: \ PtxFeature = "+ptx" #PTX_VER; \ break; +CASE_CUDA_VERSION(129, 86); +CASE_CUDA_VERSION(128, 86); +CASE_CUDA_VERSION(127, 85); sergey-kozub wrote: Updated, thank you. Also, changed line 235 in "clang/lib/Basic/Cuda.cpp", as PTX8.6 supports sm100a. https://github.com/llvm/llvm-project/pull/123398 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [NVPTX] Add support for PTX 8.6 and CUDA 12.6 (12.8) (PR #123398)
https://github.com/sergey-kozub updated https://github.com/llvm/llvm-project/pull/123398 >From 2f909002b89628e2bb83391b2287aa00a7ecaaf3 Mon Sep 17 00:00:00 2001 From: Sergey Kozub Date: Fri, 17 Jan 2025 21:00:49 + Subject: [PATCH] Add support for PTX 8.6 and CUDA 12.6 (12.8) --- clang/include/clang/Basic/BuiltinsNVPTX.def | 5 - clang/include/clang/Basic/Cuda.h| 6 +- clang/lib/Basic/Cuda.cpp| 8 ++-- clang/lib/Basic/Targets/NVPTX.cpp | 3 +++ clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp| 1 + clang/lib/Driver/ToolChains/Cuda.cpp| 9 + llvm/lib/Target/NVPTX/NVPTX.td | 2 ++ 7 files changed, 30 insertions(+), 4 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def index 969dd9e41ebfa3..37b4e6ff77fda6 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -28,7 +28,9 @@ #pragma push_macro("SM_90") #pragma push_macro("SM_90a") #pragma push_macro("SM_100") -#define SM_100 "sm_100" +#pragma push_macro("SM_100a") +#define SM_100a "sm_100a" +#define SM_100 "sm_100|" SM_100a #define SM_90a "sm_90a" #define SM_90 "sm_90|" SM_90a "|" SM_100 #define SM_89 "sm_89|" SM_90 @@ -1091,6 +1093,7 @@ TARGET_BUILTIN(__nvvm_getctarank_shared_cluster, "iv*3", "", AND(SM_90,PTX78)) #pragma pop_macro("SM_90") #pragma pop_macro("SM_90a") #pragma pop_macro("SM_100") +#pragma pop_macro("SM_100a") #pragma pop_macro("PTX42") #pragma pop_macro("PTX60") #pragma pop_macro("PTX61") diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h index c2a4addf488df1..1cdfc8178db843 100644 --- a/clang/include/clang/Basic/Cuda.h +++ b/clang/include/clang/Basic/Cuda.h @@ -44,9 +44,12 @@ enum class CudaVersion { CUDA_124, CUDA_125, CUDA_126, + CUDA_127, + CUDA_128, + CUDA_129, FULLY_SUPPORTED = CUDA_123, PARTIALLY_SUPPORTED = - CUDA_126, // Partially supported. Proceed with a warning. + CUDA_129, // Partially supported. Proceed with a warning. NEW = 1, // Too new. Issue a warning, but allow using it. }; const char *CudaVersionToString(CudaVersion V); @@ -80,6 +83,7 @@ enum class OffloadArch { SM_90, SM_90a, SM_100, + SM_100a, GFX600, GFX601, GFX602, diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp index d56609a2a8f24a..b1461429d4f51a 100644 --- a/clang/lib/Basic/Cuda.cpp +++ b/clang/lib/Basic/Cuda.cpp @@ -44,6 +44,9 @@ static const CudaVersionMapEntry CudaNameVersionMap[] = { CUDA_ENTRY(12, 4), CUDA_ENTRY(12, 5), CUDA_ENTRY(12, 6), +CUDA_ENTRY(12, 7), +CUDA_ENTRY(12, 8), +CUDA_ENTRY(12, 9), {"", CudaVersion::NEW, llvm::VersionTuple(std::numeric_limits::max())}, {"unknown", CudaVersion::UNKNOWN, {}} // End of list tombstone. }; @@ -98,6 +101,7 @@ static const OffloadArchToStringMap arch_names[] = { SM(90), // Hopper SM(90a), // Hopper SM(100), // Blackwell +SM(100a),// Blackwell GFX(600), // gfx600 GFX(601), // gfx601 GFX(602), // gfx602 @@ -227,8 +231,8 @@ CudaVersion MinVersionForOffloadArch(OffloadArch A) { case OffloadArch::SM_90a: return CudaVersion::CUDA_120; case OffloadArch::SM_100: -return CudaVersion::NEW; // TODO: use specific CUDA version once it's - // public. + case OffloadArch::SM_100a: +return CudaVersion::CUDA_127; default: llvm_unreachable("invalid enum"); } diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp index dbc3fec3657610..56efad90cb7c84 100644 --- a/clang/lib/Basic/Targets/NVPTX.cpp +++ b/clang/lib/Basic/Targets/NVPTX.cpp @@ -285,6 +285,7 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts, case OffloadArch::SM_90a: return "900"; case OffloadArch::SM_100: + case OffloadArch::SM_100a: return "1000"; } llvm_unreachable("unhandled OffloadArch"); @@ -292,6 +293,8 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts, Builder.defineMacro("__CUDA_ARCH__", CUDAArchCode); if (GPU == OffloadArch::SM_90a) Builder.defineMacro("__CUDA_ARCH_FEAT_SM90_ALL", "1"); +if (GPU == OffloadArch::SM_100a) + Builder.defineMacro("__CUDA_ARCH_FEAT_SM100_ALL", "1"); } } diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index 87c3635ed3f70e..c13928f61a7481 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -2277,6 +2277,7 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl *D) { case OffloadArch::SM_90: case OffloadArch::SM_90a: case OffloadArch::SM_100: + case OffloadArch::SM_100a: case OffloadArch::GFX600:
[clang] [llvm] [NVPTX] Add support for PTX 8.6 and CUDA 12.6 (12.8) (PR #123398)
https://github.com/sergey-kozub edited https://github.com/llvm/llvm-project/pull/123398 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [NVPTX] Add support for PTX 8.6 and CUDA 12.6 (12.8) (PR #123398)
https://github.com/sergey-kozub created https://github.com/llvm/llvm-project/pull/123398 CUDA 12.8 supports PTX 8.6 which enables architecture "sm100a" (supports Blackwell-specific instructions). CUDA 12.7 technically does not exist, map it to PTX 8.5 (same as 12.6). >From 92e4b10e940e9c08606ae4cf0a94ae77f9a11b58 Mon Sep 17 00:00:00 2001 From: Sergey Kozub Date: Fri, 17 Jan 2025 21:00:49 + Subject: [PATCH] Add support for PTX 8.6 and CUDA 12.6 (12.8) --- clang/include/clang/Basic/BuiltinsNVPTX.def | 5 - clang/include/clang/Basic/Cuda.h| 6 +- clang/lib/Basic/Cuda.cpp| 8 ++-- clang/lib/Basic/Targets/NVPTX.cpp | 3 +++ clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp| 1 + clang/lib/Driver/ToolChains/Cuda.cpp| 9 + llvm/lib/Target/NVPTX/NVPTX.td | 2 ++ 7 files changed, 30 insertions(+), 4 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def index 969dd9e41ebfa3..37b4e6ff77fda6 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -28,7 +28,9 @@ #pragma push_macro("SM_90") #pragma push_macro("SM_90a") #pragma push_macro("SM_100") -#define SM_100 "sm_100" +#pragma push_macro("SM_100a") +#define SM_100a "sm_100a" +#define SM_100 "sm_100|" SM_100a #define SM_90a "sm_90a" #define SM_90 "sm_90|" SM_90a "|" SM_100 #define SM_89 "sm_89|" SM_90 @@ -1091,6 +1093,7 @@ TARGET_BUILTIN(__nvvm_getctarank_shared_cluster, "iv*3", "", AND(SM_90,PTX78)) #pragma pop_macro("SM_90") #pragma pop_macro("SM_90a") #pragma pop_macro("SM_100") +#pragma pop_macro("SM_100a") #pragma pop_macro("PTX42") #pragma pop_macro("PTX60") #pragma pop_macro("PTX61") diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h index c2a4addf488df1..1cdfc8178db843 100644 --- a/clang/include/clang/Basic/Cuda.h +++ b/clang/include/clang/Basic/Cuda.h @@ -44,9 +44,12 @@ enum class CudaVersion { CUDA_124, CUDA_125, CUDA_126, + CUDA_127, + CUDA_128, + CUDA_129, FULLY_SUPPORTED = CUDA_123, PARTIALLY_SUPPORTED = - CUDA_126, // Partially supported. Proceed with a warning. + CUDA_129, // Partially supported. Proceed with a warning. NEW = 1, // Too new. Issue a warning, but allow using it. }; const char *CudaVersionToString(CudaVersion V); @@ -80,6 +83,7 @@ enum class OffloadArch { SM_90, SM_90a, SM_100, + SM_100a, GFX600, GFX601, GFX602, diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp index d56609a2a8f24a..692ab7c319d8bd 100644 --- a/clang/lib/Basic/Cuda.cpp +++ b/clang/lib/Basic/Cuda.cpp @@ -44,6 +44,9 @@ static const CudaVersionMapEntry CudaNameVersionMap[] = { CUDA_ENTRY(12, 4), CUDA_ENTRY(12, 5), CUDA_ENTRY(12, 6), +CUDA_ENTRY(12, 7), +CUDA_ENTRY(12, 8), +CUDA_ENTRY(12, 9), {"", CudaVersion::NEW, llvm::VersionTuple(std::numeric_limits::max())}, {"unknown", CudaVersion::UNKNOWN, {}} // End of list tombstone. }; @@ -98,6 +101,7 @@ static const OffloadArchToStringMap arch_names[] = { SM(90), // Hopper SM(90a), // Hopper SM(100), // Blackwell +SM(100a),// Blackwell GFX(600), // gfx600 GFX(601), // gfx601 GFX(602), // gfx602 @@ -227,8 +231,8 @@ CudaVersion MinVersionForOffloadArch(OffloadArch A) { case OffloadArch::SM_90a: return CudaVersion::CUDA_120; case OffloadArch::SM_100: -return CudaVersion::NEW; // TODO: use specific CUDA version once it's - // public. + case OffloadArch::SM_100a: +return CudaVersion::CUDA_128; default: llvm_unreachable("invalid enum"); } diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp index dbc3fec3657610..56efad90cb7c84 100644 --- a/clang/lib/Basic/Targets/NVPTX.cpp +++ b/clang/lib/Basic/Targets/NVPTX.cpp @@ -285,6 +285,7 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts, case OffloadArch::SM_90a: return "900"; case OffloadArch::SM_100: + case OffloadArch::SM_100a: return "1000"; } llvm_unreachable("unhandled OffloadArch"); @@ -292,6 +293,8 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts, Builder.defineMacro("__CUDA_ARCH__", CUDAArchCode); if (GPU == OffloadArch::SM_90a) Builder.defineMacro("__CUDA_ARCH_FEAT_SM90_ALL", "1"); +if (GPU == OffloadArch::SM_100a) + Builder.defineMacro("__CUDA_ARCH_FEAT_SM100_ALL", "1"); } } diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index 87c3635ed3f70e..c13928f61a7481 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -2277,6 +2277,7 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequires
[clang] [llvm] [NVPTX] Add support for PTX 8.6 and CUDA 12.6 (12.8) (PR #123398)
sergey-kozub wrote: It's not clear from the logs why the "builkite" checks have failed. Running "check-all" target locally suceeds. Please advise. https://github.com/llvm/llvm-project/pull/123398 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [NVPTX] Add support for PTX 8.6 and CUDA 12.6 (12.8) (PR #123398)
sergey-kozub wrote: This is now resolved by fixing the NVPTX test. https://github.com/llvm/llvm-project/pull/123398 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits