https://github.com/Wolfram70 updated https://github.com/llvm/llvm-project/pull/126664
>From 062a48e73ea1434f3c00ab3c0e717db66aa0f15e Mon Sep 17 00:00:00 2001 From: Srinivasa Ravi <sriniva...@nvidia.com> Date: Mon, 10 Feb 2025 14:13:42 +0530 Subject: [PATCH] [NVPTX] Add intrinsics for redux.sync f32 instructions Adds NVVM intrinsics and NVPTX codegen for redux.sync f32 instructions introduced in ptx8.6 for sm_100a. Tests added in CodeGen/NVPTX/redux-sync.ll and verified through ptxas 12.8.0. PTX Spec Reference: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-redux-sync --- clang/include/clang/Basic/BuiltinsNVPTX.td | 8 ++ clang/test/CodeGenCUDA/redux-f32-builtins.cu | 34 +++++ llvm/include/llvm/IR/IntrinsicsNVVM.td | 12 ++ llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 18 +++ llvm/test/CodeGen/NVPTX/redux-sync-f32.ll | 139 +++++++++++++++++++ 5 files changed, 211 insertions(+) create mode 100644 clang/test/CodeGenCUDA/redux-f32-builtins.cu create mode 100644 llvm/test/CodeGen/NVPTX/redux-sync-f32.ll diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.td b/clang/include/clang/Basic/BuiltinsNVPTX.td index 9d24a992563a450..327dc88cffdb4e6 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.td +++ b/clang/include/clang/Basic/BuiltinsNVPTX.td @@ -669,6 +669,14 @@ def __nvvm_redux_sync_umax : NVPTXBuiltinSMAndPTX<"unsigned int(unsigned int, in def __nvvm_redux_sync_and : NVPTXBuiltinSMAndPTX<"int(int, int)", SM_80, PTX70>; def __nvvm_redux_sync_xor : NVPTXBuiltinSMAndPTX<"int(int, int)", SM_80, PTX70>; def __nvvm_redux_sync_or : NVPTXBuiltinSMAndPTX<"int(int, int)", SM_80, PTX70>; +def __nvvm_redux_sync_fmin : NVPTXBuiltinSMAndPTX<"float(float, int)", SM_100a, PTX86>; +def __nvvm_redux_sync_fmin_abs : NVPTXBuiltinSMAndPTX<"float(float, int)", SM_100a, PTX86>; +def __nvvm_redux_sync_fmin_NaN : NVPTXBuiltinSMAndPTX<"float(float, int)", SM_100a, PTX86>; +def __nvvm_redux_sync_fmin_abs_NaN : NVPTXBuiltinSMAndPTX<"float(float, int)", SM_100a, PTX86>; +def __nvvm_redux_sync_fmax : NVPTXBuiltinSMAndPTX<"float(float, int)", SM_100a, PTX86>; +def __nvvm_redux_sync_fmax_abs : NVPTXBuiltinSMAndPTX<"float(float, int)", SM_100a, PTX86>; +def __nvvm_redux_sync_fmax_NaN : NVPTXBuiltinSMAndPTX<"float(float, int)", SM_100a, PTX86>; +def __nvvm_redux_sync_fmax_abs_NaN : NVPTXBuiltinSMAndPTX<"float(float, int)", SM_100a, PTX86>; // Membar diff --git a/clang/test/CodeGenCUDA/redux-f32-builtins.cu b/clang/test/CodeGenCUDA/redux-f32-builtins.cu new file mode 100644 index 000000000000000..7359fb000699169 --- /dev/null +++ b/clang/test/CodeGenCUDA/redux-f32-builtins.cu @@ -0,0 +1,34 @@ +// RUN: %clang_cc1 "-triple" "nvptx-nvidia-cuda" "-target-feature" "+ptx86" "-target-cpu" "sm_100a" -emit-llvm -fcuda-is-device -o - %s | FileCheck %s +// RUN: %clang_cc1 "-triple" "nvptx64-nvidia-cuda" "-target-feature" "+ptx86" "-target-cpu" "sm_100a" -emit-llvm -fcuda-is-device -o - %s | FileCheck %s + +// CHECK: define{{.*}} void @_Z6kernelPf(ptr noundef %out_f) +__attribute__((global)) void kernel(float* out_f) { + float a = 3.0; + int i = 0; + + out_f[i++] = __nvvm_redux_sync_fmin(a, 0xFF); + // CHECK: call contract float @llvm.nvvm.redux.sync.fmin + + out_f[i++] = __nvvm_redux_sync_fmin_abs(a, 0xFF); + // CHECK: call contract float @llvm.nvvm.redux.sync.fmin.abs + + out_f[i++] = __nvvm_redux_sync_fmin_NaN(a, 0xF0); + // CHECK: call contract float @llvm.nvvm.redux.sync.fmin.NaN + + out_f[i++] = __nvvm_redux_sync_fmin_abs_NaN(a, 0x0F); + // CHECK: call contract float @llvm.nvvm.redux.sync.fmin.abs.NaN + + out_f[i++] = __nvvm_redux_sync_fmax(a, 0xFF); + // CHECK: call contract float @llvm.nvvm.redux.sync.fmax + + out_f[i++] = __nvvm_redux_sync_fmax_abs(a, 0x01); + // CHECK: call contract float @llvm.nvvm.redux.sync.fmax.abs + + out_f[i++] = __nvvm_redux_sync_fmax_NaN(a, 0xF1); + // CHECK: call contract float @llvm.nvvm.redux.sync.fmax.NaN + + out_f[i++] = __nvvm_redux_sync_fmax_abs_NaN(a, 0x10); + // CHECK: call contract float @llvm.nvvm.redux.sync.fmax.abs.NaN + + // CHECK: ret void +} diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index f299a145ac73b12..0ceb64d506243c5 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -4823,6 +4823,18 @@ def int_nvvm_redux_sync_xor : ClangBuiltin<"__nvvm_redux_sync_xor">, def int_nvvm_redux_sync_or : ClangBuiltin<"__nvvm_redux_sync_or">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrInaccessibleMemOnly, IntrNoCallback]>; + +// redux.sync.{min/max}.{abs}.{nan}.f32 dst, src, membermask; +foreach binOp = ["min", "max"] in { + foreach abs = ["", "_abs"] in { + foreach nan = ["", "_NaN"] in { + def int_nvvm_redux_sync_f # binOp # abs # nan : + ClangBuiltin<!strconcat("__nvvm_redux_sync_f", binOp, abs, nan)>, + Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty], + [IntrConvergent, IntrInaccessibleMemOnly, IntrNoCallback]>; + } + } +} // // WGMMA fence instructions diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 5331f36ad09997f..059b960abc819f7 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -328,6 +328,24 @@ defm REDUX_SYNC_AND : REDUX_SYNC<"and", "b32", int_nvvm_redux_sync_and>; defm REDUX_SYNC_XOR : REDUX_SYNC<"xor", "b32", int_nvvm_redux_sync_xor>; defm REDUX_SYNC_OR : REDUX_SYNC<"or", "b32", int_nvvm_redux_sync_or>; +multiclass REDUX_SYNC_F<string BinOp, string abs, string NaN> { + def : NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$src, Int32Regs:$mask), + "redux.sync." # BinOp # !subst("_", ".", abs) # !subst("_", ".", NaN) # ".f32 $dst, $src, $mask;", + [(set f32:$dst, (!cast<Intrinsic>("int_nvvm_redux_sync_f" # BinOp # abs # NaN) f32:$src, Int32Regs:$mask))]>, + Requires<[hasPTX<86>, hasSM100a]>; + +} + +defm REDUX_SYNC_FMIN : REDUX_SYNC_F<"min", "", "">; +defm REDUX_SYNC_FMIN_ABS : REDUX_SYNC_F<"min", "_abs", "">; +defm REDUX_SYNC_FMIN_NAN: REDUX_SYNC_F<"min", "", "_NaN">; +defm REDUX_SYNC_FMIN_ABS_NAN: REDUX_SYNC_F<"min", "_abs", "_NaN">; +defm REDUX_SYNC_FMAX : REDUX_SYNC_F<"max", "", "">; +defm REDUX_SYNC_FMAX_ABS : REDUX_SYNC_F<"max", "_abs", "">; +defm REDUX_SYNC_FMAX_NAN: REDUX_SYNC_F<"max", "", "_NaN">; +defm REDUX_SYNC_FMAX_ABS_NAN: REDUX_SYNC_F<"max", "_abs", "_NaN">; + } // isConvergent = true //----------------------------------- diff --git a/llvm/test/CodeGen/NVPTX/redux-sync-f32.ll b/llvm/test/CodeGen/NVPTX/redux-sync-f32.ll new file mode 100644 index 000000000000000..af113e75fd1435a --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/redux-sync-f32.ll @@ -0,0 +1,139 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck %s +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} + +declare float @llvm.nvvm.redux.sync.fmin(float, i32) +define float @redux_sync_fmin(float %src, i32 %mask) { +; CHECK-LABEL: redux_sync_fmin( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [redux_sync_fmin_param_0]; +; CHECK-NEXT: ld.param.u32 %r1, [redux_sync_fmin_param_1]; +; CHECK-NEXT: redux.sync.min.f32 %f2, %f1, %r1; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: ret; + %val = call float @llvm.nvvm.redux.sync.fmin(float %src, i32 %mask) + ret float %val +} + +declare float @llvm.nvvm.redux.sync.fmin.abs(float, i32) +define float @redux_sync_fmin_abs(float %src, i32 %mask) { +; CHECK-LABEL: redux_sync_fmin_abs( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [redux_sync_fmin_abs_param_0]; +; CHECK-NEXT: ld.param.u32 %r1, [redux_sync_fmin_abs_param_1]; +; CHECK-NEXT: redux.sync.min.abs.f32 %f2, %f1, %r1; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: ret; + %val = call float @llvm.nvvm.redux.sync.fmin.abs(float %src, i32 %mask) + ret float %val +} + +declare float @llvm.nvvm.redux.sync.fmin.NaN(float, i32) +define float @redux_sync_fmin_NaN(float %src, i32 %mask) { +; CHECK-LABEL: redux_sync_fmin_NaN( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [redux_sync_fmin_NaN_param_0]; +; CHECK-NEXT: ld.param.u32 %r1, [redux_sync_fmin_NaN_param_1]; +; CHECK-NEXT: redux.sync.min.NaN.f32 %f2, %f1, %r1; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: ret; + %val = call float @llvm.nvvm.redux.sync.fmin.NaN(float %src, i32 %mask) + ret float %val +} + +declare float @llvm.nvvm.redux.sync.fmin.abs.NaN(float, i32) +define float @redux_sync_fmin_abs_NaN(float %src, i32 %mask) { +; CHECK-LABEL: redux_sync_fmin_abs_NaN( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [redux_sync_fmin_abs_NaN_param_0]; +; CHECK-NEXT: ld.param.u32 %r1, [redux_sync_fmin_abs_NaN_param_1]; +; CHECK-NEXT: redux.sync.min.abs.NaN.f32 %f2, %f1, %r1; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: ret; + %val = call float @llvm.nvvm.redux.sync.fmin.abs.NaN(float %src, i32 %mask) + ret float %val +} + +declare float @llvm.nvvm.redux.sync.fmax(float, i32) +define float @redux_sync_fmax(float %src, i32 %mask) { +; CHECK-LABEL: redux_sync_fmax( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [redux_sync_fmax_param_0]; +; CHECK-NEXT: ld.param.u32 %r1, [redux_sync_fmax_param_1]; +; CHECK-NEXT: redux.sync.max.f32 %f2, %f1, %r1; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: ret; + %val = call float @llvm.nvvm.redux.sync.fmax(float %src, i32 %mask) + ret float %val +} + +declare float @llvm.nvvm.redux.sync.fmax.abs(float, i32) +define float @redux_sync_fmax_abs(float %src, i32 %mask) { +; CHECK-LABEL: redux_sync_fmax_abs( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [redux_sync_fmax_abs_param_0]; +; CHECK-NEXT: ld.param.u32 %r1, [redux_sync_fmax_abs_param_1]; +; CHECK-NEXT: redux.sync.max.abs.f32 %f2, %f1, %r1; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: ret; + %val = call float @llvm.nvvm.redux.sync.fmax.abs(float %src, i32 %mask) + ret float %val +} + +declare float @llvm.nvvm.redux.sync.fmax.NaN(float, i32) +define float @redux_sync_fmax_NaN(float %src, i32 %mask) { +; CHECK-LABEL: redux_sync_fmax_NaN( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [redux_sync_fmax_NaN_param_0]; +; CHECK-NEXT: ld.param.u32 %r1, [redux_sync_fmax_NaN_param_1]; +; CHECK-NEXT: redux.sync.max.NaN.f32 %f2, %f1, %r1; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: ret; + %val = call float @llvm.nvvm.redux.sync.fmax.NaN(float %src, i32 %mask) + ret float %val +} + +declare float @llvm.nvvm.redux.sync.fmax.abs.NaN(float, i32) +define float @redux_sync_fmax_abs_NaN(float %src, i32 %mask) { +; CHECK-LABEL: redux_sync_fmax_abs_NaN( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [redux_sync_fmax_abs_NaN_param_0]; +; CHECK-NEXT: ld.param.u32 %r1, [redux_sync_fmax_abs_NaN_param_1]; +; CHECK-NEXT: redux.sync.max.abs.NaN.f32 %f2, %f1, %r1; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: ret; + %val = call float @llvm.nvvm.redux.sync.fmax.abs.NaN(float %src, i32 %mask) + ret float %val +} _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits