https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/96873
>From 367f6897698f22c30cb7491d90ae0251bfa57af1 Mon Sep 17 00:00:00 2001 From: Matt Arsenault <matthew.arsena...@amd.com> Date: Wed, 26 Jun 2024 19:12:59 +0200 Subject: [PATCH] clang/AMDGPU: Emit atomicrmw from {global|flat}_atomic_fadd_v2f16 builtins --- clang/lib/CodeGen/CGBuiltin.cpp | 20 ++++++------------- .../builtins-fp-atomics-gfx12.cl | 9 ++++++--- .../builtins-fp-atomics-gfx90a.cl | 2 +- .../builtins-fp-atomics-gfx940.cl | 3 ++- 4 files changed, 15 insertions(+), 19 deletions(-) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index ad4cce77221a6..ef4bd9fb4af09 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -18777,22 +18777,15 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() }); return Builder.CreateCall(F, { Src0, Builder.getFalse() }); } - case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16: case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64: case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64: case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64: case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64: case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: - case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32: - case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16: { + case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32: { Intrinsic::ID IID; llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext()); switch (BuiltinID) { - case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16: - ArgTy = llvm::FixedVectorType::get( - llvm::Type::getHalfTy(getLLVMContext()), 2); - IID = Intrinsic::amdgcn_global_atomic_fadd; - break; case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64: IID = Intrinsic::amdgcn_global_atomic_fmin; break; @@ -18812,11 +18805,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, ArgTy = llvm::Type::getFloatTy(getLLVMContext()); IID = Intrinsic::amdgcn_flat_atomic_fadd; break; - case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16: - ArgTy = llvm::FixedVectorType::get( - llvm::Type::getHalfTy(getLLVMContext()), 2); - IID = Intrinsic::amdgcn_flat_atomic_fadd; - break; } llvm::Value *Addr = EmitScalarExpr(E->getArg(0)); llvm::Value *Val = EmitScalarExpr(E->getArg(1)); @@ -19217,7 +19205,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_ds_fminf: case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32: - case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: { + case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: + case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16: + case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16: { llvm::AtomicRMWInst::BinOp BinOp; switch (BuiltinID) { case AMDGPU::BI__builtin_amdgcn_atomic_inc32: @@ -19235,6 +19225,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16: case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32: case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: + case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16: + case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16: BinOp = llvm::AtomicRMWInst::FAdd; break; case AMDGPU::BI__builtin_amdgcn_ds_fminf: diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl index 6b8a6d14575db..07e63a8711c7f 100644 --- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl +++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl @@ -48,7 +48,8 @@ void test_local_add_2f16_noret(__local half2 *addr, half2 x) { } // CHECK-LABEL: test_flat_add_2f16 -// CHECK: call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %{{.*}}, <2 x half> %{{.*}}) +// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr %{{.+}}, <2 x half> %{{.+}} syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} + // GFX12-LABEL: test_flat_add_2f16 // GFX12: flat_atomic_pk_add_f16 half2 test_flat_add_2f16(__generic half2 *addr, half2 x) { @@ -64,7 +65,8 @@ short2 test_flat_add_2bf16(__generic short2 *addr, short2 x) { } // CHECK-LABEL: test_global_add_half2 -// CHECK: call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %{{.*}}, <2 x half> %{{.*}}) +// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(1) %{{.+}}, <2 x half> %{{.+}} syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} + // GFX12-LABEL: test_global_add_half2 // GFX12: global_atomic_pk_add_f16 v2, v[0:1], v2, off th:TH_ATOMIC_RETURN void test_global_add_half2(__global half2 *addr, half2 x) { @@ -73,7 +75,8 @@ void test_global_add_half2(__global half2 *addr, half2 x) { } // CHECK-LABEL: test_global_add_half2_noret -// CHECK: call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %{{.*}}, <2 x half> %{{.*}}) +// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(1) %{{.+}}, <2 x half> %{{.+}} syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} + // GFX12-LABEL: test_global_add_half2_noret // GFX12: global_atomic_pk_add_f16 v[0:1], v2, off void test_global_add_half2_noret(__global half2 *addr, half2 x) { diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl index c525c250c937c..cd10777dbe079 100644 --- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl +++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl @@ -18,7 +18,7 @@ void test_global_add_f64(__global double *addr, double x) { } // CHECK-LABEL: test_global_add_half2 -// CHECK: call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %{{.*}}, <2 x half> %{{.*}}) +// CHECK: = atomicrmw fadd ptr addrspace(1) %{{.+}}, <2 x half> %{{.+}} syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} // GFX90A-LABEL: test_global_add_half2 // GFX90A: global_atomic_pk_add_f16 v2, v[0:1], v2, off glc void test_global_add_half2(__global half2 *addr, half2 x) { diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl index 5481138b9fee4..589dcd406630d 100644 --- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl +++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl @@ -18,7 +18,8 @@ half2 test_flat_add_f32(__generic float *addr, float x) { } // CHECK-LABEL: test_flat_add_2f16 -// CHECK: call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %{{.*}}, <2 x half> %{{.*}}) +// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr %{{.+}}, <2 x half> %{{.+}} syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} + // GFX940-LABEL: test_flat_add_2f16 // GFX940: flat_atomic_pk_add_f16 half2 test_flat_add_2f16(__generic half2 *addr, half2 x) { _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits