https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/96872
>From 2e27b153cf40498f64ef9f13b69e80804c45a6a4 Mon Sep 17 00:00:00 2001 From: Matt Arsenault <matthew.arsena...@amd.com> Date: Tue, 11 Jun 2024 10:58:44 +0200 Subject: [PATCH 1/2] clang/AMDGPU: Emit atomicrmw for __builtin_amdgcn_global_atomic_fadd_{f32|f64} Need to emit syncscope and new metadata to get the native instruction, most of the time. --- clang/lib/CodeGen/CGBuiltin.cpp | 39 +++++++++++++------ .../CodeGenOpenCL/builtins-amdgcn-gfx11.cl | 2 +- .../builtins-fp-atomics-gfx12.cl | 4 +- .../builtins-fp-atomics-gfx90a.cl | 4 +- .../builtins-fp-atomics-gfx940.cl | 4 +- 5 files changed, 34 insertions(+), 19 deletions(-) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 0c2ee446aa303..02f85f340893d 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -58,6 +58,7 @@ #include "llvm/IR/MDBuilder.h" #include "llvm/IR/MatrixBuilder.h" #include "llvm/IR/MemoryModelRelaxationAnnotations.h" +#include "llvm/Support/AMDGPUAddrSpace.h" #include "llvm/Support/ConvertUTF.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/ScopedPrinter.h" @@ -18776,8 +18777,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() }); return Builder.CreateCall(F, { Src0, Builder.getFalse() }); } - case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: - case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32: case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16: case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64: case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64: @@ -18789,18 +18788,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, Intrinsic::ID IID; llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext()); switch (BuiltinID) { - case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32: - ArgTy = llvm::Type::getFloatTy(getLLVMContext()); - IID = Intrinsic::amdgcn_global_atomic_fadd; - break; case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16: ArgTy = llvm::FixedVectorType::get( llvm::Type::getHalfTy(getLLVMContext()), 2); IID = Intrinsic::amdgcn_global_atomic_fadd; break; - case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: - IID = Intrinsic::amdgcn_global_atomic_fadd; - break; case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64: IID = Intrinsic::amdgcn_global_atomic_fmin; break; @@ -19223,7 +19215,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16: case AMDGPU::BI__builtin_amdgcn_ds_faddf: case AMDGPU::BI__builtin_amdgcn_ds_fminf: - case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: { + case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: + case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32: + case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: { llvm::AtomicRMWInst::BinOp BinOp; switch (BuiltinID) { case AMDGPU::BI__builtin_amdgcn_atomic_inc32: @@ -19239,6 +19233,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32: case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16: case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16: + case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32: + case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: BinOp = llvm::AtomicRMWInst::FAdd; break; case AMDGPU::BI__builtin_amdgcn_ds_fminf: @@ -19273,8 +19269,13 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)), EmitScalarExpr(E->getArg(3)), AO, SSID); } else { - // The ds_atomic_fadd_* builtins do not have syncscope/order arguments. - SSID = llvm::SyncScope::System; + // Most of the builtins do not have syncscope/order arguments. For DS + // atomics the scope doesn't really matter, as they implicitly operate at + // workgroup scope. + // + // The global/flat cases need to use agent scope to consistently produce + // the native instruction instead of a cmpxchg expansion. + SSID = getLLVMContext().getOrInsertSyncScopeID("agent"); AO = AtomicOrdering::SequentiallyConsistent; // The v2bf16 builtin uses i16 instead of a natural bfloat type. @@ -19289,6 +19290,20 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, Builder.CreateAtomicRMW(BinOp, Ptr, Val, AO, SSID); if (Volatile) RMW->setVolatile(true); + + unsigned AddrSpace = Ptr.getType()->getAddressSpace(); + if (AddrSpace != llvm::AMDGPUAS::LOCAL_ADDRESS) { + // Most targets require "amdgpu.no.fine.grained.memory" to emit the native + // instruction for flat and global operations. + llvm::MDTuple *EmptyMD = MDNode::get(getLLVMContext(), {}); + RMW->setMetadata("amdgpu.no.fine.grained.memory", EmptyMD); + + // Most targets require "amdgpu.ignore.denormal.mode" to emit the native + // instruction, but this only matters for float fadd. + if (BinOp == llvm::AtomicRMWInst::FAdd && Val->getType()->isFloatTy()) + RMW->setMetadata("amdgpu.ignore.denormal.mode", EmptyMD); + } + return Builder.CreateBitCast(RMW, OrigTy); } case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtn: diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl index f444657463568..e8889f57432f5 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl @@ -49,7 +49,7 @@ void test_s_wait_event_export_ready() { } // CHECK-LABEL: @test_global_add_f32 -// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %{{.*}}, float %{{.*}}) +// CHECK: = atomicrmw fadd ptr addrspace(1) %addr, float %x syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}} void test_global_add_f32(float *rtn, global float *addr, float x) { *rtn = __builtin_amdgcn_global_atomic_fadd_f32(addr, x); } diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl index 63381942eaba5..21c1c38bc78dc 100644 --- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl +++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl @@ -11,7 +11,7 @@ typedef short __attribute__((ext_vector_type(2))) short2; // CHECK-LABEL: test_local_add_2bf16 // CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat> -// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] seq_cst, align 4 +// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") seq_cst, align 4 // CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16> // GFX12-LABEL: test_local_add_2bf16 @@ -22,7 +22,7 @@ short2 test_local_add_2bf16(__local short2 *addr, short2 x) { // CHECK-LABEL: test_local_add_2bf16_noret // CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat> -// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] seq_cst, align 4 +// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") seq_cst, align 4 // CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16> // GFX12-LABEL: test_local_add_2bf16_noret diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl index e2117f11858f7..3778f65feaad4 100644 --- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl +++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl @@ -9,7 +9,7 @@ typedef half __attribute__((ext_vector_type(2))) half2; // CHECK-LABEL: test_global_add_f64 -// CHECK: call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %{{.*}}, double %{{.*}}) +// CHECK: = atomicrmw fadd ptr addrspace(1) %{{.+}}, double %{{.+}} syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} // GFX90A-LABEL: test_global_add_f64$local: // GFX90A: global_atomic_add_f64 void test_global_add_f64(__global double *addr, double x) { @@ -117,7 +117,7 @@ void test_ds_addf_local_f32(__local float *addr, float x){ } // CHECK-LABEL: @test_global_add_f32 -// CHECK: call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %{{.*}}, float %{{.*}}) +// CHECK: = atomicrmw fadd ptr addrspace(1) %{{.+}}, float %{{.+}} syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}} void test_global_add_f32(float *rtn, global float *addr, float x) { *rtn = __builtin_amdgcn_global_atomic_fadd_f32(addr, x); } diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl index 92a33ceac2290..d4ca32b9b3cd6 100644 --- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl +++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl @@ -44,7 +44,7 @@ short2 test_global_add_2bf16(__global short2 *addr, short2 x) { // CHECK-LABEL: test_local_add_2bf16 // CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat> -// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] seq_cst, align 4 +// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") seq_cst, align 4{{$}} // CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16> // GFX940-LABEL: test_local_add_2bf16 @@ -70,7 +70,7 @@ void test_local_add_2f16_noret(__local half2 *addr, half2 x) { } // CHECK-LABEL: @test_global_add_f32 -// CHECK: call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %{{.*}}, float %{{.*}}) +// CHECK: = atomicrmw fadd ptr addrspace(1) %{{.+}}, float %{{.+}} syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}} void test_global_add_f32(float *rtn, global float *addr, float x) { *rtn = __builtin_amdgcn_global_atomic_fadd_f32(addr, x); } >From 9352ace1c597c5a6cc827cd8c57a69df9143dc89 Mon Sep 17 00:00:00 2001 From: Matt Arsenault <matthew.arsena...@amd.com> Date: Thu, 27 Jun 2024 16:24:04 +0200 Subject: [PATCH 2/2] Use monotonic ordering --- clang/lib/CodeGen/CGBuiltin.cpp | 2 +- clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl | 2 +- clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl | 8 ++++---- clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx8.cl | 4 ++-- clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl | 8 ++++---- clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl | 8 ++++---- 6 files changed, 16 insertions(+), 16 deletions(-) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 02f85f340893d..ad4cce77221a6 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -19276,7 +19276,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, // The global/flat cases need to use agent scope to consistently produce // the native instruction instead of a cmpxchg expansion. SSID = getLLVMContext().getOrInsertSyncScopeID("agent"); - AO = AtomicOrdering::SequentiallyConsistent; + AO = AtomicOrdering::Monotonic; // The v2bf16 builtin uses i16 instead of a natural bfloat type. if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16) { diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl index e8889f57432f5..138616ccca718 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl @@ -49,7 +49,7 @@ void test_s_wait_event_export_ready() { } // CHECK-LABEL: @test_global_add_f32 -// CHECK: = atomicrmw fadd ptr addrspace(1) %addr, float %x syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}} +// CHECK: = atomicrmw fadd ptr addrspace(1) %addr, float %x syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}} void test_global_add_f32(float *rtn, global float *addr, float x) { *rtn = __builtin_amdgcn_global_atomic_fadd_f32(addr, x); } diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl index 21c1c38bc78dc..6b8a6d14575db 100644 --- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl +++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl @@ -11,7 +11,7 @@ typedef short __attribute__((ext_vector_type(2))) short2; // CHECK-LABEL: test_local_add_2bf16 // CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat> -// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") seq_cst, align 4 +// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") monotonic, align 4 // CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16> // GFX12-LABEL: test_local_add_2bf16 @@ -22,7 +22,7 @@ short2 test_local_add_2bf16(__local short2 *addr, short2 x) { // CHECK-LABEL: test_local_add_2bf16_noret // CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat> -// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") seq_cst, align 4 +// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") monotonic, align 4 // CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16> // GFX12-LABEL: test_local_add_2bf16_noret @@ -32,7 +32,7 @@ void test_local_add_2bf16_noret(__local short2 *addr, short2 x) { } // CHECK-LABEL: test_local_add_2f16 -// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} seq_cst, align 4 +// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} monotonic, align 4 // GFX12-LABEL: test_local_add_2f16 // GFX12: ds_pk_add_rtn_f16 half2 test_local_add_2f16(__local half2 *addr, half2 x) { @@ -40,7 +40,7 @@ half2 test_local_add_2f16(__local half2 *addr, half2 x) { } // CHECK-LABEL: test_local_add_2f16_noret -// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} seq_cst, align 4 +// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} monotonic, align 4 // GFX12-LABEL: test_local_add_2f16_noret // GFX12: ds_pk_add_f16 void test_local_add_2f16_noret(__local half2 *addr, half2 x) { diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx8.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx8.cl index ad4d0b7af3d4b..2f00977ec6014 100644 --- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx8.cl +++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx8.cl @@ -6,7 +6,7 @@ // REQUIRES: amdgpu-registered-target // CHECK-LABEL: test_fadd_local -// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, float %{{.+}} seq_cst, align 4 +// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, float %{{.+}} monotonic, align 4 // GFX8-LABEL: test_fadd_local$local: // GFX8: ds_add_rtn_f32 v2, v0, v1 // GFX8: s_endpgm @@ -16,7 +16,7 @@ kernel void test_fadd_local(__local float *ptr, float val){ } // CHECK-LABEL: test_fadd_local_volatile -// CHECK: = atomicrmw volatile fadd ptr addrspace(3) %{{.+}}, float %{{.+}} seq_cst, align 4 +// CHECK: = atomicrmw volatile fadd ptr addrspace(3) %{{.+}}, float %{{.+}} monotonic, align 4 kernel void test_fadd_local_volatile(volatile __local float *ptr, float val){ volatile float *res; *res = __builtin_amdgcn_ds_atomic_fadd_f32(ptr, val); diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl index 3778f65feaad4..c525c250c937c 100644 --- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl +++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl @@ -9,7 +9,7 @@ typedef half __attribute__((ext_vector_type(2))) half2; // CHECK-LABEL: test_global_add_f64 -// CHECK: = atomicrmw fadd ptr addrspace(1) %{{.+}}, double %{{.+}} syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} +// CHECK: = atomicrmw fadd ptr addrspace(1) %{{.+}}, double %{{.+}} syncscope("agent") monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} // GFX90A-LABEL: test_global_add_f64$local: // GFX90A: global_atomic_add_f64 void test_global_add_f64(__global double *addr, double x) { @@ -99,7 +99,7 @@ void test_flat_global_max_f64(__global double *addr, double x){ } // CHECK-LABEL: test_ds_add_local_f64 -// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, double %{{.+}} seq_cst, align 8 +// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, double %{{.+}} monotonic, align 8 // GFX90A: test_ds_add_local_f64$local // GFX90A: ds_add_rtn_f64 void test_ds_add_local_f64(__local double *addr, double x){ @@ -108,7 +108,7 @@ void test_ds_add_local_f64(__local double *addr, double x){ } // CHECK-LABEL: test_ds_addf_local_f32 -// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, float %{{.+}} seq_cst, align 4 +// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, float %{{.+}} monotonic, align 4 // GFX90A-LABEL: test_ds_addf_local_f32$local // GFX90A: ds_add_rtn_f32 void test_ds_addf_local_f32(__local float *addr, float x){ @@ -117,7 +117,7 @@ void test_ds_addf_local_f32(__local float *addr, float x){ } // CHECK-LABEL: @test_global_add_f32 -// CHECK: = atomicrmw fadd ptr addrspace(1) %{{.+}}, float %{{.+}} syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}} +// CHECK: = atomicrmw fadd ptr addrspace(1) %{{.+}}, float %{{.+}} syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}} void test_global_add_f32(float *rtn, global float *addr, float x) { *rtn = __builtin_amdgcn_global_atomic_fadd_f32(addr, x); } diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl index d4ca32b9b3cd6..5481138b9fee4 100644 --- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl +++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl @@ -44,7 +44,7 @@ short2 test_global_add_2bf16(__global short2 *addr, short2 x) { // CHECK-LABEL: test_local_add_2bf16 // CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat> -// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") seq_cst, align 4{{$}} +// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") monotonic, align 4{{$}} // CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16> // GFX940-LABEL: test_local_add_2bf16 @@ -54,7 +54,7 @@ short2 test_local_add_2bf16(__local short2 *addr, short2 x) { } // CHECK-LABEL: test_local_add_2f16 -// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} seq_cst, align 4 +// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} monotonic, align 4 // GFX940-LABEL: test_local_add_2f16 // GFX940: ds_pk_add_rtn_f16 half2 test_local_add_2f16(__local half2 *addr, half2 x) { @@ -62,7 +62,7 @@ half2 test_local_add_2f16(__local half2 *addr, half2 x) { } // CHECK-LABEL: test_local_add_2f16_noret -// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} seq_cst, align 4 +// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} monotonic, align 4 // GFX940-LABEL: test_local_add_2f16_noret // GFX940: ds_pk_add_f16 void test_local_add_2f16_noret(__local half2 *addr, half2 x) { @@ -70,7 +70,7 @@ void test_local_add_2f16_noret(__local half2 *addr, half2 x) { } // CHECK-LABEL: @test_global_add_f32 -// CHECK: = atomicrmw fadd ptr addrspace(1) %{{.+}}, float %{{.+}} syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}} +// CHECK: = atomicrmw fadd ptr addrspace(1) %{{.+}}, float %{{.+}} syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}} void test_global_add_f32(float *rtn, global float *addr, float x) { *rtn = __builtin_amdgcn_global_atomic_fadd_f32(addr, x); } _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits