Author: Wenju He Date: 2025-09-01T11:03:45+08:00 New Revision: a247da4f9363116c54b91a37755edd994c56dbf8
URL: https://github.com/llvm/llvm-project/commit/a247da4f9363116c54b91a37755edd994c56dbf8 DIFF: https://github.com/llvm/llvm-project/commit/a247da4f9363116c54b91a37755edd994c56dbf8.diff LOG: [libclc] update __clc_mem_fence: add MemorySemantic arg and use __builtin_amdgcn_fence for AMDGPU (#152275) It is necessary to add MemorySemantic argument for AMDGPU which means the memory or address space to which the memory ordering is applied. The MemorySemantic is also necessary for implementing the SPIR-V MemoryBarrier instruction. Additionally, the implementation of __clc_mem_fence on Intel GPUs requires the MemorySemantic argument. Using __builtin_amdgcn_fence for AMDGPU is follow-up of https://github.com/llvm/llvm-project/pull/151446#discussion_r2254006508 llvm-diff shows no change to nvptx64--nvidiacl.bc. Added: libclc/clc/include/clc/mem_fence/clc_mem_semantic.h Modified: libclc/clc/include/clc/mem_fence/clc_mem_fence.h libclc/clc/include/clc/synchronization/clc_work_group_barrier.h libclc/clc/lib/amdgcn/mem_fence/clc_mem_fence.cl libclc/clc/lib/amdgcn/synchronization/clc_work_group_barrier.cl libclc/clc/lib/ptx-nvidiacl/mem_fence/clc_mem_fence.cl libclc/clc/lib/ptx-nvidiacl/synchronization/clc_work_group_barrier.cl libclc/opencl/include/clc/opencl/synchronization/utils.h libclc/opencl/lib/amdgcn/mem_fence/fence.cl libclc/opencl/lib/amdgcn/synchronization/barrier.cl libclc/opencl/lib/ptx-nvidiacl/mem_fence/fence.cl libclc/opencl/lib/ptx-nvidiacl/synchronization/barrier.cl Removed: ################################################################################ diff --git a/libclc/clc/include/clc/mem_fence/clc_mem_fence.h b/libclc/clc/include/clc/mem_fence/clc_mem_fence.h index 2321634c76842..0776caddde0d5 100644 --- a/libclc/clc/include/clc/mem_fence/clc_mem_fence.h +++ b/libclc/clc/include/clc/mem_fence/clc_mem_fence.h @@ -10,8 +10,10 @@ #define __CLC_MEM_FENCE_CLC_MEM_FENCE_H__ #include <clc/internal/clc.h> +#include <clc/mem_fence/clc_mem_semantic.h> -_CLC_OVERLOAD _CLC_DECL void __clc_mem_fence(int memory_scope, - int memory_order); +_CLC_OVERLOAD _CLC_DECL void +__clc_mem_fence(int memory_scope, int memory_order, + __CLC_MemorySemantics memory_semantics); #endif // __CLC_MEM_FENCE_CLC_MEM_FENCE_H__ diff --git a/libclc/clc/include/clc/mem_fence/clc_mem_semantic.h b/libclc/clc/include/clc/mem_fence/clc_mem_semantic.h new file mode 100644 index 0000000000000..4d9f5f1db8ee9 --- /dev/null +++ b/libclc/clc/include/clc/mem_fence/clc_mem_semantic.h @@ -0,0 +1,21 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_MEM_FENCE_CLC_MEM_SEMANTIC_H__ +#define __CLC_MEM_FENCE_CLC_MEM_SEMANTIC_H__ + +// The memory or address space to which the memory ordering is applied. +typedef enum __CLC_MemorySemantics { + __CLC_MEMORY_PRIVATE = 1 << 0, + __CLC_MEMORY_GLOBAL = 1 << 1, + __CLC_MEMORY_CONSTANT = 1 << 2, + __CLC_MEMORY_LOCAL = 1 << 3, + __CLC_MEMORY_GENERIC = 1 << 4, +} __CLC_MemorySemantics; + +#endif // __CLC_MEM_FENCE_CLC_MEM_SEMANTIC_H__ diff --git a/libclc/clc/include/clc/synchronization/clc_work_group_barrier.h b/libclc/clc/include/clc/synchronization/clc_work_group_barrier.h index 5f864e1057b8b..34745bd47c068 100644 --- a/libclc/clc/include/clc/synchronization/clc_work_group_barrier.h +++ b/libclc/clc/include/clc/synchronization/clc_work_group_barrier.h @@ -10,8 +10,10 @@ #define __CLC_SYNCHRONIZATION_CLC_WORK_GROUP_BARRIER_H__ #include <clc/internal/clc.h> +#include <clc/mem_fence/clc_mem_semantic.h> -_CLC_OVERLOAD _CLC_DECL void __clc_work_group_barrier(int memory_scope, - int memory_order); +_CLC_OVERLOAD _CLC_DECL void +__clc_work_group_barrier(int memory_scope, int memory_order, + __CLC_MemorySemantics memory_semantics); #endif // __CLC_SYNCHRONIZATION_CLC_WORK_GROUP_BARRIER_H__ diff --git a/libclc/clc/lib/amdgcn/mem_fence/clc_mem_fence.cl b/libclc/clc/lib/amdgcn/mem_fence/clc_mem_fence.cl index 9e6460313718e..6d2a0962ba20d 100644 --- a/libclc/clc/lib/amdgcn/mem_fence/clc_mem_fence.cl +++ b/libclc/clc/lib/amdgcn/mem_fence/clc_mem_fence.cl @@ -8,30 +8,50 @@ #include <clc/mem_fence/clc_mem_fence.h> -void __clc_amdgcn_s_waitcnt(unsigned flags); +#define BUILTIN_FENCE_ORDER(memory_order, ...) \ + switch (memory_order) { \ + case __ATOMIC_ACQUIRE: \ + __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, __VA_ARGS__); \ + break; \ + case __ATOMIC_RELEASE: \ + __builtin_amdgcn_fence(__ATOMIC_RELEASE, __VA_ARGS__); \ + break; \ + case __ATOMIC_ACQ_REL: \ + __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, __VA_ARGS__); \ + break; \ + case __ATOMIC_SEQ_CST: \ + __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, __VA_ARGS__); \ + break; \ + default: \ + __builtin_unreachable(); \ + } \ + break; -// s_waitcnt takes 16bit argument with a combined number of maximum allowed -// pending operations: -// [12:8] LGKM -- LDS, GDS, Konstant (SMRD), Messages -// [7] -- undefined -// [6:4] -- exports, GDS, and mem write -// [3:0] -- vector memory operations +#define BUILTIN_FENCE(memory_scope, memory_order, ...) \ + switch (memory_scope) { \ + case __MEMORY_SCOPE_DEVICE: \ + BUILTIN_FENCE_ORDER(memory_order, "agent", ##__VA_ARGS__) \ + case __MEMORY_SCOPE_WRKGRP: \ + BUILTIN_FENCE_ORDER(memory_order, "workgroup", ##__VA_ARGS__) \ + case __MEMORY_SCOPE_WVFRNT: \ + BUILTIN_FENCE_ORDER(memory_order, "wavefront", ##__VA_ARGS__) \ + case __MEMORY_SCOPE_SINGLE: \ + BUILTIN_FENCE_ORDER(memory_order, "singlethread", ##__VA_ARGS__) \ + case __MEMORY_SCOPE_SYSTEM: \ + default: \ + BUILTIN_FENCE_ORDER(memory_order, "", ##__VA_ARGS__) \ + } -// Newer clang supports __builtin_amdgcn_s_waitcnt -#if __clang_major__ >= 5 -#define __waitcnt(x) __builtin_amdgcn_s_waitcnt(x) -#else -#define __waitcnt(x) __clc_amdgcn_s_waitcnt(x) -_CLC_DEF void __clc_amdgcn_s_waitcnt(unsigned) __asm("llvm.amdgcn.s.waitcnt"); -#endif - -_CLC_OVERLOAD _CLC_DEF void __clc_mem_fence(int memory_scope, - int memory_order) { - if (memory_scope & __MEMORY_SCOPE_DEVICE) { - // scalar loads are counted with LGKM but we don't know whether - // the compiler turned any loads to scalar - __waitcnt(0); - } else if (memory_scope & __MEMORY_SCOPE_WRKGRP) - __waitcnt(0xff); // LGKM is [12:8] +_CLC_OVERLOAD _CLC_DEF void +__clc_mem_fence(int memory_scope, int memory_order, + __CLC_MemorySemantics memory_semantics) { + if (memory_semantics == __CLC_MEMORY_LOCAL) { + BUILTIN_FENCE(memory_scope, memory_order, "local") + } else if (memory_semantics == __CLC_MEMORY_GLOBAL) { + BUILTIN_FENCE(memory_scope, memory_order, "global") + } else if (memory_semantics == (__CLC_MEMORY_LOCAL | __CLC_MEMORY_GLOBAL)) { + BUILTIN_FENCE(memory_scope, memory_order, "local", "global") + } else { + BUILTIN_FENCE(memory_scope, memory_order) + } } -#undef __waitcnt diff --git a/libclc/clc/lib/amdgcn/synchronization/clc_work_group_barrier.cl b/libclc/clc/lib/amdgcn/synchronization/clc_work_group_barrier.cl index ff3628fa7c339..034e6e7bd8ed4 100644 --- a/libclc/clc/lib/amdgcn/synchronization/clc_work_group_barrier.cl +++ b/libclc/clc/lib/amdgcn/synchronization/clc_work_group_barrier.cl @@ -9,8 +9,9 @@ #include <clc/mem_fence/clc_mem_fence.h> #include <clc/synchronization/clc_work_group_barrier.h> -_CLC_OVERLOAD _CLC_DEF void __clc_work_group_barrier(int memory_scope, - int memory_order) { - __clc_mem_fence(memory_scope, memory_order); +_CLC_OVERLOAD _CLC_DEF void +__clc_work_group_barrier(int memory_scope, int memory_order, + __CLC_MemorySemantics memory_semantics) { + __clc_mem_fence(memory_scope, memory_order, memory_semantics); __builtin_amdgcn_s_barrier(); } diff --git a/libclc/clc/lib/ptx-nvidiacl/mem_fence/clc_mem_fence.cl b/libclc/clc/lib/ptx-nvidiacl/mem_fence/clc_mem_fence.cl index b3e2375e755a2..5f96ef5477642 100644 --- a/libclc/clc/lib/ptx-nvidiacl/mem_fence/clc_mem_fence.cl +++ b/libclc/clc/lib/ptx-nvidiacl/mem_fence/clc_mem_fence.cl @@ -8,8 +8,9 @@ #include <clc/mem_fence/clc_mem_fence.h> -_CLC_OVERLOAD _CLC_DEF void __clc_mem_fence(int memory_scope, - int memory_order) { +_CLC_OVERLOAD _CLC_DEF void +__clc_mem_fence(int memory_scope, int memory_order, + __CLC_MemorySemantics memory_semantics) { if (memory_scope & (__MEMORY_SCOPE_DEVICE | __MEMORY_SCOPE_WRKGRP)) __nvvm_membar_cta(); } diff --git a/libclc/clc/lib/ptx-nvidiacl/synchronization/clc_work_group_barrier.cl b/libclc/clc/lib/ptx-nvidiacl/synchronization/clc_work_group_barrier.cl index 6cb37a38f06ac..349c0f4845132 100644 --- a/libclc/clc/lib/ptx-nvidiacl/synchronization/clc_work_group_barrier.cl +++ b/libclc/clc/lib/ptx-nvidiacl/synchronization/clc_work_group_barrier.cl @@ -8,7 +8,8 @@ #include <clc/synchronization/clc_work_group_barrier.h> -_CLC_OVERLOAD _CLC_DEF void __clc_work_group_barrier(int memory_scope, - int memory_order) { +_CLC_OVERLOAD _CLC_DEF void +__clc_work_group_barrier(int memory_scope, int memory_order, + __CLC_MemorySemantics memory_semantics) { __syncthreads(); } diff --git a/libclc/opencl/include/clc/opencl/synchronization/utils.h b/libclc/opencl/include/clc/opencl/synchronization/utils.h index cf3baf28cb5f1..a8841658598c1 100644 --- a/libclc/opencl/include/clc/opencl/synchronization/utils.h +++ b/libclc/opencl/include/clc/opencl/synchronization/utils.h @@ -10,9 +10,10 @@ #define __CLC_OPENCL_SYNCHRONIZATION_UTILS_H__ #include <clc/internal/clc.h> +#include <clc/mem_fence/clc_mem_semantic.h> #include <clc/opencl/synchronization/cl_mem_fence_flags.h> -_CLC_INLINE int getCLCMemoryScope(cl_mem_fence_flags flag) { +_CLC_INLINE int __opencl_get_memory_scope(cl_mem_fence_flags flag) { int memory_scope = 0; if (flag & CLK_GLOBAL_MEM_FENCE) memory_scope |= __MEMORY_SCOPE_DEVICE; @@ -21,4 +22,15 @@ _CLC_INLINE int getCLCMemoryScope(cl_mem_fence_flags flag) { return memory_scope; } +_CLC_INLINE __CLC_MemorySemantics +__opencl_get_memory_semantics(cl_mem_fence_flags flag) { + if ((flag & CLK_LOCAL_MEM_FENCE) && (flag & CLK_GLOBAL_MEM_FENCE)) + return __CLC_MEMORY_LOCAL | __CLC_MEMORY_GLOBAL; + if (flag & CLK_LOCAL_MEM_FENCE) + return __CLC_MEMORY_LOCAL; + if (flag & CLK_GLOBAL_MEM_FENCE) + return __CLC_MEMORY_GLOBAL; + __builtin_unreachable(); +} + #endif // __CLC_OPENCL_SYNCHRONIZATION_UTILS_H__ diff --git a/libclc/opencl/lib/amdgcn/mem_fence/fence.cl b/libclc/opencl/lib/amdgcn/mem_fence/fence.cl index 81216d6a26cf2..963380761b46c 100644 --- a/libclc/opencl/lib/amdgcn/mem_fence/fence.cl +++ b/libclc/opencl/lib/amdgcn/mem_fence/fence.cl @@ -11,9 +11,10 @@ #include <clc/opencl/synchronization/utils.h> _CLC_DEF _CLC_OVERLOAD void mem_fence(cl_mem_fence_flags flags) { - int memory_scope = getCLCMemoryScope(flags); + int memory_scope = __opencl_get_memory_scope(flags); int memory_order = __ATOMIC_SEQ_CST; - __clc_mem_fence(memory_scope, memory_order); + __CLC_MemorySemantics memory_semantics = __opencl_get_memory_semantics(flags); + __clc_mem_fence(memory_scope, memory_order, memory_semantics); } // We don't have separate mechanism for read and write fences diff --git a/libclc/opencl/lib/amdgcn/synchronization/barrier.cl b/libclc/opencl/lib/amdgcn/synchronization/barrier.cl index c8322e602302c..dd7d1507f5ad4 100644 --- a/libclc/opencl/lib/amdgcn/synchronization/barrier.cl +++ b/libclc/opencl/lib/amdgcn/synchronization/barrier.cl @@ -11,7 +11,8 @@ #include <clc/synchronization/clc_work_group_barrier.h> _CLC_DEF _CLC_OVERLOAD void barrier(cl_mem_fence_flags flags) { - int memory_scope = getCLCMemoryScope(flags); + int memory_scope = __opencl_get_memory_scope(flags); int memory_order = __ATOMIC_SEQ_CST; - __clc_work_group_barrier(memory_scope, memory_order); + __CLC_MemorySemantics memory_semantics = __opencl_get_memory_semantics(flags); + __clc_work_group_barrier(memory_scope, memory_order, memory_semantics); } diff --git a/libclc/opencl/lib/ptx-nvidiacl/mem_fence/fence.cl b/libclc/opencl/lib/ptx-nvidiacl/mem_fence/fence.cl index e22ed870a7e6b..19721574e4053 100644 --- a/libclc/opencl/lib/ptx-nvidiacl/mem_fence/fence.cl +++ b/libclc/opencl/lib/ptx-nvidiacl/mem_fence/fence.cl @@ -11,9 +11,10 @@ #include <clc/opencl/synchronization/utils.h> _CLC_DEF _CLC_OVERLOAD void mem_fence(cl_mem_fence_flags flags) { - int memory_scope = getCLCMemoryScope(flags); + int memory_scope = __opencl_get_memory_scope(flags); int memory_order = __ATOMIC_SEQ_CST; - __clc_mem_fence(memory_scope, memory_order); + __CLC_MemorySemantics memory_semantics = __opencl_get_memory_semantics(flags); + __clc_mem_fence(memory_scope, memory_order, memory_semantics); } // We do not have separate mechanism for read and write fences. diff --git a/libclc/opencl/lib/ptx-nvidiacl/synchronization/barrier.cl b/libclc/opencl/lib/ptx-nvidiacl/synchronization/barrier.cl index c8322e602302c..dd7d1507f5ad4 100644 --- a/libclc/opencl/lib/ptx-nvidiacl/synchronization/barrier.cl +++ b/libclc/opencl/lib/ptx-nvidiacl/synchronization/barrier.cl @@ -11,7 +11,8 @@ #include <clc/synchronization/clc_work_group_barrier.h> _CLC_DEF _CLC_OVERLOAD void barrier(cl_mem_fence_flags flags) { - int memory_scope = getCLCMemoryScope(flags); + int memory_scope = __opencl_get_memory_scope(flags); int memory_order = __ATOMIC_SEQ_CST; - __clc_work_group_barrier(memory_scope, memory_order); + __CLC_MemorySemantics memory_semantics = __opencl_get_memory_semantics(flags); + __clc_work_group_barrier(memory_scope, memory_order, memory_semantics); } _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits