[libclc] a247da4 - [libclc] update __clc_mem_fence: add MemorySemantic arg and use __builtin_amdgcn_fence for AMDGPU (#152275)

via cfe-commits Sun, 31 Aug 2025 20:04:09 -0700

Author: Wenju He
Date: 2025-09-01T11:03:45+08:00
New Revision: a247da4f9363116c54b91a37755edd994c56dbf8


URL: 
https://github.com/llvm/llvm-project/commit/a247da4f9363116c54b91a37755edd994c56dbf8
DIFF: 
https://github.com/llvm/llvm-project/commit/a247da4f9363116c54b91a37755edd994c56dbf8.diff

LOG: [libclc] update __clc_mem_fence: add MemorySemantic arg and use 
__builtin_amdgcn_fence for AMDGPU (#152275)

It is necessary to add MemorySemantic argument for AMDGPU which means
the memory or address space to which the memory ordering is applied.

The MemorySemantic is also necessary for implementing the SPIR-V
MemoryBarrier instruction. Additionally, the implementation of
__clc_mem_fence on Intel GPUs requires the MemorySemantic argument.

Using __builtin_amdgcn_fence for AMDGPU is follow-up of
https://github.com/llvm/llvm-project/pull/151446#discussion_r2254006508

llvm-diff shows no change to nvptx64--nvidiacl.bc.

Added: 
    libclc/clc/include/clc/mem_fence/clc_mem_semantic.h

Modified: 
    libclc/clc/include/clc/mem_fence/clc_mem_fence.h
    libclc/clc/include/clc/synchronization/clc_work_group_barrier.h
    libclc/clc/lib/amdgcn/mem_fence/clc_mem_fence.cl
    libclc/clc/lib/amdgcn/synchronization/clc_work_group_barrier.cl
    libclc/clc/lib/ptx-nvidiacl/mem_fence/clc_mem_fence.cl
    libclc/clc/lib/ptx-nvidiacl/synchronization/clc_work_group_barrier.cl
    libclc/opencl/include/clc/opencl/synchronization/utils.h
    libclc/opencl/lib/amdgcn/mem_fence/fence.cl
    libclc/opencl/lib/amdgcn/synchronization/barrier.cl
    libclc/opencl/lib/ptx-nvidiacl/mem_fence/fence.cl
    libclc/opencl/lib/ptx-nvidiacl/synchronization/barrier.cl

Removed: 
    


################################################################################
diff  --git a/libclc/clc/include/clc/mem_fence/clc_mem_fence.h 
b/libclc/clc/include/clc/mem_fence/clc_mem_fence.h
index 2321634c76842..0776caddde0d5 100644
--- a/libclc/clc/include/clc/mem_fence/clc_mem_fence.h
+++ b/libclc/clc/include/clc/mem_fence/clc_mem_fence.h
@@ -10,8 +10,10 @@
 #define __CLC_MEM_FENCE_CLC_MEM_FENCE_H__
 
 #include <clc/internal/clc.h>
+#include <clc/mem_fence/clc_mem_semantic.h>
 
-_CLC_OVERLOAD _CLC_DECL void __clc_mem_fence(int memory_scope,
-                                             int memory_order);
+_CLC_OVERLOAD _CLC_DECL void
+__clc_mem_fence(int memory_scope, int memory_order,
+                __CLC_MemorySemantics memory_semantics);
 
 #endif // __CLC_MEM_FENCE_CLC_MEM_FENCE_H__

diff  --git a/libclc/clc/include/clc/mem_fence/clc_mem_semantic.h 
b/libclc/clc/include/clc/mem_fence/clc_mem_semantic.h
new file mode 100644
index 0000000000000..4d9f5f1db8ee9
--- /dev/null
+++ b/libclc/clc/include/clc/mem_fence/clc_mem_semantic.h
@@ -0,0 +1,21 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_MEM_FENCE_CLC_MEM_SEMANTIC_H__
+#define __CLC_MEM_FENCE_CLC_MEM_SEMANTIC_H__
+
+// The memory or address space to which the memory ordering is applied.
+typedef enum __CLC_MemorySemantics {
+  __CLC_MEMORY_PRIVATE = 1 << 0,
+  __CLC_MEMORY_GLOBAL = 1 << 1,
+  __CLC_MEMORY_CONSTANT = 1 << 2,
+  __CLC_MEMORY_LOCAL = 1 << 3,
+  __CLC_MEMORY_GENERIC = 1 << 4,
+} __CLC_MemorySemantics;
+
+#endif // __CLC_MEM_FENCE_CLC_MEM_SEMANTIC_H__

diff  --git a/libclc/clc/include/clc/synchronization/clc_work_group_barrier.h 
b/libclc/clc/include/clc/synchronization/clc_work_group_barrier.h
index 5f864e1057b8b..34745bd47c068 100644
--- a/libclc/clc/include/clc/synchronization/clc_work_group_barrier.h
+++ b/libclc/clc/include/clc/synchronization/clc_work_group_barrier.h
@@ -10,8 +10,10 @@
 #define __CLC_SYNCHRONIZATION_CLC_WORK_GROUP_BARRIER_H__
 
 #include <clc/internal/clc.h>
+#include <clc/mem_fence/clc_mem_semantic.h>
 
-_CLC_OVERLOAD _CLC_DECL void __clc_work_group_barrier(int memory_scope,
-                                                      int memory_order);
+_CLC_OVERLOAD _CLC_DECL void
+__clc_work_group_barrier(int memory_scope, int memory_order,
+                         __CLC_MemorySemantics memory_semantics);
 
 #endif // __CLC_SYNCHRONIZATION_CLC_WORK_GROUP_BARRIER_H__

diff  --git a/libclc/clc/lib/amdgcn/mem_fence/clc_mem_fence.cl 
b/libclc/clc/lib/amdgcn/mem_fence/clc_mem_fence.cl
index 9e6460313718e..6d2a0962ba20d 100644
--- a/libclc/clc/lib/amdgcn/mem_fence/clc_mem_fence.cl
+++ b/libclc/clc/lib/amdgcn/mem_fence/clc_mem_fence.cl
@@ -8,30 +8,50 @@
 
 #include <clc/mem_fence/clc_mem_fence.h>
 
-void __clc_amdgcn_s_waitcnt(unsigned flags);
+#define BUILTIN_FENCE_ORDER(memory_order, ...)                                 
\
+  switch (memory_order) {                                                      
\
+  case __ATOMIC_ACQUIRE:                                                       
\
+    __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, __VA_ARGS__);                     
\
+    break;                                                                     
\
+  case __ATOMIC_RELEASE:                                                       
\
+    __builtin_amdgcn_fence(__ATOMIC_RELEASE, __VA_ARGS__);                     
\
+    break;                                                                     
\
+  case __ATOMIC_ACQ_REL:                                                       
\
+    __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, __VA_ARGS__);                     
\
+    break;                                                                     
\
+  case __ATOMIC_SEQ_CST:                                                       
\
+    __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, __VA_ARGS__);                     
\
+    break;                                                                     
\
+  default:                                                                     
\
+    __builtin_unreachable();                                                   
\
+  }                                                                            
\
+  break;
 
-// s_waitcnt takes 16bit argument with a combined number of maximum allowed
-// pending operations:
-// [12:8] LGKM -- LDS, GDS, Konstant (SMRD), Messages
-// [7] -- undefined
-// [6:4] -- exports, GDS, and mem write
-// [3:0] -- vector memory operations
+#define BUILTIN_FENCE(memory_scope, memory_order, ...)                         
\
+  switch (memory_scope) {                                                      
\
+  case __MEMORY_SCOPE_DEVICE:                                                  
\
+    BUILTIN_FENCE_ORDER(memory_order, "agent", ##__VA_ARGS__)                  
\
+  case __MEMORY_SCOPE_WRKGRP:                                                  
\
+    BUILTIN_FENCE_ORDER(memory_order, "workgroup", ##__VA_ARGS__)              
\
+  case __MEMORY_SCOPE_WVFRNT:                                                  
\
+    BUILTIN_FENCE_ORDER(memory_order, "wavefront", ##__VA_ARGS__)              
\
+  case __MEMORY_SCOPE_SINGLE:                                                  
\
+    BUILTIN_FENCE_ORDER(memory_order, "singlethread", ##__VA_ARGS__)           
\
+  case __MEMORY_SCOPE_SYSTEM:                                                  
\
+  default:                                                                     
\
+    BUILTIN_FENCE_ORDER(memory_order, "", ##__VA_ARGS__)                       
\
+  }
 
-// Newer clang supports __builtin_amdgcn_s_waitcnt
-#if __clang_major__ >= 5
-#define __waitcnt(x) __builtin_amdgcn_s_waitcnt(x)
-#else
-#define __waitcnt(x) __clc_amdgcn_s_waitcnt(x)
-_CLC_DEF void __clc_amdgcn_s_waitcnt(unsigned) __asm("llvm.amdgcn.s.waitcnt");
-#endif
-
-_CLC_OVERLOAD _CLC_DEF void __clc_mem_fence(int memory_scope,
-                                            int memory_order) {
-  if (memory_scope & __MEMORY_SCOPE_DEVICE) {
-    // scalar loads are counted with LGKM but we don't know whether
-    // the compiler turned any loads to scalar
-    __waitcnt(0);
-  } else if (memory_scope & __MEMORY_SCOPE_WRKGRP)
-    __waitcnt(0xff); // LGKM is [12:8]
+_CLC_OVERLOAD _CLC_DEF void
+__clc_mem_fence(int memory_scope, int memory_order,
+                __CLC_MemorySemantics memory_semantics) {
+  if (memory_semantics == __CLC_MEMORY_LOCAL) {
+    BUILTIN_FENCE(memory_scope, memory_order, "local")
+  } else if (memory_semantics == __CLC_MEMORY_GLOBAL) {
+    BUILTIN_FENCE(memory_scope, memory_order, "global")
+  } else if (memory_semantics == (__CLC_MEMORY_LOCAL | __CLC_MEMORY_GLOBAL)) {
+    BUILTIN_FENCE(memory_scope, memory_order, "local", "global")
+  } else {
+    BUILTIN_FENCE(memory_scope, memory_order)
+  }
 }
-#undef __waitcnt

diff  --git a/libclc/clc/lib/amdgcn/synchronization/clc_work_group_barrier.cl 
b/libclc/clc/lib/amdgcn/synchronization/clc_work_group_barrier.cl
index ff3628fa7c339..034e6e7bd8ed4 100644
--- a/libclc/clc/lib/amdgcn/synchronization/clc_work_group_barrier.cl
+++ b/libclc/clc/lib/amdgcn/synchronization/clc_work_group_barrier.cl
@@ -9,8 +9,9 @@
 #include <clc/mem_fence/clc_mem_fence.h>
 #include <clc/synchronization/clc_work_group_barrier.h>
 
-_CLC_OVERLOAD _CLC_DEF void __clc_work_group_barrier(int memory_scope,
-                                                     int memory_order) {
-  __clc_mem_fence(memory_scope, memory_order);
+_CLC_OVERLOAD _CLC_DEF void
+__clc_work_group_barrier(int memory_scope, int memory_order,
+                         __CLC_MemorySemantics memory_semantics) {
+  __clc_mem_fence(memory_scope, memory_order, memory_semantics);
   __builtin_amdgcn_s_barrier();
 }

diff  --git a/libclc/clc/lib/ptx-nvidiacl/mem_fence/clc_mem_fence.cl 
b/libclc/clc/lib/ptx-nvidiacl/mem_fence/clc_mem_fence.cl
index b3e2375e755a2..5f96ef5477642 100644
--- a/libclc/clc/lib/ptx-nvidiacl/mem_fence/clc_mem_fence.cl
+++ b/libclc/clc/lib/ptx-nvidiacl/mem_fence/clc_mem_fence.cl
@@ -8,8 +8,9 @@
 
 #include <clc/mem_fence/clc_mem_fence.h>
 
-_CLC_OVERLOAD _CLC_DEF void __clc_mem_fence(int memory_scope,
-                                            int memory_order) {
+_CLC_OVERLOAD _CLC_DEF void
+__clc_mem_fence(int memory_scope, int memory_order,
+                __CLC_MemorySemantics memory_semantics) {
   if (memory_scope & (__MEMORY_SCOPE_DEVICE | __MEMORY_SCOPE_WRKGRP))
     __nvvm_membar_cta();
 }

diff  --git 
a/libclc/clc/lib/ptx-nvidiacl/synchronization/clc_work_group_barrier.cl 
b/libclc/clc/lib/ptx-nvidiacl/synchronization/clc_work_group_barrier.cl
index 6cb37a38f06ac..349c0f4845132 100644
--- a/libclc/clc/lib/ptx-nvidiacl/synchronization/clc_work_group_barrier.cl
+++ b/libclc/clc/lib/ptx-nvidiacl/synchronization/clc_work_group_barrier.cl
@@ -8,7 +8,8 @@
 
 #include <clc/synchronization/clc_work_group_barrier.h>
 
-_CLC_OVERLOAD _CLC_DEF void __clc_work_group_barrier(int memory_scope,
-                                                     int memory_order) {
+_CLC_OVERLOAD _CLC_DEF void
+__clc_work_group_barrier(int memory_scope, int memory_order,
+                         __CLC_MemorySemantics memory_semantics) {
   __syncthreads();
 }

diff  --git a/libclc/opencl/include/clc/opencl/synchronization/utils.h 
b/libclc/opencl/include/clc/opencl/synchronization/utils.h
index cf3baf28cb5f1..a8841658598c1 100644
--- a/libclc/opencl/include/clc/opencl/synchronization/utils.h
+++ b/libclc/opencl/include/clc/opencl/synchronization/utils.h
@@ -10,9 +10,10 @@
 #define __CLC_OPENCL_SYNCHRONIZATION_UTILS_H__
 
 #include <clc/internal/clc.h>
+#include <clc/mem_fence/clc_mem_semantic.h>
 #include <clc/opencl/synchronization/cl_mem_fence_flags.h>
 
-_CLC_INLINE int getCLCMemoryScope(cl_mem_fence_flags flag) {
+_CLC_INLINE int __opencl_get_memory_scope(cl_mem_fence_flags flag) {
   int memory_scope = 0;
   if (flag & CLK_GLOBAL_MEM_FENCE)
     memory_scope |= __MEMORY_SCOPE_DEVICE;
@@ -21,4 +22,15 @@ _CLC_INLINE int getCLCMemoryScope(cl_mem_fence_flags flag) {
   return memory_scope;
 }
 
+_CLC_INLINE __CLC_MemorySemantics
+__opencl_get_memory_semantics(cl_mem_fence_flags flag) {
+  if ((flag & CLK_LOCAL_MEM_FENCE) && (flag & CLK_GLOBAL_MEM_FENCE))
+    return __CLC_MEMORY_LOCAL | __CLC_MEMORY_GLOBAL;
+  if (flag & CLK_LOCAL_MEM_FENCE)
+    return __CLC_MEMORY_LOCAL;
+  if (flag & CLK_GLOBAL_MEM_FENCE)
+    return __CLC_MEMORY_GLOBAL;
+  __builtin_unreachable();
+}
+
 #endif // __CLC_OPENCL_SYNCHRONIZATION_UTILS_H__

diff  --git a/libclc/opencl/lib/amdgcn/mem_fence/fence.cl 
b/libclc/opencl/lib/amdgcn/mem_fence/fence.cl
index 81216d6a26cf2..963380761b46c 100644
--- a/libclc/opencl/lib/amdgcn/mem_fence/fence.cl
+++ b/libclc/opencl/lib/amdgcn/mem_fence/fence.cl
@@ -11,9 +11,10 @@
 #include <clc/opencl/synchronization/utils.h>
 
 _CLC_DEF _CLC_OVERLOAD void mem_fence(cl_mem_fence_flags flags) {
-  int memory_scope = getCLCMemoryScope(flags);
+  int memory_scope = __opencl_get_memory_scope(flags);
   int memory_order = __ATOMIC_SEQ_CST;
-  __clc_mem_fence(memory_scope, memory_order);
+  __CLC_MemorySemantics memory_semantics = 
__opencl_get_memory_semantics(flags);
+  __clc_mem_fence(memory_scope, memory_order, memory_semantics);
 }
 
 // We don't have separate mechanism for read and write fences

diff  --git a/libclc/opencl/lib/amdgcn/synchronization/barrier.cl 
b/libclc/opencl/lib/amdgcn/synchronization/barrier.cl
index c8322e602302c..dd7d1507f5ad4 100644
--- a/libclc/opencl/lib/amdgcn/synchronization/barrier.cl
+++ b/libclc/opencl/lib/amdgcn/synchronization/barrier.cl
@@ -11,7 +11,8 @@
 #include <clc/synchronization/clc_work_group_barrier.h>
 
 _CLC_DEF _CLC_OVERLOAD void barrier(cl_mem_fence_flags flags) {
-  int memory_scope = getCLCMemoryScope(flags);
+  int memory_scope = __opencl_get_memory_scope(flags);
   int memory_order = __ATOMIC_SEQ_CST;
-  __clc_work_group_barrier(memory_scope, memory_order);
+  __CLC_MemorySemantics memory_semantics = 
__opencl_get_memory_semantics(flags);
+  __clc_work_group_barrier(memory_scope, memory_order, memory_semantics);
 }

diff  --git a/libclc/opencl/lib/ptx-nvidiacl/mem_fence/fence.cl 
b/libclc/opencl/lib/ptx-nvidiacl/mem_fence/fence.cl
index e22ed870a7e6b..19721574e4053 100644
--- a/libclc/opencl/lib/ptx-nvidiacl/mem_fence/fence.cl
+++ b/libclc/opencl/lib/ptx-nvidiacl/mem_fence/fence.cl
@@ -11,9 +11,10 @@
 #include <clc/opencl/synchronization/utils.h>
 
 _CLC_DEF _CLC_OVERLOAD void mem_fence(cl_mem_fence_flags flags) {
-  int memory_scope = getCLCMemoryScope(flags);
+  int memory_scope = __opencl_get_memory_scope(flags);
   int memory_order = __ATOMIC_SEQ_CST;
-  __clc_mem_fence(memory_scope, memory_order);
+  __CLC_MemorySemantics memory_semantics = 
__opencl_get_memory_semantics(flags);
+  __clc_mem_fence(memory_scope, memory_order, memory_semantics);
 }
 
 // We do not have separate mechanism for read and write fences.

diff  --git a/libclc/opencl/lib/ptx-nvidiacl/synchronization/barrier.cl 
b/libclc/opencl/lib/ptx-nvidiacl/synchronization/barrier.cl
index c8322e602302c..dd7d1507f5ad4 100644
--- a/libclc/opencl/lib/ptx-nvidiacl/synchronization/barrier.cl
+++ b/libclc/opencl/lib/ptx-nvidiacl/synchronization/barrier.cl
@@ -11,7 +11,8 @@
 #include <clc/synchronization/clc_work_group_barrier.h>
 
 _CLC_DEF _CLC_OVERLOAD void barrier(cl_mem_fence_flags flags) {
-  int memory_scope = getCLCMemoryScope(flags);
+  int memory_scope = __opencl_get_memory_scope(flags);
   int memory_order = __ATOMIC_SEQ_CST;
-  __clc_work_group_barrier(memory_scope, memory_order);
+  __CLC_MemorySemantics memory_semantics = 
__opencl_get_memory_semantics(flags);
+  __clc_work_group_barrier(memory_scope, memory_order, memory_semantics);
 }


        
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[libclc] a247da4 - [libclc] update __clc_mem_fence: add MemorySemantic arg and use __builtin_amdgcn_fence for AMDGPU (#152275)

Reply via email to