[clang] Add missing intrinsics to cuda headers (PR #143664)

via cfe-commits Thu, 12 Jun 2025 08:47:24 -0700

https://github.com/vitor1001 updated 
https://github.com/llvm/llvm-project/pull/143664


>From f84189201a974dca4cdf29541bfa6877de0056e7 Mon Sep 17 00:00:00 2001
From: Vitor Sessak <vses...@google.com>
Date: Wed, 11 Jun 2025 08:21:48 +0000
Subject: [PATCH] Add missing intrinsics to cuda headers.

LLVM prevents the sm_32_intrinsics.hpp header from being included with a 
#define __SM_32_INTRINSICS_HPP__. It also provides drop-in replacements of the 
functions defined in the CUDA header.

One issue is that some intrinsics were added after the replacement was written, 
and thus have no replacement, breaking code that calls them (Raft is one 
example).

This CL backport the code from sm_32_intrinsics.hpp for the missing intrinsics.
---
 clang/lib/Headers/__clang_cuda_intrinsics.h | 269 ++++++++++++++++++++
 1 file changed, 269 insertions(+)

diff --git a/clang/lib/Headers/__clang_cuda_intrinsics.h 
b/clang/lib/Headers/__clang_cuda_intrinsics.h
index 8b230af6f6647..cf3f2ceba1e0a 100644
--- a/clang/lib/Headers/__clang_cuda_intrinsics.h
+++ b/clang/lib/Headers/__clang_cuda_intrinsics.h
@@ -479,6 +479,275 @@ inline __device__ unsigned __funnelshift_rc(unsigned 
low32, unsigned high32,
   return ret;
 }
 
+#define INTRINSIC_LOAD(func_name, asm_op, decl_type, internal_type, asm_type) \
+inline __device__ decl_type func_name(const decl_type *ptr) { \
+  internal_type ret; \
+  asm(asm_op" %0, [%1];" : asm_type(ret) : "l"(ptr)); \
+  return (decl_type)ret; \
+}
+
+#define INTRINSIC_LOAD2(func_name, asm_op, decl_type, internal_type, asm_type) 
\
+inline __device__ decl_type func_name(const decl_type *ptr) { \
+  decl_type ret;  \
+  internal_type tmp;  \
+  asm(asm_op" {%0,%1}, [%2];"  \
+      : asm_type(tmp.x), asm_type(tmp.y)  \
+      : "l"(ptr));  \
+  using element_type = decltype(ret.x);  \
+  ret.x = (element_type)(tmp.x);  \
+  ret.y = (element_type)tmp.y;  \
+  return ret;  \
+}
+
+#define INTRINSIC_LOAD4(func_name, asm_op, decl_type, internal_type, asm_type) 
\
+inline __device__ decl_type func_name(const decl_type *ptr) { \
+  decl_type ret;  \
+  internal_type tmp;  \
+  asm(asm_op" {%0,%1,%2,%3}, [%4];" \
+    : asm_type(tmp.x), asm_type(tmp.y), asm_type(tmp.z), asm_type(tmp.w) \
+    : "l"(ptr)); \
+  using element_type = decltype(ret.x);  \
+  ret.x = (element_type)tmp.x; \
+  ret.y = (element_type)tmp.y; \
+  ret.z = (element_type)tmp.z; \
+  ret.w = (element_type)tmp.w; \
+  return ret;  \
+}
+
+INTRINSIC_LOAD(__ldcg, "ld.global.cg.s8", char, unsigned int, "=r");
+INTRINSIC_LOAD(__ldcg, "ld.global.cg.s8", signed char, unsigned int, "=r");
+INTRINSIC_LOAD(__ldcg, "ld.global.cg.s16", short, unsigned short, "=h");
+INTRINSIC_LOAD(__ldcg, "ld.global.cg.s32", int, unsigned int, "=r");
+INTRINSIC_LOAD(__ldcg, "ld.global.cg.s64", long long, unsigned long long, 
"=l");
+
+INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s8", char2, int2, "=r");
+INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.s8", char4, int4, "=r");
+INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s16", short2, short2, "=h");
+INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.s16", short4, short4, "=h");
+INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s32", int2, int2, "=r");
+INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.s32", int4, int4, "=r");
+INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s64 ", longlong2, longlong2, "=l");
+
+INTRINSIC_LOAD(__ldcg, "ld.global.cg.u8", unsigned char, unsigned int, "=r");
+INTRINSIC_LOAD(__ldcg, "ld.global.cg.u16", unsigned short, unsigned short, 
"=h");
+INTRINSIC_LOAD(__ldcg, "ld.global.cg.u32", unsigned int, unsigned int, "=r");
+INTRINSIC_LOAD(__ldcg, "ld.global.cg.u64", unsigned long long, unsigned long 
long, "=l");
+
+INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u8", uchar2, int2, "=r");
+INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.u8", uchar4, int4, "=r");
+INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u16", ushort2, ushort2, "=h");
+INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.u16", ushort4, ushort4, "=h");
+INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u32", uint2, uint2, "=r");
+INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.u32", uint4, uint4, "=r");
+INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u64", ulonglong2, ulonglong2, "=l");
+
+INTRINSIC_LOAD(__ldcg, "ld.global.cg.f32", float, float, "=f");
+INTRINSIC_LOAD(__ldcg, "ld.global.cg.f64", double, double, "=d");
+INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.f32", float2, float2, "=f");
+INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.f32", float4, float4, "=f");
+INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.f64", double2, double2, "=d");
+
+inline __device__ long __ldcg(const long *ptr) {
+  unsigned long ret;
+  if (sizeof(long) == 8) {
+    asm("ld.global.cg.s64 %0, [%1];" : "=l"(ret) : "l"(ptr));
+  } else {
+    asm("ld.global.cg.s32 %0, [%1];" : "=r"(ret) : "l"(ptr));
+  }
+  return (long)ret;
+}
+
+#define MINTRINSIC_LOAD(func_name, asm_op, decl_type, internal_type, asm_type) 
\
+inline __device__ decl_type func_name(const decl_type *ptr) { \
+  internal_type ret; \
+  asm(asm_op" %0, [%1];" : asm_type(ret) : "l"(ptr) : "memory"); \
+  return (decl_type)ret; \
+}
+
+#define MINTRINSIC_LOAD2(func_name, asm_op, decl_type, internal_type, 
asm_type) \
+inline __device__ decl_type func_name(const decl_type *ptr) { \
+  decl_type ret;  \
+  internal_type tmp;  \
+  asm(asm_op" {%0,%1}, [%2];"  \
+      : asm_type(tmp.x), asm_type(tmp.y)  \
+      : "l"(ptr) : "memory");  \
+  using element_type = decltype(ret.x);  \
+  ret.x = (element_type)tmp.x;  \
+  ret.y = (element_type)tmp.y;  \
+  return ret;  \
+}
+
+#define MINTRINSIC_LOAD4(func_name, asm_op, decl_type, internal_type, 
asm_type) \
+inline __device__ decl_type func_name(const decl_type *ptr) { \
+  decl_type ret;  \
+  internal_type tmp;  \
+  asm(asm_op" {%0,%1,%2,%3}, [%4];" \
+    : asm_type(tmp.x), asm_type(tmp.y), asm_type(tmp.z), asm_type(tmp.w) \
+    : "l"(ptr) : "memory"); \
+  using element_type = decltype(ret.x);  \
+  ret.x = (element_type)tmp.x; \
+  ret.y = (element_type)tmp.y; \
+  ret.z = (element_type)tmp.z; \
+  ret.w = (element_type)tmp.w; \
+  return ret;  \
+}
+
+MINTRINSIC_LOAD(__ldcv, "ld.global.cv.u8", unsigned char, unsigned int, "=r");
+MINTRINSIC_LOAD(__ldcv, "ld.global.cv.u16", unsigned short, unsigned short,
+                "=h");
+MINTRINSIC_LOAD(__ldcv, "ld.global.cv.u32", unsigned int, unsigned int, "=r");
+MINTRINSIC_LOAD(__ldcv, "ld.global.cv.u64", unsigned long long,
+                unsigned long long, "=l");
+
+MINTRINSIC_LOAD(__ldcv, "ld.global.cv.s8", char, unsigned int, "=r");
+MINTRINSIC_LOAD(__ldcv, "ld.global.cv.s8", signed char, unsigned int, "=r");
+MINTRINSIC_LOAD(__ldcv, "ld.global.cv.s16", short, unsigned short, "=h");
+MINTRINSIC_LOAD(__ldcv, "ld.global.cv.s32", int, unsigned int, "=r");
+MINTRINSIC_LOAD(__ldcv, "ld.global.cv.s64", long long, unsigned long long,
+                "=l");
+
+MINTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u8", uchar2, uint2, "=r");
+MINTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.u8", uchar4, uint4, "=r");
+MINTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u16", ushort2, ushort2, "=h");
+MINTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.u16", ushort4, ushort4, "=h");
+MINTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u32", uint2, uint2, "=r");
+MINTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.u32", uint4, uint4, "=r");
+MINTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u64", ulonglong2, ulonglong2, "=l");
+
+MINTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.s8", char2, int2, "=r");
+MINTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.s8", char4, int4, "=r");
+MINTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.s16", short2, short2, "=h");
+MINTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.s16", short4, short4, "=h");
+MINTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.s32", int2, int2, "=r");
+MINTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.s32", int4, int4, "=r");
+MINTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.s64", longlong2, longlong2, "=l");
+
+MINTRINSIC_LOAD(__ldcv, "ld.global.cv.f32", float, float, "=f");
+MINTRINSIC_LOAD(__ldcv, "ld.global.cv.f64", double, double, "=d");
+
+MINTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.f32", float2, float2, "=f");
+MINTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.f32", float4, float4, "=f");
+MINTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.f64", double2, double2, "=d");
+
+inline __device__ long __ldcv(const long *ptr) {
+  unsigned long ret;
+  if (sizeof(long) == 8) {
+    asm("ld.global.cv.s64 %0, [%1];" : "=l"(ret) : "l"(ptr));
+  } else {
+    asm("ld.global.cv.s32 %0, [%1];" : "=r"(ret) : "l"(ptr));
+  }
+  return (long)ret;
+}
+
+INTRINSIC_LOAD(__ldcs, "ld.global.cs.s8", char, unsigned int, "=r");
+INTRINSIC_LOAD(__ldcs, "ld.global.cs.s8", signed char, signed int, "=r");
+INTRINSIC_LOAD(__ldcs, "ld.global.cs.s16", short, unsigned short, "=h");
+INTRINSIC_LOAD(__ldcs, "ld.global.cs.s32", int, unsigned int, "=r");
+INTRINSIC_LOAD(__ldcs, "ld.global.cs.s64", long long, unsigned long long, 
"=l");
+
+INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s8", char2, int2, "=r");
+INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.s8", char4, int4, "=r");
+INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s16", short2, short2, "=h");
+INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.s16", short4, short4, "=h");
+INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s32", int2, int2, "=r");
+INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.s32", int4, int4, "=r");
+INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s64", longlong2, longlong2, "=l");
+
+INTRINSIC_LOAD(__ldcs, "ld.global.cs.u8", unsigned char, unsigned int, "=r");
+INTRINSIC_LOAD(__ldcs, "ld.global.cs.u16", unsigned short, unsigned short,
+               "=h");
+INTRINSIC_LOAD(__ldcs, "ld.global.cs.u32", unsigned int, unsigned int, "=r");
+INTRINSIC_LOAD(__ldcs, "ld.global.cs.u64", unsigned long long,
+               unsigned long long, "=l");
+
+INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u8", uchar2, uint2, "=r");
+INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.u8", uchar4, uint4, "=r");
+INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u16", ushort2, ushort2, "=h");
+INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.u16", ushort4, ushort4, "=h");
+INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u32", uint2, uint2, "=r");
+INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.u32", uint4, uint4, "=r");
+INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u64", ulonglong2, ulonglong2, "=l");
+
+INTRINSIC_LOAD(__ldcs, "ld.global.cs.f32", float, float, "=f");
+INTRINSIC_LOAD(__ldcs, "ld.global.cs.f64", double, double, "=d");
+INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.f32", float2, float2, "=f");
+INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.f32", float4, float4, "=f");
+INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.f64", double2, double2, "=d");
+
+inline __device__ long __ldcs(const long *ptr) {
+  unsigned long ret;
+  if (sizeof(long) == 8) {
+    asm("ld.global.cs.s64 %0, [%1];" : "=l"(ret) : "l"(ptr));
+  } else {
+    asm("ld.global.cs.s32 %0, [%1];" : "=r"(ret) : "l"(ptr));
+  }
+  return (long)ret;
+}
+
+#define INTRINSIC_STORE(func_name, asm_op, decl_type, internal_type, asm_type) 
\
+inline __device__ void func_name(decl_type *ptr, decl_type value) { \
+  internal_type tmp = (internal_type)value; \
+  asm(asm_op" [%0], %1;" ::"l"(ptr), asm_type(tmp) : "memory"); \
+}
+
+#define INTRINSIC_STORE2(func_name, asm_op, decl_type, internal_type, 
asm_type) \
+inline __device__ void func_name(decl_type *ptr, decl_type value) { \
+  internal_type tmp;  \
+  using element_type = decltype(tmp.x);  \
+  tmp.x = (element_type)(value.x);  \
+  tmp.y = (element_type)(value.y);  \
+  asm(asm_op" [%0], {%1,%2};" ::"l"(ptr), asm_type(tmp.x), asm_type(tmp.y) \
+      : "memory"); \
+}
+
+#define INTRINSIC_STORE4(func_name, asm_op, decl_type, internal_type, 
asm_type) \
+inline __device__ void func_name(decl_type *ptr, decl_type value) { \
+  internal_type tmp;  \
+  using element_type = decltype(tmp.x);  \
+  tmp.x = (element_type)(value.x);  \
+  tmp.y = (element_type)(value.y);  \
+  tmp.z = (element_type)(value.z);  \
+  tmp.w = (element_type)(value.w);  \
+  asm(asm_op" [%0], {%1,%2,%3,%4};" ::"l"(ptr), asm_type(tmp.x), \
+      asm_type(tmp.y), asm_type(tmp.z), asm_type(tmp.w) \
+      : "memory"); \
+}
+
+INTRINSIC_STORE(__stwt, "st.global.wt.s8", char, int, "r");
+INTRINSIC_STORE(__stwt, "st.global.wt.s8", signed char, int, "r");
+INTRINSIC_STORE(__stwt, "st.global.wt.s16", short, short, "h");
+INTRINSIC_STORE(__stwt, "st.global.wt.s32", int, int, "r");
+INTRINSIC_STORE(__stwt, "st.global.wt.s64", long long, long long, "l");
+
+INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s8", char2, int2, "r");
+INTRINSIC_STORE4(__stwt, "st.global.wt.v4.s8", char4, int4, "r");
+INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s16", short2, short2, "h");
+INTRINSIC_STORE4(__stwt, "st.global.wt.v4.s16", short4, short4, "h");
+INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s32", int2, int2, "r");
+INTRINSIC_STORE4(__stwt, "st.global.wt.v4.s32", int4, int4, "r");
+INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s64", longlong2, longlong2, "l");
+
+INTRINSIC_STORE(__stwt, "st.global.wt.u8", unsigned char, int, "r");
+INTRINSIC_STORE(__stwt, "st.global.wt.u16", unsigned short, unsigned short,
+                "h");
+INTRINSIC_STORE(__stwt, "st.global.wt.u32", unsigned int, unsigned int, "r");
+INTRINSIC_STORE(__stwt, "st.global.wt.u64", unsigned long long,
+                unsigned long long, "l");
+
+INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u8", uchar2, uchar2, "r");
+INTRINSIC_STORE4(__stwt, "st.global.wt.v4.u8", uchar4, uint4, "r");
+INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u16", ushort2, ushort2, "h");
+INTRINSIC_STORE4(__stwt, "st.global.wt.v4.u16", ushort4, ushort4, "h");
+INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u32", uint2, uint2, "r");
+INTRINSIC_STORE4(__stwt, "st.global.wt.v4.u32", uint4, uint4, "r");
+INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u64", ulonglong2, ulonglong2, "l");
+
+INTRINSIC_STORE(__stwt, "st.global.wt.f32", float, float, "f");
+INTRINSIC_STORE(__stwt, "st.global.wt.f64", double, double, "d");
+INTRINSIC_STORE2(__stwt, "st.global.wt.v2.f32", float2, float2, "f");
+INTRINSIC_STORE4(__stwt, "st.global.wt.v4.f32", float4, float4, "f");
+INTRINSIC_STORE2(__stwt, "st.global.wt.v2.f64", double2, double2, "d");
+
 #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
 
 #if CUDA_VERSION >= 11000

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] Add missing intrinsics to cuda headers (PR #143664)

Reply via email to