llvmbot wrote:

<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86

@llvm/pr-subscribers-clang

Author: None (vitor1001)

<details>
<summary>Changes</summary>

LLVM prevents the sm_32_intrinsics.hpp header from being included with a 
#define __SM_32_INTRINSICS_HPP__. It also provides drop-in replacements of the 
functions defined in the CUDA header.

One issue is that some intrinsics were added after the replacement was written, 
and thus have no replacement, breaking code that calls them (Raft is one 
example).

This CL backport the code from sm_32_intrinsics.hpp for the missing intrinsics.


---
Full diff: https://github.com/llvm/llvm-project/pull/143664.diff


1 Files Affected:

- (modified) clang/lib/Headers/__clang_cuda_intrinsics.h (+434) 


``````````diff
diff --git a/clang/lib/Headers/__clang_cuda_intrinsics.h 
b/clang/lib/Headers/__clang_cuda_intrinsics.h
index 8b230af6f6647..96f4f18d99128 100644
--- a/clang/lib/Headers/__clang_cuda_intrinsics.h
+++ b/clang/lib/Headers/__clang_cuda_intrinsics.h
@@ -479,6 +479,440 @@ inline __device__ unsigned __funnelshift_rc(unsigned 
low32, unsigned high32,
   return ret;
 }
 
+inline __device__ char __ldcg(const char *ptr) {
+  unsigned int ret;
+  asm("ld.global.cg.s8 %0, [%1];" : "=r"(ret) : "l"(ptr));
+  return (char)ret;
+}
+inline __device__ signed char __ldcg(const signed char *ptr) {
+  unsigned int ret;
+  asm("ld.global.cg.s8 %0, [%1];" : "=r"(ret) : "l"(ptr));
+  return (signed char)ret;
+}
+inline __device__ short __ldcg(const short *ptr) {
+  unsigned short ret;
+  asm("ld.global.cg.s16 %0, [%1];" : "=h"(ret) : "l"(ptr));
+  return (short)ret;
+}
+inline __device__ int __ldcg(const int *ptr) {
+  unsigned int ret;
+  asm("ld.global.cg.s32 %0, [%1];" : "=r"(ret) : "l"(ptr));
+  return (int)ret;
+}
+inline __device__ long long __ldcg(const long long *ptr) {
+  unsigned long long ret;
+  asm("ld.global.cg.s64 %0, [%1];" : "=l"(ret) : "l"(ptr));
+  return (long long)ret;
+}
+inline __device__ char2 __ldcg(const char2 *ptr) {
+  char2 ret;
+  int2 tmp;
+  asm("ld.global.cg.v2.s8 {%0,%1}, [%2];"
+      : "=r"(tmp.x), "=r"(tmp.y)
+      : "l"(ptr));
+  ret.x = (char)tmp.x;
+  ret.y = (char)tmp.y;
+  return ret;
+}
+inline __device__ char4 __ldcg(const char4 *ptr) {
+  char4 ret;
+  int4 tmp;
+  asm("ld.global.cg.v4.s8 {%0,%1,%2,%3}, [%4];"
+      : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w)
+      : "l"(ptr));
+  ret.x = (char)tmp.x;
+  ret.y = (char)tmp.y;
+  ret.z = (char)tmp.z;
+  ret.w = (char)tmp.w;
+  return ret;
+}
+inline __device__ short2 __ldcg(const short2 *ptr) {
+  short2 ret;
+  asm("ld.global.cg.v2.s16 {%0,%1}, [%2];"
+      : "=h"(ret.x), "=h"(ret.y)
+      : "l"(ptr));
+  return ret;
+}
+inline __device__ short4 __ldcg(const short4 *ptr) {
+  short4 ret;
+  asm("ld.global.cg.v4.s16 {%0,%1,%2,%3}, [%4];"
+      : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w)
+      : "l"(ptr));
+  return ret;
+}
+inline __device__ int2 __ldcg(const int2 *ptr) {
+  int2 ret;
+  asm("ld.global.cg.v2.s32 {%0,%1}, [%2];"
+      : "=r"(ret.x), "=r"(ret.y)
+      : "l"(ptr));
+  return ret;
+}
+inline __device__ int4 __ldcg(const int4 *ptr) {
+  int4 ret;
+  asm("ld.global.cg.v4.s32 {%0,%1,%2,%3}, [%4];"
+      : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w)
+      : "l"(ptr));
+  return ret;
+}
+inline __device__ longlong2 __ldcg(const longlong2 *ptr) {
+  longlong2 ret;
+  asm("ld.global.cg.v2.s64 {%0,%1}, [%2];"
+      : "=l"(ret.x), "=l"(ret.y)
+      : "l"(ptr));
+  return ret;
+}
+
+inline __device__ unsigned char __ldcg(const unsigned char *ptr) {
+  unsigned int ret;
+  asm("ld.global.cg.u8 %0, [%1];" : "=r"(ret) : "l"(ptr));
+  return (unsigned char)ret;
+}
+inline __device__ unsigned short __ldcg(const unsigned short *ptr) {
+  unsigned short ret;
+  asm("ld.global.cg.u16 %0, [%1];" : "=h"(ret) : "l"(ptr));
+  return ret;
+}
+inline __device__ unsigned int __ldcg(const unsigned int *ptr) {
+  unsigned int ret;
+  asm("ld.global.cg.u32 %0, [%1];" : "=r"(ret) : "l"(ptr));
+  return ret;
+}
+inline __device__ unsigned long long __ldcg(const unsigned long long *ptr) {
+  unsigned long long ret;
+  asm("ld.global.cg.u64 %0, [%1];" : "=l"(ret) : "l"(ptr));
+  return ret;
+}
+inline __device__ uchar2 __ldcg(const uchar2 *ptr) {
+  uchar2 ret;
+  uint2 tmp;
+  asm("ld.global.cg.v2.u8 {%0,%1}, [%2];"
+      : "=r"(tmp.x), "=r"(tmp.y)
+      : "l"(ptr));
+  ret.x = (unsigned char)tmp.x;
+  ret.y = (unsigned char)tmp.y;
+  return ret;
+}
+inline __device__ uchar4 __ldcg(const uchar4 *ptr) {
+  uchar4 ret;
+  uint4 tmp;
+  asm("ld.global.cg.v4.u8 {%0,%1,%2,%3}, [%4];"
+      : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w)
+      : "l"(ptr));
+  ret.x = (unsigned char)tmp.x;
+  ret.y = (unsigned char)tmp.y;
+  ret.z = (unsigned char)tmp.z;
+  ret.w = (unsigned char)tmp.w;
+  return ret;
+}
+inline __device__ ushort2 __ldcg(const ushort2 *ptr) {
+  ushort2 ret;
+  asm("ld.global.cg.v2.u16 {%0,%1}, [%2];"
+      : "=h"(ret.x), "=h"(ret.y)
+      : "l"(ptr));
+  return ret;
+}
+inline __device__ ushort4 __ldcg(const ushort4 *ptr) {
+  ushort4 ret;
+  asm("ld.global.cg.v4.u16 {%0,%1,%2,%3}, [%4];"
+      : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w)
+      : "l"(ptr));
+  return ret;
+}
+inline __device__ uint2 __ldcg(const uint2 *ptr) {
+  uint2 ret;
+  asm("ld.global.cg.v2.u32 {%0,%1}, [%2];"
+      : "=r"(ret.x), "=r"(ret.y)
+      : "l"(ptr));
+  return ret;
+}
+inline __device__ uint4 __ldcg(const uint4 *ptr) {
+  uint4 ret;
+  asm("ld.global.cg.v4.u32 {%0,%1,%2,%3}, [%4];"
+      : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w)
+      : "l"(ptr));
+  return ret;
+}
+inline __device__ ulonglong2 __ldcg(const ulonglong2 *ptr) {
+  ulonglong2 ret;
+  asm("ld.global.cg.v2.u64 {%0,%1}, [%2];"
+      : "=l"(ret.x), "=l"(ret.y)
+      : "l"(ptr));
+  return ret;
+}
+
+inline __device__ float __ldcg(const float *ptr) {
+  float ret;
+  asm("ld.global.cg.f32 %0, [%1];" : "=f"(ret) : "l"(ptr));
+  return ret;
+}
+inline __device__ double __ldcg(const double *ptr) {
+  double ret;
+  asm("ld.global.cg.f64 %0, [%1];" : "=d"(ret) : "l"(ptr));
+  return ret;
+}
+inline __device__ float2 __ldcg(const float2 *ptr) {
+  float2 ret;
+  asm("ld.global.cg.v2.f32 {%0,%1}, [%2];"
+      : "=f"(ret.x), "=f"(ret.y)
+      : "l"(ptr));
+  return ret;
+}
+inline __device__ float4 __ldcg(const float4 *ptr) {
+  float4 ret;
+  asm("ld.global.cg.v4.f32 {%0,%1,%2,%3}, [%4];"
+      : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w)
+      : "l"(ptr));
+  return ret;
+}
+inline __device__ double2 __ldcg(const double2 *ptr) {
+  double2 ret;
+  asm("ld.global.cg.v2.f64 {%0,%1}, [%2];"
+      : "=d"(ret.x), "=d"(ret.y)
+      : "l"(ptr));
+  return ret;
+}
+
+inline __device__ unsigned char __ldcv(const unsigned char *ptr) {
+  unsigned int ret;
+  asm("ld.global.cv.u8 %0, [%1];" : "=r"(ret) : "l"(ptr) : "memory");
+  return (unsigned char)ret;
+}
+inline __device__ unsigned short __ldcv(const unsigned short *ptr) {
+  unsigned short ret;
+  asm("ld.global.cv.u16 %0, [%1];" : "=h"(ret) : "l"(ptr) : "memory");
+  return ret;
+}
+inline __device__ unsigned int __ldcv(const unsigned int *ptr) {
+  unsigned int ret;
+  asm("ld.global.cv.u32 %0, [%1];" : "=r"(ret) : "l"(ptr) : "memory");
+  return ret;
+}
+inline __device__ unsigned long long __ldcv(const unsigned long long *ptr) {
+  unsigned long long ret;
+  asm("ld.global.cv.u64 %0, [%1];" : "=l"(ret) : "l"(ptr) : "memory");
+  return ret;
+}
+inline __device__ uchar2 __ldcv(const uchar2 *ptr) {
+  uchar2 ret;
+  uint2 tmp;
+  asm("ld.global.cv.v2.u8 {%0,%1}, [%2];"
+      : "=r"(tmp.x), "=r"(tmp.y)
+      : "l"(ptr)
+      : "memory");
+  ret.x = (unsigned char)tmp.x;
+  ret.y = (unsigned char)tmp.y;
+  return ret;
+}
+inline __device__ uchar4 __ldcv(const uchar4 *ptr) {
+  uchar4 ret;
+  uint4 tmp;
+  asm("ld.global.cv.v4.u8 {%0,%1,%2,%3}, [%4];"
+      : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w)
+      : "l"(ptr)
+      : "memory");
+  ret.x = (unsigned char)tmp.x;
+  ret.y = (unsigned char)tmp.y;
+  ret.z = (unsigned char)tmp.z;
+  ret.w = (unsigned char)tmp.w;
+  return ret;
+}
+inline __device__ ushort2 __ldcv(const ushort2 *ptr) {
+  ushort2 ret;
+  asm("ld.global.cv.v2.u16 {%0,%1}, [%2];"
+      : "=h"(ret.x), "=h"(ret.y)
+      : "l"(ptr)
+      : "memory");
+  return ret;
+}
+inline __device__ ushort4 __ldcv(const ushort4 *ptr) {
+  ushort4 ret;
+  asm("ld.global.cv.v4.u16 {%0,%1,%2,%3}, [%4];"
+      : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w)
+      : "l"(ptr)
+      : "memory");
+  return ret;
+}
+inline __device__ uint2 __ldcv(const uint2 *ptr) {
+  uint2 ret;
+  asm("ld.global.cv.v2.u32 {%0,%1}, [%2];"
+      : "=r"(ret.x), "=r"(ret.y)
+      : "l"(ptr)
+      : "memory");
+  return ret;
+}
+inline __device__ uint4 __ldcv(const uint4 *ptr) {
+  uint4 ret;
+  asm("ld.global.cv.v4.u32 {%0,%1,%2,%3}, [%4];"
+      : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w)
+      : "l"(ptr)
+      : "memory");
+  return ret;
+}
+inline __device__ ulonglong2 __ldcv(const ulonglong2 *ptr) {
+  ulonglong2 ret;
+  asm("ld.global.cv.v2.u64 {%0,%1}, [%2];"
+      : "=l"(ret.x), "=l"(ret.y)
+      : "l"(ptr)
+      : "memory");
+  return ret;
+}
+inline __device__ float __ldcv(const float *ptr) {
+  float ret;
+  asm("ld.global.cv.f32 %0, [%1];" : "=f"(ret) : "l"(ptr) : "memory");
+  return ret;
+}
+inline __device__ double __ldcv(const double *ptr) {
+  double ret;
+  asm("ld.global.cv.f64 %0, [%1];" : "=d"(ret) : "l"(ptr) : "memory");
+  return ret;
+}
+inline __device__ float2 __ldcv(const float2 *ptr) {
+  float2 ret;
+  asm("ld.global.cv.v2.f32 {%0,%1}, [%2];"
+      : "=f"(ret.x), "=f"(ret.y)
+      : "l"(ptr)
+      : "memory");
+  return ret;
+}
+inline __device__ float4 __ldcv(const float4 *ptr) {
+  float4 ret;
+  asm("ld.global.cv.v4.f32 {%0,%1,%2,%3}, [%4];"
+      : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w)
+      : "l"(ptr)
+      : "memory");
+  return ret;
+}
+inline __device__ double2 __ldcv(const double2 *ptr) {
+  double2 ret;
+  asm("ld.global.cv.v2.f64 {%0,%1}, [%2];"
+      : "=d"(ret.x), "=d"(ret.y)
+      : "l"(ptr)
+      : "memory");
+  return ret;
+}
+
+inline __device__ void __stwt(char *ptr, char value) {
+  asm("st.global.wt.s8 [%0], %1;" ::"l"(ptr), "r"((int)value) : "memory");
+}
+inline __device__ void __stwt(signed char *ptr, signed char value) {
+  asm("st.global.wt.s8 [%0], %1;" ::"l"(ptr), "r"((int)value) : "memory");
+}
+inline __device__ void __stwt(short *ptr, short value) {
+  asm("st.global.wt.s16 [%0], %1;" ::"l"(ptr), "h"(value) : "memory");
+}
+inline __device__ void __stwt(int *ptr, int value) {
+  asm("st.global.wt.s32 [%0], %1;" ::"l"(ptr), "r"(value) : "memory");
+}
+inline __device__ void __stwt(long long *ptr, long long value) {
+  asm("st.global.wt.s64 [%0], %1;" ::"l"(ptr), "l"(value) : "memory");
+}
+inline __device__ void __stwt(char2 *ptr, char2 value) {
+  const int x = value.x, y = value.y;
+  asm("st.global.wt.v2.s8 [%0], {%1,%2};" ::"l"(ptr), "r"(x), "r"(y)
+      : "memory");
+}
+inline __device__ void __stwt(char4 *ptr, char4 value) {
+  const int x = value.x, y = value.y, z = value.z, w = value.w;
+  asm("st.global.wt.v4.s8 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(x), "r"(y),
+      "r"(z), "r"(w)
+      : "memory");
+}
+inline __device__ void __stwt(short2 *ptr, short2 value) {
+  asm("st.global.wt.v2.s16 [%0], {%1,%2};" ::"l"(ptr), "h"(value.x),
+      "h"(value.y)
+      : "memory");
+}
+inline __device__ void __stwt(short4 *ptr, short4 value) {
+  asm("st.global.wt.v4.s16 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "h"(value.x),
+      "h"(value.y), "h"(value.z), "h"(value.w)
+      : "memory");
+}
+inline __device__ void __stwt(int2 *ptr, int2 value) {
+  asm("st.global.wt.v2.s32 [%0], {%1,%2};" ::"l"(ptr), "r"(value.x),
+      "r"(value.y)
+      : "memory");
+}
+inline __device__ void __stwt(int4 *ptr, int4 value) {
+  asm("st.global.wt.v4.s32 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(value.x),
+      "r"(value.y), "r"(value.z), "r"(value.w)
+      : "memory");
+}
+inline __device__ void __stwt(longlong2 *ptr, longlong2 value) {
+  asm("st.global.wt.v2.s64 [%0], {%1,%2};" ::"l"(ptr), "l"(value.x),
+      "l"(value.y)
+      : "memory");
+}
+inline __device__ void __stwt(unsigned char *ptr, unsigned char value) {
+  asm("st.global.wt.u8 [%0], %1;" ::"l"(ptr), "r"((int)value) : "memory");
+}
+inline __device__ void __stwt(unsigned short *ptr, unsigned short value) {
+  asm("st.global.wt.u16 [%0], %1;" ::"l"(ptr), "h"(value) : "memory");
+}
+inline __device__ void __stwt(unsigned int *ptr, unsigned int value) {
+  asm("st.global.wt.u32 [%0], %1;" ::"l"(ptr), "r"(value) : "memory");
+}
+inline __device__ void __stwt(unsigned long long *ptr,
+                              unsigned long long value) {
+  asm("st.global.wt.u64 [%0], %1;" ::"l"(ptr), "l"(value) : "memory");
+}
+inline __device__ void __stwt(uchar2 *ptr, uchar2 value) {
+  const int x = value.x, y = value.y;
+  asm("st.global.wt.v2.u8 [%0], {%1,%2};" ::"l"(ptr), "r"(x), "r"(y)
+      : "memory");
+}
+inline __device__ void __stwt(uchar4 *ptr, uchar4 value) {
+  const int x = value.x, y = value.y, z = value.z, w = value.w;
+  asm("st.global.wt.v4.u8 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(x), "r"(y),
+      "r"(z), "r"(w)
+      : "memory");
+}
+inline __device__ void __stwt(ushort2 *ptr, ushort2 value) {
+  asm("st.global.wt.v2.u16 [%0], {%1,%2};" ::"l"(ptr), "h"(value.x),
+      "h"(value.y)
+      : "memory");
+}
+inline __device__ void __stwt(ushort4 *ptr, ushort4 value) {
+  asm("st.global.wt.v4.u16 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "h"(value.x),
+      "h"(value.y), "h"(value.z), "h"(value.w)
+      : "memory");
+}
+inline __device__ void __stwt(uint2 *ptr, uint2 value) {
+  asm("st.global.wt.v2.u32 [%0], {%1,%2};" ::"l"(ptr), "r"(value.x),
+      "r"(value.y)
+      : "memory");
+}
+inline __device__ void __stwt(uint4 *ptr, uint4 value) {
+  asm("st.global.wt.v4.u32 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(value.x),
+      "r"(value.y), "r"(value.z), "r"(value.w)
+      : "memory");
+}
+inline __device__ void __stwt(ulonglong2 *ptr, ulonglong2 value) {
+  asm("st.global.wt.v2.u64 [%0], {%1,%2};" ::"l"(ptr), "l"(value.x),
+      "l"(value.y)
+      : "memory");
+}
+inline __device__ void __stwt(float *ptr, float value) {
+  asm("st.global.wt.f32 [%0], %1;" ::"l"(ptr), "f"(value) : "memory");
+}
+inline __device__ void __stwt(double *ptr, double value) {
+  asm("st.global.wt.f64 [%0], %1;" ::"l"(ptr), "d"(value) : "memory");
+}
+inline __device__ void __stwt(float2 *ptr, float2 value) {
+  asm("st.global.wt.v2.f32 [%0], {%1,%2};" ::"l"(ptr), "f"(value.x),
+      "f"(value.y)
+      : "memory");
+}
+inline __device__ void __stwt(float4 *ptr, float4 value) {
+  asm("st.global.wt.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "f"(value.x),
+      "f"(value.y), "f"(value.z), "f"(value.w)
+      : "memory");
+}
+inline __device__ void __stwt(double2 *ptr, double2 value) {
+  asm("st.global.wt.v2.f64 [%0], {%1,%2};" ::"l"(ptr), "d"(value.x),
+      "d"(value.y)
+      : "memory");
+}
+
 #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
 
 #if CUDA_VERSION >= 11000

``````````

</details>


https://github.com/llvm/llvm-project/pull/143664
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to