
The CUDA impelementation has long supported the `width` argument on its
shuffle instrucitons, which makes it more difficult to replace those
uses with this helper. This patch just correctly implements that for
AMDGPU and NVPTX so it's equivalent to `__shfl_sync` in CUDA. This will
ease porting.

>From 1bb963028e1b52c61286a8723475acd5ad0cb9ba Mon Sep 17 00:00:00 2001
From: Joseph Huber <>
Date: Wed, 5 Feb 2025 11:46:45 -0600
Subject: [PATCH] [Clang] Add width handling for <gpuintrin.h> shuffle helper

The CUDA impelementation has long supported the `width` argument on its
shuffle instrucitons, which makes it more difficult to replace those
uses with this helper. This patch just correctly implements that for
AMDGPU and NVPTX so it's equivalent to `__shfl_sync` in CUDA. This will
ease porting.
 clang/lib/Headers/amdgpuintrin.h | 14 +++++++++-----
 clang/lib/Headers/gpuintrin.h    | 24 ++++++++++++++----------
 clang/lib/Headers/nvptxintrin.h  | 15 ++++++++-------
 libc/src/__support/GPU/utils.h   |  5 +++--
 4 files changed, 34 insertions(+), 24 deletions(-)

diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 038605605462f80..9dad99ffe9439ab 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -145,17 +145,21 @@ _DEFAULT_FN_ATTRS static __inline__ void 
__gpu_sync_lane(uint64_t __lane_mask) {
 // Shuffles the the lanes inside the wavefront according to the given index.
 _DEFAULT_FN_ATTRS static __inline__ uint32_t
-__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
-  return __builtin_amdgcn_ds_bpermute(__idx << 2, __x);
+__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
+                      uint32_t __width) {
+  uint32_t __lane = __idx + (__gpu_lane_id() & ~(__width - 1));
+  return __builtin_amdgcn_ds_bpermute(__lane << 2, __x);
 // Shuffles the the lanes inside the wavefront according to the given index.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t
-__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) {
+__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
+                      uint32_t __width) {
   uint32_t __hi = (uint32_t)(__x >> 32ull);
   uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
-  return ((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __hi) << 32ull) |
-         ((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __lo));
+  return ((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __hi, __width)
+          << 32ull) |
+         ((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __lo, __width));
 // Returns true if the flat pointer points to AMDGPU 'shared' memory.
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 4c463c333308fce..11c87e85cd49754 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -133,18 +133,21 @@ __gpu_read_first_lane_f64(uint64_t __lane_mask, double 
__x) {
 // Shuffles the the lanes according to the given index.
 _DEFAULT_FN_ATTRS static __inline__ float
-__gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x) {
+__gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x,
+                      uint32_t __width) {
   return __builtin_bit_cast(
       float, __gpu_shuffle_idx_u32(__lane_mask, __idx,
-                                   __builtin_bit_cast(uint32_t, __x)));
+                                   __builtin_bit_cast(uint32_t, __x), 
 // Shuffles the the lanes according to the given index.
 _DEFAULT_FN_ATTRS static __inline__ double
-__gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x) {
+__gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
+                      uint32_t __width) {
   return __builtin_bit_cast(
-      double, __gpu_shuffle_idx_u64(__lane_mask, __idx,
-                                    __builtin_bit_cast(uint64_t, __x)));
+      double,
+      __gpu_shuffle_idx_u64(__lane_mask, __idx,
+                            __builtin_bit_cast(uint64_t, __x), __width));
 // Gets the sum of all lanes inside the warp or wavefront.
@@ -153,7 +156,8 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, 
double __x) {
       uint64_t __lane_mask, __type __x) {                                      
     for (uint32_t __step = __gpu_num_lanes() / 2; __step > 0; __step /= 2) {   
       uint32_t __index = __step + __gpu_lane_id();                             
-      __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x);          
+      __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x,           
+                                          __gpu_num_lanes());                  
     return __gpu_read_first_lane_##__suffix(__lane_mask, __x);                 
@@ -171,10 +175,10 @@ __DO_LANE_SUM(double, f64);   // double 
__gpu_lane_sum_f64(m, x)
       uint32_t __index = __gpu_lane_id() - __step;                             
       __bitmask_type bitmask = __gpu_lane_id() >= __step;                      
       __x += __builtin_bit_cast(                                               
-          __type,                                                              
-          -bitmask & __builtin_bit_cast(__bitmask_type,                        
-                                        __gpu_shuffle_idx_##__suffix(          
-                                            __lane_mask, __index, __x)));      
+          __type, -bitmask & __builtin_bit_cast(__bitmask_type,                
+                                                __gpu_shuffle_idx_##__suffix(  
+                                                    __lane_mask, __index, __x, 
+                                                    __gpu_num_lanes())));      
     return __x;                                                                
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index fb2864eab6a09db..40fa2edebe975cf 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -149,22 +149,23 @@ _DEFAULT_FN_ATTRS static __inline__ void 
__gpu_sync_lane(uint64_t __lane_mask) {
 // Shuffles the the lanes inside the warp according to the given index.
 _DEFAULT_FN_ATTRS static __inline__ uint32_t
-__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
+__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
+                      uint32_t __width) {
   uint32_t __mask = (uint32_t)__lane_mask;
-  return __nvvm_shfl_sync_idx_i32(__mask, __x, __idx, __gpu_num_lanes() - 1u);
+  return __nvvm_shfl_sync_idx_i32(__mask, __x, __idx,
+                                  ((__gpu_num_lanes() - __width) << 8u) | 
 // Shuffles the the lanes inside the warp according to the given index.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t
-__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) {
+__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
+                      uint32_t __width) {
   uint32_t __hi = (uint32_t)(__x >> 32ull);
   uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
   uint32_t __mask = (uint32_t)__lane_mask;
-  return ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __hi, __idx,
-                                             __gpu_num_lanes() - 1u)
+  return ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __hi, __width)
           << 32ull) |
-         ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __lo, __idx,
-                                             __gpu_num_lanes() - 1u));
+         ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width));
 // Returns true if the flat pointer points to CUDA 'shared' memory.
diff --git a/libc/src/__support/GPU/utils.h b/libc/src/__support/GPU/utils.h
index e138c84c0cb22d4..323c003f1ff0741 100644
--- a/libc/src/__support/GPU/utils.h
+++ b/libc/src/__support/GPU/utils.h
@@ -87,8 +87,9 @@ LIBC_INLINE void sync_threads() { __gpu_sync_threads(); }
 LIBC_INLINE void sync_lane(uint64_t lane_mask) { __gpu_sync_lane(lane_mask); }
-LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x) {
-  return __gpu_shuffle_idx_u32(lane_mask, idx, x);
+LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x,
+                             uint32_t width = __gpu_num_lanes()) {
+  return __gpu_shuffle_idx_u32(lane_mask, idx, x, width);
 [[noreturn]] LIBC_INLINE void end_program() { __gpu_exit(); }

cfe-commits mailing list

Reply via email to