Issue |
128408
|
Summary |
AMDGPU backend doesn't lower __builtin_nontemporal_load correctly
|
Labels |
new issue
|
Assignees |
|
Reporter |
Epliz
|
Hi,
It looks like the builtin (mentioned at https://gpuopen.com/learn/amd-lab-notes/amd-lab-notes-finite-difference-docs-laplacian_part3/) is not lowered correctly for float types, as well as vectors or floats.
Example HIP kernel:
```
#define FULL_MASK32 0xffffffff
#define FULL_MASK64 0xffffffffffffffff
#ifdef __CUDA_ARCH__
#define __xx_shfl_down(mask, val, offset) __shfl_down_sync(mask, val, offset)
#elif defined(__HIP_PLATFORM_AMD__) // AMD
#define __xx_shfl_down(mask, val, offset) __shfl_down(val, offset)
#else
#error "Unsupported compiler"
#endif
__device__ float warpReduce(float val) {
if (warpSize == 32) {
for (int offset = 16; offset > 0; offset /= 2)
val += __xx_shfl_down(FULL_MASK32, val, offset);
}
if (warpSize == 64) {
for (int offset = 32; offset > 0; offset /= 2)
val += __xx_shfl_down(FULL_MASK64, val, offset);
}
return val;
}
template <typename T>
static inline const T* __device__ addr(const T* p, unsigned index) {
// helps the AMDGPU compiler understand it can use the sgrp pair + single vgpr addressing mode
unsigned byte_offset = sizeof(T) * index;
const uint8_t* p8 = (const uint8_t*)p;
return (const T*) (p8 + byte_offset);
}
#define DIV_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
#define ALIGN_UP(a, b) (DIV_ROUND_UP((a), (b)) * (b))
#define THREADS_PER_BLOCK 256
#define WARPS_PER_BLOCK 4
#define FLOAT8_ELEMENTS_PER_THREAD 8
#define FLOAT8_ELEMENTS_PER_BLOCK ((THREADS_PER_BLOCK) * (FLOAT8_ELEMENTS_PER_THREAD))
float4 load_nontemporal_float4(const float* p) {
float x = __builtin_nontemporal_load(p);
float y = __builtin_nontemporal_load(p + 1);
float z = __builtin_nontemporal_load(p + 2);
float w = __builtin_nontemporal_load(p + 3);
float4 v = make_float4(x, y, z, w);
}
__global__ void float8_nt_bandwidth_kernel(
const float* __restrict__ A,
bool* __restrict__ out_flag,
unsigned N
) {
int warpCounts = THREADS_PER_BLOCK / warpSize;
int warpId = threadIdx.x / warpSize;
int laneId = threadIdx.x % warpSize;
int tid = blockIdx.x * FLOAT8_ELEMENTS_PER_BLOCK + (4 * threadIdx.x);
__shared__ float shared_acc;
if (threadIdx.x == 0) {
shared_acc = 0.f;
}
if (THREADS_PER_BLOCK > warpSize) {
__syncthreads();
}
float r = 0.f;
if ((blockIdx.x + 1) * FLOAT8_ELEMENTS_PER_BLOCK <= N) {
unsigned off = tid;
float4 v0 = load_nontemporal_float4(addr(A, off));
off += 4 * THREADS_PER_BLOCK;
float4 v1 = load_nontemporal_float4(addr(A, off));
float4 v01 = v0 + v1;
r = (v01.x+ v01.y) + (v01.z + v01.w);
}
r = warpReduce(r);
if (laneId == 0) {
atomicAdd(&shared_acc, r);
}
if (THREADS_PER_BLOCK > warpSize) {
__syncthreads();
}
if (threadIdx.x == 0) {
*out_flag = (shared_acc > 0.f);
}
}
void float8_nt_bandwidth(
const void* __restrict__ A,
bool* __restrict__ out_flag,
unsigned N
) {
const int threads_per_blocks = THREADS_PER_BLOCK;
const int num_blocks = DIV_ROUND_UP(N, FLOAT8_ELEMENTS_PER_BLOCK);
float8_nt_bandwidth_kernel<<<num_blocks, threads_per_blocks>>>((const float*)A, (bool*)out_flag, N);
}
```
HIP version 6.3.2
AMD clang version 18.0
Best regards,
Epliz
_______________________________________________
llvm-bugs mailing list
llvm-bugs@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs