Issue 128408
Summary AMDGPU backend doesn't lower __builtin_nontemporal_load correctly
Labels new issue
Assignees
Reporter Epliz
    Hi,

It looks like the builtin (mentioned at https://gpuopen.com/learn/amd-lab-notes/amd-lab-notes-finite-difference-docs-laplacian_part3/) is not lowered correctly for float types, as well as vectors or floats.

Example HIP kernel:
```

#define FULL_MASK32 0xffffffff
#define FULL_MASK64 0xffffffffffffffff

#ifdef __CUDA_ARCH__
#define __xx_shfl_down(mask, val, offset) __shfl_down_sync(mask, val, offset)
#elif defined(__HIP_PLATFORM_AMD__) // AMD
#define __xx_shfl_down(mask, val, offset) __shfl_down(val, offset)
#else
#error "Unsupported compiler"
#endif

__device__ float warpReduce(float val) {
  if (warpSize == 32) {
    for (int offset = 16; offset > 0; offset /= 2)
      val += __xx_shfl_down(FULL_MASK32, val, offset);
  }
  if (warpSize == 64) {
    for (int offset = 32; offset > 0; offset /= 2)
      val += __xx_shfl_down(FULL_MASK64, val, offset);

  }
 return val;
}

template <typename T>
static inline const T* __device__ addr(const T* p, unsigned index) {
  // helps the AMDGPU compiler understand it can use the sgrp pair + single vgpr addressing mode
  unsigned byte_offset = sizeof(T) * index;
  const uint8_t* p8 = (const uint8_t*)p;
 return (const T*) (p8 + byte_offset);
}

#define DIV_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
#define ALIGN_UP(a, b) (DIV_ROUND_UP((a), (b)) * (b))

#define THREADS_PER_BLOCK 256
#define WARPS_PER_BLOCK 4

#define FLOAT8_ELEMENTS_PER_THREAD 8
#define FLOAT8_ELEMENTS_PER_BLOCK ((THREADS_PER_BLOCK) * (FLOAT8_ELEMENTS_PER_THREAD))

float4 load_nontemporal_float4(const float* p) {
  float x = __builtin_nontemporal_load(p);
  float y = __builtin_nontemporal_load(p + 1);
  float z = __builtin_nontemporal_load(p + 2);
  float w = __builtin_nontemporal_load(p + 3);

  float4 v = make_float4(x, y, z, w);
}

__global__ void float8_nt_bandwidth_kernel(
  const float* __restrict__ A,
  bool* __restrict__ out_flag,
  unsigned N
) {
  int warpCounts = THREADS_PER_BLOCK / warpSize;
  int warpId = threadIdx.x / warpSize;
  int laneId = threadIdx.x % warpSize;
  int tid = blockIdx.x * FLOAT8_ELEMENTS_PER_BLOCK + (4 * threadIdx.x);

  __shared__ float shared_acc;
  if (threadIdx.x == 0) {
    shared_acc = 0.f;
  }
  if (THREADS_PER_BLOCK > warpSize) {
    __syncthreads();
  }


  float r = 0.f;
  if ((blockIdx.x + 1) * FLOAT8_ELEMENTS_PER_BLOCK <= N) {
 unsigned off = tid;
    float4 v0 = load_nontemporal_float4(addr(A, off));
 off += 4 * THREADS_PER_BLOCK;
    float4 v1 = load_nontemporal_float4(addr(A, off));

    float4 v01 = v0 + v1;

    r = (v01.x+ v01.y) + (v01.z + v01.w);
  }

  r = warpReduce(r);

  if (laneId == 0) {
    atomicAdd(&shared_acc, r);
  }

  if (THREADS_PER_BLOCK > warpSize) {
    __syncthreads();
  }

  if (threadIdx.x == 0) {
    *out_flag = (shared_acc > 0.f);
  }
}

void float8_nt_bandwidth(
  const void* __restrict__ A,
  bool* __restrict__ out_flag,
  unsigned N
) {
  const int threads_per_blocks = THREADS_PER_BLOCK;
  const int num_blocks = DIV_ROUND_UP(N, FLOAT8_ELEMENTS_PER_BLOCK);
  float8_nt_bandwidth_kernel<<<num_blocks, threads_per_blocks>>>((const float*)A, (bool*)out_flag, N);
}
```

HIP version 6.3.2
AMD clang version 18.0

Best regards,
Epliz
_______________________________________________
llvm-bugs mailing list
llvm-bugs@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to