https://github.com/easyonaadit updated https://github.com/llvm/llvm-project/pull/120063
>From 999d6ffbc6adffcb499842467bec8d07b881af46 Mon Sep 17 00:00:00 2001 From: easyonaadit <aaditya.alokdeshpa...@amd.com> Date: Mon, 16 Dec 2024 15:25:07 +0530 Subject: [PATCH 1/2] [NFC][AMDGPU] Pre-commit clang and llvm tests for dynamic allocas --- clang/test/CodeGenHIP/dynamic-alloca.cpp | 532 ++++++++++++++++++ .../GlobalISel/dynamic-alloca-divergent.ll | 10 + .../GlobalISel/dynamic-alloca-uniform.ll | 85 +++ .../test/CodeGen/AMDGPU/dynamic_stackalloc.ll | 42 +- 4 files changed, 667 insertions(+), 2 deletions(-) create mode 100644 clang/test/CodeGenHIP/dynamic-alloca.cpp diff --git a/clang/test/CodeGenHIP/dynamic-alloca.cpp b/clang/test/CodeGenHIP/dynamic-alloca.cpp new file mode 100644 index 00000000000000..4bbc6b2e69917f --- /dev/null +++ b/clang/test/CodeGenHIP/dynamic-alloca.cpp @@ -0,0 +1,532 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -x hip -emit-llvm -fcuda-is-device -o - %s | FileCheck %s + + +#define __global__ __attribute__((global)) +#define __device__ __attribute__((device)) + +// CHECK-LABEL: define dso_local amdgpu_kernel void @_Z34kernel_function_builtin_alloca_immv( +// CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ALLOCA:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[ALLOCA_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA]] to ptr +// CHECK-NEXT: [[TMP0:%.*]] = alloca i8, i64 40, align 8, addrspace(5) +// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[TMP0]] to ptr +// CHECK-NEXT: store ptr [[TMP1]], ptr [[ALLOCA_ASCAST]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[ALLOCA_ASCAST]], align 8 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 0 +// CHECK-NEXT: store volatile i32 10, ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: ret void +// +__global__ void kernel_function_builtin_alloca_imm(){ + volatile int *alloca = static_cast<volatile int*>(__builtin_alloca(10*sizeof(int))); + static_cast<volatile int*>(alloca)[0] = 10; +} + +// CHECK-LABEL: define dso_local amdgpu_kernel void @_Z50kernel_function_non_entry_block_builtin_alloca_immPi( +// CHECK-SAME: ptr addrspace(1) noundef [[A_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[ALLOCA:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[ALLOCA2:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[ALLOCA_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA]] to ptr +// CHECK-NEXT: [[ALLOCA2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA2]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[A_COERCE]], ptr [[A_ASCAST]], align 8 +// CHECK-NEXT: [[A1:%.*]] = load ptr, ptr [[A_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[A1]], ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 +// CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 10 +// CHECK-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]] +// CHECK: [[IF_THEN]]: +// CHECK-NEXT: [[TMP2:%.*]] = alloca i8, i64 40, align 8, addrspace(5) +// CHECK-NEXT: [[TMP3:%.*]] = addrspacecast ptr addrspace(5) [[TMP2]] to ptr +// CHECK-NEXT: store ptr [[TMP3]], ptr [[ALLOCA_ASCAST]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[ALLOCA_ASCAST]], align 8 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 0 +// CHECK-NEXT: store volatile i32 10, ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: br label %[[IF_END:.*]] +// CHECK: [[IF_ELSE]]: +// CHECK-NEXT: [[TMP5:%.*]] = alloca i8, i64 80, align 8, addrspace(5) +// CHECK-NEXT: [[TMP6:%.*]] = addrspacecast ptr addrspace(5) [[TMP5]] to ptr +// CHECK-NEXT: store ptr [[TMP6]], ptr [[ALLOCA2_ASCAST]], align 8 +// CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[ALLOCA2_ASCAST]], align 8 +// CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i64 0 +// CHECK-NEXT: store volatile i32 20, ptr [[ARRAYIDX3]], align 4 +// CHECK-NEXT: br label %[[IF_END]] +// CHECK: [[IF_END]]: +// CHECK-NEXT: ret void +// +__global__ void kernel_function_non_entry_block_builtin_alloca_imm(int* a){ + if(*a < 10){ + volatile void *alloca = __builtin_alloca(10*sizeof(int)); + static_cast<volatile int*>(alloca)[0] = 10; + } + else { + volatile void *alloca = __builtin_alloca(20*sizeof(int)); + static_cast<volatile int*>(alloca)[0] = 20; + } +} + +// CHECK-LABEL: define dso_local amdgpu_kernel void @_Z30kernel_function_builtin_allocaPi( +// CHECK-SAME: ptr addrspace(1) noundef [[A_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[ALLOCA:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[ALLOCA_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[A_COERCE]], ptr [[A_ASCAST]], align 8 +// CHECK-NEXT: [[A1:%.*]] = load ptr, ptr [[A_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[A1]], ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 +// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP1]] to i64 +// CHECK-NEXT: [[MUL:%.*]] = mul i64 [[CONV]], 4 +// CHECK-NEXT: [[TMP2:%.*]] = alloca i8, i64 [[MUL]], align 8, addrspace(5) +// CHECK-NEXT: [[TMP3:%.*]] = addrspacecast ptr addrspace(5) [[TMP2]] to ptr +// CHECK-NEXT: store ptr [[TMP3]], ptr [[ALLOCA_ASCAST]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[ALLOCA_ASCAST]], align 8 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 0 +// CHECK-NEXT: store volatile i32 10, ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: ret void +// +__global__ void kernel_function_builtin_alloca(int* a){ + volatile void *alloca = __builtin_alloca((*a)*sizeof(int)); + static_cast<volatile int*>(alloca)[0] = 10; +} + +// CHECK-LABEL: define dso_local amdgpu_kernel void @_Z44kernel_function_builtin_alloca_uninitializedPi( +// CHECK-SAME: ptr addrspace(1) noundef [[A_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[ALLOCA:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[ALLOCA_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[A_COERCE]], ptr [[A_ASCAST]], align 8 +// CHECK-NEXT: [[A1:%.*]] = load ptr, ptr [[A_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[A1]], ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 +// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP1]] to i64 +// CHECK-NEXT: [[MUL:%.*]] = mul i64 [[CONV]], 4 +// CHECK-NEXT: [[TMP2:%.*]] = alloca i8, i64 [[MUL]], align 8, addrspace(5) +// CHECK-NEXT: [[TMP3:%.*]] = addrspacecast ptr addrspace(5) [[TMP2]] to ptr +// CHECK-NEXT: store ptr [[TMP3]], ptr [[ALLOCA_ASCAST]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[ALLOCA_ASCAST]], align 8 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i64 0 +// CHECK-NEXT: store volatile float 1.000000e+01, ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: ret void +// +__global__ void kernel_function_builtin_alloca_uninitialized(int* a){ + volatile void *alloca = __builtin_alloca_uninitialized((*a)*sizeof(float)); + static_cast<volatile float*>(alloca)[0] = 10.0; +} + +// CHECK-LABEL: define dso_local amdgpu_kernel void @_Z44kernel_function_builtin_alloca_default_alignPi( +// CHECK-SAME: ptr addrspace(1) noundef [[A_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[ALLOCA:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[ALLOCA_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[A_COERCE]], ptr [[A_ASCAST]], align 8 +// CHECK-NEXT: [[A1:%.*]] = load ptr, ptr [[A_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[A1]], ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 +// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP1]] to i64 +// CHECK-NEXT: [[MUL:%.*]] = mul i64 [[CONV]], 8 +// CHECK-NEXT: [[TMP2:%.*]] = alloca i8, i64 [[MUL]], align 8, addrspace(5) +// CHECK-NEXT: [[TMP3:%.*]] = addrspacecast ptr addrspace(5) [[TMP2]] to ptr +// CHECK-NEXT: store ptr [[TMP3]], ptr [[ALLOCA_ASCAST]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[ALLOCA_ASCAST]], align 8 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i64 0 +// CHECK-NEXT: store volatile i64 10, ptr [[ARRAYIDX]], align 8 +// CHECK-NEXT: ret void +// +__global__ void kernel_function_builtin_alloca_default_align(int* a){ + volatile void *alloca = __builtin_alloca_with_align((*a)*sizeof(long), 64); + static_cast<volatile long*>(alloca)[0] = 10; +} + +// CHECK-LABEL: define dso_local amdgpu_kernel void @_Z48kernel_function_builtin_alloca_non_default_alignPi( +// CHECK-SAME: ptr addrspace(1) noundef [[A_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[ALLOCA:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[ALLOCA_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[A_COERCE]], ptr [[A_ASCAST]], align 8 +// CHECK-NEXT: [[A1:%.*]] = load ptr, ptr [[A_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[A1]], ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 +// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP1]] to i64 +// CHECK-NEXT: [[MUL:%.*]] = mul i64 [[CONV]], 4 +// CHECK-NEXT: [[TMP2:%.*]] = alloca i8, i64 [[MUL]], align 32, addrspace(5) +// CHECK-NEXT: [[TMP3:%.*]] = addrspacecast ptr addrspace(5) [[TMP2]] to ptr +// CHECK-NEXT: store ptr [[TMP3]], ptr [[ALLOCA_ASCAST]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[ALLOCA_ASCAST]], align 8 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 0 +// CHECK-NEXT: store volatile i32 10, ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: ret void +// +__global__ void kernel_function_builtin_alloca_non_default_align(int* a){ + volatile void *alloca = __builtin_alloca_with_align((*a)*sizeof(unsigned), 256); + static_cast<volatile unsigned*>(alloca)[0] = 10; +} + +// CHECK-LABEL: define dso_local amdgpu_kernel void @_Z62kernel_function_builtin_alloca_non_default_align_uninitializedPi( +// CHECK-SAME: ptr addrspace(1) noundef [[A_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[ALLOCA:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[ALLOCA_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[A_COERCE]], ptr [[A_ASCAST]], align 8 +// CHECK-NEXT: [[A1:%.*]] = load ptr, ptr [[A_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[A1]], ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 +// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP1]] to i64 +// CHECK-NEXT: [[MUL:%.*]] = mul i64 [[CONV]], 4 +// CHECK-NEXT: [[TMP2:%.*]] = alloca i8, i64 [[MUL]], align 32, addrspace(5) +// CHECK-NEXT: [[TMP3:%.*]] = addrspacecast ptr addrspace(5) [[TMP2]] to ptr +// CHECK-NEXT: store ptr [[TMP3]], ptr [[ALLOCA_ASCAST]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[ALLOCA_ASCAST]], align 8 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 0 +// CHECK-NEXT: store volatile i32 10, ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: ret void +// +__global__ void kernel_function_builtin_alloca_non_default_align_uninitialized(int* a){ + volatile void *alloca = __builtin_alloca_with_align_uninitialized((*a)*sizeof(unsigned), 256); + static_cast<volatile unsigned*>(alloca)[0] = 10; +} + +// CHECK-LABEL: define dso_local amdgpu_kernel void @_Z35kernel_function_variable_size_arrayPi( +// CHECK-SAME: ptr addrspace(1) noundef [[A_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[SAVED_STACK:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5) +// CHECK-NEXT: [[__VLA_EXPR0:%.*]] = alloca i64, align 8, addrspace(5) +// CHECK-NEXT: [[A_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[SAVED_STACK_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SAVED_STACK]] to ptr +// CHECK-NEXT: [[__VLA_EXPR0_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__VLA_EXPR0]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[A_COERCE]], ptr [[A_ASCAST]], align 8 +// CHECK-NEXT: [[A1:%.*]] = load ptr, ptr [[A_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[A1]], ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(5) @llvm.stacksave.p5() +// CHECK-NEXT: store ptr addrspace(5) [[TMP3]], ptr [[SAVED_STACK_ASCAST]], align 4 +// CHECK-NEXT: [[VLA:%.*]] = alloca i32, i64 [[TMP2]], align 4, addrspace(5) +// CHECK-NEXT: [[VLA_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA]] to ptr +// CHECK-NEXT: store i64 [[TMP2]], ptr [[__VLA_EXPR0_ASCAST]], align 8 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VLA_ASCAST]], i64 2 +// CHECK-NEXT: store i32 10, ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(5), ptr [[SAVED_STACK_ASCAST]], align 4 +// CHECK-NEXT: call void @llvm.stackrestore.p5(ptr addrspace(5) [[TMP4]]) +// CHECK-NEXT: ret void +// +__global__ void kernel_function_variable_size_array(int* a){ + int arr[*a]; + arr[2] = 10; +} + +// CHECK-LABEL: define dso_local amdgpu_kernel void @_Z51kernel_function_non_entry_block_static_sized_allocaPi( +// CHECK-SAME: ptr addrspace(1) noundef [[A_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[ALLOCA:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[ALLOCA2:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[ALLOCA_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA]] to ptr +// CHECK-NEXT: [[ALLOCA2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA2]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[A_COERCE]], ptr [[A_ASCAST]], align 8 +// CHECK-NEXT: [[A1:%.*]] = load ptr, ptr [[A_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[A1]], ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 +// CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 10 +// CHECK-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]] +// CHECK: [[IF_THEN]]: +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 +// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP3]] to i64 +// CHECK-NEXT: [[MUL:%.*]] = mul i64 [[CONV]], 4 +// CHECK-NEXT: [[TMP4:%.*]] = alloca i8, i64 [[MUL]], align 8, addrspace(5) +// CHECK-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(5) [[TMP4]] to ptr +// CHECK-NEXT: store ptr [[TMP5]], ptr [[ALLOCA_ASCAST]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[ALLOCA_ASCAST]], align 8 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i64 0 +// CHECK-NEXT: store volatile i32 10, ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: br label %[[IF_END:.*]] +// CHECK: [[IF_ELSE]]: +// CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4 +// CHECK-NEXT: [[MUL3:%.*]] = mul nsw i32 2, [[TMP8]] +// CHECK-NEXT: [[CONV4:%.*]] = sext i32 [[MUL3]] to i64 +// CHECK-NEXT: [[MUL5:%.*]] = mul i64 [[CONV4]], 4 +// CHECK-NEXT: [[TMP9:%.*]] = alloca i8, i64 [[MUL5]], align 8, addrspace(5) +// CHECK-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(5) [[TMP9]] to ptr +// CHECK-NEXT: store ptr [[TMP10]], ptr [[ALLOCA2_ASCAST]], align 8 +// CHECK-NEXT: [[TMP11:%.*]] = load ptr, ptr [[ALLOCA2_ASCAST]], align 8 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 0 +// CHECK-NEXT: store volatile i32 20, ptr [[ARRAYIDX6]], align 4 +// CHECK-NEXT: br label %[[IF_END]] +// CHECK: [[IF_END]]: +// CHECK-NEXT: ret void +// +__global__ void kernel_function_non_entry_block_static_sized_alloca(int* a){ + if(*a < 10){ + volatile void *alloca = __builtin_alloca((*a)*sizeof(int)); + static_cast<volatile int*>(alloca)[0] = 10; + } + else { + volatile void *alloca = __builtin_alloca(2*(*a)*sizeof(int)); + static_cast<volatile int*>(alloca)[0] = 20; + } +} + +// CHECK-LABEL: define dso_local void @_Z50device_function_non_entry_block_builtin_alloca_immv( +// CHECK-SAME: ) #[[ATTR2:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ALLOCA:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[ALLOCA_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA]] to ptr +// CHECK-NEXT: [[TMP0:%.*]] = alloca i8, i64 10, align 8, addrspace(5) +// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[TMP0]] to ptr +// CHECK-NEXT: store ptr [[TMP1]], ptr [[ALLOCA_ASCAST]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[ALLOCA_ASCAST]], align 8 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 0 +// CHECK-NEXT: store i32 10, ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: ret void +// +__device__ void device_function_non_entry_block_builtin_alloca_imm(){ + int *alloca = static_cast<int *>(__builtin_alloca(10)); + alloca[0] = 10; +} + +// CHECK-LABEL: define dso_local void @_Z30device_function_builtin_allocaPi( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR2]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[ALLOCA:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[ALLOCA_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA]] to ptr +// CHECK-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 +// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP1]] to i64 +// CHECK-NEXT: [[MUL:%.*]] = mul i64 [[CONV]], 4 +// CHECK-NEXT: [[TMP2:%.*]] = alloca i8, i64 [[MUL]], align 8, addrspace(5) +// CHECK-NEXT: [[TMP3:%.*]] = addrspacecast ptr addrspace(5) [[TMP2]] to ptr +// CHECK-NEXT: store ptr [[TMP3]], ptr [[ALLOCA_ASCAST]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[ALLOCA_ASCAST]], align 8 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 0 +// CHECK-NEXT: store volatile i32 10, ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: ret void +// +__device__ void device_function_builtin_alloca(int* a){ + volatile void *alloca = __builtin_alloca((*a)*sizeof(int)); + static_cast<volatile int*>(alloca)[0] = 10; +} + +// CHECK-LABEL: define dso_local void @_Z44device_function_builtin_alloca_uninitializedPi( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR2]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[ALLOCA:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[ALLOCA_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA]] to ptr +// CHECK-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 +// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP1]] to i64 +// CHECK-NEXT: [[MUL:%.*]] = mul i64 [[CONV]], 4 +// CHECK-NEXT: [[TMP2:%.*]] = alloca i8, i64 [[MUL]], align 8, addrspace(5) +// CHECK-NEXT: [[TMP3:%.*]] = addrspacecast ptr addrspace(5) [[TMP2]] to ptr +// CHECK-NEXT: store ptr [[TMP3]], ptr [[ALLOCA_ASCAST]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[ALLOCA_ASCAST]], align 8 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i64 0 +// CHECK-NEXT: store volatile float 1.000000e+01, ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: ret void +// +__device__ void device_function_builtin_alloca_uninitialized(int* a){ + volatile void *alloca = __builtin_alloca_uninitialized((*a)*sizeof(float)); + static_cast<volatile float*>(alloca)[0] = 10.0; +} + +// CHECK-LABEL: define dso_local void @_Z44device_function_builtin_alloca_default_alignPi( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR2]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[ALLOCA:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[ALLOCA_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA]] to ptr +// CHECK-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 +// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP1]] to i64 +// CHECK-NEXT: [[MUL:%.*]] = mul i64 [[CONV]], 8 +// CHECK-NEXT: [[TMP2:%.*]] = alloca i8, i64 [[MUL]], align 8, addrspace(5) +// CHECK-NEXT: [[TMP3:%.*]] = addrspacecast ptr addrspace(5) [[TMP2]] to ptr +// CHECK-NEXT: store ptr [[TMP3]], ptr [[ALLOCA_ASCAST]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[ALLOCA_ASCAST]], align 8 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i64 0 +// CHECK-NEXT: store volatile i64 10, ptr [[ARRAYIDX]], align 8 +// CHECK-NEXT: ret void +// +__device__ void device_function_builtin_alloca_default_align(int* a){ + volatile void *alloca = __builtin_alloca_with_align((*a)*sizeof(long), 64); + static_cast<volatile long*>(alloca)[0] = 10; +} + +// CHECK-LABEL: define dso_local void @_Z48device_function_builtin_alloca_non_default_alignPi( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR2]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[ALLOCA:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[ALLOCA_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA]] to ptr +// CHECK-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 +// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP1]] to i64 +// CHECK-NEXT: [[MUL:%.*]] = mul i64 [[CONV]], 4 +// CHECK-NEXT: [[TMP2:%.*]] = alloca i8, i64 [[MUL]], align 32, addrspace(5) +// CHECK-NEXT: [[TMP3:%.*]] = addrspacecast ptr addrspace(5) [[TMP2]] to ptr +// CHECK-NEXT: store ptr [[TMP3]], ptr [[ALLOCA_ASCAST]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[ALLOCA_ASCAST]], align 8 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 0 +// CHECK-NEXT: store volatile i32 10, ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: ret void +// +__device__ void device_function_builtin_alloca_non_default_align(int* a){ + volatile void *alloca = __builtin_alloca_with_align((*a)*sizeof(unsigned), 256); + static_cast<volatile unsigned*>(alloca)[0] = 10; +} + +// CHECK-LABEL: define dso_local void @_Z62device_function_builtin_alloca_non_default_align_uninitializedPi( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR2]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[ALLOCA:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[ALLOCA_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA]] to ptr +// CHECK-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 +// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP1]] to i64 +// CHECK-NEXT: [[MUL:%.*]] = mul i64 [[CONV]], 4 +// CHECK-NEXT: [[TMP2:%.*]] = alloca i8, i64 [[MUL]], align 32, addrspace(5) +// CHECK-NEXT: [[TMP3:%.*]] = addrspacecast ptr addrspace(5) [[TMP2]] to ptr +// CHECK-NEXT: store ptr [[TMP3]], ptr [[ALLOCA_ASCAST]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[ALLOCA_ASCAST]], align 8 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 0 +// CHECK-NEXT: store volatile i32 10, ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: ret void +// +__device__ void device_function_builtin_alloca_non_default_align_uninitialized(int* a){ + volatile void *alloca = __builtin_alloca_with_align_uninitialized((*a)*sizeof(unsigned), 256); + static_cast<volatile unsigned*>(alloca)[0] = 10; +} + +// CHECK-LABEL: define dso_local void @_Z35device_function_variable_size_arrayPi( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR2]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[SAVED_STACK:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5) +// CHECK-NEXT: [[__VLA_EXPR0:%.*]] = alloca i64, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[SAVED_STACK_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SAVED_STACK]] to ptr +// CHECK-NEXT: [[__VLA_EXPR0_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__VLA_EXPR0]] to ptr +// CHECK-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(5) @llvm.stacksave.p5() +// CHECK-NEXT: store ptr addrspace(5) [[TMP3]], ptr [[SAVED_STACK_ASCAST]], align 4 +// CHECK-NEXT: [[VLA:%.*]] = alloca i32, i64 [[TMP2]], align 4, addrspace(5) +// CHECK-NEXT: [[VLA_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA]] to ptr +// CHECK-NEXT: store i64 [[TMP2]], ptr [[__VLA_EXPR0_ASCAST]], align 8 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VLA_ASCAST]], i64 2 +// CHECK-NEXT: store volatile i32 10, ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(5), ptr [[SAVED_STACK_ASCAST]], align 4 +// CHECK-NEXT: call void @llvm.stackrestore.p5(ptr addrspace(5) [[TMP4]]) +// CHECK-NEXT: ret void +// +__device__ void device_function_variable_size_array(int* a){ + volatile int arr[*a]; + arr[2] = 10; +} + +// CHECK-LABEL: define dso_local void @_Z51device_function_non_entry_block_static_sized_allocaPi( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR2]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[ALLOCA:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[ALLOCA1:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[ALLOCA_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA]] to ptr +// CHECK-NEXT: [[ALLOCA1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA1]] to ptr +// CHECK-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 +// CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 10 +// CHECK-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]] +// CHECK: [[IF_THEN]]: +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 +// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP3]] to i64 +// CHECK-NEXT: [[MUL:%.*]] = mul i64 [[CONV]], 4 +// CHECK-NEXT: [[TMP4:%.*]] = alloca i8, i64 [[MUL]], align 8, addrspace(5) +// CHECK-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(5) [[TMP4]] to ptr +// CHECK-NEXT: store ptr [[TMP5]], ptr [[ALLOCA_ASCAST]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[ALLOCA_ASCAST]], align 8 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i64 0 +// CHECK-NEXT: store volatile i32 10, ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: br label %[[IF_END:.*]] +// CHECK: [[IF_ELSE]]: +// CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4 +// CHECK-NEXT: [[MUL2:%.*]] = mul nsw i32 2, [[TMP8]] +// CHECK-NEXT: [[CONV3:%.*]] = sext i32 [[MUL2]] to i64 +// CHECK-NEXT: [[MUL4:%.*]] = mul i64 [[CONV3]], 4 +// CHECK-NEXT: [[TMP9:%.*]] = alloca i8, i64 [[MUL4]], align 8, addrspace(5) +// CHECK-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(5) [[TMP9]] to ptr +// CHECK-NEXT: store ptr [[TMP10]], ptr [[ALLOCA1_ASCAST]], align 8 +// CHECK-NEXT: [[TMP11:%.*]] = load ptr, ptr [[ALLOCA1_ASCAST]], align 8 +// CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 0 +// CHECK-NEXT: store volatile i32 20, ptr [[ARRAYIDX5]], align 4 +// CHECK-NEXT: br label %[[IF_END]] +// CHECK: [[IF_END]]: +// CHECK-NEXT: ret void +// +__device__ void device_function_non_entry_block_static_sized_alloca(int* a){ + if(*a < 10){ + volatile void *alloca = __builtin_alloca((*a)*sizeof(int)); + static_cast<volatile int*>(alloca)[0] = 10; + }else { + volatile void *alloca = __builtin_alloca(2*(*a)*sizeof(int)); + static_cast<volatile int*>(alloca)[0] = 20; + } + /// Check formatting. +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-divergent.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-divergent.ll index 5dae7885f6bfb1..21780805c6978a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-divergent.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-divergent.ll @@ -8,6 +8,10 @@ ; ERR-NEXT: warning: Instruction selection used fallback path for func_dynamic_stackalloc_vgpr_align4 ; ERR-NEXT: error: <unknown>:0:0: in function func_dynamic_stackalloc_vgpr_align4 void (i32): unsupported dynamic alloca +; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 32 (in function: func_dynamic_stackalloc_vgpr_align32) +; ERR-NEXT: warning: Instruction selection used fallback path for func_dynamic_stackalloc_vgpr_align32 +; ERR-NEXT: error: <unknown>:0:0: in function func_dynamic_stackalloc_vgpr_align32 void (i32): unsupported dynamic alloca + define amdgpu_kernel void @kernel_dynamic_stackalloc_vgpr_align4(ptr addrspace(1) %ptr) { %id = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %id @@ -23,6 +27,12 @@ define void @func_dynamic_stackalloc_vgpr_align4(i32 %n) { ret void } +define void @func_dynamic_stackalloc_vgpr_align32(i32 %n) { + %alloca = alloca i32, i32 %n, align 32, addrspace(5) + store volatile ptr addrspace(5) %alloca, ptr addrspace(1) undef + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #0 attributes #0 = { nounwind readnone speculatable } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll index 741323a201d02e..a44cea1da4573a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll @@ -418,3 +418,88 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) { store i32 0, ptr addrspace(5) %alloca ret void } + +define amdgpu_kernel void @kernel_non_entry_block_static_alloca(ptr addrspace(1) %out, i32 %arg.cond, i32 %in) { +; GFX9-LABEL: kernel_non_entry_block_static_alloca: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX9-NEXT: s_add_u32 s0, s0, s17 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: s_mov_b32 s33, 0 +; GFX9-NEXT: s_movk_i32 s32, 0x1000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s4, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB6_2 +; GFX9-NEXT: ; %bb.1: ; %bb.1 +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: .LBB6_2: ; %bb.0 +; GFX9-NEXT: s_add_u32 s4, s32, 0x400 +; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: kernel_non_entry_block_static_alloca: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-NEXT: s_add_u32 s0, s0, s17 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_mov_b32 s33, 0 +; GFX10-NEXT: s_movk_i32 s32, 0x800 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_cmp_lg_u32 s4, 0 +; GFX10-NEXT: s_cbranch_scc0 .LBB6_2 +; GFX10-NEXT: ; %bb.1: ; %bb.1 +; GFX10-NEXT: s_endpgm +; GFX10-NEXT: .LBB6_2: ; %bb.0 +; GFX10-NEXT: s_add_u32 s4, s32, 0x200 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_and_b32 s4, s4, 0xfffff800 +; GFX10-NEXT: v_mov_b32_e32 v2, 1 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: kernel_non_entry_block_static_alloca: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-NEXT: s_mov_b32 s33, 0 +; GFX11-NEXT: s_mov_b32 s32, 64 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB6_2 +; GFX11-NEXT: ; %bb.1: ; %bb.1 +; GFX11-NEXT: s_endpgm +; GFX11-NEXT: .LBB6_2: ; %bb.0 +; GFX11-NEXT: s_add_u32 s0, s32, 0x200 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1 +; GFX11-NEXT: s_and_b32 s0, s0, 0xfffff800 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s1, s0, 4 +; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: scratch_store_b32 off, v1, s1 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_endpgm + entry: + %cond = icmp eq i32 %arg.cond, 0 + br i1 %cond, label %bb.0, label %bb.1 + + bb.0: + %alloca = alloca i32, i32 4, align 64, addrspace(5) + %gep1 = getelementptr i32, ptr addrspace(5) %alloca, i32 1 + store volatile i32 0, ptr addrspace(5) %alloca + store volatile i32 1, ptr addrspace(5) %gep1 + br label %bb.1 + + bb.1: + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll index 1c093bf31ea75f..32aff6ffa481fc 100644 --- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll @@ -5,8 +5,46 @@ target datalayout = "A5" ; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca -define amdgpu_kernel void @test_dynamic_stackalloc(ptr addrspace(1) %out, i32 %n) { +define amdgpu_kernel void @test_dynamic_stackalloc(i32 %n) { %alloca = alloca i32, i32 %n, addrspace(5) - store volatile i32 0, ptr addrspace(5) %alloca + store volatile i32 123, ptr addrspace(5) %alloca ret void } + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define amdgpu_kernel void @test_dynamic_stackalloc_multiple_allocas(i32 %n) { + %alloca1 = alloca i32, i32 8, addrspace(5) + %alloca2 = alloca i32, i32 %n, addrspace(5) + %alloca3 = alloca i32, i32 10, addrspace(5) + store volatile i32 1, ptr addrspace(5) %alloca1 + store volatile i32 2, ptr addrspace(5) %alloca2 + store volatile i32 3, ptr addrspace(5) %alloca3 + ret void +} + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define amdgpu_kernel void @test_dynamic_stackalloc_custom_alignment(i32 %n) { + %alloca = alloca i32, i32 %n, align 128, addrspace(5) + store volatile i32 1, ptr addrspace(5) %alloca + ret void +} + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define amdgpu_kernel void @test_dynamic_stackalloc_non_entry_block(i32 %n) { + entry: + %cond = icmp eq i32 %n, 0 + br i1 %cond, label %bb.0, label %bb.1 + + bb.0: + %alloca = alloca i32, i32 %n, align 64, addrspace(5) + %gep1 = getelementptr i32, ptr addrspace(5) %alloca, i32 1 + store volatile i32 0, ptr addrspace(5) %alloca + store volatile i32 1, ptr addrspace(5) %gep1 + br label %bb.1 + + bb.1: + ret void +} >From ef99f48808f4d2ea1e239f61dac6403ea8bc6786 Mon Sep 17 00:00:00 2001 From: easyonaadit <aaditya.alokdeshpa...@amd.com> Date: Tue, 17 Dec 2024 10:32:33 +0530 Subject: [PATCH 2/2] Review Comments --- clang/test/CodeGenHIP/dynamic-alloca.cpp | 532 ------------------ .../GlobalISel/dynamic-alloca-divergent.ll | 52 +- .../GlobalISel/dynamic-alloca-uniform.ll | 85 --- .../test/CodeGen/AMDGPU/dynamic_stackalloc.ll | 180 +++++- 4 files changed, 205 insertions(+), 644 deletions(-) delete mode 100644 clang/test/CodeGenHIP/dynamic-alloca.cpp diff --git a/clang/test/CodeGenHIP/dynamic-alloca.cpp b/clang/test/CodeGenHIP/dynamic-alloca.cpp deleted file mode 100644 index 4bbc6b2e69917f..00000000000000 --- a/clang/test/CodeGenHIP/dynamic-alloca.cpp +++ /dev/null @@ -1,532 +0,0 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 -// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -x hip -emit-llvm -fcuda-is-device -o - %s | FileCheck %s - - -#define __global__ __attribute__((global)) -#define __device__ __attribute__((device)) - -// CHECK-LABEL: define dso_local amdgpu_kernel void @_Z34kernel_function_builtin_alloca_immv( -// CHECK-SAME: ) #[[ATTR0:[0-9]+]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[ALLOCA:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[ALLOCA_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA]] to ptr -// CHECK-NEXT: [[TMP0:%.*]] = alloca i8, i64 40, align 8, addrspace(5) -// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[TMP0]] to ptr -// CHECK-NEXT: store ptr [[TMP1]], ptr [[ALLOCA_ASCAST]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[ALLOCA_ASCAST]], align 8 -// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 0 -// CHECK-NEXT: store volatile i32 10, ptr [[ARRAYIDX]], align 4 -// CHECK-NEXT: ret void -// -__global__ void kernel_function_builtin_alloca_imm(){ - volatile int *alloca = static_cast<volatile int*>(__builtin_alloca(10*sizeof(int))); - static_cast<volatile int*>(alloca)[0] = 10; -} - -// CHECK-LABEL: define dso_local amdgpu_kernel void @_Z50kernel_function_non_entry_block_builtin_alloca_immPi( -// CHECK-SAME: ptr addrspace(1) noundef [[A_COERCE:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[A:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[ALLOCA:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[ALLOCA2:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[A_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A]] to ptr -// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr -// CHECK-NEXT: [[ALLOCA_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA]] to ptr -// CHECK-NEXT: [[ALLOCA2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA2]] to ptr -// CHECK-NEXT: store ptr addrspace(1) [[A_COERCE]], ptr [[A_ASCAST]], align 8 -// CHECK-NEXT: [[A1:%.*]] = load ptr, ptr [[A_ASCAST]], align 8 -// CHECK-NEXT: store ptr [[A1]], ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 -// CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 10 -// CHECK-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]] -// CHECK: [[IF_THEN]]: -// CHECK-NEXT: [[TMP2:%.*]] = alloca i8, i64 40, align 8, addrspace(5) -// CHECK-NEXT: [[TMP3:%.*]] = addrspacecast ptr addrspace(5) [[TMP2]] to ptr -// CHECK-NEXT: store ptr [[TMP3]], ptr [[ALLOCA_ASCAST]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[ALLOCA_ASCAST]], align 8 -// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 0 -// CHECK-NEXT: store volatile i32 10, ptr [[ARRAYIDX]], align 4 -// CHECK-NEXT: br label %[[IF_END:.*]] -// CHECK: [[IF_ELSE]]: -// CHECK-NEXT: [[TMP5:%.*]] = alloca i8, i64 80, align 8, addrspace(5) -// CHECK-NEXT: [[TMP6:%.*]] = addrspacecast ptr addrspace(5) [[TMP5]] to ptr -// CHECK-NEXT: store ptr [[TMP6]], ptr [[ALLOCA2_ASCAST]], align 8 -// CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[ALLOCA2_ASCAST]], align 8 -// CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i64 0 -// CHECK-NEXT: store volatile i32 20, ptr [[ARRAYIDX3]], align 4 -// CHECK-NEXT: br label %[[IF_END]] -// CHECK: [[IF_END]]: -// CHECK-NEXT: ret void -// -__global__ void kernel_function_non_entry_block_builtin_alloca_imm(int* a){ - if(*a < 10){ - volatile void *alloca = __builtin_alloca(10*sizeof(int)); - static_cast<volatile int*>(alloca)[0] = 10; - } - else { - volatile void *alloca = __builtin_alloca(20*sizeof(int)); - static_cast<volatile int*>(alloca)[0] = 20; - } -} - -// CHECK-LABEL: define dso_local amdgpu_kernel void @_Z30kernel_function_builtin_allocaPi( -// CHECK-SAME: ptr addrspace(1) noundef [[A_COERCE:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[A:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[ALLOCA:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[A_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A]] to ptr -// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr -// CHECK-NEXT: [[ALLOCA_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA]] to ptr -// CHECK-NEXT: store ptr addrspace(1) [[A_COERCE]], ptr [[A_ASCAST]], align 8 -// CHECK-NEXT: [[A1:%.*]] = load ptr, ptr [[A_ASCAST]], align 8 -// CHECK-NEXT: store ptr [[A1]], ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 -// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP1]] to i64 -// CHECK-NEXT: [[MUL:%.*]] = mul i64 [[CONV]], 4 -// CHECK-NEXT: [[TMP2:%.*]] = alloca i8, i64 [[MUL]], align 8, addrspace(5) -// CHECK-NEXT: [[TMP3:%.*]] = addrspacecast ptr addrspace(5) [[TMP2]] to ptr -// CHECK-NEXT: store ptr [[TMP3]], ptr [[ALLOCA_ASCAST]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[ALLOCA_ASCAST]], align 8 -// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 0 -// CHECK-NEXT: store volatile i32 10, ptr [[ARRAYIDX]], align 4 -// CHECK-NEXT: ret void -// -__global__ void kernel_function_builtin_alloca(int* a){ - volatile void *alloca = __builtin_alloca((*a)*sizeof(int)); - static_cast<volatile int*>(alloca)[0] = 10; -} - -// CHECK-LABEL: define dso_local amdgpu_kernel void @_Z44kernel_function_builtin_alloca_uninitializedPi( -// CHECK-SAME: ptr addrspace(1) noundef [[A_COERCE:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[A:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[ALLOCA:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[A_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A]] to ptr -// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr -// CHECK-NEXT: [[ALLOCA_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA]] to ptr -// CHECK-NEXT: store ptr addrspace(1) [[A_COERCE]], ptr [[A_ASCAST]], align 8 -// CHECK-NEXT: [[A1:%.*]] = load ptr, ptr [[A_ASCAST]], align 8 -// CHECK-NEXT: store ptr [[A1]], ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 -// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP1]] to i64 -// CHECK-NEXT: [[MUL:%.*]] = mul i64 [[CONV]], 4 -// CHECK-NEXT: [[TMP2:%.*]] = alloca i8, i64 [[MUL]], align 8, addrspace(5) -// CHECK-NEXT: [[TMP3:%.*]] = addrspacecast ptr addrspace(5) [[TMP2]] to ptr -// CHECK-NEXT: store ptr [[TMP3]], ptr [[ALLOCA_ASCAST]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[ALLOCA_ASCAST]], align 8 -// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i64 0 -// CHECK-NEXT: store volatile float 1.000000e+01, ptr [[ARRAYIDX]], align 4 -// CHECK-NEXT: ret void -// -__global__ void kernel_function_builtin_alloca_uninitialized(int* a){ - volatile void *alloca = __builtin_alloca_uninitialized((*a)*sizeof(float)); - static_cast<volatile float*>(alloca)[0] = 10.0; -} - -// CHECK-LABEL: define dso_local amdgpu_kernel void @_Z44kernel_function_builtin_alloca_default_alignPi( -// CHECK-SAME: ptr addrspace(1) noundef [[A_COERCE:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[A:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[ALLOCA:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[A_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A]] to ptr -// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr -// CHECK-NEXT: [[ALLOCA_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA]] to ptr -// CHECK-NEXT: store ptr addrspace(1) [[A_COERCE]], ptr [[A_ASCAST]], align 8 -// CHECK-NEXT: [[A1:%.*]] = load ptr, ptr [[A_ASCAST]], align 8 -// CHECK-NEXT: store ptr [[A1]], ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 -// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP1]] to i64 -// CHECK-NEXT: [[MUL:%.*]] = mul i64 [[CONV]], 8 -// CHECK-NEXT: [[TMP2:%.*]] = alloca i8, i64 [[MUL]], align 8, addrspace(5) -// CHECK-NEXT: [[TMP3:%.*]] = addrspacecast ptr addrspace(5) [[TMP2]] to ptr -// CHECK-NEXT: store ptr [[TMP3]], ptr [[ALLOCA_ASCAST]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[ALLOCA_ASCAST]], align 8 -// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i64 0 -// CHECK-NEXT: store volatile i64 10, ptr [[ARRAYIDX]], align 8 -// CHECK-NEXT: ret void -// -__global__ void kernel_function_builtin_alloca_default_align(int* a){ - volatile void *alloca = __builtin_alloca_with_align((*a)*sizeof(long), 64); - static_cast<volatile long*>(alloca)[0] = 10; -} - -// CHECK-LABEL: define dso_local amdgpu_kernel void @_Z48kernel_function_builtin_alloca_non_default_alignPi( -// CHECK-SAME: ptr addrspace(1) noundef [[A_COERCE:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[A:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[ALLOCA:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[A_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A]] to ptr -// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr -// CHECK-NEXT: [[ALLOCA_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA]] to ptr -// CHECK-NEXT: store ptr addrspace(1) [[A_COERCE]], ptr [[A_ASCAST]], align 8 -// CHECK-NEXT: [[A1:%.*]] = load ptr, ptr [[A_ASCAST]], align 8 -// CHECK-NEXT: store ptr [[A1]], ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 -// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP1]] to i64 -// CHECK-NEXT: [[MUL:%.*]] = mul i64 [[CONV]], 4 -// CHECK-NEXT: [[TMP2:%.*]] = alloca i8, i64 [[MUL]], align 32, addrspace(5) -// CHECK-NEXT: [[TMP3:%.*]] = addrspacecast ptr addrspace(5) [[TMP2]] to ptr -// CHECK-NEXT: store ptr [[TMP3]], ptr [[ALLOCA_ASCAST]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[ALLOCA_ASCAST]], align 8 -// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 0 -// CHECK-NEXT: store volatile i32 10, ptr [[ARRAYIDX]], align 4 -// CHECK-NEXT: ret void -// -__global__ void kernel_function_builtin_alloca_non_default_align(int* a){ - volatile void *alloca = __builtin_alloca_with_align((*a)*sizeof(unsigned), 256); - static_cast<volatile unsigned*>(alloca)[0] = 10; -} - -// CHECK-LABEL: define dso_local amdgpu_kernel void @_Z62kernel_function_builtin_alloca_non_default_align_uninitializedPi( -// CHECK-SAME: ptr addrspace(1) noundef [[A_COERCE:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[A:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[ALLOCA:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[A_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A]] to ptr -// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr -// CHECK-NEXT: [[ALLOCA_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA]] to ptr -// CHECK-NEXT: store ptr addrspace(1) [[A_COERCE]], ptr [[A_ASCAST]], align 8 -// CHECK-NEXT: [[A1:%.*]] = load ptr, ptr [[A_ASCAST]], align 8 -// CHECK-NEXT: store ptr [[A1]], ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 -// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP1]] to i64 -// CHECK-NEXT: [[MUL:%.*]] = mul i64 [[CONV]], 4 -// CHECK-NEXT: [[TMP2:%.*]] = alloca i8, i64 [[MUL]], align 32, addrspace(5) -// CHECK-NEXT: [[TMP3:%.*]] = addrspacecast ptr addrspace(5) [[TMP2]] to ptr -// CHECK-NEXT: store ptr [[TMP3]], ptr [[ALLOCA_ASCAST]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[ALLOCA_ASCAST]], align 8 -// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 0 -// CHECK-NEXT: store volatile i32 10, ptr [[ARRAYIDX]], align 4 -// CHECK-NEXT: ret void -// -__global__ void kernel_function_builtin_alloca_non_default_align_uninitialized(int* a){ - volatile void *alloca = __builtin_alloca_with_align_uninitialized((*a)*sizeof(unsigned), 256); - static_cast<volatile unsigned*>(alloca)[0] = 10; -} - -// CHECK-LABEL: define dso_local amdgpu_kernel void @_Z35kernel_function_variable_size_arrayPi( -// CHECK-SAME: ptr addrspace(1) noundef [[A_COERCE:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[A:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[SAVED_STACK:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5) -// CHECK-NEXT: [[__VLA_EXPR0:%.*]] = alloca i64, align 8, addrspace(5) -// CHECK-NEXT: [[A_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A]] to ptr -// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr -// CHECK-NEXT: [[SAVED_STACK_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SAVED_STACK]] to ptr -// CHECK-NEXT: [[__VLA_EXPR0_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__VLA_EXPR0]] to ptr -// CHECK-NEXT: store ptr addrspace(1) [[A_COERCE]], ptr [[A_ASCAST]], align 8 -// CHECK-NEXT: [[A1:%.*]] = load ptr, ptr [[A_ASCAST]], align 8 -// CHECK-NEXT: store ptr [[A1]], ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 -// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(5) @llvm.stacksave.p5() -// CHECK-NEXT: store ptr addrspace(5) [[TMP3]], ptr [[SAVED_STACK_ASCAST]], align 4 -// CHECK-NEXT: [[VLA:%.*]] = alloca i32, i64 [[TMP2]], align 4, addrspace(5) -// CHECK-NEXT: [[VLA_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA]] to ptr -// CHECK-NEXT: store i64 [[TMP2]], ptr [[__VLA_EXPR0_ASCAST]], align 8 -// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VLA_ASCAST]], i64 2 -// CHECK-NEXT: store i32 10, ptr [[ARRAYIDX]], align 4 -// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(5), ptr [[SAVED_STACK_ASCAST]], align 4 -// CHECK-NEXT: call void @llvm.stackrestore.p5(ptr addrspace(5) [[TMP4]]) -// CHECK-NEXT: ret void -// -__global__ void kernel_function_variable_size_array(int* a){ - int arr[*a]; - arr[2] = 10; -} - -// CHECK-LABEL: define dso_local amdgpu_kernel void @_Z51kernel_function_non_entry_block_static_sized_allocaPi( -// CHECK-SAME: ptr addrspace(1) noundef [[A_COERCE:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[A:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[ALLOCA:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[ALLOCA2:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[A_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A]] to ptr -// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr -// CHECK-NEXT: [[ALLOCA_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA]] to ptr -// CHECK-NEXT: [[ALLOCA2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA2]] to ptr -// CHECK-NEXT: store ptr addrspace(1) [[A_COERCE]], ptr [[A_ASCAST]], align 8 -// CHECK-NEXT: [[A1:%.*]] = load ptr, ptr [[A_ASCAST]], align 8 -// CHECK-NEXT: store ptr [[A1]], ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 -// CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 10 -// CHECK-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]] -// CHECK: [[IF_THEN]]: -// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 -// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP3]] to i64 -// CHECK-NEXT: [[MUL:%.*]] = mul i64 [[CONV]], 4 -// CHECK-NEXT: [[TMP4:%.*]] = alloca i8, i64 [[MUL]], align 8, addrspace(5) -// CHECK-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(5) [[TMP4]] to ptr -// CHECK-NEXT: store ptr [[TMP5]], ptr [[ALLOCA_ASCAST]], align 8 -// CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[ALLOCA_ASCAST]], align 8 -// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i64 0 -// CHECK-NEXT: store volatile i32 10, ptr [[ARRAYIDX]], align 4 -// CHECK-NEXT: br label %[[IF_END:.*]] -// CHECK: [[IF_ELSE]]: -// CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4 -// CHECK-NEXT: [[MUL3:%.*]] = mul nsw i32 2, [[TMP8]] -// CHECK-NEXT: [[CONV4:%.*]] = sext i32 [[MUL3]] to i64 -// CHECK-NEXT: [[MUL5:%.*]] = mul i64 [[CONV4]], 4 -// CHECK-NEXT: [[TMP9:%.*]] = alloca i8, i64 [[MUL5]], align 8, addrspace(5) -// CHECK-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(5) [[TMP9]] to ptr -// CHECK-NEXT: store ptr [[TMP10]], ptr [[ALLOCA2_ASCAST]], align 8 -// CHECK-NEXT: [[TMP11:%.*]] = load ptr, ptr [[ALLOCA2_ASCAST]], align 8 -// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 0 -// CHECK-NEXT: store volatile i32 20, ptr [[ARRAYIDX6]], align 4 -// CHECK-NEXT: br label %[[IF_END]] -// CHECK: [[IF_END]]: -// CHECK-NEXT: ret void -// -__global__ void kernel_function_non_entry_block_static_sized_alloca(int* a){ - if(*a < 10){ - volatile void *alloca = __builtin_alloca((*a)*sizeof(int)); - static_cast<volatile int*>(alloca)[0] = 10; - } - else { - volatile void *alloca = __builtin_alloca(2*(*a)*sizeof(int)); - static_cast<volatile int*>(alloca)[0] = 20; - } -} - -// CHECK-LABEL: define dso_local void @_Z50device_function_non_entry_block_builtin_alloca_immv( -// CHECK-SAME: ) #[[ATTR2:[0-9]+]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[ALLOCA:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[ALLOCA_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA]] to ptr -// CHECK-NEXT: [[TMP0:%.*]] = alloca i8, i64 10, align 8, addrspace(5) -// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[TMP0]] to ptr -// CHECK-NEXT: store ptr [[TMP1]], ptr [[ALLOCA_ASCAST]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[ALLOCA_ASCAST]], align 8 -// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 0 -// CHECK-NEXT: store i32 10, ptr [[ARRAYIDX]], align 4 -// CHECK-NEXT: ret void -// -__device__ void device_function_non_entry_block_builtin_alloca_imm(){ - int *alloca = static_cast<int *>(__builtin_alloca(10)); - alloca[0] = 10; -} - -// CHECK-LABEL: define dso_local void @_Z30device_function_builtin_allocaPi( -// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR2]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[ALLOCA:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr -// CHECK-NEXT: [[ALLOCA_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA]] to ptr -// CHECK-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 -// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP1]] to i64 -// CHECK-NEXT: [[MUL:%.*]] = mul i64 [[CONV]], 4 -// CHECK-NEXT: [[TMP2:%.*]] = alloca i8, i64 [[MUL]], align 8, addrspace(5) -// CHECK-NEXT: [[TMP3:%.*]] = addrspacecast ptr addrspace(5) [[TMP2]] to ptr -// CHECK-NEXT: store ptr [[TMP3]], ptr [[ALLOCA_ASCAST]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[ALLOCA_ASCAST]], align 8 -// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 0 -// CHECK-NEXT: store volatile i32 10, ptr [[ARRAYIDX]], align 4 -// CHECK-NEXT: ret void -// -__device__ void device_function_builtin_alloca(int* a){ - volatile void *alloca = __builtin_alloca((*a)*sizeof(int)); - static_cast<volatile int*>(alloca)[0] = 10; -} - -// CHECK-LABEL: define dso_local void @_Z44device_function_builtin_alloca_uninitializedPi( -// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR2]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[ALLOCA:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr -// CHECK-NEXT: [[ALLOCA_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA]] to ptr -// CHECK-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 -// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP1]] to i64 -// CHECK-NEXT: [[MUL:%.*]] = mul i64 [[CONV]], 4 -// CHECK-NEXT: [[TMP2:%.*]] = alloca i8, i64 [[MUL]], align 8, addrspace(5) -// CHECK-NEXT: [[TMP3:%.*]] = addrspacecast ptr addrspace(5) [[TMP2]] to ptr -// CHECK-NEXT: store ptr [[TMP3]], ptr [[ALLOCA_ASCAST]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[ALLOCA_ASCAST]], align 8 -// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i64 0 -// CHECK-NEXT: store volatile float 1.000000e+01, ptr [[ARRAYIDX]], align 4 -// CHECK-NEXT: ret void -// -__device__ void device_function_builtin_alloca_uninitialized(int* a){ - volatile void *alloca = __builtin_alloca_uninitialized((*a)*sizeof(float)); - static_cast<volatile float*>(alloca)[0] = 10.0; -} - -// CHECK-LABEL: define dso_local void @_Z44device_function_builtin_alloca_default_alignPi( -// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR2]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[ALLOCA:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr -// CHECK-NEXT: [[ALLOCA_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA]] to ptr -// CHECK-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 -// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP1]] to i64 -// CHECK-NEXT: [[MUL:%.*]] = mul i64 [[CONV]], 8 -// CHECK-NEXT: [[TMP2:%.*]] = alloca i8, i64 [[MUL]], align 8, addrspace(5) -// CHECK-NEXT: [[TMP3:%.*]] = addrspacecast ptr addrspace(5) [[TMP2]] to ptr -// CHECK-NEXT: store ptr [[TMP3]], ptr [[ALLOCA_ASCAST]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[ALLOCA_ASCAST]], align 8 -// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i64 0 -// CHECK-NEXT: store volatile i64 10, ptr [[ARRAYIDX]], align 8 -// CHECK-NEXT: ret void -// -__device__ void device_function_builtin_alloca_default_align(int* a){ - volatile void *alloca = __builtin_alloca_with_align((*a)*sizeof(long), 64); - static_cast<volatile long*>(alloca)[0] = 10; -} - -// CHECK-LABEL: define dso_local void @_Z48device_function_builtin_alloca_non_default_alignPi( -// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR2]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[ALLOCA:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr -// CHECK-NEXT: [[ALLOCA_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA]] to ptr -// CHECK-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 -// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP1]] to i64 -// CHECK-NEXT: [[MUL:%.*]] = mul i64 [[CONV]], 4 -// CHECK-NEXT: [[TMP2:%.*]] = alloca i8, i64 [[MUL]], align 32, addrspace(5) -// CHECK-NEXT: [[TMP3:%.*]] = addrspacecast ptr addrspace(5) [[TMP2]] to ptr -// CHECK-NEXT: store ptr [[TMP3]], ptr [[ALLOCA_ASCAST]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[ALLOCA_ASCAST]], align 8 -// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 0 -// CHECK-NEXT: store volatile i32 10, ptr [[ARRAYIDX]], align 4 -// CHECK-NEXT: ret void -// -__device__ void device_function_builtin_alloca_non_default_align(int* a){ - volatile void *alloca = __builtin_alloca_with_align((*a)*sizeof(unsigned), 256); - static_cast<volatile unsigned*>(alloca)[0] = 10; -} - -// CHECK-LABEL: define dso_local void @_Z62device_function_builtin_alloca_non_default_align_uninitializedPi( -// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR2]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[ALLOCA:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr -// CHECK-NEXT: [[ALLOCA_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA]] to ptr -// CHECK-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 -// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP1]] to i64 -// CHECK-NEXT: [[MUL:%.*]] = mul i64 [[CONV]], 4 -// CHECK-NEXT: [[TMP2:%.*]] = alloca i8, i64 [[MUL]], align 32, addrspace(5) -// CHECK-NEXT: [[TMP3:%.*]] = addrspacecast ptr addrspace(5) [[TMP2]] to ptr -// CHECK-NEXT: store ptr [[TMP3]], ptr [[ALLOCA_ASCAST]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[ALLOCA_ASCAST]], align 8 -// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 0 -// CHECK-NEXT: store volatile i32 10, ptr [[ARRAYIDX]], align 4 -// CHECK-NEXT: ret void -// -__device__ void device_function_builtin_alloca_non_default_align_uninitialized(int* a){ - volatile void *alloca = __builtin_alloca_with_align_uninitialized((*a)*sizeof(unsigned), 256); - static_cast<volatile unsigned*>(alloca)[0] = 10; -} - -// CHECK-LABEL: define dso_local void @_Z35device_function_variable_size_arrayPi( -// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR2]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[SAVED_STACK:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5) -// CHECK-NEXT: [[__VLA_EXPR0:%.*]] = alloca i64, align 8, addrspace(5) -// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr -// CHECK-NEXT: [[SAVED_STACK_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SAVED_STACK]] to ptr -// CHECK-NEXT: [[__VLA_EXPR0_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__VLA_EXPR0]] to ptr -// CHECK-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 -// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(5) @llvm.stacksave.p5() -// CHECK-NEXT: store ptr addrspace(5) [[TMP3]], ptr [[SAVED_STACK_ASCAST]], align 4 -// CHECK-NEXT: [[VLA:%.*]] = alloca i32, i64 [[TMP2]], align 4, addrspace(5) -// CHECK-NEXT: [[VLA_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA]] to ptr -// CHECK-NEXT: store i64 [[TMP2]], ptr [[__VLA_EXPR0_ASCAST]], align 8 -// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VLA_ASCAST]], i64 2 -// CHECK-NEXT: store volatile i32 10, ptr [[ARRAYIDX]], align 4 -// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(5), ptr [[SAVED_STACK_ASCAST]], align 4 -// CHECK-NEXT: call void @llvm.stackrestore.p5(ptr addrspace(5) [[TMP4]]) -// CHECK-NEXT: ret void -// -__device__ void device_function_variable_size_array(int* a){ - volatile int arr[*a]; - arr[2] = 10; -} - -// CHECK-LABEL: define dso_local void @_Z51device_function_non_entry_block_static_sized_allocaPi( -// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR2]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[ALLOCA:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[ALLOCA1:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr -// CHECK-NEXT: [[ALLOCA_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA]] to ptr -// CHECK-NEXT: [[ALLOCA1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA1]] to ptr -// CHECK-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 -// CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 10 -// CHECK-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]] -// CHECK: [[IF_THEN]]: -// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 -// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP3]] to i64 -// CHECK-NEXT: [[MUL:%.*]] = mul i64 [[CONV]], 4 -// CHECK-NEXT: [[TMP4:%.*]] = alloca i8, i64 [[MUL]], align 8, addrspace(5) -// CHECK-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(5) [[TMP4]] to ptr -// CHECK-NEXT: store ptr [[TMP5]], ptr [[ALLOCA_ASCAST]], align 8 -// CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[ALLOCA_ASCAST]], align 8 -// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i64 0 -// CHECK-NEXT: store volatile i32 10, ptr [[ARRAYIDX]], align 4 -// CHECK-NEXT: br label %[[IF_END:.*]] -// CHECK: [[IF_ELSE]]: -// CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4 -// CHECK-NEXT: [[MUL2:%.*]] = mul nsw i32 2, [[TMP8]] -// CHECK-NEXT: [[CONV3:%.*]] = sext i32 [[MUL2]] to i64 -// CHECK-NEXT: [[MUL4:%.*]] = mul i64 [[CONV3]], 4 -// CHECK-NEXT: [[TMP9:%.*]] = alloca i8, i64 [[MUL4]], align 8, addrspace(5) -// CHECK-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(5) [[TMP9]] to ptr -// CHECK-NEXT: store ptr [[TMP10]], ptr [[ALLOCA1_ASCAST]], align 8 -// CHECK-NEXT: [[TMP11:%.*]] = load ptr, ptr [[ALLOCA1_ASCAST]], align 8 -// CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 0 -// CHECK-NEXT: store volatile i32 20, ptr [[ARRAYIDX5]], align 4 -// CHECK-NEXT: br label %[[IF_END]] -// CHECK: [[IF_END]]: -// CHECK-NEXT: ret void -// -__device__ void device_function_non_entry_block_static_sized_alloca(int* a){ - if(*a < 10){ - volatile void *alloca = __builtin_alloca((*a)*sizeof(int)); - static_cast<volatile int*>(alloca)[0] = 10; - }else { - volatile void *alloca = __builtin_alloca(2*(*a)*sizeof(int)); - static_cast<volatile int*>(alloca)[0] = 20; - } - /// Check formatting. -} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-divergent.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-divergent.ll index 21780805c6978a..13416bf8935ab3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-divergent.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-divergent.ll @@ -4,14 +4,6 @@ ; ERR-NEXT: warning: Instruction selection used fallback path for kernel_dynamic_stackalloc_vgpr_align4 ; ERR-NEXT: error: <unknown>:0:0: in function kernel_dynamic_stackalloc_vgpr_align4 void (ptr addrspace(1)): unsupported dynamic alloca -; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 1 (in function: func_dynamic_stackalloc_vgpr_align4) -; ERR-NEXT: warning: Instruction selection used fallback path for func_dynamic_stackalloc_vgpr_align4 -; ERR-NEXT: error: <unknown>:0:0: in function func_dynamic_stackalloc_vgpr_align4 void (i32): unsupported dynamic alloca - -; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 32 (in function: func_dynamic_stackalloc_vgpr_align32) -; ERR-NEXT: warning: Instruction selection used fallback path for func_dynamic_stackalloc_vgpr_align32 -; ERR-NEXT: error: <unknown>:0:0: in function func_dynamic_stackalloc_vgpr_align32 void (i32): unsupported dynamic alloca - define amdgpu_kernel void @kernel_dynamic_stackalloc_vgpr_align4(ptr addrspace(1) %ptr) { %id = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %id @@ -21,12 +13,56 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_vgpr_align4(ptr addrspace(1 ret void } +; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 1 (in function: kernel_dynamic_stackalloc_vgpr_default_align) +; ERR-NEXT: warning: Instruction selection used fallback path for kernel_dynamic_stackalloc_vgpr_default_align +; ERR-NEXT: error: <unknown>:0:0: in function kernel_dynamic_stackalloc_vgpr_default_align void (ptr addrspace(1)): unsupported dynamic alloca + +define amdgpu_kernel void @kernel_dynamic_stackalloc_vgpr_default_align(ptr addrspace(1) %ptr) { + %id = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %id + %n = load i32, ptr addrspace(1) %gep + %alloca = alloca i32, i32 %n, addrspace(5) + store volatile ptr addrspace(5) %alloca, ptr addrspace(1) undef + ret void +} + +; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 64 (in function: kernel_dynamic_stackalloc_vgpr_align64) +; ERR-NEXT: warning: Instruction selection used fallback path for kernel_dynamic_stackalloc_vgpr_align64 +; ERR-NEXT: error: <unknown>:0:0: in function kernel_dynamic_stackalloc_vgpr_align64 void (ptr addrspace(1)): unsupported dynamic alloca + +define amdgpu_kernel void @kernel_dynamic_stackalloc_vgpr_align64(ptr addrspace(1) %ptr) { + %id = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %id + %n = load i32, ptr addrspace(1) %gep + %alloca = alloca i32, i32 %n, align 64, addrspace(5) + store volatile ptr addrspace(5) %alloca, ptr addrspace(1) undef + ret void +} + +; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 1 (in function: func_dynamic_stackalloc_vgpr_align4) +; ERR-NEXT: warning: Instruction selection used fallback path for func_dynamic_stackalloc_vgpr_align4 +; ERR-NEXT: error: <unknown>:0:0: in function func_dynamic_stackalloc_vgpr_align4 void (i32): unsupported dynamic alloca + define void @func_dynamic_stackalloc_vgpr_align4(i32 %n) { %alloca = alloca i32, i32 %n, align 4, addrspace(5) store volatile ptr addrspace(5) %alloca, ptr addrspace(1) undef ret void } +; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 1 (in function: func_dynamic_stackalloc_vgpr_default_align) +; ERR-NEXT: warning: Instruction selection used fallback path for func_dynamic_stackalloc_vgpr_default_align +; ERR-NEXT: error: <unknown>:0:0: in function func_dynamic_stackalloc_vgpr_default_align void (i32): unsupported dynamic alloca + +define void @func_dynamic_stackalloc_vgpr_default_align(i32 %n) { + %alloca = alloca i32, i32 %n, addrspace(5) + store volatile ptr addrspace(5) %alloca, ptr addrspace(1) undef + ret void +} + +; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 32 (in function: func_dynamic_stackalloc_vgpr_align32) +; ERR-NEXT: warning: Instruction selection used fallback path for func_dynamic_stackalloc_vgpr_align32 +; ERR-NEXT: error: <unknown>:0:0: in function func_dynamic_stackalloc_vgpr_align32 void (i32): unsupported dynamic alloca + define void @func_dynamic_stackalloc_vgpr_align32(i32 %n) { %alloca = alloca i32, i32 %n, align 32, addrspace(5) store volatile ptr addrspace(5) %alloca, ptr addrspace(1) undef diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll index a44cea1da4573a..741323a201d02e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll @@ -418,88 +418,3 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) { store i32 0, ptr addrspace(5) %alloca ret void } - -define amdgpu_kernel void @kernel_non_entry_block_static_alloca(ptr addrspace(1) %out, i32 %arg.cond, i32 %in) { -; GFX9-LABEL: kernel_non_entry_block_static_alloca: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[8:9], 0x8 -; GFX9-NEXT: s_add_u32 s0, s0, s17 -; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_mov_b32 s33, 0 -; GFX9-NEXT: s_movk_i32 s32, 0x1000 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lg_u32 s4, 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB6_2 -; GFX9-NEXT: ; %bb.1: ; %bb.1 -; GFX9-NEXT: s_endpgm -; GFX9-NEXT: .LBB6_2: ; %bb.0 -; GFX9-NEXT: s_add_u32 s4, s32, 0x400 -; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff000 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 1 -; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: kernel_non_entry_block_static_alloca: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dword s4, s[8:9], 0x8 -; GFX10-NEXT: s_add_u32 s0, s0, s17 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: s_mov_b32 s33, 0 -; GFX10-NEXT: s_movk_i32 s32, 0x800 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_lg_u32 s4, 0 -; GFX10-NEXT: s_cbranch_scc0 .LBB6_2 -; GFX10-NEXT: ; %bb.1: ; %bb.1 -; GFX10-NEXT: s_endpgm -; GFX10-NEXT: .LBB6_2: ; %bb.0 -; GFX10-NEXT: s_add_u32 s4, s32, 0x200 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_and_b32 s4, s4, 0xfffff800 -; GFX10-NEXT: v_mov_b32_e32 v2, 1 -; GFX10-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen offset:4 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: kernel_non_entry_block_static_alloca: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX11-NEXT: s_mov_b32 s33, 0 -; GFX11-NEXT: s_mov_b32 s32, 64 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s0, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB6_2 -; GFX11-NEXT: ; %bb.1: ; %bb.1 -; GFX11-NEXT: s_endpgm -; GFX11-NEXT: .LBB6_2: ; %bb.0 -; GFX11-NEXT: s_add_u32 s0, s32, 0x200 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1 -; GFX11-NEXT: s_and_b32 s0, s0, 0xfffff800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_add_u32 s1, s0, 4 -; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_store_b32 off, v1, s1 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_endpgm - entry: - %cond = icmp eq i32 %arg.cond, 0 - br i1 %cond, label %bb.0, label %bb.1 - - bb.0: - %alloca = alloca i32, i32 4, align 64, addrspace(5) - %gep1 = getelementptr i32, ptr addrspace(5) %alloca, i32 1 - store volatile i32 0, ptr addrspace(5) %alloca - store volatile i32 1, ptr addrspace(5) %gep1 - br label %bb.1 - - bb.1: - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll index 32aff6ffa481fc..73aa87e5c55d20 100644 --- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll @@ -5,7 +5,7 @@ target datalayout = "A5" ; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca -define amdgpu_kernel void @test_dynamic_stackalloc(i32 %n) { +define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform(i32 %n) { %alloca = alloca i32, i32 %n, addrspace(5) store volatile i32 123, ptr addrspace(5) %alloca ret void @@ -13,38 +13,180 @@ define amdgpu_kernel void @test_dynamic_stackalloc(i32 %n) { ; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca -define amdgpu_kernel void @test_dynamic_stackalloc_multiple_allocas(i32 %n) { +define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_over_aligned(i32 %n) { + %alloca = alloca i32, i32 %n, align 128, addrspace(5) + store volatile i32 10, ptr addrspace(5) %alloca + ret void +} + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_under_aligned(i32 %n) { + %alloca = alloca i32, i32 %n, align 2, addrspace(5) + store volatile i32 22, ptr addrspace(5) %alloca + ret void +} + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent() { + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %alloca = alloca float, i32 %idx, addrspace(5) + store volatile i32 123, ptr addrspace(5) %alloca + ret void +} + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_over_aligned() { + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %alloca = alloca i32, i32 %idx, align 128, addrspace(5) + store volatile i32 444, ptr addrspace(5) %alloca + ret void +} + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_under_aligned() { + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %alloca = alloca i128, i32 %idx, align 2, addrspace(5) + store volatile i32 666, ptr addrspace(5) %alloca + ret void +} + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %n, i32 %m) { +entry: + %cond = icmp eq i32 %n, 0 %alloca1 = alloca i32, i32 8, addrspace(5) - %alloca2 = alloca i32, i32 %n, addrspace(5) - %alloca3 = alloca i32, i32 10, addrspace(5) + %alloca2 = alloca i17, i32 %n, addrspace(5) + br i1 %cond, label %bb.0, label %bb.1 +bb.0: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %alloca3 = alloca i32, i32 %m, align 64, addrspace(5) + %alloca4 = alloca i32, i32 %idx, align 4, addrspace(5) + store volatile i32 3, ptr addrspace(5) %alloca3 + store volatile i32 4, ptr addrspace(5) %alloca4 + br label %bb.1 +bb.1: store volatile i32 1, ptr addrspace(5) %alloca1 store volatile i32 2, ptr addrspace(5) %alloca2 - store volatile i32 3, ptr addrspace(5) %alloca3 + ret void +} + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i32 %m) { +entry: + %cond = icmp eq i32 %n, 0 + br i1 %cond, label %bb.0, label %bb.1 +bb.0: + %alloca2 = alloca i32, i32 %m, align 64, addrspace(5) + store volatile i32 2, ptr addrspace(5) %alloca2 + br label %bb.2 +bb.1: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %alloca1 = alloca i32, i32 %idx, align 4, addrspace(5) + store volatile i32 1, ptr addrspace(5) %alloca1 + br label %bb.2 +bb.2: + ret void +} + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define void @test_dynamic_stackalloc_device_uniform(i32 %n) { + %alloca = alloca i32, i32 %n, addrspace(5) + store volatile i32 123, ptr addrspace(5) %alloca ret void } ; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca -define amdgpu_kernel void @test_dynamic_stackalloc_custom_alignment(i32 %n) { +define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) { %alloca = alloca i32, i32 %n, align 128, addrspace(5) - store volatile i32 1, ptr addrspace(5) %alloca + store volatile i32 10, ptr addrspace(5) %alloca ret void } ; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca -define amdgpu_kernel void @test_dynamic_stackalloc_non_entry_block(i32 %n) { - entry: - %cond = icmp eq i32 %n, 0 - br i1 %cond, label %bb.0, label %bb.1 +define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) { + %alloca = alloca i32, i32 %n, align 2, addrspace(5) + store volatile i32 22, ptr addrspace(5) %alloca + ret void +} - bb.0: - %alloca = alloca i32, i32 %n, align 64, addrspace(5) - %gep1 = getelementptr i32, ptr addrspace(5) %alloca, i32 1 - store volatile i32 0, ptr addrspace(5) %alloca - store volatile i32 1, ptr addrspace(5) %gep1 - br label %bb.1 +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca - bb.1: - ret void +define void @test_dynamic_stackalloc_device_divergent() { + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %alloca = alloca i32, i32 %idx, addrspace(5) + store volatile i32 123, ptr addrspace(5) %alloca + ret void +} + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define void @test_dynamic_stackalloc_device_divergent_over_aligned() { + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %alloca = alloca i32, i32 %idx, align 128, addrspace(5) + store volatile i32 444, ptr addrspace(5) %alloca + ret void +} + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define void @test_dynamic_stackalloc_device_divergent_under_aligned() { + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %alloca = alloca i32, i32 %idx, align 2, addrspace(5) + store volatile i32 666, ptr addrspace(5) %alloca + ret void +} + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { +entry: + %cond = icmp eq i32 %n, 0 + %alloca1 = alloca i32, i32 8, addrspace(5) + %alloca2 = alloca i32, i32 %n, addrspace(5) + br i1 %cond, label %bb.0, label %bb.1 +bb.0: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %alloca3 = alloca i32, i32 %m, align 64, addrspace(5) + %alloca4 = alloca i32, i32 %idx, align 4, addrspace(5) + store volatile i32 3, ptr addrspace(5) %alloca3 + store volatile i32 4, ptr addrspace(5) %alloca4 + br label %bb.1 +bb.1: + store volatile i32 1, ptr addrspace(5) %alloca1 + store volatile i32 2, ptr addrspace(5) %alloca2 + ret void +} + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { +entry: + %cond = icmp eq i32 %n, 0 + br i1 %cond, label %bb.0, label %bb.1 +bb.0: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %alloca1 = alloca i32, i32 %idx, align 4, addrspace(5) + store volatile i32 1, ptr addrspace(5) %alloca1 + br label %bb.2 +bb.1: + %alloca2 = alloca i32, i32 %m, align 64, addrspace(5) + store volatile i32 2, ptr addrspace(5) %alloca2 + br label %bb.2 +bb.2: + ret void } _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits