https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/114144
None >From 5fb1bed79e2a51343d9d0dbd5b295a1eefb04756 Mon Sep 17 00:00:00 2001 From: Matt Arsenault <matthew.arsena...@amd.com> Date: Tue, 29 Oct 2024 14:44:53 -0700 Subject: [PATCH] AMDGPU: Improve vector of pointer handling in amdgpu-promote-alloca --- .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 73 +++--- .../promote-alloca-invalid-vector-gep.ll | 44 ---- .../AMDGPU/promote-alloca-vector-gep.ll | 243 ++++++++++++++++++ 3 files changed, 277 insertions(+), 83 deletions(-) delete mode 100644 llvm/test/CodeGen/AMDGPU/promote-alloca-invalid-vector-gep.ll create mode 100644 llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 7dd7388376f474..9834b0f7f4b2f5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -1115,9 +1115,10 @@ bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca( if (Val == OtherOp) OtherOp = Inst->getOperand(OpIdx1); - if (isa<ConstantPointerNull>(OtherOp)) + if (isa<ConstantPointerNull>(OtherOp) || isa<ConstantAggregateZero>(OtherOp)) return true; + // TODO: getUnderlyingObject will not work on a vector getelementptr Value *OtherObj = getUnderlyingObject(OtherOp); if (!isa<AllocaInst>(OtherObj)) return false; @@ -1195,36 +1196,19 @@ bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes( continue; } - // TODO: If we know the address is only observed through flat pointers, we - // could still promote. - if (UseInst->getOpcode() == Instruction::AddrSpaceCast) - return false; - - // Do not promote vector/aggregate type instructions. It is hard to track - // their users. - if (isa<InsertValueInst>(User) || isa<InsertElementInst>(User)) - return false; - - // TODO: Handle vectors of pointers. - if (!User->getType()->isPointerTy()) - return false; - if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UseInst)) { // Be conservative if an address could be computed outside the bounds of // the alloca. if (!GEP->isInBounds()) return false; - } - - // Only promote a select if we know that the other select operand is from - // another pointer that will also be promoted. - if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) { + } else if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) { + // Only promote a select if we know that the other select operand is from + // another pointer that will also be promoted. if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2)) return false; - } + } else if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) { + // Repeat for phis. - // Repeat for phis. - if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) { // TODO: Handle more complex cases. We should be able to replace loops // over arrays. switch (Phi->getNumIncomingValues()) { @@ -1237,6 +1221,15 @@ bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes( default: return false; } + } else if (!isa<ExtractElementInst>(User)) { + // Do not promote vector/aggregate type instructions. It is hard to track + // their users. + + // Do not promote addrspacecast. + // + // TODO: If we know the address is only observed through flat pointers, we + // could still promote. + return false; } WorkList.push_back(User); @@ -1490,17 +1483,21 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I, SmallVector<IntrinsicInst *> DeferredIntrs; + PointerType *NewPtrTy = PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS); + for (Value *V : WorkList) { CallInst *Call = dyn_cast<CallInst>(V); if (!Call) { if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) { - PointerType *NewTy = PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS); + Value *LHS = CI->getOperand(0); + Value *RHS = CI->getOperand(1); - if (isa<ConstantPointerNull>(CI->getOperand(0))) - CI->setOperand(0, ConstantPointerNull::get(NewTy)); + Type *NewTy = LHS->getType()->getWithNewType(NewPtrTy); + if (isa<ConstantPointerNull, ConstantAggregateZero>(LHS)) + CI->setOperand(0, Constant::getNullValue(NewTy)); - if (isa<ConstantPointerNull>(CI->getOperand(1))) - CI->setOperand(1, ConstantPointerNull::get(NewTy)); + if (isa<ConstantPointerNull, ConstantAggregateZero>(RHS)) + CI->setOperand(1, Constant::getNullValue(NewTy)); continue; } @@ -1510,25 +1507,23 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I, if (isa<AddrSpaceCastInst>(V)) continue; - PointerType *NewTy = PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS); - - assert(isa<PointerType>(V->getType())); + assert(V->getType()->isPtrOrPtrVectorTy()); - // FIXME: It doesn't really make sense to try to do this for all - // instructions. + Type *NewTy = V->getType()->getWithNewType(NewPtrTy); V->mutateType(NewTy); // Adjust the types of any constant operands. if (SelectInst *SI = dyn_cast<SelectInst>(V)) { - if (isa<ConstantPointerNull>(SI->getOperand(1))) - SI->setOperand(1, ConstantPointerNull::get(NewTy)); + if (isa<ConstantPointerNull, ConstantAggregateZero>(SI->getOperand(1))) + SI->setOperand(1, Constant::getNullValue(NewTy)); - if (isa<ConstantPointerNull>(SI->getOperand(2))) - SI->setOperand(2, ConstantPointerNull::get(NewTy)); + if (isa<ConstantPointerNull, ConstantAggregateZero>(SI->getOperand(2))) + SI->setOperand(2, Constant::getNullValue(NewTy)); } else if (PHINode *Phi = dyn_cast<PHINode>(V)) { for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) { - if (isa<ConstantPointerNull>(Phi->getIncomingValue(I))) - Phi->setIncomingValue(I, ConstantPointerNull::get(NewTy)); + if (isa<ConstantPointerNull, ConstantAggregateZero>( + Phi->getIncomingValue(I))) + Phi->setIncomingValue(I, Constant::getNullValue(NewTy)); } } diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-invalid-vector-gep.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-invalid-vector-gep.ll deleted file mode 100644 index b0d578e421e280..00000000000000 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-invalid-vector-gep.ll +++ /dev/null @@ -1,44 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s - -; Check that invalid IR is not produced on a vector typed -; getelementptr with a scalar alloca pointer base. - -define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset() { -; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset() { -; CHECK-NEXT: [[BB:.*:]] -; CHECK-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5) -; CHECK-NEXT: [[GETELEMENTPTR:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[ALLOCA]], <4 x i64> <i64 0, i64 1, i64 2, i64 3> -; CHECK-NEXT: [[EXTRACTELEMENT:%.*]] = extractelement <4 x ptr addrspace(5)> [[GETELEMENTPTR]], i64 0 -; CHECK-NEXT: store i32 0, ptr addrspace(5) [[EXTRACTELEMENT]], align 4 -; CHECK-NEXT: ret void -; -bb: - %alloca = alloca i32, align 4, addrspace(5) - %getelementptr = getelementptr inbounds i8, ptr addrspace(5) %alloca, <4 x i64> <i64 0, i64 1, i64 2, i64 3> - %extractelement = extractelement <4 x ptr addrspace(5)> %getelementptr, i64 0 - store i32 0, ptr addrspace(5) %extractelement - ret void -} - -define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset_select(i1 %cond) { -; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset_select( -; CHECK-SAME: i1 [[COND:%.*]]) { -; CHECK-NEXT: [[BB:.*:]] -; CHECK-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5) -; CHECK-NEXT: [[GETELEMENTPTR0:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[ALLOCA]], <4 x i64> <i64 0, i64 1, i64 2, i64 3> -; CHECK-NEXT: [[GETELEMENTPTR1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[ALLOCA]], <4 x i64> <i64 3, i64 2, i64 1, i64 0> -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], <4 x ptr addrspace(5)> [[GETELEMENTPTR0]], <4 x ptr addrspace(5)> [[GETELEMENTPTR1]] -; CHECK-NEXT: [[EXTRACTELEMENT:%.*]] = extractelement <4 x ptr addrspace(5)> [[SELECT]], i64 1 -; CHECK-NEXT: store i32 0, ptr addrspace(5) [[EXTRACTELEMENT]], align 4 -; CHECK-NEXT: ret void -; -bb: - %alloca = alloca i32, align 4, addrspace(5) - %getelementptr0 = getelementptr inbounds i8, ptr addrspace(5) %alloca, <4 x i64> <i64 0, i64 1, i64 2, i64 3> - %getelementptr1 = getelementptr inbounds i8, ptr addrspace(5) %alloca, <4 x i64> <i64 3, i64 2, i64 1, i64 0> - %select = select i1 %cond, <4 x ptr addrspace(5)> %getelementptr0, <4 x ptr addrspace(5)> %getelementptr1 - %extractelement = extractelement <4 x ptr addrspace(5)> %select, i64 1 - store i32 0, ptr addrspace(5) %extractelement - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll new file mode 100644 index 00000000000000..355a2b8796b24d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll @@ -0,0 +1,243 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s + +; Check that invalid IR is not produced on a vector typed +; getelementptr with a scalar alloca pointer base. + +define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset() { +; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset() { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 1 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !invariant.load [[META0:![0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 2 +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !range [[RNG1:![0-9]+]], !invariant.load [[META0]] +; CHECK-NEXT: [[TMP5:%.*]] = lshr i32 [[TMP2]], 16 +; CHECK-NEXT: [[TMP6:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[TMP7:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y() +; CHECK-NEXT: [[TMP8:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z() +; CHECK-NEXT: [[TMP9:%.*]] = mul nuw nsw i32 [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = mul nuw nsw i32 [[TMP7]], [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP8]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x [4 x i32]], ptr addrspace(3) @scalar_alloca_ptr_with_vector_gep_offset.alloca, i32 0, i32 [[TMP13]] +; CHECK-NEXT: [[GETELEMENTPTR:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP14]], <4 x i64> <i64 0, i64 1, i64 2, i64 3> +; CHECK-NEXT: [[EXTRACTELEMENT:%.*]] = extractelement <4 x ptr addrspace(3)> [[GETELEMENTPTR]], i64 0 +; CHECK-NEXT: store i32 0, ptr addrspace(3) [[EXTRACTELEMENT]], align 4 +; CHECK-NEXT: ret void +; +bb: + %alloca = alloca [4 x i32], align 4, addrspace(5) + %getelementptr = getelementptr inbounds i8, ptr addrspace(5) %alloca, <4 x i64> <i64 0, i64 1, i64 2, i64 3> + %extractelement = extractelement <4 x ptr addrspace(5)> %getelementptr, i64 0 + store i32 0, ptr addrspace(5) %extractelement + ret void +} + +; TODO: Should be able to promote this +define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset_select(i1 %cond) { +; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset_select( +; CHECK-SAME: i1 [[COND:%.*]]) { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [4 x i32], align 4, addrspace(5) +; CHECK-NEXT: [[GETELEMENTPTR0:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[ALLOCA]], <4 x i64> <i64 0, i64 1, i64 2, i64 3> +; CHECK-NEXT: [[GETELEMENTPTR1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[ALLOCA]], <4 x i64> <i64 3, i64 2, i64 1, i64 0> +; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], <4 x ptr addrspace(5)> [[GETELEMENTPTR0]], <4 x ptr addrspace(5)> [[GETELEMENTPTR1]] +; CHECK-NEXT: [[EXTRACTELEMENT:%.*]] = extractelement <4 x ptr addrspace(5)> [[SELECT]], i64 1 +; CHECK-NEXT: store i32 0, ptr addrspace(5) [[EXTRACTELEMENT]], align 4 +; CHECK-NEXT: ret void +; +bb: + %alloca = alloca [4 x i32], align 4, addrspace(5) + %getelementptr0 = getelementptr inbounds i8, ptr addrspace(5) %alloca, <4 x i64> <i64 0, i64 1, i64 2, i64 3> + %getelementptr1 = getelementptr inbounds i8, ptr addrspace(5) %alloca, <4 x i64> <i64 3, i64 2, i64 1, i64 0> + %select = select i1 %cond, <4 x ptr addrspace(5)> %getelementptr0, <4 x ptr addrspace(5)> %getelementptr1 + %extractelement = extractelement <4 x ptr addrspace(5)> %select, i64 1 + store i32 0, ptr addrspace(5) %extractelement + ret void +} + +define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset_select_nullptr0(i1 %cond) { +; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset_select_nullptr0( +; CHECK-SAME: i1 [[COND:%.*]]) { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 1 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 2 +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !range [[RNG1]], !invariant.load [[META0]] +; CHECK-NEXT: [[TMP5:%.*]] = lshr i32 [[TMP2]], 16 +; CHECK-NEXT: [[TMP6:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[TMP7:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y() +; CHECK-NEXT: [[TMP8:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z() +; CHECK-NEXT: [[TMP9:%.*]] = mul nuw nsw i32 [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = mul nuw nsw i32 [[TMP7]], [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP8]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x [4 x i32]], ptr addrspace(3) @scalar_alloca_ptr_with_vector_gep_offset_select_nullptr0.alloca, i32 0, i32 [[TMP13]] +; CHECK-NEXT: [[GETELEMENTPTR0:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP14]], <4 x i64> <i64 0, i64 1, i64 2, i64 3> +; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], <4 x ptr addrspace(3)> zeroinitializer, <4 x ptr addrspace(3)> [[GETELEMENTPTR0]] +; CHECK-NEXT: [[EXTRACTELEMENT:%.*]] = extractelement <4 x ptr addrspace(3)> [[SELECT]], i64 1 +; CHECK-NEXT: store i32 0, ptr addrspace(3) [[EXTRACTELEMENT]], align 4 +; CHECK-NEXT: ret void +; +bb: + %alloca = alloca [4 x i32], align 4, addrspace(5) + %getelementptr0 = getelementptr inbounds i8, ptr addrspace(5) %alloca, <4 x i64> <i64 0, i64 1, i64 2, i64 3> + %select = select i1 %cond, <4 x ptr addrspace(5)> zeroinitializer, <4 x ptr addrspace(5)> %getelementptr0 + %extractelement = extractelement <4 x ptr addrspace(5)> %select, i64 1 + store i32 0, ptr addrspace(5) %extractelement + ret void +} + +define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset_select_nullptr1(i1 %cond) { +; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset_select_nullptr1( +; CHECK-SAME: i1 [[COND:%.*]]) { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 1 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 2 +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !range [[RNG1]], !invariant.load [[META0]] +; CHECK-NEXT: [[TMP5:%.*]] = lshr i32 [[TMP2]], 16 +; CHECK-NEXT: [[TMP6:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[TMP7:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y() +; CHECK-NEXT: [[TMP8:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z() +; CHECK-NEXT: [[TMP9:%.*]] = mul nuw nsw i32 [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = mul nuw nsw i32 [[TMP7]], [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP8]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x [4 x i32]], ptr addrspace(3) @scalar_alloca_ptr_with_vector_gep_offset_select_nullptr1.alloca, i32 0, i32 [[TMP13]] +; CHECK-NEXT: [[GETELEMENTPTR0:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP14]], <4 x i64> <i64 0, i64 1, i64 2, i64 3> +; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], <4 x ptr addrspace(3)> [[GETELEMENTPTR0]], <4 x ptr addrspace(3)> zeroinitializer +; CHECK-NEXT: [[EXTRACTELEMENT:%.*]] = extractelement <4 x ptr addrspace(3)> [[SELECT]], i64 1 +; CHECK-NEXT: store i32 0, ptr addrspace(3) [[EXTRACTELEMENT]], align 4 +; CHECK-NEXT: ret void +; +bb: + %alloca = alloca [4 x i32], align 4, addrspace(5) + %getelementptr0 = getelementptr inbounds i8, ptr addrspace(5) %alloca, <4 x i64> <i64 0, i64 1, i64 2, i64 3> + %select = select i1 %cond, <4 x ptr addrspace(5)> %getelementptr0, <4 x ptr addrspace(5)> zeroinitializer + %extractelement = extractelement <4 x ptr addrspace(5)> %select, i64 1 + store i32 0, ptr addrspace(5) %extractelement + ret void +} + +define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset_icmp_nullptr0(i1 %cond, ptr addrspace(1) %out) { +; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset_icmp_nullptr0( +; CHECK-SAME: i1 [[COND:%.*]], ptr addrspace(1) [[OUT:%.*]]) { +; CHECK-NEXT: [[BB0:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 1 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 2 +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !range [[RNG1]], !invariant.load [[META0]] +; CHECK-NEXT: [[TMP5:%.*]] = lshr i32 [[TMP2]], 16 +; CHECK-NEXT: [[TMP6:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[TMP7:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y() +; CHECK-NEXT: [[TMP8:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z() +; CHECK-NEXT: [[TMP9:%.*]] = mul nuw nsw i32 [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = mul nuw nsw i32 [[TMP7]], [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP8]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x [4 x i32]], ptr addrspace(3) @scalar_alloca_ptr_with_vector_gep_offset_icmp_nullptr0.alloca, i32 0, i32 [[TMP13]] +; CHECK-NEXT: [[GETELEMENTPTR0:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP14]], <4 x i64> <i64 0, i64 1, i64 2, i64 3> +; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], <4 x ptr addrspace(3)> zeroinitializer, <4 x ptr addrspace(3)> [[GETELEMENTPTR0]] +; CHECK-NEXT: [[ICMP:%.*]] = icmp eq <4 x ptr addrspace(3)> [[SELECT]], zeroinitializer +; CHECK-NEXT: store <4 x i1> [[ICMP]], ptr addrspace(1) [[OUT]], align 1 +; CHECK-NEXT: ret void +; +bb0: + %alloca = alloca [4 x i32], align 4, addrspace(5) + %getelementptr0 = getelementptr inbounds i8, ptr addrspace(5) %alloca, <4 x i64> <i64 0, i64 1, i64 2, i64 3> + %select = select i1 %cond, <4 x ptr addrspace(5)> zeroinitializer, <4 x ptr addrspace(5)> %getelementptr0 + %icmp = icmp eq <4 x ptr addrspace(5)> %select, zeroinitializer + store <4 x i1> %icmp, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset_icmp_nullptr1(i1 %cond, ptr addrspace(1) %out) { +; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset_icmp_nullptr1( +; CHECK-SAME: i1 [[COND:%.*]], ptr addrspace(1) [[OUT:%.*]]) { +; CHECK-NEXT: [[BB0:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 1 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 2 +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !range [[RNG1]], !invariant.load [[META0]] +; CHECK-NEXT: [[TMP5:%.*]] = lshr i32 [[TMP2]], 16 +; CHECK-NEXT: [[TMP6:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[TMP7:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y() +; CHECK-NEXT: [[TMP8:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z() +; CHECK-NEXT: [[TMP9:%.*]] = mul nuw nsw i32 [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = mul nuw nsw i32 [[TMP7]], [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP8]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x [4 x i32]], ptr addrspace(3) @scalar_alloca_ptr_with_vector_gep_offset_icmp_nullptr1.alloca, i32 0, i32 [[TMP13]] +; CHECK-NEXT: [[GETELEMENTPTR0:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP14]], <4 x i64> <i64 0, i64 1, i64 2, i64 3> +; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], <4 x ptr addrspace(3)> zeroinitializer, <4 x ptr addrspace(3)> [[GETELEMENTPTR0]] +; CHECK-NEXT: [[ICMP:%.*]] = icmp eq <4 x ptr addrspace(3)> zeroinitializer, [[SELECT]] +; CHECK-NEXT: store <4 x i1> [[ICMP]], ptr addrspace(1) [[OUT]], align 1 +; CHECK-NEXT: ret void +; +bb0: + %alloca = alloca [4 x i32], align 4, addrspace(5) + %getelementptr0 = getelementptr inbounds i8, ptr addrspace(5) %alloca, <4 x i64> <i64 0, i64 1, i64 2, i64 3> + %select = select i1 %cond, <4 x ptr addrspace(5)> zeroinitializer, <4 x ptr addrspace(5)> %getelementptr0 + %icmp = icmp eq <4 x ptr addrspace(5)> zeroinitializer, %select + store <4 x i1> %icmp, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset_phi_nullptr(i1 %cond, ptr addrspace(1) %out) { +; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset_phi_nullptr( +; CHECK-SAME: i1 [[COND:%.*]], ptr addrspace(1) [[OUT:%.*]]) { +; CHECK-NEXT: [[BB0:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 1 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 2 +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !range [[RNG1]], !invariant.load [[META0]] +; CHECK-NEXT: [[TMP5:%.*]] = lshr i32 [[TMP2]], 16 +; CHECK-NEXT: [[TMP6:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[TMP7:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y() +; CHECK-NEXT: [[TMP8:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z() +; CHECK-NEXT: [[TMP9:%.*]] = mul nuw nsw i32 [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = mul nuw nsw i32 [[TMP7]], [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP8]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x [4 x i32]], ptr addrspace(3) @scalar_alloca_ptr_with_vector_gep_offset_phi_nullptr.alloca, i32 0, i32 [[TMP13]] +; CHECK-NEXT: br i1 [[COND]], label %[[BB1:.*]], label %[[BB2:.*]] +; CHECK: [[BB1]]: +; CHECK-NEXT: [[GETELEMENTPTR0:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP14]], <4 x i64> <i64 0, i64 1, i64 2, i64 3> +; CHECK-NEXT: br label %[[BB2]] +; CHECK: [[BB2]]: +; CHECK-NEXT: [[PHI:%.*]] = phi <4 x ptr addrspace(3)> [ [[GETELEMENTPTR0]], %[[BB1]] ], [ zeroinitializer, %[[BB0]] ] +; CHECK-NEXT: [[EXTRACTELEMENT:%.*]] = extractelement <4 x ptr addrspace(3)> [[PHI]], i64 2 +; CHECK-NEXT: store i32 0, ptr addrspace(3) [[EXTRACTELEMENT]], align 4 +; CHECK-NEXT: ret void +; +bb0: + %alloca = alloca [4 x i32], align 4, addrspace(5) + br i1 %cond, label %bb1, label %bb2 + +bb1: + %getelementptr0 = getelementptr inbounds i8, ptr addrspace(5) %alloca, <4 x i64> <i64 0, i64 1, i64 2, i64 3> + br label %bb2 + +bb2: + %phi = phi <4 x ptr addrspace(5)> [ %getelementptr0, %bb1 ], [ zeroinitializer, %bb0] + %extractelement = extractelement <4 x ptr addrspace(5)> %phi, i64 2 + store i32 0, ptr addrspace(5) %extractelement + ret void +} +;. +; CHECK: [[META0]] = !{} +; CHECK: [[RNG1]] = !{i32 0, i32 1025} +;. _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits