https://github.com/actinks updated https://github.com/llvm/llvm-project/pull/171019
>From f1ca224d2ca9f0cabfdbf636cee4ed5cda23b82a Mon Sep 17 00:00:00 2001 From: actink <[email protected]> Date: Sun, 7 Dec 2025 11:25:08 +0800 Subject: [PATCH 1/2] precommit --- .../InferAddressSpaces/NVPTX/load-ptr.ll | 240 ++++++++++++++++++ 1 file changed, 240 insertions(+) create mode 100644 llvm/test/Transforms/InferAddressSpaces/NVPTX/load-ptr.ll diff --git a/llvm/test/Transforms/InferAddressSpaces/NVPTX/load-ptr.ll b/llvm/test/Transforms/InferAddressSpaces/NVPTX/load-ptr.ll new file mode 100644 index 0000000000000..fb3f55ab89497 --- /dev/null +++ b/llvm/test/Transforms/InferAddressSpaces/NVPTX/load-ptr.ll @@ -0,0 +1,240 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=infer-address-spaces %s | FileCheck %s + +target triple = "nvptx64-nvidia-cuda" + +define ptx_kernel void @globalmem_flat_ptr_with_global(ptr %a, ptr %b){ +; CHECK-LABEL: define ptx_kernel void @globalmem_flat_ptr_with_global( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(1) +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(1) +; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr addrspace(1) [[TMP0]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = tail call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() +; CHECK-NEXT: [[IDXPROM:%.*]] = zext nneg i32 [[TMP3]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP2]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP1]], i64 [[IDXPROM]] +; CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(1) [[ARRAYIDX3]], align 4 +; CHECK-NEXT: ret void +; +entry: + %0 = load ptr, ptr %a, align 8 + %1 = tail call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %idxprom = zext nneg i32 %1 to i64 + %arrayidx = getelementptr inbounds nuw i32, ptr %0, i64 %idxprom + %2 = load i32, ptr %arrayidx, align 4 + %arrayidx3 = getelementptr inbounds nuw i32, ptr %b, i64 %idxprom + store i32 %2, ptr %arrayidx3, align 4 + ret void +} + +@shared_ptrs = internal unnamed_addr addrspace(3) global [32 x ptr] undef, align 8 + +define ptx_kernel void @sharedmem_flat_ptr_with_global(ptr %a, ptr %b) { +; CHECK-LABEL: define ptx_kernel void @sharedmem_flat_ptr_with_global( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(1) +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr +; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(1) +; CHECK-NEXT: [[TMP3:%.*]] = tail call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() +; CHECK-NEXT: [[IDXPROM:%.*]] = zext nneg i32 [[TMP3]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i64 [[IDXPROM]] +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds ptr, ptr addrspace(3) @shared_ptrs, i64 [[IDXPROM]] +; CHECK-NEXT: store ptr [[ARRAYIDX]], ptr addrspace(3) [[ARRAYIDX3]], align 8 +; CHECK-NEXT: tail call void @llvm.nvvm.bar.warp.sync(i32 -1) +; CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr addrspace(3) [[ARRAYIDX3]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP2]], i64 [[IDXPROM]] +; CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[ARRAYIDX9]], align 4 +; CHECK-NEXT: ret void +; +entry: + %0 = tail call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %idxprom = zext nneg i32 %0 to i64 + %arrayidx = getelementptr inbounds nuw i32, ptr %a, i64 %idxprom + %arrayidx3 = getelementptr inbounds nuw ptr, ptr addrspacecast (ptr addrspace(3) @shared_ptrs to ptr), i64 %idxprom + store ptr %arrayidx, ptr %arrayidx3, align 8 + tail call void @llvm.nvvm.bar.warp.sync(i32 -1) + %1 = load ptr, ptr %arrayidx3, align 8 + %2 = load i32, ptr %1, align 4 + %arrayidx9 = getelementptr inbounds nuw i32, ptr %b, i64 %idxprom + store i32 %2, ptr %arrayidx9, align 4 + ret void +} + +@a = dso_local addrspace(1) externally_initialized global ptr null, align 8 [email protected] = appending global [1 x ptr] [ptr addrspacecast (ptr addrspace(1) @a to ptr)], section "llvm.metadata" + +define dso_local ptx_kernel void @device_var_with_global(ptr %b) { +; CHECK-LABEL: define dso_local ptx_kernel void @device_var_with_global( +; CHECK-SAME: ptr [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(1) +; CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr addrspace(1) @a, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = tail call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() +; CHECK-NEXT: [[IDXPROM:%.*]] = zext nneg i32 [[TMP3]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP2]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP0]], i64 [[IDXPROM]] +; CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(1) [[ARRAYIDX3]], align 4 +; CHECK-NEXT: ret void +; +entry: + %0 = load ptr, ptr addrspacecast (ptr addrspace(1) @a to ptr), align 8 + %1 = load ptr, ptr %0, align 8 + %2 = tail call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %idxprom = zext nneg i32 %2 to i64 + %arrayidx = getelementptr inbounds nuw i32, ptr %1, i64 %idxprom + %3 = load i32, ptr %arrayidx, align 4 + %arrayidx3 = getelementptr inbounds nuw i32, ptr %b, i64 %idxprom + store i32 %3, ptr %arrayidx3, align 4 + ret void +} + + +define ptx_kernel void @globalmem_flat_ptr_with_global_clobber(ptr %a, ptr %b) { +; CHECK-LABEL: define ptx_kernel void @globalmem_flat_ptr_with_global_clobber( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(1) +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(1) +; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr addrspace(1) [[TMP0]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = tail call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() +; CHECK-NEXT: [[IDXPROM:%.*]] = zext nneg i32 [[TMP4]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[TMP1]], i64 [[IDXPROM]] +; CHECK-NEXT: store ptr [[TMP2]], ptr addrspace(1) [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 4 +; CHECK-NEXT: store i32 [[TMP5]], ptr [[ARRAYIDX4]], align 4 +; CHECK-NEXT: ret void +; +entry: + %0 = load ptr, ptr %a, align 8 + %1 = tail call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %idxprom = zext nneg i32 %1 to i64 + %arrayidx = getelementptr inbounds nuw ptr, ptr %b, i64 %idxprom + ; 1 = MemoryDef(liveOnEntry) + store ptr %0, ptr %arrayidx, align 8 + ; MemoryUse(1) + %2 = load i32, ptr %0, align 4 + %arrayidx4 = getelementptr inbounds nuw i8, ptr %0, i64 4 + ; 2 = MemoryDef(1) + store i32 %2, ptr %arrayidx4, align 4 + ret void +} + + +@s_int2 = internal addrspace(3) global [2 x i32] undef, align 4 + +; Function Attrs: convergent mustprogress noinline norecurse nounwind +define dso_local ptx_kernel void @phi_clobber_with_diff_as(ptr %a, ptr %b) { +; CHECK-LABEL: define dso_local ptx_kernel void @phi_clobber_with_diff_as( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(1) +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(1) +; CHECK-NEXT: store i32 0, ptr addrspace(3) @s_int2, align 4 +; CHECK-NEXT: store i32 0, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @s_int2, i64 4), align 4 +; CHECK-NEXT: tail call void @llvm.nvvm.bar.warp.sync(i32 -1) +; CHECK-NEXT: [[TMP2:%.*]] = tail call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ugt i32 [[TMP2]], 15 +; CHECK-NEXT: [[IDXPROM:%.*]] = zext nneg i32 [[TMP2]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[TMP0]], i64 [[IDXPROM]] +; CHECK-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[ENTRY_IF_END_CRIT_EDGE:.*]] +; CHECK: [[ENTRY_IF_END_CRIT_EDGE]]: +; CHECK-NEXT: [[DOTPRE:%.*]] = load ptr, ptr addrspace(1) [[ARRAYIDX]], align 8 +; CHECK-NEXT: br label %[[IF_END:.*]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: store ptr addrspacecast (ptr addrspace(3) @s_int2 to ptr), ptr addrspace(1) [[ARRAYIDX]], align 8 +; CHECK-NEXT: br label %[[IF_END]] +; CHECK: [[IF_END]]: +; CHECK-NEXT: [[TMP3:%.*]] = phi ptr [ [[DOTPRE]], %[[ENTRY_IF_END_CRIT_EDGE]] ], [ addrspacecast (ptr addrspace(3) @s_int2 to ptr), %[[IF_THEN]] ] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP1]], i64 [[IDXPROM]] +; CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(1) [[ARRAYIDX7]], align 4 +; CHECK-NEXT: ret void +; +entry: + store i32 0, ptr addrspacecast (ptr addrspace(3) @s_int2 to ptr), align 4 + store i32 0, ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @s_int2 to ptr), i64 4), align 4 + tail call void @llvm.nvvm.bar.warp.sync(i32 -1) + %0 = tail call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %cmp = icmp samesign ugt i32 %0, 15 + %idxprom = zext nneg i32 %0 to i64 + %arrayidx = getelementptr inbounds nuw ptr, ptr %a, i64 %idxprom + br i1 %cmp, label %if.then, label %entry.if.end_crit_edge + +entry.if.end_crit_edge: ; preds = %entry + %.pre = load ptr, ptr %arrayidx, align 8 + br label %if.end + +if.then: ; preds = %entry + store ptr addrspacecast (ptr addrspace(3) @s_int2 to ptr), ptr %arrayidx, align 8 + br label %if.end + +if.end: ; preds = %entry.if.end_crit_edge, %if.then + %1 = phi ptr [ %.pre, %entry.if.end_crit_edge ], [ addrspacecast (ptr addrspace(3) @s_int2 to ptr), %if.then ] + %2 = load i32, ptr %1, align 4 + %arrayidx7 = getelementptr inbounds nuw i32, ptr %b, i64 %idxprom + store i32 %2, ptr %arrayidx7, align 4 + ret void +} + +define ptx_kernel void @phi_clobber_with_same_as(ptr %a, ptr %b) { +; CHECK-LABEL: define ptx_kernel void @phi_clobber_with_same_as( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(1) +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(1) +; CHECK-NEXT: store i32 0, ptr addrspace(3) @s_int2, align 4 +; CHECK-NEXT: store i32 0, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @s_int2, i64 4), align 4 +; CHECK-NEXT: tail call void @llvm.nvvm.bar.warp.sync(i32 -1) +; CHECK-NEXT: [[TMP2:%.*]] = tail call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ugt i32 [[TMP2]], 15 +; CHECK-NEXT: [[IDXPROM:%.*]] = zext nneg i32 [[TMP2]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[TMP0]], i64 [[IDXPROM]] +; CHECK-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[ENTRY_IF_END_CRIT_EDGE:.*]] +; CHECK: [[ENTRY_IF_END_CRIT_EDGE]]: +; CHECK-NEXT: store ptr addrspacecast (ptr addrspace(3) @s_int2 to ptr), ptr addrspace(1) [[ARRAYIDX]], align 8 +; CHECK-NEXT: br label %[[IF_END:.*]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: store ptr addrspacecast (ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @s_int2, i64 4) to ptr), ptr addrspace(1) [[ARRAYIDX]], align 8 +; CHECK-NEXT: br label %[[IF_END]] +; CHECK: [[IF_END]]: +; CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr addrspace(1) [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP1]], i64 [[IDXPROM]] +; CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(1) [[ARRAYIDX7]], align 4 +; CHECK-NEXT: ret void +; +entry: + store i32 0, ptr addrspacecast (ptr addrspace(3) @s_int2 to ptr), align 4 + store i32 0, ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @s_int2 to ptr), i64 4), align 4 + tail call void @llvm.nvvm.bar.warp.sync(i32 -1) + %0 = tail call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %cmp = icmp samesign ugt i32 %0, 15 + %idxprom = zext nneg i32 %0 to i64 + %arrayidx = getelementptr inbounds nuw ptr, ptr %a, i64 %idxprom + br i1 %cmp, label %if.then, label %entry.if.end_crit_edge + +entry.if.end_crit_edge: ; preds = %entry + store ptr addrspacecast (ptr addrspace(3) @s_int2 to ptr), ptr %arrayidx, align 8 + br label %if.end + +if.then: ; preds = %entry + store ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @s_int2 to ptr), i64 4), ptr %arrayidx, align 8 + br label %if.end + +if.end: ; preds = %entry.if.end_crit_edge, %if.then + %1 = load ptr, ptr %arrayidx, align 8 + %2 = load i32, ptr %1, align 4 + %arrayidx7 = getelementptr inbounds nuw i32, ptr %b, i64 %idxprom + store i32 %2, ptr %arrayidx7, align 4 + ret void +} + +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() +declare void @llvm.nvvm.bar.warp.sync(i32) >From fbc43717f0d2753ee7e3a2bfc0b4e0d3a9968bdc Mon Sep 17 00:00:00 2001 From: actink <[email protected]> Date: Wed, 3 Dec 2025 11:21:43 +0800 Subject: [PATCH 2/2] [InferAddressSpaces] Support address space inference from load values --- .../amdgpu-kernel-arg-pointer-type.cu | 23 ++- .../llvm/Analysis/TargetTransformInfo.h | 6 + .../llvm/Analysis/TargetTransformInfoImpl.h | 6 + llvm/include/llvm/CodeGen/BasicTTIImpl.h | 6 + llvm/include/llvm/Target/TargetMachine.h | 8 + llvm/lib/Analysis/TargetTransformInfo.cpp | 9 + .../AMDGPU/AMDGPUPromoteKernelArguments.cpp | 37 +--- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 31 +++- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h | 2 + .../AMDGPU/AMDGPUTargetTransformInfo.cpp | 22 +++ .../Target/AMDGPU/AMDGPUTargetTransformInfo.h | 2 + .../Target/NVPTX/NVPTXTargetTransformInfo.cpp | 48 +++++ .../Target/NVPTX/NVPTXTargetTransformInfo.h | 5 + .../Transforms/Scalar/InferAddressSpaces.cpp | 168 +++++++++++++++++- .../AMDGPU/promote-kernel-arguments.ll | 2 +- llvm/test/CodeGen/NVPTX/lower-byval-args.ll | 24 ++- .../AMDGPU/infer-address-space.ll | 4 +- .../InferAddressSpaces/NVPTX/load-ptr.ll | 37 ++-- 18 files changed, 354 insertions(+), 86 deletions(-) diff --git a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu index a48affaec3c8a..5afe3e7f28242 100644 --- a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu +++ b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu @@ -95,7 +95,7 @@ __global__ void kernel1(int *x) { // CHECK-NEXT: store ptr addrspace(1) [[X_COERCE]], ptr [[X_ASCAST]], align 8 // CHECK-NEXT: [[X1:%.*]] = load ptr, ptr [[X_ASCAST]], align 8 // CHECK-NEXT: store ptr [[X1]], ptr [[X_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X_ADDR_ASCAST]], align 8, !nonnull [[META4:![0-9]+]], !align [[META5:![0-9]+]] // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 // CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 // CHECK-NEXT: store i32 [[INC]], ptr [[TMP0]], align 4 @@ -111,7 +111,7 @@ __global__ void kernel1(int *x) { // CHECK-SPIRV-NEXT: store ptr addrspace(1) [[X_COERCE]], ptr addrspace(4) [[X_ASCAST]], align 8 // CHECK-SPIRV-NEXT: [[X1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ASCAST]], align 8 // CHECK-SPIRV-NEXT: store ptr addrspace(4) [[X1]], ptr addrspace(4) [[X_ADDR_ASCAST]], align 8 -// CHECK-SPIRV-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8 +// CHECK-SPIRV-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8, !align [[META6:![0-9]+]] // CHECK-SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[TMP0]], align 4 // CHECK-SPIRV-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 // CHECK-SPIRV-NEXT: store i32 [[INC]], ptr addrspace(4) [[TMP0]], align 4 @@ -435,14 +435,15 @@ __global__ void kernel4(struct S s) { // OPT-SAME: ptr addrspace(1) noundef readonly captures(none) [[S_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] { // OPT-NEXT: [[ENTRY:.*:]] // OPT-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(1) [[S_COERCE]], align 8 -// OPT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 -// OPT-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 -// OPT-NEXT: store i32 [[INC]], ptr [[TMP0]], align 4 +// OPT-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(1) +// OPT-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) [[TMP1]], align 4 +// OPT-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1 +// OPT-NEXT: store i32 [[INC]], ptr addrspace(1) [[TMP1]], align 4 // OPT-NEXT: [[Y:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(1) [[S_COERCE]], i64 8 -// OPT-NEXT: [[TMP2:%.*]] = load ptr, ptr addrspace(1) [[Y]], align 8 -// OPT-NEXT: [[TMP3:%.*]] = load float, ptr [[TMP2]], align 4 -// OPT-NEXT: [[ADD:%.*]] = fadd contract float [[TMP3]], 1.000000e+00 -// OPT-NEXT: store float [[ADD]], ptr [[TMP2]], align 4 +// OPT-NEXT: [[TMP3:%.*]] = load ptr, ptr addrspace(1) [[Y]], align 8 +// OPT-NEXT: [[TMP4:%.*]] = load float, ptr [[TMP3]], align 4 +// OPT-NEXT: [[ADD:%.*]] = fadd contract float [[TMP4]], 1.000000e+00 +// OPT-NEXT: store float [[ADD]], ptr [[TMP3]], align 4 // OPT-NEXT: ret void // // OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel5P1S( @@ -727,7 +728,11 @@ __global__ void kernel8(struct SS a) { *a.x += 3.f; } //. +// CHECK: [[META4]] = !{} +// CHECK: [[META5]] = !{i64 4} +//. // CHECK-SPIRV: [[META5]] = !{i32 1024, i32 1, i32 1} +// CHECK-SPIRV: [[META6]] = !{i64 4} //. // OPT: [[META4]] = !{} //. diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 99525607f744a..efb352018fbe4 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -563,6 +563,8 @@ class TargetTransformInfo { LLVM_ABI unsigned getAssumedAddrSpace(const Value *V) const; + LLVM_ABI unsigned getAssumedLiveOnEntryDefAddrSpace(const Value *V) const; + LLVM_ABI bool isSingleThreaded() const; LLVM_ABI std::pair<const Value *, unsigned> @@ -577,6 +579,10 @@ class TargetTransformInfo { Value *OldV, Value *NewV) const; + /// Return true if \p IID only performs an artificial clobber to facilitate + /// ordering constraints. + LLVM_ABI bool isArtificialClobber(Intrinsic::ID IID) const; + /// Test whether calls to a function lower to actual program function /// calls. /// diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 835eb7701ccfa..0130b5225ce3f 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -158,6 +158,10 @@ class TargetTransformInfoImplBase { virtual unsigned getAssumedAddrSpace(const Value *V) const { return -1; } + virtual unsigned getAssumedLiveOnEntryDefAddrSpace(const Value *V) const { + return -1; + } + virtual bool isSingleThreaded() const { return false; } virtual std::pair<const Value *, unsigned> @@ -171,6 +175,8 @@ class TargetTransformInfoImplBase { return nullptr; } + virtual bool isArtificialClobber(Intrinsic::ID IID) const { return false; } + virtual bool isLoweredToCall(const Function *F) const { assert(F && "A concrete function must be provided to this routine."); diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 494199835a19c..10708245b1180 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -440,6 +440,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { return getTLI()->getTargetMachine().getAssumedAddrSpace(V); } + unsigned getAssumedLiveOnEntryDefAddrSpace(const Value *V) const override { + return getTLI()->getTargetMachine().getAssumedLiveOnEntryDefAddrSpace(V); + } + bool isSingleThreaded() const override { return getTLI()->getTargetMachine().Options.ThreadModel == ThreadModel::Single; @@ -455,6 +459,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { return nullptr; } + bool isArtificialClobber(Intrinsic::ID IID) const override { return false; } + bool isLegalAddImmediate(int64_t imm) const override { return getTLI()->isLegalAddImmediate(imm); } diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h index d0fd483a8ddaa..03e0b43686cd4 100644 --- a/llvm/include/llvm/Target/TargetMachine.h +++ b/llvm/include/llvm/Target/TargetMachine.h @@ -378,6 +378,14 @@ class LLVM_ABI TargetMachine { /// properties. virtual unsigned getAssumedAddrSpace(const Value *V) const { return -1; } + /// LiveOnEntryDef same as MemorySSA's concept. + /// Loads and stores from pointer arguments and other global values may be + /// defined by memory operations that do not occur in the current function. + /// Return the assumed address space for such memory operations. + virtual unsigned getAssumedLiveOnEntryDefAddrSpace(const Value *V) const { + return -1; + } + /// If the specified predicate checks whether a generic pointer falls within /// a specified address space, return that generic pointer and the address /// space being queried. diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index c529d87502acd..d943c2171d6a8 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -339,6 +339,11 @@ unsigned TargetTransformInfo::getAssumedAddrSpace(const Value *V) const { return TTIImpl->getAssumedAddrSpace(V); } +unsigned +TargetTransformInfo::getAssumedLiveOnEntryDefAddrSpace(const Value *V) const { + return TTIImpl->getAssumedLiveOnEntryDefAddrSpace(V); +} + bool TargetTransformInfo::isSingleThreaded() const { return TTIImpl->isSingleThreaded(); } @@ -353,6 +358,10 @@ Value *TargetTransformInfo::rewriteIntrinsicWithAddressSpace( return TTIImpl->rewriteIntrinsicWithAddressSpace(II, OldV, NewV); } +bool TargetTransformInfo::isArtificialClobber(Intrinsic::ID IID) const { + return TTIImpl->isArtificialClobber(IID); +} + bool TargetTransformInfo::isLoweredToCall(const Function *F) const { return TTIImpl->isLoweredToCall(F); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp index 06819d05b4be6..8ec9a1b15e6a5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp @@ -34,8 +34,6 @@ class AMDGPUPromoteKernelArguments : public FunctionPass { AliasAnalysis *AA; - Instruction *ArgCastInsertPt; - SmallVector<Value *> Ptrs; void enqueueUsers(Value *Ptr); @@ -107,24 +105,7 @@ bool AMDGPUPromoteKernelArguments::promotePointer(Value *Ptr) { PT->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) enqueueUsers(Ptr); - if (PT->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS) - return Changed; - - IRBuilder<> B(LI ? &*std::next(cast<Instruction>(Ptr)->getIterator()) - : ArgCastInsertPt); - - // Cast pointer to global address space and back to flat and let - // Infer Address Spaces pass to do all necessary rewriting. - PointerType *NewPT = - PointerType::get(PT->getContext(), AMDGPUAS::GLOBAL_ADDRESS); - Value *Cast = - B.CreateAddrSpaceCast(Ptr, NewPT, Twine(Ptr->getName(), ".global")); - Value *CastBack = - B.CreateAddrSpaceCast(Cast, PT, Twine(Ptr->getName(), ".flat")); - Ptr->replaceUsesWithIf(CastBack, - [Cast](Use &U) { return U.getUser() != Cast; }); - - return true; + return Changed; } bool AMDGPUPromoteKernelArguments::promoteLoad(LoadInst *LI) { @@ -135,21 +116,6 @@ bool AMDGPUPromoteKernelArguments::promoteLoad(LoadInst *LI) { return true; } -// skip allocas -static BasicBlock::iterator getInsertPt(BasicBlock &BB) { - BasicBlock::iterator InsPt = BB.getFirstInsertionPt(); - for (BasicBlock::iterator E = BB.end(); InsPt != E; ++InsPt) { - AllocaInst *AI = dyn_cast<AllocaInst>(&*InsPt); - - // If this is a dynamic alloca, the value may depend on the loaded kernargs, - // so loads will need to be inserted before it. - if (!AI || !AI->isStaticAlloca()) - break; - } - - return InsPt; -} - bool AMDGPUPromoteKernelArguments::run(Function &F, MemorySSA &MSSA, AliasAnalysis &AA) { if (skipFunction(F)) @@ -159,7 +125,6 @@ bool AMDGPUPromoteKernelArguments::run(Function &F, MemorySSA &MSSA, if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty()) return false; - ArgCastInsertPt = &*getInsertPt(*F.begin()); this->MSSA = &MSSA; this->AA = &AA; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index e5a35abe6da6b..fbda0196b4617 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -1042,13 +1042,38 @@ unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const { assert(V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS); const auto *Ptr = LD->getPointerOperand(); - if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) - return AMDGPUAS::UNKNOWN_ADDRESS_SPACE; + // For a generic pointer loaded from the constant memory, it could be assumed // as a global pointer since the constant memory is only populated on the // host side. As implied by the offload programming model, only global // pointers could be referenced on the host side. - return AMDGPUAS::GLOBAL_ADDRESS; + if (Ptr->getType()->getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) + return AMDGPUAS::GLOBAL_ADDRESS; + + // For a generic pointer loaded from the readonly and noalias arg, same as + // above. + if (const Argument *Arg = dyn_cast<Argument>(getUnderlyingObject(Ptr))) + if (AMDGPU::isModuleEntryFunctionCC(Arg->getParent()->getCallingConv()) && + Arg->onlyReadsMemory() && Arg->hasNoAliasAttr()) + return AMDGPUAS::GLOBAL_ADDRESS; + + return AMDGPUAS::UNKNOWN_ADDRESS_SPACE; +} + +unsigned +AMDGPUTargetMachine::getAssumedLiveOnEntryDefAddrSpace(const Value *V) const { + if (const Instruction *I = dyn_cast<Instruction>(V)) { + if (AMDGPU::isModuleEntryFunctionCC( + I->getParent()->getParent()->getCallingConv())) + return AMDGPUAS::GLOBAL_ADDRESS; + } + if (const LoadInst *LD = dyn_cast<LoadInst>(V)) { + // same as getAssumedAddrSpace + if (LD->getPointerOperandType()->getPointerAddressSpace() == + AMDGPUAS::CONSTANT_ADDRESS) + return AMDGPUAS::GLOBAL_ADDRESS; + } + return AMDGPUAS::UNKNOWN_ADDRESS_SPACE; } std::pair<const Value *, unsigned> diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 06a3047196b8a..ea21c095faf75 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -62,6 +62,8 @@ class AMDGPUTargetMachine : public CodeGenTargetMachineImpl { unsigned getAssumedAddrSpace(const Value *V) const override; + unsigned getAssumedLiveOnEntryDefAddrSpace(const Value *V) const override; + std::pair<const Value *, unsigned> getPredicatedAddrSpace(const Value *V) const override; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index dfa21515838ff..a151b0c3989fa 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1223,6 +1223,28 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, } } +bool GCNTTIImpl::isArtificialClobber(Intrinsic::ID IID) const { + switch (IID) { + case Intrinsic::amdgcn_s_barrier: + case Intrinsic::amdgcn_s_cluster_barrier: + case Intrinsic::amdgcn_s_barrier_signal: + case Intrinsic::amdgcn_s_barrier_signal_var: + case Intrinsic::amdgcn_s_barrier_signal_isfirst: + case Intrinsic::amdgcn_s_barrier_init: + case Intrinsic::amdgcn_s_barrier_join: + case Intrinsic::amdgcn_s_barrier_wait: + case Intrinsic::amdgcn_s_barrier_leave: + case Intrinsic::amdgcn_s_get_barrier_state: + case Intrinsic::amdgcn_wave_barrier: + case Intrinsic::amdgcn_sched_barrier: + case Intrinsic::amdgcn_sched_group_barrier: + case Intrinsic::amdgcn_iglp_opt: + return true; + default: + return false; + } +} + InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef<int> Mask, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 20da8344c9d37..12be42c16d025 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -210,6 +210,8 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> { Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const override; + bool isArtificialClobber(Intrinsic::ID IID) const override; + bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0, const Value *Op1, InstCombiner &IC) const; diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index 5d5553c573b0f..c61aae8335aa4 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -592,6 +592,32 @@ Value *NVPTXTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, return nullptr; } +bool NVPTXTTIImpl::isArtificialClobber(Intrinsic::ID IID) const { + switch (IID) { + case Intrinsic::nvvm_bar_warp_sync: + case Intrinsic::nvvm_barrier_cluster_arrive: + case Intrinsic::nvvm_barrier_cluster_arrive_aligned: + case Intrinsic::nvvm_barrier_cluster_arrive_relaxed: + case Intrinsic::nvvm_barrier_cluster_arrive_relaxed_aligned: + case Intrinsic::nvvm_barrier_cluster_wait: + case Intrinsic::nvvm_barrier_cluster_wait_aligned: + case Intrinsic::nvvm_barrier_cta_arrive_aligned_count: + case Intrinsic::nvvm_barrier_cta_arrive_count: + case Intrinsic::nvvm_barrier_cta_sync_aligned_all: + case Intrinsic::nvvm_barrier_cta_sync_aligned_count: + case Intrinsic::nvvm_barrier_cta_sync_all: + case Intrinsic::nvvm_barrier_cta_sync_count: + case Intrinsic::nvvm_barrier0_and: + case Intrinsic::nvvm_barrier0_or: + case Intrinsic::nvvm_barrier0_popc: + case Intrinsic::nvvm_membar_cta: + case Intrinsic::nvvm_membar_gl: + case Intrinsic::nvvm_membar_sys: + return true; + default: + return false; + } +} bool NVPTXTTIImpl::isLegalMaskedStore(Type *DataTy, Align Alignment, unsigned AddrSpace, TTI::MaskKind MaskKind) const { @@ -657,6 +683,28 @@ unsigned NVPTXTTIImpl::getAssumedAddrSpace(const Value *V) const { } } + if (const auto *LD = dyn_cast<LoadInst>(V)) { + // It must be a generic pointer loaded. + assert(V->getType()->getPointerAddressSpace() == ADDRESS_SPACE_GENERIC); + + // For a generic pointer loaded from the readonly and noalias arg, it could + // be assumed as a global pointer since the readonly memory is only + // populated on the host side. + if (const Argument *Arg = + dyn_cast<Argument>(getUnderlyingObject(LD->getPointerOperand()))) + if (isKernelFunction(*Arg->getParent()) && Arg->onlyReadsMemory() && + Arg->hasNoAliasAttr()) + return ADDRESS_SPACE_GLOBAL; + } + return -1; +} + +unsigned NVPTXTTIImpl::getAssumedLiveOnEntryDefAddrSpace(const Value *V) const { + if (const Instruction *I = dyn_cast<Instruction>(V)) { + if (isKernelFunction(*I->getParent()->getParent())) { + return ADDRESS_SPACE_GLOBAL; + } + } return -1; } diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h index d7f4e1da4073b..e1cab29df4c1d 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h @@ -191,8 +191,13 @@ class NVPTXTTIImpl final : public BasicTTIImplBase<NVPTXTTIImpl> { Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const override; + + bool isArtificialClobber(Intrinsic::ID IID) const override; + unsigned getAssumedAddrSpace(const Value *V) const override; + unsigned getAssumedLiveOnEntryDefAddrSpace(const Value *V) const override; + void collectKernelLaunchBounds( const Function &F, SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const override; diff --git a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp index 352a1b331001a..1f8e81fe100d7 100644 --- a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp +++ b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp @@ -94,7 +94,9 @@ #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/BasicBlock.h" @@ -176,6 +178,8 @@ class InferAddressSpaces : public FunctionPass { AU.addPreserved<DominatorTreeWrapperPass>(); AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.addRequired<AAResultsWrapperPass>(); + AU.addRequired<MemorySSAWrapperPass>(); } bool runOnFunction(Function &F) override; @@ -186,8 +190,9 @@ class InferAddressSpacesImpl { Function *F = nullptr; const DominatorTree *DT = nullptr; const TargetTransformInfo *TTI = nullptr; + MemorySSA *MSSA = nullptr; + mutable BatchAAResults BatchAA; const DataLayout *DL = nullptr; - /// Target specific address space which uses of should be replaced if /// possible. unsigned FlatAddrSpace = 0; @@ -245,11 +250,19 @@ class InferAddressSpacesImpl { unsigned getPredicatedAddrSpace(const Value &PtrV, const Value *UserCtx) const; + unsigned + getLoadPtrAddrSpaceImpl(const LoadInst *LI, unsigned NewAS, MemoryAccess *MA, + ValueToAddrSpaceMapTy &InferredAddrSpace, + SmallPtrSet<MemoryAccess *, 8> Visited) const; + unsigned getLoadPtrAddrSpace(const LoadInst *LI, + ValueToAddrSpaceMapTy &InferredAddrSpace) const; public: InferAddressSpacesImpl(AssumptionCache &AC, const DominatorTree *DT, - const TargetTransformInfo *TTI, unsigned FlatAddrSpace) - : AC(AC), DT(DT), TTI(TTI), FlatAddrSpace(FlatAddrSpace) {} + const TargetTransformInfo *TTI, MemorySSA *MSSA, + AliasAnalysis *AA, unsigned FlatAddrSpace) + : AC(AC), DT(DT), TTI(TTI), MSSA(MSSA), BatchAA(*AA), + FlatAddrSpace(FlatAddrSpace) {} bool run(Function &F); }; @@ -261,6 +274,8 @@ INITIALIZE_PASS_BEGIN(InferAddressSpaces, DEBUG_TYPE, "Infer address spaces", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) INITIALIZE_PASS_END(InferAddressSpaces, DEBUG_TYPE, "Infer address spaces", false, false) @@ -327,6 +342,9 @@ static bool isAddressExpression(const Value &V, const DataLayout &DL, case Instruction::AddrSpaceCast: case Instruction::GetElementPtr: return true; + case Instruction::Load: + return TTI->getAssumedLiveOnEntryDefAddrSpace(&V) != + UninitializedAddressSpace; case Instruction::Select: return Op->getType()->isPtrOrPtrVectorTy(); case Instruction::Call: { @@ -360,6 +378,8 @@ getPointerOperands(const Value &V, const DataLayout &DL, case Instruction::AddrSpaceCast: case Instruction::GetElementPtr: return {Op.getOperand(0)}; + case Instruction::Load: + return {}; case Instruction::Select: return {Op.getOperand(1), Op.getOperand(2)}; case Instruction::Call: { @@ -561,9 +581,11 @@ InferAddressSpacesImpl::collectFlatAddressExpressions(Function &F) const { PushPtrOperand(GEP->getPointerOperand()); } else if (auto *LI = dyn_cast<LoadInst>(&I)) PushPtrOperand(LI->getPointerOperand()); - else if (auto *SI = dyn_cast<StoreInst>(&I)) + else if (auto *SI = dyn_cast<StoreInst>(&I)) { PushPtrOperand(SI->getPointerOperand()); - else if (auto *RMW = dyn_cast<AtomicRMWInst>(&I)) + if (SI->getValueOperand()->getType()->isPtrOrPtrVectorTy()) + PushPtrOperand(SI->getValueOperand()); + } else if (auto *RMW = dyn_cast<AtomicRMWInst>(&I)) PushPtrOperand(RMW->getPointerOperand()); else if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(&I)) PushPtrOperand(CmpX->getPointerOperand()); @@ -900,6 +922,14 @@ Value *InferAddressSpacesImpl::cloneValueWithNewAddressSpace( return NewI; } + if (auto *LD = dyn_cast<LoadInst>(V)) { + Type *NewPtrTy = getPtrOrVecOfPtrsWithNewAS(LD->getType(), NewAddrSpace); + auto *NewI = new AddrSpaceCastInst(V, NewPtrTy); + NewI->insertAfter(LD->getIterator()); + NewI->setDebugLoc(LD->getDebugLoc()); + return NewI; + } + if (Instruction *I = dyn_cast<Instruction>(V)) { Value *NewV = cloneInstructionWithNewAddressSpace( I, NewAddrSpace, ValueWithNewAddrSpace, PredicatedAS, PoisonUsesToFix); @@ -1027,6 +1057,117 @@ InferAddressSpacesImpl::getPredicatedAddrSpace(const Value &Ptr, return UninitializedAddressSpace; } +static bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, + BatchAAResults *AA, + const TargetTransformInfo *TTI) { + Instruction *DI = Def->getMemoryInst(); + + if (auto *CB = dyn_cast<CallBase>(DI); + CB && CB->onlyAccessesInaccessibleMemory()) + return false; + + if (isa<FenceInst>(DI)) + return false; + + if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(DI); + II && TTI->isArtificialClobber(II->getIntrinsicID())) { + return false; + } + + // Ignore atomics not aliasing with the original load, any atomic is a + // universal MemoryDef from MSSA's point of view too, just like a fence. + const auto checkNoAlias = [AA, Ptr](auto I) -> bool { + return I && AA->isNoAlias(MemoryLocation::get(dyn_cast<Instruction>( + I->getPointerOperand())), + MemoryLocation::get(dyn_cast<LoadInst>(Ptr))); + }; + + if (checkNoAlias(dyn_cast<AtomicCmpXchgInst>(DI)) || + checkNoAlias(dyn_cast<AtomicRMWInst>(DI))) + return false; + + return true; +} + +unsigned InferAddressSpacesImpl::getLoadPtrAddrSpaceImpl( + const LoadInst *LI, unsigned AS, MemoryAccess *MA, + ValueToAddrSpaceMapTy &InferredAddrSpace, + SmallPtrSet<MemoryAccess *, 8> Visited) const { + MemorySSAWalker *Walker = MSSA->getWalker(); + MemoryLocation Loc(MemoryLocation::get(LI)); + + if (MSSA->isLiveOnEntryDef(MA)) + return TTI->getAssumedLiveOnEntryDefAddrSpace(LI); + + if (!Visited.insert(MA).second) + return AS; + + if (MemoryDef *Def = dyn_cast<MemoryDef>(MA)) { + LLVM_DEBUG(dbgs() << " Def: " << *Def->getMemoryInst() << '\n'); + + if (!isReallyAClobber(LI->getPointerOperand(), Def, &BatchAA, TTI)) + return getLoadPtrAddrSpaceImpl( + LI, AS, + Walker->getClobberingMemoryAccess(Def->getDefiningAccess(), Loc), + InferredAddrSpace, Visited); + + LLVM_DEBUG(dbgs() << " -> load is clobbered\n"); + Instruction *DI = Def->getMemoryInst(); + + StoreInst *SI = dyn_cast<StoreInst>(DI); + + // TODO: handle other memory writing instructions + if (!SI) + return FlatAddrSpace; + + Type *ValType = SI->getValueOperand()->getType(); + unsigned ValAS = FlatAddrSpace; + auto I = InferredAddrSpace.find(SI->getValueOperand()); + if (I != InferredAddrSpace.end()) + ValAS = I->second; + else if (ValType->isPtrOrPtrVectorTy()) + ValAS = ValType->getPointerAddressSpace(); + + AS = joinAddressSpaces(AS, ValAS); + + if (AS == FlatAddrSpace) + return FlatAddrSpace; + + if (BatchAA.isMustAlias(Loc, MemoryLocation::get(SI))) { + LLVM_DEBUG(dbgs() << " -> must alias with store: " << *SI << "\n"); + return AS; + } + + return getLoadPtrAddrSpaceImpl( + LI, AS, + Walker->getClobberingMemoryAccess(Def->getDefiningAccess(), Loc), + InferredAddrSpace, Visited); + } + + const MemoryPhi *Phi = cast<MemoryPhi>(MA); + for (const auto &Use : Phi->incoming_values()) { + AS = getLoadPtrAddrSpaceImpl(LI, AS, cast<MemoryAccess>(&Use), + InferredAddrSpace, Visited); + if (AS == FlatAddrSpace) + return FlatAddrSpace; + } + + return AS; +} + +unsigned InferAddressSpacesImpl::getLoadPtrAddrSpace( + const LoadInst *LI, ValueToAddrSpaceMapTy &InferredAddrSpace) const { + if (TTI->getAssumedLiveOnEntryDefAddrSpace(LI) == UninitializedAddressSpace) + return UninitializedAddressSpace; + + SmallPtrSet<MemoryAccess *, 8> Visited; + LLVM_DEBUG(dbgs() << "Checking clobbering of: " << *LI << '\n'); + return getLoadPtrAddrSpaceImpl( + LI, UninitializedAddressSpace, + MSSA->getWalker()->getClobberingMemoryAccess(LI), InferredAddrSpace, + Visited); +} + bool InferAddressSpacesImpl::updateAddressSpace( const Value &V, ValueToAddrSpaceMapTy &InferredAddrSpace, PredicatedAddrSpaceMapTy &PredicatedAS) const { @@ -1045,6 +1186,8 @@ bool InferAddressSpacesImpl::updateAddressSpace( if (AS != UninitializedAddressSpace) { // Use the assumed address space directly. NewAS = AS; + } else if (auto *LD = dyn_cast<LoadInst>(&V)) { + NewAS = getLoadPtrAddrSpace(LD, InferredAddrSpace); } else { // Otherwise, infer the address space from its pointer operands. SmallVector<Constant *, 2> ConstantPtrOps; @@ -1307,6 +1450,14 @@ void InferAddressSpacesImpl::performPointerReplacement( } } + if (StoreInst *SI = dyn_cast<StoreInst>(CurUserI)) { + // replace store value operand + if (V == SI->getValueOperand()) { + U.set(NewV); + return; + } + } + // Otherwise, replaces the use with flat(NewV). if (isa<Instruction>(V) || isa<Instruction>(NewV)) { // Don't create a copy of the original addrspacecast. @@ -1455,7 +1606,8 @@ bool InferAddressSpaces::runOnFunction(Function &F) { return InferAddressSpacesImpl( getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F), DT, &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F), - FlatAddrSpace) + &getAnalysis<MemorySSAWrapperPass>().getMSSA(), + &getAnalysis<AAResultsWrapperPass>().getAAResults(), FlatAddrSpace) .run(F); } @@ -1473,7 +1625,9 @@ PreservedAnalyses InferAddressSpacesPass::run(Function &F, bool Changed = InferAddressSpacesImpl(AM.getResult<AssumptionAnalysis>(F), AM.getCachedResult<DominatorTreeAnalysis>(F), - &AM.getResult<TargetIRAnalysis>(F), FlatAddrSpace) + &AM.getResult<TargetIRAnalysis>(F), + &AM.getResult<MemorySSAAnalysis>(F).getMSSA(), + &AM.getResult<AAManager>(F), FlatAddrSpace) .run(F); if (Changed) { PreservedAnalyses PA; diff --git a/llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll b/llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll index 0696cbe5aa891..f68964a96d67a 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll @@ -81,8 +81,8 @@ entry: define amdgpu_kernel void @flat_ptr_arg(ptr nocapture readonly noalias %Arg, ptr nocapture noalias %Out, i32 %X) { ; CHECK-LABEL: @flat_ptr_arg( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[OUT_GLOBAL:%.*]] = addrspacecast ptr [[OUT:%.*]] to ptr addrspace(1) ; CHECK-NEXT: [[ARG_GLOBAL:%.*]] = addrspacecast ptr [[ARG:%.*]] to ptr addrspace(1) +; CHECK-NEXT: [[OUT_GLOBAL:%.*]] = addrspacecast ptr [[OUT:%.*]] to ptr addrspace(1) ; CHECK-NEXT: [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I]] to i64 ; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARG_GLOBAL]], i64 [[IDXPROM]] diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll index ca2914a2e8043..1570a2006c47e 100644 --- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll +++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll @@ -220,19 +220,17 @@ define dso_local ptx_kernel void @escape_ptr_store(ptr nocapture noundef writeon ; PTX-NEXT: .reg .b64 %SP; ; PTX-NEXT: .reg .b64 %SPL; ; PTX-NEXT: .reg .b32 %r<3>; -; PTX-NEXT: .reg .b64 %rd<5>; +; PTX-NEXT: .reg .b64 %rd<4>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: // %entry ; PTX-NEXT: mov.b64 %SPL, __local_depot4; -; PTX-NEXT: cvta.local.u64 %SP, %SPL; ; PTX-NEXT: ld.param.b64 %rd1, [escape_ptr_store_param_0]; ; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; -; PTX-NEXT: add.u64 %rd3, %SP, 0; -; PTX-NEXT: add.u64 %rd4, %SPL, 0; +; PTX-NEXT: add.u64 %rd3, %SPL, 0; ; PTX-NEXT: ld.param.b32 %r1, [escape_ptr_store_param_1+4]; -; PTX-NEXT: st.local.b32 [%rd4+4], %r1; +; PTX-NEXT: st.local.b32 [%rd3+4], %r1; ; PTX-NEXT: ld.param.b32 %r2, [escape_ptr_store_param_1]; -; PTX-NEXT: st.local.b32 [%rd4], %r2; +; PTX-NEXT: st.local.b32 [%rd3], %r2; ; PTX-NEXT: st.global.b64 [%rd2], %rd3; ; PTX-NEXT: ret; entry: @@ -258,21 +256,19 @@ define dso_local ptx_kernel void @escape_ptr_gep_store(ptr nocapture noundef wri ; PTX-NEXT: .reg .b64 %SP; ; PTX-NEXT: .reg .b64 %SPL; ; PTX-NEXT: .reg .b32 %r<3>; -; PTX-NEXT: .reg .b64 %rd<6>; +; PTX-NEXT: .reg .b64 %rd<5>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: // %entry ; PTX-NEXT: mov.b64 %SPL, __local_depot5; -; PTX-NEXT: cvta.local.u64 %SP, %SPL; ; PTX-NEXT: ld.param.b64 %rd1, [escape_ptr_gep_store_param_0]; ; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; -; PTX-NEXT: add.u64 %rd3, %SP, 0; -; PTX-NEXT: add.u64 %rd4, %SPL, 0; +; PTX-NEXT: add.u64 %rd3, %SPL, 0; ; PTX-NEXT: ld.param.b32 %r1, [escape_ptr_gep_store_param_1+4]; -; PTX-NEXT: st.local.b32 [%rd4+4], %r1; +; PTX-NEXT: st.local.b32 [%rd3+4], %r1; ; PTX-NEXT: ld.param.b32 %r2, [escape_ptr_gep_store_param_1]; -; PTX-NEXT: st.local.b32 [%rd4], %r2; -; PTX-NEXT: add.s64 %rd5, %rd3, 4; -; PTX-NEXT: st.global.b64 [%rd2], %rd5; +; PTX-NEXT: st.local.b32 [%rd3], %r2; +; PTX-NEXT: add.s64 %rd4, %rd3, 4; +; PTX-NEXT: st.global.b64 [%rd2], %rd4; ; PTX-NEXT: ret; entry: %b = getelementptr inbounds nuw i8, ptr %s, i64 4 diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll index a08110defc8b3..caf7c7abbeab7 100644 --- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll +++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll @@ -159,6 +159,8 @@ define amdgpu_kernel void @loop_with_generic_bound() #0 { ; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: [[END:%.*]] = load ptr, ptr addrspace(1) @generic_end, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[END]] to ptr addrspace(1) +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP2]] to ptr ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[I:%.*]] = phi ptr addrspace(3) [ @array, %[[ENTRY]] ], [ [[I2:%.*]], %[[LOOP]] ] @@ -166,7 +168,7 @@ define amdgpu_kernel void @loop_with_generic_bound() #0 { ; CHECK-NEXT: call void @use(float [[V]]) ; CHECK-NEXT: [[I2]] = getelementptr float, ptr addrspace(3) [[I]], i64 1 ; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(3) [[I2]] to ptr -; CHECK-NEXT: [[EXIT_COND:%.*]] = icmp eq ptr [[TMP0]], [[END]] +; CHECK-NEXT: [[EXIT_COND:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]] ; CHECK-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/InferAddressSpaces/NVPTX/load-ptr.ll b/llvm/test/Transforms/InferAddressSpaces/NVPTX/load-ptr.ll index fb3f55ab89497..96c92bb1bb443 100644 --- a/llvm/test/Transforms/InferAddressSpaces/NVPTX/load-ptr.ll +++ b/llvm/test/Transforms/InferAddressSpaces/NVPTX/load-ptr.ll @@ -10,10 +10,11 @@ define ptx_kernel void @globalmem_flat_ptr_with_global(ptr %a, ptr %b){ ; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(1) ; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(1) ; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr addrspace(1) [[TMP0]], align 8 +; CHECK-NEXT: [[DOTGLOBAL:%.*]] = addrspacecast ptr [[TMP2]] to ptr addrspace(1) ; CHECK-NEXT: [[TMP3:%.*]] = tail call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() ; CHECK-NEXT: [[IDXPROM:%.*]] = zext nneg i32 [[TMP3]] to i64 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP2]], i64 [[IDXPROM]] -; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[DOTGLOBAL]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP1]], i64 [[IDXPROM]] ; CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(1) [[ARRAYIDX3]], align 4 ; CHECK-NEXT: ret void @@ -29,23 +30,24 @@ entry: ret void } -@shared_ptrs = internal unnamed_addr addrspace(3) global [32 x ptr] undef, align 8 +@shared_ptrs = internal unnamed_addr addrspace(3) global [32 x ptr] poison, align 8 define ptx_kernel void @sharedmem_flat_ptr_with_global(ptr %a, ptr %b) { ; CHECK-LABEL: define ptx_kernel void @sharedmem_flat_ptr_with_global( ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(1) -; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr ; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(1) ; CHECK-NEXT: [[TMP3:%.*]] = tail call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() ; CHECK-NEXT: [[IDXPROM:%.*]] = zext nneg i32 [[TMP3]] to i64 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i64 [[IDXPROM]] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP0]], i64 [[IDXPROM]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = addrspacecast ptr addrspace(1) [[ARRAYIDX1]] to ptr ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds ptr, ptr addrspace(3) @shared_ptrs, i64 [[IDXPROM]] ; CHECK-NEXT: store ptr [[ARRAYIDX]], ptr addrspace(3) [[ARRAYIDX3]], align 8 ; CHECK-NEXT: tail call void @llvm.nvvm.bar.warp.sync(i32 -1) ; CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr addrspace(3) [[ARRAYIDX3]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 +; CHECK-NEXT: [[DOTGLOBAL:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1) +; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(1) [[DOTGLOBAL]], align 4 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP2]], i64 [[IDXPROM]] ; CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[ARRAYIDX9]], align 4 ; CHECK-NEXT: ret void @@ -73,11 +75,13 @@ define dso_local ptx_kernel void @device_var_with_global(ptr %b) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(1) ; CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr addrspace(1) @a, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8 +; CHECK-NEXT: [[DOTGLOBAL1:%.*]] = addrspacecast ptr [[TMP1]] to ptr addrspace(1) +; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr addrspace(1) [[DOTGLOBAL1]], align 8 +; CHECK-NEXT: [[DOTGLOBAL:%.*]] = addrspacecast ptr [[TMP2]] to ptr addrspace(1) ; CHECK-NEXT: [[TMP3:%.*]] = tail call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() ; CHECK-NEXT: [[IDXPROM:%.*]] = zext nneg i32 [[TMP3]] to i64 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP2]], i64 [[IDXPROM]] -; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[DOTGLOBAL]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP0]], i64 [[IDXPROM]] ; CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(1) [[ARRAYIDX3]], align 4 ; CHECK-NEXT: ret void @@ -102,13 +106,15 @@ define ptx_kernel void @globalmem_flat_ptr_with_global_clobber(ptr %a, ptr %b) { ; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(1) ; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(1) ; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr addrspace(1) [[TMP0]], align 8 +; CHECK-NEXT: [[DOTGLOBAL:%.*]] = addrspacecast ptr [[TMP2]] to ptr addrspace(1) +; CHECK-NEXT: [[TMP3:%.*]] = addrspacecast ptr addrspace(1) [[DOTGLOBAL]] to ptr ; CHECK-NEXT: [[TMP4:%.*]] = tail call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() ; CHECK-NEXT: [[IDXPROM:%.*]] = zext nneg i32 [[TMP4]] to i64 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[TMP1]], i64 [[IDXPROM]] -; CHECK-NEXT: store ptr [[TMP2]], ptr addrspace(1) [[ARRAYIDX]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP2]], align 4 -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 4 -; CHECK-NEXT: store i32 [[TMP5]], ptr [[ARRAYIDX4]], align 4 +; CHECK-NEXT: store ptr [[TMP3]], ptr addrspace(1) [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(1) [[DOTGLOBAL]], align 4 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DOTGLOBAL]], i64 4 +; CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[ARRAYIDX4]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -127,7 +133,7 @@ entry: } -@s_int2 = internal addrspace(3) global [2 x i32] undef, align 4 +@s_int2 = internal addrspace(3) global [2 x i32] poison, align 4 ; Function Attrs: convergent mustprogress noinline norecurse nounwind define dso_local ptx_kernel void @phi_clobber_with_diff_as(ptr %a, ptr %b) { @@ -205,7 +211,8 @@ define ptx_kernel void @phi_clobber_with_same_as(ptr %a, ptr %b) { ; CHECK-NEXT: br label %[[IF_END]] ; CHECK: [[IF_END]]: ; CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr addrspace(1) [[ARRAYIDX]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4 +; CHECK-NEXT: [[DOTGLOBAL:%.*]] = addrspacecast ptr [[TMP3]] to ptr addrspace(3) +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(3) [[DOTGLOBAL]], align 4 ; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP1]], i64 [[IDXPROM]] ; CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(1) [[ARRAYIDX7]], align 4 ; CHECK-NEXT: ret void _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
