Author: macurtis-amd Date: 2025-08-21T04:38:55-05:00 New Revision: 0c480dd4b61e285bfda4de99c77da28922e64b94
URL: https://github.com/llvm/llvm-project/commit/0c480dd4b61e285bfda4de99c77da28922e64b94 DIFF: https://github.com/llvm/llvm-project/commit/0c480dd4b61e285bfda4de99c77da28922e64b94.diff LOG: [clang][CodeGen] cast addr space of ReturnValue if needed (#154380) Fixes a bug on AMDGPU targets where a pointer was stored as address space 5, but then loaded as address space 0. Issue found as part of [Kokkos](https://github.com/kokkos/kokkos) testing, specifically `hip.atomics` (see [core/unit_test/TestAtomics.hpp](https://github.com/kokkos/kokkos/blob/develop/core/unit_test/TestAtomics.hpp)). Issue was introduced by commit [39ec9de7c230](https://github.com/llvm/llvm-project/commit/39ec9de7c230) - [clang][CodeGen] sret args should always point to the alloca AS, so use that (https://github.com/llvm/llvm-project/pull/114062). Added: clang/test/CodeGenHIP/store-addr-space.hip Modified: clang/lib/CodeGen/CGDecl.cpp clang/lib/CodeGen/CGExpr.cpp clang/lib/CodeGen/CodeGenFunction.h clang/test/CodeGenCXX/sret_cast_with_nonzero_alloca_as.cpp clang/test/CodeGenOpenCL/addr-space-struct-arg.cl clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl Removed: ################################################################################ diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp index 9df1220c78623..8a1675848e13c 100644 --- a/clang/lib/CodeGen/CGDecl.cpp +++ b/clang/lib/CodeGen/CGDecl.cpp @@ -1563,11 +1563,10 @@ CodeGenFunction::EmitAutoVarAlloca(const VarDecl &D) { // The named return value optimization: allocate this variable in the // return slot, so that we can elide the copy when returning this // variable (C++0x [class.copy]p34). - address = ReturnValue; AllocaAddr = RawAddress(ReturnValue.emitRawPointer(*this), ReturnValue.getElementType(), ReturnValue.getAlignment()); - ; + address = MaybeCastStackAddressSpace(AllocaAddr, Ty.getAddressSpace()); if (const RecordType *RecordTy = Ty->getAs<RecordType>()) { const auto *RD = RecordTy->getOriginalDecl()->getDefinitionOrSelf(); diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp index d229d81d6b934..2329fa20a2530 100644 --- a/clang/lib/CodeGen/CGExpr.cpp +++ b/clang/lib/CodeGen/CGExpr.cpp @@ -108,13 +108,10 @@ CodeGenFunction::CreateTempAllocaWithoutCast(llvm::Type *Ty, CharUnits Align, return RawAddress(Alloca, Ty, Align, KnownNonNull); } -RawAddress CodeGenFunction::CreateTempAlloca(llvm::Type *Ty, LangAS DestLangAS, - CharUnits Align, const Twine &Name, - llvm::Value *ArraySize, - RawAddress *AllocaAddr) { - RawAddress Alloca = CreateTempAllocaWithoutCast(Ty, Align, Name, ArraySize); - if (AllocaAddr) - *AllocaAddr = Alloca; +RawAddress CodeGenFunction::MaybeCastStackAddressSpace(RawAddress Alloca, + LangAS DestLangAS, + llvm::Value *ArraySize) { + llvm::Value *V = Alloca.getPointer(); // Alloca always returns a pointer in alloca address space, which may // be diff erent from the type defined by the language. For example, @@ -134,7 +131,18 @@ RawAddress CodeGenFunction::CreateTempAlloca(llvm::Type *Ty, LangAS DestLangAS, /*IsNonNull=*/true); } - return RawAddress(V, Ty, Align, KnownNonNull); + return RawAddress(V, Alloca.getElementType(), Alloca.getAlignment(), + KnownNonNull); +} + +RawAddress CodeGenFunction::CreateTempAlloca(llvm::Type *Ty, LangAS DestLangAS, + CharUnits Align, const Twine &Name, + llvm::Value *ArraySize, + RawAddress *AllocaAddr) { + RawAddress Alloca = CreateTempAllocaWithoutCast(Ty, Align, Name, ArraySize); + if (AllocaAddr) + *AllocaAddr = Alloca; + return MaybeCastStackAddressSpace(Alloca, DestLangAS, ArraySize); } /// CreateTempAlloca - This creates an alloca and inserts it into the entry diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index ad318f289ee83..fc65199a0f154 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -2804,6 +2804,13 @@ class CodeGenFunction : public CodeGenTypeCache { AllocaTracker Tracker; }; +private: + /// If \p Alloca is not in the same address space as \p DestLangAS, insert an + /// address space cast and return a new RawAddress based on this value. + RawAddress MaybeCastStackAddressSpace(RawAddress Alloca, LangAS DestLangAS, + llvm::Value *ArraySize = nullptr); + +public: /// CreateTempAlloca - This creates an alloca and inserts it into the entry /// block if \p ArraySize is nullptr, otherwise inserts it at the current /// insertion point of the builder. The caller is responsible for setting an diff --git a/clang/test/CodeGenCXX/sret_cast_with_nonzero_alloca_as.cpp b/clang/test/CodeGenCXX/sret_cast_with_nonzero_alloca_as.cpp index 320c712b665de..a0ee54dc16ba3 100644 --- a/clang/test/CodeGenCXX/sret_cast_with_nonzero_alloca_as.cpp +++ b/clang/test/CodeGenCXX/sret_cast_with_nonzero_alloca_as.cpp @@ -10,16 +10,15 @@ struct X { int z[17]; }; // CHECK-NEXT: [[Y_ADDR:%.*]] = alloca i8, align 1, addrspace(5) // CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr // CHECK-NEXT: [[Y_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[Y_ADDR]] to ptr +// CHECK-NEXT: [[AGG_RESULT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[AGG_RESULT]] to ptr // CHECK-NEXT: store i8 [[X]], ptr [[X_ADDR_ASCAST]], align 1 // CHECK-NEXT: store i8 [[Y]], ptr [[Y_ADDR_ASCAST]], align 1 // CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[X_ADDR_ASCAST]], align 1 -// CHECK-NEXT: [[AGG_RESULT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[AGG_RESULT]] to ptr // CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT_ASCAST]], i64 1 // CHECK-NEXT: store i8 [[TMP0]], ptr [[ADD_PTR]], align 1 // CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[Y_ADDR_ASCAST]], align 1 -// CHECK-NEXT: [[AGG_RESULT_ASCAST1:%.*]] = addrspacecast ptr addrspace(5) [[AGG_RESULT]] to ptr -// CHECK-NEXT: [[ADD_PTR2:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT_ASCAST1]], i64 2 -// CHECK-NEXT: store i8 [[TMP1]], ptr [[ADD_PTR2]], align 1 +// CHECK-NEXT: [[ADD_PTR1:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT_ASCAST]], i64 2 +// CHECK-NEXT: store i8 [[TMP1]], ptr [[ADD_PTR1]], align 1 // CHECK-NEXT: ret void // X foo(char x, char y) { diff --git a/clang/test/CodeGenHIP/store-addr-space.hip b/clang/test/CodeGenHIP/store-addr-space.hip new file mode 100644 index 0000000000000..6103edba46274 --- /dev/null +++ b/clang/test/CodeGenHIP/store-addr-space.hip @@ -0,0 +1,46 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --functions "bar" --version 5 +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -x hip -emit-llvm -fcuda-is-device \ +// RUN: -o - %s | FileCheck --check-prefix=AMDGCN --enable-var-scope %s + +struct Foo { + unsigned long long val; +// + __attribute__((device)) inline Foo() { val = 0; } + __attribute__((device)) inline Foo(const Foo &src) { val = src.val; } + __attribute__((device)) inline Foo(const volatile Foo &src) { val = src.val; } +}; + +// AMDGCN-LABEL: define dso_local void @_Z3barPK3Foo( +// AMDGCN-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_FOO:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[SRC_PTR:%.*]]) #[[ATTR0:[0-9]+]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[RESULT_PTR:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5) +// AMDGCN-NEXT: [[SRC_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN-NEXT: [[DST:%.*]] = alloca [[UNION_ANON:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: [[RESULT_PTR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RESULT_PTR]] to ptr +// AMDGCN-NEXT: [[SRC_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_PTR_ADDR]] to ptr +// AMDGCN-NEXT: [[AGG_RESULT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[AGG_RESULT]] to ptr +// AMDGCN-NEXT: [[DST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST]] to ptr +// AMDGCN-NEXT: store ptr addrspace(5) [[AGG_RESULT]], ptr [[RESULT_PTR_ASCAST]], align 4 +// AMDGCN-NEXT: store ptr [[SRC_PTR]], ptr [[SRC_PTR_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: call void @_ZN3FooC1Ev(ptr noundef nonnull align 8 dereferenceable(8) [[AGG_RESULT_ASCAST]]) #[[ATTR1:[0-9]+]] +// AMDGCN-NEXT: store ptr [[AGG_RESULT_ASCAST]], ptr [[DST_ASCAST]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SRC_PTR_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FOO]], ptr [[TMP0]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP1:%.*]] = load i64, ptr [[VAL]], align 8 +// AMDGCN-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DST_ASCAST]], align 8 +// AMDGCN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i64 0 +// AMDGCN-NEXT: store i64 [[TMP1]], ptr [[ARRAYIDX]], align 8 +// AMDGCN-NEXT: ret void +// +__attribute__((device)) Foo bar(const Foo *const src_ptr) { + Foo result; + + union { + Foo* const ptr; + unsigned long long * const ptr64; + } dst = {&result}; + + dst.ptr64[0] = src_ptr->val; + return result; +} diff --git a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl index a70e9af75fa38..85157bdcf43f9 100644 --- a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl +++ b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl @@ -647,6 +647,7 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { // AMDGCN20-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4, addrspace(5) // AMDGCN20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr // AMDGCN20-NEXT: [[IN1:%.*]] = addrspacecast ptr addrspace(5) [[IN]] to ptr +// AMDGCN20-NEXT: [[RETVAL_ASCAST_ASCAST:%.*]] = addrspacecast ptr [[RETVAL_ASCAST]] to ptr addrspace(5) // AMDGCN20-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr [[IN1]], i32 0, i32 0 // AMDGCN20-NEXT: store [9 x i32] [[IN_COERCE]], ptr [[COERCE_DIVE]], align 4 // AMDGCN20-NEXT: [[TMP0:%.*]] = load [[STRUCT_MAT4X4]], ptr [[RETVAL_ASCAST]], align 4 diff --git a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl index a1a114ef129a1..bc65788c17352 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl @@ -121,6 +121,7 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { // AMDGCN-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4, addrspace(5) // AMDGCN-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr // AMDGCN-NEXT: [[IN1:%.*]] = addrspacecast ptr addrspace(5) [[IN]] to ptr +// AMDGCN-NEXT: [[RETVAL_ASCAST_ASCAST:%.*]] = addrspacecast ptr [[RETVAL_ASCAST]] to ptr addrspace(5) // AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr [[IN1]], i32 0, i32 0 // AMDGCN-NEXT: store [9 x i32] [[IN_COERCE]], ptr [[COERCE_DIVE]], align 4 // AMDGCN-NEXT: [[TMP0:%.*]] = load [[STRUCT_MAT4X4]], ptr [[RETVAL_ASCAST]], align 4 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits