Author: macurtis-amd
Date: 2025-08-21T04:38:55-05:00
New Revision: 0c480dd4b61e285bfda4de99c77da28922e64b94

URL: 
https://github.com/llvm/llvm-project/commit/0c480dd4b61e285bfda4de99c77da28922e64b94
DIFF: 
https://github.com/llvm/llvm-project/commit/0c480dd4b61e285bfda4de99c77da28922e64b94.diff

LOG: [clang][CodeGen] cast addr space of ReturnValue if needed (#154380)

Fixes a bug on AMDGPU targets where a pointer was stored as address
space 5, but then loaded as address space 0.

Issue found as part of [Kokkos](https://github.com/kokkos/kokkos)
testing, specifically `hip.atomics` (see
[core/unit_test/TestAtomics.hpp](https://github.com/kokkos/kokkos/blob/develop/core/unit_test/TestAtomics.hpp)).

Issue was introduced by commit
[39ec9de7c230](https://github.com/llvm/llvm-project/commit/39ec9de7c230)
- [clang][CodeGen] sret args should always point to the alloca AS, so
use that (https://github.com/llvm/llvm-project/pull/114062).

Added: 
    clang/test/CodeGenHIP/store-addr-space.hip

Modified: 
    clang/lib/CodeGen/CGDecl.cpp
    clang/lib/CodeGen/CGExpr.cpp
    clang/lib/CodeGen/CodeGenFunction.h
    clang/test/CodeGenCXX/sret_cast_with_nonzero_alloca_as.cpp
    clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
    clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl

Removed: 
    


################################################################################
diff  --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp
index 9df1220c78623..8a1675848e13c 100644
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -1563,11 +1563,10 @@ CodeGenFunction::EmitAutoVarAlloca(const VarDecl &D) {
       // The named return value optimization: allocate this variable in the
       // return slot, so that we can elide the copy when returning this
       // variable (C++0x [class.copy]p34).
-      address = ReturnValue;
       AllocaAddr =
           RawAddress(ReturnValue.emitRawPointer(*this),
                      ReturnValue.getElementType(), ReturnValue.getAlignment());
-      ;
+      address = MaybeCastStackAddressSpace(AllocaAddr, Ty.getAddressSpace());
 
       if (const RecordType *RecordTy = Ty->getAs<RecordType>()) {
         const auto *RD = RecordTy->getOriginalDecl()->getDefinitionOrSelf();

diff  --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index d229d81d6b934..2329fa20a2530 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -108,13 +108,10 @@ CodeGenFunction::CreateTempAllocaWithoutCast(llvm::Type 
*Ty, CharUnits Align,
   return RawAddress(Alloca, Ty, Align, KnownNonNull);
 }
 
-RawAddress CodeGenFunction::CreateTempAlloca(llvm::Type *Ty, LangAS DestLangAS,
-                                             CharUnits Align, const Twine 
&Name,
-                                             llvm::Value *ArraySize,
-                                             RawAddress *AllocaAddr) {
-  RawAddress Alloca = CreateTempAllocaWithoutCast(Ty, Align, Name, ArraySize);
-  if (AllocaAddr)
-    *AllocaAddr = Alloca;
+RawAddress CodeGenFunction::MaybeCastStackAddressSpace(RawAddress Alloca,
+                                                       LangAS DestLangAS,
+                                                       llvm::Value *ArraySize) 
{
+
   llvm::Value *V = Alloca.getPointer();
   // Alloca always returns a pointer in alloca address space, which may
   // be 
diff erent from the type defined by the language. For example,
@@ -134,7 +131,18 @@ RawAddress CodeGenFunction::CreateTempAlloca(llvm::Type 
*Ty, LangAS DestLangAS,
         /*IsNonNull=*/true);
   }
 
-  return RawAddress(V, Ty, Align, KnownNonNull);
+  return RawAddress(V, Alloca.getElementType(), Alloca.getAlignment(),
+                    KnownNonNull);
+}
+
+RawAddress CodeGenFunction::CreateTempAlloca(llvm::Type *Ty, LangAS DestLangAS,
+                                             CharUnits Align, const Twine 
&Name,
+                                             llvm::Value *ArraySize,
+                                             RawAddress *AllocaAddr) {
+  RawAddress Alloca = CreateTempAllocaWithoutCast(Ty, Align, Name, ArraySize);
+  if (AllocaAddr)
+    *AllocaAddr = Alloca;
+  return MaybeCastStackAddressSpace(Alloca, DestLangAS, ArraySize);
 }
 
 /// CreateTempAlloca - This creates an alloca and inserts it into the entry

diff  --git a/clang/lib/CodeGen/CodeGenFunction.h 
b/clang/lib/CodeGen/CodeGenFunction.h
index ad318f289ee83..fc65199a0f154 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -2804,6 +2804,13 @@ class CodeGenFunction : public CodeGenTypeCache {
     AllocaTracker Tracker;
   };
 
+private:
+  /// If \p Alloca is not in the same address space as \p DestLangAS, insert an
+  /// address space cast and return a new RawAddress based on this value.
+  RawAddress MaybeCastStackAddressSpace(RawAddress Alloca, LangAS DestLangAS,
+                                        llvm::Value *ArraySize = nullptr);
+
+public:
   /// CreateTempAlloca - This creates an alloca and inserts it into the entry
   /// block if \p ArraySize is nullptr, otherwise inserts it at the current
   /// insertion point of the builder. The caller is responsible for setting an

diff  --git a/clang/test/CodeGenCXX/sret_cast_with_nonzero_alloca_as.cpp 
b/clang/test/CodeGenCXX/sret_cast_with_nonzero_alloca_as.cpp
index 320c712b665de..a0ee54dc16ba3 100644
--- a/clang/test/CodeGenCXX/sret_cast_with_nonzero_alloca_as.cpp
+++ b/clang/test/CodeGenCXX/sret_cast_with_nonzero_alloca_as.cpp
@@ -10,16 +10,15 @@ struct X { int z[17]; };
 // CHECK-NEXT:    [[Y_ADDR:%.*]] = alloca i8, align 1, addrspace(5)
 // CHECK-NEXT:    [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[X_ADDR]] to ptr
 // CHECK-NEXT:    [[Y_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[Y_ADDR]] to ptr
+// CHECK-NEXT:    [[AGG_RESULT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[AGG_RESULT]] to ptr
 // CHECK-NEXT:    store i8 [[X]], ptr [[X_ADDR_ASCAST]], align 1
 // CHECK-NEXT:    store i8 [[Y]], ptr [[Y_ADDR_ASCAST]], align 1
 // CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[X_ADDR_ASCAST]], align 1
-// CHECK-NEXT:    [[AGG_RESULT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[AGG_RESULT]] to ptr
 // CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr 
[[AGG_RESULT_ASCAST]], i64 1
 // CHECK-NEXT:    store i8 [[TMP0]], ptr [[ADD_PTR]], align 1
 // CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[Y_ADDR_ASCAST]], align 1
-// CHECK-NEXT:    [[AGG_RESULT_ASCAST1:%.*]] = addrspacecast ptr addrspace(5) 
[[AGG_RESULT]] to ptr
-// CHECK-NEXT:    [[ADD_PTR2:%.*]] = getelementptr inbounds i8, ptr 
[[AGG_RESULT_ASCAST1]], i64 2
-// CHECK-NEXT:    store i8 [[TMP1]], ptr [[ADD_PTR2]], align 1
+// CHECK-NEXT:    [[ADD_PTR1:%.*]] = getelementptr inbounds i8, ptr 
[[AGG_RESULT_ASCAST]], i64 2
+// CHECK-NEXT:    store i8 [[TMP1]], ptr [[ADD_PTR1]], align 1
 // CHECK-NEXT:    ret void
 //
 X foo(char x, char y) {

diff  --git a/clang/test/CodeGenHIP/store-addr-space.hip 
b/clang/test/CodeGenHIP/store-addr-space.hip
new file mode 100644
index 0000000000000..6103edba46274
--- /dev/null
+++ b/clang/test/CodeGenHIP/store-addr-space.hip
@@ -0,0 +1,46 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --functions "bar" --version 5
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -x hip -emit-llvm 
-fcuda-is-device \
+// RUN:   -o - %s | FileCheck --check-prefix=AMDGCN --enable-var-scope %s
+
+struct Foo {
+  unsigned long long val;
+//
+  __attribute__((device)) inline Foo() { val = 0; }
+  __attribute__((device)) inline Foo(const Foo &src) { val = src.val; }
+  __attribute__((device)) inline Foo(const volatile Foo &src) { val = src.val; 
}
+};
+
+// AMDGCN-LABEL: define dso_local void @_Z3barPK3Foo(
+// AMDGCN-SAME: ptr addrspace(5) dead_on_unwind noalias writable 
sret([[STRUCT_FOO:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef 
[[SRC_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr addrspace(5), align 4, 
addrspace(5)
+// AMDGCN-NEXT:    [[SRC_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN-NEXT:    [[DST:%.*]] = alloca [[UNION_ANON:%.*]], align 8, 
addrspace(5)
+// AMDGCN-NEXT:    [[RESULT_PTR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[RESULT_PTR]] to ptr
+// AMDGCN-NEXT:    [[SRC_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[SRC_PTR_ADDR]] to ptr
+// AMDGCN-NEXT:    [[AGG_RESULT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[AGG_RESULT]] to ptr
+// AMDGCN-NEXT:    [[DST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST]] 
to ptr
+// AMDGCN-NEXT:    store ptr addrspace(5) [[AGG_RESULT]], ptr 
[[RESULT_PTR_ASCAST]], align 4
+// AMDGCN-NEXT:    store ptr [[SRC_PTR]], ptr [[SRC_PTR_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT:    call void @_ZN3FooC1Ev(ptr noundef nonnull align 8 
dereferenceable(8) [[AGG_RESULT_ASCAST]]) #[[ATTR1:[0-9]+]]
+// AMDGCN-NEXT:    store ptr [[AGG_RESULT_ASCAST]], ptr [[DST_ASCAST]], align 8
+// AMDGCN-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SRC_PTR_ADDR_ASCAST]], align 
8
+// AMDGCN-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FOO]], 
ptr [[TMP0]], i32 0, i32 0
+// AMDGCN-NEXT:    [[TMP1:%.*]] = load i64, ptr [[VAL]], align 8
+// AMDGCN-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DST_ASCAST]], align 8
+// AMDGCN-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr 
[[TMP2]], i64 0
+// AMDGCN-NEXT:    store i64 [[TMP1]], ptr [[ARRAYIDX]], align 8
+// AMDGCN-NEXT:    ret void
+//
+__attribute__((device)) Foo bar(const Foo *const src_ptr) {
+  Foo result;
+
+  union {
+    Foo* const ptr;
+    unsigned long long * const ptr64;
+  } dst = {&result};
+
+  dst.ptr64[0] = src_ptr->val;
+  return result;
+}

diff  --git a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl 
b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
index a70e9af75fa38..85157bdcf43f9 100644
--- a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
+++ b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
@@ -647,6 +647,7 @@ kernel void KernelLargeTwoMember(struct 
LargeStructTwoMember u) {
 // AMDGCN20-NEXT:    [[IN:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4, 
addrspace(5)
 // AMDGCN20-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[RETVAL]] to ptr
 // AMDGCN20-NEXT:    [[IN1:%.*]] = addrspacecast ptr addrspace(5) [[IN]] to ptr
+// AMDGCN20-NEXT:    [[RETVAL_ASCAST_ASCAST:%.*]] = addrspacecast ptr 
[[RETVAL_ASCAST]] to ptr addrspace(5)
 // AMDGCN20-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw 
[[STRUCT_MAT3X3]], ptr [[IN1]], i32 0, i32 0
 // AMDGCN20-NEXT:    store [9 x i32] [[IN_COERCE]], ptr [[COERCE_DIVE]], align 
4
 // AMDGCN20-NEXT:    [[TMP0:%.*]] = load [[STRUCT_MAT4X4]], ptr 
[[RETVAL_ASCAST]], align 4

diff  --git a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl 
b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl
index a1a114ef129a1..bc65788c17352 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl
@@ -121,6 +121,7 @@ kernel void KernelLargeTwoMember(struct 
LargeStructTwoMember u) {
 // AMDGCN-NEXT:    [[IN:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4, 
addrspace(5)
 // AMDGCN-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[RETVAL]] to ptr
 // AMDGCN-NEXT:    [[IN1:%.*]] = addrspacecast ptr addrspace(5) [[IN]] to ptr
+// AMDGCN-NEXT:    [[RETVAL_ASCAST_ASCAST:%.*]] = addrspacecast ptr 
[[RETVAL_ASCAST]] to ptr addrspace(5)
 // AMDGCN-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw 
[[STRUCT_MAT3X3]], ptr [[IN1]], i32 0, i32 0
 // AMDGCN-NEXT:    store [9 x i32] [[IN_COERCE]], ptr [[COERCE_DIVE]], align 4
 // AMDGCN-NEXT:    [[TMP0:%.*]] = load [[STRUCT_MAT4X4]], ptr 
[[RETVAL_ASCAST]], align 4


        
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to