https://github.com/jhuber6 updated 
https://github.com/llvm/llvm-project/pull/123437

>From 4414706b8ced9048a572fb78544a7e637c4946a0 Mon Sep 17 00:00:00 2001
From: Joseph Huber <hube...@outlook.com>
Date: Fri, 17 Jan 2025 19:56:18 -0600
Subject: [PATCH 1/3] [HIP] Support managed variables using the new driver

Summary:
Previously, managed variables didn't work in rdc mode using the new
driver because we just didn't register them. This was previously ignored
because we didn't have enough space in the current struct format. This
patch amends that by just emitting a struct pair for the two variables
and using the single pointer.

In the future, a more extensible entry format would be nice, but that
can be done later.
---
 clang/lib/CodeGen/CGCUDANV.cpp                | 31 ++++++--
 clang/test/CodeGenCUDA/offloading-entries.cu  | 78 +++++++++----------
 clang/test/Driver/linker-wrapper-image.c      | 28 ++++---
 .../llvm/Frontend/Offloading/Utility.h        |  4 +
 .../Frontend/Offloading/OffloadWrapper.cpp    | 16 ++++
 llvm/lib/Frontend/Offloading/Utility.cpp      | 10 +++
 6 files changed, 109 insertions(+), 58 deletions(-)

diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index ae14d74f2d91511..0fc81491c40855d 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -1221,12 +1221,31 @@ void CGNVCUDARuntime::createOffloadingEntries() {
              ? static_cast<int32_t>(llvm::offloading::OffloadGlobalNormalized)
              : 0);
     if (I.Flags.getKind() == DeviceVarFlags::Variable) {
-      llvm::offloading::emitOffloadingEntry(
-          M, I.Var, getDeviceSideName(I.D), VarSize,
-          (I.Flags.isManaged() ? llvm::offloading::OffloadGlobalManagedEntry
-                               : llvm::offloading::OffloadGlobalEntry) |
-              Flags,
-          /*Data=*/0, Section);
+      if (I.Flags.isManaged()) {
+        assert(I.Var->getName().ends_with(".managed") &&
+               "HIP managed variables not transformed");
+
+        // Create a struct to contain the two variables.
+        auto *ManagedVar = M.getNamedGlobal(
+            I.Var->getName().drop_back(StringRef(".managed").size()));
+        llvm::Constant *StructData[] = {ManagedVar, I.Var};
+        llvm::Constant *Initializer = llvm::ConstantStruct::get(
+            llvm::offloading::getManagedTy(M), StructData);
+        auto *Struct = new llvm::GlobalVariable(
+            M, llvm::offloading::getManagedTy(M),
+            /*IsConstant=*/true, llvm::GlobalValue::PrivateLinkage, 
Initializer,
+            I.Var->getName());
+
+        llvm::offloading::emitOffloadingEntry(
+            M, Struct, getDeviceSideName(I.D), VarSize,
+            llvm::offloading::OffloadGlobalManagedEntry | Flags,
+            /*Data=*/static_cast<uint32_t>(I.Var->getAlignment()), Section);
+      } else {
+        llvm::offloading::emitOffloadingEntry(
+            M, I.Var, getDeviceSideName(I.D), VarSize,
+            llvm::offloading::OffloadGlobalEntry | Flags,
+            /*Data=*/0, Section);
+      }
     } else if (I.Flags.getKind() == DeviceVarFlags::Surface) {
       llvm::offloading::emitOffloadingEntry(
           M, I.Var, getDeviceSideName(I.D), VarSize,
diff --git a/clang/test/CodeGenCUDA/offloading-entries.cu 
b/clang/test/CodeGenCUDA/offloading-entries.cu
index 259e3324e8ac94f..d46a25969e3ecd7 100644
--- a/clang/test/CodeGenCUDA/offloading-entries.cu
+++ b/clang/test/CodeGenCUDA/offloading-entries.cu
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --check-globals --global-value-regex ".offloading.entry.*"
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --check-globals --global-value-regex ".offloading.entry.*" "managed.*"
 // RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux-gnu -fgpu-rdc \
 // RUN:   --offload-new-driver -emit-llvm -o - -x cuda  %s | FileCheck \
 // RUN:   --check-prefix=CUDA %s
@@ -14,50 +14,68 @@
 
 #include "Inputs/cuda.h"
 
+#define __managed__ __attribute__((managed))
+
 //.
+// CUDA: @managed = global i32 undef, align 4
 // CUDA: @.offloading.entry_name = internal unnamed_addr constant [8 x i8] 
c"_Z3foov\00", section ".llvm.rodata.offloading", align 1
 // CUDA: @.offloading.entry._Z3foov = weak constant 
%struct.__tgt_offload_entry { ptr @_Z18__device_stub__foov, ptr 
@.offloading.entry_name, i64 0, i32 0, i32 0 }, section 
"cuda_offloading_entries", align 1
 // CUDA: @.offloading.entry_name.1 = internal unnamed_addr constant [11 x i8] 
c"_Z6kernelv\00", section ".llvm.rodata.offloading", align 1
 // CUDA: @.offloading.entry._Z6kernelv = weak constant 
%struct.__tgt_offload_entry { ptr @_Z21__device_stub__kernelv, ptr 
@.offloading.entry_name.1, i64 0, i32 0, i32 0 }, section 
"cuda_offloading_entries", align 1
 // CUDA: @.offloading.entry_name.2 = internal unnamed_addr constant [4 x i8] 
c"var\00", section ".llvm.rodata.offloading", align 1
 // CUDA: @.offloading.entry.var = weak constant %struct.__tgt_offload_entry { 
ptr @var, ptr @.offloading.entry_name.2, i64 4, i32 0, i32 0 }, section 
"cuda_offloading_entries", align 1
-// CUDA: @.offloading.entry_name.3 = internal unnamed_addr constant [5 x i8] 
c"surf\00", section ".llvm.rodata.offloading", align 1
-// CUDA: @.offloading.entry.surf = weak constant %struct.__tgt_offload_entry { 
ptr @surf, ptr @.offloading.entry_name.3, i64 4, i32 2, i32 1 }, section 
"cuda_offloading_entries", align 1
-// CUDA: @.offloading.entry_name.4 = internal unnamed_addr constant [4 x i8] 
c"tex\00", section ".llvm.rodata.offloading", align 1
-// CUDA: @.offloading.entry.tex = weak constant %struct.__tgt_offload_entry { 
ptr @tex, ptr @.offloading.entry_name.4, i64 4, i32 3, i32 1 }, section 
"cuda_offloading_entries", align 1
+// CUDA: @.offloading.entry_name.3 = internal unnamed_addr constant [8 x i8] 
c"managed\00", section ".llvm.rodata.offloading", align 1
+// CUDA: @.offloading.entry.managed = weak constant 
%struct.__tgt_offload_entry { ptr @managed, ptr @.offloading.entry_name.3, i64 
4, i32 0, i32 0 }, section "cuda_offloading_entries", align 1
+// CUDA: @.offloading.entry_name.4 = internal unnamed_addr constant [5 x i8] 
c"surf\00", section ".llvm.rodata.offloading", align 1
+// CUDA: @.offloading.entry.surf = weak constant %struct.__tgt_offload_entry { 
ptr @surf, ptr @.offloading.entry_name.4, i64 4, i32 2, i32 1 }, section 
"cuda_offloading_entries", align 1
+// CUDA: @.offloading.entry_name.5 = internal unnamed_addr constant [4 x i8] 
c"tex\00", section ".llvm.rodata.offloading", align 1
+// CUDA: @.offloading.entry.tex = weak constant %struct.__tgt_offload_entry { 
ptr @tex, ptr @.offloading.entry_name.5, i64 4, i32 3, i32 1 }, section 
"cuda_offloading_entries", align 1
 //.
+// HIP: @managed.managed = global i32 0, align 4
+// HIP: @managed = externally_initialized global ptr null
 // HIP: @.offloading.entry_name = internal unnamed_addr constant [8 x i8] 
c"_Z3foov\00", section ".llvm.rodata.offloading", align 1
 // HIP: @.offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry 
{ ptr @_Z3foov, ptr @.offloading.entry_name, i64 0, i32 0, i32 0 }, section 
"hip_offloading_entries", align 1
 // HIP: @.offloading.entry_name.1 = internal unnamed_addr constant [11 x i8] 
c"_Z6kernelv\00", section ".llvm.rodata.offloading", align 1
 // HIP: @.offloading.entry._Z6kernelv = weak constant 
%struct.__tgt_offload_entry { ptr @_Z6kernelv, ptr @.offloading.entry_name.1, 
i64 0, i32 0, i32 0 }, section "hip_offloading_entries", align 1
 // HIP: @.offloading.entry_name.2 = internal unnamed_addr constant [4 x i8] 
c"var\00", section ".llvm.rodata.offloading", align 1
 // HIP: @.offloading.entry.var = weak constant %struct.__tgt_offload_entry { 
ptr @var, ptr @.offloading.entry_name.2, i64 4, i32 0, i32 0 }, section 
"hip_offloading_entries", align 1
-// HIP: @.offloading.entry_name.3 = internal unnamed_addr constant [5 x i8] 
c"surf\00", section ".llvm.rodata.offloading", align 1
-// HIP: @.offloading.entry.surf = weak constant %struct.__tgt_offload_entry { 
ptr @surf, ptr @.offloading.entry_name.3, i64 4, i32 2, i32 1 }, section 
"hip_offloading_entries", align 1
-// HIP: @.offloading.entry_name.4 = internal unnamed_addr constant [4 x i8] 
c"tex\00", section ".llvm.rodata.offloading", align 1
-// HIP: @.offloading.entry.tex = weak constant %struct.__tgt_offload_entry { 
ptr @tex, ptr @.offloading.entry_name.4, i64 4, i32 3, i32 1 }, section 
"hip_offloading_entries", align 1
+// HIP: @managed.managed.3 = private constant %struct.__managed_var { ptr 
@managed, ptr @managed.managed }
+// HIP: @.offloading.entry_name.4 = internal unnamed_addr constant [8 x i8] 
c"managed\00", section ".llvm.rodata.offloading", align 1
+// HIP: @.offloading.entry.managed = weak constant %struct.__tgt_offload_entry 
{ ptr @managed.managed.3, ptr @.offloading.entry_name.4, i64 4, i32 1, i32 4 }, 
section "hip_offloading_entries", align 1
+// HIP: @.offloading.entry_name.5 = internal unnamed_addr constant [5 x i8] 
c"surf\00", section ".llvm.rodata.offloading", align 1
+// HIP: @.offloading.entry.surf = weak constant %struct.__tgt_offload_entry { 
ptr @surf, ptr @.offloading.entry_name.5, i64 4, i32 2, i32 1 }, section 
"hip_offloading_entries", align 1
+// HIP: @.offloading.entry_name.6 = internal unnamed_addr constant [4 x i8] 
c"tex\00", section ".llvm.rodata.offloading", align 1
+// HIP: @.offloading.entry.tex = weak constant %struct.__tgt_offload_entry { 
ptr @tex, ptr @.offloading.entry_name.6, i64 4, i32 3, i32 1 }, section 
"hip_offloading_entries", align 1
 //.
+// CUDA-COFF: @managed = dso_local global i32 undef, align 4
 // CUDA-COFF: @.offloading.entry_name = internal unnamed_addr constant [8 x 
i8] c"_Z3foov\00", section ".llvm.rodata.offloading", align 1
 // CUDA-COFF: @.offloading.entry._Z3foov = weak constant 
%struct.__tgt_offload_entry { ptr @_Z18__device_stub__foov, ptr 
@.offloading.entry_name, i64 0, i32 0, i32 0 }, section 
"cuda_offloading_entries$OE", align 1
 // CUDA-COFF: @.offloading.entry_name.1 = internal unnamed_addr constant [11 x 
i8] c"_Z6kernelv\00", section ".llvm.rodata.offloading", align 1
 // CUDA-COFF: @.offloading.entry._Z6kernelv = weak constant 
%struct.__tgt_offload_entry { ptr @_Z21__device_stub__kernelv, ptr 
@.offloading.entry_name.1, i64 0, i32 0, i32 0 }, section 
"cuda_offloading_entries$OE", align 1
 // CUDA-COFF: @.offloading.entry_name.2 = internal unnamed_addr constant [4 x 
i8] c"var\00", section ".llvm.rodata.offloading", align 1
 // CUDA-COFF: @.offloading.entry.var = weak constant 
%struct.__tgt_offload_entry { ptr @var, ptr @.offloading.entry_name.2, i64 4, 
i32 0, i32 0 }, section "cuda_offloading_entries$OE", align 1
-// CUDA-COFF: @.offloading.entry_name.3 = internal unnamed_addr constant [5 x 
i8] c"surf\00", section ".llvm.rodata.offloading", align 1
-// CUDA-COFF: @.offloading.entry.surf = weak constant 
%struct.__tgt_offload_entry { ptr @surf, ptr @.offloading.entry_name.3, i64 4, 
i32 2, i32 1 }, section "cuda_offloading_entries$OE", align 1
-// CUDA-COFF: @.offloading.entry_name.4 = internal unnamed_addr constant [4 x 
i8] c"tex\00", section ".llvm.rodata.offloading", align 1
-// CUDA-COFF: @.offloading.entry.tex = weak constant 
%struct.__tgt_offload_entry { ptr @tex, ptr @.offloading.entry_name.4, i64 4, 
i32 3, i32 1 }, section "cuda_offloading_entries$OE", align 1
+// CUDA-COFF: @.offloading.entry_name.3 = internal unnamed_addr constant [8 x 
i8] c"managed\00", section ".llvm.rodata.offloading", align 1
+// CUDA-COFF: @.offloading.entry.managed = weak constant 
%struct.__tgt_offload_entry { ptr @managed, ptr @.offloading.entry_name.3, i64 
4, i32 0, i32 0 }, section "cuda_offloading_entries$OE", align 1
+// CUDA-COFF: @.offloading.entry_name.4 = internal unnamed_addr constant [5 x 
i8] c"surf\00", section ".llvm.rodata.offloading", align 1
+// CUDA-COFF: @.offloading.entry.surf = weak constant 
%struct.__tgt_offload_entry { ptr @surf, ptr @.offloading.entry_name.4, i64 4, 
i32 2, i32 1 }, section "cuda_offloading_entries$OE", align 1
+// CUDA-COFF: @.offloading.entry_name.5 = internal unnamed_addr constant [4 x 
i8] c"tex\00", section ".llvm.rodata.offloading", align 1
+// CUDA-COFF: @.offloading.entry.tex = weak constant 
%struct.__tgt_offload_entry { ptr @tex, ptr @.offloading.entry_name.5, i64 4, 
i32 3, i32 1 }, section "cuda_offloading_entries$OE", align 1
 //.
+// HIP-COFF: @managed.managed = dso_local global i32 0, align 4
+// HIP-COFF: @managed = dso_local externally_initialized global ptr null
 // HIP-COFF: @.offloading.entry_name = internal unnamed_addr constant [8 x i8] 
c"_Z3foov\00", section ".llvm.rodata.offloading", align 1
 // HIP-COFF: @.offloading.entry._Z3foov = weak constant 
%struct.__tgt_offload_entry { ptr @_Z3foov, ptr @.offloading.entry_name, i64 0, 
i32 0, i32 0 }, section "hip_offloading_entries$OE", align 1
 // HIP-COFF: @.offloading.entry_name.1 = internal unnamed_addr constant [11 x 
i8] c"_Z6kernelv\00", section ".llvm.rodata.offloading", align 1
 // HIP-COFF: @.offloading.entry._Z6kernelv = weak constant 
%struct.__tgt_offload_entry { ptr @_Z6kernelv, ptr @.offloading.entry_name.1, 
i64 0, i32 0, i32 0 }, section "hip_offloading_entries$OE", align 1
 // HIP-COFF: @.offloading.entry_name.2 = internal unnamed_addr constant [4 x 
i8] c"var\00", section ".llvm.rodata.offloading", align 1
 // HIP-COFF: @.offloading.entry.var = weak constant 
%struct.__tgt_offload_entry { ptr @var, ptr @.offloading.entry_name.2, i64 4, 
i32 0, i32 0 }, section "hip_offloading_entries$OE", align 1
-// HIP-COFF: @.offloading.entry_name.3 = internal unnamed_addr constant [5 x 
i8] c"surf\00", section ".llvm.rodata.offloading", align 1
-// HIP-COFF: @.offloading.entry.surf = weak constant 
%struct.__tgt_offload_entry { ptr @surf, ptr @.offloading.entry_name.3, i64 4, 
i32 2, i32 1 }, section "hip_offloading_entries$OE", align 1
-// HIP-COFF: @.offloading.entry_name.4 = internal unnamed_addr constant [4 x 
i8] c"tex\00", section ".llvm.rodata.offloading", align 1
-// HIP-COFF: @.offloading.entry.tex = weak constant 
%struct.__tgt_offload_entry { ptr @tex, ptr @.offloading.entry_name.4, i64 4, 
i32 3, i32 1 }, section "hip_offloading_entries$OE", align 1
+// HIP-COFF: @managed.managed.3 = private constant %struct.__managed_var { ptr 
@managed, ptr @managed.managed }
+// HIP-COFF: @.offloading.entry_name.4 = internal unnamed_addr constant [8 x 
i8] c"managed\00", section ".llvm.rodata.offloading", align 1
+// HIP-COFF: @.offloading.entry.managed = weak constant 
%struct.__tgt_offload_entry { ptr @managed.managed.3, ptr 
@.offloading.entry_name.4, i64 4, i32 1, i32 4 }, section 
"hip_offloading_entries$OE", align 1
+// HIP-COFF: @.offloading.entry_name.5 = internal unnamed_addr constant [5 x 
i8] c"surf\00", section ".llvm.rodata.offloading", align 1
+// HIP-COFF: @.offloading.entry.surf = weak constant 
%struct.__tgt_offload_entry { ptr @surf, ptr @.offloading.entry_name.5, i64 4, 
i32 2, i32 1 }, section "hip_offloading_entries$OE", align 1
+// HIP-COFF: @.offloading.entry_name.6 = internal unnamed_addr constant [4 x 
i8] c"tex\00", section ".llvm.rodata.offloading", align 1
+// HIP-COFF: @.offloading.entry.tex = weak constant 
%struct.__tgt_offload_entry { ptr @tex, ptr @.offloading.entry_name.6, i64 4, 
i32 3, i32 1 }, section "hip_offloading_entries$OE", align 1
 //.
 // CUDA-LABEL: @_Z18__device_stub__foov(
 // CUDA-NEXT:  entry:
@@ -91,6 +109,7 @@ __global__ void foo() {}
 __device__ int var = 1;
 const __device__ int constant = 1;
 extern __device__ int external;
+__device__ __managed__ int managed = 0;
 
 // CUDA-LABEL: @_Z21__device_stub__kernelv(
 // CUDA-NEXT:  entry:
@@ -137,28 +156,3 @@ template <typename T, int dim = 1, int mode = 0>
 struct __attribute__((device_builtin_texture_type)) texture : public 
textureReference {};
 
 texture<void> tex;
-//.
-// CUDA: [[META0:![0-9]+]] = !{ptr @.offloading.entry_name}
-// CUDA: [[META1:![0-9]+]] = !{ptr @.offloading.entry_name.1}
-// CUDA: [[META2:![0-9]+]] = !{ptr @.offloading.entry_name.2}
-// CUDA: [[META3:![0-9]+]] = !{ptr @.offloading.entry_name.3}
-// CUDA: [[META4:![0-9]+]] = !{ptr @.offloading.entry_name.4}
-//.
-// HIP: [[META0:![0-9]+]] = !{ptr @.offloading.entry_name}
-// HIP: [[META1:![0-9]+]] = !{ptr @.offloading.entry_name.1}
-// HIP: [[META2:![0-9]+]] = !{ptr @.offloading.entry_name.2}
-// HIP: [[META3:![0-9]+]] = !{ptr @.offloading.entry_name.3}
-// HIP: [[META4:![0-9]+]] = !{ptr @.offloading.entry_name.4}
-//.
-// CUDA-COFF: [[META0:![0-9]+]] = !{ptr @.offloading.entry_name}
-// CUDA-COFF: [[META1:![0-9]+]] = !{ptr @.offloading.entry_name.1}
-// CUDA-COFF: [[META2:![0-9]+]] = !{ptr @.offloading.entry_name.2}
-// CUDA-COFF: [[META3:![0-9]+]] = !{ptr @.offloading.entry_name.3}
-// CUDA-COFF: [[META4:![0-9]+]] = !{ptr @.offloading.entry_name.4}
-//.
-// HIP-COFF: [[META0:![0-9]+]] = !{ptr @.offloading.entry_name}
-// HIP-COFF: [[META1:![0-9]+]] = !{ptr @.offloading.entry_name.1}
-// HIP-COFF: [[META2:![0-9]+]] = !{ptr @.offloading.entry_name.2}
-// HIP-COFF: [[META3:![0-9]+]] = !{ptr @.offloading.entry_name.3}
-// HIP-COFF: [[META4:![0-9]+]] = !{ptr @.offloading.entry_name.4}
-//.
diff --git a/clang/test/Driver/linker-wrapper-image.c 
b/clang/test/Driver/linker-wrapper-image.c
index f553f20f7ee897c..7f96f629e912728 100644
--- a/clang/test/Driver/linker-wrapper-image.c
+++ b/clang/test/Driver/linker-wrapper-image.c
@@ -87,7 +87,7 @@
 // CUDA-NEXT:   br i1 %1, label %while.entry, label %while.end
 
 //      CUDA: while.entry:
-// CUDA-NEXT:   %entry1 = phi ptr [ @__start_cuda_offloading_entries, %entry 
], [ %12, %if.end ]
+// CUDA-NEXT:   %entry1 = phi ptr [ @__start_cuda_offloading_entries, %entry 
], [ %13, %if.end ]
 // CUDA-NEXT:   %2 = getelementptr inbounds %struct.__tgt_offload_entry, ptr 
%entry1, i64 0, i32 0
 // CUDA-NEXT:   %addr = load ptr, ptr %2, align 8
 // CUDA-NEXT:   %3 = getelementptr inbounds %struct.__tgt_offload_entry, ptr 
%entry1, i64 0, i32 1
@@ -125,7 +125,11 @@
 // CUDA-NEXT:   br label %if.end
 
 //      CUDA: sw.managed:
-// CUDA-NEXT:   br label %if.end
+// CUDA-NEXT:  %managed.addr = load ptr, ptr %addr, align 8
+// CUDA-NEXT:  %12 = getelementptr inbounds ptr, ptr %addr, i64 1
+// CUDA-NEXT:  %managed.addr2 = load ptr, ptr %12, align 8
+// CUDA-NEXT:  call void @__cudaRegisterManagedVar(ptr %0, ptr %managed.addr, 
ptr %managed.addr2, ptr %name, i64 %size, i32 %textype)
+// CUDA-NEXT:  br label %if.end
 
 //      CUDA: sw.surface:
 // CUDA-NEXT:   br label %if.end
@@ -134,9 +138,9 @@
 // CUDA-NEXT:   br label %if.end
 
 //      CUDA: if.end:
-// CUDA-NEXT:   %12 = getelementptr inbounds %struct.__tgt_offload_entry, ptr 
%entry1, i64 1
-// CUDA-NEXT:   %13 = icmp eq ptr %12, @__stop_cuda_offloading_entries
-// CUDA-NEXT:   br i1 %13, label %while.end, label %while.entry
+// CUDA-NEXT:   %13 = getelementptr inbounds %struct.__tgt_offload_entry, ptr 
%entry1, i64 1
+// CUDA-NEXT:   %14 = icmp eq ptr %13, @__stop_cuda_offloading_entries
+// CUDA-NEXT:   br i1 %14, label %while.end, label %while.entry
 
 //      CUDA: while.end:
 // CUDA-NEXT:   ret void
@@ -187,7 +191,7 @@
 // HIP-NEXT:   br i1 %1, label %while.entry, label %while.end
 
 //      HIP: while.entry:
-// HIP-NEXT:   %entry1 = phi ptr [ @__start_hip_offloading_entries, %entry ], 
[ %12, %if.end ]
+// HIP-NEXT:   %entry1 = phi ptr [ @__start_hip_offloading_entries, %entry ], 
[ %13, %if.end ]
 // HIP-NEXT:   %2 = getelementptr inbounds %struct.__tgt_offload_entry, ptr 
%entry1, i64 0, i32 0
 // HIP-NEXT:   %addr = load ptr, ptr %2, align 8
 // HIP-NEXT:   %3 = getelementptr inbounds %struct.__tgt_offload_entry, ptr 
%entry1, i64 0, i32 1
@@ -225,7 +229,11 @@
 // HIP-NEXT:   br label %if.end
 
 //      HIP: sw.managed:
-// HIP-NEXT:   br label %if.end
+// HIP-NEXT:  %managed.addr = load ptr, ptr %addr, align 8
+// HIP-NEXT:  %12 = getelementptr inbounds ptr, ptr %addr, i64 1
+// HIP-NEXT:  %managed.addr2 = load ptr, ptr %12, align 8
+// HIP-NEXT:  call void @__hipRegisterManagedVar(ptr %0, ptr %managed.addr, 
ptr %managed.addr2, ptr %name, i64 %size, i32 %textype)
+// HIP-NEXT:  br label %if.end
 
 //      HIP: sw.surface:
 // HIP-NEXT:   call void @__hipRegisterSurface(ptr %0, ptr %addr, ptr %name, 
ptr %name, i32 %textype, i32 %extern)
@@ -236,9 +244,9 @@
 // HIP-NEXT:   br label %if.end
 
 //      HIP: if.end:
-// HIP-NEXT:   %12 = getelementptr inbounds %struct.__tgt_offload_entry, ptr 
%entry1, i64 1
-// HIP-NEXT:   %13 = icmp eq ptr %12, @__stop_hip_offloading_entries
-// HIP-NEXT:   br i1 %13, label %while.end, label %while.entry
+// HIP-NEXT:   %13 = getelementptr inbounds %struct.__tgt_offload_entry, ptr 
%entry1, i64 1
+// HIP-NEXT:   %14 = icmp eq ptr %13, @__stop_hip_offloading_entries
+// HIP-NEXT:   br i1 %14, label %while.end, label %while.entry
 
 //      HIP: while.end:
 // HIP-NEXT:   ret void
diff --git a/llvm/include/llvm/Frontend/Offloading/Utility.h 
b/llvm/include/llvm/Frontend/Offloading/Utility.h
index f0bde5d81ef6ded..ddcf0a946d635de 100644
--- a/llvm/include/llvm/Frontend/Offloading/Utility.h
+++ b/llvm/include/llvm/Frontend/Offloading/Utility.h
@@ -55,6 +55,10 @@ enum OffloadEntryKindFlag : uint32_t {
 /// globals that will be registered with the offloading runtime.
 StructType *getEntryTy(Module &M);
 
+/// Returns the struct type we store the two pointers for CUDA / HIP managed
+/// variables in. Necessary until we widen the offload entry struct.
+StructType *getManagedTy(Module &M);
+
 /// Create an offloading section struct used to register this global at
 /// runtime.
 ///
diff --git a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp 
b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp
index d616b4058b7bb09..d3cb5346f4ba5d1 100644
--- a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp
+++ b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp
@@ -353,6 +353,16 @@ Function *createRegisterGlobalsFunction(Module &M, bool 
IsHIP,
   FunctionCallee RegVar = M.getOrInsertFunction(
       IsHIP ? "__hipRegisterVar" : "__cudaRegisterVar", RegVarTy);
 
+  // Get the __cudaRegisterSurface function declaration.
+  FunctionType *RegManagedVarTy =
+      FunctionType::get(Type::getVoidTy(C),
+                        {Int8PtrPtrTy, Int8PtrTy, Int8PtrTy, Int8PtrTy,
+                         getSizeTTy(M), Type::getInt32Ty(C)},
+                        /*isVarArg=*/false);
+  FunctionCallee RegManagedVar = M.getOrInsertFunction(
+      IsHIP ? "__hipRegisterManagedVar" : "__cudaRegisterManagedVar",
+      RegManagedVarTy);
+
   // Get the __cudaRegisterSurface function declaration.
   FunctionType *RegSurfaceTy =
       FunctionType::get(Type::getVoidTy(C),
@@ -466,6 +476,12 @@ Function *createRegisterGlobalsFunction(Module &M, bool 
IsHIP,
 
   // Create managed variable registration code.
   Builder.SetInsertPoint(SwManagedBB);
+  auto *ManagedVar = Builder.CreateLoad(Int8PtrTy, Addr, "managed.addr");
+  auto *ManagedAddr = Builder.CreateInBoundsGEP(
+      Int8PtrTy, Addr, {ConstantInt::get(Builder.getInt64Ty(), 1)});
+  auto *Managed = Builder.CreateLoad(Int8PtrTy, ManagedAddr, "managed.addr");
+  Builder.CreateCall(RegManagedVar, {RegGlobalsFn->arg_begin(), ManagedVar,
+                                     Managed, Name, Size, Data});
   Builder.CreateBr(IfEndBB);
   
Switch->addCase(Builder.getInt32(llvm::offloading::OffloadGlobalManagedEntry),
                   SwManagedBB);
diff --git a/llvm/lib/Frontend/Offloading/Utility.cpp 
b/llvm/lib/Frontend/Offloading/Utility.cpp
index 9e85ffbfe22d70b..26dab0b22fa12b5 100644
--- a/llvm/lib/Frontend/Offloading/Utility.cpp
+++ b/llvm/lib/Frontend/Offloading/Utility.cpp
@@ -33,6 +33,16 @@ StructType *offloading::getEntryTy(Module &M) {
   return EntryTy;
 }
 
+StructType *offloading::getManagedTy(Module &M) {
+  LLVMContext &C = M.getContext();
+  StructType *StructTy = StructType::getTypeByName(C, "struct.__managed_var");
+  if (!StructTy)
+    StructTy = llvm::StructType::create("struct.__managed_var",
+                                        PointerType::getUnqual(M.getContext()),
+                                        
PointerType::getUnqual(M.getContext()));
+  return StructTy;
+}
+
 // TODO: Rework this interface to be more generic.
 std::pair<Constant *, GlobalVariable *>
 offloading::getOffloadingEntryInitializer(Module &M, Constant *Addr,

>From 98229f419b0cefa221de3ff4636f728f5723360a Mon Sep 17 00:00:00 2001
From: Joseph Huber <hube...@outlook.com>
Date: Sat, 18 Jan 2025 08:10:54 -0600
Subject: [PATCH 2/3] Comments

---
 clang/lib/CodeGen/CGCUDANV.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 0fc81491c40855d..48c9f78b53a8960 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -1221,6 +1221,7 @@ void CGNVCUDARuntime::createOffloadingEntries() {
              ? static_cast<int32_t>(llvm::offloading::OffloadGlobalNormalized)
              : 0);
     if (I.Flags.getKind() == DeviceVarFlags::Variable) {
+      // TODO: Update the offloading entries struct to avoid this indirection.
       if (I.Flags.isManaged()) {
         assert(I.Var->getName().ends_with(".managed") &&
                "HIP managed variables not transformed");
@@ -1234,7 +1235,9 @@ void CGNVCUDARuntime::createOffloadingEntries() {
         auto *Struct = new llvm::GlobalVariable(
             M, llvm::offloading::getManagedTy(M),
             /*IsConstant=*/true, llvm::GlobalValue::PrivateLinkage, 
Initializer,
-            I.Var->getName());
+            I.Var->getName(), /*InsertBefore=*/nullptr,
+            llvm::GlobalVariable::NotThreadLocal,
+            CGM.getContext().getTargetAddressSpace(LangAS::Default));
 
         llvm::offloading::emitOffloadingEntry(
             M, Struct, getDeviceSideName(I.D), VarSize,

>From 753a4d8dc23d9241fcd7c8544a111df19295f693 Mon Sep 17 00:00:00 2001
From: Joseph Huber <hube...@outlook.com>
Date: Wed, 22 Jan 2025 07:54:47 -0600
Subject: [PATCH 3/3] Addrspace

---
 clang/lib/CodeGen/CGCUDANV.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 48c9f78b53a8960..23a40b8f7c32a39 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -1237,7 +1237,7 @@ void CGNVCUDARuntime::createOffloadingEntries() {
             /*IsConstant=*/true, llvm::GlobalValue::PrivateLinkage, 
Initializer,
             I.Var->getName(), /*InsertBefore=*/nullptr,
             llvm::GlobalVariable::NotThreadLocal,
-            CGM.getContext().getTargetAddressSpace(LangAS::Default));
+            M.getDataLayout().getDefaultGlobalsAddressSpace());
 
         llvm::offloading::emitOffloadingEntry(
             M, Struct, getDeviceSideName(I.D), VarSize,

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to