jhuber6 created this revision.
jhuber6 added reviewers: tra, jdoerfert, tianshilei1992, JonChesterfield,
yaxunl, rnk.
Herald added a subscriber: hiraditya.
Herald added a project: All.
jhuber6 requested review of this revision.
Herald added subscribers: llvm-commits, cfe-commits, sstefan1.
Herald added projects: clang, LLVM.
The new driver registers all the offloading entries by first storing a
structure containing the necessary information into a special section
and then iterating the section at runtime. This is done in ELF targets
using the linker defined `__start` and `__stop` sections. However for
COFF targets these are not provided. This is instead done by generating
sections as described here
<https://learn.microsoft.com/en-us/cpp/preprocessor/init-seg?redirectedfrom=MSDN&view=msvc-170>.
This patch adds the initial support required to offloadon COFF targets
by implementing this for the new driver. We use the `.<kind>$Ox` section
for COFF now.
NOTE: I have not tested the runtime functionality of patch as I do not
have a Windows machine set up yet.
Repository:
rG LLVM Github Monorepo
https://reviews.llvm.org/D137470
Files:
clang/lib/CodeGen/CGCUDANV.cpp
clang/test/CodeGenCUDA/offloading-entries.cu
clang/test/Driver/linker-wrapper-image.c
clang/test/OpenMP/declare_target_link_codegen.cpp
clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
Index: llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
===================================================================
--- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -4697,7 +4697,10 @@
int32_t Flags,
GlobalValue::LinkageTypes) {
if (!IsTargetCodegen) {
- emitOffloadingEntry(ID, Addr->getName(), Size, Flags);
+ llvm::Triple Triple(M.getTargetTriple());
+ emitOffloadingEntry(ID, Addr->getName(), Size, Flags,
+ Triple.isOSBinFormatCOFF() ? ".omp$OE"
+ : "omp_offloading_entries");
return;
}
// TODO: Add support for global variables on the device after declare target
Index: llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
===================================================================
--- llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -867,8 +867,7 @@
/// \param Flags Flags associated with the entry.
/// \param SectionName The section this entry will be placed at.
void emitOffloadingEntry(Constant *Addr, StringRef Name, uint64_t Size,
- int32_t Flags,
- StringRef SectionName = "omp_offloading_entries");
+ int32_t Flags, StringRef SectionName);
/// Generate control flow and cleanup for cancellation.
///
Index: clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
===================================================================
--- clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
+++ clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
@@ -110,6 +110,64 @@
return PointerType::getUnqual(getBinDescTy(M));
}
+std::pair<Constant *, Constant *> getELFEntriesArray(Module &M,
+ StringRef Kind) {
+ auto *EntriesB = new GlobalVariable(
+ M, ArrayType::get(getEntryTy(M), 0), /*isConstant*/ true,
+ GlobalValue::ExternalLinkage,
+ /*Initializer*/ nullptr, "__start_" + Kind + "_offloading_entries");
+ EntriesB->setVisibility(GlobalValue::HiddenVisibility);
+ auto *EntriesE = new GlobalVariable(
+ M, ArrayType::get(getEntryTy(M), 0), /*isConstant*/ true,
+ GlobalValue::ExternalLinkage,
+ /*Initializer*/ nullptr, "__stop_" + Kind + "_offloading_entries");
+ EntriesE->setVisibility(GlobalValue::HiddenVisibility);
+
+ // We assume that external begin/end symbols that we have created above will
+ // be defined by the linker. But linker will do that only if linker inputs
+ // have section with "omp_offloading_entries" name which is not guaranteed.
+ // So, we just create dummy zero sized object in the offload entries section
+ // to force linker to define those symbols.
+ auto *DummyInit =
+ ConstantAggregateZero::get(ArrayType::get(getEntryTy(M), 0u));
+ auto *DummyEntry = new GlobalVariable(
+ M, DummyInit->getType(), true, GlobalVariable::ExternalLinkage, DummyInit,
+ "__dummy." + Kind + "_offloading.entry");
+ DummyEntry->setSection((Kind + "_offloading_entries").str());
+ DummyEntry->setVisibility(GlobalValue::HiddenVisibility);
+
+ return std::make_pair(EntriesB, EntriesE);
+}
+
+std::pair<Constant *, Constant *> getCOFFEntriesArray(Module &M,
+ StringRef Kind) {
+ // For COFF targets, sections with 8 or fewer characters containing a '$' will
+ // be merged into the same section at runtime. The order is determined by the
+ // alphebetical ordering of the text after the '$' character. Here we generate
+ // two dummy variables that will be placed at the start and end of that
+ // section respectively that can be used to iterate the section at runtime.
+ auto *EntriesInit =
+ ConstantAggregateZero::get(ArrayType::get(getEntryTy(M), 0u));
+ auto *EntriesB =
+ new GlobalVariable(M, ArrayType::get(getEntryTy(M), 0), true,
+ GlobalVariable::ExternalLinkage, EntriesInit,
+ "__start." + Kind + "_offloading.entry");
+ EntriesB->setSection(("." + Kind + "$OA").str());
+ EntriesB->setVisibility(GlobalValue::HiddenVisibility);
+ auto *EntriesE =
+ new GlobalVariable(M, ArrayType::get(getEntryTy(M), 0), true,
+ GlobalVariable::ExternalLinkage, EntriesInit,
+ "__stop." + Kind + "_offloading.entry");
+ EntriesE->setSection(("." + Kind + "$OZ").str());
+ EntriesE->setVisibility(GlobalValue::HiddenVisibility);
+
+ Constant *ZeroOne[] = {ConstantInt::get(getSizeTTy(M), 0u),
+ ConstantInt::get(getSizeTTy(M), 1u)};
+ return std::make_pair(ConstantExpr::getGetElementPtr(EntriesB->getValueType(),
+ EntriesB, ZeroOne),
+ EntriesE);
+}
+
/// Creates binary descriptor for the given device images. Binary descriptor
/// is an object that is passed to the offloading runtime at program startup
/// and it describes all device images available in the executable or shared
@@ -150,28 +208,13 @@
/// Global variable that represents BinDesc is returned.
GlobalVariable *createBinDesc(Module &M, ArrayRef<ArrayRef<char>> Bufs) {
LLVMContext &C = M.getContext();
- // Create external begin/end symbols for the offload entries table.
- auto *EntriesB = new GlobalVariable(
- M, getEntryTy(M), /*isConstant*/ true, GlobalValue::ExternalLinkage,
- /*Initializer*/ nullptr, "__start_omp_offloading_entries");
- EntriesB->setVisibility(GlobalValue::HiddenVisibility);
- auto *EntriesE = new GlobalVariable(
- M, getEntryTy(M), /*isConstant*/ true, GlobalValue::ExternalLinkage,
- /*Initializer*/ nullptr, "__stop_omp_offloading_entries");
- EntriesE->setVisibility(GlobalValue::HiddenVisibility);
+ llvm::Triple Triple(M.getTargetTriple());
- // We assume that external begin/end symbols that we have created above will
- // be defined by the linker. But linker will do that only if linker inputs
- // have section with "omp_offloading_entries" name which is not guaranteed.
- // So, we just create dummy zero sized object in the offload entries section
- // to force linker to define those symbols.
- auto *DummyInit =
- ConstantAggregateZero::get(ArrayType::get(getEntryTy(M), 0u));
- auto *DummyEntry = new GlobalVariable(
- M, DummyInit->getType(), true, GlobalVariable::ExternalLinkage, DummyInit,
- "__dummy.omp_offloading.entry");
- DummyEntry->setSection("omp_offloading_entries");
- DummyEntry->setVisibility(GlobalValue::HiddenVisibility);
+ Constant *EntriesB, *EntriesE;
+ if (Triple.isOSBinFormatCOFF())
+ std::tie(EntriesB, EntriesE) = getCOFFEntriesArray(M, "omp");
+ else
+ std::tie(EntriesB, EntriesE) = getELFEntriesArray(M, "omp");
auto *Zero = ConstantInt::get(getSizeTTy(M), 0u);
Constant *ZeroZero[] = {Zero, Zero};
@@ -327,18 +370,6 @@
FatbinDesc->setSection(FatbinWrapperSection);
FatbinDesc->setAlignment(Align(8));
- // We create a dummy entry to ensure the linker will define the begin / end
- // symbols. The CUDA runtime should ignore the null address if we attempt to
- // register it.
- auto *DummyInit =
- ConstantAggregateZero::get(ArrayType::get(getEntryTy(M), 0u));
- auto *DummyEntry = new GlobalVariable(
- M, DummyInit->getType(), true, GlobalVariable::ExternalLinkage, DummyInit,
- IsHIP ? "__dummy.hip_offloading.entry" : "__dummy.cuda_offloading.entry");
- DummyEntry->setVisibility(GlobalValue::HiddenVisibility);
- DummyEntry->setSection(IsHIP ? "hip_offloading_entries"
- : "cuda_offloading_entries");
-
return FatbinDesc;
}
@@ -367,6 +398,7 @@
/// }
Function *createRegisterGlobalsFunction(Module &M, bool IsHIP) {
LLVMContext &C = M.getContext();
+ llvm::Triple Triple(M.getTargetTriple());
// Get the __cudaRegisterFunction function declaration.
auto *RegFuncTy = FunctionType::get(
Type::getInt32Ty(C),
@@ -388,21 +420,13 @@
FunctionCallee RegVar = M.getOrInsertFunction(
IsHIP ? "__hipRegisterVar" : "__cudaRegisterVar", RegVarTy);
- // Create the references to the start / stop symbols defined by the linker.
- auto *EntriesB =
- new GlobalVariable(M, ArrayType::get(getEntryTy(M), 0),
- /*isConstant*/ true, GlobalValue::ExternalLinkage,
- /*Initializer*/ nullptr,
- IsHIP ? "__start_hip_offloading_entries"
- : "__start_cuda_offloading_entries");
- EntriesB->setVisibility(GlobalValue::HiddenVisibility);
- auto *EntriesE =
- new GlobalVariable(M, ArrayType::get(getEntryTy(M), 0),
- /*isConstant*/ true, GlobalValue::ExternalLinkage,
- /*Initializer*/ nullptr,
- IsHIP ? "__stop_hip_offloading_entries"
- : "__stop_cuda_offloading_entries");
- EntriesE->setVisibility(GlobalValue::HiddenVisibility);
+ Constant *EntriesB, *EntriesE;
+ if (Triple.isOSBinFormatCOFF())
+ std::tie(EntriesB, EntriesE) =
+ getCOFFEntriesArray(M, IsHIP ? "hip" : "cuda");
+ else
+ std::tie(EntriesB, EntriesE) =
+ getELFEntriesArray(M, IsHIP ? "hip" : "cuda");
auto *RegGlobalsTy = FunctionType::get(Type::getVoidTy(C),
Type::getInt8PtrTy(C)->getPointerTo(),
Index: clang/test/OpenMP/declare_target_link_codegen.cpp
===================================================================
--- clang/test/OpenMP/declare_target_link_codegen.cpp
+++ clang/test/OpenMP/declare_target_link_codegen.cpp
@@ -10,6 +10,8 @@
// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t
// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify -o - | FileCheck %s --check-prefix SIMD-ONLY
+// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -x c++ -triple x86_64-win32-gnu -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix HOST-COFF --check-prefix CHECK
+
// expected-no-diagnostics
// SIMD-ONLY-NOT: {{__kmpc|__tgt}}
@@ -27,6 +29,7 @@
// HOST: [[MAPTYPES:@.+]] = private unnamed_addr constant [3 x i64] [i64 35, i64 531, i64 531]
// HOST: @.omp_offloading.entry_name{{.*}} = internal unnamed_addr constant [{{[0-9]+}} x i8] c"c_decl_tgt_ref_ptr\00"
// HOST: @.omp_offloading.entry.c_decl_tgt_ref_ptr = weak constant %struct.__tgt_offload_entry { i8* bitcast (i32** @c_decl_tgt_ref_ptr to i8*), i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* @.omp_offloading.entry_name, i32 0, i32 0), i64 8, i32 1, i32 0 }, section "omp_offloading_entries", align 1
+// HOST-COFF: @.omp_offloading.entry.c_decl_tgt_ref_ptr = weak constant %struct.__tgt_offload_entry { i8* bitcast (i32** @c_decl_tgt_ref_ptr to i8*), i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* @.omp_offloading.entry_name, i32 0, i32 0), i64 8, i32 1, i32 0 }, section ".omp$OE", align 1
// DEVICE-NOT: internal unnamed_addr constant [{{[0-9]+}} x i8] c"c_{{.*}}_decl_tgt_ref_ptr\00"
// HOST: @.omp_offloading.entry_name{{.*}} = internal unnamed_addr constant [{{[0-9]+}} x i8] c"_{{.*}}d_{{.*}}_decl_tgt_ref_ptr\00"
// HOST: @.omp_offloading.entry.[[D_PTR]] = weak constant %struct.__tgt_offload_entry { i8* bitcast (i32** @[[D_PTR]] to i8*), i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* @.omp_offloading.entry_name{{.*}}, i32 0, i32 0
@@ -50,7 +53,7 @@
return 0;
}
-// DEVICE: define weak_odr protected void @__omp_offloading_{{.*}}_{{.*}}maini1{{.*}}_l42(i32* noundef nonnull align {{[0-9]+}} dereferenceable{{[^,]*}}
+// DEVICE: define weak_odr protected void @__omp_offloading_{{.*}}_{{.*}}maini1{{.*}}_l45(i32* noundef nonnull align {{[0-9]+}} dereferenceable{{[^,]*}}
// DEVICE: [[C_REF:%.+]] = load i32*, i32** @c_decl_tgt_ref_ptr,
// DEVICE: [[C:%.+]] = load i32, i32* [[C_REF]],
// DEVICE: store i32 [[C]], i32* %
@@ -78,10 +81,10 @@
// HOST: [[BP0:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[BASEPTRS]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// HOST: [[P0:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[PTRS]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// HOST: call i32 @__tgt_target_kernel(%struct.ident_t* @{{.+}}, i64 -1, i32 -1, i32 0, i8* @.{{.+}}.region_id, %struct.__tgt_kernel_arguments* %{{.+}})
-// HOST: call void @__omp_offloading_{{.*}}_{{.*}}_{{.*}}maini1{{.*}}_l42(i32* %{{[^,]+}})
+// HOST: call void @__omp_offloading_{{.*}}_{{.*}}_{{.*}}maini1{{.*}}_l45(i32* %{{[^,]+}})
// HOST: call i32 @__tgt_target_kernel(%struct.ident_t* @{{.+}}, i64 -1, i32 0, i32 0, i8* @.{{.+}}.region_id, %struct.__tgt_kernel_arguments* %{{.+}})
-// HOST: define internal void @__omp_offloading_{{.*}}_{{.*}}maini1{{.*}}_l42(i32* noundef nonnull align {{[0-9]+}} dereferenceable{{.*}})
+// HOST: define internal void @__omp_offloading_{{.*}}_{{.*}}maini1{{.*}}_l45(i32* noundef nonnull align {{[0-9]+}} dereferenceable{{.*}})
// HOST: [[C:%.*]] = load i32, i32* @c,
// HOST: store i32 [[C]], i32* %
Index: clang/test/Driver/linker-wrapper-image.c
===================================================================
--- clang/test/Driver/linker-wrapper-image.c
+++ clang/test/Driver/linker-wrapper-image.c
@@ -6,16 +6,26 @@
// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o \
// RUN: -fembed-offload-object=%t.out
// RUN: clang-linker-wrapper --print-wrapped-module --dry-run --host-triple=x86_64-unknown-linux-gnu \
-// RUN: --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=OPENMP
-
-// OPENMP: @__start_omp_offloading_entries = external hidden constant %__tgt_offload_entry
-// OPENMP-NEXT: @__stop_omp_offloading_entries = external hidden constant %__tgt_offload_entry
-// OPENMP-NEXT: @__dummy.omp_offloading.entry = hidden constant [0 x %__tgt_offload_entry] zeroinitializer, section "omp_offloading_entries"
-// OPENMP-NEXT: @.omp_offloading.device_image = internal unnamed_addr constant [[[SIZE:[0-9]+]] x i8] c"\10\FF\10\AD{{.*}}"
-// OPENMP-NEXT: @.omp_offloading.device_images = internal unnamed_addr constant [1 x %__tgt_device_image] [%__tgt_device_image { ptr @.omp_offloading.device_image, ptr getelementptr inbounds ([[[SIZE]] x i8], ptr @.omp_offloading.device_image, i64 1, i64 0), ptr @__start_omp_offloading_entries, ptr @__stop_omp_offloading_entries }]
-// OPENMP-NEXT: @.omp_offloading.descriptor = internal constant %__tgt_bin_desc { i32 1, ptr @.omp_offloading.device_images, ptr @__start_omp_offloading_entries, ptr @__stop_omp_offloading_entries }
-// OPENMP-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.omp_offloading.descriptor_reg, ptr null }]
-// OPENMP-NEXT: @llvm.global_dtors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.omp_offloading.descriptor_unreg, ptr null }]
+// RUN: --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefixes=OPENMP,OPENMP-ELF
+// RUN: clang-linker-wrapper --print-wrapped-module --dry-run --host-triple=x86_64-unknown-windows-gnu \
+// RUN: --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefixes=OPENMP,OPENMP-COFF
+
+// OPENMP-ELF: @__start_omp_offloading_entries = external hidden constant [0 x %__tgt_offload_entry]
+// OPENMP-ELF-NEXT: @__stop_omp_offloading_entries = external hidden constant [0 x %__tgt_offload_entry]
+// OPENMP-ELF-NEXT: @__dummy.omp_offloading.entry = hidden constant [0 x %__tgt_offload_entry] zeroinitializer, section "omp_offloading_entries"
+// OPENMP-ELF-NEXT: @.omp_offloading.device_image = internal unnamed_addr constant [[[SIZE:[0-9]+]] x i8] c"\10\FF\10\AD{{.*}}"
+// OPENMP-ELF-NEXT: @.omp_offloading.device_images = internal unnamed_addr constant [1 x %__tgt_device_image] [%__tgt_device_image { ptr @.omp_offloading.device_image, ptr getelementptr inbounds ([[[SIZE]] x i8], ptr @.omp_offloading.device_image, i64 1, i64 0), ptr @__start_omp_offloading_entries, ptr @__stop_omp_offloading_entries }]
+// OPENMP-ELF-NEXT: @.omp_offloading.descriptor = internal constant %__tgt_bin_desc { i32 1, ptr @.omp_offloading.device_images, ptr @__start_omp_offloading_entries, ptr @__stop_omp_offloading_entries }
+// OPENMP-ELF-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.omp_offloading.descriptor_reg, ptr null }]
+// OPENMP-ELF-NEXT: @llvm.global_dtors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.omp_offloading.descriptor_unreg, ptr null }]
+
+// OPENMP-COFF: @__start.omp_offloading.entry = hidden constant [0 x %__tgt_offload_entry] zeroinitializer, section ".omp$OA"
+// OPENMP-COFF-NEXT: @__stop.omp_offloading.entry = hidden constant [0 x %__tgt_offload_entry] zeroinitializer, section ".omp$OZ"
+// OPENMP-COFF-NEXT: @.omp_offloading.device_image = internal unnamed_addr constant [[[SIZE:[0-9]+]] x i8] c"\10\FF\10\AD{{.*}}"
+// OPENMP-COFF-NEXT: @.omp_offloading.device_images = internal unnamed_addr constant [1 x %__tgt_device_image] [%__tgt_device_image { ptr @.omp_offloading.device_image, ptr getelementptr inbounds ([[[SIZE]] x i8], ptr @.omp_offloading.device_image, i64 1, i64 0), ptr getelementptr inbounds ([0 x %__tgt_offload_entry], ptr @__start.omp_offloading.entry, i64 0, i64 1), ptr @__stop.omp_offloading.entry }]
+// OPENMP-COFF-NEXT: @.omp_offloading.descriptor = internal constant %__tgt_bin_desc { i32 1, ptr @.omp_offloading.device_images, ptr getelementptr inbounds ([0 x %__tgt_offload_entry], ptr @__start.omp_offloading.entry, i64 0, i64 1), ptr @__stop.omp_offloading.entry }
+// OPENMP-COFF-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.omp_offloading.descriptor_reg, ptr null }]
+// OPENMP-COFF-NEXT: @llvm.global_dtors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.omp_offloading.descriptor_unreg, ptr null }]
// OPENMP: define internal void @.omp_offloading.descriptor_reg() section ".text.startup" {
// OPENMP-NEXT: entry:
@@ -33,15 +43,24 @@
// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o \
// RUN: -fembed-offload-object=%t.out
// RUN: clang-linker-wrapper --print-wrapped-module --dry-run --host-triple=x86_64-unknown-linux-gnu \
-// RUN: --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=CUDA
-
-// CUDA: @.fatbin_image = internal constant [0 x i8] zeroinitializer, section ".nv_fatbin"
-// CUDA-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1180844977, i32 1, ptr @.fatbin_image, ptr null }, section ".nvFatBinSegment", align 8
-// CUDA-NEXT: @__dummy.cuda_offloading.entry = hidden constant [0 x %__tgt_offload_entry] zeroinitializer, section "cuda_offloading_entries"
-// CUDA-NEXT: @.cuda.binary_handle = internal global ptr null
-// CUDA-NEXT: @__start_cuda_offloading_entries = external hidden constant [0 x %__tgt_offload_entry]
-// CUDA-NEXT: @__stop_cuda_offloading_entries = external hidden constant [0 x %__tgt_offload_entry]
-// CUDA-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.cuda.fatbin_reg, ptr null }]
+// RUN: --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefixes=CUDA,CUDA-ELF
+// RUN: clang-linker-wrapper --print-wrapped-module --dry-run --host-triple=x86_64-unknown-windows-gnu \
+// RUN: --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefixes=CUDA,CUDA-COFF
+
+// CUDA-ELF: @.fatbin_image = internal constant [0 x i8] zeroinitializer, section ".nv_fatbin"
+// CUDA-ELF-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1180844977, i32 1, ptr @.fatbin_image, ptr null }, section ".nvFatBinSegment", align 8
+// CUDA-ELF-NEXT: @.cuda.binary_handle = internal global ptr null
+// CUDA-ELF-NEXT: @__start_cuda_offloading_entries = external hidden constant [0 x %__tgt_offload_entry]
+// CUDA-ELF-NEXT: @__stop_cuda_offloading_entries = external hidden constant [0 x %__tgt_offload_entry]
+// CUDA-ELF-NEXT: @__dummy.cuda_offloading.entry = hidden constant [0 x %__tgt_offload_entry] zeroinitializer, section "cuda_offloading_entries"
+// CUDA-ELF-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.cuda.fatbin_reg, ptr null }]
+
+// CUDA-COFF: @.fatbin_image = internal constant [0 x i8] zeroinitializer, section ".nv_fatbin"
+// CUDA-COFF-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1180844977, i32 1, ptr @.fatbin_image, ptr null }, section ".nvFatBinSegment", align 8
+// CUDA-COFF-NEXT: @.cuda.binary_handle = internal global ptr null
+// CUDA-COFF-NEXT: @__start.cuda_offloading.entry = hidden constant [0 x %__tgt_offload_entry] zeroinitializer, section ".cuda$OA"
+// CUDA-COFF-NEXT: @__stop.cuda_offloading.entry = hidden constant [0 x %__tgt_offload_entry] zeroinitializer, section ".cuda$OZ"
+// CUDA-COFF-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.cuda.fatbin_reg, ptr null }]
// CUDA: define internal void @.cuda.fatbin_reg() section ".text.startup" {
// CUDA-NEXT: entry:
@@ -62,10 +81,10 @@
// CUDA: define internal void @.cuda.globals_reg(ptr %0) section ".text.startup" {
// CUDA-NEXT: entry:
-// CUDA-NEXT: br i1 icmp ne (ptr @__start_cuda_offloading_entries, ptr @__stop_cuda_offloading_entries), label %while.entry, label %while.end
+// CUDA-NEXT: br i1 icmp ne (ptr [[START_ENTRIES:.+]], ptr [[STOP_ENTRIES:.+]]), label %while.entry, label %while.end
// CUDA: while.entry:
-// CUDA-NEXT: %entry1 = phi ptr [ @__start_cuda_offloading_entries, %entry ], [ %7, %if.end ]
+// CUDA-NEXT: %entry1 = phi ptr [ [[START_ENTRIES]], %entry ], [ %7, %if.end ]
// CUDA-NEXT: %1 = getelementptr inbounds %__tgt_offload_entry, ptr %entry1, i64 0, i32 0
// CUDA-NEXT: %addr = load ptr, ptr %1, align 8
// CUDA-NEXT: %2 = getelementptr inbounds %__tgt_offload_entry, ptr %entry1, i64 0, i32 1
@@ -104,7 +123,7 @@
// CUDA: if.end:
// CUDA-NEXT: %7 = getelementptr inbounds %__tgt_offload_entry, ptr %entry1, i64 1
-// CUDA-NEXT: %8 = icmp eq ptr %7, @__stop_cuda_offloading_entries
+// CUDA-NEXT: %8 = icmp eq ptr %7, [[STOP_ENTRIES]]
// CUDA-NEXT: br i1 %8, label %while.end, label %while.entry
// CUDA: while.end:
@@ -115,15 +134,24 @@
// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o \
// RUN: -fembed-offload-object=%t.out
// RUN: clang-linker-wrapper --print-wrapped-module --dry-run --host-triple=x86_64-unknown-linux-gnu \
-// RUN: --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=HIP
-
-// HIP: @.fatbin_image = internal constant [0 x i8] zeroinitializer, section ".hip_fatbin"
-// HIP-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1212764230, i32 1, ptr @.fatbin_image, ptr null }, section ".hipFatBinSegment", align 8
-// HIP-NEXT: @__dummy.hip_offloading.entry = hidden constant [0 x %__tgt_offload_entry] zeroinitializer, section "hip_offloading_entries"
-// HIP-NEXT: @.hip.binary_handle = internal global ptr null
-// HIP-NEXT: @__start_hip_offloading_entries = external hidden constant [0 x %__tgt_offload_entry]
-// HIP-NEXT: @__stop_hip_offloading_entries = external hidden constant [0 x %__tgt_offload_entry]
-// HIP-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.hip.fatbin_reg, ptr null }]
+// RUN: --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefixes=HIP,HIP-ELF
+// RUN: clang-linker-wrapper --print-wrapped-module --dry-run --host-triple=x86_64-unknown-windows-gnu \
+// RUN: --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefixes=HIP,HIP-COFF
+
+// HIP-ELF: @.fatbin_image = internal constant [0 x i8] zeroinitializer, section ".hip_fatbin"
+// HIP-ELF-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1212764230, i32 1, ptr @.fatbin_image, ptr null }, section ".hipFatBinSegment", align 8
+// HIP-ELF-NEXT: @.hip.binary_handle = internal global ptr null
+// HIP-ELF-NEXT: @__start_hip_offloading_entries = external hidden constant [0 x %__tgt_offload_entry]
+// HIP-ELF-NEXT: @__stop_hip_offloading_entries = external hidden constant [0 x %__tgt_offload_entry]
+// HIP-ELF-NEXT: @__dummy.hip_offloading.entry = hidden constant [0 x %__tgt_offload_entry] zeroinitializer, section "hip_offloading_entries"
+// HIP-ELF-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.hip.fatbin_reg, ptr null }]
+
+// HIP-COFF: @.fatbin_image = internal constant [0 x i8] zeroinitializer, section ".hip_fatbin"
+// HIP-COFF-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1212764230, i32 1, ptr @.fatbin_image, ptr null }, section ".hipFatBinSegment", align 8
+// HIP-COFF-NEXT: @.hip.binary_handle = internal global ptr null
+// HIP-COFF-NEXT: @__start.hip_offloading.entry = hidden constant [0 x %__tgt_offload_entry] zeroinitializer, section ".hip$OA"
+// HIP-COFF-NEXT: @__stop.hip_offloading.entry = hidden constant [0 x %__tgt_offload_entry] zeroinitializer, section ".hip$OZ"
+// HIP-COFF-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.hip.fatbin_reg, ptr null }]
// HIP: define internal void @.hip.fatbin_reg() section ".text.startup" {
// HIP-NEXT: entry:
@@ -143,10 +171,10 @@
// HIP: define internal void @.hip.globals_reg(ptr %0) section ".text.startup" {
// HIP-NEXT: entry:
-// HIP-NEXT: br i1 icmp ne (ptr @__start_hip_offloading_entries, ptr @__stop_hip_offloading_entries), label %while.entry, label %while.end
+// HIP-NEXT: br i1 icmp ne (ptr [[START_ENTRIES:.+]], ptr [[STOP_ENTRIES:.+]]), label %while.entry, label %while.end
// HIP: while.entry:
-// HIP-NEXT: %entry1 = phi ptr [ @__start_hip_offloading_entries, %entry ], [ %7, %if.end ]
+// HIP-NEXT: %entry1 = phi ptr [ [[START_ENTRIES]], %entry ], [ %7, %if.end ]
// HIP-NEXT: %1 = getelementptr inbounds %__tgt_offload_entry, ptr %entry1, i64 0, i32 0
// HIP-NEXT: %addr = load ptr, ptr %1, align 8
// HIP-NEXT: %2 = getelementptr inbounds %__tgt_offload_entry, ptr %entry1, i64 0, i32 1
@@ -185,7 +213,7 @@
// HIP: if.end:
// HIP-NEXT: %7 = getelementptr inbounds %__tgt_offload_entry, ptr %entry1, i64 1
-// HIP-NEXT: %8 = icmp eq ptr %7, @__stop_hip_offloading_entries
+// HIP-NEXT: %8 = icmp eq ptr %7, [[STOP_ENTRIES]]
// HIP-NEXT: br i1 %8, label %while.end, label %while.entry
// HIP: while.end:
Index: clang/test/CodeGenCUDA/offloading-entries.cu
===================================================================
--- clang/test/CodeGenCUDA/offloading-entries.cu
+++ clang/test/CodeGenCUDA/offloading-entries.cu
@@ -5,6 +5,12 @@
// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux-gnu -fgpu-rdc \
// RUN: --offload-new-driver -emit-llvm -o - -x hip %s | FileCheck \
// RUN: --check-prefix=HIP %s
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-win32-gnu -fgpu-rdc \
+// RUN: --offload-new-driver -emit-llvm -o - -x cuda %s | FileCheck \
+// RUN: --check-prefix=CUDA-COFF %s
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-win32-gnu -fgpu-rdc \
+// RUN: --offload-new-driver -emit-llvm -o - -x hip %s | FileCheck \
+// RUN: --check-prefix=HIP-COFF %s
#include "Inputs/cuda.h"
@@ -23,6 +29,20 @@
// HIP: @.omp_offloading.entry_name.2 = internal unnamed_addr constant [2 x i8] c"x\00"
// HIP: @.omp_offloading.entry.x = weak constant %struct.__tgt_offload_entry { ptr @x, ptr @.omp_offloading.entry_name.2, i64 4, i32 0, i32 0 }, section "hip_offloading_entries", align 1
//.
+// CUDA-COFF: @.omp_offloading.entry_name = internal unnamed_addr constant [8 x i8] c"_Z3foov\00"
+// CUDA-COFF: @.omp_offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry { ptr @_Z18__device_stub__foov, ptr @.omp_offloading.entry_name, i64 0, i32 0, i32 0 }, section ".cuda$OE", align 1
+// CUDA-COFF: @.omp_offloading.entry_name.1 = internal unnamed_addr constant [8 x i8] c"_Z3barv\00"
+// CUDA-COFF: @.omp_offloading.entry._Z3barv = weak constant %struct.__tgt_offload_entry { ptr @_Z18__device_stub__barv, ptr @.omp_offloading.entry_name.1, i64 0, i32 0, i32 0 }, section ".cuda$OE", align 1
+// CUDA-COFF: @.omp_offloading.entry_name.2 = internal unnamed_addr constant [2 x i8] c"x\00"
+// CUDA-COFF: @.omp_offloading.entry.x = weak constant %struct.__tgt_offload_entry { ptr @x, ptr @.omp_offloading.entry_name.2, i64 4, i32 0, i32 0 }, section ".cuda$OE", align 1
+//.
+// HIP-COFF: @.omp_offloading.entry_name = internal unnamed_addr constant [8 x i8] c"_Z3foov\00"
+// HIP-COFF: @.omp_offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry { ptr @_Z3foov, ptr @.omp_offloading.entry_name, i64 0, i32 0, i32 0 }, section ".hip$OE", align 1
+// HIP-COFF: @.omp_offloading.entry_name.1 = internal unnamed_addr constant [8 x i8] c"_Z3barv\00"
+// HIP-COFF: @.omp_offloading.entry._Z3barv = weak constant %struct.__tgt_offload_entry { ptr @_Z3barv, ptr @.omp_offloading.entry_name.1, i64 0, i32 0, i32 0 }, section ".hip$OE", align 1
+// HIP-COFF: @.omp_offloading.entry_name.2 = internal unnamed_addr constant [2 x i8] c"x\00"
+// HIP-COFF: @.omp_offloading.entry.x = weak constant %struct.__tgt_offload_entry { ptr @x, ptr @.omp_offloading.entry_name.2, i64 4, i32 0, i32 0 }, section ".hip$OE", align 1
+//.
// CUDA-LABEL: @_Z18__device_stub__foov(
// CUDA-NEXT: entry:
// CUDA-NEXT: [[TMP0:%.*]] = call i32 @cudaLaunch(ptr @_Z18__device_stub__foov)
@@ -37,6 +57,20 @@
// HIP: setup.end:
// HIP-NEXT: ret void
//
+// CUDA-COFF-LABEL: @_Z18__device_stub__foov(
+// CUDA-COFF-NEXT: entry:
+// CUDA-COFF-NEXT: [[TMP0:%.*]] = call i32 @cudaLaunch(ptr @_Z18__device_stub__foov)
+// CUDA-COFF-NEXT: br label [[SETUP_END:%.*]]
+// CUDA-COFF: setup.end:
+// CUDA-COFF-NEXT: ret void
+//
+// HIP-COFF-LABEL: @_Z18__device_stub__foov(
+// HIP-COFF-NEXT: entry:
+// HIP-COFF-NEXT: [[TMP0:%.*]] = call i32 @hipLaunchByPtr(ptr @_Z3foov)
+// HIP-COFF-NEXT: br label [[SETUP_END:%.*]]
+// HIP-COFF: setup.end:
+// HIP-COFF-NEXT: ret void
+//
__global__ void foo() {}
// CUDA-LABEL: @_Z18__device_stub__barv(
@@ -53,5 +87,19 @@
// HIP: setup.end:
// HIP-NEXT: ret void
//
+// CUDA-COFF-LABEL: @_Z18__device_stub__barv(
+// CUDA-COFF-NEXT: entry:
+// CUDA-COFF-NEXT: [[TMP0:%.*]] = call i32 @cudaLaunch(ptr @_Z18__device_stub__barv)
+// CUDA-COFF-NEXT: br label [[SETUP_END:%.*]]
+// CUDA-COFF: setup.end:
+// CUDA-COFF-NEXT: ret void
+//
+// HIP-COFF-LABEL: @_Z18__device_stub__barv(
+// HIP-COFF-NEXT: entry:
+// HIP-COFF-NEXT: [[TMP0:%.*]] = call i32 @hipLaunchByPtr(ptr @_Z3barv)
+// HIP-COFF-NEXT: br label [[SETUP_END:%.*]]
+// HIP-COFF: setup.end:
+// HIP-COFF-NEXT: ret void
+//
__global__ void bar() {}
__device__ int x = 1;
Index: clang/lib/CodeGen/CGCUDANV.cpp
===================================================================
--- clang/lib/CodeGen/CGCUDANV.cpp
+++ clang/lib/CodeGen/CGCUDANV.cpp
@@ -1127,8 +1127,13 @@
llvm::OpenMPIRBuilder OMPBuilder(CGM.getModule());
OMPBuilder.initialize();
- StringRef Section = CGM.getLangOpts().HIP ? "hip_offloading_entries"
- : "cuda_offloading_entries";
+ StringRef Section;
+ if (CGM.getTriple().isOSBinFormatCOFF())
+ Section = CGM.getLangOpts().HIP ? ".hip$OE" : ".cuda$OE";
+ else
+ Section = CGM.getLangOpts().HIP ? "hip_offloading_entries"
+ : "cuda_offloading_entries";
+
for (KernelInfo &I : EmittedKernels)
OMPBuilder.emitOffloadingEntry(KernelHandles[I.Kernel],
getDeviceSideName(cast<NamedDecl>(I.D)), 0,
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits