jhuber6 created this revision.
jhuber6 added reviewers: tra, jdoerfert, tianshilei1992, JonChesterfield, 
yaxunl, rnk.
Herald added a subscriber: hiraditya.
Herald added a project: All.
jhuber6 requested review of this revision.
Herald added subscribers: llvm-commits, cfe-commits, sstefan1.
Herald added projects: clang, LLVM.

The new driver registers all the offloading entries by first storing a
structure containing the necessary information into a special section
and then iterating the section at runtime. This is done in ELF targets
using the linker defined `__start` and `__stop` sections. However for
COFF targets these are not provided. This is instead done by generating
sections as described here 
<https://learn.microsoft.com/en-us/cpp/preprocessor/init-seg?redirectedfrom=MSDN&view=msvc-170>.

This patch adds the initial support required to offloadon COFF targets
by implementing this for the new driver. We use the `.<kind>$Ox` section
for COFF now.

NOTE: I have not tested the runtime functionality of patch as I do not
have a Windows machine set up yet.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D137470

Files:
  clang/lib/CodeGen/CGCUDANV.cpp
  clang/test/CodeGenCUDA/offloading-entries.cu
  clang/test/Driver/linker-wrapper-image.c
  clang/test/OpenMP/declare_target_link_codegen.cpp
  clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
  llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
  llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp

Index: llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
===================================================================
--- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -4697,7 +4697,10 @@
                                          int32_t Flags,
                                          GlobalValue::LinkageTypes) {
   if (!IsTargetCodegen) {
-    emitOffloadingEntry(ID, Addr->getName(), Size, Flags);
+    llvm::Triple Triple(M.getTargetTriple());
+    emitOffloadingEntry(ID, Addr->getName(), Size, Flags,
+                        Triple.isOSBinFormatCOFF() ? ".omp$OE"
+                                                   : "omp_offloading_entries");
     return;
   }
   // TODO: Add support for global variables on the device after declare target
Index: llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
===================================================================
--- llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -867,8 +867,7 @@
   /// \param Flags Flags associated with the entry.
   /// \param SectionName The section this entry will be placed at.
   void emitOffloadingEntry(Constant *Addr, StringRef Name, uint64_t Size,
-                           int32_t Flags,
-                           StringRef SectionName = "omp_offloading_entries");
+                           int32_t Flags, StringRef SectionName);
 
   /// Generate control flow and cleanup for cancellation.
   ///
Index: clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
===================================================================
--- clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
+++ clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
@@ -110,6 +110,64 @@
   return PointerType::getUnqual(getBinDescTy(M));
 }
 
+std::pair<Constant *, Constant *> getELFEntriesArray(Module &M,
+                                                     StringRef Kind) {
+  auto *EntriesB = new GlobalVariable(
+      M, ArrayType::get(getEntryTy(M), 0), /*isConstant*/ true,
+      GlobalValue::ExternalLinkage,
+      /*Initializer*/ nullptr, "__start_" + Kind + "_offloading_entries");
+  EntriesB->setVisibility(GlobalValue::HiddenVisibility);
+  auto *EntriesE = new GlobalVariable(
+      M, ArrayType::get(getEntryTy(M), 0), /*isConstant*/ true,
+      GlobalValue::ExternalLinkage,
+      /*Initializer*/ nullptr, "__stop_" + Kind + "_offloading_entries");
+  EntriesE->setVisibility(GlobalValue::HiddenVisibility);
+
+  // We assume that external begin/end symbols that we have created above will
+  // be defined by the linker. But linker will do that only if linker inputs
+  // have section with "omp_offloading_entries" name which is not guaranteed.
+  // So, we just create dummy zero sized object in the offload entries section
+  // to force linker to define those symbols.
+  auto *DummyInit =
+      ConstantAggregateZero::get(ArrayType::get(getEntryTy(M), 0u));
+  auto *DummyEntry = new GlobalVariable(
+      M, DummyInit->getType(), true, GlobalVariable::ExternalLinkage, DummyInit,
+      "__dummy." + Kind + "_offloading.entry");
+  DummyEntry->setSection((Kind + "_offloading_entries").str());
+  DummyEntry->setVisibility(GlobalValue::HiddenVisibility);
+
+  return std::make_pair(EntriesB, EntriesE);
+}
+
+std::pair<Constant *, Constant *> getCOFFEntriesArray(Module &M,
+                                                      StringRef Kind) {
+  // For COFF targets, sections with 8 or fewer characters containing a '$' will
+  // be merged into the same section at runtime. The order is determined by the
+  // alphebetical ordering of the text after the '$' character. Here we generate
+  // two dummy variables that will be placed at the start and end of that
+  // section respectively that can be used to iterate the section at runtime.
+  auto *EntriesInit =
+      ConstantAggregateZero::get(ArrayType::get(getEntryTy(M), 0u));
+  auto *EntriesB =
+      new GlobalVariable(M, ArrayType::get(getEntryTy(M), 0), true,
+                         GlobalVariable::ExternalLinkage, EntriesInit,
+                         "__start." + Kind + "_offloading.entry");
+  EntriesB->setSection(("." + Kind + "$OA").str());
+  EntriesB->setVisibility(GlobalValue::HiddenVisibility);
+  auto *EntriesE =
+      new GlobalVariable(M, ArrayType::get(getEntryTy(M), 0), true,
+                         GlobalVariable::ExternalLinkage, EntriesInit,
+                         "__stop." + Kind + "_offloading.entry");
+  EntriesE->setSection(("." + Kind + "$OZ").str());
+  EntriesE->setVisibility(GlobalValue::HiddenVisibility);
+
+  Constant *ZeroOne[] = {ConstantInt::get(getSizeTTy(M), 0u),
+                         ConstantInt::get(getSizeTTy(M), 1u)};
+  return std::make_pair(ConstantExpr::getGetElementPtr(EntriesB->getValueType(),
+                                                       EntriesB, ZeroOne),
+                        EntriesE);
+}
+
 /// Creates binary descriptor for the given device images. Binary descriptor
 /// is an object that is passed to the offloading runtime at program startup
 /// and it describes all device images available in the executable or shared
@@ -150,28 +208,13 @@
 /// Global variable that represents BinDesc is returned.
 GlobalVariable *createBinDesc(Module &M, ArrayRef<ArrayRef<char>> Bufs) {
   LLVMContext &C = M.getContext();
-  // Create external begin/end symbols for the offload entries table.
-  auto *EntriesB = new GlobalVariable(
-      M, getEntryTy(M), /*isConstant*/ true, GlobalValue::ExternalLinkage,
-      /*Initializer*/ nullptr, "__start_omp_offloading_entries");
-  EntriesB->setVisibility(GlobalValue::HiddenVisibility);
-  auto *EntriesE = new GlobalVariable(
-      M, getEntryTy(M), /*isConstant*/ true, GlobalValue::ExternalLinkage,
-      /*Initializer*/ nullptr, "__stop_omp_offloading_entries");
-  EntriesE->setVisibility(GlobalValue::HiddenVisibility);
+  llvm::Triple Triple(M.getTargetTriple());
 
-  // We assume that external begin/end symbols that we have created above will
-  // be defined by the linker. But linker will do that only if linker inputs
-  // have section with "omp_offloading_entries" name which is not guaranteed.
-  // So, we just create dummy zero sized object in the offload entries section
-  // to force linker to define those symbols.
-  auto *DummyInit =
-      ConstantAggregateZero::get(ArrayType::get(getEntryTy(M), 0u));
-  auto *DummyEntry = new GlobalVariable(
-      M, DummyInit->getType(), true, GlobalVariable::ExternalLinkage, DummyInit,
-      "__dummy.omp_offloading.entry");
-  DummyEntry->setSection("omp_offloading_entries");
-  DummyEntry->setVisibility(GlobalValue::HiddenVisibility);
+  Constant *EntriesB, *EntriesE;
+  if (Triple.isOSBinFormatCOFF())
+    std::tie(EntriesB, EntriesE) = getCOFFEntriesArray(M, "omp");
+  else
+    std::tie(EntriesB, EntriesE) = getELFEntriesArray(M, "omp");
 
   auto *Zero = ConstantInt::get(getSizeTTy(M), 0u);
   Constant *ZeroZero[] = {Zero, Zero};
@@ -327,18 +370,6 @@
   FatbinDesc->setSection(FatbinWrapperSection);
   FatbinDesc->setAlignment(Align(8));
 
-  // We create a dummy entry to ensure the linker will define the begin / end
-  // symbols. The CUDA runtime should ignore the null address if we attempt to
-  // register it.
-  auto *DummyInit =
-      ConstantAggregateZero::get(ArrayType::get(getEntryTy(M), 0u));
-  auto *DummyEntry = new GlobalVariable(
-      M, DummyInit->getType(), true, GlobalVariable::ExternalLinkage, DummyInit,
-      IsHIP ? "__dummy.hip_offloading.entry" : "__dummy.cuda_offloading.entry");
-  DummyEntry->setVisibility(GlobalValue::HiddenVisibility);
-  DummyEntry->setSection(IsHIP ? "hip_offloading_entries"
-                               : "cuda_offloading_entries");
-
   return FatbinDesc;
 }
 
@@ -367,6 +398,7 @@
 /// }
 Function *createRegisterGlobalsFunction(Module &M, bool IsHIP) {
   LLVMContext &C = M.getContext();
+  llvm::Triple Triple(M.getTargetTriple());
   // Get the __cudaRegisterFunction function declaration.
   auto *RegFuncTy = FunctionType::get(
       Type::getInt32Ty(C),
@@ -388,21 +420,13 @@
   FunctionCallee RegVar = M.getOrInsertFunction(
       IsHIP ? "__hipRegisterVar" : "__cudaRegisterVar", RegVarTy);
 
-  // Create the references to the start / stop symbols defined by the linker.
-  auto *EntriesB =
-      new GlobalVariable(M, ArrayType::get(getEntryTy(M), 0),
-                         /*isConstant*/ true, GlobalValue::ExternalLinkage,
-                         /*Initializer*/ nullptr,
-                         IsHIP ? "__start_hip_offloading_entries"
-                               : "__start_cuda_offloading_entries");
-  EntriesB->setVisibility(GlobalValue::HiddenVisibility);
-  auto *EntriesE =
-      new GlobalVariable(M, ArrayType::get(getEntryTy(M), 0),
-                         /*isConstant*/ true, GlobalValue::ExternalLinkage,
-                         /*Initializer*/ nullptr,
-                         IsHIP ? "__stop_hip_offloading_entries"
-                               : "__stop_cuda_offloading_entries");
-  EntriesE->setVisibility(GlobalValue::HiddenVisibility);
+  Constant *EntriesB, *EntriesE;
+  if (Triple.isOSBinFormatCOFF())
+    std::tie(EntriesB, EntriesE) =
+        getCOFFEntriesArray(M, IsHIP ? "hip" : "cuda");
+  else
+    std::tie(EntriesB, EntriesE) =
+        getELFEntriesArray(M, IsHIP ? "hip" : "cuda");
 
   auto *RegGlobalsTy = FunctionType::get(Type::getVoidTy(C),
                                          Type::getInt8PtrTy(C)->getPointerTo(),
Index: clang/test/OpenMP/declare_target_link_codegen.cpp
===================================================================
--- clang/test/OpenMP/declare_target_link_codegen.cpp
+++ clang/test/OpenMP/declare_target_link_codegen.cpp
@@ -10,6 +10,8 @@
 // RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -emit-pch -o %t
 // RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify -o - | FileCheck %s --check-prefix SIMD-ONLY
 
+// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -x c++ -triple x86_64-win32-gnu -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix HOST-COFF --check-prefix CHECK
+
 // expected-no-diagnostics
 
 // SIMD-ONLY-NOT: {{__kmpc|__tgt}}
@@ -27,6 +29,7 @@
 // HOST: [[MAPTYPES:@.+]] = private unnamed_addr constant [3 x i64] [i64 35, i64 531, i64 531]
 // HOST: @.omp_offloading.entry_name{{.*}} = internal unnamed_addr constant [{{[0-9]+}} x i8] c"c_decl_tgt_ref_ptr\00"
 // HOST: @.omp_offloading.entry.c_decl_tgt_ref_ptr = weak constant %struct.__tgt_offload_entry { i8* bitcast (i32** @c_decl_tgt_ref_ptr to i8*), i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* @.omp_offloading.entry_name, i32 0, i32 0), i64 8, i32 1, i32 0 }, section "omp_offloading_entries", align 1
+// HOST-COFF: @.omp_offloading.entry.c_decl_tgt_ref_ptr = weak constant %struct.__tgt_offload_entry { i8* bitcast (i32** @c_decl_tgt_ref_ptr to i8*), i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* @.omp_offloading.entry_name, i32 0, i32 0), i64 8, i32 1, i32 0 }, section ".omp$OE", align 1
 // DEVICE-NOT: internal unnamed_addr constant [{{[0-9]+}} x i8] c"c_{{.*}}_decl_tgt_ref_ptr\00"
 // HOST: @.omp_offloading.entry_name{{.*}} = internal unnamed_addr constant [{{[0-9]+}} x i8] c"_{{.*}}d_{{.*}}_decl_tgt_ref_ptr\00"
 // HOST: @.omp_offloading.entry.[[D_PTR]] = weak constant %struct.__tgt_offload_entry { i8* bitcast (i32** @[[D_PTR]] to i8*), i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* @.omp_offloading.entry_name{{.*}}, i32 0, i32 0
@@ -50,7 +53,7 @@
   return 0;
 }
 
-// DEVICE: define weak_odr protected void @__omp_offloading_{{.*}}_{{.*}}maini1{{.*}}_l42(i32* noundef nonnull align {{[0-9]+}} dereferenceable{{[^,]*}}
+// DEVICE: define weak_odr protected void @__omp_offloading_{{.*}}_{{.*}}maini1{{.*}}_l45(i32* noundef nonnull align {{[0-9]+}} dereferenceable{{[^,]*}}
 // DEVICE: [[C_REF:%.+]] = load i32*, i32** @c_decl_tgt_ref_ptr,
 // DEVICE: [[C:%.+]] = load i32, i32* [[C_REF]],
 // DEVICE: store i32 [[C]], i32* %
@@ -78,10 +81,10 @@
 // HOST: [[BP0:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[BASEPTRS]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // HOST: [[P0:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[PTRS]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // HOST: call i32 @__tgt_target_kernel(%struct.ident_t* @{{.+}}, i64 -1, i32 -1, i32 0, i8* @.{{.+}}.region_id, %struct.__tgt_kernel_arguments* %{{.+}})
-// HOST: call void @__omp_offloading_{{.*}}_{{.*}}_{{.*}}maini1{{.*}}_l42(i32* %{{[^,]+}})
+// HOST: call void @__omp_offloading_{{.*}}_{{.*}}_{{.*}}maini1{{.*}}_l45(i32* %{{[^,]+}})
 // HOST: call i32 @__tgt_target_kernel(%struct.ident_t* @{{.+}}, i64 -1, i32 0, i32 0, i8* @.{{.+}}.region_id, %struct.__tgt_kernel_arguments* %{{.+}})
 
-// HOST: define internal void @__omp_offloading_{{.*}}_{{.*}}maini1{{.*}}_l42(i32* noundef nonnull align {{[0-9]+}} dereferenceable{{.*}})
+// HOST: define internal void @__omp_offloading_{{.*}}_{{.*}}maini1{{.*}}_l45(i32* noundef nonnull align {{[0-9]+}} dereferenceable{{.*}})
 // HOST: [[C:%.*]] = load i32, i32* @c,
 // HOST: store i32 [[C]], i32* %
 
Index: clang/test/Driver/linker-wrapper-image.c
===================================================================
--- clang/test/Driver/linker-wrapper-image.c
+++ clang/test/Driver/linker-wrapper-image.c
@@ -6,16 +6,26 @@
 // RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o \
 // RUN:   -fembed-offload-object=%t.out
 // RUN: clang-linker-wrapper --print-wrapped-module --dry-run --host-triple=x86_64-unknown-linux-gnu \
-// RUN:   --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=OPENMP
-
-//      OPENMP: @__start_omp_offloading_entries = external hidden constant %__tgt_offload_entry
-// OPENMP-NEXT: @__stop_omp_offloading_entries = external hidden constant %__tgt_offload_entry
-// OPENMP-NEXT: @__dummy.omp_offloading.entry = hidden constant [0 x %__tgt_offload_entry] zeroinitializer, section "omp_offloading_entries"
-// OPENMP-NEXT: @.omp_offloading.device_image = internal unnamed_addr constant [[[SIZE:[0-9]+]] x i8] c"\10\FF\10\AD{{.*}}"
-// OPENMP-NEXT: @.omp_offloading.device_images = internal unnamed_addr constant [1 x %__tgt_device_image] [%__tgt_device_image { ptr @.omp_offloading.device_image, ptr getelementptr inbounds ([[[SIZE]] x i8], ptr @.omp_offloading.device_image, i64 1, i64 0), ptr @__start_omp_offloading_entries, ptr @__stop_omp_offloading_entries }]
-// OPENMP-NEXT: @.omp_offloading.descriptor = internal constant %__tgt_bin_desc { i32 1, ptr @.omp_offloading.device_images, ptr @__start_omp_offloading_entries, ptr @__stop_omp_offloading_entries }
-// OPENMP-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.omp_offloading.descriptor_reg, ptr null }]
-// OPENMP-NEXT: @llvm.global_dtors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.omp_offloading.descriptor_unreg, ptr null }]
+// RUN:   --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefixes=OPENMP,OPENMP-ELF
+// RUN: clang-linker-wrapper --print-wrapped-module --dry-run --host-triple=x86_64-unknown-windows-gnu \
+// RUN:   --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefixes=OPENMP,OPENMP-COFF
+
+//      OPENMP-ELF: @__start_omp_offloading_entries = external hidden constant [0 x %__tgt_offload_entry]
+// OPENMP-ELF-NEXT: @__stop_omp_offloading_entries = external hidden constant [0 x %__tgt_offload_entry]
+// OPENMP-ELF-NEXT: @__dummy.omp_offloading.entry = hidden constant [0 x %__tgt_offload_entry] zeroinitializer, section "omp_offloading_entries"
+// OPENMP-ELF-NEXT: @.omp_offloading.device_image = internal unnamed_addr constant [[[SIZE:[0-9]+]] x i8] c"\10\FF\10\AD{{.*}}"
+// OPENMP-ELF-NEXT: @.omp_offloading.device_images = internal unnamed_addr constant [1 x %__tgt_device_image] [%__tgt_device_image { ptr @.omp_offloading.device_image, ptr getelementptr inbounds ([[[SIZE]] x i8], ptr @.omp_offloading.device_image, i64 1, i64 0), ptr @__start_omp_offloading_entries, ptr @__stop_omp_offloading_entries }]
+// OPENMP-ELF-NEXT: @.omp_offloading.descriptor = internal constant %__tgt_bin_desc { i32 1, ptr @.omp_offloading.device_images, ptr @__start_omp_offloading_entries, ptr @__stop_omp_offloading_entries }
+// OPENMP-ELF-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.omp_offloading.descriptor_reg, ptr null }]
+// OPENMP-ELF-NEXT: @llvm.global_dtors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.omp_offloading.descriptor_unreg, ptr null }]
+
+//      OPENMP-COFF: @__start.omp_offloading.entry = hidden constant [0 x %__tgt_offload_entry] zeroinitializer, section ".omp$OA"
+// OPENMP-COFF-NEXT: @__stop.omp_offloading.entry = hidden constant [0 x %__tgt_offload_entry] zeroinitializer, section ".omp$OZ"
+// OPENMP-COFF-NEXT: @.omp_offloading.device_image = internal unnamed_addr constant [[[SIZE:[0-9]+]] x i8] c"\10\FF\10\AD{{.*}}"
+// OPENMP-COFF-NEXT: @.omp_offloading.device_images = internal unnamed_addr constant [1 x %__tgt_device_image] [%__tgt_device_image { ptr @.omp_offloading.device_image, ptr getelementptr inbounds ([[[SIZE]] x i8], ptr @.omp_offloading.device_image, i64 1, i64 0), ptr getelementptr inbounds ([0 x %__tgt_offload_entry], ptr @__start.omp_offloading.entry, i64 0, i64 1), ptr @__stop.omp_offloading.entry }]
+// OPENMP-COFF-NEXT: @.omp_offloading.descriptor = internal constant %__tgt_bin_desc { i32 1, ptr @.omp_offloading.device_images, ptr getelementptr inbounds ([0 x %__tgt_offload_entry], ptr @__start.omp_offloading.entry, i64 0, i64 1), ptr @__stop.omp_offloading.entry }
+// OPENMP-COFF-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.omp_offloading.descriptor_reg, ptr null }]
+// OPENMP-COFF-NEXT: @llvm.global_dtors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.omp_offloading.descriptor_unreg, ptr null }]
 
 //      OPENMP: define internal void @.omp_offloading.descriptor_reg() section ".text.startup" {
 // OPENMP-NEXT: entry:
@@ -33,15 +43,24 @@
 // RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o \
 // RUN:   -fembed-offload-object=%t.out
 // RUN: clang-linker-wrapper --print-wrapped-module --dry-run --host-triple=x86_64-unknown-linux-gnu \
-// RUN:   --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=CUDA
-
-//      CUDA: @.fatbin_image = internal constant [0 x i8] zeroinitializer, section ".nv_fatbin"
-// CUDA-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1180844977, i32 1, ptr @.fatbin_image, ptr null }, section ".nvFatBinSegment", align 8
-// CUDA-NEXT: @__dummy.cuda_offloading.entry = hidden constant [0 x %__tgt_offload_entry] zeroinitializer, section "cuda_offloading_entries"
-// CUDA-NEXT: @.cuda.binary_handle = internal global ptr null
-// CUDA-NEXT: @__start_cuda_offloading_entries = external hidden constant [0 x %__tgt_offload_entry]
-// CUDA-NEXT: @__stop_cuda_offloading_entries = external hidden constant [0 x %__tgt_offload_entry]
-// CUDA-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.cuda.fatbin_reg, ptr null }]
+// RUN:   --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefixes=CUDA,CUDA-ELF
+// RUN: clang-linker-wrapper --print-wrapped-module --dry-run --host-triple=x86_64-unknown-windows-gnu \
+// RUN:   --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefixes=CUDA,CUDA-COFF
+
+//      CUDA-ELF: @.fatbin_image = internal constant [0 x i8] zeroinitializer, section ".nv_fatbin"
+// CUDA-ELF-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1180844977, i32 1, ptr @.fatbin_image, ptr null }, section ".nvFatBinSegment", align 8
+// CUDA-ELF-NEXT: @.cuda.binary_handle = internal global ptr null
+// CUDA-ELF-NEXT: @__start_cuda_offloading_entries = external hidden constant [0 x %__tgt_offload_entry]
+// CUDA-ELF-NEXT: @__stop_cuda_offloading_entries = external hidden constant [0 x %__tgt_offload_entry]
+// CUDA-ELF-NEXT: @__dummy.cuda_offloading.entry = hidden constant [0 x %__tgt_offload_entry] zeroinitializer, section "cuda_offloading_entries"
+// CUDA-ELF-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.cuda.fatbin_reg, ptr null }]
+
+//      CUDA-COFF: @.fatbin_image = internal constant [0 x i8] zeroinitializer, section ".nv_fatbin"
+// CUDA-COFF-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1180844977, i32 1, ptr @.fatbin_image, ptr null }, section ".nvFatBinSegment", align 8
+// CUDA-COFF-NEXT: @.cuda.binary_handle = internal global ptr null
+// CUDA-COFF-NEXT: @__start.cuda_offloading.entry = hidden constant [0 x %__tgt_offload_entry] zeroinitializer, section ".cuda$OA"
+// CUDA-COFF-NEXT: @__stop.cuda_offloading.entry = hidden constant [0 x %__tgt_offload_entry] zeroinitializer, section ".cuda$OZ"
+// CUDA-COFF-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.cuda.fatbin_reg, ptr null }]
 
 //      CUDA: define internal void @.cuda.fatbin_reg() section ".text.startup" {
 // CUDA-NEXT: entry:
@@ -62,10 +81,10 @@
 
 //      CUDA: define internal void @.cuda.globals_reg(ptr %0) section ".text.startup" {
 // CUDA-NEXT: entry:
-// CUDA-NEXT:   br i1 icmp ne (ptr @__start_cuda_offloading_entries, ptr @__stop_cuda_offloading_entries), label %while.entry, label %while.end
+// CUDA-NEXT:   br i1 icmp ne (ptr [[START_ENTRIES:.+]], ptr [[STOP_ENTRIES:.+]]), label %while.entry, label %while.end
 
 //      CUDA: while.entry:
-// CUDA-NEXT:  %entry1 = phi ptr [ @__start_cuda_offloading_entries, %entry ], [ %7, %if.end ]
+// CUDA-NEXT:  %entry1 = phi ptr [ [[START_ENTRIES]], %entry ], [ %7, %if.end ]
 // CUDA-NEXT:  %1 = getelementptr inbounds %__tgt_offload_entry, ptr %entry1, i64 0, i32 0
 // CUDA-NEXT:  %addr = load ptr, ptr %1, align 8
 // CUDA-NEXT:  %2 = getelementptr inbounds %__tgt_offload_entry, ptr %entry1, i64 0, i32 1
@@ -104,7 +123,7 @@
 
 //      CUDA: if.end:
 // CUDA-NEXT:   %7 = getelementptr inbounds %__tgt_offload_entry, ptr %entry1, i64 1
-// CUDA-NEXT:   %8 = icmp eq ptr %7, @__stop_cuda_offloading_entries
+// CUDA-NEXT:   %8 = icmp eq ptr %7, [[STOP_ENTRIES]]
 // CUDA-NEXT:   br i1 %8, label %while.end, label %while.entry
 
 //      CUDA: while.end:
@@ -115,15 +134,24 @@
 // RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o \
 // RUN:   -fembed-offload-object=%t.out
 // RUN: clang-linker-wrapper --print-wrapped-module --dry-run --host-triple=x86_64-unknown-linux-gnu \
-// RUN:   --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=HIP
-
-//      HIP: @.fatbin_image = internal constant [0 x i8] zeroinitializer, section ".hip_fatbin"
-// HIP-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1212764230, i32 1, ptr @.fatbin_image, ptr null }, section ".hipFatBinSegment", align 8
-// HIP-NEXT: @__dummy.hip_offloading.entry = hidden constant [0 x %__tgt_offload_entry] zeroinitializer, section "hip_offloading_entries"
-// HIP-NEXT: @.hip.binary_handle = internal global ptr null
-// HIP-NEXT: @__start_hip_offloading_entries = external hidden constant [0 x %__tgt_offload_entry]
-// HIP-NEXT: @__stop_hip_offloading_entries = external hidden constant [0 x %__tgt_offload_entry]
-// HIP-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.hip.fatbin_reg, ptr null }]
+// RUN:   --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefixes=HIP,HIP-ELF
+// RUN: clang-linker-wrapper --print-wrapped-module --dry-run --host-triple=x86_64-unknown-windows-gnu \
+// RUN:   --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefixes=HIP,HIP-COFF
+
+//      HIP-ELF: @.fatbin_image = internal constant [0 x i8] zeroinitializer, section ".hip_fatbin"
+// HIP-ELF-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1212764230, i32 1, ptr @.fatbin_image, ptr null }, section ".hipFatBinSegment", align 8
+// HIP-ELF-NEXT: @.hip.binary_handle = internal global ptr null
+// HIP-ELF-NEXT: @__start_hip_offloading_entries = external hidden constant [0 x %__tgt_offload_entry]
+// HIP-ELF-NEXT: @__stop_hip_offloading_entries = external hidden constant [0 x %__tgt_offload_entry]
+// HIP-ELF-NEXT: @__dummy.hip_offloading.entry = hidden constant [0 x %__tgt_offload_entry] zeroinitializer, section "hip_offloading_entries"
+// HIP-ELF-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.hip.fatbin_reg, ptr null }]
+
+//      HIP-COFF: @.fatbin_image = internal constant [0 x i8] zeroinitializer, section ".hip_fatbin"
+// HIP-COFF-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1212764230, i32 1, ptr @.fatbin_image, ptr null }, section ".hipFatBinSegment", align 8
+// HIP-COFF-NEXT: @.hip.binary_handle = internal global ptr null
+// HIP-COFF-NEXT: @__start.hip_offloading.entry = hidden constant [0 x %__tgt_offload_entry] zeroinitializer, section ".hip$OA"
+// HIP-COFF-NEXT: @__stop.hip_offloading.entry = hidden constant [0 x %__tgt_offload_entry] zeroinitializer, section ".hip$OZ"
+// HIP-COFF-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.hip.fatbin_reg, ptr null }]
 
 //      HIP: define internal void @.hip.fatbin_reg() section ".text.startup" {
 // HIP-NEXT: entry:
@@ -143,10 +171,10 @@
 
 //      HIP: define internal void @.hip.globals_reg(ptr %0) section ".text.startup" {
 // HIP-NEXT: entry:
-// HIP-NEXT:   br i1 icmp ne (ptr @__start_hip_offloading_entries, ptr @__stop_hip_offloading_entries), label %while.entry, label %while.end
+// HIP-NEXT:   br i1 icmp ne (ptr [[START_ENTRIES:.+]], ptr [[STOP_ENTRIES:.+]]), label %while.entry, label %while.end
 
 //      HIP: while.entry:
-// HIP-NEXT:   %entry1 = phi ptr [ @__start_hip_offloading_entries, %entry ], [ %7, %if.end ]
+// HIP-NEXT:   %entry1 = phi ptr [ [[START_ENTRIES]], %entry ], [ %7, %if.end ]
 // HIP-NEXT:   %1 = getelementptr inbounds %__tgt_offload_entry, ptr %entry1, i64 0, i32 0
 // HIP-NEXT:   %addr = load ptr, ptr %1, align 8
 // HIP-NEXT:   %2 = getelementptr inbounds %__tgt_offload_entry, ptr %entry1, i64 0, i32 1
@@ -185,7 +213,7 @@
 
 //      HIP: if.end:
 // HIP-NEXT:   %7 = getelementptr inbounds %__tgt_offload_entry, ptr %entry1, i64 1
-// HIP-NEXT:   %8 = icmp eq ptr %7, @__stop_hip_offloading_entries
+// HIP-NEXT:   %8 = icmp eq ptr %7, [[STOP_ENTRIES]]
 // HIP-NEXT:   br i1 %8, label %while.end, label %while.entry
 
 //      HIP: while.end:
Index: clang/test/CodeGenCUDA/offloading-entries.cu
===================================================================
--- clang/test/CodeGenCUDA/offloading-entries.cu
+++ clang/test/CodeGenCUDA/offloading-entries.cu
@@ -5,6 +5,12 @@
 // RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux-gnu -fgpu-rdc \
 // RUN:   --offload-new-driver -emit-llvm -o - -x hip  %s | FileCheck \
 // RUN:   --check-prefix=HIP %s
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-win32-gnu -fgpu-rdc \
+// RUN:   --offload-new-driver -emit-llvm -o - -x cuda  %s | FileCheck \
+// RUN:   --check-prefix=CUDA-COFF %s
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-win32-gnu -fgpu-rdc \
+// RUN:   --offload-new-driver -emit-llvm -o - -x hip  %s | FileCheck \
+// RUN:   --check-prefix=HIP-COFF %s
 
 #include "Inputs/cuda.h"
 
@@ -23,6 +29,20 @@
 // HIP: @.omp_offloading.entry_name.2 = internal unnamed_addr constant [2 x i8] c"x\00"
 // HIP: @.omp_offloading.entry.x = weak constant %struct.__tgt_offload_entry { ptr @x, ptr @.omp_offloading.entry_name.2, i64 4, i32 0, i32 0 }, section "hip_offloading_entries", align 1
 //.
+// CUDA-COFF: @.omp_offloading.entry_name = internal unnamed_addr constant [8 x i8] c"_Z3foov\00"
+// CUDA-COFF: @.omp_offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry { ptr @_Z18__device_stub__foov, ptr @.omp_offloading.entry_name, i64 0, i32 0, i32 0 }, section ".cuda$OE", align 1
+// CUDA-COFF: @.omp_offloading.entry_name.1 = internal unnamed_addr constant [8 x i8] c"_Z3barv\00"
+// CUDA-COFF: @.omp_offloading.entry._Z3barv = weak constant %struct.__tgt_offload_entry { ptr @_Z18__device_stub__barv, ptr @.omp_offloading.entry_name.1, i64 0, i32 0, i32 0 }, section ".cuda$OE", align 1
+// CUDA-COFF: @.omp_offloading.entry_name.2 = internal unnamed_addr constant [2 x i8] c"x\00"
+// CUDA-COFF: @.omp_offloading.entry.x = weak constant %struct.__tgt_offload_entry { ptr @x, ptr @.omp_offloading.entry_name.2, i64 4, i32 0, i32 0 }, section ".cuda$OE", align 1
+//.
+// HIP-COFF: @.omp_offloading.entry_name = internal unnamed_addr constant [8 x i8] c"_Z3foov\00"
+// HIP-COFF: @.omp_offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry { ptr @_Z3foov, ptr @.omp_offloading.entry_name, i64 0, i32 0, i32 0 }, section ".hip$OE", align 1
+// HIP-COFF: @.omp_offloading.entry_name.1 = internal unnamed_addr constant [8 x i8] c"_Z3barv\00"
+// HIP-COFF: @.omp_offloading.entry._Z3barv = weak constant %struct.__tgt_offload_entry { ptr @_Z3barv, ptr @.omp_offloading.entry_name.1, i64 0, i32 0, i32 0 }, section ".hip$OE", align 1
+// HIP-COFF: @.omp_offloading.entry_name.2 = internal unnamed_addr constant [2 x i8] c"x\00"
+// HIP-COFF: @.omp_offloading.entry.x = weak constant %struct.__tgt_offload_entry { ptr @x, ptr @.omp_offloading.entry_name.2, i64 4, i32 0, i32 0 }, section ".hip$OE", align 1
+//.
 // CUDA-LABEL: @_Z18__device_stub__foov(
 // CUDA-NEXT:  entry:
 // CUDA-NEXT:    [[TMP0:%.*]] = call i32 @cudaLaunch(ptr @_Z18__device_stub__foov)
@@ -37,6 +57,20 @@
 // HIP:       setup.end:
 // HIP-NEXT:    ret void
 //
+// CUDA-COFF-LABEL: @_Z18__device_stub__foov(
+// CUDA-COFF-NEXT:  entry:
+// CUDA-COFF-NEXT:    [[TMP0:%.*]] = call i32 @cudaLaunch(ptr @_Z18__device_stub__foov)
+// CUDA-COFF-NEXT:    br label [[SETUP_END:%.*]]
+// CUDA-COFF:       setup.end:
+// CUDA-COFF-NEXT:    ret void
+//
+// HIP-COFF-LABEL: @_Z18__device_stub__foov(
+// HIP-COFF-NEXT:  entry:
+// HIP-COFF-NEXT:    [[TMP0:%.*]] = call i32 @hipLaunchByPtr(ptr @_Z3foov)
+// HIP-COFF-NEXT:    br label [[SETUP_END:%.*]]
+// HIP-COFF:       setup.end:
+// HIP-COFF-NEXT:    ret void
+//
 __global__ void foo() {}
 
 // CUDA-LABEL: @_Z18__device_stub__barv(
@@ -53,5 +87,19 @@
 // HIP:       setup.end:
 // HIP-NEXT:    ret void
 //
+// CUDA-COFF-LABEL: @_Z18__device_stub__barv(
+// CUDA-COFF-NEXT:  entry:
+// CUDA-COFF-NEXT:    [[TMP0:%.*]] = call i32 @cudaLaunch(ptr @_Z18__device_stub__barv)
+// CUDA-COFF-NEXT:    br label [[SETUP_END:%.*]]
+// CUDA-COFF:       setup.end:
+// CUDA-COFF-NEXT:    ret void
+//
+// HIP-COFF-LABEL: @_Z18__device_stub__barv(
+// HIP-COFF-NEXT:  entry:
+// HIP-COFF-NEXT:    [[TMP0:%.*]] = call i32 @hipLaunchByPtr(ptr @_Z3barv)
+// HIP-COFF-NEXT:    br label [[SETUP_END:%.*]]
+// HIP-COFF:       setup.end:
+// HIP-COFF-NEXT:    ret void
+//
 __global__ void bar() {}
 __device__ int x = 1;
Index: clang/lib/CodeGen/CGCUDANV.cpp
===================================================================
--- clang/lib/CodeGen/CGCUDANV.cpp
+++ clang/lib/CodeGen/CGCUDANV.cpp
@@ -1127,8 +1127,13 @@
   llvm::OpenMPIRBuilder OMPBuilder(CGM.getModule());
   OMPBuilder.initialize();
 
-  StringRef Section = CGM.getLangOpts().HIP ? "hip_offloading_entries"
-                                            : "cuda_offloading_entries";
+  StringRef Section;
+  if (CGM.getTriple().isOSBinFormatCOFF())
+    Section = CGM.getLangOpts().HIP ? ".hip$OE" : ".cuda$OE";
+  else
+    Section = CGM.getLangOpts().HIP ? "hip_offloading_entries"
+                                    : "cuda_offloading_entries";
+
   for (KernelInfo &I : EmittedKernels)
     OMPBuilder.emitOffloadingEntry(KernelHandles[I.Kernel],
                                    getDeviceSideName(cast<NamedDecl>(I.D)), 0,
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to