jhuber6 created this revision.
jhuber6 added reviewers: jdoerfert, gregrodgers, JonChesterfield, ronlieb.
Herald added subscribers: ormris, dexonsmith, dang, guansong, hiraditya, 
yaxunl, mgorny.
jhuber6 requested review of this revision.
Herald added subscribers: llvm-commits, cfe-commits, sstefan1.
Herald added projects: clang, LLVM.

This patch introduces the `-fopenmp-new-driver` option which instructs
the compiler to use a new driver scheme for producing offloading code.
In this scheme we create a complete offloading object file and then pass
it as input to the host compilation phase. This will allow us to embed
the object code in the backend phase.

[OpenMP] Add a flag for embedding a file into the module

This patch adds support for a flag `-fembed-offload-binary` to embed a
file as an ELF section in the output by placing it in a global variable.
This can be used to bundle offloading files with the host binary so it
can be accessed by the linker. The section is named using the
`-fembed-offload-section` option.

[OpenMP] Embed device files into the host IR

This patch adds support for embedding the device object files into the
host IR to create a fat binary. Each offloading file will be inserted
into a section with the following naming format
`.llvm.offloading.<triple>.<arch>`.

[Clang] Introduce Clang Linker Wrapper Tool

This patch introduces a linker wrapper tool that allows us to preprocess
files before they are sent to the linker. This adds a dummy action and
job to the driver stage that builds the linker command as usual and then
replaces the command line with the wrapper tool.

[OpenMP] Add support for extracting device code in linker wrapper

This patchs add support for extracting device offloading code from the
linker's input files. If the file contains a section with the name
`.llvm.offloading.<triple>.<arch>` it will be extracted to a new
temporary file to be linked. Addtionally, the host file containing it
will have the section stripped so it does not remain in the executable
once linked.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D116541

Files:
  clang/include/clang/Basic/CodeGenOptions.h
  clang/include/clang/CodeGen/BackendUtil.h
  clang/include/clang/Driver/Action.h
  clang/include/clang/Driver/Driver.h
  clang/include/clang/Driver/Job.h
  clang/include/clang/Driver/Options.td
  clang/include/clang/Driver/ToolChain.h
  clang/lib/CodeGen/BackendUtil.cpp
  clang/lib/CodeGen/CodeGenAction.cpp
  clang/lib/Driver/Action.cpp
  clang/lib/Driver/Driver.cpp
  clang/lib/Driver/ToolChain.cpp
  clang/lib/Driver/ToolChains/Clang.cpp
  clang/lib/Driver/ToolChains/Clang.h
  clang/test/Driver/openmp-offload-gpu.c
  clang/test/Frontend/embed-object.ll
  clang/tools/CMakeLists.txt
  clang/tools/clang-linker-wrapper/CMakeLists.txt
  clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
  llvm/include/llvm/Bitcode/BitcodeWriter.h
  llvm/lib/Bitcode/Writer/BitcodeWriter.cpp

Index: llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
===================================================================
--- llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -4971,3 +4971,42 @@
       llvm::ConstantArray::get(ATy, UsedArray), "llvm.compiler.used");
   NewUsed->setSection("llvm.metadata");
 }
+
+void llvm::EmbedObjectInModule(llvm::Module &M, llvm::MemoryBufferRef Buf,
+                               StringRef SectionName) {
+  // Save llvm.compiler.used and remove it.
+  SmallVector<Constant *, 2> UsedArray;
+  SmallVector<GlobalValue *, 4> UsedGlobals;
+  Type *UsedElementType = Type::getInt8Ty(M.getContext())->getPointerTo(0);
+  GlobalVariable *Used = collectUsedGlobalVariables(M, UsedGlobals, true);
+  for (auto *GV : UsedGlobals) {
+    if (!GV->getName().startswith("llvm.embedded.object"))
+      UsedArray.push_back(
+          ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV, UsedElementType));
+  }
+  if (Used)
+    Used->eraseFromParent();
+
+  ArrayRef<uint8_t> ModuleData = ArrayRef<uint8_t>(
+      (const uint8_t *)Buf.getBufferStart(), Buf.getBufferSize());
+
+  // Embed the data in the 
+  llvm::Constant *ModuleConstant =
+      llvm::ConstantDataArray::get(M.getContext(), ModuleData);
+  llvm::GlobalVariable *GV = new llvm::GlobalVariable(
+      M, ModuleConstant->getType(), true, llvm::GlobalValue::PrivateLinkage,
+      ModuleConstant, "llvm.embedded.object");
+  GV->setSection(SectionName);
+  // Set alignment to 1 to prevent padding between two contributions from input
+  // sections after linking.
+  GV->setAlignment(Align(1));
+  UsedArray.push_back(
+      ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV, UsedElementType));
+
+  // Recreate llvm.compiler.used.
+  ArrayType *ATy = ArrayType::get(UsedElementType, UsedArray.size());
+  auto *NewUsed = new GlobalVariable(
+      M, ATy, false, llvm::GlobalValue::AppendingLinkage,
+      llvm::ConstantArray::get(ATy, UsedArray), "llvm.compiler.used");
+  NewUsed->setSection("llvm.metadata");
+}
Index: llvm/include/llvm/Bitcode/BitcodeWriter.h
===================================================================
--- llvm/include/llvm/Bitcode/BitcodeWriter.h
+++ llvm/include/llvm/Bitcode/BitcodeWriter.h
@@ -165,6 +165,11 @@
                             bool EmbedCmdline,
                             const std::vector<uint8_t> &CmdArgs);
 
+  /// Embeds the memory buffer \p Buf into the module \p M as a global using the
+  /// section name \p SectionName.
+  void EmbedObjectInModule(Module &M, MemoryBufferRef Buf,
+                           StringRef SectionName);
+
 } // end namespace llvm
 
 #endif // LLVM_BITCODE_BITCODEWRITER_H
Index: clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
===================================================================
--- /dev/null
+++ clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -0,0 +1,469 @@
+//===-- clang-linker-wrapper/ClangLinkerWrapper.cpp - wrapper over linker-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+//
+// This tool works as a wrapper over a linking job. This tool is used to create
+// linked device images for offloading. It scans the linker's input for embedded
+// device offloading data stored in sections `.llvm.offloading.<triple>.<arch>`
+// and extracts it as a temporary file. The extracted device files will then be
+// passed to a device linking job to create a final device image.
+//
+//===---------------------------------------------------------------------===//
+
+#include "clang/Basic/Version.h"
+#include "llvm/BinaryFormat/Magic.h"
+#include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Object/Archive.h"
+#include "llvm/Object/ArchiveWriter.h"
+#include "llvm/Object/Binary.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/FileOutputBuffer.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Program.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/StringSaver.h"
+#include "llvm/Support/WithColor.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::object;
+
+static cl::opt<bool> Help("h", cl::desc("Alias for -help"), cl::Hidden);
+
+// Mark all our options with this category, everything else (except for -help)
+// will be hidden.
+static cl::OptionCategory
+    ClangLinkerWrapperCategory("clang-linker-wrapper options");
+
+static cl::opt<bool> StripSections(
+    "strip-sections", cl::ZeroOrMore,
+    cl::desc("Strip offloading sections from the host object file."),
+    cl::init(true), cl::cat(ClangLinkerWrapperCategory));
+
+static cl::opt<std::string> LinkerUserPath("linker-path",
+                                           cl::desc("Path of linker binary"),
+                                           cl::cat(ClangLinkerWrapperCategory));
+
+// Do not parse linker options.
+static cl::list<std::string>
+    LinkerArgs(cl::Sink, cl::desc("<options to be passed to linker>..."));
+
+/// Path of the current binary.
+static std::string LinkerExecutable;
+
+/// Magic section string that marks the existence of offloading data. The
+/// section string will be formatted as `.llvm.offloading.<triple>.<arch>`.
+#define OFFLOAD_SECTION_MAGIC_STR ".llvm.offloading"
+
+struct DeviceFile {
+  DeviceFile(StringRef TheTriple, StringRef Arch, StringRef Filename)
+      : TheTriple(TheTriple), Arch(Arch), Filename(Filename) {}
+
+  const Triple TheTriple;
+  const std::string Arch;
+  const std::string Filename;
+};
+
+namespace {
+
+Expected<Optional<std::string>>
+extractFromBuffer(std::unique_ptr<MemoryBuffer> Buffer,
+                  SmallVectorImpl<DeviceFile> &DeviceFiles);
+
+Error runLinker(std::string &LinkerPath, SmallVectorImpl<std::string> &Args) {
+  std::vector<StringRef> LinkerArgs;
+  LinkerArgs.push_back(LinkerPath);
+  for (auto &Arg : Args)
+    LinkerArgs.push_back(Arg);
+
+  if (sys::ExecuteAndWait(LinkerPath, LinkerArgs))
+    return createStringError(inconvertibleErrorCode(), "'linker' failed");
+  return Error::success();
+}
+
+void PrintVersion(raw_ostream &OS) {
+  OS << clang::getClangToolFullVersion("clang-linker-wrapper") << '\n';
+}
+
+void removeFromCompilerUsed(Module &M, GlobalValue &Value) {
+  GlobalVariable *GV = M.getGlobalVariable("llvm.compiler.used");
+  Type *Int8PtrTy = Type::getInt8PtrTy(M.getContext());
+  Constant *ValueToRemove =
+      ConstantExpr::getPointerBitCastOrAddrSpaceCast(&Value, Int8PtrTy);
+  SmallPtrSet<Constant *, 16> InitAsSet;
+  SmallVector<Constant *, 16> Init;
+  if (GV) {
+    if (GV->hasInitializer()) {
+      auto *CA = cast<ConstantArray>(GV->getInitializer());
+      for (auto &Op : CA->operands()) {
+        Constant *C = cast_or_null<Constant>(Op);
+        if (C != ValueToRemove && InitAsSet.insert(C).second)
+          Init.push_back(C);
+      }
+    }
+    GV->eraseFromParent();
+  }
+
+  if (Init.empty())
+    return;
+
+  ArrayType *ATy = ArrayType::get(Int8PtrTy, Init.size());
+  GV = new llvm::GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
+                                ConstantArray::get(ATy, Init),
+                                "llvm.compiler.used");
+  GV->setSection("llvm.metadata");
+}
+
+Expected<Optional<std::string>>
+extractFromBinary(const ObjectFile &Obj,
+                  SmallVectorImpl<DeviceFile> &DeviceFiles) {
+
+  StringRef Extension = sys::path::extension(Obj.getFileName()).drop_front();
+  StringRef Prefix = sys::path::stem(Obj.getFileName()).take_until([](char C) {
+    return C == '-';
+  });
+  SmallVector<StringRef, 4> ToBeStripped;
+
+  // Extract data from sections of the form `.llvm.offloading.<triple>.<arch>`.
+  for (const SectionRef &Sec : Obj.sections()) {
+    Expected<StringRef> Name = Sec.getName();
+    if (!Name || !Name->startswith(OFFLOAD_SECTION_MAGIC_STR))
+      continue;
+
+    SmallVector<StringRef, 4> SectionFields;
+    Name->split(SectionFields, '.', -1, false);
+
+    assert(SectionFields.size() == 4 &&
+           "Offloading section name is missing required fields");
+
+    const StringRef DeviceTriple = SectionFields[2];
+    const StringRef Arch = SectionFields[3];
+
+    if (Expected<StringRef> Contents = Sec.getContents()) {
+      SmallString<128> TempFile;
+      if (std::error_code EC = sys::fs::createTemporaryFile(
+              Prefix + "-device-" + DeviceTriple, Extension, TempFile))
+        return createFileError(TempFile, EC);
+
+      Expected<std::unique_ptr<FileOutputBuffer>> OutputOrErr =
+          FileOutputBuffer::create(TempFile, Sec.getSize());
+      if (!OutputOrErr)
+        return OutputOrErr.takeError();
+      std::unique_ptr<FileOutputBuffer> Output = std::move(*OutputOrErr);
+      std::copy(Contents->begin(), Contents->end(), Output->getBufferStart());
+      if (Error E = Output->commit())
+        return E;
+
+      DeviceFiles.emplace_back(DeviceTriple, Arch, TempFile);
+      ToBeStripped.push_back(*Name);
+    }
+  }
+
+  if (ToBeStripped.empty())
+    return None;
+
+  // We will use llvm-strip to remove the now unneeded section containing the
+  // offloading code.
+  ErrorOr<std::string> StripPath = sys::findProgramByName(
+      "llvm-strip", sys::path::parent_path(LinkerExecutable));
+  if (!StripPath)
+    StripPath = sys::findProgramByName("llvm-strip");
+  if (!StripPath)
+    return createStringError(StripPath.getError(),
+                             "Unable to find 'llvm-strip' in path");
+
+  SmallString<128> TempFile;
+  if (std::error_code EC =
+          sys::fs::createTemporaryFile(Prefix + "-host", Extension, TempFile))
+    return createFileError(TempFile, EC);
+
+  SmallVector<StringRef, 8> StripArgs;
+  StripArgs.push_back(*StripPath);
+  StripArgs.push_back("--no-strip-all");
+  StripArgs.push_back(Obj.getFileName());
+  for (auto &Section : ToBeStripped) {
+    StripArgs.push_back("--remove-section");
+    StripArgs.push_back(Section);
+  }
+  StripArgs.push_back("-o");
+  StripArgs.push_back(TempFile);
+
+  if (sys::ExecuteAndWait(*StripPath, StripArgs))
+    return createStringError(inconvertibleErrorCode(), "'llvm-strip' failed");
+
+  return static_cast<std::string>(TempFile);
+}
+
+Expected<Optional<std::string>>
+extractFromBitcode(std::unique_ptr<MemoryBuffer> Buffer,
+                   SmallVectorImpl<DeviceFile> &DeviceFiles) {
+  LLVMContext Context;
+  SMDiagnostic Err;
+  std::unique_ptr<Module> M = getLazyIRModule(std::move(Buffer), Err, Context);
+  if (!M)
+    return createStringError(inconvertibleErrorCode(),
+                             "Failed to create module");
+
+  StringRef Extension = sys::path::extension(M->getName()).drop_front();
+  StringRef Prefix =
+      sys::path::stem(M->getName()).take_until([](char C) { return C == '-'; });
+
+  SmallVector<GlobalVariable *, 4> ToBeDeleted;
+
+  // Extract data from the global string containing a section of the form
+  // `.llvm.offloading.<triple>.<arch>`.
+  for (GlobalVariable &GV : M->globals()) {
+    if (!GV.hasSection() ||
+        !GV.getSection().startswith(OFFLOAD_SECTION_MAGIC_STR))
+      continue;
+
+    auto *CDS = dyn_cast<ConstantDataSequential>(GV.getInitializer());
+    if (!CDS)
+      continue;
+
+    SmallVector<StringRef, 4> SectionFields;
+    GV.getSection().split(SectionFields, '.', -1, false);
+
+    assert(SectionFields.size() == 4 &&
+           "Offloading section name is missing required fields");
+
+    const StringRef DeviceTriple = SectionFields[2];
+    const StringRef Arch = SectionFields[3];
+
+    StringRef Contents = CDS->getAsString();
+    SmallString<128> TempFile;
+    if (std::error_code EC = sys::fs::createTemporaryFile(
+            Prefix + "-device-" + DeviceTriple, Extension, TempFile))
+      return createFileError(TempFile, EC);
+
+    Expected<std::unique_ptr<FileOutputBuffer>> OutputOrErr =
+        FileOutputBuffer::create(TempFile, Contents.size());
+    if (!OutputOrErr)
+      return OutputOrErr.takeError();
+    std::unique_ptr<FileOutputBuffer> Output = std::move(*OutputOrErr);
+    std::copy(Contents.begin(), Contents.end(), Output->getBufferStart());
+    if (Error E = Output->commit())
+      return E;
+
+    DeviceFiles.emplace_back(DeviceTriple, Arch, TempFile);
+    ToBeDeleted.push_back(&GV);
+  }
+
+  if (ToBeDeleted.empty())
+    return None;
+
+  // We need to materialize the lazy module before we make any changes.
+  if (Error Err = M->materializeAll())
+    return Err;
+
+  // Remove the global from the module and write it to a new file.
+  for (GlobalVariable *GV : ToBeDeleted) {
+    removeFromCompilerUsed(*M, *GV);
+    GV->eraseFromParent();
+  }
+
+  SmallString<128> TempFile;
+  if (std::error_code EC =
+          sys::fs::createTemporaryFile(Prefix + "-host", Extension, TempFile))
+    return createFileError(TempFile, EC);
+  std::error_code EC;
+  raw_fd_ostream HostOutput(TempFile, EC, sys::fs::OF_None);
+  if (EC)
+    return createFileError(TempFile, EC);
+  WriteBitcodeToFile(*M, HostOutput);
+  return static_cast<std::string>(TempFile);
+}
+
+Expected<Optional<std::string>>
+extractFromArchive(const Archive &Library,
+                   SmallVectorImpl<DeviceFile> &DeviceFiles) {
+
+  StringRef Extension =
+      sys::path::extension(Library.getFileName()).drop_front();
+  StringRef Prefix =
+      sys::path::stem(Library.getFileName()).take_until([](char C) {
+        return C == '-';
+      });
+
+  bool NewMembers = false;
+  SmallVector<NewArchiveMember, 8> Members;
+
+  // Try to extract device code from each file stored in the static archive.
+  // Save the stripped archive members to create a new host archive with the
+  // offloading code removed.
+  Error Err = Error::success();
+  for (auto Child : Library.children(Err)) {
+    auto ChildBufferRefOrErr = Child.getMemoryBufferRef();
+    if (!ChildBufferRefOrErr)
+      return ChildBufferRefOrErr.takeError();
+    std::unique_ptr<MemoryBuffer> ChildBuffer =
+        MemoryBuffer::getMemBuffer(*ChildBufferRefOrErr, false);
+
+    auto FileOrErr = extractFromBuffer(std::move(ChildBuffer), DeviceFiles);
+    if (!FileOrErr)
+      return FileOrErr.takeError();
+
+    // If we created a new stripped host file, use it to create a new archive
+    // member, otherwise use the old member.
+    if (!FileOrErr->hasValue()) {
+      Expected<NewArchiveMember> NewMember =
+          NewArchiveMember::getOldMember(Child, true);
+      if (!NewMember)
+        return NewMember.takeError();
+      Members.push_back(std::move(*NewMember));
+    } else {
+      Expected<NewArchiveMember> NewMember =
+          NewArchiveMember::getFile(**FileOrErr, true);
+      if (!NewMember)
+        return NewMember.takeError();
+      Members.push_back(std::move(*NewMember));
+      NewMembers = true;
+
+      // We no longer need the stripped file, remove it.
+      if (std::error_code EC = sys::fs::remove(**FileOrErr))
+        return createFileError(**FileOrErr, EC);
+    }
+  }
+
+  if (Err)
+    return Err;
+
+  if (!NewMembers)
+    return None;
+
+  // Create a new static library using the stripped host files.
+  SmallString<128> TempFile;
+  if (std::error_code EC =
+          sys::fs::createTemporaryFile(Prefix + "-host", Extension, TempFile))
+    return createFileError(TempFile, EC);
+
+  std::unique_ptr<MemoryBuffer> Buffer =
+      MemoryBuffer::getMemBuffer(Library.getMemoryBufferRef(), false);
+  if (Error WriteErr = writeArchive(TempFile, Members, true, Library.kind(),
+                                    true, Library.isThin(), std::move(Buffer)))
+    return WriteErr;
+
+  return static_cast<std::string>(TempFile);
+}
+
+/// Extracts embedded device offloading code from a memory \p Buffer to a list
+/// of \p DeviceFiles. If device code was extracted a new file with the embedded
+/// device code stripped from the buffer will be returned.
+Expected<Optional<std::string>>
+extractFromBuffer(std::unique_ptr<MemoryBuffer> Buffer,
+                  SmallVectorImpl<DeviceFile> &DeviceFiles) {
+  file_magic Type = identify_magic(Buffer->getBuffer());
+  switch (Type) {
+  case file_magic::bitcode:
+    return extractFromBitcode(std::move(Buffer), DeviceFiles);
+  case file_magic::elf_relocatable:
+  case file_magic::macho_object:
+  case file_magic::coff_object: {
+    Expected<std::unique_ptr<ObjectFile>> ObjFile =
+        ObjectFile::createObjectFile(*Buffer, Type);
+    if (!ObjFile)
+      return ObjFile.takeError();
+    return extractFromBinary(*ObjFile->get(), DeviceFiles);
+  }
+  case file_magic::archive: {
+    Expected<std::unique_ptr<llvm::object::Archive>> LibFile =
+        object::Archive::create(*Buffer);
+    if (!LibFile)
+      return LibFile.takeError();
+    return extractFromArchive(*LibFile->get(), DeviceFiles);
+  }
+  default:
+    return errorCodeToError(object_error::invalid_file_type);
+  }
+
+  return None;
+}
+
+} // namespace
+
+int main(int argc, const char **argv) {
+  InitLLVM X(argc, argv);
+
+  sys::PrintStackTraceOnErrorSignal(argv[0]);
+  cl::SetVersionPrinter(PrintVersion);
+  cl::HideUnrelatedOptions(ClangLinkerWrapperCategory);
+  cl::ParseCommandLineOptions(
+      argc, argv,
+      "A wrapper utility over the host linker. It scans the input files for\n"
+      "sections that require additional processing prior to linking. The tool\n"
+      "will then transparently pass all arguments and input to the specified\n"
+      "host linker to create the final binary.\n");
+
+  if (Help) {
+    cl::PrintHelpMessage();
+    return EXIT_SUCCESS;
+  }
+  LinkerExecutable = argv[0];
+
+  SmallVector<std::string, 4> TempFiles;
+  SmallVector<DeviceFile, 4> DeviceFiles;
+
+  auto reportError = [argv](Error E) {
+    logAllUnhandledErrors(std::move(E), WithColor::error(errs(), argv[0]));
+    exit(EXIT_FAILURE);
+  };
+
+  // Try to extract device code from the linker input and replace the linker
+  // input with a new file that has the device section stripped.
+  for (std::string &Arg : LinkerArgs) {
+    if (sys::path::extension(Arg) == ".o" ||
+        sys::path::extension(Arg) == ".a") {
+      ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
+          MemoryBuffer::getFileOrSTDIN(Arg);
+      if (std::error_code EC = BufferOrErr.getError())
+        reportError(createFileError(Arg, EC));
+
+      auto NewFileOrErr =
+          extractFromBuffer(std::move(*BufferOrErr), DeviceFiles);
+
+      if (!NewFileOrErr)
+        reportError(NewFileOrErr.takeError());
+
+      if (NewFileOrErr->hasValue()) {
+        TempFiles.push_back(**NewFileOrErr);
+        Arg = **NewFileOrErr;
+      }
+    }
+  }
+
+  // Add the newly extracted device files to the temporary list.
+  for (const auto &DeviceFile : DeviceFiles)
+    TempFiles.push_back(DeviceFile.Filename);
+
+  // TODO: Perform appropriate device linking action.
+  // TODO: Wrap device image in a host binary and pass it to the linker.
+  WithColor::warning(errs(), argv[0]) << "Offload linking not yet supported.\n";
+
+  SmallVector<std::string, 16> LinkerArgv;
+  for (const std::string &Arg : LinkerArgs)
+    LinkerArgv.push_back(Arg);
+
+  // Run the host linking job.
+  if (Error Err = runLinker(LinkerUserPath, LinkerArgv))
+    reportError(std::move(Err));
+
+  for (const auto &TempFile : TempFiles) {
+    if (std::error_code EC = sys::fs::remove(TempFile))
+      reportError(createFileError(TempFile, EC));
+  }
+
+  return EXIT_SUCCESS;
+}
Index: clang/tools/clang-linker-wrapper/CMakeLists.txt
===================================================================
--- /dev/null
+++ clang/tools/clang-linker-wrapper/CMakeLists.txt
@@ -0,0 +1,25 @@
+set(LLVM_LINK_COMPONENTS BitWriter Core BinaryFormat IRReader Object Support)
+
+if(NOT CLANG_BUILT_STANDALONE)
+  set(tablegen_deps intrinsics_gen)
+endif()
+
+add_clang_executable(clang-linker-wrapper
+  ClangLinkerWrapper.cpp
+
+  DEPENDS
+  ${tablegen_deps}
+  )
+
+set(CLANG_LINKER_WRAPPER_LIB_DEPS
+  clangBasic
+  )
+
+add_dependencies(clang clang-linker-wrapper)
+
+target_link_libraries(clang-linker-wrapper
+  PRIVATE
+  ${CLANG_LINKER_WRAPPER_LIB_DEPS}
+  )
+
+install(TARGETS clang-linker-wrapper RUNTIME DESTINATION bin)
Index: clang/tools/CMakeLists.txt
===================================================================
--- clang/tools/CMakeLists.txt
+++ clang/tools/CMakeLists.txt
@@ -9,6 +9,7 @@
 add_clang_subdirectory(clang-fuzzer)
 add_clang_subdirectory(clang-import-test)
 add_clang_subdirectory(clang-nvlink-wrapper)
+add_clang_subdirectory(clang-linker-wrapper)
 add_clang_subdirectory(clang-offload-bundler)
 add_clang_subdirectory(clang-offload-wrapper)
 add_clang_subdirectory(clang-scan-deps)
Index: clang/test/Frontend/embed-object.ll
===================================================================
--- /dev/null
+++ clang/test/Frontend/embed-object.ll
@@ -0,0 +1,13 @@
+; RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm \
+; RUN:    -fembed-offload-binary=%S/Inputs/empty.h -fembed-offload-section=section -x ir %s -o - \
+; RUN:    | FileCheck %s -check-prefix=CHECK
+
+; CHECK: @llvm.embedded.object = private constant [0 x i8] zeroinitializer, section ".llvm.offloading.section", align 1
+; CHECK: @llvm.compiler.used = appending global [2 x i8*] [i8* @x, i8* getelementptr inbounds ([0 x i8], [0 x i8]* @llvm.embedded.object, i32 0, i32 0)], section "llvm.metadata"
+
+@x = private constant i8 1
+@llvm.compiler.used = appending global [1 x i8*] [i8* @x], section "llvm.metadata"
+
+define i32 @foo() {
+  ret i32 0
+}
Index: clang/test/Driver/openmp-offload-gpu.c
===================================================================
--- clang/test/Driver/openmp-offload-gpu.c
+++ clang/test/Driver/openmp-offload-gpu.c
@@ -343,3 +343,13 @@
 // RUN:   | FileCheck -check-prefix=SAVE_TEMPS_NAMES %s
 
 // SAVE_TEMPS_NAMES-NOT: "GNU::Linker"{{.*}}["[[SAVE_TEMPS_INPUT1:.*\.o]]", "[[SAVE_TEMPS_INPUT1]]"]
+
+// RUN:   %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
+// RUN:          -fopenmp-new-driver -no-canonical-prefixes -ccc-print-bindings %s -o openmp-offload-gpu 2>&1 \
+// RUN:   | FileCheck -check-prefix=NEW_DRIVER %s
+
+// NEW_DRIVER: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[HOST_INPUT:.+]]"], output: "[[HOST_BC:.+]]" 
+// NEW_DRIVER: "nvptx64-nvidia-cuda" - "clang", inputs: ["[[DEVICE_INPUT:.+]]", "[[HOST_BC]]"], output: "[[DEVICE_ASM:.+]]"
+// NEW_DRIVER: "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[DEVICE_ASM]]"], output: "[[DEVICE_OBJ:.+]]" 
+// NEW_DRIVER: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[HOST_BC]]", "[[DEVICE_OBJ]]"], output: "[[HOST_OBJ:.+]]" 
+// NEW_DRIVER: "x86_64-unknown-linux-gnu" - "[[LINKER:.+]]", inputs: ["[[HOST_OBJ]]"], output: "openmp-offload-gpu"
Index: clang/lib/Driver/ToolChains/Clang.h
===================================================================
--- clang/lib/Driver/ToolChains/Clang.h
+++ clang/lib/Driver/ToolChains/Clang.h
@@ -170,6 +170,21 @@
                     const char *LinkingOutput) const override;
 };
 
+/// Linker wrapper tool.
+class LLVM_LIBRARY_VISIBILITY LinkerWrapper final : public Tool {
+  const Tool *Linker;
+
+public:
+  LinkerWrapper(const ToolChain &TC, const Tool *Linker)
+      : Tool("Offload::Linker", "linker", TC), Linker(Linker) {}
+
+  bool hasIntegratedCPP() const override { return false; }
+  void ConstructJob(Compilation &C, const JobAction &JA,
+                    const InputInfo &Output, const InputInfoList &Inputs,
+                    const llvm::opt::ArgList &TCArgs,
+                    const char *LinkingOutput) const override;
+};
+
 } // end namespace tools
 
 } // end namespace driver
Index: clang/lib/Driver/ToolChains/Clang.cpp
===================================================================
--- clang/lib/Driver/ToolChains/Clang.cpp
+++ clang/lib/Driver/ToolChains/Clang.cpp
@@ -4338,6 +4338,7 @@
   bool IsHIP = JA.isOffloading(Action::OFK_HIP);
   bool IsHIPDevice = JA.isDeviceOffloading(Action::OFK_HIP);
   bool IsOpenMPDevice = JA.isDeviceOffloading(Action::OFK_OpenMP);
+  bool IsOpenMPHost = JA.isHostOffloading(Action::OFK_OpenMP);
   bool IsHeaderModulePrecompile = isa<HeaderModulePrecompileJobAction>(JA);
   bool IsDeviceOffloadAction = !(JA.isDeviceOffloading(Action::OFK_None) ||
                                  JA.isDeviceOffloading(Action::OFK_Host));
@@ -4356,6 +4357,7 @@
       IsHeaderModulePrecompile ? HeaderModuleInput : Inputs[0];
 
   InputInfoList ModuleHeaderInputs;
+  InputInfoList OpenMPHostInputs;
   const InputInfo *CudaDeviceInput = nullptr;
   const InputInfo *OpenMPDeviceInput = nullptr;
   for (const InputInfo &I : Inputs) {
@@ -4374,6 +4376,8 @@
       CudaDeviceInput = &I;
     } else if (IsOpenMPDevice && !OpenMPDeviceInput) {
       OpenMPDeviceInput = &I;
+    } else if (IsOpenMPHost) {
+      OpenMPHostInputs.push_back(I);
     } else {
       llvm_unreachable("unexpectedly given multiple inputs");
     }
@@ -6866,6 +6870,32 @@
     }
   }
 
+  // Host-side OpenMP offloading recieves the device object files and embeds it
+  // in a named section including the associated target triple and architecture.
+  if (IsOpenMPHost && !OpenMPHostInputs.empty()) {
+    SmallString<128> InputFiles("-fembed-offload-binary=");
+    SmallString<128> InputSections("-fembed-offload-section=");
+
+    auto InputFile = OpenMPHostInputs.begin();
+    auto OpenMPTCs = C.getOffloadToolChains<Action::OFK_OpenMP>();
+    for (auto TI = OpenMPTCs.first, TE = OpenMPTCs.second; TI != TE;
+         ++TI, ++InputFile) {
+      const ToolChain *TC = TI->second;
+      const ArgList &TCArgs = C.getArgsForToolChain(TC, "", Action::OFK_OpenMP);
+      InputSections += TC->getTripleString() + ".";
+      InputSections += TCArgs.getLastArgValue(options::OPT_march_EQ);
+      InputSections += ",";
+
+      InputFiles += C.getArgs().MakeArgString(TC->getInputFilename(*InputFile));
+      InputFiles += ",";
+    }
+    InputSections.pop_back();
+    InputFiles.pop_back();
+
+    CmdArgs.push_back(Args.MakeArgString(InputFiles.str()));
+    CmdArgs.push_back(Args.MakeArgString(InputSections.str()));
+  }
+
   if (Triple.isAMDGPU()) {
     handleAMDGPUCodeObjectVersionOptions(D, Args, CmdArgs);
 
@@ -8092,3 +8122,28 @@
       Args.MakeArgString(getToolChain().GetProgramPath(getShortName())),
       CmdArgs, Inputs, Output));
 }
+
+void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
+                                 const InputInfo &Output,
+                                 const InputInfoList &Inputs,
+                                 const ArgList &Args,
+                                 const char *LinkingOutput) const {
+  ArgStringList CmdArgs;
+
+  // Construct the link job so we can wrap around it.
+  Linker->ConstructJob(C, JA, Output, Inputs, Args, LinkingOutput);
+  const auto &LinkCommand = C.getJobs().getJobs().back();
+
+  CmdArgs.push_back("-linker-path");
+  CmdArgs.push_back(LinkCommand->getExecutable());
+  for (const char *LinkArg : LinkCommand->getArguments())
+    CmdArgs.push_back(LinkArg);
+
+  const char *Exec =
+      Args.MakeArgString(getToolChain().GetProgramPath("clang-linker-wrapper"));
+
+  // Replace the executable and arguments associated with the link job to the
+  // wrapper.
+  LinkCommand->replaceExecutable(Exec);
+  LinkCommand->replaceArguments(CmdArgs);
+}
Index: clang/lib/Driver/ToolChain.cpp
===================================================================
--- clang/lib/Driver/ToolChain.cpp
+++ clang/lib/Driver/ToolChain.cpp
@@ -324,6 +324,12 @@
   return OffloadWrapper.get();
 }
 
+Tool *ToolChain::getLinkerWrapper() const {
+  if (!LinkerWrapper)
+    LinkerWrapper.reset(new tools::LinkerWrapper(*this, getLink()));
+  return LinkerWrapper.get();
+}
+
 Tool *ToolChain::getTool(Action::ActionClass AC) const {
   switch (AC) {
   case Action::AssembleJobClass:
@@ -362,6 +368,8 @@
 
   case Action::OffloadWrapperJobClass:
     return getOffloadWrapper();
+  case Action::LinkerWrapperJobClass:
+    return getLinkerWrapper();
   }
 
   llvm_unreachable("Invalid tool kind.");
Index: clang/lib/Driver/Driver.cpp
===================================================================
--- clang/lib/Driver/Driver.cpp
+++ clang/lib/Driver/Driver.cpp
@@ -3788,6 +3788,11 @@
   // Builder to be used to build offloading actions.
   OffloadingActionBuilder OffloadBuilder(C, Args, Inputs);
 
+  // Offload kinds active for this compilation.
+  unsigned OffloadKinds = Action::OFK_None;
+  if (C.hasOffloadToolChain<Action::OFK_OpenMP>())
+    OffloadKinds |= Action::OFK_OpenMP;
+
   // Construct the actions to perform.
   HeaderModulePrecompileJobAction *HeaderModuleAction = nullptr;
   ActionList LinkerInputs;
@@ -3808,14 +3813,16 @@
 
     // Use the current host action in any of the offloading actions, if
     // required.
-    if (OffloadBuilder.addHostDependenceToDeviceActions(Current, InputArg))
-      break;
+    if (!Args.hasArg(options::OPT_fopenmp_new_driver))
+      if (OffloadBuilder.addHostDependenceToDeviceActions(Current, InputArg))
+        break;
 
     for (phases::ID Phase : PL) {
 
       // Add any offload action the host action depends on.
-      Current = OffloadBuilder.addDeviceDependencesToHostAction(
-          Current, InputArg, Phase, PL.back(), FullPL);
+      if (!Args.hasArg(options::OPT_fopenmp_new_driver))
+        Current = OffloadBuilder.addDeviceDependencesToHostAction(
+            Current, InputArg, Phase, PL.back(), FullPL);
       if (!Current)
         break;
 
@@ -3855,6 +3862,11 @@
         break;
       }
 
+      // Try to build the offloading actions and add the result as a dependency
+      // to the host.
+      if (Args.hasArg(options::OPT_fopenmp_new_driver))
+        Current = BuildOffloadingActions(C, Args, I, Current);
+
       // FIXME: Should we include any prior module file outputs as inputs of
       // later actions in the same command line?
 
@@ -3872,8 +3884,9 @@
 
       // Use the current host action in any of the offloading actions, if
       // required.
-      if (OffloadBuilder.addHostDependenceToDeviceActions(Current, InputArg))
-        break;
+      if (!Args.hasArg(options::OPT_fopenmp_new_driver))
+        if (OffloadBuilder.addHostDependenceToDeviceActions(Current, InputArg))
+          break;
 
       if (Current->getType() == types::TY_Nothing)
         break;
@@ -3884,21 +3897,32 @@
       Actions.push_back(Current);
 
     // Add any top level actions generated for offloading.
-    OffloadBuilder.appendTopLevelActions(Actions, Current, InputArg);
+    if (!Args.hasArg(options::OPT_fopenmp_new_driver))
+      OffloadBuilder.appendTopLevelActions(Actions, Current, InputArg);
+    else if (Current)
+      Current->propagateHostOffloadInfo(OffloadKinds,
+                                        /*BoundArch=*/nullptr);
   }
 
   // Add a link action if necessary.
   if (!LinkerInputs.empty()) {
-    if (Action *Wrapper = OffloadBuilder.makeHostLinkAction())
-      LinkerInputs.push_back(Wrapper);
+    if (!Args.hasArg(options::OPT_fopenmp_new_driver))
+      if (Action *Wrapper = OffloadBuilder.makeHostLinkAction())
+        LinkerInputs.push_back(Wrapper);
     Action *LA;
     // Check if this Linker Job should emit a static library.
     if (ShouldEmitStaticLibrary(Args)) {
       LA = C.MakeAction<StaticLibJobAction>(LinkerInputs, types::TY_Image);
+    } else if (Args.hasArg(options::OPT_fopenmp_new_driver) &&
+               OffloadKinds != Action::OFK_None) {
+      LA = C.MakeAction<LinkerWrapperJobAction>(LinkerInputs, types::TY_Image);
+      LA->propagateHostOffloadInfo(OffloadKinds,
+                                   /*BoundArch=*/nullptr);
     } else {
       LA = C.MakeAction<LinkJobAction>(LinkerInputs, types::TY_Image);
     }
-    LA = OffloadBuilder.processHostLinkAction(LA);
+    if (!Args.hasArg(options::OPT_fopenmp_new_driver))
+      LA = OffloadBuilder.processHostLinkAction(LA);
     Actions.push_back(LA);
   }
 
@@ -3984,6 +4008,68 @@
   Args.ClaimAllArgs(options::OPT_cuda_compile_host_device);
 }
 
+Action *Driver::BuildOffloadingActions(Compilation &C,
+                                       llvm::opt::DerivedArgList &Args,
+                                       const InputTy &Input,
+                                       Action *HostAction) const {
+  if (!isa<CompileJobAction>(HostAction))
+    return HostAction;
+
+  SmallVector<const ToolChain *, 2> ToolChains;
+  ActionList DeviceActions;
+
+  types::ID InputType = Input.first;
+  const Arg *InputArg = Input.second;
+
+  auto OpenMPTCRange = C.getOffloadToolChains<Action::OFK_OpenMP>();
+  for (auto TI = OpenMPTCRange.first, TE = OpenMPTCRange.second; TI != TE; ++TI)
+    ToolChains.push_back(TI->second);
+
+  for (unsigned I = 0; I < ToolChains.size(); ++I)
+    DeviceActions.push_back(C.MakeAction<InputAction>(*InputArg, InputType));
+
+  if (DeviceActions.empty())
+    return HostAction;
+
+  auto PL = types::getCompilationPhases(*this, Args, InputType);
+
+  for (phases::ID Phase : PL) {
+    if (Phase == phases::Link) {
+      assert(Phase == PL.back() && "linking must be final compilation step.");
+      break;
+    }
+
+    auto TC = ToolChains.begin();
+    for (Action *&A : DeviceActions) {
+      A = ConstructPhaseAction(C, Args, Phase, A);
+
+      if (isa<CompileJobAction>(A)) {
+        HostAction->setCannotBeCollapsedWithNextDependentAction();
+        OffloadAction::HostDependence HDep(
+            *HostAction, *C.getSingleOffloadToolChain<Action::OFK_Host>(),
+            /*BourdArch=*/nullptr, Action::OFK_OpenMP);
+        OffloadAction::DeviceDependences DDep;
+        DDep.add(*A, **TC, /*BoundArch=*/nullptr, Action::OFK_OpenMP);
+        A = C.MakeAction<OffloadAction>(HDep, DDep);
+      }
+      ++TC;
+    }
+  }
+
+  OffloadAction::DeviceDependences DDeps;
+
+  auto TC = ToolChains.begin();
+  for (Action *A : DeviceActions) {
+    DDeps.add(*A, **TC, /*BoundArch=*/nullptr, Action::OFK_OpenMP);
+    TC++;
+  }
+
+  OffloadAction::HostDependence HDep(
+      *HostAction, *C.getSingleOffloadToolChain<Action::OFK_Host>(),
+      /*BoundArch=*/nullptr, DDeps);
+  return C.MakeAction<OffloadAction>(HDep, DDeps);
+}
+
 Action *Driver::ConstructPhaseAction(
     Compilation &C, const ArgList &Args, phases::ID Phase, Action *Input,
     Action::OffloadKind TargetDeviceOffloadKind) const {
@@ -4143,7 +4229,7 @@
         ArchNames.insert(A->getValue());
 
   // Set of (Action, canonical ToolChain triple) pairs we've built jobs for.
-  std::map<std::pair<const Action *, std::string>, InputInfo> CachedResults;
+  std::map<std::pair<const Action *, std::string>, InputInfoList> CachedResults;
   for (Action *A : C.getActions()) {
     // If we are linking an image for multiple archs then the linker wants
     // -arch_multiple and -final_output <final image name>. Unfortunately, this
@@ -4600,10 +4686,11 @@
   return TriplePlusArch;
 }
 
-InputInfo Driver::BuildJobsForAction(
+InputInfoList Driver::BuildJobsForAction(
     Compilation &C, const Action *A, const ToolChain *TC, StringRef BoundArch,
     bool AtTopLevel, bool MultipleArchs, const char *LinkingOutput,
-    std::map<std::pair<const Action *, std::string>, InputInfo> &CachedResults,
+    std::map<std::pair<const Action *, std::string>, InputInfoList>
+        &CachedResults,
     Action::OffloadKind TargetDeviceOffloadKind) const {
   std::pair<const Action *, std::string> ActionTC = {
       A, GetTriplePlusArchString(TC, BoundArch, TargetDeviceOffloadKind)};
@@ -4611,17 +4698,18 @@
   if (CachedResult != CachedResults.end()) {
     return CachedResult->second;
   }
-  InputInfo Result = BuildJobsForActionNoCache(
+  InputInfoList Result = BuildJobsForActionNoCache(
       C, A, TC, BoundArch, AtTopLevel, MultipleArchs, LinkingOutput,
       CachedResults, TargetDeviceOffloadKind);
   CachedResults[ActionTC] = Result;
   return Result;
 }
 
-InputInfo Driver::BuildJobsForActionNoCache(
+InputInfoList Driver::BuildJobsForActionNoCache(
     Compilation &C, const Action *A, const ToolChain *TC, StringRef BoundArch,
     bool AtTopLevel, bool MultipleArchs, const char *LinkingOutput,
-    std::map<std::pair<const Action *, std::string>, InputInfo> &CachedResults,
+    std::map<std::pair<const Action *, std::string>, InputInfoList>
+        &CachedResults,
     Action::OffloadKind TargetDeviceOffloadKind) const {
   llvm::PrettyStackTraceString CrashInfo("Building compilation jobs");
 
@@ -4659,7 +4747,7 @@
 
     // If there is a single device option, just generate the job for it.
     if (OA->hasSingleDeviceDependence()) {
-      InputInfo DevA;
+      InputInfoList DevA;
       OA->doOnEachDeviceDependence([&](Action *DepA, const ToolChain *DepTC,
                                        const char *DepBoundArch) {
         DevA =
@@ -4677,7 +4765,7 @@
     OA->doOnEachDependence(
         /*IsHostDependence=*/BuildingForOffloadDevice,
         [&](Action *DepA, const ToolChain *DepTC, const char *DepBoundArch) {
-          OffloadDependencesInputInfo.push_back(BuildJobsForAction(
+          OffloadDependencesInputInfo.append(BuildJobsForAction(
               C, DepA, DepTC, DepBoundArch, /*AtTopLevel=*/false,
               /*MultipleArchs*/ !!DepBoundArch, LinkingOutput, CachedResults,
               DepA->getOffloadingDeviceKind()));
@@ -4686,6 +4774,17 @@
     A = BuildingForOffloadDevice
             ? OA->getSingleDeviceDependence(/*DoNotConsiderHostActions=*/true)
             : OA->getHostDependence();
+
+    // We may have already built this action as a part of the offloading
+    // toolchain, return the cached input if so.
+    std::pair<const Action *, std::string> ActionTC = {
+        OA->getHostDependence(),
+        GetTriplePlusArchString(TC, BoundArch, TargetDeviceOffloadKind)};
+    if (CachedResults.find(ActionTC) != CachedResults.end()) {
+      InputInfoList Inputs = CachedResults[ActionTC];
+      Inputs.append(OffloadDependencesInputInfo);
+      return Inputs;
+    }
   }
 
   if (const InputAction *IA = dyn_cast<InputAction>(A)) {
@@ -4695,9 +4794,9 @@
     Input.claim();
     if (Input.getOption().matches(options::OPT_INPUT)) {
       const char *Name = Input.getValue();
-      return InputInfo(A, Name, /* _BaseInput = */ Name);
+      return {InputInfo(A, Name, /* _BaseInput = */ Name)};
     }
-    return InputInfo(A, &Input, /* _BaseInput = */ "");
+    return {InputInfo(A, &Input, /* _BaseInput = */ "")};
   }
 
   if (const BindArchAction *BAA = dyn_cast<BindArchAction>(A)) {
@@ -4727,7 +4826,7 @@
   const Tool *T = TS.getTool(Inputs, CollapsedOffloadActions);
 
   if (!T)
-    return InputInfo();
+    return {InputInfo()};
 
   if (BuildingForOffloadDevice &&
       A->getOffloadingDeviceKind() == Action::OFK_OpenMP) {
@@ -4754,7 +4853,7 @@
     cast<OffloadAction>(OA)->doOnEachDependence(
         /*IsHostDependence=*/BuildingForOffloadDevice,
         [&](Action *DepA, const ToolChain *DepTC, const char *DepBoundArch) {
-          OffloadDependencesInputInfo.push_back(BuildJobsForAction(
+          OffloadDependencesInputInfo.append(BuildJobsForAction(
               C, DepA, DepTC, DepBoundArch, /* AtTopLevel */ false,
               /*MultipleArchs=*/!!DepBoundArch, LinkingOutput, CachedResults,
               DepA->getOffloadingDeviceKind()));
@@ -4768,7 +4867,7 @@
     // FIXME: Clean this up.
     bool SubJobAtTopLevel =
         AtTopLevel && (isa<DsymutilJobAction>(A) || isa<VerifyJobAction>(A));
-    InputInfos.push_back(BuildJobsForAction(
+    InputInfos.append(BuildJobsForAction(
         C, Input, TC, BoundArch, SubJobAtTopLevel, MultipleArchs, LinkingOutput,
         CachedResults, A->getOffloadingDeviceKind()));
   }
@@ -4852,8 +4951,8 @@
         Arch = BoundArch;
 
       CachedResults[{A, GetTriplePlusArchString(UI.DependentToolChain, Arch,
-                                                UI.DependentOffloadKind)}] =
-          CurI;
+                                                UI.DependentOffloadKind)}] = {
+          CurI};
     }
 
     // Now that we have all the results generated, select the one that should be
@@ -4862,9 +4961,9 @@
         A, GetTriplePlusArchString(TC, BoundArch, TargetDeviceOffloadKind)};
     assert(CachedResults.find(ActionTC) != CachedResults.end() &&
            "Result does not exist??");
-    Result = CachedResults[ActionTC];
+    Result = CachedResults[ActionTC].front();
   } else if (JA->getType() == types::TY_Nothing)
-    Result = InputInfo(A, BaseInput);
+    Result = {InputInfo(A, BaseInput)};
   else {
     // We only have to generate a prefix for the host if this is not a top-level
     // action.
@@ -4917,7 +5016,7 @@
           C.getArgsForToolChain(TC, BoundArch, JA->getOffloadingDeviceKind()),
           LinkingOutput);
   }
-  return Result;
+  return {Result};
 }
 
 const char *Driver::getDefaultImageName() const {
Index: clang/lib/Driver/Action.cpp
===================================================================
--- clang/lib/Driver/Action.cpp
+++ clang/lib/Driver/Action.cpp
@@ -43,6 +43,8 @@
     return "clang-offload-unbundler";
   case OffloadWrapperJobClass:
     return "clang-offload-wrapper";
+  case LinkerWrapperJobClass:
+    return "clang-linker-wrapper";
   case StaticLibJobClass:
     return "static-lib-linker";
   }
@@ -418,6 +420,12 @@
                                                  types::ID Type)
   : JobAction(OffloadWrapperJobClass, Inputs, Type) {}
 
+void LinkerWrapperJobAction::anchor() {}
+
+LinkerWrapperJobAction::LinkerWrapperJobAction(ActionList &Inputs,
+                                               types::ID Type)
+    : JobAction(LinkerWrapperJobClass, Inputs, Type) {}
+
 void StaticLibJobAction::anchor() {}
 
 StaticLibJobAction::StaticLibJobAction(ActionList &Inputs, types::ID Type)
Index: clang/lib/CodeGen/CodeGenAction.cpp
===================================================================
--- clang/lib/CodeGen/CodeGenAction.cpp
+++ clang/lib/CodeGen/CodeGenAction.cpp
@@ -1134,6 +1134,7 @@
     TheModule->setTargetTriple(TargetOpts.Triple);
   }
 
+  EmbedBinary(TheModule.get(), CodeGenOpts, Diagnostics);
   EmbedBitcode(TheModule.get(), CodeGenOpts, *MainFile);
 
   LLVMContext &Ctx = TheModule->getContext();
Index: clang/lib/CodeGen/BackendUtil.cpp
===================================================================
--- clang/lib/CodeGen/BackendUtil.cpp
+++ clang/lib/CodeGen/BackendUtil.cpp
@@ -1738,8 +1738,43 @@
                          llvm::MemoryBufferRef Buf) {
   if (CGOpts.getEmbedBitcode() == CodeGenOptions::Embed_Off)
     return;
+
   llvm::EmbedBitcodeInModule(
       *M, Buf, CGOpts.getEmbedBitcode() != CodeGenOptions::Embed_Marker,
       CGOpts.getEmbedBitcode() != CodeGenOptions::Embed_Bitcode,
       CGOpts.CmdArgs);
 }
+
+void clang::EmbedBinary(llvm::Module *M, const CodeGenOptions &CGOpts,
+    DiagnosticsEngine &Diags) {
+  if (CGOpts.OffloadBinaryString.empty())
+    return;
+
+  SmallVector<StringRef, 4> BinaryFilenames;
+  SmallVector<StringRef, 4> BinarySections;
+  StringRef(CGOpts.OffloadBinaryString).split(BinaryFilenames, ",");
+  StringRef(CGOpts.OffloadSectionString).split(BinarySections, ",");
+
+  assert(BinaryFilenames.size() == BinarySections.size() &&
+         "Different number of filenames and section names in embedding");
+
+  auto BinarySection = BinarySections.begin();
+  for (StringRef BinaryFilename : BinaryFilenames) {
+    llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> BinaryOrErr =
+      llvm::MemoryBuffer::getFileOrSTDIN(BinaryFilename);
+    if (std::error_code EC = BinaryOrErr.getError()) {
+      auto DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
+          "could not open '%0' for embedding");
+      Diags.Report(DiagID) << BinaryFilename;
+      return;
+    }
+
+    SmallString<128> SectionName(".llvm.offloading");
+    if (!BinarySection->empty()) {
+      SectionName += ".";
+      SectionName += *BinarySection;
+    }
+    llvm::EmbedObjectInModule(*M, **BinaryOrErr, SectionName);
+    ++BinarySection;
+  }
+}
Index: clang/include/clang/Driver/ToolChain.h
===================================================================
--- clang/include/clang/Driver/ToolChain.h
+++ clang/include/clang/Driver/ToolChain.h
@@ -151,6 +151,7 @@
   mutable std::unique_ptr<Tool> IfsMerge;
   mutable std::unique_ptr<Tool> OffloadBundler;
   mutable std::unique_ptr<Tool> OffloadWrapper;
+  mutable std::unique_ptr<Tool> LinkerWrapper;
 
   Tool *getClang() const;
   Tool *getFlang() const;
@@ -161,6 +162,7 @@
   Tool *getClangAs() const;
   Tool *getOffloadBundler() const;
   Tool *getOffloadWrapper() const;
+  Tool *getLinkerWrapper() const;
 
   mutable bool SanitizerArgsChecked = false;
   mutable std::unique_ptr<XRayArgs> XRayArguments;
Index: clang/include/clang/Driver/Options.td
===================================================================
--- clang/include/clang/Driver/Options.td
+++ clang/include/clang/Driver/Options.td
@@ -1148,6 +1148,14 @@
   PosFlag<SetTrue, [CC1Option], "Enable support for the C++ Coroutines TS">,
   NegFlag<SetFalse>>;
 
+def fembed_offload_binary_EQ : Joined<["-"], "fembed-offload-binary=">,
+  Group<f_Group>, Flags<[NoXarchOption, CC1Option]>,
+  HelpText<"Embed Offloading device-side binary into host object file.">,
+  MarshallingInfoString<CodeGenOpts<"OffloadBinaryString">>;
+def fembed_offload_section_EQ : Joined<["-"], "fembed-offload-section=">,
+  Group<f_Group>, Flags<[NoXarchOption, CC1Option]>,
+  HelpText<"Section name to use for the embedded device binary.">,
+  MarshallingInfoString<CodeGenOpts<"OffloadSectionString">>;
 def fembed_bitcode_EQ : Joined<["-"], "fembed-bitcode=">,
     Group<f_Group>, Flags<[NoXarchOption, CC1Option, CC1AsOption]>, MetaVarName<"<option>">,
     HelpText<"Embed LLVM bitcode (option: off, all, bitcode, marker)">,
@@ -2461,6 +2469,8 @@
   PosFlag<SetTrue, [CC1Option]>, NegFlag<SetFalse>, BothFlags<[NoArgumentUnused, HelpHidden]>>;
 def static_openmp: Flag<["-"], "static-openmp">,
   HelpText<"Use the static host OpenMP runtime while linking.">;
+def fopenmp_new_driver : Flag<["-"], "fopenmp-new-driver">, Flags<[CC1Option]>, Group<Action_Group>,
+  HelpText<"Use the new driver for OpenMP offloading.">;
 def fno_optimize_sibling_calls : Flag<["-"], "fno-optimize-sibling-calls">, Group<f_Group>;
 def foptimize_sibling_calls : Flag<["-"], "foptimize-sibling-calls">, Group<f_Group>;
 defm escaping_block_tail_calls : BoolFOption<"escaping-block-tail-calls",
Index: clang/include/clang/Driver/Job.h
===================================================================
--- clang/include/clang/Driver/Job.h
+++ clang/include/clang/Driver/Job.h
@@ -208,6 +208,8 @@
     Arguments = std::move(List);
   }
 
+  void replaceExecutable(const char *Exe) { Executable = Exe; }
+
   const char *getExecutable() const { return Executable; }
 
   const llvm::opt::ArgStringList &getArguments() const { return Arguments; }
Index: clang/include/clang/Driver/Driver.h
===================================================================
--- clang/include/clang/Driver/Driver.h
+++ clang/include/clang/Driver/Driver.h
@@ -12,6 +12,7 @@
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/LLVM.h"
 #include "clang/Driver/Action.h"
+#include "clang/Driver/InputInfo.h"
 #include "clang/Driver/Options.h"
 #include "clang/Driver/Phases.h"
 #include "clang/Driver/ToolChain.h"
@@ -38,13 +39,14 @@
 
 namespace driver {
 
-  class Command;
-  class Compilation;
-  class InputInfo;
-  class JobList;
-  class JobAction;
-  class SanitizerArgs;
-  class ToolChain;
+typedef SmallVector<InputInfo, 4> InputInfoList;
+
+class Command;
+class Compilation;
+class JobList;
+class JobAction;
+class SanitizerArgs;
+class ToolChain;
 
 /// Describes the kind of LTO mode selected via -f(no-)?lto(=.*)? options.
 enum LTOKind {
@@ -171,9 +173,11 @@
   /// The file to log CC_LOG_DIAGNOSTICS output to, if enabled.
   std::string CCLogDiagnosticsFilename;
 
+  /// An input type and its arguments.
+  using InputTy = std::pair<types::ID, const llvm::opt::Arg *>;
+
   /// A list of inputs and their types for the given arguments.
-  typedef SmallVector<std::pair<types::ID, const llvm::opt::Arg *>, 16>
-      InputList;
+  using InputList = SmallVector<InputTy, 16>;
 
   /// Whether the driver should follow g++ like behavior.
   bool CCCIsCXX() const { return Mode == GXXMode; }
@@ -413,6 +417,18 @@
   void BuildUniversalActions(Compilation &C, const ToolChain &TC,
                              const InputList &BAInputs) const;
 
+  /// BuildOffloadingActions - Construct the list of actions to perform for the
+  /// offloading toolchain that will be embedded in the host.
+  ///
+  /// \param C - The compilation that is being built.
+  /// \param Args - The input arguments.
+  /// \param Input - The input type and arguments
+  /// \param HostAction - The host action used in the offloading toolchain.
+  Action *BuildOffloadingActions(Compilation &C,
+                                 llvm::opt::DerivedArgList &Args,
+                                 const InputTy &Input,
+                                 Action *HostAction) const;
+
   /// Check that the file referenced by Value exists. If it doesn't,
   /// issue a diagnostic and return false.
   /// If TypoCorrect is true and the file does not exist, see if it looks
@@ -503,13 +519,12 @@
   /// BuildJobsForAction - Construct the jobs to perform for the action \p A and
   /// return an InputInfo for the result of running \p A.  Will only construct
   /// jobs for a given (Action, ToolChain, BoundArch, DeviceKind) tuple once.
-  InputInfo
-  BuildJobsForAction(Compilation &C, const Action *A, const ToolChain *TC,
-                     StringRef BoundArch, bool AtTopLevel, bool MultipleArchs,
-                     const char *LinkingOutput,
-                     std::map<std::pair<const Action *, std::string>, InputInfo>
-                         &CachedResults,
-                     Action::OffloadKind TargetDeviceOffloadKind) const;
+  InputInfoList BuildJobsForAction(
+      Compilation &C, const Action *A, const ToolChain *TC, StringRef BoundArch,
+      bool AtTopLevel, bool MultipleArchs, const char *LinkingOutput,
+      std::map<std::pair<const Action *, std::string>, InputInfoList>
+          &CachedResults,
+      Action::OffloadKind TargetDeviceOffloadKind) const;
 
   /// Returns the default name for linked images (e.g., "a.out").
   const char *getDefaultImageName() const;
@@ -617,10 +632,10 @@
   /// Helper used in BuildJobsForAction.  Doesn't use the cache when building
   /// jobs specifically for the given action, but will use the cache when
   /// building jobs for the Action's inputs.
-  InputInfo BuildJobsForActionNoCache(
+  InputInfoList BuildJobsForActionNoCache(
       Compilation &C, const Action *A, const ToolChain *TC, StringRef BoundArch,
       bool AtTopLevel, bool MultipleArchs, const char *LinkingOutput,
-      std::map<std::pair<const Action *, std::string>, InputInfo>
+      std::map<std::pair<const Action *, std::string>, InputInfoList>
           &CachedResults,
       Action::OffloadKind TargetDeviceOffloadKind) const;
 
Index: clang/include/clang/Driver/Action.h
===================================================================
--- clang/include/clang/Driver/Action.h
+++ clang/include/clang/Driver/Action.h
@@ -73,6 +73,7 @@
     OffloadBundlingJobClass,
     OffloadUnbundlingJobClass,
     OffloadWrapperJobClass,
+    LinkerWrapperJobClass,
     StaticLibJobClass,
 
     JobClassFirst = PreprocessJobClass,
@@ -642,6 +643,17 @@
   }
 };
 
+class LinkerWrapperJobAction : public JobAction {
+  void anchor() override;
+
+public:
+  LinkerWrapperJobAction(ActionList &Inputs, types::ID Type);
+
+  static bool classof(const Action *A) {
+    return A->getKind() == LinkerWrapperJobClass;
+  }
+};
+
 class StaticLibJobAction : public JobAction {
   void anchor() override;
 
Index: clang/include/clang/CodeGen/BackendUtil.h
===================================================================
--- clang/include/clang/CodeGen/BackendUtil.h
+++ clang/include/clang/CodeGen/BackendUtil.h
@@ -44,6 +44,9 @@
 
   void EmbedBitcode(llvm::Module *M, const CodeGenOptions &CGOpts,
                     llvm::MemoryBufferRef Buf);
+
+  void EmbedBinary(llvm::Module *M, const CodeGenOptions &CGOpts,
+                   DiagnosticsEngine &Diags);
 }
 
 #endif
Index: clang/include/clang/Basic/CodeGenOptions.h
===================================================================
--- clang/include/clang/Basic/CodeGenOptions.h
+++ clang/include/clang/Basic/CodeGenOptions.h
@@ -276,6 +276,14 @@
   /// CUDA runtime back-end for incorporating them into host-side object file.
   std::string CudaGpuBinaryFileName;
 
+  /// List of file passed with -fembed-offload-binary option to embed
+  /// device-side offloading binaries in the host object file.
+  std::string OffloadBinaryString;
+
+  /// List of section names pass with -fembed-offload-binary to use when
+  /// embedding files passed with -fembed-offload-binary.
+  std::string OffloadSectionString;
+
   /// The name of the file to which the backend should save YAML optimization
   /// records.
   std::string OptRecordFile;
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
  • [PATCH] D116541: [OpenMP] Int... Joseph Huber via Phabricator via cfe-commits

Reply via email to