[PATCH] D140158: [CUDA] Allow targeting NVPTX directly without a host toolchain

Joseph Huber via Phabricator via cfe-commits Thu, 15 Dec 2022 13:00:40 -0800

jhuber6 created this revision.
jhuber6 added reviewers: tra, yaxunl, JonChesterfield.
Herald added subscribers: mattd, gchakrabarti, carlosgalvezp, asavonic.
Herald added a project: All.
jhuber6 requested review of this revision.
Herald added subscribers: cfe-commits, sstefan1, MaskRay.
Herald added a reviewer: jdoerfert.
Herald added a project: clang.


Currently, the NVPTX compilation toolchain can only be invoked either
through CUDA or OpenMP via `--offload-device-only`. This is because we
cannot build a CUDA toolchain without an accompanying host toolchain for
the offloading. When using `--target=nvptx64-nvidia-cuda` this results
in generating calls to the GNU assembler and linker, leading to errors.

This patch abstracts the portions of the CUDA toolchain that are
independent of the host toolchain or offloading kind into a new base
class called `NVPTXToolChain`. We still need to read the host's triple
to build the CUDA installation, so if not present we just assume it will
match the host's system for now, or the user can provide the path
explicitly.

This should allow the compiler driver to create NVPTX device images
directly from C/C++ code.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D140158

Files:
  clang/lib/Driver/Driver.cpp
  clang/lib/Driver/ToolChains/Cuda.cpp
  clang/lib/Driver/ToolChains/Cuda.h
  clang/test/Driver/cuda-cross-compiling.c

Index: clang/test/Driver/cuda-cross-compiling.c
===================================================================
--- /dev/null
+++ clang/test/Driver/cuda-cross-compiling.c
@@ -0,0 +1,48 @@
+// Tests the driver when targeting the NVPTX architecture directly without a
+// host toolchain to perform CUDA mappings.
+
+// REQUIRES: nvptx-registered-target
+
+//
+// Test the generated phases when targeting NVPTX.
+//
+// RUN: %clang -target nvptx64-nvidia-cuda -ccc-print-phases %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=PHASES %s
+
+//      PHASES: 0: input, "[[INPUT:.+]]", c
+// PHASES-NEXT: 1: preprocessor, {0}, cpp-output
+// PHASES-NEXT: 2: compiler, {1}, ir
+// PHASES-NEXT: 3: backend, {2}, assembler
+// PHASES-NEXT: 4: assembler, {3}, object
+// PHASES-NEXT: 5: linker, {4}, image
+
+//
+// Test the generated bindings when targeting NVPTX.
+//
+// RUN: %clang -target nvptx64-nvidia-cuda -ccc-print-bindings %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=BINDINGS %s
+
+//      BINDINGS: "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[PTX:.+]].s"
+// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[PTX]].s"], output: "[[CUBIN:.+]].o"
+// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "NVPTX::Linker", inputs: ["[[CUBIN]].o"], output: "a.out"
+
+//
+// Test the generated arguments to the CUDA binary utils when targeting NVPTX. 
+// Ensure that the '.o' files are converted to '.cubin' as well.
+//
+// RUN: %clang -target nvptx64-nvidia-cuda -march=sm_61 -### %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=ARGS %s
+
+//      ARGS: -cc1" "-triple" "nvptx64-nvidia-cuda" "-S" {{.*}} "-target-cpu" "sm_61" "-target-feature" "+ptx{{[0-9]+}}" {{.*}} "-o" "[[PTX:.+]].s"
+// ARGS-NEXT: ptxas" "-m64" "-O0" "--gpu-name" "sm_61" "--output-file" "[[CUBIN:.+]].o" "[[PTX]].s" "-c"
+// ARGS-NEXT: nvlink" "-o" "a.out" "-arch" "sm_61" {{.*}} "{{.*}}.cubin"
+
+//
+// Test the generated arguments default to a value with no architecture. 
+//
+// RUN: %clang -target nvptx64-nvidia-cuda -### %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=DEFAULT %s
+
+//      DEFAULT: -cc1" "-triple" "nvptx64-nvidia-cuda" "-S" {{.*}} "-target-cpu" "sm_35" "-target-feature" "+ptx{{[0-9]+}}" {{.*}} "-o" "[[PTX:.+]].s"
+// DEFAULT-NEXT: ptxas" "-m64" "-O0" "--gpu-name" "sm_35" "--output-file" "[[CUBIN:.+]].o" "[[PTX]].s" "-c"
+// DEFAULT-NEXT: nvlink" "-o" "a.out" "-arch" "sm_35" {{.*}} "{{.*}}.cubin"
Index: clang/lib/Driver/ToolChains/Cuda.h
===================================================================
--- clang/lib/Driver/ToolChains/Cuda.h
+++ clang/lib/Driver/ToolChains/Cuda.h
@@ -96,11 +96,25 @@
                      const char *LinkingOutput) const override;
 };
 
+// Runs fatbinary, which combines GPU object files ("cubin" files) and/or PTX
+// assembly into a single output file.
+class LLVM_LIBRARY_VISIBILITY FatBinary : public Tool {
+ public:
+   FatBinary(const ToolChain &TC) : Tool("NVPTX::Linker", "fatbinary", TC) {}
+
+   bool hasIntegratedCPP() const override { return false; }
+
+   void ConstructJob(Compilation &C, const JobAction &JA,
+                     const InputInfo &Output, const InputInfoList &Inputs,
+                     const llvm::opt::ArgList &TCArgs,
+                     const char *LinkingOutput) const override;
+};
+
 // Runs fatbinary, which combines GPU object files ("cubin" files) and/or PTX
 // assembly into a single output file.
 class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
  public:
-   Linker(const ToolChain &TC) : Tool("NVPTX::Linker", "fatbinary", TC) {}
+   Linker(const ToolChain &TC) : Tool("NVPTX::Linker", "nvlink", TC) {}
 
    bool hasIntegratedCPP() const override { return false; }
 
@@ -119,43 +133,68 @@
 
 namespace toolchains {
 
-class LLVM_LIBRARY_VISIBILITY CudaToolChain : public ToolChain {
-public:
-  CudaToolChain(const Driver &D, const llvm::Triple &Triple,
-                const ToolChain &HostTC, const llvm::opt::ArgList &Args);
+class LLVM_LIBRARY_VISIBILITY NVPTXToolChain : public ToolChain {
+ public:
+   NVPTXToolChain(const Driver &D, const llvm::Triple &Triple,
+                  const llvm::Triple &HostTriple,
+                  const llvm::opt::ArgList &Args);
+
+   NVPTXToolChain(const Driver &D, const llvm::Triple &Triple,
+                  const llvm::opt::ArgList &Args);
+
+   llvm::opt::DerivedArgList *
+   TranslateArgs(const llvm::opt::DerivedArgList &Args, StringRef BoundArch,
+                 Action::OffloadKind DeviceOffloadKind) const override;
+
+   // Never try to use the integrated assembler with NVPTX; always fork out to
+   // ptxas.
+   bool useIntegratedAs() const override { return false; }
+   bool isCrossCompiling() const override { return true; }
+   bool isPICDefault() const override { return false; }
+   bool isPIEDefault(const llvm::opt::ArgList &Args) const override {
+    return false;
+   }
+   bool isPICDefaultForced() const override { return false; }
+   bool SupportsProfiling() const override { return false; }
 
-  const llvm::Triple *getAuxTriple() const override {
+   bool IsMathErrnoDefault() const override { return false; }
+
+   bool supportsDebugInfoOption(const llvm::opt::Arg *A) const override;
+   void adjustDebugInfoKind(codegenoptions::DebugInfoKind &DebugInfoKind,
+                            const llvm::opt::ArgList &Args) const override;
+
+   // NVPTX supports only DWARF2.
+   unsigned GetDefaultDwarfVersion() const override { return 2; }
+   unsigned getMaxDwarfVersion() const override { return 2; }
+
+   CudaInstallationDetector CudaInstallation;
+
+ protected:
+   Tool *buildAssembler() const override; // ptxas.
+   Tool *buildLinker() const override;    // nvlink.
+};
+
+class LLVM_LIBRARY_VISIBILITY CudaToolChain : public NVPTXToolChain {
+ public:
+   CudaToolChain(const Driver &D, const llvm::Triple &Triple,
+                 const ToolChain &HostTC, const llvm::opt::ArgList &Args);
+
+   const llvm::Triple *getAuxTriple() const override {
     return &HostTC.getTriple();
   }
 
-  std::string getInputFilename(const InputInfo &Input) const override;
-
   llvm::opt::DerivedArgList *
   TranslateArgs(const llvm::opt::DerivedArgList &Args, StringRef BoundArch,
                 Action::OffloadKind DeviceOffloadKind) const override;
-  void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
-                             llvm::opt::ArgStringList &CC1Args,
-                             Action::OffloadKind DeviceOffloadKind) const override;
+  void
+  addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
+                        llvm::opt::ArgStringList &CC1Args,
+                        Action::OffloadKind DeviceOffloadKind) const override;
 
   llvm::DenormalMode getDefaultDenormalModeForType(
       const llvm::opt::ArgList &DriverArgs, const JobAction &JA,
       const llvm::fltSemantics *FPType = nullptr) const override;
 
-  // Never try to use the integrated assembler with CUDA; always fork out to
-  // ptxas.
-  bool useIntegratedAs() const override { return false; }
-  bool isCrossCompiling() const override { return true; }
-  bool isPICDefault() const override { return false; }
-  bool isPIEDefault(const llvm::opt::ArgList &Args) const override {
-    return false;
-  }
-  bool isPICDefaultForced() const override { return false; }
-  bool SupportsProfiling() const override { return false; }
-  bool supportsDebugInfoOption(const llvm::opt::Arg *A) const override;
-  void adjustDebugInfoKind(codegenoptions::DebugInfoKind &DebugInfoKind,
-                           const llvm::opt::ArgList &Args) const override;
-  bool IsMathErrnoDefault() const override { return false; }
-
   void AddCudaIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                           llvm::opt::ArgStringList &CC1Args) const override;
 
@@ -176,12 +215,7 @@
   computeMSVCVersion(const Driver *D,
                      const llvm::opt::ArgList &Args) const override;
 
-  unsigned GetDefaultDwarfVersion() const override { return 2; }
-  // NVPTX supports only DWARF2.
-  unsigned getMaxDwarfVersion() const override { return 2; }
-
   const ToolChain &HostTC;
-  CudaInstallationDetector CudaInstallation;
 
 protected:
   Tool *buildAssembler() const override;  // ptxas
Index: clang/lib/Driver/ToolChains/Cuda.cpp
===================================================================
--- clang/lib/Driver/ToolChains/Cuda.cpp
+++ clang/lib/Driver/ToolChains/Cuda.cpp
@@ -381,18 +381,20 @@
                                     const ArgList &Args,
                                     const char *LinkingOutput) const {
   const auto &TC =
-      static_cast<const toolchains::CudaToolChain &>(getToolChain());
+      static_cast<const toolchains::NVPTXToolChain &>(getToolChain());
   assert(TC.getTriple().isNVPTX() && "Wrong platform");
 
   StringRef GPUArchName;
-  // If this is an OpenMP action we need to extract the device architecture
-  // from the -march=arch option. This option may come from -Xopenmp-target
-  // flag or the default value.
-  if (JA.isDeviceOffloading(Action::OFK_OpenMP)) {
+  // If this is a CUDA action we need to extract the device architecture
+  // from the Job's associated architecture, otherwise use the -march=arch
+  // option. This option may come from -Xopenmp-target flag or the default
+  // value.
+  if (JA.isDeviceOffloading(Action::OFK_Cuda)) {
+    GPUArchName = JA.getOffloadingArch();
+  } else {
     GPUArchName = Args.getLastArgValue(options::OPT_march_EQ);
     assert(!GPUArchName.empty() && "Must have an architecture passed in.");
-  } else
-    GPUArchName = JA.getOffloadingArch();
+  }
 
   // Obtain architecture from the action.
   CudaArch gpu_arch = StringToCudaArch(GPUArchName);
@@ -463,13 +465,14 @@
   for (const auto& A : Args.getAllArgValues(options::OPT_Xcuda_ptxas))
     CmdArgs.push_back(Args.MakeArgString(A));
 
-  bool Relocatable = false;
+  bool Relocatable = true;
   if (JA.isOffloading(Action::OFK_OpenMP))
     // In OpenMP we need to generate relocatable code.
     Relocatable = Args.hasFlag(options::OPT_fopenmp_relocatable_target,
                                options::OPT_fnoopenmp_relocatable_target,
                                /*Default=*/true);
   else if (JA.isOffloading(Action::OFK_Cuda))
+    // In CUDA we generate non-relocatable code by default.
     Relocatable = Args.hasFlag(options::OPT_fgpu_rdc,
                                options::OPT_fno_gpu_rdc, /*Default=*/false);
 
@@ -507,11 +510,11 @@
 // All inputs to this linker must be from CudaDeviceActions, as we need to look
 // at the Inputs' Actions in order to figure out which GPU architecture they
 // correspond to.
-void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA,
-                                 const InputInfo &Output,
-                                 const InputInfoList &Inputs,
-                                 const ArgList &Args,
-                                 const char *LinkingOutput) const {
+void NVPTX::FatBinary::ConstructJob(Compilation &C, const JobAction &JA,
+                                    const InputInfo &Output,
+                                    const InputInfoList &Inputs,
+                                    const ArgList &Args,
+                                    const char *LinkingOutput) const {
   const auto &TC =
       static_cast<const toolchains::CudaToolChain &>(getToolChain());
   assert(TC.getTriple().isNVPTX() && "Wrong platform");
@@ -558,6 +561,82 @@
       Exec, CmdArgs, Inputs, Output));
 }
 
+void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA,
+                                 const InputInfo &Output,
+                                 const InputInfoList &Inputs,
+                                 const ArgList &Args,
+                                 const char *LinkingOutput) const {
+  const auto &TC =
+      static_cast<const toolchains::NVPTXToolChain &>(getToolChain());
+  assert(TC.getTriple().isNVPTX() && "Wrong platform");
+
+  ArgStringList CmdArgs;
+  if (Output.isFilename()) {
+    CmdArgs.push_back("-o");
+    CmdArgs.push_back(Output.getFilename());
+  } else {
+    assert(Output.isNothing() && "Invalid output.");
+  }
+
+  if (mustEmitDebugInfo(Args) == EmitSameDebugInfoAsHost)
+    CmdArgs.push_back("-g");
+
+  if (Args.hasArg(options::OPT_v))
+    CmdArgs.push_back("-v");
+
+  StringRef GPUArch = Args.getLastArgValue(options::OPT_march_EQ);
+  assert(!GPUArch.empty() && "At least one GPU Arch required for nvlink.");
+
+  CmdArgs.push_back("-arch");
+  CmdArgs.push_back(Args.MakeArgString(GPUArch));
+
+  // Add paths specified in LIBRARY_PATH environment variable as -L options.
+  addDirectoryList(Args, CmdArgs, "-L", "LIBRARY_PATH");
+
+  // Add paths for the default clang library path.
+  SmallString<256> DefaultLibPath =
+      llvm::sys::path::parent_path(TC.getDriver().Dir);
+  llvm::sys::path::append(DefaultLibPath, CLANG_INSTALL_LIBDIR_BASENAME);
+  CmdArgs.push_back(Args.MakeArgString(Twine("-L") + DefaultLibPath));
+
+  for (const auto &II : Inputs) {
+    if (II.getType() == types::TY_LLVM_IR || II.getType() == types::TY_LTO_IR ||
+        II.getType() == types::TY_LTO_BC || II.getType() == types::TY_LLVM_BC) {
+      C.getDriver().Diag(diag::err_drv_no_linker_llvm_support)
+          << getToolChain().getTripleString();
+      continue;
+    }
+
+    // Currently, we only pass the input files to the linker, we do not pass
+    // any libraries that may be valid only for the host.
+    if (!II.isFilename())
+      continue;
+
+    // The 'nvlink' application performs RDC-mode linking when given a '.o'
+    // file and device linking when given a '.cubin' file. We always want to
+    // perform device linking, so just rename any '.o' files.
+    auto InputFile = getToolChain().getInputFilename(II);
+    if (llvm::sys::path::extension(InputFile) != "cubin") {
+      const char *CubinF =
+          Args.MakeArgString(getToolChain().getDriver().GetTemporaryPath(
+              llvm::sys::path::stem(InputFile), "cubin"));
+      llvm::sys::fs::copy_file(InputFile, C.addTempFile(CubinF));
+
+      CmdArgs.push_back(CubinF);
+    } else {
+      CmdArgs.push_back(Args.MakeArgString(InputFile));
+    }
+  }
+
+  const char *Exec =
+      Args.MakeArgString(getToolChain().GetProgramPath("nvlink"));
+  C.addCommand(std::make_unique<Command>(
+      JA, *this,
+      ResponseFileSupport{ResponseFileSupport::RF_Full, llvm::sys::WEM_UTF8,
+                          "--options-file"},
+      Exec, CmdArgs, Inputs, Output));
+}
+
 void NVPTX::getNVPTXTargetFeatures(const Driver &D, const llvm::Triple &Triple,
                                    const llvm::opt::ArgList &Args,
                                    std::vector<StringRef> &Features) {
@@ -600,14 +679,13 @@
   Features.push_back(PtxFeature);
 }
 
-/// CUDA toolchain.  Our assembler is ptxas, and our "linker" is fatbinary,
-/// which isn't properly a linker but nonetheless performs the step of stitching
-/// together object files from the assembler into a single blob.
-
-CudaToolChain::CudaToolChain(const Driver &D, const llvm::Triple &Triple,
-                             const ToolChain &HostTC, const ArgList &Args)
-    : ToolChain(D, Triple, Args), HostTC(HostTC),
-      CudaInstallation(D, HostTC.getTriple(), Args) {
+/// NVPTX toolchain. Our assembler is ptxas, and our linker is nvlink. This
+/// operates as a stand-alone version of the NVPTX tools without the host
+/// toolchain.
+NVPTXToolChain::NVPTXToolChain(const Driver &D, const llvm::Triple &Triple,
+                               const llvm::Triple &HostTriple,
+                               const ArgList &Args)
+    : ToolChain(D, Triple, Args), CudaInstallation(D, HostTriple, Args) {
   if (CudaInstallation.isValid()) {
     CudaInstallation.WarnIfUnsupportedVersion();
     getProgramPaths().push_back(std::string(CudaInstallation.getBinPath()));
@@ -617,22 +695,72 @@
   getProgramPaths().push_back(getDriver().Dir);
 }
 
-std::string CudaToolChain::getInputFilename(const InputInfo &Input) const {
-  // Only object files are changed, for example assembly files keep their .s
-  // extensions. If the user requested device-only compilation don't change it.
-  if (Input.getType() != types::TY_Object || getDriver().offloadDeviceOnly())
-    return ToolChain::getInputFilename(Input);
+/// We only need the host triple to locate the CUDA binary utilities, use the
+/// system's default triple if not provided.
+NVPTXToolChain::NVPTXToolChain(const Driver &D, const llvm::Triple &Triple,
+                               const ArgList &Args)
+    : NVPTXToolChain(D, Triple,
+                     llvm::Triple(llvm::sys::getDefaultTargetTriple()), Args) {}
+
+llvm::opt::DerivedArgList *
+NVPTXToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args,
+                              StringRef BoundArch,
+                              Action::OffloadKind DeviceOffloadKind) const {
+  DerivedArgList *DAL =
+      ToolChain::TranslateArgs(Args, BoundArch, DeviceOffloadKind);
+  if (!DAL)
+    DAL = new DerivedArgList(Args.getBaseArgs());
+
+  const OptTable &Opts = getDriver().getOpts();
+
+  for (Arg *A : Args)
+    if (!llvm::is_contained(*DAL, A))
+      DAL->append(A);
+
+  if (!DAL->hasArg(options::OPT_march_EQ))
+    DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ),
+                      CudaArchToString(CudaArch::CudaDefault));
+
+  return DAL;
+}
+
+bool NVPTXToolChain::supportsDebugInfoOption(const llvm::opt::Arg *A) const {
+  const Option &O = A->getOption();
+  return (O.matches(options::OPT_gN_Group) &&
+          !O.matches(options::OPT_gmodules)) ||
+         O.matches(options::OPT_g_Flag) ||
+         O.matches(options::OPT_ggdbN_Group) || O.matches(options::OPT_ggdb) ||
+         O.matches(options::OPT_gdwarf) || O.matches(options::OPT_gdwarf_2) ||
+         O.matches(options::OPT_gdwarf_3) || O.matches(options::OPT_gdwarf_4) ||
+         O.matches(options::OPT_gdwarf_5) ||
+         O.matches(options::OPT_gcolumn_info);
+}
 
-  // Replace extension for object files with cubin because nvlink relies on
-  // these particular file names.
-  SmallString<256> Filename(ToolChain::getInputFilename(Input));
-  llvm::sys::path::replace_extension(Filename, "cubin");
-  return std::string(Filename.str());
+void NVPTXToolChain::adjustDebugInfoKind(
+    codegenoptions::DebugInfoKind &DebugInfoKind, const ArgList &Args) const {
+  switch (mustEmitDebugInfo(Args)) {
+  case DisableDebugInfo:
+    DebugInfoKind = codegenoptions::NoDebugInfo;
+    break;
+  case DebugDirectivesOnly:
+    DebugInfoKind = codegenoptions::DebugDirectivesOnly;
+    break;
+  case EmitSameDebugInfoAsHost:
+    // Use same debug info level as the host.
+    break;
+  }
 }
 
+/// CUDA toolchain.  Our assembler is ptxas, and our "linker" is fatbinary,
+/// which isn't properly a linker but nonetheless performs the step of stitching
+/// together object files from the assembler into a single blob.
+
+CudaToolChain::CudaToolChain(const Driver &D, const llvm::Triple &Triple,
+                             const ToolChain &HostTC, const ArgList &Args)
+    : NVPTXToolChain(D, Triple, HostTC.getTriple(), Args), HostTC(HostTC) {}
+
 void CudaToolChain::addClangTargetOptions(
-    const llvm::opt::ArgList &DriverArgs,
-    llvm::opt::ArgStringList &CC1Args,
+    const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args,
     Action::OffloadKind DeviceOffloadingKind) const {
   HostTC.addClangTargetOptions(DriverArgs, CC1Args, DeviceOffloadingKind);
 
@@ -709,33 +837,6 @@
   return llvm::DenormalMode::getIEEE();
 }
 
-bool CudaToolChain::supportsDebugInfoOption(const llvm::opt::Arg *A) const {
-  const Option &O = A->getOption();
-  return (O.matches(options::OPT_gN_Group) &&
-          !O.matches(options::OPT_gmodules)) ||
-         O.matches(options::OPT_g_Flag) ||
-         O.matches(options::OPT_ggdbN_Group) || O.matches(options::OPT_ggdb) ||
-         O.matches(options::OPT_gdwarf) || O.matches(options::OPT_gdwarf_2) ||
-         O.matches(options::OPT_gdwarf_3) || O.matches(options::OPT_gdwarf_4) ||
-         O.matches(options::OPT_gdwarf_5) ||
-         O.matches(options::OPT_gcolumn_info);
-}
-
-void CudaToolChain::adjustDebugInfoKind(
-    codegenoptions::DebugInfoKind &DebugInfoKind, const ArgList &Args) const {
-  switch (mustEmitDebugInfo(Args)) {
-  case DisableDebugInfo:
-    DebugInfoKind = codegenoptions::NoDebugInfo;
-    break;
-  case DebugDirectivesOnly:
-    DebugInfoKind = codegenoptions::DebugDirectivesOnly;
-    break;
-  case EmitSameDebugInfoAsHost:
-    // Use same debug info level as the host.
-    break;
-  }
-}
-
 void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
                                        ArgStringList &CC1Args) const {
   // Check our CUDA version if we're going to include the CUDA headers.
@@ -786,12 +887,20 @@
   return DAL;
 }
 
+Tool *NVPTXToolChain::buildAssembler() const {
+  return new tools::NVPTX::Assembler(*this);
+}
+
+Tool *NVPTXToolChain::buildLinker() const {
+  return new tools::NVPTX::Linker(*this);
+}
+
 Tool *CudaToolChain::buildAssembler() const {
   return new tools::NVPTX::Assembler(*this);
 }
 
 Tool *CudaToolChain::buildLinker() const {
-  return new tools::NVPTX::Linker(*this);
+  return new tools::NVPTX::FatBinary(*this);
 }
 
 void CudaToolChain::addClangWarningOptions(ArgStringList &CC1Args) const {
Index: clang/lib/Driver/Driver.cpp
===================================================================
--- clang/lib/Driver/Driver.cpp
+++ clang/lib/Driver/Driver.cpp
@@ -5926,6 +5926,9 @@
     case llvm::Triple::Solaris:
       TC = std::make_unique<toolchains::Solaris>(*this, Target, Args);
       break;
+    case llvm::Triple::CUDA:
+      TC = std::make_unique<toolchains::NVPTXToolChain>(*this, Target, Args);
+      break;
     case llvm::Triple::AMDHSA:
       TC = std::make_unique<toolchains::ROCMToolChain>(*this, Target, Args);
       break;
@@ -6045,11 +6048,6 @@
     }
   }
 
-  // Intentionally omitted from the switch above: llvm::Triple::CUDA.  CUDA
-  // compiles always need two toolchains, the CUDA toolchain and the host
-  // toolchain.  So the only valid way to create a CUDA toolchain is via
-  // CreateOffloadingDeviceToolChains.
-
   return *TC;
 }

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D140158: [CUDA] Allow targeting NVPTX directly without a host toolchain

Reply via email to