https://github.com/jhuber6 updated https://github.com/llvm/llvm-project/pull/96561
>From 6c70e542bbb355160b833ede6f86be0366953b88 Mon Sep 17 00:00:00 2001 From: Joseph Huber <hube...@outlook.com> Date: Mon, 24 Jun 2024 15:14:52 -0500 Subject: [PATCH] [Clang] Introduce 'clang-nvlink-wrappaer' to work around 'nvlink' Summary: The `clang-nvlink-wrapper` is a utility that I removed awhile back during the transition to the new driver. This patch adds back in a new, upgraded version that does LTO + archive linking. It's not an easy choice to reintroduce something I happily deleted, but this is the only way to move forward with improving GPU support in LLVM. While NVIDIA provides a linker called 'nvlink', its main interface is very difficult to work with. It does not provide LTO, or static linking, requires all files to be named a non-standard `.cubin`, and rejects link jobs that other linkers would be fine with (i.e empty). I have spent a great deal of time hacking around this in the GPU `libc` implementation, where I deliberately avoid LTO and static linking and have about 100 lines of hacky CMake dedicated to storing these files in a format that the clang-linker-wrapper accepts to avoid this limitation. The main reason I want to re-intorudce this tool is because I am planning on creating a more standard C/C++ toolchain for GPUs to use. This will install files like the following. ``` <install>/lib/nvptx64-nvidia-cuda/libc.a <install>/lib/nvptx64-nvidia-cuda/libc++.a <install>/lib/nvptx64-nvidia-cuda/libomp.a <install>/lib/clang/19/lib/nvptx64-nvidia-cuda/libclang_rt.builtins.a ``` Linking in these libraries will then simply require passing `-lc` like is already done for non-GPU toolchains. However, this doesn't work with the currently deficient `nvlink` linker, so I consider this a blocking issue to massively improving the state of building GPU libraries. In the future we may be able to convince NVIDIA to port their linker to `ld.lld`, but for now this is the only workable solution that allows us to hack around the weird behavior of their closed-source software. --- clang/lib/Driver/ToolChains/Cuda.cpp | 61 +- clang/lib/Driver/ToolChains/Cuda.h | 3 + clang/test/Driver/cuda-cross-compiling.c | 8 +- clang/test/Driver/nvlink-wrapper.c | 65 ++ clang/test/lit.cfg.py | 1 + clang/tools/CMakeLists.txt | 1 + .../tools/clang-nvlink-wrapper/CMakeLists.txt | 44 ++ .../ClangNVLinkWrapper.cpp | 671 ++++++++++++++++++ .../tools/clang-nvlink-wrapper/NVLinkOpts.td | 68 ++ 9 files changed, 865 insertions(+), 57 deletions(-) create mode 100644 clang/test/Driver/nvlink-wrapper.c create mode 100644 clang/tools/clang-nvlink-wrapper/CMakeLists.txt create mode 100644 clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp create mode 100644 clang/tools/clang-nvlink-wrapper/NVLinkOpts.td diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp index 2dfc7457b0ac7..54724cc1ad08e 100644 --- a/clang/lib/Driver/ToolChains/Cuda.cpp +++ b/clang/lib/Driver/ToolChains/Cuda.cpp @@ -461,13 +461,6 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA, CmdArgs.push_back("--output-file"); std::string OutputFileName = TC.getInputFilename(Output); - // If we are invoking `nvlink` internally we need to output a `.cubin` file. - // FIXME: This should hopefully be removed if NVIDIA updates their tooling. - if (!C.getInputArgs().getLastArg(options::OPT_c)) { - SmallString<256> Filename(Output.getFilename()); - llvm::sys::path::replace_extension(Filename, "cubin"); - OutputFileName = Filename.str(); - } if (Output.isFilename() && OutputFileName != Output.getFilename()) C.addTempFile(Args.MakeArgString(OutputFileName)); @@ -618,6 +611,11 @@ void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA, // Add standard library search paths passed on the command line. Args.AddAllArgs(CmdArgs, options::OPT_L); getToolChain().AddFilePathLibArgs(Args, CmdArgs); + AddLinkerInputs(getToolChain(), Inputs, Args, CmdArgs, JA); + + if (C.getDriver().isUsingLTO()) + addLTOOptions(getToolChain(), Args, CmdArgs, Output, Inputs[0], + C.getDriver().getLTOMode() == LTOK_Thin); // Add paths for the default clang library path. SmallString<256> DefaultLibPath = @@ -625,51 +623,12 @@ void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA, llvm::sys::path::append(DefaultLibPath, CLANG_INSTALL_LIBDIR_BASENAME); CmdArgs.push_back(Args.MakeArgString(Twine("-L") + DefaultLibPath)); - for (const auto &II : Inputs) { - if (II.getType() == types::TY_LLVM_IR || II.getType() == types::TY_LTO_IR || - II.getType() == types::TY_LTO_BC || II.getType() == types::TY_LLVM_BC) { - C.getDriver().Diag(diag::err_drv_no_linker_llvm_support) - << getToolChain().getTripleString(); - continue; - } - - // The 'nvlink' application performs RDC-mode linking when given a '.o' - // file and device linking when given a '.cubin' file. We always want to - // perform device linking, so just rename any '.o' files. - // FIXME: This should hopefully be removed if NVIDIA updates their tooling. - if (II.isFilename()) { - auto InputFile = getToolChain().getInputFilename(II); - if (llvm::sys::path::extension(InputFile) != ".cubin") { - // If there are no actions above this one then this is direct input and - // we can copy it. Otherwise the input is internal so a `.cubin` file - // should exist. - if (II.getAction() && II.getAction()->getInputs().size() == 0) { - const char *CubinF = - Args.MakeArgString(getToolChain().getDriver().GetTemporaryPath( - llvm::sys::path::stem(InputFile), "cubin")); - if (llvm::sys::fs::copy_file(InputFile, C.addTempFile(CubinF))) - continue; - - CmdArgs.push_back(CubinF); - } else { - SmallString<256> Filename(InputFile); - llvm::sys::path::replace_extension(Filename, "cubin"); - CmdArgs.push_back(Args.MakeArgString(Filename)); - } - } else { - CmdArgs.push_back(Args.MakeArgString(InputFile)); - } - } else if (!II.isNothing()) { - II.getInputArg().renderAsInput(Args, CmdArgs); - } - } - C.addCommand(std::make_unique<Command>( JA, *this, ResponseFileSupport{ResponseFileSupport::RF_Full, llvm::sys::WEM_UTF8, "--options-file"}, - Args.MakeArgString(getToolChain().GetProgramPath("nvlink")), CmdArgs, - Inputs, Output)); + Args.MakeArgString(getToolChain().GetProgramPath("clang-nvlink-wrapper")), + CmdArgs, Inputs, Output)); } void NVPTX::getNVPTXTargetFeatures(const Driver &D, const llvm::Triple &Triple, @@ -949,11 +908,7 @@ std::string CudaToolChain::getInputFilename(const InputInfo &Input) const { if (Input.getType() != types::TY_Object || getDriver().offloadDeviceOnly()) return ToolChain::getInputFilename(Input); - // Replace extension for object files with cubin because nvlink relies on - // these particular file names. - SmallString<256> Filename(ToolChain::getInputFilename(Input)); - llvm::sys::path::replace_extension(Filename, "cubin"); - return std::string(Filename); + return ToolChain::getInputFilename(Input); } llvm::opt::DerivedArgList * diff --git a/clang/lib/Driver/ToolChains/Cuda.h b/clang/lib/Driver/ToolChains/Cuda.h index 43c17ba7c0ba0..0735c36f116bc 100644 --- a/clang/lib/Driver/ToolChains/Cuda.h +++ b/clang/lib/Driver/ToolChains/Cuda.h @@ -155,6 +155,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXToolChain : public ToolChain { bool isPIEDefault(const llvm::opt::ArgList &Args) const override { return false; } + bool HasNativeLLVMSupport() const override { return true; } bool isPICDefaultForced() const override { return false; } bool SupportsProfiling() const override { return false; } @@ -192,6 +193,8 @@ class LLVM_LIBRARY_VISIBILITY CudaToolChain : public NVPTXToolChain { return &HostTC.getTriple(); } + bool HasNativeLLVMSupport() const override { return false; } + std::string getInputFilename(const InputInfo &Input) const override; llvm::opt::DerivedArgList * diff --git a/clang/test/Driver/cuda-cross-compiling.c b/clang/test/Driver/cuda-cross-compiling.c index 1dc4520f485db..9a1a68f2aad5d 100644 --- a/clang/test/Driver/cuda-cross-compiling.c +++ b/clang/test/Driver/cuda-cross-compiling.c @@ -32,8 +32,8 @@ // RUN: | FileCheck -check-prefix=ARGS %s // ARGS: -cc1" "-triple" "nvptx64-nvidia-cuda" "-S" {{.*}} "-target-cpu" "sm_61" "-target-feature" "+ptx{{[0-9]+}}" {{.*}} "-o" "[[PTX:.+]].s" -// ARGS-NEXT: ptxas{{.*}}"-m64" "-O0" "--gpu-name" "sm_61" "--output-file" "[[CUBIN:.+]].cubin" "[[PTX]].s" "-c" -// ARGS-NEXT: nvlink{{.*}}"-o" "a.out" "-arch" "sm_61" {{.*}} "[[CUBIN]].cubin" +// ARGS-NEXT: ptxas{{.*}}"-m64" "-O0" "--gpu-name" "sm_61" "--output-file" "[[CUBIN:.+]].o" "[[PTX]].s" "-c" +// ARGS-NEXT: clang-nvlink-wrapper{{.*}}"-o" "a.out" "-arch" "sm_61"{{.*}}"[[CUBIN]].o" // // Test the generated arguments to the CUDA binary utils when targeting NVPTX. @@ -55,7 +55,7 @@ // RUN: %clang -target nvptx64-nvidia-cuda -march=sm_61 -### %t.o 2>&1 \ // RUN: | FileCheck -check-prefix=LINK %s -// LINK: nvlink{{.*}}"-o" "a.out" "-arch" "sm_61" {{.*}} "{{.*}}.cubin" +// LINK: clang-nvlink-wrapper{{.*}}"-o" "a.out" "-arch" "sm_61" {{.*}} "{{.*}}.o" // // Test to ensure that we enable handling global constructors in a freestanding @@ -72,7 +72,7 @@ // RUN: %clang -target nvptx64-nvidia-cuda -Wl,-v -Wl,a,b -march=sm_52 -### %s 2>&1 \ // RUN: | FileCheck -check-prefix=LINKER-ARGS %s -// LINKER-ARGS: nvlink{{.*}}"-v"{{.*}}"a" "b" +// LINKER-ARGS: clang-nvlink-wrapper{{.*}}"-v"{{.*}}"a" "b" // Tests for handling a missing architecture. // diff --git a/clang/test/Driver/nvlink-wrapper.c b/clang/test/Driver/nvlink-wrapper.c new file mode 100644 index 0000000000000..fdda93f1f9cdc --- /dev/null +++ b/clang/test/Driver/nvlink-wrapper.c @@ -0,0 +1,65 @@ +// REQUIRES: x86-registered-target +// REQUIRES: nvptx-registered-target + +#if defined(X) +extern int y; +int foo() { return y; } + +int x = 0; +#elif defined(Y) +int y = 42; +#elif defined(Z) +int z = 42; +#elif defined(W) +int w = 42; +#elif defined(U) +extern int x; +extern int __attribute__((weak)) w; + +int bar() { + return x + w; +} +#else +extern int y; +int __attribute__((visibility("hidden"))) x = 999; +int baz() { return y + x; } +#endif + +// Create various inputs to test basic linking and LTO capabilities. Creating a +// CUDA binary requires access to the `ptxas` executable, so we just use x64. +// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -DX -o %t-x.o +// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -DY -o %t-y.o +// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -DZ -o %t-z.o +// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -DW -o %t-w.o +// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -DU -o %t-u.o +// RUN: llvm-ar rcs %t-x.a %t-x.o +// RUN: llvm-ar rcs %t-y.a %t-y.o +// RUN: llvm-ar rcs %t-z.a %t-z.o +// RUN: llvm-ar rcs %t-w.a %t-w.o + +// +// Check that we forward any unrecognized argument to 'nvlink'. +// +// RUN: clang-nvlink-wrapper --dry-run -arch sm_52 %t-u.o -foo -o a.out 2>&1 \ +// RUN: | FileCheck %s --check-prefix=ARGS +// ARGS: nvlink{{.*}} -arch sm_52 -foo -o a.out [[INPUT:.+]].cubin + +// +// Check the symbol resolution for static archives. We expect to only link +// `libx.a` and `liby.a` because extern weak symbols do not extract and `libz.a` +// is not used at all. +// +// RUN: clang-nvlink-wrapper --dry-run %t-x.a %t-u.o %t-y.a %t-z.a %t-w.a \ +// RUN: -arch sm_52 -o a.out 2>&1 | FileCheck %s --check-prefix=LINK +// LINK: nvlink{{.*}} -arch sm_52 -o a.out [[INPUT:.+]].cubin {{.*}}-x-{{.*}}.cubin{{.*}}-y-{{.*}}.cubin + +// RUN: %clang -cc1 %s -triple nvptx64-nvidia-cuda -emit-llvm-bc -o %t.o + +// +// Check that the LTO interface works and properly preserves symbols used in a +// regular object file. +// +// RUN: clang-nvlink-wrapper --dry-run %t.o %t-u.o %t-y.a \ +// RUN: -arch sm_52 -o a.out 2>&1 | FileCheck %s --check-prefix=LTO +// LTO: ptxas{{.*}} -m64 -c [[PTX:.+]].s -O3 -arch sm_52 -o [[CUBIN:.+]].cubin +// LTO: nvlink{{.*}} -arch sm_52 -o a.out [[CUBIN]].cubin {{.*}}-u-{{.*}}.cubin {{.*}}-y-{{.*}}.cubin diff --git a/clang/test/lit.cfg.py b/clang/test/lit.cfg.py index e5630a07424c7..92a3361ce672e 100644 --- a/clang/test/lit.cfg.py +++ b/clang/test/lit.cfg.py @@ -95,6 +95,7 @@ "llvm-ifs", "yaml2obj", "clang-linker-wrapper", + "clang-nvlink-wrapper", "llvm-lto", "llvm-lto2", "llvm-profdata", diff --git a/clang/tools/CMakeLists.txt b/clang/tools/CMakeLists.txt index bdd8004be3e02..4885afc1584d0 100644 --- a/clang/tools/CMakeLists.txt +++ b/clang/tools/CMakeLists.txt @@ -9,6 +9,7 @@ add_clang_subdirectory(clang-format-vs) add_clang_subdirectory(clang-fuzzer) add_clang_subdirectory(clang-import-test) add_clang_subdirectory(clang-linker-wrapper) +add_clang_subdirectory(clang-nvlink-wrapper) add_clang_subdirectory(clang-offload-packager) add_clang_subdirectory(clang-offload-bundler) add_clang_subdirectory(clang-scan-deps) diff --git a/clang/tools/clang-nvlink-wrapper/CMakeLists.txt b/clang/tools/clang-nvlink-wrapper/CMakeLists.txt new file mode 100644 index 0000000000000..d46f66994cf39 --- /dev/null +++ b/clang/tools/clang-nvlink-wrapper/CMakeLists.txt @@ -0,0 +1,44 @@ +set(LLVM_LINK_COMPONENTS + ${LLVM_TARGETS_TO_BUILD} + BitWriter + Core + BinaryFormat + MC + Target + TransformUtils + Analysis + Passes + IRReader + Object + Option + Support + TargetParser + CodeGen + LTO + ) + +set(LLVM_TARGET_DEFINITIONS NVLinkOpts.td) +tablegen(LLVM NVLinkOpts.inc -gen-opt-parser-defs) +add_public_tablegen_target(NVLinkWrapperOpts) + +if(NOT CLANG_BUILT_STANDALONE) + set(tablegen_deps intrinsics_gen NVLinkWrapperOpts) +endif() + +add_clang_tool(clang-nvlink-wrapper + ClangNVLinkWrapper.cpp + + DEPENDS + ${tablegen_deps} + ) + +set(CLANG_NVLINK_WRAPPER_LIB_DEPS + clangBasic + ) + +target_compile_options(clang-nvlink-wrapper PRIVATE "-g" "-O0") + +target_link_libraries(clang-nvlink-wrapper + PRIVATE + ${CLANG_NVLINK_WRAPPER_LIB_DEPS} + ) diff --git a/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp b/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp new file mode 100644 index 0000000000000..e94d2a9016954 --- /dev/null +++ b/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp @@ -0,0 +1,671 @@ +//===-- clang-nvlink-wrapper/ClangNVLinkWrapper.cpp - NVIDIA linker util --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===---------------------------------------------------------------------===// +// +// This tool wraps around the NVIDIA linker called 'nvlink'. The NVIDIA linker +// is required to create NVPTX applications, but does not support common +// features like LTO or archives. This utility wraps around the tool to cover +// its deficiencies. This tool can be removed once NVIDIA improves their linker +// or ports it to `ld.lld`. +// +//===---------------------------------------------------------------------===// + +#include "clang/Basic/Version.h" + +#include "llvm/ADT/StringExtras.h" +#include "llvm/BinaryFormat/Magic.h" +#include "llvm/CodeGen/CommandFlags.h" +#include "llvm/IR/DiagnosticPrinter.h" +#include "llvm/LTO/LTO.h" +#include "llvm/Object/Archive.h" +#include "llvm/Object/ArchiveWriter.h" +#include "llvm/Object/Binary.h" +#include "llvm/Object/ELFObjectFile.h" +#include "llvm/Object/IRObjectFile.h" +#include "llvm/Object/ObjectFile.h" +#include "llvm/Object/OffloadBinary.h" +#include "llvm/Option/ArgList.h" +#include "llvm/Option/OptTable.h" +#include "llvm/Option/Option.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/FileOutputBuffer.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/InitLLVM.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/Program.h" +#include "llvm/Support/Signals.h" +#include "llvm/Support/StringSaver.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/Support/WithColor.h" + +using namespace llvm; +using namespace llvm::opt; +using namespace llvm::object; + +static void printVersion(raw_ostream &OS) { + OS << clang::getClangToolFullVersion("clang-nvlink-wrapper") << '\n'; +} + +/// The value of `argv[0]` when run. +static const char *Executable; + +/// Temporary files to be cleaned up. +static SmallVector<SmallString<128>> TempFiles; + +/// Codegen flags for LTO backend. +static codegen::RegisterCodeGenFlags CodeGenFlags; + +namespace { +/// Must not overlap with llvm::opt::DriverFlag. +enum WrapperFlags { + WrapperOnlyOption = (1 << 4), // Options only used by the linker wrapper. + DeviceOnlyOption = (1 << 5), // Options only used for device linking. +}; + +enum ID { + OPT_INVALID = 0, // This is not an option ID. +#define OPTION(...) LLVM_MAKE_OPT_ID(__VA_ARGS__), +#include "NVLinkOpts.inc" + LastOption +#undef OPTION +}; + +#define PREFIX(NAME, VALUE) \ + static constexpr StringLiteral NAME##_init[] = VALUE; \ + static constexpr ArrayRef<StringLiteral> NAME(NAME##_init, \ + std::size(NAME##_init) - 1); +#include "NVLinkOpts.inc" +#undef PREFIX + +static constexpr OptTable::Info InfoTable[] = { +#define OPTION(...) LLVM_CONSTRUCT_OPT_INFO(__VA_ARGS__), +#include "NVLinkOpts.inc" +#undef OPTION +}; + +class WrapperOptTable : public opt::GenericOptTable { +public: + WrapperOptTable() : opt::GenericOptTable(InfoTable) {} +}; + +const OptTable &getOptTable() { + static const WrapperOptTable *Table = []() { + auto Result = std::make_unique<WrapperOptTable>(); + return Result.release(); + }(); + return *Table; +} + +[[noreturn]] void reportError(Error E) { + outs().flush(); + logAllUnhandledErrors(std::move(E), WithColor::error(errs(), Executable)); + exit(EXIT_FAILURE); +} + +void diagnosticHandler(const DiagnosticInfo &DI) { + std::string ErrStorage; + raw_string_ostream OS(ErrStorage); + DiagnosticPrinterRawOStream DP(OS); + DI.print(DP); + + switch (DI.getSeverity()) { + case DS_Error: + WithColor::error(errs(), Executable) << ErrStorage << "\n"; + break; + case DS_Warning: + WithColor::warning(errs(), Executable) << ErrStorage << "\n"; + break; + case DS_Note: + WithColor::note(errs(), Executable) << ErrStorage << "\n"; + break; + case DS_Remark: + WithColor::remark(errs()) << ErrStorage << "\n"; + break; + } +} + +Expected<StringRef> createTempFile(const ArgList &Args, const Twine &Prefix, + StringRef Extension) { + SmallString<128> OutputFile; + if (Args.hasArg(OPT_save_temps)) { + (Prefix + "." + Extension).toNullTerminatedStringRef(OutputFile); + } else { + if (std::error_code EC = + sys::fs::createTemporaryFile(Prefix, Extension, OutputFile)) + return createFileError(OutputFile, EC); + } + + TempFiles.emplace_back(std::move(OutputFile)); + return TempFiles.back(); +} + +Expected<std::string> findProgram(StringRef Name, ArrayRef<StringRef> Paths) { + ErrorOr<std::string> Path = sys::findProgramByName(Name, Paths); + if (!Path) + Path = sys::findProgramByName(Name); + if (!Path) + return createStringError(Path.getError(), + "Unable to find '" + Name + "' in path"); + return *Path; +} + +std::optional<std::string> findFile(StringRef Dir, StringRef Root, + const Twine &Name) { + SmallString<128> Path; + if (Dir.starts_with("=")) + sys::path::append(Path, Root, Dir.substr(1), Name); + else + sys::path::append(Path, Dir, Name); + + if (sys::fs::exists(Path)) + return static_cast<std::string>(Path); + return std::nullopt; +} + +std::optional<std::string> +findFromSearchPaths(StringRef Name, StringRef Root, + ArrayRef<StringRef> SearchPaths) { + for (StringRef Dir : SearchPaths) + if (std::optional<std::string> File = findFile(Dir, Root, Name)) + return File; + return std::nullopt; +} + +std::optional<std::string> +searchLibraryBaseName(StringRef Name, StringRef Root, + ArrayRef<StringRef> SearchPaths) { + for (StringRef Dir : SearchPaths) + if (std::optional<std::string> File = + findFile(Dir, Root, "lib" + Name + ".a")) + return File; + return std::nullopt; +} + +/// Search for static libraries in the linker's library path given input like +/// `-lfoo` or `-l:libfoo.a`. +std::optional<std::string> searchLibrary(StringRef Input, StringRef Root, + ArrayRef<StringRef> SearchPaths) { + if (Input.starts_with(":") || Input.ends_with(".lib")) + return findFromSearchPaths(Input.drop_front(), Root, SearchPaths); + return searchLibraryBaseName(Input, Root, SearchPaths); +} + +void printCommands(ArrayRef<StringRef> CmdArgs) { + if (CmdArgs.empty()) + return; + + llvm::errs() << " \"" << CmdArgs.front() << "\" "; + for (auto IC = std::next(CmdArgs.begin()), IE = CmdArgs.end(); IC != IE; ++IC) + llvm::errs() << *IC << (std::next(IC) != IE ? " " : "\n"); +} + +/// A minimum symbol interface that provides the necessary information to +/// extract archive members and resolve LTO symbols. +struct Symbol { + enum Flags { + None = 0, + Undefined = 1 << 0, + Weak = 1 << 1, + }; + + Symbol() + : File(), Flags(Undefined), Name(), UsedInRegularObj(false), Lazy(false) { + } + + Symbol(MemoryBufferRef File, const irsymtab::Reader::SymbolRef Sym, bool Lazy) + : File(File), Flags(0), UsedInRegularObj(false), Lazy(Lazy) { + if (Sym.isUndefined()) + Flags |= Undefined; + if (Sym.isWeak()) + Flags |= Weak; + Name = Sym.getName(); + } + + Symbol(MemoryBufferRef File, const SymbolRef Sym, bool Lazy) + : File(File), Flags(0), UsedInRegularObj(false), Lazy(Lazy) { + auto FlagsOrErr = Sym.getFlags(); + if (!FlagsOrErr) + reportError(FlagsOrErr.takeError()); + if (*FlagsOrErr & SymbolRef::SF_Undefined) + Flags |= Undefined; + if (*FlagsOrErr & SymbolRef::SF_Weak) + Flags |= Weak; + + auto NameOrErr = Sym.getName(); + if (!NameOrErr) + reportError(NameOrErr.takeError()); + Name = *NameOrErr; + } + + Symbol Resolve(Symbol Other) { + if (File.getBuffer().empty()) + return Other.Lazy ? *this : Other; + if (Other.isUndefined()) + return *this; + if (isWeak() && isUndefined() && Other.Lazy) + return *this; + if (isWeak() && !Other.isWeak()) + return Other; + if (isUndefined() && !Other.isUndefined()) + return Other; + return *this; + } + + bool isWeak() const { return Flags & Weak; } + bool isUndefined() const { return Flags & Undefined; } + + MemoryBufferRef File; + uint32_t Flags; + StringRef Name; + bool UsedInRegularObj; + bool Lazy; +}; + +Expected<StringRef> runPTXAs(StringRef File, const ArgList &Args) { + std::string CudaPath = Args.getLastArgValue(OPT_cuda_path_EQ).str(); + Expected<std::string> PTXAsPath = findProgram("ptxas", {CudaPath + "/bin"}); + if (!PTXAsPath) + return PTXAsPath.takeError(); + + auto TempFileOrErr = createTempFile( + Args, sys::path::stem(Args.getLastArgValue(OPT_o, "a.out")), "cubin"); + if (!TempFileOrErr) + return TempFileOrErr.takeError(); + + SmallVector<StringRef> AssemblerArgs({*PTXAsPath, "-m64", "-c", File}); + if (Args.hasArg(OPT_g)) + AssemblerArgs.push_back("-O0"); + else + AssemblerArgs.push_back( + Args.MakeArgString("-O" + Args.getLastArgValue(OPT_O, "3"))); + AssemblerArgs.append({"-arch", Args.getLastArgValue(OPT_arch)}); + AssemblerArgs.append({"-o", *TempFileOrErr}); + + if (Args.hasArg(OPT_dry_run)) + printCommands(AssemblerArgs); + else if (sys::ExecuteAndWait(*PTXAsPath, AssemblerArgs)) + return createStringError("'" + sys::path::filename(*PTXAsPath) + "'" + + " failed"); + return Args.MakeArgString(*TempFileOrErr); +} + +Expected<std::unique_ptr<lto::LTO>> createLTO(const ArgList &Args) { + const llvm::Triple Triple("nvptx64-nvidia-cuda"); + // We need to remove AMD's target-id from the processor if present. + lto::Config Conf; + lto::ThinBackend Backend; + Backend = + lto::createInProcessThinBackend(llvm::heavyweight_hardware_concurrency()); + + Conf.CPU = Args.getLastArgValue(OPT_arch); + Conf.Options = codegen::InitTargetOptionsFromCodeGenFlags(Triple); + + Conf.MAttrs = {Args.getLastArgValue(OPT_feature, "+ptx60").str()}; + std::optional<CodeGenOptLevel> CGOptLevelOrNone = + CodeGenOpt::parseLevel(Args.getLastArgValue(OPT_O, "2")[0]); + assert(CGOptLevelOrNone && "Invalid optimization level"); + Conf.CGOptLevel = *CGOptLevelOrNone; + Conf.OptLevel = Args.getLastArgValue(OPT_O, "2")[0] - '0'; + Conf.DefaultTriple = Triple.getTriple(); + + Conf.DiagHandler = diagnosticHandler; + Conf.CGFileType = CodeGenFileType::AssemblyFile; + + if (Args.hasArg(OPT_save_temps)) + if (Error Err = Conf.addSaveTemps( + (Args.getLastArgValue(OPT_o, "a.out") + ".").str())) + return Err; + + return std::make_unique<lto::LTO>(std::move(Conf), Backend); +} + +Expected<SmallVector<StringRef>> getInput(const ArgList &Args) { + SmallVector<StringRef> LibraryPaths; + for (const opt::Arg *Arg : Args.filtered(OPT_library_path)) + LibraryPaths.push_back(Arg->getValue()); + + bool WholeArchive = false; + SmallVector<std::pair<std::unique_ptr<MemoryBuffer>, bool>> InputFiles; + for (const opt::Arg *Arg : Args.filtered( + OPT_INPUT, OPT_library, OPT_whole_archive, OPT_no_whole_archive)) { + if (Arg->getOption().matches(OPT_whole_archive) || + Arg->getOption().matches(OPT_no_whole_archive)) { + WholeArchive = Arg->getOption().matches(OPT_whole_archive); + continue; + } + + std::optional<std::string> Filename = + Arg->getOption().matches(OPT_library) + ? searchLibrary(Arg->getValue(), /*Root=*/"", LibraryPaths) + : std::string(Arg->getValue()); + + if (!Filename && Arg->getOption().matches(OPT_library)) + return createStringError("unable to find library -l%s", Arg->getValue()); + + if (!Filename || !sys::fs::exists(*Filename) || + sys::fs::is_directory(*Filename)) + continue; + + ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr = + MemoryBuffer::getFileOrSTDIN(*Filename); + if (std::error_code EC = BufferOrErr.getError()) + return createFileError(*Filename, EC); + + MemoryBufferRef Buffer = **BufferOrErr; + switch (identify_magic(Buffer.getBuffer())) { + case file_magic::bitcode: + case file_magic::elf_relocatable: + InputFiles.emplace_back(std::move(*BufferOrErr), /*IsLazy=*/false); + break; + case file_magic::archive: { + Expected<std::unique_ptr<llvm::object::Archive>> LibFile = + object::Archive::create(Buffer); + if (!LibFile) + return LibFile.takeError(); + Error Err = Error::success(); + for (auto Child : (*LibFile)->children(Err)) { + auto ChildBufferOrErr = Child.getMemoryBufferRef(); + if (!ChildBufferOrErr) + return ChildBufferOrErr.takeError(); + std::unique_ptr<MemoryBuffer> ChildBuffer = + MemoryBuffer::getMemBufferCopy( + ChildBufferOrErr->getBuffer(), + ChildBufferOrErr->getBufferIdentifier()); + InputFiles.emplace_back(std::move(ChildBuffer), !WholeArchive); + } + if (Err) + return Err; + break; + } + default: + return createStringError("Unsupported file type"); + } + } + + StringMap<Symbol> SymTab; + SmallVector<std::unique_ptr<MemoryBuffer>> LinkerInput; + + bool Extracted = true; + while (Extracted) { + Extracted = false; + for (auto &[Buffer, IsLazy] : InputFiles) { + if (!Buffer) + continue; + + Extracted = !IsLazy; + MemoryBufferRef File = *Buffer; + switch (identify_magic(File.getBuffer())) { + case file_magic::bitcode: { + Expected<IRSymtabFile> IRSymtabOrErr = readIRSymtab(File); + if (!IRSymtabOrErr) + return IRSymtabOrErr.takeError(); + bool CheckSymbolsAgain; + do { + CheckSymbolsAgain = false; + for (unsigned I = 0; I != IRSymtabOrErr->Mods.size(); ++I) { + for (const auto &IRSym : + IRSymtabOrErr->TheReader.module_symbols(I)) { + if (IRSym.isFormatSpecific() || !IRSym.isGlobal()) + continue; + + Symbol &Sym = SymTab[IRSym.getName()]; + Sym = Sym.Resolve(Symbol(File, IRSym, IsLazy)); + if (Sym.File == File) { + Extracted = true; + if (IsLazy) { + IsLazy = false; + CheckSymbolsAgain = true; + } + } + } + } + } while (CheckSymbolsAgain); + break; + } + case file_magic::elf_relocatable: { + Expected<std::unique_ptr<ObjectFile>> ObjFile = + ObjectFile::createObjectFile(File); + if (!ObjFile) + return ObjFile.takeError(); + bool CheckSymbolsAgaing; + do { + CheckSymbolsAgaing = false; + for (SymbolRef ObjSum : (*ObjFile)->symbols()) { + auto NameOrErr = ObjSum.getName(); + if (!NameOrErr) + return NameOrErr.takeError(); + + Symbol &Sym = SymTab[*NameOrErr]; + Sym = Sym.Resolve(Symbol(File, ObjSum, IsLazy)); + Sym.UsedInRegularObj = true; + if (Sym.File == File) { + Extracted = true; + if (IsLazy) { + IsLazy = false; + CheckSymbolsAgaing = true; + } + } + } + } while (CheckSymbolsAgaing); + break; + } + default: + return createStringError("Unsupported file type"); + } + + // If we extracted any files we need to check all the symbols again. + if (Extracted) { + LinkerInput.emplace_back(std::move(Buffer)); + break; + } + } + } + + SmallVector<std::unique_ptr<MemoryBuffer>> BitcodeFiles; + for (auto &Input : LinkerInput) + if (identify_magic(Input->getBuffer()) == file_magic::bitcode) + BitcodeFiles.emplace_back(std::move(Input)); + + SmallVector<StringRef> Files; + if (!BitcodeFiles.empty()) { + auto LTOBackendOrErr = createLTO(Args); + if (!LTOBackendOrErr) + return LTOBackendOrErr.takeError(); + lto::LTO <OBackend = **LTOBackendOrErr; + for (auto &BitcodeFile : BitcodeFiles) { + Expected<std::unique_ptr<lto::InputFile>> BitcodeFileOrErr = + llvm::lto::InputFile::create(*BitcodeFile); + if (!BitcodeFileOrErr) + return BitcodeFileOrErr.takeError(); + + const auto Symbols = (*BitcodeFileOrErr)->symbols(); + SmallVector<lto::SymbolResolution, 16> Resolutions(Symbols.size()); + size_t Idx = 0; + for (auto &Sym : Symbols) { + lto::SymbolResolution &Res = Resolutions[Idx++]; + Symbol ObjSym = SymTab[Sym.getName()]; + + // We will use this as the prevailing symbol in LTO if it is not + // undefined and it is from the file that contained the canonical + // definition. + Res.Prevailing = !Sym.isUndefined() && ObjSym.File == *BitcodeFile; + + // We need LTO to preseve the following global symbols: + // 1) Symbols used in regular objects. + // 2) Prevailing symbols that are needed visible to the gpu runtime. + Res.VisibleToRegularObj = + ObjSym.UsedInRegularObj || + (Res.Prevailing && + (Sym.getVisibility() != GlobalValue::HiddenVisibility && + !Sym.canBeOmittedFromSymbolTable())); + + // Identify symbols that must be exported dynamically and can be + // referenced by other files, (i.e. the runtime). + Res.ExportDynamic = + Sym.getVisibility() != GlobalValue::HiddenVisibility && + !Sym.canBeOmittedFromSymbolTable(); + + // The final definition will reside in this linkage unit if the symbol + // is defined and local to the module. This only checks for bitcode + // files, full assertion will require complete symbol resolution. + Res.FinalDefinitionInLinkageUnit = + Sym.getVisibility() != GlobalValue::DefaultVisibility && + (!Sym.isUndefined() && !Sym.isCommon()); + + // We do not support linker redefined symbols (e.g. --wrap) for device + // image linking, so the symbols will not be changed after LTO. + Res.LinkerRedefined = false; + } + + // Add the bitcode file with its resolved symbols to the LTO job. + if (Error Err = LTOBackend.add(std::move(*BitcodeFileOrErr), Resolutions)) + return Err; + } + + // Run the LTO job to compile the bitcode. + size_t MaxTasks = LTOBackend.getMaxTasks(); + SmallVector<StringRef> LTOFiles(MaxTasks); + auto AddStream = + [&](size_t Task, + const Twine &ModuleName) -> std::unique_ptr<CachedFileStream> { + int FD = -1; + auto &TempFile = LTOFiles[Task]; + auto TempFileOrErr = createTempFile( + Args, sys::path::stem(Args.getLastArgValue(OPT_o, "a.out")), "s"); + if (!TempFileOrErr) + reportError(TempFileOrErr.takeError()); + TempFile = Args.MakeArgString(*TempFileOrErr); + if (std::error_code EC = sys::fs::openFileForWrite(TempFile, FD)) + reportError(errorCodeToError(EC)); + return std::make_unique<CachedFileStream>( + std::make_unique<llvm::raw_fd_ostream>(FD, true)); + }; + + if (Error Err = LTOBackend.run(AddStream)) + return Err; + + for (StringRef LTOFile : LTOFiles) { + auto FileOrErr = runPTXAs(LTOFile, Args); + if (!FileOrErr) + return FileOrErr.takeError(); + Files.emplace_back(*FileOrErr); + } + } + + for (auto &Input : LinkerInput) { + if (!Input) + continue; + + auto TempFileOrErr = createTempFile( + Args, sys::path::stem(Input->getBufferIdentifier()), "cubin"); + if (!TempFileOrErr) + return TempFileOrErr.takeError(); + Expected<std::unique_ptr<FileOutputBuffer>> OutputOrErr = + FileOutputBuffer::create(*TempFileOrErr, Input->getBuffer().size()); + if (!OutputOrErr) + return OutputOrErr.takeError(); + std::unique_ptr<FileOutputBuffer> Output = std::move(*OutputOrErr); + llvm::copy(Input->getBuffer(), Output->getBufferStart()); + if (Error E = Output->commit()) + return E; + Files.emplace_back(Args.MakeArgString(*TempFileOrErr)); + } + + return Files; +} + +Error runNVLink(ArrayRef<StringRef> Files, const ArgList &Args) { + std::string CudaPath = Args.getLastArgValue(OPT_cuda_path_EQ).str(); + Expected<std::string> NVLinkPath = findProgram("nvlink", {CudaPath + "/bin"}); + if (!NVLinkPath) + return NVLinkPath.takeError(); + + ArgStringList NewLinkerArgs; + for (const opt::Arg *Arg : Args) { + // Do not forward arguments only intended for the linker wrapper. + if (Arg->getOption().hasFlag(WrapperOnlyOption)) + continue; + + // Do not forward any inputs that we have processed. + if (Arg->getOption().matches(OPT_INPUT) || + Arg->getOption().matches(OPT_library)) + continue; + + Arg->render(Args, NewLinkerArgs); + } + + llvm::transform(Files, std::back_inserter(NewLinkerArgs), + [&](StringRef Arg) { return Args.MakeArgString(Arg); }); + + SmallVector<StringRef> LinkerArgs({*NVLinkPath}); + if (!Args.hasArg(OPT_o)) + LinkerArgs.append({"-o", "a.out"}); + for (StringRef Arg : NewLinkerArgs) + LinkerArgs.push_back(Arg); + + if (Args.hasArg(OPT_dry_run)) + printCommands(LinkerArgs); + else if (sys::ExecuteAndWait(*NVLinkPath, LinkerArgs)) + return createStringError("'" + sys::path::filename(*NVLinkPath) + "'" + + " failed"); + return Error::success(); +} + +} // namespace + +int main(int argc, char **argv) { + InitLLVM X(argc, argv); + InitializeAllTargetInfos(); + InitializeAllTargets(); + InitializeAllTargetMCs(); + InitializeAllAsmParsers(); + InitializeAllAsmPrinters(); + + Executable = argv[0]; + sys::PrintStackTraceOnErrorSignal(argv[0]); + + const OptTable &Tbl = getOptTable(); + BumpPtrAllocator Alloc; + StringSaver Saver(Alloc); + auto Args = Tbl.parseArgs(argc, argv, OPT_INVALID, Saver, [&](StringRef Err) { + reportError(createStringError(inconvertibleErrorCode(), Err)); + }); + + if (Args.hasArg(OPT_help) || Args.hasArg(OPT_help_hidden)) { + Tbl.printHelp( + outs(), "clang-nvlink-wrapper [options] <options to passed to nvlink>", + "A utility that wraps around the NVIDIA 'nvlink' linker.\n" + "This enables static linking and LTO handling for NVPTX targets.\n", + Args.hasArg(OPT_help_hidden), Args.hasArg(OPT_help_hidden)); + return EXIT_SUCCESS; + } + + if (Args.hasArg(OPT_version)) + printVersion(outs()); + + // This forwards '-mllvm' arguments to LLVM if present. + SmallVector<const char *> NewArgv = {argv[0]}; + for (const opt::Arg *Arg : Args.filtered(OPT_mllvm)) + NewArgv.push_back(Arg->getValue()); + cl::ParseCommandLineOptions(NewArgv.size(), &NewArgv[0]); + + auto FilesOrErr = getInput(Args); + if (!FilesOrErr) + reportError(FilesOrErr.takeError()); + + if (Error Err = runNVLink(*FilesOrErr, Args)) + reportError(std::move(Err)); + + // Remove the temporary files created. + if (!Args.hasArg(OPT_save_temps)) + for (const auto &TempFile : TempFiles) + if (std::error_code EC = sys::fs::remove(TempFile)) + reportError(createFileError(TempFile, EC)); + + return EXIT_SUCCESS; +} diff --git a/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td b/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td new file mode 100644 index 0000000000000..55e93109ddadc --- /dev/null +++ b/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td @@ -0,0 +1,68 @@ +include "llvm/Option/OptParser.td" + +def WrapperOnlyOption : OptionFlag; + +def help : Flag<["-", "--"], "help">, + HelpText<"Display available options (--help-hidden for more)">; + +def help_hidden : Flag<["-", "--"], "help-hidden">, + HelpText<"Display all available options">; + +def version : Flag<["--"], "version">, + HelpText<"Display the version number and exit">; + +def cuda_path_EQ : Joined<["--"], "cuda-path=">, + MetaVarName<"<dir>">, HelpText<"Set the system CUDA path">; + +def o : JoinedOrSeparate<["-"], "o">, MetaVarName<"<path>">, + HelpText<"Path to file to write output">; +def output : Separate<["--"], "output-file">, Alias<o>, Flags<[HelpHidden]>, + HelpText<"Alias for -o">; + +def library_path : JoinedOrSeparate<["-"], "L">, MetaVarName<"<dir>">, + HelpText<"Add <dir> to the library search path">; +def library_path_S : Separate<["--", "-"], "library-path">, Flags<[HelpHidden]>, + Alias<library_path>; +def library_path_EQ : Joined<["--", "-"], "library-path=">, Flags<[HelpHidden]>, + Alias<library_path>; + +def library : JoinedOrSeparate<["-"], "l">, MetaVarName<"<libname>">, + HelpText<"Search for library <libname>">; +def library_S : Separate<["--", "-"], "library">, Flags<[HelpHidden]>, + Alias<library_path>; +def library_EQ : Joined<["--", "-"], "library=">, Flags<[HelpHidden]>, + Alias<library_path>; + +def arch : Separate<["--", "-"], "arch">, + HelpText<"Specify the 'sm_' name of the target architecture.">; +def : Joined<["--", "-"], "plugin-opt=mcpu=">, + Flags<[HelpHidden, WrapperOnlyOption]>, Alias<arch>; + +def feature : Separate<["--", "-"], "feature">, Flags<[WrapperOnlyOption]>, + HelpText<"Specify the '+ptx' freature to use for LTO.">; + +def g : Flag<["-"], "g">, HelpText<"Specify that this was a debug compile.">; +def debug : Flag<["--"], "debug">, Alias<g>; + +def O : Joined<["--", "-"], "plugin-opt=O">, + Flags<[WrapperOnlyOption]>, MetaVarName<"<O0, O1, O2, or O3>">, + HelpText<"Optimization level for LTO">; + +def plugin_opt : Joined<["--", "-"], "plugin-opt">, + Flags<[WrapperOnlyOption, HelpHidden]>; + +def save_temps : Flag<["--", "-"], "save-temps">, + Flags<[WrapperOnlyOption]>, HelpText<"Save intermediate results">; + +def whole_archive : Flag<["--", "-"], "whole-archive">, + Flags<[WrapperOnlyOption, HelpHidden]>; +def no_whole_archive : Flag<["--", "-"], "no-whole-archive">, + Flags<[WrapperOnlyOption, HelpHidden]>; + +def mllvm : Separate<["-"], "mllvm">, Flags<[WrapperOnlyOption]>, + MetaVarName<"<arg>">, HelpText<"Arguments passed to the LLVM invocation">; +def mllvm_EQ : Joined<["-"], "mllvm=">, Flags<[HelpHidden]>, + Alias<mllvm>; + +def dry_run : Flag<["--", "-"], "dry-run">, Flags<[WrapperOnlyOption]>, + HelpText<"Print generated commands without running.">; _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits