[PATCH] D124292: [OpenMP] Use CUDA's non-RDC mode when LTO has whole program visibility

Joseph Huber via Phabricator via cfe-commits Fri, 22 Apr 2022 12:34:51 -0700

jhuber6 created this revision.
jhuber6 added reviewers: jdoerfert, tianshilei1992, tra, JonChesterfield.
Herald added subscribers: mattd, guansong, inglorion, yaxunl.
Herald added a project: All.
jhuber6 requested review of this revision.
Herald added subscribers: cfe-commits, sstefan1.
Herald added a project: clang.


When we do LTO we consider ourselves to have whole program visibility if
every single input file we have contains LLVM bitcode. If we have whole
program visibliity then we can create a single image and utilize CUDA's
non-RDC mode by not passing `-c` to `ptxas` and ignoring the `nvlink`
job. This should be faster for some situations and also saves us the
time executing `nvlink`.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D124292

Files:
  clang/test/Driver/linker-wrapper.c
  clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp


Index: clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
===================================================================
--- clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -595,7 +595,7 @@
 // TODO: Move these to a separate file.
 namespace nvptx {
 Expected<std::string> assemble(StringRef InputFile, Triple TheTriple,
-                               StringRef Arch) {
+                               StringRef Arch, bool RDC = true) {
   // NVPTX uses the ptxas binary to create device object files.
   Expected<std::string> PtxasPath = findProgram("ptxas", {CudaBinaryPath});
   if (!PtxasPath)
@@ -626,7 +626,8 @@
   CmdArgs.push_back(Opt);
   CmdArgs.push_back("--gpu-name");
   CmdArgs.push_back(Arch);
-  CmdArgs.push_back("-c");
+  if (RDC)
+    CmdArgs.push_back("-c");
 
   CmdArgs.push_back(InputFile);
 
@@ -933,7 +934,8 @@
 }
 
 Error linkBitcodeFiles(SmallVectorImpl<std::string> &InputFiles,
-                       const Triple &TheTriple, StringRef Arch) {
+                       const Triple &TheTriple, StringRef Arch,
+                       bool &WholeProgram) {
   SmallVector<std::unique_ptr<MemoryBuffer>, 4> SavedBuffers;
   SmallVector<std::unique_ptr<lto::InputFile>, 4> BitcodeFiles;
   SmallVector<std::string, 4> NewInputFiles;
@@ -1009,7 +1011,7 @@
   };
 
   // We assume visibility of the whole program if every input file was bitcode.
-  bool WholeProgram = BitcodeFiles.size() == InputFiles.size();
+  WholeProgram = BitcodeFiles.size() == InputFiles.size();
   auto LTOBackend =
       (EmbedBitcode) ? createLTO(TheTriple, Arch, WholeProgram, OutputBitcode)
                      : createLTO(TheTriple, Arch, WholeProgram);
@@ -1089,7 +1091,7 @@
   // Is we are compiling for NVPTX we need to run the assembler first.
   if (TheTriple.isNVPTX() && !EmbedBitcode) {
     for (auto &File : Files) {
-      auto FileOrErr = nvptx::assemble(File, TheTriple, Arch);
+      auto FileOrErr = nvptx::assemble(File, TheTriple, Arch, !WholeProgram);
       if (!FileOrErr)
         return FileOrErr.takeError();
       File = *FileOrErr;
@@ -1117,10 +1119,11 @@
   for (auto &LinkerInput : LinkerInputMap) {
     DeviceFile &File = LinkerInput.getFirst();
     Triple TheTriple = Triple(File.TheTriple);
+    bool WholeProgram = false;
 
     // Run LTO on any bitcode files and replace the input with the result.
-    if (Error Err =
-            linkBitcodeFiles(LinkerInput.getSecond(), TheTriple, File.Arch))
+    if (Error Err = linkBitcodeFiles(LinkerInput.getSecond(), TheTriple,
+                                     File.Arch, WholeProgram))
       return Err;
 
     // If we are embedding bitcode for JIT, skip the final device linking.
@@ -1130,6 +1133,14 @@
       continue;
     }
 
+    // If we performed LTO on NVPTX and had whole program visibility, we can 
use
+    // CUDA in non-RDC mode.
+    if (WholeProgram && TheTriple.isNVPTX()) {
+      assert(!LinkerInput.getSecond().empty() && "No non-RDC image to embed");
+      LinkedImages.push_back(LinkerInput.getSecond().front());
+      continue;
+    }
+
     auto ImageOrErr = linkDevice(LinkerInput.getSecond(), TheTriple, 
File.Arch);
     if (!ImageOrErr)
       return ImageOrErr.takeError();
Index: clang/test/Driver/linker-wrapper.c
===================================================================
--- clang/test/Driver/linker-wrapper.c
+++ clang/test/Driver/linker-wrapper.c
@@ -38,5 +38,4 @@
 // RUN: clang-linker-wrapper --host-triple x86_64-unknown-linux-gnu --dry-run 
-linker-path \
 // RUN:   /usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=LTO
 
-// LTO: ptxas{{.*}}-m64 -o {{.*}}.cubin -O2 --gpu-name sm_70 -c {{.*}}.s
-// LTO: nvlink{{.*}}-m64 -o {{.*}}.out -arch sm_70 {{.*}}.cubin
+// LTO: ptxas{{.*}}-m64 -o {{.*}}.cubin -O2 --gpu-name sm_70 {{.*}}.s

Index: clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
===================================================================
--- clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -595,7 +595,7 @@
 // TODO: Move these to a separate file.
 namespace nvptx {
 Expected<std::string> assemble(StringRef InputFile, Triple TheTriple,
-                               StringRef Arch) {
+                               StringRef Arch, bool RDC = true) {
   // NVPTX uses the ptxas binary to create device object files.
   Expected<std::string> PtxasPath = findProgram("ptxas", {CudaBinaryPath});
   if (!PtxasPath)
@@ -626,7 +626,8 @@
   CmdArgs.push_back(Opt);
   CmdArgs.push_back("--gpu-name");
   CmdArgs.push_back(Arch);
-  CmdArgs.push_back("-c");
+  if (RDC)
+    CmdArgs.push_back("-c");
 
   CmdArgs.push_back(InputFile);
 
@@ -933,7 +934,8 @@
 }
 
 Error linkBitcodeFiles(SmallVectorImpl<std::string> &InputFiles,
-                       const Triple &TheTriple, StringRef Arch) {
+                       const Triple &TheTriple, StringRef Arch,
+                       bool &WholeProgram) {
   SmallVector<std::unique_ptr<MemoryBuffer>, 4> SavedBuffers;
   SmallVector<std::unique_ptr<lto::InputFile>, 4> BitcodeFiles;
   SmallVector<std::string, 4> NewInputFiles;
@@ -1009,7 +1011,7 @@
   };
 
   // We assume visibility of the whole program if every input file was bitcode.
-  bool WholeProgram = BitcodeFiles.size() == InputFiles.size();
+  WholeProgram = BitcodeFiles.size() == InputFiles.size();
   auto LTOBackend =
       (EmbedBitcode) ? createLTO(TheTriple, Arch, WholeProgram, OutputBitcode)
                      : createLTO(TheTriple, Arch, WholeProgram);
@@ -1089,7 +1091,7 @@
   // Is we are compiling for NVPTX we need to run the assembler first.
   if (TheTriple.isNVPTX() && !EmbedBitcode) {
     for (auto &File : Files) {
-      auto FileOrErr = nvptx::assemble(File, TheTriple, Arch);
+      auto FileOrErr = nvptx::assemble(File, TheTriple, Arch, !WholeProgram);
       if (!FileOrErr)
         return FileOrErr.takeError();
       File = *FileOrErr;
@@ -1117,10 +1119,11 @@
   for (auto &LinkerInput : LinkerInputMap) {
     DeviceFile &File = LinkerInput.getFirst();
     Triple TheTriple = Triple(File.TheTriple);
+    bool WholeProgram = false;
 
     // Run LTO on any bitcode files and replace the input with the result.
-    if (Error Err =
-            linkBitcodeFiles(LinkerInput.getSecond(), TheTriple, File.Arch))
+    if (Error Err = linkBitcodeFiles(LinkerInput.getSecond(), TheTriple,
+                                     File.Arch, WholeProgram))
       return Err;
 
     // If we are embedding bitcode for JIT, skip the final device linking.
@@ -1130,6 +1133,14 @@
       continue;
     }
 
+    // If we performed LTO on NVPTX and had whole program visibility, we can use
+    // CUDA in non-RDC mode.
+    if (WholeProgram && TheTriple.isNVPTX()) {
+      assert(!LinkerInput.getSecond().empty() && "No non-RDC image to embed");
+      LinkedImages.push_back(LinkerInput.getSecond().front());
+      continue;
+    }
+
     auto ImageOrErr = linkDevice(LinkerInput.getSecond(), TheTriple, File.Arch);
     if (!ImageOrErr)
       return ImageOrErr.takeError();
Index: clang/test/Driver/linker-wrapper.c
===================================================================
--- clang/test/Driver/linker-wrapper.c
+++ clang/test/Driver/linker-wrapper.c
@@ -38,5 +38,4 @@
 // RUN: clang-linker-wrapper --host-triple x86_64-unknown-linux-gnu --dry-run -linker-path \
 // RUN:   /usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=LTO
 
-// LTO: ptxas{{.*}}-m64 -o {{.*}}.cubin -O2 --gpu-name sm_70 -c {{.*}}.s
-// LTO: nvlink{{.*}}-m64 -o {{.*}}.out -arch sm_70 {{.*}}.cubin
+// LTO: ptxas{{.*}}-m64 -o {{.*}}.cubin -O2 --gpu-name sm_70 {{.*}}.s

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D124292: [OpenMP] Use CUDA's non-RDC mode when LTO has whole program visibility

Reply via email to