https://github.com/kyulee-com updated https://github.com/llvm/llvm-project/pull/90933
>From 411fc459e58a65d9599c917f220ba68bb799baac Mon Sep 17 00:00:00 2001 From: Kyungwoo Lee <kyu...@meta.com> Date: Fri, 13 Sep 2024 08:51:00 -0700 Subject: [PATCH 1/5] [CGData][ThinLTO] Global Outlining with Two-CodeGen Rounds --- llvm/include/llvm/CGData/CodeGenData.h | 16 +++ llvm/lib/CGData/CodeGenData.cpp | 81 +++++++++++++- llvm/lib/LTO/CMakeLists.txt | 1 + llvm/lib/LTO/LTO.cpp | 103 +++++++++++++++++- llvm/lib/LTO/LTOBackend.cpp | 11 ++ .../test/ThinLTO/AArch64/cgdata-two-rounds.ll | 94 ++++++++++++++++ llvm/test/ThinLTO/AArch64/lit.local.cfg | 2 + 7 files changed, 302 insertions(+), 6 deletions(-) create mode 100644 llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll create mode 100644 llvm/test/ThinLTO/AArch64/lit.local.cfg diff --git a/llvm/include/llvm/CGData/CodeGenData.h b/llvm/include/llvm/CGData/CodeGenData.h index 84133a433170fe..1e1afe99327650 100644 --- a/llvm/include/llvm/CGData/CodeGenData.h +++ b/llvm/include/llvm/CGData/CodeGenData.h @@ -164,6 +164,22 @@ publishOutlinedHashTree(std::unique_ptr<OutlinedHashTree> HashTree) { CodeGenData::getInstance().publishOutlinedHashTree(std::move(HashTree)); } +/// Initialize the two-codegen rounds. +void initializeTwoCodegenRounds(); + +/// Save the current module before the first codegen round. +void saveModuleForTwoRounds(const Module &TheModule, unsigned Task); + +/// Load the current module before the second codegen round. +std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule, + unsigned Task, + LLVMContext &Context); + +/// Merge the codegen data from the input files in scratch vector in ThinLTO +/// two-codegen rounds. +Error mergeCodeGenData( + const std::unique_ptr<std::vector<llvm::SmallString<0>>> InputFiles); + void warn(Error E, StringRef Whence = ""); void warn(Twine Message, std::string Whence = "", std::string Hint = ""); diff --git a/llvm/lib/CGData/CodeGenData.cpp b/llvm/lib/CGData/CodeGenData.cpp index 55d2504231c744..ff8e5dd7c75790 100644 --- a/llvm/lib/CGData/CodeGenData.cpp +++ b/llvm/lib/CGData/CodeGenData.cpp @@ -17,6 +17,7 @@ #include "llvm/Object/ObjectFile.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/FileSystem.h" +#include "llvm/Support/Path.h" #include "llvm/Support/WithColor.h" #define DEBUG_TYPE "cg-data" @@ -30,6 +31,14 @@ cl::opt<bool> cl::opt<std::string> CodeGenDataUsePath("codegen-data-use-path", cl::init(""), cl::Hidden, cl::desc("File path to where .cgdata file is read")); +cl::opt<bool> CodeGenDataThinLTOTwoRounds( + "codegen-data-thinlto-two-rounds", cl::init(false), cl::Hidden, + cl::desc("Enable two-round ThinLTO code generation. The first round " + "emits codegen data, while the second round uses the emitted " + "codegen data for further optimizations.")); + +// Path to where the optimized bitcodes are saved and restored for ThinLTO. +static SmallString<128> CodeGenDataThinLTOTwoRoundsPath; static std::string getCGDataErrString(cgdata_error Err, const std::string &ErrMsg = "") { @@ -139,7 +148,7 @@ CodeGenData &CodeGenData::getInstance() { std::call_once(CodeGenData::OnceFlag, []() { Instance = std::unique_ptr<CodeGenData>(new CodeGenData()); - if (CodeGenDataGenerate) + if (CodeGenDataGenerate || CodeGenDataThinLTOTwoRounds) Instance->EmitCGData = true; else if (!CodeGenDataUsePath.empty()) { // Initialize the global CGData if the input file name is given. @@ -215,6 +224,76 @@ void warn(Error E, StringRef Whence) { } } +static std::string getPath(StringRef Dir, unsigned Task) { + return (Dir + "/" + llvm::Twine(Task) + ".saved_copy.bc").str(); +} + +void initializeTwoCodegenRounds() { + assert(CodeGenDataThinLTOTwoRounds); + if (auto EC = llvm::sys::fs::createUniqueDirectory( + "cgdata", CodeGenDataThinLTOTwoRoundsPath)) + report_fatal_error(Twine("Failed to create directory: ") + EC.message()); +} + +void saveModuleForTwoRounds(const Module &TheModule, unsigned Task) { + assert(sys::fs::is_directory(CodeGenDataThinLTOTwoRoundsPath)); + std::string Path = getPath(CodeGenDataThinLTOTwoRoundsPath, Task); + std::error_code EC; + raw_fd_ostream OS(Path, EC, sys::fs::OpenFlags::OF_None); + if (EC) + report_fatal_error(Twine("Failed to open ") + Path + + " to save optimized bitcode: " + EC.message()); + WriteBitcodeToFile(TheModule, OS, /* ShouldPreserveUseListOrder */ true); +} + +std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule, + unsigned Task, + LLVMContext &Context) { + assert(sys::fs::is_directory(CodeGenDataThinLTOTwoRoundsPath)); + std::string Path = getPath(CodeGenDataThinLTOTwoRoundsPath, Task); + auto FileOrError = MemoryBuffer::getFile(Path); + if (auto EC = FileOrError.getError()) + report_fatal_error(Twine("Failed to open ") + Path + + " to load optimized bitcode: " + EC.message()); + + std::unique_ptr<MemoryBuffer> FileBuffer = std::move(*FileOrError); + auto RestoredModule = llvm::parseBitcodeFile(*FileBuffer, Context); + if (!RestoredModule) + report_fatal_error(Twine("Failed to parse optimized bitcode loaded from ") + + Path + "\n"); + + // Restore the original module identifier. + (*RestoredModule)->setModuleIdentifier(OrigModule.getModuleIdentifier()); + return std::move(*RestoredModule); +} + +Error mergeCodeGenData( + const std::unique_ptr<std::vector<llvm::SmallString<0>>> InputFiles) { + + OutlinedHashTreeRecord GlobalOutlineRecord; + for (auto &InputFile : *(InputFiles)) { + if (InputFile.empty()) + continue; + StringRef File = StringRef(InputFile.data(), InputFile.size()); + std::unique_ptr<MemoryBuffer> Buffer = MemoryBuffer::getMemBuffer( + File, "in-memory object file", /*RequiresNullTerminator=*/false); + Expected<std::unique_ptr<object::ObjectFile>> BinOrErr = + object::ObjectFile::createObjectFile(Buffer->getMemBufferRef()); + if (!BinOrErr) + return BinOrErr.takeError(); + + std::unique_ptr<object::ObjectFile> &Obj = BinOrErr.get(); + if (auto E = CodeGenDataReader::mergeFromObjectFile(Obj.get(), + GlobalOutlineRecord)) + return E; + } + + if (!GlobalOutlineRecord.empty()) + cgdata::publishOutlinedHashTree(std::move(GlobalOutlineRecord.HashTree)); + + return Error::success(); +} + } // end namespace cgdata } // end namespace llvm diff --git a/llvm/lib/LTO/CMakeLists.txt b/llvm/lib/LTO/CMakeLists.txt index 69ff08e1f374c4..057d73b6349cf1 100644 --- a/llvm/lib/LTO/CMakeLists.txt +++ b/llvm/lib/LTO/CMakeLists.txt @@ -21,6 +21,7 @@ add_llvm_component_library(LLVMLTO BinaryFormat BitReader BitWriter + CGData CodeGen CodeGenTypes Core diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index e1714b29399298..83c832ec033d3d 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -21,6 +21,7 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Bitcode/BitcodeReader.h" #include "llvm/Bitcode/BitcodeWriter.h" +#include "llvm/CGData/CodeGenData.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/Config/llvm-config.h" #include "llvm/IR/AutoUpgrade.h" @@ -70,6 +71,8 @@ static cl::opt<bool> DumpThinCGSCCs("dump-thin-cg-sccs", cl::init(false), cl::Hidden, cl::desc("Dump the SCCs in the ThinLTO index's callgraph")); +extern cl::opt<bool> CodeGenDataThinLTOTwoRounds; + namespace llvm { /// Enable global value internalization in LTO. cl::opt<bool> EnableLTOInternalization( @@ -1424,7 +1427,7 @@ class InProcessThinBackend : public ThinBackendProc { GlobalValue::getGUID(GlobalValue::dropLLVMManglingEscape(Name))); } - Error runThinLTOBackendThread( + virtual Error runThinLTOBackendThread( AddStreamFn AddStream, FileCache Cache, unsigned Task, BitcodeModule BM, ModuleSummaryIndex &CombinedIndex, const FunctionImporter::ImportMapTy &ImportList, @@ -1513,6 +1516,60 @@ class InProcessThinBackend : public ThinBackendProc { return Error::success(); } }; + +/// This Backend will run ThinBackend process but throw away all the output from +/// the codegen. This class facilitates the first codegen round. +class NoOutputThinBackend : public InProcessThinBackend { +public: + NoOutputThinBackend( + const Config &Conf, ModuleSummaryIndex &CombinedIndex, + ThreadPoolStrategy ThinLTOParallelism, + const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries, + std::unique_ptr<std::vector<llvm::SmallString<0>>> Scratch) + : InProcessThinBackend( + Conf, CombinedIndex, ThinLTOParallelism, ModuleToDefinedGVSummaries, + // Allocate a scratch buffer for each task to write output to. + [Allocation = &*Scratch](unsigned Task, const Twine &ModuleName) { + return std::make_unique<CachedFileStream>( + std::make_unique<raw_svector_ostream>((*Allocation)[Task])); + }, + FileCache(), nullptr, false, false), + Scratch(std::move(Scratch)) {} + + /// Scratch space for writing output during the codegen. + std::unique_ptr<std::vector<llvm::SmallString<0>>> Scratch; +}; + +/// This Backend performs codegen on bitcode that was previously saved after +/// going through optimization. This class facilitates the second codegen round. +class OptimizedBitcodeThinBackend : public InProcessThinBackend { +public: + OptimizedBitcodeThinBackend( + const Config &Conf, ModuleSummaryIndex &CombinedIndex, + ThreadPoolStrategy ThinLTOParallelism, + const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries, + AddStreamFn AddStream) + : InProcessThinBackend(Conf, CombinedIndex, ThinLTOParallelism, + ModuleToDefinedGVSummaries, AddStream, FileCache(), + nullptr, false, false) {} + + virtual Error runThinLTOBackendThread( + AddStreamFn AddStream, FileCache Cache, unsigned Task, BitcodeModule BM, + ModuleSummaryIndex &CombinedIndex, + const FunctionImporter::ImportMapTy &ImportList, + const FunctionImporter::ExportSetTy &ExportList, + const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR, + const GVSummaryMapTy &DefinedGlobals, + MapVector<StringRef, BitcodeModule> &ModuleMap) override { + LTOLLVMContext BackendContext(Conf); + std::unique_ptr<Module> LoadedModule = + cgdata::loadModuleForTwoRounds(BM, Task, BackendContext); + + return thinBackend(Conf, Task, AddStream, *LoadedModule, CombinedIndex, + ImportList, DefinedGlobals, &ModuleMap, + /*CodeGenOnly=*/true); + } +}; } // end anonymous namespace ThinBackend lto::createInProcessThinBackend(ThreadPoolStrategy Parallelism, @@ -1855,10 +1912,46 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache, return BackendProcess->wait(); }; - std::unique_ptr<ThinBackendProc> BackendProc = - ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries, - AddStream, Cache); - return RunBackends(BackendProc.get()); + if (!CodeGenDataThinLTOTwoRounds) { + std::unique_ptr<ThinBackendProc> BackendProc = + ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries, + AddStream, Cache); + return RunBackends(BackendProc.get()); + } + + // Perform two rounds of code generation for ThinLTO: + // 1. First round: Run optimization and code generation with a scratch output. + // 2. Merge codegen data extracted from the scratch output. + // 3. Second round: Run code generation again using the merged data. + LLVM_DEBUG(dbgs() << "Running ThinLTO two-codegen rounds\n"); + + // Initialize a temporary path to store and retrieve optimized IRs for + // two-round code generation. + cgdata::initializeTwoCodegenRounds(); + + // Create a scratch output to hold intermediate results. + auto Outputs = + std::make_unique<std::vector<llvm::SmallString<0>>>(getMaxTasks()); + auto FirstRoundLTO = std::make_unique<NoOutputThinBackend>( + Conf, ThinLTO.CombinedIndex, llvm::heavyweight_hardware_concurrency(), + ModuleToDefinedGVSummaries, std::move(Outputs)); + // First round: Run optimization and code generation with a scratch output. + // Before code generation, serialize modules. + if (Error E = RunBackends(FirstRoundLTO.get())) + return E; + + // Merge codegen data extracted from the scratch output. + if (Error E = cgdata::mergeCodeGenData(std::move(FirstRoundLTO->Scratch))) + return E; + + // Second round: Run code generation by reading IRs. + std::unique_ptr<ThinBackendProc> SecondRoundLTO = + std::make_unique<OptimizedBitcodeThinBackend>( + Conf, ThinLTO.CombinedIndex, llvm::heavyweight_hardware_concurrency(), + ModuleToDefinedGVSummaries, AddStream); + Error E = RunBackends(SecondRoundLTO.get()); + + return E; } Expected<std::unique_ptr<ToolOutputFile>> lto::setupLLVMOptimizationRemarks( diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp index 06eeed3e1bc41f..3e3b5b316d4125 100644 --- a/llvm/lib/LTO/LTOBackend.cpp +++ b/llvm/lib/LTO/LTOBackend.cpp @@ -20,6 +20,7 @@ #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Bitcode/BitcodeReader.h" #include "llvm/Bitcode/BitcodeWriter.h" +#include "llvm/CGData/CodeGenData.h" #include "llvm/IR/LLVMRemarkStreamer.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/PassManager.h" @@ -74,6 +75,8 @@ static cl::opt<bool> ThinLTOAssumeMerged( cl::desc("Assume the input has already undergone ThinLTO function " "importing and the other pre-optimization pipeline changes.")); +extern cl::opt<bool> CodeGenDataThinLTOTwoRounds; + namespace llvm { extern cl::opt<bool> NoPGOWarnMismatch; } @@ -599,11 +602,19 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream, auto OptimizeAndCodegen = [&](Module &Mod, TargetMachine *TM, std::unique_ptr<ToolOutputFile> DiagnosticOutputFile) { + // Perform optimization and code generation for ThinLTO. if (!opt(Conf, TM, Task, Mod, /*IsThinLTO=*/true, /*ExportSummary=*/nullptr, /*ImportSummary=*/&CombinedIndex, CmdArgs)) return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile)); + // Save the current module before the first codegen round. + // Note that the second codegen round runs only `codegen()` without + // running `opt()`. We're not reaching here as it's bailed out earlier + // with CodeGenOnly which has been set in `OptimizedBitcodeThinBackend`. + if (CodeGenDataThinLTOTwoRounds) + cgdata::saveModuleForTwoRounds(Mod, Task); + codegen(Conf, TM, AddStream, Task, Mod, CombinedIndex); return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile)); }; diff --git a/llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll b/llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll new file mode 100644 index 00000000000000..0e082cf4e55e54 --- /dev/null +++ b/llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll @@ -0,0 +1,94 @@ +; This test verifies whether we can outline a singleton instance (i.e., an instance that does not repeat) +; by running two codegen rounds. + +; RUN: split-file %s %t + +; Verify each outlining instance is singleton with the global outlining for thinlto. +; They will be identical, which can be folded by the linker with ICF. +; RUN: opt -module-summary %t/thin-one.ll -o %t/thin-one.bc +; RUN: opt -module-summary %t/thin-two.ll -o %t/thin-two.bc +; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc -o %t/thinlto \ +; RUN: -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \ +; RUN: -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \ +; RUN: -codegen-data-thinlto-two-rounds + +; thin-one.ll will have one outlining instance (matched in the global outlined hash tree) +; RUN: llvm-objdump -d %t/thinlto.1 | FileCheck %s --check-prefix=THINLTO-1 +; THINLTO-1: _OUTLINED_FUNCTION{{.*}}>: +; THINLTO-1-NEXT: mov +; THINLTO-1-NEXT: mov +; THINLTO-1-NEXT: b + +; thin-two.ll will have two outlining instances (matched in the global outlined hash tree) +; RUN: llvm-objdump -d %t/thinlto.2 | FileCheck %s --check-prefix=THINLTO-2 +; THINLTO-2: _OUTLINED_FUNCTION{{.*}}>: +; THINLTO-2-NEXT: mov +; THINLTO-2-NEXT: mov +; THINLTO-2-NEXT: b +; THINLTO-2: _OUTLINED_FUNCTION{{.*}}>: +; THINLTO-2-NEXT: mov +; THINLTO-2-NEXT: mov +; THINLTO-2-NEXT: b + +; Now add a lto module to the above thinlto modules. +; Verify the lto module is optimized independent of the global outlining for thinlto. +; RUN: opt %t/lto.ll -o %t/lto.bc +; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc %t/lto.bc -o %t/out \ +; RUN: -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \ +; RUN: -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \ +; RUN: -r %t/lto.bc,_f4,px -r %t/lto.bc,_f5,px -r %t/lto.bc,_f6,px -r %t/lto.bc,_g,x \ +; RUN: -codegen-data-thinlto-two-rounds + +; lto.ll will have one outlining instance within the lto module itself (no global outlining). +; RUN: llvm-objdump -d %t/out.0 | FileCheck %s --check-prefix=LTO-0 +; LTO-0: _OUTLINED_FUNCTION{{.*}}>: +; LTO-0-NEXT: mov +; LTO-0-NEXT: b + +; thin-one.ll will have one outlining instance (matched in the global outlined hash tree) +; RUN: llvm-objdump -d %t/out.1 | FileCheck %s --check-prefix=THINLTO-1 + +; thin-two.ll will have two outlining instances (matched in the global outlined hash tree) +; RUN: llvm-objdump -d %t/out.2 | FileCheck %s --check-prefix=THINLTO-2 + +;--- thin-one.ll +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-darwin" + +declare i32 @g(i32, i32, i32) +define i32 @f3() minsize { + %1 = call i32 @g(i32 30, i32 1, i32 2); + ret i32 %1 +} + +;--- thin-two.ll +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-darwin" + +declare i32 @g(i32, i32, i32) +define i32 @f1() minsize { + %1 = call i32 @g(i32 10, i32 1, i32 2); + ret i32 %1 +} +define i32 @f2() minsize { + %1 = call i32 @g(i32 20, i32 1, i32 2); + ret i32 %1 +} + +;--- lto.ll +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-darwin" + +declare i32 @g(i32, i32, i32) +define i32 @f4() minsize { + %1 = call i32 @g(i32 10, i32 30, i32 2); + ret i32 %1 +} +define i32 @f5() minsize { + %1 = call i32 @g(i32 20, i32 40, i32 2); + ret i32 %1 +} +define i32 @f6() minsize { + %1 = call i32 @g(i32 50, i32 60, i32 2); + ret i32 %1 +} diff --git a/llvm/test/ThinLTO/AArch64/lit.local.cfg b/llvm/test/ThinLTO/AArch64/lit.local.cfg new file mode 100644 index 00000000000000..10d4a0e953ed47 --- /dev/null +++ b/llvm/test/ThinLTO/AArch64/lit.local.cfg @@ -0,0 +1,2 @@ +if not "AArch64" in config.root.targets: + config.unsupported = True >From 873bc99693b76cab36750c347d502a99594240b3 Mon Sep 17 00:00:00 2001 From: Kyungwoo Lee <kyu...@meta.com> Date: Tue, 17 Sep 2024 18:07:49 -0700 Subject: [PATCH 2/5] Address comments from ellishg --- llvm/include/llvm/CGData/CodeGenData.h | 7 ++++--- llvm/lib/CGData/CodeGenData.cpp | 4 +++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/llvm/include/llvm/CGData/CodeGenData.h b/llvm/include/llvm/CGData/CodeGenData.h index 1e1afe99327650..72b52e6e9b8fd1 100644 --- a/llvm/include/llvm/CGData/CodeGenData.h +++ b/llvm/include/llvm/CGData/CodeGenData.h @@ -164,13 +164,14 @@ publishOutlinedHashTree(std::unique_ptr<OutlinedHashTree> HashTree) { CodeGenData::getInstance().publishOutlinedHashTree(std::move(HashTree)); } -/// Initialize the two-codegen rounds. void initializeTwoCodegenRounds(); -/// Save the current module before the first codegen round. +/// Save \p TheModule before the first codegen round. +/// \p Task represents the partition number in the parallel code generation +/// process. void saveModuleForTwoRounds(const Module &TheModule, unsigned Task); -/// Load the current module before the second codegen round. +/// Load the optimized module before the second codegen round. std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule, unsigned Task, LLVMContext &Context); diff --git a/llvm/lib/CGData/CodeGenData.cpp b/llvm/lib/CGData/CodeGenData.cpp index ff8e5dd7c75790..58b92b7262957a 100644 --- a/llvm/lib/CGData/CodeGenData.cpp +++ b/llvm/lib/CGData/CodeGenData.cpp @@ -225,7 +225,9 @@ void warn(Error E, StringRef Whence) { } static std::string getPath(StringRef Dir, unsigned Task) { - return (Dir + "/" + llvm::Twine(Task) + ".saved_copy.bc").str(); + llvm::SmallString<128> Path(Dir); + llvm::sys::path::append(Path, llvm::Twine(Task) + ".saved_copy.bc"); + return std::string(Path); } void initializeTwoCodegenRounds() { >From 56569a7c4cc9a4b0dead845a0a99c475fb3f4b8b Mon Sep 17 00:00:00 2001 From: Kyungwoo Lee <kyu...@meta.com> Date: Tue, 17 Sep 2024 23:37:51 -0700 Subject: [PATCH 3/5] Address comments from NuriAmari --- llvm/lib/CGData/CodeGenData.cpp | 4 ++-- llvm/lib/LTO/LTO.cpp | 33 +++++++++++++++++++++------------ llvm/lib/LTO/LTOBackend.cpp | 2 +- 3 files changed, 24 insertions(+), 15 deletions(-) diff --git a/llvm/lib/CGData/CodeGenData.cpp b/llvm/lib/CGData/CodeGenData.cpp index 58b92b7262957a..4e21045a67cba6 100644 --- a/llvm/lib/CGData/CodeGenData.cpp +++ b/llvm/lib/CGData/CodeGenData.cpp @@ -245,7 +245,7 @@ void saveModuleForTwoRounds(const Module &TheModule, unsigned Task) { if (EC) report_fatal_error(Twine("Failed to open ") + Path + " to save optimized bitcode: " + EC.message()); - WriteBitcodeToFile(TheModule, OS, /* ShouldPreserveUseListOrder */ true); + WriteBitcodeToFile(TheModule, OS, /*ShouldPreserveUseListOrder=*/true); } std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule, @@ -259,7 +259,7 @@ std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule, " to load optimized bitcode: " + EC.message()); std::unique_ptr<MemoryBuffer> FileBuffer = std::move(*FileOrError); - auto RestoredModule = llvm::parseBitcodeFile(*FileBuffer, Context); + auto RestoredModule = parseBitcodeFile(*FileBuffer, Context); if (!RestoredModule) report_fatal_error(Twine("Failed to parse optimized bitcode loaded from ") + Path + "\n"); diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index 83c832ec033d3d..978193815b4d5b 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -1517,11 +1517,14 @@ class InProcessThinBackend : public ThinBackendProc { } }; -/// This Backend will run ThinBackend process but throw away all the output from -/// the codegen. This class facilitates the first codegen round. -class NoOutputThinBackend : public InProcessThinBackend { +/// This backend is utilized in the first round of a two-codegen round process. +/// It first saves optimized bitcode files to disk before the codegen process +/// begins. After codegen, it stores the resulting object files in a scratch +/// buffer. Note the codegen data stored in the scratch buffer will be extracted +/// and merged in the subsequent step. +class FirstRoundThinBackend : public InProcessThinBackend { public: - NoOutputThinBackend( + FirstRoundThinBackend( const Config &Conf, ModuleSummaryIndex &CombinedIndex, ThreadPoolStrategy ThinLTOParallelism, const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries, @@ -1533,25 +1536,31 @@ class NoOutputThinBackend : public InProcessThinBackend { return std::make_unique<CachedFileStream>( std::make_unique<raw_svector_ostream>((*Allocation)[Task])); }, - FileCache(), nullptr, false, false), + FileCache(), /*OnWrite=*/nullptr, /*ShouldEmitIndexFiles=*/false, + /*ShouldEmitImportsFiles=*/false), Scratch(std::move(Scratch)) {} /// Scratch space for writing output during the codegen. std::unique_ptr<std::vector<llvm::SmallString<0>>> Scratch; }; -/// This Backend performs codegen on bitcode that was previously saved after -/// going through optimization. This class facilitates the second codegen round. -class OptimizedBitcodeThinBackend : public InProcessThinBackend { +/// This backend operates in the second round of a two-codegen round process. +/// It starts by reading the optimized bitcode files that were saved during the +/// first round. The backend then executes the codegen only to further optimize +/// the code, utilizing the codegen data merged from the first round. Finally, +/// it writes the resulting object files as usual. +class SecondRoundThinBackend : public InProcessThinBackend { public: - OptimizedBitcodeThinBackend( + SecondRoundThinBackend( const Config &Conf, ModuleSummaryIndex &CombinedIndex, ThreadPoolStrategy ThinLTOParallelism, const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries, AddStreamFn AddStream) : InProcessThinBackend(Conf, CombinedIndex, ThinLTOParallelism, ModuleToDefinedGVSummaries, AddStream, FileCache(), - nullptr, false, false) {} + /*OnWrite=*/nullptr, + /*ShouldEmitIndexFiles=*/false, + /*ShouldEmitImportsFiles=*/false) {} virtual Error runThinLTOBackendThread( AddStreamFn AddStream, FileCache Cache, unsigned Task, BitcodeModule BM, @@ -1932,7 +1941,7 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache, // Create a scratch output to hold intermediate results. auto Outputs = std::make_unique<std::vector<llvm::SmallString<0>>>(getMaxTasks()); - auto FirstRoundLTO = std::make_unique<NoOutputThinBackend>( + auto FirstRoundLTO = std::make_unique<FirstRoundThinBackend>( Conf, ThinLTO.CombinedIndex, llvm::heavyweight_hardware_concurrency(), ModuleToDefinedGVSummaries, std::move(Outputs)); // First round: Run optimization and code generation with a scratch output. @@ -1946,7 +1955,7 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache, // Second round: Run code generation by reading IRs. std::unique_ptr<ThinBackendProc> SecondRoundLTO = - std::make_unique<OptimizedBitcodeThinBackend>( + std::make_unique<SecondRoundThinBackend>( Conf, ThinLTO.CombinedIndex, llvm::heavyweight_hardware_concurrency(), ModuleToDefinedGVSummaries, AddStream); Error E = RunBackends(SecondRoundLTO.get()); diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp index 3e3b5b316d4125..b66989fe520b42 100644 --- a/llvm/lib/LTO/LTOBackend.cpp +++ b/llvm/lib/LTO/LTOBackend.cpp @@ -611,7 +611,7 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream, // Save the current module before the first codegen round. // Note that the second codegen round runs only `codegen()` without // running `opt()`. We're not reaching here as it's bailed out earlier - // with CodeGenOnly which has been set in `OptimizedBitcodeThinBackend`. + // with `CodeGenOnly` which has been set in `SecondRoundThinBackend`. if (CodeGenDataThinLTOTwoRounds) cgdata::saveModuleForTwoRounds(Mod, Task); >From 6ab166409a0bee70ac3a6f362a9ebfa65160cdd1 Mon Sep 17 00:00:00 2001 From: Kyungwoo Lee <kyu...@meta.com> Date: Sun, 29 Sep 2024 10:38:46 -0700 Subject: [PATCH 4/5] Address comments from teresajohnson --- clang/lib/CodeGen/BackendUtil.cpp | 9 +- llvm/include/llvm/CGData/CodeGenData.h | 73 +++++- llvm/include/llvm/CGData/CodeGenDataReader.h | 5 +- llvm/include/llvm/LTO/LTO.h | 6 +- llvm/include/llvm/LTO/LTOBackend.h | 6 +- llvm/lib/CGData/CMakeLists.txt | 2 + llvm/lib/CGData/CodeGenData.cpp | 74 +++--- llvm/lib/CGData/CodeGenDataReader.cpp | 7 +- llvm/lib/LTO/LTO.cpp | 214 +++++++++++++----- llvm/lib/LTO/LTOBackend.cpp | 7 +- .../AArch64/cgdata-two-rounds-caching.ll | 174 ++++++++++++++ 11 files changed, 457 insertions(+), 120 deletions(-) create mode 100644 llvm/test/ThinLTO/AArch64/cgdata-two-rounds-caching.ll diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index abc936f2c686dd..f018130807519d 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -1321,10 +1321,11 @@ static void runThinLTOBackend( Conf.CGFileType = getCodeGenFileType(Action); break; } - if (Error E = thinBackend( - Conf, -1, AddStream, *M, *CombinedIndex, ImportList, - ModuleToDefinedGVSummaries[M->getModuleIdentifier()], - /* ModuleMap */ nullptr, Conf.CodeGenOnly, CGOpts.CmdArgs)) { + if (Error E = + thinBackend(Conf, -1, AddStream, *M, *CombinedIndex, ImportList, + ModuleToDefinedGVSummaries[M->getModuleIdentifier()], + /*ModuleMap=*/nullptr, Conf.CodeGenOnly, + /*IRAddStream=*/nullptr, CGOpts.CmdArgs)) { handleAllErrors(std::move(E), [&](ErrorInfoBase &EIB) { errs() << "Error running ThinLTO backend: " << EIB.message() << '\n'; }); diff --git a/llvm/include/llvm/CGData/CodeGenData.h b/llvm/include/llvm/CGData/CodeGenData.h index 72b52e6e9b8fd1..53550beeae1f83 100644 --- a/llvm/include/llvm/CGData/CodeGenData.h +++ b/llvm/include/llvm/CGData/CodeGenData.h @@ -15,11 +15,13 @@ #define LLVM_CGDATA_CODEGENDATA_H #include "llvm/ADT/BitmaskEnum.h" +#include "llvm/ADT/StableHashing.h" #include "llvm/Bitcode/BitcodeReader.h" #include "llvm/CGData/OutlinedHashTree.h" #include "llvm/CGData/OutlinedHashTreeRecord.h" #include "llvm/IR/Module.h" #include "llvm/Object/ObjectFile.h" +#include "llvm/Support/Caching.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/TargetParser/Triple.h" #include <mutex> @@ -164,22 +166,73 @@ publishOutlinedHashTree(std::unique_ptr<OutlinedHashTree> HashTree) { CodeGenData::getInstance().publishOutlinedHashTree(std::move(HashTree)); } -void initializeTwoCodegenRounds(); +struct StreamCacheData { + /// Backing buffer for serialized data stream. + SmallVector<SmallString<0>> Outputs; + /// Callback function to add serialized data to the stream. + AddStreamFn AddStream; + /// Backing buffer for cached data. + SmallVector<std::unique_ptr<MemoryBuffer>> Files; + /// Cache mechanism for storing data. + FileCache Cache; + + StreamCacheData(unsigned Size, const FileCache &OrigCache, + const Twine &CachePrefix) + : Outputs(Size), Files(Size) { + AddStream = [&](size_t Task, const Twine &ModuleName) { + return std::make_unique<CachedFileStream>( + std::make_unique<raw_svector_ostream>(Outputs[Task])); + }; + + if (OrigCache.isValid()) { + auto CGCacheOrErr = + localCache("ThinLTO", CachePrefix, OrigCache.getCacheDirectoryPath(), + [&](size_t Task, const Twine &ModuleName, + std::unique_ptr<MemoryBuffer> MB) { + Files[Task] = std::move(MB); + }); + if (Error Err = CGCacheOrErr.takeError()) + report_fatal_error(std::move(Err)); + Cache = std::move(*CGCacheOrErr); + } + } + StreamCacheData() = delete; + + /// Retrieve results from either the cache or the stream. + std::unique_ptr<SmallVector<StringRef>> getResult() { + unsigned NumOutputs = Outputs.size(); + auto Result = std::make_unique<SmallVector<StringRef>>(NumOutputs); + for (unsigned I = 0; I < NumOutputs; ++I) + if (Files[I]) + (*Result)[I] = Files[I]->getBuffer(); + else + (*Result)[I] = Outputs[I]; + return Result; + } +}; /// Save \p TheModule before the first codegen round. /// \p Task represents the partition number in the parallel code generation -/// process. -void saveModuleForTwoRounds(const Module &TheModule, unsigned Task); - -/// Load the optimized module before the second codegen round. +/// process. \p AddStream is the callback used to add the serialized module to +/// the stream. +void saveModuleForTwoRounds(const Module &TheModule, unsigned Task, + AddStreamFn AddStream); + +/// Load the optimized bitcode module for the second codegen round. +/// \p OrigModule is the original bitcode module. +/// \p Task identifies the partition number in the parallel code generation +/// process. \p Context provides the environment settings for module operations. +/// \p IRFiles contains optimized bitcode module files needed for loading. +/// \return A unique_ptr to the loaded Module, or nullptr if loading fails. std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule, unsigned Task, - LLVMContext &Context); + LLVMContext &Context, + ArrayRef<StringRef> IRFiles); -/// Merge the codegen data from the input files in scratch vector in ThinLTO -/// two-codegen rounds. -Error mergeCodeGenData( - const std::unique_ptr<std::vector<llvm::SmallString<0>>> InputFiles); +/// Merge the codegen data from the scratch objects \p ObjectFiles from the +/// first codegen round. +/// \return the combined hash of the merged codegen data. +Expected<stable_hash> mergeCodeGenData(ArrayRef<StringRef> ObjectFiles); void warn(Error E, StringRef Whence = ""); void warn(Twine Message, std::string Whence = "", std::string Hint = ""); diff --git a/llvm/include/llvm/CGData/CodeGenDataReader.h b/llvm/include/llvm/CGData/CodeGenDataReader.h index 1ee4bfbe480233..7e4882df2116e2 100644 --- a/llvm/include/llvm/CGData/CodeGenDataReader.h +++ b/llvm/include/llvm/CGData/CodeGenDataReader.h @@ -54,8 +54,11 @@ class CodeGenDataReader { /// Extract the cgdata embedded in sections from the given object file and /// merge them into the GlobalOutlineRecord. This is a static helper that /// is used by `llvm-cgdata --merge` or ThinLTO's two-codegen rounds. + /// Optionally, \p CombinedHash can be used to compuate the combined hash of + /// the merged data. static Error mergeFromObjectFile(const object::ObjectFile *Obj, - OutlinedHashTreeRecord &GlobalOutlineRecord); + OutlinedHashTreeRecord &GlobalOutlineRecord, + stable_hash *CombinedHash = nullptr); protected: /// The outlined hash tree that has been read. When it's released by diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h index 66d8ca63a206f6..d7aa31fe30db08 100644 --- a/llvm/include/llvm/LTO/LTO.h +++ b/llvm/include/llvm/LTO/LTO.h @@ -65,7 +65,8 @@ void thinLTOInternalizeAndPromoteInIndex( isPrevailing); /// Computes a unique hash for the Module considering the current list of -/// export/import and other global analysis results. +/// export/import and other global analysis results. Optionally, \p ExtraID +/// can be used to add an extra identifier to the hash. std::string computeLTOCacheKey( const lto::Config &Conf, const ModuleSummaryIndex &Index, StringRef ModuleID, const FunctionImporter::ImportMapTy &ImportList, @@ -73,7 +74,8 @@ std::string computeLTOCacheKey( const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR, const GVSummaryMapTy &DefinedGlobals, const DenseSet<GlobalValue::GUID> &CfiFunctionDefs = {}, - const DenseSet<GlobalValue::GUID> &CfiFunctionDecls = {}); + const DenseSet<GlobalValue::GUID> &CfiFunctionDecls = {}, + StringRef ExtraID = {}); namespace lto { diff --git a/llvm/include/llvm/LTO/LTOBackend.h b/llvm/include/llvm/LTO/LTOBackend.h index 098c0491dfe70a..2769e58f249053 100644 --- a/llvm/include/llvm/LTO/LTOBackend.h +++ b/llvm/include/llvm/LTO/LTOBackend.h @@ -51,13 +51,15 @@ Error backend(const Config &C, AddStreamFn AddStream, /// are saved in the ModuleMap. If \p ModuleMap is nullptr, module files will /// be mapped to memory on demand and at any given time during importing, only /// one source module will be kept open at the most. If \p CodeGenOnly is true, -/// the backend will skip optimization and only perform code generation. +/// the backend will skip optimization and only perform code generation. If +/// \p IRAddStream is not nullptr, it will be called just before code generation +/// to serialize the optimized IR. Error thinBackend(const Config &C, unsigned Task, AddStreamFn AddStream, Module &M, const ModuleSummaryIndex &CombinedIndex, const FunctionImporter::ImportMapTy &ImportList, const GVSummaryMapTy &DefinedGlobals, MapVector<StringRef, BitcodeModule> *ModuleMap, - bool CodeGenOnly, + bool CodeGenOnly, AddStreamFn IRAddStream = nullptr, const std::vector<uint8_t> &CmdArgs = std::vector<uint8_t>()); Error finalizeOptimizationRemarks( diff --git a/llvm/lib/CGData/CMakeLists.txt b/llvm/lib/CGData/CMakeLists.txt index ff1aab920e7a8c..157b0dfb7f9fcf 100644 --- a/llvm/lib/CGData/CMakeLists.txt +++ b/llvm/lib/CGData/CMakeLists.txt @@ -12,6 +12,8 @@ add_llvm_component_library(LLVMCGData intrinsics_gen LINK_COMPONENTS + BitReader + BitWriter Core Support Object diff --git a/llvm/lib/CGData/CodeGenData.cpp b/llvm/lib/CGData/CodeGenData.cpp index 4e21045a67cba6..c56a8b77a52319 100644 --- a/llvm/lib/CGData/CodeGenData.cpp +++ b/llvm/lib/CGData/CodeGenData.cpp @@ -15,6 +15,7 @@ #include "llvm/CGData/CodeGenDataReader.h" #include "llvm/CGData/OutlinedHashTreeRecord.h" #include "llvm/Object/ObjectFile.h" +#include "llvm/Support/Caching.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" @@ -37,9 +38,6 @@ cl::opt<bool> CodeGenDataThinLTOTwoRounds( "emits codegen data, while the second round uses the emitted " "codegen data for further optimizations.")); -// Path to where the optimized bitcodes are saved and restored for ThinLTO. -static SmallString<128> CodeGenDataThinLTOTwoRoundsPath; - static std::string getCGDataErrString(cgdata_error Err, const std::string &ErrMsg = "") { std::string Msg; @@ -224,59 +222,45 @@ void warn(Error E, StringRef Whence) { } } -static std::string getPath(StringRef Dir, unsigned Task) { - llvm::SmallString<128> Path(Dir); - llvm::sys::path::append(Path, llvm::Twine(Task) + ".saved_copy.bc"); - return std::string(Path); -} - -void initializeTwoCodegenRounds() { - assert(CodeGenDataThinLTOTwoRounds); - if (auto EC = llvm::sys::fs::createUniqueDirectory( - "cgdata", CodeGenDataThinLTOTwoRoundsPath)) - report_fatal_error(Twine("Failed to create directory: ") + EC.message()); -} - -void saveModuleForTwoRounds(const Module &TheModule, unsigned Task) { - assert(sys::fs::is_directory(CodeGenDataThinLTOTwoRoundsPath)); - std::string Path = getPath(CodeGenDataThinLTOTwoRoundsPath, Task); - std::error_code EC; - raw_fd_ostream OS(Path, EC, sys::fs::OpenFlags::OF_None); - if (EC) - report_fatal_error(Twine("Failed to open ") + Path + - " to save optimized bitcode: " + EC.message()); - WriteBitcodeToFile(TheModule, OS, /*ShouldPreserveUseListOrder=*/true); +void saveModuleForTwoRounds(const Module &TheModule, unsigned Task, + AddStreamFn AddStream) { + LLVM_DEBUG(dbgs() << "Saving module: " << TheModule.getModuleIdentifier() + << " in Task " << Task << "\n"); + Expected<std::unique_ptr<CachedFileStream>> StreamOrErr = + AddStream(Task, TheModule.getModuleIdentifier()); + if (Error Err = StreamOrErr.takeError()) + report_fatal_error(std::move(Err)); + std::unique_ptr<CachedFileStream> &Stream = *StreamOrErr; + + WriteBitcodeToFile(TheModule, *Stream->OS, + /*ShouldPreserveUseListOrder=*/true); } std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule, unsigned Task, - LLVMContext &Context) { - assert(sys::fs::is_directory(CodeGenDataThinLTOTwoRoundsPath)); - std::string Path = getPath(CodeGenDataThinLTOTwoRoundsPath, Task); - auto FileOrError = MemoryBuffer::getFile(Path); - if (auto EC = FileOrError.getError()) - report_fatal_error(Twine("Failed to open ") + Path + - " to load optimized bitcode: " + EC.message()); - - std::unique_ptr<MemoryBuffer> FileBuffer = std::move(*FileOrError); + LLVMContext &Context, + ArrayRef<StringRef> IRFiles) { + LLVM_DEBUG(dbgs() << "Loading module: " << OrigModule.getModuleIdentifier() + << " in Task " << Task << "\n"); + auto FileBuffer = MemoryBuffer::getMemBuffer( + IRFiles[Task], "in-memory IR file", /*RequiresNullTerminator=*/false); auto RestoredModule = parseBitcodeFile(*FileBuffer, Context); if (!RestoredModule) - report_fatal_error(Twine("Failed to parse optimized bitcode loaded from ") + - Path + "\n"); + report_fatal_error( + Twine("Failed to parse optimized bitcode loaded for Task: ") + + Twine(Task) + "\n"); // Restore the original module identifier. (*RestoredModule)->setModuleIdentifier(OrigModule.getModuleIdentifier()); return std::move(*RestoredModule); } -Error mergeCodeGenData( - const std::unique_ptr<std::vector<llvm::SmallString<0>>> InputFiles) { - +Expected<stable_hash> mergeCodeGenData(ArrayRef<StringRef> ObjFiles) { OutlinedHashTreeRecord GlobalOutlineRecord; - for (auto &InputFile : *(InputFiles)) { - if (InputFile.empty()) + stable_hash CombinedHash = 0; + for (auto File : ObjFiles) { + if (File.empty()) continue; - StringRef File = StringRef(InputFile.data(), InputFile.size()); std::unique_ptr<MemoryBuffer> Buffer = MemoryBuffer::getMemBuffer( File, "in-memory object file", /*RequiresNullTerminator=*/false); Expected<std::unique_ptr<object::ObjectFile>> BinOrErr = @@ -285,15 +269,15 @@ Error mergeCodeGenData( return BinOrErr.takeError(); std::unique_ptr<object::ObjectFile> &Obj = BinOrErr.get(); - if (auto E = CodeGenDataReader::mergeFromObjectFile(Obj.get(), - GlobalOutlineRecord)) + if (auto E = CodeGenDataReader::mergeFromObjectFile( + Obj.get(), GlobalOutlineRecord, &CombinedHash)) return E; } if (!GlobalOutlineRecord.empty()) cgdata::publishOutlinedHashTree(std::move(GlobalOutlineRecord.HashTree)); - return Error::success(); + return CombinedHash; } } // end namespace cgdata diff --git a/llvm/lib/CGData/CodeGenDataReader.cpp b/llvm/lib/CGData/CodeGenDataReader.cpp index f7f3a8f42af7e1..2f2481ea60f822 100644 --- a/llvm/lib/CGData/CodeGenDataReader.cpp +++ b/llvm/lib/CGData/CodeGenDataReader.cpp @@ -31,8 +31,8 @@ setupMemoryBuffer(const Twine &Filename, vfs::FileSystem &FS) { } Error CodeGenDataReader::mergeFromObjectFile( - const object::ObjectFile *Obj, - OutlinedHashTreeRecord &GlobalOutlineRecord) { + const object::ObjectFile *Obj, OutlinedHashTreeRecord &GlobalOutlineRecord, + stable_hash *CombinedHash) { Triple TT = Obj->makeTriple(); auto CGOutLineName = getCodeGenDataSectionName(CG_outline, TT.getObjectFormat(), false); @@ -48,6 +48,9 @@ Error CodeGenDataReader::mergeFromObjectFile( auto *EndData = Data + ContentsOrErr->size(); if (*NameOrErr == CGOutLineName) { + if (CombinedHash) + *CombinedHash = + stable_hash_combine(*CombinedHash, xxh3_64bits(*ContentsOrErr)); // In case dealing with an executable that has concatenated cgdata, // we want to merge them into a single cgdata. // Although it's not a typical workflow, we support this scenario. diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index 978193815b4d5b..29338d10ca0c0b 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -13,6 +13,7 @@ #include "llvm/LTO/LTO.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/StableHashing.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" @@ -36,6 +37,7 @@ #include "llvm/Linker/IRMover.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Object/IRObjectFile.h" +#include "llvm/Support/Caching.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Error.h" #include "llvm/Support/FileSystem.h" @@ -101,7 +103,7 @@ std::string llvm::computeLTOCacheKey( const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR, const GVSummaryMapTy &DefinedGlobals, const DenseSet<GlobalValue::GUID> &CfiFunctionDefs, - const DenseSet<GlobalValue::GUID> &CfiFunctionDecls) { + const DenseSet<GlobalValue::GUID> &CfiFunctionDecls, StringRef ExtraID) { // Compute the unique hash for this entry. // This is based on the current compiler version, the module itself, the // export list, the hash for every single module in the import list, the @@ -341,6 +343,9 @@ std::string llvm::computeLTOCacheKey( } } + if (!ExtraID.empty()) + AddString(ExtraID); + return toHex(Hasher.result()); } @@ -1401,6 +1406,7 @@ Error ThinBackendProc::emitFiles( namespace { class InProcessThinBackend : public ThinBackendProc { +protected: AddStreamFn AddStream; FileCache Cache; DenseSet<GlobalValue::GUID> CfiFunctionDefs; @@ -1523,25 +1529,91 @@ class InProcessThinBackend : public ThinBackendProc { /// buffer. Note the codegen data stored in the scratch buffer will be extracted /// and merged in the subsequent step. class FirstRoundThinBackend : public InProcessThinBackend { + AddStreamFn IRAddStream; + FileCache IRCache; + public: FirstRoundThinBackend( const Config &Conf, ModuleSummaryIndex &CombinedIndex, ThreadPoolStrategy ThinLTOParallelism, const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries, - std::unique_ptr<std::vector<llvm::SmallString<0>>> Scratch) - : InProcessThinBackend( - Conf, CombinedIndex, ThinLTOParallelism, ModuleToDefinedGVSummaries, - // Allocate a scratch buffer for each task to write output to. - [Allocation = &*Scratch](unsigned Task, const Twine &ModuleName) { - return std::make_unique<CachedFileStream>( - std::make_unique<raw_svector_ostream>((*Allocation)[Task])); - }, - FileCache(), /*OnWrite=*/nullptr, /*ShouldEmitIndexFiles=*/false, - /*ShouldEmitImportsFiles=*/false), - Scratch(std::move(Scratch)) {} - - /// Scratch space for writing output during the codegen. - std::unique_ptr<std::vector<llvm::SmallString<0>>> Scratch; + AddStreamFn CGAddStream, FileCache CGCache, AddStreamFn IRAddStream, + FileCache IRCache) + : InProcessThinBackend(Conf, CombinedIndex, ThinLTOParallelism, + ModuleToDefinedGVSummaries, std::move(CGAddStream), + std::move(CGCache), /*OnWrite=*/nullptr, + /*ShouldEmitIndexFiles=*/false, + /*ShouldEmitImportsFiles=*/false), + IRAddStream(std::move(IRAddStream)), IRCache(std::move(IRCache)) {} + + Error runThinLTOBackendThread( + AddStreamFn CGAddStream, FileCache CGCache, unsigned Task, + BitcodeModule BM, ModuleSummaryIndex &CombinedIndex, + const FunctionImporter::ImportMapTy &ImportList, + const FunctionImporter::ExportSetTy &ExportList, + const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR, + const GVSummaryMapTy &DefinedGlobals, + MapVector<StringRef, BitcodeModule> &ModuleMap) override { + auto RunThinBackend = [&](AddStreamFn CGAddStream, + AddStreamFn IRAddStream) { + LTOLLVMContext BackendContext(Conf); + Expected<std::unique_ptr<Module>> MOrErr = BM.parseModule(BackendContext); + if (!MOrErr) + return MOrErr.takeError(); + + return thinBackend(Conf, Task, CGAddStream, **MOrErr, CombinedIndex, + ImportList, DefinedGlobals, &ModuleMap, + Conf.CodeGenOnly, IRAddStream); + }; + + auto ModuleID = BM.getModuleIdentifier(); + // Like InProcessThinBackend, we produce index files as needed for + // FirstRoundThinBackend. However, these files are not generated for + // SecondRoundThinBackend. + if (ShouldEmitIndexFiles) { + if (auto E = emitFiles(ImportList, ModuleID, ModuleID.str())) + return E; + } + + assert((CGCache.isValid() == IRCache.isValid()) && + "Both caches for CG and IR should have matching availability"); + if (!CGCache.isValid() || !CombinedIndex.modulePaths().count(ModuleID) || + all_of(CombinedIndex.getModuleHash(ModuleID), + [](uint32_t V) { return V == 0; })) + // Cache disabled or no entry for this module in the combined index or + // no module hash. + return RunThinBackend(CGAddStream, IRAddStream); + + // Get CGKey for caching object in CGCache. + std::string CGKey = computeLTOCacheKey( + Conf, CombinedIndex, ModuleID, ImportList, ExportList, ResolvedODR, + DefinedGlobals, CfiFunctionDefs, CfiFunctionDecls); + Expected<AddStreamFn> CacheCGAddStreamOrErr = + CGCache(Task, CGKey, ModuleID); + if (Error Err = CacheCGAddStreamOrErr.takeError()) + return Err; + AddStreamFn &CacheCGAddStream = *CacheCGAddStreamOrErr; + + // Get IRKey for caching (optimized) IR in IRCache with an extra ID. + std::string IRKey = computeLTOCacheKey( + Conf, CombinedIndex, ModuleID, ImportList, ExportList, ResolvedODR, + DefinedGlobals, CfiFunctionDefs, CfiFunctionDecls, /*ExtraID=*/"IR"); + Expected<AddStreamFn> CacheIRAddStreamOrErr = + IRCache(Task, IRKey, ModuleID); + if (Error Err = CacheIRAddStreamOrErr.takeError()) + return Err; + AddStreamFn &CacheIRAddStream = *CacheIRAddStreamOrErr; + + assert((CacheCGAddStream == nullptr) == (CacheIRAddStream == nullptr) && + "Both CG and IR caching should be matched"); + if (CacheIRAddStream) { + LLVM_DEBUG(dbgs() << "[FirstRound] Cache Miss for " + << BM.getModuleIdentifier() << "\n"); + return RunThinBackend(CacheCGAddStream, CacheIRAddStream); + } + + return Error::success(); + } }; /// This backend operates in the second round of a two-codegen round process. @@ -1550,17 +1622,24 @@ class FirstRoundThinBackend : public InProcessThinBackend { /// the code, utilizing the codegen data merged from the first round. Finally, /// it writes the resulting object files as usual. class SecondRoundThinBackend : public InProcessThinBackend { + std::unique_ptr<SmallVector<StringRef>> IRFiles; + stable_hash CombinedCGDataHash; + public: SecondRoundThinBackend( const Config &Conf, ModuleSummaryIndex &CombinedIndex, ThreadPoolStrategy ThinLTOParallelism, const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries, - AddStreamFn AddStream) + AddStreamFn AddStream, FileCache Cache, + std::unique_ptr<SmallVector<StringRef>> IRFiles, + stable_hash CombinedCGDataHash) : InProcessThinBackend(Conf, CombinedIndex, ThinLTOParallelism, - ModuleToDefinedGVSummaries, AddStream, FileCache(), + ModuleToDefinedGVSummaries, std::move(AddStream), + std::move(Cache), /*OnWrite=*/nullptr, /*ShouldEmitIndexFiles=*/false, - /*ShouldEmitImportsFiles=*/false) {} + /*ShouldEmitImportsFiles=*/false), + IRFiles(std::move(IRFiles)), CombinedCGDataHash(CombinedCGDataHash) {} virtual Error runThinLTOBackendThread( AddStreamFn AddStream, FileCache Cache, unsigned Task, BitcodeModule BM, @@ -1570,13 +1649,42 @@ class SecondRoundThinBackend : public InProcessThinBackend { const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR, const GVSummaryMapTy &DefinedGlobals, MapVector<StringRef, BitcodeModule> &ModuleMap) override { - LTOLLVMContext BackendContext(Conf); - std::unique_ptr<Module> LoadedModule = - cgdata::loadModuleForTwoRounds(BM, Task, BackendContext); + auto RunThinBackend = [&](AddStreamFn AddStream) { + LTOLLVMContext BackendContext(Conf); + std::unique_ptr<Module> LoadedModule = + cgdata::loadModuleForTwoRounds(BM, Task, BackendContext, *IRFiles); - return thinBackend(Conf, Task, AddStream, *LoadedModule, CombinedIndex, - ImportList, DefinedGlobals, &ModuleMap, - /*CodeGenOnly=*/true); + return thinBackend(Conf, Task, AddStream, *LoadedModule, CombinedIndex, + ImportList, DefinedGlobals, &ModuleMap, + /*CodeGenOnly=*/true); + }; + + auto ModuleID = BM.getModuleIdentifier(); + if (!Cache.isValid() || !CombinedIndex.modulePaths().count(ModuleID) || + all_of(CombinedIndex.getModuleHash(ModuleID), + [](uint32_t V) { return V == 0; })) + // Cache disabled or no entry for this module in the combined index or + // no module hash. + return RunThinBackend(AddStream); + + // Get Key for caching the final object file in Cache with the combined + // CGData hash. + std::string Key = computeLTOCacheKey( + Conf, CombinedIndex, ModuleID, ImportList, ExportList, ResolvedODR, + DefinedGlobals, CfiFunctionDefs, CfiFunctionDecls, + /*ExtraID=*/std::to_string(CombinedCGDataHash)); + Expected<AddStreamFn> CacheAddStreamOrErr = Cache(Task, Key, ModuleID); + if (Error Err = CacheAddStreamOrErr.takeError()) + return Err; + AddStreamFn &CacheAddStream = *CacheAddStreamOrErr; + + if (CacheAddStream) { + LLVM_DEBUG(dbgs() << "[SecondRound] Cache Miss for " + << BM.getModuleIdentifier() << "\n"); + return RunThinBackend(CacheAddStream); + } + + return Error::success(); } }; } // end anonymous namespace @@ -1929,38 +2037,42 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache, } // Perform two rounds of code generation for ThinLTO: - // 1. First round: Run optimization and code generation with a scratch output. - // 2. Merge codegen data extracted from the scratch output. - // 3. Second round: Run code generation again using the merged data. - LLVM_DEBUG(dbgs() << "Running ThinLTO two-codegen rounds\n"); - - // Initialize a temporary path to store and retrieve optimized IRs for - // two-round code generation. - cgdata::initializeTwoCodegenRounds(); - - // Create a scratch output to hold intermediate results. - auto Outputs = - std::make_unique<std::vector<llvm::SmallString<0>>>(getMaxTasks()); + // 1. First round: Perform optimization and code generation, outputting to + // temporary scratch objects. + // 2. Merge code generation data extracted from the temporary scratch objects. + // 3. Second round: Execute code generation again using the merged data. + LLVM_DEBUG(dbgs() << "[TwoRounds] Initializing ThinLTO two-codegen rounds\n"); + + unsigned MaxTasks = getMaxTasks(); + auto Parallelism = ThinLTO.Backend.getParallelism(); + // Set up two additional streams and caches for storing temporary scratch + // objects and optimized IRs, using the same cache directory as the original. + cgdata::StreamCacheData CG(MaxTasks, Cache, "CG"), IR(MaxTasks, Cache, "IR"); + + // First round: Execute optimization and code generation, outputting to + // temporary scratch objects. Serialize the optimized IRs before initiating + // code generation. + LLVM_DEBUG(dbgs() << "[TwoRounds] Running the first round of codegen\n"); auto FirstRoundLTO = std::make_unique<FirstRoundThinBackend>( - Conf, ThinLTO.CombinedIndex, llvm::heavyweight_hardware_concurrency(), - ModuleToDefinedGVSummaries, std::move(Outputs)); - // First round: Run optimization and code generation with a scratch output. - // Before code generation, serialize modules. + Conf, ThinLTO.CombinedIndex, Parallelism, ModuleToDefinedGVSummaries, + CG.AddStream, CG.Cache, IR.AddStream, IR.Cache); if (Error E = RunBackends(FirstRoundLTO.get())) return E; - // Merge codegen data extracted from the scratch output. - if (Error E = cgdata::mergeCodeGenData(std::move(FirstRoundLTO->Scratch))) + LLVM_DEBUG(dbgs() << "[TwoRounds] Merging codegen data\n"); + auto CombinedHashOrErr = cgdata::mergeCodeGenData(*CG.getResult()); + if (Error E = CombinedHashOrErr.takeError()) return E; - - // Second round: Run code generation by reading IRs. - std::unique_ptr<ThinBackendProc> SecondRoundLTO = - std::make_unique<SecondRoundThinBackend>( - Conf, ThinLTO.CombinedIndex, llvm::heavyweight_hardware_concurrency(), - ModuleToDefinedGVSummaries, AddStream); - Error E = RunBackends(SecondRoundLTO.get()); - - return E; + auto CombinedHash = *CombinedHashOrErr; + LLVM_DEBUG(dbgs() << "[TwoRounds] CGData hash: " << CombinedHash << "\n"); + + // Second round: Read the optimized IRs and execute code generation using the + // merged data. + LLVM_DEBUG(dbgs() << "[TwoRounds] Running the second round of codegen\n"); + auto SecondRoundLTO = std::make_unique<SecondRoundThinBackend>( + Conf, ThinLTO.CombinedIndex, Parallelism, ModuleToDefinedGVSummaries, + AddStream, Cache, std::move(IR.getResult()), CombinedHash); + return RunBackends(SecondRoundLTO.get()); } Expected<std::unique_ptr<ToolOutputFile>> lto::setupLLVMOptimizationRemarks( diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp index b66989fe520b42..fd2e9c9169514c 100644 --- a/llvm/lib/LTO/LTOBackend.cpp +++ b/llvm/lib/LTO/LTOBackend.cpp @@ -568,7 +568,8 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream, const FunctionImporter::ImportMapTy &ImportList, const GVSummaryMapTy &DefinedGlobals, MapVector<StringRef, BitcodeModule> *ModuleMap, - bool CodeGenOnly, const std::vector<uint8_t> &CmdArgs) { + bool CodeGenOnly, AddStreamFn IRAddStream, + const std::vector<uint8_t> &CmdArgs) { Expected<const Target *> TOrErr = initAndLookupTarget(Conf, Mod); if (!TOrErr) return TOrErr.takeError(); @@ -612,8 +613,8 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream, // Note that the second codegen round runs only `codegen()` without // running `opt()`. We're not reaching here as it's bailed out earlier // with `CodeGenOnly` which has been set in `SecondRoundThinBackend`. - if (CodeGenDataThinLTOTwoRounds) - cgdata::saveModuleForTwoRounds(Mod, Task); + if (IRAddStream) + cgdata::saveModuleForTwoRounds(Mod, Task, IRAddStream); codegen(Conf, TM, AddStream, Task, Mod, CombinedIndex); return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile)); diff --git a/llvm/test/ThinLTO/AArch64/cgdata-two-rounds-caching.ll b/llvm/test/ThinLTO/AArch64/cgdata-two-rounds-caching.ll new file mode 100644 index 00000000000000..0809a9c1ab0821 --- /dev/null +++ b/llvm/test/ThinLTO/AArch64/cgdata-two-rounds-caching.ll @@ -0,0 +1,174 @@ +; This test verifies whether we can outline a singleton instance (i.e., an instance that does not repeat) +; by running two codegen rounds. +; This test also verifies if caches for the two-round codegens are correctly working. + +; REQUIRES: asserts +; RUN: rm -rf %t +; RUN: split-file %s %t + +; 0. Base case without a cache. +; Verify each outlining instance is singleton with the global outlining for thinlto. +; They will be identical, which can be folded by the linker with ICF. +; RUN: opt -module-hash -module-summary %t/thin-one.ll -o %t/thin-one.bc +; RUN: opt -module-hash -module-summary %t/thin-two.ll -o %t/thin-two.bc +; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc -o %t/thinlto \ +; RUN: -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \ +; RUN: -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \ +; RUN: -codegen-data-thinlto-two-rounds + +; thin-one.ll will have one outlining instance (matched in the global outlined hash tree) +; RUN: llvm-objdump -d %t/thinlto.1 | FileCheck %s --check-prefix=THINLTO-1 +; THINLTO-1: _OUTLINED_FUNCTION{{.*}}>: +; THINLTO-1-NEXT: mov +; THINLTO-1-NEXT: mov +; THINLTO-1-NEXT: b + +; thin-two.ll will have two outlining instances (matched in the global outlined hash tree) +; RUN: llvm-objdump -d %t/thinlto.2 | FileCheck %s --check-prefix=THINLTO-2 +; THINLTO-2: _OUTLINED_FUNCTION{{.*}}>: +; THINLTO-2-NEXT: mov +; THINLTO-2-NEXT: mov +; THINLTO-2-NEXT: b +; THINLTO-2: _OUTLINED_FUNCTION{{.*}}>: +; THINLTO-2-NEXT: mov +; THINLTO-2-NEXT: mov +; THINLTO-2-NEXT: b + +; 1. Run this with a cache for the first time. +; RUN: rm -rf %t.cache +; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc -o %t/thinlto-cold \ +; RUN: -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \ +; RUN: -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \ +; RUN: -codegen-data-thinlto-two-rounds -cache-dir %t.cache -debug-only=lto -thinlto-threads 1 > %t.log-cold.txt 2>&1 +; RUN: cat %t.log-cold.txt | FileCheck %s --check-prefix=COLD +; diff %t/thinlto.1 %t/thinlto-cold.1 +; diff %t/thinlto.2 %t/thinlto-cold.2 + +; COLD: [FirstRound] Cache Miss for {{.*}}thin-one.bc +; COLD: [FirstRound] Cache Miss for {{.*}}thin-two.bc +; COLD: [SecondRound] Cache Miss for {{.*}}thin-one.bc +; COLD: [SecondRound] Cache Miss for {{.*}}thin-two.bc + +; 2. Without any changes, simply re-running it will hit the cache. +; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc -o %t/thinlto-warm \ +; RUN: -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \ +; RUN: -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \ +; RUN: -codegen-data-thinlto-two-rounds -cache-dir %t.cache -debug-only=lto -thinlto-threads 1 > %t.log-warm.txt 2>&1 +; RUN: cat %t.log-warm.txt | FileCheck %s --check-prefix=WARM +; diff %t/thinlto.1 %t/thinlto-warm.1 +; diff %t/thinlto.2 %t/thinlto-warm.2 + +; WARM-NOT: Cache Miss + +; 3. Assume thin-one.ll has been modified to thin-one-modified.ll. +; The merged CG data remains unchanged as this modification does not affect the hash tree built from thin-two.bc. +; Therefore, both the first and second round runs update only this module. +; RUN: opt -module-hash -module-summary %t/thin-one-modified.ll -o %t/thin-one.bc +; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc -o %t/thinlto-warm-modified \ +; RUN: -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \ +; RUN: -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \ +; RUN: -codegen-data-thinlto-two-rounds -cache-dir %t.cache -debug-only=lto -thinlto-threads 1 > %t.log-warm-modified.txt 2>&1 +; RUN: cat %t.log-warm-modified.txt | FileCheck %s --check-prefix=WARM-MODIFIED +; diff %t/thinlto.1 %t/thinlto-warm-modified.1 +; diff %t/thinlto.2 %t/thinlto-warm-modified.2 + +; WARM-MODIFIED: [FirstRound] Cache Miss for {{.*}}thin-one.bc +; WARM-MODIFIED-NOT: [FirstRound] Cache Miss for {{.*}}thin-two.bc +; WARM-MODIFIED: [SecondRound] Cache Miss for {{.*}}thin-one.bc +; WARM-MODIFIED-NOT: [SecondRound] Cache Miss for {{.*}}thin-two.bc + +; 4. Additionally, thin-two.ll has been modified to thin-two-modified.ll. +; In this case, the merged CG data, which is global, is updated. +; Although the first round run updates only the thin-two.bc module, +; as the module thin-one.bc remains the same as in step 3 above, +; the second round run will update all modules, resulting in different binaries. +; RUN: opt -module-hash -module-summary %t/thin-one-modified.ll -o %t/thin-one.bc +; RUN: opt -module-hash -module-summary %t/thin-two-modified.ll -o %t/thin-two.bc +; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc -o %t/thinlto-warm-modified-all \ +; RUN: -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \ +; RUN: -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \ +; RUN: -codegen-data-thinlto-two-rounds -cache-dir %t.cache -debug-only=lto -thinlto-threads 1 > %t.log-warm-modified-all.txt 2>&1 +; RUN: cat %t.log-warm-modified-all.txt | FileCheck %s --check-prefix=WARM-MODIFIED-ALL +; RUN: not diff %t/thinlto.1 %t/thinlto-warm-modified-all.1 +; RUN: not diff %t/thinlto.2 %t/thinlto-warm-modified-all.2 + +; WARM-MODIFIED-ALL-NOT: [FirstRound] Cache Miss for {{.*}}thin-one.bc +; WARM-MODIFIED-ALL: [FirstRound] Cache Miss for {{.*}}thin-two.bc +; WARM-MODIFIED-ALL: [SecondRound] Cache Miss for {{.*}}thin-one.bc +; WARM-MODIFIED-ALL: [SecondRound] Cache Miss for {{.*}}thin-two.bc + +; thin-one-modified.ll won't be outlined. +; RUN: llvm-objdump -d %t/thinlto-warm-modified-all.1 | FileCheck %s --check-prefix=THINLTO-1-MODIFIED-ALL +; THINLTO-1-MODIFIED-ALL-NOT: _OUTLINED_FUNCTION{{.*}}>: + +; thin-two-modified.ll will have two (longer) outlining instances (matched in the global outlined hash tree) +; RUN: llvm-objdump -d %t/thinlto-warm-modified-all.2| FileCheck %s --check-prefix=THINLTO-2-MODIFIED-ALL +; THINLTO-2-MODIFIED-ALL: _OUTLINED_FUNCTION{{.*}}>: +; THINLTO-2-MODIFIED-ALL: mov +; THINLTO-2-MODIFIED-ALL: mov +; THINLTO-2-MODIFIED-ALL: mov +; THINLTO-2-MODIFIED-ALL: b +; THINLTO-2-MODIFIED-ALL: _OUTLINED_FUNCTION{{.*}}>: +; THINLTO-2-MODIFIED-ALL: mov +; THINLTO-2-MODIFIED-ALL: mov +; THINLTO-2-MODIFIED-ALL: mov +; THINLTO-2-MODIFIED-ALL: b + +; 5. Re-running it will hit the cache. +; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc -o %t/thinlto-warm-again \ +; RUN: -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \ +; RUN: -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \ +; RUN: -codegen-data-thinlto-two-rounds -cache-dir %t.cache -debug-only=lto -thinlto-threads 1 > %t.log-warm-again.txt 2>&1 +; RUN: cat %t.log-warm-again.txt | FileCheck %s --check-prefix=WARM-AGAIN +; RUN: diff %t/thinlto-warm-modified-all.1 %t/thinlto-warm-again.1 +; RUN: diff %t/thinlto-warm-modified-all.2 %t/thinlto-warm-again.2 + +; WARM-AGAIN-NOT: Cache Miss + +;--- thin-one.ll +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-darwin" + +declare i32 @g(i32, i32, i32) +define i32 @f3() minsize { + %1 = call i32 @g(i32 30, i32 1, i32 2); + ret i32 %1 +} + +;--- thin-one-modified.ll +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-darwin" + +declare i32 @g(i32, i32, i32) +define i32 @f3() minsize { + %1 = call i32 @g(i32 31, i32 1, i32 2); + ret i32 %1 +} + +;--- thin-two.ll +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-darwin" + +declare i32 @g(i32, i32, i32) +define i32 @f1() minsize { + %1 = call i32 @g(i32 10, i32 1, i32 2); + ret i32 %1 +} +define i32 @f2() minsize { + %1 = call i32 @g(i32 20, i32 1, i32 2); + ret i32 %1 +} + +;--- thin-two-modified.ll +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-darwin" + +declare i32 @g(i32, i32, i32) +define i32 @f1() minsize { + %1 = call i32 @g(i32 10, i32 1, i32 2); + ret i32 %1 +} +define i32 @f2() minsize { + %1 = call i32 @g(i32 10, i32 1, i32 2); + ret i32 %1 +} >From dc4145e530f8805fb79f5961a2307d640ee7b5d4 Mon Sep 17 00:00:00 2001 From: Kyungwoo Lee <kyu...@meta.com> Date: Tue, 8 Oct 2024 23:46:58 -0700 Subject: [PATCH 5/5] Address 2nd comments from teresajohnson --- llvm/include/llvm/LTO/LTO.h | 9 +++-- llvm/lib/LTO/LTO.cpp | 37 +++++++++++++------ llvm/lib/LTO/LTOBackend.cpp | 2 - .../AArch64/cgdata-two-rounds-caching.ll | 5 +++ .../test/ThinLTO/AArch64/cgdata-two-rounds.ll | 21 ++++++++--- 5 files changed, 51 insertions(+), 23 deletions(-) diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h index d7aa31fe30db08..242a05f7d32c02 100644 --- a/llvm/include/llvm/LTO/LTO.h +++ b/llvm/include/llvm/LTO/LTO.h @@ -65,8 +65,7 @@ void thinLTOInternalizeAndPromoteInIndex( isPrevailing); /// Computes a unique hash for the Module considering the current list of -/// export/import and other global analysis results. Optionally, \p ExtraID -/// can be used to add an extra identifier to the hash. +/// export/import and other global analysis results. std::string computeLTOCacheKey( const lto::Config &Conf, const ModuleSummaryIndex &Index, StringRef ModuleID, const FunctionImporter::ImportMapTy &ImportList, @@ -74,8 +73,10 @@ std::string computeLTOCacheKey( const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR, const GVSummaryMapTy &DefinedGlobals, const DenseSet<GlobalValue::GUID> &CfiFunctionDefs = {}, - const DenseSet<GlobalValue::GUID> &CfiFunctionDecls = {}, - StringRef ExtraID = {}); + const DenseSet<GlobalValue::GUID> &CfiFunctionDecls = {}); + +/// Recomputes the LTO cache key for a given key with an extra identifier. +std::string recomputeLTOCacheKey(const std::string &Key, StringRef ExtraID); namespace lto { diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index 29338d10ca0c0b..8e7675f442567a 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -103,7 +103,7 @@ std::string llvm::computeLTOCacheKey( const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR, const GVSummaryMapTy &DefinedGlobals, const DenseSet<GlobalValue::GUID> &CfiFunctionDefs, - const DenseSet<GlobalValue::GUID> &CfiFunctionDecls, StringRef ExtraID) { + const DenseSet<GlobalValue::GUID> &CfiFunctionDecls) { // Compute the unique hash for this entry. // This is based on the current compiler version, the module itself, the // export list, the hash for every single module in the import list, the @@ -343,8 +343,19 @@ std::string llvm::computeLTOCacheKey( } } - if (!ExtraID.empty()) - AddString(ExtraID); + return toHex(Hasher.result()); +} + +std::string llvm::recomputeLTOCacheKey(const std::string &Key, + StringRef ExtraID) { + SHA1 Hasher; + + auto AddString = [&](StringRef Str) { + Hasher.update(Str); + Hasher.update(ArrayRef<uint8_t>{0}); + }; + AddString(Key); + AddString(ExtraID); return toHex(Hasher.result()); } @@ -1595,21 +1606,22 @@ class FirstRoundThinBackend : public InProcessThinBackend { AddStreamFn &CacheCGAddStream = *CacheCGAddStreamOrErr; // Get IRKey for caching (optimized) IR in IRCache with an extra ID. - std::string IRKey = computeLTOCacheKey( - Conf, CombinedIndex, ModuleID, ImportList, ExportList, ResolvedODR, - DefinedGlobals, CfiFunctionDefs, CfiFunctionDecls, /*ExtraID=*/"IR"); + std::string IRKey = recomputeLTOCacheKey(CGKey, /*ExtraID=*/"IR"); Expected<AddStreamFn> CacheIRAddStreamOrErr = IRCache(Task, IRKey, ModuleID); if (Error Err = CacheIRAddStreamOrErr.takeError()) return Err; AddStreamFn &CacheIRAddStream = *CacheIRAddStreamOrErr; - assert((CacheCGAddStream == nullptr) == (CacheIRAddStream == nullptr) && - "Both CG and IR caching should be matched"); - if (CacheIRAddStream) { + // Ideally, both CG and IR caching should be synchronized. However, in + // practice, their availability may differ due to different expiration + // times. Therefore, if either cache is missing, the backend process is + // triggered. + if (CacheCGAddStream || CacheIRAddStream) { LLVM_DEBUG(dbgs() << "[FirstRound] Cache Miss for " << BM.getModuleIdentifier() << "\n"); - return RunThinBackend(CacheCGAddStream, CacheIRAddStream); + return RunThinBackend(CacheCGAddStream ? CacheCGAddStream : CGAddStream, + CacheIRAddStream ? CacheIRAddStream : IRAddStream); } return Error::success(); @@ -1671,8 +1683,9 @@ class SecondRoundThinBackend : public InProcessThinBackend { // CGData hash. std::string Key = computeLTOCacheKey( Conf, CombinedIndex, ModuleID, ImportList, ExportList, ResolvedODR, - DefinedGlobals, CfiFunctionDefs, CfiFunctionDecls, - /*ExtraID=*/std::to_string(CombinedCGDataHash)); + DefinedGlobals, CfiFunctionDefs, CfiFunctionDecls); + Key = recomputeLTOCacheKey(Key, + /*ExtraID=*/std::to_string(CombinedCGDataHash)); Expected<AddStreamFn> CacheAddStreamOrErr = Cache(Task, Key, ModuleID); if (Error Err = CacheAddStreamOrErr.takeError()) return Err; diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp index fd2e9c9169514c..ad332d25d9c082 100644 --- a/llvm/lib/LTO/LTOBackend.cpp +++ b/llvm/lib/LTO/LTOBackend.cpp @@ -75,8 +75,6 @@ static cl::opt<bool> ThinLTOAssumeMerged( cl::desc("Assume the input has already undergone ThinLTO function " "importing and the other pre-optimization pipeline changes.")); -extern cl::opt<bool> CodeGenDataThinLTOTwoRounds; - namespace llvm { extern cl::opt<bool> NoPGOWarnMismatch; } diff --git a/llvm/test/ThinLTO/AArch64/cgdata-two-rounds-caching.ll b/llvm/test/ThinLTO/AArch64/cgdata-two-rounds-caching.ll index 0809a9c1ab0821..e11903bf0f3bf8 100644 --- a/llvm/test/ThinLTO/AArch64/cgdata-two-rounds-caching.ll +++ b/llvm/test/ThinLTO/AArch64/cgdata-two-rounds-caching.ll @@ -49,6 +49,11 @@ ; COLD: [SecondRound] Cache Miss for {{.*}}thin-one.bc ; COLD: [SecondRound] Cache Miss for {{.*}}thin-two.bc +; There are two input bitcode files and each one is operated with 3 caches: +; CG/IR caches for the first round and the second round CG cache. +; So the total number of files are 2 * 3 = 6. +; RUN: ls %t.cache | count 6 + ; 2. Without any changes, simply re-running it will hit the cache. ; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc -o %t/thinlto-warm \ ; RUN: -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \ diff --git a/llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll b/llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll index 0e082cf4e55e54..ada1c6bb9a8421 100644 --- a/llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll +++ b/llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll @@ -1,5 +1,15 @@ -; This test verifies whether we can outline a singleton instance (i.e., an instance that does not repeat) -; by running two codegen rounds. +; This test checks if we can outline a singleton instance (i.e., an instance that +; does not repeat) through two codegen rounds. The first round identifies a local +; outlining instance within thin-two.ll, which is then encoded in the resulting +; object file and merged into the codegen data summary. +; The second round utilizes the merged codegen data to optimistically outline a +; singleton instance in thin-one.ll. +; Note that this global outlining creates a unique instance for each sequence +; without directly sharing identical functions for correctness. +; Actual code size reductions occur at link time through identical code folding. +; When both thinlto and lto modules are compiled, the lto module is processed +; independently, without relying on the merged codegen data. In this case, +; the identical code sequences are directly replaced by a common outlined function. ; RUN: split-file %s %t @@ -12,14 +22,14 @@ ; RUN: -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \ ; RUN: -codegen-data-thinlto-two-rounds -; thin-one.ll will have one outlining instance (matched in the global outlined hash tree) +; thin-one.ll will have one outlining instance itself (matched in the global outlined hash tree) ; RUN: llvm-objdump -d %t/thinlto.1 | FileCheck %s --check-prefix=THINLTO-1 ; THINLTO-1: _OUTLINED_FUNCTION{{.*}}>: ; THINLTO-1-NEXT: mov ; THINLTO-1-NEXT: mov ; THINLTO-1-NEXT: b -; thin-two.ll will have two outlining instances (matched in the global outlined hash tree) +; thin-two.ll will have two respective outlining instances (matched in the global outlined hash tree) ; RUN: llvm-objdump -d %t/thinlto.2 | FileCheck %s --check-prefix=THINLTO-2 ; THINLTO-2: _OUTLINED_FUNCTION{{.*}}>: ; THINLTO-2-NEXT: mov @@ -39,11 +49,12 @@ ; RUN: -r %t/lto.bc,_f4,px -r %t/lto.bc,_f5,px -r %t/lto.bc,_f6,px -r %t/lto.bc,_g,x \ ; RUN: -codegen-data-thinlto-two-rounds -; lto.ll will have one outlining instance within the lto module itself (no global outlining). +; lto.ll will have one shared outlining instance within the lto module itself (no global outlining). ; RUN: llvm-objdump -d %t/out.0 | FileCheck %s --check-prefix=LTO-0 ; LTO-0: _OUTLINED_FUNCTION{{.*}}>: ; LTO-0-NEXT: mov ; LTO-0-NEXT: b +; LTO-0-NOT: _OUTLINED_FUNCTION{{.*}}>: ; thin-one.ll will have one outlining instance (matched in the global outlined hash tree) ; RUN: llvm-objdump -d %t/out.1 | FileCheck %s --check-prefix=THINLTO-1 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits