https://github.com/kyulee-com created https://github.com/llvm/llvm-project/pull/110431
None >From c1a0219457a3c162d7fa6b9d70750ba7a040d9f2 Mon Sep 17 00:00:00 2001 From: Kyungwoo Lee <kyu...@meta.com> Date: Fri, 26 Apr 2024 20:02:52 -0700 Subject: [PATCH 1/5] [ThinLTO][NFC] Prep for two-codegen rounds --- clang/lib/CodeGen/BackendUtil.cpp | 8 ++-- llvm/include/llvm/LTO/LTOBackend.h | 1 + llvm/lib/LTO/LTO.cpp | 75 ++++++++++++++++-------------- llvm/lib/LTO/LTOBackend.cpp | 6 ++- 4 files changed, 49 insertions(+), 41 deletions(-) diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index 7fa69420298160..a1909d45b4d944 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -1286,10 +1286,10 @@ static void runThinLTOBackend( Conf.CGFileType = getCodeGenFileType(Action); break; } - if (Error E = - thinBackend(Conf, -1, AddStream, *M, *CombinedIndex, ImportList, - ModuleToDefinedGVSummaries[M->getModuleIdentifier()], - /* ModuleMap */ nullptr, CGOpts.CmdArgs)) { + if (Error E = thinBackend( + Conf, -1, AddStream, *M, *CombinedIndex, ImportList, + ModuleToDefinedGVSummaries[M->getModuleIdentifier()], + /* ModuleMap */ nullptr, Conf.CodeGenOnly, CGOpts.CmdArgs)) { handleAllErrors(std::move(E), [&](ErrorInfoBase &EIB) { errs() << "Error running ThinLTO backend: " << EIB.message() << '\n'; }); diff --git a/llvm/include/llvm/LTO/LTOBackend.h b/llvm/include/llvm/LTO/LTOBackend.h index de89f4bb10dff2..8516398510d4b8 100644 --- a/llvm/include/llvm/LTO/LTOBackend.h +++ b/llvm/include/llvm/LTO/LTOBackend.h @@ -56,6 +56,7 @@ Error thinBackend(const Config &C, unsigned Task, AddStreamFn AddStream, const FunctionImporter::ImportMapTy &ImportList, const GVSummaryMapTy &DefinedGlobals, MapVector<StringRef, BitcodeModule> *ModuleMap, + bool CodeGenOnly, const std::vector<uint8_t> &CmdArgs = std::vector<uint8_t>()); Error finalizeOptimizationRemarks( diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index a88124dacfaefd..f4c25f80811a85 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -1473,7 +1473,8 @@ class InProcessThinBackend : public ThinBackendProc { return MOrErr.takeError(); return thinBackend(Conf, Task, AddStream, **MOrErr, CombinedIndex, - ImportList, DefinedGlobals, &ModuleMap); + ImportList, DefinedGlobals, &ModuleMap, + Conf.CodeGenOnly); }; auto ModuleID = BM.getModuleIdentifier(); @@ -1839,45 +1840,49 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache, TimeTraceScopeExit.release(); - std::unique_ptr<ThinBackendProc> BackendProc = - ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries, - AddStream, Cache); - auto &ModuleMap = ThinLTO.ModulesToCompile ? *ThinLTO.ModulesToCompile : ThinLTO.ModuleMap; - auto ProcessOneModule = [&](int I) -> Error { - auto &Mod = *(ModuleMap.begin() + I); - // Tasks 0 through ParallelCodeGenParallelismLevel-1 are reserved for - // combined module and parallel code generation partitions. - return BackendProc->start(RegularLTO.ParallelCodeGenParallelismLevel + I, - Mod.second, ImportLists[Mod.first], - ExportLists[Mod.first], ResolvedODR[Mod.first], - ThinLTO.ModuleMap); + auto RunBackends = [&](ThinBackendProc *BackendProcess) -> Error { + auto ProcessOneModule = [&](int I) -> Error { + auto &Mod = *(ModuleMap.begin() + I); + // Tasks 0 through ParallelCodeGenParallelismLevel-1 are reserved for + // combined module and parallel code generation partitions. + return BackendProcess->start( + RegularLTO.ParallelCodeGenParallelismLevel + I, Mod.second, + ImportLists[Mod.first], ExportLists[Mod.first], + ResolvedODR[Mod.first], ThinLTO.ModuleMap); + }; + + if (BackendProcess->getThreadCount() == 1) { + // Process the modules in the order they were provided on the + // command-line. It is important for this codepath to be used for + // WriteIndexesThinBackend, to ensure the emitted LinkedObjectsFile lists + // ThinLTO objects in the same order as the inputs, which otherwise would + // affect the final link order. + for (int I = 0, E = ModuleMap.size(); I != E; ++I) + if (Error E = ProcessOneModule(I)) + return E; + } else { + // When executing in parallel, process largest bitsize modules first to + // improve parallelism, and avoid starving the thread pool near the end. + // This saves about 15 sec on a 36-core machine while link `clang.exe` + // (out of 100 sec). + std::vector<BitcodeModule *> ModulesVec; + ModulesVec.reserve(ModuleMap.size()); + for (auto &Mod : ModuleMap) + ModulesVec.push_back(&Mod.second); + for (int I : generateModulesOrdering(ModulesVec)) + if (Error E = ProcessOneModule(I)) + return E; + } + return BackendProcess->wait(); }; - if (BackendProc->getThreadCount() == 1) { - // Process the modules in the order they were provided on the command-line. - // It is important for this codepath to be used for WriteIndexesThinBackend, - // to ensure the emitted LinkedObjectsFile lists ThinLTO objects in the same - // order as the inputs, which otherwise would affect the final link order. - for (int I = 0, E = ModuleMap.size(); I != E; ++I) - if (Error E = ProcessOneModule(I)) - return E; - } else { - // When executing in parallel, process largest bitsize modules first to - // improve parallelism, and avoid starving the thread pool near the end. - // This saves about 15 sec on a 36-core machine while link `clang.exe` (out - // of 100 sec). - std::vector<BitcodeModule *> ModulesVec; - ModulesVec.reserve(ModuleMap.size()); - for (auto &Mod : ModuleMap) - ModulesVec.push_back(&Mod.second); - for (int I : generateModulesOrdering(ModulesVec)) - if (Error E = ProcessOneModule(I)) - return E; - } - return BackendProc->wait(); + std::unique_ptr<ThinBackendProc> BackendProc = + ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries, + AddStream, Cache); + return RunBackends(BackendProc.get()); } Expected<std::unique_ptr<ToolOutputFile>> lto::setupLLVMOptimizationRemarks( diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp index 4e58cd369c3ac9..880567989baffb 100644 --- a/llvm/lib/LTO/LTOBackend.cpp +++ b/llvm/lib/LTO/LTOBackend.cpp @@ -565,7 +565,7 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream, const FunctionImporter::ImportMapTy &ImportList, const GVSummaryMapTy &DefinedGlobals, MapVector<StringRef, BitcodeModule> *ModuleMap, - const std::vector<uint8_t> &CmdArgs) { + bool CodeGenOnly, const std::vector<uint8_t> &CmdArgs) { Expected<const Target *> TOrErr = initAndLookupTarget(Conf, Mod); if (!TOrErr) return TOrErr.takeError(); @@ -586,7 +586,9 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream, Mod.setPartialSampleProfileRatio(CombinedIndex); LLVM_DEBUG(dbgs() << "Running ThinLTO\n"); - if (Conf.CodeGenOnly) { + if (CodeGenOnly) { + // If CodeGenOnly is set, we only perform code generation and skip + // optimization. codegen(Conf, TM.get(), AddStream, Task, Mod, CombinedIndex); return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile)); } >From a566ab01485da1446431f449bee88ab0b8d558f1 Mon Sep 17 00:00:00 2001 From: Kyungwoo Lee <kyu...@meta.com> Date: Fri, 13 Sep 2024 08:51:00 -0700 Subject: [PATCH 2/5] [CGData][ThinLTO] Global Outlining with Two-CodeGen Rounds --- llvm/include/llvm/CGData/CodeGenData.h | 16 +++ llvm/lib/CGData/CodeGenData.cpp | 81 +++++++++++++- llvm/lib/LTO/CMakeLists.txt | 1 + llvm/lib/LTO/LTO.cpp | 103 +++++++++++++++++- llvm/lib/LTO/LTOBackend.cpp | 11 ++ .../test/ThinLTO/AArch64/cgdata-two-rounds.ll | 94 ++++++++++++++++ llvm/test/ThinLTO/AArch64/lit.local.cfg | 2 + 7 files changed, 302 insertions(+), 6 deletions(-) create mode 100644 llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll create mode 100644 llvm/test/ThinLTO/AArch64/lit.local.cfg diff --git a/llvm/include/llvm/CGData/CodeGenData.h b/llvm/include/llvm/CGData/CodeGenData.h index 84133a433170fe..1e1afe99327650 100644 --- a/llvm/include/llvm/CGData/CodeGenData.h +++ b/llvm/include/llvm/CGData/CodeGenData.h @@ -164,6 +164,22 @@ publishOutlinedHashTree(std::unique_ptr<OutlinedHashTree> HashTree) { CodeGenData::getInstance().publishOutlinedHashTree(std::move(HashTree)); } +/// Initialize the two-codegen rounds. +void initializeTwoCodegenRounds(); + +/// Save the current module before the first codegen round. +void saveModuleForTwoRounds(const Module &TheModule, unsigned Task); + +/// Load the current module before the second codegen round. +std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule, + unsigned Task, + LLVMContext &Context); + +/// Merge the codegen data from the input files in scratch vector in ThinLTO +/// two-codegen rounds. +Error mergeCodeGenData( + const std::unique_ptr<std::vector<llvm::SmallString<0>>> InputFiles); + void warn(Error E, StringRef Whence = ""); void warn(Twine Message, std::string Whence = "", std::string Hint = ""); diff --git a/llvm/lib/CGData/CodeGenData.cpp b/llvm/lib/CGData/CodeGenData.cpp index 55d2504231c744..ff8e5dd7c75790 100644 --- a/llvm/lib/CGData/CodeGenData.cpp +++ b/llvm/lib/CGData/CodeGenData.cpp @@ -17,6 +17,7 @@ #include "llvm/Object/ObjectFile.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/FileSystem.h" +#include "llvm/Support/Path.h" #include "llvm/Support/WithColor.h" #define DEBUG_TYPE "cg-data" @@ -30,6 +31,14 @@ cl::opt<bool> cl::opt<std::string> CodeGenDataUsePath("codegen-data-use-path", cl::init(""), cl::Hidden, cl::desc("File path to where .cgdata file is read")); +cl::opt<bool> CodeGenDataThinLTOTwoRounds( + "codegen-data-thinlto-two-rounds", cl::init(false), cl::Hidden, + cl::desc("Enable two-round ThinLTO code generation. The first round " + "emits codegen data, while the second round uses the emitted " + "codegen data for further optimizations.")); + +// Path to where the optimized bitcodes are saved and restored for ThinLTO. +static SmallString<128> CodeGenDataThinLTOTwoRoundsPath; static std::string getCGDataErrString(cgdata_error Err, const std::string &ErrMsg = "") { @@ -139,7 +148,7 @@ CodeGenData &CodeGenData::getInstance() { std::call_once(CodeGenData::OnceFlag, []() { Instance = std::unique_ptr<CodeGenData>(new CodeGenData()); - if (CodeGenDataGenerate) + if (CodeGenDataGenerate || CodeGenDataThinLTOTwoRounds) Instance->EmitCGData = true; else if (!CodeGenDataUsePath.empty()) { // Initialize the global CGData if the input file name is given. @@ -215,6 +224,76 @@ void warn(Error E, StringRef Whence) { } } +static std::string getPath(StringRef Dir, unsigned Task) { + return (Dir + "/" + llvm::Twine(Task) + ".saved_copy.bc").str(); +} + +void initializeTwoCodegenRounds() { + assert(CodeGenDataThinLTOTwoRounds); + if (auto EC = llvm::sys::fs::createUniqueDirectory( + "cgdata", CodeGenDataThinLTOTwoRoundsPath)) + report_fatal_error(Twine("Failed to create directory: ") + EC.message()); +} + +void saveModuleForTwoRounds(const Module &TheModule, unsigned Task) { + assert(sys::fs::is_directory(CodeGenDataThinLTOTwoRoundsPath)); + std::string Path = getPath(CodeGenDataThinLTOTwoRoundsPath, Task); + std::error_code EC; + raw_fd_ostream OS(Path, EC, sys::fs::OpenFlags::OF_None); + if (EC) + report_fatal_error(Twine("Failed to open ") + Path + + " to save optimized bitcode: " + EC.message()); + WriteBitcodeToFile(TheModule, OS, /* ShouldPreserveUseListOrder */ true); +} + +std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule, + unsigned Task, + LLVMContext &Context) { + assert(sys::fs::is_directory(CodeGenDataThinLTOTwoRoundsPath)); + std::string Path = getPath(CodeGenDataThinLTOTwoRoundsPath, Task); + auto FileOrError = MemoryBuffer::getFile(Path); + if (auto EC = FileOrError.getError()) + report_fatal_error(Twine("Failed to open ") + Path + + " to load optimized bitcode: " + EC.message()); + + std::unique_ptr<MemoryBuffer> FileBuffer = std::move(*FileOrError); + auto RestoredModule = llvm::parseBitcodeFile(*FileBuffer, Context); + if (!RestoredModule) + report_fatal_error(Twine("Failed to parse optimized bitcode loaded from ") + + Path + "\n"); + + // Restore the original module identifier. + (*RestoredModule)->setModuleIdentifier(OrigModule.getModuleIdentifier()); + return std::move(*RestoredModule); +} + +Error mergeCodeGenData( + const std::unique_ptr<std::vector<llvm::SmallString<0>>> InputFiles) { + + OutlinedHashTreeRecord GlobalOutlineRecord; + for (auto &InputFile : *(InputFiles)) { + if (InputFile.empty()) + continue; + StringRef File = StringRef(InputFile.data(), InputFile.size()); + std::unique_ptr<MemoryBuffer> Buffer = MemoryBuffer::getMemBuffer( + File, "in-memory object file", /*RequiresNullTerminator=*/false); + Expected<std::unique_ptr<object::ObjectFile>> BinOrErr = + object::ObjectFile::createObjectFile(Buffer->getMemBufferRef()); + if (!BinOrErr) + return BinOrErr.takeError(); + + std::unique_ptr<object::ObjectFile> &Obj = BinOrErr.get(); + if (auto E = CodeGenDataReader::mergeFromObjectFile(Obj.get(), + GlobalOutlineRecord)) + return E; + } + + if (!GlobalOutlineRecord.empty()) + cgdata::publishOutlinedHashTree(std::move(GlobalOutlineRecord.HashTree)); + + return Error::success(); +} + } // end namespace cgdata } // end namespace llvm diff --git a/llvm/lib/LTO/CMakeLists.txt b/llvm/lib/LTO/CMakeLists.txt index 69ff08e1f374c4..057d73b6349cf1 100644 --- a/llvm/lib/LTO/CMakeLists.txt +++ b/llvm/lib/LTO/CMakeLists.txt @@ -21,6 +21,7 @@ add_llvm_component_library(LLVMLTO BinaryFormat BitReader BitWriter + CGData CodeGen CodeGenTypes Core diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index f4c25f80811a85..945f8c859365ea 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -21,6 +21,7 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Bitcode/BitcodeReader.h" #include "llvm/Bitcode/BitcodeWriter.h" +#include "llvm/CGData/CodeGenData.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/Config/llvm-config.h" #include "llvm/IR/AutoUpgrade.h" @@ -70,6 +71,8 @@ static cl::opt<bool> DumpThinCGSCCs("dump-thin-cg-sccs", cl::init(false), cl::Hidden, cl::desc("Dump the SCCs in the ThinLTO index's callgraph")); +extern cl::opt<bool> CodeGenDataThinLTOTwoRounds; + namespace llvm { /// Enable global value internalization in LTO. cl::opt<bool> EnableLTOInternalization( @@ -1458,7 +1461,7 @@ class InProcessThinBackend : public ThinBackendProc { GlobalValue::getGUID(GlobalValue::dropLLVMManglingEscape(Name))); } - Error runThinLTOBackendThread( + virtual Error runThinLTOBackendThread( AddStreamFn AddStream, FileCache Cache, unsigned Task, BitcodeModule BM, ModuleSummaryIndex &CombinedIndex, const FunctionImporter::ImportMapTy &ImportList, @@ -1559,6 +1562,60 @@ class InProcessThinBackend : public ThinBackendProc { return BackendThreadPool.getMaxConcurrency(); } }; + +/// This Backend will run ThinBackend process but throw away all the output from +/// the codegen. This class facilitates the first codegen round. +class NoOutputThinBackend : public InProcessThinBackend { +public: + NoOutputThinBackend( + const Config &Conf, ModuleSummaryIndex &CombinedIndex, + ThreadPoolStrategy ThinLTOParallelism, + const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries, + std::unique_ptr<std::vector<llvm::SmallString<0>>> Scratch) + : InProcessThinBackend( + Conf, CombinedIndex, ThinLTOParallelism, ModuleToDefinedGVSummaries, + // Allocate a scratch buffer for each task to write output to. + [Allocation = &*Scratch](unsigned Task, const Twine &ModuleName) { + return std::make_unique<CachedFileStream>( + std::make_unique<raw_svector_ostream>((*Allocation)[Task])); + }, + FileCache(), nullptr, false, false), + Scratch(std::move(Scratch)) {} + + /// Scratch space for writing output during the codegen. + std::unique_ptr<std::vector<llvm::SmallString<0>>> Scratch; +}; + +/// This Backend performs codegen on bitcode that was previously saved after +/// going through optimization. This class facilitates the second codegen round. +class OptimizedBitcodeThinBackend : public InProcessThinBackend { +public: + OptimizedBitcodeThinBackend( + const Config &Conf, ModuleSummaryIndex &CombinedIndex, + ThreadPoolStrategy ThinLTOParallelism, + const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries, + AddStreamFn AddStream) + : InProcessThinBackend(Conf, CombinedIndex, ThinLTOParallelism, + ModuleToDefinedGVSummaries, AddStream, FileCache(), + nullptr, false, false) {} + + virtual Error runThinLTOBackendThread( + AddStreamFn AddStream, FileCache Cache, unsigned Task, BitcodeModule BM, + ModuleSummaryIndex &CombinedIndex, + const FunctionImporter::ImportMapTy &ImportList, + const FunctionImporter::ExportSetTy &ExportList, + const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR, + const GVSummaryMapTy &DefinedGlobals, + MapVector<StringRef, BitcodeModule> &ModuleMap) override { + LTOLLVMContext BackendContext(Conf); + std::unique_ptr<Module> LoadedModule = + cgdata::loadModuleForTwoRounds(BM, Task, BackendContext); + + return thinBackend(Conf, Task, AddStream, *LoadedModule, CombinedIndex, + ImportList, DefinedGlobals, &ModuleMap, + /*CodeGenOnly=*/true); + } +}; } // end anonymous namespace ThinBackend lto::createInProcessThinBackend(ThreadPoolStrategy Parallelism, @@ -1879,10 +1936,46 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache, return BackendProcess->wait(); }; - std::unique_ptr<ThinBackendProc> BackendProc = - ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries, - AddStream, Cache); - return RunBackends(BackendProc.get()); + if (!CodeGenDataThinLTOTwoRounds) { + std::unique_ptr<ThinBackendProc> BackendProc = + ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries, + AddStream, Cache); + return RunBackends(BackendProc.get()); + } + + // Perform two rounds of code generation for ThinLTO: + // 1. First round: Run optimization and code generation with a scratch output. + // 2. Merge codegen data extracted from the scratch output. + // 3. Second round: Run code generation again using the merged data. + LLVM_DEBUG(dbgs() << "Running ThinLTO two-codegen rounds\n"); + + // Initialize a temporary path to store and retrieve optimized IRs for + // two-round code generation. + cgdata::initializeTwoCodegenRounds(); + + // Create a scratch output to hold intermediate results. + auto Outputs = + std::make_unique<std::vector<llvm::SmallString<0>>>(getMaxTasks()); + auto FirstRoundLTO = std::make_unique<NoOutputThinBackend>( + Conf, ThinLTO.CombinedIndex, llvm::heavyweight_hardware_concurrency(), + ModuleToDefinedGVSummaries, std::move(Outputs)); + // First round: Run optimization and code generation with a scratch output. + // Before code generation, serialize modules. + if (Error E = RunBackends(FirstRoundLTO.get())) + return E; + + // Merge codegen data extracted from the scratch output. + if (Error E = cgdata::mergeCodeGenData(std::move(FirstRoundLTO->Scratch))) + return E; + + // Second round: Run code generation by reading IRs. + std::unique_ptr<ThinBackendProc> SecondRoundLTO = + std::make_unique<OptimizedBitcodeThinBackend>( + Conf, ThinLTO.CombinedIndex, llvm::heavyweight_hardware_concurrency(), + ModuleToDefinedGVSummaries, AddStream); + Error E = RunBackends(SecondRoundLTO.get()); + + return E; } Expected<std::unique_ptr<ToolOutputFile>> lto::setupLLVMOptimizationRemarks( diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp index 880567989baffb..d198e8e5102009 100644 --- a/llvm/lib/LTO/LTOBackend.cpp +++ b/llvm/lib/LTO/LTOBackend.cpp @@ -20,6 +20,7 @@ #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Bitcode/BitcodeReader.h" #include "llvm/Bitcode/BitcodeWriter.h" +#include "llvm/CGData/CodeGenData.h" #include "llvm/IR/LLVMRemarkStreamer.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/PassManager.h" @@ -74,6 +75,8 @@ static cl::opt<bool> ThinLTOAssumeMerged( cl::desc("Assume the input has already undergone ThinLTO function " "importing and the other pre-optimization pipeline changes.")); +extern cl::opt<bool> CodeGenDataThinLTOTwoRounds; + namespace llvm { extern cl::opt<bool> NoPGOWarnMismatch; } @@ -599,11 +602,19 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream, auto OptimizeAndCodegen = [&](Module &Mod, TargetMachine *TM, std::unique_ptr<ToolOutputFile> DiagnosticOutputFile) { + // Perform optimization and code generation for ThinLTO. if (!opt(Conf, TM, Task, Mod, /*IsThinLTO=*/true, /*ExportSummary=*/nullptr, /*ImportSummary=*/&CombinedIndex, CmdArgs)) return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile)); + // Save the current module before the first codegen round. + // Note that the second codegen round runs only `codegen()` without + // running `opt()`. We're not reaching here as it's bailed out earlier + // with CodeGenOnly which has been set in `OptimizedBitcodeThinBackend`. + if (CodeGenDataThinLTOTwoRounds) + cgdata::saveModuleForTwoRounds(Mod, Task); + codegen(Conf, TM, AddStream, Task, Mod, CombinedIndex); return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile)); }; diff --git a/llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll b/llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll new file mode 100644 index 00000000000000..0e082cf4e55e54 --- /dev/null +++ b/llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll @@ -0,0 +1,94 @@ +; This test verifies whether we can outline a singleton instance (i.e., an instance that does not repeat) +; by running two codegen rounds. + +; RUN: split-file %s %t + +; Verify each outlining instance is singleton with the global outlining for thinlto. +; They will be identical, which can be folded by the linker with ICF. +; RUN: opt -module-summary %t/thin-one.ll -o %t/thin-one.bc +; RUN: opt -module-summary %t/thin-two.ll -o %t/thin-two.bc +; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc -o %t/thinlto \ +; RUN: -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \ +; RUN: -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \ +; RUN: -codegen-data-thinlto-two-rounds + +; thin-one.ll will have one outlining instance (matched in the global outlined hash tree) +; RUN: llvm-objdump -d %t/thinlto.1 | FileCheck %s --check-prefix=THINLTO-1 +; THINLTO-1: _OUTLINED_FUNCTION{{.*}}>: +; THINLTO-1-NEXT: mov +; THINLTO-1-NEXT: mov +; THINLTO-1-NEXT: b + +; thin-two.ll will have two outlining instances (matched in the global outlined hash tree) +; RUN: llvm-objdump -d %t/thinlto.2 | FileCheck %s --check-prefix=THINLTO-2 +; THINLTO-2: _OUTLINED_FUNCTION{{.*}}>: +; THINLTO-2-NEXT: mov +; THINLTO-2-NEXT: mov +; THINLTO-2-NEXT: b +; THINLTO-2: _OUTLINED_FUNCTION{{.*}}>: +; THINLTO-2-NEXT: mov +; THINLTO-2-NEXT: mov +; THINLTO-2-NEXT: b + +; Now add a lto module to the above thinlto modules. +; Verify the lto module is optimized independent of the global outlining for thinlto. +; RUN: opt %t/lto.ll -o %t/lto.bc +; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc %t/lto.bc -o %t/out \ +; RUN: -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \ +; RUN: -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \ +; RUN: -r %t/lto.bc,_f4,px -r %t/lto.bc,_f5,px -r %t/lto.bc,_f6,px -r %t/lto.bc,_g,x \ +; RUN: -codegen-data-thinlto-two-rounds + +; lto.ll will have one outlining instance within the lto module itself (no global outlining). +; RUN: llvm-objdump -d %t/out.0 | FileCheck %s --check-prefix=LTO-0 +; LTO-0: _OUTLINED_FUNCTION{{.*}}>: +; LTO-0-NEXT: mov +; LTO-0-NEXT: b + +; thin-one.ll will have one outlining instance (matched in the global outlined hash tree) +; RUN: llvm-objdump -d %t/out.1 | FileCheck %s --check-prefix=THINLTO-1 + +; thin-two.ll will have two outlining instances (matched in the global outlined hash tree) +; RUN: llvm-objdump -d %t/out.2 | FileCheck %s --check-prefix=THINLTO-2 + +;--- thin-one.ll +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-darwin" + +declare i32 @g(i32, i32, i32) +define i32 @f3() minsize { + %1 = call i32 @g(i32 30, i32 1, i32 2); + ret i32 %1 +} + +;--- thin-two.ll +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-darwin" + +declare i32 @g(i32, i32, i32) +define i32 @f1() minsize { + %1 = call i32 @g(i32 10, i32 1, i32 2); + ret i32 %1 +} +define i32 @f2() minsize { + %1 = call i32 @g(i32 20, i32 1, i32 2); + ret i32 %1 +} + +;--- lto.ll +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-darwin" + +declare i32 @g(i32, i32, i32) +define i32 @f4() minsize { + %1 = call i32 @g(i32 10, i32 30, i32 2); + ret i32 %1 +} +define i32 @f5() minsize { + %1 = call i32 @g(i32 20, i32 40, i32 2); + ret i32 %1 +} +define i32 @f6() minsize { + %1 = call i32 @g(i32 50, i32 60, i32 2); + ret i32 %1 +} diff --git a/llvm/test/ThinLTO/AArch64/lit.local.cfg b/llvm/test/ThinLTO/AArch64/lit.local.cfg new file mode 100644 index 00000000000000..10d4a0e953ed47 --- /dev/null +++ b/llvm/test/ThinLTO/AArch64/lit.local.cfg @@ -0,0 +1,2 @@ +if not "AArch64" in config.root.targets: + config.unsupported = True >From 011d4c112bebc1a93fa31e40b2ee5ccb3b785077 Mon Sep 17 00:00:00 2001 From: Kyungwoo Lee <kyu...@meta.com> Date: Tue, 17 Sep 2024 18:07:49 -0700 Subject: [PATCH 3/5] Address comments from ellishg --- llvm/include/llvm/CGData/CodeGenData.h | 7 ++++--- llvm/include/llvm/LTO/LTOBackend.h | 3 ++- llvm/lib/CGData/CodeGenData.cpp | 4 +++- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/llvm/include/llvm/CGData/CodeGenData.h b/llvm/include/llvm/CGData/CodeGenData.h index 1e1afe99327650..72b52e6e9b8fd1 100644 --- a/llvm/include/llvm/CGData/CodeGenData.h +++ b/llvm/include/llvm/CGData/CodeGenData.h @@ -164,13 +164,14 @@ publishOutlinedHashTree(std::unique_ptr<OutlinedHashTree> HashTree) { CodeGenData::getInstance().publishOutlinedHashTree(std::move(HashTree)); } -/// Initialize the two-codegen rounds. void initializeTwoCodegenRounds(); -/// Save the current module before the first codegen round. +/// Save \p TheModule before the first codegen round. +/// \p Task represents the partition number in the parallel code generation +/// process. void saveModuleForTwoRounds(const Module &TheModule, unsigned Task); -/// Load the current module before the second codegen round. +/// Load the optimized module before the second codegen round. std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule, unsigned Task, LLVMContext &Context); diff --git a/llvm/include/llvm/LTO/LTOBackend.h b/llvm/include/llvm/LTO/LTOBackend.h index 8516398510d4b8..098c0491dfe70a 100644 --- a/llvm/include/llvm/LTO/LTOBackend.h +++ b/llvm/include/llvm/LTO/LTOBackend.h @@ -50,7 +50,8 @@ Error backend(const Config &C, AddStreamFn AddStream, /// already been mapped to memory and the corresponding BitcodeModule objects /// are saved in the ModuleMap. If \p ModuleMap is nullptr, module files will /// be mapped to memory on demand and at any given time during importing, only -/// one source module will be kept open at the most. +/// one source module will be kept open at the most. If \p CodeGenOnly is true, +/// the backend will skip optimization and only perform code generation. Error thinBackend(const Config &C, unsigned Task, AddStreamFn AddStream, Module &M, const ModuleSummaryIndex &CombinedIndex, const FunctionImporter::ImportMapTy &ImportList, diff --git a/llvm/lib/CGData/CodeGenData.cpp b/llvm/lib/CGData/CodeGenData.cpp index ff8e5dd7c75790..58b92b7262957a 100644 --- a/llvm/lib/CGData/CodeGenData.cpp +++ b/llvm/lib/CGData/CodeGenData.cpp @@ -225,7 +225,9 @@ void warn(Error E, StringRef Whence) { } static std::string getPath(StringRef Dir, unsigned Task) { - return (Dir + "/" + llvm::Twine(Task) + ".saved_copy.bc").str(); + llvm::SmallString<128> Path(Dir); + llvm::sys::path::append(Path, llvm::Twine(Task) + ".saved_copy.bc"); + return std::string(Path); } void initializeTwoCodegenRounds() { >From e402d60c6206c585495123dd327b2a5ab85982b4 Mon Sep 17 00:00:00 2001 From: Kyungwoo Lee <kyu...@meta.com> Date: Tue, 17 Sep 2024 23:37:51 -0700 Subject: [PATCH 4/5] Address comments from NuriAmari --- llvm/lib/CGData/CodeGenData.cpp | 4 ++-- llvm/lib/LTO/LTO.cpp | 33 +++++++++++++++++++++------------ llvm/lib/LTO/LTOBackend.cpp | 2 +- 3 files changed, 24 insertions(+), 15 deletions(-) diff --git a/llvm/lib/CGData/CodeGenData.cpp b/llvm/lib/CGData/CodeGenData.cpp index 58b92b7262957a..4e21045a67cba6 100644 --- a/llvm/lib/CGData/CodeGenData.cpp +++ b/llvm/lib/CGData/CodeGenData.cpp @@ -245,7 +245,7 @@ void saveModuleForTwoRounds(const Module &TheModule, unsigned Task) { if (EC) report_fatal_error(Twine("Failed to open ") + Path + " to save optimized bitcode: " + EC.message()); - WriteBitcodeToFile(TheModule, OS, /* ShouldPreserveUseListOrder */ true); + WriteBitcodeToFile(TheModule, OS, /*ShouldPreserveUseListOrder=*/true); } std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule, @@ -259,7 +259,7 @@ std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule, " to load optimized bitcode: " + EC.message()); std::unique_ptr<MemoryBuffer> FileBuffer = std::move(*FileOrError); - auto RestoredModule = llvm::parseBitcodeFile(*FileBuffer, Context); + auto RestoredModule = parseBitcodeFile(*FileBuffer, Context); if (!RestoredModule) report_fatal_error(Twine("Failed to parse optimized bitcode loaded from ") + Path + "\n"); diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index 945f8c859365ea..b51b908fb28760 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -1563,11 +1563,14 @@ class InProcessThinBackend : public ThinBackendProc { } }; -/// This Backend will run ThinBackend process but throw away all the output from -/// the codegen. This class facilitates the first codegen round. -class NoOutputThinBackend : public InProcessThinBackend { +/// This backend is utilized in the first round of a two-codegen round process. +/// It first saves optimized bitcode files to disk before the codegen process +/// begins. After codegen, it stores the resulting object files in a scratch +/// buffer. Note the codegen data stored in the scratch buffer will be extracted +/// and merged in the subsequent step. +class FirstRoundThinBackend : public InProcessThinBackend { public: - NoOutputThinBackend( + FirstRoundThinBackend( const Config &Conf, ModuleSummaryIndex &CombinedIndex, ThreadPoolStrategy ThinLTOParallelism, const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries, @@ -1579,25 +1582,31 @@ class NoOutputThinBackend : public InProcessThinBackend { return std::make_unique<CachedFileStream>( std::make_unique<raw_svector_ostream>((*Allocation)[Task])); }, - FileCache(), nullptr, false, false), + FileCache(), /*OnWrite=*/nullptr, /*ShouldEmitIndexFiles=*/false, + /*ShouldEmitImportsFiles=*/false), Scratch(std::move(Scratch)) {} /// Scratch space for writing output during the codegen. std::unique_ptr<std::vector<llvm::SmallString<0>>> Scratch; }; -/// This Backend performs codegen on bitcode that was previously saved after -/// going through optimization. This class facilitates the second codegen round. -class OptimizedBitcodeThinBackend : public InProcessThinBackend { +/// This backend operates in the second round of a two-codegen round process. +/// It starts by reading the optimized bitcode files that were saved during the +/// first round. The backend then executes the codegen only to further optimize +/// the code, utilizing the codegen data merged from the first round. Finally, +/// it writes the resulting object files as usual. +class SecondRoundThinBackend : public InProcessThinBackend { public: - OptimizedBitcodeThinBackend( + SecondRoundThinBackend( const Config &Conf, ModuleSummaryIndex &CombinedIndex, ThreadPoolStrategy ThinLTOParallelism, const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries, AddStreamFn AddStream) : InProcessThinBackend(Conf, CombinedIndex, ThinLTOParallelism, ModuleToDefinedGVSummaries, AddStream, FileCache(), - nullptr, false, false) {} + /*OnWrite=*/nullptr, + /*ShouldEmitIndexFiles=*/false, + /*ShouldEmitImportsFiles=*/false) {} virtual Error runThinLTOBackendThread( AddStreamFn AddStream, FileCache Cache, unsigned Task, BitcodeModule BM, @@ -1956,7 +1965,7 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache, // Create a scratch output to hold intermediate results. auto Outputs = std::make_unique<std::vector<llvm::SmallString<0>>>(getMaxTasks()); - auto FirstRoundLTO = std::make_unique<NoOutputThinBackend>( + auto FirstRoundLTO = std::make_unique<FirstRoundThinBackend>( Conf, ThinLTO.CombinedIndex, llvm::heavyweight_hardware_concurrency(), ModuleToDefinedGVSummaries, std::move(Outputs)); // First round: Run optimization and code generation with a scratch output. @@ -1970,7 +1979,7 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache, // Second round: Run code generation by reading IRs. std::unique_ptr<ThinBackendProc> SecondRoundLTO = - std::make_unique<OptimizedBitcodeThinBackend>( + std::make_unique<SecondRoundThinBackend>( Conf, ThinLTO.CombinedIndex, llvm::heavyweight_hardware_concurrency(), ModuleToDefinedGVSummaries, AddStream); Error E = RunBackends(SecondRoundLTO.get()); diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp index d198e8e5102009..cf69f4add53a79 100644 --- a/llvm/lib/LTO/LTOBackend.cpp +++ b/llvm/lib/LTO/LTOBackend.cpp @@ -611,7 +611,7 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream, // Save the current module before the first codegen round. // Note that the second codegen round runs only `codegen()` without // running `opt()`. We're not reaching here as it's bailed out earlier - // with CodeGenOnly which has been set in `OptimizedBitcodeThinBackend`. + // with `CodeGenOnly` which has been set in `SecondRoundThinBackend`. if (CodeGenDataThinLTOTwoRounds) cgdata::saveModuleForTwoRounds(Mod, Task); >From 569401b2a18fb8d5cca6b2b1b96c3bd4235fa43e Mon Sep 17 00:00:00 2001 From: Kyungwoo Lee <kyu...@meta.com> Date: Sun, 29 Sep 2024 10:38:46 -0700 Subject: [PATCH 5/5] Address comments from teresajohnson --- llvm/include/llvm/CGData/CodeGenData.h | 25 +++- llvm/include/llvm/LTO/LTO.h | 3 +- llvm/include/llvm/LTO/LTOBackend.h | 4 +- llvm/lib/CGData/CodeGenData.cpp | 97 +++++++++------ llvm/lib/LTO/LTO.cpp | 163 +++++++++++++++++++------ llvm/lib/LTO/LTOBackend.cpp | 7 +- 6 files changed, 214 insertions(+), 85 deletions(-) diff --git a/llvm/include/llvm/CGData/CodeGenData.h b/llvm/include/llvm/CGData/CodeGenData.h index 72b52e6e9b8fd1..65abd97fd26206 100644 --- a/llvm/include/llvm/CGData/CodeGenData.h +++ b/llvm/include/llvm/CGData/CodeGenData.h @@ -20,6 +20,7 @@ #include "llvm/CGData/OutlinedHashTreeRecord.h" #include "llvm/IR/Module.h" #include "llvm/Object/ObjectFile.h" +#include "llvm/Support/Caching.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/TargetParser/Triple.h" #include <mutex> @@ -164,22 +165,36 @@ publishOutlinedHashTree(std::unique_ptr<OutlinedHashTree> HashTree) { CodeGenData::getInstance().publishOutlinedHashTree(std::move(HashTree)); } -void initializeTwoCodegenRounds(); +struct StreamCacheData { + /// Backing buffer for serialized data streams. + SmallVector<SmallString<0>> &Outputs; + /// Callback function to add serialized data to the stream. + AddStreamFn &AddStream; + /// Backing buffer for cached data. + SmallVector<std::unique_ptr<MemoryBuffer>> &Files; + /// Cache mechanism for storing and retrieving data. + FileCache &Cache; +}; + +void initializeTwoCodegenRounds(StreamCacheData &CG, StreamCacheData &IR); /// Save \p TheModule before the first codegen round. /// \p Task represents the partition number in the parallel code generation /// process. -void saveModuleForTwoRounds(const Module &TheModule, unsigned Task); +/// \p AddStream is the callback used to add the serialized module to the +/// stream. +void saveModuleForTwoRounds(const Module &TheModule, unsigned Task, + AddStreamFn AddStream); /// Load the optimized module before the second codegen round. std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule, unsigned Task, - LLVMContext &Context); + LLVMContext &Context, + ArrayRef<StringRef> IRBuffer); /// Merge the codegen data from the input files in scratch vector in ThinLTO /// two-codegen rounds. -Error mergeCodeGenData( - const std::unique_ptr<std::vector<llvm::SmallString<0>>> InputFiles); +Error mergeCodeGenData(ArrayRef<StringRef> InputFiles); void warn(Error E, StringRef Whence = ""); void warn(Twine Message, std::string Whence = "", std::string Hint = ""); diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h index 214aa4e1c562dc..3059ad775aeb2f 100644 --- a/llvm/include/llvm/LTO/LTO.h +++ b/llvm/include/llvm/LTO/LTO.h @@ -72,7 +72,8 @@ std::string computeLTOCacheKey( const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR, const GVSummaryMapTy &DefinedGlobals, const DenseSet<GlobalValue::GUID> &CfiFunctionDefs = {}, - const DenseSet<GlobalValue::GUID> &CfiFunctionDecls = {}); + const DenseSet<GlobalValue::GUID> &CfiFunctionDecls = {}, + StringRef ExtraID = StringRef()); namespace lto { diff --git a/llvm/include/llvm/LTO/LTOBackend.h b/llvm/include/llvm/LTO/LTOBackend.h index 098c0491dfe70a..123768aeb98ef0 100644 --- a/llvm/include/llvm/LTO/LTOBackend.h +++ b/llvm/include/llvm/LTO/LTOBackend.h @@ -52,12 +52,14 @@ Error backend(const Config &C, AddStreamFn AddStream, /// be mapped to memory on demand and at any given time during importing, only /// one source module will be kept open at the most. If \p CodeGenOnly is true, /// the backend will skip optimization and only perform code generation. +/// If \p IRAddStream is not nullptr, it will be called invoked just before +/// code generation to serialize the optimized IR. Error thinBackend(const Config &C, unsigned Task, AddStreamFn AddStream, Module &M, const ModuleSummaryIndex &CombinedIndex, const FunctionImporter::ImportMapTy &ImportList, const GVSummaryMapTy &DefinedGlobals, MapVector<StringRef, BitcodeModule> *ModuleMap, - bool CodeGenOnly, + bool CodeGenOnly, AddStreamFn IRAddStream = nullptr, const std::vector<uint8_t> &CmdArgs = std::vector<uint8_t>()); Error finalizeOptimizationRemarks( diff --git a/llvm/lib/CGData/CodeGenData.cpp b/llvm/lib/CGData/CodeGenData.cpp index 4e21045a67cba6..b6398aa66a884c 100644 --- a/llvm/lib/CGData/CodeGenData.cpp +++ b/llvm/lib/CGData/CodeGenData.cpp @@ -15,6 +15,7 @@ #include "llvm/CGData/CodeGenDataReader.h" #include "llvm/CGData/OutlinedHashTreeRecord.h" #include "llvm/Object/ObjectFile.h" +#include "llvm/Support/Caching.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" @@ -36,9 +37,17 @@ cl::opt<bool> CodeGenDataThinLTOTwoRounds( cl::desc("Enable two-round ThinLTO code generation. The first round " "emits codegen data, while the second round uses the emitted " "codegen data for further optimizations.")); - -// Path to where the optimized bitcodes are saved and restored for ThinLTO. -static SmallString<128> CodeGenDataThinLTOTwoRoundsPath; +cl::opt<std::string> ThinLTOTwoRoundsCachePath( + "thinlto-two-rounds-cache-path", cl::init(""), cl::Hidden, + cl::desc("Specify the cache path for storing intermediate object files and " + "optimized IRs during the first round of two-round ThinLTO code " + "generation. This option supports faster iteration of the first " + "code generation in incremental builds. However, regardless of " + "this option, the second round of code generation always runs to " + "ensure correctness, as merged codegen data can alter global " + "states. So that standard cache options supplied by the linker, " + "such as `-cache_path_lto`, are ineffective when " + "`-codegen-data-thinlto-two-rounds` is enabled.")); static std::string getCGDataErrString(cgdata_error Err, const std::string &ErrMsg = "") { @@ -224,59 +233,69 @@ void warn(Error E, StringRef Whence) { } } -static std::string getPath(StringRef Dir, unsigned Task) { - llvm::SmallString<128> Path(Dir); - llvm::sys::path::append(Path, llvm::Twine(Task) + ".saved_copy.bc"); - return std::string(Path); -} - -void initializeTwoCodegenRounds() { +void initializeTwoCodegenRounds(StreamCacheData &CG, StreamCacheData &IR) { assert(CodeGenDataThinLTOTwoRounds); - if (auto EC = llvm::sys::fs::createUniqueDirectory( - "cgdata", CodeGenDataThinLTOTwoRoundsPath)) - report_fatal_error(Twine("Failed to create directory: ") + EC.message()); + CG.AddStream = [&](size_t Task, const Twine &ModuleName) { + return std::make_unique<CachedFileStream>( + std::make_unique<raw_svector_ostream>(CG.Outputs[Task])); + }; + IR.AddStream = [&](size_t Task, const Twine &ModuleName) { + return std::make_unique<CachedFileStream>( + std::make_unique<raw_svector_ostream>(IR.Outputs[Task])); + }; + + if (!ThinLTOTwoRoundsCachePath.empty()) { + CG.Cache = *localCache("ThinLTO", "CG", ThinLTOTwoRoundsCachePath, + [&](size_t Task, const Twine &ModuleName, + std::unique_ptr<MemoryBuffer> MB) { + CG.Files[Task] = std::move(MB); + }); + IR.Cache = *localCache("ThinLTO", "IR", ThinLTOTwoRoundsCachePath, + [&](size_t Task, const Twine &NoduleName, + std::unique_ptr<MemoryBuffer> MB) { + IR.Files[Task] = std::move(MB); + }); + } } -void saveModuleForTwoRounds(const Module &TheModule, unsigned Task) { - assert(sys::fs::is_directory(CodeGenDataThinLTOTwoRoundsPath)); - std::string Path = getPath(CodeGenDataThinLTOTwoRoundsPath, Task); - std::error_code EC; - raw_fd_ostream OS(Path, EC, sys::fs::OpenFlags::OF_None); - if (EC) - report_fatal_error(Twine("Failed to open ") + Path + - " to save optimized bitcode: " + EC.message()); - WriteBitcodeToFile(TheModule, OS, /*ShouldPreserveUseListOrder=*/true); +void saveModuleForTwoRounds(const Module &TheModule, unsigned Task, + AddStreamFn AddStream) { + LLVM_DEBUG(dbgs() << "Saving module: " << TheModule.getModuleIdentifier() + << " in Task " << Task << "\n"); + Expected<std::unique_ptr<CachedFileStream>> StreamOrErr = + AddStream(Task, TheModule.getModuleIdentifier()); + if (Error Err = StreamOrErr.takeError()) + report_fatal_error(std::move(Err)); + std::unique_ptr<CachedFileStream> &Stream = *StreamOrErr; + + WriteBitcodeToFile(TheModule, *Stream->OS, + /*ShouldPreserveUseListOrder=*/true); } std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule, unsigned Task, - LLVMContext &Context) { - assert(sys::fs::is_directory(CodeGenDataThinLTOTwoRoundsPath)); - std::string Path = getPath(CodeGenDataThinLTOTwoRoundsPath, Task); - auto FileOrError = MemoryBuffer::getFile(Path); - if (auto EC = FileOrError.getError()) - report_fatal_error(Twine("Failed to open ") + Path + - " to load optimized bitcode: " + EC.message()); - - std::unique_ptr<MemoryBuffer> FileBuffer = std::move(*FileOrError); + LLVMContext &Context, + ArrayRef<StringRef> IRBuffer) { + LLVM_DEBUG(dbgs() << "Loading module: " << OrigModule.getModuleIdentifier() + << " in Task " << Task << "\n"); + std::unique_ptr<MemoryBuffer> FileBuffer = MemoryBuffer::getMemBuffer( + IRBuffer[Task], "in-memory IR file", /*RequiresNullTerminator=*/false); auto RestoredModule = parseBitcodeFile(*FileBuffer, Context); if (!RestoredModule) - report_fatal_error(Twine("Failed to parse optimized bitcode loaded from ") + - Path + "\n"); + report_fatal_error( + Twine("Failed to parse optimized bitcode loaded for Task: ") + + Twine(Task) + "\n"); // Restore the original module identifier. (*RestoredModule)->setModuleIdentifier(OrigModule.getModuleIdentifier()); return std::move(*RestoredModule); } -Error mergeCodeGenData( - const std::unique_ptr<std::vector<llvm::SmallString<0>>> InputFiles) { - +Error mergeCodeGenData(ArrayRef<StringRef> InputFiles) { OutlinedHashTreeRecord GlobalOutlineRecord; - for (auto &InputFile : *(InputFiles)) { - if (InputFile.empty()) + for (auto File : InputFiles) { + if (File.empty()) continue; - StringRef File = StringRef(InputFile.data(), InputFile.size()); std::unique_ptr<MemoryBuffer> Buffer = MemoryBuffer::getMemBuffer( File, "in-memory object file", /*RequiresNullTerminator=*/false); Expected<std::unique_ptr<object::ObjectFile>> BinOrErr = diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index b51b908fb28760..06bf06608b29ab 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -36,6 +36,7 @@ #include "llvm/Linker/IRMover.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Object/IRObjectFile.h" +#include "llvm/Support/Caching.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Error.h" #include "llvm/Support/FileSystem.h" @@ -101,7 +102,7 @@ std::string llvm::computeLTOCacheKey( const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR, const GVSummaryMapTy &DefinedGlobals, const DenseSet<GlobalValue::GUID> &CfiFunctionDefs, - const DenseSet<GlobalValue::GUID> &CfiFunctionDecls) { + const DenseSet<GlobalValue::GUID> &CfiFunctionDecls, StringRef ExtraID) { // Compute the unique hash for this entry. // This is based on the current compiler version, the module itself, the // export list, the hash for every single module in the import list, the @@ -341,6 +342,9 @@ std::string llvm::computeLTOCacheKey( } } + if (ExtraID.empty()) + AddString(ExtraID); + return toHex(Hasher.result()); } @@ -1431,6 +1435,7 @@ class lto::ThinBackendProc { namespace { class InProcessThinBackend : public ThinBackendProc { +protected: DefaultThreadPool BackendThreadPool; AddStreamFn AddStream; FileCache Cache; @@ -1569,25 +1574,86 @@ class InProcessThinBackend : public ThinBackendProc { /// buffer. Note the codegen data stored in the scratch buffer will be extracted /// and merged in the subsequent step. class FirstRoundThinBackend : public InProcessThinBackend { + AddStreamFn IRAddStream; + FileCache IRCache; + public: FirstRoundThinBackend( const Config &Conf, ModuleSummaryIndex &CombinedIndex, ThreadPoolStrategy ThinLTOParallelism, const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries, - std::unique_ptr<std::vector<llvm::SmallString<0>>> Scratch) - : InProcessThinBackend( - Conf, CombinedIndex, ThinLTOParallelism, ModuleToDefinedGVSummaries, - // Allocate a scratch buffer for each task to write output to. - [Allocation = &*Scratch](unsigned Task, const Twine &ModuleName) { - return std::make_unique<CachedFileStream>( - std::make_unique<raw_svector_ostream>((*Allocation)[Task])); - }, - FileCache(), /*OnWrite=*/nullptr, /*ShouldEmitIndexFiles=*/false, - /*ShouldEmitImportsFiles=*/false), - Scratch(std::move(Scratch)) {} - - /// Scratch space for writing output during the codegen. - std::unique_ptr<std::vector<llvm::SmallString<0>>> Scratch; + AddStreamFn CGAddStream, FileCache CGCache, AddStreamFn IRAddStream, + FileCache IRCache) + : InProcessThinBackend(Conf, CombinedIndex, ThinLTOParallelism, + ModuleToDefinedGVSummaries, std::move(CGAddStream), + std::move(CGCache), /*OnWrite=*/nullptr, + /*ShouldEmitIndexFiles=*/false, + /*ShouldEmitImportsFiles=*/false), + IRAddStream(std::move(IRAddStream)), IRCache(std::move(IRCache)) {} + + Error runThinLTOBackendThread( + AddStreamFn CGAddStream, FileCache CGCache, unsigned Task, + BitcodeModule BM, ModuleSummaryIndex &CombinedIndex, + const FunctionImporter::ImportMapTy &ImportList, + const FunctionImporter::ExportSetTy &ExportList, + const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR, + const GVSummaryMapTy &DefinedGlobals, + MapVector<StringRef, BitcodeModule> &ModuleMap) override { + auto RunThinBackend = [&](AddStreamFn CGAddStream, + AddStreamFn IRAddStream) { + LTOLLVMContext BackendContext(Conf); + Expected<std::unique_ptr<Module>> MOrErr = BM.parseModule(BackendContext); + if (!MOrErr) + return MOrErr.takeError(); + + return thinBackend(Conf, Task, CGAddStream, **MOrErr, CombinedIndex, + ImportList, DefinedGlobals, &ModuleMap, + Conf.CodeGenOnly, IRAddStream); + }; + + auto ModuleID = BM.getModuleIdentifier(); + + if (ShouldEmitIndexFiles) { + if (auto E = emitFiles(ImportList, ModuleID, ModuleID.str())) + return E; + } + + assert((CGCache == nullptr) == (IRCache == nullptr) && + "Both caches for CG and IR should have matching availability"); + if (!CGCache || !CombinedIndex.modulePaths().count(ModuleID) || + all_of(CombinedIndex.getModuleHash(ModuleID), + [](uint32_t V) { return V == 0; })) + // Cache disabled or no entry for this module in the combined index or + // no module hash. + return RunThinBackend(CGAddStream, IRAddStream); + + // Get CGKey for caching object in CGCache. + std::string CGKey = computeLTOCacheKey( + Conf, CombinedIndex, ModuleID, ImportList, ExportList, ResolvedODR, + DefinedGlobals, CfiFunctionDefs, CfiFunctionDecls); + Expected<AddStreamFn> CacheCGAddStreamOrErr = + CGCache(Task, CGKey, ModuleID); + if (Error Err = CacheCGAddStreamOrErr.takeError()) + return Err; + AddStreamFn &CacheCGAddStream = *CacheCGAddStreamOrErr; + + // Get IRKey for caching (optimized) IR in IRCache. + std::string IRKey = computeLTOCacheKey( + Conf, CombinedIndex, ModuleID, ImportList, ExportList, ResolvedODR, + DefinedGlobals, CfiFunctionDefs, CfiFunctionDecls, /*ExtraID=*/"IR"); + Expected<AddStreamFn> CacheIRAddStreamOrErr = + IRCache(Task, IRKey, ModuleID); + if (Error Err = CacheIRAddStreamOrErr.takeError()) + return Err; + AddStreamFn &CacheIRAddStream = *CacheIRAddStreamOrErr; + + assert((CacheCGAddStream == nullptr) == (CacheAddStream == nullptr) && + "Both object and ir caching should be matched"); + if (CacheIRAddStream) + return RunThinBackend(CacheCGAddStream, CacheIRAddStream); + + return Error::success(); + } }; /// This backend operates in the second round of a two-codegen round process. @@ -1596,17 +1662,20 @@ class FirstRoundThinBackend : public InProcessThinBackend { /// the code, utilizing the codegen data merged from the first round. Finally, /// it writes the resulting object files as usual. class SecondRoundThinBackend : public InProcessThinBackend { + ArrayRef<StringRef> IRBuffer; + public: SecondRoundThinBackend( const Config &Conf, ModuleSummaryIndex &CombinedIndex, ThreadPoolStrategy ThinLTOParallelism, const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries, - AddStreamFn AddStream) + AddStreamFn AddStream, ArrayRef<StringRef> IRBuffer) : InProcessThinBackend(Conf, CombinedIndex, ThinLTOParallelism, ModuleToDefinedGVSummaries, AddStream, FileCache(), /*OnWrite=*/nullptr, /*ShouldEmitIndexFiles=*/false, - /*ShouldEmitImportsFiles=*/false) {} + /*ShouldEmitImportsFiles=*/false), + IRBuffer(IRBuffer) {} virtual Error runThinLTOBackendThread( AddStreamFn AddStream, FileCache Cache, unsigned Task, BitcodeModule BM, @@ -1618,7 +1687,7 @@ class SecondRoundThinBackend : public InProcessThinBackend { MapVector<StringRef, BitcodeModule> &ModuleMap) override { LTOLLVMContext BackendContext(Conf); std::unique_ptr<Module> LoadedModule = - cgdata::loadModuleForTwoRounds(BM, Task, BackendContext); + cgdata::loadModuleForTwoRounds(BM, Task, BackendContext, IRBuffer); return thinBackend(Conf, Task, AddStream, *LoadedModule, CombinedIndex, ImportList, DefinedGlobals, &ModuleMap, @@ -1956,32 +2025,54 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache, // 1. First round: Run optimization and code generation with a scratch output. // 2. Merge codegen data extracted from the scratch output. // 3. Second round: Run code generation again using the merged data. - LLVM_DEBUG(dbgs() << "Running ThinLTO two-codegen rounds\n"); - - // Initialize a temporary path to store and retrieve optimized IRs for - // two-round code generation. - cgdata::initializeTwoCodegenRounds(); + LLVM_DEBUG(dbgs() << "Initializing ThinLTO two-codegen rounds\n"); + + unsigned MaxTasks = getMaxTasks(); + AddStreamFn CGAddStream, IRAddStream; + FileCache CGCache, IRCache; + SmallVector<SmallString<0>> CGOutputs(MaxTasks); + SmallVector<SmallString<0>> IROutputs(MaxTasks); + SmallVector<std::unique_ptr<MemoryBuffer>> CGFiles(MaxTasks); + SmallVector<std::unique_ptr<MemoryBuffer>> IRFiles(MaxTasks); + + // Set up output streams and caches for storing and retrieving intermediate + // objects and optimized IRs in a two-round code generation process. + cgdata::StreamCacheData CG{CGOutputs, CGAddStream, CGFiles, CGCache}; + cgdata::StreamCacheData IR{IROutputs, IRAddStream, IRFiles, IRCache}; + cgdata::initializeTwoCodegenRounds(CG, IR); + + // Retrieve results from either the cache or the stream. + auto ResultBuf = [&](ArrayRef<SmallString<0>> Outputs, + ArrayRef<std::unique_ptr<MemoryBuffer>> Files) { + SmallVector<StringRef> Result(MaxTasks); + for (unsigned I = 0; I < MaxTasks; ++I) + if (Files[I]) + Result[I] = Files[I]->getBuffer(); + else + Result[I] = Outputs[I]; + return Result; + }; - // Create a scratch output to hold intermediate results. - auto Outputs = - std::make_unique<std::vector<llvm::SmallString<0>>>(getMaxTasks()); - auto FirstRoundLTO = std::make_unique<FirstRoundThinBackend>( - Conf, ThinLTO.CombinedIndex, llvm::heavyweight_hardware_concurrency(), - ModuleToDefinedGVSummaries, std::move(Outputs)); // First round: Run optimization and code generation with a scratch output. - // Before code generation, serialize modules. + // Before code generation, serialize the optimized IR modules. + LLVM_DEBUG(dbgs() << "Running the first round of codegen\n"); + auto FirstRoundLTO = std::make_unique<FirstRoundThinBackend>( + Conf, ThinLTO.CombinedIndex, heavyweight_hardware_concurrency(), + ModuleToDefinedGVSummaries, CGAddStream, CGCache, IRAddStream, IRCache); if (Error E = RunBackends(FirstRoundLTO.get())) return E; - // Merge codegen data extracted from the scratch output. - if (Error E = cgdata::mergeCodeGenData(std::move(FirstRoundLTO->Scratch))) + LLVM_DEBUG(dbgs() << "Merging codegen data\n"); + if (Error E = cgdata::mergeCodeGenData(ResultBuf(CGOutputs, CGFiles))) return E; // Second round: Run code generation by reading IRs. - std::unique_ptr<ThinBackendProc> SecondRoundLTO = - std::make_unique<SecondRoundThinBackend>( - Conf, ThinLTO.CombinedIndex, llvm::heavyweight_hardware_concurrency(), - ModuleToDefinedGVSummaries, AddStream); + // Caching is disabled in this round because the merged code generation data + // may modify the output. + LLVM_DEBUG(dbgs() << "Running the second round of codegen\n"); + auto SecondRoundLTO = std::make_unique<SecondRoundThinBackend>( + Conf, ThinLTO.CombinedIndex, heavyweight_hardware_concurrency(), + ModuleToDefinedGVSummaries, AddStream, ResultBuf(IROutputs, IRFiles)); Error E = RunBackends(SecondRoundLTO.get()); return E; diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp index cf69f4add53a79..e414411ed64a01 100644 --- a/llvm/lib/LTO/LTOBackend.cpp +++ b/llvm/lib/LTO/LTOBackend.cpp @@ -568,7 +568,8 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream, const FunctionImporter::ImportMapTy &ImportList, const GVSummaryMapTy &DefinedGlobals, MapVector<StringRef, BitcodeModule> *ModuleMap, - bool CodeGenOnly, const std::vector<uint8_t> &CmdArgs) { + bool CodeGenOnly, AddStreamFn IRAddStream, + const std::vector<uint8_t> &CmdArgs) { Expected<const Target *> TOrErr = initAndLookupTarget(Conf, Mod); if (!TOrErr) return TOrErr.takeError(); @@ -612,8 +613,8 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream, // Note that the second codegen round runs only `codegen()` without // running `opt()`. We're not reaching here as it's bailed out earlier // with `CodeGenOnly` which has been set in `SecondRoundThinBackend`. - if (CodeGenDataThinLTOTwoRounds) - cgdata::saveModuleForTwoRounds(Mod, Task); + if (IRAddStream) + cgdata::saveModuleForTwoRounds(Mod, Task, IRAddStream); codegen(Conf, TM, AddStream, Task, Mod, CombinedIndex); return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile)); _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits