https://github.com/kyulee-com updated https://github.com/llvm/llvm-project/pull/109081
>From c1a0219457a3c162d7fa6b9d70750ba7a040d9f2 Mon Sep 17 00:00:00 2001 From: Kyungwoo Lee <kyu...@meta.com> Date: Fri, 26 Apr 2024 20:02:52 -0700 Subject: [PATCH 1/4] [ThinLTO][NFC] Prep for two-codegen rounds --- clang/lib/CodeGen/BackendUtil.cpp | 8 ++-- llvm/include/llvm/LTO/LTOBackend.h | 1 + llvm/lib/LTO/LTO.cpp | 75 ++++++++++++++++-------------- llvm/lib/LTO/LTOBackend.cpp | 6 ++- 4 files changed, 49 insertions(+), 41 deletions(-) diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index 7fa69420298160..a1909d45b4d944 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -1286,10 +1286,10 @@ static void runThinLTOBackend( Conf.CGFileType = getCodeGenFileType(Action); break; } - if (Error E = - thinBackend(Conf, -1, AddStream, *M, *CombinedIndex, ImportList, - ModuleToDefinedGVSummaries[M->getModuleIdentifier()], - /* ModuleMap */ nullptr, CGOpts.CmdArgs)) { + if (Error E = thinBackend( + Conf, -1, AddStream, *M, *CombinedIndex, ImportList, + ModuleToDefinedGVSummaries[M->getModuleIdentifier()], + /* ModuleMap */ nullptr, Conf.CodeGenOnly, CGOpts.CmdArgs)) { handleAllErrors(std::move(E), [&](ErrorInfoBase &EIB) { errs() << "Error running ThinLTO backend: " << EIB.message() << '\n'; }); diff --git a/llvm/include/llvm/LTO/LTOBackend.h b/llvm/include/llvm/LTO/LTOBackend.h index de89f4bb10dff2..8516398510d4b8 100644 --- a/llvm/include/llvm/LTO/LTOBackend.h +++ b/llvm/include/llvm/LTO/LTOBackend.h @@ -56,6 +56,7 @@ Error thinBackend(const Config &C, unsigned Task, AddStreamFn AddStream, const FunctionImporter::ImportMapTy &ImportList, const GVSummaryMapTy &DefinedGlobals, MapVector<StringRef, BitcodeModule> *ModuleMap, + bool CodeGenOnly, const std::vector<uint8_t> &CmdArgs = std::vector<uint8_t>()); Error finalizeOptimizationRemarks( diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index a88124dacfaefd..f4c25f80811a85 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -1473,7 +1473,8 @@ class InProcessThinBackend : public ThinBackendProc { return MOrErr.takeError(); return thinBackend(Conf, Task, AddStream, **MOrErr, CombinedIndex, - ImportList, DefinedGlobals, &ModuleMap); + ImportList, DefinedGlobals, &ModuleMap, + Conf.CodeGenOnly); }; auto ModuleID = BM.getModuleIdentifier(); @@ -1839,45 +1840,49 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache, TimeTraceScopeExit.release(); - std::unique_ptr<ThinBackendProc> BackendProc = - ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries, - AddStream, Cache); - auto &ModuleMap = ThinLTO.ModulesToCompile ? *ThinLTO.ModulesToCompile : ThinLTO.ModuleMap; - auto ProcessOneModule = [&](int I) -> Error { - auto &Mod = *(ModuleMap.begin() + I); - // Tasks 0 through ParallelCodeGenParallelismLevel-1 are reserved for - // combined module and parallel code generation partitions. - return BackendProc->start(RegularLTO.ParallelCodeGenParallelismLevel + I, - Mod.second, ImportLists[Mod.first], - ExportLists[Mod.first], ResolvedODR[Mod.first], - ThinLTO.ModuleMap); + auto RunBackends = [&](ThinBackendProc *BackendProcess) -> Error { + auto ProcessOneModule = [&](int I) -> Error { + auto &Mod = *(ModuleMap.begin() + I); + // Tasks 0 through ParallelCodeGenParallelismLevel-1 are reserved for + // combined module and parallel code generation partitions. + return BackendProcess->start( + RegularLTO.ParallelCodeGenParallelismLevel + I, Mod.second, + ImportLists[Mod.first], ExportLists[Mod.first], + ResolvedODR[Mod.first], ThinLTO.ModuleMap); + }; + + if (BackendProcess->getThreadCount() == 1) { + // Process the modules in the order they were provided on the + // command-line. It is important for this codepath to be used for + // WriteIndexesThinBackend, to ensure the emitted LinkedObjectsFile lists + // ThinLTO objects in the same order as the inputs, which otherwise would + // affect the final link order. + for (int I = 0, E = ModuleMap.size(); I != E; ++I) + if (Error E = ProcessOneModule(I)) + return E; + } else { + // When executing in parallel, process largest bitsize modules first to + // improve parallelism, and avoid starving the thread pool near the end. + // This saves about 15 sec on a 36-core machine while link `clang.exe` + // (out of 100 sec). + std::vector<BitcodeModule *> ModulesVec; + ModulesVec.reserve(ModuleMap.size()); + for (auto &Mod : ModuleMap) + ModulesVec.push_back(&Mod.second); + for (int I : generateModulesOrdering(ModulesVec)) + if (Error E = ProcessOneModule(I)) + return E; + } + return BackendProcess->wait(); }; - if (BackendProc->getThreadCount() == 1) { - // Process the modules in the order they were provided on the command-line. - // It is important for this codepath to be used for WriteIndexesThinBackend, - // to ensure the emitted LinkedObjectsFile lists ThinLTO objects in the same - // order as the inputs, which otherwise would affect the final link order. - for (int I = 0, E = ModuleMap.size(); I != E; ++I) - if (Error E = ProcessOneModule(I)) - return E; - } else { - // When executing in parallel, process largest bitsize modules first to - // improve parallelism, and avoid starving the thread pool near the end. - // This saves about 15 sec on a 36-core machine while link `clang.exe` (out - // of 100 sec). - std::vector<BitcodeModule *> ModulesVec; - ModulesVec.reserve(ModuleMap.size()); - for (auto &Mod : ModuleMap) - ModulesVec.push_back(&Mod.second); - for (int I : generateModulesOrdering(ModulesVec)) - if (Error E = ProcessOneModule(I)) - return E; - } - return BackendProc->wait(); + std::unique_ptr<ThinBackendProc> BackendProc = + ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries, + AddStream, Cache); + return RunBackends(BackendProc.get()); } Expected<std::unique_ptr<ToolOutputFile>> lto::setupLLVMOptimizationRemarks( diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp index 4e58cd369c3ac9..880567989baffb 100644 --- a/llvm/lib/LTO/LTOBackend.cpp +++ b/llvm/lib/LTO/LTOBackend.cpp @@ -565,7 +565,7 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream, const FunctionImporter::ImportMapTy &ImportList, const GVSummaryMapTy &DefinedGlobals, MapVector<StringRef, BitcodeModule> *ModuleMap, - const std::vector<uint8_t> &CmdArgs) { + bool CodeGenOnly, const std::vector<uint8_t> &CmdArgs) { Expected<const Target *> TOrErr = initAndLookupTarget(Conf, Mod); if (!TOrErr) return TOrErr.takeError(); @@ -586,7 +586,9 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream, Mod.setPartialSampleProfileRatio(CombinedIndex); LLVM_DEBUG(dbgs() << "Running ThinLTO\n"); - if (Conf.CodeGenOnly) { + if (CodeGenOnly) { + // If CodeGenOnly is set, we only perform code generation and skip + // optimization. codegen(Conf, TM.get(), AddStream, Task, Mod, CombinedIndex); return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile)); } >From a566ab01485da1446431f449bee88ab0b8d558f1 Mon Sep 17 00:00:00 2001 From: Kyungwoo Lee <kyu...@meta.com> Date: Fri, 13 Sep 2024 08:51:00 -0700 Subject: [PATCH 2/4] [CGData][ThinLTO] Global Outlining with Two-CodeGen Rounds --- llvm/include/llvm/CGData/CodeGenData.h | 16 +++ llvm/lib/CGData/CodeGenData.cpp | 81 +++++++++++++- llvm/lib/LTO/CMakeLists.txt | 1 + llvm/lib/LTO/LTO.cpp | 103 +++++++++++++++++- llvm/lib/LTO/LTOBackend.cpp | 11 ++ .../test/ThinLTO/AArch64/cgdata-two-rounds.ll | 94 ++++++++++++++++ llvm/test/ThinLTO/AArch64/lit.local.cfg | 2 + 7 files changed, 302 insertions(+), 6 deletions(-) create mode 100644 llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll create mode 100644 llvm/test/ThinLTO/AArch64/lit.local.cfg diff --git a/llvm/include/llvm/CGData/CodeGenData.h b/llvm/include/llvm/CGData/CodeGenData.h index 84133a433170fe..1e1afe99327650 100644 --- a/llvm/include/llvm/CGData/CodeGenData.h +++ b/llvm/include/llvm/CGData/CodeGenData.h @@ -164,6 +164,22 @@ publishOutlinedHashTree(std::unique_ptr<OutlinedHashTree> HashTree) { CodeGenData::getInstance().publishOutlinedHashTree(std::move(HashTree)); } +/// Initialize the two-codegen rounds. +void initializeTwoCodegenRounds(); + +/// Save the current module before the first codegen round. +void saveModuleForTwoRounds(const Module &TheModule, unsigned Task); + +/// Load the current module before the second codegen round. +std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule, + unsigned Task, + LLVMContext &Context); + +/// Merge the codegen data from the input files in scratch vector in ThinLTO +/// two-codegen rounds. +Error mergeCodeGenData( + const std::unique_ptr<std::vector<llvm::SmallString<0>>> InputFiles); + void warn(Error E, StringRef Whence = ""); void warn(Twine Message, std::string Whence = "", std::string Hint = ""); diff --git a/llvm/lib/CGData/CodeGenData.cpp b/llvm/lib/CGData/CodeGenData.cpp index 55d2504231c744..ff8e5dd7c75790 100644 --- a/llvm/lib/CGData/CodeGenData.cpp +++ b/llvm/lib/CGData/CodeGenData.cpp @@ -17,6 +17,7 @@ #include "llvm/Object/ObjectFile.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/FileSystem.h" +#include "llvm/Support/Path.h" #include "llvm/Support/WithColor.h" #define DEBUG_TYPE "cg-data" @@ -30,6 +31,14 @@ cl::opt<bool> cl::opt<std::string> CodeGenDataUsePath("codegen-data-use-path", cl::init(""), cl::Hidden, cl::desc("File path to where .cgdata file is read")); +cl::opt<bool> CodeGenDataThinLTOTwoRounds( + "codegen-data-thinlto-two-rounds", cl::init(false), cl::Hidden, + cl::desc("Enable two-round ThinLTO code generation. The first round " + "emits codegen data, while the second round uses the emitted " + "codegen data for further optimizations.")); + +// Path to where the optimized bitcodes are saved and restored for ThinLTO. +static SmallString<128> CodeGenDataThinLTOTwoRoundsPath; static std::string getCGDataErrString(cgdata_error Err, const std::string &ErrMsg = "") { @@ -139,7 +148,7 @@ CodeGenData &CodeGenData::getInstance() { std::call_once(CodeGenData::OnceFlag, []() { Instance = std::unique_ptr<CodeGenData>(new CodeGenData()); - if (CodeGenDataGenerate) + if (CodeGenDataGenerate || CodeGenDataThinLTOTwoRounds) Instance->EmitCGData = true; else if (!CodeGenDataUsePath.empty()) { // Initialize the global CGData if the input file name is given. @@ -215,6 +224,76 @@ void warn(Error E, StringRef Whence) { } } +static std::string getPath(StringRef Dir, unsigned Task) { + return (Dir + "/" + llvm::Twine(Task) + ".saved_copy.bc").str(); +} + +void initializeTwoCodegenRounds() { + assert(CodeGenDataThinLTOTwoRounds); + if (auto EC = llvm::sys::fs::createUniqueDirectory( + "cgdata", CodeGenDataThinLTOTwoRoundsPath)) + report_fatal_error(Twine("Failed to create directory: ") + EC.message()); +} + +void saveModuleForTwoRounds(const Module &TheModule, unsigned Task) { + assert(sys::fs::is_directory(CodeGenDataThinLTOTwoRoundsPath)); + std::string Path = getPath(CodeGenDataThinLTOTwoRoundsPath, Task); + std::error_code EC; + raw_fd_ostream OS(Path, EC, sys::fs::OpenFlags::OF_None); + if (EC) + report_fatal_error(Twine("Failed to open ") + Path + + " to save optimized bitcode: " + EC.message()); + WriteBitcodeToFile(TheModule, OS, /* ShouldPreserveUseListOrder */ true); +} + +std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule, + unsigned Task, + LLVMContext &Context) { + assert(sys::fs::is_directory(CodeGenDataThinLTOTwoRoundsPath)); + std::string Path = getPath(CodeGenDataThinLTOTwoRoundsPath, Task); + auto FileOrError = MemoryBuffer::getFile(Path); + if (auto EC = FileOrError.getError()) + report_fatal_error(Twine("Failed to open ") + Path + + " to load optimized bitcode: " + EC.message()); + + std::unique_ptr<MemoryBuffer> FileBuffer = std::move(*FileOrError); + auto RestoredModule = llvm::parseBitcodeFile(*FileBuffer, Context); + if (!RestoredModule) + report_fatal_error(Twine("Failed to parse optimized bitcode loaded from ") + + Path + "\n"); + + // Restore the original module identifier. + (*RestoredModule)->setModuleIdentifier(OrigModule.getModuleIdentifier()); + return std::move(*RestoredModule); +} + +Error mergeCodeGenData( + const std::unique_ptr<std::vector<llvm::SmallString<0>>> InputFiles) { + + OutlinedHashTreeRecord GlobalOutlineRecord; + for (auto &InputFile : *(InputFiles)) { + if (InputFile.empty()) + continue; + StringRef File = StringRef(InputFile.data(), InputFile.size()); + std::unique_ptr<MemoryBuffer> Buffer = MemoryBuffer::getMemBuffer( + File, "in-memory object file", /*RequiresNullTerminator=*/false); + Expected<std::unique_ptr<object::ObjectFile>> BinOrErr = + object::ObjectFile::createObjectFile(Buffer->getMemBufferRef()); + if (!BinOrErr) + return BinOrErr.takeError(); + + std::unique_ptr<object::ObjectFile> &Obj = BinOrErr.get(); + if (auto E = CodeGenDataReader::mergeFromObjectFile(Obj.get(), + GlobalOutlineRecord)) + return E; + } + + if (!GlobalOutlineRecord.empty()) + cgdata::publishOutlinedHashTree(std::move(GlobalOutlineRecord.HashTree)); + + return Error::success(); +} + } // end namespace cgdata } // end namespace llvm diff --git a/llvm/lib/LTO/CMakeLists.txt b/llvm/lib/LTO/CMakeLists.txt index 69ff08e1f374c4..057d73b6349cf1 100644 --- a/llvm/lib/LTO/CMakeLists.txt +++ b/llvm/lib/LTO/CMakeLists.txt @@ -21,6 +21,7 @@ add_llvm_component_library(LLVMLTO BinaryFormat BitReader BitWriter + CGData CodeGen CodeGenTypes Core diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index f4c25f80811a85..945f8c859365ea 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -21,6 +21,7 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Bitcode/BitcodeReader.h" #include "llvm/Bitcode/BitcodeWriter.h" +#include "llvm/CGData/CodeGenData.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/Config/llvm-config.h" #include "llvm/IR/AutoUpgrade.h" @@ -70,6 +71,8 @@ static cl::opt<bool> DumpThinCGSCCs("dump-thin-cg-sccs", cl::init(false), cl::Hidden, cl::desc("Dump the SCCs in the ThinLTO index's callgraph")); +extern cl::opt<bool> CodeGenDataThinLTOTwoRounds; + namespace llvm { /// Enable global value internalization in LTO. cl::opt<bool> EnableLTOInternalization( @@ -1458,7 +1461,7 @@ class InProcessThinBackend : public ThinBackendProc { GlobalValue::getGUID(GlobalValue::dropLLVMManglingEscape(Name))); } - Error runThinLTOBackendThread( + virtual Error runThinLTOBackendThread( AddStreamFn AddStream, FileCache Cache, unsigned Task, BitcodeModule BM, ModuleSummaryIndex &CombinedIndex, const FunctionImporter::ImportMapTy &ImportList, @@ -1559,6 +1562,60 @@ class InProcessThinBackend : public ThinBackendProc { return BackendThreadPool.getMaxConcurrency(); } }; + +/// This Backend will run ThinBackend process but throw away all the output from +/// the codegen. This class facilitates the first codegen round. +class NoOutputThinBackend : public InProcessThinBackend { +public: + NoOutputThinBackend( + const Config &Conf, ModuleSummaryIndex &CombinedIndex, + ThreadPoolStrategy ThinLTOParallelism, + const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries, + std::unique_ptr<std::vector<llvm::SmallString<0>>> Scratch) + : InProcessThinBackend( + Conf, CombinedIndex, ThinLTOParallelism, ModuleToDefinedGVSummaries, + // Allocate a scratch buffer for each task to write output to. + [Allocation = &*Scratch](unsigned Task, const Twine &ModuleName) { + return std::make_unique<CachedFileStream>( + std::make_unique<raw_svector_ostream>((*Allocation)[Task])); + }, + FileCache(), nullptr, false, false), + Scratch(std::move(Scratch)) {} + + /// Scratch space for writing output during the codegen. + std::unique_ptr<std::vector<llvm::SmallString<0>>> Scratch; +}; + +/// This Backend performs codegen on bitcode that was previously saved after +/// going through optimization. This class facilitates the second codegen round. +class OptimizedBitcodeThinBackend : public InProcessThinBackend { +public: + OptimizedBitcodeThinBackend( + const Config &Conf, ModuleSummaryIndex &CombinedIndex, + ThreadPoolStrategy ThinLTOParallelism, + const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries, + AddStreamFn AddStream) + : InProcessThinBackend(Conf, CombinedIndex, ThinLTOParallelism, + ModuleToDefinedGVSummaries, AddStream, FileCache(), + nullptr, false, false) {} + + virtual Error runThinLTOBackendThread( + AddStreamFn AddStream, FileCache Cache, unsigned Task, BitcodeModule BM, + ModuleSummaryIndex &CombinedIndex, + const FunctionImporter::ImportMapTy &ImportList, + const FunctionImporter::ExportSetTy &ExportList, + const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR, + const GVSummaryMapTy &DefinedGlobals, + MapVector<StringRef, BitcodeModule> &ModuleMap) override { + LTOLLVMContext BackendContext(Conf); + std::unique_ptr<Module> LoadedModule = + cgdata::loadModuleForTwoRounds(BM, Task, BackendContext); + + return thinBackend(Conf, Task, AddStream, *LoadedModule, CombinedIndex, + ImportList, DefinedGlobals, &ModuleMap, + /*CodeGenOnly=*/true); + } +}; } // end anonymous namespace ThinBackend lto::createInProcessThinBackend(ThreadPoolStrategy Parallelism, @@ -1879,10 +1936,46 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache, return BackendProcess->wait(); }; - std::unique_ptr<ThinBackendProc> BackendProc = - ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries, - AddStream, Cache); - return RunBackends(BackendProc.get()); + if (!CodeGenDataThinLTOTwoRounds) { + std::unique_ptr<ThinBackendProc> BackendProc = + ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries, + AddStream, Cache); + return RunBackends(BackendProc.get()); + } + + // Perform two rounds of code generation for ThinLTO: + // 1. First round: Run optimization and code generation with a scratch output. + // 2. Merge codegen data extracted from the scratch output. + // 3. Second round: Run code generation again using the merged data. + LLVM_DEBUG(dbgs() << "Running ThinLTO two-codegen rounds\n"); + + // Initialize a temporary path to store and retrieve optimized IRs for + // two-round code generation. + cgdata::initializeTwoCodegenRounds(); + + // Create a scratch output to hold intermediate results. + auto Outputs = + std::make_unique<std::vector<llvm::SmallString<0>>>(getMaxTasks()); + auto FirstRoundLTO = std::make_unique<NoOutputThinBackend>( + Conf, ThinLTO.CombinedIndex, llvm::heavyweight_hardware_concurrency(), + ModuleToDefinedGVSummaries, std::move(Outputs)); + // First round: Run optimization and code generation with a scratch output. + // Before code generation, serialize modules. + if (Error E = RunBackends(FirstRoundLTO.get())) + return E; + + // Merge codegen data extracted from the scratch output. + if (Error E = cgdata::mergeCodeGenData(std::move(FirstRoundLTO->Scratch))) + return E; + + // Second round: Run code generation by reading IRs. + std::unique_ptr<ThinBackendProc> SecondRoundLTO = + std::make_unique<OptimizedBitcodeThinBackend>( + Conf, ThinLTO.CombinedIndex, llvm::heavyweight_hardware_concurrency(), + ModuleToDefinedGVSummaries, AddStream); + Error E = RunBackends(SecondRoundLTO.get()); + + return E; } Expected<std::unique_ptr<ToolOutputFile>> lto::setupLLVMOptimizationRemarks( diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp index 880567989baffb..d198e8e5102009 100644 --- a/llvm/lib/LTO/LTOBackend.cpp +++ b/llvm/lib/LTO/LTOBackend.cpp @@ -20,6 +20,7 @@ #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Bitcode/BitcodeReader.h" #include "llvm/Bitcode/BitcodeWriter.h" +#include "llvm/CGData/CodeGenData.h" #include "llvm/IR/LLVMRemarkStreamer.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/PassManager.h" @@ -74,6 +75,8 @@ static cl::opt<bool> ThinLTOAssumeMerged( cl::desc("Assume the input has already undergone ThinLTO function " "importing and the other pre-optimization pipeline changes.")); +extern cl::opt<bool> CodeGenDataThinLTOTwoRounds; + namespace llvm { extern cl::opt<bool> NoPGOWarnMismatch; } @@ -599,11 +602,19 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream, auto OptimizeAndCodegen = [&](Module &Mod, TargetMachine *TM, std::unique_ptr<ToolOutputFile> DiagnosticOutputFile) { + // Perform optimization and code generation for ThinLTO. if (!opt(Conf, TM, Task, Mod, /*IsThinLTO=*/true, /*ExportSummary=*/nullptr, /*ImportSummary=*/&CombinedIndex, CmdArgs)) return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile)); + // Save the current module before the first codegen round. + // Note that the second codegen round runs only `codegen()` without + // running `opt()`. We're not reaching here as it's bailed out earlier + // with CodeGenOnly which has been set in `OptimizedBitcodeThinBackend`. + if (CodeGenDataThinLTOTwoRounds) + cgdata::saveModuleForTwoRounds(Mod, Task); + codegen(Conf, TM, AddStream, Task, Mod, CombinedIndex); return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile)); }; diff --git a/llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll b/llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll new file mode 100644 index 00000000000000..0e082cf4e55e54 --- /dev/null +++ b/llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll @@ -0,0 +1,94 @@ +; This test verifies whether we can outline a singleton instance (i.e., an instance that does not repeat) +; by running two codegen rounds. + +; RUN: split-file %s %t + +; Verify each outlining instance is singleton with the global outlining for thinlto. +; They will be identical, which can be folded by the linker with ICF. +; RUN: opt -module-summary %t/thin-one.ll -o %t/thin-one.bc +; RUN: opt -module-summary %t/thin-two.ll -o %t/thin-two.bc +; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc -o %t/thinlto \ +; RUN: -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \ +; RUN: -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \ +; RUN: -codegen-data-thinlto-two-rounds + +; thin-one.ll will have one outlining instance (matched in the global outlined hash tree) +; RUN: llvm-objdump -d %t/thinlto.1 | FileCheck %s --check-prefix=THINLTO-1 +; THINLTO-1: _OUTLINED_FUNCTION{{.*}}>: +; THINLTO-1-NEXT: mov +; THINLTO-1-NEXT: mov +; THINLTO-1-NEXT: b + +; thin-two.ll will have two outlining instances (matched in the global outlined hash tree) +; RUN: llvm-objdump -d %t/thinlto.2 | FileCheck %s --check-prefix=THINLTO-2 +; THINLTO-2: _OUTLINED_FUNCTION{{.*}}>: +; THINLTO-2-NEXT: mov +; THINLTO-2-NEXT: mov +; THINLTO-2-NEXT: b +; THINLTO-2: _OUTLINED_FUNCTION{{.*}}>: +; THINLTO-2-NEXT: mov +; THINLTO-2-NEXT: mov +; THINLTO-2-NEXT: b + +; Now add a lto module to the above thinlto modules. +; Verify the lto module is optimized independent of the global outlining for thinlto. +; RUN: opt %t/lto.ll -o %t/lto.bc +; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc %t/lto.bc -o %t/out \ +; RUN: -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \ +; RUN: -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \ +; RUN: -r %t/lto.bc,_f4,px -r %t/lto.bc,_f5,px -r %t/lto.bc,_f6,px -r %t/lto.bc,_g,x \ +; RUN: -codegen-data-thinlto-two-rounds + +; lto.ll will have one outlining instance within the lto module itself (no global outlining). +; RUN: llvm-objdump -d %t/out.0 | FileCheck %s --check-prefix=LTO-0 +; LTO-0: _OUTLINED_FUNCTION{{.*}}>: +; LTO-0-NEXT: mov +; LTO-0-NEXT: b + +; thin-one.ll will have one outlining instance (matched in the global outlined hash tree) +; RUN: llvm-objdump -d %t/out.1 | FileCheck %s --check-prefix=THINLTO-1 + +; thin-two.ll will have two outlining instances (matched in the global outlined hash tree) +; RUN: llvm-objdump -d %t/out.2 | FileCheck %s --check-prefix=THINLTO-2 + +;--- thin-one.ll +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-darwin" + +declare i32 @g(i32, i32, i32) +define i32 @f3() minsize { + %1 = call i32 @g(i32 30, i32 1, i32 2); + ret i32 %1 +} + +;--- thin-two.ll +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-darwin" + +declare i32 @g(i32, i32, i32) +define i32 @f1() minsize { + %1 = call i32 @g(i32 10, i32 1, i32 2); + ret i32 %1 +} +define i32 @f2() minsize { + %1 = call i32 @g(i32 20, i32 1, i32 2); + ret i32 %1 +} + +;--- lto.ll +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-darwin" + +declare i32 @g(i32, i32, i32) +define i32 @f4() minsize { + %1 = call i32 @g(i32 10, i32 30, i32 2); + ret i32 %1 +} +define i32 @f5() minsize { + %1 = call i32 @g(i32 20, i32 40, i32 2); + ret i32 %1 +} +define i32 @f6() minsize { + %1 = call i32 @g(i32 50, i32 60, i32 2); + ret i32 %1 +} diff --git a/llvm/test/ThinLTO/AArch64/lit.local.cfg b/llvm/test/ThinLTO/AArch64/lit.local.cfg new file mode 100644 index 00000000000000..10d4a0e953ed47 --- /dev/null +++ b/llvm/test/ThinLTO/AArch64/lit.local.cfg @@ -0,0 +1,2 @@ +if not "AArch64" in config.root.targets: + config.unsupported = True >From 011d4c112bebc1a93fa31e40b2ee5ccb3b785077 Mon Sep 17 00:00:00 2001 From: Kyungwoo Lee <kyu...@meta.com> Date: Tue, 17 Sep 2024 18:07:49 -0700 Subject: [PATCH 3/4] Address comments from ellishg --- llvm/include/llvm/CGData/CodeGenData.h | 7 ++++--- llvm/include/llvm/LTO/LTOBackend.h | 3 ++- llvm/lib/CGData/CodeGenData.cpp | 4 +++- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/llvm/include/llvm/CGData/CodeGenData.h b/llvm/include/llvm/CGData/CodeGenData.h index 1e1afe99327650..72b52e6e9b8fd1 100644 --- a/llvm/include/llvm/CGData/CodeGenData.h +++ b/llvm/include/llvm/CGData/CodeGenData.h @@ -164,13 +164,14 @@ publishOutlinedHashTree(std::unique_ptr<OutlinedHashTree> HashTree) { CodeGenData::getInstance().publishOutlinedHashTree(std::move(HashTree)); } -/// Initialize the two-codegen rounds. void initializeTwoCodegenRounds(); -/// Save the current module before the first codegen round. +/// Save \p TheModule before the first codegen round. +/// \p Task represents the partition number in the parallel code generation +/// process. void saveModuleForTwoRounds(const Module &TheModule, unsigned Task); -/// Load the current module before the second codegen round. +/// Load the optimized module before the second codegen round. std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule, unsigned Task, LLVMContext &Context); diff --git a/llvm/include/llvm/LTO/LTOBackend.h b/llvm/include/llvm/LTO/LTOBackend.h index 8516398510d4b8..098c0491dfe70a 100644 --- a/llvm/include/llvm/LTO/LTOBackend.h +++ b/llvm/include/llvm/LTO/LTOBackend.h @@ -50,7 +50,8 @@ Error backend(const Config &C, AddStreamFn AddStream, /// already been mapped to memory and the corresponding BitcodeModule objects /// are saved in the ModuleMap. If \p ModuleMap is nullptr, module files will /// be mapped to memory on demand and at any given time during importing, only -/// one source module will be kept open at the most. +/// one source module will be kept open at the most. If \p CodeGenOnly is true, +/// the backend will skip optimization and only perform code generation. Error thinBackend(const Config &C, unsigned Task, AddStreamFn AddStream, Module &M, const ModuleSummaryIndex &CombinedIndex, const FunctionImporter::ImportMapTy &ImportList, diff --git a/llvm/lib/CGData/CodeGenData.cpp b/llvm/lib/CGData/CodeGenData.cpp index ff8e5dd7c75790..58b92b7262957a 100644 --- a/llvm/lib/CGData/CodeGenData.cpp +++ b/llvm/lib/CGData/CodeGenData.cpp @@ -225,7 +225,9 @@ void warn(Error E, StringRef Whence) { } static std::string getPath(StringRef Dir, unsigned Task) { - return (Dir + "/" + llvm::Twine(Task) + ".saved_copy.bc").str(); + llvm::SmallString<128> Path(Dir); + llvm::sys::path::append(Path, llvm::Twine(Task) + ".saved_copy.bc"); + return std::string(Path); } void initializeTwoCodegenRounds() { >From e402d60c6206c585495123dd327b2a5ab85982b4 Mon Sep 17 00:00:00 2001 From: Kyungwoo Lee <kyu...@meta.com> Date: Tue, 17 Sep 2024 23:37:51 -0700 Subject: [PATCH 4/4] Address comments from NuriAmari --- llvm/lib/CGData/CodeGenData.cpp | 4 ++-- llvm/lib/LTO/LTO.cpp | 33 +++++++++++++++++++++------------ llvm/lib/LTO/LTOBackend.cpp | 2 +- 3 files changed, 24 insertions(+), 15 deletions(-) diff --git a/llvm/lib/CGData/CodeGenData.cpp b/llvm/lib/CGData/CodeGenData.cpp index 58b92b7262957a..4e21045a67cba6 100644 --- a/llvm/lib/CGData/CodeGenData.cpp +++ b/llvm/lib/CGData/CodeGenData.cpp @@ -245,7 +245,7 @@ void saveModuleForTwoRounds(const Module &TheModule, unsigned Task) { if (EC) report_fatal_error(Twine("Failed to open ") + Path + " to save optimized bitcode: " + EC.message()); - WriteBitcodeToFile(TheModule, OS, /* ShouldPreserveUseListOrder */ true); + WriteBitcodeToFile(TheModule, OS, /*ShouldPreserveUseListOrder=*/true); } std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule, @@ -259,7 +259,7 @@ std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule, " to load optimized bitcode: " + EC.message()); std::unique_ptr<MemoryBuffer> FileBuffer = std::move(*FileOrError); - auto RestoredModule = llvm::parseBitcodeFile(*FileBuffer, Context); + auto RestoredModule = parseBitcodeFile(*FileBuffer, Context); if (!RestoredModule) report_fatal_error(Twine("Failed to parse optimized bitcode loaded from ") + Path + "\n"); diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index 945f8c859365ea..b51b908fb28760 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -1563,11 +1563,14 @@ class InProcessThinBackend : public ThinBackendProc { } }; -/// This Backend will run ThinBackend process but throw away all the output from -/// the codegen. This class facilitates the first codegen round. -class NoOutputThinBackend : public InProcessThinBackend { +/// This backend is utilized in the first round of a two-codegen round process. +/// It first saves optimized bitcode files to disk before the codegen process +/// begins. After codegen, it stores the resulting object files in a scratch +/// buffer. Note the codegen data stored in the scratch buffer will be extracted +/// and merged in the subsequent step. +class FirstRoundThinBackend : public InProcessThinBackend { public: - NoOutputThinBackend( + FirstRoundThinBackend( const Config &Conf, ModuleSummaryIndex &CombinedIndex, ThreadPoolStrategy ThinLTOParallelism, const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries, @@ -1579,25 +1582,31 @@ class NoOutputThinBackend : public InProcessThinBackend { return std::make_unique<CachedFileStream>( std::make_unique<raw_svector_ostream>((*Allocation)[Task])); }, - FileCache(), nullptr, false, false), + FileCache(), /*OnWrite=*/nullptr, /*ShouldEmitIndexFiles=*/false, + /*ShouldEmitImportsFiles=*/false), Scratch(std::move(Scratch)) {} /// Scratch space for writing output during the codegen. std::unique_ptr<std::vector<llvm::SmallString<0>>> Scratch; }; -/// This Backend performs codegen on bitcode that was previously saved after -/// going through optimization. This class facilitates the second codegen round. -class OptimizedBitcodeThinBackend : public InProcessThinBackend { +/// This backend operates in the second round of a two-codegen round process. +/// It starts by reading the optimized bitcode files that were saved during the +/// first round. The backend then executes the codegen only to further optimize +/// the code, utilizing the codegen data merged from the first round. Finally, +/// it writes the resulting object files as usual. +class SecondRoundThinBackend : public InProcessThinBackend { public: - OptimizedBitcodeThinBackend( + SecondRoundThinBackend( const Config &Conf, ModuleSummaryIndex &CombinedIndex, ThreadPoolStrategy ThinLTOParallelism, const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries, AddStreamFn AddStream) : InProcessThinBackend(Conf, CombinedIndex, ThinLTOParallelism, ModuleToDefinedGVSummaries, AddStream, FileCache(), - nullptr, false, false) {} + /*OnWrite=*/nullptr, + /*ShouldEmitIndexFiles=*/false, + /*ShouldEmitImportsFiles=*/false) {} virtual Error runThinLTOBackendThread( AddStreamFn AddStream, FileCache Cache, unsigned Task, BitcodeModule BM, @@ -1956,7 +1965,7 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache, // Create a scratch output to hold intermediate results. auto Outputs = std::make_unique<std::vector<llvm::SmallString<0>>>(getMaxTasks()); - auto FirstRoundLTO = std::make_unique<NoOutputThinBackend>( + auto FirstRoundLTO = std::make_unique<FirstRoundThinBackend>( Conf, ThinLTO.CombinedIndex, llvm::heavyweight_hardware_concurrency(), ModuleToDefinedGVSummaries, std::move(Outputs)); // First round: Run optimization and code generation with a scratch output. @@ -1970,7 +1979,7 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache, // Second round: Run code generation by reading IRs. std::unique_ptr<ThinBackendProc> SecondRoundLTO = - std::make_unique<OptimizedBitcodeThinBackend>( + std::make_unique<SecondRoundThinBackend>( Conf, ThinLTO.CombinedIndex, llvm::heavyweight_hardware_concurrency(), ModuleToDefinedGVSummaries, AddStream); Error E = RunBackends(SecondRoundLTO.get()); diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp index d198e8e5102009..cf69f4add53a79 100644 --- a/llvm/lib/LTO/LTOBackend.cpp +++ b/llvm/lib/LTO/LTOBackend.cpp @@ -611,7 +611,7 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream, // Save the current module before the first codegen round. // Note that the second codegen round runs only `codegen()` without // running `opt()`. We're not reaching here as it's bailed out earlier - // with CodeGenOnly which has been set in `OptimizedBitcodeThinBackend`. + // with `CodeGenOnly` which has been set in `SecondRoundThinBackend`. if (CodeGenDataThinLTOTwoRounds) cgdata::saveModuleForTwoRounds(Mod, Task); _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits