Author: Kyungwoo Lee Date: 2024-10-09T15:37:41-07:00 New Revision: dc85d5263ed5e416cb4ddf405611472f4ef12fd3
URL: https://github.com/llvm/llvm-project/commit/dc85d5263ed5e416cb4ddf405611472f4ef12fd3 DIFF: https://github.com/llvm/llvm-project/commit/dc85d5263ed5e416cb4ddf405611472f4ef12fd3.diff LOG: [CGData][ThinLTO] Global Outlining with Two-CodeGen Rounds (#90933) This feature is enabled by `-codegen-data-thinlto-two-rounds`, which effectively runs the `-codegen-data-generate` and `-codegen-data-use` in two rounds to enable global outlining with ThinLTO. 1. The first round: Run both optimization + codegen with a scratch output. Before running codegen, we serialize the optimized bitcode modules to a temporary path. 2. From the scratch object files, we merge them into the codegen data. 3. The second round: Read the optimized bitcode modules and start the codegen only this time. Using the codegen data, the machine outliner effectively performs the global outlining. Depends on #90934, #110461 and #110463. This is a patch for https://discourse.llvm.org/t/rfc-enhanced-machine-outliner-part-2-thinlto-nolto/78753. Added: llvm/test/ThinLTO/AArch64/cgdata-two-rounds-caching.ll llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll llvm/test/ThinLTO/AArch64/lit.local.cfg Modified: clang/lib/CodeGen/BackendUtil.cpp llvm/include/llvm/CGData/CodeGenData.h llvm/include/llvm/CGData/CodeGenDataReader.h llvm/include/llvm/LTO/LTO.h llvm/include/llvm/LTO/LTOBackend.h llvm/lib/CGData/CMakeLists.txt llvm/lib/CGData/CodeGenData.cpp llvm/lib/CGData/CodeGenDataReader.cpp llvm/lib/LTO/CMakeLists.txt llvm/lib/LTO/LTO.cpp llvm/lib/LTO/LTOBackend.cpp Removed: ################################################################################ diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index abc936f2c686dd..f018130807519d 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -1321,10 +1321,11 @@ static void runThinLTOBackend( Conf.CGFileType = getCodeGenFileType(Action); break; } - if (Error E = thinBackend( - Conf, -1, AddStream, *M, *CombinedIndex, ImportList, - ModuleToDefinedGVSummaries[M->getModuleIdentifier()], - /* ModuleMap */ nullptr, Conf.CodeGenOnly, CGOpts.CmdArgs)) { + if (Error E = + thinBackend(Conf, -1, AddStream, *M, *CombinedIndex, ImportList, + ModuleToDefinedGVSummaries[M->getModuleIdentifier()], + /*ModuleMap=*/nullptr, Conf.CodeGenOnly, + /*IRAddStream=*/nullptr, CGOpts.CmdArgs)) { handleAllErrors(std::move(E), [&](ErrorInfoBase &EIB) { errs() << "Error running ThinLTO backend: " << EIB.message() << '\n'; }); diff --git a/llvm/include/llvm/CGData/CodeGenData.h b/llvm/include/llvm/CGData/CodeGenData.h index 84133a433170fe..53550beeae1f83 100644 --- a/llvm/include/llvm/CGData/CodeGenData.h +++ b/llvm/include/llvm/CGData/CodeGenData.h @@ -15,11 +15,13 @@ #define LLVM_CGDATA_CODEGENDATA_H #include "llvm/ADT/BitmaskEnum.h" +#include "llvm/ADT/StableHashing.h" #include "llvm/Bitcode/BitcodeReader.h" #include "llvm/CGData/OutlinedHashTree.h" #include "llvm/CGData/OutlinedHashTreeRecord.h" #include "llvm/IR/Module.h" #include "llvm/Object/ObjectFile.h" +#include "llvm/Support/Caching.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/TargetParser/Triple.h" #include <mutex> @@ -164,6 +166,74 @@ publishOutlinedHashTree(std::unique_ptr<OutlinedHashTree> HashTree) { CodeGenData::getInstance().publishOutlinedHashTree(std::move(HashTree)); } +struct StreamCacheData { + /// Backing buffer for serialized data stream. + SmallVector<SmallString<0>> Outputs; + /// Callback function to add serialized data to the stream. + AddStreamFn AddStream; + /// Backing buffer for cached data. + SmallVector<std::unique_ptr<MemoryBuffer>> Files; + /// Cache mechanism for storing data. + FileCache Cache; + + StreamCacheData(unsigned Size, const FileCache &OrigCache, + const Twine &CachePrefix) + : Outputs(Size), Files(Size) { + AddStream = [&](size_t Task, const Twine &ModuleName) { + return std::make_unique<CachedFileStream>( + std::make_unique<raw_svector_ostream>(Outputs[Task])); + }; + + if (OrigCache.isValid()) { + auto CGCacheOrErr = + localCache("ThinLTO", CachePrefix, OrigCache.getCacheDirectoryPath(), + [&](size_t Task, const Twine &ModuleName, + std::unique_ptr<MemoryBuffer> MB) { + Files[Task] = std::move(MB); + }); + if (Error Err = CGCacheOrErr.takeError()) + report_fatal_error(std::move(Err)); + Cache = std::move(*CGCacheOrErr); + } + } + StreamCacheData() = delete; + + /// Retrieve results from either the cache or the stream. + std::unique_ptr<SmallVector<StringRef>> getResult() { + unsigned NumOutputs = Outputs.size(); + auto Result = std::make_unique<SmallVector<StringRef>>(NumOutputs); + for (unsigned I = 0; I < NumOutputs; ++I) + if (Files[I]) + (*Result)[I] = Files[I]->getBuffer(); + else + (*Result)[I] = Outputs[I]; + return Result; + } +}; + +/// Save \p TheModule before the first codegen round. +/// \p Task represents the partition number in the parallel code generation +/// process. \p AddStream is the callback used to add the serialized module to +/// the stream. +void saveModuleForTwoRounds(const Module &TheModule, unsigned Task, + AddStreamFn AddStream); + +/// Load the optimized bitcode module for the second codegen round. +/// \p OrigModule is the original bitcode module. +/// \p Task identifies the partition number in the parallel code generation +/// process. \p Context provides the environment settings for module operations. +/// \p IRFiles contains optimized bitcode module files needed for loading. +/// \return A unique_ptr to the loaded Module, or nullptr if loading fails. +std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule, + unsigned Task, + LLVMContext &Context, + ArrayRef<StringRef> IRFiles); + +/// Merge the codegen data from the scratch objects \p ObjectFiles from the +/// first codegen round. +/// \return the combined hash of the merged codegen data. +Expected<stable_hash> mergeCodeGenData(ArrayRef<StringRef> ObjectFiles); + void warn(Error E, StringRef Whence = ""); void warn(Twine Message, std::string Whence = "", std::string Hint = ""); diff --git a/llvm/include/llvm/CGData/CodeGenDataReader.h b/llvm/include/llvm/CGData/CodeGenDataReader.h index 1ee4bfbe480233..7e4882df2116e2 100644 --- a/llvm/include/llvm/CGData/CodeGenDataReader.h +++ b/llvm/include/llvm/CGData/CodeGenDataReader.h @@ -54,8 +54,11 @@ class CodeGenDataReader { /// Extract the cgdata embedded in sections from the given object file and /// merge them into the GlobalOutlineRecord. This is a static helper that /// is used by `llvm-cgdata --merge` or ThinLTO's two-codegen rounds. + /// Optionally, \p CombinedHash can be used to compuate the combined hash of + /// the merged data. static Error mergeFromObjectFile(const object::ObjectFile *Obj, - OutlinedHashTreeRecord &GlobalOutlineRecord); + OutlinedHashTreeRecord &GlobalOutlineRecord, + stable_hash *CombinedHash = nullptr); protected: /// The outlined hash tree that has been read. When it's released by diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h index 66d8ca63a206f6..242a05f7d32c02 100644 --- a/llvm/include/llvm/LTO/LTO.h +++ b/llvm/include/llvm/LTO/LTO.h @@ -75,6 +75,9 @@ std::string computeLTOCacheKey( const DenseSet<GlobalValue::GUID> &CfiFunctionDefs = {}, const DenseSet<GlobalValue::GUID> &CfiFunctionDecls = {}); +/// Recomputes the LTO cache key for a given key with an extra identifier. +std::string recomputeLTOCacheKey(const std::string &Key, StringRef ExtraID); + namespace lto { StringLiteral getThinLTODefaultCPU(const Triple &TheTriple); diff --git a/llvm/include/llvm/LTO/LTOBackend.h b/llvm/include/llvm/LTO/LTOBackend.h index 098c0491dfe70a..2769e58f249053 100644 --- a/llvm/include/llvm/LTO/LTOBackend.h +++ b/llvm/include/llvm/LTO/LTOBackend.h @@ -51,13 +51,15 @@ Error backend(const Config &C, AddStreamFn AddStream, /// are saved in the ModuleMap. If \p ModuleMap is nullptr, module files will /// be mapped to memory on demand and at any given time during importing, only /// one source module will be kept open at the most. If \p CodeGenOnly is true, -/// the backend will skip optimization and only perform code generation. +/// the backend will skip optimization and only perform code generation. If +/// \p IRAddStream is not nullptr, it will be called just before code generation +/// to serialize the optimized IR. Error thinBackend(const Config &C, unsigned Task, AddStreamFn AddStream, Module &M, const ModuleSummaryIndex &CombinedIndex, const FunctionImporter::ImportMapTy &ImportList, const GVSummaryMapTy &DefinedGlobals, MapVector<StringRef, BitcodeModule> *ModuleMap, - bool CodeGenOnly, + bool CodeGenOnly, AddStreamFn IRAddStream = nullptr, const std::vector<uint8_t> &CmdArgs = std::vector<uint8_t>()); Error finalizeOptimizationRemarks( diff --git a/llvm/lib/CGData/CMakeLists.txt b/llvm/lib/CGData/CMakeLists.txt index ff1aab920e7a8c..157b0dfb7f9fcf 100644 --- a/llvm/lib/CGData/CMakeLists.txt +++ b/llvm/lib/CGData/CMakeLists.txt @@ -12,6 +12,8 @@ add_llvm_component_library(LLVMCGData intrinsics_gen LINK_COMPONENTS + BitReader + BitWriter Core Support Object diff --git a/llvm/lib/CGData/CodeGenData.cpp b/llvm/lib/CGData/CodeGenData.cpp index 55d2504231c744..c56a8b77a52319 100644 --- a/llvm/lib/CGData/CodeGenData.cpp +++ b/llvm/lib/CGData/CodeGenData.cpp @@ -15,8 +15,10 @@ #include "llvm/CGData/CodeGenDataReader.h" #include "llvm/CGData/OutlinedHashTreeRecord.h" #include "llvm/Object/ObjectFile.h" +#include "llvm/Support/Caching.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/FileSystem.h" +#include "llvm/Support/Path.h" #include "llvm/Support/WithColor.h" #define DEBUG_TYPE "cg-data" @@ -30,6 +32,11 @@ cl::opt<bool> cl::opt<std::string> CodeGenDataUsePath("codegen-data-use-path", cl::init(""), cl::Hidden, cl::desc("File path to where .cgdata file is read")); +cl::opt<bool> CodeGenDataThinLTOTwoRounds( + "codegen-data-thinlto-two-rounds", cl::init(false), cl::Hidden, + cl::desc("Enable two-round ThinLTO code generation. The first round " + "emits codegen data, while the second round uses the emitted " + "codegen data for further optimizations.")); static std::string getCGDataErrString(cgdata_error Err, const std::string &ErrMsg = "") { @@ -139,7 +146,7 @@ CodeGenData &CodeGenData::getInstance() { std::call_once(CodeGenData::OnceFlag, []() { Instance = std::unique_ptr<CodeGenData>(new CodeGenData()); - if (CodeGenDataGenerate) + if (CodeGenDataGenerate || CodeGenDataThinLTOTwoRounds) Instance->EmitCGData = true; else if (!CodeGenDataUsePath.empty()) { // Initialize the global CGData if the input file name is given. @@ -215,6 +222,64 @@ void warn(Error E, StringRef Whence) { } } +void saveModuleForTwoRounds(const Module &TheModule, unsigned Task, + AddStreamFn AddStream) { + LLVM_DEBUG(dbgs() << "Saving module: " << TheModule.getModuleIdentifier() + << " in Task " << Task << "\n"); + Expected<std::unique_ptr<CachedFileStream>> StreamOrErr = + AddStream(Task, TheModule.getModuleIdentifier()); + if (Error Err = StreamOrErr.takeError()) + report_fatal_error(std::move(Err)); + std::unique_ptr<CachedFileStream> &Stream = *StreamOrErr; + + WriteBitcodeToFile(TheModule, *Stream->OS, + /*ShouldPreserveUseListOrder=*/true); +} + +std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule, + unsigned Task, + LLVMContext &Context, + ArrayRef<StringRef> IRFiles) { + LLVM_DEBUG(dbgs() << "Loading module: " << OrigModule.getModuleIdentifier() + << " in Task " << Task << "\n"); + auto FileBuffer = MemoryBuffer::getMemBuffer( + IRFiles[Task], "in-memory IR file", /*RequiresNullTerminator=*/false); + auto RestoredModule = parseBitcodeFile(*FileBuffer, Context); + if (!RestoredModule) + report_fatal_error( + Twine("Failed to parse optimized bitcode loaded for Task: ") + + Twine(Task) + "\n"); + + // Restore the original module identifier. + (*RestoredModule)->setModuleIdentifier(OrigModule.getModuleIdentifier()); + return std::move(*RestoredModule); +} + +Expected<stable_hash> mergeCodeGenData(ArrayRef<StringRef> ObjFiles) { + OutlinedHashTreeRecord GlobalOutlineRecord; + stable_hash CombinedHash = 0; + for (auto File : ObjFiles) { + if (File.empty()) + continue; + std::unique_ptr<MemoryBuffer> Buffer = MemoryBuffer::getMemBuffer( + File, "in-memory object file", /*RequiresNullTerminator=*/false); + Expected<std::unique_ptr<object::ObjectFile>> BinOrErr = + object::ObjectFile::createObjectFile(Buffer->getMemBufferRef()); + if (!BinOrErr) + return BinOrErr.takeError(); + + std::unique_ptr<object::ObjectFile> &Obj = BinOrErr.get(); + if (auto E = CodeGenDataReader::mergeFromObjectFile( + Obj.get(), GlobalOutlineRecord, &CombinedHash)) + return E; + } + + if (!GlobalOutlineRecord.empty()) + cgdata::publishOutlinedHashTree(std::move(GlobalOutlineRecord.HashTree)); + + return CombinedHash; +} + } // end namespace cgdata } // end namespace llvm diff --git a/llvm/lib/CGData/CodeGenDataReader.cpp b/llvm/lib/CGData/CodeGenDataReader.cpp index f7f3a8f42af7e1..2f2481ea60f822 100644 --- a/llvm/lib/CGData/CodeGenDataReader.cpp +++ b/llvm/lib/CGData/CodeGenDataReader.cpp @@ -31,8 +31,8 @@ setupMemoryBuffer(const Twine &Filename, vfs::FileSystem &FS) { } Error CodeGenDataReader::mergeFromObjectFile( - const object::ObjectFile *Obj, - OutlinedHashTreeRecord &GlobalOutlineRecord) { + const object::ObjectFile *Obj, OutlinedHashTreeRecord &GlobalOutlineRecord, + stable_hash *CombinedHash) { Triple TT = Obj->makeTriple(); auto CGOutLineName = getCodeGenDataSectionName(CG_outline, TT.getObjectFormat(), false); @@ -48,6 +48,9 @@ Error CodeGenDataReader::mergeFromObjectFile( auto *EndData = Data + ContentsOrErr->size(); if (*NameOrErr == CGOutLineName) { + if (CombinedHash) + *CombinedHash = + stable_hash_combine(*CombinedHash, xxh3_64bits(*ContentsOrErr)); // In case dealing with an executable that has concatenated cgdata, // we want to merge them into a single cgdata. // Although it's not a typical workflow, we support this scenario. diff --git a/llvm/lib/LTO/CMakeLists.txt b/llvm/lib/LTO/CMakeLists.txt index 69ff08e1f374c4..057d73b6349cf1 100644 --- a/llvm/lib/LTO/CMakeLists.txt +++ b/llvm/lib/LTO/CMakeLists.txt @@ -21,6 +21,7 @@ add_llvm_component_library(LLVMLTO BinaryFormat BitReader BitWriter + CGData CodeGen CodeGenTypes Core diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index e1714b29399298..8e7675f442567a 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -13,6 +13,7 @@ #include "llvm/LTO/LTO.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/StableHashing.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" @@ -21,6 +22,7 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Bitcode/BitcodeReader.h" #include "llvm/Bitcode/BitcodeWriter.h" +#include "llvm/CGData/CodeGenData.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/Config/llvm-config.h" #include "llvm/IR/AutoUpgrade.h" @@ -35,6 +37,7 @@ #include "llvm/Linker/IRMover.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Object/IRObjectFile.h" +#include "llvm/Support/Caching.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Error.h" #include "llvm/Support/FileSystem.h" @@ -70,6 +73,8 @@ static cl::opt<bool> DumpThinCGSCCs("dump-thin-cg-sccs", cl::init(false), cl::Hidden, cl::desc("Dump the SCCs in the ThinLTO index's callgraph")); +extern cl::opt<bool> CodeGenDataThinLTOTwoRounds; + namespace llvm { /// Enable global value internalization in LTO. cl::opt<bool> EnableLTOInternalization( @@ -341,6 +346,20 @@ std::string llvm::computeLTOCacheKey( return toHex(Hasher.result()); } +std::string llvm::recomputeLTOCacheKey(const std::string &Key, + StringRef ExtraID) { + SHA1 Hasher; + + auto AddString = [&](StringRef Str) { + Hasher.update(Str); + Hasher.update(ArrayRef<uint8_t>{0}); + }; + AddString(Key); + AddString(ExtraID); + + return toHex(Hasher.result()); +} + static void thinLTOResolvePrevailingGUID( const Config &C, ValueInfo VI, DenseSet<GlobalValueSummary *> &GlobalInvolvedWithAlias, @@ -1398,6 +1417,7 @@ Error ThinBackendProc::emitFiles( namespace { class InProcessThinBackend : public ThinBackendProc { +protected: AddStreamFn AddStream; FileCache Cache; DenseSet<GlobalValue::GUID> CfiFunctionDefs; @@ -1424,7 +1444,7 @@ class InProcessThinBackend : public ThinBackendProc { GlobalValue::getGUID(GlobalValue::dropLLVMManglingEscape(Name))); } - Error runThinLTOBackendThread( + virtual Error runThinLTOBackendThread( AddStreamFn AddStream, FileCache Cache, unsigned Task, BitcodeModule BM, ModuleSummaryIndex &CombinedIndex, const FunctionImporter::ImportMapTy &ImportList, @@ -1513,6 +1533,173 @@ class InProcessThinBackend : public ThinBackendProc { return Error::success(); } }; + +/// This backend is utilized in the first round of a two-codegen round process. +/// It first saves optimized bitcode files to disk before the codegen process +/// begins. After codegen, it stores the resulting object files in a scratch +/// buffer. Note the codegen data stored in the scratch buffer will be extracted +/// and merged in the subsequent step. +class FirstRoundThinBackend : public InProcessThinBackend { + AddStreamFn IRAddStream; + FileCache IRCache; + +public: + FirstRoundThinBackend( + const Config &Conf, ModuleSummaryIndex &CombinedIndex, + ThreadPoolStrategy ThinLTOParallelism, + const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries, + AddStreamFn CGAddStream, FileCache CGCache, AddStreamFn IRAddStream, + FileCache IRCache) + : InProcessThinBackend(Conf, CombinedIndex, ThinLTOParallelism, + ModuleToDefinedGVSummaries, std::move(CGAddStream), + std::move(CGCache), /*OnWrite=*/nullptr, + /*ShouldEmitIndexFiles=*/false, + /*ShouldEmitImportsFiles=*/false), + IRAddStream(std::move(IRAddStream)), IRCache(std::move(IRCache)) {} + + Error runThinLTOBackendThread( + AddStreamFn CGAddStream, FileCache CGCache, unsigned Task, + BitcodeModule BM, ModuleSummaryIndex &CombinedIndex, + const FunctionImporter::ImportMapTy &ImportList, + const FunctionImporter::ExportSetTy &ExportList, + const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR, + const GVSummaryMapTy &DefinedGlobals, + MapVector<StringRef, BitcodeModule> &ModuleMap) override { + auto RunThinBackend = [&](AddStreamFn CGAddStream, + AddStreamFn IRAddStream) { + LTOLLVMContext BackendContext(Conf); + Expected<std::unique_ptr<Module>> MOrErr = BM.parseModule(BackendContext); + if (!MOrErr) + return MOrErr.takeError(); + + return thinBackend(Conf, Task, CGAddStream, **MOrErr, CombinedIndex, + ImportList, DefinedGlobals, &ModuleMap, + Conf.CodeGenOnly, IRAddStream); + }; + + auto ModuleID = BM.getModuleIdentifier(); + // Like InProcessThinBackend, we produce index files as needed for + // FirstRoundThinBackend. However, these files are not generated for + // SecondRoundThinBackend. + if (ShouldEmitIndexFiles) { + if (auto E = emitFiles(ImportList, ModuleID, ModuleID.str())) + return E; + } + + assert((CGCache.isValid() == IRCache.isValid()) && + "Both caches for CG and IR should have matching availability"); + if (!CGCache.isValid() || !CombinedIndex.modulePaths().count(ModuleID) || + all_of(CombinedIndex.getModuleHash(ModuleID), + [](uint32_t V) { return V == 0; })) + // Cache disabled or no entry for this module in the combined index or + // no module hash. + return RunThinBackend(CGAddStream, IRAddStream); + + // Get CGKey for caching object in CGCache. + std::string CGKey = computeLTOCacheKey( + Conf, CombinedIndex, ModuleID, ImportList, ExportList, ResolvedODR, + DefinedGlobals, CfiFunctionDefs, CfiFunctionDecls); + Expected<AddStreamFn> CacheCGAddStreamOrErr = + CGCache(Task, CGKey, ModuleID); + if (Error Err = CacheCGAddStreamOrErr.takeError()) + return Err; + AddStreamFn &CacheCGAddStream = *CacheCGAddStreamOrErr; + + // Get IRKey for caching (optimized) IR in IRCache with an extra ID. + std::string IRKey = recomputeLTOCacheKey(CGKey, /*ExtraID=*/"IR"); + Expected<AddStreamFn> CacheIRAddStreamOrErr = + IRCache(Task, IRKey, ModuleID); + if (Error Err = CacheIRAddStreamOrErr.takeError()) + return Err; + AddStreamFn &CacheIRAddStream = *CacheIRAddStreamOrErr; + + // Ideally, both CG and IR caching should be synchronized. However, in + // practice, their availability may diff er due to diff erent expiration + // times. Therefore, if either cache is missing, the backend process is + // triggered. + if (CacheCGAddStream || CacheIRAddStream) { + LLVM_DEBUG(dbgs() << "[FirstRound] Cache Miss for " + << BM.getModuleIdentifier() << "\n"); + return RunThinBackend(CacheCGAddStream ? CacheCGAddStream : CGAddStream, + CacheIRAddStream ? CacheIRAddStream : IRAddStream); + } + + return Error::success(); + } +}; + +/// This backend operates in the second round of a two-codegen round process. +/// It starts by reading the optimized bitcode files that were saved during the +/// first round. The backend then executes the codegen only to further optimize +/// the code, utilizing the codegen data merged from the first round. Finally, +/// it writes the resulting object files as usual. +class SecondRoundThinBackend : public InProcessThinBackend { + std::unique_ptr<SmallVector<StringRef>> IRFiles; + stable_hash CombinedCGDataHash; + +public: + SecondRoundThinBackend( + const Config &Conf, ModuleSummaryIndex &CombinedIndex, + ThreadPoolStrategy ThinLTOParallelism, + const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries, + AddStreamFn AddStream, FileCache Cache, + std::unique_ptr<SmallVector<StringRef>> IRFiles, + stable_hash CombinedCGDataHash) + : InProcessThinBackend(Conf, CombinedIndex, ThinLTOParallelism, + ModuleToDefinedGVSummaries, std::move(AddStream), + std::move(Cache), + /*OnWrite=*/nullptr, + /*ShouldEmitIndexFiles=*/false, + /*ShouldEmitImportsFiles=*/false), + IRFiles(std::move(IRFiles)), CombinedCGDataHash(CombinedCGDataHash) {} + + virtual Error runThinLTOBackendThread( + AddStreamFn AddStream, FileCache Cache, unsigned Task, BitcodeModule BM, + ModuleSummaryIndex &CombinedIndex, + const FunctionImporter::ImportMapTy &ImportList, + const FunctionImporter::ExportSetTy &ExportList, + const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR, + const GVSummaryMapTy &DefinedGlobals, + MapVector<StringRef, BitcodeModule> &ModuleMap) override { + auto RunThinBackend = [&](AddStreamFn AddStream) { + LTOLLVMContext BackendContext(Conf); + std::unique_ptr<Module> LoadedModule = + cgdata::loadModuleForTwoRounds(BM, Task, BackendContext, *IRFiles); + + return thinBackend(Conf, Task, AddStream, *LoadedModule, CombinedIndex, + ImportList, DefinedGlobals, &ModuleMap, + /*CodeGenOnly=*/true); + }; + + auto ModuleID = BM.getModuleIdentifier(); + if (!Cache.isValid() || !CombinedIndex.modulePaths().count(ModuleID) || + all_of(CombinedIndex.getModuleHash(ModuleID), + [](uint32_t V) { return V == 0; })) + // Cache disabled or no entry for this module in the combined index or + // no module hash. + return RunThinBackend(AddStream); + + // Get Key for caching the final object file in Cache with the combined + // CGData hash. + std::string Key = computeLTOCacheKey( + Conf, CombinedIndex, ModuleID, ImportList, ExportList, ResolvedODR, + DefinedGlobals, CfiFunctionDefs, CfiFunctionDecls); + Key = recomputeLTOCacheKey(Key, + /*ExtraID=*/std::to_string(CombinedCGDataHash)); + Expected<AddStreamFn> CacheAddStreamOrErr = Cache(Task, Key, ModuleID); + if (Error Err = CacheAddStreamOrErr.takeError()) + return Err; + AddStreamFn &CacheAddStream = *CacheAddStreamOrErr; + + if (CacheAddStream) { + LLVM_DEBUG(dbgs() << "[SecondRound] Cache Miss for " + << BM.getModuleIdentifier() << "\n"); + return RunThinBackend(CacheAddStream); + } + + return Error::success(); + } +}; } // end anonymous namespace ThinBackend lto::createInProcessThinBackend(ThreadPoolStrategy Parallelism, @@ -1855,10 +2042,50 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache, return BackendProcess->wait(); }; - std::unique_ptr<ThinBackendProc> BackendProc = - ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries, - AddStream, Cache); - return RunBackends(BackendProc.get()); + if (!CodeGenDataThinLTOTwoRounds) { + std::unique_ptr<ThinBackendProc> BackendProc = + ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries, + AddStream, Cache); + return RunBackends(BackendProc.get()); + } + + // Perform two rounds of code generation for ThinLTO: + // 1. First round: Perform optimization and code generation, outputting to + // temporary scratch objects. + // 2. Merge code generation data extracted from the temporary scratch objects. + // 3. Second round: Execute code generation again using the merged data. + LLVM_DEBUG(dbgs() << "[TwoRounds] Initializing ThinLTO two-codegen rounds\n"); + + unsigned MaxTasks = getMaxTasks(); + auto Parallelism = ThinLTO.Backend.getParallelism(); + // Set up two additional streams and caches for storing temporary scratch + // objects and optimized IRs, using the same cache directory as the original. + cgdata::StreamCacheData CG(MaxTasks, Cache, "CG"), IR(MaxTasks, Cache, "IR"); + + // First round: Execute optimization and code generation, outputting to + // temporary scratch objects. Serialize the optimized IRs before initiating + // code generation. + LLVM_DEBUG(dbgs() << "[TwoRounds] Running the first round of codegen\n"); + auto FirstRoundLTO = std::make_unique<FirstRoundThinBackend>( + Conf, ThinLTO.CombinedIndex, Parallelism, ModuleToDefinedGVSummaries, + CG.AddStream, CG.Cache, IR.AddStream, IR.Cache); + if (Error E = RunBackends(FirstRoundLTO.get())) + return E; + + LLVM_DEBUG(dbgs() << "[TwoRounds] Merging codegen data\n"); + auto CombinedHashOrErr = cgdata::mergeCodeGenData(*CG.getResult()); + if (Error E = CombinedHashOrErr.takeError()) + return E; + auto CombinedHash = *CombinedHashOrErr; + LLVM_DEBUG(dbgs() << "[TwoRounds] CGData hash: " << CombinedHash << "\n"); + + // Second round: Read the optimized IRs and execute code generation using the + // merged data. + LLVM_DEBUG(dbgs() << "[TwoRounds] Running the second round of codegen\n"); + auto SecondRoundLTO = std::make_unique<SecondRoundThinBackend>( + Conf, ThinLTO.CombinedIndex, Parallelism, ModuleToDefinedGVSummaries, + AddStream, Cache, std::move(IR.getResult()), CombinedHash); + return RunBackends(SecondRoundLTO.get()); } Expected<std::unique_ptr<ToolOutputFile>> lto::setupLLVMOptimizationRemarks( diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp index 06eeed3e1bc41f..ad332d25d9c082 100644 --- a/llvm/lib/LTO/LTOBackend.cpp +++ b/llvm/lib/LTO/LTOBackend.cpp @@ -20,6 +20,7 @@ #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Bitcode/BitcodeReader.h" #include "llvm/Bitcode/BitcodeWriter.h" +#include "llvm/CGData/CodeGenData.h" #include "llvm/IR/LLVMRemarkStreamer.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/PassManager.h" @@ -565,7 +566,8 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream, const FunctionImporter::ImportMapTy &ImportList, const GVSummaryMapTy &DefinedGlobals, MapVector<StringRef, BitcodeModule> *ModuleMap, - bool CodeGenOnly, const std::vector<uint8_t> &CmdArgs) { + bool CodeGenOnly, AddStreamFn IRAddStream, + const std::vector<uint8_t> &CmdArgs) { Expected<const Target *> TOrErr = initAndLookupTarget(Conf, Mod); if (!TOrErr) return TOrErr.takeError(); @@ -599,11 +601,19 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream, auto OptimizeAndCodegen = [&](Module &Mod, TargetMachine *TM, std::unique_ptr<ToolOutputFile> DiagnosticOutputFile) { + // Perform optimization and code generation for ThinLTO. if (!opt(Conf, TM, Task, Mod, /*IsThinLTO=*/true, /*ExportSummary=*/nullptr, /*ImportSummary=*/&CombinedIndex, CmdArgs)) return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile)); + // Save the current module before the first codegen round. + // Note that the second codegen round runs only `codegen()` without + // running `opt()`. We're not reaching here as it's bailed out earlier + // with `CodeGenOnly` which has been set in `SecondRoundThinBackend`. + if (IRAddStream) + cgdata::saveModuleForTwoRounds(Mod, Task, IRAddStream); + codegen(Conf, TM, AddStream, Task, Mod, CombinedIndex); return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile)); }; diff --git a/llvm/test/ThinLTO/AArch64/cgdata-two-rounds-caching.ll b/llvm/test/ThinLTO/AArch64/cgdata-two-rounds-caching.ll new file mode 100644 index 00000000000000..e11903bf0f3bf8 --- /dev/null +++ b/llvm/test/ThinLTO/AArch64/cgdata-two-rounds-caching.ll @@ -0,0 +1,179 @@ +; This test verifies whether we can outline a singleton instance (i.e., an instance that does not repeat) +; by running two codegen rounds. +; This test also verifies if caches for the two-round codegens are correctly working. + +; REQUIRES: asserts +; RUN: rm -rf %t +; RUN: split-file %s %t + +; 0. Base case without a cache. +; Verify each outlining instance is singleton with the global outlining for thinlto. +; They will be identical, which can be folded by the linker with ICF. +; RUN: opt -module-hash -module-summary %t/thin-one.ll -o %t/thin-one.bc +; RUN: opt -module-hash -module-summary %t/thin-two.ll -o %t/thin-two.bc +; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc -o %t/thinlto \ +; RUN: -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \ +; RUN: -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \ +; RUN: -codegen-data-thinlto-two-rounds + +; thin-one.ll will have one outlining instance (matched in the global outlined hash tree) +; RUN: llvm-objdump -d %t/thinlto.1 | FileCheck %s --check-prefix=THINLTO-1 +; THINLTO-1: _OUTLINED_FUNCTION{{.*}}>: +; THINLTO-1-NEXT: mov +; THINLTO-1-NEXT: mov +; THINLTO-1-NEXT: b + +; thin-two.ll will have two outlining instances (matched in the global outlined hash tree) +; RUN: llvm-objdump -d %t/thinlto.2 | FileCheck %s --check-prefix=THINLTO-2 +; THINLTO-2: _OUTLINED_FUNCTION{{.*}}>: +; THINLTO-2-NEXT: mov +; THINLTO-2-NEXT: mov +; THINLTO-2-NEXT: b +; THINLTO-2: _OUTLINED_FUNCTION{{.*}}>: +; THINLTO-2-NEXT: mov +; THINLTO-2-NEXT: mov +; THINLTO-2-NEXT: b + +; 1. Run this with a cache for the first time. +; RUN: rm -rf %t.cache +; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc -o %t/thinlto-cold \ +; RUN: -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \ +; RUN: -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \ +; RUN: -codegen-data-thinlto-two-rounds -cache-dir %t.cache -debug-only=lto -thinlto-threads 1 > %t.log-cold.txt 2>&1 +; RUN: cat %t.log-cold.txt | FileCheck %s --check-prefix=COLD +; diff %t/thinlto.1 %t/thinlto-cold.1 +; diff %t/thinlto.2 %t/thinlto-cold.2 + +; COLD: [FirstRound] Cache Miss for {{.*}}thin-one.bc +; COLD: [FirstRound] Cache Miss for {{.*}}thin-two.bc +; COLD: [SecondRound] Cache Miss for {{.*}}thin-one.bc +; COLD: [SecondRound] Cache Miss for {{.*}}thin-two.bc + +; There are two input bitcode files and each one is operated with 3 caches: +; CG/IR caches for the first round and the second round CG cache. +; So the total number of files are 2 * 3 = 6. +; RUN: ls %t.cache | count 6 + +; 2. Without any changes, simply re-running it will hit the cache. +; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc -o %t/thinlto-warm \ +; RUN: -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \ +; RUN: -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \ +; RUN: -codegen-data-thinlto-two-rounds -cache-dir %t.cache -debug-only=lto -thinlto-threads 1 > %t.log-warm.txt 2>&1 +; RUN: cat %t.log-warm.txt | FileCheck %s --check-prefix=WARM +; diff %t/thinlto.1 %t/thinlto-warm.1 +; diff %t/thinlto.2 %t/thinlto-warm.2 + +; WARM-NOT: Cache Miss + +; 3. Assume thin-one.ll has been modified to thin-one-modified.ll. +; The merged CG data remains unchanged as this modification does not affect the hash tree built from thin-two.bc. +; Therefore, both the first and second round runs update only this module. +; RUN: opt -module-hash -module-summary %t/thin-one-modified.ll -o %t/thin-one.bc +; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc -o %t/thinlto-warm-modified \ +; RUN: -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \ +; RUN: -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \ +; RUN: -codegen-data-thinlto-two-rounds -cache-dir %t.cache -debug-only=lto -thinlto-threads 1 > %t.log-warm-modified.txt 2>&1 +; RUN: cat %t.log-warm-modified.txt | FileCheck %s --check-prefix=WARM-MODIFIED +; diff %t/thinlto.1 %t/thinlto-warm-modified.1 +; diff %t/thinlto.2 %t/thinlto-warm-modified.2 + +; WARM-MODIFIED: [FirstRound] Cache Miss for {{.*}}thin-one.bc +; WARM-MODIFIED-NOT: [FirstRound] Cache Miss for {{.*}}thin-two.bc +; WARM-MODIFIED: [SecondRound] Cache Miss for {{.*}}thin-one.bc +; WARM-MODIFIED-NOT: [SecondRound] Cache Miss for {{.*}}thin-two.bc + +; 4. Additionally, thin-two.ll has been modified to thin-two-modified.ll. +; In this case, the merged CG data, which is global, is updated. +; Although the first round run updates only the thin-two.bc module, +; as the module thin-one.bc remains the same as in step 3 above, +; the second round run will update all modules, resulting in diff erent binaries. +; RUN: opt -module-hash -module-summary %t/thin-one-modified.ll -o %t/thin-one.bc +; RUN: opt -module-hash -module-summary %t/thin-two-modified.ll -o %t/thin-two.bc +; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc -o %t/thinlto-warm-modified-all \ +; RUN: -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \ +; RUN: -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \ +; RUN: -codegen-data-thinlto-two-rounds -cache-dir %t.cache -debug-only=lto -thinlto-threads 1 > %t.log-warm-modified-all.txt 2>&1 +; RUN: cat %t.log-warm-modified-all.txt | FileCheck %s --check-prefix=WARM-MODIFIED-ALL +; RUN: not diff %t/thinlto.1 %t/thinlto-warm-modified-all.1 +; RUN: not diff %t/thinlto.2 %t/thinlto-warm-modified-all.2 + +; WARM-MODIFIED-ALL-NOT: [FirstRound] Cache Miss for {{.*}}thin-one.bc +; WARM-MODIFIED-ALL: [FirstRound] Cache Miss for {{.*}}thin-two.bc +; WARM-MODIFIED-ALL: [SecondRound] Cache Miss for {{.*}}thin-one.bc +; WARM-MODIFIED-ALL: [SecondRound] Cache Miss for {{.*}}thin-two.bc + +; thin-one-modified.ll won't be outlined. +; RUN: llvm-objdump -d %t/thinlto-warm-modified-all.1 | FileCheck %s --check-prefix=THINLTO-1-MODIFIED-ALL +; THINLTO-1-MODIFIED-ALL-NOT: _OUTLINED_FUNCTION{{.*}}>: + +; thin-two-modified.ll will have two (longer) outlining instances (matched in the global outlined hash tree) +; RUN: llvm-objdump -d %t/thinlto-warm-modified-all.2| FileCheck %s --check-prefix=THINLTO-2-MODIFIED-ALL +; THINLTO-2-MODIFIED-ALL: _OUTLINED_FUNCTION{{.*}}>: +; THINLTO-2-MODIFIED-ALL: mov +; THINLTO-2-MODIFIED-ALL: mov +; THINLTO-2-MODIFIED-ALL: mov +; THINLTO-2-MODIFIED-ALL: b +; THINLTO-2-MODIFIED-ALL: _OUTLINED_FUNCTION{{.*}}>: +; THINLTO-2-MODIFIED-ALL: mov +; THINLTO-2-MODIFIED-ALL: mov +; THINLTO-2-MODIFIED-ALL: mov +; THINLTO-2-MODIFIED-ALL: b + +; 5. Re-running it will hit the cache. +; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc -o %t/thinlto-warm-again \ +; RUN: -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \ +; RUN: -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \ +; RUN: -codegen-data-thinlto-two-rounds -cache-dir %t.cache -debug-only=lto -thinlto-threads 1 > %t.log-warm-again.txt 2>&1 +; RUN: cat %t.log-warm-again.txt | FileCheck %s --check-prefix=WARM-AGAIN +; RUN: diff %t/thinlto-warm-modified-all.1 %t/thinlto-warm-again.1 +; RUN: diff %t/thinlto-warm-modified-all.2 %t/thinlto-warm-again.2 + +; WARM-AGAIN-NOT: Cache Miss + +;--- thin-one.ll +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-darwin" + +declare i32 @g(i32, i32, i32) +define i32 @f3() minsize { + %1 = call i32 @g(i32 30, i32 1, i32 2); + ret i32 %1 +} + +;--- thin-one-modified.ll +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-darwin" + +declare i32 @g(i32, i32, i32) +define i32 @f3() minsize { + %1 = call i32 @g(i32 31, i32 1, i32 2); + ret i32 %1 +} + +;--- thin-two.ll +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-darwin" + +declare i32 @g(i32, i32, i32) +define i32 @f1() minsize { + %1 = call i32 @g(i32 10, i32 1, i32 2); + ret i32 %1 +} +define i32 @f2() minsize { + %1 = call i32 @g(i32 20, i32 1, i32 2); + ret i32 %1 +} + +;--- thin-two-modified.ll +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-darwin" + +declare i32 @g(i32, i32, i32) +define i32 @f1() minsize { + %1 = call i32 @g(i32 10, i32 1, i32 2); + ret i32 %1 +} +define i32 @f2() minsize { + %1 = call i32 @g(i32 10, i32 1, i32 2); + ret i32 %1 +} diff --git a/llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll b/llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll new file mode 100644 index 00000000000000..ada1c6bb9a8421 --- /dev/null +++ b/llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll @@ -0,0 +1,105 @@ +; This test checks if we can outline a singleton instance (i.e., an instance that +; does not repeat) through two codegen rounds. The first round identifies a local +; outlining instance within thin-two.ll, which is then encoded in the resulting +; object file and merged into the codegen data summary. +; The second round utilizes the merged codegen data to optimistically outline a +; singleton instance in thin-one.ll. +; Note that this global outlining creates a unique instance for each sequence +; without directly sharing identical functions for correctness. +; Actual code size reductions occur at link time through identical code folding. +; When both thinlto and lto modules are compiled, the lto module is processed +; independently, without relying on the merged codegen data. In this case, +; the identical code sequences are directly replaced by a common outlined function. + +; RUN: split-file %s %t + +; Verify each outlining instance is singleton with the global outlining for thinlto. +; They will be identical, which can be folded by the linker with ICF. +; RUN: opt -module-summary %t/thin-one.ll -o %t/thin-one.bc +; RUN: opt -module-summary %t/thin-two.ll -o %t/thin-two.bc +; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc -o %t/thinlto \ +; RUN: -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \ +; RUN: -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \ +; RUN: -codegen-data-thinlto-two-rounds + +; thin-one.ll will have one outlining instance itself (matched in the global outlined hash tree) +; RUN: llvm-objdump -d %t/thinlto.1 | FileCheck %s --check-prefix=THINLTO-1 +; THINLTO-1: _OUTLINED_FUNCTION{{.*}}>: +; THINLTO-1-NEXT: mov +; THINLTO-1-NEXT: mov +; THINLTO-1-NEXT: b + +; thin-two.ll will have two respective outlining instances (matched in the global outlined hash tree) +; RUN: llvm-objdump -d %t/thinlto.2 | FileCheck %s --check-prefix=THINLTO-2 +; THINLTO-2: _OUTLINED_FUNCTION{{.*}}>: +; THINLTO-2-NEXT: mov +; THINLTO-2-NEXT: mov +; THINLTO-2-NEXT: b +; THINLTO-2: _OUTLINED_FUNCTION{{.*}}>: +; THINLTO-2-NEXT: mov +; THINLTO-2-NEXT: mov +; THINLTO-2-NEXT: b + +; Now add a lto module to the above thinlto modules. +; Verify the lto module is optimized independent of the global outlining for thinlto. +; RUN: opt %t/lto.ll -o %t/lto.bc +; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc %t/lto.bc -o %t/out \ +; RUN: -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \ +; RUN: -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \ +; RUN: -r %t/lto.bc,_f4,px -r %t/lto.bc,_f5,px -r %t/lto.bc,_f6,px -r %t/lto.bc,_g,x \ +; RUN: -codegen-data-thinlto-two-rounds + +; lto.ll will have one shared outlining instance within the lto module itself (no global outlining). +; RUN: llvm-objdump -d %t/out.0 | FileCheck %s --check-prefix=LTO-0 +; LTO-0: _OUTLINED_FUNCTION{{.*}}>: +; LTO-0-NEXT: mov +; LTO-0-NEXT: b +; LTO-0-NOT: _OUTLINED_FUNCTION{{.*}}>: + +; thin-one.ll will have one outlining instance (matched in the global outlined hash tree) +; RUN: llvm-objdump -d %t/out.1 | FileCheck %s --check-prefix=THINLTO-1 + +; thin-two.ll will have two outlining instances (matched in the global outlined hash tree) +; RUN: llvm-objdump -d %t/out.2 | FileCheck %s --check-prefix=THINLTO-2 + +;--- thin-one.ll +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-darwin" + +declare i32 @g(i32, i32, i32) +define i32 @f3() minsize { + %1 = call i32 @g(i32 30, i32 1, i32 2); + ret i32 %1 +} + +;--- thin-two.ll +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-darwin" + +declare i32 @g(i32, i32, i32) +define i32 @f1() minsize { + %1 = call i32 @g(i32 10, i32 1, i32 2); + ret i32 %1 +} +define i32 @f2() minsize { + %1 = call i32 @g(i32 20, i32 1, i32 2); + ret i32 %1 +} + +;--- lto.ll +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-darwin" + +declare i32 @g(i32, i32, i32) +define i32 @f4() minsize { + %1 = call i32 @g(i32 10, i32 30, i32 2); + ret i32 %1 +} +define i32 @f5() minsize { + %1 = call i32 @g(i32 20, i32 40, i32 2); + ret i32 %1 +} +define i32 @f6() minsize { + %1 = call i32 @g(i32 50, i32 60, i32 2); + ret i32 %1 +} diff --git a/llvm/test/ThinLTO/AArch64/lit.local.cfg b/llvm/test/ThinLTO/AArch64/lit.local.cfg new file mode 100644 index 00000000000000..10d4a0e953ed47 --- /dev/null +++ b/llvm/test/ThinLTO/AArch64/lit.local.cfg @@ -0,0 +1,2 @@ +if not "AArch64" in config.root.targets: + config.unsupported = True _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits