Author: Dmitry Polukhin Date: 2020-04-17T06:17:33-07:00 New Revision: a7afb211dc460bd4cfb2542ad1f9b05876b57ba1
URL: https://github.com/llvm/llvm-project/commit/a7afb211dc460bd4cfb2542ad1f9b05876b57ba1 DIFF: https://github.com/llvm/llvm-project/commit/a7afb211dc460bd4cfb2542ad1f9b05876b57ba1.diff LOG: [clang][AST] Support AST files larger than 512M Summary: Clang uses 32-bit integers for storing bit offsets from the beginning of the file that results in 512M limit on AST file. This diff replaces absolute offsets with relative offsets from the beginning of corresponding data structure when it is possible. And uses 64-bit offsets for DeclOffests and TypeOffssts because these coder AST section may easily exceeds 512M alone. This diff breaks AST file format compatibility so VERSION_MAJOR bumped. Test Plan: Existing clang AST serialization tests Tested on clangd with ~700M and ~900M preamble files check-clang with ubsan Reviewers: rsmith, dexonsmith Subscribers: ilya-biryukov, kadircet, usaxena95, cfe-commits Tags: #clang Differential Revision: https://reviews.llvm.org/D76594 Added: Modified: clang/include/clang/Serialization/ASTBitCodes.h clang/include/clang/Serialization/ASTReader.h clang/include/clang/Serialization/ASTWriter.h clang/include/clang/Serialization/ModuleFile.h clang/lib/Serialization/ASTReader.cpp clang/lib/Serialization/ASTReaderDecl.cpp clang/lib/Serialization/ASTWriter.cpp clang/lib/Serialization/ASTWriterDecl.cpp Removed: ################################################################################ diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h index 323edfbf8126..d5a27f487fa9 100644 --- a/clang/include/clang/Serialization/ASTBitCodes.h +++ b/clang/include/clang/Serialization/ASTBitCodes.h @@ -41,7 +41,7 @@ namespace serialization { /// Version 4 of AST files also requires that the version control branch and /// revision match exactly, since there is no backward compatibility of /// AST files at this time. - const unsigned VERSION_MAJOR = 9; + const unsigned VERSION_MAJOR = 10; /// AST file minor version number supported by this version of /// Clang. @@ -181,7 +181,7 @@ namespace serialization { /// Raw source location of end of range. unsigned End; - /// Offset in the AST file. + /// Offset in the AST file relative to ModuleFile::MacroOffsetsBase. uint32_t BitOffset; PPEntityOffset(SourceRange R, uint32_t BitOffset) @@ -216,17 +216,41 @@ namespace serialization { } }; - /// Source range/offset of a preprocessed entity. + /// Offset in the AST file. Use splitted 64-bit integer into low/high + /// parts to keep structure alignment 32-bit (it is important because + /// blobs in bitstream are 32-bit aligned). This structure is serialized + /// "as is" to the AST file. + struct UnderalignedInt64 { + uint32_t BitOffsetLow = 0; + uint32_t BitOffsetHigh = 0; + + UnderalignedInt64() = default; + UnderalignedInt64(uint64_t BitOffset) { setBitOffset(BitOffset); } + + void setBitOffset(uint64_t Offset) { + BitOffsetLow = Offset; + BitOffsetHigh = Offset >> 32; + } + + uint64_t getBitOffset() const { + return BitOffsetLow | (uint64_t(BitOffsetHigh) << 32); + } + }; + + /// Source location and bit offset of a declaration. struct DeclOffset { /// Raw source location. unsigned Loc = 0; - /// Offset in the AST file. - uint32_t BitOffset = 0; + /// Offset in the AST file. Keep structure alignment 32-bit and avoid + /// padding gap because undefined value in the padding affects AST hash. + UnderalignedInt64 BitOffset; DeclOffset() = default; - DeclOffset(SourceLocation Loc, uint32_t BitOffset) - : Loc(Loc.getRawEncoding()), BitOffset(BitOffset) {} + DeclOffset(SourceLocation Loc, uint64_t BitOffset) { + setLocation(Loc); + setBitOffset(BitOffset); + } void setLocation(SourceLocation L) { Loc = L.getRawEncoding(); @@ -235,6 +259,14 @@ namespace serialization { SourceLocation getLocation() const { return SourceLocation::getFromRawEncoding(Loc); } + + void setBitOffset(uint64_t Offset) { + BitOffset.setBitOffset(Offset); + } + + uint64_t getBitOffset() const { + return BitOffset.getBitOffset(); + } }; /// The number of predefined preprocessed entity IDs. diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h index 94645fff9f93..11a537fad5d5 100644 --- a/clang/include/clang/Serialization/ASTReader.h +++ b/clang/include/clang/Serialization/ASTReader.h @@ -723,9 +723,10 @@ class ASTReader struct PendingMacroInfo { ModuleFile *M; - uint64_t MacroDirectivesOffset; + /// Offset relative to ModuleFile::MacroOffsetsBase. + uint32_t MacroDirectivesOffset; - PendingMacroInfo(ModuleFile *M, uint64_t MacroDirectivesOffset) + PendingMacroInfo(ModuleFile *M, uint32_t MacroDirectivesOffset) : M(M), MacroDirectivesOffset(MacroDirectivesOffset) {} }; @@ -2205,7 +2206,7 @@ class ASTReader /// \param MacroDirectivesOffset Offset of the serialized macro directive /// history. void addPendingMacro(IdentifierInfo *II, ModuleFile *M, - uint64_t MacroDirectivesOffset); + uint32_t MacroDirectivesOffset); /// Read the set of macros defined by this external macro source. void ReadDefinedMacros() override; diff --git a/clang/include/clang/Serialization/ASTWriter.h b/clang/include/clang/Serialization/ASTWriter.h index c0a943adf2c7..8dc4889e3ae8 100644 --- a/clang/include/clang/Serialization/ASTWriter.h +++ b/clang/include/clang/Serialization/ASTWriter.h @@ -243,7 +243,7 @@ class ASTWriter : public ASTDeserializationListener, /// Offset of each type in the bitstream, indexed by /// the type's ID. - std::vector<uint32_t> TypeOffsets; + std::vector<serialization::UnderalignedInt64> TypeOffsets; /// The first ID number we can use for our own identifiers. serialization::IdentID FirstIdentID = serialization::NUM_PREDEF_IDENT_IDS; @@ -277,7 +277,8 @@ class ASTWriter : public ASTDeserializationListener, /// The macro infos to emit. std::vector<MacroInfoToEmitData> MacroInfosToEmit; - llvm::DenseMap<const IdentifierInfo *, uint64_t> IdentMacroDirectivesOffsetMap; + llvm::DenseMap<const IdentifierInfo *, uint32_t> + IdentMacroDirectivesOffsetMap; /// @name FlushStmt Caches /// @{ @@ -464,7 +465,8 @@ class ASTWriter : public ASTDeserializationListener, const Preprocessor &PP); void WritePreprocessor(const Preprocessor &PP, bool IsModule); void WriteHeaderSearch(const HeaderSearch &HS); - void WritePreprocessorDetail(PreprocessingRecord &PPRec); + void WritePreprocessorDetail(PreprocessingRecord &PPRec, + uint64_t MacroOffsetsBase); void WriteSubmodules(Module *WritingModule); void WritePragmaDiagnosticMappings(const DiagnosticsEngine &Diag, @@ -588,7 +590,7 @@ class ASTWriter : public ASTDeserializationListener, /// Determine the ID of an already-emitted macro. serialization::MacroID getMacroID(MacroInfo *MI); - uint64_t getMacroDirectivesOffset(const IdentifierInfo *Name); + uint32_t getMacroDirectivesOffset(const IdentifierInfo *Name); /// Emit a reference to a type. void AddTypeRef(QualType T, RecordDataImpl &Record); diff --git a/clang/include/clang/Serialization/ModuleFile.h b/clang/include/clang/Serialization/ModuleFile.h index 90d2745e080c..0cbfb2a14cd6 100644 --- a/clang/include/clang/Serialization/ModuleFile.h +++ b/clang/include/clang/Serialization/ModuleFile.h @@ -251,6 +251,10 @@ class ModuleFile { /// The base offset in the source manager's view of this module. unsigned SLocEntryBaseOffset = 0; + /// Base file offset for the offsets in SLocEntryOffsets. Real file offset + /// for the entry is SLocEntryOffsetsBase + SLocEntryOffsets[i]. + uint64_t SLocEntryOffsetsBase = 0; + /// Offsets for all of the source location entries in the /// AST file. const uint32_t *SLocEntryOffsets = nullptr; @@ -302,6 +306,10 @@ class ModuleFile { /// The number of macros in this AST file. unsigned LocalNumMacros = 0; + /// Base file offset for the offsets in MacroOffsets. Real file offset for + /// the entry is MacroOffsetsBase + MacroOffsets[i]. + uint64_t MacroOffsetsBase = 0; + /// Offsets of macros in the preprocessor block. /// /// This array is indexed by the macro ID (-1), and provides @@ -450,7 +458,7 @@ class ModuleFile { /// Offset of each type within the bitstream, indexed by the /// type ID, or the representation of a Type*. - const uint32_t *TypeOffsets = nullptr; + const UnderalignedInt64 *TypeOffsets = nullptr; /// Base type ID for types local to this module as represented in /// the global type ID space. diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index 7f114c069586..f0e9bbd4dcea 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -1470,6 +1470,7 @@ bool ASTReader::ReadSLocEntry(int ID) { ModuleFile *F = GlobalSLocEntryMap.find(-ID)->second; if (llvm::Error Err = F->SLocEntryCursor.JumpToBit( + F->SLocEntryOffsetsBase + F->SLocEntryOffsets[ID - F->SLocEntryBaseID])) { Error(std::move(Err)); return true; @@ -1932,9 +1933,8 @@ HeaderFileInfoTrait::ReadData(internal_key_ref key, const unsigned char *d, return HFI; } -void ASTReader::addPendingMacro(IdentifierInfo *II, - ModuleFile *M, - uint64_t MacroDirectivesOffset) { +void ASTReader::addPendingMacro(IdentifierInfo *II, ModuleFile *M, + uint32_t MacroDirectivesOffset) { assert(NumCurrentElementsDeserializing > 0 &&"Missing deserialization guard"); PendingMacroIDs[II].push_back(PendingMacroInfo(M, MacroDirectivesOffset)); } @@ -2099,7 +2099,8 @@ void ASTReader::resolvePendingMacro(IdentifierInfo *II, BitstreamCursor &Cursor = M.MacroCursor; SavedStreamPosition SavedPosition(Cursor); - if (llvm::Error Err = Cursor.JumpToBit(PMInfo.MacroDirectivesOffset)) { + if (llvm::Error Err = + Cursor.JumpToBit(M.MacroOffsetsBase + PMInfo.MacroDirectivesOffset)) { Error(std::move(Err)); return; } @@ -3098,7 +3099,7 @@ ASTReader::ReadASTBlock(ModuleFile &F, unsigned ClientLoadCapabilities) { Error("duplicate TYPE_OFFSET record in AST file"); return Failure; } - F.TypeOffsets = (const uint32_t *)Blob.data(); + F.TypeOffsets = reinterpret_cast<const UnderalignedInt64 *>(Blob.data()); F.LocalNumTypes = Record[0]; unsigned LocalBaseTypeIndex = Record[1]; F.BaseTypeIndex = getTotalNumTypes(); @@ -3376,6 +3377,7 @@ ASTReader::ReadASTBlock(ModuleFile &F, unsigned ClientLoadCapabilities) { F.SLocEntryOffsets = (const uint32_t *)Blob.data(); F.LocalNumSLocEntries = Record[0]; unsigned SLocSpaceSize = Record[1]; + F.SLocEntryOffsetsBase = Record[2]; std::tie(F.SLocEntryBaseID, F.SLocEntryBaseOffset) = SourceMgr.AllocateLoadedSLocEntries(F.LocalNumSLocEntries, SLocSpaceSize); @@ -3694,6 +3696,7 @@ ASTReader::ReadASTBlock(ModuleFile &F, unsigned ClientLoadCapabilities) { F.MacroOffsets = (const uint32_t *)Blob.data(); F.LocalNumMacros = Record[0]; unsigned LocalBaseMacroID = Record[1]; + F.MacroOffsetsBase = Record[2]; F.BaseMacroID = getTotalNumMacros(); if (F.LocalNumMacros > 0) { @@ -5907,8 +5910,8 @@ PreprocessedEntity *ASTReader::ReadPreprocessedEntity(unsigned Index) { } SavedStreamPosition SavedPosition(M.PreprocessorDetailCursor); - if (llvm::Error Err = - M.PreprocessorDetailCursor.JumpToBit(PPOffs.BitOffset)) { + if (llvm::Error Err = M.PreprocessorDetailCursor.JumpToBit( + M.MacroOffsetsBase + PPOffs.BitOffset)) { Error(std::move(Err)); return nullptr; } @@ -6321,7 +6324,8 @@ ASTReader::RecordLocation ASTReader::TypeCursorForIndex(unsigned Index) { GlobalTypeMapType::iterator I = GlobalTypeMap.find(Index); assert(I != GlobalTypeMap.end() && "Corrupted global type map"); ModuleFile *M = I->second; - return RecordLocation(M, M->TypeOffsets[Index - M->BaseTypeIndex]); + return RecordLocation( + M, M->TypeOffsets[Index - M->BaseTypeIndex].getBitOffset()); } static llvm::Optional<Type::TypeClass> getTypeClassForCode(TypeCode code) { @@ -8427,7 +8431,8 @@ MacroInfo *ASTReader::getMacro(MacroID ID) { assert(I != GlobalMacroMap.end() && "Corrupted global macro map"); ModuleFile *M = I->second; unsigned Index = ID - M->BaseMacroID; - MacrosLoaded[ID] = ReadMacroRecord(*M, M->MacroOffsets[Index]); + MacrosLoaded[ID] = + ReadMacroRecord(*M, M->MacroOffsetsBase + M->MacroOffsets[Index]); if (DeserializationListener) DeserializationListener->MacroRead(ID + NUM_PREDEF_MACRO_IDS, diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp index fce4be133220..0a278c7506e1 100644 --- a/clang/lib/Serialization/ASTReaderDecl.cpp +++ b/clang/lib/Serialization/ASTReaderDecl.cpp @@ -2870,7 +2870,7 @@ ASTReader::DeclCursorForID(DeclID ID, SourceLocation &Loc) { const DeclOffset &DOffs = M->DeclOffsets[ID - M->BaseDeclID - NUM_PREDEF_DECL_IDS]; Loc = TranslateSourceLocation(*M, DOffs.getLocation()); - return RecordLocation(M, DOffs.BitOffset); + return RecordLocation(M, DOffs.getBitOffset()); } ASTReader::RecordLocation ASTReader::getLocalBitOffset(uint64_t GlobalOffset) { diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index de59dd280ba8..c8ce3edda60b 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -1893,6 +1893,7 @@ void ASTWriter::WriteSourceManagerBlock(SourceManager &SourceMgr, // Write out the source location entry table. We skip the first // entry, which is always the same dummy entry. std::vector<uint32_t> SLocEntryOffsets; + uint64_t SLocEntryOffsetsBase = Stream.GetCurrentBitNo(); RecordData PreloadSLocs; SLocEntryOffsets.reserve(SourceMgr.local_sloc_entry_size() - 1); for (unsigned I = 1, N = SourceMgr.local_sloc_entry_size(); @@ -1903,7 +1904,9 @@ void ASTWriter::WriteSourceManagerBlock(SourceManager &SourceMgr, assert(&SourceMgr.getSLocEntry(FID) == SLoc); // Record the offset of this source-location entry. - SLocEntryOffsets.push_back(Stream.GetCurrentBitNo()); + uint64_t Offset = Stream.GetCurrentBitNo() - SLocEntryOffsetsBase; + assert((Offset >> 32) == 0 && "SLocEntry offset too large"); + SLocEntryOffsets.push_back(Offset); // Figure out which record code to use. unsigned Code; @@ -2011,12 +2014,14 @@ void ASTWriter::WriteSourceManagerBlock(SourceManager &SourceMgr, Abbrev->Add(BitCodeAbbrevOp(SOURCE_LOCATION_OFFSETS)); Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 16)); // # of slocs Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 16)); // total size + Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 32)); // base offset Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); // offsets unsigned SLocOffsetsAbbrev = Stream.EmitAbbrev(std::move(Abbrev)); { RecordData::value_type Record[] = { SOURCE_LOCATION_OFFSETS, SLocEntryOffsets.size(), - SourceMgr.getNextLocalOffset() - 1 /* skip dummy */}; + SourceMgr.getNextLocalOffset() - 1 /* skip dummy */, + SLocEntryOffsetsBase}; Stream.EmitRecordWithBlob(SLocOffsetsAbbrev, Record, bytes(SLocEntryOffsets)); } @@ -2093,9 +2098,11 @@ static bool shouldIgnoreMacro(MacroDirective *MD, bool IsModule, /// Writes the block containing the serialized form of the /// preprocessor. void ASTWriter::WritePreprocessor(const Preprocessor &PP, bool IsModule) { + uint64_t MacroOffsetsBase = Stream.GetCurrentBitNo(); + PreprocessingRecord *PPRec = PP.getPreprocessingRecord(); if (PPRec) - WritePreprocessorDetail(*PPRec); + WritePreprocessorDetail(*PPRec, MacroOffsetsBase); RecordData Record; RecordData ModuleMacroRecord; @@ -2156,7 +2163,8 @@ void ASTWriter::WritePreprocessor(const Preprocessor &PP, bool IsModule) { // identifier they belong to. for (const IdentifierInfo *Name : MacroIdentifiers) { MacroDirective *MD = PP.getLocalMacroDirectiveHistory(Name); - auto StartOffset = Stream.GetCurrentBitNo(); + uint64_t StartOffset = Stream.GetCurrentBitNo() - MacroOffsetsBase; + assert((StartOffset >> 32) == 0 && "Macro identifiers offset too large"); // Emit the macro directives in reverse source order. for (; MD; MD = MD->getPrevious()) { @@ -2229,14 +2237,12 @@ void ASTWriter::WritePreprocessor(const Preprocessor &PP, bool IsModule) { // Record the local offset of this macro. unsigned Index = ID - FirstMacroID; - if (Index == MacroOffsets.size()) - MacroOffsets.push_back(Stream.GetCurrentBitNo()); - else { - if (Index > MacroOffsets.size()) - MacroOffsets.resize(Index + 1); + if (Index >= MacroOffsets.size()) + MacroOffsets.resize(Index + 1); - MacroOffsets[Index] = Stream.GetCurrentBitNo(); - } + uint64_t Offset = Stream.GetCurrentBitNo() - MacroOffsetsBase; + assert((Offset >> 32) == 0 && "Macro offset too large"); + MacroOffsets[Index] = Offset; AddIdentifierRef(Name, Record); AddSourceLocation(MI->getDefinitionLoc(), Record); @@ -2287,17 +2293,20 @@ void ASTWriter::WritePreprocessor(const Preprocessor &PP, bool IsModule) { Abbrev->Add(BitCodeAbbrevOp(MACRO_OFFSET)); Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // # of macros Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // first ID + Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 32)); // base offset Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); unsigned MacroOffsetAbbrev = Stream.EmitAbbrev(std::move(Abbrev)); { RecordData::value_type Record[] = {MACRO_OFFSET, MacroOffsets.size(), - FirstMacroID - NUM_PREDEF_MACRO_IDS}; + FirstMacroID - NUM_PREDEF_MACRO_IDS, + MacroOffsetsBase}; Stream.EmitRecordWithBlob(MacroOffsetAbbrev, Record, bytes(MacroOffsets)); } } -void ASTWriter::WritePreprocessorDetail(PreprocessingRecord &PPRec) { +void ASTWriter::WritePreprocessorDetail(PreprocessingRecord &PPRec, + uint64_t MacroOffsetsBase) { if (PPRec.local_begin() == PPRec.local_end()) return; @@ -2334,8 +2343,10 @@ void ASTWriter::WritePreprocessorDetail(PreprocessingRecord &PPRec) { (void)++E, ++NumPreprocessingRecords, ++NextPreprocessorEntityID) { Record.clear(); + uint64_t Offset = Stream.GetCurrentBitNo() - MacroOffsetsBase; + assert((Offset >> 32) == 0 && "Preprocessed entity offset too large"); PreprocessedEntityOffsets.push_back( - PPEntityOffset((*E)->getSourceRange(), Stream.GetCurrentBitNo())); + PPEntityOffset((*E)->getSourceRange(), Offset)); if (auto *MD = dyn_cast<MacroDefinitionRecord>(*E)) { // Record this macro definition's ID. @@ -2808,10 +2819,10 @@ void ASTWriter::WriteType(QualType T) { // Record the offset for this type. unsigned Index = Idx.getIndex() - FirstTypeID; if (TypeOffsets.size() == Index) - TypeOffsets.push_back(Offset); + TypeOffsets.emplace_back(Offset); else if (TypeOffsets.size() < Index) { TypeOffsets.resize(Index + 1); - TypeOffsets[Index] = Offset; + TypeOffsets[Index].setBitOffset(Offset); } else { llvm_unreachable("Types emitted in wrong order"); } @@ -5144,7 +5155,7 @@ MacroID ASTWriter::getMacroID(MacroInfo *MI) { return MacroIDs[MI]; } -uint64_t ASTWriter::getMacroDirectivesOffset(const IdentifierInfo *Name) { +uint32_t ASTWriter::getMacroDirectivesOffset(const IdentifierInfo *Name) { return IdentMacroDirectivesOffsetMap.lookup(Name); } diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp index e847180435ec..8c5be6cacac0 100644 --- a/clang/lib/Serialization/ASTWriterDecl.cpp +++ b/clang/lib/Serialization/ASTWriterDecl.cpp @@ -2434,12 +2434,12 @@ void ASTWriter::WriteDecl(ASTContext &Context, Decl *D) { SourceLocation Loc = D->getLocation(); unsigned Index = ID - FirstDeclID; if (DeclOffsets.size() == Index) - DeclOffsets.push_back(DeclOffset(Loc, Offset)); + DeclOffsets.emplace_back(Loc, Offset); else if (DeclOffsets.size() < Index) { // FIXME: Can/should this happen? DeclOffsets.resize(Index+1); DeclOffsets[Index].setLocation(Loc); - DeclOffsets[Index].BitOffset = Offset; + DeclOffsets[Index].setBitOffset(Offset); } else { llvm_unreachable("declarations should be emitted in ID order"); } _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits