https://github.com/ilovepi updated https://github.com/llvm/llvm-project/pull/182621
>From 92a078b388681bbd0f785005519b4120ee2007f1 Mon Sep 17 00:00:00 2001 From: Paul Kirth <[email protected]> Date: Thu, 12 Jun 2025 10:50:39 -0700 Subject: [PATCH 1/2] [clang-doc] Improve complexity of Index construction The existing implementation ends up with an O(N^2) algorithm due to repeated linear scans during index construction. Switching to a StringMap allows us to reduce this to O(N), since we no longer need to search the vector. The `BM_Index_Insertion` benchmark measures the time taken to insert N unique records into the index. | Scale (N Items) | Baseline (ns) | Patched (ns) | Speedup | Change | |----------------:|--------------:|-------------:|--------:|-------:| | 10 | 9,977 | 11,004 | 0.91x | +10.3% | | 64 | 69,249 | 69,166 | 1.00x | -0.1% | | 512 | 1,932,714 | 525,877 | 3.68x | -72.8% | | 4,096 | 92,411,535 | 4,589,030 | 20.1x | -95.0% | | 10,000 | 577,384,945 | 12,998,039 | 44.4x | -97.7% | The patch delivers significant improvements to scalability. At 10,000 items, index construction is **~44 times faster**, confirming the complexity reduction from O(N^2) to O(N). The crossover point where the new map-based approach beats the vector-based approach appears to be around N=64. Since the index is typically larger than 64 for files of non trivial complexity, and users will typically be building documentation for an entire project with many files, all normal usage should benefit from this change. Other benchmarks show minor regressions, though in a typical build of LLVM documentation index construction takes up a larger amount of runtime than any of these other components. --- clang-tools-extra/clang-doc/Generators.cpp | 24 ++--- clang-tools-extra/clang-doc/JSONGenerator.cpp | 16 ++-- clang-tools-extra/clang-doc/MDGenerator.cpp | 31 ++++--- .../clang-doc/Representation.cpp | 3 +- clang-tools-extra/clang-doc/Representation.h | 3 +- clang-tools-extra/clang-doc/YAMLGenerator.cpp | 6 +- .../unittests/clang-doc/ClangDocTest.cpp | 4 +- .../unittests/clang-doc/GeneratorTest.cpp | 88 +++++++++++++++---- 8 files changed, 124 insertions(+), 51 deletions(-) diff --git a/clang-tools-extra/clang-doc/Generators.cpp b/clang-tools-extra/clang-doc/Generators.cpp index eca1f288d5ba1..f69d666757b18 100644 --- a/clang-tools-extra/clang-doc/Generators.cpp +++ b/clang-tools-extra/clang-doc/Generators.cpp @@ -212,33 +212,35 @@ void Generator::addInfoToIndex(Index &Idx, const doc::Info *Info) { for (const auto &R : llvm::reverse(Info->Namespace)) { // Look for the current namespace in the children of the index I is // pointing. - auto It = llvm::find(I->Children, R.USR); + auto It = I->Children.find(llvm::toStringRef(R.USR)); if (It != I->Children.end()) { // If it is found, just change I to point the namespace reference found. - I = &*It; + I = &It->second; } else { // If it is not found a new reference is created - I->Children.emplace_back(R.USR, R.Name, R.RefType, R.Path); + auto [NewInfo, success] = I->Children.try_emplace( + llvm::toStringRef(R.USR), R.USR, R.Name, R.RefType, R.Path); // I is updated with the reference of the new namespace reference - I = &I->Children.back(); + if (success) + I = &NewInfo->second; } } // Look for Info in the vector where it is supposed to be; it could already // exist if it is a parent namespace of an Info already passed to this // function. - auto It = llvm::find(I->Children, Info->USR); + auto It = I->Children.find(llvm::toStringRef(Info->USR)); if (It == I->Children.end()) { // If it is not in the vector it is inserted - I->Children.emplace_back(Info->USR, Info->extractName(), Info->IT, - Info->Path); + I->Children.try_emplace(llvm::toStringRef(Info->USR), Info->USR, + Info->extractName(), Info->IT, Info->Path); } else { // If it not in the vector we only check if Path and Name are not empty // because if the Info was included by a namespace it may not have those // values. - if (It->Path.empty()) - It->Path = Info->Path; - if (It->Name.empty()) - It->Name = Info->extractName(); + if (It->second.Path.empty()) + It->second.Path = Info->Path; + if (It->second.Name.empty()) + It->second.Name = Info->extractName(); } } diff --git a/clang-tools-extra/clang-doc/JSONGenerator.cpp b/clang-tools-extra/clang-doc/JSONGenerator.cpp index 9a8f51b808cab..00b862cc5f281 100644 --- a/clang-tools-extra/clang-doc/JSONGenerator.cpp +++ b/clang-tools-extra/clang-doc/JSONGenerator.cpp @@ -805,19 +805,25 @@ static Error serializeIndex(const ClangDocContext &CDCtx, StringRef RootDir) { if (IndexCopy.Children.empty()) { // If the index is empty, default to displaying the global namespace. - IndexCopy.Children.emplace_back(GlobalNamespaceID, "", + IndexCopy.Children.try_emplace(toStringRef(GlobalNamespaceID), GlobalNamespaceID, "", InfoType::IT_namespace, "GlobalNamespace"); } else { IndexArrayRef.reserve(CDCtx.Idx.Children.size()); } - for (auto &Idx : IndexCopy.Children) { - if (Idx.Children.empty()) + llvm::SmallVector<const Index *> Children; + Children.reserve(IndexCopy.Children.size()); + for (const auto &[_, Idx] : IndexCopy.Children) + Children.push_back(&Idx); + llvm::sort(Children, [](const Index *A, const Index *B) { return *A < *B; }); + + for (const auto *Idx : Children) { + if (Idx->Children.empty()) continue; - std::string TypeStr = infoTypeToString(Idx.RefType); + std::string TypeStr = infoTypeToString(Idx->RefType); json::Value IdxVal = Object(); auto &IdxObj = *IdxVal.getAsObject(); - serializeReference(Idx, IdxObj); + serializeReference(*Idx, IdxObj); IndexArrayRef.push_back(IdxVal); } Obj["Index"] = IndexArray; diff --git a/clang-tools-extra/clang-doc/MDGenerator.cpp b/clang-tools-extra/clang-doc/MDGenerator.cpp index 851b4e084ef23..f01a3e4673bcf 100644 --- a/clang-tools-extra/clang-doc/MDGenerator.cpp +++ b/clang-tools-extra/clang-doc/MDGenerator.cpp @@ -314,7 +314,7 @@ static void genMarkdown(const ClangDocContext &CDCtx, const TypedefInfo &I, // TODO support typedefs in markdown. } -static void serializeReference(llvm::raw_fd_ostream &OS, Index &I, int Level) { +static void serializeReference(llvm::raw_fd_ostream &OS, const Index &I, int Level) { // Write out the heading level starting at ## OS << "##" << std::string(Level, '#') << " "; writeNameLink("", I, OS); @@ -338,8 +338,14 @@ static llvm::Error serializeIndex(ClangDocContext &CDCtx) { OS << " for " << CDCtx.ProjectName; OS << "\n\n"; - for (auto C : CDCtx.Idx.Children) - serializeReference(OS, C, 0); + std::vector<const Index *> Children; + Children.reserve(CDCtx.Idx.Children.size()); + for (const auto &[_, C] : CDCtx.Idx.Children) + Children.push_back(&C); + llvm::sort(Children, [](const Index *A, const Index *B) { return *A < *B; }); + + for (const auto *C : Children) + serializeReference(OS, *C, 0); return llvm::Error::success(); } @@ -356,10 +362,15 @@ static llvm::Error genIndex(ClangDocContext &CDCtx) { FileErr.message()); CDCtx.Idx.sort(); OS << "# " << CDCtx.ProjectName << " C/C++ Reference\n\n"; - for (auto C : CDCtx.Idx.Children) { - if (!C.Children.empty()) { + std::vector<const Index *> Children; + Children.reserve(CDCtx.Idx.Children.size()); + for (const auto &[_, C] : CDCtx.Idx.Children) + Children.push_back(&C); + llvm::sort(Children, [](const Index *A, const Index *B) { return *A < *B; }); + for (const auto *C : Children) { + if (!C->Children.empty()) { const char *Type; - switch (C.RefType) { + switch (C->RefType) { case InfoType::IT_namespace: Type = "Namespace"; break; @@ -387,10 +398,10 @@ static llvm::Error genIndex(ClangDocContext &CDCtx) { case InfoType::IT_default: Type = "Other"; } - OS << "* " << Type << ": [" << C.Name << "]("; - if (!C.Path.empty()) - OS << C.Path << "/"; - OS << C.Name << ")\n"; + OS << "* " << Type << ": [" << C->Name << "]("; + if (!C->Path.empty()) + OS << C->Path << "/"; + OS << C->Name << ")\n"; } } return llvm::Error::success(); diff --git a/clang-tools-extra/clang-doc/Representation.cpp b/clang-tools-extra/clang-doc/Representation.cpp index a69129041516f..e1b1c96342666 100644 --- a/clang-tools-extra/clang-doc/Representation.cpp +++ b/clang-tools-extra/clang-doc/Representation.cpp @@ -473,8 +473,7 @@ bool Index::operator<(const Index &Other) const { } void Index::sort() { - llvm::sort(Children); - for (auto &C : Children) + for (auto &[_,C] : Children) C.sort(); } diff --git a/clang-tools-extra/clang-doc/Representation.h b/clang-tools-extra/clang-doc/Representation.h index 297e564ea7866..9da65105e48c0 100644 --- a/clang-tools-extra/clang-doc/Representation.h +++ b/clang-tools-extra/clang-doc/Representation.h @@ -18,6 +18,7 @@ #include "clang/Basic/Diagnostic.h" #include "clang/Basic/Specifiers.h" #include "clang/Tooling/Execution.h" +#include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" #include <array> #include <optional> @@ -611,7 +612,7 @@ struct Index : public Reference { bool operator<(const Index &Other) const; std::optional<SmallString<16>> JumpToSection; - std::vector<Index> Children; + llvm::StringMap<Index> Children; void sort(); }; diff --git a/clang-tools-extra/clang-doc/YAMLGenerator.cpp b/clang-tools-extra/clang-doc/YAMLGenerator.cpp index ce4ef58e582e5..d81fcafddbc1f 100644 --- a/clang-tools-extra/clang-doc/YAMLGenerator.cpp +++ b/clang-tools-extra/clang-doc/YAMLGenerator.cpp @@ -109,15 +109,15 @@ template <unsigned U> struct ScalarTraits<SmallString<U>> { static QuotingType mustQuote(StringRef) { return QuotingType::Single; } }; -template <> struct ScalarTraits<std::array<unsigned char, 20>> { +template <> struct ScalarTraits<SymbolID> { - static void output(const std::array<unsigned char, 20> &S, void *, + static void output(const SymbolID &S, void *, llvm::raw_ostream &OS) { OS << toHex(toStringRef(S)); } static StringRef input(StringRef Scalar, void *, - std::array<unsigned char, 20> &Value) { + SymbolID &Value) { if (Scalar.size() != 40) return "Error: Incorrect scalar size for USR."; Value = stringToSymbol(Scalar); diff --git a/clang-tools-extra/unittests/clang-doc/ClangDocTest.cpp b/clang-tools-extra/unittests/clang-doc/ClangDocTest.cpp index 73891abd53f99..169dd598f65ef 100644 --- a/clang-tools-extra/unittests/clang-doc/ClangDocTest.cpp +++ b/clang-tools-extra/unittests/clang-doc/ClangDocTest.cpp @@ -247,8 +247,8 @@ void CheckBaseRecordInfo(BaseRecordInfo *Expected, BaseRecordInfo *Actual) { void CheckIndex(Index &Expected, Index &Actual) { CheckReference(Expected, Actual); ASSERT_EQ(Expected.Children.size(), Actual.Children.size()); - for (size_t Idx = 0; Idx < Actual.Children.size(); ++Idx) - CheckIndex(Expected.Children[Idx], Actual.Children[Idx]); + for (auto &[_, C] : Expected.Children) + CheckIndex(C, Actual.Children.find(llvm::toStringRef(C.USR))->second); } } // namespace doc diff --git a/clang-tools-extra/unittests/clang-doc/GeneratorTest.cpp b/clang-tools-extra/unittests/clang-doc/GeneratorTest.cpp index e7f68b405284f..b6c4793aef3d9 100644 --- a/clang-tools-extra/unittests/clang-doc/GeneratorTest.cpp +++ b/clang-tools-extra/unittests/clang-doc/GeneratorTest.cpp @@ -46,45 +46,99 @@ TEST(GeneratorTest, emitIndex) { Index ExpectedIdx; Index IndexA; IndexA.Name = "A"; - ExpectedIdx.Children.emplace_back(std::move(IndexA)); + IndexA.USR = serialize::hashUSR("1"); + ExpectedIdx.Children.try_emplace(llvm::toStringRef(IndexA.USR), + std::move(IndexA)); Index IndexB; IndexB.Name = "B"; + IndexB.USR = serialize::hashUSR("2"); Index IndexC; IndexC.Name = "C"; - IndexB.Children.emplace_back(std::move(IndexC)); - ExpectedIdx.Children.emplace_back(std::move(IndexB)); + IndexC.USR = serialize::hashUSR("3"); + IndexB.Children.try_emplace(llvm::toStringRef(IndexC.USR), std::move(IndexC)); + ExpectedIdx.Children.try_emplace(llvm::toStringRef(IndexB.USR), + std::move(IndexB)); Index IndexD; IndexD.Name = "D"; + IndexD.USR = serialize::hashUSR("4"); Index IndexE; IndexE.Name = "E"; + IndexE.USR = serialize::hashUSR("5"); Index IndexF; IndexF.Name = "F"; - IndexE.Children.emplace_back(std::move(IndexF)); - IndexD.Children.emplace_back(std::move(IndexE)); - ExpectedIdx.Children.emplace_back(std::move(IndexD)); + IndexF.USR = serialize::hashUSR("6"); + IndexE.Children.try_emplace(llvm::toStringRef(IndexF.USR), std::move(IndexF)); + IndexD.Children.try_emplace(llvm::toStringRef(IndexE.USR), std::move(IndexE)); + ExpectedIdx.Children.try_emplace(llvm::toStringRef(IndexD.USR), + std::move(IndexD)); Index IndexG; IndexG.Name = "GlobalNamespace"; IndexG.RefType = InfoType::IT_namespace; - ExpectedIdx.Children.emplace_back(std::move(IndexG)); + ExpectedIdx.Children.try_emplace(llvm::toStringRef(IndexG.USR), + std::move(IndexG)); CheckIndex(ExpectedIdx, Idx); } TEST(GeneratorTest, sortIndex) { Index Idx; - Idx.Children.emplace_back("b"); - Idx.Children.emplace_back("aA"); - Idx.Children.emplace_back("aa"); - Idx.Children.emplace_back("A"); - Idx.Children.emplace_back("a"); + Index IndexA; + IndexA.Name = "a"; + IndexA.USR = serialize::hashUSR("1"); + Idx.Children.try_emplace(llvm::toStringRef(IndexA.USR), std::move(IndexA)); + + Index IndexB; + IndexB.Name = "A"; + IndexB.USR = serialize::hashUSR("2"); + Idx.Children.try_emplace(llvm::toStringRef(IndexB.USR), std::move(IndexB)); + + Index IndexC; + IndexC.Name = "aa"; + IndexC.USR = serialize::hashUSR("3"); + Idx.Children.try_emplace(llvm::toStringRef(IndexC.USR), std::move(IndexC)); + + Index IndexD; + IndexD.Name = "aA"; + IndexD.USR = serialize::hashUSR("4"); + Idx.Children.try_emplace(llvm::toStringRef(IndexD.USR), std::move(IndexD)); + + Index IndexE; + IndexE.Name = "b"; + IndexE.USR = serialize::hashUSR("5"); + Idx.Children.try_emplace(llvm::toStringRef(IndexE.USR), std::move(IndexE)); + Idx.sort(); Index ExpectedIdx; - ExpectedIdx.Children.emplace_back("a"); - ExpectedIdx.Children.emplace_back("A"); - ExpectedIdx.Children.emplace_back("aa"); - ExpectedIdx.Children.emplace_back("aA"); - ExpectedIdx.Children.emplace_back("b"); + Index IndexAExp; + IndexAExp.Name = "a"; + IndexAExp.USR = serialize::hashUSR("1"); + ExpectedIdx.Children.try_emplace(llvm::toStringRef(IndexAExp.USR), + std::move(IndexAExp)); + + Index IndexBExp; + IndexBExp.Name = "A"; + IndexBExp.USR = serialize::hashUSR("2"); + ExpectedIdx.Children.try_emplace(llvm::toStringRef(IndexBExp.USR), + std::move(IndexBExp)); + + Index IndexCExp; + IndexCExp.Name = "aa"; + IndexCExp.USR = serialize::hashUSR("3"); + ExpectedIdx.Children.try_emplace(llvm::toStringRef(IndexCExp.USR), + std::move(IndexCExp)); + + Index IndexDExp; + IndexDExp.Name = "aA"; + IndexDExp.USR = serialize::hashUSR("4"); + ExpectedIdx.Children.try_emplace(llvm::toStringRef(IndexDExp.USR), + std::move(IndexDExp)); + + Index IndexEExp; + IndexEExp.Name = "b"; + IndexEExp.USR = serialize::hashUSR("5"); + ExpectedIdx.Children.try_emplace(llvm::toStringRef(IndexEExp.USR), + std::move(IndexEExp)); CheckIndex(ExpectedIdx, Idx); } >From 5936c9136091b1be169a567c1e774e12ffcb8fa6 Mon Sep 17 00:00:00 2001 From: Paul Kirth <[email protected]> Date: Mon, 23 Feb 2026 15:55:06 -0800 Subject: [PATCH 2/2] Format --- clang-tools-extra/clang-doc/JSONGenerator.cpp | 5 +++-- clang-tools-extra/clang-doc/MDGenerator.cpp | 3 ++- clang-tools-extra/clang-doc/Representation.cpp | 2 +- clang-tools-extra/clang-doc/YAMLGenerator.cpp | 6 ++---- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/clang-tools-extra/clang-doc/JSONGenerator.cpp b/clang-tools-extra/clang-doc/JSONGenerator.cpp index 00b862cc5f281..773dc3a205e1c 100644 --- a/clang-tools-extra/clang-doc/JSONGenerator.cpp +++ b/clang-tools-extra/clang-doc/JSONGenerator.cpp @@ -805,8 +805,9 @@ static Error serializeIndex(const ClangDocContext &CDCtx, StringRef RootDir) { if (IndexCopy.Children.empty()) { // If the index is empty, default to displaying the global namespace. - IndexCopy.Children.try_emplace(toStringRef(GlobalNamespaceID), GlobalNamespaceID, "", - InfoType::IT_namespace, "GlobalNamespace"); + IndexCopy.Children.try_emplace(toStringRef(GlobalNamespaceID), + GlobalNamespaceID, "", + InfoType::IT_namespace, "GlobalNamespace"); } else { IndexArrayRef.reserve(CDCtx.Idx.Children.size()); } diff --git a/clang-tools-extra/clang-doc/MDGenerator.cpp b/clang-tools-extra/clang-doc/MDGenerator.cpp index f01a3e4673bcf..f94690632895c 100644 --- a/clang-tools-extra/clang-doc/MDGenerator.cpp +++ b/clang-tools-extra/clang-doc/MDGenerator.cpp @@ -314,7 +314,8 @@ static void genMarkdown(const ClangDocContext &CDCtx, const TypedefInfo &I, // TODO support typedefs in markdown. } -static void serializeReference(llvm::raw_fd_ostream &OS, const Index &I, int Level) { +static void serializeReference(llvm::raw_fd_ostream &OS, const Index &I, + int Level) { // Write out the heading level starting at ## OS << "##" << std::string(Level, '#') << " "; writeNameLink("", I, OS); diff --git a/clang-tools-extra/clang-doc/Representation.cpp b/clang-tools-extra/clang-doc/Representation.cpp index e1b1c96342666..e3ccc43b9072b 100644 --- a/clang-tools-extra/clang-doc/Representation.cpp +++ b/clang-tools-extra/clang-doc/Representation.cpp @@ -473,7 +473,7 @@ bool Index::operator<(const Index &Other) const { } void Index::sort() { - for (auto &[_,C] : Children) + for (auto &[_, C] : Children) C.sort(); } diff --git a/clang-tools-extra/clang-doc/YAMLGenerator.cpp b/clang-tools-extra/clang-doc/YAMLGenerator.cpp index d81fcafddbc1f..6034131f5db35 100644 --- a/clang-tools-extra/clang-doc/YAMLGenerator.cpp +++ b/clang-tools-extra/clang-doc/YAMLGenerator.cpp @@ -111,13 +111,11 @@ template <unsigned U> struct ScalarTraits<SmallString<U>> { template <> struct ScalarTraits<SymbolID> { - static void output(const SymbolID &S, void *, - llvm::raw_ostream &OS) { + static void output(const SymbolID &S, void *, llvm::raw_ostream &OS) { OS << toHex(toStringRef(S)); } - static StringRef input(StringRef Scalar, void *, - SymbolID &Value) { + static StringRef input(StringRef Scalar, void *, SymbolID &Value) { if (Scalar.size() != 40) return "Error: Incorrect scalar size for USR."; Value = stringToSymbol(Scalar); _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
