https://github.com/evelez7 created https://github.com/llvm/llvm-project/pull/155887
None >From c9e91c7681af8801235ab741b6e68966ecc3690c Mon Sep 17 00:00:00 2001 From: Erick Velez <erickvel...@gmail.com> Date: Mon, 11 Aug 2025 10:21:35 -0700 Subject: [PATCH] [clang-doc] markdown parser --- clang-tools-extra/clang-doc/CMakeLists.txt | 1 + clang-tools-extra/clang-doc/MDParser.cpp | 229 ++++++++++++++++++ clang-tools-extra/clang-doc/MDParser.h | 99 ++++++++ clang-tools-extra/clang-doc/Representation.h | 1 + clang-tools-extra/clang-doc/Serialize.cpp | 1 + .../unittests/clang-doc/CMakeLists.txt | 1 + .../unittests/clang-doc/MDParserTest.cpp | 38 +++ 7 files changed, 370 insertions(+) create mode 100644 clang-tools-extra/clang-doc/MDParser.cpp create mode 100644 clang-tools-extra/clang-doc/MDParser.h create mode 100644 clang-tools-extra/unittests/clang-doc/MDParserTest.cpp diff --git a/clang-tools-extra/clang-doc/CMakeLists.txt b/clang-tools-extra/clang-doc/CMakeLists.txt index 5989e5fe60cf3..f86272b3381a5 100644 --- a/clang-tools-extra/clang-doc/CMakeLists.txt +++ b/clang-tools-extra/clang-doc/CMakeLists.txt @@ -18,6 +18,7 @@ add_clang_library(clangDoc STATIC YAMLGenerator.cpp HTMLMustacheGenerator.cpp JSONGenerator.cpp + MDParser.cpp DEPENDS omp_gen diff --git a/clang-tools-extra/clang-doc/MDParser.cpp b/clang-tools-extra/clang-doc/MDParser.cpp new file mode 100644 index 0000000000000..80e0386002dcd --- /dev/null +++ b/clang-tools-extra/clang-doc/MDParser.cpp @@ -0,0 +1,229 @@ +#include "MDParser.h" +#include "clang/Basic/CharInfo.h" +#include "llvm/ADT/AllocatorList.h" + +namespace clang { +namespace doc { +namespace { +bool isEmphasisDelimiter(char &Token) { + // TODO: support '_' + if (Token == '*') + return true; + return false; +} +} // namespace + +std::pair<std::optional<DelimiterContext>, size_t> +MarkdownParser::processDelimiters(SmallString<64> &Line, const size_t &Origin) { + size_t Idx = Origin; + while (Idx < Line.size() && Line[Idx] == Line[Origin]) { + ++Idx; + } + + char Preceeding = (Origin == 0) ? ' ' : Line[Origin - 1]; + char Proceeding = (Idx >= Line.size()) ? ' ' : Line[Idx]; + + bool LeftFlanking = !isWhitespace(Proceeding) && + (!isPunctuation(Proceeding) || isWhitespace(Preceeding) || + isPunctuation(Preceeding)); + bool RightFlanking = !isWhitespace(Preceeding) && + (!isPunctuation(Preceeding) || isWhitespace(Proceeding) || + isPunctuation(Proceeding)); + + if (LeftFlanking && RightFlanking) + return {DelimiterContext{LeftFlanking, RightFlanking, true, true}, Idx}; + if (LeftFlanking) + return {DelimiterContext{LeftFlanking, RightFlanking, true, false}, Idx}; + if (RightFlanking) + return {DelimiterContext{LeftFlanking, RightFlanking, false, true}, Idx}; + return {std::nullopt, 0}; +} + +Node *MarkdownParser::createTextNode(const std::list<LineNode *> &Text) { + Node *TextNode = new (Arena) Node(); + for (const auto *Node : Text) { + TextNode->Content.append(Node->Content); + } + TextNode->Type = MDType::Text; + return TextNode; +} + +Node *MarkdownParser::reverseIterateLine(std::list<LineNode *> &Stack, + std::list<LineNode *>::iterator &It) { + auto ReverseIt = std::make_reverse_iterator(It); + std::list<LineNode *> Text; + while (ReverseIt != Stack.rend()) { + auto *ReverseCurrent = *ReverseIt; + if (!ReverseCurrent->DelimiterContext && !ReverseCurrent->Content.empty()) { + Text.push_back(ReverseCurrent); + ReverseIt++; + continue; + } + + if (ReverseCurrent->DelimiterContext && + ReverseCurrent->DelimiterContext->CanOpen) { + if (Text.empty()) { + // If there is no text between the runs, there is no emphasis, so both + // delimiter runs are literal text. + auto *DelimiterTextNode = new (Arena) Node(); + DelimiterTextNode->Content = + Saver.save((*It)->Content + ReverseCurrent->Content); + DelimiterTextNode->Type = MDType::Text; + return DelimiterTextNode; + } + Node *Emphasis = nullptr; + + auto &Closer = (*It)->DelimiterContext; + auto &Opener = ReverseCurrent->DelimiterContext; + + if (Closer->Length >= 2 && Opener->Length >= 2) { + // We have at least one strong node. + Closer->Length -= 2; + Opener->Length -= 2; + Emphasis = new (Arena) Node(); + Emphasis->Type = MDType::Strong; + auto *Child = createTextNode(Text); + Child->Parent = Emphasis; + Emphasis->Children.push_back(Child); + } else if (Closer->Length == 1 && Opener->Length == 1) { + Closer->Length -= 1; + Opener->Length -= 1; + Emphasis = new (Arena) Node(); + Emphasis->Type = MDType::Emphasis; + auto *Child = createTextNode(Text); + Child->Parent = Emphasis; + Emphasis->Children.push_back(Child); + } + + if (Closer->Length == 0) + It = Stack.erase(It); + if (Opener->Length == 0) + ReverseIt = std::make_reverse_iterator(Stack.erase(ReverseIt.base())); + if (!Text.empty()) + for (auto *Node : Text) + Stack.remove(Node); + return Emphasis; + } + ReverseIt++; + } + return nullptr; +} + +std::list<Node *> +MarkdownParser::processEmphasis(std::list<LineNode *> &Stack) { + std::list<Node *> Result; + auto It = Stack.begin(); + while (It != Stack.end()) { + LineNode *Current = *It; + if (Current->DelimiterContext && Current->DelimiterContext->CanClose) { + auto *NewNode = reverseIterateLine(Stack, It); + if (NewNode) { + Result.push_back(NewNode); + It = Stack.begin(); + continue; + } + } + ++It; + } + + return Result; +} + +void MarkdownParser::parseLine(SmallString<64> &Line, Node *Current) { + std::list<LineNode *> Stack; + BumpPtrAllocator LineArena; + size_t StrCount = 0; + size_t Idx = 0; + for (; Idx < Line.size(); ++Idx) { + if (isEmphasisDelimiter(Line[Idx])) { + auto DelimiterResult = processDelimiters(Line, Idx); + if (DelimiterResult.first != std::nullopt) { + if (StrCount > 0) { + auto *TextNode = new (LineArena) LineNode(); + TextNode->Content = Line.substr(Idx - StrCount, StrCount); + Stack.push_back(TextNode); + StrCount = 0; + } + auto *NewNode = new (LineArena) LineNode(); + NewNode->Content = Line.substr(Idx, DelimiterResult.second - Idx); + NewNode->DelimiterContext = std::move(DelimiterResult.first); + NewNode->DelimiterContext->Length = NewNode->Content.size(); + Stack.push_back(NewNode); + Idx = DelimiterResult.second - 1; + continue; + } + } + // Not any emphasis delimiter, so it will be appended as a string later + StrCount += 1; + } + + if (StrCount > 0) { + auto *TextNode = new (LineArena) LineNode(); + TextNode->Content = Line.substr(Line.size() - StrCount, StrCount); + Stack.push_back(TextNode); + } + + auto Resolved = processEmphasis(Stack); + for (auto *Node : Resolved) { + Node->Parent = Current; + Current->Children.push_back(Node); + } +} + +Node *MarkdownParser::parse(std::vector<SmallString<64>> &Lines) { + auto *Root = new (Arena) Node(); + Node *Current = Root; + for (auto &Line : Lines) { + if (Line.empty()) { + auto *Paragraph = new (Arena) Node(); + Paragraph->Type = MDType::Paragraph; + Paragraph->Parent = Current; + Current->Children.push_back(Paragraph); + Current = Paragraph; + continue; + } + parseLine(Line, Current); + } + return Root; +} + +std::string MarkdownParser::traverse(Node *Current) { + std::string Result; + switch (Current->Type) { + case MDType::Strong: + Result.append("<strong>"); + for (auto *Child : Current->Children) + Result.append(traverse(Child)); + Result.append("</strong>"); + break; + case MDType::Text: + Result.append(Current->Content); + break; + case MDType::Softbreak: + Result.append("\n"); + break; + case MDType::Paragraph: + Result.append("<p>"); + for (auto *Child : Current->Children) + Result.append(traverse(Child)); + Result.append("</p>"); + break; + case MDType::Emphasis: + Result.append("<em>"); + for (auto *Child : Current->Children) + Result.append(traverse(Child)); + Result.append("</em>"); + break; + } + return Result; +} + +std::string MarkdownParser::render(std::vector<SmallString<64>> &Lines) { + auto *Document = parse(Lines); + std::string Result; + for (auto *Child : Document->Children) + Result.append(traverse(Child)); + return Result; +} +} // namespace doc +} // namespace clang diff --git a/clang-tools-extra/clang-doc/MDParser.h b/clang-tools-extra/clang-doc/MDParser.h new file mode 100644 index 0000000000000..32599eaae4cf3 --- /dev/null +++ b/clang-tools-extra/clang-doc/MDParser.h @@ -0,0 +1,99 @@ +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MD_PARSER_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MD_PARSER_H +#include "llvm/ADT/SmallString.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/StringSaver.h" +#include <list> + +using namespace llvm; + +namespace clang { +namespace doc { +using llvm::SmallString; +enum class MDState { Emphasis, Strong, None }; + +enum class MDType { + Paragraph, + Emphasis, + Strong, + Text, + Softbreak, +}; + +enum class MDTokenType { LeftDelimiterRun, RightDelimiterRun, Text }; + +struct Node { + SmallVector<Node*> Children; + MDType Type; + Node *Parent; + std::string Content; +}; + +struct DelimiterContext { + bool RightFlanking; + bool LeftFlanking; + bool CanOpen; + bool CanClose; + char DelimChar; + // Since Content is a StringRef, we separately track the length so that we can + // decrement when necessary without modifying the string. + size_t Length; +}; + +/// A LineNode might be a valid delimiter run, text, or a delimiter run that +/// will later be merged with a text if there is no matching run e.g. ***foo. +/// @brief A preprocessing structure for tracking text in a line. +struct LineNode { + StringRef Content; + // Instantiated if the line is a delimiter run. + std::optional<DelimiterContext> DelimiterContext; +}; + +class MarkdownParser { + // MDState State; + BumpPtrAllocator Arena; + StringSaver Saver; + + /// If a delimiter is found, determine if it is a delimiter run, what type of + /// run it is, and whether it can be an opener or closer. + /// + /// The CommonMark specification defines delimiter runs as: + /// A delimiter run is either a sequence of one or more * or _ characters that + /// is not preceded or followed by a non-backslash-escaped * or _ character + /// + /// A left-flanking delimiter run is a delimiter run that is (1) not followed + /// by Unicode whitespace, and either (2a) not followed by a Unicode + /// punctuation character, or (2b) followed by a Unicode punctuation character + /// and preceded by Unicode whitespace or a Unicode punctuation character. + /// + /// A right-flanking delimiter run is a delimiter run that is (1) not preceded + /// by Unicode whitespace, and either (2a) not preceded by a Unicode + /// punctuation character, or (2b) preceded by a Unicode punctuation character + /// and followed by Unicode whitespace or a Unicode punctuation character. + /// + /// @param IdxOrigin the index of * or _ that might start a delimiter run. + /// @return A pair denoting the type of run and the index where the run stops + std::pair<std::optional<DelimiterContext>, size_t> + processDelimiters(SmallString<64> &Line, const size_t &Origin = 0); + + void parseLine(SmallString<64> &Line, Node *Current); + std::list<Node *> processEmphasis(std::list<LineNode *> &Stack); + void convertToNode(LineNode LN, Node *Parent); + + Node *reverseIterateLine(std::list<LineNode *> &Stack, + std::list<LineNode *>::iterator &It); + + Node *createTextNode(const std::list<LineNode *> &Text); + + std::string traverse(Node *Current); + + /// @param Lines An entire Document that resides in a comment. + /// @return the root of a Markdown document. + Node* parse(std::vector<SmallString<64>> &Lines); +public: + MarkdownParser() : Arena(BumpPtrAllocator()), Saver(Arena) {} + std::string render(std::vector<SmallString<64>> &Lines); +}; +} // namespace doc +} // namespace clang +#endif diff --git a/clang-tools-extra/clang-doc/Representation.h b/clang-tools-extra/clang-doc/Representation.h index 2a75f89696b7d..71acb6956d6f7 100644 --- a/clang-tools-extra/clang-doc/Representation.h +++ b/clang-tools-extra/clang-doc/Representation.h @@ -99,6 +99,7 @@ struct CommentInfo { bool SelfClosing = false; // Indicates if tag is self-closing (for HTML). bool Explicit = false; // Indicates if the direction of a param is explicit // (for (T)ParamCommand). + bool Markdown = false; // Comment contains Markdown tokens. llvm::SmallVector<SmallString<16>, 4> AttrKeys; // List of attribute keys (for HTML). llvm::SmallVector<SmallString<16>, 4> diff --git a/clang-tools-extra/clang-doc/Serialize.cpp b/clang-tools-extra/clang-doc/Serialize.cpp index bcab4f1b8a729..787ce55d5e87e 100644 --- a/clang-tools-extra/clang-doc/Serialize.cpp +++ b/clang-tools-extra/clang-doc/Serialize.cpp @@ -17,6 +17,7 @@ #include "clang/Lex/Lexer.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Support/SHA1.h" +#include <stack> using clang::comments::FullComment; diff --git a/clang-tools-extra/unittests/clang-doc/CMakeLists.txt b/clang-tools-extra/unittests/clang-doc/CMakeLists.txt index 18166acf9bbca..b62fd25a08da7 100644 --- a/clang-tools-extra/unittests/clang-doc/CMakeLists.txt +++ b/clang-tools-extra/unittests/clang-doc/CMakeLists.txt @@ -32,6 +32,7 @@ add_extra_unittest(ClangDocTests SerializeTest.cpp YAMLGeneratorTest.cpp JSONGeneratorTest.cpp + MDParserTest.cpp ) clang_target_link_libraries(ClangDocTests diff --git a/clang-tools-extra/unittests/clang-doc/MDParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MDParserTest.cpp new file mode 100644 index 0000000000000..0ecbce0d7da53 --- /dev/null +++ b/clang-tools-extra/unittests/clang-doc/MDParserTest.cpp @@ -0,0 +1,38 @@ +#include "MDParser.h" +#include "ClangDocTest.h" + +namespace clang { +namespace doc { +TEST(MDParserTest, Strong) { + MarkdownParser Parser; + std::vector<SmallString<64>> Line = {{"**Strong**"}}; + auto Result = Parser.render(Line); + std::string Expected = R"raw(<strong>Strong</strong>)raw"; + EXPECT_EQ(Expected, Result); +} + +// TEST(MDParserTest, DoubleStrong) { +// MarkdownParser Parser; +// std::vector<SmallString<64>> Line = {{"****Strong****"}}; +// auto Result = Parser.render(Line); +// std::string Expected = R"raw(<strong><strong>Strong</strong></strong>)raw"; +// EXPECT_EQ(Expected, Result); +// } + +TEST(MDParserTest, Emphasis) { + MarkdownParser Parser; + std::vector<SmallString<64>> Line = {{"*Emphasis*"}}; + auto Result = Parser.render(Line); + std::string Expected = R"raw(<em>Emphasis</em>)raw"; + EXPECT_EQ(Expected, Result); +} + +// TEST(MDParserTest, Text) { +// MarkdownParser Parser; +// std::vector<SmallString<64>> Line = {{"Text"}}; +// auto Result = Parser.render(Line); +// std::string Expected = R"raw(Text)raw"; +// EXPECT_EQ(Expected, Result); +// } +} // namespace doc +} // namespace clang _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits