llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-clang Author: nerix (Nerixyz) <details> <summary>Changes</summary> HTML starting tags that span multiple lines were previously not allowed (or rather, only the starting line was lexed as HTML). Doxygen allows those tags. This PR allows the starting tags to span multiple lines. They can't span multiple (C-)Comments, though (it's likely a user-error). Multiple BCPL comments are fine as those are single lines (shown below). Example: ```c /// <a /// href="foo" /// >Aaa</a>b int Test; ``` Fixes #<!-- -->28321. --- Full diff: https://github.com/llvm/llvm-project/pull/120843.diff 4 Files Affected: - (modified) clang/lib/AST/CommentLexer.cpp (+63-6) - (modified) clang/test/AST/ast-dump-comment.cpp (+13) - (modified) clang/unittests/AST/CommentLexer.cpp (+123) - (modified) clang/unittests/AST/CommentParser.cpp (+12-11) ``````````diff diff --git a/clang/lib/AST/CommentLexer.cpp b/clang/lib/AST/CommentLexer.cpp index ec9a5b480aa295..804be89a8d4ddc 100644 --- a/clang/lib/AST/CommentLexer.cpp +++ b/clang/lib/AST/CommentLexer.cpp @@ -196,6 +196,15 @@ const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) { return BufferEnd; } +const char *skipHorizontalWhitespace(const char *BufferPtr, + const char *BufferEnd) { + for (; BufferPtr != BufferEnd; ++BufferPtr) { + if (!isHorizontalWhitespace(*BufferPtr)) + return BufferPtr; + } + return BufferEnd; +} + bool isWhitespace(const char *BufferPtr, const char *BufferEnd) { return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd; } @@ -637,17 +646,41 @@ void Lexer::setupAndLexHTMLStartTag(Token &T) { formTokenWithChars(T, TagNameEnd, tok::html_start_tag); T.setHTMLTagStartName(Name); - BufferPtr = skipWhitespace(BufferPtr, CommentEnd); + BufferPtr = skipHorizontalWhitespace(BufferPtr, CommentEnd); + if (BufferPtr == CommentEnd) { // in BCPL comments + State = LS_HTMLStartTag; + return; + } const char C = *BufferPtr; if (BufferPtr != CommentEnd && - (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C))) + (C == '>' || C == '/' || isVerticalWhitespace(C) || + isHTMLIdentifierStartingCharacter(C))) State = LS_HTMLStartTag; } void Lexer::lexHTMLStartTag(Token &T) { assert(State == LS_HTMLStartTag); + // Skip leading whitespace and comment decorations + while (isVerticalWhitespace(*BufferPtr)) { + BufferPtr = skipNewline(BufferPtr, CommentEnd); + + if (CommentState == LCS_InsideCComment) + skipLineStartingDecorations(); + + BufferPtr = skipHorizontalWhitespace(BufferPtr, CommentEnd); + if (BufferPtr == CommentEnd) { + // HTML starting tags must be defined in a single comment block. + // It's likely a user-error where they forgot to terminate the comment. + State = LS_Normal; + // Since at least one newline was skipped and one token needs to be lexed, + // return a newline. + formTokenWithChars(T, BufferPtr, tok::newline); + return; + } + } + const char *TokenPtr = BufferPtr; char C = *TokenPtr; if (isHTMLIdentifierCharacter(C)) { @@ -693,14 +726,13 @@ void Lexer::lexHTMLStartTag(Token &T) { // Now look ahead and return to normal state if we don't see any HTML tokens // ahead. - BufferPtr = skipWhitespace(BufferPtr, CommentEnd); + BufferPtr = skipHorizontalWhitespace(BufferPtr, CommentEnd); if (BufferPtr == CommentEnd) { - State = LS_Normal; return; } C = *BufferPtr; - if (!isHTMLIdentifierStartingCharacter(C) && + if (!isHTMLIdentifierStartingCharacter(C) && !isVerticalWhitespace(C) && C != '=' && C != '\"' && C != '\'' && C != '>' && C != '/') { State = LS_Normal; return; @@ -774,8 +806,17 @@ void Lexer::lex(Token &T) { BufferPtr++; CommentState = LCS_InsideBCPLComment; - if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine) + switch (State) { + case LS_VerbatimBlockFirstLine: + case LS_VerbatimBlockBody: + break; + case LS_HTMLStartTag: + BufferPtr = skipHorizontalWhitespace(BufferPtr, BufferEnd); + break; + default: State = LS_Normal; + break; + } CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd); goto again; } @@ -807,6 +848,14 @@ void Lexer::lex(Token &T) { while(EndWhitespace != BufferEnd && *EndWhitespace != '/') EndWhitespace++; + // When lexing the start of an HTML tag (i.e. going through the attributes) + // there won't be any newlines generated. + if (State == LS_HTMLStartTag && EndWhitespace != BufferEnd) { + CommentState = LCS_BeforeComment; + BufferPtr = EndWhitespace; + goto again; + } + // Turn any whitespace between comments (and there is only whitespace // between them -- guaranteed by comment extraction) into a newline. We // have two newlines between C comments in total (first one was synthesized @@ -829,6 +878,14 @@ void Lexer::lex(Token &T) { BufferPtr += 2; assert(BufferPtr <= BufferEnd); + // When lexing the start of an HTML tag (i.e. going through the + // attributes) there won't be any newlines generated - whitespace still + // needs to be skipped. + if (State == LS_HTMLStartTag && BufferPtr != BufferEnd) { + CommentState = LCS_BetweenComments; + goto again; + } + // Synthenize newline just after the C comment, regardless if there is // actually a newline. formTokenWithChars(T, BufferPtr, tok::newline); diff --git a/clang/test/AST/ast-dump-comment.cpp b/clang/test/AST/ast-dump-comment.cpp index 9798295b420f9a..40c3edb62821bb 100644 --- a/clang/test/AST/ast-dump-comment.cpp +++ b/clang/test/AST/ast-dump-comment.cpp @@ -91,6 +91,19 @@ int Test_HTMLTagComment; // CHECK-NEXT: TextComment{{.*}} Text=" " // CHECK-NEXT: HTMLStartTagComment{{.*}} Name="br" SelfClosing +/// <a +/// href="foo" +/// >Aaa</a>b +int Test_HTMLTagMultilineBCPL; +// CHECK: VarDecl{{.*}}Test_HTMLTagMultilineBCPL +// CHECK-NEXT: FullComment +// CHECK-NEXT: ParagraphComment +// CHECK-NEXT: TextComment{{.*}} Text=" " +// CHECK-NEXT: HTMLStartTagComment{{.*}} Name="a" Attrs: "href="foo" +// CHECK-NEXT: TextComment{{.*}} Text="Aaa" +// CHECK-NEXT: HTMLEndTagComment{{.*}} Name="a" +// CHECK-NEXT: TextComment{{.*}} Text="b" + /// \verbatim /// Aaa /// \endverbatim diff --git a/clang/unittests/AST/CommentLexer.cpp b/clang/unittests/AST/CommentLexer.cpp index 1e7bad89898f4c..2231a5d78af451 100644 --- a/clang/unittests/AST/CommentLexer.cpp +++ b/clang/unittests/AST/CommentLexer.cpp @@ -1453,6 +1453,129 @@ TEST_F(CommentLexerTest, HTML19) { ASSERT_EQ(tok::newline, Toks[2].getKind()); } +TEST_F(CommentLexerTest, HTML20) { + const char *Source = "// <a\n" + "// \n" + "// href=\"foo\"\n" + "// \n" + "// bar>text</a>"; + + std::vector<Token> Toks; + + lexString(Source, Toks); + + ASSERT_EQ(11U, Toks.size()); + + ASSERT_EQ(tok::text, Toks[0].getKind()); + ASSERT_EQ(StringRef(" "), Toks[0].getText()); + + ASSERT_EQ(tok::html_start_tag, Toks[1].getKind()); + ASSERT_EQ(StringRef("a"), Toks[1].getHTMLTagStartName()); + + ASSERT_EQ(tok::html_ident, Toks[2].getKind()); + ASSERT_EQ(StringRef("href"), Toks[2].getHTMLIdent()); + + ASSERT_EQ(tok::html_equals, Toks[3].getKind()); + + ASSERT_EQ(tok::html_quoted_string, Toks[4].getKind()); + ASSERT_EQ(StringRef("foo"), Toks[4].getHTMLQuotedString()); + + ASSERT_EQ(tok::html_ident, Toks[5].getKind()); + ASSERT_EQ(StringRef("bar"), Toks[5].getHTMLIdent()); + + ASSERT_EQ(tok::html_greater, Toks[6].getKind()); + + ASSERT_EQ(tok::text, Toks[7].getKind()); + ASSERT_EQ(StringRef("text"), Toks[7].getText()); + + ASSERT_EQ(tok::html_end_tag, Toks[8].getKind()); + ASSERT_EQ(StringRef("a"), Toks[8].getHTMLTagEndName()); + + ASSERT_EQ(tok::html_greater, Toks[9].getKind()); + + ASSERT_EQ(tok::newline, Toks[10].getKind()); +} + +TEST_F(CommentLexerTest, HTML21) { + const char *Source = "/**\n" + " * <a\n" + " * \n" + " * href=\"foo\"\n" + " * \n" + " * bar>text</a>\n" + " */"; + + std::vector<Token> Toks; + + lexString(Source, Toks); + + ASSERT_EQ(15U, Toks.size()); + + ASSERT_EQ(tok::newline, Toks[0].getKind()); + + ASSERT_EQ(tok::text, Toks[1].getKind()); + ASSERT_EQ(StringRef(" "), Toks[1].getText()); + + ASSERT_EQ(tok::html_start_tag, Toks[2].getKind()); + ASSERT_EQ(StringRef("a"), Toks[2].getHTMLTagStartName()); + + ASSERT_EQ(tok::html_ident, Toks[3].getKind()); + ASSERT_EQ(StringRef("href"), Toks[3].getHTMLIdent()); + + ASSERT_EQ(tok::html_equals, Toks[4].getKind()); + + ASSERT_EQ(tok::html_quoted_string, Toks[5].getKind()); + ASSERT_EQ(StringRef("foo"), Toks[5].getHTMLQuotedString()); + + ASSERT_EQ(tok::html_ident, Toks[6].getKind()); + ASSERT_EQ(StringRef("bar"), Toks[6].getHTMLIdent()); + + ASSERT_EQ(tok::html_greater, Toks[7].getKind()); + + ASSERT_EQ(tok::text, Toks[8].getKind()); + ASSERT_EQ(StringRef("text"), Toks[8].getText()); + + ASSERT_EQ(tok::html_end_tag, Toks[9].getKind()); + ASSERT_EQ(StringRef("a"), Toks[9].getHTMLTagEndName()); + + ASSERT_EQ(tok::html_greater, Toks[10].getKind()); + + ASSERT_EQ(tok::newline, Toks[11].getKind()); + + ASSERT_EQ(tok::text, Toks[12].getKind()); + ASSERT_EQ(StringRef(" "), Toks[12].getText()); + + ASSERT_EQ(tok::newline, Toks[13].getKind()); + + ASSERT_EQ(tok::newline, Toks[14].getKind()); +} + +TEST_F(CommentLexerTest, HTML22) { + const char *Source = "/**\n" + " * <a\n" + " */"; + + std::vector<Token> Toks; + + lexString(Source, Toks); + + ASSERT_EQ(6U, Toks.size()); + + ASSERT_EQ(tok::newline, Toks[0].getKind()); + + ASSERT_EQ(tok::text, Toks[1].getKind()); + ASSERT_EQ(StringRef(" "), Toks[1].getText()); + + ASSERT_EQ(tok::html_start_tag, Toks[2].getKind()); + ASSERT_EQ(StringRef("a"), Toks[2].getHTMLTagStartName()); + + ASSERT_EQ(tok::newline, Toks[3].getKind()); + + ASSERT_EQ(tok::newline, Toks[4].getKind()); + + ASSERT_EQ(tok::newline, Toks[5].getKind()); +} + TEST_F(CommentLexerTest, NotAKnownHTMLTag1) { const char *Source = "// <tag>"; diff --git a/clang/unittests/AST/CommentParser.cpp b/clang/unittests/AST/CommentParser.cpp index e0df182d430c36..aa08b6718e506f 100644 --- a/clang/unittests/AST/CommentParser.cpp +++ b/clang/unittests/AST/CommentParser.cpp @@ -1065,9 +1065,10 @@ TEST_F(CommentParserTest, InlineCommand5) { TEST_F(CommentParserTest, HTML1) { const char *Sources[] = { - "// <a", - "// <a>", - "// <a >" + "// <a", + "// <a>", + "// <a >", + "// <a\n// >", }; for (size_t i = 0, e = std::size(Sources); i != e; i++) { @@ -1088,8 +1089,9 @@ TEST_F(CommentParserTest, HTML1) { TEST_F(CommentParserTest, HTML2) { const char *Sources[] = { - "// <br/>", - "// <br />" + "// <br/>", + "// <br />", + "// <br \n// />", }; for (size_t i = 0, e = std::size(Sources); i != e; i++) { @@ -1110,10 +1112,8 @@ TEST_F(CommentParserTest, HTML2) { TEST_F(CommentParserTest, HTML3) { const char *Sources[] = { - "// <a href", - "// <a href ", - "// <a href>", - "// <a href >", + "// <a href", "// <a href ", "// <a href>", + "// <a href >", "// <a \n// href >", }; for (size_t i = 0, e = std::size(Sources); i != e; i++) { @@ -1134,8 +1134,9 @@ TEST_F(CommentParserTest, HTML3) { TEST_F(CommentParserTest, HTML4) { const char *Sources[] = { - "// <a href=\"bbb\"", - "// <a href=\"bbb\">", + "// <a href=\"bbb\"", + "// <a href=\"bbb\">", + "// <a \n// href=\"bbb\">", }; for (size_t i = 0, e = std::size(Sources); i != e; i++) { `````````` </details> https://github.com/llvm/llvm-project/pull/120843 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits