[clang] [Clang][Comments] Allow HTML tags across multiple lines (PR #120843)

via cfe-commits Sat, 21 Dec 2024 08:55:56 -0800

llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-clang

Author: nerix (Nerixyz)

<details>
<summary>Changes</summary>

HTML starting tags that span multiple lines were previously not allowed (or 
rather, only the starting line was lexed as HTML). Doxygen allows those tags.

This PR allows the starting tags to span multiple lines. They can't span 
multiple (C-)Comments, though (it's likely a user-error). Multiple BCPL 
comments are fine as those are single lines (shown below).

Example:

```c
/// &lt;a
///     href="foo"
/// &gt;Aaa&lt;/a&gt;b
int Test;
```

Fixes #<!-- -->28321.

---
Full diff: https://github.com/llvm/llvm-project/pull/120843.diff


4 Files Affected:

- (modified) clang/lib/AST/CommentLexer.cpp (+63-6) 
- (modified) clang/test/AST/ast-dump-comment.cpp (+13) 
- (modified) clang/unittests/AST/CommentLexer.cpp (+123) 
- (modified) clang/unittests/AST/CommentParser.cpp (+12-11) 


``````````diff
diff --git a/clang/lib/AST/CommentLexer.cpp b/clang/lib/AST/CommentLexer.cpp
index ec9a5b480aa295..804be89a8d4ddc 100644
--- a/clang/lib/AST/CommentLexer.cpp
+++ b/clang/lib/AST/CommentLexer.cpp
@@ -196,6 +196,15 @@ const char *skipWhitespace(const char *BufferPtr, const 
char *BufferEnd) {
   return BufferEnd;
 }
 
+const char *skipHorizontalWhitespace(const char *BufferPtr,
+                                     const char *BufferEnd) {
+  for (; BufferPtr != BufferEnd; ++BufferPtr) {
+    if (!isHorizontalWhitespace(*BufferPtr))
+      return BufferPtr;
+  }
+  return BufferEnd;
+}
+
 bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
   return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
 }
@@ -637,17 +646,41 @@ void Lexer::setupAndLexHTMLStartTag(Token &T) {
   formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
   T.setHTMLTagStartName(Name);
 
-  BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
+  BufferPtr = skipHorizontalWhitespace(BufferPtr, CommentEnd);
+  if (BufferPtr == CommentEnd) { // in BCPL comments
+    State = LS_HTMLStartTag;
+    return;
+  }
 
   const char C = *BufferPtr;
   if (BufferPtr != CommentEnd &&
-      (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
+      (C == '>' || C == '/' || isVerticalWhitespace(C) ||
+       isHTMLIdentifierStartingCharacter(C)))
     State = LS_HTMLStartTag;
 }
 
 void Lexer::lexHTMLStartTag(Token &T) {
   assert(State == LS_HTMLStartTag);
 
+  // Skip leading whitespace and comment decorations
+  while (isVerticalWhitespace(*BufferPtr)) {
+    BufferPtr = skipNewline(BufferPtr, CommentEnd);
+
+    if (CommentState == LCS_InsideCComment)
+      skipLineStartingDecorations();
+
+    BufferPtr = skipHorizontalWhitespace(BufferPtr, CommentEnd);
+    if (BufferPtr == CommentEnd) {
+      // HTML starting tags must be defined in a single comment block.
+      // It's likely a user-error where they forgot to terminate the comment.
+      State = LS_Normal;
+      // Since at least one newline was skipped and one token needs to be 
lexed,
+      // return a newline.
+      formTokenWithChars(T, BufferPtr, tok::newline);
+      return;
+    }
+  }
+
   const char *TokenPtr = BufferPtr;
   char C = *TokenPtr;
   if (isHTMLIdentifierCharacter(C)) {
@@ -693,14 +726,13 @@ void Lexer::lexHTMLStartTag(Token &T) {
 
   // Now look ahead and return to normal state if we don't see any HTML tokens
   // ahead.
-  BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
+  BufferPtr = skipHorizontalWhitespace(BufferPtr, CommentEnd);
   if (BufferPtr == CommentEnd) {
-    State = LS_Normal;
     return;
   }
 
   C = *BufferPtr;
-  if (!isHTMLIdentifierStartingCharacter(C) &&
+  if (!isHTMLIdentifierStartingCharacter(C) && !isVerticalWhitespace(C) &&
       C != '=' && C != '\"' && C != '\'' && C != '>' && C != '/') {
     State = LS_Normal;
     return;
@@ -774,8 +806,17 @@ void Lexer::lex(Token &T) {
         BufferPtr++;
 
       CommentState = LCS_InsideBCPLComment;
-      if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
+      switch (State) {
+      case LS_VerbatimBlockFirstLine:
+      case LS_VerbatimBlockBody:
+        break;
+      case LS_HTMLStartTag:
+        BufferPtr = skipHorizontalWhitespace(BufferPtr, BufferEnd);
+        break;
+      default:
         State = LS_Normal;
+        break;
+      }
       CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
       goto again;
     }
@@ -807,6 +848,14 @@ void Lexer::lex(Token &T) {
     while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
       EndWhitespace++;
 
+    // When lexing the start of an HTML tag (i.e. going through the attributes)
+    // there won't be any newlines generated.
+    if (State == LS_HTMLStartTag && EndWhitespace != BufferEnd) {
+      CommentState = LCS_BeforeComment;
+      BufferPtr = EndWhitespace;
+      goto again;
+    }
+
     // Turn any whitespace between comments (and there is only whitespace
     // between them -- guaranteed by comment extraction) into a newline.  We
     // have two newlines between C comments in total (first one was synthesized
@@ -829,6 +878,14 @@ void Lexer::lex(Token &T) {
         BufferPtr += 2;
         assert(BufferPtr <= BufferEnd);
 
+        // When lexing the start of an HTML tag (i.e. going through the
+        // attributes) there won't be any newlines generated - whitespace still
+        // needs to be skipped.
+        if (State == LS_HTMLStartTag && BufferPtr != BufferEnd) {
+          CommentState = LCS_BetweenComments;
+          goto again;
+        }
+
         // Synthenize newline just after the C comment, regardless if there is
         // actually a newline.
         formTokenWithChars(T, BufferPtr, tok::newline);
diff --git a/clang/test/AST/ast-dump-comment.cpp 
b/clang/test/AST/ast-dump-comment.cpp
index 9798295b420f9a..40c3edb62821bb 100644
--- a/clang/test/AST/ast-dump-comment.cpp
+++ b/clang/test/AST/ast-dump-comment.cpp
@@ -91,6 +91,19 @@ int Test_HTMLTagComment;
 // CHECK-NEXT:       TextComment{{.*}} Text=" "
 // CHECK-NEXT:       HTMLStartTagComment{{.*}} Name="br" SelfClosing
 
+/// <a
+///     href="foo"
+/// >Aaa</a>b
+int Test_HTMLTagMultilineBCPL;
+// CHECK:      VarDecl{{.*}}Test_HTMLTagMultilineBCPL
+// CHECK-NEXT:   FullComment
+// CHECK-NEXT:     ParagraphComment
+// CHECK-NEXT:       TextComment{{.*}} Text=" "
+// CHECK-NEXT:       HTMLStartTagComment{{.*}} Name="a" Attrs:  "href="foo"
+// CHECK-NEXT:       TextComment{{.*}} Text="Aaa"
+// CHECK-NEXT:       HTMLEndTagComment{{.*}} Name="a"
+// CHECK-NEXT:       TextComment{{.*}} Text="b"
+
 /// \verbatim
 /// Aaa
 /// \endverbatim
diff --git a/clang/unittests/AST/CommentLexer.cpp 
b/clang/unittests/AST/CommentLexer.cpp
index 1e7bad89898f4c..2231a5d78af451 100644
--- a/clang/unittests/AST/CommentLexer.cpp
+++ b/clang/unittests/AST/CommentLexer.cpp
@@ -1453,6 +1453,129 @@ TEST_F(CommentLexerTest, HTML19) {
   ASSERT_EQ(tok::newline,      Toks[2].getKind());
 }
 
+TEST_F(CommentLexerTest, HTML20) {
+  const char *Source = "// <a\n"
+                       "// \n"
+                       "// href=\"foo\"\n"
+                       "// \n"
+                       "// bar>text</a>";
+
+  std::vector<Token> Toks;
+
+  lexString(Source, Toks);
+
+  ASSERT_EQ(11U, Toks.size());
+
+  ASSERT_EQ(tok::text, Toks[0].getKind());
+  ASSERT_EQ(StringRef(" "), Toks[0].getText());
+
+  ASSERT_EQ(tok::html_start_tag, Toks[1].getKind());
+  ASSERT_EQ(StringRef("a"), Toks[1].getHTMLTagStartName());
+
+  ASSERT_EQ(tok::html_ident, Toks[2].getKind());
+  ASSERT_EQ(StringRef("href"), Toks[2].getHTMLIdent());
+
+  ASSERT_EQ(tok::html_equals, Toks[3].getKind());
+
+  ASSERT_EQ(tok::html_quoted_string, Toks[4].getKind());
+  ASSERT_EQ(StringRef("foo"), Toks[4].getHTMLQuotedString());
+
+  ASSERT_EQ(tok::html_ident, Toks[5].getKind());
+  ASSERT_EQ(StringRef("bar"), Toks[5].getHTMLIdent());
+
+  ASSERT_EQ(tok::html_greater, Toks[6].getKind());
+
+  ASSERT_EQ(tok::text, Toks[7].getKind());
+  ASSERT_EQ(StringRef("text"), Toks[7].getText());
+
+  ASSERT_EQ(tok::html_end_tag, Toks[8].getKind());
+  ASSERT_EQ(StringRef("a"), Toks[8].getHTMLTagEndName());
+
+  ASSERT_EQ(tok::html_greater, Toks[9].getKind());
+
+  ASSERT_EQ(tok::newline, Toks[10].getKind());
+}
+
+TEST_F(CommentLexerTest, HTML21) {
+  const char *Source = "/**\n"
+                       " * <a\n"
+                       " * \n"
+                       " * href=\"foo\"\n"
+                       " * \n"
+                       " * bar>text</a>\n"
+                       " */";
+
+  std::vector<Token> Toks;
+
+  lexString(Source, Toks);
+
+  ASSERT_EQ(15U, Toks.size());
+
+  ASSERT_EQ(tok::newline, Toks[0].getKind());
+
+  ASSERT_EQ(tok::text, Toks[1].getKind());
+  ASSERT_EQ(StringRef(" "), Toks[1].getText());
+
+  ASSERT_EQ(tok::html_start_tag, Toks[2].getKind());
+  ASSERT_EQ(StringRef("a"), Toks[2].getHTMLTagStartName());
+
+  ASSERT_EQ(tok::html_ident, Toks[3].getKind());
+  ASSERT_EQ(StringRef("href"), Toks[3].getHTMLIdent());
+
+  ASSERT_EQ(tok::html_equals, Toks[4].getKind());
+
+  ASSERT_EQ(tok::html_quoted_string, Toks[5].getKind());
+  ASSERT_EQ(StringRef("foo"), Toks[5].getHTMLQuotedString());
+
+  ASSERT_EQ(tok::html_ident, Toks[6].getKind());
+  ASSERT_EQ(StringRef("bar"), Toks[6].getHTMLIdent());
+
+  ASSERT_EQ(tok::html_greater, Toks[7].getKind());
+
+  ASSERT_EQ(tok::text, Toks[8].getKind());
+  ASSERT_EQ(StringRef("text"), Toks[8].getText());
+
+  ASSERT_EQ(tok::html_end_tag, Toks[9].getKind());
+  ASSERT_EQ(StringRef("a"), Toks[9].getHTMLTagEndName());
+
+  ASSERT_EQ(tok::html_greater, Toks[10].getKind());
+
+  ASSERT_EQ(tok::newline, Toks[11].getKind());
+
+  ASSERT_EQ(tok::text, Toks[12].getKind());
+  ASSERT_EQ(StringRef(" "), Toks[12].getText());
+
+  ASSERT_EQ(tok::newline, Toks[13].getKind());
+
+  ASSERT_EQ(tok::newline, Toks[14].getKind());
+}
+
+TEST_F(CommentLexerTest, HTML22) {
+  const char *Source = "/**\n"
+                       " * <a\n"
+                       " */";
+
+  std::vector<Token> Toks;
+
+  lexString(Source, Toks);
+
+  ASSERT_EQ(6U, Toks.size());
+
+  ASSERT_EQ(tok::newline, Toks[0].getKind());
+
+  ASSERT_EQ(tok::text, Toks[1].getKind());
+  ASSERT_EQ(StringRef(" "), Toks[1].getText());
+
+  ASSERT_EQ(tok::html_start_tag, Toks[2].getKind());
+  ASSERT_EQ(StringRef("a"), Toks[2].getHTMLTagStartName());
+
+  ASSERT_EQ(tok::newline, Toks[3].getKind());
+
+  ASSERT_EQ(tok::newline, Toks[4].getKind());
+
+  ASSERT_EQ(tok::newline, Toks[5].getKind());
+}
+
 TEST_F(CommentLexerTest, NotAKnownHTMLTag1) {
   const char *Source = "// <tag>";
 
diff --git a/clang/unittests/AST/CommentParser.cpp 
b/clang/unittests/AST/CommentParser.cpp
index e0df182d430c36..aa08b6718e506f 100644
--- a/clang/unittests/AST/CommentParser.cpp
+++ b/clang/unittests/AST/CommentParser.cpp
@@ -1065,9 +1065,10 @@ TEST_F(CommentParserTest, InlineCommand5) {
 
 TEST_F(CommentParserTest, HTML1) {
   const char *Sources[] = {
-    "// <a",
-    "// <a>",
-    "// <a >"
+      "// <a",
+      "// <a>",
+      "// <a >",
+      "// <a\n// >",
   };
 
   for (size_t i = 0, e = std::size(Sources); i != e; i++) {
@@ -1088,8 +1089,9 @@ TEST_F(CommentParserTest, HTML1) {
 
 TEST_F(CommentParserTest, HTML2) {
   const char *Sources[] = {
-    "// <br/>",
-    "// <br />"
+      "// <br/>",
+      "// <br />",
+      "// <br \n// />",
   };
 
   for (size_t i = 0, e = std::size(Sources); i != e; i++) {
@@ -1110,10 +1112,8 @@ TEST_F(CommentParserTest, HTML2) {
 
 TEST_F(CommentParserTest, HTML3) {
   const char *Sources[] = {
-    "// <a href",
-    "// <a href ",
-    "// <a href>",
-    "// <a href >",
+      "// <a href",   "// <a href ",       "// <a href>",
+      "// <a href >", "// <a \n// href >",
   };
 
   for (size_t i = 0, e = std::size(Sources); i != e; i++) {
@@ -1134,8 +1134,9 @@ TEST_F(CommentParserTest, HTML3) {
 
 TEST_F(CommentParserTest, HTML4) {
   const char *Sources[] = {
-    "// <a href=\"bbb\"",
-    "// <a href=\"bbb\">",
+      "// <a href=\"bbb\"",
+      "// <a href=\"bbb\">",
+      "// <a \n// href=\"bbb\">",
   };
 
   for (size_t i = 0, e = std::size(Sources); i != e; i++) {

``````````

</details>


https://github.com/llvm/llvm-project/pull/120843
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [Clang][Comments] Allow HTML tags across multiple lines (PR #120843)

Reply via email to