[PATCH] D46000: [AST] Added a helper to extract a user-friendly text of a comment.

Ilya Biryukov via Phabricator via cfe-commits Tue, 24 Apr 2018 04:53:25 -0700

ilya-biryukov created this revision.
ilya-biryukov added reviewers: sammccall, hokein, ioeric.


The helper is used in clangd for documentation shown in code completion
and storing the docs in the symbols. See https://reviews.llvm.org/D45999.

This patch reuses the code of the Doxygen comment lexer, disabling the
bits that do command and html tag parsing.
The new helper works on all comments, including non-doxygen comments
and is faster. However, it does not understand or transform any
doxygen directives, i.e. cannot extract brief text, etc.


Repository:
  rC Clang

https://reviews.llvm.org/D46000

Files:
  include/clang/AST/CommentLexer.h
  include/clang/AST/RawCommentList.h
  lib/AST/CommentLexer.cpp
  lib/AST/RawCommentList.cpp

Index: lib/AST/RawCommentList.cpp
===================================================================
--- lib/AST/RawCommentList.cpp
+++ lib/AST/RawCommentList.cpp
@@ -335,3 +335,83 @@
              BeforeThanCompare<RawComment>(SourceMgr));
   std::swap(Comments, MergedComments);
 }
+
+std::string RawComment::getFormattedText(const ASTContext &Ctx) const {
+  auto &SourceMgr = Ctx.getSourceManager();
+  llvm::StringRef CommentText = getRawText(SourceMgr);
+  if (CommentText.empty())
+    return ""; // we couldn't retreive the comment.
+
+  llvm::BumpPtrAllocator Allocator;
+  comments::Lexer L(Allocator, Ctx.getDiagnostics(),
+                    Ctx.getCommentCommandTraits(), getSourceRange().getBegin(),
+                    CommentText.begin(), CommentText.end(),
+                    /*ParseCommentText=*/false);
+
+  // Trim whitespace at the start of \p S of length up to the value of \p
+  // MaxSkip.
+  auto SkipWs = [](llvm::StringRef S, unsigned MaxSkip) -> llvm::StringRef {
+    unsigned SkipLen = std::min(
+        MaxSkip, (unsigned)std::min(S.size(), S.find_first_not_of(" \t")));
+    return S.drop_front(SkipLen);
+  };
+
+  std::string Result;
+  unsigned IndentColumn = 0;
+
+  // Processes one line of the comment and adds it to the result.
+  // Handles skipping the indent at the start of the line.
+  // Returns false when eof is reached and true otherwise.
+  auto LexLine = [&](bool IsFirstLine) -> bool {
+    comments::Token Tok;
+    // Lex the first token on the line. We handle it separately, because we to
+    // fix up its indentation.
+    L.lex(Tok);
+    if (Tok.is(comments::tok::eof))
+      return false;
+    if (Tok.is(comments::tok::newline)) {
+      Result += "\n";
+      return true;
+    }
+    llvm::StringRef TokText = L.getSpelling(Tok, SourceMgr);
+    bool LocInvalid = false;
+    unsigned TokColumn =
+        SourceMgr.getSpellingColumnNumber(Tok.getLocation(), &LocInvalid);
+    if (LocInvalid)
+      TokColumn = 0;
+    // Compute the length of whitespace we're allowed to skip.
+    unsigned MaxSkip;
+    if (IsFirstLine) {
+      // For the first line we skip all leading ws.
+      MaxSkip = std::numeric_limits<unsigned>::max();
+    } else {
+      // For the rest, we skip up to the column of first non-ws symbol on the
+      // first line..
+      MaxSkip = std::max((int)IndentColumn - (int)TokColumn, 0);
+    }
+    llvm::StringRef Trimmed = SkipWs(TokText, MaxSkip);
+    Result += Trimmed;
+    // Remember the amount of whitespace we skipped in the first line to remove
+    // indent up to that column in the following lines.
+    if (IsFirstLine)
+      IndentColumn = TokColumn + TokText.size() - Trimmed.size();
+    // Lex all tokens in the rest of the line.
+    for (L.lex(Tok); Tok.isNot(comments::tok::eof); L.lex(Tok)) {
+      if (Tok.is(comments::tok::newline)) {
+        Result += "\n";
+        return true;
+      }
+      Result += L.getSpelling(Tok, SourceMgr);
+    }
+    // We've reached the end of the line.
+    return false;
+  };
+
+  // Proces first line separately to remember indent for the following lines.
+  if (!LexLine(/*IsFirstLine=*/true))
+    return Result;
+  // Process the rest of the lines.
+  while (LexLine(/*IsFirstLine=*/false))
+    ;
+  return Result;
+}
Index: lib/AST/CommentLexer.cpp
===================================================================
--- lib/AST/CommentLexer.cpp
+++ lib/AST/CommentLexer.cpp
@@ -291,6 +291,14 @@
 }
 
 void Lexer::lexCommentText(Token &T) {
+  if (ParseCommands)
+    lexCommentTextWithCommands(T);
+  else
+    lexCommentTextWithoutCommands(T);
+}
+
+void Lexer::lexCommentTextWithCommands(Token &T) {
+  assert(ParseCommands);
   assert(CommentState == LCS_InsideBCPLComment ||
          CommentState == LCS_InsideCComment);
 
@@ -448,6 +456,39 @@
   }
 }
 
+void Lexer::lexCommentTextWithoutCommands(Token &T) {
+  assert(!ParseCommands);
+  assert(CommentState == LCS_InsideBCPLComment ||
+         CommentState == LCS_InsideCComment);
+  assert(State == LS_Normal);
+
+  const char *TokenPtr = BufferPtr;
+  assert(TokenPtr < CommentEnd);
+  while (TokenPtr != CommentEnd) {
+    switch(*TokenPtr) {
+      case '\n':
+      case '\r':
+        TokenPtr = skipNewline(TokenPtr, CommentEnd);
+        formTokenWithChars(T, TokenPtr, tok::newline);
+
+        if (CommentState == LCS_InsideCComment)
+          skipLineStartingDecorations();
+        return;
+
+      default: {
+        size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr).
+                         find_first_of("\n\r\\@&<");
+        if (End != StringRef::npos)
+          TokenPtr += End;
+        else
+          TokenPtr = CommentEnd;
+        formTextToken(T, TokenPtr);
+        return;
+      }
+    }
+  }
+}
+
 void Lexer::setupAndLexVerbatimBlock(Token &T,
                                      const char *TextBegin,
                                      char Marker, const CommandInfo *Info) {
@@ -727,14 +768,13 @@
 }
 
 Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
-             const CommandTraits &Traits,
-             SourceLocation FileLoc,
-             const char *BufferStart, const char *BufferEnd):
-    Allocator(Allocator), Diags(Diags), Traits(Traits),
-    BufferStart(BufferStart), BufferEnd(BufferEnd),
-    FileLoc(FileLoc), BufferPtr(BufferStart),
-    CommentState(LCS_BeforeComment), State(LS_Normal) {
-}
+             const CommandTraits &Traits, SourceLocation FileLoc,
+             const char *BufferStart, const char *BufferEnd,
+             bool ParseCommands)
+    : Allocator(Allocator), Diags(Diags), Traits(Traits),
+      BufferStart(BufferStart), BufferEnd(BufferEnd), FileLoc(FileLoc),
+      BufferPtr(BufferStart), CommentState(LCS_BeforeComment), State(LS_Normal),
+      ParseCommands(ParseCommands) {}
 
 void Lexer::lex(Token &T) {
 again:
Index: include/clang/AST/RawCommentList.h
===================================================================
--- include/clang/AST/RawCommentList.h
+++ include/clang/AST/RawCommentList.h
@@ -111,6 +111,29 @@
     return extractBriefText(Context);
   }
 
+  /// Returns sanitized comment text, suitable for presentation in editor UIs.
+  /// E.g. will transform:
+  ///     // This is a long multiline comment.
+  ///     //   Parts of it  might be indented.
+  ///     /* The comments styles might be mixed. */
+  ///  into
+  ///     "This is a long multiline comment.\n"
+  ///     "  Parts of it  might be indented.\n"
+  ///     "The comments styles might be mixed."
+  /// Also removes leading indentation and sanitizes some common cases:
+  ///     /* This is a first line.
+  ///      *   This is a second line. It is indented.
+  ///      * This is a third line. */
+  /// and
+  ///     /* This is a first line.
+  ///          This is a second line. It is indented.
+  ///     This is a third line. */
+  /// will both turn into:
+  ///     "This is a first line.\n"
+  ///     "  This is a second line. It is indented.\n"
+  ///     "This is a third line."
+  std::string getFormattedText(const ASTContext &Context) const;
+
   /// Parse the comment, assuming it is attached to decl \c D.
   comments::FullComment *parse(const ASTContext &Context,
                                const Preprocessor *PP, const Decl *D) const;
Index: include/clang/AST/CommentLexer.h
===================================================================
--- include/clang/AST/CommentLexer.h
+++ include/clang/AST/CommentLexer.h
@@ -281,6 +281,11 @@
   /// command, including command marker.
   SmallString<16> VerbatimBlockEndCommandName;
 
+  /// If true, the commands, html tags, etc will be parsed and reported as
+  /// separate tokens inside the comment body. If false, the comment text will
+  /// be parsed into text and newline tokens.
+  bool ParseCommands;
+
   /// Given a character reference name (e.g., "lt"), return the character that
   /// it stands for (e.g., "<").
   StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
@@ -315,12 +320,19 @@
   /// Eat string matching regexp \code \s*\* \endcode.
   void skipLineStartingDecorations();
 
-  /// Lex stuff inside comments.  CommentEnd should be set correctly.
+  /// Calls lexCommentText(With|Without)Commands, depending on value of
+  /// ParseCommands.
   void lexCommentText(Token &T);
 
-  void setupAndLexVerbatimBlock(Token &T,
-                                const char *TextBegin,
-                                char Marker, const CommandInfo *Info);
+  /// Lex stuff inside comments.  CommentEnd should be set correctly.
+  void lexCommentTextWithCommands(Token &T);
+
+  /// Lex only newlines and text inside comments. CommentEnd should be set
+  /// correctly.
+  void lexCommentTextWithoutCommands(Token &T);
+
+  void setupAndLexVerbatimBlock(Token &T, const char *TextBegin, char Marker,
+                                const CommandInfo *Info);
 
   void lexVerbatimBlockFirstLine(Token &T);
 
@@ -343,14 +355,13 @@
 
 public:
   Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
-        const CommandTraits &Traits,
-        SourceLocation FileLoc,
-        const char *BufferStart, const char *BufferEnd);
+        const CommandTraits &Traits, SourceLocation FileLoc,
+        const char *BufferStart, const char *BufferEnd,
+        bool ParseCommands = true);
 
   void lex(Token &T);
 
-  StringRef getSpelling(const Token &Tok,
-                        const SourceManager &SourceMgr,
+  StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr,
                         bool *Invalid = nullptr) const;
 };

_______________________________________________
cfe-commits mailing list
[email protected]
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D46000: [AST] Added a helper to extract a user-friendly text of a comment.

Reply via email to