[PATCH] D143144: [clang][lex] Add TryGrowLexerBuffer/SourceFileGrower

Sunho Kim via Phabricator via cfe-commits Mon, 06 Feb 2023 03:06:55 -0800

sunho created this revision.
Herald added a project: All.
sunho retitled this revision from "asdfasdf" to "[clang][lex] Add 
TryExpandBuffer callback".
sunho edited the summary of this revision.
sunho updated this revision to Diff 494978.
sunho added a comment.
sunho updated this revision to Diff 495037.
sunho updated this revision to Diff 495039.
sunho retitled this revision from "[clang][lex] Add TryExpandBuffer callback" 
to "[clang][lex] Add TryGrowLexerBuffer/SourceFileGrower".
sunho edited the summary of this revision.
sunho edited the summary of this revision.
sunho edited the summary of this revision.
sunho edited the summary of this revision.
sunho published this revision for review.
Herald added a project: clang.
Herald added a subscriber: cfe-commits.

Update

sunho added a comment.

Update

sunho added a comment.

Update

Add TryGrowLexerBuffer to Lexer and SourceFileGrower to Preprocessor that can
be used to grow the source code buffer when Lexer reaches the eof of the buffer.

Since clang-repl receive source code incrementally line by line, we need to
grow source code buffer. This change adds the interfaces to Lexer/Preprocessor
in order to accommodate this need. If SourceFileGrower interface is registered
to Preprocessor, it will call TryGrowFile method when Lexer reached eof. Inside
this method, the user can grow the file and return true to request Lexer to
continue the lexing from the last point.

When Lexer reaches eof it will call TryGrowLexerBuffer callback specified in
the constructor to try growing buffer and if it got new buffer, it will
continue the lexing from the last point. Preprocessor registers
TryGrowLexerBuffer callback in order to implement handling of SourceFileGrower.

Note that practically all the code changes will not affect the AOT clang world
since they are all disabled when Preprocessor doesn't have SourceFileGrower
instance.

NOTE: This is part 2 and 3 of
https://discourse.llvm.org/t/rfc-flexible-lexer-buffering-for-handling-incomplete-input-in-interactive-c-c/64180

Repository:
rG LLVM Github Monorepo

https://reviews.llvm.org/D143144

Files:
clang/include/clang/Lex/Lexer.h
clang/include/clang/Lex/Preprocessor.h
clang/lib/Lex/Lexer.cpp
clang/lib/Lex/PPLexerChange.cpp
clang/lib/Lex/Preprocessor.cpp
clang/unittests/Lex/LexerTest.cpp

Index: clang/unittests/Lex/LexerTest.cpp
===================================================================
--- clang/unittests/Lex/LexerTest.cpp
+++ clang/unittests/Lex/LexerTest.cpp
@@ -26,9 +26,12 @@
 #include "clang/Lex/PreprocessorOptions.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
+#include <deque>
 #include <memory>
+#include <optional>
 #include <vector>
 
 namespace {
@@ -49,12 +52,7 @@
     Target = TargetInfo::CreateTargetInfo(Diags, TargetOpts);
   }
 
-  std::unique_ptr<Preprocessor> CreatePP(StringRef Source,
-                                         TrivialModuleLoader &ModLoader) {
-    std::unique_ptr<llvm::MemoryBuffer> Buf =
-        llvm::MemoryBuffer::getMemBuffer(Source);
-    SourceMgr.setMainFileID(SourceMgr.createFileID(std::move(Buf)));
-
+  std::unique_ptr<Preprocessor> CreatePP(TrivialModuleLoader &ModLoader) {
     HeaderSearch HeaderInfo(std::make_shared<HeaderSearchOptions>(), SourceMgr,
                             Diags, LangOpts, Target.get());
     std::unique_ptr<Preprocessor> PP = std::make_unique<Preprocessor>(
@@ -63,6 +61,14 @@
         /*IILookup =*/nullptr,
         /*OwnsHeaderSearch =*/false);
     PP->Initialize(*Target);
+    return PP;
+  }
+
+  std::unique_ptr<Preprocessor> CreatePP(StringRef Source, TrivialModuleLoader &ModLoader) {
+    std::unique_ptr<llvm::MemoryBuffer> Buf =
+        llvm::MemoryBuffer::getMemBuffer(Source);
+    SourceMgr.setMainFileID(SourceMgr.createFileID(std::move(Buf)));
+    std::unique_ptr<Preprocessor> PP = CreatePP(ModLoader);
     PP->EnterMainSourceFile();
     return PP;
   }
@@ -660,4 +666,59 @@
   }
   EXPECT_TRUE(ToksView.empty());
 }
+
+TEST_F(LexerTest, BasicSourceFileGrower) {
+  std::deque<StringRef> SourceLines = {
+    "int main() {",
+    "  return 0;",
+    "}"
+  };
+
+  TrivialModuleLoader ModLoader;
+  PP = CreatePP(ModLoader);
+  auto &SM = PP->getSourceManager();
+
+  struct FileGrower : public SourceFileGrower {
+    FileGrower(SourceManager& SM, std::deque<StringRef> SourceLines) : SM(SM), SourceLines(SourceLines) {
+      FE = SM.getFileManager().getVirtualFile("main.cpp", 1024, 0);
+      CurFileID = SM.createFileID(FE, SourceLocation(), SrcMgr::C_User);
+      SM.overrideFileContents(FE, llvm::MemoryBufferRef("", ""));
+    }
+    ~FileGrower() = default;
+
+    bool TryGrowFile(FileID FileID) override {
+      if (FileID != CurFileID)
+        return false;
+      if (SourceLines.empty())
+        return false;
+      CurStr += SourceLines.front();
+      CurStr.push_back('\n');
+      SourceLines.pop_front();
+      CurBuf = llvm::MemoryBuffer::getMemBuffer(CurStr);
+      SM.overrideFileContents(FE, CurBuf->getMemBufferRef());
+      return true;
+    }
+    SourceManager& SM;
+    std::deque<StringRef> SourceLines;
+    std::string CurStr;
+    std::unique_ptr<llvm::MemoryBuffer> CurBuf;
+    FileID CurFileID;
+    const FileEntry* FE;
+  } FG(SM, SourceLines);
+
+  PP->setSourceFileGrower(&FG);
+  PP->EnterSourceFile(FG.CurFileID, nullptr, SourceLocation());
+
+  std::vector<std::string> Toks;
+  while (1) {
+    Token tok;
+    PP->Lex(tok);
+    if (tok.is(tok::eof))
+      break;
+    Toks.push_back(getSourceText(tok, tok));
+  }
+
+  EXPECT_THAT(Toks, ElementsAre("int", "main", "(", ")", "{",
+                                                "return", "0", ";", "}"));
+}
 } // anonymous namespace
Index: clang/lib/Lex/Preprocessor.cpp
===================================================================
--- clang/lib/Lex/Preprocessor.cpp
+++ clang/lib/Lex/Preprocessor.cpp
@@ -181,6 +181,15 @@
     delete &HeaderInfo;
 }
 
+std::optional<llvm::MemoryBufferRef> Preprocessor::TryGrowFile() {
+  assert(FileGrower && "FileGrower must be set when TryGrowFile is called");
+  if (!FileGrower->TryGrowFile(CurLexer->getFileID())) 
+    return std::nullopt;
+  const FileEntry* Entry = SourceMgr.getFileEntryForID((CurLexer->getFileID()));
+  assert(Entry && "TryGrowFile must be only called inside Lexer on File Entry");
+  return SourceMgr.getMemoryBufferForFileOrNone(Entry);
+}
+
 void Preprocessor::Initialize(const TargetInfo &Target,
                               const TargetInfo *AuxTarget) {
   assert((!this->Target || this->Target == &Target) &&
@@ -1470,6 +1479,8 @@
 
 CodeCompletionHandler::~CodeCompletionHandler() = default;
 
+SourceFileGrower::~SourceFileGrower() = default;
+
 void Preprocessor::createPreprocessingRecord() {
   if (Record)
     return;
Index: clang/lib/Lex/PPLexerChange.cpp
===================================================================
--- clang/lib/Lex/PPLexerChange.cpp
+++ clang/lib/Lex/PPLexerChange.cpp
@@ -15,6 +15,7 @@
 #include "clang/Basic/SourceManager.h"
 #include "clang/Lex/HeaderSearch.h"
 #include "clang/Lex/LexDiagnostic.h"
+#include "clang/Lex/Lexer.h"
 #include "clang/Lex/MacroInfo.h"
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Lex/PreprocessorOptions.h"
@@ -92,7 +93,12 @@
         CodeCompletionFileLoc.getLocWithOffset(CodeCompletionOffset);
   }
 
-  Lexer *TheLexer = new Lexer(FID, *InputFile, *this, IsFirstIncludeOfFile);
+  TryGrowLexerBuffer TryGrowBuffer;
+  if (FileGrower) 
+    TryGrowBuffer = [this]() {
+      return this->TryGrowFile();
+    };
+  Lexer *TheLexer = new Lexer(FID, *InputFile, *this, IsFirstIncludeOfFile, std::move(TryGrowBuffer));
   if (getPreprocessorOpts().DependencyDirectivesForFile &&
       FID != PredefinesFileID) {
     if (OptionalFileEntryRef File = SourceMgr.getFileEntryRefForID(FID)) {
Index: clang/lib/Lex/Lexer.cpp
===================================================================
--- clang/lib/Lex/Lexer.cpp
+++ clang/lib/Lex/Lexer.cpp
@@ -133,12 +133,14 @@
 /// assumes that the associated file buffer and Preprocessor objects will
 /// outlive it, so it doesn't take ownership of either of them.
 Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile,
-             Preprocessor &PP, bool IsFirstIncludeOfFile)
+             Preprocessor &PP, bool IsFirstIncludeOfFile, TryGrowLexerBuffer TryGrowBuffer)
     : PreprocessorLexer(&PP, FID),
       FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
       LangOpts(PP.getLangOpts()), LineComment(LangOpts.LineComment),
-      IsFirstTimeLexingFile(IsFirstIncludeOfFile) {
-  InitLexer(InputFile.getBufferStart(), 0, InputFile.getBufferSize());
+      IsFirstTimeLexingFile(IsFirstIncludeOfFile), 
+      TryGrowBuffer(std::move(TryGrowBuffer)) {
+  InitLexer(InputFile.getBufferStart(), 0,
+            InputFile.getBufferSize());
 
   resetExtendedTokenMode();
 }
@@ -444,6 +446,19 @@
   return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer));
 }
 
+ bool Lexer::tryGrowLexerBuffer() {
+  if (!TryGrowBuffer)
+    return false;
+  
+  auto NewBuffer = TryGrowBuffer();
+  if (!NewBuffer)
+    return false;
+
+  BufferStart = NewBuffer->getBufferStart();
+  BufferSize = NewBuffer->getBufferSize();
+  return true;
+}
+
 /// MeasureTokenLength - Relex the token at the specified location and return
 /// its length in bytes in the input file.  If the token needs cleaning (e.g.
 /// includes a trigraph or an escaped newline) then this count includes bytes
@@ -1363,6 +1378,9 @@
       Size += EscapedNewLineSize;
       Offset += EscapedNewLineSize;
 
+      if (BufferStart[Offset] == 0 && Offset == BufferSize) 
+        tryGrowLexerBuffer();
+
       // Use slow version to accumulate a correct size field.
       return getCharAndSizeSlow(Offset, Size, Tok);
     }
@@ -1861,7 +1879,7 @@
 
   const char *IdStart = BufferStart + BufferOffset;
   FormTokenWithChars(Result, CurOffset, tok::raw_identifier);
-  Result.setRawIdentifierData(IdStart);
+  SetTokLiteralData(Result, IdStart);
 
   // If we are in raw mode, return this identifier raw.  There is no need to
   // look up identifier information or attempt to macro expand it.
@@ -1982,7 +2000,7 @@
   // Update the location of token as well as BufferPtr.
   const char *TokStart = BufferStart + BufferOffset;
   FormTokenWithChars(Result, CurOffset, tok::numeric_constant);
-  Result.setLiteralData(TokStart);
+  SetTokLiteralData(Result, TokStart);
   return true;
 }
 
@@ -2138,7 +2156,7 @@
   // Update the location of the token as well as the BufferPtr instance var.
   const char *TokStart = BufferStart + BufferOffset;
   FormTokenWithChars(Result, CurOffset, Kind);
-  Result.setLiteralData(TokStart);
+  SetTokLiteralData(Result, TokStart);
   return true;
 }
 
@@ -2221,7 +2239,7 @@
   // Update the location of token as well as BufferPtr.
   const char *TokStart = &BufferStart[BufferOffset];
   FormTokenWithChars(Result, CurOffset, Kind);
-  Result.setLiteralData(TokStart);
+  SetTokLiteralData(Result, TokStart);
   return true;
 }
 
@@ -2266,7 +2284,7 @@
   // Update the location of token as well as BufferPtr.
   const char *TokStart = &BufferStart[BufferOffset];
   FormTokenWithChars(Result, CurOffset, tok::header_name);
-  Result.setLiteralData(TokStart);
+  SetTokLiteralData(Result, TokStart);
   return true;
 }
 
@@ -2330,8 +2348,8 @@
     if (C == '\\')
       C = getAndAdvanceChar(CurOffset, Result);
 
-    if (C == '\n' || C == '\r' ||                  // Newline.
-        (C == 0 && CurOffset - 1 == BufferSize)) { // End of file.
+    if (C == '\n' || C == '\r' ||             // Newline.
+        (C == 0 && CurOffset-1 == BufferSize && !tryGrowLexerBuffer())) {  // End of file.
       if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
         Diag(BufferOffset, diag::ext_unterminated_char_or_string) << 0;
       FormTokenWithChars(Result, CurOffset - 1, tok::unknown);
@@ -2362,7 +2380,7 @@
   // Update the location of token as well as BufferPtr.
   const char *TokStart = BufferStart + BufferOffset;
   FormTokenWithChars(Result, CurOffset, Kind);
-  Result.setLiteralData(TokStart);
+  SetTokLiteralData(Result, TokStart);
   return true;
 }
 
@@ -2392,6 +2410,11 @@
     while (isHorizontalWhitespace(Char))
       Char = BufferStart[++CurOffset];
 
+    if (Char == 0 && CurOffset == BufferSize+1 && tryGrowLexerBuffer()) {
+      --CurOffset;
+      continue;
+    }
+
     // Otherwise if we have something other than whitespace, we're done.
     if (!isVerticalWhitespace(Char))
       break;
@@ -2478,10 +2501,14 @@
   while (true) {
     C = BufferStart[CurOffset];
     // Skip over characters in the fast loop.
-    while (isASCII(C) && C != 0 &&   // Potentially EOF.
-           C != '\n' && C != '\r') { // Newline or DOS-style newline.
-      C = BufferStart[++CurOffset];
-      UnicodeDecodingAlreadyDiagnosed = false;
+    while (true) {
+      while (isASCII(C) && C != 0 &&   // Potentially EOF.
+            C != '\n' && C != '\r') { // Newline or DOS-style newline.
+        C = BufferStart[++CurOffset];
+        UnicodeDecodingAlreadyDiagnosed = false;
+      }
+      if (C != 0 || CurOffset != BufferSize + 1 || !tryGrowLexerBuffer()) break;
+      C = BufferStart[CurOffset];
     }
 
     if (!isASCII(C)) {
@@ -2568,7 +2595,15 @@
         }
     }
 
-    if (C == '\r' || C == '\n' || CurOffset == BufferSize + 1) {
+    if (CurOffset == BufferSize + 1) {
+      if (!tryGrowLexerBuffer()) {
+        --CurOffset;
+        break;
+      }
+      continue;
+    }
+
+    if (C == '\r' || C == '\n') {
       --CurOffset;
       break;
     }
@@ -2750,7 +2785,7 @@
   unsigned CharSize;
   unsigned char C = getCharAndSize(CurOffset, CharSize);
   CurOffset += CharSize;
-  if (C == 0 && CurOffset == BufferSize + 1) {
+  if (C == 0 && CurOffset == BufferSize+1 && !tryGrowLexerBuffer()) {
     if (!isLexingRawMode())
       Diag(BufferOffset, diag::err_unterminated_block_comment);
     --CurOffset;
@@ -2778,6 +2813,9 @@
   bool UnicodeDecodingAlreadyDiagnosed = false;
 
   while (true) {
+    if (CurOffset + 24 >= BufferSize) {
+      tryGrowLexerBuffer();
+    }
     // Skip over all non-interesting characters until we find end of buffer or a
     // (probably ending) '/' character.
     if (CurOffset + 24 < BufferSize &&
@@ -2820,14 +2858,14 @@
         '/', '/', '/', '/',  '/', '/', '/', '/',
         '/', '/', '/', '/',  '/', '/', '/', '/'
       };
-      while (CurPtr + 16 < BufferEnd) {
+      while (CurOffset + 16 < BufferSize) {
         if (LLVM_UNLIKELY(
-                vec_any_ge(*(const __vector unsigned char *)CurPtr, LongUTF)))
+                vec_any_ge(*(const __vector unsigned char *)(BufferStart + CurOffset), LongUTF)))
           goto MultiByteUTF8;
-        if (vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) {
+        if (vec_any_eq(*(const __vector unsigned char *)(BufferStart + CurOffset), Slashes)) {
           break;
         }
-        CurPtr += 16;
+        CurOffset += 16;
       }
 
 #else
@@ -2900,7 +2938,10 @@
         if (!isLexingRawMode())
           Diag(CurOffset - 1, diag::warn_nested_block_comment);
       }
-  } else if (C == 0 && CurOffset == BufferSize + 1) {
+    } else if (C == 0 && CurOffset == BufferSize+1 && tryGrowLexerBuffer()) {
+      --CurOffset;
+      continue;
+    } else if (C == 0 && CurOffset == BufferSize+1) {
       if (!isLexingRawMode())
         Diag(BufferOffset, diag::err_unterminated_block_comment);
       // Note: the user probably forgot a */.  We could continue immediately
@@ -2978,8 +3019,8 @@
       break;
     case 0:  // Null.
       // Found end of file?
-      if (CurOffset - 1 != BufferSize) {
-        if (isCodeCompletionPoint(CurOffset - 1)) {
+      if (CurOffset-1 != BufferSize && !tryGrowLexerBuffer()) {
+        if (isCodeCompletionPoint(CurOffset-1)) {
           PP->CodeCompleteNaturalLanguage();
           cutOffLexing();
           return;
@@ -3264,8 +3305,9 @@
          "Not a placeholder!");
   if (!PP || !PP->getPreprocessorOpts().LexEditorPlaceholders || LexingRawMode)
     return false;
-  const char *End =
-      findPlaceholderEnd(BufferStart + CurOffset + 1, BufferStart + BufferSize);
+  if (CurOffset + 1 == BufferSize) 
+    tryGrowLexerBuffer();
+  const char *End = findPlaceholderEnd(BufferStart + CurOffset + 1, BufferStart + BufferSize);
   if (!End)
     return false;
   const char *Start = BufferStart + CurOffset - 1;
@@ -3273,7 +3315,7 @@
     Diag(CurOffset - 1, diag::err_placeholder_in_source);
   Result.startToken();
   FormTokenWithChars(Result, End - BufferStart, tok::raw_identifier);
-  Result.setRawIdentifierData(Start);
+  SetTokLiteralData(Result, Start);
   PP->LookUpIdentifierInfo(Result);
   Result.setFlag(Token::IsEditorPlaceholder);
   BufferOffset = End - BufferStart;
@@ -3628,8 +3670,10 @@
   // Small amounts of horizontal whitespace is very common between tokens.
   if (isHorizontalWhitespace(BufferStart[CurOffset])) {
     do {
-      ++CurOffset;
-    } while (isHorizontalWhitespace(BufferStart[CurOffset]));
+      do {
+        ++CurOffset;
+      } while (isHorizontalWhitespace(BufferStart[CurOffset]));
+    } while (BufferStart[CurOffset] == 0 && CurOffset == BufferSize && tryGrowLexerBuffer());
 
     // If we are keeping whitespace and other tokens, just return what we just
     // skipped.  The next lexer invocation will return the token after the
@@ -3656,8 +3700,12 @@
   switch (Char) {
   case 0:  // Null.
     // Found end of file?
-    if (CurOffset - 1 == BufferSize)
-      return LexEndOfFile(Result, CurOffset - 1);
+    if (CurOffset-1 == BufferSize) {
+      if (tryGrowLexerBuffer()) {
+        goto LexNextToken;
+      }
+      return LexEndOfFile(Result, CurOffset-1);
+    }
 
     // Check if we are performing code completion.
     if (isCodeCompletionPoint(CurOffset - 1)) {
@@ -4487,7 +4535,7 @@
     return true;
   }
   if (Result.isLiteral()) {
-    Result.setLiteralData(TokPtr);
+    SetTokLiteralData(Result, TokPtr);
     return true;
   }
   if (Result.is(tok::colon) &&
@@ -4571,3 +4619,18 @@
   convertDependencyDirectiveToken(DDTok, Result);
   return false;
 }
+
+void Lexer::SetTokLiteralData(Token& Tok, const char* Str) {
+  if (TryGrowBuffer) {
+    assert(PP);
+    SourceLocation Loc = Tok.getLocation();
+    PP->CreateString(StringRef(Str, Tok.getLength()), Tok);
+    Tok.setLocation(Loc);
+    return;
+  }
+
+  if (Tok.is(tok::raw_identifier))
+    Tok.setRawIdentifierData(Str);
+  else
+    Tok.setLiteralData(Str);
+}
Index: clang/include/clang/Lex/Preprocessor.h
===================================================================
--- clang/include/clang/Lex/Preprocessor.h
+++ clang/include/clang/Lex/Preprocessor.h
@@ -44,6 +44,7 @@
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/MemoryBufferRef.h"
 #include "llvm/Support/Registry.h"
 #include <cassert>
 #include <cstddef>
@@ -78,6 +79,7 @@
 class PreprocessorLexer;
 class PreprocessorOptions;
 class ScratchBuffer;
+class SourceFileGrower;
 class TargetInfo;
 
 namespace Builtin {
@@ -282,6 +284,8 @@
   /// Empty line handler.
   EmptylineHandler *Emptyline = nullptr;
 
+  /// File grower.
+  SourceFileGrower *FileGrower = nullptr;
 public:
   /// The kind of translation unit we are processing.
   const TranslationUnitKind TUKind;
@@ -1783,6 +1787,11 @@
     const_cast<LangOptions &>(getLangOpts()).IncrementalExtensions = value;
   }
 
+  void setSourceFileGrower(SourceFileGrower* Val) { 
+    FileGrower = Val; 
+  }
+  SourceFileGrower* getSourceFileGrower() const { return FileGrower; } 
+
   /// Specify the point at which code-completion will be performed.
   ///
   /// \param File the file in which code completion should occur. If
@@ -2265,6 +2274,8 @@
   void EnterSubmodule(Module *M, SourceLocation ImportLoc, bool ForPragma);
   Module *LeaveSubmodule(bool ForPragma);
 
+  /// Try growing file by using SourceFileGrower.
+  std::optional<llvm::MemoryBufferRef> TryGrowFile();
 private:
   friend void TokenLexer::ExpandFunctionArguments();
 
@@ -2711,6 +2722,16 @@
   virtual void HandleEmptyline(SourceRange Range) = 0;
 };
 
+/// Abstract base class that will receive the ID of source 
+/// file that reached eof by Lexer and grow that file if possible.  
+class SourceFileGrower {
+public:
+  virtual ~SourceFileGrower();
+
+  // This method should return true if it has grown the specified file.
+  virtual bool TryGrowFile(FileID FileID) = 0;
+};
+
 /// Registry of pragma handlers added by plugins
 using PragmaHandlerRegistry = llvm::Registry<PragmaHandler>;
 
Index: clang/include/clang/Lex/Lexer.h
===================================================================
--- clang/include/clang/Lex/Lexer.h
+++ clang/include/clang/Lex/Lexer.h
@@ -16,6 +16,7 @@
 #include "clang/Basic/LangOptions.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/TokenKinds.h"
+#include "llvm/ADT/FunctionExtras.h"
 #include "clang/Lex/DependencyDirectivesScanner.h"
 #include "clang/Lex/PreprocessorLexer.h"
 #include "clang/Lex/Token.h"
@@ -71,6 +72,11 @@
       : Size(Size), PreambleEndsAtStartOfLine(PreambleEndsAtStartOfLine) {}
 };
 
+/// TryGrowLexerBuffer - Callback that gets called by Lexer when it reaches eof
+/// to try getting newly grown buffer and continue the lexing. If this callback
+/// returns nullopt, the Lexer will stop lexing and process eof.
+using TryGrowLexerBuffer = llvm::unique_function<std::optional<llvm::MemoryBufferRef>()>;
+
 /// Lexer - This provides a simple interface that turns a text buffer into a
 /// stream of tokens.  This provides no support for file reading or buffering,
 /// or buffering/seeking of tokens, only forward lexing is supported.  It relies
@@ -157,16 +163,20 @@
   /// next token to use from the current dependency directive.
   unsigned NextDepDirectiveTokenIndex = 0;
 
+  // TryGrowBuffer - The TryGrowLexerBuffer callback to grow the buffer if possible.
+  TryGrowLexerBuffer TryGrowBuffer;
+
   void InitLexer(const char *BufStart, unsigned BufferOffset,
                  unsigned BufferSize);
 
 public:
+
   /// Lexer constructor - Create a new lexer object for the specified buffer
   /// with the specified preprocessor managing the lexing process.  This lexer
   /// assumes that the associated file buffer and Preprocessor objects will
   /// outlive it, so it doesn't take ownership of either of them.
   Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile, Preprocessor &PP,
-        bool IsFirstIncludeOfFile = true);
+        bool IsFirstIncludeOfFile = true, TryGrowLexerBuffer TryGrowBuffer = nullptr);
 
   /// Lexer constructor - Create a new raw lexer object.  This object is only
   /// suitable for calls to 'LexFromRawLexer'.  This lexer assumes that the
@@ -611,9 +621,9 @@
   bool LexUnicodeIdentifierStart(Token &Result, uint32_t C, unsigned CurOffset);
 
   /// FormTokenWithChars - When we lex a token, we have identified a span
-  /// starting at BufferPtr, going to TokEnd that forms the token.  This method
+  /// starting at BufferOffset, going to TokEnd that forms the token.  This method
   /// takes that range and assigns it to the token as its location and size.  In
-  /// addition, since tokens cannot overlap, this also updates BufferPtr to be
+  /// addition, since tokens cannot overlap, this also updates BufferOffset to be
   /// TokEnd.
   void FormTokenWithChars(Token &Result, unsigned TokEnd, tok::TokenKind Kind) {
     unsigned TokLen = TokEnd - BufferOffset;
@@ -623,6 +633,13 @@
     BufferOffset = TokEnd;
   }
 
+  /// SetTokString - If the buffer can be grown, it's unsafe to set the original 
+  /// string pointer from the buffer to literal data of a token. When the buffer 
+  /// has a posibility of growing, this method will copy the string into scratch 
+  /// buffer and set it to literal data of the token. Otherwise, it will just set 
+  /// the passed string pointer to literal data of the token as it is.
+  void SetTokLiteralData(Token& Tok, const char* Str);
+
   /// isNextPPTokenLParen - Return 1 if the next unexpanded token will return a
   /// tok::l_paren token, 0 if it is something else and 2 if there are no more
   /// tokens in the buffer controlled by this lexer.
@@ -804,8 +821,12 @@
   /// \return \c true if a UTF-8 sequence mapping to an acceptable identifier
   ///         character was lexed, \c false otherwise.
   bool tryConsumeIdentifierUTF8Char(unsigned &CurOffset);
-};
 
+  /// Try to grow the buffer if possible by calling TryGrowLexerBuffer callback.
+  /// \return \c true if it has grown the buffer so that Lexer should continue on
+  ///         lexing, \c false otherwise.
+  bool tryGrowLexerBuffer();
+};
 } // namespace clang
 
 #endif // LLVM_CLANG_LEX_LEXER_H

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D143144: [clang][lex] Add TryGrowLexerBuffer/SourceFileGrower

Reply via email to