[PATCH] D39279: Stringizing raw string literals containing newline

Taewook Oh via Phabricator via cfe-commits Mon, 04 Dec 2017 10:35:55 -0800

twoh updated this revision to Diff 125368.
twoh added a comment.

Thanks @jkorous-apple for the comment. I think your suggestion is a more
precise description for the implementation, and adjusted the comments
accordingly.


I intentionally didn't add the details to the comments for the header,
as LLVM coding standard "Don’t duplicate the documentation comment in
the header file and in the implementation file. ... implementation files
can include additional comments (not necessarily in Doxygen markup) to
explain implementation details as needed." I'd like to leave it as of
now if you're not strongly against it.

Thank you again!


https://reviews.llvm.org/D39279

Files:
  include/clang/Lex/Lexer.h
  lib/Lex/Lexer.cpp
  test/Preprocessor/macro_raw_string.cpp
  unittests/Lex/LexerTest.cpp

Index: unittests/Lex/LexerTest.cpp
===================================================================
--- unittests/Lex/LexerTest.cpp
+++ unittests/Lex/LexerTest.cpp
@@ -37,7 +37,7 @@
       DiagID(new DiagnosticIDs()),
       Diags(DiagID, new DiagnosticOptions, new IgnoringDiagConsumer()),
       SourceMgr(Diags, FileMgr),
-      TargetOpts(new TargetOptions) 
+      TargetOpts(new TargetOptions)
   {
     TargetOpts->Triple = "x86_64-apple-darwin11.1.0";
     Target = TargetInfo::CreateTargetInfo(Diags, TargetOpts);
@@ -478,4 +478,42 @@
   EXPECT_TRUE(LexedTokens.empty());
 }
 
+TEST_F(LexerTest, StringizingRasString) {
+  // For "std::string Lexer::Stringify(StringRef Str, bool Charify)".
+  std::string String1 = R"(foo
+    {"bar":[]}
+    baz)";
+  // For "void Lexer::Stringify(SmallVectorImpl<char> &Str)".
+  SmallString<128> String2;
+  String2 += String1.c_str();
+
+  // Corner cases.
+  std::string String3 = R"(\
+    \n
+    \\n
+    \\)";
+  SmallString<128> String4;
+  String4 += String3.c_str();
+  std::string String5 = R"(a\
+
+
+    \\b)";
+  SmallString<128> String6;
+  String6 += String5.c_str();
+
+  String1 = Lexer::Stringify(StringRef(String1));
+  Lexer::Stringify(String2);
+  String3 = Lexer::Stringify(StringRef(String3));
+  Lexer::Stringify(String4);
+  String5 = Lexer::Stringify(StringRef(String5));
+  Lexer::Stringify(String6);
+
+  EXPECT_EQ(String1, R"(foo\n    {\"bar\":[]}\n    baz)");
+  EXPECT_EQ(String2, R"(foo\n    {\"bar\":[]}\n    baz)");
+  EXPECT_EQ(String3, R"(\\\n    \\n\n    \\\\n\n    \\\\)");
+  EXPECT_EQ(String4, R"(\\\n    \\n\n    \\\\n\n    \\\\)");
+  EXPECT_EQ(String5, R"(a\\\n\n\n    \\\\b)");
+  EXPECT_EQ(String6, R"(a\\\n\n\n    \\\\b)");
+}
+
 } // anonymous namespace
Index: test/Preprocessor/macro_raw_string.cpp
===================================================================
--- /dev/null
+++ test/Preprocessor/macro_raw_string.cpp
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -E -std=c++11 %s -o %t
+// RUN: %clang_cc1 %t
+
+#define FOO(str) foo(#str)
+
+extern void foo(const char *str);
+
+void bar() {
+  FOO(R"(foo
+    bar)");
+}
Index: lib/Lex/Lexer.cpp
===================================================================
--- lib/Lex/Lexer.cpp
+++ lib/Lex/Lexer.cpp
@@ -209,29 +209,44 @@
   return L;
 }
 
-/// Stringify - Convert the specified string into a C string, with surrounding
-/// ""'s, and with escaped \ and " characters.
+/// StringifyImpl - Implementation of Stringify functions. Convert the
+/// specified string into a C string by i) escaping '\\' and " characters and
+/// ii) replacing newline character(s) with "\\n".
+template <typename T>
+void StringifyImpl(T& Str, char Quote) {
+  typename T::size_type i = 0, e = Str.size();
+  while (i < e) {
+    if (Str[i] == '\\' || Str[i] == Quote) {
+      Str.insert(Str.begin() + i, '\\');
+      i += 2;
+      ++e;
+    } else if (Str[i] == '\n' || Str[i] == '\r') {
+      // Replace '\r\n' and '\n\r' to '\\' followed by 'n'.
+      if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') &&
+          Str[i] != Str[i + 1]) {
+        Str[i] = '\\';
+        Str[i+1] = 'n';
+      } else {
+        // Replace '\n' and '\r' to '\\' followed by 'n'.
+        Str[i] = '\\';
+        Str.insert(Str.begin() + i + 1, 'n');
+        ++e;
+      }
+      i += 2;
+    } else
+      ++i;
+  }
+}
+
 std::string Lexer::Stringify(StringRef Str, bool Charify) {
   std::string Result = Str;
   char Quote = Charify ? '\'' : '"';
-  for (unsigned i = 0, e = Result.size(); i != e; ++i) {
-    if (Result[i] == '\\' || Result[i] == Quote) {
-      Result.insert(Result.begin()+i, '\\');
-      ++i; ++e;
-    }
-  }
+  StringifyImpl(Result, Quote);
   return Result;
 }
 
-/// Stringify - Convert the specified string into a C string by escaping '\'
-/// and " characters.  This does not add surrounding ""'s to the string.
 void Lexer::Stringify(SmallVectorImpl<char> &Str) {
-  for (unsigned i = 0, e = Str.size(); i != e; ++i) {
-    if (Str[i] == '\\' || Str[i] == '"') {
-      Str.insert(Str.begin()+i, '\\');
-      ++i; ++e;
-    }
-  }
+  StringifyImpl(Str, '"');
 }
 
 //===----------------------------------------------------------------------===//
@@ -367,7 +382,7 @@
 /// to point to a constant buffer with the data already in it (avoiding a
 /// copy).  The caller is not allowed to modify the returned buffer pointer
 /// if an internal buffer is returned.
-unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer, 
+unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
                             const SourceManager &SourceMgr,
                             const LangOptions &LangOpts, bool *Invalid) {
   assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
@@ -592,17 +607,17 @@
       if (TheTok.getKind() == tok::eof) {
         break;
       }
-      
+
       // If we haven't hit the end of the preprocessor directive, skip this
       // token.
       if (!TheTok.isAtStartOfLine())
         continue;
-        
+
       // We've passed the end of the preprocessor directive, and will look
       // at this token again below.
       InPreprocessorDirective = false;
     }
-    
+
     // Keep track of the # of lines in the preamble.
     if (TheTok.isAtStartOfLine()) {
       unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset;
@@ -619,13 +634,13 @@
         ActiveCommentLoc = TheTok.getLocation();
       continue;
     }
-    
+
     if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) {
-      // This is the start of a preprocessor directive. 
+      // This is the start of a preprocessor directive.
       Token HashTok = TheTok;
       InPreprocessorDirective = true;
       ActiveCommentLoc = SourceLocation();
-      
+
       // Figure out which directive this is. Since we're lexing raw tokens,
       // we don't have an identifier table available. Instead, just look at
       // the raw identifier to recognize and categorize preprocessor directives.
@@ -665,7 +680,7 @@
           break;
         }
       }
-      
+
       // We only end up here if we didn't recognize the preprocessor
       // directive or it was one that can't occur in the preamble at this
       // point. Roll back the current token to the location of the '#'.
@@ -678,7 +693,7 @@
     // the preamble.
     break;
   } while (true);
-  
+
   SourceLocation End;
   if (ActiveCommentLoc.isValid())
     End = ActiveCommentLoc; // don't truncate a decl comment.
@@ -700,13 +715,13 @@
   // trigraphs.
   bool Invalid = false;
   const char *TokPtr = SM.getCharacterData(TokStart, &Invalid);
-  
+
   // If they request the first char of the token, we're trivially done.
   if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr)))
     return TokStart;
-  
+
   unsigned PhysOffset = 0;
-  
+
   // The usual case is that tokens don't contain anything interesting.  Skip
   // over the uninteresting characters.  If a token only consists of simple
   // chars, this method is extremely fast.
@@ -717,23 +732,23 @@
     --CharNo;
     ++PhysOffset;
   }
-  
+
   // If we have a character that may be a trigraph or escaped newline, use a
   // lexer to parse it correctly.
   for (; CharNo; --CharNo) {
     unsigned Size;
     Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts);
     TokPtr += Size;
     PhysOffset += Size;
   }
-  
+
   // Final detail: if we end up on an escaped newline, we want to return the
   // location of the actual byte of the token.  For example foo\<newline>bar
   // advanced by 3 should return the location of b, not of \\.  One compounding
   // detail of this is that the escape may be made by a trigraph.
   if (!Lexer::isObviouslySimpleCharacter(*TokPtr))
     PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;
-  
+
   return TokStart.getLocWithOffset(PhysOffset);
 }
 
@@ -768,7 +783,7 @@
     Len = Len - Offset;
   else
     return Loc;
-  
+
   return Loc.getLocWithOffset(Len);
 }
 
@@ -965,7 +980,7 @@
 
     // For macro arguments we need to check that the argument did not come
     // from an inner macro, e.g: "MAC1( MAC2(foo) )"
-    
+
     // Loc points to the argument id of the macro definition, move to the
     // macro expansion.
     Loc = SM.getImmediateExpansionRange(Loc).first;
@@ -1795,15 +1810,15 @@
     // getAndAdvanceChar.
     if (C == '\\')
       C = getAndAdvanceChar(CurPtr, Result);
-    
+
     if (C == '\n' || C == '\r' ||             // Newline.
         (C == 0 && CurPtr-1 == BufferEnd)) {  // End of file.
       if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
         Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1;
       FormTokenWithChars(Result, CurPtr-1, tok::unknown);
       return true;
     }
-    
+
     if (C == 0) {
       if (isCodeCompletionPoint(CurPtr-1)) {
         PP->CodeCompleteNaturalLanguage();
@@ -2232,7 +2247,7 @@
   std::string Spelling = PP->getSpelling(Result, &Invalid);
   if (Invalid)
     return true;
-  
+
   assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?");
   Spelling[1] = '*';   // Change prefix to "/*".
   Spelling += "*/";    // add suffix.
@@ -2558,16 +2573,16 @@
       resetExtendedTokenMode();
     return true;  // Have a token.
   }
- 
+
   // If we are in raw mode, return this event as an EOF token.  Let the caller
   // that put us in raw mode handle the event.
   if (isLexingRawMode()) {
     Result.startToken();
     BufferPtr = BufferEnd;
     FormTokenWithChars(Result, BufferEnd, tok::eof);
     return true;
   }
-  
+
   if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) {
     PP->setRecordedPreambleConditionalStack(ConditionalStack);
     ConditionalStack.clear();
@@ -2679,7 +2694,7 @@
   if (CurPtr != BufferStart &&
       CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
     return false;
-  
+
   // Check to see if we have <<<<<<< or >>>>.
   if (!StringRef(CurPtr, BufferEnd - CurPtr).startswith("<<<<<<<") &&
       !StringRef(CurPtr, BufferEnd - CurPtr).startswith(">>>> "))
@@ -2689,7 +2704,7 @@
   // it.
   if (CurrentConflictMarkerState || isLexingRawMode())
     return false;
-  
+
   ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce;
 
   // Check to see if there is an ending marker somewhere in the buffer at the
@@ -2699,7 +2714,7 @@
     // Diagnose this, and ignore to the end of line.
     Diag(CurPtr, diag::err_conflict_marker);
     CurrentConflictMarkerState = Kind;
-    
+
     // Skip ahead to the end of line.  We know this exists because the
     // end-of-conflict marker starts with \r or \n.
     while (*CurPtr != '\r' && *CurPtr != '\n') {
@@ -2709,7 +2724,7 @@
     BufferPtr = CurPtr;
     return true;
   }
-  
+
   // No end of conflict marker found.
   return false;
 }
@@ -2723,35 +2738,35 @@
   if (CurPtr != BufferStart &&
       CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
     return false;
-  
+
   // If we have a situation where we don't care about conflict markers, ignore
   // it.
   if (!CurrentConflictMarkerState || isLexingRawMode())
     return false;
-  
+
   // Check to see if we have the marker (4 characters in a row).
   for (unsigned i = 1; i != 4; ++i)
     if (CurPtr[i] != CurPtr[0])
       return false;
-  
+
   // If we do have it, search for the end of the conflict marker.  This could
   // fail if it got skipped with a '#if 0' or something.  Note that CurPtr might
   // be the end of conflict marker.
   if (const char *End = FindConflictEnd(CurPtr, BufferEnd,
                                         CurrentConflictMarkerState)) {
     CurPtr = End;
-    
+
     // Skip ahead to the end of line.
     while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n')
       ++CurPtr;
-    
+
     BufferPtr = CurPtr;
-    
+
     // No longer in the conflict marker.
     CurrentConflictMarkerState = CMK_None;
     return true;
   }
-  
+
   return false;
 }
 
@@ -3060,7 +3075,7 @@
     // We know the lexer hasn't changed, so just try again with this lexer.
     // (We manually eliminate the tail call to avoid recursion.)
     goto LexNextToken;
-      
+
   case 26:  // DOS & CP/M EOF: "^Z".
     // If we're in Microsoft extensions mode, treat this as end of file.
     if (LangOpts.MicrosoftExt) {
@@ -3072,7 +3087,7 @@
     // If Microsoft extensions are disabled, this is just random garbage.
     Kind = tok::unknown;
     break;
-      
+
   case '\r':
     if (CurPtr[0] == '\n')
       Char = getAndAdvanceChar(CurPtr, Result);
@@ -3135,7 +3150,7 @@
     // We only saw whitespace, so just try again with this lexer.
     // (We manually eliminate the tail call to avoid recursion.)
     goto LexNextToken;
-      
+
   // C99 6.4.4.1: Integer Constants.
   // C99 6.4.4.2: Floating Constants.
   case '0': case '1': case '2': case '3': case '4':
@@ -3634,7 +3649,7 @@
       // If this is '====' and we're in a conflict marker, ignore it.
       if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1))
         goto LexNextToken;
-      
+
       Kind = tok::equalequal;
       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
     } else {
@@ -3721,7 +3736,7 @@
       }
       return LexUnicode(Result, CodePoint, CurPtr);
     }
-    
+
     if (isLexingRawMode() || ParsingPreprocessorDirective ||
         PP->isPreprocessedOutput()) {
       ++CurPtr;
Index: include/clang/Lex/Lexer.h
===================================================================
--- include/clang/Lex/Lexer.h
+++ include/clang/Lex/Lexer.h
@@ -70,7 +70,7 @@
   SourceLocation FileLoc;        // Location for start of file.
   LangOptions LangOpts;          // LangOpts enabled by this language (cache).
   bool Is_PragmaLexer;           // True if lexer for _Pragma handling.
-  
+
   //===--------------------------------------------------------------------===//
   // Context-specific lexing flags set by the preprocessor.
   //
@@ -241,17 +241,16 @@
 
   /// \brief Return the current location in the buffer.
   const char *getBufferLocation() const { return BufferPtr; }
-  
-  /// Stringify - Convert the specified string into a C string by escaping '\'
-  /// and " characters.  This does not add surrounding ""'s to the string.
-  /// If Charify is true, this escapes the ' character instead of ".
+
+  /// Stringify - Convert the specified string into a C string. This does not
+  /// add surrounding ""'s to the string. If Charify is true, this escapes the
+  /// ' character instead of ".
   static std::string Stringify(StringRef Str, bool Charify = false);
 
-  /// Stringify - Convert the specified string into a C string by escaping '\'
-  /// and " characters.  This does not add surrounding ""'s to the string.
+  /// Stringify - Convert the specified string into a C string. This does not
+  /// add surrounding ""'s to the string.
   static void Stringify(SmallVectorImpl<char> &Str);
 
-  
   /// getSpelling - This method is used to get the spelling of a token into a
   /// preallocated buffer, instead of as an std::string.  The caller is required
   /// to allocate enough space for the token, which is guaranteed to be at least
@@ -262,19 +261,19 @@
   /// to point to a constant buffer with the data already in it (avoiding a
   /// copy).  The caller is not allowed to modify the returned buffer pointer
   /// if an internal buffer is returned.
-  static unsigned getSpelling(const Token &Tok, const char *&Buffer, 
+  static unsigned getSpelling(const Token &Tok, const char *&Buffer,
                               const SourceManager &SourceMgr,
                               const LangOptions &LangOpts,
                               bool *Invalid = nullptr);
-  
+
   /// getSpelling() - Return the 'spelling' of the Tok token.  The spelling of a
   /// token is the characters used to represent the token in the source file
   /// after trigraph expansion and escaped-newline folding.  In particular, this
   /// wants to get the true, uncanonicalized, spelling of things like digraphs
   /// UCNs, etc.
   static std::string getSpelling(const Token &Tok,
                                  const SourceManager &SourceMgr,
-                                 const LangOptions &LangOpts, 
+                                 const LangOptions &LangOpts,
                                  bool *Invalid = nullptr);
 
   /// getSpelling - This method is used to get the spelling of the
@@ -290,7 +289,7 @@
                                const SourceManager &SourceMgr,
                                const LangOptions &LangOpts,
                                bool *invalid = nullptr);
-  
+
   /// MeasureTokenLength - Relex the token at the specified location and return
   /// its length in bytes in the input file.  If the token needs cleaning (e.g.
   /// includes a trigraph or an escaped newline) then this count includes bytes
@@ -312,15 +311,15 @@
   static SourceLocation GetBeginningOfToken(SourceLocation Loc,
                                             const SourceManager &SM,
                                             const LangOptions &LangOpts);
-  
+
   /// AdvanceToTokenCharacter - If the current SourceLocation specifies a
   /// location at the start of a token, return a new location that specifies a
   /// character within the token.  This handles trigraphs and escaped newlines.
   static SourceLocation AdvanceToTokenCharacter(SourceLocation TokStart,
                                                 unsigned Character,
                                                 const SourceManager &SM,
                                                 const LangOptions &LangOpts);
-  
+
   /// \brief Computes the source location just past the end of the
   /// token at this source location.
   ///
@@ -667,7 +666,7 @@
   bool SkipBlockComment      (Token &Result, const char *CurPtr,
                               bool &TokAtPhysicalStartOfLine);
   bool SaveLineComment       (Token &Result, const char *CurPtr);
-  
+
   bool IsStartOfConflictMarker(const char *CurPtr);
   bool HandleEndOfConflictMarker(const char *CurPtr);

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D39279: Stringizing raw string literals containing newline

Reply via email to