twoh updated this revision to Diff 123658. twoh added a comment. Addressing @vsapsai's comments. Thank you for the suggestion! Added test case actually finds an off-by-one error in the original patch. I improved the comments as well.
https://reviews.llvm.org/D39279 Files: include/clang/Lex/Lexer.h lib/Lex/Lexer.cpp test/Preprocessor/macro_raw_string.cpp unittests/Lex/LexerTest.cpp
Index: unittests/Lex/LexerTest.cpp =================================================================== --- unittests/Lex/LexerTest.cpp +++ unittests/Lex/LexerTest.cpp @@ -37,7 +37,7 @@ DiagID(new DiagnosticIDs()), Diags(DiagID, new DiagnosticOptions, new IgnoringDiagConsumer()), SourceMgr(Diags, FileMgr), - TargetOpts(new TargetOptions) + TargetOpts(new TargetOptions) { TargetOpts->Triple = "x86_64-apple-darwin11.1.0"; Target = TargetInfo::CreateTargetInfo(Diags, TargetOpts); @@ -478,4 +478,42 @@ EXPECT_TRUE(LexedTokens.empty()); } +TEST_F(LexerTest, StringizingRasString) { + // For "std::string Lexer::Stringify(StringRef Str, bool Charify)". + std::string String1 = R"(foo + {"bar":[]} + baz)"; + // For "void Lexer::Stringify(SmallVectorImpl<char> &Str)". + SmallString<128> String2; + String2 += String1.c_str(); + + // Corner cases. + std::string String3 = R"(\ + \n + \\n + \\)"; + SmallString<128> String4; + String4 += String3.c_str(); + std::string String5 = R"(a\ + + + \\b)"; + SmallString<128> String6; + String6 += String5.c_str(); + + String1 = Lexer::Stringify(StringRef(String1)); + Lexer::Stringify(String2); + String3 = Lexer::Stringify(StringRef(String3)); + Lexer::Stringify(String4); + String5 = Lexer::Stringify(StringRef(String5)); + Lexer::Stringify(String6); + + EXPECT_EQ(String1, R"(foo\n {\"bar\":[]}\n baz)"); + EXPECT_EQ(String2, R"(foo\n {\"bar\":[]}\n baz)"); + EXPECT_EQ(String3, R"(\\\n \\n\n \\\\n\n \\\\)"); + EXPECT_EQ(String4, R"(\\\n \\n\n \\\\n\n \\\\)"); + EXPECT_EQ(String5, R"(a\\\n\n\n \\\\b)"); + EXPECT_EQ(String6, R"(a\\\n\n\n \\\\b)"); +} + } // anonymous namespace Index: test/Preprocessor/macro_raw_string.cpp =================================================================== --- /dev/null +++ test/Preprocessor/macro_raw_string.cpp @@ -0,0 +1,11 @@ +// RUN: %clang_cc1 -E -std=c++11 %s -o %t +// RUN: %clang_cc1 %t + +#define FOO(str) foo(#str) + +extern void foo(const char *str); + +void bar() { + FOO(R"(foo + bar)"); +} Index: lib/Lex/Lexer.cpp =================================================================== --- lib/Lex/Lexer.cpp +++ lib/Lex/Lexer.cpp @@ -209,29 +209,43 @@ return L; } -/// Stringify - Convert the specified string into a C string, with surrounding -/// ""'s, and with escaped \ and " characters. +/// StringifyImpl - Implementation of Stringify functions. Convert the +/// specified string into a C string by i) escaping '\' and " characters and +/// ii) replacing newline character(s) with "\n". +template <typename T> +void StringifyImpl(T& Str, char Quote) { + unsigned i = 0, e = Str.size(); + while (i < e) { + if (Str[i] == '\\' || Str[i] == Quote) { + Str.insert(Str.begin() + i, '\\'); + i += 2; + ++e; + } else if (Str[i] == '\n' || Str[i] == '\r') { + // Replace '\n', '\r', '\r\n', and '\n\r' to '\\' followed by 'n'. + unsigned Size = 1; + if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') && + Str[i] != Str[i + 1]) + Size += 1; + + Str.erase(Str.begin() + i, Str.begin() + i + Size); + Str.insert(Str.begin() + i, '\\'); + Str.insert(Str.begin() + i + 1, 'n'); + i += 2; + e += (2 - Size); + } else + ++i; + } +} + std::string Lexer::Stringify(StringRef Str, bool Charify) { std::string Result = Str; char Quote = Charify ? '\'' : '"'; - for (unsigned i = 0, e = Result.size(); i != e; ++i) { - if (Result[i] == '\\' || Result[i] == Quote) { - Result.insert(Result.begin()+i, '\\'); - ++i; ++e; - } - } + StringifyImpl(Result, Quote); return Result; } -/// Stringify - Convert the specified string into a C string by escaping '\' -/// and " characters. This does not add surrounding ""'s to the string. void Lexer::Stringify(SmallVectorImpl<char> &Str) { - for (unsigned i = 0, e = Str.size(); i != e; ++i) { - if (Str[i] == '\\' || Str[i] == '"') { - Str.insert(Str.begin()+i, '\\'); - ++i; ++e; - } - } + StringifyImpl(Str, '"'); } //===----------------------------------------------------------------------===// @@ -367,7 +381,7 @@ /// to point to a constant buffer with the data already in it (avoiding a /// copy). The caller is not allowed to modify the returned buffer pointer /// if an internal buffer is returned. -unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer, +unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer, const SourceManager &SourceMgr, const LangOptions &LangOpts, bool *Invalid) { assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); @@ -592,17 +606,17 @@ if (TheTok.getKind() == tok::eof) { break; } - + // If we haven't hit the end of the preprocessor directive, skip this // token. if (!TheTok.isAtStartOfLine()) continue; - + // We've passed the end of the preprocessor directive, and will look // at this token again below. InPreprocessorDirective = false; } - + // Keep track of the # of lines in the preamble. if (TheTok.isAtStartOfLine()) { unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset; @@ -619,13 +633,13 @@ ActiveCommentLoc = TheTok.getLocation(); continue; } - + if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) { - // This is the start of a preprocessor directive. + // This is the start of a preprocessor directive. Token HashTok = TheTok; InPreprocessorDirective = true; ActiveCommentLoc = SourceLocation(); - + // Figure out which directive this is. Since we're lexing raw tokens, // we don't have an identifier table available. Instead, just look at // the raw identifier to recognize and categorize preprocessor directives. @@ -665,7 +679,7 @@ break; } } - + // We only end up here if we didn't recognize the preprocessor // directive or it was one that can't occur in the preamble at this // point. Roll back the current token to the location of the '#'. @@ -678,7 +692,7 @@ // the preamble. break; } while (true); - + SourceLocation End; if (ActiveCommentLoc.isValid()) End = ActiveCommentLoc; // don't truncate a decl comment. @@ -700,13 +714,13 @@ // trigraphs. bool Invalid = false; const char *TokPtr = SM.getCharacterData(TokStart, &Invalid); - + // If they request the first char of the token, we're trivially done. if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr))) return TokStart; - + unsigned PhysOffset = 0; - + // The usual case is that tokens don't contain anything interesting. Skip // over the uninteresting characters. If a token only consists of simple // chars, this method is extremely fast. @@ -717,23 +731,23 @@ --CharNo; ++PhysOffset; } - + // If we have a character that may be a trigraph or escaped newline, use a // lexer to parse it correctly. for (; CharNo; --CharNo) { unsigned Size; Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts); TokPtr += Size; PhysOffset += Size; } - + // Final detail: if we end up on an escaped newline, we want to return the // location of the actual byte of the token. For example foo\<newline>bar // advanced by 3 should return the location of b, not of \\. One compounding // detail of this is that the escape may be made by a trigraph. if (!Lexer::isObviouslySimpleCharacter(*TokPtr)) PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr; - + return TokStart.getLocWithOffset(PhysOffset); } @@ -768,7 +782,7 @@ Len = Len - Offset; else return Loc; - + return Loc.getLocWithOffset(Len); } @@ -965,7 +979,7 @@ // For macro arguments we need to check that the argument did not come // from an inner macro, e.g: "MAC1( MAC2(foo) )" - + // Loc points to the argument id of the macro definition, move to the // macro expansion. Loc = SM.getImmediateExpansionRange(Loc).first; @@ -1795,15 +1809,15 @@ // getAndAdvanceChar. if (C == '\\') C = getAndAdvanceChar(CurPtr, Result); - + if (C == '\n' || C == '\r' || // Newline. (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1; FormTokenWithChars(Result, CurPtr-1, tok::unknown); return true; } - + if (C == 0) { if (isCodeCompletionPoint(CurPtr-1)) { PP->CodeCompleteNaturalLanguage(); @@ -2232,7 +2246,7 @@ std::string Spelling = PP->getSpelling(Result, &Invalid); if (Invalid) return true; - + assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?"); Spelling[1] = '*'; // Change prefix to "/*". Spelling += "*/"; // add suffix. @@ -2558,16 +2572,16 @@ resetExtendedTokenMode(); return true; // Have a token. } - + // If we are in raw mode, return this event as an EOF token. Let the caller // that put us in raw mode handle the event. if (isLexingRawMode()) { Result.startToken(); BufferPtr = BufferEnd; FormTokenWithChars(Result, BufferEnd, tok::eof); return true; } - + if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) { PP->setRecordedPreambleConditionalStack(ConditionalStack); ConditionalStack.clear(); @@ -2679,7 +2693,7 @@ if (CurPtr != BufferStart && CurPtr[-1] != '\n' && CurPtr[-1] != '\r') return false; - + // Check to see if we have <<<<<<< or >>>>. if (!StringRef(CurPtr, BufferEnd - CurPtr).startswith("<<<<<<<") && !StringRef(CurPtr, BufferEnd - CurPtr).startswith(">>>> ")) @@ -2689,7 +2703,7 @@ // it. if (CurrentConflictMarkerState || isLexingRawMode()) return false; - + ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce; // Check to see if there is an ending marker somewhere in the buffer at the @@ -2699,7 +2713,7 @@ // Diagnose this, and ignore to the end of line. Diag(CurPtr, diag::err_conflict_marker); CurrentConflictMarkerState = Kind; - + // Skip ahead to the end of line. We know this exists because the // end-of-conflict marker starts with \r or \n. while (*CurPtr != '\r' && *CurPtr != '\n') { @@ -2709,7 +2723,7 @@ BufferPtr = CurPtr; return true; } - + // No end of conflict marker found. return false; } @@ -2723,35 +2737,35 @@ if (CurPtr != BufferStart && CurPtr[-1] != '\n' && CurPtr[-1] != '\r') return false; - + // If we have a situation where we don't care about conflict markers, ignore // it. if (!CurrentConflictMarkerState || isLexingRawMode()) return false; - + // Check to see if we have the marker (4 characters in a row). for (unsigned i = 1; i != 4; ++i) if (CurPtr[i] != CurPtr[0]) return false; - + // If we do have it, search for the end of the conflict marker. This could // fail if it got skipped with a '#if 0' or something. Note that CurPtr might // be the end of conflict marker. if (const char *End = FindConflictEnd(CurPtr, BufferEnd, CurrentConflictMarkerState)) { CurPtr = End; - + // Skip ahead to the end of line. while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n') ++CurPtr; - + BufferPtr = CurPtr; - + // No longer in the conflict marker. CurrentConflictMarkerState = CMK_None; return true; } - + return false; } @@ -3060,7 +3074,7 @@ // We know the lexer hasn't changed, so just try again with this lexer. // (We manually eliminate the tail call to avoid recursion.) goto LexNextToken; - + case 26: // DOS & CP/M EOF: "^Z". // If we're in Microsoft extensions mode, treat this as end of file. if (LangOpts.MicrosoftExt) { @@ -3072,7 +3086,7 @@ // If Microsoft extensions are disabled, this is just random garbage. Kind = tok::unknown; break; - + case '\r': if (CurPtr[0] == '\n') Char = getAndAdvanceChar(CurPtr, Result); @@ -3135,7 +3149,7 @@ // We only saw whitespace, so just try again with this lexer. // (We manually eliminate the tail call to avoid recursion.) goto LexNextToken; - + // C99 6.4.4.1: Integer Constants. // C99 6.4.4.2: Floating Constants. case '0': case '1': case '2': case '3': case '4': @@ -3634,7 +3648,7 @@ // If this is '====' and we're in a conflict marker, ignore it. if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1)) goto LexNextToken; - + Kind = tok::equalequal; CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); } else { @@ -3721,7 +3735,7 @@ } return LexUnicode(Result, CodePoint, CurPtr); } - + if (isLexingRawMode() || ParsingPreprocessorDirective || PP->isPreprocessedOutput()) { ++CurPtr; Index: include/clang/Lex/Lexer.h =================================================================== --- include/clang/Lex/Lexer.h +++ include/clang/Lex/Lexer.h @@ -70,7 +70,7 @@ SourceLocation FileLoc; // Location for start of file. LangOptions LangOpts; // LangOpts enabled by this language (cache). bool Is_PragmaLexer; // True if lexer for _Pragma handling. - + //===--------------------------------------------------------------------===// // Context-specific lexing flags set by the preprocessor. // @@ -241,17 +241,16 @@ /// \brief Return the current location in the buffer. const char *getBufferLocation() const { return BufferPtr; } - - /// Stringify - Convert the specified string into a C string by escaping '\' - /// and " characters. This does not add surrounding ""'s to the string. - /// If Charify is true, this escapes the ' character instead of ". + + /// Stringify - Convert the specified string into a C string. This does not + /// add surrounding ""'s to the string. If Charify is true, this escapes the + /// ' character instead of ". static std::string Stringify(StringRef Str, bool Charify = false); - /// Stringify - Convert the specified string into a C string by escaping '\' - /// and " characters. This does not add surrounding ""'s to the string. + /// Stringify - Convert the specified string into a C string. This does not + /// add surrounding ""'s to the string. static void Stringify(SmallVectorImpl<char> &Str); - /// getSpelling - This method is used to get the spelling of a token into a /// preallocated buffer, instead of as an std::string. The caller is required /// to allocate enough space for the token, which is guaranteed to be at least @@ -262,19 +261,19 @@ /// to point to a constant buffer with the data already in it (avoiding a /// copy). The caller is not allowed to modify the returned buffer pointer /// if an internal buffer is returned. - static unsigned getSpelling(const Token &Tok, const char *&Buffer, + static unsigned getSpelling(const Token &Tok, const char *&Buffer, const SourceManager &SourceMgr, const LangOptions &LangOpts, bool *Invalid = nullptr); - + /// getSpelling() - Return the 'spelling' of the Tok token. The spelling of a /// token is the characters used to represent the token in the source file /// after trigraph expansion and escaped-newline folding. In particular, this /// wants to get the true, uncanonicalized, spelling of things like digraphs /// UCNs, etc. static std::string getSpelling(const Token &Tok, const SourceManager &SourceMgr, - const LangOptions &LangOpts, + const LangOptions &LangOpts, bool *Invalid = nullptr); /// getSpelling - This method is used to get the spelling of the @@ -290,7 +289,7 @@ const SourceManager &SourceMgr, const LangOptions &LangOpts, bool *invalid = nullptr); - + /// MeasureTokenLength - Relex the token at the specified location and return /// its length in bytes in the input file. If the token needs cleaning (e.g. /// includes a trigraph or an escaped newline) then this count includes bytes @@ -312,15 +311,15 @@ static SourceLocation GetBeginningOfToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts); - + /// AdvanceToTokenCharacter - If the current SourceLocation specifies a /// location at the start of a token, return a new location that specifies a /// character within the token. This handles trigraphs and escaped newlines. static SourceLocation AdvanceToTokenCharacter(SourceLocation TokStart, unsigned Character, const SourceManager &SM, const LangOptions &LangOpts); - + /// \brief Computes the source location just past the end of the /// token at this source location. /// @@ -667,7 +666,7 @@ bool SkipBlockComment (Token &Result, const char *CurPtr, bool &TokAtPhysicalStartOfLine); bool SaveLineComment (Token &Result, const char *CurPtr); - + bool IsStartOfConflictMarker(const char *CurPtr); bool HandleEndOfConflictMarker(const char *CurPtr);
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits