Author: Corentin Jabot Date: 2022-07-09T17:18:35+02:00 New Revision: 50416e5454d802a3ef71bb799e5bfd38e8ec9089
URL: https://github.com/llvm/llvm-project/commit/50416e5454d802a3ef71bb799e5bfd38e8ec9089 DIFF: https://github.com/llvm/llvm-project/commit/50416e5454d802a3ef71bb799e5bfd38e8ec9089.diff LOG: Revert "[Clang] Add a warning on invalid UTF-8 in comments." It is probable thart this change crashes on the powerpc bots. This reverts commit 355532a1499aa9b13a89fb5b5caaba2344d57cd7. Added: Modified: clang/docs/ReleaseNotes.rst clang/include/clang/Basic/DiagnosticLexKinds.td clang/lib/Lex/Lexer.cpp clang/test/SemaCXX/static-assert.cpp llvm/include/llvm/Support/ConvertUTF.h llvm/lib/Support/ConvertUTF.cpp Removed: clang/test/Lexer/comment-invalid-utf8.c clang/test/Lexer/comment-utf8.c ################################################################################ diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index da14489f6f2cd..5dae6205efa05 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -279,11 +279,9 @@ Improvements to Clang's diagnostics unevaluated operands of a ``typeid`` expression, as they are now modeled correctly in the CFG. This fixes `Issue 21668 <https://github.com/llvm/llvm-project/issues/21668>`_. -- ``-Wself-assign``, ``-Wself-assign-overloaded`` and ``-Wself-move`` will +- ``-Wself-assign``, ``-Wself-assign-overloaded`` and ``-Wself-move`` will suggest a fix if the decl being assigned is a parameter that shadows a data member of the contained class. -- Added ``-Winvalid-utf8`` which diagnoses invalid UTF-8 code unit sequences in - comments. Non-comprehensive list of changes in this release ------------------------------------------------- @@ -594,7 +592,7 @@ AST Matchers - Added ``forEachTemplateArgument`` matcher which creates a match every time a ``templateArgument`` matches the matcher supplied to it. - + - Added ``objcStringLiteral`` matcher which matches ObjectiveC String literal expressions. diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td index 38ee022e5f04c..ac86076140c58 100644 --- a/clang/include/clang/Basic/DiagnosticLexKinds.td +++ b/clang/include/clang/Basic/DiagnosticLexKinds.td @@ -113,8 +113,6 @@ def warn_four_char_character_literal : Warning< // Unicode and UCNs def err_invalid_utf8 : Error< "source file is not valid UTF-8">; -def warn_invalid_utf8_in_comment : Extension< - "invalid UTF-8 in comment">, InGroup<DiagGroup<"invalid-utf8">>; def err_character_not_allowed : Error< "unexpected character <U+%0>">; def err_character_not_allowed_identifier : Error< diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp index 799f3017daa82..6820057642bea 100644 --- a/clang/lib/Lex/Lexer.cpp +++ b/clang/lib/Lex/Lexer.cpp @@ -2392,37 +2392,13 @@ bool Lexer::SkipLineComment(Token &Result, const char *CurPtr, // // This loop terminates with CurPtr pointing at the newline (or end of buffer) // character that ends the line comment. - - // C++23 [lex.phases] p1 - // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a - // diagnostic only once per entire ill-formed subsequence to avoid - // emiting to many diagnostics (see http://unicode.org/review/pr-121.html). - bool UnicodeDecodingAlreadyDiagnosed = false; - char C; while (true) { C = *CurPtr; // Skip over characters in the fast loop. - while (isASCII(C) && C != 0 && // Potentially EOF. - C != '\n' && C != '\r') { // Newline or DOS-style newline. + while (C != 0 && // Potentially EOF. + C != '\n' && C != '\r') // Newline or DOS-style newline. C = *++CurPtr; - UnicodeDecodingAlreadyDiagnosed = false; - } - - if (!isASCII(C)) { - unsigned Length = llvm::getUTF8SequenceSize( - (const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd); - if (Length == 0) { - if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode()) - Diag(CurPtr, diag::warn_invalid_utf8_in_comment); - UnicodeDecodingAlreadyDiagnosed = true; - ++CurPtr; - } else { - UnicodeDecodingAlreadyDiagnosed = false; - CurPtr += Length; - } - continue; - } const char *NextLine = CurPtr; if (C != 0) { @@ -2689,12 +2665,6 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr, if (C == '/') C = *CurPtr++; - // C++23 [lex.phases] p1 - // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a - // diagnostic only once per entire ill-formed subsequence to avoid - // emiting to many diagnostics (see http://unicode.org/review/pr-121.html). - bool UnicodeDecodingAlreadyDiagnosed = false; - while (true) { // Skip over all non-interesting characters until we find end of buffer or a // (probably ending) '/' character. @@ -2703,22 +2673,14 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr, // doesn't check for '\0'. !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) { // While not aligned to a 16-byte boundary. - while (C != '/' && (intptr_t)CurPtr % 16 != 0) { - if (!isASCII(C)) - goto MultiByteUTF8; + while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0) C = *CurPtr++; - } + if (C == '/') goto FoundSlash; #ifdef __SSE2__ __m128i Slashes = _mm_set1_epi8('/'); - while (CurPtr + 16 < BufferEnd) { - int Mask = _mm_movemask_epi8(*(const __m128i *)CurPtr); - if (LLVM_UNLIKELY(Mask != 0)) { - CurPtr += llvm::countTrailingZeros<unsigned>(Mask); - goto MultiByteUTF8; - } - // look for slashes + while (CurPtr+16 <= BufferEnd) { int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr, Slashes)); if (cmp != 0) { @@ -2731,39 +2693,21 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr, CurPtr += 16; } #elif __ALTIVEC__ - __vector unsigned char LongUTF = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80}; __vector unsigned char Slashes = { '/', '/', '/', '/', '/', '/', '/', '/', '/', '/', '/', '/', '/', '/', '/', '/' }; - while (CurPtr + 16 < BufferEnd) { - if (LLVM_UNLIKELY( - vec_any_ge(*(const __vector unsigned char *)CurPtr, LongUTF))) - goto MultiByteUTF8; - if (vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) { - C = *CurPtr++; - break; - } + while (CurPtr + 16 <= BufferEnd && + !vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) CurPtr += 16; - } - #else - while (CurPtr + 16 < BufferEnd) { - bool HasNonASCII = false; - for (unsigned I = 0; I < 16; ++I) - HasNonASCII |= !isASCII(CurPtr[I]); - - if (LLVM_UNLIKELY(HasNonASCII)) - goto MultiByteUTF8; - - bool HasSlash = false; - for (unsigned I = 0; I < 16; ++I) - HasSlash |= CurPtr[I] == '/'; - if (HasSlash) - break; - CurPtr += 16; + // Scan for '/' quickly. Many block comments are very large. + while (CurPtr[0] != '/' && + CurPtr[1] != '/' && + CurPtr[2] != '/' && + CurPtr[3] != '/' && + CurPtr+4 < BufferEnd) { + CurPtr += 4; } #endif @@ -2771,31 +2715,9 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr, C = *CurPtr++; } - // Loop to scan the remainder, warning on invalid UTF-8 - // if the corresponding warning is enabled, emitting a diagnostic only once - // per sequence that cannot be decoded. - while (C != '/' && C != '\0') { - if (isASCII(C)) { - UnicodeDecodingAlreadyDiagnosed = false; - C = *CurPtr++; - continue; - } - MultiByteUTF8: - // CurPtr is 1 code unit past C, so to decode - // the codepoint, we need to read from the previous position. - unsigned Length = llvm::getUTF8SequenceSize( - (const llvm::UTF8 *)CurPtr-1, (const llvm::UTF8 *)BufferEnd); - if (Length == 0) { - if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode()) - Diag(CurPtr-1, diag::warn_invalid_utf8_in_comment); - UnicodeDecodingAlreadyDiagnosed = true; - } - else { - UnicodeDecodingAlreadyDiagnosed = false; - CurPtr += Length - 1; - } + // Loop to scan the remainder. + while (C != '/' && C != '\0') C = *CurPtr++; - } if (C == '/') { FoundSlash: diff --git a/clang/test/Lexer/comment-invalid-utf8.c b/clang/test/Lexer/comment-invalid-utf8.c deleted file mode 100644 index b8bf551dd8564..0000000000000 --- a/clang/test/Lexer/comment-invalid-utf8.c +++ /dev/null @@ -1,27 +0,0 @@ -// RUN: %clang_cc1 -fsyntax-only %s -Winvalid-utf8 -verify=expected -// RUN: %clang_cc1 -fsyntax-only %s -verify=nowarn -// nowarn-no-diagnostics - -// This file is purposefully encoded as windows-1252 -// be careful when modifying. - -// -// expected-warning@-1 {{invalid UTF-8 in comment}} - -// -// expected-warning@-1 6{{invalid UTF-8 in comment}} - -/**/ -// expected-warning@-1 {{invalid UTF-8 in comment}} - -/* */ -// expected-warning@-1 6{{invalid UTF-8 in comment}} - -/* - -*/ -// expected-warning@-2 {{invalid UTF-8 in comment}} - -// abcd -// abcd -// expected-warning@-1 {{invalid UTF-8 in comment}} diff --git a/clang/test/Lexer/comment-utf8.c b/clang/test/Lexer/comment-utf8.c deleted file mode 100644 index 87f2d1375d4c7..0000000000000 --- a/clang/test/Lexer/comment-utf8.c +++ /dev/null @@ -1,20 +0,0 @@ -// RUN: %clang_cc1 -fsyntax-only %s -Winvalid-utf8 -verify -// expected-no-diagnostics - - -//§ § § ð ä½ å¥½ © - -/*§ § § ð ä½ å¥½ ©*/ - -/* -§ § § ð ä½ å¥½ ©©© -*/ - -/* § § § ð ä½ å¥½ © */ -/* - a longer comment to exerce the vectorized code path - ---------------------------------------------------- - αααααααααααααααααααααα // here is some unicode - ---------------------------------------------------- - ---------------------------------------------------- -*/ diff --git a/clang/test/SemaCXX/static-assert.cpp b/clang/test/SemaCXX/static-assert.cpp index 2ac0dfdea9eae..5801320f305da 100644 --- a/clang/test/SemaCXX/static-assert.cpp +++ b/clang/test/SemaCXX/static-assert.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++11 -pedantic -triple=x86_64-linux-gnu -Wno-invalid-utf8 +// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++11 -pedantic -triple=x86_64-linux-gnu int f(); // expected-note {{declared here}} diff --git a/llvm/include/llvm/Support/ConvertUTF.h b/llvm/include/llvm/Support/ConvertUTF.h index 1e05cfe1f4241..662f3aca5b543 100644 --- a/llvm/include/llvm/Support/ConvertUTF.h +++ b/llvm/include/llvm/Support/ConvertUTF.h @@ -181,8 +181,6 @@ Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd); Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd); -unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd); - unsigned getNumBytesForUTF8(UTF8 firstByte); /*************************************************************************/ diff --git a/llvm/lib/Support/ConvertUTF.cpp b/llvm/lib/Support/ConvertUTF.cpp index cc411fae746d0..e24a918c5c898 100644 --- a/llvm/lib/Support/ConvertUTF.cpp +++ b/llvm/lib/Support/ConvertUTF.cpp @@ -417,16 +417,6 @@ Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) { return isLegalUTF8(source, length); } -/* - * Exported function to return the size of the first utf-8 code unit sequence, - * Or 0 if the sequence is not valid; - */ -unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd) { - int length = trailingBytesForUTF8[*source] + 1; - return (length <= sourceEnd - source && isLegalUTF8(source, length)) ? length - : 0; -} - /* --------------------------------------------------------------------- */ static unsigned _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits