This revision was landed with ongoing or failed builds. This revision was automatically updated to reflect the committed changes. Closed by commit rG355532a1499a: [Clang] Add a warning on invalid UTF-8 in comments. (authored by cor3ntin).
Changed prior to commit: https://reviews.llvm.org/D128059?vs=443225&id=443423#toc Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D128059/new/ https://reviews.llvm.org/D128059 Files: clang/docs/ReleaseNotes.rst clang/include/clang/Basic/DiagnosticLexKinds.td clang/lib/Lex/Lexer.cpp clang/test/Lexer/comment-invalid-utf8.c clang/test/Lexer/comment-utf8.c clang/test/SemaCXX/static-assert.cpp llvm/include/llvm/Support/ConvertUTF.h llvm/lib/Support/ConvertUTF.cpp
Index: llvm/lib/Support/ConvertUTF.cpp =================================================================== --- llvm/lib/Support/ConvertUTF.cpp +++ llvm/lib/Support/ConvertUTF.cpp @@ -417,6 +417,16 @@ return isLegalUTF8(source, length); } +/* + * Exported function to return the size of the first utf-8 code unit sequence, + * Or 0 if the sequence is not valid; + */ +unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd) { + int length = trailingBytesForUTF8[*source] + 1; + return (length <= sourceEnd - source && isLegalUTF8(source, length)) ? length + : 0; +} + /* --------------------------------------------------------------------- */ static unsigned Index: llvm/include/llvm/Support/ConvertUTF.h =================================================================== --- llvm/include/llvm/Support/ConvertUTF.h +++ llvm/include/llvm/Support/ConvertUTF.h @@ -181,6 +181,8 @@ Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd); +unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd); + unsigned getNumBytesForUTF8(UTF8 firstByte); /*************************************************************************/ Index: clang/test/SemaCXX/static-assert.cpp =================================================================== --- clang/test/SemaCXX/static-assert.cpp +++ clang/test/SemaCXX/static-assert.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++11 -pedantic -triple=x86_64-linux-gnu +// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++11 -pedantic -triple=x86_64-linux-gnu -Wno-invalid-utf8 int f(); // expected-note {{declared here}} Index: clang/test/Lexer/comment-utf8.c =================================================================== --- /dev/null +++ clang/test/Lexer/comment-utf8.c @@ -0,0 +1,20 @@ +// RUN: %clang_cc1 -fsyntax-only %s -Winvalid-utf8 -verify +// expected-no-diagnostics + + +//§ § § ð ä½ å¥½ © + +/*§ § § ð ä½ å¥½ ©*/ + +/* +§ § § ð ä½ å¥½ ©©© +*/ + +/* § § § ð ä½ å¥½ © */ +/* + a longer comment to exerce the vectorized code path + ---------------------------------------------------- + αααααααααααααααααααααα // here is some unicode + ---------------------------------------------------- + ---------------------------------------------------- +*/ Index: clang/test/Lexer/comment-invalid-utf8.c =================================================================== --- /dev/null +++ clang/test/Lexer/comment-invalid-utf8.c @@ -0,0 +1,27 @@ +// RUN: %clang_cc1 -fsyntax-only %s -Winvalid-utf8 -verify=expected +// RUN: %clang_cc1 -fsyntax-only %s -verify=nowarn +// nowarn-no-diagnostics + +// This file is purposefully encoded as windows-1252 +// be careful when modifying. + +// +// expected-warning@-1 {{invalid UTF-8 in comment}} + +//  Â Â     +// expected-warning@-1 6{{invalid UTF-8 in comment}} + +/*Â*/ +// expected-warning@-1 {{invalid UTF-8 in comment}} + +/* Â Â    Â*/ +// expected-warning@-1 6{{invalid UTF-8 in comment}} + +/* + +*/ +// expected-warning@-2 {{invalid UTF-8 in comment}} + +// abcd +// Âabcd +// expected-warning@-1 {{invalid UTF-8 in comment}} Index: clang/lib/Lex/Lexer.cpp =================================================================== --- clang/lib/Lex/Lexer.cpp +++ clang/lib/Lex/Lexer.cpp @@ -2392,13 +2392,37 @@ // // This loop terminates with CurPtr pointing at the newline (or end of buffer) // character that ends the line comment. + + // C++23 [lex.phases] p1 + // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a + // diagnostic only once per entire ill-formed subsequence to avoid + // emiting to many diagnostics (see http://unicode.org/review/pr-121.html). + bool UnicodeDecodingAlreadyDiagnosed = false; + char C; while (true) { C = *CurPtr; // Skip over characters in the fast loop. - while (C != 0 && // Potentially EOF. - C != '\n' && C != '\r') // Newline or DOS-style newline. + while (isASCII(C) && C != 0 && // Potentially EOF. + C != '\n' && C != '\r') { // Newline or DOS-style newline. C = *++CurPtr; + UnicodeDecodingAlreadyDiagnosed = false; + } + + if (!isASCII(C)) { + unsigned Length = llvm::getUTF8SequenceSize( + (const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd); + if (Length == 0) { + if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode()) + Diag(CurPtr, diag::warn_invalid_utf8_in_comment); + UnicodeDecodingAlreadyDiagnosed = true; + ++CurPtr; + } else { + UnicodeDecodingAlreadyDiagnosed = false; + CurPtr += Length; + } + continue; + } const char *NextLine = CurPtr; if (C != 0) { @@ -2665,6 +2689,12 @@ if (C == '/') C = *CurPtr++; + // C++23 [lex.phases] p1 + // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a + // diagnostic only once per entire ill-formed subsequence to avoid + // emiting to many diagnostics (see http://unicode.org/review/pr-121.html). + bool UnicodeDecodingAlreadyDiagnosed = false; + while (true) { // Skip over all non-interesting characters until we find end of buffer or a // (probably ending) '/' character. @@ -2673,14 +2703,22 @@ // doesn't check for '\0'. !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) { // While not aligned to a 16-byte boundary. - while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0) + while (C != '/' && (intptr_t)CurPtr % 16 != 0) { + if (!isASCII(C)) + goto MultiByteUTF8; C = *CurPtr++; - + } if (C == '/') goto FoundSlash; #ifdef __SSE2__ __m128i Slashes = _mm_set1_epi8('/'); - while (CurPtr+16 <= BufferEnd) { + while (CurPtr + 16 < BufferEnd) { + int Mask = _mm_movemask_epi8(*(const __m128i *)CurPtr); + if (LLVM_UNLIKELY(Mask != 0)) { + CurPtr += llvm::countTrailingZeros<unsigned>(Mask); + goto MultiByteUTF8; + } + // look for slashes int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr, Slashes)); if (cmp != 0) { @@ -2693,21 +2731,39 @@ CurPtr += 16; } #elif __ALTIVEC__ + __vector unsigned char LongUTF = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80}; __vector unsigned char Slashes = { '/', '/', '/', '/', '/', '/', '/', '/', '/', '/', '/', '/', '/', '/', '/', '/' }; - while (CurPtr + 16 <= BufferEnd && - !vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) + while (CurPtr + 16 < BufferEnd) { + if (LLVM_UNLIKELY( + vec_any_ge(*(const __vector unsigned char *)CurPtr, LongUTF))) + goto MultiByteUTF8; + if (vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) { + C = *CurPtr++; + break; + } CurPtr += 16; + } + #else - // Scan for '/' quickly. Many block comments are very large. - while (CurPtr[0] != '/' && - CurPtr[1] != '/' && - CurPtr[2] != '/' && - CurPtr[3] != '/' && - CurPtr+4 < BufferEnd) { - CurPtr += 4; + while (CurPtr + 16 < BufferEnd) { + bool HasNonASCII = false; + for (unsigned I = 0; I < 16; ++I) + HasNonASCII |= !isASCII(CurPtr[I]); + + if (LLVM_UNLIKELY(HasNonASCII)) + goto MultiByteUTF8; + + bool HasSlash = false; + for (unsigned I = 0; I < 16; ++I) + HasSlash |= CurPtr[I] == '/'; + if (HasSlash) + break; + CurPtr += 16; } #endif @@ -2715,9 +2771,31 @@ C = *CurPtr++; } - // Loop to scan the remainder. - while (C != '/' && C != '\0') + // Loop to scan the remainder, warning on invalid UTF-8 + // if the corresponding warning is enabled, emitting a diagnostic only once + // per sequence that cannot be decoded. + while (C != '/' && C != '\0') { + if (isASCII(C)) { + UnicodeDecodingAlreadyDiagnosed = false; + C = *CurPtr++; + continue; + } + MultiByteUTF8: + // CurPtr is 1 code unit past C, so to decode + // the codepoint, we need to read from the previous position. + unsigned Length = llvm::getUTF8SequenceSize( + (const llvm::UTF8 *)CurPtr-1, (const llvm::UTF8 *)BufferEnd); + if (Length == 0) { + if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode()) + Diag(CurPtr-1, diag::warn_invalid_utf8_in_comment); + UnicodeDecodingAlreadyDiagnosed = true; + } + else { + UnicodeDecodingAlreadyDiagnosed = false; + CurPtr += Length - 1; + } C = *CurPtr++; + } if (C == '/') { FoundSlash: Index: clang/include/clang/Basic/DiagnosticLexKinds.td =================================================================== --- clang/include/clang/Basic/DiagnosticLexKinds.td +++ clang/include/clang/Basic/DiagnosticLexKinds.td @@ -113,6 +113,8 @@ // Unicode and UCNs def err_invalid_utf8 : Error< "source file is not valid UTF-8">; +def warn_invalid_utf8_in_comment : Extension< + "invalid UTF-8 in comment">, InGroup<DiagGroup<"invalid-utf8">>; def err_character_not_allowed : Error< "unexpected character <U+%0>">; def err_character_not_allowed_identifier : Error< Index: clang/docs/ReleaseNotes.rst =================================================================== --- clang/docs/ReleaseNotes.rst +++ clang/docs/ReleaseNotes.rst @@ -279,9 +279,11 @@ unevaluated operands of a ``typeid`` expression, as they are now modeled correctly in the CFG. This fixes `Issue 21668 <https://github.com/llvm/llvm-project/issues/21668>`_. -- ``-Wself-assign``, ``-Wself-assign-overloaded`` and ``-Wself-move`` will +- ``-Wself-assign``, ``-Wself-assign-overloaded`` and ``-Wself-move`` will suggest a fix if the decl being assigned is a parameter that shadows a data member of the contained class. +- Added ``-Winvalid-utf8`` which diagnoses invalid UTF-8 code unit sequences in + comments. Non-comprehensive list of changes in this release ------------------------------------------------- @@ -592,7 +594,7 @@ - Added ``forEachTemplateArgument`` matcher which creates a match every time a ``templateArgument`` matches the matcher supplied to it. - + - Added ``objcStringLiteral`` matcher which matches ObjectiveC String literal expressions.
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits