cor3ntin created this revision. Herald added a subscriber: hiraditya. Herald added a project: All. cor3ntin requested review of this revision. Herald added projects: clang, LLVM. Herald added subscribers: llvm-commits, cfe-commits.
Introduce an off-by default `-Winvalid-utf8` warning that detects invalid UTF-8 code units sequences in comments. Invalid UTF-8 in other places is already diagnosed, as that cannot appear in identifiers and other grammar constructs. The warning is off by default as its likely to be somewhat disruptive otherwise. This warning allows clang to conform to the yet-to be approved WG21 "P2295R5 Support for UTF-8 as a portable source file encoding" paper. Repository: rG LLVM Github Monorepo https://reviews.llvm.org/D128059 Files: clang/docs/ReleaseNotes.rst clang/include/clang/Basic/DiagnosticLexKinds.td clang/lib/Lex/Lexer.cpp clang/test/Lexer/comment-invalid-utf8.c llvm/include/llvm/Support/ConvertUTF.h llvm/lib/Support/ConvertUTF.cpp
Index: llvm/lib/Support/ConvertUTF.cpp =================================================================== --- llvm/lib/Support/ConvertUTF.cpp +++ llvm/lib/Support/ConvertUTF.cpp @@ -417,6 +417,18 @@ return isLegalUTF8(source, length); } +/* + * Exported function to return the size of the first utf-8 code unit sequence, + * Or 0 if the sequence is not valid; + */ +unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd) { + int length = trailingBytesForUTF8[*source] + 1; + if (length > sourceEnd - source) { + return 0; + } + return isLegalUTF8(source, length) ? length : 0; +} + /* --------------------------------------------------------------------- */ static unsigned Index: llvm/include/llvm/Support/ConvertUTF.h =================================================================== --- llvm/include/llvm/Support/ConvertUTF.h +++ llvm/include/llvm/Support/ConvertUTF.h @@ -181,6 +181,8 @@ Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd); +unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd); + unsigned getNumBytesForUTF8(UTF8 firstByte); /*************************************************************************/ Index: clang/lib/Lex/Lexer.cpp =================================================================== --- clang/lib/Lex/Lexer.cpp +++ clang/lib/Lex/Lexer.cpp @@ -2391,13 +2391,40 @@ // // This loop terminates with CurPtr pointing at the newline (or end of buffer) // character that ends the line comment. + + bool WarnOnInvalidUtf8 = + !isLexingRawMode() && + !PP->getDiagnostics().isIgnored(diag::warn_invalid_utf8_in_comment, + getSourceLocation(CurPtr)); + bool UnicodeDecodeFailed = false; + char C; while (true) { C = *CurPtr; // Skip over characters in the fast loop. - while (C != 0 && // Potentially EOF. - C != '\n' && C != '\r') // Newline or DOS-style newline. + // Warn on invalid UTF-8 if the corresponding warning is enabled, emitting a + // diagnostic only once per sequence that cannot be decoded. + while ((!WarnOnInvalidUtf8 || isASCII(C)) && C != 0 && // Potentially EOF. + C != '\n' && C != '\r') { // Newline or DOS-style newline. C = *++CurPtr; + UnicodeDecodeFailed = false; + } + + if (WarnOnInvalidUtf8 && !isASCII(C)) { + unsigned Length = llvm::getUTF8SequenceSize( + (const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd); + if (Length == 0) { + if (!UnicodeDecodeFailed) { + Diag(CurPtr, diag::warn_invalid_utf8_in_comment); + } + UnicodeDecodeFailed = true; + ++CurPtr; + } else { + UnicodeDecodeFailed = false; + CurPtr += Length; + } + continue; + } const char *NextLine = CurPtr; if (C != 0) { @@ -2664,10 +2691,18 @@ if (C == '/') C = *CurPtr++; + bool WarnOnInvalidUtf8 = + !isLexingRawMode() && + !PP->getDiagnostics().isIgnored(diag::warn_invalid_utf8_in_comment, + getSourceLocation(CurPtr)); + bool UnicodeDecodeFailed = false; + while (true) { // Skip over all non-interesting characters until we find end of buffer or a // (probably ending) '/' character. - if (CurPtr + 24 < BufferEnd && + // When diagnosing invalid UTF-8 sequences we always skip the fast + // vectorized path. + if (!WarnOnInvalidUtf8 && CurPtr + 24 < BufferEnd && // If there is a code-completion point avoid the fast scan because it // doesn't check for '\0'. !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) { @@ -2714,9 +2749,28 @@ C = *CurPtr++; } - // Loop to scan the remainder. - while (C != '/' && C != '\0') - C = *CurPtr++; + // Loop to scan the remainder, warning on invalid UTF-8 + // if the corresponding warning is enabled, emitting a diagnostic only once + // per sequence that cannot be decoded. + while (C != '/' && C != '\0') { + if (!WarnOnInvalidUtf8 || isASCII(C)) { + UnicodeDecodeFailed = false; + C = *CurPtr++; + continue; + } + unsigned Length = llvm::getUTF8SequenceSize( + (const llvm::UTF8 *)CurPtr - 1, (const llvm::UTF8 *)BufferEnd); + if (Length == 0) { + if (!UnicodeDecodeFailed) { + Diag(CurPtr, diag::warn_invalid_utf8_in_comment); + } + UnicodeDecodeFailed = true; + C = *CurPtr++; + continue; + } + UnicodeDecodeFailed = false; + C = *(CurPtr += Length - 1); + } if (C == '/') { FoundSlash: Index: clang/include/clang/Basic/DiagnosticLexKinds.td =================================================================== --- clang/include/clang/Basic/DiagnosticLexKinds.td +++ clang/include/clang/Basic/DiagnosticLexKinds.td @@ -113,6 +113,8 @@ // Unicode and UCNs def err_invalid_utf8 : Error< "source file is not valid UTF-8">; +def warn_invalid_utf8_in_comment : Warning< + "invalid UTF-8 in comment">, InGroup<DiagGroup<"invalid-utf8">>, DefaultIgnore; def err_character_not_allowed : Error< "unexpected character <U+%0>">; def err_character_not_allowed_identifier : Error< Index: clang/docs/ReleaseNotes.rst =================================================================== --- clang/docs/ReleaseNotes.rst +++ clang/docs/ReleaseNotes.rst @@ -267,6 +267,8 @@ - When using class templates without arguments, clang now tells developers that template arguments are missing in certain contexts. This fixes `Issue 55962 <https://github.com/llvm/llvm-project/issues/55962>`_. +- Added ``-Winvalid-utf8`` which diagnose invalid UTF-8 code unit sequences in + comments. Non-comprehensive list of changes in this release -------------------------------------------------
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits