cor3ntin created this revision.
Herald added a subscriber: hiraditya.
Herald added a project: All.
cor3ntin requested review of this revision.
Herald added projects: clang, LLVM.
Herald added subscribers: llvm-commits, cfe-commits.

Introduce an off-by default `-Winvalid-utf8` warning
that detects invalid UTF-8 code units sequences in comments.

Invalid UTF-8 in other places is already diagnosed,
as that cannot appear in identifiers and other grammar constructs.

The warning is off by default as its likely to be somewhat disruptive
otherwise.

This warning allows clang to conform to the yet-to be approved WG21
"P2295R5 Support for UTF-8 as a portable source file encoding"
paper.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D128059

Files:
  clang/docs/ReleaseNotes.rst
  clang/include/clang/Basic/DiagnosticLexKinds.td
  clang/lib/Lex/Lexer.cpp
  clang/test/Lexer/comment-invalid-utf8.c
  llvm/include/llvm/Support/ConvertUTF.h
  llvm/lib/Support/ConvertUTF.cpp

Index: llvm/lib/Support/ConvertUTF.cpp
===================================================================
--- llvm/lib/Support/ConvertUTF.cpp
+++ llvm/lib/Support/ConvertUTF.cpp
@@ -417,6 +417,18 @@
     return isLegalUTF8(source, length);
 }
 
+/*
+ * Exported function to return the size of the first utf-8 code unit sequence,
+ * Or 0 if the sequence is not valid;
+ */
+unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd) {
+  int length = trailingBytesForUTF8[*source] + 1;
+  if (length > sourceEnd - source) {
+    return 0;
+  }
+  return isLegalUTF8(source, length) ? length : 0;
+}
+
 /* --------------------------------------------------------------------- */
 
 static unsigned
Index: llvm/include/llvm/Support/ConvertUTF.h
===================================================================
--- llvm/include/llvm/Support/ConvertUTF.h
+++ llvm/include/llvm/Support/ConvertUTF.h
@@ -181,6 +181,8 @@
 
 Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd);
 
+unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd);
+
 unsigned getNumBytesForUTF8(UTF8 firstByte);
 
 /*************************************************************************/
Index: clang/lib/Lex/Lexer.cpp
===================================================================
--- clang/lib/Lex/Lexer.cpp
+++ clang/lib/Lex/Lexer.cpp
@@ -2391,13 +2391,40 @@
   //
   // This loop terminates with CurPtr pointing at the newline (or end of buffer)
   // character that ends the line comment.
+
+  bool WarnOnInvalidUtf8 =
+      !isLexingRawMode() &&
+      !PP->getDiagnostics().isIgnored(diag::warn_invalid_utf8_in_comment,
+                                      getSourceLocation(CurPtr));
+  bool UnicodeDecodeFailed = false;
+
   char C;
   while (true) {
     C = *CurPtr;
     // Skip over characters in the fast loop.
-    while (C != 0 &&                // Potentially EOF.
-           C != '\n' && C != '\r')  // Newline or DOS-style newline.
+    // Warn on invalid UTF-8 if the corresponding warning is enabled, emitting a
+    // diagnostic only once per sequence that cannot be decoded.
+    while ((!WarnOnInvalidUtf8 || isASCII(C)) && C != 0 && // Potentially EOF.
+           C != '\n' && C != '\r') { // Newline or DOS-style newline.
       C = *++CurPtr;
+      UnicodeDecodeFailed = false;
+    }
+
+    if (WarnOnInvalidUtf8 && !isASCII(C)) {
+      unsigned Length = llvm::getUTF8SequenceSize(
+          (const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd);
+      if (Length == 0) {
+        if (!UnicodeDecodeFailed) {
+          Diag(CurPtr, diag::warn_invalid_utf8_in_comment);
+        }
+        UnicodeDecodeFailed = true;
+        ++CurPtr;
+      } else {
+        UnicodeDecodeFailed = false;
+        CurPtr += Length;
+      }
+      continue;
+    }
 
     const char *NextLine = CurPtr;
     if (C != 0) {
@@ -2664,10 +2691,18 @@
   if (C == '/')
     C = *CurPtr++;
 
+  bool WarnOnInvalidUtf8 =
+      !isLexingRawMode() &&
+      !PP->getDiagnostics().isIgnored(diag::warn_invalid_utf8_in_comment,
+                                      getSourceLocation(CurPtr));
+  bool UnicodeDecodeFailed = false;
+
   while (true) {
     // Skip over all non-interesting characters until we find end of buffer or a
     // (probably ending) '/' character.
-    if (CurPtr + 24 < BufferEnd &&
+    // When diagnosing invalid UTF-8 sequences we always skip the fast
+    // vectorized path.
+    if (!WarnOnInvalidUtf8 && CurPtr + 24 < BufferEnd &&
         // If there is a code-completion point avoid the fast scan because it
         // doesn't check for '\0'.
         !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
@@ -2714,9 +2749,28 @@
       C = *CurPtr++;
     }
 
-    // Loop to scan the remainder.
-    while (C != '/' && C != '\0')
-      C = *CurPtr++;
+    // Loop to scan the remainder, warning on invalid UTF-8
+    // if the corresponding warning is enabled, emitting a diagnostic only once
+    // per sequence that cannot be decoded.
+    while (C != '/' && C != '\0') {
+      if (!WarnOnInvalidUtf8 || isASCII(C)) {
+        UnicodeDecodeFailed = false;
+        C = *CurPtr++;
+        continue;
+      }
+      unsigned Length = llvm::getUTF8SequenceSize(
+          (const llvm::UTF8 *)CurPtr - 1, (const llvm::UTF8 *)BufferEnd);
+      if (Length == 0) {
+        if (!UnicodeDecodeFailed) {
+          Diag(CurPtr, diag::warn_invalid_utf8_in_comment);
+        }
+        UnicodeDecodeFailed = true;
+        C = *CurPtr++;
+        continue;
+      }
+      UnicodeDecodeFailed = false;
+      C = *(CurPtr += Length - 1);
+    }
 
     if (C == '/') {
   FoundSlash:
Index: clang/include/clang/Basic/DiagnosticLexKinds.td
===================================================================
--- clang/include/clang/Basic/DiagnosticLexKinds.td
+++ clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -113,6 +113,8 @@
 // Unicode and UCNs
 def err_invalid_utf8 : Error<
   "source file is not valid UTF-8">;
+def warn_invalid_utf8_in_comment : Warning<
+  "invalid UTF-8 in comment">, InGroup<DiagGroup<"invalid-utf8">>, DefaultIgnore;
 def err_character_not_allowed : Error<
   "unexpected character <U+%0>">;
 def err_character_not_allowed_identifier : Error<
Index: clang/docs/ReleaseNotes.rst
===================================================================
--- clang/docs/ReleaseNotes.rst
+++ clang/docs/ReleaseNotes.rst
@@ -267,6 +267,8 @@
 - When using class templates without arguments, clang now tells developers
   that template arguments are missing in certain contexts.
   This fixes `Issue 55962 <https://github.com/llvm/llvm-project/issues/55962>`_.
+- Added ``-Winvalid-utf8`` which diagnose invalid UTF-8 code unit sequences in
+  comments.
 
 Non-comprehensive list of changes in this release
 -------------------------------------------------
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to