cor3ntin created this revision.
Herald added a project: All.
cor3ntin requested review of this revision.
Herald added a project: clang.
Herald added a subscriber: cfe-commits.

This covers

- P2558R2 (C++, wg21.link/P2558 <https://reviews.llvm.org/P2558>)
- N2701 (C, https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2701.htm)
- N3124 (C, https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3124.pdf)

This patch

- Disallow representing $ as a UCN in all language mode, which did not properly 
work (see GH62133), and which in made ill-formed in C++ and C by P2558 
<https://reviews.llvm.org/P2558> and N3124 respectively
- Allow a UCN for any character in C2X, in string and character literals

Fixes #62133


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D153621

Files:
  clang/include/clang/Basic/DiagnosticLexKinds.td
  clang/lib/Lex/Lexer.cpp
  clang/lib/Lex/LiteralSupport.cpp
  clang/test/Lexer/char-literal.cpp
  clang/test/Lexer/utf8-char-literal.cpp
  clang/test/Preprocessor/ucn-allowed-chars.c
  clang/test/Preprocessor/ucn-pp-identifier.c

Index: clang/test/Preprocessor/ucn-pp-identifier.c
===================================================================
--- clang/test/Preprocessor/ucn-pp-identifier.c
+++ clang/test/Preprocessor/ucn-pp-identifier.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 %s -fsyntax-only -std=c99 -pedantic -verify=expected,ext -Wundef -DTRIGRAPHS=1
+// RUN: %clang_cc1 %s -fsyntax-only -std=c2x -pedantic -verify=expected,ext -Wundef -DTRIGRAPHS=1
 // RUN: %clang_cc1 %s -fsyntax-only -x c++ -pedantic -verify=expected,ext -Wundef -fno-trigraphs
 // RUN: %clang_cc1 %s -fsyntax-only -x c++ -std=c++23 -pedantic -ftrigraphs -DTRIGRAPHS=1 -verify=expected,cxx23 -Wundef -Wpre-c++23-compat
 // RUN: %clang_cc1 %s -fsyntax-only -x c++ -pedantic -verify=expected,ext -Wundef -ftrigraphs -DTRIGRAPHS=1
@@ -40,7 +41,8 @@
                    // ext-warning {{extension}} cxx23-warning {{before C++23}}
 #define \N{WASTEBASKET} // expected-error {{macro name must be an identifier}} \
                         // ext-warning {{extension}} cxx23-warning {{before C++23}}
-#define a\u0024
+#define a\u0024a  // expected-error {{character '$' cannot be specified by a universal character name}} \
+                  // expected-warning {{requires whitespace after the macro name}}
 
 #if \u0110 // expected-warning {{is not defined, evaluates to 0}}
 #endif
@@ -112,7 +114,7 @@
 #define capital_u_\U00FC
 // expected-warning@-1 {{incomplete universal character name}} expected-note@-1 {{did you mean to use '\u'?}} expected-warning@-1 {{whitespace}}
 // CHECK: note: did you mean to use '\u'?
-// CHECK-NEXT: {{^  112 | #define capital_u_\U00FC}}
+// CHECK-NEXT: {{^  114 | #define capital_u_\U00FC}}
 // CHECK-NEXT: {{^      |                    \^}}
 // CHECK-NEXT: {{^      |                    u}}
 
@@ -155,5 +157,5 @@
 int a\N{LATIN CAPITAL LETTER A WITH GRAVE??>;
 // expected-warning@-1 {{trigraph ignored}}\
 // expected-warning@-1 {{incomplete}}\
-// expected-error@-1 {{expected ';' after top level declarator}}
+// expected-error@-1 {{expected}}
 #endif
Index: clang/test/Preprocessor/ucn-allowed-chars.c
===================================================================
--- clang/test/Preprocessor/ucn-allowed-chars.c
+++ clang/test/Preprocessor/ucn-allowed-chars.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 %s -fsyntax-only -std=c99 -verify
+// RUN: %clang_cc1 %s -fsyntax-only -std=c2x -Wc99-compat -verify
 // RUN: %clang_cc1 %s -fsyntax-only -std=c11 -Wc99-compat -verify
 // RUN: %clang_cc1 %s -fsyntax-only -x c++ -std=c++03 -Wc++11-compat -verify
 // RUN: %clang_cc1 %s -fsyntax-only -x c++ -std=c++11 -Wc++98-compat -verify
@@ -13,7 +14,6 @@
 
 
 
-
 // Identifier initial characters
 extern char \u0E50; // C++03, C11, C++11
 extern char \u0300; // disallowed in C99/C++03
@@ -38,8 +38,8 @@
 
 
 #if __cplusplus
-// expected-error@9 {{character <U+0384> not allowed in an identifier}}
-// expected-error@11 {{character <U+FFFF> not allowed in an identifier}}
+// expected-error@10 {{character <U+0384> not allowed in an identifier}}
+// expected-error@12 {{character <U+FFFF> not allowed in an identifier}}
 // expected-error@18 {{expected unqualified-id}}
 # if __cplusplus >= 201103L
 // C++11
@@ -53,23 +53,49 @@
 
 # endif
 #else
-# if __STDC_VERSION__ >= 201112L
+# if __STDC_VERSION__ >= 201800L
+// C2X
+// expected-warning@8 {{using this character in an identifier is incompatible with C99}}
+// expected-error@10 {{character <U+0384> not allowed in an identifier}}
+// expected-error@12 {{character <U+FFFF> not allowed in an identifier}}
+// expected-error@18 {{expected identifier}}
+// expected-error@19 {{expected identifier}}
+// expected-error@33 {{invalid universal character}}
+# elif __STDC_VERSION__ >= 201112L
 // C11
-// expected-warning@7 {{using this character in an identifier is incompatible with C99}}
-// expected-warning@9 {{using this character in an identifier is incompatible with C99}}
-// expected-error@11 {{character <U+FFFF> not allowed in an identifier}}
+// expected-warning@8 {{using this character in an identifier is incompatible with C99}}
+// expected-warning@10 {{using this character in an identifier is incompatible with C99}}
+// expected-error@12 {{character <U+FFFF> not allowed in an identifier}}
 // expected-warning@18 {{starting an identifier with this character is incompatible with C99}}
 // expected-error@19 {{expected identifier}}
 // expected-error@33 {{invalid universal character}}
 
 # else
 // C99
-// expected-error@7 {{not allowed in an identifier}}
-// expected-error@9 {{not allowed in an identifier}}
-// expected-error@11 {{not allowed in an identifier}}
+// expected-error@8 {{not allowed in an identifier}}
+// expected-error@10 {{not allowed in an identifier}}
+// expected-error@12 {{not allowed in an identifier}}
 // expected-error@18 {{expected identifier}}
 // expected-error@19 {{expected identifier}}
 // expected-error@33 {{invalid universal character}}
 
 # endif
 #endif
+
+#define AAA\u0024 // expected-error{{character '$' cannot be specified by a universal character name}} \
+                  // expected-warning{{whitespace}}
+#define AAB\u0040 // expected-error{{character '@' cannot be specified by a universal character name}} \
+                  // expected-warning{{whitespace}}
+#define AAC\u0060 // expected-error{{character '`' cannot be specified by a universal character name}} \
+                  // expected-warning{{whitespace}}
+
+#define ABA \u0024 // expected-error{{character '$' cannot be specified by a universal character name}}
+#define ABB \u0040 // expected-error{{character '@' cannot be specified by a universal character name}}
+#define ABC \u0060 // expected-error{{character '`' cannot be specified by a universal character name}}
+
+int GH62133_a\u0024; // expected-error {{character '$' cannot be specified by a universal character name}} \
+                     // expected-error {{}}
+int GH62133_b\u0040; // expected-error {{character '@' cannot be specified by a universal character name}} \
+                     // expected-error {{}}
+int GH62133_c\u0060; // expected-error {{character '`' cannot be specified by a universal character name}} \
+                     // expected-error {{}}
Index: clang/test/Lexer/utf8-char-literal.cpp
===================================================================
--- clang/test/Lexer/utf8-char-literal.cpp
+++ clang/test/Lexer/utf8-char-literal.cpp
@@ -19,7 +19,7 @@
 #elif __STDC_VERSION__ >= 202000L
 char a = u8'ñ';      // expected-error {{character too large for enclosing character literal type}}
 char b = u8'\x80';   // ok
-char c = u8'\u0080'; // expected-error {{universal character name refers to a control character}}
+char c = u8'\u0000'; // ok
 char d = u8'\u1234'; // expected-error {{character too large for enclosing character literal type}}
 char e = u8'ሴ';      // expected-error {{character too large for enclosing character literal type}}
 char f = u8'ab';     // expected-error {{Unicode character literals may not contain multiple characters}}
Index: clang/test/Lexer/char-literal.cpp
===================================================================
--- clang/test/Lexer/char-literal.cpp
+++ clang/test/Lexer/char-literal.cpp
@@ -2,7 +2,7 @@
 // RUN: %clang_cc1 -triple x86_64-apple-darwin -std=c++17 -Wfour-char-constants -fsyntax-only -verify=cxx,expected %s
 // RUN: %clang_cc1 -triple x86_64-apple-darwin -std=c++20 -Wfour-char-constants -fsyntax-only -verify=cxx,expected %s
 // RUN: %clang_cc1 -triple x86_64-apple-darwin -std=c11 -x c -Wfour-char-constants -fsyntax-only -verify=c,expected %s
-// RUN: %clang_cc1 -triple x86_64-apple-darwin -std=c2x -x c -Wfour-char-constants -fsyntax-only -verify=c,expected %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -std=c2x -x c -Wfour-char-constants -fsyntax-only -verify=c2x,expected %s
 
 #ifndef __cplusplus
 typedef __WCHAR_TYPE__ wchar_t;
@@ -48,6 +48,7 @@
 _Static_assert(u8'\U0000007F' == 0x7F, ""); // c-error {{universal character name refers to a control character}}
 _Static_assert(u8'\U00000080', ""); // c-error {{universal character name refers to a control character}}
                                     // cxx-error@-1 {{character too large for enclosing character literal type}}
+                                    // c2x-error@-2 {{character too large for enclosing character literal type}}
 _Static_assert((unsigned char)u8'\xFF' == (unsigned char)0xFF, "");
 #endif
 
@@ -119,3 +120,10 @@
 _Static_assert(U"\U0000DFFF"[0], ""); // expected-error {{invalid universal character}}
 _Static_assert(U"\U0010FFFF"[0] == 0x0010FFFF, "");
 _Static_assert(U"\U00110000"[0], ""); // expected-error {{invalid universal character}}
+
+_Static_assert('\u0024' == '$', "");
+_Static_assert('\u0040' == '@', "");
+_Static_assert('\u0060' == '`', "");
+
+_Static_assert('\u0061' == 'a', ""); // c-error {{character 'a' cannot be specified by a universal character name}}
+_Static_assert('\u0000' == '\0', ""); // c-error {{universal character name refers to a control character}}
Index: clang/lib/Lex/LiteralSupport.cpp
===================================================================
--- clang/lib/Lex/LiteralSupport.cpp
+++ clang/lib/Lex/LiteralSupport.cpp
@@ -617,19 +617,24 @@
   // C++11 allows UCNs that refer to control characters and basic source
   // characters inside character and string literals
   if (UcnVal < 0xa0 &&
-      (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60)) {  // $, @, `
-    bool IsError = (!Features.CPlusPlus11 || !in_char_string_literal);
+      // $, @, ` are allowed in all language modes
+      (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60)) {
+    bool IsError = (!(Features.CPlusPlus11 || Features.C2x) || !in_char_string_literal);
     if (Diags) {
       char BasicSCSChar = UcnVal;
       if (UcnVal >= 0x20 && UcnVal < 0x7f)
         Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
              IsError ? diag::err_ucn_escape_basic_scs :
-                       diag::warn_cxx98_compat_literal_ucn_escape_basic_scs)
+                       Features.CPlusPlus11 ?
+                       diag::warn_cxx98_compat_literal_ucn_escape_basic_scs
+                     : diag::warn_c2x_compat_literal_ucn_escape_basic_scs )
             << StringRef(&BasicSCSChar, 1);
       else
         Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
              IsError ? diag::err_ucn_control_character :
-                       diag::warn_cxx98_compat_literal_ucn_control_character);
+                       Features.CPlusPlus11 ?
+                         diag::warn_cxx98_compat_literal_ucn_control_character
+                       : diag::warn_c2x_compat_literal_ucn_control_character);
     }
     if (IsError)
       return false;
Index: clang/lib/Lex/Lexer.cpp
===================================================================
--- clang/lib/Lex/Lexer.cpp
+++ clang/lib/Lex/Lexer.cpp
@@ -3496,9 +3496,6 @@
   //   ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the
   //   basic source character set, the program is ill-formed.
   if (CodePoint < 0xA0) {
-    if (CodePoint == 0x24 || CodePoint == 0x40 || CodePoint == 0x60)
-      return CodePoint;
-
     // We don't use isLexingRawMode() here because we need to warn about bad
     // UCNs even when skipping preprocessing tokens in a #if block.
     if (Result && PP) {
Index: clang/include/clang/Basic/DiagnosticLexKinds.td
===================================================================
--- clang/include/clang/Basic/DiagnosticLexKinds.td
+++ clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -197,6 +197,14 @@
 def warn_cxx98_compat_literal_ucn_control_character : Warning<
   "universal character name referring to a control character "
   "is incompatible with C++98">, InGroup<CXX98Compat>, DefaultIgnore;
+def warn_c2x_compat_literal_ucn_escape_basic_scs : Warning<
+  "specifying character '%0' with a universal character name is "
+  "incompatible with C standards before C2x">,
+  InGroup<CPre2xCompat>, DefaultIgnore;
+def warn_c2x_compat_literal_ucn_control_character : Warning<
+  "universal character name referring to a control character "
+  "incompatible with C standards before C2x">,
+  InGroup<CPre2xCompat>, DefaultIgnore;
 def warn_ucn_not_valid_in_c89 : Warning<
   "universal character names are only valid in C99 or C++; "
   "treating as '\\' followed by identifier">, InGroup<Unicode>;
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to