https://github.com/MitalAshok updated https://github.com/llvm/llvm-project/pull/97208
>From ef0072d1fc9b14f7ee657fa95f44a686b78b525a Mon Sep 17 00:00:00 2001 From: Mital Ashok <mi...@mitalashok.co.uk> Date: Sun, 30 Jun 2024 12:07:54 +0100 Subject: [PATCH 1/5] [Clang] [C23] Implement N2653: u8 strings are char8_t[] --- clang/docs/ReleaseNotes.rst | 6 ++++ .../clang/Basic/DiagnosticSemaKinds.td | 5 +++- clang/lib/Frontend/InitPreprocessor.cpp | 6 ++-- clang/lib/Headers/stdatomic.h | 5 ++++ clang/lib/Sema/SemaExpr.cpp | 23 ++++++++++----- clang/test/C/C2x/n2653.c | 29 +++++++++++++++++++ clang/www/c_status.html | 2 +- 7 files changed, 65 insertions(+), 11 deletions(-) create mode 100644 clang/test/C/C2x/n2653.c diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index c720e47dbe35b..e51be81d8b11a 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -337,6 +337,12 @@ C23 Feature Support - Properly promote bit-fields of bit-precise integer types to the field's type rather than to ``int``. #GH87641 +- Compiler support for `N2653 char8_t: A type for UTF-8 characters and strings` + <https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2653.htm>`_: ``u8`` string + literals are now of type ``char8_t[N]`` in C23 and expose + ``__CLANG_ATOMIC_CHAR8_T_LOCK_FREE``/``__GCC_ATOMIC_CHAR8_T_LOCK_FREE`` to + implement the corresponding macro in ``<stdatomic.h>``. + Non-comprehensive list of changes in this release ------------------------------------------------- diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 5dc36c594bcb7..6a00b92df1c36 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -7252,7 +7252,10 @@ def err_array_init_utf8_string_into_char : Error< def warn_cxx20_compat_utf8_string : Warning< "type of UTF-8 string literal will change from array of const char to " "array of const char8_t in C++20">, InGroup<CXX20Compat>, DefaultIgnore; -def note_cxx20_compat_utf8_string_remove_u8 : Note< +def warn_c23_compat_utf8_string : Warning< + "type of UTF-8 string literal will change from array of char to " + "array of char8_t in C23">, InGroup<C23Compat>, DefaultIgnore; +def note_cxx20_c23_compat_utf8_string_remove_u8 : Note< "remove 'u8' prefix to avoid a change of behavior; " "Clang encodes unprefixed narrow string literals as UTF-8">; def err_array_init_different_type : Error< diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp index 55ec460064830..6270c37342bcf 100644 --- a/clang/lib/Frontend/InitPreprocessor.cpp +++ b/clang/lib/Frontend/InitPreprocessor.cpp @@ -1342,8 +1342,10 @@ static void InitializePredefinedMacros(const TargetInfo &TI, getLockFreeValue(TI.get##Type##Width(), TI)); DEFINE_LOCK_FREE_MACRO(BOOL, Bool); DEFINE_LOCK_FREE_MACRO(CHAR, Char); - if (LangOpts.Char8) - DEFINE_LOCK_FREE_MACRO(CHAR8_T, Char); // Treat char8_t like char. + // char8_t has the same representation / width as unsigned + // char in C++ and is a typedef for unsigned char in C23 + if (LangOpts.Char8 || LangOpts.C23) + DEFINE_LOCK_FREE_MACRO(CHAR8_T, Char); DEFINE_LOCK_FREE_MACRO(CHAR16_T, Char16); DEFINE_LOCK_FREE_MACRO(CHAR32_T, Char32); DEFINE_LOCK_FREE_MACRO(WCHAR_T, WChar); diff --git a/clang/lib/Headers/stdatomic.h b/clang/lib/Headers/stdatomic.h index 9c103d98af8c5..c33cd8083525c 100644 --- a/clang/lib/Headers/stdatomic.h +++ b/clang/lib/Headers/stdatomic.h @@ -35,6 +35,10 @@ extern "C" { #define ATOMIC_BOOL_LOCK_FREE __CLANG_ATOMIC_BOOL_LOCK_FREE #define ATOMIC_CHAR_LOCK_FREE __CLANG_ATOMIC_CHAR_LOCK_FREE +#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L) || \ + defined(__cplusplus) +#define ATOMIC_CHAR8_T_LOCK_FREE __CLANG_ATOMIC_CHAR8_T_LOCK_FREE +#endif #define ATOMIC_CHAR16_T_LOCK_FREE __CLANG_ATOMIC_CHAR16_T_LOCK_FREE #define ATOMIC_CHAR32_T_LOCK_FREE __CLANG_ATOMIC_CHAR32_T_LOCK_FREE #define ATOMIC_WCHAR_T_LOCK_FREE __CLANG_ATOMIC_WCHAR_T_LOCK_FREE @@ -104,6 +108,7 @@ typedef _Atomic(long) atomic_long; typedef _Atomic(unsigned long) atomic_ulong; typedef _Atomic(long long) atomic_llong; typedef _Atomic(unsigned long long) atomic_ullong; +typedef _Atomic(unsigned char) atomic_char8_t; typedef _Atomic(uint_least16_t) atomic_char16_t; typedef _Atomic(uint_least32_t) atomic_char32_t; typedef _Atomic(wchar_t) atomic_wchar_t; diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index db44cfe1288b6..a1b060f7f1510 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -2082,6 +2082,8 @@ Sema::ActOnStringLiteral(ArrayRef<Token> StringToks, Scope *UDLScope) { } else if (Literal.isUTF8()) { if (getLangOpts().Char8) CharTy = Context.Char8Ty; + else if (getLangOpts().C23) + CharTy = Context.UnsignedCharTy; Kind = StringLiteralKind::UTF8; } else if (Literal.isUTF16()) { CharTy = Context.Char16Ty; @@ -2093,17 +2095,24 @@ Sema::ActOnStringLiteral(ArrayRef<Token> StringToks, Scope *UDLScope) { CharTy = Context.UnsignedCharTy; } - // Warn on initializing an array of char from a u8 string literal; this - // becomes ill-formed in C++2a. - if (getLangOpts().CPlusPlus && !getLangOpts().CPlusPlus20 && - !getLangOpts().Char8 && Kind == StringLiteralKind::UTF8) { - Diag(StringTokLocs.front(), diag::warn_cxx20_compat_utf8_string); + // Warn on u8 string literals before C++20 and C23, whose type + // was an array of char before but becomes an array of char8_t. + // In C++20, initializing an array of char from a u8 string literal + // becomes ill-formed. In C23, it might have an unexpected value if + // char was signed. + if (Kind == StringLiteralKind::UTF8 && + (getLangOpts().CPlusPlus + ? !getLangOpts().CPlusPlus20 && !getLangOpts().Char8 + : !getLangOpts().C23)) { + Diag(StringTokLocs.front(), getLangOpts().CPlusPlus + ? diag::warn_cxx20_compat_utf8_string + : diag::warn_c23_compat_utf8_string); // Create removals for all 'u8' prefixes in the string literal(s). This - // ensures C++2a compatibility (but may change the program behavior when + // ensures C++20/C23 compatibility (but may change the program behavior when // built by non-Clang compilers for which the execution character set is // not always UTF-8). - auto RemovalDiag = PDiag(diag::note_cxx20_compat_utf8_string_remove_u8); + auto RemovalDiag = PDiag(diag::note_cxx20_c23_compat_utf8_string_remove_u8); SourceLocation RemovalDiagLoc; for (const Token &Tok : StringToks) { if (Tok.getKind() == tok::utf8_string_literal) { diff --git a/clang/test/C/C2x/n2653.c b/clang/test/C/C2x/n2653.c new file mode 100644 index 0000000000000..1abd61947de7e --- /dev/null +++ b/clang/test/C/C2x/n2653.c @@ -0,0 +1,29 @@ +// RUN: %clang_cc1 -verify=c23 -std=c23 %s +// RUN: %clang_cc1 -verify=c17 -std=c17 %s + +// c23-no-diagnostics + +#include <stdatomic.h> + +#define __enable_constant_folding(x) (__builtin_constant_p(x) ? (x) : (x)) + +#ifndef ATOMIC_CHAR8_T_LOCK_FREE +#error missing +#endif +// c17-error@-2 {{missing}} + +_Static_assert(_Generic(u8"", unsigned char*: 1, char*: 0), ""); +// c17-error@-1 {{static assertion failed}} + +// -fsigned-char is the default +#define M(X) __enable_constant_folding((X) >= 0x80) + +_Static_assert(M(u8"\U000000E9"[0]), ""); +// c17-error@-1 {{static assertion failed}} +#if __STDC_VERSION__ >= 202311L +_Static_assert(M(u8'\xC3'), ""); +#endif + +const char cu8[] = u8"text"; +const signed char scu8[] = u8"text"; +const unsigned char ucu8[] = u8"text"; diff --git a/clang/www/c_status.html b/clang/www/c_status.html index 84cd8e836006c..81bb51a58e5cb 100644 --- a/clang/www/c_status.html +++ b/clang/www/c_status.html @@ -1061,7 +1061,7 @@ <h2 id="c2x">C23 implementation status</h2> <tr> <td>char8_t: A type for UTF-8 characters and strings</td> <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2653.htm">N2653</a></td> - <td class="none" align="center">No</td> + <td class="unreleased" align="center">Clang 19</td> </tr> <tr> <td>Clarification for max exponent macros-update</td> >From d2594adb3ced3b5ecbb64a2c999715e06139f90b Mon Sep 17 00:00:00 2001 From: Mital Ashok <mi...@mitalashok.co.uk> Date: Mon, 1 Jul 2024 18:19:30 +0100 Subject: [PATCH 2/5] Char array initialized from u8 string was fixed in C++20 as a DR https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p2513r4.html --- clang/lib/Sema/SemaExpr.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index a1b060f7f1510..8692ca9e1e628 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -2097,9 +2097,8 @@ Sema::ActOnStringLiteral(ArrayRef<Token> StringToks, Scope *UDLScope) { // Warn on u8 string literals before C++20 and C23, whose type // was an array of char before but becomes an array of char8_t. - // In C++20, initializing an array of char from a u8 string literal - // becomes ill-formed. In C23, it might have an unexpected value if - // char was signed. + // In C++20, it cannot be used where a pointer to char is expected. + // In C23, it might have an unexpected value if char was signed. if (Kind == StringLiteralKind::UTF8 && (getLangOpts().CPlusPlus ? !getLangOpts().CPlusPlus20 && !getLangOpts().Char8 >From 6816f7f63d3def751cc63fef5e4fa2978d735521 Mon Sep 17 00:00:00 2001 From: Mital Ashok <mi...@mitalashok.co.uk> Date: Mon, 1 Jul 2024 18:20:11 +0100 Subject: [PATCH 3/5] Define ATOMIC_CHAR8_T_LOCK_FREE only when available This should be equivalent to: \#if (defined(__STDC_VERSION__) && __STDC_VERSION__ > 202311L) || \ (defined(__cpp_char8_t) && __cpp_char8_t >= 201811L) --- clang/lib/Headers/stdatomic.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/clang/lib/Headers/stdatomic.h b/clang/lib/Headers/stdatomic.h index c33cd8083525c..ea07a58ec17a4 100644 --- a/clang/lib/Headers/stdatomic.h +++ b/clang/lib/Headers/stdatomic.h @@ -35,8 +35,7 @@ extern "C" { #define ATOMIC_BOOL_LOCK_FREE __CLANG_ATOMIC_BOOL_LOCK_FREE #define ATOMIC_CHAR_LOCK_FREE __CLANG_ATOMIC_CHAR_LOCK_FREE -#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L) || \ - defined(__cplusplus) +#ifdef __CLANG_ATOMIC_CHAR8_T_LOCK_FREE #define ATOMIC_CHAR8_T_LOCK_FREE __CLANG_ATOMIC_CHAR8_T_LOCK_FREE #endif #define ATOMIC_CHAR16_T_LOCK_FREE __CLANG_ATOMIC_CHAR16_T_LOCK_FREE >From 2480d5a864731ccf5c8949aed12c8377377c258b Mon Sep 17 00:00:00 2001 From: Mital Ashok <mi...@mitalashok.co.uk> Date: Wed, 3 Jul 2024 14:16:27 +0100 Subject: [PATCH 4/5] [Headers] Gate atomic_char8_t behind C23/C++20[-fchar8_t] --- clang/lib/Headers/stdatomic.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/clang/lib/Headers/stdatomic.h b/clang/lib/Headers/stdatomic.h index ea07a58ec17a4..79a0652c401b1 100644 --- a/clang/lib/Headers/stdatomic.h +++ b/clang/lib/Headers/stdatomic.h @@ -107,7 +107,9 @@ typedef _Atomic(long) atomic_long; typedef _Atomic(unsigned long) atomic_ulong; typedef _Atomic(long long) atomic_llong; typedef _Atomic(unsigned long long) atomic_ullong; +#ifdef __CLANG_ATOMIC_CHAR8_T_LOCK_FREE typedef _Atomic(unsigned char) atomic_char8_t; +#endif typedef _Atomic(uint_least16_t) atomic_char16_t; typedef _Atomic(uint_least32_t) atomic_char32_t; typedef _Atomic(wchar_t) atomic_wchar_t; >From 8c6b2d09e24f0437083d4614626e40a8552cdb9f Mon Sep 17 00:00:00 2001 From: Mital Ashok <mi...@mitalashok.co.uk> Date: Wed, 3 Jul 2024 14:16:49 +0100 Subject: [PATCH 5/5] Add __CHAR8_TYPE__ predefined macro to be compatible with GCC --- clang/lib/Frontend/InitPreprocessor.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp index 81ec67ebaf7b1..92b1542265c40 100644 --- a/clang/lib/Frontend/InitPreprocessor.cpp +++ b/clang/lib/Frontend/InitPreprocessor.cpp @@ -1165,6 +1165,8 @@ static void InitializePredefinedMacros(const TargetInfo &TI, DefineType("__WCHAR_TYPE__", TI.getWCharType(), Builder); DefineType("__WINT_TYPE__", TI.getWIntType(), Builder); DefineTypeSizeAndWidth("__SIG_ATOMIC", TI.getSigAtomicType(), TI, Builder); + if (LangOpts.Char8 || LangOpts.C23) + DefineType("__CHAR8_TYPE__", TI.UnsignedChar, Builder); DefineType("__CHAR16_TYPE__", TI.getChar16Type(), Builder); DefineType("__CHAR32_TYPE__", TI.getChar32Type(), Builder); _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits