2023-11-03 Jakub Jelinek <ja...@redhat.com>
PR c++/110341
libcpp/
* charset.cc: Implement C++ 26 P1854R4 - Making non-encodable string
literals ill-formed.
(one_count_chars, convert_count_chars, count_source_chars): New
functions.
(narrow_str_to_charconst): Change last arg type from cpp_ttype to
const cpp_token *. For C++ if pedantic and i > 1 in CPP_CHAR
interpret token also as CPP_STRING32 and if number of characters
in the CPP_STRING32 is larger than number of bytes in CPP_CHAR,
pedwarn on it. Make the diagnostics more detailed.
(wide_str_to_charconst): Change last arg type from cpp_ttype to
const cpp_token *. Make the diagnostics more detailed.
(cpp_interpret_charconst): Adjust narrow_str_to_charconst and
wide_str_to_charconst callers.
gcc/testsuite/
* g++.dg/cpp26/literals1.C: New test.
* g++.dg/cpp26/literals2.C: New test.
* g++.dg/cpp23/wchar-multi1.C: Adjust expected diagnostic wordings.
* g++.dg/cpp23/wchar-multi2.C: Likewise.
* gcc.dg/c2x-utf8char-3.c: Likewise.
* gcc.dg/cpp/charconst-4.c: Likewise.
* gcc.dg/cpp/charconst.c: Likewise.
* gcc.dg/cpp/if-2.c: Likewise.
* gcc.dg/utf16-4.c: Likewise.
* gcc.dg/utf32-4.c: Likewise.
* g++.dg/cpp1z/utf8-neg.C: Likewise.
* g++.dg/cpp2a/ucn2.C: Likewise.
* g++.dg/ext/utf16-4.C: Likewise.
* g++.dg/ext/utf32-4.C: Likewise.
--- libcpp/charset.cc.jj 2023-11-02 07:49:20.975811244 +0100
+++ libcpp/charset.cc 2023-11-03 11:57:56.738701066 +0100
@@ -446,6 +446,73 @@ one_utf16_to_utf8 (iconv_t bigend, const
return 0;
}
+
+/* Special routine which just counts number of characters in the
+ string, what exactly is stored into the output doesn't matter
+ as long as it is one uchar per character. */
+
+static inline int
+one_count_chars (iconv_t, const uchar **inbufp, size_t *inbytesleftp,
+ uchar **outbufp, size_t *outbytesleftp)
+{
+ cppchar_t s = 0;
+ int rval;
+
+ /* Check for space first, since we know exactly how much we need. */
+ if (*outbytesleftp < 1)
+ return E2BIG;
+
+#if HOST_CHARSET == HOST_CHARSET_ASCII
+ rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
+ if (rval)
+ return rval;
+#else
+ if (*inbytesleftp < 1)
+ return EINVAL;
+ static const uchar utf_ebcdic_map[256] = {
+ /* See table 4 in http://unicode.org/reports/tr16/tr16-7.2.html */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1, 1, 1, 1, 1,
+ 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1, 1, 1, 1, 1, 1,
+ 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1, 1, 1, 1, 1,
+ 9, 9, 9, 9, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
+ 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
+ 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
+ 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 1, 3, 3,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 4, 4, 4, 4,
+ 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 4, 5, 5, 5,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 6, 6, 7, 7, 0
+ };
+ rval = utf_ebcdic_map[**inbufp];
+ if (rval == 9)
+ return EILSEQ;
+ if (rval == 0)
+ rval = 1;
+ if (rval >= 2)
+ {
+ if (*inbytesleftp < rval)
+ return EINVAL;
+ for (int i = 1; i < rval; ++i)
+ if (utf_ebcdic_map[(*inbufp)[i]] != 9)
+ return EILSEQ;
+ }
+ *inbytesleftp -= rval;
+ *inbufp += rval;
+#endif
+
+ **outbufp = ' ';
+
+ *outbufp += 1;
+ *outbytesleftp -= 1;
+ return 0;
+}
+
+
/* Helper routine for the next few functions. The 'const' on
one_conversion means that we promise not to modify what function is
pointed to, which lets the inliner see through it. */
@@ -529,6 +596,15 @@ convert_utf32_utf8 (iconv_t cd, const uc
return conversion_loop (one_utf32_to_utf8, cd, from, flen, to);
}
+/* Magic conversion which just counts characters from input, so
+ only to->len is significant. */
+static bool
+convert_count_chars (iconv_t cd, const uchar *from,
+ size_t flen, struct _cpp_strbuf *to)
+{
+ return conversion_loop (one_count_chars, cd, from, flen, to);
+}
+
/* Identity conversion, used when we have no alternative. */
static bool
convert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED,
@@ -2574,21 +2650,49 @@ cpp_interpret_string_notranslate (cpp_re
}
+/* Return number of source characters in STR. */
+static unsigned
+count_source_chars (cpp_reader *pfile, cpp_string str, cpp_ttype type)
+{
+ cpp_string str2 = { 0, 0 };
+ bool (*saved_diagnostic_handler) (cpp_reader *, enum cpp_diagnostic_level,
+ enum cpp_warning_reason, rich_location *,
+ const char *, va_list *)
+ ATTRIBUTE_FPTR_PRINTF(5,0);
+ saved_diagnostic_handler = pfile->cb.diagnostic;
+ pfile->cb.diagnostic = noop_diagnostic_cb;
+ convert_f save_func = pfile->narrow_cset_desc.func;
+ pfile->narrow_cset_desc.func = convert_count_chars;
+ bool ret = cpp_interpret_string (pfile, &str, 1, &str2, type);
+ pfile->narrow_cset_desc.func = save_func;
+ pfile->cb.diagnostic = saved_diagnostic_handler;
+ if (ret)
+ {
+ if (str2.text != str.text)
+ free ((void *)str2.text);
+ return str2.len;
+ }
+ else
+ return 0;
+}
+
/* Subroutine of cpp_interpret_charconst which performs the conversion
to a number, for narrow strings. STR is the string structure returned
by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for
- cpp_interpret_charconst. TYPE is the token type. */
+ cpp_interpret_charconst. TOKEN is the token. */
static cppchar_t
narrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
unsigned int *pchars_seen, int *unsignedp,
- enum cpp_ttype type)
+ const cpp_token *token)
{
+ enum cpp_ttype type = token->type;
size_t width = CPP_OPTION (pfile, char_precision);
size_t max_chars = CPP_OPTION (pfile, int_precision) / width;
size_t mask = width_to_mask (width);
size_t i;
cppchar_t result, c;
bool unsigned_p;
+ bool diagnosed = false;
/* The value of a multi-character character constant, or a
single-character character constant whose representation in the
@@ -2612,11 +2716,55 @@ narrow_str_to_charconst (cpp_reader *pfi
if (type == CPP_UTF8CHAR)
max_chars = 1;
- if (i > max_chars)
+ else if (i > 1 && CPP_OPTION (pfile, cplusplus) && CPP_PEDANTIC (pfile))
{
+ /* C++ as a DR since
+ P1854R4 - Making non-encodable string literals ill-formed
+ makes multi-character narrow character literals if any of the
+ characters in the literal isn't encodable in char/unsigned char
+ ill-formed. We need to count the number of c-chars and compare
+ that to str.len. */
+ unsigned src_chars = count_source_chars (pfile, token->val.str, type);
+
+ if (src_chars)
+ {
+ if (str.len > src_chars)
+ {
+ if (src_chars <= 2)
+ diagnosed
+ = cpp_error (pfile, CPP_DL_PEDWARN,
+ "character not encodable in a single execution "
+ "character code unit");
+ else
+ diagnosed
+ = cpp_error (pfile, CPP_DL_PEDWARN,
+ "at least one character in a multi-character "
+ "literal not encodable in a single execution "
+ "character code unit");
+ if (diagnosed && i > max_chars)
+ i = max_chars;
+ }
+ }
+ }
+ if (diagnosed)
+ /* Already diagnosed above. */;
+ else if (i > max_chars)
+ {
+ unsigned src_chars
+ = count_source_chars (pfile, token->val.str,
+ type == CPP_UTF8CHAR ? CPP_CHAR : type);
+
+ if (type != CPP_UTF8CHAR)
+ cpp_error (pfile, CPP_DL_WARNING,
+ "multi-character literal with %ld characters exceeds "
+ "'int' size of %ld bytes", (long) i, (long) max_chars);
+ else if (src_chars > 2)
+ cpp_error (pfile, CPP_DL_ERROR,
+ "multi-character literal cannot have an encoding prefix");
+ else
+ cpp_error (pfile, CPP_DL_ERROR,
+ "character not encodable in a single code unit");
i = max_chars;
- cpp_error (pfile, type == CPP_UTF8CHAR ? CPP_DL_ERROR : CPP_DL_WARNING,
- "character constant too long for its type");
}
else if (i > 1 && CPP_OPTION (pfile, warn_multichar))
cpp_warning (pfile, CPP_W_MULTICHAR, "multi-character character
constant");
@@ -2651,12 +2799,13 @@ narrow_str_to_charconst (cpp_reader *pfi
/* Subroutine of cpp_interpret_charconst which performs the conversion
to a number, for wide strings. STR is the string structure returned
by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for
- cpp_interpret_charconst. TYPE is the token type. */
+ cpp_interpret_charconst. TOKEN is the token. */
static cppchar_t
wide_str_to_charconst (cpp_reader *pfile, cpp_string str,
unsigned int *pchars_seen, int *unsignedp,
- enum cpp_ttype type)
+ const cpp_token *token)
{
+ enum cpp_ttype type = token->type;
bool bigend = CPP_OPTION (pfile, bytes_big_endian);
size_t width = converter_for_type (pfile, type).width;
size_t cwidth = CPP_OPTION (pfile, char_precision);
@@ -2692,14 +2841,25 @@ wide_str_to_charconst (cpp_reader *pfile
character exactly fills a wchar_t, so a multi-character wide
character constant is guaranteed to overflow. */
if (str.len > nbwc * 2)
- cpp_error (pfile, (CPP_OPTION (pfile, cplusplus)
- && (type == CPP_CHAR16
- || type == CPP_CHAR32
- /* In C++23 this is error even for L'ab'. */
- || (type == CPP_WCHAR
- && CPP_OPTION (pfile, size_t_literals))))
- ? CPP_DL_ERROR : CPP_DL_WARNING,
- "character constant too long for its type");
+ {
+ cpp_diagnostic_level level = CPP_DL_WARNING;
+ unsigned src_chars
+ = count_source_chars (pfile, token->val.str, CPP_CHAR);
+
+ if (CPP_OPTION (pfile, cplusplus)
+ && (type == CPP_CHAR16
+ || type == CPP_CHAR32
+ /* In C++23 this is error even for L'ab'. */
+ || (type == CPP_WCHAR
+ && CPP_OPTION (pfile, size_t_literals))))
+ level = CPP_DL_ERROR;
+ if (src_chars > 2)
+ cpp_error (pfile, level,
+ "multi-character literal cannot have an encoding prefix");
+ else
+ cpp_error (pfile, level,
+ "character not encodable in a single code unit");
+ }
/* Truncate the constant to its natural width, and simultaneously
sign- or zero-extend to the full width of cppchar_t. */
@@ -2754,10 +2914,10 @@ cpp_interpret_charconst (cpp_reader *pfi
if (wide)
result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp,
- token->type);
+ token);
else
result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp,
- token->type);
+ token);
if (str.text != token->val.str.text)
free ((void *)str.text);
--- gcc/testsuite/g++.dg/cpp23/wchar-multi1.C.jj 2022-08-26
16:06:10.578493272 +0200
+++ gcc/testsuite/g++.dg/cpp23/wchar-multi1.C 2023-11-03 12:03:38.982933997
+0100
@@ -4,18 +4,19 @@
char a = 'a';
int b = 'ab'; // { dg-warning "multi-character character
constant" }
-int c = '\u05D9'; // { dg-warning "multi-character character
constant" }
+int c = '\u05D9'; // { dg-error "character not encodable in a single
execution character code unit" }
#if __SIZEOF_INT__ > 2
-int d = '\U0001F525'; // { dg-warning "multi-character character constant"
"" { target int32 } }
+int d = '\U0001F525'; // { dg-error "character not encodable in a single execution
character code unit" "" { target int32 } }
#endif
-int e = 'abcd'; // { dg-warning "multi-character character
constant" }
+int e = 'abcd'; // { dg-warning "multi-character character
constant" "" { target int32plus } }
+ // { dg-warning "multi-character literal with \[0-9]+
characters exceeds 'int' size of \[0-9]+ bytes" "" { target { ! int32plus } } .-1 }
wchar_t f = L'f';
-wchar_t g = L'gh'; // { dg-error "character constant too long for its type"
"" { target c++23 } }
- // { dg-warning "character constant too long for its
type" "" { target c++20_down } .-1 }
-wchar_t h = L'ijkl'; // { dg-error "character constant too long for its type"
"" { target c++23 } }
- // { dg-warning "character constant too long for its
type" "" { target c++20_down } .-1 }
-wchar_t i = L'\U0001F525'; // { dg-error "character constant too long for its type"
"" { target { c++23 && { ! 4byte_wchar_t } } } }
- // { dg-warning "character constant too long for its type"
"" { target { c++20_down && { ! 4byte_wchar_t } } } .-1 }
+wchar_t g = L'gh'; // { dg-error "multi-character literal cannot have an encoding
prefix" "" { target c++23 } }
+ // { dg-warning "multi-character literal cannot have an
encoding prefix" "" { target c++20_down } .-1 }
+wchar_t h = L'ijkl'; // { dg-error "multi-character literal cannot have an encoding
prefix" "" { target c++23 } }
+ // { dg-warning "multi-character literal cannot have an
encoding prefix" "" { target c++20_down } .-1 }
+wchar_t i = L'\U0001F525'; // { dg-error "multi-character literal cannot have an encoding
prefix" "" { target { c++23 && { ! 4byte_wchar_t } } } }
+ // { dg-warning "multi-character literal cannot have an encoding
prefix" "" { target { c++20_down && { ! 4byte_wchar_t } } } .-1 }
#ifdef __cpp_char8_t
typedef char8_t u8;
#else
@@ -23,20 +24,20 @@ typedef char u8;
#endif
#if __cpp_unicode_characters >= 201411
u8 j = u8'j';
-u8 k = u8'kl'; // { dg-error "character constant too long for its type"
"" { target c++17 } }
-u8 l = u8'\U0001F525'; // { dg-error "character constant too long for its type"
"" { target c++17 } }
+u8 k = u8'kl'; // { dg-error "multi-character literal cannot have an encoding
prefix" "" { target c++17 } }
+u8 l = u8'\U0001F525'; // { dg-error "character not encodable in a single code
unit" "" { target c++17 } }
#endif
#if __cpp_unicode_characters >= 200704
char16_t m = u'm';
-char16_t n = u'no'; // { dg-error "character constant too long for its type"
"" { target c++11 } }
+char16_t n = u'no'; // { dg-error "multi-character literal cannot have an encoding
prefix" "" { target c++11 } }
char16_t o = u'\u05D9';
-char16_t p = u'\U0001F525'; // { dg-error "character constant too long for its type"
"" { target c++11 } }
+char16_t p = u'\U0001F525'; // { dg-error "character not encodable in a single code
unit" "" { target c++11 } }
char32_t q = U'm';
-char32_t r = U'no'; // { dg-error "character constant too long for its type"
"" { target c++11 } }
+char32_t r = U'no'; // { dg-error "multi-character literal cannot have an encoding
prefix" "" { target c++11 } }
char32_t s = U'\u05D9';
char32_t t = U'\U0001F525';
#endif
-wchar_t u = L'\u0065\u0301'; // { dg-error "character constant too long for its
type" "" { target c++23 } }
- // { dg-warning "character constant too long for its
type" "" { target c++20_down } .-1 }
-wchar_t v = L'é'; // { dg-error "character constant too long for its type"
"" { target c++23 } }
- // { dg-warning "character constant too long for its
type" "" { target c++20_down } .-1 }
+wchar_t u = L'\u0065\u0301'; // { dg-error "multi-character literal cannot have an
encoding prefix" "" { target c++23 } }
+ // { dg-warning "multi-character literal cannot have an
encoding prefix" "" { target c++20_down } .-1 }
+wchar_t v = L'é'; // { dg-error "multi-character literal cannot have an encoding
prefix" "" { target c++23 } }
+ // { dg-warning "multi-character literal cannot have an
encoding prefix" "" { target c++20_down } .-1 }
--- gcc/testsuite/g++.dg/cpp23/wchar-multi2.C.jj 2022-08-26
16:06:10.578493272 +0200
+++ gcc/testsuite/g++.dg/cpp23/wchar-multi2.C 2023-11-03 12:06:44.661347711
+0100
@@ -11,12 +11,12 @@ int d = '\U0001F525'; // { dg-warning "
#endif
int e = 'abcd'; // { dg-warning "multi-character character
constant" }
wchar_t f = L'f';
-wchar_t g = L'gh'; // { dg-error "character constant too long for its type"
"" { target c++23 } }
- // { dg-warning "character constant too long for its
type" "" { target c++20_down } .-1 }
-wchar_t h = L'ijkl'; // { dg-error "character constant too long for its type"
"" { target c++23 } }
- // { dg-warning "character constant too long for its
type" "" { target c++20_down } .-1 }
-wchar_t i = L'\U0001F525'; // { dg-error "character constant too long for its type"
"" { target { c++23 } } }
- // { dg-warning "character constant too long for its
type" "" { target { c++20_down } } .-1 }
+wchar_t g = L'gh'; // { dg-error "multi-character literal cannot have an encoding
prefix" "" { target c++23 } }
+ // { dg-warning "multi-character literal cannot have an
encoding prefix" "" { target c++20_down } .-1 }
+wchar_t h = L'ijkl'; // { dg-error "multi-character literal cannot have an encoding
prefix" "" { target c++23 } }
+ // { dg-warning "multi-character literal cannot have an
encoding prefix" "" { target c++20_down } .-1 }
+wchar_t i = L'\U0001F525'; // { dg-error "character not encodable in a single code
unit" "" { target { c++23 } } }
+ // { dg-warning "character not encodable in a single code
unit" "" { target { c++20_down } } .-1 }
#ifdef __cpp_char8_t
typedef char8_t u8;
#else
@@ -24,20 +24,20 @@ typedef char u8;
#endif
#if __cpp_unicode_characters >= 201411
u8 j = u8'j';
-u8 k = u8'kl'; // { dg-error "character constant too long for its type"
"" { target c++17 } }
-u8 l = u8'\U0001F525'; // { dg-error "character constant too long for its type"
"" { target c++17 } }
+u8 k = u8'kl'; // { dg-error "multi-character literal cannot have an encoding
prefix" "" { target c++17 } }
+u8 l = u8'\U0001F525'; // { dg-error "character not encodable in a single code
unit" "" { target c++17 } }
#endif
#if __cpp_unicode_characters >= 200704
char16_t m = u'm';
-char16_t n = u'no'; // { dg-error "character constant too long for its type"
"" { target c++11 } }
+char16_t n = u'no'; // { dg-error "multi-character literal cannot have an encoding
prefix" "" { target c++11 } }
char16_t o = u'\u05D9';
-char16_t p = u'\U0001F525'; // { dg-error "character constant too long for its type"
"" { target c++11 } }
+char16_t p = u'\U0001F525'; // { dg-error "character not encodable in a single code
unit" "" { target c++11 } }
char32_t q = U'm';
-char32_t r = U'no'; // { dg-error "character constant too long for its type"
"" { target c++11 } }
+char32_t r = U'no'; // { dg-error "multi-character literal cannot have an encoding
prefix" "" { target c++11 } }
char32_t s = U'\u05D9';
char32_t t = U'\U0001F525';
#endif
-wchar_t u = L'\u0065\u0301'; // { dg-error "character constant too long for its
type" "" { target c++23 } }
- // { dg-warning "character constant too long for its
type" "" { target c++20_down } .-1 }
-wchar_t v = L'é'; // { dg-error "character constant too long for its type"
"" { target c++23 } }
- // { dg-warning "character constant too long for its
type" "" { target c++20_down } .-1 }
+wchar_t u = L'\u0065\u0301'; // { dg-error "multi-character literal cannot have an
encoding prefix" "" { target c++23 } }
+ // { dg-warning "multi-character literal cannot have an
encoding prefix" "" { target c++20_down } .-1 }
+wchar_t v = L'é'; // { dg-error "multi-character literal cannot have an encoding
prefix" "" { target c++23 } }
+ // { dg-warning "multi-character literal cannot have an
encoding prefix" "" { target c++20_down } .-1 }
--- gcc/testsuite/g++.dg/cpp26/literals1.C.jj 2023-11-03 09:56:28.103335149
+0100
+++ gcc/testsuite/g++.dg/cpp26/literals1.C 2023-11-03 12:43:05.043977373
+0100
@@ -0,0 +1,66 @@
+// C++26 P1854R4 - Making non-encodable string literals ill-formed
+// { dg-do compile { target c++11 } }
+// { dg-require-effective-target int32 }
+// { dg-options "-pedantic-errors -finput-charset=UTF-8 -fexec-charset=UTF-8" }
+
+int a = 'abcd'; // { dg-warning
"multi-character character constant" }
+int b = '\x61\x62\x63\x64'; // { dg-warning
"multi-character character constant" }
+int c = 'á'; // { dg-error "character not
encodable in a single execution character code unit" }
+int d = '😁'; // { dg-error "character not
encodable in a single execution character code unit" }
+int e = '\N{FACE WITH TEARS OF JOY}'; // { dg-error "character not
encodable in a single execution character code unit" }
+ // { dg-error "named universal
character escapes are only valid in" "" { target c++20_down } .-1 }
+int f = '\U0001F602'; // { dg-error "character not
encodable in a single execution character code unit" }
+wchar_t g = L'abcd'; // { dg-error "multi-character literal
cannot have an encoding prefix" "" { target c++23 } }
+ // { dg-warning "multi-character
literal cannot have an encoding prefix" "" { target c++20_down } .-1 }
+wchar_t h = L'\x61\x62\x63\x64'; // { dg-error "multi-character literal
cannot have an encoding prefix" "" { target c++23 } }
+ // { dg-warning "multi-character
literal cannot have an encoding prefix" "" { target c++20_down } .-1 }
+wchar_t i = L'á';
+char16_t j = u'abcd'; // { dg-error
"multi-character literal cannot have an encoding prefix" }
+char16_t k = u'\x61\x62\x63\x64'; // { dg-error
"multi-character literal cannot have an encoding prefix" }
+char16_t l = u'á';
+char16_t m = u'😁'; // { dg-error "character not
encodable in a single code unit" }
+char16_t n = u'\N{FACE WITH TEARS OF JOY}'; // { dg-error "character not
encodable in a single code unit" { target c++23 } }
+ // { dg-error "named universal
character escapes are only valid in" "" { target c++20_down } .-1 }
+char16_t o = u'\U0001F602'; // { dg-error "character not
encodable in a single code unit" }
+char32_t p = U'abcd'; // { dg-error
"multi-character literal cannot have an encoding prefix" }
+char32_t q = U'\x61\x62\x63\x64'; // { dg-error
"multi-character literal cannot have an encoding prefix" }
+char32_t r = U'á';
+char32_t s = U'😁';
+char32_t t = U'\N{FACE WITH TEARS OF JOY}'; // { dg-error "named universal
character escapes are only valid in" "" { target c++20_down } }
+char32_t u = U'\U0001F602';
+#if __cpp_unicode_characters >= 201411L
+auto v = u8'abcd'; // { dg-error "multi-character literal
cannot have an encoding prefix" "" { target c++17 } }
+auto w = u8'\x61\x62\x63\x64'; // { dg-error "multi-character literal
cannot have an encoding prefix" "" { target c++17 } }
+auto x = u8'á'; // { dg-error "character not
encodable in a single code unit" "" { target c++17 } }
+auto y = u8'😁'; // { dg-error "character not encodable
in a single code unit" "" { target c++17 } }
+auto z = u8'\N{FACE WITH TEARS OF JOY}'; // { dg-error "character not encodable
in a single code unit" "" { target c++17 } }
+ // { dg-error "named universal character
escapes are only valid in" "" { target { c++17 && c++20_down } } .-1 }
+auto aa = u8'\U0001F602'; // { dg-error "character not encodable
in a single code unit" "" { target c++17 } }
+#endif
+const char *ab = "😁";
+const char *ac = "\N{FACE WITH TEARS OF JOY}"; // { dg-error "named universal
character escapes are only valid in" "" { target c++20_down } }
+const char *ad = "\U0001F602";
+const char16_t *ae = u"😁";
+const char16_t *af = u"\N{FACE WITH TEARS OF JOY}"; // { dg-error "named universal character
escapes are only valid in" "" { target c++20_down } }
+const char16_t *ag = u"\U0001F602";
+const char32_t *ah = U"😁";
+const char32_t *ai = U"\N{FACE WITH TEARS OF JOY}"; // { dg-error "named universal character
escapes are only valid in" "" { target c++20_down } }
+const char32_t *aj = U"\U0001F602";
+auto ak = u8"😁";
+auto al = u8"\N{FACE WITH TEARS OF JOY}"; // { dg-error "named universal character
escapes are only valid in" "" { target c++20_down } }
+auto am = u8"\U0001F602";
+int an = '\x123456789'; // { dg-error "hex
escape sequence out of range" }
+wchar_t ao = L'\x123456789abcdef0'; // { dg-error "hex escape
sequence out of range" }
+char16_t ap = u'\x12345678'; // { dg-error "hex escape
sequence out of range" }
+char32_t aq = U'\x123456789abcdef0'; // { dg-error "hex escape
sequence out of range" }
+#if __cpp_unicode_characters >= 201411L
+auto ar = u8'\x123456789abcdef0'; // { dg-error "hex escape sequence out
of range" "" { target c++17 } }
+#endif
+char as = '\xff';
+#if __SIZEOF_WCHAR_T__ * __CHAR_BIT__ == 32
+wchar_t at = L'\xffffffff';
+#elif __SIZEOF_WCHAR_T__ * __CHAR_BIT__ == 16
+wchar_t at = L'\xffff';
+#endif
+int au = '\x1234'; // { dg-error "hex escape
sequence out of range" }
+int av = 'abcdefghijklmnop'; // { dg-warning
"multi-character literal with \[0-9]+ characters exceeds 'int' size of \[0-9]+
bytes" }
--- gcc/testsuite/g++.dg/cpp26/literals2.C.jj 2023-11-03 09:56:28.108335079
+0100
+++ gcc/testsuite/g++.dg/cpp26/literals2.C 2023-11-03 12:44:02.932171027
+0100
@@ -0,0 +1,68 @@
+// C++26 P1854R4 - Making non-encodable string literals ill-formed
+// { dg-do compile { target c++11 } }
+// { dg-require-effective-target int32 }
+// { dg-options "-pedantic-errors -finput-charset=UTF-8
-fexec-charset=ISO-8859-1" }
+/* { dg-require-iconv "ISO-8859-1" } */
+
+int a = 'abcd'; // { dg-warning
"multi-character character constant" }
+int b = '\x61\x62\x63\x64'; // { dg-warning
"multi-character character constant" }
+int c = 'á';
+int d = '😁'; // { dg-error "converting to
execution character set" }
+int e = '\N{FACE WITH TEARS OF JOY}'; // { dg-error "converting
UCN to execution character set" }
+ // { dg-error "named universal
character escapes are only valid in" "" { target c++20_down } .-1 }
+int f = '\U0001F602'; // { dg-error "converting
UCN to execution character set" }
+wchar_t g = L'abcd'; // { dg-error "multi-character literal
cannot have an encoding prefix" "" { target c++23 } }
+ // { dg-warning "multi-character
literal cannot have an encoding prefix" "" { target c++20_down } .-1 }
+wchar_t h = L'\x61\x62\x63\x64'; // { dg-error "multi-character literal
cannot have an encoding prefix" "" { target c++23 } }
+ // { dg-warning "multi-character
literal cannot have an encoding prefix" "" { target c++20_down } .-1 }
+wchar_t i = L'á';
+char16_t j = u'abcd'; // { dg-error
"multi-character literal cannot have an encoding prefix" }
+char16_t k = u'\x61\x62\x63\x64'; // { dg-error
"multi-character literal cannot have an encoding prefix" }
+char16_t l = u'á';
+char16_t m = u'😁'; // { dg-error "character not
encodable in a single code unit" }
+char16_t n = u'\N{FACE WITH TEARS OF JOY}'; // { dg-error "character not
encodable in a single code unit" { target c++23 } }
+ // { dg-error "named universal
character escapes are only valid in" "" { target c++20_down } .-1 }
+char16_t o = u'\U0001F602'; // { dg-error "character not
encodable in a single code unit" }
+char32_t p = U'abcd'; // { dg-error
"multi-character literal cannot have an encoding prefix" }
+char32_t q = U'\x61\x62\x63\x64'; // { dg-error
"multi-character literal cannot have an encoding prefix" }
+char32_t r = U'á';
+char32_t s = U'😁';
+char32_t t = U'\N{FACE WITH TEARS OF JOY}'; // { dg-error "named universal
character escapes are only valid in" "" { target c++20_down } }
+char32_t u = U'\U0001F602';
+#if __cpp_unicode_characters >= 201411L
+auto v = u8'abcd'; // { dg-error "multi-character literal
cannot have an encoding prefix" "" { target c++17 } }
+auto w = u8'\x61\x62\x63\x64'; // { dg-error "multi-character literal
cannot have an encoding prefix" "" { target c++17 } }
+auto x = u8'á'; // { dg-error "character not
encodable in a single code unit" "" { target c++17 } }
+auto y = u8'😁'; // { dg-error "character not encodable
in a single code unit" "" { target c++17 } }
+auto z = u8'\N{FACE WITH TEARS OF JOY}'; // { dg-error "character not encodable
in a single code unit" "" { target c++17 } }
+ // { dg-error "named universal character
escapes are only valid in" "" { target { c++17 && c++20_down } } .-1 }
+auto aa = u8'\U0001F602'; // { dg-error "character not encodable
in a single code unit" "" { target c++17 } }
+#endif
+const char *ab = "😁"; // { dg-error
"converting to execution character set" }
+const char *ac = "\N{FACE WITH TEARS OF JOY}"; // { dg-error
"converting UCN to execution character set" }
+ // { dg-error "named universal
character escapes are only valid in" "" { target c++20_down } .-1 }
+const char *ad = "\U0001F602"; // { dg-error
"converting UCN to execution character set" }
+const char16_t *ae = u"😁";
+const char16_t *af = u"\N{FACE WITH TEARS OF JOY}"; // { dg-error "named universal character
escapes are only valid in" "" { target c++20_down } }
+const char16_t *ag = u"\U0001F602";
+const char32_t *ah = U"😁";
+const char32_t *ai = U"\N{FACE WITH TEARS OF JOY}"; // { dg-error "named universal character
escapes are only valid in" "" { target c++20_down } }
+const char32_t *aj = U"\U0001F602";
+auto ak = u8"😁";
+auto al = u8"\N{FACE WITH TEARS OF JOY}"; // { dg-error "named universal character
escapes are only valid in" "" { target c++20_down } }
+auto am = u8"\U0001F602";
+int an = '\x123456789'; // { dg-error "hex
escape sequence out of range" }
+wchar_t ao = L'\x123456789abcdef0'; // { dg-error "hex escape
sequence out of range" }
+char16_t ap = u'\x12345678'; // { dg-error "hex escape
sequence out of range" }
+char32_t aq = U'\x123456789abcdef0'; // { dg-error "hex escape
sequence out of range" }
+#if __cpp_unicode_characters >= 201411L
+auto ar = u8'\x123456789abcdef0'; // { dg-error "hex escape sequence out
of range" "" { target c++17 } }
+#endif
+char as = '\xff';
+#if __SIZEOF_WCHAR_T__ * __CHAR_BIT__ == 32
+wchar_t at = L'\xffffffff';
+#elif __SIZEOF_WCHAR_T__ * __CHAR_BIT__ == 16
+wchar_t at = L'\xffff';
+#endif
+int au = '\x1234'; // { dg-error "hex escape
sequence out of range" }
+int av = 'abcdefghijklmnop'; // { dg-warning
"multi-character literal with \[0-9]+ characters exceeds 'int' size of \[0-9]+
bytes" }
--- gcc/testsuite/gcc.dg/c2x-utf8char-3.c.jj 2020-01-14 20:02:47.234603078
+0100
+++ gcc/testsuite/gcc.dg/c2x-utf8char-3.c 2023-11-03 16:43:32.965924076
+0100
@@ -3,6 +3,6 @@
/* { dg-options "-std=c2x -pedantic-errors" } */
unsigned char a = u8''; /* { dg-error "empty character constant" } */
-unsigned char b = u8'ab'; /* { dg-error "character constant too long for its
type" } */
-unsigned char c = u8'\u00ff'; /* { dg-error "character constant too long for its
type" } */
+unsigned char b = u8'ab'; /* { dg-error "multi-character literal cannot have an
encoding prefix" } */
+unsigned char c = u8'\u00ff'; /* { dg-error "character not encodable in a single
code unit" } */
unsigned char d = u8'\x100'; /* { dg-error "hex escape sequence out of range"
} */
--- gcc/testsuite/gcc.dg/cpp/charconst-4.c.jj 2020-01-14 20:02:47.252602809
+0100
+++ gcc/testsuite/gcc.dg/cpp/charconst-4.c 2023-11-03 16:40:43.060203232
+0100
@@ -38,7 +38,7 @@ extern void abort (void);
# error Charconst incorrectly sign-extended
#endif
-#if LONG_CHARCONST != SHORT_CHARCONST /* { dg-warning "too long" } */
+#if LONG_CHARCONST != SHORT_CHARCONST /* { dg-warning "multi-character literal with
\[0-9]+ characters exceeds 'int' size of \[0-9]+ bytes" } */
# error Overly long charconst truncates wrongly for preprocessor
#endif
@@ -46,7 +46,7 @@ int main ()
{
if (POS_CHARCONST < 0)
abort ();
- if (LONG_CHARCONST != SHORT_CHARCONST) /* { dg-warning "too long" } */
+ if (LONG_CHARCONST != SHORT_CHARCONST) /* { dg-warning "multi-character literal
with \[0-9]+ characters exceeds 'int' size of \[0-9]+ bytes" } */
abort ();
return 0;
}
--- gcc/testsuite/gcc.dg/cpp/charconst.c.jj 2020-01-14 20:02:47.252602809
+0100
+++ gcc/testsuite/gcc.dg/cpp/charconst.c 2023-11-03 16:42:02.911132097
+0100
@@ -11,9 +11,9 @@
#endif
#if L'' /* { dg-error "empty" "empty wide charconst" }
*/
#endif
-#if 'very long' /* { dg-warning "too long" "long charconst" } */
+#if 'very long' /* { dg-warning "multi-character literal with \[0-9]+
characters exceeds 'int' size of \[0-9]+ bytes" "long charconst" } */
#endif
-#if L'very long' /* { dg-warning "too long" "long wide charconst" } */
+#if L'very long' /* { dg-warning "multi-character literal cannot have an encoding
prefix" "long wide charconst" } */
#endif
/* Don't do this test for L'ab'; it depends upon sizeof (wchar_t). */
#if 'ab' /* { dg-warning "multi-char" "multi-character" } */
@@ -27,10 +27,10 @@ void foo ()
c = ''; /* { dg-error "empty" "empty charconst" } */
w = L''; /* { dg-error "empty" "empty wide charconst" } */
- c = 'very long'; /* { dg-warning "too long" "long charconst" } */
- w = L'very long'; /* { dg-warning "too long" "long wide charconst" } */
+ c = 'very long'; /* { dg-warning "multi-character literal with \[0-9]+ characters
exceeds 'int' size of \[0-9]+ bytes" "long charconst" } */
+ w = L'very long'; /* { dg-warning "multi-character literal cannot have an encoding
prefix" "long wide charconst" } */
c = 'ab'; /* { dg-warning "multi-char" "multi-char" } */
/* Wide charconsts cannot contain more than one wide character. */
- w = L'ab'; /* { dg-warning "too long" "multi-char wide" } */
+ w = L'ab'; /* { dg-warning "multi-character literal cannot have an encoding
prefix" "multi-char wide" } */
}
--- gcc/testsuite/gcc.dg/cpp/if-2.c.jj 2020-01-14 20:02:47.255602764 +0100
+++ gcc/testsuite/gcc.dg/cpp/if-2.c 2023-11-03 16:39:22.105289183 +0100
@@ -21,7 +21,7 @@
#if 'abcd' /* { dg-warning "(multi-character character constant)|(character constant (is
)?too long)" "multi-character charconst" } */
#endif
-#if 'abcdefghi' /* { dg-warning "character constant (is )?too long" "charconst too long" } */
+#if 'abcdefghi' /* { dg-warning "multi-character literal with \[0-9]+ characters
exceeds 'int' size of \[0-9]+ bytes" } */
#endif
#if '' /* { dg-error "empty character constant" "empty charconst" } */
--- gcc/testsuite/gcc.dg/utf16-4.c.jj 2020-01-14 20:02:47.499599109 +0100
+++ gcc/testsuite/gcc.dg/utf16-4.c 2023-11-03 16:55:43.102129866 +0100
@@ -6,8 +6,8 @@
typedef __CHAR16_TYPE__ char16_t;
char16_t c0 = u''; /* { dg-error "empty character" } */
-char16_t c1 = u'ab'; /* { dg-warning "constant too long" } */
-char16_t c2 = u'\U00064321'; /* { dg-warning "constant too long" } */
+char16_t c1 = u'ab'; /* { dg-warning "multi-character literal
cannot have an encoding prefix" } */
+char16_t c2 = u'\U00064321'; /* { dg-warning "character not encodable in
a single code unit" } */
char16_t c3 = 'a';
char16_t c4 = U'a';
@@ -16,6 +16,6 @@ char16_t c6 = U'\U00064321'; /* { dg-war
char16_t c7 = L'a';
char16_t c8 = L'\u2029';
char16_t c9 = L'\U00064321'; /* { dg-warning "conversion" "" {
target { 4byte_wchar_t } } } */
- /* { dg-warning "constant too long" ""
{ target { ! 4byte_wchar_t } } .-1 } */
+ /* { dg-warning "character not encodable in a single
code unit" "" { target { ! 4byte_wchar_t } } .-1 } */
int main () {}
--- gcc/testsuite/gcc.dg/utf32-4.c.jj 2020-01-14 20:02:47.500599094 +0100
+++ gcc/testsuite/gcc.dg/utf32-4.c 2023-11-03 16:45:33.625305532 +0100
@@ -6,15 +6,15 @@
typedef __CHAR32_TYPE__ char32_t;
char32_t c0 = U''; /* { dg-error "empty character" } */
-char32_t c1 = U'ab'; /* { dg-warning "constant too long" } */
+char32_t c1 = U'ab'; /* { dg-warning "multi-character literal
cannot have an encoding prefix" } */
char32_t c2 = U'\U00064321';
char32_t c3 = 'a';
char32_t c4 = u'a';
char32_t c5 = u'\u2029';
-char32_t c6 = u'\U00064321'; /* { dg-warning "constant too long" } */
+char32_t c6 = u'\U00064321'; /* { dg-warning "character not encodable in
a single code unit" } */
char32_t c7 = L'a';
char32_t c8 = L'\u2029';
-char32_t c9 = L'\U00064321'; /* { dg-warning "constant too long" ""
{ target { ! 4byte_wchar_t } } } */
+char32_t c9 = L'\U00064321'; /* { dg-warning "character not encodable in a single
code unit" "" { target { ! 4byte_wchar_t } } } */
int main () {}
--- gcc/testsuite/g++.dg/cpp1z/utf8-neg.C.jj 2020-01-14 20:02:46.792609699
+0100
+++ gcc/testsuite/g++.dg/cpp1z/utf8-neg.C 2023-11-03 16:48:25.384001519
+0100
@@ -1,6 +1,6 @@
/* { dg-do compile { target c++17 } } */
const static char c0 = u8''; // { dg-error "empty character" }
-const static char c1 = u8'ab'; // { dg-error "character constant too long
for its type" }
-const static char c2 = u8'\u0124'; // { dg-error "character constant too long
for its type" }
-const static char c3 = u8'\U00064321'; // { dg-error "character constant too long
for its type" }
+const static char c1 = u8'ab'; // { dg-error "multi-character literal
cannot have an encoding prefix" }
+const static char c2 = u8'\u0124'; // { dg-error "character not encodable in a
single code unit" }
+const static char c3 = u8'\U00064321'; // { dg-error "character not encodable in a
single code unit" }
--- gcc/testsuite/g++.dg/cpp2a/ucn2.C.jj 2020-05-13 21:38:28.363420230
+0200
+++ gcc/testsuite/g++.dg/cpp2a/ucn2.C 2023-11-03 16:50:23.872412086 +0100
@@ -12,18 +12,18 @@ const char32_t *f = U"\uD802"; // { dg-
const char32_t *g = U"\U0000DFF0"; // { dg-error "is not a valid universal
character" }
const char32_t *h = U"\U00110001"; // { dg-error "is outside the UCS codespace"
"" { target c++20 } }
#if __cpp_unicode_characters >= 201411
-const char8_t i = u8'\u00C0'; // { dg-error "character constant too long for its
type" "" { target c++17 } }
+const char8_t i = u8'\u00C0'; // { dg-error "character not encodable in a single
code unit" "" { target c++17 } }
#endif
-const char16_t j = u'\U0001F914'; // { dg-error "character constant too long
for its type" }
+const char16_t j = u'\U0001F914'; // { dg-error "character not encodable in a
single code unit" }
const char32_t k = U'\U0001F914';
#if __cpp_unicode_characters >= 201411
-const char8_t l = u8'ab'; // { dg-error "character constant too long for its
type" "" { target c++17 } }
+const char8_t l = u8'ab'; // { dg-error "multi-character literal cannot have an
encoding prefix" "" { target c++17 } }
#endif
-const char16_t m = u'ab'; // { dg-error "character constant too long
for its type" }
-const char32_t n = U'ab'; // { dg-error "character constant too long
for its type" }
+const char16_t m = u'ab'; // { dg-error "multi-character literal
cannot have an encoding prefix" }
+const char32_t n = U'ab'; // { dg-error "multi-character literal
cannot have an encoding prefix" }
#if __cpp_unicode_characters >= 201411
const char8_t o = u8'\U00110002'; // { dg-error "is outside the UCS codespace"
"" { target c++20 } }
- // { dg-error "character constant too long for its
type" "" { target c++17 } .-1 }
+ // { dg-error "character not encodable in a single
code unit" "" { target c++17 } .-1 }
#endif
const char16_t p = u'\U00110003'; // { dg-error "is outside the UCS codespace"
"" { target c++20 } }
// { dg-error "converting UCN to execution character
set" "" { target *-*-* } .-1 }
--- gcc/testsuite/g++.dg/ext/utf16-4.C.jj 2020-01-14 20:02:46.841608965
+0100
+++ gcc/testsuite/g++.dg/ext/utf16-4.C 2023-11-03 16:58:51.134607555 +0100
@@ -4,8 +4,8 @@
const static char16_t c0 = u''; /* { dg-error "empty character" } */
-const static char16_t c1 = u'ab'; /* { dg-error "constant too
long" } */
-const static char16_t c2 = u'\U00064321'; /* { dg-error "constant too
long" } */
+const static char16_t c1 = u'ab'; /* { dg-error "multi-character
literal cannot have an encoding prefix" } */
+const static char16_t c2 = u'\U00064321'; /* { dg-error "character not
encodable in a single code unit" } */
const static char16_t c3 = 'a';
const static char16_t c4 = U'a';
@@ -14,5 +14,6 @@ const static char16_t c6 = U'\U00064321'
const static char16_t c7 = L'a';
const static char16_t c8 = L'\u2029';
const static char16_t c9 = L'\U00064321'; /* { dg-warning "conversion from .wchar_t. to
.char16_t. changes value from .410401. to .17185." "" { target { 4byte_wchar_t } } }
*/
- /* { dg-warning "constant too long"
"" { target { ! 4byte_wchar_t } } .-1 } */
+ /* { dg-warning "character not encodable in a single
code unit" "" { target { { ! 4byte_wchar_t } && c++20_down } } .-1 } */
+ /* { dg-error "character not encodable in a single
code unit" "" { target { { ! 4byte_wchar_t } && c++23 } } .-2 } */
int main () {}
--- gcc/testsuite/g++.dg/ext/utf32-4.C.jj 2020-01-14 20:02:46.841608965
+0100
+++ gcc/testsuite/g++.dg/ext/utf32-4.C 2023-11-03 16:59:41.155924498 +0100
@@ -3,15 +3,16 @@
/* { dg-do compile { target c++11 } } */
const static char32_t c0 = U''; /* { dg-error "empty character" } */
-const static char32_t c1 = U'ab'; /* { dg-error "constant too
long" } */
+const static char32_t c1 = U'ab'; /* { dg-error "multi-character
literal cannot have an encoding prefix" } */
const static char32_t c2 = U'\U00064321';
const static char32_t c3 = 'a';
const static char32_t c4 = u'a';
const static char32_t c5 = u'\u2029';
-const static char32_t c6 = u'\U00064321'; /* { dg-error "constant too
long" } */
+const static char32_t c6 = u'\U00064321'; /* { dg-error "character not
encodable in a single code unit" } */
const static char32_t c7 = L'a';
const static char32_t c8 = L'\u2029';
-const static char32_t c9 = L'\U00064321'; /* { dg-warning "constant too long"
"" { target { ! 4byte_wchar_t } } } */
+const static char32_t c9 = L'\U00064321'; /* { dg-warning "character not encodable in a single
code unit" "" { target { { ! 4byte_wchar_t } && c++20_down } } } */
+ /* { dg-error "character not encodable in a single
code unit" "" { target { { ! 4byte_wchar_t } && c++23 } } .-1 } */
int main () {}
Jakub