https://gcc.gnu.org/bugzilla/show_bug.cgi?id=86419
--- Comment #7 from Dimitrij Mijoski <dmjpp at hotmail dot com> ---
I think a found a related bug in the UTF8 to UCS2 codecvt,
codecvt_utf8<char16_t>. It can be tested with the following example:
#include <codecvt>
auto test_u8_ucs2_in()
{
// 2 code points, one is 3 bytes and the other is 4 bytes in UTF-8.
// in UTF-16 the first is sinlge unit, the second is surrogate pair
// in UCS2 only the first CP is allowed.
const char* in = u8"\uAAAA\U0010AAAA";
char16_t out[2] = { 'y' , 'y' };
auto cvt_ptr = make_unique<codecvt_utf8<char16_t>>();
auto& cvt = *cvt_ptr;
auto state = mbstate_t{};
auto in_ptr = in;
auto out_ptr = out;
state = {};
in_ptr = nullptr;
out_ptr = nullptr;
auto res = cvt.in(state, in, in + 2, in_ptr, out, out, out_ptr);
assert(res == cvt.partial); //BUG, returns OK, should be Partial
assert(out_ptr == out);
assert(in_ptr == in);
state = {};
in_ptr = nullptr;
out_ptr = nullptr;
res = cvt.in(state, in, in + 2, in_ptr, out, out + 1, out_ptr);
assert(res == cvt.partial); // BUG, returns ERROR, should be Partial
assert(out_ptr == out);
assert(in_ptr == in);
state = {};
in_ptr = nullptr;
out_ptr = nullptr;
res = cvt.in(state, in, in + 3, in_ptr, out, out, out_ptr);
assert(res == cvt.partial); //BUG, return OK, should be Partial
assert(out_ptr == out);
assert(in_ptr == in);
state = {};
in_ptr = nullptr;
out_ptr = nullptr;
res = cvt.in(state, in, in + 3, in_ptr, out, out + 1, out_ptr);
assert(res == cvt.ok);
assert(out_ptr == out + 1);
assert(in_ptr == in + 3);
cout << "UCS2 sequence: " << hex << out[0] << ' ' << out[1] << '\n';
state = {};
in_ptr = nullptr;
out_ptr = nullptr;
res = cvt.in(state, in, in + 6, in_ptr, out, out + 1, out_ptr);
assert(res == cvt.partial); // BUG, return OK, should be Partial
assert(out_ptr == out + 1);
assert(in_ptr == in + 3);
state = {};
in_ptr = nullptr;
out_ptr = nullptr;
res = cvt.in(state, in, in + 6, in_ptr, out, out + 2, out_ptr);
assert(res == cvt.partial); // BUG, returns ERROR, should be Partial
assert(out_ptr == out + 1);
assert(in_ptr == in + 3);
state = {};
in_ptr = nullptr;
out_ptr = nullptr;
res = cvt.in(state, in, in + 7, in_ptr, out, out + 1, out_ptr);
assert(res == cvt.partial); // BUG, returns OK, should be Partial
assert(out_ptr == out + 1);
assert(in_ptr == in + 3);
state = {};
in_ptr = nullptr;
out_ptr = nullptr;
res = cvt.in(state, in, in + 7, in_ptr, out, out + 2, out_ptr);
assert(res == cvt.error);
assert(out_ptr == out + 1);
assert(in_ptr == in + 3);
}
The bug lies in the same function utf16_in() I mentioned in comment #5, in
lines 544-547
https://gcc.gnu.org/git/?p=gcc.git;a=blob;f=libstdc%2B%2B-v3/src/c%2B%2B11/codecvt.cc;h=0311b15177d0439757e0347f7934b5a09b78f8e3;hb=HEAD#l544
Those lines:
544 if (s == surrogates::allowed)
545 return codecvt_base::partial;
546 else
547 return codecvt_base::error; // No surrogates in UCS2
Should simply be one line:
544 return codecvt_base::partial;