The JSON parser treats each half of a surrogate pair as unpaired surrogate. Fix it to recognize surrogate pairs.
Signed-off-by: Markus Armbruster <arm...@redhat.com> --- qobject/json-parser.c | 16 +++++++++++++++- tests/check-qjson.c | 3 +-- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/qobject/json-parser.c b/qobject/json-parser.c index bb54886809..703065fa2b 100644 --- a/qobject/json-parser.c +++ b/qobject/json-parser.c @@ -115,7 +115,7 @@ static QString *parse_string(JSONParserContext *ctxt, JSONToken *token) const char *ptr = token->str; QString *str; char quote; - int cp, i; + int cp, i, leading_surrogate; char *end; ssize_t len; char utf8_buf[5]; @@ -156,6 +156,8 @@ static QString *parse_string(JSONParserContext *ctxt, JSONToken *token) qstring_append_chr(str, '\t'); break; case 'u': + leading_surrogate = 0; + hex: cp = 0; for (i = 0; i < 4; i++) { ptr++; @@ -168,6 +170,18 @@ static QString *parse_string(JSONParserContext *ctxt, JSONToken *token) cp |= hex2decimal(*ptr); } + if (cp >= 0xD800 && cp <= 0xDBFF && !leading_surrogate + && ptr[1] == '\\' && ptr[2] == 'u') { + ptr += 2; + leading_surrogate = cp; + goto hex; + } + if (cp >= 0xDC00 && cp <= 0xDFFF && leading_surrogate) { + cp &= 0x3FF; + cp |= (leading_surrogate & 0x3FF) << 10; + cp += 0x010000; + } + if (mod_utf8_encode(utf8_buf, sizeof(utf8_buf), cp) < 0) { parse_error(ctxt, token, "\\u%.4s is not a valid Unicode character", diff --git a/tests/check-qjson.c b/tests/check-qjson.c index 422697459f..3d3a3f105f 100644 --- a/tests/check-qjson.c +++ b/tests/check-qjson.c @@ -61,8 +61,7 @@ static void escaped_string(void) { "double byte utf-8 \\u00A2", "double byte utf-8 \xc2\xa2" }, { "triple byte utf-8 \\u20AC", "triple byte utf-8 \xe2\x82\xac" }, { "quadruple byte utf-8 \\uD834\\uDD1E", /* U+1D11E */ - /* bug: want \xF0\x9D\x84\x9E */ - NULL }, + "quadruple byte utf-8 \xF0\x9D\x84\x9E" }, { "\\z", NULL }, { "\\ux", NULL }, { "\\u1x", NULL }, -- 2.17.1