comments below On 03/14/13 18:49, Markus Armbruster wrote:
> diff --git a/qobject/qjson.c b/qobject/qjson.c > index 83a6b4f..19085a1 100644 > --- a/qobject/qjson.c > +++ b/qobject/qjson.c > @@ -136,68 +136,56 @@ static void to_json(const QObject *obj, QString *str, > int pretty, int indent) > case QTYPE_QSTRING: { > QString *val = qobject_to_qstring(obj); > const char *ptr; > + int cp; > + char buf[16]; > + char *end; > > ptr = qstring_get_str(val); > qstring_append(str, "\""); > - while (*ptr) { > - if ((ptr[0] & 0xE0) == 0xE0 && > - (ptr[1] & 0x80) && (ptr[2] & 0x80)) { > - uint16_t wchar; > - char escape[7]; > - > - wchar = (ptr[0] & 0x0F) << 12; > - wchar |= (ptr[1] & 0x3F) << 6; > - wchar |= (ptr[2] & 0x3F); > - ptr += 2; > - > - snprintf(escape, sizeof(escape), "\\u%04X", wchar); > - qstring_append(str, escape); > - } else if ((ptr[0] & 0xE0) == 0xC0 && (ptr[1] & 0x80)) { > - uint16_t wchar; > - char escape[7]; > - > - wchar = (ptr[0] & 0x1F) << 6; > - wchar |= (ptr[1] & 0x3F); > - ptr++; > - > - snprintf(escape, sizeof(escape), "\\u%04X", wchar); > - qstring_append(str, escape); > - } else switch (ptr[0]) { > - case '\"': > - qstring_append(str, "\\\""); > - break; > - case '\\': > - qstring_append(str, "\\\\"); > - break; > - case '\b': > - qstring_append(str, "\\b"); > - break; > - case '\f': > - qstring_append(str, "\\f"); > - break; > - case '\n': > - qstring_append(str, "\\n"); > - break; > - case '\r': > - qstring_append(str, "\\r"); > - break; > - case '\t': > - qstring_append(str, "\\t"); > - break; > - default: { > - if (ptr[0] <= 0x1F) { > - char escape[7]; > - snprintf(escape, sizeof(escape), "\\u%04X", ptr[0]); > - qstring_append(str, escape); > - } else { > - char buf[2] = { ptr[0], 0 }; > - qstring_append(str, buf); > - } > - break; > + > + for (; *ptr; ptr = end) { > + cp = mod_utf8_codepoint(ptr, 6, &end); This provides more background: you never call mod_utf8_codepoint() with '\0' at offset 0. So handling that in mod_utf8_codepoint() may not be that important. If a '\0' is found at offset >= 1, it will correctly trigger the /* continuation byte missing */ branch in mod_utf8_codepoint(). The retval is -1, and *end is left pointing to the NUL byte. (This is consistent with mod_utf8_codepoint()'s docs.) The -1 (incomplete sequence) produces the replacement character below, and the next time around *ptr is '\0', so we finish the loop. Seems OK. ( An alternative interface for mod_utf8_codepoint() might be something like: size_t alternative(const char *ptr, int *cp, size_t n); Resembling read() somewhat: - the return value would be the number of bytes consumed (it can't be negative (= fatal error), because we guarantee progress). 0 is EOF and only possible when "n" is 0. - "ptr" is the source, - "cp" is the output code point, -1 if invalid, - "n" is the bytes available in the source / requested to process at most. Encountering a \0 in the byte stream would be an error (*cp = -1), but would not terminate parsing per se. Then the loop would look like: processed = 0; while (processed < full) { int cp; rd = alternative(ptr + processed, &cp, full - processed); g_assert(rd > 0); /* look at cp */ processed += rd; } But of course I'm not suggesting to rewrite the function! ) > + switch (cp) { > + case '\"': > + qstring_append(str, "\\\""); > + break; > + case '\\': > + qstring_append(str, "\\\\"); > + break; > + case '\b': > + qstring_append(str, "\\b"); > + break; > + case '\f': > + qstring_append(str, "\\f"); > + break; > + case '\n': > + qstring_append(str, "\\n"); > + break; > + case '\r': > + qstring_append(str, "\\r"); > + break; > + case '\t': > + qstring_append(str, "\\t"); > + break; The C standard also names \a (alert) and \v (vertical tab); I'm not sure about their JSON notation. (The (cp < 0x20) condition catches them below of course.) > + default: > + if (cp < 0) { > + cp = 0xFFFD; /* replacement character */ > } > + if (cp > 0xFFFF) { > + /* beyond BMP; need a surrogate pair */ > + snprintf(buf, sizeof(buf), "\\u%04X\\u%04X", > + 0xD800 + ((cp - 0x10000) >> 10), > + 0xDC00 + ((cp - 0x10000) & 0x3FF)); Seems like we write 13 bytes into buf, OK. Also cp is never greater than 0x10FFFF, hence the difference is at most 0xFFFFF. The RHS surrogate half can go up to 0xDFFF, the LHS up to 0xD800+0x3FF == 0xDBFF. Good. > + } else if (cp < 0x20 || cp >= 0x7F) { > + snprintf(buf, sizeof(buf), "\\u%04X", cp); > + } else { > + buf[0] = cp; > + buf[1] = 0; > } > - ptr++; > - } > + qstring_append(str, buf); > + } > + }; > + > qstring_append(str, "\""); > break; > } Seems OK. > diff --git a/tests/check-qjson.c b/tests/check-qjson.c > index efec1b2..595ddc0 100644 > --- a/tests/check-qjson.c > +++ b/tests/check-qjson.c I'll trust you on that one :) Reviewed-by: Laszlo Ersek <ler...@redhat.com>