details: https://hg.nginx.org/njs/rev/855edd76bdb6 branches: changeset: 1472:855edd76bdb6 user: Alexander Borisov <alexander.bori...@nginx.com> date: Wed Jul 15 19:19:19 2020 +0300 description: Introduced UTF-8 decoder according to WHATWG encoding spec.
diffstat: src/njs_json.c | 8 +- src/njs_parser.c | 73 ++++++--- src/njs_string.c | 293 +++++++++++++++++++++---------------- src/njs_unicode.h | 4 + src/njs_utf8.c | 333 ++++++++++++++++++++---------------------- src/njs_utf8.h | 40 ++-- src/test/njs_unit_test.c | 44 ++++- src/test/unicode_unit_test.c | 53 ++++-- 8 files changed, 466 insertions(+), 382 deletions(-) diffs (truncated from 1394 to 1000 lines): diff -r 63106bd2e9bf -r 855edd76bdb6 src/njs_json.c --- a/src/njs_json.c Wed Jul 15 19:19:18 2020 +0300 +++ b/src/njs_json.c Wed Jul 15 19:19:19 2020 +0300 @@ -728,7 +728,7 @@ njs_json_parse_string(njs_json_parse_ctx if (njs_surrogate_any(utf)) { if (utf > 0xdbff || p[0] != '\\' || p[1] != 'u') { - s = njs_utf8_encode(s, NJS_UTF8_REPLACEMENT); + s = njs_utf8_encode(s, NJS_UNICODE_REPLACEMENT); continue; } @@ -741,12 +741,12 @@ njs_json_parse_string(njs_json_parse_ctx utf = njs_string_surrogate_pair(utf, utf_low); } else if (njs_surrogate_leading(utf_low)) { - utf = NJS_UTF8_REPLACEMENT; - s = njs_utf8_encode(s, NJS_UTF8_REPLACEMENT); + utf = NJS_UNICODE_REPLACEMENT; + s = njs_utf8_encode(s, NJS_UNICODE_REPLACEMENT); } else { utf = utf_low; - s = njs_utf8_encode(s, NJS_UTF8_REPLACEMENT); + s = njs_utf8_encode(s, NJS_UNICODE_REPLACEMENT); } } diff -r 63106bd2e9bf -r 855edd76bdb6 src/njs_parser.c --- a/src/njs_parser.c Wed Jul 15 19:19:18 2020 +0300 +++ b/src/njs_parser.c Wed Jul 15 19:19:19 2020 +0300 @@ -7896,11 +7896,12 @@ njs_int_t njs_parser_string_create(njs_vm_t *vm, njs_lexer_token_t *token, njs_value_t *value) { - u_char *dst; - ssize_t size, length; - uint32_t cp; - njs_str_t *src; - const u_char *p, *end; + u_char *dst; + ssize_t size, length; + uint32_t cp; + njs_str_t *src; + const u_char *p, *end; + njs_unicode_decode_t ctx; src = &token->text; @@ -7914,10 +7915,17 @@ njs_parser_string_create(njs_vm_t *vm, n p = src->start; end = src->start + src->length; + njs_utf8_decode_init(&ctx); + while (p < end) { - cp = njs_utf8_safe_decode(&p, end); - - dst = njs_utf8_encode(dst, cp); + cp = njs_utf8_decode(&ctx, &p, end); + + if (cp <= NJS_UNICODE_MAX_CODEPOINT) { + dst = njs_utf8_encode(dst, cp); + + } else { + dst = njs_utf8_encode(dst, NJS_UNICODE_REPLACEMENT); + } } if (length > NJS_STRING_MAP_STRIDE && size != length) { @@ -7932,12 +7940,13 @@ static njs_token_type_t njs_parser_escape_string_create(njs_parser_t *parser, njs_lexer_token_t *token, njs_value_t *value) { - u_char c, *start, *dst; - size_t size, length, hex_length; - uint64_t cp, cp_pair; - njs_int_t ret; - njs_str_t *string; - const u_char *src, *end, *hex_end; + u_char c, *start, *dst; + size_t size, length, hex_length; + uint64_t cp, cp_pair; + njs_int_t ret; + njs_str_t *string; + const u_char *src, *end, *hex_end; + njs_unicode_decode_t ctx; ret = njs_parser_escape_string_calc_length(parser, token, &size, &length); if (njs_slow_path(ret != NJS_OK)) { @@ -8053,7 +8062,13 @@ njs_parser_escape_string_create(njs_pars src--; - cp = njs_utf8_safe_decode2(&src, end); + njs_utf8_decode_init(&ctx); + + cp = njs_utf8_decode(&ctx, &src, end); + if (cp > NJS_UNICODE_MAX_CODEPOINT) { + cp = NJS_UNICODE_REPLACEMENT; + } + dst = njs_utf8_encode(dst, cp); continue; @@ -8076,12 +8091,12 @@ njs_parser_escape_string_create(njs_pars cp = njs_string_surrogate_pair(cp_pair, cp); } else if (njs_slow_path(njs_surrogate_leading(cp))) { - cp = NJS_UTF8_REPLACEMENT; + cp = NJS_UNICODE_REPLACEMENT; dst = njs_utf8_encode(dst, (uint32_t) cp); } else { - dst = njs_utf8_encode(dst, NJS_UTF8_REPLACEMENT); + dst = njs_utf8_encode(dst, NJS_UNICODE_REPLACEMENT); } cp_pair = 0; @@ -8092,7 +8107,7 @@ njs_parser_escape_string_create(njs_pars continue; } - cp = NJS_UTF8_REPLACEMENT; + cp = NJS_UNICODE_REPLACEMENT; } dst = njs_utf8_encode(dst, (uint32_t) cp); @@ -8116,10 +8131,11 @@ static njs_int_t njs_parser_escape_string_calc_length(njs_parser_t *parser, njs_lexer_token_t *token, size_t *out_size, size_t *out_length) { - size_t size, length, hex_length; - uint64_t cp, cp_pair; - njs_str_t *string; - const u_char *ptr, *src, *end, *hex_end; + size_t size, length, hex_length; + uint64_t cp, cp_pair; + njs_str_t *string; + const u_char *ptr, *src, *end, *hex_end; + njs_unicode_decode_t ctx; size = 0; length = 0; @@ -8173,7 +8189,12 @@ njs_parser_escape_string_calc_length(njs } if (*src >= 0x80) { - cp = njs_utf8_safe_decode2(&src, end); + njs_utf8_decode_init(&ctx); + + cp = njs_utf8_decode(&ctx, &src, end); + if (cp > NJS_UNICODE_MAX_CODEPOINT) { + cp = NJS_UNICODE_REPLACEMENT; + } size += njs_utf8_size(cp); length++; @@ -8220,13 +8241,13 @@ njs_parser_escape_string_calc_length(njs cp = njs_string_surrogate_pair(cp_pair, cp); } else if (njs_slow_path(njs_surrogate_leading(cp))) { - cp = NJS_UTF8_REPLACEMENT; + cp = NJS_UNICODE_REPLACEMENT; size += njs_utf8_size(cp); length++; } else { - size += njs_utf8_size(NJS_UTF8_REPLACEMENT); + size += njs_utf8_size(NJS_UNICODE_REPLACEMENT); length++; } @@ -8238,7 +8259,7 @@ njs_parser_escape_string_calc_length(njs continue; } - cp = NJS_UTF8_REPLACEMENT; + cp = NJS_UNICODE_REPLACEMENT; } size += njs_utf8_size(cp); diff -r 63106bd2e9bf -r 855edd76bdb6 src/njs_string.c --- a/src/njs_string.c Wed Jul 15 19:19:18 2020 +0300 +++ b/src/njs_string.c Wed Jul 15 19:19:19 2020 +0300 @@ -20,10 +20,8 @@ static njs_int_t njs_string_slice_prop(n njs_slice_prop_t *slice, njs_value_t *args, njs_uint_t nargs); static njs_int_t njs_string_slice_args(njs_vm_t *vm, njs_slice_prop_t *slice, njs_value_t *args, njs_uint_t nargs); -static njs_int_t njs_string_from_char_code(njs_vm_t *vm, - njs_value_t *args, njs_uint_t nargs, njs_index_t unused); -static njs_int_t njs_string_from_code_point(njs_vm_t *vm, njs_value_t *args, - njs_uint_t nargs, njs_index_t unused); +static njs_int_t njs_string_from_char_code(njs_vm_t *vm, njs_value_t *args, + njs_uint_t nargs, njs_index_t is_point); static njs_int_t njs_string_bytes_from(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs, njs_index_t unused); static njs_int_t njs_string_bytes_from_array_like(njs_vm_t *vm, @@ -545,7 +543,7 @@ static const njs_object_prop_t njs_stri { .type = NJS_PROPERTY, .name = njs_string("fromCharCode"), - .value = njs_native_function(njs_string_from_char_code, 1), + .value = njs_native_function2(njs_string_from_char_code, 1, 0), .writable = 1, .configurable = 1, }, @@ -553,7 +551,7 @@ static const njs_object_prop_t njs_stri { .type = NJS_PROPERTY, .name = njs_string("fromCodePoint"), - .value = njs_native_function(njs_string_from_code_point, 1), + .value = njs_native_function2(njs_string_from_char_code, 1, 1), .writable = 1, .configurable = 1, }, @@ -1029,13 +1027,14 @@ static njs_int_t njs_string_prototype_to_bytes(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs, njs_index_t unused) { - u_char *p; - size_t length; - uint32_t byte; - njs_int_t ret; - const u_char *s, *end; - njs_slice_prop_t slice; - njs_string_prop_t string; + u_char *p; + size_t length; + uint32_t byte; + njs_int_t ret; + const u_char *s, *end; + njs_slice_prop_t slice; + njs_string_prop_t string; + njs_unicode_decode_t ctx; ret = njs_string_object_validate(vm, njs_arg(args, nargs, 0)); if (njs_slow_path(ret != NJS_OK)) { @@ -1064,8 +1063,10 @@ njs_string_prototype_to_bytes(njs_vm_t * length = slice.length; + njs_utf8_decode_init(&ctx); + while (length != 0 && s < end) { - byte = njs_utf8_decode(&s, end); + byte = njs_utf8_decode(&ctx, &s, end); if (njs_slow_path(byte > 0xFF)) { njs_release(vm, &vm->retval); @@ -1463,13 +1464,14 @@ static njs_int_t njs_string_prototype_char_code_at(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs, njs_index_t unused) { - double num; - size_t length; - int64_t index; - uint32_t code; - njs_int_t ret; - const u_char *start, *end; - njs_string_prop_t string; + double num; + size_t length; + int64_t index; + uint32_t code; + njs_int_t ret; + const u_char *start, *end; + njs_string_prop_t string; + njs_unicode_decode_t ctx; ret = njs_string_object_validate(vm, njs_arg(args, nargs, 0)); if (njs_slow_path(ret != NJS_OK)) { @@ -1493,10 +1495,12 @@ njs_string_prototype_char_code_at(njs_vm code = string.start[index]; } else { + njs_utf8_decode_init(&ctx); + /* UTF-8 string. */ end = string.start + string.size; start = njs_string_offset(string.start, end, index); - code = njs_utf8_decode(&start, end); + code = njs_utf8_decode(&ctx, &start, end); } num = code; @@ -1829,14 +1833,27 @@ njs_decode_base64_core(njs_vm_t *vm, njs static njs_int_t -njs_string_from_char_code(njs_vm_t *vm, njs_value_t *args, - njs_uint_t nargs, njs_index_t unused) +njs_string_from_char_code(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs, + njs_index_t is_point) { - u_char *p; - size_t size; - uint16_t code; - njs_int_t ret; - njs_uint_t i; + double num; + u_char *p, *start, *end; + ssize_t len; + int32_t code; + uint32_t cp; + uint64_t length, size; + njs_int_t ret; + njs_uint_t i; + njs_unicode_decode_t ctx; + u_char buf[4]; + + size = 0; + length = 0; + + cp = 0x00; + end = buf + sizeof(buf); + + njs_utf16_decode_init(&ctx); for (i = 1; i < nargs; i++) { if (!njs_is_numeric(&args[i])) { @@ -1845,73 +1862,76 @@ njs_string_from_char_code(njs_vm_t *vm, return ret; } } + + if (is_point) { + num = njs_number(&args[i]); + if (isnan(num)) { + goto range_error; + } + + code = num; + + if (code != num || code < 0 || code > 0x10FFFF) { + goto range_error; + } + + } else { + code = njs_number_to_uint16(njs_number(&args[i])); + } + + start = buf; + len = njs_utf16_encode(code, &start, end); + + start = buf; + cp = njs_utf16_decode(&ctx, (const u_char **) &start, start + len); + + if (cp > NJS_UNICODE_MAX_CODEPOINT) { + if (cp == NJS_UNICODE_CONTINUE) { + continue; + } + + cp = NJS_UNICODE_REPLACEMENT; + } + + size += njs_utf8_size(cp); + length++; } - size = 0; - - for (i = 1; i < nargs; i++) { - code = njs_number_to_uint16(njs_number(&args[i])); - size += njs_utf8_size_uint16(code); + if (cp == NJS_UNICODE_CONTINUE) { + size += njs_utf8_size(NJS_UNICODE_REPLACEMENT); + length++; } - p = njs_string_alloc(vm, &vm->retval, size, nargs - 1); + p = njs_string_alloc(vm, &vm->retval, size, length); if (njs_slow_path(p == NULL)) { return NJS_ERROR; } - for (i = 1; i < nargs; i++) { - code = njs_number_to_uint16(njs_number(&args[i])); - p = njs_utf8_encode(p, code); - } - - return NJS_OK; -} - - -static njs_int_t -njs_string_from_code_point(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs, - njs_index_t unused) -{ - u_char *p; - double num; - size_t size; - int32_t code; - njs_int_t ret; - njs_uint_t i; + njs_utf16_decode_init(&ctx); for (i = 1; i < nargs; i++) { - if (!njs_is_numeric(&args[i])) { - ret = njs_value_to_numeric(vm, &args[i], &args[i]); - if (ret != NJS_OK) { - return ret; - } - } - } - - size = 0; - - for (i = 1; i < nargs; i++) { - num = njs_number(&args[i]); - if (isnan(num)) { - goto range_error; + if (is_point) { + code = njs_number(&args[i]); + + } else { + code = njs_number_to_uint16(njs_number(&args[i])); } - code = num; - - if (code != num || code < 0 || code >= 0x110000) { - goto range_error; + start = buf; + len = njs_utf16_encode(code, &start, end); + + start = buf; + cp = njs_utf16_decode(&ctx, (const u_char **) &start, start + len); + + if (cp > NJS_UNICODE_MAX_CODEPOINT) { + if (cp == NJS_UNICODE_CONTINUE && i + 1 != nargs) { + continue; + } + + cp = NJS_UNICODE_REPLACEMENT; } - size += njs_utf8_size(code); - } - - p = njs_string_alloc(vm, &vm->retval, size, nargs - 1); - if (njs_slow_path(p == NULL)) { - return NJS_ERROR; - } - - for (i = 1; i < nargs; i++) { - p = njs_utf8_encode(p, njs_number(&args[i])); + p = njs_utf8_encode(p, cp); } return NJS_OK; @@ -2591,11 +2611,12 @@ static njs_int_t njs_string_prototype_trim(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs, njs_index_t mode) { - uint32_t u, trim, length; - njs_int_t ret; - njs_value_t *value; - const u_char *p, *prev, *start, *end; - njs_string_prop_t string; + uint32_t u, trim, length; + njs_int_t ret; + njs_value_t *value; + const u_char *p, *prev, *start, *end; + njs_string_prop_t string; + njs_unicode_decode_t ctx; value = njs_argument(args, 0); ret = njs_string_object_validate(vm, value); @@ -2651,13 +2672,15 @@ njs_string_prototype_trim(njs_vm_t *vm, /* UTF-8 string. */ if (mode & NJS_TRIM_START) { + njs_utf8_decode_init(&ctx); + for ( ;; ) { if (start == end) { goto empty; } p = start; - u = njs_utf8_decode(&start, end); + u = njs_utf8_decode(&ctx, &start, end); if (njs_utf8_is_whitespace(u)) { trim++; @@ -2672,6 +2695,8 @@ njs_string_prototype_trim(njs_vm_t *vm, if (mode & NJS_TRIM_END) { prev = end; + njs_utf8_decode_init(&ctx); + for ( ;; ) { if (start == prev) { goto empty; @@ -2679,7 +2704,7 @@ njs_string_prototype_trim(njs_vm_t *vm, prev = njs_utf8_prev(prev); p = prev; - u = njs_utf8_decode(&p, end); + u = njs_utf8_decode(&ctx, &p, end); if (njs_utf8_is_whitespace(u)) { trim++; @@ -3640,11 +3665,12 @@ njs_string_prototype_replace(njs_vm_t *v double njs_string_to_number(const njs_value_t *value, njs_bool_t parse_float) { - double num; - size_t size; - uint32_t u; - njs_bool_t minus; - const u_char *p, *start, *end; + double num; + size_t size; + uint32_t u; + njs_bool_t minus; + const u_char *p, *start, *end; + njs_unicode_decode_t ctx; const size_t infinity = njs_length("Infinity"); @@ -3660,9 +3686,11 @@ njs_string_to_number(const njs_value_t * end = p + size; + njs_utf8_decode_init(&ctx); + while (p < end) { start = p; - u = njs_utf8_decode(&p, end); + u = njs_utf8_decode(&ctx, &p, end); if (!njs_utf8_is_whitespace(u)) { p = start; @@ -4179,15 +4207,16 @@ njs_int_t njs_string_encode_uri(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs, njs_index_t component) { - u_char byte, *dst; - uint64_t size; - uint32_t cp, cp_low; - njs_int_t ret; - njs_value_t *value; - const u_char *src, *end; - const uint32_t *escape; - njs_string_prop_t string; - u_char encode[4]; + u_char byte, *dst; + uint64_t size; + uint32_t cp, cp_low; + njs_int_t ret; + njs_value_t *value; + const u_char *src, *end; + const uint32_t *escape; + njs_string_prop_t string; + njs_unicode_decode_t ctx; + u_char encode[4]; static const uint32_t escape_uri[] = { 0xffffffff, /* 1111 1111 1111 1111 1111 1111 1111 1111 */ @@ -4257,8 +4286,10 @@ njs_string_encode_uri(njs_vm_t *vm, njs_ } else { /* UTF-8 string. */ + njs_utf8_decode_init(&ctx); + while (src < end) { - cp = njs_utf8_decode(&src, end); + cp = njs_utf8_decode(&ctx, &src, end); if (cp < 0x80 && !njs_need_escape(escape, cp)) { size++; @@ -4271,7 +4302,7 @@ njs_string_encode_uri(njs_vm_t *vm, njs_ } if (njs_surrogate_leading(cp)) { - cp_low = njs_utf8_decode(&src, end); + cp_low = njs_utf8_decode(&ctx, &src, end); if (njs_slow_path(!njs_surrogate_trailing(cp_low))) { goto uri_error; @@ -4310,11 +4341,13 @@ njs_string_encode_uri(njs_vm_t *vm, njs_ /* UTF-8 string. */ + njs_utf8_decode_init(&ctx); + while (src < end) { - cp = njs_utf8_decode(&src, end); + cp = njs_utf8_decode(&ctx, &src, end); if (njs_slow_path(njs_surrogate_leading(cp))) { - cp_low = njs_utf8_decode(&src, end); + cp_low = njs_utf8_decode(&ctx, &src, end); cp = njs_string_surrogate_pair(cp, cp_low); } @@ -4337,11 +4370,14 @@ njs_inline uint32_t njs_string_decode_uri_cp(const int8_t *hex, const u_char **start, const u_char *end, njs_bool_t expect_percent) { - int8_t d0, d1; - uint32_t cp; - const u_char *p; - - cp = njs_utf8_decode(start, end); + int8_t d0, d1; + uint32_t cp; + const u_char *p; + njs_unicode_decode_t ctx; + + njs_utf8_decode_init(&ctx); + + cp = njs_utf8_decode(&ctx, start, end); if (njs_fast_path(cp != '%')) { return expect_percent ? 0xFFFFFFFF: cp; } @@ -4378,18 +4414,19 @@ njs_int_t njs_string_decode_uri(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs, njs_index_t component) { - u_char *dst; - int64_t size, length; - uint32_t cp; - njs_int_t ret; - njs_chb_t chain; - njs_uint_t i, n; - njs_bool_t percent; - njs_value_t *value; - const u_char *src, *p, *end; - const uint32_t *reserve; - njs_string_prop_t string; - u_char encode[4]; + u_char *dst; + int64_t size, length; + uint32_t cp; + njs_int_t ret; + njs_chb_t chain; + njs_uint_t i, n; + njs_bool_t percent; + njs_value_t *value; + const u_char *src, *p, *end; + const uint32_t *reserve; + njs_string_prop_t string; + njs_unicode_decode_t ctx; + u_char encode[4]; static const uint32_t reserve_uri[] = { 0x00000000, /* 0000 0000 0000 0000 0000 0000 0000 0000 */ @@ -4472,6 +4509,8 @@ njs_string_decode_uri(njs_vm_t *vm, njs_ njs_chb_init(&chain, vm->mem_pool); + njs_utf8_decode_init(&ctx); + while (src < end) { percent = (src[0] == '%'); cp = njs_string_decode_uri_cp(hex, &src, end, 0); @@ -4529,8 +4568,8 @@ njs_string_decode_uri(njs_vm_t *vm, njs_ } p = encode; - cp = njs_utf8_decode(&p, p + n); - if (njs_slow_path(cp == 0xFFFFFFFF)) { + cp = njs_utf8_decode(&ctx, &p, p + n); + if (njs_slow_path(cp > NJS_UNICODE_MAX_CODEPOINT)) { goto uri_error; } diff -r 63106bd2e9bf -r 855edd76bdb6 src/njs_unicode.h --- a/src/njs_unicode.h Wed Jul 15 19:19:18 2020 +0300 +++ b/src/njs_unicode.h Wed Jul 15 19:19:19 2020 +0300 @@ -9,6 +9,7 @@ enum { + NJS_UNICODE_REPLACEMENT = 0xFFFD, NJS_UNICODE_MAX_CODEPOINT = 0x10FFFF, NJS_UNICODE_ERROR = 0x1FFFFF, NJS_UNICODE_CONTINUE = 0x2FFFFF @@ -16,6 +17,9 @@ enum { typedef struct { uint32_t codepoint; + + unsigned need; + u_char lower; u_char upper; } njs_unicode_decode_t; diff -r 63106bd2e9bf -r 855edd76bdb6 src/njs_utf8.c --- a/src/njs_utf8.c Wed Jul 15 19:19:18 2020 +0300 +++ b/src/njs_utf8.c Wed Jul 15 19:19:19 2020 +0300 @@ -56,211 +56,166 @@ njs_utf8_encode(u_char *p, uint32_t u) } -/* - * njs_utf8_decode() decodes UTF-8 sequences and returns a valid - * character 0x00 - 0x10FFFF, or 0xFFFFFFFF for invalid or overlong - * UTF-8 sequence. - */ +njs_inline njs_int_t +njs_utf8_boundary(njs_unicode_decode_t *ctx, const u_char **data, + unsigned *need, u_char lower, u_char upper) +{ + u_char ch; -uint32_t -njs_utf8_decode(const u_char **start, const u_char *end) -{ - uint32_t u; + ch = **data; - u = (uint32_t) **start; - - if (u < 0x80) { - (*start)++; - return u; + if (ch < lower || ch > upper) { + return NJS_ERROR; } - return njs_utf8_decode2(start, end); + (*data)++; + (*need)--; + ctx->codepoint = (ctx->codepoint << 6) | (ch & 0x3F); + + return NJS_OK; } -/* - * njs_utf8_decode2() decodes two and more bytes UTF-8 sequences only - * and returns a valid character 0x80 - 0x10FFFF, OR 0xFFFFFFFF for - * invalid or overlong UTF-8 sequence. - */ - -uint32_t -njs_utf8_decode2(const u_char **start, const u_char *end) +njs_inline void +njs_utf8_boundary_set(njs_unicode_decode_t *ctx, const u_char ch, + u_char first, u_char second, u_char lower, u_char upper) { - u_char c; - size_t n; - uint32_t u, overlong; - const u_char *p; - - p = *start; - u = (uint32_t) *p; - - if (u >= 0xE0) { - - if (u >= 0xF0) { - - if (njs_slow_path(u > 0xF4)) { - /* - * The maximum valid Unicode character is 0x10FFFF - * which is encoded as 0xF4 0x8F 0xBF 0xBF. - */ - return 0xFFFFFFFF; - } - - u &= 0x07; - overlong = 0x00FFFF; - n = 3; - - } else { - u &= 0x0F; - overlong = 0x07FF; - n = 2; - } + if (ch == first) { + ctx->lower = lower; + ctx->upper = 0xBF; - } else if (u >= 0xC2) { - - /* 0x80 is encoded as 0xC2 0x80. */ - - u &= 0x1F; - overlong = 0x007F; - n = 1; - - } else { - /* u <= 0xC2 */ - return 0xFFFFFFFF; + } else if (ch == second) { + ctx->lower = 0x80; + ctx->upper = upper; } - - p++; - - if (njs_fast_path(p + n <= end)) { - - do { - c = *p++; - /* - * The byte must in the 0x80 - 0xBF range. - * Values below 0x80 become >= 0x80. - */ - c = c - 0x80; - - if (njs_slow_path(c > 0x3F)) { - return 0xFFFFFFFF; - } - - u = (u << 6) | c; - n--; - - } while (n != 0); - - if (overlong < u && u < 0x110000) { - *start = p; - return u; - } - } - - return 0xFFFFFFFF; } uint32_t -njs_utf8_safe_decode(const u_char **start, const u_char *end) -{ - uint32_t u; - - u = (uint32_t) **start; - - if (u < 0x80) { - (*start)++; - return u; - } - - return njs_utf8_safe_decode2(start, end); -} - - -uint32_t -njs_utf8_safe_decode2(const u_char **start, const u_char *end) +njs_utf8_decode(njs_unicode_decode_t *ctx, const u_char **start, + const u_char *end) { u_char c; - size_t n; - uint32_t u, overlong; + unsigned need; + njs_int_t ret; const u_char *p; - p = *start; - u = (uint32_t) *p; - - if (u >= 0xE0) { - - if (u >= 0xF0) { + if (ctx->need != 0) { + need = ctx->need; + ctx->need = 0; - if (njs_slow_path(u > 0xF4)) { - /* - * The maximum valid Unicode character is 0x10FFFF - * which is encoded as 0xF4 0x8F 0xBF 0xBF. - */ - goto fail_one; + if (ctx->lower != 0x00) { + ret = njs_utf8_boundary(ctx, start, &need, ctx->lower, ctx->upper); + if (njs_slow_path(ret != NJS_OK)) { + goto failed; } - u &= 0x07; - overlong = 0x00FFFF; - n = 3; + ctx->lower = 0x00; + } + + goto decode; + } + + c = *(*start)++; + + if (c < 0x80) { + return c; - } else { - u &= 0x0F; - overlong = 0x07FF; - n = 2; + } else if (c <= 0xDF) { + if (c < 0xC2) { + return NJS_UNICODE_ERROR; + } + + need = 1; + ctx->codepoint = c & 0x1F; + + } else if (c < 0xF0) { + need = 2; + ctx->codepoint = c & 0x0F; + + if (*start == end) { + njs_utf8_boundary_set(ctx, c, 0xE0, 0xED, 0xA0, 0x9F); + goto next; } - } else if (u >= 0xC2) { + ret = NJS_OK; + + if (c == 0xE0) { + ret = njs_utf8_boundary(ctx, start, &need, 0xA0, 0xBF); - /* 0x80 is encoded as 0xC2 0x80. */ + } else if (c == 0xED) { + ret = njs_utf8_boundary(ctx, start, &need, 0x80, 0x9F); + } + + if (njs_slow_path(ret != NJS_OK)) { + goto failed; + } + + } else if (c < 0xF5) { + need = 3; + ctx->codepoint = c & 0x07; - u &= 0x1F; - overlong = 0x007F; - n = 1; + if (*start == end) { + njs_utf8_boundary_set(ctx, c, 0xF0, 0xF4, 0x90, 0x8F); + goto next; + } + + ret = NJS_OK; + + if (c == 0xF0) { + ret = njs_utf8_boundary(ctx, start, &need, 0x90, 0xBF); + + } else if (c == 0xF4) { + ret = njs_utf8_boundary(ctx, start, &need, 0x80, 0x8F); + } + + if (njs_slow_path(ret != NJS_OK)) { + goto failed; + } } else { - /* u <= 0xC2 */ - goto fail_one; + return NJS_UNICODE_ERROR; } - p++; +decode: + + for (p = *start; p < end; p++) { + c = *p; - while (p < end && n != 0) { - c = *p++; - /* - * The byte must in the 0x80 - 0xBF range. - * Values below 0x80 become >= 0x80. - */ - c = c - 0x80; + if (c < 0x80 || c > 0xBF) { + *start = p; - if (njs_slow_path(c > 0x3F)) { - *start = --p; - return NJS_UTF8_REPLACEMENT; + goto failed; } - u = (u << 6) | c; - n--; + ctx->codepoint = (ctx->codepoint << 6) | (c & 0x3F); + + if (--need == 0) { + *start = p + 1; + + return ctx->codepoint; + } } *start = p; - if (n == 0 && overlong < u && u < 0x110000) { - return u; - } +next: - return NJS_UTF8_REPLACEMENT; + ctx->need = need; _______________________________________________ nginx-devel mailing list nginx-devel@nginx.org http://mailman.nginx.org/mailman/listinfo/nginx-devel