I missed something, that I will fix later, but there are three options of what to do:
grapheme_len() assumes cp_decode() returns 0 at the end of the string, whereas this change will return 1 (it is counter-intuitive that an UTF-8 decode would say that the NUL character is 0 bytes longs as it is indeed a character, and one which you may want to support (I did when I rewrote the decoder for another project)). grapheme_len.3 is sparse on details, but in the example it checks for the NUL byte before calling grapheme_len() rather than checking if grapheme_len() returns 0 (started at the NUL byte). So option 1 is to make cp_decode return 0 on NUL. I really don't like this option as it eliminates the support for the NUL character, and you may want to return NUL characters without any special handling. And it doesn't simply anything for the user even if he wanted to and at NUL. Option 2 as option 1 but also change the man page check if grapheme_len() returns 0. This is a little butter as it would document this feature. Option 3 is change grapheme_len() to support the NUL character. I think this is the best option as it would add support NUL character without the user doing anything special, and it barely complicates things. The change needed in grapheme_len() would be to compare `cp0` instead of `ret` against 0, after running `len += ret`, in the first call to `cp_decode`. This handling of NUL in grapheme_len() would still be needed to ensure that it does not read outside the string. Regards, Mattias Andrée On Wed, 27 May 2020 15:22:35 +0200 Mattias Andrée <[email protected]> wrote: > No special cases for NUL and ASCII. > > Simplified check of overencoded codepoints: > lowest non-overencoded codepoints are stored in the lookup table > > Do not accept surrogate code points. > From RFC 3629: > The definition of UTF-8 prohibits encoding character > numbers between U+D800 and U+DFFF, [...] > > Do not accept plane 17 and above: > From RFC 3629: > Restricted the range of characters to 0000-10FFFF > (the UTF-16 accessible range). > --- > src/codepoint.c | 73 +++++++++++++++---------------------------------- > 1 file changed, 22 insertions(+), 51 deletions(-) > > diff --git a/src/codepoint.c b/src/codepoint.c > index 0b63184..6cb0d4f 100644 > --- a/src/codepoint.c > +++ b/src/codepoint.c > @@ -8,73 +8,44 @@ > size_t > cp_decode(const uint8_t *str, Codepoint *p) > { > - size_t off, j, k, l; > + size_t rank, i, len; > > struct { > uint8_t lower; > uint8_t upper; > uint8_t mask; > - uint8_t bits; > + Codepoint lowest; > } lookup[] = { > - { 0x00, 0x7F, 0xFF, 7 }, /* 00000000 - 01111111, 01111111 */ > - { 0xC0, 0xDF, 0x1F, 11 }, /* 11000000 - 11011111, 00011111 */ > - { 0xE0, 0xEF, 0x0F, 16 }, /* 11100000 - 11101111, 00001111 */ > - { 0xF0, 0xF7, 0x07, 21 }, /* 11110000 - 11110111, 00000111 */ > - { 0xF8, 0xFB, 0x03, 26 }, /* 11111000 - 11111011, 00000011 */ > - { 0xFC, 0xFD, 0x01, 31 }, /* 11111100 - 11111101, 00000001 */ > + { 0x00, 0x7F, 0xFF, UINT32_C(0x000000) }, > + { 0xC0, 0xDF, 0x1F, UINT32_C(0x000080) }, > + { 0xE0, 0xEF, 0x0F, UINT32_C(0x000800) }, > + { 0xF0, 0xF7, 0x07, UINT32_C(0x010000) } > }; > > - /* empty string */ > - if (str[0] == '\0') { > - *p = 0; > - return 0; > - } > - > - /* find out in which ranges str[0] is */ > - for (off = 0; off < LEN(lookup); off++) { > - if (BETWEEN(str[0], lookup[off].lower, lookup[off].upper)) { > - *p = str[0] & lookup[off].mask; > + for (rank = 0; rank < LEN(lookup); rank++) > + if (BETWEEN(str[0], lookup[rank].lower, lookup[rank].upper)) > break; > - } > - } > - if (off == 0) { > - /* ASCII */ > - return 1; > - } else if (off == LEN(lookup)) { > - /* not in ranges */ > + if (rank == LEN(lookup)) { > + /* Out of range */ > *p = CP_INVALID; > return 1; > } > > - /* off denotes the number of upcoming expected code units */ > - for (j = 0; j < off; j++) { > - if (str[j] == '\0') { > - *p = CP_INVALID; > - return j; > - } > - if ((str[1 + j] & 0xC0) != 0x80) { > + *p = (Codepoint)(str[0] & lookup[rank].mask); > + len = rank + 1; > + for (i = 1; i < len; i++) { > + if ((str[i] & 0xC0) != 0x80) { > + /* Not continuation of character */ > *p = CP_INVALID; > - return 1 + j; > + return 1; > } > - *p <<= 6; > - *p |= str[1 + j] & 0x3F; /* 00111111 */ > + *p = (*p << 6) | (str[i] & 0x3F); > } > > - if (*p == 0) { > - if (off != 0) { > - /* overencoded NUL */ > - *p = CP_INVALID; > - } > - } else { > - /* determine effective bytes */ > - for (k = 0; ((*p << k) & (1 << 31)) == 0; k++) > - ; > - for (l = 0; l < off; l++) { > - if ((32 - k) <= lookup[l].bits) { > - *p = CP_INVALID; > - } > - } > + if (*p < lookup[rank].lowest || BETWEEN(*p, 0xD800, 0xDFFF) || *p > > UINT32_C(0x10FFFF)) { > + /* Overencoded, surrogate, or out of range */ > + *p = CP_INVALID; > + return 1; > } > - > - return 1 + j; > + return len; > }
