No special cases for NUL and ASCII.

Simplified check of overencoded codepoints:
lowest non-overencoded codepoints are stored in the lookup table

Do not accept surrogate code points.
>From RFC 3629:
        The definition of UTF-8 prohibits encoding character
        numbers between U+D800 and U+DFFF, [...]

Do not accept plane 17 and above:
>From RFC 3629:
        Restricted the range of characters to 0000-10FFFF
        (the UTF-16 accessible range).
---
 src/codepoint.c | 73 +++++++++++++++----------------------------------
 1 file changed, 22 insertions(+), 51 deletions(-)

diff --git a/src/codepoint.c b/src/codepoint.c
index 0b63184..6cb0d4f 100644
--- a/src/codepoint.c
+++ b/src/codepoint.c
@@ -8,73 +8,44 @@
 size_t
 cp_decode(const uint8_t *str, Codepoint *p)
 {
-       size_t off, j, k, l;
+       size_t rank, i, len;
 
        struct {
                uint8_t lower;
                uint8_t upper;
                uint8_t mask;
-               uint8_t bits;
+               Codepoint lowest;
        } lookup[] = {
-               { 0x00, 0x7F, 0xFF,  7 }, /* 00000000 - 01111111, 01111111 */
-               { 0xC0, 0xDF, 0x1F, 11 }, /* 11000000 - 11011111, 00011111 */
-               { 0xE0, 0xEF, 0x0F, 16 }, /* 11100000 - 11101111, 00001111 */
-               { 0xF0, 0xF7, 0x07, 21 }, /* 11110000 - 11110111, 00000111 */
-               { 0xF8, 0xFB, 0x03, 26 }, /* 11111000 - 11111011, 00000011 */
-               { 0xFC, 0xFD, 0x01, 31 }, /* 11111100 - 11111101, 00000001 */
+               { 0x00, 0x7F, 0xFF, UINT32_C(0x000000) },
+               { 0xC0, 0xDF, 0x1F, UINT32_C(0x000080) },
+               { 0xE0, 0xEF, 0x0F, UINT32_C(0x000800) },
+               { 0xF0, 0xF7, 0x07, UINT32_C(0x010000) }
        };
 
-       /* empty string */
-       if (str[0] == '\0') {
-               *p = 0;
-               return 0;
-       }
-
-       /* find out in which ranges str[0] is */
-       for (off = 0; off < LEN(lookup); off++) {
-               if (BETWEEN(str[0], lookup[off].lower, lookup[off].upper)) {
-                       *p = str[0] & lookup[off].mask;
+       for (rank = 0; rank < LEN(lookup); rank++)
+               if (BETWEEN(str[0], lookup[rank].lower, lookup[rank].upper))
                        break;
-               }
-       }
-       if (off == 0) {
-               /* ASCII */
-               return 1;
-       } else if (off == LEN(lookup)) {
-               /* not in ranges */
+       if (rank == LEN(lookup)) {
+               /* Out of range */
                *p = CP_INVALID;
                return 1;
        }
 
-       /* off denotes the number of upcoming expected code units */
-       for (j = 0; j < off; j++) {
-               if (str[j] == '\0') {
-                       *p = CP_INVALID;
-                       return j;
-               }
-               if ((str[1 + j] & 0xC0) != 0x80) {
+       *p = (Codepoint)(str[0] & lookup[rank].mask);
+       len = rank + 1;
+       for (i = 1; i < len; i++) {
+               if ((str[i] & 0xC0) != 0x80) {
+                       /* Not continuation of character */
                        *p = CP_INVALID;
-                       return 1 + j;
+                       return 1;
                }
-               *p <<= 6;
-               *p |= str[1 + j] & 0x3F; /* 00111111 */
+               *p = (*p << 6) | (str[i] & 0x3F);
        }
 
-       if (*p == 0) {
-               if (off != 0) {
-                       /* overencoded NUL */
-                       *p = CP_INVALID;
-               }
-       } else {
-               /* determine effective bytes */
-               for (k = 0; ((*p << k) & (1 << 31)) == 0; k++)
-                       ;
-               for (l = 0; l < off; l++) {
-                       if ((32 - k) <= lookup[l].bits) {
-                               *p = CP_INVALID;
-                       }
-               }
+       if (*p < lookup[rank].lowest || BETWEEN(*p, 0xD800, 0xDFFF) || *p > 
UINT32_C(0x10FFFF)) {
+               /* Overencoded, surrogate, or out of range */
+               *p = CP_INVALID;
+               return 1;
        }
-
-       return 1 + j;
+       return len;
 }
-- 
2.26.2


Reply via email to