[hackers] [libgrapheme] Rewrite grapheme_next_character_break() and add size-parameter || Laslo Hunhold

git Sat, 18 Dec 2021 15:56:17 -0800

commit f8e8649a4fd88e61f9473400f44b9b1c5fce9e7c
Author:     Laslo Hunhold <[email protected]>
AuthorDate: Sun Dec 19 00:52:23 2021 +0100
Commit:     Laslo Hunhold <[email protected]>
CommitDate: Sun Dec 19 00:52:23 2021 +0100


    Rewrite grapheme_next_character_break() and add size-parameter
    
    Not in all cases will you have a NUL-terminated string to look at,
    but some length-bounded "raw" array in memory. Comparable to how
    we already do it in grapheme_decode_utf8() to handle NUL-terminated
    strings, we add a len-parameter to grapheme_next_character_break()
    that can be set to SIZE_MAX to indicate that the string doesn't have
    a known bound but is instead NUL-terminated. Otherwise, if len is
    not SIZE_MAX, we have a proper bound.
    
    It was planned anyway, but this was a good point to rewrite the function
    to make it more readable and simplify it. There was especially no reason
    to call grapheme_decode_utf8() more than once.
    
    This will bring 99% feature-parity with what most people do with
    ICU without all the unnecessary cruft, boiler-plate and incantations
    you need with ICU.
    
    Signed-off-by: Laslo Hunhold <[email protected]>

diff --git a/grapheme.h b/grapheme.h
index ea8a02d..c2def7c 100644
--- a/grapheme.h
+++ b/grapheme.h
@@ -19,7 +19,7 @@ typedef struct grapheme_internal_segmentation_state {
 
 #define GRAPHEME_INVALID_CODEPOINT UINT32_C(0xFFFD)
 
-size_t grapheme_next_character_break(const char *);
+size_t grapheme_next_character_break(const char *, size_t);
 
 bool grapheme_is_character_break(uint_least32_t, uint_least32_t, 
GRAPHEME_STATE *);
 
diff --git a/man/grapheme_next_character_break.3 
b/man/grapheme_next_character_break.3
index 1e96383..962b2ce 100644
--- a/man/grapheme_next_character_break.3
+++ b/man/grapheme_next_character_break.3
@@ -7,19 +7,30 @@
 .Sh SYNOPSIS
 .In grapheme.h
 .Ft size_t
-.Fn grapheme_next_character_break "const char *str"
+.Fn grapheme_next_character_break "const char *str" "size_t len"
 .Sh DESCRIPTION
 The
 .Fn grapheme_next_character_break
 function computes the offset (in bytes) to the next grapheme
 cluster break (see
 .Xr libgrapheme 7 )
-in the UTF-8-encoded NUL-terminated string
-.Va str .
+in the UTF-8-encoded string
+.Va str
+of length
+.Va len .
 If a grapheme cluster begins at
 .Va str
 this offset is equal to the length of said grapheme cluster.
 .Pp
+If
+.Va len
+is set to
+.Dv SIZE_MAX
+(stdint.h is already included by grapheme.h) the string
+.Va str
+is interpreted to be NUL-terminated and processing stops when a
+NUL-byte is encountered.
+.Pp
 For non-UTF-8 input data
 .Xr grapheme_is_character_break 3
 can be used instead.
@@ -48,15 +59,24 @@ main(void)
                  "\\x9F\\x91\\xA9\\xE2\\x80\\x8D\\xF0\\x9F\\x91\\xA6 \\xF0"
                  "\\x9F\\x87\\xBA\\xF0\\x9F\\x87\\xB8 \\xE0\\xA4\\xA8\\xE0"
                  "\\xA5\\x80 \\xE0\\xAE\\xA8\\xE0\\xAE\\xBF!";
-       size_t len;
+       size_t ret, len, off;
 
        printf("Input: \\"%s\\"\\n", s);
 
        /* print each grapheme cluster with byte-length */
-       for (; *s != '\\0';) {
-               len = grapheme_next_character_break(s);
-               printf("%2zu bytes | %.*s\\n", len, (int)len, s, len);
-               s += len;
+       printf("Grapheme clusters in NUL-delimited input:\\n");
+       for (off = 0; s[off] != '\\0'; off += ret) {
+               ret = grapheme_next_character_break(s + off, SIZE_MAX);
+               printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off, ret);
+       }
+       printf("\\n");
+
+       /* do the same, but this time string is length-delimited */
+       len = 17;
+       printf("Grapheme clusters in input delimited to %zu bytes:\\n", len);
+       for (off = 0; off < len; off += ret) {
+               ret = grapheme_next_character_break(s + off, len - off);
+               printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off, ret);
        }
 
        return 0;
diff --git a/src/character.c b/src/character.c
index 8f1143f..2215543 100644
--- a/src/character.c
+++ b/src/character.c
@@ -179,50 +179,41 @@ hasbreak:
 }
 
 size_t
-grapheme_next_character_break(const char *str)
+grapheme_next_character_break(const char *str, size_t len)
 {
-       uint_least32_t cp0, cp1;
-       size_t ret, len = 0;
        GRAPHEME_STATE state = { 0 };
+       uint_least32_t cp0 = 0, cp1 = 0;
+       size_t off, ret;
 
-       if (str == NULL) {
+       if (str == NULL || len == 0) {
                return 0;
        }
 
-       /*
-        * grapheme_decode_utf8, when it encounters an unexpected byte,
-        * does not count it to the error and instead assumes that the
-        * unexpected byte is the beginning of a new sequence.
-        * This way, when the string ends with a null byte, we never
-        * miss it, even if the previous UTF-8 sequence terminates
-        * unexpectedly, as it would either act as an unexpected byte,
-        * saved for later, or as a null byte itself, that we can catch.
-        * We pass SIZE_MAX to the length, as we will never read beyond
-        * the null byte for the reasons given above.
-        */
-
-       /* get first codepoint */
-       len += grapheme_decode_utf8(str, SIZE_MAX, &cp0);
-       if (cp0 == GRAPHEME_INVALID_CODEPOINT) {
-               return len;
-       }
+       for (off = 0; (len == SIZE_MAX) || off < len; off += ret) {
+               cp0 = cp1;
+               ret = grapheme_decode_utf8(str + off, (len == SIZE_MAX) ?
+                                          SIZE_MAX : len - off, &cp1);
 
-       while (cp0 != 0) {
-               /* get next codepoint */
-               ret = grapheme_decode_utf8(str + len, SIZE_MAX, &cp1);
+               if (len != SIZE_MAX && ret > (len - off)) {
+                       /* string ended abruptly, simply accept cropping */
+                       ret = len - off;
+               }
 
-               if (cp1 == GRAPHEME_INVALID_CODEPOINT ||
-                   grapheme_is_character_break(cp0, cp1, &state)) {
-                       /* we read an invalid cp or have a breakpoint */
+               if (len == SIZE_MAX && cp1 == 0) {
+                       /* we hit a NUL-byte and are done */
                        break;
-               } else {
-                       /* we don't have a breakpoint, continue */
-                       len += ret;
                }
 
-               /* prepare next round */
-               cp0 = cp1;
+               if (off == 0) {
+                       /*
+                        * we skip the first round, as we need both
+                        * cp0 and cp1 to be initialized
+                        */
+                       continue;
+               } else if (grapheme_is_character_break(cp0, cp1, &state)) {
+                       break;
+               }
        }
 
-       return len;
+       return off;
 }

[hackers] [libgrapheme] Rewrite grapheme_next_character_break() and add size-parameter || Laslo Hunhold

Reply via email to