Changeset: f0e19e88af26 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=f0e19e88af26 Modified Files: gdk/gdk_strimps.c Branch: string_imprints Log Message:
Count utf-8 chars correctly diffs (62 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -14,9 +14,31 @@ * contains. */ static size_t -GDKstrimp_strlen(const char *s) +GDKstrimp_strlen(const uint8_t *s) { - return strlen(s); + size_t ret = 0; + size_t i; + int m,n; + uint8_t c; + + i = 0; + while((c = *(s + i)) != 0) { + if (c < 0x80) + i++; + else { + for (n = 0, m=0x40; c & m; n++, m >>= 1) + ; + /* n is now the number of 10xxxxxx bytes that should + follow. */ + if (n == 0 || n >= 4) + /* TODO: handle invalid utf-8 */ + {} + i += n+1; + } + ret++; + } + + return ret; } /* Given a BAT return the number of digrams in it. The observation is @@ -33,7 +55,7 @@ GDKstrimp_ndigrams(BAT *b, size_t *n) // lng t0; BUN i; BATiter bi; - char *s; + uint8_t *s; // GDKtracer_set_component_level("ALGO", "DEBUG"); // struct canditer ci; @@ -44,12 +66,13 @@ GDKstrimp_ndigrams(BAT *b, size_t *n) bi = bat_iterator(b); *n = 0; for (i = 0; i < b->batCount; i++) { - s = (char *)BUNtail(bi, i); + s = (uint8_t *)BUNtail(bi, i); *n += GDKstrimp_strlen(s) - 1; - // TRC_DEBUG(ALGO, "s["LLFMT"]=%s\n", i, (char *)BUNtail(bi, i)); + // TRC_DEBUG(ALGO, "s["LLFMT"]=%s\n", i, s); } // TRC_DEBUG(ALGO, LLFMT "usec\n", GDKusec() - t0); + // GDKtracer_flush_buffer(); return GDK_SUCCEED; } _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list