Changeset: 450f65bcb3e5 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/450f65bcb3e5 Modified Files: clients/odbc/driver/CMakeLists.txt clients/odbc/driver/ODBCUtil.c common/stream/CMakeLists.txt common/stream/stdio_stream.c common/stream/winio.c common/utils/CMakeLists.txt common/utils/mutils.c Branch: default Log Message:
Use utf-8 decode function in more places. diffs (truncated from 430 to 300 lines): diff --git a/clients/odbc/driver/CMakeLists.txt b/clients/odbc/driver/CMakeLists.txt --- a/clients/odbc/driver/CMakeLists.txt +++ b/clients/odbc/driver/CMakeLists.txt @@ -133,6 +133,7 @@ target_link_libraries(MonetODBC monetdb_config_header mutils mapi + mutf8 ${ODBCINST_LIBRARIES}) install(TARGETS diff --git a/clients/odbc/driver/ODBCUtil.c b/clients/odbc/driver/ODBCUtil.c --- a/clients/odbc/driver/ODBCUtil.c +++ b/clients/odbc/driver/ODBCUtil.c @@ -35,6 +35,7 @@ #include "ODBCUtil.h" #include "ODBCDbc.h" #include <float.h> +#include "mutf8.h" #ifdef WIN32 @@ -199,7 +200,6 @@ ODBCutf82wchar(const SQLCHAR *src, { SQLLEN i = 0; SQLINTEGER j = 0; - uint32_t c; if (buf == NULL) buflen = 0; @@ -220,51 +220,25 @@ ODBCutf82wchar(const SQLCHAR *src, else if (length < 0) return "Invalid length parameter"; + uint32_t state = 0, codepoint = 0; while (j < length && i + 1 < buflen && src[j]) { - if ((src[j+0] & 0x80) == 0) { - buf[i++] = src[j+0]; - j += 1; - } else if (j + 1 < length - && (src[j+0] & 0xE0) == 0xC0 - && (src[j+1] & 0xC0) == 0x80 - && (src[j+0] & 0x1E) != 0) { - buf[i++] = (src[j+0] & 0x1F) << 6 - | (src[j+1] & 0x3F); - j += 2; - } else if (j + 2 < length - && (src[j+0] & 0xF0) == 0xE0 - && (src[j+1] & 0xC0) == 0x80 - && (src[j+2] & 0xC0) == 0x80 - && ((src[j+0] & 0x0F) != 0 - || (src[j+1] & 0x20) != 0)) { - buf[i++] = (src[j+0] & 0x0F) << 12 - | (src[j+1] & 0x3F) << 6 - | (src[j+2] & 0x3F); - j += 3; - } else if (j + 3 < length - && (src[j+0] & 0xF8) == 0xF0 - && (src[j+1] & 0xC0) == 0x80 - && (src[j+2] & 0xC0) == 0x80 - && (src[j+3] & 0xC0) == 0x80 - && ((src[j+0] & 0x07) != 0 - || (src[j+1] & 0x30) != 0)) { - c = (src[j+0] & 0x07) << 18 - | (src[j+1] & 0x3F) << 12 - | (src[j+2] & 0x3F) << 6 - | (src[j+3] & 0x3F); - if (c > 0x10FFFF || (c & 0x1FF800) == 0x00D800) - return "Illegal code point"; + switch (decode(&state, &codepoint, (uint8_t) src[j++])) { + case UTF8_ACCEPT: #if SIZEOF_SQLWCHAR == 2 - if (i + 2 >= buflen) - break; - buf[i++] = 0xD7C0 + (c >> 10); - buf[i++] = 0xDC00 + (c & 0x03FF); + if (codepoint <= 0xFFFF) { + buf[i++] = (SQLWCHAR) codepoint; + } else { + buf[i++] = (SQLWCHAR) (0xD7C0 + (codepoint >> 10)); + buf[i++] = (SQLWCHAR) (0xDC00 + (codepoint & 0x3FF)); + } #else - buf[i++] = c; + buf[i++] = (SQLWCHAR) codepoint; #endif - j += 4; - } else { + break; + case UTF8_REJECT: return "Illegal code point"; + default: + break; } } if (buflen > 0) @@ -272,40 +246,22 @@ ODBCutf82wchar(const SQLCHAR *src, if (consumed) *consumed = (size_t) j; while (j < length && src[j]) { - i++; - if ((src[j+0] & 0x80) == 0) { - j += 1; - } else if (j + 1 < length - && (src[j+0] & 0xE0) == 0xC0 - && (src[j+1] & 0xC0) == 0x80 - && (src[j+0] & 0x1E) != 0) { - j += 2; - } else if (j + 2 < length - && (src[j+0] & 0xF0) == 0xE0 - && (src[j+1] & 0xC0) == 0x80 - && (src[j+2] & 0xC0) == 0x80 - && ((src[j+0] & 0x0F) != 0 - || (src[j+1] & 0x20) != 0)) { - j += 3; - } else if (j + 3 < length - && (src[j+0] & 0xF8) == 0xF0 - && (src[j+1] & 0xC0) == 0x80 - && (src[j+2] & 0xC0) == 0x80 - && (src[j+3] & 0xC0) == 0x80 - && ((src[j+0] & 0x07) != 0 - || (src[j+1] & 0x30) != 0)) { - c = (src[j+0] & 0x07) << 18 - | (src[j+1] & 0x3F) << 12 - | (src[j+2] & 0x3F) << 6 - | (src[j+3] & 0x3F); - if (c > 0x10FFFF || (c & 0x1FF800) == 0x00D800) - return "Illegal code point"; + switch (decode(&state, &codepoint, (uint8_t) src[j++])) { + case UTF8_ACCEPT: #if SIZEOF_SQLWCHAR == 2 + if (codepoint <= 0xFFFF) { + i++; + } else { + i += 2; + } +#else i++; #endif - j += 4; - } else { + break; + case UTF8_REJECT: return "Illegal code point"; + default: + break; } } if (buflenout) diff --git a/common/stream/CMakeLists.txt b/common/stream/CMakeLists.txt --- a/common/stream/CMakeLists.txt +++ b/common/stream/CMakeLists.txt @@ -68,6 +68,7 @@ target_link_libraries(stream $<$<BOOL:${LZ4_FOUND}>:LZ4::LZ4> $<$<BOOL:${OPENSSL_FOUND}>:OpenSSL::SSL> matomic + mutf8 monetdb_config_header $<$<PLATFORM_ID:Windows>:ws2_32> Threads::Threads) diff --git a/common/stream/stdio_stream.c b/common/stream/stdio_stream.c --- a/common/stream/stdio_stream.c +++ b/common/stream/stdio_stream.c @@ -15,6 +15,7 @@ #include "monetdb_config.h" #include "stream.h" #include "stream_internal.h" +#include "mutf8.h" /* ------------------------------------------------------------------ */ @@ -177,43 +178,22 @@ utf8towchar(const char *src) { wchar_t *dest; size_t i = 0; - size_t j = 0; - uint32_t c; + uint32_t state = 0, codepoint = 0; /* count how many wchar_t's we need, while also checking for * correctness of the input */ - while (src[j]) { - i++; - if ((src[j+0] & 0x80) == 0) { - j += 1; - } else if ((src[j+0] & 0xE0) == 0xC0 - && (src[j+1] & 0xC0) == 0x80 - && (src[j+0] & 0x1E) != 0) { - j += 2; - } else if ((src[j+0] & 0xF0) == 0xE0 - && (src[j+1] & 0xC0) == 0x80 - && (src[j+2] & 0xC0) == 0x80 - && ((src[j+0] & 0x0F) != 0 - || (src[j+1] & 0x20) != 0)) { - j += 3; - } else if ((src[j+0] & 0xF8) == 0xF0 - && (src[j+1] & 0xC0) == 0x80 - && (src[j+2] & 0xC0) == 0x80 - && (src[j+3] & 0xC0) == 0x80) { - c = (src[j+0] & 0x07) << 18 - | (src[j+1] & 0x3F) << 12 - | (src[j+2] & 0x3F) << 6 - | (src[j+3] & 0x3F); - if (c < 0x10000 - || c > 0x10FFFF - || (c & 0x1FF800) == 0x00D800) - return NULL; + for (size_t j = 0; src[j]; j++) { + switch (decode(&state, &codepoint, (uint8_t) src[j])) { + case UTF8_ACCEPT: + i++; #if SIZEOF_WCHAR_T == 2 - i++; + i += (codepoint > 0xFFFF); #endif - j += 4; - } else { + break; + case UTF8_REJECT: return NULL; + default: + break; } } dest = malloc((i + 1) * sizeof(wchar_t)); @@ -221,32 +201,27 @@ utf8towchar(const char *src) return NULL; /* go through the source string again, this time we can skip * the correctness tests */ - i = j = 0; - while (src[j]) { - if ((src[j+0] & 0x80) == 0) { - dest[i++] = src[j+0]; - j += 1; - } else if ((src[j+0] & 0xE0) == 0xC0) { - dest[i++] = (src[j+0] & 0x1F) << 6 - | (src[j+1] & 0x3F); - j += 2; - } else if ((src[j+0] & 0xF0) == 0xE0) { - dest[i++] = (src[j+0] & 0x0F) << 12 - | (src[j+1] & 0x3F) << 6 - | (src[j+2] & 0x3F); - j += 3; - } else if ((src[j+0] & 0xF8) == 0xF0) { - c = (src[j+0] & 0x07) << 18 - | (src[j+1] & 0x3F) << 12 - | (src[j+2] & 0x3F) << 6 - | (src[j+3] & 0x3F); + i = 0; + for (size_t j = 0; src[j]; j++) { + switch (decode(&state, &codepoint, (uint8_t) src[j])) { + case UTF8_ACCEPT: #if SIZEOF_WCHAR_T == 2 - dest[i++] = 0xD7C0 + (c >> 10); - dest[i++] = 0xDC00 + (c & 0x03FF); + if (codepoint <= 0xFFFF) { + dest[i++] = (wchar_t) codepoint; + } else { + dest[i++] = (wchar_t) (0xD7C0 + (codepoint >> 10)); + dest[i++] = (wchar_t) (0xDC00 + (codepoint & 0x3FF)); + } #else - dest[i++] = c; + dest[i++] = (wchar_t) codepoint; #endif - j += 4; + break; + case UTF8_REJECT: + /* cannot happen because of first loop */ + free(dest); + return NULL; + default: + break; } } dest[i] = 0; diff --git a/common/stream/winio.c b/common/stream/winio.c --- a/common/stream/winio.c +++ b/common/stream/winio.c @@ -214,7 +214,7 @@ console_write(stream *restrict s, const mnstr_set_error(s, MNSTR_WRITE_ERROR, "encoding error %d", __LINE__); return -1; } else if (c->ch > 0xFFFF) { - c->wbuf[c->len++] = 0xD800 | ((c->ch >> 10) - (1 << 6)); + c->wbuf[c->len++] = 0xD7C0 + (c->ch >> 10); c->wbuf[c->len++] = 0xDC00 | (c->ch & 0x03FF); } else { c->wbuf[c->len++] = c->ch; @@ -268,7 +268,7 @@ console_write(stream *restrict s, const mnstr_set_error(s, MNSTR_WRITE_ERROR, "encoding error %d", __LINE__); return -1; } else if (ch > 0xFFFF) { - c->wbuf[c->len++] = 0xD800 | ((ch >> 10) - (1 << 6)); + c->wbuf[c->len++] = 0xD7C0 + (ch >> 10); c->wbuf[c->len++] = 0xDC00 | (ch & 0x03FF); } else { c->wbuf[c->len++] = ch; diff --git a/common/utils/CMakeLists.txt b/common/utils/CMakeLists.txt --- a/common/utils/CMakeLists.txt +++ b/common/utils/CMakeLists.txt _______________________________________________ checkin-list mailing list -- checkin-list@monetdb.org To unsubscribe send an email to checkin-list-le...@monetdb.org