This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 582be130dd [Feature] (ODBC) support read/write emoji of utf16 via odbc
table (#11863)
582be130dd is described below
commit 582be130dd08cfe1d5ab390d7237e67aa2a0837f
Author: HappenLee <[email protected]>
AuthorDate: Thu Aug 18 09:09:02 2022 +0800
[Feature] (ODBC) support read/write emoji of utf16 via odbc table (#11863)
Co-authored-by: lihaopeng <[email protected]>
---
be/src/exec/odbc_connector.cpp | 26 +++++++---------------
.../docs/ecosystem/external-table/odbc-of-doris.md | 5 ++++-
.../docs/ecosystem/external-table/odbc-of-doris.md | 4 ++++
.../main/java/org/apache/doris/catalog/Env.java | 1 +
4 files changed, 17 insertions(+), 19 deletions(-)
diff --git a/be/src/exec/odbc_connector.cpp b/be/src/exec/odbc_connector.cpp
index 5ca74080df..169b626726 100644
--- a/be/src/exec/odbc_connector.cpp
+++ b/be/src/exec/odbc_connector.cpp
@@ -48,14 +48,9 @@ static constexpr uint32_t BIG_COLUMN_SIZE_BUFFER = 65535;
// Default max buffer size use in insert to: 50MB, normally a batch is smaller
than the size
static constexpr uint32_t INSERT_BUFFER_SIZE = 1024l * 1024 * 50;
-static doris::Status utf8_to_wstring(const std::string& str, std::u16string&
out) {
- std::wstring_convert<std::codecvt_utf8<char16_t>, char16_t> utf8_ucs2_cvt;
- try {
- out = utf8_ucs2_cvt.from_bytes(str);
- } catch (std::range_error& e) {
- return doris::Status::InternalError("UNICODE out of supported range");
- }
- return doris::Status::OK();
+static std::u16string utf8_to_u16string(const char* first, const char* last) {
+ std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t>
utf8_utf16_cvt;
+ return utf8_utf16_cvt.from_bytes(first, last);
}
namespace doris {
@@ -133,8 +128,7 @@ Status ODBCConnector::query() {
"alloc statement");
// Translate utf8 string to utf16 to use unicode encoding
- std::u16string wquery;
- RETURN_IF_ERROR(utf8_to_wstring(_sql_str, wquery));
+ auto wquery = utf8_to_u16string(_sql_str.c_str(), _sql_str.c_str() +
_sql_str.length());
ODBC_DISPOSE(_stmt, SQL_HANDLE_STMT,
SQLExecDirectW(_stmt, (SQLWCHAR*)(wquery.c_str()), SQL_NTS),
"exec direct");
@@ -313,10 +307,8 @@ Status ODBCConnector::append(const std::string&
table_name, RowBatch* batch,
}
}
// Translate utf8 string to utf16 to use unicode encodeing
- RETURN_IF_ERROR(utf8_to_wstring(
- std::string(_insert_stmt_buffer.data(),
- _insert_stmt_buffer.data() +
_insert_stmt_buffer.size()),
- insert_stmt));
+ insert_stmt = utf8_to_u16string(_insert_stmt_buffer.data(),
+ _insert_stmt_buffer.data() +
_insert_stmt_buffer.size());
}
{
@@ -499,10 +491,8 @@ Status ODBCConnector::append(const std::string&
table_name, vectorized::Block* b
}
}
// Translate utf8 string to utf16 to use unicode encodeing
- RETURN_IF_ERROR(utf8_to_wstring(
- std::string(_insert_stmt_buffer.data(),
- _insert_stmt_buffer.data() +
_insert_stmt_buffer.size()),
- insert_stmt));
+ insert_stmt = utf8_to_u16string(_insert_stmt_buffer.data(),
+ _insert_stmt_buffer.data() +
_insert_stmt_buffer.size());
}
{
diff --git a/docs/en/docs/ecosystem/external-table/odbc-of-doris.md
b/docs/en/docs/ecosystem/external-table/odbc-of-doris.md
index 2d850f8073..5c3c35595b 100644
--- a/docs/en/docs/ecosystem/external-table/odbc-of-doris.md
+++ b/docs/en/docs/ecosystem/external-table/odbc-of-doris.md
@@ -381,5 +381,8 @@ This is the compatibility problem between MySQL database
ODBC driver and existin
Connection to the database fails. The` Err: part` represents the error of
different database connection failures. This is usually a configuration
problem. You should check whether the IP address, port or account password are
mismatched.
-
+ 11. Messy code appears when reading and writing emoji emoji in mysql odbc
table
+
+ The default encoding used by Doris when connecting to odbc tables is utf8,
since the default utf8 encoding in mysql is utf8mb3, it can't represent the
emoji expressions which need 4-byte encoding. Here need to set
`charset`=`utf8mb4` when you create odbc mysql tables, then can read and write
emoji normally 😀.
+
diff --git a/docs/zh-CN/docs/ecosystem/external-table/odbc-of-doris.md
b/docs/zh-CN/docs/ecosystem/external-table/odbc-of-doris.md
index 8d1df916d6..05cb3d81a1 100644
--- a/docs/zh-CN/docs/ecosystem/external-table/odbc-of-doris.md
+++ b/docs/zh-CN/docs/ecosystem/external-table/odbc-of-doris.md
@@ -371,3 +371,7 @@ sudo alien -i
oracle-instantclient19.13-sqlplus-19.13.0.0.0-2.x86_64.rpm
10. 报错`driver connect Err: xxx`
通常是连接数据库失败,Err部分代表了不同的数据库连接失败的报错。这种情况通常是配置存在问题。可以检查是否错配了ip地址,端口或账号密码。
+
+11. 读写mysql外表的emoji表情出现乱码
+
+
Doris进行odbc外表连接时,默认采用的编码为utf8,由于mysql之中默认的utf8编码为utf8mb3,无法表示需要4字节编码的emoji表情。这里需要在建立mysql外表时设置`charset`=`utf8mb4`,便可以正常读写emoji表情😀。
diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java
b/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java
index 53ddeab48d..75208057c5 100755
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java
@@ -2954,6 +2954,7 @@ public class Env {
sb.append("\"password\" = \"").append(hidePassword ? "" :
odbcTable.getPasswd()).append("\",\n");
sb.append("\"driver\" =
\"").append(odbcTable.getOdbcDriver()).append("\",\n");
sb.append("\"odbc_type\" =
\"").append(odbcTable.getOdbcTableTypeName()).append("\",\n");
+ sb.append("\"charest\" =
\"").append(odbcTable.getCharset()).append("\",\n");
} else {
sb.append("\"odbc_catalog_resource\" =
\"").append(odbcTable.getOdbcCatalogResourceName())
.append("\",\n");
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]