This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch branch-c108335-hive-sql
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-c108335-hive-sql by
this push:
new 29e3e387951 [fix](csv) escape quote with double quote for csv format
table #50101
29e3e387951 is described below
commit 29e3e38795119c68022129fa750dd37d84dd36ae
Author: Socrates <[email protected]>
AuthorDate: Wed Apr 16 10:38:30 2025 +0800
[fix](csv) escape quote with double quote for csv format table #50101
---
be/src/vec/data_types/serde/data_type_serde.h | 10 +++++
.../vec/data_types/serde/data_type_string_serde.h | 34 ++++++++++++++++
be/src/vec/exec/format/csv/csv_reader.cpp | 7 ++--
.../file_reader/new_plain_text_line_reader.cpp | 19 +++++++--
.../file_reader/new_plain_text_line_reader.h | 2 +
.../csv/csv_json_table/csv_json_table.csv | 3 ++
.../hive/test_hive_serde_prop.out | Bin 1538 -> 1534 bytes
.../external_table_p0/hive/test_open_csv_serde.out | Bin 0 -> 1561 bytes
.../test_csv_with_enclose_and_escapeS3_load.out | Bin 743 -> 743 bytes
.../hive/test_open_csv_serde.groovy | 45 +++++++++++++++++++++
10 files changed, 113 insertions(+), 7 deletions(-)
diff --git a/be/src/vec/data_types/serde/data_type_serde.h
b/be/src/vec/data_types/serde/data_type_serde.h
index 7705db33699..74f170b5808 100644
--- a/be/src/vec/data_types/serde/data_type_serde.h
+++ b/be/src/vec/data_types/serde/data_type_serde.h
@@ -151,6 +151,8 @@ public:
*/
bool converted_from_string = false;
+ char quote_char = '"';
+
char escape_char = 0;
/**
* flags for each byte to indicate if escape is needed.
@@ -273,6 +275,14 @@ public:
virtual Status deserialize_one_cell_from_json(IColumn& column, Slice&
slice,
const FormatOptions&
options) const = 0;
+
+ // In some cases, CSV and JSON deserialization behaviors may differ
+ // so we provide a default implementation that uses JSON deserialization
+ virtual Status deserialize_one_cell_from_csv(IColumn& column, Slice& slice,
+ const FormatOptions& options)
const {
+ return deserialize_one_cell_from_json(column, slice, options);
+ }
+
// deserialize text vector is to avoid virtual function call in complex
type nested loop
virtual Status deserialize_column_from_json_vector(IColumn& column,
std::vector<Slice>& slices,
uint64_t*
num_deserialized,
diff --git a/be/src/vec/data_types/serde/data_type_string_serde.h
b/be/src/vec/data_types/serde/data_type_string_serde.h
index bd1dac19725..6a30254390e 100644
--- a/be/src/vec/data_types/serde/data_type_string_serde.h
+++ b/be/src/vec/data_types/serde/data_type_string_serde.h
@@ -64,6 +64,31 @@ inline void escape_string(const char* src, size_t& len, char
escape_char) {
len = dest_ptr - start;
}
+// specially escape quote with double quote
+inline void escape_string_for_csv(const char* src, size_t& len, char
escape_char, char quote_char) {
+ const char* start = src;
+ char* dest_ptr = const_cast<char*>(src);
+ const char* end = src + len;
+ bool escape_next_char = false;
+
+ while (src < end) {
+ if ((src < end - 1 && *src == quote_char && *(src + 1) == quote_char)
||
+ *src == escape_char) {
+ escape_next_char = !escape_next_char;
+ } else {
+ escape_next_char = false;
+ }
+
+ if (escape_next_char) {
+ ++src;
+ } else {
+ *dest_ptr++ = *src++;
+ }
+ }
+
+ len = dest_ptr - start;
+}
+
template <typename ColumnType>
class DataTypeStringSerDeBase : public DataTypeSerDe {
using ColumnStrType = ColumnType;
@@ -189,6 +214,15 @@ public:
return Status::OK();
}
+ Status deserialize_one_cell_from_csv(IColumn& column, Slice& slice,
+ const FormatOptions& options) const
override {
+ if (options.escape_char != 0) {
+ escape_string_for_csv(slice.data, slice.size, options.escape_char,
options.quote_char);
+ }
+ assert_cast<ColumnType&>(column).insert_data(slice.data, slice.size);
+ return Status::OK();
+ }
+
Status deserialize_one_cell_from_hive_text(
IColumn& column, Slice& slice, const FormatOptions& options,
int hive_text_complex_type_delimiter_level = 1) const override {
diff --git a/be/src/vec/exec/format/csv/csv_reader.cpp
b/be/src/vec/exec/format/csv/csv_reader.cpp
index 0bc09cb9ea8..cfb44585fc6 100644
--- a/be/src/vec/exec/format/csv/csv_reader.cpp
+++ b/be/src/vec/exec/format/csv/csv_reader.cpp
@@ -338,6 +338,7 @@ Status CsvReader::init_reader(bool is_load) {
(_state != nullptr &&
_state->trim_tailing_spaces_for_external_table_query());
_options.escape_char = _escape;
+ _options.quote_char = _enclose;
if (_params.file_attributes.text_params.collection_delimiter.empty()) {
switch (_text_serde_type) {
case TTextSerdeType::JSON_TEXT_SERDE:
@@ -625,8 +626,8 @@ Status CsvReader::deserialize_nullable_string(IColumn&
column, Slice& slice) {
}
}
static DataTypeStringSerDe stringSerDe;
- auto st =
stringSerDe.deserialize_one_cell_from_json(null_column.get_nested_column(),
slice,
- _options);
+ auto st =
stringSerDe.deserialize_one_cell_from_csv(null_column.get_nested_column(),
slice,
+ _options);
if (!st.ok()) {
// fill null if fail
null_column.insert_data(nullptr, 0); // 0 is meaningless here
@@ -678,7 +679,7 @@ Status CsvReader::_fill_dest_columns(const Slice& line,
Block* block,
switch (_text_serde_type) {
case TTextSerdeType::JSON_TEXT_SERDE:
RETURN_IF_ERROR(
- _serdes[i]->deserialize_one_cell_from_json(*col_ptr,
slice, _options));
+ _serdes[i]->deserialize_one_cell_from_csv(*col_ptr,
slice, _options));
break;
case TTextSerdeType::HIVE_TEXT_SERDE:
RETURN_IF_ERROR(
diff --git a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp
b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp
index 83bca5439ae..e3f16a37a82 100644
--- a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp
+++ b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp
@@ -149,11 +149,22 @@ void
EncloseCsvLineReaderContext::_on_pre_match_enclose(const uint8_t* start, si
_should_escape = !_should_escape;
} else if (_should_escape) [[unlikely]] {
_should_escape = false;
- } else if (start[_idx] == _enclose) [[unlikely]] {
- _state.forward_to(ReaderState::MATCH_ENCLOSE);
- ++_idx;
- return;
+ } else if (_quote_escape) {
+ if (start[_idx] == _enclose) {
+ // double quote, escaped by quote
+ _quote_escape = false;
+ } else {
+ // match enclose
+ _quote_escape = false;
+ _state.forward_to(ReaderState::MATCH_ENCLOSE);
+ return;
+ }
+ } else if (start[_idx] == _enclose) {
+ _quote_escape = true;
+ } else {
+ _quote_escape = false;
}
+
++_idx;
} while (_idx != len);
diff --git a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.h
b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.h
index e1c30607e1b..476f42781c2 100644
--- a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.h
+++ b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.h
@@ -254,6 +254,8 @@ private:
size_t _idx = 0;
bool _should_escape = false;
+ // quote is specially escaped by quote in csv format
+ bool _quote_escape = false;
const std::string _column_sep;
std::vector<size_t> _column_sep_positions;
diff --git
a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/csv/csv_json_table/csv_json_table.csv
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/csv/csv_json_table/csv_json_table.csv
new file mode 100644
index 00000000000..0e3860a3792
--- /dev/null
+++
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/csv/csv_json_table/csv_json_table.csv
@@ -0,0 +1,3 @@
+1196252867,115863833,dc3179cc9dea43ec5d2a5854edc9ee4f,4b54b5aed52f184f535c2c1b30431bd8,dbb7757423134e63d65e8a0de7dac6e7,在籍,1,"{""code"":
""100"", ""message"": ""查询成功"", ""dsorderid"": """", ""orderid"": """",
""data"": {""status"": ""1""}}"
+1194793017,115810730,b006ec54704ae2720cfce7eaafd68c67,fe53ae2121e5ad47bcd7ad1333128c88,fa289021e86ba1550332ad484556c9b8,不在籍,20,"{""code"":
""100"", ""message"": ""查询成功"", ""dsorderid"": """", ""orderid"": """",
""data"": {""status"": ""20""}}"
+1190957155,115658063,3ed5c1ebbb0035c37690aa6720e4ef7d,14ba6410e6d0c698287bcd508dd9f73a,bfe3440f302292b30329f1e48a139595,不在籍,20,"{""code"":
""100"", ""message"": ""查询成功"", ""dsorderid"": """", ""orderid"": """",
""data"": {""status"": ""20""}}"
\ No newline at end of file
diff --git
a/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out
b/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out
index fd8e7bb4313..c2415c058f1 100644
Binary files
a/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out and
b/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out differ
diff --git
a/regression-test/data/external_table_p0/hive/test_open_csv_serde.out
b/regression-test/data/external_table_p0/hive/test_open_csv_serde.out
new file mode 100644
index 00000000000..918c22ab996
Binary files /dev/null and
b/regression-test/data/external_table_p0/hive/test_open_csv_serde.out differ
diff --git
a/regression-test/data/load_p0/broker_load/test_csv_with_enclose_and_escapeS3_load.out
b/regression-test/data/load_p0/broker_load/test_csv_with_enclose_and_escapeS3_load.out
index 0c1450e35fe..8d4444ac418 100644
Binary files
a/regression-test/data/load_p0/broker_load/test_csv_with_enclose_and_escapeS3_load.out
and
b/regression-test/data/load_p0/broker_load/test_csv_with_enclose_and_escapeS3_load.out
differ
diff --git
a/regression-test/suites/external_table_p0/hive/test_open_csv_serde.groovy
b/regression-test/suites/external_table_p0/hive/test_open_csv_serde.groovy
new file mode 100644
index 00000000000..c6207391726
--- /dev/null
+++ b/regression-test/suites/external_table_p0/hive/test_open_csv_serde.groovy
@@ -0,0 +1,45 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+
+suite("test_open_csv_serde","p0,external,tvf,hive,external_docker,external_docker_hive")
{
+ String enabled = context.config.otherConfigs.get("enableHiveTest")
+ if (enabled == null || !enabled.equalsIgnoreCase("true")) {
+ logger.info("diable Hive test.")
+ return;
+ }
+
+ for (String hivePrefix : ["hive2","hive3"]) {
+
+ String hms_port = context.config.otherConfigs.get(hivePrefix +
"HmsPort")
+ String catalog_name = "${hivePrefix}_test_open_csv_serde"
+ String externalEnvIp = context.config.otherConfigs.get("externalEnvIp")
+ def hdfsUserName = "doris"
+ String hdfs_port = context.config.otherConfigs.get(hivePrefix +
"HdfsPort")
+ def defaultFS = "hdfs://${externalEnvIp}:${hdfs_port}"
+
+ sql """drop catalog if exists ${catalog_name}"""
+ sql """create catalog if not exists ${catalog_name} properties (
+ "type"="hms",
+ 'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hms_port}'
+ );"""
+ sql """use `${catalog_name}`.`multi_catalog`"""
+
+ qt_csv_escape_quote_in_enclose """select * from csv_json_table order
by trace_id;"""
+ // TODO: add more case after refactor csv_reader and text_reader
+ }
+}
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]