This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch branch-c108335-hive-sql
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-c108335-hive-sql by 
this push:
     new 29e3e387951 [fix](csv) escape quote with double quote for csv format 
table #50101
29e3e387951 is described below

commit 29e3e38795119c68022129fa750dd37d84dd36ae
Author: Socrates <[email protected]>
AuthorDate: Wed Apr 16 10:38:30 2025 +0800

    [fix](csv) escape quote with double quote for csv format table #50101
---
 be/src/vec/data_types/serde/data_type_serde.h      |  10 +++++
 .../vec/data_types/serde/data_type_string_serde.h  |  34 ++++++++++++++++
 be/src/vec/exec/format/csv/csv_reader.cpp          |   7 ++--
 .../file_reader/new_plain_text_line_reader.cpp     |  19 +++++++--
 .../file_reader/new_plain_text_line_reader.h       |   2 +
 .../csv/csv_json_table/csv_json_table.csv          |   3 ++
 .../hive/test_hive_serde_prop.out                  | Bin 1538 -> 1534 bytes
 .../external_table_p0/hive/test_open_csv_serde.out | Bin 0 -> 1561 bytes
 .../test_csv_with_enclose_and_escapeS3_load.out    | Bin 743 -> 743 bytes
 .../hive/test_open_csv_serde.groovy                |  45 +++++++++++++++++++++
 10 files changed, 113 insertions(+), 7 deletions(-)

diff --git a/be/src/vec/data_types/serde/data_type_serde.h 
b/be/src/vec/data_types/serde/data_type_serde.h
index 7705db33699..74f170b5808 100644
--- a/be/src/vec/data_types/serde/data_type_serde.h
+++ b/be/src/vec/data_types/serde/data_type_serde.h
@@ -151,6 +151,8 @@ public:
          */
         bool converted_from_string = false;
 
+        char quote_char = '"';
+
         char escape_char = 0;
         /**
          * flags for each byte to indicate if escape is needed.
@@ -273,6 +275,14 @@ public:
 
     virtual Status deserialize_one_cell_from_json(IColumn& column, Slice& 
slice,
                                                   const FormatOptions& 
options) const = 0;
+
+    // In some cases, CSV and JSON deserialization behaviors may differ
+    // so we provide a default implementation that uses JSON deserialization
+    virtual Status deserialize_one_cell_from_csv(IColumn& column, Slice& slice,
+                                                 const FormatOptions& options) 
const {
+        return deserialize_one_cell_from_json(column, slice, options);
+    }
+
     // deserialize text vector is to avoid virtual function call in complex 
type nested loop
     virtual Status deserialize_column_from_json_vector(IColumn& column, 
std::vector<Slice>& slices,
                                                        uint64_t* 
num_deserialized,
diff --git a/be/src/vec/data_types/serde/data_type_string_serde.h 
b/be/src/vec/data_types/serde/data_type_string_serde.h
index bd1dac19725..6a30254390e 100644
--- a/be/src/vec/data_types/serde/data_type_string_serde.h
+++ b/be/src/vec/data_types/serde/data_type_string_serde.h
@@ -64,6 +64,31 @@ inline void escape_string(const char* src, size_t& len, char 
escape_char) {
     len = dest_ptr - start;
 }
 
+// specially escape quote with double quote
+inline void escape_string_for_csv(const char* src, size_t& len, char 
escape_char, char quote_char) {
+    const char* start = src;
+    char* dest_ptr = const_cast<char*>(src);
+    const char* end = src + len;
+    bool escape_next_char = false;
+
+    while (src < end) {
+        if ((src < end - 1 && *src == quote_char && *(src + 1) == quote_char) 
||
+            *src == escape_char) {
+            escape_next_char = !escape_next_char;
+        } else {
+            escape_next_char = false;
+        }
+
+        if (escape_next_char) {
+            ++src;
+        } else {
+            *dest_ptr++ = *src++;
+        }
+    }
+
+    len = dest_ptr - start;
+}
+
 template <typename ColumnType>
 class DataTypeStringSerDeBase : public DataTypeSerDe {
     using ColumnStrType = ColumnType;
@@ -189,6 +214,15 @@ public:
         return Status::OK();
     }
 
+    Status deserialize_one_cell_from_csv(IColumn& column, Slice& slice,
+                                         const FormatOptions& options) const 
override {
+        if (options.escape_char != 0) {
+            escape_string_for_csv(slice.data, slice.size, options.escape_char, 
options.quote_char);
+        }
+        assert_cast<ColumnType&>(column).insert_data(slice.data, slice.size);
+        return Status::OK();
+    }
+
     Status deserialize_one_cell_from_hive_text(
             IColumn& column, Slice& slice, const FormatOptions& options,
             int hive_text_complex_type_delimiter_level = 1) const override {
diff --git a/be/src/vec/exec/format/csv/csv_reader.cpp 
b/be/src/vec/exec/format/csv/csv_reader.cpp
index 0bc09cb9ea8..cfb44585fc6 100644
--- a/be/src/vec/exec/format/csv/csv_reader.cpp
+++ b/be/src/vec/exec/format/csv/csv_reader.cpp
@@ -338,6 +338,7 @@ Status CsvReader::init_reader(bool is_load) {
             (_state != nullptr && 
_state->trim_tailing_spaces_for_external_table_query());
 
     _options.escape_char = _escape;
+    _options.quote_char = _enclose;
     if (_params.file_attributes.text_params.collection_delimiter.empty()) {
         switch (_text_serde_type) {
         case TTextSerdeType::JSON_TEXT_SERDE:
@@ -625,8 +626,8 @@ Status CsvReader::deserialize_nullable_string(IColumn& 
column, Slice& slice) {
         }
     }
     static DataTypeStringSerDe stringSerDe;
-    auto st = 
stringSerDe.deserialize_one_cell_from_json(null_column.get_nested_column(), 
slice,
-                                                         _options);
+    auto st = 
stringSerDe.deserialize_one_cell_from_csv(null_column.get_nested_column(), 
slice,
+                                                        _options);
     if (!st.ok()) {
         // fill null if fail
         null_column.insert_data(nullptr, 0); // 0 is meaningless here
@@ -678,7 +679,7 @@ Status CsvReader::_fill_dest_columns(const Slice& line, 
Block* block,
             switch (_text_serde_type) {
             case TTextSerdeType::JSON_TEXT_SERDE:
                 RETURN_IF_ERROR(
-                        _serdes[i]->deserialize_one_cell_from_json(*col_ptr, 
slice, _options));
+                        _serdes[i]->deserialize_one_cell_from_csv(*col_ptr, 
slice, _options));
                 break;
             case TTextSerdeType::HIVE_TEXT_SERDE:
                 RETURN_IF_ERROR(
diff --git a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp 
b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp
index 83bca5439ae..e3f16a37a82 100644
--- a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp
+++ b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp
@@ -149,11 +149,22 @@ void 
EncloseCsvLineReaderContext::_on_pre_match_enclose(const uint8_t* start, si
                 _should_escape = !_should_escape;
             } else if (_should_escape) [[unlikely]] {
                 _should_escape = false;
-            } else if (start[_idx] == _enclose) [[unlikely]] {
-                _state.forward_to(ReaderState::MATCH_ENCLOSE);
-                ++_idx;
-                return;
+            } else if (_quote_escape) {
+                if (start[_idx] == _enclose) {
+                    // double quote, escaped by quote
+                    _quote_escape = false;
+                } else {
+                    // match enclose
+                    _quote_escape = false;
+                    _state.forward_to(ReaderState::MATCH_ENCLOSE);
+                    return;
+                }
+            } else if (start[_idx] == _enclose) {
+                _quote_escape = true;
+            } else {
+                _quote_escape = false;
             }
+
             ++_idx;
         } while (_idx != len);
 
diff --git a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.h 
b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.h
index e1c30607e1b..476f42781c2 100644
--- a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.h
+++ b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.h
@@ -254,6 +254,8 @@ private:
 
     size_t _idx = 0;
     bool _should_escape = false;
+    // quote is specially escaped by quote in csv format
+    bool _quote_escape = false;
 
     const std::string _column_sep;
     std::vector<size_t> _column_sep_positions;
diff --git 
a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/csv/csv_json_table/csv_json_table.csv
 
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/csv/csv_json_table/csv_json_table.csv
new file mode 100644
index 00000000000..0e3860a3792
--- /dev/null
+++ 
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/csv/csv_json_table/csv_json_table.csv
@@ -0,0 +1,3 @@
+1196252867,115863833,dc3179cc9dea43ec5d2a5854edc9ee4f,4b54b5aed52f184f535c2c1b30431bd8,dbb7757423134e63d65e8a0de7dac6e7,在籍,1,"{""code"":
 ""100"", ""message"": ""查询成功"", ""dsorderid"": """", ""orderid"": """", 
""data"": {""status"": ""1""}}"
+1194793017,115810730,b006ec54704ae2720cfce7eaafd68c67,fe53ae2121e5ad47bcd7ad1333128c88,fa289021e86ba1550332ad484556c9b8,不在籍,20,"{""code"":
 ""100"", ""message"": ""查询成功"", ""dsorderid"": """", ""orderid"": """", 
""data"": {""status"": ""20""}}"
+1190957155,115658063,3ed5c1ebbb0035c37690aa6720e4ef7d,14ba6410e6d0c698287bcd508dd9f73a,bfe3440f302292b30329f1e48a139595,不在籍,20,"{""code"":
 ""100"", ""message"": ""查询成功"", ""dsorderid"": """", ""orderid"": """", 
""data"": {""status"": ""20""}}"
\ No newline at end of file
diff --git 
a/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out 
b/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out
index fd8e7bb4313..c2415c058f1 100644
Binary files 
a/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out and 
b/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out differ
diff --git 
a/regression-test/data/external_table_p0/hive/test_open_csv_serde.out 
b/regression-test/data/external_table_p0/hive/test_open_csv_serde.out
new file mode 100644
index 00000000000..918c22ab996
Binary files /dev/null and 
b/regression-test/data/external_table_p0/hive/test_open_csv_serde.out differ
diff --git 
a/regression-test/data/load_p0/broker_load/test_csv_with_enclose_and_escapeS3_load.out
 
b/regression-test/data/load_p0/broker_load/test_csv_with_enclose_and_escapeS3_load.out
index 0c1450e35fe..8d4444ac418 100644
Binary files 
a/regression-test/data/load_p0/broker_load/test_csv_with_enclose_and_escapeS3_load.out
 and 
b/regression-test/data/load_p0/broker_load/test_csv_with_enclose_and_escapeS3_load.out
 differ
diff --git 
a/regression-test/suites/external_table_p0/hive/test_open_csv_serde.groovy 
b/regression-test/suites/external_table_p0/hive/test_open_csv_serde.groovy
new file mode 100644
index 00000000000..c6207391726
--- /dev/null
+++ b/regression-test/suites/external_table_p0/hive/test_open_csv_serde.groovy
@@ -0,0 +1,45 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+
+suite("test_open_csv_serde","p0,external,tvf,hive,external_docker,external_docker_hive")
 {
+    String enabled = context.config.otherConfigs.get("enableHiveTest")
+    if (enabled == null || !enabled.equalsIgnoreCase("true")) {
+        logger.info("diable Hive test.")
+        return;
+    }
+
+    for (String hivePrefix : ["hive2","hive3"]) {
+    
+        String hms_port = context.config.otherConfigs.get(hivePrefix + 
"HmsPort")
+        String catalog_name = "${hivePrefix}_test_open_csv_serde"
+        String externalEnvIp = context.config.otherConfigs.get("externalEnvIp")
+        def hdfsUserName = "doris"
+        String hdfs_port = context.config.otherConfigs.get(hivePrefix + 
"HdfsPort")
+        def defaultFS = "hdfs://${externalEnvIp}:${hdfs_port}"
+
+        sql """drop catalog if exists ${catalog_name}"""
+        sql """create catalog if not exists ${catalog_name} properties (
+            "type"="hms",
+            'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hms_port}'
+        );"""
+        sql """use `${catalog_name}`.`multi_catalog`"""
+
+        qt_csv_escape_quote_in_enclose """select * from csv_json_table order 
by trace_id;"""
+        // TODO: add more case after refactor csv_reader and text_reader
+    }
+}
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to