This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new b085187c45f [fix](es-catalog) Fix query error when ES keyword field 
contains array data (#61236)
b085187c45f is described below

commit b085187c45f8411a9910bc26ced8ae530262e5e3
Author: Mingyu Chen (Rayner) <[email protected]>
AuthorDate: Sun Mar 15 20:45:07 2026 -0700

    [fix](es-catalog) Fix query error when ES keyword field contains array data 
(#61236)
    
    ### What problem does this PR solve?
    
    When an Elasticsearch field is mapped as keyword/text but the actual
    data stored is an array (which is valid in ES),
    Doris was throwing: 'Expected value of type: STRING; but found type:
    Array'
    
    This fix serializes the array to a JSON string representation instead of
    throwing an error,
    which is consistent with how other non-string types (Numbers, Objects)
    are already handled.
    
    Fixed two code paths in es_scroll_parser.cpp:
    1. fill_columns() - main _source parsing path
    2. handle_value() - array element processing path
    
    Also added regression test and test data for this scenario.
---
 be/src/exec/es/es_scroll_parser.cpp                |  20 +++-
 .../scripts/data/test_keyword_array.json           |   3 +
 .../elasticsearch/scripts/es_init.sh               |  18 +++-
 .../scripts/index/es7_test_keyword_array.json      |  28 +++++
 .../es/test_es_keyword_array_type.out              |  33 ++++++
 .../es/test_es_keyword_array_type.groovy           | 115 +++++++++++++++++++++
 6 files changed, 212 insertions(+), 5 deletions(-)

diff --git a/be/src/exec/es/es_scroll_parser.cpp 
b/be/src/exec/es/es_scroll_parser.cpp
index 8e3ee8fa3ee..e2821a40ff1 100644
--- a/be/src/exec/es/es_scroll_parser.cpp
+++ b/be/src/exec/es/es_scroll_parser.cpp
@@ -438,8 +438,12 @@ Status handle_value(const rapidjson::Value& col, 
PrimitiveType sub_type, bool pu
         return Status::OK();
     }
     if constexpr (T == TYPE_STRING || T == TYPE_CHAR || T == TYPE_VARCHAR) {
-        RETURN_ERROR_IF_COL_IS_ARRAY(col, sub_type, true);
-        if (!col.IsString()) {
+        // When ES mapping is keyword/text but actual data is an array,
+        // serialize the array to JSON string instead of throwing an error.
+        // This is valid in ES since any field can hold array values.
+        if (col.IsArray()) {
+            val = json_value_to_string(col);
+        } else if (!col.IsString()) {
             val = json_value_to_string(col);
         } else {
             val = col.GetString();
@@ -707,14 +711,22 @@ Status ScrollParser::fill_columns(const TupleDescriptor* 
tuple_desc,
             if (pure_doc_value) {
                 if (col.Empty()) {
                     break;
+                } else if (col.Size() > 1) {
+                    // doc_values with multiple elements means actual array 
data
+                    // in ES keyword/text field, serialize as JSON array string
+                    val = json_value_to_string(col);
                 } else if (!col[0].IsString()) {
                     val = json_value_to_string(col[0]);
                 } else {
                     val = col[0].GetString();
                 }
             } else {
-                RETURN_ERROR_IF_COL_IS_ARRAY(col, type, true);
-                if (!col.IsString()) {
+                // When ES mapping is keyword/text but actual data is an array,
+                // serialize the array to JSON string instead of throwing an 
error.
+                // This is valid in ES since any field can hold array values.
+                if (col.IsArray()) {
+                    val = json_value_to_string(col);
+                } else if (!col.IsString()) {
                     val = json_value_to_string(col);
                 } else {
                     val = col.GetString();
diff --git 
a/docker/thirdparties/docker-compose/elasticsearch/scripts/data/test_keyword_array.json
 
b/docker/thirdparties/docker-compose/elasticsearch/scripts/data/test_keyword_array.json
new file mode 100644
index 00000000000..72979d2a459
--- /dev/null
+++ 
b/docker/thirdparties/docker-compose/elasticsearch/scripts/data/test_keyword_array.json
@@ -0,0 +1,3 @@
+{"id": 1, "name": "Alice", "tags": ["tag1", "tag2", "tag3"], "description": 
["first line", "second line"], "emails": ["[email protected]", 
"[email protected]"], "single_value": "hello"}
+{"id": 2, "name": "Bob", "tags": "single_tag", "description": "simple text", 
"emails": ["[email protected]"], "single_value": "world"}
+{"id": 3, "name": "Charlie", "tags": ["tag_a", "tag_b"], "description": ["only 
line"], "emails": "[email protected]", "single_value": ["val1", "val2"]}
diff --git 
a/docker/thirdparties/docker-compose/elasticsearch/scripts/es_init.sh 
b/docker/thirdparties/docker-compose/elasticsearch/scripts/es_init.sh
index d45e4b5fb4a..51f81ced938 100755
--- a/docker/thirdparties/docker-compose/elasticsearch/scripts/es_init.sh
+++ b/docker/thirdparties/docker-compose/elasticsearch/scripts/es_init.sh
@@ -23,7 +23,7 @@ generate_bulk_request() {
     local data_file=$4
     local output_file=$5
 
-    // clear output file
+    # clear output file
     echo "" > "$output_file"
 
     local id=1
@@ -174,6 +174,14 @@ curl -X POST "http://${ES_7_HOST}:9200/_bulk"; 
--data-binary "@$bulk_request_file
 # put _meta for composite_type_array
 curl "http://${ES_7_HOST}:9200/composite_type_array/_mapping"; -H 
"Content-Type:application/json" -X PUT -d 
"@/mnt/scripts/index/array_meta_composite_type_array.json"
 
+# create index test_keyword_array for keyword array type testing
+curl "http://${ES_7_HOST}:9200/test_keyword_array"; -H 
"Content-Type:application/json" -X PUT -d 
'@/mnt/scripts/index/es7_test_keyword_array.json'
+# put data for test_keyword_array
+keyword_array_data_file="/mnt/scripts/data/test_keyword_array.json"
+keyword_array_bulk_file="/mnt/scripts/data/bulk_request_keyword_array_es7.json"
+generate_bulk_request "test_keyword_array" "_doc" "item_" 
"$keyword_array_data_file" "$keyword_array_bulk_file"
+curl -X POST "http://${ES_7_HOST}:9200/_bulk"; --data-binary 
"@$keyword_array_bulk_file" -H "Content-Type: application/json"
+
 # es8
 # create index test1
 curl "http://${ES_8_HOST}:9200/test1"; -H "Content-Type:application/json" -X 
PUT -d "@/mnt/scripts/index/es7_test1.json"
@@ -217,4 +225,12 @@ curl -X POST "http://${ES_8_HOST}:9200/_bulk"; 
--data-binary "@$bulk_request_file
 # put _meta for composite_type_array
 curl "http://${ES_8_HOST}:9200/composite_type_array/_mapping"; -H 
"Content-Type:application/json" -X PUT -d 
"@/mnt/scripts/index/array_meta_composite_type_array.json"
 
+# create index test_keyword_array for keyword array type testing
+curl "http://${ES_8_HOST}:9200/test_keyword_array"; -H 
"Content-Type:application/json" -X PUT -d 
'@/mnt/scripts/index/es7_test_keyword_array.json'
+# put data for test_keyword_array
+keyword_array_data_file="/mnt/scripts/data/test_keyword_array.json"
+keyword_array_bulk_file="/mnt/scripts/data/bulk_request_keyword_array_es8.json"
+generate_bulk_request "test_keyword_array" "" "item_" 
"$keyword_array_data_file" "$keyword_array_bulk_file"
+curl -X POST "http://${ES_8_HOST}:9200/_bulk"; --data-binary 
"@$keyword_array_bulk_file" -H "Content-Type: application/json"
+
 touch /tmp/SUCCESS
diff --git 
a/docker/thirdparties/docker-compose/elasticsearch/scripts/index/es7_test_keyword_array.json
 
b/docker/thirdparties/docker-compose/elasticsearch/scripts/index/es7_test_keyword_array.json
new file mode 100644
index 00000000000..3bd8e786408
--- /dev/null
+++ 
b/docker/thirdparties/docker-compose/elasticsearch/scripts/index/es7_test_keyword_array.json
@@ -0,0 +1,28 @@
+{
+    "settings": {
+        "number_of_shards": 1,
+        "number_of_replicas": 0
+    },
+    "mappings": {
+        "properties": {
+            "id": {
+                "type": "integer"
+            },
+            "name": {
+                "type": "keyword"
+            },
+            "tags": {
+                "type": "keyword"
+            },
+            "description": {
+                "type": "text"
+            },
+            "emails": {
+                "type": "keyword"
+            },
+            "single_value": {
+                "type": "keyword"
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git 
a/regression-test/data/external_table_p0/es/test_es_keyword_array_type.out 
b/regression-test/data/external_table_p0/es/test_es_keyword_array_type.out
new file mode 100644
index 00000000000..13e75a94f95
--- /dev/null
+++ b/regression-test/data/external_table_p0/es/test_es_keyword_array_type.out
@@ -0,0 +1,33 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !es7_keyword_array_01 --
+1      Alice   ["tag1","tag2","tag3"]  ["[email protected]","[email protected]"]  
hello
+2      Bob     single_tag      ["[email protected]"]    world
+3      Charlie ["tag_a","tag_b"]       [email protected]     ["val1","val2"]
+
+-- !es7_keyword_array_02 --
+1      Alice   ["first line","second line"]
+2      Bob     simple text
+3      Charlie ["only line"]
+
+-- !es7_keyword_array_03 --
+1      ["tag1","tag2","tag3"]
+
+-- !es7_keyword_array_04 --
+3      ["val1","val2"]
+
+-- !es8_keyword_array_01 --
+1      Alice   ["tag1","tag2","tag3"]  ["[email protected]","[email protected]"]  
hello
+2      Bob     single_tag      ["[email protected]"]    world
+3      Charlie ["tag_a","tag_b"]       [email protected]     ["val1","val2"]
+
+-- !es8_keyword_array_02 --
+1      Alice   ["first line","second line"]
+2      Bob     simple text
+3      Charlie ["only line"]
+
+-- !es8_keyword_array_03 --
+1      ["tag1","tag2","tag3"]
+
+-- !es8_keyword_array_04 --
+3      ["val1","val2"]
+
diff --git 
a/regression-test/suites/external_table_p0/es/test_es_keyword_array_type.groovy 
b/regression-test/suites/external_table_p0/es/test_es_keyword_array_type.groovy
new file mode 100644
index 00000000000..2c153c940f9
--- /dev/null
+++ 
b/regression-test/suites/external_table_p0/es/test_es_keyword_array_type.groovy
@@ -0,0 +1,115 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Test for https://github.com/apache/doris/issues/XXXXX
+// When ES mapping defines a field as keyword/text, but the actual data stored
+// is an array, Doris should not throw an error. Instead, it should serialize
+// the array to a JSON string representation.
+
+suite("test_es_keyword_array_type", 
"p0,external,es,external_docker,external_docker_es") {
+    String enabled = context.config.otherConfigs.get("enableEsTest")
+    if (enabled != null && enabled.equalsIgnoreCase("true")) {
+        String externalEnvIp = context.config.otherConfigs.get("externalEnvIp")
+        String es_7_port = context.config.otherConfigs.get("es_7_port")
+        String es_8_port = context.config.otherConfigs.get("es_8_port")
+
+        sql """drop catalog if exists test_es_keyword_array_es7;"""
+        sql """drop catalog if exists test_es_keyword_array_es8;"""
+
+        // Create ES7 catalog
+        sql """create catalog if not exists test_es_keyword_array_es7 
properties(
+            "type"="es",
+            "hosts"="http://${externalEnvIp}:$es_7_port";,
+            "nodes_discovery"="false",
+            "enable_keyword_sniff"="true",
+            "enable_docvalue_scan"="false"
+        );
+        """
+
+        // Create ES8 catalog
+        sql """create catalog if not exists test_es_keyword_array_es8 
properties(
+            "type"="es",
+            "hosts"="http://${externalEnvIp}:$es_8_port";,
+            "nodes_discovery"="false",
+            "enable_keyword_sniff"="true",
+            "enable_docvalue_scan"="false"
+        );
+        """
+
+        // Test ES7
+        sql """switch test_es_keyword_array_es7"""
+        // Wait for metadata sync
+        def maxRetries = 30
+        def retryCount = 0
+        def success = false
+        while (!success && retryCount < maxRetries) {
+            try {
+                sql """select * from test_keyword_array"""
+                success = true
+            } catch (Exception e) {
+                if (e.getMessage().contains("EsTable metadata has not been 
synced, Try it later")) {
+                    logger.info("Waiting for ES metadata sync... Attempt 
${retryCount + 1}")
+                    retryCount++
+                    sleep(1000)
+                } else {
+                    throw e
+                }
+            }
+        }
+        if (!success) {
+            throw new RuntimeException("Failed to sync ES metadata after 
${maxRetries} attempts")
+        }
+
+        // Test: Query keyword fields that contain array data should not throw 
error
+        // Previously this would fail with: "Expected value of type: STRING; 
but found type: Array"
+        order_qt_es7_keyword_array_01 """select id, name, tags, emails, 
single_value from test_keyword_array order by id"""
+        order_qt_es7_keyword_array_02 """select id, name, description from 
test_keyword_array order by id"""
+        order_qt_es7_keyword_array_03 """select id, tags from 
test_keyword_array where id = 1"""
+        order_qt_es7_keyword_array_04 """select id, single_value from 
test_keyword_array where id = 3"""
+
+        // Test ES8
+        sql """switch test_es_keyword_array_es8"""
+        retryCount = 0
+        success = false
+        while (!success && retryCount < maxRetries) {
+            try {
+                sql """select * from test_keyword_array"""
+                success = true
+            } catch (Exception e) {
+                if (e.getMessage().contains("EsTable metadata has not been 
synced, Try it later")) {
+                    logger.info("Waiting for ES metadata sync... Attempt 
${retryCount + 1}")
+                    retryCount++
+                    sleep(1000)
+                } else {
+                    throw e
+                }
+            }
+        }
+        if (!success) {
+            throw new RuntimeException("Failed to sync ES metadata after 
${maxRetries} attempts")
+        }
+
+        order_qt_es8_keyword_array_01 """select id, name, tags, emails, 
single_value from test_keyword_array order by id"""
+        order_qt_es8_keyword_array_02 """select id, name, description from 
test_keyword_array order by id"""
+        order_qt_es8_keyword_array_03 """select id, tags from 
test_keyword_array where id = 1"""
+        order_qt_es8_keyword_array_04 """select id, single_value from 
test_keyword_array where id = 3"""
+
+        // Cleanup
+        sql """drop catalog if exists test_es_keyword_array_es7;"""
+        sql """drop catalog if exists test_es_keyword_array_es8;"""
+    }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to