This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new b085187c45f [fix](es-catalog) Fix query error when ES keyword field
contains array data (#61236)
b085187c45f is described below
commit b085187c45f8411a9910bc26ced8ae530262e5e3
Author: Mingyu Chen (Rayner) <[email protected]>
AuthorDate: Sun Mar 15 20:45:07 2026 -0700
[fix](es-catalog) Fix query error when ES keyword field contains array data
(#61236)
### What problem does this PR solve?
When an Elasticsearch field is mapped as keyword/text but the actual
data stored is an array (which is valid in ES),
Doris was throwing: 'Expected value of type: STRING; but found type:
Array'
This fix serializes the array to a JSON string representation instead of
throwing an error,
which is consistent with how other non-string types (Numbers, Objects)
are already handled.
Fixed two code paths in es_scroll_parser.cpp:
1. fill_columns() - main _source parsing path
2. handle_value() - array element processing path
Also added regression test and test data for this scenario.
---
be/src/exec/es/es_scroll_parser.cpp | 20 +++-
.../scripts/data/test_keyword_array.json | 3 +
.../elasticsearch/scripts/es_init.sh | 18 +++-
.../scripts/index/es7_test_keyword_array.json | 28 +++++
.../es/test_es_keyword_array_type.out | 33 ++++++
.../es/test_es_keyword_array_type.groovy | 115 +++++++++++++++++++++
6 files changed, 212 insertions(+), 5 deletions(-)
diff --git a/be/src/exec/es/es_scroll_parser.cpp
b/be/src/exec/es/es_scroll_parser.cpp
index 8e3ee8fa3ee..e2821a40ff1 100644
--- a/be/src/exec/es/es_scroll_parser.cpp
+++ b/be/src/exec/es/es_scroll_parser.cpp
@@ -438,8 +438,12 @@ Status handle_value(const rapidjson::Value& col,
PrimitiveType sub_type, bool pu
return Status::OK();
}
if constexpr (T == TYPE_STRING || T == TYPE_CHAR || T == TYPE_VARCHAR) {
- RETURN_ERROR_IF_COL_IS_ARRAY(col, sub_type, true);
- if (!col.IsString()) {
+ // When ES mapping is keyword/text but actual data is an array,
+ // serialize the array to JSON string instead of throwing an error.
+ // This is valid in ES since any field can hold array values.
+ if (col.IsArray()) {
+ val = json_value_to_string(col);
+ } else if (!col.IsString()) {
val = json_value_to_string(col);
} else {
val = col.GetString();
@@ -707,14 +711,22 @@ Status ScrollParser::fill_columns(const TupleDescriptor*
tuple_desc,
if (pure_doc_value) {
if (col.Empty()) {
break;
+ } else if (col.Size() > 1) {
+ // doc_values with multiple elements means actual array
data
+ // in ES keyword/text field, serialize as JSON array string
+ val = json_value_to_string(col);
} else if (!col[0].IsString()) {
val = json_value_to_string(col[0]);
} else {
val = col[0].GetString();
}
} else {
- RETURN_ERROR_IF_COL_IS_ARRAY(col, type, true);
- if (!col.IsString()) {
+ // When ES mapping is keyword/text but actual data is an array,
+ // serialize the array to JSON string instead of throwing an
error.
+ // This is valid in ES since any field can hold array values.
+ if (col.IsArray()) {
+ val = json_value_to_string(col);
+ } else if (!col.IsString()) {
val = json_value_to_string(col);
} else {
val = col.GetString();
diff --git
a/docker/thirdparties/docker-compose/elasticsearch/scripts/data/test_keyword_array.json
b/docker/thirdparties/docker-compose/elasticsearch/scripts/data/test_keyword_array.json
new file mode 100644
index 00000000000..72979d2a459
--- /dev/null
+++
b/docker/thirdparties/docker-compose/elasticsearch/scripts/data/test_keyword_array.json
@@ -0,0 +1,3 @@
+{"id": 1, "name": "Alice", "tags": ["tag1", "tag2", "tag3"], "description":
["first line", "second line"], "emails": ["[email protected]",
"[email protected]"], "single_value": "hello"}
+{"id": 2, "name": "Bob", "tags": "single_tag", "description": "simple text",
"emails": ["[email protected]"], "single_value": "world"}
+{"id": 3, "name": "Charlie", "tags": ["tag_a", "tag_b"], "description": ["only
line"], "emails": "[email protected]", "single_value": ["val1", "val2"]}
diff --git
a/docker/thirdparties/docker-compose/elasticsearch/scripts/es_init.sh
b/docker/thirdparties/docker-compose/elasticsearch/scripts/es_init.sh
index d45e4b5fb4a..51f81ced938 100755
--- a/docker/thirdparties/docker-compose/elasticsearch/scripts/es_init.sh
+++ b/docker/thirdparties/docker-compose/elasticsearch/scripts/es_init.sh
@@ -23,7 +23,7 @@ generate_bulk_request() {
local data_file=$4
local output_file=$5
- // clear output file
+ # clear output file
echo "" > "$output_file"
local id=1
@@ -174,6 +174,14 @@ curl -X POST "http://${ES_7_HOST}:9200/_bulk"
--data-binary "@$bulk_request_file
# put _meta for composite_type_array
curl "http://${ES_7_HOST}:9200/composite_type_array/_mapping" -H
"Content-Type:application/json" -X PUT -d
"@/mnt/scripts/index/array_meta_composite_type_array.json"
+# create index test_keyword_array for keyword array type testing
+curl "http://${ES_7_HOST}:9200/test_keyword_array" -H
"Content-Type:application/json" -X PUT -d
'@/mnt/scripts/index/es7_test_keyword_array.json'
+# put data for test_keyword_array
+keyword_array_data_file="/mnt/scripts/data/test_keyword_array.json"
+keyword_array_bulk_file="/mnt/scripts/data/bulk_request_keyword_array_es7.json"
+generate_bulk_request "test_keyword_array" "_doc" "item_"
"$keyword_array_data_file" "$keyword_array_bulk_file"
+curl -X POST "http://${ES_7_HOST}:9200/_bulk" --data-binary
"@$keyword_array_bulk_file" -H "Content-Type: application/json"
+
# es8
# create index test1
curl "http://${ES_8_HOST}:9200/test1" -H "Content-Type:application/json" -X
PUT -d "@/mnt/scripts/index/es7_test1.json"
@@ -217,4 +225,12 @@ curl -X POST "http://${ES_8_HOST}:9200/_bulk"
--data-binary "@$bulk_request_file
# put _meta for composite_type_array
curl "http://${ES_8_HOST}:9200/composite_type_array/_mapping" -H
"Content-Type:application/json" -X PUT -d
"@/mnt/scripts/index/array_meta_composite_type_array.json"
+# create index test_keyword_array for keyword array type testing
+curl "http://${ES_8_HOST}:9200/test_keyword_array" -H
"Content-Type:application/json" -X PUT -d
'@/mnt/scripts/index/es7_test_keyword_array.json'
+# put data for test_keyword_array
+keyword_array_data_file="/mnt/scripts/data/test_keyword_array.json"
+keyword_array_bulk_file="/mnt/scripts/data/bulk_request_keyword_array_es8.json"
+generate_bulk_request "test_keyword_array" "" "item_"
"$keyword_array_data_file" "$keyword_array_bulk_file"
+curl -X POST "http://${ES_8_HOST}:9200/_bulk" --data-binary
"@$keyword_array_bulk_file" -H "Content-Type: application/json"
+
touch /tmp/SUCCESS
diff --git
a/docker/thirdparties/docker-compose/elasticsearch/scripts/index/es7_test_keyword_array.json
b/docker/thirdparties/docker-compose/elasticsearch/scripts/index/es7_test_keyword_array.json
new file mode 100644
index 00000000000..3bd8e786408
--- /dev/null
+++
b/docker/thirdparties/docker-compose/elasticsearch/scripts/index/es7_test_keyword_array.json
@@ -0,0 +1,28 @@
+{
+ "settings": {
+ "number_of_shards": 1,
+ "number_of_replicas": 0
+ },
+ "mappings": {
+ "properties": {
+ "id": {
+ "type": "integer"
+ },
+ "name": {
+ "type": "keyword"
+ },
+ "tags": {
+ "type": "keyword"
+ },
+ "description": {
+ "type": "text"
+ },
+ "emails": {
+ "type": "keyword"
+ },
+ "single_value": {
+ "type": "keyword"
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git
a/regression-test/data/external_table_p0/es/test_es_keyword_array_type.out
b/regression-test/data/external_table_p0/es/test_es_keyword_array_type.out
new file mode 100644
index 00000000000..13e75a94f95
--- /dev/null
+++ b/regression-test/data/external_table_p0/es/test_es_keyword_array_type.out
@@ -0,0 +1,33 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !es7_keyword_array_01 --
+1 Alice ["tag1","tag2","tag3"] ["[email protected]","[email protected]"]
hello
+2 Bob single_tag ["[email protected]"] world
+3 Charlie ["tag_a","tag_b"] [email protected] ["val1","val2"]
+
+-- !es7_keyword_array_02 --
+1 Alice ["first line","second line"]
+2 Bob simple text
+3 Charlie ["only line"]
+
+-- !es7_keyword_array_03 --
+1 ["tag1","tag2","tag3"]
+
+-- !es7_keyword_array_04 --
+3 ["val1","val2"]
+
+-- !es8_keyword_array_01 --
+1 Alice ["tag1","tag2","tag3"] ["[email protected]","[email protected]"]
hello
+2 Bob single_tag ["[email protected]"] world
+3 Charlie ["tag_a","tag_b"] [email protected] ["val1","val2"]
+
+-- !es8_keyword_array_02 --
+1 Alice ["first line","second line"]
+2 Bob simple text
+3 Charlie ["only line"]
+
+-- !es8_keyword_array_03 --
+1 ["tag1","tag2","tag3"]
+
+-- !es8_keyword_array_04 --
+3 ["val1","val2"]
+
diff --git
a/regression-test/suites/external_table_p0/es/test_es_keyword_array_type.groovy
b/regression-test/suites/external_table_p0/es/test_es_keyword_array_type.groovy
new file mode 100644
index 00000000000..2c153c940f9
--- /dev/null
+++
b/regression-test/suites/external_table_p0/es/test_es_keyword_array_type.groovy
@@ -0,0 +1,115 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Test for https://github.com/apache/doris/issues/XXXXX
+// When ES mapping defines a field as keyword/text, but the actual data stored
+// is an array, Doris should not throw an error. Instead, it should serialize
+// the array to a JSON string representation.
+
+suite("test_es_keyword_array_type",
"p0,external,es,external_docker,external_docker_es") {
+ String enabled = context.config.otherConfigs.get("enableEsTest")
+ if (enabled != null && enabled.equalsIgnoreCase("true")) {
+ String externalEnvIp = context.config.otherConfigs.get("externalEnvIp")
+ String es_7_port = context.config.otherConfigs.get("es_7_port")
+ String es_8_port = context.config.otherConfigs.get("es_8_port")
+
+ sql """drop catalog if exists test_es_keyword_array_es7;"""
+ sql """drop catalog if exists test_es_keyword_array_es8;"""
+
+ // Create ES7 catalog
+ sql """create catalog if not exists test_es_keyword_array_es7
properties(
+ "type"="es",
+ "hosts"="http://${externalEnvIp}:$es_7_port",
+ "nodes_discovery"="false",
+ "enable_keyword_sniff"="true",
+ "enable_docvalue_scan"="false"
+ );
+ """
+
+ // Create ES8 catalog
+ sql """create catalog if not exists test_es_keyword_array_es8
properties(
+ "type"="es",
+ "hosts"="http://${externalEnvIp}:$es_8_port",
+ "nodes_discovery"="false",
+ "enable_keyword_sniff"="true",
+ "enable_docvalue_scan"="false"
+ );
+ """
+
+ // Test ES7
+ sql """switch test_es_keyword_array_es7"""
+ // Wait for metadata sync
+ def maxRetries = 30
+ def retryCount = 0
+ def success = false
+ while (!success && retryCount < maxRetries) {
+ try {
+ sql """select * from test_keyword_array"""
+ success = true
+ } catch (Exception e) {
+ if (e.getMessage().contains("EsTable metadata has not been
synced, Try it later")) {
+ logger.info("Waiting for ES metadata sync... Attempt
${retryCount + 1}")
+ retryCount++
+ sleep(1000)
+ } else {
+ throw e
+ }
+ }
+ }
+ if (!success) {
+ throw new RuntimeException("Failed to sync ES metadata after
${maxRetries} attempts")
+ }
+
+ // Test: Query keyword fields that contain array data should not throw
error
+ // Previously this would fail with: "Expected value of type: STRING;
but found type: Array"
+ order_qt_es7_keyword_array_01 """select id, name, tags, emails,
single_value from test_keyword_array order by id"""
+ order_qt_es7_keyword_array_02 """select id, name, description from
test_keyword_array order by id"""
+ order_qt_es7_keyword_array_03 """select id, tags from
test_keyword_array where id = 1"""
+ order_qt_es7_keyword_array_04 """select id, single_value from
test_keyword_array where id = 3"""
+
+ // Test ES8
+ sql """switch test_es_keyword_array_es8"""
+ retryCount = 0
+ success = false
+ while (!success && retryCount < maxRetries) {
+ try {
+ sql """select * from test_keyword_array"""
+ success = true
+ } catch (Exception e) {
+ if (e.getMessage().contains("EsTable metadata has not been
synced, Try it later")) {
+ logger.info("Waiting for ES metadata sync... Attempt
${retryCount + 1}")
+ retryCount++
+ sleep(1000)
+ } else {
+ throw e
+ }
+ }
+ }
+ if (!success) {
+ throw new RuntimeException("Failed to sync ES metadata after
${maxRetries} attempts")
+ }
+
+ order_qt_es8_keyword_array_01 """select id, name, tags, emails,
single_value from test_keyword_array order by id"""
+ order_qt_es8_keyword_array_02 """select id, name, description from
test_keyword_array order by id"""
+ order_qt_es8_keyword_array_03 """select id, tags from
test_keyword_array where id = 1"""
+ order_qt_es8_keyword_array_04 """select id, single_value from
test_keyword_array where id = 3"""
+
+ // Cleanup
+ sql """drop catalog if exists test_es_keyword_array_es7;"""
+ sql """drop catalog if exists test_es_keyword_array_es8;"""
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]