This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-2.1 by this push:
     new 64042777956 [fix](json) Add . after  in JSON path to support correct 
token parsing (#52543) (#52544)
64042777956 is described below

commit 64042777956da6d81e64bcbdcbfd25100c2eee69
Author: Jerry Hu <[email protected]>
AuthorDate: Thu Jul 3 14:36:53 2025 +0800

    [fix](json) Add . after  in JSON path to support correct token parsing 
(#52543) (#52544)
    
    Boost tokenizer requires explicit "." after "$" to correctly extract
    JSON path tokens. Without this, expressions like "$[0].key" cannot be
    properly split, causing issues in downstream logic. This commit ensures
    a "." is automatically added after "$" to maintain consistent token
    parsing behavior.
---
 be/src/vec/functions/function_json.cpp             | 114 +++++++++------------
 be/test/util/jsonb_parser_simd_test.cpp            |   2 +-
 be/test/vec/function/function_json_test.cpp        |  12 +--
 .../data/json_p0/test_json_load_and_function.out   | Bin 259751 -> 258245 bytes
 .../data/jsonb_p0/test_jsonb_load_and_function.out | Bin 188467 -> 187714 bytes
 .../data/nereids_function_p0/scalar_function/J.out | Bin 160933 -> 160276 bytes
 .../json_p0/test_json_load_and_function.out        | Bin 136408 -> 135655 bytes
 .../json_functions/test_json_extract.out           | Bin 286 -> 320 bytes
 .../json_functions/test_json_extract.groovy        |   6 ++
 9 files changed, 60 insertions(+), 74 deletions(-)

diff --git a/be/src/vec/functions/function_json.cpp 
b/be/src/vec/functions/function_json.cpp
index 346c6005b98..22a6871805a 100644
--- a/be/src/vec/functions/function_json.cpp
+++ b/be/src/vec/functions/function_json.cpp
@@ -145,45 +145,7 @@ rapidjson::Value* match_value(const std::vector<JsonPath>& 
parsed_paths, rapidjs
         const std::string& col = parsed_paths[i].key;
         int index = parsed_paths[i].idx;
         if (LIKELY(!col.empty())) {
-            if (root->IsArray()) {
-                array_obj = static_cast<rapidjson::Value*>(
-                        mem_allocator.Malloc(sizeof(rapidjson::Value)));
-                array_obj->SetArray();
-                bool is_null = true;
-
-                // if array ,loop the array,find out all Objects,then find the 
results from the objects
-                for (int j = 0; j < root->Size(); j++) {
-                    rapidjson::Value* json_elem = &((*root)[j]);
-
-                    if (json_elem->IsArray() || json_elem->IsNull()) {
-                        continue;
-                    } else {
-                        if (!json_elem->IsObject()) {
-                            continue;
-                        }
-                        if (!json_elem->HasMember(col.c_str())) {
-                            if (is_insert_null) { // not found item, then 
insert a null object.
-                                is_null = false;
-                                rapidjson::Value 
nullObject(rapidjson::kNullType);
-                                array_obj->PushBack(nullObject, mem_allocator);
-                            }
-                            continue;
-                        }
-                        rapidjson::Value* obj = &((*json_elem)[col.c_str()]);
-                        if (obj->IsArray()) {
-                            is_null = false;
-                            for (int k = 0; k < obj->Size(); k++) {
-                                array_obj->PushBack((*obj)[k], mem_allocator);
-                            }
-                        } else if (!obj->IsNull()) {
-                            is_null = false;
-                            array_obj->PushBack(*obj, mem_allocator);
-                        }
-                    }
-                }
-
-                root = is_null ? &(array_obj->SetNull()) : array_obj;
-            } else if (root->IsObject()) {
+            if (root->IsObject()) {
                 if (!root->HasMember(col.c_str())) {
                     return nullptr;
                 } else {
@@ -234,8 +196,17 @@ rapidjson::Value* get_json_object(std::string_view 
json_string, std::string_view
 
     //Cannot use '\' as the last character, return NULL
     if (path_string.back() == '\\') {
-        document->SetNull();
-        return document;
+        return nullptr;
+    }
+
+    std::string fixed_string;
+    if (path_string.size() >= 2 && path_string[0] == '$' && path_string[1] != 
'.') {
+        // Boost tokenizer requires explicit "." after "$" to correctly 
extract JSON path tokens.
+        // Without this, expressions like "$[0].key" cannot be properly split.
+        // This commit ensures a "." is automatically added after "$" to 
maintain consistent token parsing behavior.
+        fixed_string = "$.";
+        fixed_string += path_string.substr(1);
+        path_string = fixed_string;
     }
 
     try {
@@ -252,13 +223,13 @@ rapidjson::Value* get_json_object(std::string_view 
json_string, std::string_view
         }
     } catch (boost::escaped_list_error&) {
         // meet unknown escape sequence, example '$.name\k'
-        return document;
+        return nullptr;
     }
 
     parsed_paths = &tmp_parsed_paths;
 
     if (!(*parsed_paths)[0].is_valid) {
-        return document;
+        return nullptr;
     }
 
     if (UNLIKELY((*parsed_paths).size() == 1)) {
@@ -271,10 +242,7 @@ rapidjson::Value* get_json_object(std::string_view 
json_string, std::string_view
 
     document->Parse(json_string.data(), json_string.size());
     if (UNLIKELY(document->HasParseError())) {
-        // VLOG_CRITICAL << "Error at offset " << document->GetErrorOffset() 
<< ": "
-        //         << GetParseError_En(document->GetParseError());
-        document->SetNull();
-        return document;
+        return nullptr;
     }
 
     return match_value(*parsed_paths, document, document->GetAllocator());
@@ -849,9 +817,9 @@ struct FunctionJsonQuoteImpl {
 struct FunctionJsonExtractImpl {
     static constexpr auto name = "json_extract";
 
-    static rapidjson::Value parse_json(const ColumnString* json_col, const 
ColumnString* path_col,
-                                       rapidjson::Document::AllocatorType& 
allocator,
-                                       const int row) {
+    static std::pair<bool, rapidjson::Value> parse_json(
+            const ColumnString* json_col, const ColumnString* path_col,
+            rapidjson::Document::AllocatorType& allocator, const int row) {
         rapidjson::Value value;
         rapidjson::Document document;
 
@@ -860,11 +828,14 @@ struct FunctionJsonExtractImpl {
         const auto path = path_col->get_data_at(row);
         std::string_view path_string(path.data, path.size);
 
-        auto root = get_json_object<JSON_FUN_STRING>(json_string, path_string, 
&document);
+        auto* root = get_json_object<JSON_FUN_STRING>(json_string, 
path_string, &document);
+        bool found = false;
         if (root != nullptr) {
+            found = true;
             value.CopyFrom(*root, allocator);
         }
-        return value;
+
+        return {found, std::move(value)};
     }
 
     static void execute(const std::vector<const ColumnString*>& data_columns,
@@ -874,30 +845,41 @@ struct FunctionJsonExtractImpl {
         rapidjson::StringBuffer buf;
         rapidjson::Writer<rapidjson::StringBuffer> writer(buf);
 
-        const auto json_col = data_columns[0];
+        const auto* json_col = data_columns[0];
         for (size_t row = 0; row < input_rows_count; row++) {
             rapidjson::Value value;
             if (data_columns.size() == 2) {
-                value = parse_json(json_col, data_columns[1], allocator, row);
+                auto result = parse_json(json_col, data_columns[1], allocator, 
row);
+                if (result.first) {
+                    value = std::move(result.second);
+                } else {
+                    null_map[row] = 1;
+                    result_column.insert_default();
+                    continue;
+                }
             } else {
+                bool found_any = false;
                 value.SetArray();
                 value.Reserve(data_columns.size() - 1, allocator);
                 for (size_t col = 1; col < data_columns.size(); ++col) {
-                    value.PushBack(parse_json(json_col, data_columns[col], 
allocator, row),
-                                   allocator);
+                    auto result = parse_json(json_col, data_columns[col], 
allocator, row);
+                    if (result.first) {
+                        found_any = true;
+                        value.PushBack(std::move(result.second), allocator);
+                    }
+                }
+                if (!found_any) {
+                    null_map[row] = 1;
+                    result_column.insert_default();
+                    continue;
                 }
             }
 
-            if (value.IsNull()) {
-                null_map[row] = 1;
-                result_column.insert_default();
-            } else {
-                // write value as string
-                buf.Clear();
-                writer.Reset(buf);
-                value.Accept(writer);
-                result_column.insert_data(buf.GetString(), buf.GetSize());
-            }
+            // write value as string
+            buf.Clear();
+            writer.Reset(buf);
+            value.Accept(writer);
+            result_column.insert_data(buf.GetString(), buf.GetSize());
         }
     }
 };
diff --git a/be/test/util/jsonb_parser_simd_test.cpp 
b/be/test/util/jsonb_parser_simd_test.cpp
index 4fff867d536..3e7e70cf666 100644
--- a/be/test/util/jsonb_parser_simd_test.cpp
+++ b/be/test/util/jsonb_parser_simd_test.cpp
@@ -232,7 +232,7 @@ TEST_F(JsonbParserTest, ParseJsonWithLongInt) {
 
 TEST_F(JsonbParserTest, ParseInvalidJsonFormat) {
     std::string_view invalid_json = R"({"key": "value")";
-    EXPECT_EQ(parse_json_and_check(invalid_json, invalid_json), 
JsonbErrType::E_INVALID_KEY_STRING);
+    EXPECT_NE(parse_json_and_check(invalid_json, invalid_json), 
JsonbErrType::E_NONE);
 }
 
 TEST_F(JsonbParserTest, ParseJsonWithInvalidKeyType) {
diff --git a/be/test/vec/function/function_json_test.cpp 
b/be/test/vec/function/function_json_test.cpp
index ceecadf64c1..9c79b712463 100644
--- a/be/test/vec/function/function_json_test.cpp
+++ b/be/test/vec/function/function_json_test.cpp
@@ -72,15 +72,13 @@ TEST(FunctionJsonTEST, GetJsonStringTest) {
     std::string func_name = "get_json_string";
     InputTypeSet input_types = {TypeIndex::String, TypeIndex::String};
     DataSet data_set = {
-            {{VARCHAR("{\"k1\":\"v1\", \"k2\":\"v2\"}"), VARCHAR("$.k1")}, 
VARCHAR("v1")},
-            {{VARCHAR("{\"k1\":\"v1\", \"my.key\":[\"e1\", \"e2\", \"e3\"]}"),
-              VARCHAR("$.\"my.key\"[1]")},
+            {{VARCHAR(R"({"k1":"v1", "k2":"v2"})"), VARCHAR("$.k1")}, 
VARCHAR("v1")},
+            {{VARCHAR(R"({"k1":"v1", "my.key":["e1", "e2", "e3"]})"), 
VARCHAR("$.\"my.key\"[1]")},
              VARCHAR("e2")},
-            {{VARCHAR("{\"k1.key\":{\"k2\":[\"v1\", \"v2\"]}}"), 
VARCHAR("$.\"k1.key\".k2[0]")},
+            {{VARCHAR(R"({"k1.key":{"k2":["v1", "v2"]}})"), 
VARCHAR("$.\"k1.key\".k2[0]")},
              VARCHAR("v1")},
-            {{VARCHAR("[{\"k1\":\"v1\"}, {\"k2\":\"v2\"}, {\"k1\":\"v3\"}, 
{\"k1\":\"v4\"}]"),
-              VARCHAR("$.k1")},
-             VARCHAR("[\"v1\",\"v3\",\"v4\"]")}};
+            {{VARCHAR(R"([{"k1":"v1"}, {"k2":"v2"}, {"k1":"v3"}, 
{"k1":"v4"}])"), VARCHAR("$.k1")},
+             Null()}};
 
     static_cast<void>(check_function<DataTypeString, true>(func_name, 
input_types, data_set));
 }
diff --git a/regression-test/data/json_p0/test_json_load_and_function.out 
b/regression-test/data/json_p0/test_json_load_and_function.out
index c82e38aa21e..9928a5943ab 100644
Binary files a/regression-test/data/json_p0/test_json_load_and_function.out and 
b/regression-test/data/json_p0/test_json_load_and_function.out differ
diff --git a/regression-test/data/jsonb_p0/test_jsonb_load_and_function.out 
b/regression-test/data/jsonb_p0/test_jsonb_load_and_function.out
index 8f871ec68e9..773061f7bda 100644
Binary files a/regression-test/data/jsonb_p0/test_jsonb_load_and_function.out 
and b/regression-test/data/jsonb_p0/test_jsonb_load_and_function.out differ
diff --git a/regression-test/data/nereids_function_p0/scalar_function/J.out 
b/regression-test/data/nereids_function_p0/scalar_function/J.out
index 64a76e24826..dc1c34eafe6 100644
Binary files a/regression-test/data/nereids_function_p0/scalar_function/J.out 
and b/regression-test/data/nereids_function_p0/scalar_function/J.out differ
diff --git 
a/regression-test/data/nereids_p0/json_p0/test_json_load_and_function.out 
b/regression-test/data/nereids_p0/json_p0/test_json_load_and_function.out
index 892c9e8fe8f..c8bbe10e57d 100644
Binary files 
a/regression-test/data/nereids_p0/json_p0/test_json_load_and_function.out and 
b/regression-test/data/nereids_p0/json_p0/test_json_load_and_function.out differ
diff --git 
a/regression-test/data/query_p0/sql_functions/json_functions/test_json_extract.out
 
b/regression-test/data/query_p0/sql_functions/json_functions/test_json_extract.out
index 2b64e92fd3e..c800dc75f75 100644
Binary files 
a/regression-test/data/query_p0/sql_functions/json_functions/test_json_extract.out
 and 
b/regression-test/data/query_p0/sql_functions/json_functions/test_json_extract.out
 differ
diff --git 
a/regression-test/suites/query_p0/sql_functions/json_functions/test_json_extract.groovy
 
b/regression-test/suites/query_p0/sql_functions/json_functions/test_json_extract.groovy
index 41e68111d79..b3e202be289 100644
--- 
a/regression-test/suites/query_p0/sql_functions/json_functions/test_json_extract.groovy
+++ 
b/regression-test/suites/query_p0/sql_functions/json_functions/test_json_extract.groovy
@@ -28,4 +28,10 @@ suite("test_json_extract") {
         sql """ SELECT JSON_EXTRACT_STRING('{"id": 123, "name": "doris"}', 
'\$.'); """
         exception "Invalid Json Path for value: \$."
     }
+
+    qt_fix_array_path """
+        select 
+            JSON_EXTRACT('[{"key": [123]}]', '\$[0].key') v1
+            , JSON_EXTRACT('[{"key": [123]}]', '\$.[0].key') v2;
+    """
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to