This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push:
new 64042777956 [fix](json) Add . after in JSON path to support correct
token parsing (#52543) (#52544)
64042777956 is described below
commit 64042777956da6d81e64bcbdcbfd25100c2eee69
Author: Jerry Hu <[email protected]>
AuthorDate: Thu Jul 3 14:36:53 2025 +0800
[fix](json) Add . after in JSON path to support correct token parsing
(#52543) (#52544)
Boost tokenizer requires explicit "." after "$" to correctly extract
JSON path tokens. Without this, expressions like "$[0].key" cannot be
properly split, causing issues in downstream logic. This commit ensures
a "." is automatically added after "$" to maintain consistent token
parsing behavior.
---
be/src/vec/functions/function_json.cpp | 114 +++++++++------------
be/test/util/jsonb_parser_simd_test.cpp | 2 +-
be/test/vec/function/function_json_test.cpp | 12 +--
.../data/json_p0/test_json_load_and_function.out | Bin 259751 -> 258245 bytes
.../data/jsonb_p0/test_jsonb_load_and_function.out | Bin 188467 -> 187714 bytes
.../data/nereids_function_p0/scalar_function/J.out | Bin 160933 -> 160276 bytes
.../json_p0/test_json_load_and_function.out | Bin 136408 -> 135655 bytes
.../json_functions/test_json_extract.out | Bin 286 -> 320 bytes
.../json_functions/test_json_extract.groovy | 6 ++
9 files changed, 60 insertions(+), 74 deletions(-)
diff --git a/be/src/vec/functions/function_json.cpp
b/be/src/vec/functions/function_json.cpp
index 346c6005b98..22a6871805a 100644
--- a/be/src/vec/functions/function_json.cpp
+++ b/be/src/vec/functions/function_json.cpp
@@ -145,45 +145,7 @@ rapidjson::Value* match_value(const std::vector<JsonPath>&
parsed_paths, rapidjs
const std::string& col = parsed_paths[i].key;
int index = parsed_paths[i].idx;
if (LIKELY(!col.empty())) {
- if (root->IsArray()) {
- array_obj = static_cast<rapidjson::Value*>(
- mem_allocator.Malloc(sizeof(rapidjson::Value)));
- array_obj->SetArray();
- bool is_null = true;
-
- // if array ,loop the array,find out all Objects,then find the
results from the objects
- for (int j = 0; j < root->Size(); j++) {
- rapidjson::Value* json_elem = &((*root)[j]);
-
- if (json_elem->IsArray() || json_elem->IsNull()) {
- continue;
- } else {
- if (!json_elem->IsObject()) {
- continue;
- }
- if (!json_elem->HasMember(col.c_str())) {
- if (is_insert_null) { // not found item, then
insert a null object.
- is_null = false;
- rapidjson::Value
nullObject(rapidjson::kNullType);
- array_obj->PushBack(nullObject, mem_allocator);
- }
- continue;
- }
- rapidjson::Value* obj = &((*json_elem)[col.c_str()]);
- if (obj->IsArray()) {
- is_null = false;
- for (int k = 0; k < obj->Size(); k++) {
- array_obj->PushBack((*obj)[k], mem_allocator);
- }
- } else if (!obj->IsNull()) {
- is_null = false;
- array_obj->PushBack(*obj, mem_allocator);
- }
- }
- }
-
- root = is_null ? &(array_obj->SetNull()) : array_obj;
- } else if (root->IsObject()) {
+ if (root->IsObject()) {
if (!root->HasMember(col.c_str())) {
return nullptr;
} else {
@@ -234,8 +196,17 @@ rapidjson::Value* get_json_object(std::string_view
json_string, std::string_view
//Cannot use '\' as the last character, return NULL
if (path_string.back() == '\\') {
- document->SetNull();
- return document;
+ return nullptr;
+ }
+
+ std::string fixed_string;
+ if (path_string.size() >= 2 && path_string[0] == '$' && path_string[1] !=
'.') {
+ // Boost tokenizer requires explicit "." after "$" to correctly
extract JSON path tokens.
+ // Without this, expressions like "$[0].key" cannot be properly split.
+ // This commit ensures a "." is automatically added after "$" to
maintain consistent token parsing behavior.
+ fixed_string = "$.";
+ fixed_string += path_string.substr(1);
+ path_string = fixed_string;
}
try {
@@ -252,13 +223,13 @@ rapidjson::Value* get_json_object(std::string_view
json_string, std::string_view
}
} catch (boost::escaped_list_error&) {
// meet unknown escape sequence, example '$.name\k'
- return document;
+ return nullptr;
}
parsed_paths = &tmp_parsed_paths;
if (!(*parsed_paths)[0].is_valid) {
- return document;
+ return nullptr;
}
if (UNLIKELY((*parsed_paths).size() == 1)) {
@@ -271,10 +242,7 @@ rapidjson::Value* get_json_object(std::string_view
json_string, std::string_view
document->Parse(json_string.data(), json_string.size());
if (UNLIKELY(document->HasParseError())) {
- // VLOG_CRITICAL << "Error at offset " << document->GetErrorOffset()
<< ": "
- // << GetParseError_En(document->GetParseError());
- document->SetNull();
- return document;
+ return nullptr;
}
return match_value(*parsed_paths, document, document->GetAllocator());
@@ -849,9 +817,9 @@ struct FunctionJsonQuoteImpl {
struct FunctionJsonExtractImpl {
static constexpr auto name = "json_extract";
- static rapidjson::Value parse_json(const ColumnString* json_col, const
ColumnString* path_col,
- rapidjson::Document::AllocatorType&
allocator,
- const int row) {
+ static std::pair<bool, rapidjson::Value> parse_json(
+ const ColumnString* json_col, const ColumnString* path_col,
+ rapidjson::Document::AllocatorType& allocator, const int row) {
rapidjson::Value value;
rapidjson::Document document;
@@ -860,11 +828,14 @@ struct FunctionJsonExtractImpl {
const auto path = path_col->get_data_at(row);
std::string_view path_string(path.data, path.size);
- auto root = get_json_object<JSON_FUN_STRING>(json_string, path_string,
&document);
+ auto* root = get_json_object<JSON_FUN_STRING>(json_string,
path_string, &document);
+ bool found = false;
if (root != nullptr) {
+ found = true;
value.CopyFrom(*root, allocator);
}
- return value;
+
+ return {found, std::move(value)};
}
static void execute(const std::vector<const ColumnString*>& data_columns,
@@ -874,30 +845,41 @@ struct FunctionJsonExtractImpl {
rapidjson::StringBuffer buf;
rapidjson::Writer<rapidjson::StringBuffer> writer(buf);
- const auto json_col = data_columns[0];
+ const auto* json_col = data_columns[0];
for (size_t row = 0; row < input_rows_count; row++) {
rapidjson::Value value;
if (data_columns.size() == 2) {
- value = parse_json(json_col, data_columns[1], allocator, row);
+ auto result = parse_json(json_col, data_columns[1], allocator,
row);
+ if (result.first) {
+ value = std::move(result.second);
+ } else {
+ null_map[row] = 1;
+ result_column.insert_default();
+ continue;
+ }
} else {
+ bool found_any = false;
value.SetArray();
value.Reserve(data_columns.size() - 1, allocator);
for (size_t col = 1; col < data_columns.size(); ++col) {
- value.PushBack(parse_json(json_col, data_columns[col],
allocator, row),
- allocator);
+ auto result = parse_json(json_col, data_columns[col],
allocator, row);
+ if (result.first) {
+ found_any = true;
+ value.PushBack(std::move(result.second), allocator);
+ }
+ }
+ if (!found_any) {
+ null_map[row] = 1;
+ result_column.insert_default();
+ continue;
}
}
- if (value.IsNull()) {
- null_map[row] = 1;
- result_column.insert_default();
- } else {
- // write value as string
- buf.Clear();
- writer.Reset(buf);
- value.Accept(writer);
- result_column.insert_data(buf.GetString(), buf.GetSize());
- }
+ // write value as string
+ buf.Clear();
+ writer.Reset(buf);
+ value.Accept(writer);
+ result_column.insert_data(buf.GetString(), buf.GetSize());
}
}
};
diff --git a/be/test/util/jsonb_parser_simd_test.cpp
b/be/test/util/jsonb_parser_simd_test.cpp
index 4fff867d536..3e7e70cf666 100644
--- a/be/test/util/jsonb_parser_simd_test.cpp
+++ b/be/test/util/jsonb_parser_simd_test.cpp
@@ -232,7 +232,7 @@ TEST_F(JsonbParserTest, ParseJsonWithLongInt) {
TEST_F(JsonbParserTest, ParseInvalidJsonFormat) {
std::string_view invalid_json = R"({"key": "value")";
- EXPECT_EQ(parse_json_and_check(invalid_json, invalid_json),
JsonbErrType::E_INVALID_KEY_STRING);
+ EXPECT_NE(parse_json_and_check(invalid_json, invalid_json),
JsonbErrType::E_NONE);
}
TEST_F(JsonbParserTest, ParseJsonWithInvalidKeyType) {
diff --git a/be/test/vec/function/function_json_test.cpp
b/be/test/vec/function/function_json_test.cpp
index ceecadf64c1..9c79b712463 100644
--- a/be/test/vec/function/function_json_test.cpp
+++ b/be/test/vec/function/function_json_test.cpp
@@ -72,15 +72,13 @@ TEST(FunctionJsonTEST, GetJsonStringTest) {
std::string func_name = "get_json_string";
InputTypeSet input_types = {TypeIndex::String, TypeIndex::String};
DataSet data_set = {
- {{VARCHAR("{\"k1\":\"v1\", \"k2\":\"v2\"}"), VARCHAR("$.k1")},
VARCHAR("v1")},
- {{VARCHAR("{\"k1\":\"v1\", \"my.key\":[\"e1\", \"e2\", \"e3\"]}"),
- VARCHAR("$.\"my.key\"[1]")},
+ {{VARCHAR(R"({"k1":"v1", "k2":"v2"})"), VARCHAR("$.k1")},
VARCHAR("v1")},
+ {{VARCHAR(R"({"k1":"v1", "my.key":["e1", "e2", "e3"]})"),
VARCHAR("$.\"my.key\"[1]")},
VARCHAR("e2")},
- {{VARCHAR("{\"k1.key\":{\"k2\":[\"v1\", \"v2\"]}}"),
VARCHAR("$.\"k1.key\".k2[0]")},
+ {{VARCHAR(R"({"k1.key":{"k2":["v1", "v2"]}})"),
VARCHAR("$.\"k1.key\".k2[0]")},
VARCHAR("v1")},
- {{VARCHAR("[{\"k1\":\"v1\"}, {\"k2\":\"v2\"}, {\"k1\":\"v3\"},
{\"k1\":\"v4\"}]"),
- VARCHAR("$.k1")},
- VARCHAR("[\"v1\",\"v3\",\"v4\"]")}};
+ {{VARCHAR(R"([{"k1":"v1"}, {"k2":"v2"}, {"k1":"v3"},
{"k1":"v4"}])"), VARCHAR("$.k1")},
+ Null()}};
static_cast<void>(check_function<DataTypeString, true>(func_name,
input_types, data_set));
}
diff --git a/regression-test/data/json_p0/test_json_load_and_function.out
b/regression-test/data/json_p0/test_json_load_and_function.out
index c82e38aa21e..9928a5943ab 100644
Binary files a/regression-test/data/json_p0/test_json_load_and_function.out and
b/regression-test/data/json_p0/test_json_load_and_function.out differ
diff --git a/regression-test/data/jsonb_p0/test_jsonb_load_and_function.out
b/regression-test/data/jsonb_p0/test_jsonb_load_and_function.out
index 8f871ec68e9..773061f7bda 100644
Binary files a/regression-test/data/jsonb_p0/test_jsonb_load_and_function.out
and b/regression-test/data/jsonb_p0/test_jsonb_load_and_function.out differ
diff --git a/regression-test/data/nereids_function_p0/scalar_function/J.out
b/regression-test/data/nereids_function_p0/scalar_function/J.out
index 64a76e24826..dc1c34eafe6 100644
Binary files a/regression-test/data/nereids_function_p0/scalar_function/J.out
and b/regression-test/data/nereids_function_p0/scalar_function/J.out differ
diff --git
a/regression-test/data/nereids_p0/json_p0/test_json_load_and_function.out
b/regression-test/data/nereids_p0/json_p0/test_json_load_and_function.out
index 892c9e8fe8f..c8bbe10e57d 100644
Binary files
a/regression-test/data/nereids_p0/json_p0/test_json_load_and_function.out and
b/regression-test/data/nereids_p0/json_p0/test_json_load_and_function.out differ
diff --git
a/regression-test/data/query_p0/sql_functions/json_functions/test_json_extract.out
b/regression-test/data/query_p0/sql_functions/json_functions/test_json_extract.out
index 2b64e92fd3e..c800dc75f75 100644
Binary files
a/regression-test/data/query_p0/sql_functions/json_functions/test_json_extract.out
and
b/regression-test/data/query_p0/sql_functions/json_functions/test_json_extract.out
differ
diff --git
a/regression-test/suites/query_p0/sql_functions/json_functions/test_json_extract.groovy
b/regression-test/suites/query_p0/sql_functions/json_functions/test_json_extract.groovy
index 41e68111d79..b3e202be289 100644
---
a/regression-test/suites/query_p0/sql_functions/json_functions/test_json_extract.groovy
+++
b/regression-test/suites/query_p0/sql_functions/json_functions/test_json_extract.groovy
@@ -28,4 +28,10 @@ suite("test_json_extract") {
sql """ SELECT JSON_EXTRACT_STRING('{"id": 123, "name": "doris"}',
'\$.'); """
exception "Invalid Json Path for value: \$."
}
+
+ qt_fix_array_path """
+ select
+ JSON_EXTRACT('[{"key": [123]}]', '\$[0].key') v1
+ , JSON_EXTRACT('[{"key": [123]}]', '\$.[0].key') v2;
+ """
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]