This is an automated email from the ASF dual-hosted git repository. zykkk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new ddd7e9871d [improvement](Jsonb) optimization Jsonb path parse (#21495) ddd7e9871d is described below commit ddd7e9871ddcf177c47c7e21e2ec5d9232133739 Author: Liqf <109049295+lemonlit...@users.noreply.github.com> AuthorDate: Sun Jul 23 18:59:12 2023 +0800 [improvement](Jsonb) optimization Jsonb path parse (#21495) The previous logic was to read jsonbvalue while parsing the json path. For complex json paths, there will be a lot of repeated parsing work. The optimization idea is to separate the analysis and value of jsonpath --- be/src/util/jsonb_document.h | 273 ++++++++++++++++---------------- be/src/vec/functions/function_jsonb.cpp | 101 +++++++++--- 2 files changed, 213 insertions(+), 161 deletions(-) diff --git a/be/src/util/jsonb_document.h b/be/src/util/jsonb_document.h index d54e7291dd..c21917e066 100644 --- a/be/src/util/jsonb_document.h +++ b/be/src/util/jsonb_document.h @@ -309,14 +309,6 @@ public: bool get_has_escapes() const { return has_escapes; } - void set_is_invalid_json_path(bool has) { is_invalid_json_path = has; } - - bool get_is_invalid_json_path() const { return is_invalid_json_path; } - - void set_type(unsigned int code) { type = code; } - - bool get_type() const { return type; } - private: /// The current position in the stream. const char* m_position; @@ -332,9 +324,17 @@ private: ///Whether to contain escape characters bool has_escapes = false; +}; + +struct leg_info { + ///path leg ptr + char* leg_ptr; + + ///path leg len + unsigned int leg_len; - ///Is the json path valid - bool is_invalid_json_path = false; + ///array_index + int array_index; ///type: 0 is member 1 is array unsigned int type; @@ -343,10 +343,24 @@ private: class JsonbPath { public: // parse json path - static bool parsePath(Stream* stream); + static bool parsePath(Stream* stream, JsonbPath* path); + + static bool parse_array(Stream* stream, JsonbPath* path); + static bool parse_member(Stream* stream, JsonbPath* path); + + //return true if json path valid else return false + bool seek(const char* string, size_t length); + + void add_leg_to_leg_vector(std::unique_ptr<leg_info> leg) { + leg_vector.emplace_back(leg.release()); + } - static bool parse_array(Stream* stream); - static bool parse_member(Stream* stream); + size_t get_leg_vector_size() { return leg_vector.size(); } + + leg_info* get_leg_from_leg_vector(size_t i) { return leg_vector[i].get(); } + +private: + std::vector<std::unique_ptr<leg_info>> leg_vector; }; /* @@ -529,15 +543,8 @@ public: // get the raw byte array of the value const char* getValuePtr() const; - // find the JSONB value by a key path string (null terminated) - JsonbValue* findPath(const char* key_path, bool& is_invalid_json_path, - hDictFind handler = nullptr) { - return findPath(key_path, (unsigned int)strlen(key_path), is_invalid_json_path, handler); - } - - // find the JSONB value by a key path string (with length) - JsonbValue* findPath(const char* key_path, unsigned int len, bool& is_invalid_json_path, - hDictFind handler); + // find the JSONB value by JsonbPath + JsonbValue* findValue(JsonbPath& path, hDictFind handler); friend class JsonbDocument; protected: @@ -1207,154 +1214,100 @@ inline const char* JsonbValue::getValuePtr() const { } } -inline JsonbValue* JsonbValue::findPath(const char* key_path, unsigned int kp_len, - bool& is_invalid_json_path, hDictFind handler = nullptr) { - if (!key_path) return nullptr; - if (kp_len == 0) { - is_invalid_json_path = true; - return nullptr; - } +inline bool JsonbPath::seek(const char* key_path, size_t kp_len) { + //path invalid + if (!key_path || kp_len == 0) return false; Stream stream(key_path, kp_len); stream.skip_whitespace(); if (stream.exhausted() || stream.read() != SCOPE) { - is_invalid_json_path = true; - return nullptr; + //path invalid + return false; } - JsonbValue* pval = this; - - while (pval && !stream.exhausted()) { + while (!stream.exhausted()) { stream.skip_whitespace(); stream.clear_leg_ptr(); stream.clear_leg_len(); - if (!JsonbPath::parsePath(&stream)) { - is_invalid_json_path = stream.get_is_invalid_json_path(); - return nullptr; - } - - if (stream.get_leg_len() == 0) { - return nullptr; + if (!JsonbPath::parsePath(&stream, this)) { + //path invalid + return false; } + } + return true; +} - if (stream.get_type() == MEMBER_CODE) { +inline JsonbValue* JsonbValue::findValue(JsonbPath& path, hDictFind handler) { + JsonbValue* pval = this; + for (size_t i = 0; i < path.get_leg_vector_size(); ++i) { + switch (path.get_leg_from_leg_vector(i)->type) { + case MEMBER_CODE: { if (LIKELY(pval->type_ == JsonbType::T_Object)) { - if (stream.get_leg_len() == 1 && *stream.get_leg_ptr() == WILDCARD) { - return pval; - } else if (stream.get_has_escapes()) { - stream.remove_escapes(); + if (path.get_leg_from_leg_vector(i)->leg_len == 1 && + *path.get_leg_from_leg_vector(i)->leg_ptr == WILDCARD) { + continue; } pval = ((ObjectVal*)pval) - ->find(stream.get_leg_ptr(), stream.get_leg_len(), handler); + ->find(path.get_leg_from_leg_vector(i)->leg_ptr, + path.get_leg_from_leg_vector(i)->leg_len, handler); if (!pval) return nullptr; + continue; } else { return nullptr; } - } else if (stream.get_type() == ARRAY_CODE) { - int index = 0; - std::string_view idx_string(stream.get_leg_ptr(), stream.get_leg_len()); - - if (stream.get_leg_len() == 1 && *stream.get_leg_ptr() == WILDCARD) { + } + case ARRAY_CODE: { + if (path.get_leg_from_leg_vector(i)->leg_len == 1 && + *path.get_leg_from_leg_vector(i)->leg_ptr == WILDCARD) { if (LIKELY(pval->type_ == JsonbType::T_Array)) { - stream.skip(1); - stream.skip_whitespace(); continue; } else { return nullptr; } - } else if (std::equal(LAST, LAST + 4, stream.get_leg_ptr(), - [](char c1, char c2) { - return std::tolower(c1) == std::tolower(c2); - }) && - stream.get_leg_len() >= 4) { - auto pos = idx_string.find(MINUS); - - if (pos != std::string::npos) { - idx_string = idx_string.substr(pos + 1); - - auto result = std::from_chars(idx_string.data(), - idx_string.data() + idx_string.size(), index); - if (result.ec != std::errc()) { - is_invalid_json_path = true; - return nullptr; - } - - if (pval->type_ == JsonbType::T_Object) { - if (index == 0) { - continue; - } else { - return nullptr; - } - } else if (LIKELY(pval->type_ == JsonbType::T_Array)) { - size_t num = ((ArrayVal*)pval)->numElem(); - if (index > num) return nullptr; - index = num - 1 - index; - } else { - return nullptr; - } - } else if (stream.get_leg_len() == 4) { - if (pval->type_ == JsonbType::T_Object) { - continue; - } else if (LIKELY(pval->type_ == JsonbType::T_Array)) { - index = ((ArrayVal*)pval)->numElem() - 1; - } else { - return nullptr; - } - - } else { - is_invalid_json_path = true; - return nullptr; - } - } else { - auto result = std::from_chars(idx_string.data(), - idx_string.data() + idx_string.size(), index); - if (result.ec != std::errc()) { - is_invalid_json_path = true; - return nullptr; - } + } - if (pval->type_ == JsonbType::T_Object) { - if (index == 0) { - continue; - } else { - return nullptr; - } - } else if (LIKELY(pval->type_ == JsonbType::T_Array)) { - if (std::abs(index) >= ((ArrayVal*)pval)->numElem()) return nullptr; - } else { - return nullptr; - } + if (pval->type_ == JsonbType::T_Object && + path.get_leg_from_leg_vector(i)->array_index == 0) { + continue; } - if (index >= 0) { - pval = ((ArrayVal*)pval)->get(index); + if (pval->type_ != JsonbType::T_Array || + path.get_leg_from_leg_vector(i)->leg_ptr != nullptr || + path.get_leg_from_leg_vector(i)->leg_len != 0) + return nullptr; + + if (path.get_leg_from_leg_vector(i)->array_index >= 0) { + pval = ((ArrayVal*)pval)->get(path.get_leg_from_leg_vector(i)->array_index); } else { - pval = ((ArrayVal*)pval)->get(((ArrayVal*)pval)->numElem() + index); + pval = ((ArrayVal*)pval) + ->get(((ArrayVal*)pval)->numElem() + + path.get_leg_from_leg_vector(i)->array_index); } + + if (!pval) return nullptr; + continue; + } } } - return pval; } -inline bool JsonbPath::parsePath(Stream* stream) { +inline bool JsonbPath::parsePath(Stream* stream, JsonbPath* path) { if (stream->peek() == BEGIN_ARRAY) { - return parse_array(stream); + return parse_array(stream, path); } else if (stream->peek() == BEGIN_MEMBER) { - return parse_member(stream); + return parse_member(stream, path); } else { - stream->set_is_invalid_json_path(true); return false; //invalid json path } } -inline bool JsonbPath::parse_array(Stream* stream) { +inline bool JsonbPath::parse_array(Stream* stream, JsonbPath* path) { assert(stream->peek() == BEGIN_ARRAY); stream->skip(1); if (stream->exhausted()) { - stream->set_is_invalid_json_path(true); return false; } @@ -1363,10 +1316,12 @@ inline bool JsonbPath::parse_array(Stream* stream) { stream->add_leg_len(); stream->skip(1); if (stream->peek() == END_ARRAY) { - stream->set_type(ARRAY_CODE); + std::unique_ptr<leg_info> leg( + new leg_info(stream->get_leg_ptr(), stream->get_leg_len(), 0, ARRAY_CODE)); + path->add_leg_to_leg_vector(std::move(leg)); + stream->skip(1); return true; } else { - stream->set_is_invalid_json_path(true); return false; } } @@ -1377,22 +1332,58 @@ inline bool JsonbPath::parse_array(Stream* stream) { stream->add_leg_len(); } - if (!stream->exhausted() && stream->peek() == END_ARRAY) { + if (stream->exhausted() || stream->peek() != END_ARRAY) { + return false; + } else { stream->skip(1); - stream->set_type(ARRAY_CODE); + } + + //parse array index to int + + std::string_view idx_string(stream->get_leg_ptr(), stream->get_leg_len()); + int index = 0; + + if (stream->get_leg_len() >= 4 && + std::equal(LAST, LAST + 4, stream->get_leg_ptr(), + [](char c1, char c2) { return std::tolower(c1) == std::tolower(c2); })) { + auto pos = idx_string.find(MINUS); + + if (pos != std::string::npos) { + idx_string = idx_string.substr(pos + 1); + + auto result = std::from_chars(idx_string.data(), idx_string.data() + idx_string.size(), + index); + if (result.ec != std::errc()) { + return false; + } + + } else if (stream->get_leg_len() > 4) { + return false; + } + + std::unique_ptr<leg_info> leg(new leg_info(nullptr, 0, -index - 1, ARRAY_CODE)); + path->add_leg_to_leg_vector(std::move(leg)); + return true; - } else { - stream->set_is_invalid_json_path(true); + } + + auto result = std::from_chars(idx_string.data(), idx_string.data() + idx_string.size(), index); + + if (result.ec != std::errc()) { return false; } + + std::unique_ptr<leg_info> leg(new leg_info(nullptr, 0, index, ARRAY_CODE)); + path->add_leg_to_leg_vector(std::move(leg)); + + return true; } -inline bool JsonbPath::parse_member(Stream* stream) { +inline bool JsonbPath::parse_member(Stream* stream, JsonbPath* path) { // advance past the . assert(stream->peek() == BEGIN_MEMBER); stream->skip(1); if (stream->exhausted()) { - stream->set_is_invalid_json_path(true); return false; } @@ -1400,7 +1391,9 @@ inline bool JsonbPath::parse_member(Stream* stream) { stream->set_leg_ptr(const_cast<char*>(stream->position())); stream->add_leg_len(); stream->skip(1); - stream->set_type(MEMBER_CODE); + std::unique_ptr<leg_info> leg( + new leg_info(stream->get_leg_ptr(), stream->get_leg_len(), 0, MEMBER_CODE)); + path->add_leg_to_leg_vector(std::move(leg)); return true; } @@ -1435,12 +1428,18 @@ inline bool JsonbPath::parse_member(Stream* stream) { stream->add_leg_len(); } - if (left_quotation_marks != nullptr && right_quotation_marks == nullptr) { - stream->set_is_invalid_json_path(true); + if ((left_quotation_marks != nullptr && right_quotation_marks == nullptr) || + stream->get_leg_ptr() == nullptr || stream->get_leg_len() == 0) { return false; //invalid json path } - stream->set_type(MEMBER_CODE); + if (stream->get_has_escapes()) { + stream->remove_escapes(); + } + + std::unique_ptr<leg_info> leg( + new leg_info(stream->get_leg_ptr(), stream->get_leg_len(), 0, MEMBER_CODE)); + path->add_leg_to_leg_vector(std::move(leg)); return true; } diff --git a/be/src/vec/functions/function_jsonb.cpp b/be/src/vec/functions/function_jsonb.cpp index d9e498b4cf..9fe20e751f 100644 --- a/be/src/vec/functions/function_jsonb.cpp +++ b/be/src/vec/functions/function_jsonb.cpp @@ -436,10 +436,7 @@ private: ColumnString::Offsets& res_offsets, NullMap& null_map, const std::unique_ptr<JsonbWriter>& writer, std::unique_ptr<JsonbToJson>& formater, - const char* l_raw, int l_size, const char* r_raw, - int r_size, bool& is_invalid_json_path) { - String path(r_raw, r_size); - + const char* l_raw, int l_size, JsonbPath& path) { if (null_map[i]) { StringOP::push_null_string(i, res_data, res_offsets, null_map); return; @@ -453,9 +450,9 @@ private: } // value is NOT necessary to be deleted since JsonbValue will not allocate memory - JsonbValue* value = doc->getValue()->findPath(r_raw, r_size, is_invalid_json_path, nullptr); + JsonbValue* value = doc->getValue()->findValue(path, nullptr); - if (UNLIKELY(!value) || is_invalid_json_path) { + if (UNLIKELY(!value)) { StringOP::push_null_string(i, res_data, res_offsets, null_map); return; } @@ -535,8 +532,18 @@ public: size_t r_off = roffsets[index_check_const(i, path_const[0]) - 1]; size_t r_size = roffsets[index_check_const(i, path_const[0])] - r_off; const char* r_raw = reinterpret_cast<const char*>(&rdata[r_off]); + + JsonbPath path; + if (!path.seek(r_raw, r_size)) { + return Status::InvalidArgument( + "Json path error: {} for value: {}", + JsonbErrMsg::getErrMsg(JsonbErrType::E_INVALID_JSON_PATH), + std::string_view(reinterpret_cast<const char*>(rdata.data()), + rdata.size())); + } + inner_loop_impl(i, res_data, res_offsets, null_map, writer, formater, l_raw, l_size, - r_raw, r_size, is_invalid_json_path); + path); } else { // will make array string to user writer->reset(); writer->writeStartArray(); @@ -553,9 +560,16 @@ public: writer->writeNull(); continue; } - // value is NOT necessary to be deleted since JsonbValue will not allocate memory - JsonbValue* value = - doc->getValue()->findPath(r_raw, r_size, is_invalid_json_path, nullptr); + + JsonbPath path; + if (!path.seek(r_raw, r_size)) { + return Status::InvalidArgument( + "Json path error: {} for value: {}", + JsonbErrMsg::getErrMsg(JsonbErrType::E_INVALID_JSON_PATH), + std::string_view(reinterpret_cast<const char*>(rdata.data()), + rdata.size())); + } + // if not valid json path , should return error message to user if (is_invalid_json_path) { return Status::InvalidArgument( @@ -564,6 +578,10 @@ public: std::string_view(reinterpret_cast<const char*>(rdata.data()), rdata.size())); } + + // value is NOT necessary to be deleted since JsonbValue will not allocate memory + JsonbValue* value = doc->getValue()->findValue(path, nullptr); + if (UNLIKELY(!value)) { writer->writeNull(); } else { @@ -602,8 +620,15 @@ public: int r_size = roffsets[i] - roffsets[i - 1]; const char* r_raw = reinterpret_cast<const char*>(&rdata[roffsets[i - 1]]); + JsonbPath path; + if (!path.seek(r_raw, r_size)) { + is_invalid_json_path = true; + StringOP::push_null_string(i, res_data, res_offsets, null_map); + return; + } + inner_loop_impl(i, res_data, res_offsets, null_map, writer, formater, l_raw, l_size, - r_raw, r_size, is_invalid_json_path); + path); } //for } //function static void vector_scalar(FunctionContext* context, const ColumnString::Chars& ldata, @@ -620,12 +645,18 @@ public: std::unique_ptr<JsonbToJson> formater; + JsonbPath path; + if (!path.seek(rdata.data, rdata.size)) { + is_invalid_json_path = true; + return; + } + for (size_t i = 0; i < input_rows_count; ++i) { int l_size = loffsets[i] - loffsets[i - 1]; const char* l_raw = reinterpret_cast<const char*>(&ldata[loffsets[i - 1]]); inner_loop_impl(i, res_data, res_offsets, null_map, writer, formater, l_raw, l_size, - rdata.data, rdata.size, is_invalid_json_path); + path); } //for } //function static void scalar_vector(FunctionContext* context, const StringRef& ldata, @@ -647,8 +678,15 @@ public: int r_size = roffsets[i] - roffsets[i - 1]; const char* r_raw = reinterpret_cast<const char*>(&rdata[roffsets[i - 1]]); + JsonbPath path; + if (!path.seek(r_raw, r_size)) { + is_invalid_json_path = true; + StringOP::push_null_string(i, res_data, res_offsets, null_map); + return; + } + inner_loop_impl(i, res_data, res_offsets, null_map, writer, formater, ldata.data, - ldata.size, r_raw, r_size, is_invalid_json_path); + ldata.size, path); } //for } //function }; @@ -663,8 +701,7 @@ struct JsonbExtractImpl { private: static ALWAYS_INLINE void inner_loop_impl(size_t i, Container& res, NullMap& null_map, const char* l_raw_str, int l_str_size, - const char* r_raw_str, int r_str_size, - bool& is_invalid_json_path) { + JsonbPath& path) { if (null_map[i]) { res[i] = 0; return; @@ -679,10 +716,9 @@ private: } // value is NOT necessary to be deleted since JsonbValue will not allocate memory - JsonbValue* value = - doc->getValue()->findPath(r_raw_str, r_str_size, is_invalid_json_path, nullptr); + JsonbValue* value = doc->getValue()->findValue(path, nullptr); - if (UNLIKELY(!value) || is_invalid_json_path) { + if (UNLIKELY(!value)) { if constexpr (!only_check_exists) { null_map[i] = 1; } @@ -761,8 +797,14 @@ public: const char* r_raw_str = reinterpret_cast<const char*>(&rdata[roffsets[i - 1]]); int r_str_size = roffsets[i] - roffsets[i - 1]; - inner_loop_impl(i, res, null_map, l_raw_str, l_str_size, r_raw_str, r_str_size, - is_invalid_json_path); + JsonbPath path; + if (!path.seek(r_raw_str, r_str_size)) { + is_invalid_json_path = true; + res[i] = 0; + return; + } + + inner_loop_impl(i, res, null_map, l_raw_str, l_str_size, path); } //for } //function static void scalar_vector(FunctionContext* context, const StringRef& ldata, @@ -780,8 +822,14 @@ public: const char* r_raw_str = reinterpret_cast<const char*>(&rdata[roffsets[i - 1]]); int r_str_size = roffsets[i] - roffsets[i - 1]; - inner_loop_impl(i, res, null_map, ldata.data, ldata.size, r_raw_str, r_str_size, - is_invalid_json_path); + JsonbPath path; + if (!path.seek(r_raw_str, r_str_size)) { + is_invalid_json_path = true; + res[i] = 0; + return; + } + + inner_loop_impl(i, res, null_map, ldata.data, ldata.size, path); } //for } //function static void vector_scalar(FunctionContext* context, const ColumnString::Chars& ldata, @@ -790,6 +838,12 @@ public: size_t size = loffsets.size(); res.resize(size); + JsonbPath path; + if (!path.seek(rdata.data, rdata.size)) { + is_invalid_json_path = true; + return; + } + for (size_t i = 0; i < loffsets.size(); i++) { if constexpr (only_check_exists) { res[i] = 0; @@ -798,8 +852,7 @@ public: const char* l_raw_str = reinterpret_cast<const char*>(&ldata[loffsets[i - 1]]); int l_str_size = loffsets[i] - loffsets[i - 1]; - inner_loop_impl(i, res, null_map, l_raw_str, l_str_size, rdata.data, rdata.size, - is_invalid_json_path); + inner_loop_impl(i, res, null_map, l_raw_str, l_str_size, path); } //for } //function }; --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org