This is an automated email from the ASF dual-hosted git repository. lihaopeng pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 2b328eafbb [function](string_function) add new string function 'extract_url_parameter' (#13323) 2b328eafbb is described below commit 2b328eafbb20f5ec206912b41165616f88727f63 Author: DongLiang-0 <46414265+donglian...@users.noreply.github.com> AuthorDate: Thu Oct 20 11:11:43 2022 +0800 [function](string_function) add new string function 'extract_url_parameter' (#13323) --- be/src/util/url_parser.cpp | 57 ++++++++++++++++++++++ be/src/util/url_parser.h | 5 ++ be/src/vec/functions/function_string.cpp | 1 + be/src/vec/functions/function_string.h | 50 +++++++++++++++++++ be/test/vec/function/function_string_test.cpp | 25 ++++++++++ .../string-functions/extract_url_parameter.md | 50 +++++++++++++++++++ .../string-functions/extract_url_parameter.md | 50 +++++++++++++++++++ gensrc/script/doris_builtins_functions.py | 1 + 8 files changed, 239 insertions(+) diff --git a/be/src/util/url_parser.cpp b/be/src/util/url_parser.cpp index 0ce0913f3d..00d2783bd6 100644 --- a/be/src/util/url_parser.cpp +++ b/be/src/util/url_parser.cpp @@ -17,6 +17,8 @@ #include "util/url_parser.h" +#include <string> + #include "runtime/string_value.hpp" namespace doris { @@ -344,4 +346,59 @@ UrlParser::UrlPart UrlParser::get_url_part(const StringValue& part) { } } +std::string UrlParser::extract_url(const StringValue& url, const StringValue& name) { + std::string result; + std::string str_name = name.to_string(); + // Remove leading and trailing spaces. + StringValue trimmed_url = url.trim(); + // find '?' + int32_t question_pos = _s_question_search.search(&trimmed_url); + if (question_pos < 0) { + // this url no parameters. + // Example: https://doris.apache.org/ + return result; + } + // find '#' + int32_t hash_pos = _s_hash_search.search(&trimmed_url); + std::string sub_url = ""; + if (hash_pos < 0) { + sub_url = trimmed_url.substring(question_pos + 1, trimmed_url.len - question_pos - 1) + .to_string(); + } else { + sub_url = trimmed_url.substring(question_pos + 1, hash_pos - question_pos - 1).to_string(); + } + + // find '&' and '=', and extract target parameter + // Example: k1=aa&k2=bb&k3=cc&test=dd + std::string::size_type and_pod; + std::string::size_type len = sub_url.length(); + std::string key_url; + while (true) { + if (len <= 0) { + break; + } + and_pod = sub_url.find_first_of('&'); + if (and_pod != std::string::npos) { + key_url = sub_url.substr(0, and_pod); + sub_url = sub_url.substr(and_pod + 1, len - and_pod); + } else { + key_url = sub_url; + sub_url = ""; + } + len = sub_url.length(); + + std::string::size_type eq_pod = key_url.find_first_of('='); + if (eq_pod == std::string::npos) { + // invalid url. like: k1&k2=bb + continue; + } + int32_t key_len = key_url.length(); + std::string key = key_url.substr(0, eq_pod); + if (str_name == key) { + result = key_url.substr(eq_pod + 1, key_len - eq_pod); + return result; + } + } + return result; +} } // namespace doris diff --git a/be/src/util/url_parser.h b/be/src/util/url_parser.h index 3363f65e6b..0d212b1acd 100644 --- a/be/src/util/url_parser.h +++ b/be/src/util/url_parser.h @@ -60,6 +60,11 @@ public: // If part did not match any of the url part constants, returns INVALID. static UrlPart get_url_part(const StringValue& part); + // Extract parameter value from url + // Example for url: + // http://doris.apache.org?k1=aa&k2=bb&k3=cc&test=dd#999 + static std::string extract_url(const StringValue& url, const StringValue& name); + private: // Constants representing parts of a URL. static const StringValue _s_url_authority; diff --git a/be/src/vec/functions/function_string.cpp b/be/src/vec/functions/function_string.cpp index b8bf150249..7033eee2f8 100644 --- a/be/src/vec/functions/function_string.cpp +++ b/be/src/vec/functions/function_string.cpp @@ -678,6 +678,7 @@ void register_function_string(SimpleFunctionFactory& factory) { factory.register_function<FunctionFromBase64>(); factory.register_function<FunctionSplitPart>(); factory.register_function<FunctionStringMd5AndSM3<MD5Sum>>(); + factory.register_function<FunctionExtractURLParameter>(); factory.register_function<FunctionStringParseUrl>(); factory.register_function<FunctionMoneyFormat<MoneyFormatDoubleImpl>>(); factory.register_function<FunctionMoneyFormat<MoneyFormatInt64Impl>>(); diff --git a/be/src/vec/functions/function_string.h b/be/src/vec/functions/function_string.h index 9588e7a2c0..403bc7b2f5 100644 --- a/be/src/vec/functions/function_string.h +++ b/be/src/vec/functions/function_string.h @@ -31,6 +31,8 @@ #include <fmt/ranges.h> #include <cstdint> +#include <memory_resource> +#include <string> #include <string_view> #include "exprs/math_functions.h" @@ -1248,6 +1250,54 @@ public: } }; +class FunctionExtractURLParameter : public IFunction { +public: + static constexpr auto name = "extract_url_parameter"; + static FunctionPtr create() { return std::make_shared<FunctionExtractURLParameter>(); } + String get_name() const override { return name; } + size_t get_number_of_arguments() const override { return 2; } + + DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { + return std::make_shared<DataTypeString>(); + } + + bool use_default_implementation_for_constants() const override { return true; } + + Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, + size_t result, size_t input_rows_count) override { + auto col_url = + block.get_by_position(arguments[0]).column->convert_to_full_column_if_const(); + auto col_parameter = + block.get_by_position(arguments[1]).column->convert_to_full_column_if_const(); + auto url_col = assert_cast<const ColumnString*>(col_url.get()); + auto parameter_col = assert_cast<const ColumnString*>(col_parameter.get()); + + ColumnString::MutablePtr col_res = ColumnString::create(); + + for (int i = 0; i < input_rows_count; ++i) { + auto source = url_col->get_data_at(i); + auto param = parameter_col->get_data_at(i); + StringValue url_str(const_cast<char*>(source.data), source.size); + StringValue parameter_str(const_cast<char*>(param.data), param.size); + + std::string result = extract_url(url_str, parameter_str); + + col_res->insert_data(result.data(), result.length()); + } + + block.replace_by_position(result, std::move(col_res)); + return Status::OK(); + } + +private: + std::string extract_url(StringValue url, StringValue parameter) { + if (url.len == 0 || parameter.len == 0) { + return ""; + } + return UrlParser::extract_url(url, parameter); + } +}; + class FunctionStringParseUrl : public IFunction { public: static constexpr auto name = "parse_url"; diff --git a/be/test/vec/function/function_string_test.cpp b/be/test/vec/function/function_string_test.cpp index 3ee81d1391..e996bb13f9 100644 --- a/be/test/vec/function/function_string_test.cpp +++ b/be/test/vec/function/function_string_test.cpp @@ -953,6 +953,31 @@ TEST(function_string_test, function_sm4_decrypt_test) { } } +TEST(function_string_test, function_extract_url_parameter_test) { + std::string func_name = "extract_url_parameter"; + InputTypeSet input_types = {TypeIndex::String, TypeIndex::String}; + DataSet data_set = { + {{VARCHAR(""), VARCHAR("k1")}, {VARCHAR("")}}, + {{VARCHAR("http://doris.apache.org?k1=aa"), VARCHAR("")}, {VARCHAR("")}}, + {{VARCHAR("https://doris.apache.org/"), VARCHAR("k1")}, {VARCHAR("")}}, + {{VARCHAR("http://doris.apache.org?"), VARCHAR("k1")}, {VARCHAR("")}}, + {{VARCHAR("http://doris.apache.org?k1=aa"), VARCHAR("k1")}, {VARCHAR("aa")}}, + {{VARCHAR("http://doris.apache.org:8080?k1&k2=bb#99"), VARCHAR("k1")}, {VARCHAR("")}}, + {{VARCHAR("http://doris.apache.org?k1=aa#999"), VARCHAR("k1")}, {VARCHAR("aa")}}, + {{VARCHAR("http://doris.apache.org?k1=aa&k2=bb&test=dd#999/"), VARCHAR("k1")}, + {VARCHAR("aa")}}, + {{VARCHAR("http://doris.apache.org?k1=aa&k2=bb&test=dd#999/"), VARCHAR("k2")}, + {VARCHAR("bb")}}, + {{VARCHAR("http://doris.apache.org?k1=aa&k2=bb&test=dd#999/"), VARCHAR("999")}, + {VARCHAR("")}}, + {{VARCHAR("http://doris.apache.org?k1=aa&k2=bb&test=dd#999/"), VARCHAR("k3")}, + {VARCHAR("")}}, + {{VARCHAR("http://doris.apache.org?k1=aa&k2=bb&test=dd#999/"), VARCHAR("test")}, + {VARCHAR("dd")}}}; + + check_function<DataTypeString, true>(func_name, input_types, data_set); +} + TEST(function_string_test, function_parse_url_test) { std::string func_name = "parse_url"; diff --git a/docs/en/docs/sql-manual/sql-functions/string-functions/extract_url_parameter.md b/docs/en/docs/sql-manual/sql-functions/string-functions/extract_url_parameter.md new file mode 100644 index 0000000000..eb4bd8301b --- /dev/null +++ b/docs/en/docs/sql-manual/sql-functions/string-functions/extract_url_parameter.md @@ -0,0 +1,50 @@ +--- +{ +"title": "extract_url_parameter", +"language": "en" +} +--- + +<!-- +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE +file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on +an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +--> + +## extract_url_parameter +### description +#### Syntax + +`VARCHAR extract_url_parameter(VARCHAR url, VARCHAR name)` + + +Returns the value of the "name" parameter in the URL, if present. Otherwise an empty string. +If there are many parameters with this name, the first occurrence is returned. +This function works assuming that the parameter name is encoded in the URL exactly as it was in the passed parameter. + +``` +mysql> SELECT extract_url_parameter ("http://doris.apache.org?k1=aa&k2=bb&test=cc#999", "k2"); ++--------------------------------------------------------------------------------+ +| extract_url_parameter('http://doris.apache.org?k1=aa&k2=bb&test=cc#999', 'k2') | ++--------------------------------------------------------------------------------+ +| bb | ++--------------------------------------------------------------------------------+ +``` + +### keywords + EXTRACT URL PARAMETER diff --git a/docs/zh-CN/docs/sql-manual/sql-functions/string-functions/extract_url_parameter.md b/docs/zh-CN/docs/sql-manual/sql-functions/string-functions/extract_url_parameter.md new file mode 100644 index 0000000000..2a17ede2d2 --- /dev/null +++ b/docs/zh-CN/docs/sql-manual/sql-functions/string-functions/extract_url_parameter.md @@ -0,0 +1,50 @@ +--- +{ +"title": "extract_url_parameter", +"language": "zh-CN" +} +--- + +<!-- +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE +file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on +an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +--> + +## extract_url_parameter +### description +#### Syntax + +`VARCHAR extract_url_parameter(VARCHAR url, VARCHAR name)` + + +返回 URL 中“name”参数的值(如果存在)。否则为空字符串。 +如果有许多具有此名称的参数,则返回第一个出现的参数。 +此函数的工作假设参数名称在 URL 中的编码方式与在传递参数中的编码方式完全相同。 + +``` +mysql> SELECT extract_url_parameter ("http://doris.apache.org?k1=aa&k2=bb&test=cc#999", "k2"); ++--------------------------------------------------------------------------------+ +| extract_url_parameter('http://doris.apache.org?k1=aa&k2=bb&test=cc#999', 'k2') | ++--------------------------------------------------------------------------------+ +| bb | ++--------------------------------------------------------------------------------+ +``` + +### keywords + EXTRACT URL PARAMETER diff --git a/gensrc/script/doris_builtins_functions.py b/gensrc/script/doris_builtins_functions.py index ed71c714fc..e832428207 100755 --- a/gensrc/script/doris_builtins_functions.py +++ b/gensrc/script/doris_builtins_functions.py @@ -2137,6 +2137,7 @@ visible_functions = [ [['split_part'], 'VARCHAR', ['VARCHAR', 'VARCHAR', 'INT'], '_ZN5doris15StringFunctions10split_partEPN9doris_udf15FunctionContextERKNS1_9StringValES6_RKNS1_6IntValE', '', '', 'vec', 'ALWAYS_NULLABLE'], + [['extract_url_parameter'], 'VARCHAR', ['VARCHAR', 'VARCHAR'],'','', '', 'vec', ''], # Longtext function [['substr', 'substring'], 'STRING', ['STRING', 'INT'], --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org