This is an automated email from the ASF dual-hosted git repository.
dataroaring pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new f7153b7eae0 branch-3.0: [feat](function) SUBSTRING_INDEX function
delimiter supports dynamic #50149 (#50302)
f7153b7eae0 is described below
commit f7153b7eae0b1ca5dae4f8536da79c64cfd19faa
Author: github-actions[bot]
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Sun Apr 27 18:02:22 2025 +0800
branch-3.0: [feat](function) SUBSTRING_INDEX function delimiter supports
dynamic #50149 (#50302)
Cherry-picked from #50149
Co-authored-by: lw112 <[email protected]>
---
be/src/vec/functions/function_string.h | 85 ++++++++-----
.../functions/scalar/SubstringIndex.java | 11 --
.../data/function_p0/test_substring_index.out | Bin 0 -> 2304 bytes
.../function_p0/test_substring_index_columns.out | Bin 0 -> 1124 bytes
.../function_p0/test_substring_index_simple.out | Bin 0 -> 725 bytes
.../suites/function_p0/test_substring_index.groovy | 137 +++++++++++++++++++++
.../test_substring_index_columns.groovy | 114 +++++++++++++++++
.../function_p0/test_substring_index_simple.groovy | 89 +++++++++++++
8 files changed, 396 insertions(+), 40 deletions(-)
diff --git a/be/src/vec/functions/function_string.h
b/be/src/vec/functions/function_string.h
index d9d561ac59e..0b5b38eb49e 100644
--- a/be/src/vec/functions/function_string.h
+++ b/be/src/vec/functions/function_string.h
@@ -1975,24 +1975,42 @@ public:
const auto* str_col = assert_cast<const
ColumnString*>(content_column.get());
- [[maybe_unused]] const auto& [delimiter_col, delimiter_const] =
+ // Handle both constant and non-constant delimiter parameters
+ ColumnPtr delimiter_column_ptr;
+ bool delimiter_const = false;
+ std::tie(delimiter_column_ptr, delimiter_const) =
unpack_if_const(block.get_by_position(arguments[1]).column);
- auto delimiter = delimiter_col->get_data_at(0);
- int32_t delimiter_size = delimiter.size;
+ const auto* delimiter_col = assert_cast<const
ColumnString*>(delimiter_column_ptr.get());
- [[maybe_unused]] const auto& [part_num_col, part_const] =
+ ColumnPtr part_num_column_ptr;
+ bool part_num_const = false;
+ std::tie(part_num_column_ptr, part_num_const) =
unpack_if_const(block.get_by_position(arguments[2]).column);
- auto part_number = *((int*)part_num_col->get_data_at(0).data);
+ const ColumnVector<Int32>* part_num_col =
+ assert_cast<const
ColumnVector<Int32>*>(part_num_column_ptr.get());
- if (part_number == 0 || delimiter_size == 0) {
- for (size_t i = 0; i < input_rows_count; ++i) {
+ // For constant multi-character delimiters, create StringRef and
StringSearch only once
+ std::optional<StringRef> const_delimiter_ref;
+ std::optional<StringSearch> const_search;
+ if (delimiter_const && delimiter_col->get_data_at(0).size > 1) {
+ const_delimiter_ref.emplace(delimiter_col->get_data_at(0));
+ const_search.emplace(&const_delimiter_ref.value());
+ }
+
+ for (size_t i = 0; i < input_rows_count; ++i) {
+ auto str = str_col->get_data_at(i);
+ auto delimiter = delimiter_col->get_data_at(delimiter_const ? 0 :
i);
+ int32_t delimiter_size = delimiter.size;
+
+ auto part_number = part_num_col->get_element(part_num_const ? 0 :
i);
+
+ if (part_number == 0 || delimiter_size == 0) {
StringOP::push_empty_string(i, res_chars, res_offsets);
+ continue;
}
- } else if (part_number > 0) {
- if (delimiter_size == 1) {
- // If delimiter is a char, use memchr to split
- for (size_t i = 0; i < input_rows_count; ++i) {
- auto str = str_col->get_data_at(i);
+
+ if (part_number > 0) {
+ if (delimiter_size == 1) {
int32_t offset = -1;
int32_t num = 0;
while (num < part_number) {
@@ -2018,18 +2036,23 @@ public:
StringOP::push_value_string(std::string_view(str.data,
str.size), i,
res_chars, res_offsets);
}
- }
- } else {
- StringRef delimiter_ref(delimiter);
- StringSearch search(&delimiter_ref);
- for (size_t i = 0; i < input_rows_count; ++i) {
- auto str = str_col->get_data_at(i);
+ } else {
+ // For multi-character delimiters
+ // Use pre-created StringRef and StringSearch for constant
delimiters
+ StringRef delimiter_ref = const_delimiter_ref ?
const_delimiter_ref.value()
+ :
StringRef(delimiter);
+ const StringSearch* search_ptr = const_search ?
&const_search.value() : nullptr;
+ StringSearch local_search(&delimiter_ref);
+ if (!search_ptr) {
+ search_ptr = &local_search;
+ }
+
int32_t offset = -delimiter_size;
int32_t num = 0;
while (num < part_number) {
size_t n = str.size - offset - delimiter_size;
// search first match delimter_ref index from src
string among str_offset to end
- const char* pos = search.search(str.data + offset +
delimiter_size, n);
+ const char* pos = search_ptr->search(str.data + offset
+ delimiter_size, n);
if (pos < str.data + str.size) {
offset = pos - str.data;
num++;
@@ -2050,21 +2073,25 @@ public:
res_chars, res_offsets);
}
}
- }
- } else {
- // if part_number is negative
- part_number = -part_number;
- for (size_t i = 0; i < input_rows_count; ++i) {
- auto str = str_col->get_data_at(i);
+ } else {
+ int neg_part_number = -part_number;
auto str_str = str.to_string();
int32_t offset = str.size;
int32_t pre_offset = offset;
int32_t num = 0;
auto substr = str_str;
- while (num <= part_number && offset >= 0) {
- offset = (int)substr.rfind(delimiter, offset);
+
+ // Use pre-created StringRef for constant delimiters
+ StringRef delimiter_str =
+ const_delimiter_ref
+ ? const_delimiter_ref.value()
+ : StringRef(reinterpret_cast<const
char*>(delimiter.data),
+ delimiter.size);
+
+ while (num <= neg_part_number && offset >= 0) {
+ offset = (int)substr.rfind(delimiter_str, offset);
if (offset != -1) {
- if (++num == part_number) {
+ if (++num == neg_part_number) {
break;
}
pre_offset = offset;
@@ -2076,7 +2103,7 @@ public:
}
num = (offset == -1 && num != 0) ? num + 1 : num;
- if (num == part_number) {
+ if (num == neg_part_number) {
if (offset == -1) {
StringOP::push_value_string(std::string_view(str.data,
str.size), i,
res_chars, res_offsets);
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SubstringIndex.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SubstringIndex.java
index bb9e2b749c4..7751578b2a4 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SubstringIndex.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SubstringIndex.java
@@ -18,7 +18,6 @@
package org.apache.doris.nereids.trees.expressions.functions.scalar;
import org.apache.doris.catalog.FunctionSignature;
-import org.apache.doris.nereids.exceptions.AnalysisException;
import org.apache.doris.nereids.trees.expressions.Expression;
import
org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature;
import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable;
@@ -53,16 +52,6 @@ public class SubstringIndex extends ScalarFunction
super("substring_index", arg0, arg1, arg2);
}
- @Override
- public void checkLegalityBeforeTypeCoercion() {
- for (int i = 1; i < children.size(); ++i) {
- if (!getArgument(i).isConstant()) {
- throw new AnalysisException(getName()
- + " function except for the first argument, other
parameter must be a constant.");
- }
- }
- }
-
/**
* withChildren.
*/
diff --git a/regression-test/data/function_p0/test_substring_index.out
b/regression-test/data/function_p0/test_substring_index.out
new file mode 100644
index 00000000000..480f2c2b482
Binary files /dev/null and
b/regression-test/data/function_p0/test_substring_index.out differ
diff --git a/regression-test/data/function_p0/test_substring_index_columns.out
b/regression-test/data/function_p0/test_substring_index_columns.out
new file mode 100644
index 00000000000..ef023cd37a2
Binary files /dev/null and
b/regression-test/data/function_p0/test_substring_index_columns.out differ
diff --git a/regression-test/data/function_p0/test_substring_index_simple.out
b/regression-test/data/function_p0/test_substring_index_simple.out
new file mode 100644
index 00000000000..ca236ccc9bd
Binary files /dev/null and
b/regression-test/data/function_p0/test_substring_index_simple.out differ
diff --git a/regression-test/suites/function_p0/test_substring_index.groovy
b/regression-test/suites/function_p0/test_substring_index.groovy
new file mode 100644
index 00000000000..47df1208a00
--- /dev/null
+++ b/regression-test/suites/function_p0/test_substring_index.groovy
@@ -0,0 +1,137 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_substring_index") {
+ sql "set enable_nereids_planner=true"
+ sql "set enable_fallback_to_original_planner=false"
+
+ sql "DROP TABLE IF EXISTS test_substring_index"
+ sql """
+ CREATE TABLE test_substring_index (
+ id INT,
+ str VARCHAR(100),
+ delimiter VARCHAR(10),
+ count INT
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1"
+ )
+ """
+
+ sql """
+ INSERT INTO test_substring_index VALUES
+ (1, 'AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06', 'BBB', -1),
+ (2, 'zyz_01|zyz_02|CCC_03|qwe_04|qwe_05|qwe_06', 'ccc', -1),
+ (3, 'AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06', 'DDD', -1),
+ (4, 'sgr_01|wsc_02|CCC_03|DDD_04|rfv_05|rgb_06', 'DDD', -1),
+ (5, 'cdr_01|vfr_02|dfc_03|DDD_04|EEE_05|FFF_06', 'eee', -1),
+ (6, 'AAA_01|dsd_02|ert_03|bgt_04|fgh_05|hyb_06', 'A_01', -1),
+ (7, 'AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06', 'BBB', 1),
+ (8, 'AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06', '|', 2),
+ (9, 'AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06', '|', -2),
+ (10, 'ABC', '|', 1),
+ (11, 'ABC|DEF', '|', 0),
+ (12, '', 'ABC', 1),
+ (13, 'ABC|DEF|GHI', '', 1)
+ """
+
+ sql """
+ INSERT INTO test_substring_index VALUES
+ (101, '北京市|上海市|广州市|深圳市|成都市', '|', 2),
+ (102, '北京市|上海市|广州市|深圳市|成都市', '|', -2),
+ (103, '北京市|上海市|广州市|深圳市|成都市', '上海', -1),
+ (104, '中国人民共和国', '人民', 1),
+ (105, '中国人民共和国', '人民', -1),
+ (106, '你好,世界!你好,朋友!', '你好', 1),
+ (107, '你好,世界!你好,朋友!', '你好', -1),
+ (108, '你好,世界!你好,朋友!', '世界', -1),
+ (109, '中文|测试|数据', '测试', 1),
+ (110, '中文|测试|数据', '测试', -1)
+ """
+
+ sql """
+ INSERT INTO test_substring_index VALUES
+ (201, 'hello😀world😀example', '😀', 1),
+ (202, 'hello😀world😀example', '😀', 2),
+ (203, 'hello😀world😀example', '😀', -1),
+ (204, '👋👋hello👋world👋', '👋', 2),
+ (205, '👋👋hello👋world👋', '👋', -2)
+ """
+
+ qt_sql """
+ SELECT
+ id,
+ str,
+ delimiter,
+ count,
+ substring_index(str, delimiter, count) as result
+ FROM test_substring_index
+ WHERE id BETWEEN 1 AND 13
+ ORDER BY id
+ """
+
+ qt_sql """
+ SELECT
+ id,
+ str,
+ delimiter,
+ count,
+ substring_index(str, delimiter, count) as result
+ FROM test_substring_index
+ WHERE id BETWEEN 101 AND 110
+ ORDER BY id
+ """
+
+ qt_sql """
+ SELECT
+ id,
+ str,
+ delimiter,
+ count,
+ substring_index(str, delimiter, count) as result
+ FROM test_substring_index
+ WHERE id BETWEEN 201 AND 205
+ ORDER BY id
+ """
+
+ qt_sql """
+ SELECT
+ a.id,
+ a.str,
+ a.delimiter,
+ b.count,
+ substring_index(a.str, a.delimiter, b.count) as result
+ FROM test_substring_index a
+ JOIN test_substring_index b ON a.id = b.id
+ WHERE a.id IN (1, 3, 7, 8, 101, 103, 201, 203)
+ ORDER BY a.id
+ """
+
+ qt_sql """
+ SELECT
+ substring_index('', '', 1) as empty_all,
+ substring_index('test', '', 1) as empty_delimiter,
+ substring_index('', 'test', 1) as empty_string,
+ substring_index('test', 'test', 0) as zero_count,
+ substring_index('test|test', '|', 999) as large_count,
+ substring_index('test|test', '|', -999) as large_negative_count
+ """
+
+ sql "DROP TABLE IF EXISTS test_substring_index"
+}
\ No newline at end of file
diff --git
a/regression-test/suites/function_p0/test_substring_index_columns.groovy
b/regression-test/suites/function_p0/test_substring_index_columns.groovy
new file mode 100644
index 00000000000..80fb3b5318d
--- /dev/null
+++ b/regression-test/suites/function_p0/test_substring_index_columns.groovy
@@ -0,0 +1,114 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_substring_index_columns") {
+ sql "set enable_nereids_planner=true"
+ sql "set enable_fallback_to_original_planner=false"
+
+ sql "DROP TABLE IF EXISTS test_substring_index_compat"
+ sql """
+ CREATE TABLE test_substring_index_compat (
+ no INT,
+ sub_str VARCHAR(50),
+ str VARCHAR(100)
+ ) ENGINE=OLAP
+ DUPLICATE KEY(no)
+ DISTRIBUTED BY HASH(no) BUCKETS 1
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1"
+ )
+ """
+
+ sql """
+ INSERT INTO test_substring_index_compat VALUES
+ (1, 'BBB', 'AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06'),
+ (2, 'ccc', 'zyz_01|zyz_02|CCC_03|qwe_04|qwe_05|qwe_06'),
+ (3, 'DDD', 'AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06'),
+ (4, 'DDD', 'sgr_01|wsc_02|CCC_03|DDD_04|rfv_05|rgb_06'),
+ (5, 'eee', 'cdr_01|vfr_02|dfc_03|DDD_04|EEE_05|FFF_06'),
+ (6, 'A_01', 'AAA_01|dsd_02|ert_03|bgt_04|fgh_05|hyb_06')
+ """
+
+ qt_sql """
+ SELECT
+ no,
+ sub_str AS '分隔符字符串',
+ str AS '需要截取的字符串',
+ substring_index(str, sub_str, -1) AS '动态分隔符结果'
+ FROM test_substring_index_compat
+ ORDER BY no
+ """
+
+ sql """
+ INSERT INTO test_substring_index_compat VALUES
+ (7, '市', '北京市|上海市|广州市|深圳市'),
+ (8, '人民', '中华人民共和国'),
+ (9, '分隔符', '中文分隔符测试分隔符数据'),
+ (10, '你好', '你好,世界!你好,朋友!')
+ """
+
+ qt_sql """
+ SELECT
+ no,
+ sub_str AS '分隔符字符串',
+ str AS '需要截取的字符串',
+ substring_index(str, sub_str, 1) AS '正向截取',
+ substring_index(str, sub_str, -1) AS '反向截取'
+ FROM test_substring_index_compat
+ WHERE no > 6
+ ORDER BY no
+ """
+
+ sql "DROP TABLE IF EXISTS test_dynamic_params"
+ sql """
+ CREATE TABLE test_dynamic_params (
+ id INT,
+ source_str VARCHAR(100),
+ delimiter VARCHAR(20),
+ count_val INT
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1"
+ )
+ """
+
+ sql """
+ INSERT INTO test_dynamic_params VALUES
+ (1, 'field1,field2,field3,field4', ',', 2),
+ (2, 'field1,field2,field3,field4', ',', -1),
+ (3, 'AAA_01|BBB_02|CCC_03', '|', 2),
+ (4, 'AAA_01|BBB_02|CCC_03', '|', -2),
+ (5, '中文分隔符测试分隔符数据', '分隔符', 1),
+ (6, '中文分隔符测试分隔符数据', '分隔符', -1)
+ """
+
+ qt_sql """
+ SELECT
+ id,
+ source_str,
+ delimiter,
+ count_val,
+ substring_index(source_str, delimiter, count_val) AS result
+ FROM test_dynamic_params
+ ORDER BY id
+ """
+
+ sql "DROP TABLE IF EXISTS test_substring_index_compat"
+ sql "DROP TABLE IF EXISTS test_dynamic_params"
+}
\ No newline at end of file
diff --git
a/regression-test/suites/function_p0/test_substring_index_simple.groovy
b/regression-test/suites/function_p0/test_substring_index_simple.groovy
new file mode 100644
index 00000000000..82d97e23782
--- /dev/null
+++ b/regression-test/suites/function_p0/test_substring_index_simple.groovy
@@ -0,0 +1,89 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_substring_index_simple") {
+ sql "set enable_nereids_planner=true"
+ sql "set enable_fallback_to_original_planner=false"
+
+ qt_sql """SELECT substring_index('AAA_01|BBB_02|CCC_03', '|', 1) as
result"""
+
+ qt_sql """SELECT substring_index('AAA_01|BBB_02|CCC_03', '|', -1) as
result"""
+
+ qt_sql """SELECT substring_index('AAA_01|BBB_02|CCC_03', '|', 2) as
result"""
+
+ qt_sql """SELECT substring_index('AAA_01|BBB_02|CCC_03', '|', -2) as
result"""
+
+ qt_sql """SELECT substring_index('AAA_01|BBB_02|CCC_03', 'XYZ', 1) as
result"""
+
+ qt_sql """SELECT substring_index('AAA_01|BBB_02|CCC_03', 'XYZ', -1) as
result"""
+
+ qt_sql """SELECT substring_index('', '|', 1) as result"""
+
+ qt_sql """SELECT substring_index('AAA_01|BBB_02|CCC_03', '', 1) as
result"""
+
+ qt_sql """SELECT substring_index('AAA_01|BBB_02|CCC_03', '|', 0) as
result"""
+
+ qt_sql """SELECT substring_index('AAA_01|BBB_02|CCC_03', '|', 10) as
result"""
+
+ qt_sql """SELECT substring_index('AAA_01|BBB_02|CCC_03', '|', -10) as
result"""
+
+ qt_sql """SELECT substring_index('AAA_01||BBB_02||CCC_03', '||', 1) as
result"""
+
+ qt_sql """SELECT substring_index('AAA_01||BBB_02||CCC_03', '||', -1) as
result"""
+
+ qt_sql """SELECT substring_index('AAA_01|BBB_02|CCC_03', 'BBB', -1) as
result"""
+
+ qt_sql """SELECT substring_index('|AAA_01|BBB_02|CCC_03', '|', 1) as
result"""
+
+ qt_sql """SELECT substring_index('AAA_01|BBB_02|CCC_03|', '|', -1) as
result"""
+
+ qt_sql """SELECT substring_index('北京市|上海市|广州市', '|', 2) as result"""
+
+ qt_sql """SELECT substring_index('北京市分隔符上海市分隔符广州市', '分隔符', 1) as result"""
+
+ qt_sql """SELECT substring_index('北京市分隔符上海市分隔符广州市', '分隔符', -1) as result"""
+
+ qt_sql """SELECT substring_index('hello😀world😀example', '😀', 1) as
result"""
+
+ qt_sql """SELECT substring_index('hello😀world😀example', '😀', -1) as
result"""
+
+ qt_sql """
+ SELECT substring_index('AAA_01|BBB_02|CCC_03', (SELECT '|'), 2) as result
+ """
+
+ qt_sql """
+ SELECT substring_index('AAA_01|BBB_02|CCC_03', '|', (SELECT 2)) as result
+ """
+
+ qt_sql """
+ SELECT substring_index('AAA_01|BBB_02|CCC_03', (SELECT '|'), (SELECT 2))
as result
+ """
+
+ qt_sql """
+ SELECT substring_index('AAA_01|BBB_02|CCC_03', concat('|'), 2) as result
+ """
+
+ qt_sql """
+ SELECT substring_index('中文_分隔符_测试_分隔符_数据', concat('分', '隔', '符'), 1) as
result
+ """
+
+ qt_sql """
+ SELECT
+ substring_index('AAA_01|BBB_02|CCC_03', 'BBB', -1) as result1,
+ substring_index('AAA_01|BBB_02|CCC_03', 'bbb', -1) as result2
+ """
+}
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]