This is an automated email from the ASF dual-hosted git repository. morrysnow pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 5e64736aa96 [fix](Nereids) string constant folding process regex delim by mistake (#48783) 5e64736aa96 is described below commit 5e64736aa963c163247b3ac777091d4a11cf899a Author: morrySnow <zhangwen...@selectdb.com> AuthorDate: Fri Mar 7 16:36:30 2025 +0800 [fix](Nereids) string constant folding process regex delim by mistake (#48783) ### What problem does this PR solve? Related PR: #40441 Problem Summary: The delimiters in split-related string functions that have special meanings in regular expressions should be escaped. .$|()[{^?*+\ --- .../functions/executable/StringArithmetic.java | 57 +++++----- .../fold_constant_string_arithmatic.groovy | 123 ++++++++++++++++++++- 2 files changed, 149 insertions(+), 31 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/executable/StringArithmetic.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/executable/StringArithmetic.java index bc056a03bcb..18ec333882c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/executable/StringArithmetic.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/executable/StringArithmetic.java @@ -38,6 +38,9 @@ import org.apache.doris.nereids.trees.expressions.literal.StringLikeLiteral; import org.apache.doris.nereids.trees.expressions.literal.StringLiteral; import org.apache.doris.nereids.trees.expressions.literal.TinyIntLiteral; import org.apache.doris.nereids.trees.expressions.literal.VarcharLiteral; +import org.apache.doris.nereids.types.ArrayType; + +import com.google.common.collect.ImmutableList; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; @@ -50,6 +53,7 @@ import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.ArrayList; import java.util.List; +import java.util.regex.Pattern; /** * executable functions: @@ -658,14 +662,18 @@ public class StringArithmetic { } /** - * Executable arithmetic functions split_by_char + * Executable arithmetic functions split_by_string */ - @ExecFunction(name = "split_by_char") - public static Expression splitByChar(StringLikeLiteral first, StringLikeLiteral second) { - String[] result = first.getValue().split(second.getValue(), -1); + @ExecFunction(name = "split_by_string") + public static Expression splitByString(StringLikeLiteral first, StringLikeLiteral second) { + if (first.getValue().isEmpty()) { + return new ArrayLiteral(ImmutableList.of(), ArrayType.of(first.getDataType())); + } + int limit = second.getValue().isEmpty() ? 0 : -1; + String[] result = first.getValue().split(Pattern.quote(second.getValue()), limit); List<Literal> items = new ArrayList<>(); - for (int i = 1; i < result.length; i++) { - items.add((Literal) castStringLikeLiteral(first, result[i])); + for (String s : result) { + items.add((Literal) castStringLikeLiteral(first, s)); } return new ArrayLiteral(items); } @@ -675,35 +683,34 @@ public class StringArithmetic { */ @ExecFunction(name = "split_part") public static Expression splitPart(StringLikeLiteral first, StringLikeLiteral chr, IntegerLiteral number) { + if (number.getValue() == 0) { + return new NullLiteral(first.getDataType()); + } + if (chr.getValue().isEmpty()) { + return castStringLikeLiteral(first, ""); + } + if (first.getValue().isEmpty()) { + return new NullLiteral(first.getDataType()); + } if (first.getValue().equals(chr.getValue())) { if (Math.abs(number.getValue()) == 1 || Math.abs(number.getValue()) == 2) { return castStringLikeLiteral(first, ""); + } else { + return new NullLiteral(first.getDataType()); } } String separator = chr.getValue(); - String[] parts = null; + String[] parts; if (number.getValue() < 0) { StringBuilder sb = new StringBuilder(first.getValue()); - StringBuilder seperatorBuilder = new StringBuilder(separator); - separator = seperatorBuilder.reverse().toString(); - if (".$|()[{^?*+\\".contains(separator) || separator.startsWith("\\")) { - separator = "\\" + separator; - } - parts = sb.reverse().toString().split(separator, -1); + StringBuilder separatorBuilder = new StringBuilder(separator); + separator = separatorBuilder.reverse().toString(); + parts = sb.reverse().toString().split(Pattern.quote(separator), -1); } else { - if (".$|()[{^?*+\\".contains(separator) || separator.startsWith("\\")) { - separator = "\\" + separator; - } - parts = first.getValue().split(separator, -1); + parts = first.getValue().split(Pattern.quote(separator), -1); } - if (parts.length < Math.abs(number.getValue()) || number.getValue() == 0) { - if (parts.length == Math.abs(number.getValue())) { - if (number.getValue() < 0 && first.getValue().startsWith(chr.getValue()) - || number.getValue() > 0 && first.getValue().endsWith(chr.getValue())) { - return castStringLikeLiteral(first, ""); - } - } + if (parts.length < Math.abs(number.getValue())) { return new NullLiteral(first.getDataType()); } else if (number.getValue() < 0) { StringBuilder result = new StringBuilder(parts[Math.abs(number.getValue()) - 1]); @@ -721,7 +728,7 @@ public class StringArithmetic { if (chr.getValue().isEmpty()) { return chr; } - String[] parts = first.getValue().split(chr.getValue(), -1); + String[] parts = first.getValue().split(Pattern.quote(chr.getValue()), -1); if (Math.abs(number.getValue()) >= parts.length) { return first; } diff --git a/regression-test/suites/nereids_p0/expression/fold_constant/fold_constant_string_arithmatic.groovy b/regression-test/suites/nereids_p0/expression/fold_constant/fold_constant_string_arithmatic.groovy index e3ed0024b6f..0f2ddac1fa9 100644 --- a/regression-test/suites/nereids_p0/expression/fold_constant/fold_constant_string_arithmatic.groovy +++ b/regression-test/suites/nereids_p0/expression/fold_constant/fold_constant_string_arithmatic.groovy @@ -461,18 +461,80 @@ suite("fold_constant_string_arithmatic") { testFoldConst("select split_by_string(cast('abc' as string), cast('::' as string))") testFoldConst("select split_by_string('上海天津北京杭州', '北')") testFoldConst("select split_by_string('abccccc', 'c')") + testFoldConst("select split_by_string('abcde','')") + testFoldConst("select split_by_string('你a好b世c界','')") + testFoldConst("select split_by_string('12553','')") + testFoldConst("select split_by_string('','')") + testFoldConst("select split_by_string('',',')") + testFoldConst("select split_by_string('','a')") + testFoldConst("select split_by_string('','abc')") + testFoldConst("select split_by_string('abc','')") + testFoldConst("select split_by_string('a1b1c1d','1')") + testFoldConst("select split_by_string(',,,',',')") + testFoldConst("select split_by_string('a,b,c,abcde',',')") + testFoldConst("select split_by_string(',,a,b,c,',',')") + testFoldConst("select split_by_string('null',',')") + testFoldConst("select split_by_string('1,,2,3,,4,5,,abcde', ',,')") + testFoldConst("select split_by_string('abcde','')") + testFoldConst("select split_by_string('1,,2,3,,,,,,4,5, abcde', ',,')") + testFoldConst("select split_by_string(',,,,',',,')") + testFoldConst("select split_by_string('a,,b,,c',',,')") + testFoldConst("select split_by_string('a,,b,,c,,',',,')") + testFoldConst("select split_by_string(',,a,,b,,c,,',',,')") + testFoldConst("SELECT split_by_string('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','..')") + testFoldConst("SELECT split_by_string('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','\$\$')") + testFoldConst("SELECT split_by_string('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','||')") + testFoldConst("SELECT split_by_string('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','((')") + testFoldConst("SELECT split_by_string('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','))')") + testFoldConst("SELECT split_by_string('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','[[')") + testFoldConst("SELECT split_by_string('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','{{')") + testFoldConst("SELECT split_by_string('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','^^')") + testFoldConst("SELECT split_by_string('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','??')") + testFoldConst("SELECT split_by_string('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','**')") + testFoldConst("SELECT split_by_string('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','++')") + testFoldConst("SELECT split_by_string('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','\\\\')") // split_part - testFoldConst("select split_part('a,b,c', ',', -1)") - testFoldConst("select split_part('abc##123###xyz', '##', 0)") + testFoldConst("select split_part('a,b,c', '', -2)") + testFoldConst("select split_part('a,b,c', '', -1)") + testFoldConst("select split_part('a,b,c', '', 0)") + testFoldConst("select split_part('a,b,c', '', 1)") + testFoldConst("select split_part('a,b,c', '', 2)") + testFoldConst("select split_part('', '', -2)") + testFoldConst("select split_part('', '', -1)") + testFoldConst("select split_part('', '', 0)") + testFoldConst("select split_part('', '', 1)") + testFoldConst("select split_part('', '', 2)") + testFoldConst("select split_part('', 'abc', -2)") + testFoldConst("select split_part('', 'abc', -1)") + testFoldConst("select split_part('', 'abc', 0)") + testFoldConst("select split_part('', 'abc', 1)") + testFoldConst("select split_part('', 'abc', 2)") + testFoldConst("select split_part('abc##123###xyz', '##', -10)") + testFoldConst("select split_part('abc##123###xyz', '##', -4)") + testFoldConst("select split_part('abc##123###xyz', '##', -3)") + testFoldConst("select split_part('abc##123###xyz', '##', -2)") testFoldConst("select split_part('abc##123###xyz', '##', -1)") + testFoldConst("select split_part('abc##123###xyz', '##', 0)") testFoldConst("select split_part('abc##123###xyz', '##', 1)") - testFoldConst("select split_part('abc##123###xyz', '##', -2)") + testFoldConst("select split_part('abc##123###xyz', '##', 2)") testFoldConst("select split_part('abc##123###xyz', '##', 3)") - testFoldConst("select split_part('abc##123###xyz', '##', -4)") - testFoldConst("select split_part('abc##123###xyz', '##', 5)") + testFoldConst("select split_part('abc##123###xyz', '##', 4)") + testFoldConst("select split_part('abc##123###xyz', '##', 10)") + testFoldConst("select split_part('a,b,c', ',', -100)") + testFoldConst("select split_part('a,b,c', ',', -5)") + testFoldConst("select split_part('a,b,c', ',', -4)") + testFoldConst("select split_part('a,b,c', ',', -3)") + testFoldConst("select split_part('a,b,c', ',', -2)") + testFoldConst("select split_part('a,b,c', ',', -1)") + testFoldConst("select split_part('a,b,c', ',', -0)") + testFoldConst("select split_part('a,b,c', ',', 0)") + testFoldConst("select split_part('a,b,c', ',', 1)") testFoldConst("select split_part('a,b,c', ',', 2)") + testFoldConst("select split_part('a,b,c', ',', 3)") + testFoldConst("select split_part('a,b,c', ',', 4)") testFoldConst("select split_part('a,b,c', ',', 5)") + testFoldConst("select split_part('a,b,c', ',', 100)") testFoldConst("select split_part(cast('a,b,c' as string), cast(',' as string), -1)") testFoldConst("select split_part(cast('a,b,c' as string), cast(',' as string), 2)") testFoldConst("select split_part(cast('a,b,c' as string), cast(',' as string), 5)") @@ -485,6 +547,7 @@ suite("fold_constant_string_arithmatic") { testFoldConst("select split_part('hello world', ' ', -2)") testFoldConst("select split_part('hello world', ' ', 2)") testFoldConst("select split_part('hello world', ' ', -3)") + testFoldConst("select split_part('hello world', ' ', -3)") testFoldConst("SELECT split_part('哈哈哈AAA','A', -5)") testFoldConst("SELECT split_part('哈哈哈AAA','A', -4)") testFoldConst("SELECT split_part('哈哈哈AAA','A', -3)") @@ -505,7 +568,31 @@ suite("fold_constant_string_arithmatic") { testFoldConst("SELECT split_part('哈哈哈AA+','A', 2)") testFoldConst("SELECT split_part('哈哈哈AA+','A', 3)") testFoldConst("SELECT split_part('哈哈哈AA+','A', 4)") - + testFoldConst("SELECT split_part('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','..', 1)") + testFoldConst("SELECT split_part('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','\$\$', 1)") + testFoldConst("SELECT split_part('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','||', 1)") + testFoldConst("SELECT split_part('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','((', 1)") + testFoldConst("SELECT split_part('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','))', 1)") + testFoldConst("SELECT split_part('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','[[', 1)") + testFoldConst("SELECT split_part('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','{{', 1)") + testFoldConst("SELECT split_part('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','^^', 1)") + testFoldConst("SELECT split_part('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','??', 1)") + testFoldConst("SELECT split_part('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','**', 1)") + testFoldConst("SELECT split_part('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','++', 1)") + testFoldConst("SELECT split_part('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','\\\\', 1)") + testFoldConst("SELECT split_part('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','..', 2)") + testFoldConst("SELECT split_part('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','\$\$', 2)") + testFoldConst("SELECT split_part('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','||', 2)") + testFoldConst("SELECT split_part('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','((', 2)") + testFoldConst("SELECT split_part('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','))', 2)") + testFoldConst("SELECT split_part('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','[[', 2)") + testFoldConst("SELECT split_part('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','{{', 2)") + testFoldConst("SELECT split_part('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','^^', 2)") + testFoldConst("SELECT split_part('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','??', 2)") + testFoldConst("SELECT split_part('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','**', 2)") + testFoldConst("SELECT split_part('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','++', 2)") + testFoldConst("SELECT split_part('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','\\\\', 2)") + // starts_with testFoldConst("select starts_with('hello world','hello')") testFoldConst("select starts_with('hello world',null)") @@ -650,6 +737,30 @@ suite("fold_constant_string_arithmatic") { testFoldConst("SELECT substring_index('哈哈哈AA+','A', 2)") testFoldConst("SELECT substring_index('哈哈哈AA+','A', 3)") testFoldConst("SELECT substring_index('哈哈哈AA+','A', 4)") + testFoldConst("SELECT substring_index('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','..', 1)") + testFoldConst("SELECT substring_index('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','\$\$', 1)") + testFoldConst("SELECT substring_index('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','||', 1)") + testFoldConst("SELECT substring_index('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','((', 1)") + testFoldConst("SELECT substring_index('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','))', 1)") + testFoldConst("SELECT substring_index('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','[[', 1)") + testFoldConst("SELECT substring_index('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','{{', 1)") + testFoldConst("SELECT substring_index('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','^^', 1)") + testFoldConst("SELECT substring_index('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','??', 1)") + testFoldConst("SELECT substring_index('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','**', 1)") + testFoldConst("SELECT substring_index('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','++', 1)") + testFoldConst("SELECT substring_index('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','\\\\', 1)") + testFoldConst("SELECT substring_index('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','..', 2)") + testFoldConst("SELECT substring_index('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','\$\$', 2)") + testFoldConst("SELECT substring_index('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','||', 2)") + testFoldConst("SELECT substring_index('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','((', 2)") + testFoldConst("SELECT substring_index('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','))', 2)") + testFoldConst("SELECT substring_index('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','[[', 2)") + testFoldConst("SELECT substring_index('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','{{', 2)") + testFoldConst("SELECT substring_index('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','^^', 2)") + testFoldConst("SELECT substring_index('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','??', 2)") + testFoldConst("SELECT substring_index('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','**', 2)") + testFoldConst("SELECT substring_index('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','++', 2)") + testFoldConst("SELECT substring_index('a..b\$\$c||d((e))f[[g{{h^^i??j**k++l\\\\m','\\\\', 2)") // trim testFoldConst("select trim('11111', 11)") --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org