linrrzqqq commented on code in PR #60892:
URL: https://github.com/apache/doris/pull/60892#discussion_r2934827599
##########
be/src/vec/functions/function_string.h:
##########
@@ -2163,44 +2158,143 @@ class FunctionSplitByString : public IFunction {
}
}
- void split_empty_delimiter(const StringRef& str_ref, ColumnString::Chars&
column_string_chars,
- ColumnString::Offsets& column_string_offsets,
- ColumnArray::Offset64& string_pos,
- ColumnArray::Offset64& dest_pos) const {
+ static void split_empty_delimiter(const StringRef& str_ref,
+ ColumnString::Chars& column_string_chars,
+ ColumnString::Offsets&
column_string_offsets,
+ ColumnArray::Offset64& string_pos,
+ ColumnArray::Offset64& dest_pos, Int32
limit_value) {
const size_t old_size = column_string_chars.size();
const size_t new_size = old_size + str_ref.size;
column_string_chars.resize(new_size);
memcpy(column_string_chars.data() + old_size, str_ref.data,
str_ref.size);
- if (simd::VStringFunctions::is_ascii(str_ref)) {
- const auto size = str_ref.size;
-
- const auto nested_old_size = column_string_offsets.size();
- const auto nested_new_size = nested_old_size + size;
- column_string_offsets.resize(nested_new_size);
- std::iota(column_string_offsets.data() + nested_old_size,
- column_string_offsets.data() + nested_new_size,
string_pos + 1);
-
- string_pos += size;
- dest_pos += size;
- // The above code is equivalent to the code in the following
comment.
- // for (size_t i = 0; i < str_ref.size; i++) {
- // string_pos++;
- // column_string_offsets.push_back(string_pos);
- // (*dest_nested_null_map).push_back(false);
- // dest_pos++;
- // }
+
+ if (limit_value > 0) {
+ // With limit: split character by character up to limit-1, then
remainder
+ Int32 split_count = 0;
+ size_t i = 0;
+ if (simd::VStringFunctions::is_ascii(str_ref)) {
+ for (; i < str_ref.size; i++) {
+ if (split_count == limit_value - 1) {
+ // remainder
+ string_pos += str_ref.size - i;
+ column_string_offsets.push_back(string_pos);
+ dest_pos++;
+ return;
+ }
+ string_pos++;
+ column_string_offsets.push_back(string_pos);
+ dest_pos++;
+ split_count++;
+ }
+ } else {
+ for (size_t utf8_char_len = 0; i < str_ref.size; i +=
utf8_char_len) {
+ utf8_char_len = UTF8_BYTE_LENGTH[(unsigned
char)str_ref.data[i]];
+ if (split_count == limit_value - 1) {
+ // remainder
+ string_pos += str_ref.size - i;
+ column_string_offsets.push_back(string_pos);
+ dest_pos++;
+ return;
+ }
+ string_pos += utf8_char_len;
+ column_string_offsets.push_back(string_pos);
+ dest_pos++;
+ split_count++;
+ }
+ }
} else {
- for (size_t i = 0, utf8_char_len = 0; i < str_ref.size; i +=
utf8_char_len) {
- utf8_char_len = UTF8_BYTE_LENGTH[(unsigned
char)str_ref.data[i]];
+ // No limit: original behavior
+ if (simd::VStringFunctions::is_ascii(str_ref)) {
+ const auto size = str_ref.size;
+
+ const auto nested_old_size = column_string_offsets.size();
+ const auto nested_new_size = nested_old_size + size;
+ column_string_offsets.resize(nested_new_size);
+ std::iota(column_string_offsets.data() + nested_old_size,
+ column_string_offsets.data() + nested_new_size,
string_pos + 1);
+
+ string_pos += size;
+ dest_pos += size;
+ } else {
+ for (size_t i = 0, utf8_char_len = 0; i < str_ref.size; i +=
utf8_char_len) {
+ utf8_char_len = UTF8_BYTE_LENGTH[(unsigned
char)str_ref.data[i]];
- string_pos += utf8_char_len;
- column_string_offsets.push_back(string_pos);
- dest_pos++;
+ string_pos += utf8_char_len;
+ column_string_offsets.push_back(string_pos);
+ dest_pos++;
+ }
}
}
}
};
+struct SplitByStringTwoArgImpl {
+ static DataTypes get_variadic_argument_types() {
+ return {std::make_shared<DataTypeString>(),
std::make_shared<DataTypeString>()};
+ }
+
+ static Status execute_impl(FunctionContext* /*context*/, Block& block,
+ const ColumnNumbers& arguments, uint32_t result,
+ size_t input_rows_count) {
+ DCHECK_EQ(arguments.size(), 2);
+ return SplitByStringExecutor::execute_core(block, arguments, result,
input_rows_count, -1);
+ }
+};
+
+struct SplitByStringThreeArgImpl {
+ static DataTypes get_variadic_argument_types() {
+ return {std::make_shared<DataTypeString>(),
std::make_shared<DataTypeString>(),
+ std::make_shared<DataTypeInt32>()};
+ }
+
+ static Status execute_impl(FunctionContext* /*context*/, Block& block,
+ const ColumnNumbers& arguments, uint32_t result,
+ size_t input_rows_count) {
+ DCHECK_EQ(arguments.size(), 3);
+ const auto& [limit_column, limit_is_const] =
+ unpack_if_const(block.get_by_position(arguments[2]).column);
+ DCHECK(limit_is_const) << "limit argument of split_by_string must be a
constant";
+ auto limit_value = assert_cast<const
ColumnInt32&>(*limit_column).get_element(0);
Review Comment:
The `DCHECK(limit_is_const)` here is incorrect. when
`use_default_implementation_for_constants()` returns true (the default) and all
arguments are constant, unwrap every ColumnConst into its underlying data
column before calling execute_impl.
In that case limit_is_const will be false, causing the DCHECK to failed, so
just remove it.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]