This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 3a49156e30 [performance] (vectorization)optimize In Expr (#11826)
3a49156e30 is described below
commit 3a49156e30f3a025e5100d388961a082843387b7
Author: wangbo <[email protected]>
AuthorDate: Wed Aug 17 10:46:37 2022 +0800
[performance] (vectorization)optimize In Expr (#11826)
Co-authored-by: Wang Bo <[email protected]>
---
be/src/exprs/aggregate_functions.cpp | 2 +-
be/src/exprs/create_predicate_function.h | 2 +-
be/src/exprs/hybrid_set.h | 94 +++++++++++++++++-
be/src/vec/exec/join/vhash_join_node.cpp | 9 +-
be/src/vec/exec/join/vhash_join_node.h | 2 +
be/src/vec/functions/in.h | 85 +++++++++++++---
.../data/query/sql_functions/test_in_expr.out | 45 +++++++++
.../suites/query/sql_functions/test_in_expr.groovy | 110 +++++++++++++++++++++
8 files changed, 330 insertions(+), 19 deletions(-)
diff --git a/be/src/exprs/aggregate_functions.cpp
b/be/src/exprs/aggregate_functions.cpp
index 992bd42b74..2ae13ffaef 100644
--- a/be/src/exprs/aggregate_functions.cpp
+++ b/be/src/exprs/aggregate_functions.cpp
@@ -1397,7 +1397,7 @@ public:
static const int STRING_LENGTH_RECORD_LENGTH = 4;
private:
- StringValueSet _set;
+ StringSet _set;
// _type is serialized into buffer by one byte
FunctionContext::Type _type;
};
diff --git a/be/src/exprs/create_predicate_function.h
b/be/src/exprs/create_predicate_function.h
index 6795777cc6..5aa0e2347f 100644
--- a/be/src/exprs/create_predicate_function.h
+++ b/be/src/exprs/create_predicate_function.h
@@ -39,7 +39,7 @@ public:
template <PrimitiveType type>
static BasePtr get_function() {
using CppType = typename PrimitiveTypeTraits<type>::CppType;
- using Set = std::conditional_t<std::is_same_v<CppType, StringValue>,
StringValueSet,
+ using Set = std::conditional_t<std::is_same_v<CppType, StringValue>,
StringSet,
HybridSet<type, is_vec>>;
return new (std::nothrow) Set();
};
diff --git a/be/src/exprs/hybrid_set.h b/be/src/exprs/hybrid_set.h
index e43b2c29ea..20c9721675 100644
--- a/be/src/exprs/hybrid_set.h
+++ b/be/src/exprs/hybrid_set.h
@@ -145,11 +145,11 @@ private:
ObjectPool _pool;
};
-class StringValueSet : public HybridSetBase {
+class StringSet : public HybridSetBase {
public:
- StringValueSet() = default;
+ StringSet() = default;
- ~StringValueSet() override = default;
+ ~StringSet() override = default;
Status to_vexpr_list(doris::ObjectPool* pool,
std::vector<doris::vectorized::VExpr*>* vexpr_list,
int precision,
@@ -179,7 +179,7 @@ public:
}
void insert(HybridSetBase* set) override {
- StringValueSet* string_set = reinterpret_cast<StringValueSet*>(set);
+ StringSet* string_set = reinterpret_cast<StringSet*>(set);
_set.insert(string_set->_set.begin(), string_set->_set.end());
}
@@ -228,4 +228,90 @@ private:
ObjectPool _pool;
};
+// note: Two difference from StringSet
+// 1 StringValue has better comparison performance than std::string
+// 2 std::string keeps its own memory, bug StringValue just keeps ptr and len,
so you the caller should manage memory of StringValue
+class StringValueSet : public HybridSetBase {
+public:
+ StringValueSet() = default;
+
+ ~StringValueSet() override = default;
+
+ Status to_vexpr_list(doris::ObjectPool* pool,
+ std::vector<doris::vectorized::VExpr*>* vexpr_list,
int precision,
+ int scale) override {
+ HybridSetBase::IteratorBase* it = begin();
+ DCHECK(it != nullptr);
+ while (it->has_next()) {
+ TExprNode node;
+ const void* v = it->get_value();
+ create_texpr_literal_node<TYPE_STRING>(v, &node);
+ vexpr_list->push_back(pool->add(new
doris::vectorized::VLiteral(node)));
+ it->next();
+ }
+ return Status::OK();
+ };
+
+ void insert(const void* data) override {
+ if (data == nullptr) return;
+
+ const auto* value = reinterpret_cast<const StringValue*>(data);
+ StringValue sv(value->ptr, value->len);
+ _set.insert(sv);
+ }
+ void insert(void* data, size_t size) override {
+ StringValue sv(reinterpret_cast<char*>(data), size);
+ _set.insert(sv);
+ }
+
+ void insert(HybridSetBase* set) override {
+ StringValueSet* string_set = reinterpret_cast<StringValueSet*>(set);
+ _set.insert(string_set->_set.begin(), string_set->_set.end());
+ }
+
+ int size() override { return _set.size(); }
+
+ bool find(void* data) override {
+ auto* value = reinterpret_cast<StringValue*>(data);
+ auto it = _set.find(*value);
+
+ return !(it == _set.end());
+ }
+
+ bool find(void* data, size_t size) override {
+ // std::string str_value(reinterpret_cast<char*>(data), size);
+ StringValue sv(reinterpret_cast<char*>(data), size);
+ auto it = _set.find(sv);
+ return !(it == _set.end());
+ }
+
+ class Iterator : public IteratorBase {
+ public:
+ Iterator(phmap::flat_hash_set<StringValue>::iterator begin,
+ phmap::flat_hash_set<StringValue>::iterator end)
+ : _begin(begin), _end(end) {}
+ ~Iterator() override = default;
+ virtual bool has_next() const override { return !(_begin == _end); }
+ virtual const void* get_value() override {
+ _value.ptr = const_cast<char*>(_begin->ptr);
+ _value.len = _begin->len;
+ return &_value;
+ }
+ virtual void next() override { ++_begin; }
+
+ private:
+ typename phmap::flat_hash_set<StringValue>::iterator _begin;
+ typename phmap::flat_hash_set<StringValue>::iterator _end;
+ StringValue _value;
+ };
+
+ IteratorBase* begin() override {
+ return _pool.add(new (std::nothrow) Iterator(_set.begin(),
_set.end()));
+ }
+
+private:
+ phmap::flat_hash_set<StringValue> _set;
+ ObjectPool _pool;
+};
+
} // namespace doris
diff --git a/be/src/vec/exec/join/vhash_join_node.cpp
b/be/src/vec/exec/join/vhash_join_node.cpp
index e7ae1c3a91..4fc36fd5ba 100644
--- a/be/src/vec/exec/join/vhash_join_node.cpp
+++ b/be/src/vec/exec/join/vhash_join_node.cpp
@@ -879,6 +879,8 @@ Status HashJoinNode::prepare(RuntimeState* state) {
_build_side_output_timer = ADD_TIMER(probe_phase_profile,
"ProbeWhenBuildSideOutputTime");
_probe_side_output_timer = ADD_TIMER(probe_phase_profile,
"ProbeWhenProbeSideOutputTime");
+ _join_filter_timer = ADD_TIMER(runtime_profile(), "JoinFilterTimer");
+
_push_down_timer = ADD_TIMER(runtime_profile(), "PushDownTime");
_push_compute_timer = ADD_TIMER(runtime_profile(), "PushDownComputeTime");
_build_buckets_counter = ADD_COUNTER(runtime_profile(), "BuildBuckets",
TUnit::UNIT);
@@ -1034,8 +1036,11 @@ Status HashJoinNode::get_next(RuntimeState* state,
Block* output_block, bool* eo
}
_add_tuple_is_null_column(&temp_block);
- RETURN_IF_ERROR(
- VExprContext::filter_block(_vconjunct_ctx_ptr, &temp_block,
temp_block.columns()));
+ {
+ SCOPED_TIMER(_join_filter_timer);
+ RETURN_IF_ERROR(
+ VExprContext::filter_block(_vconjunct_ctx_ptr, &temp_block,
temp_block.columns()));
+ }
RETURN_IF_ERROR(_build_output_block(&temp_block, output_block));
_reset_tuple_is_null_column();
reached_limit(output_block, eos);
diff --git a/be/src/vec/exec/join/vhash_join_node.h
b/be/src/vec/exec/join/vhash_join_node.h
index 36dec27fbb..48cb54e67a 100644
--- a/be/src/vec/exec/join/vhash_join_node.h
+++ b/be/src/vec/exec/join/vhash_join_node.h
@@ -204,6 +204,8 @@ private:
RuntimeProfile::Counter* _build_side_output_timer;
RuntimeProfile::Counter* _probe_side_output_timer;
+ RuntimeProfile::Counter* _join_filter_timer;
+
int64_t _hash_table_rows;
int64_t _mem_used;
diff --git a/be/src/vec/functions/in.h b/be/src/vec/functions/in.h
index d10661fa48..bdf21cda96 100644
--- a/be/src/vec/functions/in.h
+++ b/be/src/vec/functions/in.h
@@ -67,8 +67,15 @@ public:
}
auto* state = new InState();
context->set_function_state(scope, state);
- state->hybrid_set.reset(
-
vec_create_set(convert_type_to_primitive(context->get_arg_type(0)->type)));
+ if (context->get_arg_type(0)->type == FunctionContext::Type::TYPE_CHAR
||
+ context->get_arg_type(0)->type ==
FunctionContext::Type::TYPE_VARCHAR ||
+ context->get_arg_type(0)->type ==
FunctionContext::Type::TYPE_STRING) {
+ // the StringValue's memory is held by FunctionContext, so we can
use StringValueSet here directly
+ state->hybrid_set.reset(new StringValueSet());
+ } else {
+ state->hybrid_set.reset(
+
vec_create_set(convert_type_to_primitive(context->get_arg_type(0)->type)));
+ }
DCHECK(context->get_num_args() >= 1);
for (int i = 1; i < context->get_num_args(); ++i) {
@@ -109,18 +116,74 @@ public:
auto materialized_column =
left_arg.column->convert_to_full_column_if_const();
if (in_state->use_set) {
- for (size_t i = 0; i < input_rows_count; ++i) {
- const auto& ref_data = materialized_column->get_data_at(i);
- if (ref_data.data) {
- vec_res[i] = negative ^
-
in_state->hybrid_set->find((void*)ref_data.data, ref_data.size);
- if (in_state->null_in_set) {
+ if (materialized_column->is_nullable()) {
+ auto* null_col_ptr =
vectorized::check_and_get_column<vectorized::ColumnNullable>(
+ materialized_column);
+ auto& null_bitmap = reinterpret_cast<const
vectorized::ColumnUInt8&>(
+
null_col_ptr->get_null_map_column())
+ .get_data();
+ auto* nested_col_ptr =
null_col_ptr->get_nested_column_ptr().get();
+ auto search_hash_set = [&](auto* col_ptr) {
+ for (size_t i = 0; i < input_rows_count; ++i) {
+ const auto& ref_data = col_ptr->get_data_at(i);
+ vec_res[i] =
+ !null_bitmap[i] &&
+
in_state->hybrid_set->find((void*)ref_data.data, ref_data.size);
+ if constexpr (negative) {
+ vec_res[i] = !vec_res[i];
+ }
+ }
+ };
+
+ if (nested_col_ptr->is_column_string()) {
+ const auto* column_string_ptr =
+ reinterpret_cast<const
vectorized::ColumnString*>(nested_col_ptr);
+ search_hash_set(column_string_ptr);
+ } else {
+ // todo support other column type
+ search_hash_set(nested_col_ptr);
+ }
+
+ if (!in_state->null_in_set) {
+ for (size_t i = 0; i < input_rows_count; ++i) {
+ vec_null_map_to[i] = null_bitmap[i];
+ }
+ } else {
+ for (size_t i = 0; i < input_rows_count; ++i) {
+ vec_null_map_to[i] = null_bitmap[i] || (negative ==
vec_res[i]);
+ }
+ }
+
+ } else { // non-nullable
+
+ auto search_hash_set = [&](auto* col_ptr) {
+ for (size_t i = 0; i < input_rows_count; ++i) {
+ const auto& ref_data = col_ptr->get_data_at(i);
+ vec_res[i] =
+
in_state->hybrid_set->find((void*)ref_data.data, ref_data.size);
+ if constexpr (negative) {
+ vec_res[i] = !vec_res[i];
+ }
+ }
+ };
+
+ if (materialized_column->is_column_string()) {
+ const auto* column_string_ptr =
+ reinterpret_cast<const vectorized::ColumnString*>(
+ materialized_column.get());
+ search_hash_set(column_string_ptr);
+ } else {
+ search_hash_set(materialized_column.get());
+ }
+
+ if (in_state->null_in_set) {
+ for (size_t i = 0; i < input_rows_count; ++i) {
vec_null_map_to[i] = negative == vec_res[i];
- } else {
- vec_null_map_to[i] = false;
}
} else {
- vec_null_map_to[i] = true;
+ for (size_t i = 0; i < input_rows_count; ++i) {
+ vec_null_map_to[i] = false;
+ }
}
}
} else {
diff --git a/regression-test/data/query/sql_functions/test_in_expr.out
b/regression-test/data/query/sql_functions/test_in_expr.out
new file mode 100644
index 0000000000..5006d062dc
--- /dev/null
+++ b/regression-test/data/query/sql_functions/test_in_expr.out
@@ -0,0 +1,45 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !select --
+4
+
+-- !select --
+4
+
+-- !select --
+c
+
+-- !select --
+4
+
+-- !select --
+4
+
+-- !select --
+c
+
+-- !select --
+\N
+1
+2
+3
+
+-- !select --
+
+-- !select --
+\N
+a
+b
+d
+
+-- !select --
+1
+2
+3
+
+-- !select --
+
+-- !select --
+a
+b
+d
+
diff --git a/regression-test/suites/query/sql_functions/test_in_expr.groovy
b/regression-test/suites/query/sql_functions/test_in_expr.groovy
new file mode 100644
index 0000000000..6efc094c3b
--- /dev/null
+++ b/regression-test/suites/query/sql_functions/test_in_expr.groovy
@@ -0,0 +1,110 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_in_expr", "query") {
+ def nullTableName = "in_expr_test_null"
+ def notNullTableName = "in_expr_test_not_null"
+
+ sql """DROP TABLE IF EXISTS ${nullTableName}"""
+ sql """
+ CREATE TABLE ${nullTableName} (
+ `cid` int(11) NULL,
+ `number` int(11) NULL,
+ `addr` varchar(256) NULL
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`cid`)
+ COMMENT 'OLAP'
+ DISTRIBUTED BY HASH(`cid`) BUCKETS 1
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1",
+ "in_memory" = "false",
+ "storage_format" = "V2"
+ )
+ """
+ sql """ insert into ${nullTableName}
values(100,1,'a'),(101,2,'b'),(102,3,'c'),(103,4,'d'),(104,null,'e'),(105,6,
null) """
+
+
+ sql """DROP TABLE IF EXISTS ${notNullTableName}"""
+ sql """
+ CREATE TABLE ${notNullTableName} (
+ `cid` int(11) not NULL,
+ `number` int(11) not NULL,
+ `addr` varchar(256) not NULL
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`cid`)
+ COMMENT 'OLAP'
+ DISTRIBUTED BY HASH(`cid`) BUCKETS 1
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1",
+ "in_memory" = "false",
+ "storage_format" = "V2"
+ )
+ """
+
+ sql """ insert into ${notNullTableName}
values(100,1,'a'),(101,2,'b'),(102,3,'c'),(103,4,'d') """
+
+ sql """ set enable_vectorized_engine = true """
+
+ // 1 in expr
+ // 1.1 nullable
+ // 1.1.1 string + set_not_null
+ qt_select "select t1.number from ${nullTableName} t1 left join
${nullTableName} t2 on t1.cid=t2.cid where t2.addr in ('d')"
+
+ // 1.1.2 string + null_in_set
+ qt_select "select t1.number from ${nullTableName} t1 left join
${nullTableName} t2 on t1.cid=t2.cid where t2.addr in ('d', null)"
+
+ // 1.1.3 non-string
+ qt_select "select t1.addr from ${nullTableName} t1 left join
${nullTableName} t2 on t1.cid=t2.cid where t2.number in (3)"
+
+ // 1.2 not null
+ // 1.2.1 string + set_not_null
+ qt_select "select t1.number from ${notNullTableName} t1 left join
${notNullTableName} t2 on t1.cid=t2.cid where t2.addr in ('d')"
+
+ // 1.1.2 string + null_in_set
+ qt_select "select t1.number from ${notNullTableName} t1 left join
${notNullTableName} t2 on t1.cid=t2.cid where t2.addr in ('d', null)"
+
+ // 1.1.3 non-string
+ qt_select "select t1.addr from ${notNullTableName} t1 left join
${notNullTableName} t2 on t1.cid=t2.cid where t2.number in (3)"
+
+
+
+
+ // 2 not in expr
+ // 2.1 nullable
+ // 2.1.1 string + set_not_null
+ qt_select "select t1.number from ${nullTableName} t1 left join
${nullTableName} t2 on t1.cid=t2.cid where t2.addr not in ('d') order by
t1.number"
+
+ // 2.1.2 string + null_in_set
+ qt_select "select t1.number from ${nullTableName} t1 left join
${nullTableName} t2 on t1.cid=t2.cid where t2.addr not in ('d', null) "
+
+ // 2.1.3 non-string
+ qt_select "select t1.addr from ${nullTableName} t1 left join
${nullTableName} t2 on t1.cid=t2.cid where t2.number not in (3) order by
t1.addr "
+
+ // 2.2 not null
+ // 2.2.1 string + set_not_null
+ qt_select "select t1.number from ${notNullTableName} t1 left join
${notNullTableName} t2 on t1.cid=t2.cid where t2.addr not in ('d') order by
t1.number "
+
+ // 2.1.2 string + null_in_set
+ qt_select "select t1.number from ${notNullTableName} t1 left join
${notNullTableName} t2 on t1.cid=t2.cid where t2.addr not in ('d', null)"
+
+ // 2.1.3 non-string
+ qt_select "select t1.addr from ${notNullTableName} t1 left join
${notNullTableName} t2 on t1.cid=t2.cid where t2.number not in (3) order by
t1.addr "
+
+ sql """DROP TABLE IF EXISTS ${nullTableName}"""
+ sql """DROP TABLE IF EXISTS ${notNullTableName}"""
+
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]