This is an automated email from the ASF dual-hosted git repository. yiguolei pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 8eab20d3df [bugfix](low cardinality) cached code is wrong will result wrong query result when many null pages (#19221) 8eab20d3df is described below commit 8eab20d3dfb7b73e9572d843cb7001c2952f5364 Author: yiguolei <676222...@qq.com> AuthorDate: Sat Apr 29 21:28:41 2023 +0800 [bugfix](low cardinality) cached code is wrong will result wrong query result when many null pages (#19221) Sometimes the dict is not initialized when run comparison predicate here, for example, the full page is null, then the reader will skip read, so that the dictionary is not inited. The cached code is wrong during this case, because the following page maybe not null, and the dict should have items in the future. This will result the dict string column query return wrong result, if there are many null values in the column. I also add some regression test for dict column's equal query, larger than query, less than query. --------- Co-authored-by: yiguolei <yiguo...@gmail.com> --- be/src/common/config.h | 1 + be/src/olap/comparison_predicate.h | 20 +++- be/src/vec/columns/column_dictionary.h | 4 +- .../data/nereids_p0/test_dict_with_null.out | 112 +++++++++++++++++++++ .../data/query_p0/test_dict_with_null.out | 112 +++++++++++++++++++++ .../suites/nereids_p0/test_dict_with_null.groovy | 5 +- .../suites/query_p0/test_dict_with_null.groovy | 5 +- 7 files changed, 255 insertions(+), 4 deletions(-) diff --git a/be/src/common/config.h b/be/src/common/config.h index 29ae1c3ba9..fa6a516bc4 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -284,6 +284,7 @@ CONF_Bool(disable_storage_page_cache, "false"); CONF_Bool(disable_storage_row_cache, "true"); CONF_Bool(enable_low_cardinality_optimize, "true"); +CONF_Bool(enable_low_cardinality_cache_code, "true"); // be policy // whether check compaction checksum diff --git a/be/src/olap/comparison_predicate.h b/be/src/olap/comparison_predicate.h index c74b06eaf3..a830319088 100644 --- a/be/src/olap/comparison_predicate.h +++ b/be/src/olap/comparison_predicate.h @@ -565,8 +565,26 @@ private: const vectorized::ColumnDictI32& column) const { /// if _cache_code_enabled is false, always find the code from dict. if (UNLIKELY(!_cache_code_enabled || _cached_code == _InvalidateCodeValue)) { - _cached_code = _is_range() ? column.find_code_by_bound(_value, _is_greater(), _is_eq()) + int32_t code = _is_range() ? column.find_code_by_bound(_value, _is_greater(), _is_eq()) : column.find_code(_value); + + // Protect the invalid code logic, to avoid data error. + if (code == _InvalidateCodeValue) { + LOG(FATAL) << "column dictionary should not return the code " << code + << ", because it is assumed as an invalid code in comparison predicate"; + } + // Sometimes the dict is not initialized when run comparison predicate here, for example, + // the full page is null, then the reader will skip read, so that the dictionary is not + // inited. The cached code is wrong during this case, because the following page maybe not + // null, and the dict should have items in the future. + // + // Cached code may have problems, so that add a config here, if not opened, then + // we will return the code and not cache it. + if (column.is_dict_empty() || !config::enable_low_cardinality_cache_code) { + return code; + } + // If the dict is not empty, then the dict is inited and we could cache the value. + _cached_code = code; } return _cached_code; } diff --git a/be/src/vec/columns/column_dictionary.h b/be/src/vec/columns/column_dictionary.h index 8a2c055d13..0d3b4ee8de 100644 --- a/be/src/vec/columns/column_dictionary.h +++ b/be/src/vec/columns/column_dictionary.h @@ -306,6 +306,8 @@ public: bool is_dict_sorted() const { return _dict_sorted; } + bool is_dict_empty() const { return _dict.empty(); } + bool is_dict_code_converted() const { return _dict_code_converted; } MutableColumnPtr convert_to_predicate_column_if_dictionary() override { @@ -507,7 +509,7 @@ public: size_t byte_size() { return _dict_data->size() * sizeof((*_dict_data)[0]); } - bool empty() { return _dict_data->empty(); } + bool empty() const { return _dict_data->empty(); } size_t avg_str_len() { return empty() ? 0 : _total_str_len / _dict_data->size(); } diff --git a/regression-test/data/nereids_p0/test_dict_with_null.out b/regression-test/data/nereids_p0/test_dict_with_null.out new file mode 100644 index 0000000000..7ce45abf43 --- /dev/null +++ b/regression-test/data/nereids_p0/test_dict_with_null.out @@ -0,0 +1,112 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql1 -- +101 abc + +-- !sql2 -- +101 abc + +-- !sql3 -- +101 abc + +-- !sql4 -- +100 \N +99 \N +98 \N +97 \N +96 \N +95 \N +94 \N +93 \N +92 \N +91 \N +90 \N +89 \N +88 \N +87 \N +86 \N +85 \N +84 \N +83 \N +82 \N +81 \N +80 \N +79 \N +78 \N +77 \N +76 \N +75 \N +74 \N +73 \N +72 \N +71 \N +70 \N +69 \N +68 \N +67 \N +66 \N +65 \N +64 \N +63 \N +62 \N +61 \N +60 \N +59 \N +58 \N +57 \N +56 \N +55 \N +54 \N +53 \N +52 \N +51 \N +50 \N +49 \N +48 \N +47 \N +46 \N +45 \N +44 \N +43 \N +42 \N +41 \N +40 \N +39 \N +38 \N +37 \N +36 \N +35 \N +34 \N +33 \N +32 \N +31 \N +30 \N +29 \N +28 \N +27 \N +26 \N +25 \N +24 \N +23 \N +22 \N +21 \N +20 \N +19 \N +18 \N +17 \N +16 \N +15 \N +14 \N +13 \N +12 \N +11 \N +10 \N +9 \N +8 \N +7 \N +6 \N +5 \N +4 \N +3 \N +2 \N +1 \N + diff --git a/regression-test/data/query_p0/test_dict_with_null.out b/regression-test/data/query_p0/test_dict_with_null.out new file mode 100644 index 0000000000..7ce45abf43 --- /dev/null +++ b/regression-test/data/query_p0/test_dict_with_null.out @@ -0,0 +1,112 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql1 -- +101 abc + +-- !sql2 -- +101 abc + +-- !sql3 -- +101 abc + +-- !sql4 -- +100 \N +99 \N +98 \N +97 \N +96 \N +95 \N +94 \N +93 \N +92 \N +91 \N +90 \N +89 \N +88 \N +87 \N +86 \N +85 \N +84 \N +83 \N +82 \N +81 \N +80 \N +79 \N +78 \N +77 \N +76 \N +75 \N +74 \N +73 \N +72 \N +71 \N +70 \N +69 \N +68 \N +67 \N +66 \N +65 \N +64 \N +63 \N +62 \N +61 \N +60 \N +59 \N +58 \N +57 \N +56 \N +55 \N +54 \N +53 \N +52 \N +51 \N +50 \N +49 \N +48 \N +47 \N +46 \N +45 \N +44 \N +43 \N +42 \N +41 \N +40 \N +39 \N +38 \N +37 \N +36 \N +35 \N +34 \N +33 \N +32 \N +31 \N +30 \N +29 \N +28 \N +27 \N +26 \N +25 \N +24 \N +23 \N +22 \N +21 \N +20 \N +19 \N +18 \N +17 \N +16 \N +15 \N +14 \N +13 \N +12 \N +11 \N +10 \N +9 \N +8 \N +7 \N +6 \N +5 \N +4 \N +3 \N +2 \N +1 \N + diff --git a/regression-test/suites/nereids_p0/test_dict_with_null.groovy b/regression-test/suites/nereids_p0/test_dict_with_null.groovy index becceafdde..a0858c2122 100644 --- a/regression-test/suites/nereids_p0/test_dict_with_null.groovy +++ b/regression-test/suites/nereids_p0/test_dict_with_null.groovy @@ -44,5 +44,8 @@ suite("dict_with_null", "query") { insert_sql += ", (101, 'abc')" sql insert_sql - sql "select * from test_dict_with_null where c_string > '0'" + qt_sql1 "select * from test_dict_with_null where c_string > '0'" + qt_sql2 "select * from test_dict_with_null where c_string < 'dfg'" + qt_sql3 "select * from test_dict_with_null where c_string = 'abc'" + qt_sql4 "select * from test_dict_with_null where c_string is null order by c_int desc" } \ No newline at end of file diff --git a/regression-test/suites/query_p0/test_dict_with_null.groovy b/regression-test/suites/query_p0/test_dict_with_null.groovy index a5c84444ae..b3738bb68a 100644 --- a/regression-test/suites/query_p0/test_dict_with_null.groovy +++ b/regression-test/suites/query_p0/test_dict_with_null.groovy @@ -42,5 +42,8 @@ suite("dict_with_null", "query") { insert_sql += ", (101, 'abc')" sql insert_sql - sql "select * from test_dict_with_null where c_string > '0'" + qt_sql1 "select * from test_dict_with_null where c_string > '0'" + qt_sql2 "select * from test_dict_with_null where c_string < 'dfg'" + qt_sql3 "select * from test_dict_with_null where c_string = 'abc'" + qt_sql4 "select * from test_dict_with_null where c_string is null order by c_int desc" } \ No newline at end of file --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org