This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 8eab20d3df [bugfix](low cardinality) cached code is wrong will result 
wrong query result when many null pages (#19221)
8eab20d3df is described below

commit 8eab20d3dfb7b73e9572d843cb7001c2952f5364
Author: yiguolei <676222...@qq.com>
AuthorDate: Sat Apr 29 21:28:41 2023 +0800

    [bugfix](low cardinality) cached code is wrong will result wrong query 
result when many null pages (#19221)
    
    Sometimes the dict is not initialized when run comparison predicate here, 
for example, the full page is null, then the reader will skip read, so that the 
dictionary is not inited. The cached code is wrong during this case, because 
the following page maybe not null, and the dict should have items in the future.
    This will result the dict string column query return wrong result, if there 
are many null values in the column.
    I also add some regression test for dict column's equal query, larger than 
query, less than query.
    
    ---------
    
    Co-authored-by: yiguolei <yiguo...@gmail.com>
---
 be/src/common/config.h                             |   1 +
 be/src/olap/comparison_predicate.h                 |  20 +++-
 be/src/vec/columns/column_dictionary.h             |   4 +-
 .../data/nereids_p0/test_dict_with_null.out        | 112 +++++++++++++++++++++
 .../data/query_p0/test_dict_with_null.out          | 112 +++++++++++++++++++++
 .../suites/nereids_p0/test_dict_with_null.groovy   |   5 +-
 .../suites/query_p0/test_dict_with_null.groovy     |   5 +-
 7 files changed, 255 insertions(+), 4 deletions(-)

diff --git a/be/src/common/config.h b/be/src/common/config.h
index 29ae1c3ba9..fa6a516bc4 100644
--- a/be/src/common/config.h
+++ b/be/src/common/config.h
@@ -284,6 +284,7 @@ CONF_Bool(disable_storage_page_cache, "false");
 CONF_Bool(disable_storage_row_cache, "true");
 
 CONF_Bool(enable_low_cardinality_optimize, "true");
+CONF_Bool(enable_low_cardinality_cache_code, "true");
 
 // be policy
 // whether check compaction checksum
diff --git a/be/src/olap/comparison_predicate.h 
b/be/src/olap/comparison_predicate.h
index c74b06eaf3..a830319088 100644
--- a/be/src/olap/comparison_predicate.h
+++ b/be/src/olap/comparison_predicate.h
@@ -565,8 +565,26 @@ private:
             const vectorized::ColumnDictI32& column) const {
         /// if _cache_code_enabled is false, always find the code from dict.
         if (UNLIKELY(!_cache_code_enabled || _cached_code == 
_InvalidateCodeValue)) {
-            _cached_code = _is_range() ? column.find_code_by_bound(_value, 
_is_greater(), _is_eq())
+            int32_t code = _is_range() ? column.find_code_by_bound(_value, 
_is_greater(), _is_eq())
                                        : column.find_code(_value);
+
+            // Protect the invalid code logic, to avoid data error.
+            if (code == _InvalidateCodeValue) {
+                LOG(FATAL) << "column dictionary should not return the code " 
<< code
+                           << ", because it is assumed as an invalid code in 
comparison predicate";
+            }
+            // Sometimes the dict is not initialized when run comparison 
predicate here, for example,
+            // the full page is null, then the reader will skip read, so that 
the dictionary is not
+            // inited. The cached code is wrong during this case, because the 
following page maybe not
+            // null, and the dict should have items in the future.
+            //
+            // Cached code may have problems, so that add a config here, if 
not opened, then
+            // we will return the code and not cache it.
+            if (column.is_dict_empty() || 
!config::enable_low_cardinality_cache_code) {
+                return code;
+            }
+            // If the dict is not empty, then the dict is inited and we could 
cache the value.
+            _cached_code = code;
         }
         return _cached_code;
     }
diff --git a/be/src/vec/columns/column_dictionary.h 
b/be/src/vec/columns/column_dictionary.h
index 8a2c055d13..0d3b4ee8de 100644
--- a/be/src/vec/columns/column_dictionary.h
+++ b/be/src/vec/columns/column_dictionary.h
@@ -306,6 +306,8 @@ public:
 
     bool is_dict_sorted() const { return _dict_sorted; }
 
+    bool is_dict_empty() const { return _dict.empty(); }
+
     bool is_dict_code_converted() const { return _dict_code_converted; }
 
     MutableColumnPtr convert_to_predicate_column_if_dictionary() override {
@@ -507,7 +509,7 @@ public:
 
         size_t byte_size() { return _dict_data->size() * 
sizeof((*_dict_data)[0]); }
 
-        bool empty() { return _dict_data->empty(); }
+        bool empty() const { return _dict_data->empty(); }
 
         size_t avg_str_len() { return empty() ? 0 : _total_str_len / 
_dict_data->size(); }
 
diff --git a/regression-test/data/nereids_p0/test_dict_with_null.out 
b/regression-test/data/nereids_p0/test_dict_with_null.out
new file mode 100644
index 0000000000..7ce45abf43
--- /dev/null
+++ b/regression-test/data/nereids_p0/test_dict_with_null.out
@@ -0,0 +1,112 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !sql1 --
+101    abc
+
+-- !sql2 --
+101    abc
+
+-- !sql3 --
+101    abc
+
+-- !sql4 --
+100    \N
+99     \N
+98     \N
+97     \N
+96     \N
+95     \N
+94     \N
+93     \N
+92     \N
+91     \N
+90     \N
+89     \N
+88     \N
+87     \N
+86     \N
+85     \N
+84     \N
+83     \N
+82     \N
+81     \N
+80     \N
+79     \N
+78     \N
+77     \N
+76     \N
+75     \N
+74     \N
+73     \N
+72     \N
+71     \N
+70     \N
+69     \N
+68     \N
+67     \N
+66     \N
+65     \N
+64     \N
+63     \N
+62     \N
+61     \N
+60     \N
+59     \N
+58     \N
+57     \N
+56     \N
+55     \N
+54     \N
+53     \N
+52     \N
+51     \N
+50     \N
+49     \N
+48     \N
+47     \N
+46     \N
+45     \N
+44     \N
+43     \N
+42     \N
+41     \N
+40     \N
+39     \N
+38     \N
+37     \N
+36     \N
+35     \N
+34     \N
+33     \N
+32     \N
+31     \N
+30     \N
+29     \N
+28     \N
+27     \N
+26     \N
+25     \N
+24     \N
+23     \N
+22     \N
+21     \N
+20     \N
+19     \N
+18     \N
+17     \N
+16     \N
+15     \N
+14     \N
+13     \N
+12     \N
+11     \N
+10     \N
+9      \N
+8      \N
+7      \N
+6      \N
+5      \N
+4      \N
+3      \N
+2      \N
+1      \N
+
diff --git a/regression-test/data/query_p0/test_dict_with_null.out 
b/regression-test/data/query_p0/test_dict_with_null.out
new file mode 100644
index 0000000000..7ce45abf43
--- /dev/null
+++ b/regression-test/data/query_p0/test_dict_with_null.out
@@ -0,0 +1,112 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !sql1 --
+101    abc
+
+-- !sql2 --
+101    abc
+
+-- !sql3 --
+101    abc
+
+-- !sql4 --
+100    \N
+99     \N
+98     \N
+97     \N
+96     \N
+95     \N
+94     \N
+93     \N
+92     \N
+91     \N
+90     \N
+89     \N
+88     \N
+87     \N
+86     \N
+85     \N
+84     \N
+83     \N
+82     \N
+81     \N
+80     \N
+79     \N
+78     \N
+77     \N
+76     \N
+75     \N
+74     \N
+73     \N
+72     \N
+71     \N
+70     \N
+69     \N
+68     \N
+67     \N
+66     \N
+65     \N
+64     \N
+63     \N
+62     \N
+61     \N
+60     \N
+59     \N
+58     \N
+57     \N
+56     \N
+55     \N
+54     \N
+53     \N
+52     \N
+51     \N
+50     \N
+49     \N
+48     \N
+47     \N
+46     \N
+45     \N
+44     \N
+43     \N
+42     \N
+41     \N
+40     \N
+39     \N
+38     \N
+37     \N
+36     \N
+35     \N
+34     \N
+33     \N
+32     \N
+31     \N
+30     \N
+29     \N
+28     \N
+27     \N
+26     \N
+25     \N
+24     \N
+23     \N
+22     \N
+21     \N
+20     \N
+19     \N
+18     \N
+17     \N
+16     \N
+15     \N
+14     \N
+13     \N
+12     \N
+11     \N
+10     \N
+9      \N
+8      \N
+7      \N
+6      \N
+5      \N
+4      \N
+3      \N
+2      \N
+1      \N
+
diff --git a/regression-test/suites/nereids_p0/test_dict_with_null.groovy 
b/regression-test/suites/nereids_p0/test_dict_with_null.groovy
index becceafdde..a0858c2122 100644
--- a/regression-test/suites/nereids_p0/test_dict_with_null.groovy
+++ b/regression-test/suites/nereids_p0/test_dict_with_null.groovy
@@ -44,5 +44,8 @@ suite("dict_with_null", "query") {
     insert_sql += ", (101, 'abc')"
 
     sql insert_sql
-    sql "select * from test_dict_with_null where c_string > '0'"
+    qt_sql1 "select * from test_dict_with_null where c_string > '0'"
+    qt_sql2 "select * from test_dict_with_null where c_string < 'dfg'"
+    qt_sql3 "select * from test_dict_with_null where c_string = 'abc'"
+    qt_sql4 "select * from test_dict_with_null where c_string is null order by 
c_int desc"
 }
\ No newline at end of file
diff --git a/regression-test/suites/query_p0/test_dict_with_null.groovy 
b/regression-test/suites/query_p0/test_dict_with_null.groovy
index a5c84444ae..b3738bb68a 100644
--- a/regression-test/suites/query_p0/test_dict_with_null.groovy
+++ b/regression-test/suites/query_p0/test_dict_with_null.groovy
@@ -42,5 +42,8 @@ suite("dict_with_null", "query") {
     insert_sql += ", (101, 'abc')"
 
     sql insert_sql
-    sql "select * from test_dict_with_null where c_string > '0'"
+    qt_sql1 "select * from test_dict_with_null where c_string > '0'"
+    qt_sql2 "select * from test_dict_with_null where c_string < 'dfg'"
+    qt_sql3 "select * from test_dict_with_null where c_string = 'abc'"
+    qt_sql4 "select * from test_dict_with_null where c_string is null order by 
c_int desc"
 }
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to