This is an automated email from the ASF dual-hosted git repository.

kakachen pushed a commit to branch orc
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git


The following commit(s) were added to refs/heads/orc by this push:
     new 86a5c44f800 [fix](orc) Fix has null statistics for forward 
compatibility (#259)
86a5c44f800 is described below

commit 86a5c44f800306c9414f431eb1260f7b117b02fc
Author: Socrates <suyit...@selectdb.com>
AuthorDate: Tue Dec 10 10:13:14 2024 +0800

    [fix](orc) Fix has null statistics for forward compatibility (#259)
---
 c++/src/Statistics.cc          | 20 ++++++++++----------
 c++/src/sargs/PredicateLeaf.cc | 30 ++++++++++++++++++------------
 2 files changed, 28 insertions(+), 22 deletions(-)

diff --git a/c++/src/Statistics.cc b/c++/src/Statistics.cc
index 7b648739618..97c2963ff50 100644
--- a/c++/src/Statistics.cc
+++ b/c++/src/Statistics.cc
@@ -181,13 +181,13 @@ namespace orc {
 
   ColumnStatisticsImpl::ColumnStatisticsImpl(const proto::ColumnStatistics& 
pb) {
     _stats.setNumberOfValues(pb.numberofvalues());
-    _stats.setHasNull(pb.hasnull());
+    _stats.setHasNull(pb.has_hasnull() ? pb.hasnull() : true);
   }
 
   BinaryColumnStatisticsImpl::BinaryColumnStatisticsImpl(const 
proto::ColumnStatistics& pb,
                                                          const StatContext& 
statContext) {
     _stats.setNumberOfValues(pb.numberofvalues());
-    _stats.setHasNull(pb.hasnull());
+    _stats.setHasNull(pb.has_hasnull() ? pb.hasnull() : true);
     if (pb.has_binarystatistics() && statContext.correctStats) {
       _stats.setHasTotalLength(pb.binarystatistics().has_sum());
       
_stats.setTotalLength(static_cast<uint64_t>(pb.binarystatistics().sum()));
@@ -197,7 +197,7 @@ namespace orc {
   BooleanColumnStatisticsImpl::BooleanColumnStatisticsImpl(const 
proto::ColumnStatistics& pb,
                                                            const StatContext& 
statContext) {
     _stats.setNumberOfValues(pb.numberofvalues());
-    _stats.setHasNull(pb.hasnull());
+    _stats.setHasNull(pb.has_hasnull() ? pb.hasnull() : true);
     if (pb.has_bucketstatistics() && statContext.correctStats) {
       _hasCount = true;
       _trueCount = pb.bucketstatistics().count(0);
@@ -210,7 +210,7 @@ namespace orc {
   DateColumnStatisticsImpl::DateColumnStatisticsImpl(const 
proto::ColumnStatistics& pb,
                                                      const StatContext& 
statContext) {
     _stats.setNumberOfValues(pb.numberofvalues());
-    _stats.setHasNull(pb.hasnull());
+    _stats.setHasNull(pb.has_hasnull() ? pb.hasnull() : true);
     if (!pb.has_datestatistics() || !statContext.correctStats) {
       // hasMinimum_ is false by default;
       // hasMaximum_ is false by default;
@@ -227,7 +227,7 @@ namespace orc {
   DecimalColumnStatisticsImpl::DecimalColumnStatisticsImpl(const 
proto::ColumnStatistics& pb,
                                                            const StatContext& 
statContext) {
     _stats.setNumberOfValues(pb.numberofvalues());
-    _stats.setHasNull(pb.hasnull());
+    _stats.setHasNull(pb.has_hasnull() ? pb.hasnull() : true);
     if (pb.has_decimalstatistics() && statContext.correctStats) {
       const proto::DecimalStatistics& stats = pb.decimalstatistics();
       _stats.setHasMinimum(stats.has_minimum());
@@ -242,7 +242,7 @@ namespace orc {
 
   DoubleColumnStatisticsImpl::DoubleColumnStatisticsImpl(const 
proto::ColumnStatistics& pb) {
     _stats.setNumberOfValues(pb.numberofvalues());
-    _stats.setHasNull(pb.hasnull());
+    _stats.setHasNull(pb.has_hasnull() ? pb.hasnull() : true);
     if (!pb.has_doublestatistics()) {
       _stats.setMinimum(0);
       _stats.setMaximum(0);
@@ -261,7 +261,7 @@ namespace orc {
 
   IntegerColumnStatisticsImpl::IntegerColumnStatisticsImpl(const 
proto::ColumnStatistics& pb) {
     _stats.setNumberOfValues(pb.numberofvalues());
-    _stats.setHasNull(pb.hasnull());
+    _stats.setHasNull(pb.has_hasnull() ? pb.hasnull() : true);
     if (!pb.has_intstatistics()) {
       _stats.setMinimum(0);
       _stats.setMaximum(0);
@@ -281,7 +281,7 @@ namespace orc {
   StringColumnStatisticsImpl::StringColumnStatisticsImpl(const 
proto::ColumnStatistics& pb,
                                                          const StatContext& 
statContext) {
     _stats.setNumberOfValues(pb.numberofvalues());
-    _stats.setHasNull(pb.hasnull());
+    _stats.setHasNull(pb.has_hasnull() ? pb.hasnull() : true);
     if (!pb.has_stringstatistics() || !statContext.correctStats) {
       _stats.setTotalLength(0);
     } else {
@@ -299,7 +299,7 @@ namespace orc {
   TimestampColumnStatisticsImpl::TimestampColumnStatisticsImpl(const 
proto::ColumnStatistics& pb,
                                                                const 
StatContext& statContext) {
     _stats.setNumberOfValues(pb.numberofvalues());
-    _stats.setHasNull(pb.hasnull());
+    _stats.setHasNull(pb.has_hasnull() ? pb.hasnull() : true);
     if (!pb.has_timestampstatistics() || !statContext.correctStats) {
       _stats.setMinimum(0);
       _stats.setMaximum(0);
@@ -365,7 +365,7 @@ namespace orc {
   CollectionColumnStatisticsImpl::CollectionColumnStatisticsImpl(
       const proto::ColumnStatistics& pb) {
     _stats.setNumberOfValues(pb.numberofvalues());
-    _stats.setHasNull(pb.hasnull());
+    _stats.setHasNull(pb.has_hasnull() ? pb.hasnull() : true);
     if (!pb.has_collectionstatistics()) {
       _stats.setMinimum(0);
       _stats.setMaximum(0);
diff --git a/c++/src/sargs/PredicateLeaf.cc b/c++/src/sargs/PredicateLeaf.cc
index 9e9f41c338d..9ab4cfb1e2c 100644
--- a/c++/src/sargs/PredicateLeaf.cc
+++ b/c++/src/sargs/PredicateLeaf.cc
@@ -390,10 +390,15 @@ namespace orc {
 
   DIAGNOSTIC_POP
 
+  static bool col_stats_hasnull(const proto::ColumnStatistics& stats) {
+    // for foward compatibility, if hasnull is not set, assume that the column 
has nulls
+    return stats.has_hasnull() ? stats.hasnull() : true;
+  }
+
   static TruthValue evaluateBoolPredicate(const PredicateLeaf::Operator op,
                                           const std::vector<Literal>& literals,
                                           const proto::ColumnStatistics& 
stats) {
-    bool hasNull = stats.hasnull();
+    bool hasNull = col_stats_hasnull(stats);
     if (!stats.has_bucketstatistics() || stats.bucketstatistics().count_size() 
== 0) {
       // does not have bool stats
       return hasNull ? TruthValue::YES_NO_NULL : TruthValue::YES_NO;
@@ -513,7 +518,7 @@ namespace orc {
             colStats.intstatistics().has_maximum()) {
           const auto& stats = colStats.intstatistics();
           result = evaluatePredicateRange(mOperator, literal2Long(mLiterals), 
stats.minimum(),
-                                          stats.maximum(), colStats.hasnull());
+                                          stats.maximum(), 
col_stats_hasnull(colStats));
         }
         break;
       }
@@ -522,10 +527,10 @@ namespace orc {
             colStats.doublestatistics().has_maximum()) {
           const auto& stats = colStats.doublestatistics();
           if (!std::isfinite(stats.sum())) {
-            result = colStats.hasnull() ? TruthValue::YES_NO_NULL : 
TruthValue::YES_NO;
+            result = col_stats_hasnull(colStats) ? TruthValue::YES_NO_NULL : 
TruthValue::YES_NO;
           } else {
             result = evaluatePredicateRange(mOperator, 
literal2Double(mLiterals), stats.minimum(),
-                                            stats.maximum(), 
colStats.hasnull());
+                                            stats.maximum(), 
col_stats_hasnull(colStats));
           }
         }
         break;
@@ -536,7 +541,7 @@ namespace orc {
             colStats.stringstatistics().has_maximum()) {
           const auto& stats = colStats.stringstatistics();
           result = evaluatePredicateRange(mOperator, 
literal2String(mLiterals), stats.minimum(),
-                                          stats.maximum(), colStats.hasnull());
+                                          stats.maximum(), 
col_stats_hasnull(colStats));
         }
         break;
       }
@@ -545,7 +550,7 @@ namespace orc {
             colStats.datestatistics().has_maximum()) {
           const auto& stats = colStats.datestatistics();
           result = evaluatePredicateRange(mOperator, literal2Date(mLiterals), 
stats.minimum(),
-                                          stats.maximum(), colStats.hasnull());
+                                          stats.maximum(), 
col_stats_hasnull(colStats));
         }
         break;
       }
@@ -564,7 +569,7 @@ namespace orc {
               stats.maximumutc() / 1000,
               static_cast<int32_t>((stats.maximumutc() % 1000) * 1000000) + 
maxNano);
           result = evaluatePredicateRange(mOperator, 
literal2Timestamp(mLiterals), minTimestamp,
-                                          maxTimestamp, colStats.hasnull());
+                                          maxTimestamp, 
col_stats_hasnull(colStats));
         }
         break;
       }
@@ -574,7 +579,7 @@ namespace orc {
           const auto& stats = colStats.decimalstatistics();
           result = evaluatePredicateRange(mOperator, 
literal2Decimal(mLiterals),
                                           Decimal(stats.minimum()), 
Decimal(stats.maximum()),
-                                          colStats.hasnull());
+                                          col_stats_hasnull(colStats));
         }
         break;
       }
@@ -589,7 +594,7 @@ namespace orc {
     }
 
     // make sure null literal is respected for IN operator
-    if (mOperator == Operator::IN && colStats.hasnull()) {
+    if (mOperator == Operator::IN && col_stats_hasnull(colStats)) {
       for (const auto& literal : mLiterals) {
         if (literal.isNull()) {
           result = TruthValue::YES_NO_NULL;
@@ -698,12 +703,13 @@ namespace orc {
       }
     }
 
-    bool allNull = colStats.hasnull() && colStats.numberofvalues() == 0;
+    bool allNull = col_stats_hasnull(colStats) && colStats.numberofvalues() == 
0;
     if (mOperator == Operator::IS_NULL ||
         ((mOperator == Operator::EQUALS || mOperator == 
Operator::NULL_SAFE_EQUALS) &&
          mLiterals.at(0).isNull())) {
       // IS_NULL operator does not need to check min/max stats and bloom filter
-      return allNull ? TruthValue::YES : (colStats.hasnull() ? 
TruthValue::YES_NO : TruthValue::NO);
+      return allNull ? TruthValue::YES
+                     : (col_stats_hasnull(colStats) ? TruthValue::YES_NO : 
TruthValue::NO);
     } else if (allNull) {
       // if we don't have any value, everything must have been null
       return TruthValue::IS_NULL;
@@ -711,7 +717,7 @@ namespace orc {
 
     TruthValue result = evaluatePredicateMinMax(colStats);
     if (shouldEvaluateBloomFilter(mOperator, result, bloomFilter)) {
-      return evaluatePredicateBloomFiter(bloomFilter, colStats.hasnull());
+      return evaluatePredicateBloomFiter(bloomFilter, 
col_stats_hasnull(colStats));
     } else {
       return result;
     }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to