This is an automated email from the ASF dual-hosted git repository. kakachen pushed a commit to branch orc in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/orc by this push: new 86a5c44f800 [fix](orc) Fix has null statistics for forward compatibility (#259) 86a5c44f800 is described below commit 86a5c44f800306c9414f431eb1260f7b117b02fc Author: Socrates <suyit...@selectdb.com> AuthorDate: Tue Dec 10 10:13:14 2024 +0800 [fix](orc) Fix has null statistics for forward compatibility (#259) --- c++/src/Statistics.cc | 20 ++++++++++---------- c++/src/sargs/PredicateLeaf.cc | 30 ++++++++++++++++++------------ 2 files changed, 28 insertions(+), 22 deletions(-) diff --git a/c++/src/Statistics.cc b/c++/src/Statistics.cc index 7b648739618..97c2963ff50 100644 --- a/c++/src/Statistics.cc +++ b/c++/src/Statistics.cc @@ -181,13 +181,13 @@ namespace orc { ColumnStatisticsImpl::ColumnStatisticsImpl(const proto::ColumnStatistics& pb) { _stats.setNumberOfValues(pb.numberofvalues()); - _stats.setHasNull(pb.hasnull()); + _stats.setHasNull(pb.has_hasnull() ? pb.hasnull() : true); } BinaryColumnStatisticsImpl::BinaryColumnStatisticsImpl(const proto::ColumnStatistics& pb, const StatContext& statContext) { _stats.setNumberOfValues(pb.numberofvalues()); - _stats.setHasNull(pb.hasnull()); + _stats.setHasNull(pb.has_hasnull() ? pb.hasnull() : true); if (pb.has_binarystatistics() && statContext.correctStats) { _stats.setHasTotalLength(pb.binarystatistics().has_sum()); _stats.setTotalLength(static_cast<uint64_t>(pb.binarystatistics().sum())); @@ -197,7 +197,7 @@ namespace orc { BooleanColumnStatisticsImpl::BooleanColumnStatisticsImpl(const proto::ColumnStatistics& pb, const StatContext& statContext) { _stats.setNumberOfValues(pb.numberofvalues()); - _stats.setHasNull(pb.hasnull()); + _stats.setHasNull(pb.has_hasnull() ? pb.hasnull() : true); if (pb.has_bucketstatistics() && statContext.correctStats) { _hasCount = true; _trueCount = pb.bucketstatistics().count(0); @@ -210,7 +210,7 @@ namespace orc { DateColumnStatisticsImpl::DateColumnStatisticsImpl(const proto::ColumnStatistics& pb, const StatContext& statContext) { _stats.setNumberOfValues(pb.numberofvalues()); - _stats.setHasNull(pb.hasnull()); + _stats.setHasNull(pb.has_hasnull() ? pb.hasnull() : true); if (!pb.has_datestatistics() || !statContext.correctStats) { // hasMinimum_ is false by default; // hasMaximum_ is false by default; @@ -227,7 +227,7 @@ namespace orc { DecimalColumnStatisticsImpl::DecimalColumnStatisticsImpl(const proto::ColumnStatistics& pb, const StatContext& statContext) { _stats.setNumberOfValues(pb.numberofvalues()); - _stats.setHasNull(pb.hasnull()); + _stats.setHasNull(pb.has_hasnull() ? pb.hasnull() : true); if (pb.has_decimalstatistics() && statContext.correctStats) { const proto::DecimalStatistics& stats = pb.decimalstatistics(); _stats.setHasMinimum(stats.has_minimum()); @@ -242,7 +242,7 @@ namespace orc { DoubleColumnStatisticsImpl::DoubleColumnStatisticsImpl(const proto::ColumnStatistics& pb) { _stats.setNumberOfValues(pb.numberofvalues()); - _stats.setHasNull(pb.hasnull()); + _stats.setHasNull(pb.has_hasnull() ? pb.hasnull() : true); if (!pb.has_doublestatistics()) { _stats.setMinimum(0); _stats.setMaximum(0); @@ -261,7 +261,7 @@ namespace orc { IntegerColumnStatisticsImpl::IntegerColumnStatisticsImpl(const proto::ColumnStatistics& pb) { _stats.setNumberOfValues(pb.numberofvalues()); - _stats.setHasNull(pb.hasnull()); + _stats.setHasNull(pb.has_hasnull() ? pb.hasnull() : true); if (!pb.has_intstatistics()) { _stats.setMinimum(0); _stats.setMaximum(0); @@ -281,7 +281,7 @@ namespace orc { StringColumnStatisticsImpl::StringColumnStatisticsImpl(const proto::ColumnStatistics& pb, const StatContext& statContext) { _stats.setNumberOfValues(pb.numberofvalues()); - _stats.setHasNull(pb.hasnull()); + _stats.setHasNull(pb.has_hasnull() ? pb.hasnull() : true); if (!pb.has_stringstatistics() || !statContext.correctStats) { _stats.setTotalLength(0); } else { @@ -299,7 +299,7 @@ namespace orc { TimestampColumnStatisticsImpl::TimestampColumnStatisticsImpl(const proto::ColumnStatistics& pb, const StatContext& statContext) { _stats.setNumberOfValues(pb.numberofvalues()); - _stats.setHasNull(pb.hasnull()); + _stats.setHasNull(pb.has_hasnull() ? pb.hasnull() : true); if (!pb.has_timestampstatistics() || !statContext.correctStats) { _stats.setMinimum(0); _stats.setMaximum(0); @@ -365,7 +365,7 @@ namespace orc { CollectionColumnStatisticsImpl::CollectionColumnStatisticsImpl( const proto::ColumnStatistics& pb) { _stats.setNumberOfValues(pb.numberofvalues()); - _stats.setHasNull(pb.hasnull()); + _stats.setHasNull(pb.has_hasnull() ? pb.hasnull() : true); if (!pb.has_collectionstatistics()) { _stats.setMinimum(0); _stats.setMaximum(0); diff --git a/c++/src/sargs/PredicateLeaf.cc b/c++/src/sargs/PredicateLeaf.cc index 9e9f41c338d..9ab4cfb1e2c 100644 --- a/c++/src/sargs/PredicateLeaf.cc +++ b/c++/src/sargs/PredicateLeaf.cc @@ -390,10 +390,15 @@ namespace orc { DIAGNOSTIC_POP + static bool col_stats_hasnull(const proto::ColumnStatistics& stats) { + // for foward compatibility, if hasnull is not set, assume that the column has nulls + return stats.has_hasnull() ? stats.hasnull() : true; + } + static TruthValue evaluateBoolPredicate(const PredicateLeaf::Operator op, const std::vector<Literal>& literals, const proto::ColumnStatistics& stats) { - bool hasNull = stats.hasnull(); + bool hasNull = col_stats_hasnull(stats); if (!stats.has_bucketstatistics() || stats.bucketstatistics().count_size() == 0) { // does not have bool stats return hasNull ? TruthValue::YES_NO_NULL : TruthValue::YES_NO; @@ -513,7 +518,7 @@ namespace orc { colStats.intstatistics().has_maximum()) { const auto& stats = colStats.intstatistics(); result = evaluatePredicateRange(mOperator, literal2Long(mLiterals), stats.minimum(), - stats.maximum(), colStats.hasnull()); + stats.maximum(), col_stats_hasnull(colStats)); } break; } @@ -522,10 +527,10 @@ namespace orc { colStats.doublestatistics().has_maximum()) { const auto& stats = colStats.doublestatistics(); if (!std::isfinite(stats.sum())) { - result = colStats.hasnull() ? TruthValue::YES_NO_NULL : TruthValue::YES_NO; + result = col_stats_hasnull(colStats) ? TruthValue::YES_NO_NULL : TruthValue::YES_NO; } else { result = evaluatePredicateRange(mOperator, literal2Double(mLiterals), stats.minimum(), - stats.maximum(), colStats.hasnull()); + stats.maximum(), col_stats_hasnull(colStats)); } } break; @@ -536,7 +541,7 @@ namespace orc { colStats.stringstatistics().has_maximum()) { const auto& stats = colStats.stringstatistics(); result = evaluatePredicateRange(mOperator, literal2String(mLiterals), stats.minimum(), - stats.maximum(), colStats.hasnull()); + stats.maximum(), col_stats_hasnull(colStats)); } break; } @@ -545,7 +550,7 @@ namespace orc { colStats.datestatistics().has_maximum()) { const auto& stats = colStats.datestatistics(); result = evaluatePredicateRange(mOperator, literal2Date(mLiterals), stats.minimum(), - stats.maximum(), colStats.hasnull()); + stats.maximum(), col_stats_hasnull(colStats)); } break; } @@ -564,7 +569,7 @@ namespace orc { stats.maximumutc() / 1000, static_cast<int32_t>((stats.maximumutc() % 1000) * 1000000) + maxNano); result = evaluatePredicateRange(mOperator, literal2Timestamp(mLiterals), minTimestamp, - maxTimestamp, colStats.hasnull()); + maxTimestamp, col_stats_hasnull(colStats)); } break; } @@ -574,7 +579,7 @@ namespace orc { const auto& stats = colStats.decimalstatistics(); result = evaluatePredicateRange(mOperator, literal2Decimal(mLiterals), Decimal(stats.minimum()), Decimal(stats.maximum()), - colStats.hasnull()); + col_stats_hasnull(colStats)); } break; } @@ -589,7 +594,7 @@ namespace orc { } // make sure null literal is respected for IN operator - if (mOperator == Operator::IN && colStats.hasnull()) { + if (mOperator == Operator::IN && col_stats_hasnull(colStats)) { for (const auto& literal : mLiterals) { if (literal.isNull()) { result = TruthValue::YES_NO_NULL; @@ -698,12 +703,13 @@ namespace orc { } } - bool allNull = colStats.hasnull() && colStats.numberofvalues() == 0; + bool allNull = col_stats_hasnull(colStats) && colStats.numberofvalues() == 0; if (mOperator == Operator::IS_NULL || ((mOperator == Operator::EQUALS || mOperator == Operator::NULL_SAFE_EQUALS) && mLiterals.at(0).isNull())) { // IS_NULL operator does not need to check min/max stats and bloom filter - return allNull ? TruthValue::YES : (colStats.hasnull() ? TruthValue::YES_NO : TruthValue::NO); + return allNull ? TruthValue::YES + : (col_stats_hasnull(colStats) ? TruthValue::YES_NO : TruthValue::NO); } else if (allNull) { // if we don't have any value, everything must have been null return TruthValue::IS_NULL; @@ -711,7 +717,7 @@ namespace orc { TruthValue result = evaluatePredicateMinMax(colStats); if (shouldEvaluateBloomFilter(mOperator, result, bloomFilter)) { - return evaluatePredicateBloomFiter(bloomFilter, colStats.hasnull()); + return evaluatePredicateBloomFiter(bloomFilter, col_stats_hasnull(colStats)); } else { return result; } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org