This is an automated email from the ASF dual-hosted git repository. kxiao pushed a commit to branch branch-2.0 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.0 by this push: new c22bb8d7973 [opt](nereids) improve filter estimation on string column #35610 #34542 (#35667) c22bb8d7973 is described below commit c22bb8d7973e0cf24f776cd0ca8addeb45bb3b88 Author: minghong <engle...@gmail.com> AuthorDate: Sun Jun 2 09:34:40 2024 +0800 [opt](nereids) improve filter estimation on string column #35610 #34542 (#35667) Some users are likely to define date(datetime) column as Varchar type. when estimating the selectivity of predicate like A>'2020-01-01', if nereids regards A and '2020-01-01' as date type, the sel is more accurate than that as string type. --- .../org/apache/doris/analysis/DateLiteral.java | 58 +++++++-- .../doris/nereids/stats/FilterEstimation.java | 145 ++++++++++++++++++--- .../expressions/literal/StringLikeLiteral.java | 11 +- .../org/apache/doris/nereids/types/TimeType.java | 3 +- .../org/apache/doris/nereids/types/TimeV2Type.java | 3 +- .../doris/nereids/types/coercion/DateLikeType.java | 2 +- .../doris/nereids/types/coercion/NumericType.java | 2 +- .../{TimeType.java => coercion/RangeScalable.java} | 37 ++---- .../doris/nereids/stats/FilterEstimationTest.java | 118 +++++++++++++++++ .../data/nereids_ssb_shape_sf100_p0/shape/q2.2.out | 27 ++-- 10 files changed, 332 insertions(+), 74 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/DateLiteral.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/DateLiteral.java index 9f05c0e6574..1dc58b4c382 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/DateLiteral.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/DateLiteral.java @@ -66,6 +66,7 @@ import java.util.Set; import java.util.TimeZone; import java.util.regex.Pattern; import java.util.stream.Collectors; +import javax.annotation.Nullable; public class DateLiteral extends LiteralExpr { private static final Logger LOG = LogManager.getLogger(DateLiteral.class); @@ -367,13 +368,21 @@ public class DateLiteral extends LiteralExpr { type = other.type; } + public DateLiteral(String s) throws AnalysisException { + super(); + init(s, null); + analysisDone(); + } + public static DateLiteral createMinValue(Type type) throws AnalysisException { return new DateLiteral(type, false); } - private void init(String s, Type type) throws AnalysisException { + private void init(String s, @Nullable Type type) throws AnalysisException { try { - Preconditions.checkArgument(type.isDateType()); + if (type != null) { + Preconditions.checkArgument(type.isDateType()); + } TemporalAccessor dateTime = null; boolean parsed = false; int offset = 0; @@ -445,25 +454,29 @@ public class DateLiteral extends LiteralExpr { builder.appendLiteral(" "); } String[] timePart = s.contains(" ") ? s.split(" ")[1].split(":") : new String[] {}; - if (timePart.length > 0 && (type.equals(Type.DATE) || type.equals(Type.DATEV2))) { + if (timePart.length > 0 && type != null && (type.equals(Type.DATE) || type.equals(Type.DATEV2))) { throw new AnalysisException("Invalid date value: " + s); } - if (timePart.length == 0 && (type.equals(Type.DATETIME) || type.equals(Type.DATETIMEV2))) { + if (timePart.length == 0 && type != null + && (type.equals(Type.DATETIME) || type.equals(Type.DATETIMEV2))) { throw new AnalysisException("Invalid datetime value: " + s); } for (int i = 0; i < timePart.length; i++) { switch (i) { case 0: - builder.appendPattern(String.join("", Collections.nCopies(timePart[i].length(), "H"))); + builder.appendPattern(String.join("", + Collections.nCopies(timePart[i].length(), "H"))); break; case 1: - builder.appendPattern(String.join("", Collections.nCopies(timePart[i].length(), "m"))); + builder.appendPattern(String.join("", + Collections.nCopies(timePart[i].length(), "m"))); break; case 2: builder.appendPattern(String.join("", Collections.nCopies(timePart[i].contains(".") ? timePart[i].split("\\.")[0].length() : timePart[i].length(), "s"))); if (timePart[i].contains(".")) { - builder.appendFraction(ChronoField.MICRO_OF_SECOND, 0, 6, true); + builder.appendFraction(ChronoField.MICRO_OF_SECOND, + 0, 6, true); } break; default: @@ -488,10 +501,29 @@ public class DateLiteral extends LiteralExpr { minute = getOrDefault(dateTime, ChronoField.MINUTE_OF_HOUR, 0); second = getOrDefault(dateTime, ChronoField.SECOND_OF_MINUTE, 0); microsecond = getOrDefault(dateTime, ChronoField.MICRO_OF_SECOND, 0); - if (microsecond != 0 && type.isDatetime()) { - int dotIndex = s.lastIndexOf("."); - int scale = s.length() - dotIndex - 1; - type = ScalarType.createDatetimeV2Type(scale); + if (type != null) { + if (microsecond != 0 && type.isDatetime()) { + int dotIndex = s.lastIndexOf("."); + int scale = s.length() - dotIndex - 1; + type = ScalarType.createDatetimeV2Type(scale); + } + } else { + if (hour == 0 && minute == 0 && second == 0 && microsecond == 0) { + type = ScalarType.getDefaultDateType(Type.DATE); + } else { + type = ScalarType.getDefaultDateType(Type.DATETIME); + if (type.isDatetimeV2() && microsecond != 0) { + int scale = 6; + for (int i = 0; i < 6; i++) { + if (microsecond % Math.pow(10.0, i + 1) > 0) { + break; + } else { + scale -= 1; + } + } + type = ScalarType.createDatetimeV2Type(scale); + } + } } this.type = type; @@ -720,6 +752,10 @@ public class DateLiteral extends LiteralExpr { return getLongValue(); } + public double getDoubleValueAsDateTime() { + return (year * 10000 + month * 100 + day) * 1000000L + hour * 10000 + minute * 100 + second; + } + @Override protected void toThrift(TExprNode msg) { if (type.isDatetimeV2()) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java index c56a7d9f5bc..33f6318808b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java @@ -17,7 +17,9 @@ package org.apache.doris.nereids.stats; +import org.apache.doris.analysis.DateLiteral; import org.apache.doris.analysis.LiteralExpr; +import org.apache.doris.analysis.StringLiteral; import org.apache.doris.nereids.stats.FilterEstimation.EstimationContext; import org.apache.doris.nereids.trees.TreeNode; import org.apache.doris.nereids.trees.expressions.And; @@ -39,7 +41,11 @@ import org.apache.doris.nereids.trees.expressions.Slot; import org.apache.doris.nereids.trees.expressions.SlotReference; import org.apache.doris.nereids.trees.expressions.functions.Function; import org.apache.doris.nereids.trees.expressions.literal.Literal; +import org.apache.doris.nereids.trees.expressions.literal.StringLikeLiteral; import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; +import org.apache.doris.nereids.types.DataType; +import org.apache.doris.nereids.types.DateTimeType; +import org.apache.doris.nereids.types.coercion.RangeScalable; import org.apache.doris.statistics.ColumnStatistic; import org.apache.doris.statistics.ColumnStatisticBuilder; import org.apache.doris.statistics.StatisticRange; @@ -49,7 +55,10 @@ import org.apache.doris.statistics.StatisticsBuilder; import com.google.common.base.Preconditions; import com.google.common.collect.Sets; +import java.util.HashMap; import java.util.List; +import java.util.Map; +import java.util.Optional; import java.util.Set; import java.util.function.Predicate; @@ -176,22 +185,22 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo } } - private Statistics updateLessThanLiteral(Expression leftExpr, ColumnStatistic statsForLeft, + private Statistics updateLessThanLiteral(Expression leftExpr, DataType dataType, ColumnStatistic statsForLeft, ColumnStatistic statsForRight, EstimationContext context) { StatisticRange rightRange = new StatisticRange(statsForLeft.minValue, statsForLeft.minExpr, statsForRight.maxValue, statsForRight.maxExpr, - statsForLeft.ndv, leftExpr.getDataType()); - return estimateBinaryComparisonFilter(leftExpr, + statsForLeft.ndv, dataType); + return estimateBinaryComparisonFilter(leftExpr, dataType, statsForLeft, rightRange, context); } - private Statistics updateGreaterThanLiteral(Expression leftExpr, ColumnStatistic statsForLeft, + private Statistics updateGreaterThanLiteral(Expression leftExpr, DataType dataType, ColumnStatistic statsForLeft, ColumnStatistic statsForRight, EstimationContext context) { StatisticRange rightRange = new StatisticRange(statsForRight.minValue, statsForRight.minExpr, statsForLeft.maxValue, statsForLeft.maxExpr, - statsForLeft.ndv, leftExpr.getDataType()); - return estimateBinaryComparisonFilter(leftExpr, statsForLeft, rightRange, context); + statsForLeft.ndv, dataType); + return estimateBinaryComparisonFilter(leftExpr, dataType, statsForLeft, rightRange, context); } private Statistics calculateWhenLiteralRight(ComparisonPredicate cp, @@ -203,14 +212,111 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo if (cp instanceof EqualPredicate) { return estimateEqualTo(cp, statsForLeft, statsForRight, context); } else { + // literal Map used to covert dateLiteral back to stringLiteral + Map<DateLiteral, StringLiteral> literalMap = new HashMap<>(); + DataType compareType = cp.left().getDataType(); + Optional<ColumnStatistic> statsForLeftMayConvertedOpt = + tryConvertStringColStatsToDateColStats(statsForLeft, literalMap); + Optional<ColumnStatistic> statsForRightMayConvertedOpt = (statsForLeftMayConvertedOpt.isPresent()) + ? tryConvertStringColStatsToDateColStats(statsForRight, literalMap) + : Optional.empty(); + + boolean converted = false; + ColumnStatistic statsForLeftMayConverted = statsForLeft; + ColumnStatistic statsForRightMayConverted = statsForRight; + if (statsForLeftMayConvertedOpt.isPresent() && statsForRightMayConvertedOpt.isPresent() + && statsForRightMayConvertedOpt.get().minExpr.getType() + == statsForLeftMayConvertedOpt.get().minExpr.getType()) { + // string type is converted to date type + converted = true; + compareType = DateTimeType.INSTANCE; + statsForLeftMayConverted = statsForLeftMayConvertedOpt.get(); + statsForRightMayConverted = statsForRightMayConvertedOpt.get(); + } + Statistics result = null; if (cp instanceof LessThan || cp instanceof LessThanEqual) { - return updateLessThanLiteral(cp.left(), statsForLeft, statsForRight, context); + result = updateLessThanLiteral(cp.left(), compareType, statsForLeftMayConverted, + statsForRightMayConverted, context); } else if (cp instanceof GreaterThan || cp instanceof GreaterThanEqual) { - return updateGreaterThanLiteral(cp.left(), statsForLeft, statsForRight, context); + result = updateGreaterThanLiteral(cp.left(), compareType, statsForLeftMayConverted, + statsForRightMayConverted, context); } else { throw new RuntimeException(String.format("Unexpected expression : %s", cp.toSql())); } + if (converted) { + // convert min/max of left.colStats back to string type + ColumnStatistic newLeftStats = result.findColumnStatistics(cp.left()); + result.addColumnStats(cp.left(), convertDateColStatsToStringColStats(newLeftStats, literalMap)); + } + return result; + } + } + + private ColumnStatistic convertDateColStatsToStringColStats(ColumnStatistic colStats, + Map<DateLiteral, StringLiteral> literalMap) { + if (colStats.minExpr == null && colStats.maxExpr == null) { + // when sel=0, minExpr and maxExpr are both null + return colStats; + } + Preconditions.checkArgument(colStats.minExpr instanceof DateLiteral + && colStats.maxExpr instanceof DateLiteral, + "cannot convert colStats back to stringType %s", colStats.toString()); + ColumnStatisticBuilder builder = new ColumnStatisticBuilder(colStats); + StringLiteral newMinLiteral = new StringLiteral(colStats.maxExpr.toString()); + return builder.setMaxExpr(newMinLiteral) + .setMaxExpr(literalMap.get(colStats.maxExpr)) + .setMaxValue(StringLikeLiteral.getDouble(colStats.maxExpr.toString())) + .setMinExpr(literalMap.get(colStats.minExpr)) + .setMinValue(StringLikeLiteral.getDouble(colStats.minExpr.getStringValue())) + .build(); + } + + private Optional<ColumnStatistic> tryConvertStringColStatsToDateColStats(ColumnStatistic colStats, + Map<DateLiteral, StringLiteral> literalMap) { + if (colStats.minExpr == null || colStats.maxExpr == null) { + return Optional.empty(); + } + if (!(colStats.minExpr instanceof StringLiteral) || !(colStats.maxExpr instanceof StringLiteral)) { + return Optional.empty(); + } + Optional<DateLiteral> newMinExpr = tryConvertStrLiteralToDateLiteral(colStats.minExpr); + if (!newMinExpr.isPresent()) { + return Optional.empty(); } + Optional<DateLiteral> newMaxExpr = tryConvertStrLiteralToDateLiteral(colStats.maxExpr); + if (!newMaxExpr.isPresent()) { + return Optional.empty(); + } + if (newMaxExpr.get().getType() != newMinExpr.get().getType()) { + return Optional.empty(); + } + literalMap.put(newMinExpr.get(), (StringLiteral) colStats.minExpr); + literalMap.put(newMaxExpr.get(), (StringLiteral) colStats.maxExpr); + + ColumnStatisticBuilder builder = new ColumnStatisticBuilder(colStats); + return Optional.of(builder.setMinValue(newMinExpr.get().getDoubleValueAsDateTime()) + .setMinExpr(newMinExpr.get()) + .setMaxValue(newMaxExpr.get().getDoubleValueAsDateTime()) + .setMaxExpr(newMaxExpr.get()) + .build()); + } + + private Optional<DateLiteral> tryConvertStrLiteralToDateLiteral(LiteralExpr literal) { + if (literal == null) { + return Optional.empty(); + } + if (!(literal instanceof StringLiteral)) { + return Optional.empty(); + } + + DateLiteral dt = null; + try { + dt = new DateLiteral(literal.getStringValue()); + dt.checkValueValid(); + } catch (Exception e) { + // ignore + } + return dt == null ? Optional.empty() : Optional.of(dt); } private Statistics estimateEqualTo(ComparisonPredicate cp, ColumnStatistic statsForLeft, @@ -451,11 +557,11 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo } } - private Statistics estimateBinaryComparisonFilter(Expression leftExpr, ColumnStatistic leftStats, + private Statistics estimateBinaryComparisonFilter(Expression leftExpr, DataType dataType, ColumnStatistic leftStats, StatisticRange rightRange, EstimationContext context) { StatisticRange leftRange = new StatisticRange(leftStats.minValue, leftStats.minExpr, leftStats.maxValue, leftStats.maxExpr, - leftStats.ndv, leftExpr.getDataType()); + leftStats.ndv, dataType); StatisticRange intersectRange = leftRange.cover(rightRange); ColumnStatisticBuilder leftColumnStatisticBuilder; @@ -479,6 +585,9 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo .setNdv(intersectRange.getDistinctValues()) .setNumNulls(0); double sel = leftRange.overlapPercentWith(rightRange); + if (!(dataType instanceof RangeScalable) && (sel != 0.0 && sel != 1.0)) { + sel = DEFAULT_INEQUALITY_COEFFICIENT; + } sel = getNotNullSelectivity(leftStats, sel); updatedStatistics = context.statistics.withSel(sel); leftColumnStatisticBuilder.setCount(updatedStatistics.getRowCount()); @@ -535,8 +644,9 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo } double leftOverlapPercent = leftRange.overlapPercentWith(rightRange); - // Left always greater than right - if (leftOverlapPercent == 0) { + + if (leftOverlapPercent == 0.0) { + // Left always greater than right return context.statistics.withRowCount(0.0); } StatisticRange leftAlwaysLessThanRightRange = new StatisticRange(leftStats.minValue, leftStats.minExpr, @@ -565,9 +675,14 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo .setNdv(rightStats.ndv * (rightAlwaysGreaterRangeFraction + rightOverlappingRangeFraction)) .setNumNulls(0) .build(); - double sel = leftAlwaysLessThanRightPercent - + leftOverlapPercent * rightOverlappingRangeFraction * DEFAULT_INEQUALITY_COEFFICIENT - + leftOverlapPercent * rightAlwaysGreaterRangeFraction; + double sel = DEFAULT_INEQUALITY_COEFFICIENT; + if (leftExpr.getDataType() instanceof RangeScalable) { + sel = leftAlwaysLessThanRightPercent + + leftOverlapPercent * rightOverlappingRangeFraction * DEFAULT_INEQUALITY_COEFFICIENT + + leftOverlapPercent * rightAlwaysGreaterRangeFraction; + } else if (leftOverlapPercent == 1.0) { + sel = 1.0; + } context.addKeyIfSlot(leftExpr); context.addKeyIfSlot(rightExpr); return context.statistics.withSel(sel) diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/StringLikeLiteral.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/StringLikeLiteral.java index 5b437021f53..30cb9580195 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/StringLikeLiteral.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/StringLikeLiteral.java @@ -39,11 +39,18 @@ public abstract class StringLikeLiteral extends Literal { @Override public double getDouble() { + return getDouble(value); + } + + /** + * get double value + */ + public static double getDouble(String str) { long v = 0; int pos = 0; - int len = Math.min(value.length(), 7); + int len = Math.min(str.length(), 7); while (pos < len) { - v += Byte.toUnsignedLong(value.getBytes()[pos]) << ((6 - pos) * 8); + v += Byte.toUnsignedLong(str.getBytes()[pos]) << ((6 - pos) * 8); pos++; } return (double) v; diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/TimeType.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/TimeType.java index 9fb438fd79c..1111038ed91 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/TimeType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/TimeType.java @@ -19,11 +19,12 @@ package org.apache.doris.nereids.types; import org.apache.doris.catalog.Type; import org.apache.doris.nereids.types.coercion.PrimitiveType; +import org.apache.doris.nereids.types.coercion.RangeScalable; /** * Datetime type in Nereids. */ -public class TimeType extends PrimitiveType { +public class TimeType extends PrimitiveType implements RangeScalable { public static final TimeType INSTANCE = new TimeType(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/TimeV2Type.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/TimeV2Type.java index ec625d0cd17..b436fe31d39 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/TimeV2Type.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/TimeV2Type.java @@ -20,11 +20,12 @@ package org.apache.doris.nereids.types; import org.apache.doris.catalog.ScalarType; import org.apache.doris.catalog.Type; import org.apache.doris.nereids.types.coercion.PrimitiveType; +import org.apache.doris.nereids.types.coercion.RangeScalable; /** * Datetime type in Nereids. */ -public class TimeV2Type extends PrimitiveType { +public class TimeV2Type extends PrimitiveType implements RangeScalable { public static final TimeV2Type INSTANCE = new TimeV2Type(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/DateLikeType.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/DateLikeType.java index 22ea99f00bc..1f8130215b0 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/DateLikeType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/DateLikeType.java @@ -33,7 +33,7 @@ import java.time.LocalDateTime; /** * date like type. */ -public abstract class DateLikeType extends PrimitiveType { +public abstract class DateLikeType extends PrimitiveType implements RangeScalable { protected LocalDate toLocalDate(double d) { // d = (year * 10000 + month * 100 + day) * 1000000L; diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/NumericType.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/NumericType.java index 18a41d3ffef..1d09a80a74f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/NumericType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/NumericType.java @@ -24,7 +24,7 @@ import org.apache.doris.nereids.types.DoubleType; /** * Abstract class for all numeric type in Nereids. */ -public class NumericType extends PrimitiveType { +public class NumericType extends PrimitiveType implements RangeScalable { public static final NumericType INSTANCE = new NumericType(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/TimeType.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/RangeScalable.java similarity index 57% copy from fe/fe-core/src/main/java/org/apache/doris/nereids/types/TimeType.java copy to fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/RangeScalable.java index 9fb438fd79c..8209e2f2b6e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/TimeType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/RangeScalable.java @@ -15,35 +15,16 @@ // specific language governing permissions and limitations // under the License. -package org.apache.doris.nereids.types; - -import org.apache.doris.catalog.Type; -import org.apache.doris.nereids.types.coercion.PrimitiveType; +package org.apache.doris.nereids.types.coercion; /** - * Datetime type in Nereids. + * numeric type/ date related type are range scalable + * RangeScalable Column can be estimated by filter like "A < 10" more accurate. + * For example, for a given relation R, which contains 10 rows. R.A in (1, 100), + * the selectivity of filter "A<10" is "(10-1) / (100 -1)" + * But for string column A, the filter selectivity of "A<'abc'" can not be estimated by range, although we could + * have an order reserved mapping from string value to double. + * */ -public class TimeType extends PrimitiveType { - - public static final TimeType INSTANCE = new TimeType(); - - private static final int WIDTH = 8; - - private TimeType() { - } - - @Override - public Type toCatalogDataType() { - return Type.TIME; - } - - @Override - public boolean equals(Object o) { - return o instanceof TimeType; - } - - @Override - public int width() { - return WIDTH; - } +public interface RangeScalable { } diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java index eca15eea364..da8159ef6b9 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java @@ -18,6 +18,7 @@ package org.apache.doris.nereids.stats; import org.apache.doris.analysis.IntLiteral; +import org.apache.doris.analysis.StringLiteral; import org.apache.doris.nereids.trees.expressions.And; import org.apache.doris.nereids.trees.expressions.Cast; import org.apache.doris.nereids.trees.expressions.EqualTo; @@ -35,9 +36,11 @@ import org.apache.doris.nereids.trees.expressions.SlotReference; import org.apache.doris.nereids.trees.expressions.literal.DateLiteral; import org.apache.doris.nereids.trees.expressions.literal.DoubleLiteral; import org.apache.doris.nereids.trees.expressions.literal.IntegerLiteral; +import org.apache.doris.nereids.trees.expressions.literal.VarcharLiteral; import org.apache.doris.nereids.types.DateType; import org.apache.doris.nereids.types.DoubleType; import org.apache.doris.nereids.types.IntegerType; +import org.apache.doris.nereids.types.VarcharType; import org.apache.doris.statistics.ColumnStatistic; import org.apache.doris.statistics.ColumnStatisticBuilder; import org.apache.doris.statistics.Statistics; @@ -1139,4 +1142,119 @@ class FilterEstimationTest { Statistics resultEq = estimator.estimate(eq, statsBuilder.build()); Assertions.assertEquals(7, resultNse.getRowCount() - resultEq.getRowCount()); } + + /** + * for string literal, min-max range is only used for coverage, not for percentage + */ + @Test + public void testStringRangeColToLiteral() { + SlotReference a = new SlotReference("a", new VarcharType(25)); + ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder() + .setNdv(100) + .setAvgSizeByte(25) + .setNumNulls(0) + .setMaxExpr(new StringLiteral("200")) + .setMaxValue(new VarcharLiteral("200").getDouble()) + .setMinExpr(new StringLiteral("100")) + .setMinValue(new VarcharLiteral("100").getDouble()) + .setCount(100); + StatisticsBuilder statsBuilder = new StatisticsBuilder(); + statsBuilder.setRowCount(100); + statsBuilder.putColumnStatistics(a, columnStatisticBuilder.build()); + Statistics baseStats = statsBuilder.build(); + VarcharLiteral i500 = new VarcharLiteral("500"); + Statistics filter500 = new FilterEstimation().estimate(new LessThan(a, i500), baseStats); + Assertions.assertEquals(100, filter500.getRowCount()); + + VarcharLiteral i10 = new VarcharLiteral("10"); + Statistics filter10 = new FilterEstimation().estimate(new LessThan(i10, a), baseStats); + Assertions.assertEquals(100, filter10.getRowCount()); + + VarcharLiteral i199 = new VarcharLiteral("199"); + Statistics filter199 = new FilterEstimation().estimate(new GreaterThan(a, i199), baseStats); + Assertions.assertEquals(50, filter199.getRowCount(), 0.01); + } + + @Test + public void testStringRangeColToDateLiteral() { + SlotReference a = new SlotReference("a", new VarcharType(25)); + ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder() + .setNdv(100) + .setAvgSizeByte(25) + .setNumNulls(0) + .setMaxExpr(new StringLiteral("2022-01-01")) + .setMaxValue(new VarcharLiteral("2022-01-01").getDouble()) + .setMinExpr(new StringLiteral("2020-01-01")) + .setMinValue(new VarcharLiteral("2020-01-01").getDouble()) + .setCount(100); + StatisticsBuilder statsBuilder = new StatisticsBuilder(); + statsBuilder.setRowCount(100); + statsBuilder.putColumnStatistics(a, columnStatisticBuilder.build()); + Statistics baseStats = statsBuilder.build(); + VarcharLiteral year2030 = new VarcharLiteral("2030-01-01"); + Statistics filter2030 = new FilterEstimation().estimate(new LessThan(a, year2030), baseStats); + Assertions.assertEquals(100, filter2030.getRowCount()); + + VarcharLiteral year2000 = new VarcharLiteral("2000-01-01"); + Statistics filter2k = new FilterEstimation().estimate(new LessThan(year2000, a), baseStats); + Assertions.assertEquals(100, filter2k.getRowCount()); + + VarcharLiteral year2021 = new VarcharLiteral("2021-12-01"); + Statistics filter2021 = new FilterEstimation().estimate(new GreaterThan(a, year2021), baseStats); + Assertions.assertEquals(4.24, filter2021.getRowCount(), 0.01); + } + + @Test + public void testStringRangeColToCol() { + SlotReference a = new SlotReference("a", new VarcharType(25)); + ColumnStatisticBuilder columnStatisticBuilderA = new ColumnStatisticBuilder() + .setNdv(100) + .setAvgSizeByte(25) + .setNumNulls(0) + .setMaxExpr(new StringLiteral("2022-01-01")) + .setMaxValue(new VarcharLiteral("2022-01-01").getDouble()) + .setMinExpr(new StringLiteral("2020-01-01")) + .setMinValue(new VarcharLiteral("2020-01-01").getDouble()) + .setCount(100); + + SlotReference b = new SlotReference("b", new VarcharType(25)); + ColumnStatisticBuilder columnStatisticBuilderB = new ColumnStatisticBuilder() + .setNdv(100) + .setAvgSizeByte(25) + .setNumNulls(0) + .setMaxExpr(new StringLiteral("2012-01-01")) + .setMaxValue(new VarcharLiteral("2012-01-01").getDouble()) + .setMinExpr(new StringLiteral("2010-01-01")) + .setMinValue(new VarcharLiteral("2010-01-01").getDouble()) + .setCount(100); + + SlotReference c = new SlotReference("c", new VarcharType(25)); + ColumnStatisticBuilder columnStatisticBuilderC = new ColumnStatisticBuilder() + .setNdv(100) + .setAvgSizeByte(25) + .setNumNulls(0) + .setMaxExpr(new StringLiteral("2021-01-01")) + .setMaxValue(new VarcharLiteral("2021-01-01").getDouble()) + .setMinExpr(new StringLiteral("2010-01-01")) + .setMinValue(new VarcharLiteral("2010-01-01").getDouble()) + .setCount(100); + + StatisticsBuilder statsBuilder = new StatisticsBuilder(); + statsBuilder.setRowCount(100); + statsBuilder.putColumnStatistics(a, columnStatisticBuilderA.build()); + statsBuilder.putColumnStatistics(b, columnStatisticBuilderB.build()); + statsBuilder.putColumnStatistics(c, columnStatisticBuilderC.build()); + Statistics baseStats = statsBuilder.build(); + + // (2020-2022) > (2010,2012), sel=1 + Statistics agrtb = new FilterEstimation().estimate(new GreaterThan(a, b), baseStats); + Assertions.assertEquals(100, agrtb.getRowCount()); + // (2020-2022) < (2010,2012), sel=0 + Statistics alessb = new FilterEstimation().estimate(new LessThan(a, b), baseStats); + Assertions.assertEquals(0, alessb.getRowCount()); + + // (2020-2022) > (2010-2021), sel = DEFAULT (0.5) + Statistics agrtc = new FilterEstimation().estimate(new GreaterThan(a, c), baseStats); + Assertions.assertEquals(50, agrtc.getRowCount()); + } } diff --git a/regression-test/data/nereids_ssb_shape_sf100_p0/shape/q2.2.out b/regression-test/data/nereids_ssb_shape_sf100_p0/shape/q2.2.out index de0b1d9ce05..0a148822bf8 100644 --- a/regression-test/data/nereids_ssb_shape_sf100_p0/shape/q2.2.out +++ b/regression-test/data/nereids_ssb_shape_sf100_p0/shape/q2.2.out @@ -10,21 +10,20 @@ PhysicalResultSink --------------hashAgg[LOCAL] ----------------PhysicalProject ------------------hashJoin[INNER_JOIN](lineorder.lo_orderdate = dates.d_datekey) ---------------------PhysicalDistribute -----------------------PhysicalProject -------------------------hashJoin[INNER_JOIN](lineorder.lo_suppkey = supplier.s_suppkey) ---------------------------PhysicalDistribute -----------------------------hashJoin[INNER_JOIN](lineorder.lo_partkey = part.p_partkey) -------------------------------PhysicalProject ---------------------------------PhysicalOlapScan[lineorder] -------------------------------PhysicalDistribute ---------------------------------PhysicalProject -----------------------------------filter((part.p_brand >= 'MFGR#2221')(part.p_brand <= 'MFGR#2228')) -------------------------------------PhysicalOlapScan[part] ---------------------------PhysicalDistribute +--------------------PhysicalProject +----------------------hashJoin[INNER_JOIN](lineorder.lo_partkey = part.p_partkey) +------------------------PhysicalDistribute +--------------------------hashJoin[INNER_JOIN](lineorder.lo_suppkey = supplier.s_suppkey) ----------------------------PhysicalProject -------------------------------filter((supplier.s_region = 'ASIA')) ---------------------------------PhysicalOlapScan[supplier] +------------------------------PhysicalOlapScan[lineorder] +----------------------------PhysicalDistribute +------------------------------PhysicalProject +--------------------------------filter((supplier.s_region = 'ASIA')) +----------------------------------PhysicalOlapScan[supplier] +------------------------PhysicalDistribute +--------------------------PhysicalProject +----------------------------filter((part.p_brand >= 'MFGR#2221')(part.p_brand <= 'MFGR#2228')) +------------------------------PhysicalOlapScan[part] --------------------PhysicalDistribute ----------------------PhysicalProject ------------------------PhysicalOlapScan[dates] --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org