This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git

commit 8f264a7206aba5ec78ae4afba937c195b1847b7d
Author: minghong <[email protected]>
AuthorDate: Thu May 30 14:32:14 2024 +0800

    [opt](nereids) compare str literal as date literal to compute selectivity 
(#35610)
    
    this pr improves #34542, when the real data type is date-like type.
    
    Some users are likely to define date(datetime) column as Varchar type.
    when estimating the selectivity of predicate like A>'2020-01-01', if
    nereids regards A and '2020-01-01' as date type, the sel is more
    accurate than that as string type.
---
 .../org/apache/doris/analysis/DateLiteral.java     |   4 +
 .../doris/nereids/stats/FilterEstimation.java      | 127 +++++++++++++++++++--
 .../expressions/literal/StringLikeLiteral.java     |  11 +-
 .../doris/nereids/stats/FilterEstimationTest.java  |  31 ++++-
 4 files changed, 159 insertions(+), 14 deletions(-)

diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/analysis/DateLiteral.java 
b/fe/fe-core/src/main/java/org/apache/doris/analysis/DateLiteral.java
index a8148237fb7..1ff103097ef 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/analysis/DateLiteral.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/DateLiteral.java
@@ -786,6 +786,10 @@ public class DateLiteral extends LiteralExpr {
         return getLongValue();
     }
 
+    public double getDoubleValueAsDateTime() {
+        return (year * 10000 + month * 100 + day) * 1000000L + hour * 10000 + 
minute * 100 + second;
+    }
+
     @Override
     protected void toThrift(TExprNode msg) {
         if (type.isDatetimeV2()) {
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java
index 2286daaa448..17b1eb39387 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java
@@ -17,7 +17,9 @@
 
 package org.apache.doris.nereids.stats;
 
+import org.apache.doris.analysis.DateLiteral;
 import org.apache.doris.analysis.LiteralExpr;
+import org.apache.doris.analysis.StringLiteral;
 import org.apache.doris.nereids.stats.FilterEstimation.EstimationContext;
 import org.apache.doris.nereids.trees.TreeNode;
 import org.apache.doris.nereids.trees.expressions.And;
@@ -39,7 +41,10 @@ import org.apache.doris.nereids.trees.expressions.Slot;
 import org.apache.doris.nereids.trees.expressions.SlotReference;
 import org.apache.doris.nereids.trees.expressions.functions.Function;
 import org.apache.doris.nereids.trees.expressions.literal.Literal;
+import org.apache.doris.nereids.trees.expressions.literal.StringLikeLiteral;
 import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
+import org.apache.doris.nereids.types.DataType;
+import org.apache.doris.nereids.types.DateTimeType;
 import org.apache.doris.nereids.types.coercion.RangeScalable;
 import org.apache.doris.statistics.ColumnStatistic;
 import org.apache.doris.statistics.ColumnStatisticBuilder;
@@ -50,7 +55,10 @@ import org.apache.doris.statistics.StatisticsBuilder;
 import com.google.common.base.Preconditions;
 import com.google.common.collect.Sets;
 
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
+import java.util.Optional;
 import java.util.Set;
 import java.util.function.Predicate;
 
@@ -183,22 +191,22 @@ public class FilterEstimation extends 
ExpressionVisitor<Statistics, EstimationCo
         }
     }
 
-    private Statistics updateLessThanLiteral(Expression leftExpr, 
ColumnStatistic statsForLeft,
+    private Statistics updateLessThanLiteral(Expression leftExpr, DataType 
dataType, ColumnStatistic statsForLeft,
             ColumnStatistic statsForRight, EstimationContext context) {
         StatisticRange rightRange = new StatisticRange(statsForLeft.minValue, 
statsForLeft.minExpr,
                 statsForRight.maxValue, statsForRight.maxExpr,
-                statsForLeft.ndv, leftExpr.getDataType());
-        return estimateBinaryComparisonFilter(leftExpr,
+                statsForLeft.ndv, dataType);
+        return estimateBinaryComparisonFilter(leftExpr, dataType,
                 statsForLeft,
                 rightRange, context);
     }
 
-    private Statistics updateGreaterThanLiteral(Expression leftExpr, 
ColumnStatistic statsForLeft,
+    private Statistics updateGreaterThanLiteral(Expression leftExpr, DataType 
dataType, ColumnStatistic statsForLeft,
             ColumnStatistic statsForRight, EstimationContext context) {
         StatisticRange rightRange = new StatisticRange(statsForRight.minValue, 
statsForRight.minExpr,
                 statsForLeft.maxValue, statsForLeft.maxExpr,
-                statsForLeft.ndv, leftExpr.getDataType());
-        return estimateBinaryComparisonFilter(leftExpr, statsForLeft, 
rightRange, context);
+                statsForLeft.ndv, dataType);
+        return estimateBinaryComparisonFilter(leftExpr, dataType, 
statsForLeft, rightRange, context);
     }
 
     private Statistics calculateWhenLiteralRight(ComparisonPredicate cp,
@@ -210,14 +218,111 @@ public class FilterEstimation extends 
ExpressionVisitor<Statistics, EstimationCo
         if (cp instanceof EqualPredicate) {
             return estimateEqualTo(cp, statsForLeft, statsForRight, context);
         } else {
+            // literal Map used to covert dateLiteral back to stringLiteral
+            Map<DateLiteral, StringLiteral> literalMap = new HashMap<>();
+            DataType compareType = cp.left().getDataType();
+            Optional<ColumnStatistic> statsForLeftMayConvertedOpt =
+                    tryConvertStringColStatsToDateColStats(statsForLeft, 
literalMap);
+            Optional<ColumnStatistic> statsForRightMayConvertedOpt = 
(statsForLeftMayConvertedOpt.isPresent())
+                    ? tryConvertStringColStatsToDateColStats(statsForRight, 
literalMap)
+                    : Optional.empty();
+
+            boolean converted = false;
+            ColumnStatistic statsForLeftMayConverted = statsForLeft;
+            ColumnStatistic statsForRightMayConverted = statsForRight;
+            if (statsForLeftMayConvertedOpt.isPresent() && 
statsForRightMayConvertedOpt.isPresent()
+                    && statsForRightMayConvertedOpt.get().minExpr.getType()
+                    == statsForLeftMayConvertedOpt.get().minExpr.getType()) {
+                // string type is converted to date type
+                converted = true;
+                compareType = DateTimeType.INSTANCE;
+                statsForLeftMayConverted = statsForLeftMayConvertedOpt.get();
+                statsForRightMayConverted = statsForRightMayConvertedOpt.get();
+            }
+            Statistics result = null;
             if (cp instanceof LessThan || cp instanceof LessThanEqual) {
-                return updateLessThanLiteral(cp.left(), statsForLeft, 
statsForRight, context);
+                result = updateLessThanLiteral(cp.left(), compareType, 
statsForLeftMayConverted,
+                        statsForRightMayConverted, context);
             } else if (cp instanceof GreaterThan || cp instanceof 
GreaterThanEqual) {
-                return updateGreaterThanLiteral(cp.left(), statsForLeft, 
statsForRight, context);
+                result = updateGreaterThanLiteral(cp.left(), compareType, 
statsForLeftMayConverted,
+                        statsForRightMayConverted, context);
             } else {
                 throw new RuntimeException(String.format("Unexpected 
expression : %s", cp.toSql()));
             }
+            if (converted) {
+                // convert min/max of left.colStats back to string type
+                ColumnStatistic newLeftStats = 
result.findColumnStatistics(cp.left());
+                result.addColumnStats(cp.left(), 
convertDateColStatsToStringColStats(newLeftStats, literalMap));
+            }
+            return result;
+        }
+    }
+
+    private ColumnStatistic 
convertDateColStatsToStringColStats(ColumnStatistic colStats,
+            Map<DateLiteral, StringLiteral> literalMap) {
+        if (colStats.minExpr == null && colStats.maxExpr == null) {
+            // when sel=0, minExpr and maxExpr are both null
+            return colStats;
+        }
+        Preconditions.checkArgument(colStats.minExpr instanceof DateLiteral
+                        && colStats.maxExpr instanceof DateLiteral,
+                "cannot convert colStats back to stringType %s", 
colStats.toString());
+        ColumnStatisticBuilder builder = new ColumnStatisticBuilder(colStats);
+        StringLiteral newMinLiteral = new 
StringLiteral(colStats.maxExpr.toString());
+        return builder.setMaxExpr(newMinLiteral)
+                .setMaxExpr(literalMap.get(colStats.maxExpr))
+                
.setMaxValue(StringLikeLiteral.getDouble(colStats.maxExpr.toString()))
+                .setMinExpr(literalMap.get(colStats.minExpr))
+                
.setMinValue(StringLikeLiteral.getDouble(colStats.minExpr.getStringValue()))
+                .build();
+    }
+
+    private Optional<ColumnStatistic> 
tryConvertStringColStatsToDateColStats(ColumnStatistic colStats,
+            Map<DateLiteral, StringLiteral> literalMap) {
+        if (colStats.minExpr == null || colStats.maxExpr == null) {
+            return Optional.empty();
+        }
+        if (!(colStats.minExpr instanceof StringLiteral) || !(colStats.maxExpr 
instanceof StringLiteral)) {
+            return Optional.empty();
+        }
+        Optional<DateLiteral> newMinExpr = 
tryConvertStrLiteralToDateLiteral(colStats.minExpr);
+        if (newMinExpr.isEmpty()) {
+            return Optional.empty();
+        }
+        Optional<DateLiteral> newMaxExpr = 
tryConvertStrLiteralToDateLiteral(colStats.maxExpr);
+        if (newMaxExpr.isEmpty()) {
+            return Optional.empty();
+        }
+        if (newMaxExpr.get().getType() != newMinExpr.get().getType()) {
+            return Optional.empty();
+        }
+        literalMap.put(newMinExpr.get(), (StringLiteral) colStats.minExpr);
+        literalMap.put(newMaxExpr.get(), (StringLiteral) colStats.maxExpr);
+
+        ColumnStatisticBuilder builder = new ColumnStatisticBuilder(colStats);
+        return 
Optional.of(builder.setMinValue(newMinExpr.get().getDoubleValueAsDateTime())
+                .setMinExpr(newMinExpr.get())
+                .setMaxValue(newMaxExpr.get().getDoubleValueAsDateTime())
+                .setMaxExpr(newMaxExpr.get())
+                .build());
+    }
+
+    private Optional<DateLiteral> 
tryConvertStrLiteralToDateLiteral(LiteralExpr literal) {
+        if (literal == null) {
+            return Optional.empty();
+        }
+        if (!(literal instanceof StringLiteral)) {
+            return Optional.empty();
+        }
+
+        DateLiteral dt = null;
+        try {
+            dt = new DateLiteral(literal.getStringValue());
+            dt.checkValueValid();
+        } catch (Exception e) {
+            // ignore
         }
+        return dt == null ? Optional.empty() : Optional.of(dt);
     }
 
     private Statistics estimateEqualTo(ComparisonPredicate cp, ColumnStatistic 
statsForLeft,
@@ -467,11 +572,11 @@ public class FilterEstimation extends 
ExpressionVisitor<Statistics, EstimationCo
         }
     }
 
-    private Statistics estimateBinaryComparisonFilter(Expression leftExpr, 
ColumnStatistic leftStats,
+    private Statistics estimateBinaryComparisonFilter(Expression leftExpr, 
DataType dataType, ColumnStatistic leftStats,
             StatisticRange rightRange, EstimationContext context) {
         StatisticRange leftRange =
                 new StatisticRange(leftStats.minValue, leftStats.minExpr, 
leftStats.maxValue, leftStats.maxExpr,
-                        leftStats.ndv, leftExpr.getDataType());
+                        leftStats.ndv, dataType);
         StatisticRange intersectRange = leftRange.cover(rightRange);
 
         ColumnStatisticBuilder leftColumnStatisticBuilder;
@@ -495,7 +600,7 @@ public class FilterEstimation extends 
ExpressionVisitor<Statistics, EstimationCo
                     .setNdv(intersectRange.getDistinctValues())
                     .setNumNulls(0);
             double sel = leftRange.overlapPercentWith(rightRange);
-            if (!(leftExpr.getDataType() instanceof RangeScalable) && (sel != 
0.0 && sel != 1.0)) {
+            if (!(dataType instanceof RangeScalable) && (sel != 0.0 && sel != 
1.0)) {
                 sel = DEFAULT_INEQUALITY_COEFFICIENT;
             }
             sel = getNotNullSelectivity(leftStats, sel);
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/StringLikeLiteral.java
 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/StringLikeLiteral.java
index e0e28d9399c..815e5742d24 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/StringLikeLiteral.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/StringLikeLiteral.java
@@ -39,11 +39,18 @@ public abstract class StringLikeLiteral extends Literal {
 
     @Override
     public double getDouble() {
+        return getDouble(value);
+    }
+
+    /**
+     * get double value
+     */
+    public static double getDouble(String str) {
         long v = 0;
         int pos = 0;
-        int len = Math.min(value.length(), 7);
+        int len = Math.min(str.length(), 7);
         while (pos < len) {
-            v += Byte.toUnsignedLong(value.getBytes()[pos]) << ((6 - pos) * 8);
+            v += Byte.toUnsignedLong(str.getBytes()[pos]) << ((6 - pos) * 8);
             pos++;
         }
         return (double) v;
diff --git 
a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java
 
b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java
index 687a4d7a54a..08aced49e14 100644
--- 
a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java
+++ 
b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java
@@ -1148,6 +1148,35 @@ class FilterEstimationTest {
      */
     @Test
     public void testStringRangeColToLiteral() {
+        SlotReference a = new SlotReference("a", new VarcharType(25));
+        ColumnStatisticBuilder columnStatisticBuilder = new 
ColumnStatisticBuilder()
+                .setNdv(100)
+                .setAvgSizeByte(25)
+                .setNumNulls(0)
+                .setMaxExpr(new StringLiteral("200"))
+                .setMaxValue(new VarcharLiteral("200").getDouble())
+                .setMinExpr(new StringLiteral("100"))
+                .setMinValue(new VarcharLiteral("100").getDouble())
+                .setCount(100);
+        StatisticsBuilder statsBuilder = new StatisticsBuilder();
+        statsBuilder.setRowCount(100);
+        statsBuilder.putColumnStatistics(a, columnStatisticBuilder.build());
+        Statistics baseStats = statsBuilder.build();
+        VarcharLiteral i500 = new VarcharLiteral("500");
+        Statistics filter500 = new FilterEstimation().estimate(new LessThan(a, 
i500), baseStats);
+        Assertions.assertEquals(100, filter500.getRowCount());
+
+        VarcharLiteral i10 = new VarcharLiteral("10");
+        Statistics filter10 = new FilterEstimation().estimate(new 
LessThan(i10, a), baseStats);
+        Assertions.assertEquals(100, filter10.getRowCount());
+
+        VarcharLiteral i199 = new VarcharLiteral("199");
+        Statistics filter199 = new FilterEstimation().estimate(new 
GreaterThan(a, i199), baseStats);
+        Assertions.assertEquals(50, filter199.getRowCount(), 0.01);
+    }
+
+    @Test
+    public void testStringRangeColToDateLiteral() {
         SlotReference a = new SlotReference("a", new VarcharType(25));
         ColumnStatisticBuilder columnStatisticBuilder = new 
ColumnStatisticBuilder()
                 .setNdv(100)
@@ -1172,7 +1201,7 @@ class FilterEstimationTest {
 
         VarcharLiteral year2021 = new VarcharLiteral("2021-12-01");
         Statistics filter2021 = new FilterEstimation().estimate(new 
GreaterThan(a, year2021), baseStats);
-        Assertions.assertEquals(50, filter2021.getRowCount());
+        Assertions.assertEquals(4.24, filter2021.getRowCount(), 0.01);
     }
 
     @Test


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to