This is an automated email from the ASF dual-hosted git repository.

englefly pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 90b2ee90b22 [nereids] consider numNulls in filter estimation (#29184)
90b2ee90b22 is described below

commit 90b2ee90b22c0c4a1b42f78ab66972cf790ca449
Author: xzj7019 <131111794+xzj7...@users.noreply.github.com>
AuthorDate: Tue Jan 2 13:51:11 2024 +0800

    [nereids] consider numNulls in filter estimation (#29184)
    
    consider numNulls in filter estimation
---
 .../doris/nereids/stats/FilterEstimation.java      |  59 ++++--
 .../doris/nereids/stats/FilterEstimationTest.java  | 203 ++++++++++++++++++++-
 2 files changed, 243 insertions(+), 19 deletions(-)

diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java
index b716b350f24..c086aaef5c8 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java
@@ -117,6 +117,8 @@ public class FilterEstimation extends 
ExpressionVisitor<Statistics, EstimationCo
                     
colBuilder.setMinValue(union.getLow()).setMinExpr(union.getLowExpr())
                             
.setMaxValue(union.getHigh()).setMaxExpr(union.getHighExpr())
                             .setNdv(union.getDistinctValues());
+                    double maxNumNulls = Math.max(leftColStats.numNulls, 
rightColStats.numNulls);
+                    colBuilder.setNumNulls(Math.min(colBuilder.getCount(), 
maxNumNulls));
                     orStats.addColumnStats(slot, colBuilder.build());
                 }
             }
@@ -175,7 +177,7 @@ public class FilterEstimation extends 
ExpressionVisitor<Statistics, EstimationCo
     }
 
     private Statistics updateLessThanLiteral(Expression leftExpr, 
ColumnStatistic statsForLeft,
-            ColumnStatistic statsForRight, EstimationContext context, boolean 
contains) {
+            ColumnStatistic statsForRight, EstimationContext context) {
         StatisticRange rightRange = new StatisticRange(statsForLeft.minValue, 
statsForLeft.minExpr,
                 statsForRight.maxValue, statsForRight.maxExpr,
                 statsForLeft.ndv, leftExpr.getDataType());
@@ -185,7 +187,7 @@ public class FilterEstimation extends 
ExpressionVisitor<Statistics, EstimationCo
     }
 
     private Statistics updateGreaterThanLiteral(Expression leftExpr, 
ColumnStatistic statsForLeft,
-            ColumnStatistic statsForRight, EstimationContext context, boolean 
contains) {
+            ColumnStatistic statsForRight, EstimationContext context) {
         StatisticRange rightRange = new StatisticRange(statsForRight.minValue, 
statsForRight.minExpr,
                 statsForLeft.maxValue, statsForLeft.maxExpr,
                 statsForLeft.ndv, leftExpr.getDataType());
@@ -202,12 +204,9 @@ public class FilterEstimation extends 
ExpressionVisitor<Statistics, EstimationCo
             return estimateEqualTo(cp, statsForLeft, statsForRight, context);
         } else {
             if (cp instanceof LessThan || cp instanceof LessThanEqual) {
-                return updateLessThanLiteral(cp.left(), statsForLeft, 
statsForRight,
-                        context, cp instanceof LessThanEqual);
+                return updateLessThanLiteral(cp.left(), statsForLeft, 
statsForRight, context);
             } else if (cp instanceof GreaterThan || cp instanceof 
GreaterThanEqual) {
-
-                return updateGreaterThanLiteral(cp.left(), statsForLeft, 
statsForRight, context,
-                        cp instanceof GreaterThanEqual);
+                return updateGreaterThanLiteral(cp.left(), statsForLeft, 
statsForRight, context);
             } else {
                 throw new RuntimeException(String.format("Unexpected 
expression : %s", cp.toSql()));
             }
@@ -225,6 +224,7 @@ public class FilterEstimation extends 
ExpressionVisitor<Statistics, EstimationCo
         } else {
             selectivity = StatsMathUtil.minNonNaN(1.0, 1.0 / ndv);
         }
+        selectivity = getNotNullSelectivity(statsForLeft, selectivity);
         Statistics equalStats = context.statistics.withSel(selectivity);
         Expression left = cp.left();
         equalStats.addColumnStats(left, statsForRight);
@@ -331,10 +331,12 @@ public class FilterEstimation extends 
ExpressionVisitor<Statistics, EstimationCo
                 selectivity = 1.0;
             }
         }
+        compareExprStatsBuilder.setNumNulls(0);
         Statistics estimated = new Statistics(context.statistics);
+        ColumnStatistic stats = compareExprStatsBuilder.build();
+        selectivity = getNotNullSelectivity(stats, selectivity);
         estimated = estimated.withSel(selectivity);
-        estimated.addColumnStats(compareExpr,
-                compareExprStatsBuilder.build());
+        estimated.addColumnStats(compareExpr, stats);
         context.addKeyIfSlot(compareExpr);
         return estimated;
     }
@@ -394,6 +396,11 @@ public class FilterEstimation extends 
ExpressionVisitor<Statistics, EstimationCo
                             .setMaxValue(originColStats.maxValue)
                             .setMaxExpr(originColStats.maxExpr);
                 }
+                if (not.child().getInputSlots().size() == 1 && !(child 
instanceof IsNull)) {
+                    // only consider the single column numNull, otherwise, 
ignore
+                    rowCount = Math.max(rowCount - originColStats.numNulls, 1);
+                    statisticsBuilder.setRowCount(rowCount);
+                }
                 statisticsBuilder.putColumnStatistics(slot, 
colBuilder.build());
             }
         }
@@ -460,15 +467,18 @@ public class FilterEstimation extends 
ExpressionVisitor<Statistics, EstimationCo
                     .setMaxValue(Double.POSITIVE_INFINITY)
                     .setMaxExpr(null)
                     .setNdv(0)
-                    .setCount(0);
+                    .setCount(0)
+                    .setNumNulls(0);
         } else {
             leftColumnStatisticBuilder = new ColumnStatisticBuilder(leftStats)
                     .setMinValue(intersectRange.getLow())
                     .setMinExpr(intersectRange.getLowExpr())
                     .setMaxValue(intersectRange.getHigh())
                     .setMaxExpr(intersectRange.getHighExpr())
-                    .setNdv(intersectRange.getDistinctValues());
+                    .setNdv(intersectRange.getDistinctValues())
+                    .setNumNulls(0);
             double sel = leftRange.overlapPercentWith(rightRange);
+            sel = getNotNullSelectivity(leftStats, sel);
             updatedStatistics = context.statistics.withSel(sel);
             
leftColumnStatisticBuilder.setCount(updatedStatistics.getRowCount());
         }
@@ -488,6 +498,7 @@ public class FilterEstimation extends 
ExpressionVisitor<Statistics, EstimationCo
         intersectBuilder.setNdv(intersect.getDistinctValues());
         intersectBuilder.setMinValue(intersect.getLow());
         intersectBuilder.setMaxValue(intersect.getHigh());
+        intersectBuilder.setNumNulls(0);
         double sel = 1 / StatsMathUtil.nonZeroDivisor(Math.max(leftStats.ndv, 
rightStats.ndv));
         Statistics updatedStatistics = context.statistics.withSel(sel);
         updatedStatistics.addColumnStats(leftExpr, intersectBuilder.build());
@@ -568,10 +579,34 @@ public class FilterEstimation extends 
ExpressionVisitor<Statistics, EstimationCo
                     "col stats not found. slot=%s in %s",
                     like.left().toSql(), like.toSql());
             ColumnStatisticBuilder colBuilder = new 
ColumnStatisticBuilder(origin);
-            colBuilder.setNdv(origin.ndv * 
DEFAULT_LIKE_COMPARISON_SELECTIVITY).setNumNulls(0);
+            double selectivity = 
StatsMathUtil.divide(DEFAULT_LIKE_COMPARISON_SELECTIVITY, origin.ndv);
+            double notNullSel = getNotNullSelectivity(origin, selectivity);
+            colBuilder.setNdv(origin.ndv * DEFAULT_LIKE_COMPARISON_SELECTIVITY)
+                    .setCount(notNullSel * 
context.statistics.getRowCount()).setNumNulls(0);
             statsBuilder.putColumnStatistics(like.left(), colBuilder.build());
             context.addKeyIfSlot(like.left());
         }
         return statsBuilder.build();
     }
+
+    private double getNotNullSelectivity(ColumnStatistic stats, double 
origSel) {
+        double rowCount = stats.count;
+        double numNulls = stats.numNulls;
+
+        // comment following check since current rowCount and ndv may be 
inconsistant
+        // e.g, rowCount has been reduced by one filter but another filter 
column's
+        // ndv and numNull remains originally, which will unexpectedly go into 
the following
+        // normalization.
+
+        //if (numNulls > rowCount - ndv) {
+        //    numNulls = rowCount - ndv > 0 ? rowCount - ndv : 0;
+        //}
+        double notNullSel = rowCount <= 1.0 ? 1.0 : 1 - 
getValidSelectivity(numNulls / rowCount);
+        double validSel = origSel * notNullSel;
+        return getValidSelectivity(validSel);
+    }
+
+    private static double getValidSelectivity(double nullSel) {
+        return nullSel < 0 ? 0 : (nullSel > 1 ? 1 : nullSel);
+    }
 }
diff --git 
a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java
 
b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java
index f5491c63331..b476cc563be 100644
--- 
a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java
+++ 
b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java
@@ -64,10 +64,10 @@ class FilterEstimationTest {
         Or or = new Or(greaterThan1, lessThan);
         Map<Expression, ColumnStatistic> columnStat = new HashMap<>();
         ColumnStatistic aStats = new 
ColumnStatisticBuilder().setCount(500).setNdv(500).setAvgSizeByte(4)
-                .setNumNulls(500).setDataSize(0)
+                .setNumNulls(0).setDataSize(0)
                 .setMinValue(0).setMaxValue(1000).setMinExpr(null).build();
         ColumnStatistic bStats = new 
ColumnStatisticBuilder().setCount(500).setNdv(500).setAvgSizeByte(4)
-                .setNumNulls(500).setDataSize(0)
+                .setNumNulls(0).setDataSize(0)
                 
.setMinValue(0).setMaxValue(1000).setMinExpr(null).setIsUnknown(true).build();
         columnStat.put(a, aStats);
         columnStat.put(b, bStats);
@@ -93,10 +93,10 @@ class FilterEstimationTest {
         And and = new And(greaterThan1, lessThan);
         Map<Expression, ColumnStatistic> columnStat = new HashMap<>();
         ColumnStatistic aStats = new 
ColumnStatisticBuilder().setCount(500).setNdv(500)
-                .setAvgSizeByte(4).setNumNulls(500).setDataSize(0)
+                .setAvgSizeByte(4).setNumNulls(0).setDataSize(0)
                 .setMinValue(0).setMaxValue(1000).setMinExpr(null).build();
         ColumnStatistic bStats = new 
ColumnStatisticBuilder().setCount(500).setNdv(500)
-                .setAvgSizeByte(4).setNumNulls(500).setDataSize(0)
+                .setAvgSizeByte(4).setNumNulls(0).setDataSize(0)
                 
.setMinValue(0).setMaxValue(1000).setMinExpr(null).setIsUnknown(true).build();
         columnStat.put(a, aStats);
         columnStat.put(b, bStats);
@@ -185,13 +185,13 @@ class FilterEstimationTest {
         Or or = new Or(and, equalTo);
         Map<Expression, ColumnStatistic> slotToColumnStat = new HashMap<>();
         ColumnStatistic aStats = new 
ColumnStatisticBuilder().setCount(500).setNdv(500)
-                .setAvgSizeByte(4).setNumNulls(500).setDataSize(0)
+                .setAvgSizeByte(4).setNumNulls(0).setDataSize(0)
                 .setMinValue(0).setMaxValue(1000).setMinExpr(null).build();
         ColumnStatistic bStats = new 
ColumnStatisticBuilder().setCount(500).setNdv(500)
-                .setAvgSizeByte(4).setNumNulls(500).setDataSize(0)
+                .setAvgSizeByte(4).setNumNulls(0).setDataSize(0)
                 .setMinValue(0).setMaxValue(1000).setMinExpr(null).build();
         ColumnStatistic cStats = new 
ColumnStatisticBuilder().setCount(500).setNdv(500)
-                .setAvgSizeByte(4).setNumNulls(500).setDataSize(0)
+                .setAvgSizeByte(4).setNumNulls(0).setDataSize(0)
                 .setMinValue(0).setMaxValue(1000).setMinExpr(null).build();
         slotToColumnStat.put(a, aStats);
         slotToColumnStat.put(b, bStats);
@@ -910,4 +910,193 @@ class FilterEstimationTest {
         Statistics result = filterEstimation.estimate(not, stats);
         Assertions.assertEquals(result.getRowCount(), 90);
     }
+
+    /**
+     * a = 1
+     */
+    @Test
+    public void testNumNullsEqualTo() {
+        SlotReference a = new SlotReference("a", IntegerType.INSTANCE);
+        ColumnStatisticBuilder builder = new ColumnStatisticBuilder()
+                .setNdv(2)
+                .setAvgSizeByte(4)
+                .setNumNulls(8)
+                .setMaxValue(2)
+                .setMinValue(1)
+                .setCount(10);
+        IntegerLiteral int1 = new IntegerLiteral(1);
+        EqualTo equalTo = new EqualTo(a, int1);
+        Statistics stats = new Statistics(10, new HashMap<>());
+        stats.addColumnStats(a, builder.build());
+        FilterEstimation filterEstimation = new FilterEstimation();
+        Statistics result = filterEstimation.estimate(equalTo, stats);
+        Assertions.assertEquals(result.getRowCount(), 1.0, 0.01);
+    }
+
+    /**
+     * a > 1
+     */
+    @Test
+    public void testNumNullsComparable() {
+        SlotReference a = new SlotReference("a", IntegerType.INSTANCE);
+        ColumnStatisticBuilder builder = new ColumnStatisticBuilder()
+                .setNdv(2)
+                .setAvgSizeByte(4)
+                .setNumNulls(8)
+                .setMaxValue(2)
+                .setMinValue(1)
+                .setCount(10);
+        IntegerLiteral int1 = new IntegerLiteral(1);
+        GreaterThan greaterThan = new GreaterThan(a, int1);
+        Statistics stats = new Statistics(10, new HashMap<>());
+        stats.addColumnStats(a, builder.build());
+        FilterEstimation filterEstimation = new FilterEstimation();
+        Statistics result = filterEstimation.estimate(greaterThan, stats);
+        Assertions.assertEquals(result.getRowCount(), 2.0, 0.01);
+    }
+
+    /**
+     * a in (1, 2)
+     */
+    @Test
+    public void testNumNullsIn() {
+        SlotReference a = new SlotReference("a", IntegerType.INSTANCE);
+        ColumnStatisticBuilder builder = new ColumnStatisticBuilder()
+                .setNdv(2)
+                .setAvgSizeByte(4)
+                .setNumNulls(8)
+                .setMaxValue(2)
+                .setMinValue(1)
+                .setCount(10);
+        IntegerLiteral int1 = new IntegerLiteral(1);
+        IntegerLiteral int2 = new IntegerLiteral(2);
+        InPredicate in = new InPredicate(a, Lists.newArrayList(int1, int2));
+        Statistics stats = new Statistics(10, new HashMap<>());
+        stats.addColumnStats(a, builder.build());
+        FilterEstimation filterEstimation = new FilterEstimation();
+        Statistics result = filterEstimation.estimate(in, stats);
+        Assertions.assertEquals(result.getRowCount(), 10.0, 0.01);
+    }
+
+    /**
+     * not a = 1
+     */
+    @Test
+    public void testNumNullsNotEqualTo() {
+        SlotReference a = new SlotReference("a", IntegerType.INSTANCE);
+        ColumnStatisticBuilder builder = new ColumnStatisticBuilder()
+                .setNdv(2)
+                .setAvgSizeByte(4)
+                .setNumNulls(8)
+                .setMaxValue(2)
+                .setMinValue(1)
+                .setCount(10);
+        IntegerLiteral int1 = new IntegerLiteral(1);
+        EqualTo equalTo = new EqualTo(a, int1);
+        Not not = new Not(equalTo);
+        Statistics stats = new Statistics(10, new HashMap<>());
+        stats.addColumnStats(a, builder.build());
+        FilterEstimation filterEstimation = new FilterEstimation();
+        Statistics result = filterEstimation.estimate(not, stats);
+        Assertions.assertEquals(result.getRowCount(), 1.0, 0.01);
+    }
+
+    /**
+     * a not in (1, 2)
+     */
+    @Test
+    public void testNumNullsNotIn() {
+        SlotReference a = new SlotReference("a", IntegerType.INSTANCE);
+        ColumnStatisticBuilder builder = new ColumnStatisticBuilder()
+                .setNdv(2)
+                .setAvgSizeByte(4)
+                .setNumNulls(8)
+                .setMaxValue(2)
+                .setMinValue(1)
+                .setCount(10);
+        IntegerLiteral int1 = new IntegerLiteral(1);
+        IntegerLiteral int2 = new IntegerLiteral(2);
+        InPredicate in = new InPredicate(a, Lists.newArrayList(int1, int2));
+        Not not = new Not(in);
+        Statistics stats = new Statistics(10, new HashMap<>());
+        stats.addColumnStats(a, builder.build());
+        FilterEstimation filterEstimation = new FilterEstimation();
+        Statistics result = filterEstimation.estimate(not, stats);
+        Assertions.assertEquals(result.getRowCount(), 1.0, 0.01);
+    }
+
+    /**
+     * a >= 1 and a <= 2
+     */
+    @Test
+    public void testNumNullsAnd() {
+        SlotReference a = new SlotReference("a", IntegerType.INSTANCE);
+        ColumnStatisticBuilder builder = new ColumnStatisticBuilder()
+                .setNdv(2)
+                .setAvgSizeByte(4)
+                .setNumNulls(8)
+                .setMaxValue(2)
+                .setMinValue(1)
+                .setCount(10);
+        IntegerLiteral int1 = new IntegerLiteral(1);
+        IntegerLiteral int2 = new IntegerLiteral(2);
+        GreaterThanEqual greaterThanEqual = new GreaterThanEqual(a, int1);
+        LessThanEqual lessThanEqual = new LessThanEqual(a, int2);
+        And and = new And(greaterThanEqual, lessThanEqual);
+        Statistics stats = new Statistics(10, new HashMap<>());
+        stats.addColumnStats(a, builder.build());
+        FilterEstimation filterEstimation = new FilterEstimation();
+        Statistics result = filterEstimation.estimate(and, stats);
+        Assertions.assertEquals(result.getRowCount(), 2.0, 0.01);
+    }
+
+    /**
+     * a >= 1 or a <= 2
+     */
+    @Test
+    public void testNumNullsOr() {
+        SlotReference a = new SlotReference("a", IntegerType.INSTANCE);
+        ColumnStatisticBuilder builder = new ColumnStatisticBuilder()
+                .setNdv(2)
+                .setAvgSizeByte(4)
+                .setNumNulls(8)
+                .setMaxValue(2)
+                .setMinValue(1)
+                .setCount(10);
+        IntegerLiteral int1 = new IntegerLiteral(1);
+        IntegerLiteral int2 = new IntegerLiteral(2);
+        GreaterThanEqual greaterThanEqual = new GreaterThanEqual(a, int2);
+        LessThanEqual lessThanEqual = new LessThanEqual(a, int1);
+        Or or = new Or(greaterThanEqual, lessThanEqual);
+        Statistics stats = new Statistics(10, new HashMap<>());
+        stats.addColumnStats(a, builder.build());
+        FilterEstimation filterEstimation = new FilterEstimation();
+        Statistics result = filterEstimation.estimate(or, stats);
+        Assertions.assertEquals(result.getRowCount(), 2.0, 0.01);
+    }
+
+    /**
+     * a >= 1 or a is null
+     */
+    @Test
+    public void testNumNullsOrIsNull() {
+        SlotReference a = new SlotReference("a", IntegerType.INSTANCE);
+        ColumnStatisticBuilder builder = new ColumnStatisticBuilder()
+                .setNdv(2)
+                .setAvgSizeByte(4)
+                .setNumNulls(8)
+                .setMaxValue(2)
+                .setMinValue(1)
+                .setCount(10);
+        IntegerLiteral int1 = new IntegerLiteral(1);
+        GreaterThanEqual greaterThanEqual = new GreaterThanEqual(a, int1);
+        IsNull isNull = new IsNull(a);
+        Or or = new Or(greaterThanEqual, isNull);
+        Statistics stats = new Statistics(10, new HashMap<>());
+        stats.addColumnStats(a, builder.build());
+        FilterEstimation filterEstimation = new FilterEstimation();
+        Statistics result = filterEstimation.estimate(or, stats);
+        Assertions.assertEquals(result.getRowCount(), 10.0, 0.01);
+    }
+
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to