[GitHub] [doris] englefly commented on a diff in pull request #17790: [enhancement](nereids) update FilterEstimation and Agg in stats derive

via GitHub Wed, 15 Mar 2023 17:42:31 -0700


englefly commented on code in PR #17790:
URL: https://github.com/apache/doris/pull/17790#discussion_r1137947707



##########
fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java:
##########
@@ -436,12 +437,44 @@ private Statistics computeAggregate(Aggregate<? extends 
Plan> aggregate) {
         // TODO: since we have no column stats here. just use a fix ratio to 
compute the row count.
         List<Expression> groupByExpressions = 
aggregate.getGroupByExpressions();
         Statistics childStats = groupExpression.childStatistics(0);
-        Map<Expression, ColumnStatistic> childSlotToColumnStats = 
childStats.columnStatistics();
-        double resultSetCount = groupByExpressions.stream().flatMap(expr -> 
expr.getInputSlots().stream())
-                
.filter(childSlotToColumnStats::containsKey).map(childSlotToColumnStats::get).map(s
 -> s.ndv)
-                .reduce(1d, (a, b) -> a * b);
-        if (resultSetCount <= 0) {
-            resultSetCount = 1L;
+        double resultSetCount = 1;
+        if (!groupByExpressions.isEmpty()) {
+            Map<Expression, ColumnStatistic> childSlotToColumnStats = 
childStats.columnStatistics();
+            double inputRowCount = childStats.getRowCount();
+            if (inputRowCount == 0) {
+                //on empty relation, Agg output 1 tuple
+                resultSetCount = 1;
+            } else {
+                //TODO: add column correlation for agg estimation
+                // group by A1, A2
+                // currently, the estimated row count is between max(ndv_A1, 
ndv_A2) and ndv_A1 * ndv_A2
+                resultSetCount = groupByExpressions.stream().flatMap(expr -> 
expr.getInputSlots().stream())
+                        .map(Slot::getExprId)
+                        .filter(childSlotToColumnStats::containsKey)
+                        .map(childSlotToColumnStats::get)
+                        .map(s -> {
+                            double adjustedNdv = s.ndv;
+                            if (s.isUnKnown) {

Review Comment:
   ok



##########
fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java:
##########
@@ -436,12 +437,44 @@ private Statistics computeAggregate(Aggregate<? extends 
Plan> aggregate) {
         // TODO: since we have no column stats here. just use a fix ratio to 
compute the row count.
         List<Expression> groupByExpressions = 
aggregate.getGroupByExpressions();
         Statistics childStats = groupExpression.childStatistics(0);
-        Map<Expression, ColumnStatistic> childSlotToColumnStats = 
childStats.columnStatistics();
-        double resultSetCount = groupByExpressions.stream().flatMap(expr -> 
expr.getInputSlots().stream())
-                
.filter(childSlotToColumnStats::containsKey).map(childSlotToColumnStats::get).map(s
 -> s.ndv)
-                .reduce(1d, (a, b) -> a * b);
-        if (resultSetCount <= 0) {
-            resultSetCount = 1L;
+        double resultSetCount = 1;
+        if (!groupByExpressions.isEmpty()) {

Review Comment:
   refactored



##########
fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java:
##########
@@ -72,17 +85,18 @@ public double getRowCount() {
 
     public Statistics withRowCount(double rowCount) {
         Statistics statistics = new Statistics(rowCount, new 
HashMap<>(expressionToColumnStats), width, penalty);
-        statistics.fix(rowCount / StatsMathUtil.nonZeroDivisor(this.rowCount));
+        statistics.fix(rowCount, StatsMathUtil.nonZeroDivisor(this.rowCount));
         return statistics;
     }
 
-    public void fix(double sel) {
+    public void fix(double newRowCount, double originRowCount) {
+        double sel = newRowCount / originRowCount;
         for (Entry<Expression, ColumnStatistic> entry : 
expressionToColumnStats.entrySet()) {
             ColumnStatistic columnStatistic = entry.getValue();
             ColumnStatisticBuilder columnStatisticBuilder = new 
ColumnStatisticBuilder(columnStatistic);
-            
columnStatisticBuilder.setNdv(Math.min(Math.ceil(columnStatistic.ndv * sel), 
rowCount));
-            
columnStatisticBuilder.setNumNulls(Math.min(Math.ceil(columnStatistic.numNulls 
* sel), rowCount));
-            
columnStatisticBuilder.setCount(Math.min(Math.ceil(columnStatistic.count * 
sel), rowCount));
+            columnStatisticBuilder.setNdv(computeNdv(columnStatistic.ndv, 
newRowCount, originRowCount));

Review Comment:
   done



##########
fe/fe-core/src/main/java/org/apache/doris/nereids/memo/GroupExpression.java:
##########
@@ -319,10 +320,8 @@ public String toString() {
             builder.append("#").append(ownerGroup.getGroupId().asInt());
         }
 
-        DecimalFormat decimalFormat = new DecimalFormat();
-        decimalFormat.setGroupingSize(3);
-        builder.append(" cost=").append(decimalFormat.format((long) cost));
-        builder.append(" estRows=").append(estOutputRowCount);
+        builder.append(" cost=").append(NUM_FORMAT.format((long) cost));
+        builder.append(" 
estRows=").append(NUM_FORMAT.format(estOutputRowCount));

Review Comment:
   thank for review.
   this is mainly used for debug. I will create new instance locally.



##########
fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java:
##########
@@ -436,12 +437,44 @@ private Statistics computeAggregate(Aggregate<? extends 
Plan> aggregate) {
         // TODO: since we have no column stats here. just use a fix ratio to 
compute the row count.
         List<Expression> groupByExpressions = 
aggregate.getGroupByExpressions();
         Statistics childStats = groupExpression.childStatistics(0);
-        Map<Expression, ColumnStatistic> childSlotToColumnStats = 
childStats.columnStatistics();
-        double resultSetCount = groupByExpressions.stream().flatMap(expr -> 
expr.getInputSlots().stream())
-                
.filter(childSlotToColumnStats::containsKey).map(childSlotToColumnStats::get).map(s
 -> s.ndv)
-                .reduce(1d, (a, b) -> a * b);
-        if (resultSetCount <= 0) {
-            resultSetCount = 1L;
+        double resultSetCount = 1;
+        if (!groupByExpressions.isEmpty()) {
+            Map<Expression, ColumnStatistic> childSlotToColumnStats = 
childStats.columnStatistics();
+            double inputRowCount = childStats.getRowCount();
+            if (inputRowCount == 0) {
+                //on empty relation, Agg output 1 tuple
+                resultSetCount = 1;
+            } else {
+                //TODO: add column correlation for agg estimation
+                // group by A1, A2
+                // currently, the estimated row count is between max(ndv_A1, 
ndv_A2) and ndv_A1 * ndv_A2
+                resultSetCount = groupByExpressions.stream().flatMap(expr -> 
expr.getInputSlots().stream())
+                        .map(Slot::getExprId)
+                        .filter(childSlotToColumnStats::containsKey)
+                        .map(childSlotToColumnStats::get)
+                        .map(s -> {
+                            double adjustedNdv = s.ndv;
+                            if (s.isUnKnown) {
+                                adjustedNdv = 0;
+                            }
+                            if (s.ndv > inputRowCount) {
+                                adjustedNdv = inputRowCount;
+                            }
+                            return adjustedNdv;
+                        })
+                        .reduce(1.0, (a, b) -> a * b);
+                if (resultSetCount == 0) {
+                    //any group_by_key's stats is unknown
+                    resultSetCount = inputRowCount * DEFAULT_AGGREGATE_RATIO;
+                }
+                if (resultSetCount > inputRowCount) {
+                    // avoid ndv error propagation
+                    resultSetCount = inputRowCount;
+                }
+                if (resultSetCount < 1) {
+                    resultSetCount = 1;
+                }
+            }
         }
         resultSetCount = Math.min(resultSetCount, childStats.getRowCount());

Review Comment:
   I think 0.5 is acceptable



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

[GitHub] [doris] englefly commented on a diff in pull request #17790: [enhancement](nereids) update FilterEstimation and Agg in stats derive

Reply via email to