github-actions[bot] commented on code in PR #63736: URL: https://github.com/apache/doris/pull/63736#discussion_r3325577962
########## fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/PullUpProjectExprUnderTopN.java: ########## @@ -0,0 +1,486 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.rules.rewrite; + +import org.apache.doris.nereids.jobs.JobContext; +import org.apache.doris.nereids.properties.OrderKey; +import org.apache.doris.nereids.trees.expressions.Alias; +import org.apache.doris.nereids.trees.expressions.ExprId; +import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.NamedExpression; +import org.apache.doris.nereids.trees.expressions.Slot; +import org.apache.doris.nereids.trees.expressions.functions.NoneMovableFunction; +import org.apache.doris.nereids.trees.expressions.literal.Literal; +import org.apache.doris.nereids.trees.plans.Plan; +import org.apache.doris.nereids.trees.plans.logical.LogicalAggregate; +import org.apache.doris.nereids.trees.plans.logical.LogicalCTEProducer; +import org.apache.doris.nereids.trees.plans.logical.LogicalExcept; +import org.apache.doris.nereids.trees.plans.logical.LogicalIntersect; +import org.apache.doris.nereids.trees.plans.logical.LogicalProject; +import org.apache.doris.nereids.trees.plans.logical.LogicalRelation; +import org.apache.doris.nereids.trees.plans.logical.LogicalRepeat; +import org.apache.doris.nereids.trees.plans.logical.LogicalSetOperation; +import org.apache.doris.nereids.trees.plans.logical.LogicalTopN; +import org.apache.doris.nereids.trees.plans.logical.LogicalUnion; +import org.apache.doris.nereids.trees.plans.logical.LogicalWindow; +import org.apache.doris.nereids.trees.plans.visitor.CustomRewriter; +import org.apache.doris.nereids.trees.plans.visitor.DefaultPlanRewriter; +import org.apache.doris.qe.ConnectContext; +import org.apache.doris.qe.SessionVariable; + +import com.google.common.collect.ImmutableList; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.IdentityHashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +/** + * Pull up non-trivial expressions from Projects below TopN to above TopN, + * exposing their input base columns as lazy materialization candidates. + * + * <p>Two-pass CustomRewriter: + * <ol> + * <li><b>Collector (top-down)</b>: walk the plan tree, find qualifying TopNs, + * walk into their descendants to find Projects with pull-able expressions. + * Any operator that references a slot blocks pulling up expressions that + * output that slot past it. Boundary nodes (Aggregate, Window, Repeat, + * Relation, CTEProducer) stop the walk. + * Set operators are treated as blockers for the current TopN but their + * children are still traversed so nested TopNs inside them are visited.</li> + * <li><b>Replacer (bottom-up)</b>: simplify found Projects and add upper + * Projects above TopN to restore pulled-up expressions.</li> + * </ol> + */ +public class PullUpProjectExprUnderTopN implements CustomRewriter { + + @Override + public Plan rewriteRoot(Plan plan, JobContext jobContext) { + ConnectContext ctx = jobContext.getCascadesContext() + .getStatementContext().getConnectContext(); + if (ctx != null && !ctx.getSessionVariable().enableTopnExprPullup) { + return plan; + } + + // Pass 1: Collect pull-up info + CollectorContext collectorCtx = new CollectorContext(); + plan.accept(new Collector(), collectorCtx); + + if (collectorCtx.topNToPullUpInfo.isEmpty()) { + return plan; + } + + // Pass 2: Replace/restructure Review Comment: `deduplicatePullUps()` is never called, and `getPullUpInfoForProject()` returns only the first `PullUpInfo` that references a project. In a nested TopN shape like the new `testDeduplicatePullUpToOutermostTopN` scenario, the outer TopN records only `y` while the inner TopN records `x` and `y`; when the bottom project is rewritten, it is simplified with the outer info only, so `x` remains computed below the inner TopN even though an upper project is also added for it. That means the inner-only expression is not actually pulled out of the TopN input, so the intended lazy-materialization benefit is lost and the plan computes the expression twice. Please either call the deduplication pass before replacement and/or make project simplification account for each TopN's own pulled expressions. ########## fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/PullUpProjectExprUnderTopN.java: ########## @@ -0,0 +1,486 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.rules.rewrite; + +import org.apache.doris.nereids.jobs.JobContext; +import org.apache.doris.nereids.properties.OrderKey; +import org.apache.doris.nereids.trees.expressions.Alias; +import org.apache.doris.nereids.trees.expressions.ExprId; +import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.NamedExpression; +import org.apache.doris.nereids.trees.expressions.Slot; +import org.apache.doris.nereids.trees.expressions.functions.NoneMovableFunction; +import org.apache.doris.nereids.trees.expressions.literal.Literal; +import org.apache.doris.nereids.trees.plans.Plan; +import org.apache.doris.nereids.trees.plans.logical.LogicalAggregate; +import org.apache.doris.nereids.trees.plans.logical.LogicalCTEProducer; +import org.apache.doris.nereids.trees.plans.logical.LogicalExcept; +import org.apache.doris.nereids.trees.plans.logical.LogicalIntersect; +import org.apache.doris.nereids.trees.plans.logical.LogicalProject; +import org.apache.doris.nereids.trees.plans.logical.LogicalRelation; +import org.apache.doris.nereids.trees.plans.logical.LogicalRepeat; +import org.apache.doris.nereids.trees.plans.logical.LogicalSetOperation; +import org.apache.doris.nereids.trees.plans.logical.LogicalTopN; +import org.apache.doris.nereids.trees.plans.logical.LogicalUnion; +import org.apache.doris.nereids.trees.plans.logical.LogicalWindow; +import org.apache.doris.nereids.trees.plans.visitor.CustomRewriter; +import org.apache.doris.nereids.trees.plans.visitor.DefaultPlanRewriter; +import org.apache.doris.qe.ConnectContext; +import org.apache.doris.qe.SessionVariable; + +import com.google.common.collect.ImmutableList; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.IdentityHashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +/** + * Pull up non-trivial expressions from Projects below TopN to above TopN, + * exposing their input base columns as lazy materialization candidates. + * + * <p>Two-pass CustomRewriter: + * <ol> + * <li><b>Collector (top-down)</b>: walk the plan tree, find qualifying TopNs, + * walk into their descendants to find Projects with pull-able expressions. + * Any operator that references a slot blocks pulling up expressions that + * output that slot past it. Boundary nodes (Aggregate, Window, Repeat, + * Relation, CTEProducer) stop the walk. + * Set operators are treated as blockers for the current TopN but their + * children are still traversed so nested TopNs inside them are visited.</li> + * <li><b>Replacer (bottom-up)</b>: simplify found Projects and add upper + * Projects above TopN to restore pulled-up expressions.</li> + * </ol> + */ +public class PullUpProjectExprUnderTopN implements CustomRewriter { + + @Override + public Plan rewriteRoot(Plan plan, JobContext jobContext) { + ConnectContext ctx = jobContext.getCascadesContext() + .getStatementContext().getConnectContext(); + if (ctx != null && !ctx.getSessionVariable().enableTopnExprPullup) { + return plan; + } + + // Pass 1: Collect pull-up info + CollectorContext collectorCtx = new CollectorContext(); + plan.accept(new Collector(), collectorCtx); + + if (collectorCtx.topNToPullUpInfo.isEmpty()) { + return plan; + } + + // Pass 2: Replace/restructure + return plan.accept(new Replacer(), collectorCtx); + } + + // ========================================================================= + // Data structures + // ========================================================================= + + /** Info collected per TopN about which expressions to pull up from which Projects. */ + static class PullUpInfo { + final LogicalTopN topN; + final List<Slot> originalTopNOutput; + final List<NamedExpression> allPulledUpExprs = new ArrayList<>(); + final Map<LogicalProject<? extends Plan>, List<NamedExpression>> projectToPulledUpExprs + = new LinkedHashMap<>(); + final Map<ExprId, List<Slot>> baseSlotsByExpr = new HashMap<>(); + + PullUpInfo(LogicalTopN topN) { + this.topN = topN; + this.originalTopNOutput = ImmutableList.copyOf(topN.getOutput()); + } + + void addPulledUpExpr(LogicalProject<? extends Plan> project, NamedExpression expr) { + allPulledUpExprs.add(expr); + projectToPulledUpExprs.computeIfAbsent(project, k -> new ArrayList<>()).add(expr); + baseSlotsByExpr.put(expr.getExprId(), ImmutableList.copyOf(expr.getInputSlots())); + } + } + + /** Context shared between collector and replacer passes. */ + static class CollectorContext { + final Map<LogicalTopN, PullUpInfo> topNToPullUpInfo = new LinkedHashMap<>(); + int cteProducerDepth = 0; + + boolean hasPullUpInfo(LogicalTopN topN) { + return topNToPullUpInfo.containsKey(topN); + } + + PullUpInfo getPullUpInfo(LogicalTopN topN) { + return topNToPullUpInfo.get(topN); + } + + PullUpInfo getPullUpInfoForProject(LogicalProject<? extends Plan> project) { + for (PullUpInfo info : topNToPullUpInfo.values()) { + if (info.projectToPulledUpExprs.containsKey(project)) { + return info; + } + } + return null; + } + } + + // ========================================================================= + // Pass 1: Collector (top-down) + // ========================================================================= + + private static boolean qualifiesForLazyMatThreshold(LogicalTopN topN) { + long limit = topN.getLimit(); + if (limit <= 0) { + return false; + } + long threshold = SessionVariable.getTopNLazyMaterializationThreshold(); + return threshold >= limit; + } + + static class Collector extends DefaultPlanRewriter<CollectorContext> { + + @Override + public Plan visitLogicalCTEProducer( + LogicalCTEProducer<? extends Plan> cteProducer, CollectorContext context) { + context.cteProducerDepth++; + try { + return visit(cteProducer, context); + } finally { + context.cteProducerDepth--; + } + } + + @Override + public Plan visitLogicalTopN(LogicalTopN topN, CollectorContext context) { + if (context.cteProducerDepth > 0 + || !qualifiesForLazyMatThreshold(topN)) { + return visit(topN, context); + } + PullUpInfo info = new PullUpInfo(topN); + // Seed blockedExprIds with this TopN's order key ExprIds so that + // expressions used by order keys are not pulled up past this TopN. + Set<ExprId> blockedExprIds = buildOrderKeyExprIds(topN); + collectFromNode((Plan) topN.child(0), info, blockedExprIds); + if (!info.allPulledUpExprs.isEmpty()) { + context.topNToPullUpInfo.put(topN, info); + } + return visit(topN, context); + } + } + + /** + * Recursively walk down from a TopN's child to find Projects with pull-able expressions. + * + * <p>{@code blockedExprIds} contains ExprIds of slots that are referenced by operators + * along the path from the TopN to the current node. An expression whose output ExprId + * is in this set cannot be pulled up past the operators that reference it. + */ + private static void collectFromNode(Plan node, PullUpInfo info, Set<ExprId> blockedExprIds) { + if (node instanceof LogicalProject) { + LogicalProject<? extends Plan> project = (LogicalProject<? extends Plan>) node; + for (NamedExpression ne : project.getProjects()) { + if (canPullUp(ne) && !blockedExprIds.contains(ne.getExprId())) { + info.addPulledUpExpr(project, ne); + } + } + // Continue into the project's child. Chained projects are all visited. + collectFromNode((Plan) project.child(0), info, blockedExprIds); + return; + } + + if (node instanceof LogicalTopN) { + LogicalTopN inner = (LogicalTopN) node; + // TopN preserves all input columns, so it doesn't block by itself. + // However, its order keys consume slots, so add them to blocked set. + // Do NOT reset blockedExprIds — intermediate operators between the + // outer and inner TopN must still block expressions. + Set<ExprId> newBlocked = new HashSet<>(blockedExprIds); + newBlocked.addAll(buildOrderKeyExprIds(inner)); + collectFromNode((Plan) inner.child(0), info, newBlocked); + return; + } + + // Stop at boundary nodes that transform the schema or are data sources. + if (node instanceof LogicalRelation || node instanceof LogicalCTEProducer + || isBlockingNode(node)) { + return; + } + + // For set operators, walk into each child. + // blockedExprIds must be mapped from the set operator's output ExprIds + // to each child's output ExprIds via regularChildrenOutputs. + // + // Union ALL is transparent: it only concatenates rows and does not Review Comment: This traversal makes the current TopN collect expressions from below a set operator. That is not semantics-preserving for UNION ALL unless every child computes exactly the same expression. For example, `select x, id from (select a + 1 as x, id from t1 union all select a + 2 as x, id from t2) u order by id limit 10`: `x` is not an order key, so both child `Project` expressions are collected and simplified to expose `a`. The single project added above the TopN cannot represent the branch-specific `a + 1` versus `a + 2` semantics (and may also reference only one child expression id), so rows from one branch can get the wrong value or the union inputs can stop matching their declared child outputs. Please treat set operations as a boundary for the current TopN; it is fine to let the normal visitor continue into their children so nested TopNs are handled independently. ########## fe/fe-core/src/main/java/org/apache/doris/nereids/processor/post/materialize/LazyMaterializeTopN.java: ########## @@ -154,47 +159,28 @@ private Plan computeTopN(PhysicalTopN topN, CascadesContext ctx) { tvfRelation.getFunction().getName() + ".global_row_id", false, Integer.MAX_VALUE); SlotReference rowIdSlot = SlotReference.fromColumn(threadStatementContext.getNextExprId(), tvfRelation.getFunction().getTable(), rowIdCol, ImmutableList.of()); - result = result.accept(new LazySlotPruning(), - new LazySlotPruning.Context((PhysicalTVFRelation) relation, - rowIdSlot, relationToLazySlotMap.get(relation))); + result = result.accept(createLazySlotPruning(), new LazySlotPruning.Context( + (PhysicalTVFRelation) relation, + rowIdSlot, relationToLazySlotMap.get(relation))); relationToRowId.put(tvfRelation, rowIdSlot); rowIdSet.add(rowIdSlot); } else { - // should not reach here. throw new RuntimeException("LazyMaterializeTopN not support this relation." + relation); } } - // materialize.child.output requires - // rowId only appears once. - // that is [a, rowId1, b rowId1] is not acceptable List<SlotReference> materializeInput = moveRowIdsToTail(result.getOutput(), rowIdSet); if (materializeInput == null) { - /* - topn - -->any - => - project - -->materialize - -->topn - -->any - */ - result = new PhysicalLazyMaterialize(result, result.getOutput(), + List<Slot> correctInput = new ArrayList<>(materializedSlots); + for (SlotReference rowId : relationToRowId.values()) { + correctInput.add(rowId); Review Comment: When `moveRowIdsToTail(result.getOutput(), rowIdSet)` returns `null`, the child output is already in a valid `[materialized slots..., row ids...]` layout. Rebuilding `correctInput` from `materializedSlots` plus `relationToRowId.values()` can change the row-id order for multi-relation lazy materialization because the row ids are stored in hash-based maps. `PhysicalLazyMaterialize` uses this list to derive the row-id/fetch relation order, so a plan with lazy columns from two joined relations can bind the fetch expression order differently from the actual child tuple layout. Please preserve `result.getOutput()` in this branch instead of reconstructing the input list. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
