Pajaraja commented on code in PR #49955: URL: https://github.com/apache/spark/pull/49955#discussion_r2000916949
########## sql/core/src/main/scala/org/apache/spark/sql/execution/UnionLoopExec.scala: ########## @@ -0,0 +1,229 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution + +import scala.collection.mutable + +import org.apache.spark.SparkException +import org.apache.spark.rdd.{EmptyRDD, RDD} +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.{Attribute, Literal} +import org.apache.spark.sql.catalyst.plans.QueryPlan +import org.apache.spark.sql.catalyst.plans.logical.{LocalLimit, LogicalPlan, Union, UnionLoopRef} +import org.apache.spark.sql.classic.Dataset +import org.apache.spark.sql.execution.LogicalRDD.rewriteStatsAndConstraints +import org.apache.spark.sql.execution.metric.SQLMetrics +import org.apache.spark.sql.internal.SQLConf + + +/** + * The physical node for recursion. Currently only UNION ALL case is supported. + * For the details about the execution, look at the comment above doExecute function. + * + * A simple recursive query: + * {{{ + * WITH RECURSIVE t(n) AS ( + * SELECT 1 + * UNION ALL + * SELECT n+1 FROM t WHERE n < 5) + * SELECT * FROM t; + * }}} + * Corresponding logical plan for the recursive query above: + * {{{ + * WithCTE + * :- CTERelationDef 0, false + * : +- SubqueryAlias t + * : +- Project [1#0 AS n#3] + * : +- UnionLoop 0 + * : :- Project [1 AS 1#0] + * : : +- OneRowRelation + * : +- Project [(n#1 + 1) AS (n + 1)#2] + * : +- Filter (n#1 < 5) + * : +- SubqueryAlias t + * : +- Project [1#0 AS n#1] + * : +- UnionLoopRef 0, [1#0], false + * +- Project [n#3] + * +- SubqueryAlias t + * +- CTERelationRef 0, true, [n#3], false, false + * }}} + * + * @param loopId This is id of the CTERelationDef containing the recursive query. Its value is + * first passed down to UnionLoop when creating it, and then to UnionLoopExec in + * SparkStrategies. + * @param anchor The logical plan of the initial element of the loop. + * @param recursion The logical plan that describes the recursion with an [[UnionLoopRef]] node. + * CTERelationRef, which is marked as recursive, gets substituted with + * [[UnionLoopRef]] in ResolveWithCTE. + * Both anchor and recursion are marked with @transient annotation, so that they + * are not serialized. + * @param output The output attributes of this loop. + * @param limit If defined, the total number of rows output by this operator will be bounded by + * limit. + * Its value is pushed down to UnionLoop in Optimizer in case Limit node is present + * in the logical plan and then transferred to UnionLoopExec in SparkStrategies. + * Note here: limit can be applied in the main query calling the recursive CTE, and not + * inside the recursive term of recursive CTE. + */ +case class UnionLoopExec( + loopId: Long, + @transient anchor: LogicalPlan, + @transient recursion: LogicalPlan, + override val output: Seq[Attribute], + limit: Option[Int] = None) extends LeafExecNode { Review Comment: Why? Isn't it better not to have a limit if it doesn't exist, to not clutter the plan? Also semantic value of the limit being -1 isn't universal, just internal. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org