cloud-fan commented on code in PR #51003: URL: https://github.com/apache/spark/pull/51003#discussion_r2110913638
########## sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/Flow.scala: ########## @@ -0,0 +1,230 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.pipelines.graph + +import scala.util.Try + +import org.apache.spark.internal.Logging +import org.apache.spark.sql.catalyst.{AliasIdentifier, TableIdentifier} +import org.apache.spark.sql.classic.DataFrame +import org.apache.spark.sql.pipelines.AnalysisWarning +import org.apache.spark.sql.pipelines.util.InputReadOptions +import org.apache.spark.sql.types.StructType + +/** + * A [[Flow]] is a node of data transformation in a dataflow graph. It describes the movement + * of data into a particular dataset. + */ +trait Flow extends GraphElement with Logging { + + /** The [[FlowFunction]] containing the user's query. */ + def func: FlowFunction + + val identifier: TableIdentifier + + /** + * The dataset that this Flow represents a write to. Since the DataflowGraph doesn't have a first- + * class concept of views, writing to a destination that isn't a Table or a Sink represents a + * view. + */ + val destinationIdentifier: TableIdentifier + + /** + * Whether this is a ONCE flow. ONCE flows should run only once per full refresh. + */ + def once: Boolean = false + + /** The current catalog in the execution context when the query is defined. */ + def currentCatalog: Option[String] + + /** The current database in the execution context when the query is defined. */ + def currentDatabase: Option[String] + + /** The comment associated with this flow */ + def comment: Option[String] + + def sqlConf: Map[String, String] +} + +/** A wrapper for a resolved internal input that includes the identifier used in SubqueryAlias */ +case class ResolvedInput(input: Input, aliasIdentifier: AliasIdentifier) + +/** A wrapper for the lambda function that defines a [[Flow]]. */ +trait FlowFunction extends Logging { + + /** + * This function defines the transformations performed by a flow, expressed as a [[DataFrame]]. + * + * @param allInputs the set of identifiers for all the [[Input]]s defined in the + * [[DataflowGraph]]. + * @param availableInputs the list of all [[Input]]s available to this flow + * @param configuration the spark configurations that apply to this flow. + * @param currentCatalog The current catalog in execution context when the query is defined. + * @param currentDatabase The current database in execution context when the query is defined. + * @return the inputs actually used, and the [[DataFrame]] expression for the flow + */ + def call( + allInputs: Set[TableIdentifier], + availableInputs: Seq[Input], + configuration: Map[String, String], + currentCatalog: Option[String], + currentDatabase: Option[String] + ): FlowFunctionResult +} + +/** + * Holds the [[DataFrame]] returned by a [[FlowFunction]] along with the inputs used to + * construct it. + * @param usedBatchInputs the identifiers of the complete inputs read by the flow + * @param usedStreamingInputs the identifiers of the incremental inputs read by the flow + * @param usedExternalInputs the identifiers of the external inputs read by the flow + * @param dataFrame the [[DataFrame]] expression executed by the flow if the flow can be resolved + */ +case class FlowFunctionResult( + requestedInputs: Set[TableIdentifier], + usedBatchInputs: Set[ResolvedInput], + usedStreamingInputs: Set[ResolvedInput], + usedExternalInputs: Set[String], + dataFrame: Try[DataFrame], + sqlConf: Map[String, String], + analysisWarnings: Seq[AnalysisWarning] = Nil) { + + /** + * Returns the names of all of the [[Input]]s used when resolving this [[Flow]]. If the + * flow failed to resolve, we return the all the datasets that were requested when evaluating the + * flow. + */ + def inputs: Set[TableIdentifier] = { + (batchInputs ++ streamingInputs).map(_.input.identifier) + } + + /** Names of [[Input]]s read completely by this [[Flow]]. */ + def batchInputs: Set[ResolvedInput] = usedBatchInputs Review Comment: This is a case class, shall we just name this field `batchInputs` and remove this method? ########## sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/Flow.scala: ########## @@ -0,0 +1,230 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.pipelines.graph + +import scala.util.Try + +import org.apache.spark.internal.Logging +import org.apache.spark.sql.catalyst.{AliasIdentifier, TableIdentifier} +import org.apache.spark.sql.classic.DataFrame +import org.apache.spark.sql.pipelines.AnalysisWarning +import org.apache.spark.sql.pipelines.util.InputReadOptions +import org.apache.spark.sql.types.StructType + +/** + * A [[Flow]] is a node of data transformation in a dataflow graph. It describes the movement + * of data into a particular dataset. + */ +trait Flow extends GraphElement with Logging { + + /** The [[FlowFunction]] containing the user's query. */ + def func: FlowFunction + + val identifier: TableIdentifier + + /** + * The dataset that this Flow represents a write to. Since the DataflowGraph doesn't have a first- + * class concept of views, writing to a destination that isn't a Table or a Sink represents a + * view. + */ + val destinationIdentifier: TableIdentifier + + /** + * Whether this is a ONCE flow. ONCE flows should run only once per full refresh. + */ + def once: Boolean = false + + /** The current catalog in the execution context when the query is defined. */ + def currentCatalog: Option[String] + + /** The current database in the execution context when the query is defined. */ + def currentDatabase: Option[String] + + /** The comment associated with this flow */ + def comment: Option[String] + + def sqlConf: Map[String, String] +} + +/** A wrapper for a resolved internal input that includes the identifier used in SubqueryAlias */ +case class ResolvedInput(input: Input, aliasIdentifier: AliasIdentifier) + +/** A wrapper for the lambda function that defines a [[Flow]]. */ +trait FlowFunction extends Logging { + + /** + * This function defines the transformations performed by a flow, expressed as a [[DataFrame]]. + * + * @param allInputs the set of identifiers for all the [[Input]]s defined in the + * [[DataflowGraph]]. + * @param availableInputs the list of all [[Input]]s available to this flow + * @param configuration the spark configurations that apply to this flow. + * @param currentCatalog The current catalog in execution context when the query is defined. + * @param currentDatabase The current database in execution context when the query is defined. + * @return the inputs actually used, and the [[DataFrame]] expression for the flow + */ + def call( + allInputs: Set[TableIdentifier], + availableInputs: Seq[Input], + configuration: Map[String, String], + currentCatalog: Option[String], + currentDatabase: Option[String] + ): FlowFunctionResult +} + +/** + * Holds the [[DataFrame]] returned by a [[FlowFunction]] along with the inputs used to + * construct it. + * @param usedBatchInputs the identifiers of the complete inputs read by the flow + * @param usedStreamingInputs the identifiers of the incremental inputs read by the flow + * @param usedExternalInputs the identifiers of the external inputs read by the flow + * @param dataFrame the [[DataFrame]] expression executed by the flow if the flow can be resolved + */ +case class FlowFunctionResult( + requestedInputs: Set[TableIdentifier], + usedBatchInputs: Set[ResolvedInput], + usedStreamingInputs: Set[ResolvedInput], + usedExternalInputs: Set[String], + dataFrame: Try[DataFrame], + sqlConf: Map[String, String], + analysisWarnings: Seq[AnalysisWarning] = Nil) { + + /** + * Returns the names of all of the [[Input]]s used when resolving this [[Flow]]. If the + * flow failed to resolve, we return the all the datasets that were requested when evaluating the + * flow. + */ + def inputs: Set[TableIdentifier] = { + (batchInputs ++ streamingInputs).map(_.input.identifier) + } + + /** Names of [[Input]]s read completely by this [[Flow]]. */ + def batchInputs: Set[ResolvedInput] = usedBatchInputs + + /** Names of [[Input]]s read incrementally by this [[Flow]]. */ + def streamingInputs: Set[ResolvedInput] = usedStreamingInputs Review Comment: ditto -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org