aakash-db commented on code in PR #51003: URL: https://github.com/apache/spark/pull/51003#discussion_r2105504652
########## sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/GraphIdentifierManager.scala: ########## @@ -0,0 +1,345 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.pipelines.graph + +import org.apache.spark.sql.AnalysisException +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.analysis.{ResolvedIdentifier, UnresolvedIdentifier, UnresolvedRelation} +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.classic.SparkSession +import org.apache.spark.sql.execution.datasources.DataSource + +/** + * Responsible for properly qualify the identifiers for datasets inside or referenced by the + * dataflow graph. + */ +object GraphIdentifierManager { + + import IdentifierHelper._ + + def parseTableIdentifier(name: String, spark: SparkSession): TableIdentifier = { + toTableIdentifier(spark.sessionState.sqlParser.parseMultipartIdentifier(name)) + } + + /** + * Fully qualify (if needed) the user-specified identifier used to reference datasets, and + * categorizing the dataset we're referencing (i.e. dataset from this pipeline or dataset that is + * external to this pipeline). + * + * Returns whether the input dataset should be read as a dataset and also the qualified + * identifier. + * + * @param rawInputName the user-specified name when referencing datasets. + */ + def parseAndQualifyInputIdentifier( + context: FlowAnalysisContext, + rawInputName: String): DatasetIdentifier = { + resolveDatasetReadInsideQueryDefinition(context = context, rawInputName = rawInputName) + } + + /** + * Resolve dataset reads that happens inside the dataset query definition (i.e., inside + * the @materialized_view() annotation in Python). + */ + private def resolveDatasetReadInsideQueryDefinition( + context: FlowAnalysisContext, + rawInputName: String + ): DatasetIdentifier = { + // After identifier is pre-processed, we first check whether we're referencing a + // single-part-name dataset (e.g., temp view). If so, don't fully qualified the identifier + // and directly read from it, because single-part-name datasets always out-mask other + // fully-qualified-datasets that have the same name. For example, if there's a view named + // "a" and also a table named "catalog.schema.a" defined in the graph. "SELECT * FROM a" + // would always read the view "a". To read table "a", user would need to read it using + // fully/partially qualified name (e.g., "SELECT * FROM catalog.schema.a" or "SELECT * FROM + // schema.a"). + val inputIdentifier = parseTableIdentifier(rawInputName, context.spark) + // TODO: once pipeline spec catalog/schema is propagated, use that catalog via DSv2 API Review Comment: Yeah. We probably don't need this branch, actually - we can just prevent temp views from being used. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org