Re: [PR] [SPARK-52223] Add SDP Spark Connect Protos [spark]

via GitHub Mon, 19 May 2025 14:48:36 -0700


sryza commented on code in PR #50942:
URL: https://github.com/apache/spark/pull/50942#discussion_r2096524100



##########
sql/connect/common/src/main/protobuf/spark/connect/pipelines.proto:
##########
@@ -0,0 +1,142 @@
+syntax = "proto3";
+
+package spark.connect;
+
+import "spark/connect/relations.proto";
+import "spark/connect/types.proto";
+
+option java_multiple_files = true;
+option java_package = "org.apache.spark.connect.proto";
+
+// Dispatch object for pipelines commands.
+message PipelineCommand {
+  oneof command_type {
+    CreateDataflowGraph create_dataflow_graph = 1;
+    DefineDataset define_dataset = 2;
+    DefineFlow define_flow = 3;
+    DropDataflowGraph drop_dataflow_graph = 4;
+    StartRun start_run = 5;
+    StopRun stop_run = 6;
+    DefineSqlGraphElements define_sql_graph_elements = 7;
+  }
+
+  message DefineSqlGraphElements {
+    optional string dataflow_graph_id = 1;
+    optional string sql_file_name = 2;
+    optional string sql_text = 3;
+  }
+
+  // Request to create a new pipeline.
+  message CreateDataflowGraph {
+    // The default catalog.
+    optional string default_catalog = 1;
+
+    // The default database.
+    optional string default_database = 2;
+
+    // Default SQL configurations for all flows in this graph.
+    map<string, string> sql_conf = 5;
+
+    message Response {
+      // The ID of the created graph.
+      string dataflow_graph_id = 1;
+    }
+  }
+
+  // Drops the graph and stops any running attached flows.
+  message DropDataflowGraph {
+    // The graph to drop.
+    string dataflow_graph_id = 1;
+  }
+
+  // Request to define a dataset: a table, a materialized view, or a temporary 
view.
+  message DefineDataset {
+    // The graph to attach this dataset to.
+    string dataflow_graph_id = 1;
+
+    // Name of the dataset.
+    string dataset_name = 2;
+
+    // Table or view.
+    DatasetType dataset_type = 3;
+
+    // Optional comment for the dataset.
+    optional string comment = 4;
+
+    // Optional table properties.
+    map<string, string> table_properties = 5;
+
+    // Optional partition columns for the dataset (if applicable).
+    repeated string partition_cols = 6;
+
+    // Schema for the dataset. If unset, this will be inferred.
+    optional spark.connect.DataType schema = 7;
+
+    // The output table format of the dataset.
+    optional string format = 8;
+  }
+
+  // Request to define a flow targeting a dataset.
+  message DefineFlow {
+    // The graph to attach this dataset to.
+    string dataflow_graph_id = 1;
+
+    // Name of the dataset.
+    string flow_name = 2;
+
+    string target_dataset_name = 3;
+
+    // An unresolved relation that defines the dataset's flow.
+    spark.connect.Relation plan = 4;
+
+    // Default SQL configurations set when running this flow.
+    map<string, string> sql_conf = 5;
+
+    // If true, this flow will only be run once per execution.
+    bool once = 6;
+  }
+
+  // Resolves all datasets and flows and start a pipeline update. Should be 
called after all
+  // graph elements are registered.
+  message StartRun {
+    // The graph to start.
+    string dataflow_graph_id = 1;
+  }
+  message StopRun {
+    // The ID of the graph to stop..

Review Comment:
   ```suggestion
       // The ID of the graph to stop.
   ```



##########
sql/connect/common/src/main/protobuf/spark/connect/pipelines.proto:
##########
@@ -0,0 +1,142 @@
+syntax = "proto3";
+
+package spark.connect;
+
+import "spark/connect/relations.proto";
+import "spark/connect/types.proto";
+
+option java_multiple_files = true;
+option java_package = "org.apache.spark.connect.proto";
+
+// Dispatch object for pipelines commands.
+message PipelineCommand {
+  oneof command_type {
+    CreateDataflowGraph create_dataflow_graph = 1;
+    DefineDataset define_dataset = 2;
+    DefineFlow define_flow = 3;
+    DropDataflowGraph drop_dataflow_graph = 4;
+    StartRun start_run = 5;
+    StopRun stop_run = 6;
+    DefineSqlGraphElements define_sql_graph_elements = 7;
+  }
+
+  message DefineSqlGraphElements {
+    optional string dataflow_graph_id = 1;
+    optional string sql_file_name = 2;
+    optional string sql_text = 3;
+  }
+
+  // Request to create a new pipeline.
+  message CreateDataflowGraph {
+    // The default catalog.
+    optional string default_catalog = 1;
+
+    // The default database.
+    optional string default_database = 2;
+
+    // Default SQL configurations for all flows in this graph.
+    map<string, string> sql_conf = 5;
+
+    message Response {
+      // The ID of the created graph.
+      string dataflow_graph_id = 1;
+    }
+  }
+
+  // Drops the graph and stops any running attached flows.
+  message DropDataflowGraph {
+    // The graph to drop.
+    string dataflow_graph_id = 1;
+  }
+
+  // Request to define a dataset: a table, a materialized view, or a temporary 
view.
+  message DefineDataset {
+    // The graph to attach this dataset to.
+    string dataflow_graph_id = 1;
+
+    // Name of the dataset.
+    string dataset_name = 2;
+
+    // Table or view.
+    DatasetType dataset_type = 3;
+
+    // Optional comment for the dataset.
+    optional string comment = 4;
+
+    // Optional table properties.
+    map<string, string> table_properties = 5;
+
+    // Optional partition columns for the dataset (if applicable).

Review Comment:
   Nitpick: what does "if applicable" mean here?



##########
sql/connect/common/src/main/protobuf/spark/connect/pipelines.proto:
##########
@@ -0,0 +1,142 @@
+syntax = "proto3";
+
+package spark.connect;
+
+import "spark/connect/relations.proto";
+import "spark/connect/types.proto";
+
+option java_multiple_files = true;
+option java_package = "org.apache.spark.connect.proto";
+
+// Dispatch object for pipelines commands.
+message PipelineCommand {
+  oneof command_type {
+    CreateDataflowGraph create_dataflow_graph = 1;
+    DefineDataset define_dataset = 2;
+    DefineFlow define_flow = 3;
+    DropDataflowGraph drop_dataflow_graph = 4;
+    StartRun start_run = 5;
+    StopRun stop_run = 6;
+    DefineSqlGraphElements define_sql_graph_elements = 7;
+  }
+
+  message DefineSqlGraphElements {

Review Comment:
   Could use at least a header comment



##########
sql/connect/common/src/main/protobuf/spark/connect/pipelines.proto:
##########
@@ -0,0 +1,142 @@
+syntax = "proto3";

Review Comment:
   I think we need a license at the top of this file



##########
sql/connect/common/src/main/protobuf/spark/connect/pipelines.proto:
##########
@@ -0,0 +1,142 @@
+syntax = "proto3";
+
+package spark.connect;
+
+import "spark/connect/relations.proto";
+import "spark/connect/types.proto";
+
+option java_multiple_files = true;
+option java_package = "org.apache.spark.connect.proto";
+
+// Dispatch object for pipelines commands.
+message PipelineCommand {
+  oneof command_type {
+    CreateDataflowGraph create_dataflow_graph = 1;
+    DefineDataset define_dataset = 2;
+    DefineFlow define_flow = 3;
+    DropDataflowGraph drop_dataflow_graph = 4;
+    StartRun start_run = 5;
+    StopRun stop_run = 6;
+    DefineSqlGraphElements define_sql_graph_elements = 7;
+  }
+
+  message DefineSqlGraphElements {
+    optional string dataflow_graph_id = 1;
+    optional string sql_file_name = 2;
+    optional string sql_text = 3;
+  }
+
+  // Request to create a new pipeline.
+  message CreateDataflowGraph {
+    // The default catalog.
+    optional string default_catalog = 1;
+
+    // The default database.
+    optional string default_database = 2;
+
+    // Default SQL configurations for all flows in this graph.
+    map<string, string> sql_conf = 5;
+
+    message Response {
+      // The ID of the created graph.
+      string dataflow_graph_id = 1;
+    }
+  }
+
+  // Drops the graph and stops any running attached flows.
+  message DropDataflowGraph {
+    // The graph to drop.
+    string dataflow_graph_id = 1;
+  }
+
+  // Request to define a dataset: a table, a materialized view, or a temporary 
view.
+  message DefineDataset {
+    // The graph to attach this dataset to.
+    string dataflow_graph_id = 1;
+
+    // Name of the dataset.
+    string dataset_name = 2;
+
+    // Table or view.
+    DatasetType dataset_type = 3;
+
+    // Optional comment for the dataset.
+    optional string comment = 4;
+
+    // Optional table properties.
+    map<string, string> table_properties = 5;
+
+    // Optional partition columns for the dataset (if applicable).
+    repeated string partition_cols = 6;
+
+    // Schema for the dataset. If unset, this will be inferred.
+    optional spark.connect.DataType schema = 7;
+
+    // The output table format of the dataset.
+    optional string format = 8;
+  }
+
+  // Request to define a flow targeting a dataset.
+  message DefineFlow {
+    // The graph to attach this dataset to.
+    string dataflow_graph_id = 1;
+
+    // Name of the dataset.
+    string flow_name = 2;
+
+    string target_dataset_name = 3;
+
+    // An unresolved relation that defines the dataset's flow.
+    spark.connect.Relation plan = 4;
+
+    // Default SQL configurations set when running this flow.

Review Comment:
   Nitpick: is the word "Default" relevant here? There's nothing more specific, 
right?



##########
sql/connect/common/src/main/protobuf/spark/connect/pipelines.proto:
##########
@@ -0,0 +1,142 @@
+syntax = "proto3";
+
+package spark.connect;
+
+import "spark/connect/relations.proto";
+import "spark/connect/types.proto";
+
+option java_multiple_files = true;
+option java_package = "org.apache.spark.connect.proto";
+
+// Dispatch object for pipelines commands.
+message PipelineCommand {
+  oneof command_type {
+    CreateDataflowGraph create_dataflow_graph = 1;
+    DefineDataset define_dataset = 2;
+    DefineFlow define_flow = 3;
+    DropDataflowGraph drop_dataflow_graph = 4;
+    StartRun start_run = 5;
+    StopRun stop_run = 6;
+    DefineSqlGraphElements define_sql_graph_elements = 7;
+  }
+
+  message DefineSqlGraphElements {
+    optional string dataflow_graph_id = 1;
+    optional string sql_file_name = 2;

Review Comment:
   Something that occurred to me recently is that there could be SQL files with 
the same name in different subdirs. Should this be `sql_file_path`?



##########
sql/connect/common/src/main/protobuf/spark/connect/pipelines.proto:
##########
@@ -0,0 +1,142 @@
+syntax = "proto3";
+
+package spark.connect;
+
+import "spark/connect/relations.proto";
+import "spark/connect/types.proto";
+
+option java_multiple_files = true;
+option java_package = "org.apache.spark.connect.proto";
+
+// Dispatch object for pipelines commands.
+message PipelineCommand {
+  oneof command_type {
+    CreateDataflowGraph create_dataflow_graph = 1;
+    DefineDataset define_dataset = 2;
+    DefineFlow define_flow = 3;
+    DropDataflowGraph drop_dataflow_graph = 4;
+    StartRun start_run = 5;
+    StopRun stop_run = 6;
+    DefineSqlGraphElements define_sql_graph_elements = 7;
+  }
+
+  message DefineSqlGraphElements {
+    optional string dataflow_graph_id = 1;
+    optional string sql_file_name = 2;
+    optional string sql_text = 3;
+  }
+
+  // Request to create a new pipeline.

Review Comment:
   ```suggestion
     // Request to create a new dataflow graph.
   ```



##########
sql/connect/common/src/main/protobuf/spark/connect/pipelines.proto:
##########
@@ -0,0 +1,142 @@
+syntax = "proto3";
+
+package spark.connect;
+
+import "spark/connect/relations.proto";
+import "spark/connect/types.proto";
+
+option java_multiple_files = true;
+option java_package = "org.apache.spark.connect.proto";
+
+// Dispatch object for pipelines commands.
+message PipelineCommand {
+  oneof command_type {
+    CreateDataflowGraph create_dataflow_graph = 1;
+    DefineDataset define_dataset = 2;
+    DefineFlow define_flow = 3;
+    DropDataflowGraph drop_dataflow_graph = 4;
+    StartRun start_run = 5;
+    StopRun stop_run = 6;
+    DefineSqlGraphElements define_sql_graph_elements = 7;
+  }
+
+  message DefineSqlGraphElements {
+    optional string dataflow_graph_id = 1;
+    optional string sql_file_name = 2;
+    optional string sql_text = 3;
+  }
+
+  // Request to create a new pipeline.
+  message CreateDataflowGraph {
+    // The default catalog.
+    optional string default_catalog = 1;
+
+    // The default database.
+    optional string default_database = 2;
+
+    // Default SQL configurations for all flows in this graph.
+    map<string, string> sql_conf = 5;
+
+    message Response {
+      // The ID of the created graph.
+      string dataflow_graph_id = 1;
+    }
+  }
+
+  // Drops the graph and stops any running attached flows.
+  message DropDataflowGraph {
+    // The graph to drop.
+    string dataflow_graph_id = 1;
+  }
+
+  // Request to define a dataset: a table, a materialized view, or a temporary 
view.
+  message DefineDataset {
+    // The graph to attach this dataset to.
+    string dataflow_graph_id = 1;
+
+    // Name of the dataset.
+    string dataset_name = 2;
+
+    // Table or view.
+    DatasetType dataset_type = 3;
+
+    // Optional comment for the dataset.
+    optional string comment = 4;
+
+    // Optional table properties.
+    map<string, string> table_properties = 5;
+
+    // Optional partition columns for the dataset (if applicable).
+    repeated string partition_cols = 6;
+
+    // Schema for the dataset. If unset, this will be inferred.
+    optional spark.connect.DataType schema = 7;
+
+    // The output table format of the dataset.
+    optional string format = 8;
+  }
+
+  // Request to define a flow targeting a dataset.
+  message DefineFlow {
+    // The graph to attach this dataset to.
+    string dataflow_graph_id = 1;
+
+    // Name of the dataset.

Review Comment:
   I think this comment is inaccurate



##########
sql/connect/common/src/main/protobuf/spark/connect/pipelines.proto:
##########
@@ -0,0 +1,142 @@
+syntax = "proto3";
+
+package spark.connect;
+
+import "spark/connect/relations.proto";
+import "spark/connect/types.proto";
+
+option java_multiple_files = true;
+option java_package = "org.apache.spark.connect.proto";
+
+// Dispatch object for pipelines commands.
+message PipelineCommand {
+  oneof command_type {
+    CreateDataflowGraph create_dataflow_graph = 1;
+    DefineDataset define_dataset = 2;
+    DefineFlow define_flow = 3;
+    DropDataflowGraph drop_dataflow_graph = 4;
+    StartRun start_run = 5;
+    StopRun stop_run = 6;
+    DefineSqlGraphElements define_sql_graph_elements = 7;
+  }
+
+  message DefineSqlGraphElements {
+    optional string dataflow_graph_id = 1;
+    optional string sql_file_name = 2;
+    optional string sql_text = 3;
+  }
+
+  // Request to create a new pipeline.
+  message CreateDataflowGraph {
+    // The default catalog.
+    optional string default_catalog = 1;
+
+    // The default database.
+    optional string default_database = 2;
+
+    // Default SQL configurations for all flows in this graph.
+    map<string, string> sql_conf = 5;
+
+    message Response {
+      // The ID of the created graph.
+      string dataflow_graph_id = 1;
+    }
+  }
+
+  // Drops the graph and stops any running attached flows.
+  message DropDataflowGraph {
+    // The graph to drop.
+    string dataflow_graph_id = 1;
+  }
+
+  // Request to define a dataset: a table, a materialized view, or a temporary 
view.
+  message DefineDataset {
+    // The graph to attach this dataset to.
+    string dataflow_graph_id = 1;
+
+    // Name of the dataset.
+    string dataset_name = 2;
+
+    // Table or view.
+    DatasetType dataset_type = 3;
+
+    // Optional comment for the dataset.
+    optional string comment = 4;
+
+    // Optional table properties.
+    map<string, string> table_properties = 5;
+
+    // Optional partition columns for the dataset (if applicable).
+    repeated string partition_cols = 6;
+
+    // Schema for the dataset. If unset, this will be inferred.
+    optional spark.connect.DataType schema = 7;
+
+    // The output table format of the dataset.
+    optional string format = 8;
+  }
+
+  // Request to define a flow targeting a dataset.
+  message DefineFlow {
+    // The graph to attach this dataset to.
+    string dataflow_graph_id = 1;
+
+    // Name of the dataset.
+    string flow_name = 2;
+
+    string target_dataset_name = 3;
+
+    // An unresolved relation that defines the dataset's flow.
+    spark.connect.Relation plan = 4;
+
+    // Default SQL configurations set when running this flow.
+    map<string, string> sql_conf = 5;
+
+    // If true, this flow will only be run once per execution.
+    bool once = 6;
+  }
+
+  // Resolves all datasets and flows and start a pipeline update. Should be 
called after all
+  // graph elements are registered.
+  message StartRun {
+    // The graph to start.
+    string dataflow_graph_id = 1;
+  }
+  message StopRun {
+    // The ID of the graph to stop..
+    string dataflow_graph_id = 1;
+  }
+}
+
+// Dispatch object for pipelines command results.
+message PipelineCommandResult {
+  oneof result_type {
+    CreateDataflowGraphResult create_dataflow_graph_result = 1;
+  }
+  message CreateDataflowGraphResult {
+    // The ID of the created graph.
+    string dataflow_graph_id = 1;
+  }
+}
+
+enum DatasetType {
+  DATASET_UNSPECIFIED = 0;

Review Comment:
   Do we ever need to use DATASET_UNSPECIFIED?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Re: [PR] [SPARK-52223] Add SDP Spark Connect Protos [spark]

Reply via email to