Re: [PR] Databricks: Add support for OPTIMIZE, PARTITIONED BY, and STRUCT syntax [datafusion-sqlparser-rs]

via GitHub Sat, 24 Jan 2026 03:54:23 -0800


iffyio commented on code in PR #2170:
URL: 
https://github.com/apache/datafusion-sqlparser-rs/pull/2170#discussion_r2724067066



##########
src/parser/mod.rs:
##########
@@ -18233,13 +18244,24 @@ impl<'a> Parser<'a> {
         }
     }
 
+    /// ClickHouse:
     /// ```sql
     /// OPTIMIZE TABLE [db.]name [ON CLUSTER cluster] [PARTITION partition | 
PARTITION ID 'partition_id'] [FINAL] [DEDUPLICATE [BY expression]]
     /// ```
     /// 
[ClickHouse](https://clickhouse.com/docs/en/sql-reference/statements/optimize)
+    ///
+    /// Databricks:
+    /// ```sql
+    /// OPTIMIZE table_name [WHERE predicate] [ZORDER BY (col_name1 [, ...])]
+    /// ```
+    /// 
[Databricks](https://docs.databricks.com/en/sql/language-manual/delta-optimize.html)
     pub fn parse_optimize_table(&mut self) -> Result<Statement, ParserError> {
-        self.expect_keyword_is(Keyword::TABLE)?;
+        // Check for TABLE keyword (ClickHouse uses it, Databricks does not)

Review Comment:
   ```suggestion
   ```
   thinking we can skip the comment so that it doesnt become stale/incomplete 
if other dialects support the feature



##########
src/parser/mod.rs:
##########
@@ -7907,14 +7890,41 @@ impl<'a> Parser<'a> {
     pub fn parse_hive_distribution(&mut self) -> Result<HiveDistributionStyle, 
ParserError> {
         if self.parse_keywords(&[Keyword::PARTITIONED, Keyword::BY]) {
             self.expect_token(&Token::LParen)?;
-            let columns = 
self.parse_comma_separated(Parser::parse_column_def)?;
+            let columns = 
self.parse_comma_separated(Parser::parse_column_def_for_partition)?;
             self.expect_token(&Token::RParen)?;
             Ok(HiveDistributionStyle::PARTITIONED { columns })
         } else {
             Ok(HiveDistributionStyle::NONE)
         }
     }
 
+    /// Parse column definition for PARTITIONED BY clause.
+    ///
+    /// Databricks allows partition columns without types when referencing
+    /// columns already defined in the table specification:
+    /// ```sql
+    /// CREATE TABLE t (col1 STRING, col2 INT) PARTITIONED BY (col1)
+    /// CREATE TABLE t (col1 STRING) PARTITIONED BY (col2 INT)
+    /// ```
+    ///
+    /// See 
[Databricks](https://docs.databricks.com/en/sql/language-manual/sql-ref-partition.html)
+    fn parse_column_def_for_partition(&mut self) -> Result<ColumnDef, 
ParserError> {

Review Comment:
   ```suggestion
       fn parse_partitioned_by_column_def(&mut self) -> Result<ColumnDef, 
ParserError> {
   ```



##########
src/parser/mod.rs:
##########
@@ -3361,25 +3334,35 @@ impl<'a> Parser<'a> {
     /// Syntax:
     ///
     /// ```sql
+    /// -- BigQuery style
     /// [field_name] field_type
+    /// -- Databricks/Hive style (colon separator)
+    /// field_name: field_type
     /// ```
     ///
     /// [struct]: 
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#declaring_a_struct_type
     /// [tuple]: https://clickhouse.com/docs/en/sql-reference/data-types/tuple
+    /// [databricks]: 
https://docs.databricks.com/en/sql/language-manual/data-types/struct-type.html
     fn parse_struct_field_def(
         &mut self,
     ) -> Result<(StructField, MatchedTrailingBracket), ParserError> {
         // Look beyond the next item to infer whether both field name
         // and type are specified.
-        let is_anonymous_field = !matches!(
+        // Supports both:
+        //   - `field_name field_type` (BigQuery style)
+        //   - `field_name: field_type` (Databricks/Hive style)
+        let is_named_field = matches!(
             (self.peek_nth_token(0).token, self.peek_nth_token(1).token),
-            (Token::Word(_), Token::Word(_))
+            (Token::Word(_), Token::Word(_)) | (Token::Word(_), Token::Colon)
         );
 
-        let field_name = if is_anonymous_field {
-            None
+        let field_name = if is_named_field {
+            let name = self.parse_identifier()?;
+            // Consume optional colon separator (Databricks/Hive style)

Review Comment:
   ```suggestion
   ```



##########
src/parser/mod.rs:
##########
@@ -7907,14 +7890,41 @@ impl<'a> Parser<'a> {
     pub fn parse_hive_distribution(&mut self) -> Result<HiveDistributionStyle, 
ParserError> {
         if self.parse_keywords(&[Keyword::PARTITIONED, Keyword::BY]) {
             self.expect_token(&Token::LParen)?;
-            let columns = 
self.parse_comma_separated(Parser::parse_column_def)?;
+            let columns = 
self.parse_comma_separated(Parser::parse_column_def_for_partition)?;
             self.expect_token(&Token::RParen)?;
             Ok(HiveDistributionStyle::PARTITIONED { columns })
         } else {
             Ok(HiveDistributionStyle::NONE)
         }
     }
 
+    /// Parse column definition for PARTITIONED BY clause.
+    ///
+    /// Databricks allows partition columns without types when referencing
+    /// columns already defined in the table specification:
+    /// ```sql
+    /// CREATE TABLE t (col1 STRING, col2 INT) PARTITIONED BY (col1)
+    /// CREATE TABLE t (col1 STRING) PARTITIONED BY (col2 INT)
+    /// ```
+    ///
+    /// See 
[Databricks](https://docs.databricks.com/en/sql/language-manual/sql-ref-partition.html)
+    fn parse_column_def_for_partition(&mut self) -> Result<ColumnDef, 
ParserError> {

Review Comment:
   Is the reason for this rewrite to support optional datatype? If so I think 
it would make sense that we introduce a new type that reflects this instead of 
reusing the `Unspecified` datatype which has a different meaning? e.g. that 
this function returns
   ```rust
   struct PartitionedByColumnDef { name, data_type: Option<DataType>}`
   ```
   wdyt?



##########
src/parser/mod.rs:
##########
@@ -7907,14 +7890,41 @@ impl<'a> Parser<'a> {
     pub fn parse_hive_distribution(&mut self) -> Result<HiveDistributionStyle, 
ParserError> {
         if self.parse_keywords(&[Keyword::PARTITIONED, Keyword::BY]) {
             self.expect_token(&Token::LParen)?;
-            let columns = 
self.parse_comma_separated(Parser::parse_column_def)?;
+            let columns = 
self.parse_comma_separated(Parser::parse_column_def_for_partition)?;
             self.expect_token(&Token::RParen)?;
             Ok(HiveDistributionStyle::PARTITIONED { columns })
         } else {
             Ok(HiveDistributionStyle::NONE)
         }
     }
 
+    /// Parse column definition for PARTITIONED BY clause.

Review Comment:
   ```suggestion
       /// Parse column definition for `PARTITIONED BY` clause.
   ```



##########
src/ast/mod.rs:
##########
@@ -4654,22 +4654,34 @@ pub enum Statement {
         /// Legacy copy-style options.
         options: Vec<CopyLegacyOption>,
     },
+    /// ClickHouse:
     /// ```sql
     /// OPTIMIZE TABLE [db.]name [ON CLUSTER cluster] [PARTITION partition | 
PARTITION ID 'partition_id'] [FINAL] [DEDUPLICATE [BY expression]]
     /// ```
-    ///
     /// See ClickHouse 
<https://clickhouse.com/docs/en/sql-reference/statements/optimize>
+    ///
+    /// Databricks:
+    /// ```sql
+    /// OPTIMIZE table_name [WHERE predicate] [ZORDER BY (col_name1 [, ...])]
+    /// ```
+    /// See Databricks 
<https://docs.databricks.com/en/sql/language-manual/delta-optimize.html>
     OptimizeTable {
         /// Table name to optimize.
         name: ObjectName,
-        /// Optional cluster identifier.
+        /// Whether the `TABLE` keyword was present (ClickHouse uses `OPTIMIZE 
TABLE`, Databricks uses `OPTIMIZE`).
+        has_table_keyword: bool,
+        /// Optional cluster identifier (ClickHouse).

Review Comment:
   ```suggestion
           /// Optional cluster identifier.
           /// 
[Clickhouse](https://clickhouse.com/docs/en/sql-reference/statements/optimize)
   ```



##########
src/parser/mod.rs:
##########
@@ -7907,14 +7890,41 @@ impl<'a> Parser<'a> {
     pub fn parse_hive_distribution(&mut self) -> Result<HiveDistributionStyle, 
ParserError> {
         if self.parse_keywords(&[Keyword::PARTITIONED, Keyword::BY]) {
             self.expect_token(&Token::LParen)?;
-            let columns = 
self.parse_comma_separated(Parser::parse_column_def)?;
+            let columns = 
self.parse_comma_separated(Parser::parse_column_def_for_partition)?;
             self.expect_token(&Token::RParen)?;
             Ok(HiveDistributionStyle::PARTITIONED { columns })
         } else {
             Ok(HiveDistributionStyle::NONE)
         }
     }
 
+    /// Parse column definition for PARTITIONED BY clause.
+    ///
+    /// Databricks allows partition columns without types when referencing
+    /// columns already defined in the table specification:
+    /// ```sql
+    /// CREATE TABLE t (col1 STRING, col2 INT) PARTITIONED BY (col1)
+    /// CREATE TABLE t (col1 STRING) PARTITIONED BY (col2 INT)
+    /// ```
+    ///
+    /// See 
[Databricks](https://docs.databricks.com/en/sql/language-manual/sql-ref-partition.html)
+    fn parse_column_def_for_partition(&mut self) -> Result<ColumnDef, 
ParserError> {
+        let name = self.parse_identifier()?;
+
+        // Check if the next token indicates there's no type specified
+        // (comma or closing paren means end of this column definition)
+        let data_type = match self.peek_token().token {
+            Token::Comma | Token::RParen => DataType::Unspecified,
+            _ => self.parse_data_type()?,
+        };

Review Comment:
   I think we can instead do something like
   ```rust
   let data_type = self.maybe_parse(|parser| parser.parse_data_type())?;



##########
src/ast/mod.rs:
##########
@@ -4654,22 +4654,34 @@ pub enum Statement {
         /// Legacy copy-style options.
         options: Vec<CopyLegacyOption>,
     },
+    /// ClickHouse:
     /// ```sql
     /// OPTIMIZE TABLE [db.]name [ON CLUSTER cluster] [PARTITION partition | 
PARTITION ID 'partition_id'] [FINAL] [DEDUPLICATE [BY expression]]
     /// ```
-    ///
     /// See ClickHouse 
<https://clickhouse.com/docs/en/sql-reference/statements/optimize>
+    ///
+    /// Databricks:
+    /// ```sql
+    /// OPTIMIZE table_name [WHERE predicate] [ZORDER BY (col_name1 [, ...])]
+    /// ```
+    /// See Databricks 
<https://docs.databricks.com/en/sql/language-manual/delta-optimize.html>
     OptimizeTable {
         /// Table name to optimize.
         name: ObjectName,
-        /// Optional cluster identifier.
+        /// Whether the `TABLE` keyword was present (ClickHouse uses `OPTIMIZE 
TABLE`, Databricks uses `OPTIMIZE`).
+        has_table_keyword: bool,
+        /// Optional cluster identifier (ClickHouse).
         on_cluster: Option<Ident>,
-        /// Optional partition spec.
+        /// Optional partition spec (ClickHouse).
         partition: Option<Partition>,
-        /// Whether `FINAL` was specified.
+        /// Whether `FINAL` was specified (ClickHouse).
         include_final: bool,
-        /// Optional deduplication settings.
+        /// Optional deduplication settings (ClickHouse).
         deduplicate: Option<Deduplicate>,
+        /// Optional WHERE predicate (Databricks).

Review Comment:
   ```suggestion
           /// Optional WHERE predicate.
           /// 
[Databricks](https://docs.databricks.com/en/sql/language-manual/delta-optimize.html)
   ```



##########
src/parser/mod.rs:
##########
@@ -3361,25 +3334,35 @@ impl<'a> Parser<'a> {
     /// Syntax:
     ///
     /// ```sql
+    /// -- BigQuery style
     /// [field_name] field_type
+    /// -- Databricks/Hive style (colon separator)
+    /// field_name: field_type
     /// ```
     ///
     /// [struct]: 
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#declaring_a_struct_type
     /// [tuple]: https://clickhouse.com/docs/en/sql-reference/data-types/tuple
+    /// [databricks]: 
https://docs.databricks.com/en/sql/language-manual/data-types/struct-type.html
     fn parse_struct_field_def(
         &mut self,
     ) -> Result<(StructField, MatchedTrailingBracket), ParserError> {
         // Look beyond the next item to infer whether both field name
         // and type are specified.
-        let is_anonymous_field = !matches!(
+        // Supports both:
+        //   - `field_name field_type` (BigQuery style)
+        //   - `field_name: field_type` (Databricks/Hive style)

Review Comment:
   ```suggestion
   ```
   thinking we can skip the comment since they're other dialects that are 
affected, and the code is explanatory in this case



##########
src/ast/mod.rs:
##########
@@ -4654,22 +4654,34 @@ pub enum Statement {
         /// Legacy copy-style options.
         options: Vec<CopyLegacyOption>,
     },
+    /// ClickHouse:
     /// ```sql
     /// OPTIMIZE TABLE [db.]name [ON CLUSTER cluster] [PARTITION partition | 
PARTITION ID 'partition_id'] [FINAL] [DEDUPLICATE [BY expression]]
     /// ```
-    ///
     /// See ClickHouse 
<https://clickhouse.com/docs/en/sql-reference/statements/optimize>
+    ///
+    /// Databricks:
+    /// ```sql
+    /// OPTIMIZE table_name [WHERE predicate] [ZORDER BY (col_name1 [, ...])]
+    /// ```
+    /// See Databricks 
<https://docs.databricks.com/en/sql/language-manual/delta-optimize.html>
     OptimizeTable {
         /// Table name to optimize.
         name: ObjectName,
-        /// Optional cluster identifier.
+        /// Whether the `TABLE` keyword was present (ClickHouse uses `OPTIMIZE 
TABLE`, Databricks uses `OPTIMIZE`).
+        has_table_keyword: bool,
+        /// Optional cluster identifier (ClickHouse).
         on_cluster: Option<Ident>,
-        /// Optional partition spec.
+        /// Optional partition spec (ClickHouse).
         partition: Option<Partition>,
-        /// Whether `FINAL` was specified.
+        /// Whether `FINAL` was specified (ClickHouse).

Review Comment:
   ```suggestion
           /// Whether `FINAL` was specified.
           /// 
[Clickhouse](https://clickhouse.com/docs/en/sql-reference/statements/optimize)
   ```



##########
src/ast/mod.rs:
##########
@@ -4654,22 +4654,34 @@ pub enum Statement {
         /// Legacy copy-style options.
         options: Vec<CopyLegacyOption>,
     },
+    /// ClickHouse:
     /// ```sql
     /// OPTIMIZE TABLE [db.]name [ON CLUSTER cluster] [PARTITION partition | 
PARTITION ID 'partition_id'] [FINAL] [DEDUPLICATE [BY expression]]
     /// ```
-    ///
     /// See ClickHouse 
<https://clickhouse.com/docs/en/sql-reference/statements/optimize>
+    ///
+    /// Databricks:
+    /// ```sql
+    /// OPTIMIZE table_name [WHERE predicate] [ZORDER BY (col_name1 [, ...])]
+    /// ```
+    /// See Databricks 
<https://docs.databricks.com/en/sql/language-manual/delta-optimize.html>
     OptimizeTable {
         /// Table name to optimize.
         name: ObjectName,
-        /// Optional cluster identifier.
+        /// Whether the `TABLE` keyword was present (ClickHouse uses `OPTIMIZE 
TABLE`, Databricks uses `OPTIMIZE`).
+        has_table_keyword: bool,
+        /// Optional cluster identifier (ClickHouse).
         on_cluster: Option<Ident>,
-        /// Optional partition spec.
+        /// Optional partition spec (ClickHouse).
         partition: Option<Partition>,
-        /// Whether `FINAL` was specified.
+        /// Whether `FINAL` was specified (ClickHouse).
         include_final: bool,
-        /// Optional deduplication settings.
+        /// Optional deduplication settings (ClickHouse).
         deduplicate: Option<Deduplicate>,
+        /// Optional WHERE predicate (Databricks).
+        predicate: Option<Expr>,
+        /// Optional ZORDER BY columns (Databricks).

Review Comment:
   ```suggestion
           /// Optional ZORDER BY columns.
           /// 
[Databricks](https://docs.databricks.com/en/sql/language-manual/delta-optimize.html)
   ```



##########
src/ast/mod.rs:
##########
@@ -4654,22 +4654,34 @@ pub enum Statement {
         /// Legacy copy-style options.
         options: Vec<CopyLegacyOption>,
     },
+    /// ClickHouse:
     /// ```sql
     /// OPTIMIZE TABLE [db.]name [ON CLUSTER cluster] [PARTITION partition | 
PARTITION ID 'partition_id'] [FINAL] [DEDUPLICATE [BY expression]]
     /// ```
-    ///
     /// See ClickHouse 
<https://clickhouse.com/docs/en/sql-reference/statements/optimize>
+    ///
+    /// Databricks:
+    /// ```sql
+    /// OPTIMIZE table_name [WHERE predicate] [ZORDER BY (col_name1 [, ...])]
+    /// ```
+    /// See Databricks 
<https://docs.databricks.com/en/sql/language-manual/delta-optimize.html>
     OptimizeTable {
         /// Table name to optimize.
         name: ObjectName,
-        /// Optional cluster identifier.
+        /// Whether the `TABLE` keyword was present (ClickHouse uses `OPTIMIZE 
TABLE`, Databricks uses `OPTIMIZE`).
+        has_table_keyword: bool,
+        /// Optional cluster identifier (ClickHouse).
         on_cluster: Option<Ident>,
-        /// Optional partition spec.
+        /// Optional partition spec (ClickHouse).
         partition: Option<Partition>,
-        /// Whether `FINAL` was specified.
+        /// Whether `FINAL` was specified (ClickHouse).
         include_final: bool,
-        /// Optional deduplication settings.
+        /// Optional deduplication settings (ClickHouse).

Review Comment:
   ```suggestion
           /// Optional deduplication settings.
           /// 
[Clickhouse](https://clickhouse.com/docs/en/sql-reference/statements/optimize)
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Databricks: Add support for OPTIMIZE, PARTITIONED BY, and STRUCT syntax [datafusion-sqlparser-rs]

Reply via email to