iffyio commented on code in PR #1551: URL: https://github.com/apache/datafusion-sqlparser-rs/pull/1551#discussion_r1883938066
########## src/parser/mod.rs: ########## @@ -1427,6 +1424,112 @@ impl<'a> Parser<'a> { } } + /// Try to parse an [Expr::CompoundFieldAccess] like `a.b.c` or `a.b[1].c`. + /// If all the fields are `Expr::Identifier`s, return an [Expr::CompoundIdentifier] instead. + /// If only the root exists, return the root. + /// If self supports [Dialect::supports_partiql], it will fall back when occurs [Token::LBracket] for JsonAccess parsing. + pub fn parse_compound_field_access( + &mut self, + root: Expr, + mut chain: Vec<AccessExpr>, + ) -> Result<Expr, ParserError> { + let mut ending_wildcard: Option<TokenWithSpan> = None; + let mut ending_lbracket = false; + while self.consume_token(&Token::Period) { + let next_token = self.next_token(); + match next_token.token { + Token::Word(w) => { + let expr = Expr::Identifier(w.to_ident(next_token.span)); + chain.push(AccessExpr::Dot(expr)); + if self.consume_token(&Token::LBracket) { + if self.dialect.supports_partiql() { + ending_lbracket = true; + break; + } else { + self.parse_multi_dim_subscript(&mut chain)? + } + } + } + Token::Mul => { + // Postgres explicitly allows funcnm(tablenm.*) and the + // function array_agg traverses this control flow + if dialect_of!(self is PostgreSqlDialect) { + ending_wildcard = Some(next_token); + break; + } else { + return self.expected("an identifier after '.'", next_token); + } + } + Token::SingleQuotedString(s) => { + let expr = Expr::Identifier(Ident::with_quote('\'', s)); + chain.push(AccessExpr::Dot(expr)); + } + _ => { + return self.expected("an identifier or a '*' after '.'", next_token); + } + } + } + + // if dialect supports partiql, we need to go back one Token::LBracket for the JsonAccess parsing + if self.dialect.supports_partiql() && ending_lbracket { + self.prev_token(); + } + + if let Some(wildcard_token) = ending_wildcard { + let Some(id_parts) = Self::exprs_to_idents(&root, &chain) else { + return self.expected("an identifier or a '*' after '.'", self.peek_token()); + }; + Ok(Expr::QualifiedWildcard( + ObjectName(id_parts), + AttachedToken(wildcard_token), + )) + } else if self.consume_token(&Token::LParen) { + let Some(id_parts) = Self::exprs_to_idents(&root, &chain) else { + return self.expected("an identifier or a '*' after '.'", self.peek_token()); + }; + if dialect_of!(self is SnowflakeDialect | MsSqlDialect) + && self.consume_tokens(&[Token::Plus, Token::RParen]) + { + Ok(Expr::OuterJoin(Box::new( + match <[Ident; 1]>::try_from(id_parts) { + Ok([ident]) => Expr::Identifier(ident), + Err(parts) => Expr::CompoundIdentifier(parts), + }, + ))) Review Comment: I think it would be nice if we could reuse this logic somehow in a function? since its not trivial and lists dialects, now that its duplicated it could be easy to miss if changes are made to one but not the other? ########## src/parser/mod.rs: ########## @@ -3095,15 +3201,29 @@ impl<'a> Parser<'a> { }) } + /// Parse an multi-dimension array accessing like `[1:3][1][1]` Review Comment: ```suggestion /// Parses a multi-dimension array accessing like `[1:3][1][1]` ``` ########## src/parser/mod.rs: ########## @@ -3167,42 +3287,23 @@ impl<'a> Parser<'a> { pub fn parse_map_access(&mut self, expr: Expr) -> Result<Expr, ParserError> { let key = self.parse_expr()?; + let result = match key { Review Comment: hmm I was initially wondering about this match since it looked incorrect to restrict the key expression type. e.g. this BigQuery [test case](https://github.com/apache/datafusion-sqlparser-rs/blob/e16b246/tests/sqlparser_bigquery.rs#L1969). But then I realised that test case is passing because it now takes a different codepath via `parse_compound_field_access`. And so I wonder if there are any scenarios that rely on this method anymore, if there aren't it seems like we might be able to remove it entirely? ########## src/parser/mod.rs: ########## @@ -3095,15 +3201,29 @@ impl<'a> Parser<'a> { }) } + /// Parse an multi-dimension array accessing like `[1:3][1][1]` + /// + /// Parser is right after the first `[` Review Comment: given this is a pub function it would be nice if the api was symmetric so that the function also consumed the leading `[`? I think within the parser itself that would mean calling `prev_token()` or replacing with `peek()` before invoking this function which would be a fair price to pay? ########## src/parser/mod.rs: ########## @@ -1144,53 +1144,52 @@ impl<'a> Parser<'a> { w_span: Span, ) -> Result<Expr, ParserError> { match self.peek_token().token { - Token::LParen | Token::Period => { - let mut id_parts: Vec<Ident> = vec![w.to_ident(w_span)]; - let mut ending_wildcard: Option<TokenWithSpan> = None; - while self.consume_token(&Token::Period) { - let next_token = self.next_token(); - match next_token.token { - Token::Word(w) => id_parts.push(w.to_ident(next_token.span)), - Token::Mul => { - // Postgres explicitly allows funcnm(tablenm.*) and the - // function array_agg traverses this control flow - if dialect_of!(self is PostgreSqlDialect) { - ending_wildcard = Some(next_token); - break; - } else { - return self.expected("an identifier after '.'", next_token); - } - } - Token::SingleQuotedString(s) => id_parts.push(Ident::with_quote('\'', s)), - _ => { - return self.expected("an identifier or a '*' after '.'", next_token); - } + Token::Period => self.parse_compound_expr(Expr::Identifier(w.to_ident(w_span)), vec![]), + Token::LParen => { + let id_parts = vec![w.to_ident(w_span)]; + // parse_comma_outer_join is used to parse the following pattern: + if dialect_of!(self is SnowflakeDialect | MsSqlDialect) + && self.consume_tokens(&[Token::LParen, Token::Plus, Token::RParen]) + { + Ok(Expr::OuterJoin(Box::new( + match <[Ident; 1]>::try_from(id_parts) { + Ok([ident]) => Expr::Identifier(ident), + Err(parts) => Expr::CompoundIdentifier(parts), + }, + ))) + } else { + let mut expr = self.parse_function(ObjectName(id_parts))?; + // consume all period if it's a method chain + if self.dialect.supports_methods() { Review Comment: Oh, to clarify what I meant was that `try_parse_method` does this already ```rust if !self.dialect.supports_methods() { return Ok(expr); } ``` So that this can be simplified as following (i.e without the extra `if self.dialect.supports_methods()`) ```rust expr = self.try_parse_method(expr)?; ``` ########## src/parser/mod.rs: ########## @@ -1427,6 +1424,112 @@ impl<'a> Parser<'a> { } } + /// Try to parse an [Expr::CompoundFieldAccess] like `a.b.c` or `a.b[1].c`. + /// If all the fields are `Expr::Identifier`s, return an [Expr::CompoundIdentifier] instead. + /// If only the root exists, return the root. + /// If self supports [Dialect::supports_partiql], it will fall back when occurs [Token::LBracket] for JsonAccess parsing. + pub fn parse_compound_field_access( + &mut self, + root: Expr, + mut chain: Vec<AccessExpr>, + ) -> Result<Expr, ParserError> { + let mut ending_wildcard: Option<TokenWithSpan> = None; + let mut ending_lbracket = false; + while self.consume_token(&Token::Period) { + let next_token = self.next_token(); + match next_token.token { + Token::Word(w) => { + let expr = Expr::Identifier(w.to_ident(next_token.span)); + chain.push(AccessExpr::Dot(expr)); + if self.consume_token(&Token::LBracket) { + if self.dialect.supports_partiql() { + ending_lbracket = true; + break; + } else { + self.parse_multi_dim_subscript(&mut chain)? + } + } + } + Token::Mul => { + // Postgres explicitly allows funcnm(tablenm.*) and the + // function array_agg traverses this control flow + if dialect_of!(self is PostgreSqlDialect) { + ending_wildcard = Some(next_token); + break; + } else { + return self.expected("an identifier after '.'", next_token); + } + } + Token::SingleQuotedString(s) => { + let expr = Expr::Identifier(Ident::with_quote('\'', s)); + chain.push(AccessExpr::Dot(expr)); + } + _ => { + return self.expected("an identifier or a '*' after '.'", next_token); + } + } + } + + // if dialect supports partiql, we need to go back one Token::LBracket for the JsonAccess parsing + if self.dialect.supports_partiql() && ending_lbracket { + self.prev_token(); + } + + if let Some(wildcard_token) = ending_wildcard { + let Some(id_parts) = Self::exprs_to_idents(&root, &chain) else { + return self.expected("an identifier or a '*' after '.'", self.peek_token()); + }; + Ok(Expr::QualifiedWildcard( + ObjectName(id_parts), + AttachedToken(wildcard_token), + )) + } else if self.consume_token(&Token::LParen) { + let Some(id_parts) = Self::exprs_to_idents(&root, &chain) else { + return self.expected("an identifier or a '*' after '.'", self.peek_token()); + }; + if dialect_of!(self is SnowflakeDialect | MsSqlDialect) + && self.consume_tokens(&[Token::Plus, Token::RParen]) + { + Ok(Expr::OuterJoin(Box::new( + match <[Ident; 1]>::try_from(id_parts) { + Ok([ident]) => Expr::Identifier(ident), + Err(parts) => Expr::CompoundIdentifier(parts), + }, + ))) + } else { + self.prev_token(); + self.parse_function(ObjectName(id_parts)) + } + } else { + if let Some(id_parts) = Self::exprs_to_idents(&root, &chain) { + return Ok(Expr::CompoundIdentifier(id_parts)); + } + if chain.is_empty() { + return Ok(root); + } + Ok(Expr::CompoundFieldAccess { + root: Box::new(root), + access_chain: chain.clone(), + }) + } + } + + fn exprs_to_idents(root: &Expr, fields: &[AccessExpr]) -> Option<Vec<Ident>> { Review Comment: Could we add a description of what the function does? ########## src/parser/mod.rs: ########## @@ -1427,6 +1426,112 @@ impl<'a> Parser<'a> { } } + /// Try to parse an [Expr::CompoundExpr] like `a.b.c` or `a.b[1].c`. + /// If all the fields are `Expr::Identifier`s, return an [Expr::CompoundIdentifier] instead. + /// If only the root exists, return the root. + /// If self supports [Dialect::supports_partiql], it will fall back when occurs [Token::LBracket] for JsonAccess parsing. + pub fn parse_compound_expr( + &mut self, + root: Expr, + mut chain: Vec<AccessField>, + ) -> Result<Expr, ParserError> { + let mut ending_wildcard: Option<TokenWithSpan> = None; + let mut ending_lbracket = false; + while self.consume_token(&Token::Period) { + let next_token = self.next_token(); + match next_token.token { + Token::Word(w) => { + let expr = Expr::Identifier(w.to_ident(next_token.span)); + chain.push(AccessField::Expr(expr)); + if self.consume_token(&Token::LBracket) { + if self.dialect.supports_partiql() { + ending_lbracket = true; + break; + } else { + self.parse_multi_dim_subscript(&mut chain)? + } + } + } + Token::Mul => { + // Postgres explicitly allows funcnm(tablenm.*) and the + // function array_agg traverses this control flow + if dialect_of!(self is PostgreSqlDialect) { + ending_wildcard = Some(next_token); + break; + } else { + return self.expected("an identifier after '.'", next_token); + } + } + Token::SingleQuotedString(s) => { + let expr = Expr::Identifier(Ident::with_quote('\'', s)); + chain.push(AccessField::Expr(expr)); + } + _ => { + return self.expected("an identifier or a '*' after '.'", next_token); + } + } + } + + // if dialect supports partiql, we need to go back one Token::LBracket for the JsonAccess parsing + if self.dialect.supports_partiql() && ending_lbracket { + self.prev_token(); + } + + if let Some(wildcard_token) = ending_wildcard { Review Comment: Ah yeah I agree the nesting isn't desirable here, and it makes sense to consider that behavior separate from this PR! ########## src/parser/mod.rs: ########## @@ -1427,6 +1424,112 @@ impl<'a> Parser<'a> { } } + /// Try to parse an [Expr::CompoundFieldAccess] like `a.b.c` or `a.b[1].c`. + /// If all the fields are `Expr::Identifier`s, return an [Expr::CompoundIdentifier] instead. + /// If only the root exists, return the root. + /// If self supports [Dialect::supports_partiql], it will fall back when occurs [Token::LBracket] for JsonAccess parsing. + pub fn parse_compound_field_access( + &mut self, + root: Expr, + mut chain: Vec<AccessExpr>, + ) -> Result<Expr, ParserError> { + let mut ending_wildcard: Option<TokenWithSpan> = None; + let mut ending_lbracket = false; + while self.consume_token(&Token::Period) { + let next_token = self.next_token(); + match next_token.token { + Token::Word(w) => { + let expr = Expr::Identifier(w.to_ident(next_token.span)); + chain.push(AccessExpr::Dot(expr)); + if self.consume_token(&Token::LBracket) { + if self.dialect.supports_partiql() { + ending_lbracket = true; + break; + } else { + self.parse_multi_dim_subscript(&mut chain)? + } + } + } + Token::Mul => { + // Postgres explicitly allows funcnm(tablenm.*) and the + // function array_agg traverses this control flow + if dialect_of!(self is PostgreSqlDialect) { + ending_wildcard = Some(next_token); + break; + } else { + return self.expected("an identifier after '.'", next_token); + } + } + Token::SingleQuotedString(s) => { + let expr = Expr::Identifier(Ident::with_quote('\'', s)); + chain.push(AccessExpr::Dot(expr)); + } + _ => { + return self.expected("an identifier or a '*' after '.'", next_token); + } + } + } + + // if dialect supports partiql, we need to go back one Token::LBracket for the JsonAccess parsing + if self.dialect.supports_partiql() && ending_lbracket { + self.prev_token(); + } + + if let Some(wildcard_token) = ending_wildcard { + let Some(id_parts) = Self::exprs_to_idents(&root, &chain) else { + return self.expected("an identifier or a '*' after '.'", self.peek_token()); + }; + Ok(Expr::QualifiedWildcard( + ObjectName(id_parts), + AttachedToken(wildcard_token), + )) + } else if self.consume_token(&Token::LParen) { + let Some(id_parts) = Self::exprs_to_idents(&root, &chain) else { + return self.expected("an identifier or a '*' after '.'", self.peek_token()); + }; + if dialect_of!(self is SnowflakeDialect | MsSqlDialect) + && self.consume_tokens(&[Token::Plus, Token::RParen]) + { + Ok(Expr::OuterJoin(Box::new( + match <[Ident; 1]>::try_from(id_parts) { + Ok([ident]) => Expr::Identifier(ident), + Err(parts) => Expr::CompoundIdentifier(parts), + }, + ))) + } else { + self.prev_token(); + self.parse_function(ObjectName(id_parts)) + } + } else { + if let Some(id_parts) = Self::exprs_to_idents(&root, &chain) { + return Ok(Expr::CompoundIdentifier(id_parts)); + } + if chain.is_empty() { + return Ok(root); + } + Ok(Expr::CompoundFieldAccess { + root: Box::new(root), + access_chain: chain.clone(), + }) + } + } + + fn exprs_to_idents(root: &Expr, fields: &[AccessExpr]) -> Option<Vec<Ident>> { Review Comment: since the caller wouldn't need the copy of `root` and `fields` it would be nice to avoid the cloning? maybe something like this could work? ```rust fn exprs_to_idens(root: Expr, fields: Vec<AccessExprs>) -> Result<Vec<Ident>, (Expr, Vec<AccessExprs>)> ``` so that if no conversion was made the caller gets back the same values they provided? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org