comphead commented on code in PR #16332: URL: https://github.com/apache/datafusion/pull/16332#discussion_r2134794387
########## datafusion-cli/src/functions.rs: ########## @@ -460,3 +473,94 @@ impl TableFunctionImpl for ParquetMetadataFunc { Ok(Arc::new(parquet_metadata)) } } + +/// A table function that allows users to query files using glob patterns +/// for example: SELECT * FROM glob('path/to/*/file.parquet') +pub struct GlobFunc { + // we need the ctx here to get the schema from the listing table later + ctx: SessionContext, +} + +impl std::fmt::Debug for GlobFunc { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("GlobFunc") + .field("ctx", &"<SessionContext>") + .finish() + } +} + +impl GlobFunc { + /// Create a new GlobFunc + pub fn new(ctx: SessionContext) -> Self { + Self { ctx } + } +} + +fn as_utf8_literal<'a>(expr: &'a Expr, arg_name: &str) -> Result<&'a str> { + match expr { + Expr::Literal(ScalarValue::Utf8(Some(s)), _) => Ok(s), + Expr::Column(Column { name, .. }) => Ok(name), + _ => plan_err!("glob() requires a string literal for the '{arg_name}' argument"), + } +} + +impl TableFunctionImpl for GlobFunc { + fn call(&self, exprs: &[Expr]) -> Result<Arc<dyn TableProvider>> { + // Parse arguments + if exprs.is_empty() { + return plan_err!("glob() requires a glob pattern"); + } + let pattern = as_utf8_literal(&exprs[0], "pattern")?; + let format = if exprs.len() > 1 { + Some(as_utf8_literal(&exprs[1], "format")?) + } else { + None + }; + + // Create ListingTableUrl - distinguish between URLs with schemes and local paths + let url = if pattern.contains("://") && pattern.contains(['*', '?', '[']) { + // URL with scheme and glob - split manually to avoid URL encoding of glob chars + let glob_pos = pattern.find(['*', '?', '[']).unwrap(); // we already checked it exists + let split_pos = pattern[..glob_pos].rfind('/').unwrap() + 1; // find last '/' before glob + let (base_path, glob_part) = pattern.split_at(split_pos); + + let base_url = Url::parse(&format!("{}/", base_path.trim_end_matches('/'))) + .map_err(|e| { + DataFusionError::Plan(format!("Invalid base URL: {}", e)) Review Comment: plan_df_error! ```suggestion plan_datafusion_err!("Invalid base URL: {}", e)) ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org