Re: [PR] refactor: add ParquetOpenerBuilder to reduce test code duplication [datafusion]

via GitHub Fri, 19 Dec 2025 10:53:26 -0800


alamb commented on code in PR #19405:
URL: https://github.com/apache/datafusion/pull/19405#discussion_r2635983824



##########
datafusion/datasource-parquet/src/opener.rs:
##########
@@ -988,6 +988,171 @@ mod test {
     use parquet::arrow::ArrowWriter;
     use parquet::file::properties::WriterProperties;
 
+    /// Builder for creating [`ParquetOpener`] instances with sensible 
defaults for tests.
+    /// This helps reduce code duplication and makes it clear what differs 
between test cases.
+    struct ParquetOpenerBuilder {
+        store: Option<Arc<dyn ObjectStore>>,
+        file_schema: Option<SchemaRef>,
+        table_schema: Option<TableSchema>,
+        partition_index: usize,
+        projection_indices: Option<Vec<usize>>,
+        projection: Option<ProjectionExprs>,
+        batch_size: usize,
+        limit: Option<usize>,
+        predicate: Option<Arc<dyn PhysicalExpr>>,
+        metadata_size_hint: Option<usize>,
+        metrics: ExecutionPlanMetricsSet,
+        pushdown_filters: bool,
+        reorder_filters: bool,
+        force_filter_selections: bool,
+        enable_page_index: bool,
+        enable_bloom_filter: bool,
+        enable_row_group_stats_pruning: bool,
+        coerce_int96: Option<arrow::datatypes::TimeUnit>,
+        max_predicate_cache_size: Option<usize>,
+        reverse_row_groups: bool,
+    }
+
+    impl ParquetOpenerBuilder {
+        /// Create a new builder with sensible defaults for tests.
+        fn new() -> Self {
+            Self {
+                store: None,
+                file_schema: None,
+                table_schema: None,
+                partition_index: 0,
+                projection_indices: None,
+                projection: None,
+                batch_size: 1024,
+                limit: None,
+                predicate: None,
+                metadata_size_hint: None,
+                metrics: ExecutionPlanMetricsSet::new(),
+                pushdown_filters: false,
+                reorder_filters: false,
+                force_filter_selections: false,
+                enable_page_index: false,
+                enable_bloom_filter: false,
+                enable_row_group_stats_pruning: false,
+                coerce_int96: None,
+                max_predicate_cache_size: None,
+                reverse_row_groups: false,
+            }
+        }
+
+        /// Set the object store (required for building).
+        fn with_store(mut self, store: Arc<dyn ObjectStore>) -> Self {
+            self.store = Some(store);
+            self
+        }
+
+        /// Set the file schema and create a simple table schema (for files 
without partition columns).
+        fn with_schema(mut self, file_schema: SchemaRef) -> Self {
+            self.file_schema = Some(Arc::clone(&file_schema));
+            self.table_schema = 
Some(TableSchema::from_file_schema(file_schema));
+            self
+        }
+
+        /// Set a custom table schema (for files with partition columns).

Review Comment:
   This looks similar to the copilot suggestion 
   - https://github.com/apache/datafusion/pull/19405#discussion_r2634720785



##########
datafusion/datasource-parquet/src/opener.rs:
##########
@@ -1181,33 +1346,13 @@ mod test {
         ));
 
         let make_opener = |predicate| {
-            ParquetOpener {
-                partition_index: 0,
-                projection: ProjectionExprs::from_indices(&[0, 1], &schema),
-                batch_size: 1024,
-                limit: None,
-                predicate: Some(predicate),
-                table_schema: 
TableSchema::from_file_schema(Arc::clone(&schema)),
-                metadata_size_hint: None,
-                metrics: ExecutionPlanMetricsSet::new(),
-                parquet_file_reader_factory: Arc::new(
-                    DefaultParquetFileReaderFactory::new(Arc::clone(&store)),
-                ),
-                pushdown_filters: false, // note that this is false!
-                reorder_filters: false,
-                force_filter_selections: false,
-                enable_page_index: false,
-                enable_bloom_filter: false,
-                enable_row_group_stats_pruning: true,
-                coerce_int96: None,
-                #[cfg(feature = "parquet_encryption")]
-                file_decryption_properties: None,
-                expr_adapter_factory: 
Arc::new(DefaultPhysicalExprAdapterFactory),
-                #[cfg(feature = "parquet_encryption")]
-                encryption_factory: None,
-                max_predicate_cache_size: None,
-                reverse_row_groups: false,
-            }
+            ParquetOpenerBuilder::new()

Review Comment:
   this is so much nicer



##########
datafusion/datasource-parquet/src/opener.rs:
##########
@@ -988,6 +988,171 @@ mod test {
     use parquet::arrow::ArrowWriter;
     use parquet::file::properties::WriterProperties;
 
+    /// Builder for creating [`ParquetOpener`] instances with sensible 
defaults for tests.
+    /// This helps reduce code duplication and makes it clear what differs 
between test cases.
+    struct ParquetOpenerBuilder {
+        store: Option<Arc<dyn ObjectStore>>,
+        file_schema: Option<SchemaRef>,
+        table_schema: Option<TableSchema>,
+        partition_index: usize,
+        projection_indices: Option<Vec<usize>>,
+        projection: Option<ProjectionExprs>,
+        batch_size: usize,
+        limit: Option<usize>,
+        predicate: Option<Arc<dyn PhysicalExpr>>,
+        metadata_size_hint: Option<usize>,
+        metrics: ExecutionPlanMetricsSet,
+        pushdown_filters: bool,
+        reorder_filters: bool,
+        force_filter_selections: bool,
+        enable_page_index: bool,
+        enable_bloom_filter: bool,
+        enable_row_group_stats_pruning: bool,
+        coerce_int96: Option<arrow::datatypes::TimeUnit>,
+        max_predicate_cache_size: Option<usize>,
+        reverse_row_groups: bool,
+    }
+
+    impl ParquetOpenerBuilder {
+        /// Create a new builder with sensible defaults for tests.
+        fn new() -> Self {
+            Self {
+                store: None,
+                file_schema: None,
+                table_schema: None,
+                partition_index: 0,
+                projection_indices: None,
+                projection: None,
+                batch_size: 1024,
+                limit: None,
+                predicate: None,
+                metadata_size_hint: None,
+                metrics: ExecutionPlanMetricsSet::new(),
+                pushdown_filters: false,
+                reorder_filters: false,
+                force_filter_selections: false,
+                enable_page_index: false,
+                enable_bloom_filter: false,
+                enable_row_group_stats_pruning: false,
+                coerce_int96: None,
+                max_predicate_cache_size: None,
+                reverse_row_groups: false,
+            }
+        }
+
+        /// Set the object store (required for building).
+        fn with_store(mut self, store: Arc<dyn ObjectStore>) -> Self {
+            self.store = Some(store);
+            self
+        }
+
+        /// Set the file schema and create a simple table schema (for files 
without partition columns).
+        fn with_schema(mut self, file_schema: SchemaRef) -> Self {
+            self.file_schema = Some(Arc::clone(&file_schema));
+            self.table_schema = 
Some(TableSchema::from_file_schema(file_schema));
+            self
+        }
+
+        /// Set a custom table schema (for files with partition columns).

Review Comment:
   this seems to set both table_schema and file_schema to the same thing -- 
this seems confusing. I think it would be better if `with_table_schema` 
explicitly only modified `self.table_schema` here
   
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] refactor: add ParquetOpenerBuilder to reduce test code duplication [datafusion]

Reply via email to