AdamGS commented on code in PR #14838: URL: https://github.com/apache/datafusion/pull/14838#discussion_r1968368078
########## datafusion/datasource/src/file_scan_config.rs: ########## @@ -15,19 +15,611 @@ // specific language governing permissions and limitations // under the License. -use std::{borrow::Cow, collections::HashMap, marker::PhantomData, sync::Arc}; +//! [`FileScanConfig`] to configure scanning of possibly partitioned +//! file sources. + +use std::{ + any::Any, borrow::Cow, collections::HashMap, fmt::Debug, fmt::Formatter, + fmt::Result as FmtResult, marker::PhantomData, sync::Arc, +}; use arrow::{ array::{ ArrayData, ArrayRef, BufferBuilder, DictionaryArray, RecordBatch, RecordBatchOptions, }, buffer::Buffer, - datatypes::{ArrowNativeType, DataType, SchemaRef, UInt16Type}, + datatypes::{ArrowNativeType, DataType, Field, Schema, SchemaRef, UInt16Type}, +}; +use datafusion_common::{ + exec_err, stats::Precision, ColumnStatistics, Constraints, Result, Statistics, }; -use datafusion_common::{exec_err, Result}; use datafusion_common::{DataFusionError, ScalarValue}; -use log::warn; +use datafusion_execution::{ + object_store::ObjectStoreUrl, SendableRecordBatchStream, TaskContext, +}; +use datafusion_physical_expr::{ + expressions::Column, EquivalenceProperties, LexOrdering, Partitioning, + PhysicalSortExpr, +}; +use datafusion_physical_plan::{ + display::{display_orderings, ProjectSchemaDisplay}, + metrics::ExecutionPlanMetricsSet, + projection::{all_alias_free_columns, new_projections_for_columns, ProjectionExec}, + DisplayAs, DisplayFormatType, ExecutionPlan, +}; +use log::{debug, warn}; + +use crate::{ + display::FileGroupsDisplay, + file::FileSource, + file_compression_type::FileCompressionType, + file_stream::FileStream, + source::{DataSource, DataSourceExec}, + statistics::MinMaxStatistics, + PartitionedFile, +}; + +/// The base configurations for a [`DataSourceExec`], the a physical plan for +/// any given file format. +/// +/// Use [`Self::build`] to create a [`DataSourceExec`] from a ``FileScanConfig`. +/// +/// # Example +/// ```ignore +/// # use std::sync::Arc; +/// # use arrow::datatypes::{Field, Fields, DataType, Schema}; +/// # use datafusion_datasource::PartitionedFile; +/// # use datafusion_datasource::file_scan_config::FileScanConfig; +/// # use datafusion_execution::object_store::ObjectStoreUrl; +/// # use datafusion::datasource::physical_plan::ArrowSource; Review Comment: Had to ignore this very good rustdoc test because we don't have `ArrowSource` here. My plan is to only have that as a temporary thing between PRs, with the fix being either: 1. Have some `InMemoryFileSource` here that just gets `Vec<RecordBatch>` and knows how to return it through `FileSource` (or maybe even the `MockSource` I added for tests somewhere in this PR). 2. Potentially put `ArrowSource` in this crate as the "canonical" source, especially given how small it is at less than 200 lines of actual code. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org