suremarc commented on code in PR #9593:
URL: https://github.com/apache/datafusion/pull/9593#discussion_r1586860589
##########
datafusion/core/src/datasource/physical_plan/mod.rs:
##########
@@ -473,15 +468,281 @@ fn get_projected_output_ordering(
// since rest of the orderings are violated
break;
}
+
// do not push empty entries
// otherwise we may have `Some(vec![])` at the output ordering.
- if !new_ordering.is_empty() {
- all_orderings.push(new_ordering);
+ if new_ordering.is_empty() {
+ continue;
+ }
+
+ // Check if any file groups are not sorted
+ if base_config.file_groups.iter().any(|group| {
+ if group.len() <= 1 {
+ // File groups with <= 1 files are always sorted
+ return false;
+ }
+
+ let statistics = match MinMaxStatistics::new_from_files(
+ &new_ordering,
+ projected_schema,
+ base_config.projection.as_deref(),
+ group,
+ ) {
+ Ok(statistics) => statistics,
+ Err(e) => {
+ log::trace!("Error fetching statistics for file group:
{e}");
+ // we can't prove that it's ordered, so we have to reject
it
+ return true;
+ }
+ };
+
+ !statistics.is_sorted()
+ }) {
+ debug!(
+ "Skipping specified output ordering {:?}. \
+ Some file groups couldn't be determined to be sorted: {:?}",
+ base_config.output_ordering[0], base_config.file_groups
+ );
+ continue;
}
+
+ all_orderings.push(new_ordering);
}
all_orderings
}
+/// A normalized representation of file min/max statistics that allows for
efficient sorting & comparison.
+/// The min/max values are ordered by [`Self::sort_order`].
+/// Furthermore, any columns that are reversed in the sort order have their
min/max values swapped.
+pub(crate) struct MinMaxStatistics {
+ min_by_sort_order: arrow::row::Rows,
+ max_by_sort_order: arrow::row::Rows,
+ sort_order: Vec<PhysicalSortExpr>,
+}
+
+impl MinMaxStatistics {
+ #[allow(unused)]
+ fn sort_order(&self) -> &[PhysicalSortExpr] {
+ &self.sort_order
+ }
+
+ fn new_from_files<'a>(
+ projected_sort_order: &[PhysicalSortExpr], // Sort order with respect
to projected schema
+ projected_schema: &SchemaRef, // Projected schema
+ projection: Option<&[usize]>, // Indices of projection in full table
schema (None = all columns)
Review Comment:
Oh... I didn't even think about option 1. But I was assuming that the layout
of the file statistics should match the table schema and not the individual
file's schema. It seems that that's what DataFusion does currently.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]