suremarc commented on code in PR #9593:
URL: https://github.com/apache/datafusion/pull/9593#discussion_r1585938974
##########
datafusion/core/src/datasource/physical_plan/mod.rs:
##########
@@ -473,15 +468,281 @@ fn get_projected_output_ordering(
// since rest of the orderings are violated
break;
}
+
// do not push empty entries
// otherwise we may have `Some(vec![])` at the output ordering.
- if !new_ordering.is_empty() {
- all_orderings.push(new_ordering);
+ if new_ordering.is_empty() {
+ continue;
+ }
+
+ // Check if any file groups are not sorted
+ if base_config.file_groups.iter().any(|group| {
+ if group.len() <= 1 {
+ // File groups with <= 1 files are always sorted
+ return false;
+ }
+
+ let statistics = match MinMaxStatistics::new_from_files(
+ &new_ordering,
+ projected_schema,
+ base_config.projection.as_deref(),
+ group,
+ ) {
+ Ok(statistics) => statistics,
+ Err(e) => {
+ log::trace!("Error fetching statistics for file group:
{e}");
+ // we can't prove that it's ordered, so we have to reject
it
+ return true;
+ }
+ };
+
+ !statistics.is_sorted()
+ }) {
+ debug!(
+ "Skipping specified output ordering {:?}. \
+ Some file groups couldn't be determined to be sorted: {:?}",
+ base_config.output_ordering[0], base_config.file_groups
+ );
+ continue;
}
+
+ all_orderings.push(new_ordering);
}
all_orderings
}
+/// A normalized representation of file min/max statistics that allows for
efficient sorting & comparison.
+/// The min/max values are ordered by [`Self::sort_order`].
+/// Furthermore, any columns that are reversed in the sort order have their
min/max values swapped.
+pub(crate) struct MinMaxStatistics {
+ min_by_sort_order: arrow::row::Rows,
+ max_by_sort_order: arrow::row::Rows,
+ sort_order: Vec<PhysicalSortExpr>,
+}
+
+impl MinMaxStatistics {
+ #[allow(unused)]
+ fn sort_order(&self) -> &[PhysicalSortExpr] {
+ &self.sort_order
+ }
+
+ fn new_from_files<'a>(
+ projected_sort_order: &[PhysicalSortExpr], // Sort order with respect
to projected schema
+ projected_schema: &SchemaRef, // Projected schema
+ projection: Option<&[usize]>, // Indices of projection in full table
schema (None = all columns)
Review Comment:
I am not sure what you mean, but `projection` is what was used to produce
`projected_schema`. It tells us what position the columns of `projected_schema`
would be in the full schema. Does that make it more clear?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]