zhuqi-lucas commented on code in PR #21580:
URL: https://github.com/apache/datafusion/pull/21580#discussion_r3128524960
##########
datafusion/datasource-parquet/src/access_plan.rs:
##########
@@ -377,6 +381,124 @@ impl PreparedAccessPlan {
})
}
+ /// Return a reference to the row group indexes.
+ pub(crate) fn row_group_indexes(&self) -> &[usize] {
+ &self.row_group_indexes
+ }
+
+ /// Keep only the first `count` row groups, dropping the rest.
+ /// Used for TopK cumulative pruning after reorder + reverse.
+ pub(crate) fn truncate_row_groups(mut self, count: usize) -> Self {
+ self.row_group_indexes.truncate(count);
+ // Clear row_selection since it's tied to the original RG set
+ if self.row_selection.is_some() {
+ self.row_selection = None;
+ }
+ self
+ }
+
+ /// Reorder row groups by their min statistics for the given sort order.
+ ///
+ /// This helps TopK queries find optimal values first. For ASC sort,
+ /// row groups with the smallest min values come first. For DESC sort,
+ /// row groups with the largest min values come first.
+ ///
+ /// Gracefully skips reordering when:
+ /// - There is a row_selection (too complex to remap)
+ /// - 0 or 1 row groups (nothing to reorder)
+ /// - Sort expression is not a simple column reference
+ /// - Statistics are unavailable
+ pub(crate) fn reorder_by_statistics(
+ mut self,
+ sort_order: &LexOrdering,
+ file_metadata: &ParquetMetaData,
+ arrow_schema: &Schema,
+ ) -> Result<Self> {
+ // Skip if row_selection present (too complex to remap)
+ if self.row_selection.is_some() {
+ debug!("Skipping RG reorder: row_selection present");
+ return Ok(self);
+ }
+
+ // Nothing to reorder
+ if self.row_group_indexes.len() <= 1 {
+ return Ok(self);
+ }
+
+ // Get the first sort expression
+ // LexOrdering is guaranteed non-empty, so first() returns
&PhysicalSortExpr
+ let first_sort_expr = sort_order.first();
Review Comment:
Addressed it in latest PR.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]