Re: [PR] better preserve statistics when applying limits [datafusion]

via GitHub Thu, 04 Sep 2025 23:44:57 -0700


xudong963 commented on code in PR #17381:
URL: https://github.com/apache/datafusion/pull/17381#discussion_r2324235835



##########
datafusion/common/src/stats.rs:
##########
@@ -391,62 +391,85 @@ impl Statistics {
     /// parameter to compute global statistics in a multi-partition setting.
     pub fn with_fetch(
         mut self,
-        schema: SchemaRef,
+        _schema: SchemaRef,

Review Comment:
   Are you keeping the unused param to avoid API change?



##########
datafusion/common/src/stats.rs:
##########
@@ -391,62 +391,85 @@ impl Statistics {
     /// parameter to compute global statistics in a multi-partition setting.
     pub fn with_fetch(
         mut self,
-        schema: SchemaRef,
+        _schema: SchemaRef,
         fetch: Option<usize>,
         skip: usize,
         n_partitions: usize,
     ) -> Result<Self> {
         let fetch_val = fetch.unwrap_or(usize::MAX);
 
-        self.num_rows = match self {
-            Statistics {
-                num_rows: Precision::Exact(nr),
-                ..
+        // Get the ratio of rows after / rows before on a per-partition basis
+        let num_rows_before = self.num_rows;
+
+        // Check for early return case before mutating self
+        if let Precision::Exact(nr) | Precision::Inexact(nr) = self.num_rows {
+            if nr <= fetch_val && skip == 0 {
+                // If the input does not reach the `fetch` globally, and `skip`
+                // is zero (meaning the input and output are identical), return
+                // input stats as is.
+                // TODO: Can input stats still be used, but adjusted, when 
`skip`
+                //       is non-zero?
+                return Ok(self);
             }
-            | Statistics {
-                num_rows: Precision::Inexact(nr),
-                ..
-            } => {
+        }
+
+        self.num_rows = match self.num_rows {
+            Precision::Exact(nr) | Precision::Inexact(nr) => {
                 // Here, the inexact case gives us an upper bound on the 
number of rows.
                 if nr <= skip {
                     // All input data will be skipped:
                     Precision::Exact(0)
-                } else if nr <= fetch_val && skip == 0 {
-                    // If the input does not reach the `fetch` globally, and 
`skip`
-                    // is zero (meaning the input and output are identical), 
return
-                    // input stats as is.
-                    // TODO: Can input stats still be used, but adjusted, when 
`skip`
-                    //       is non-zero?
-                    return Ok(self);
                 } else if nr - skip <= fetch_val {
                     // After `skip` input rows are skipped, the remaining rows 
are
                     // less than or equal to the `fetch` values, so `num_rows` 
must
                     // equal the remaining rows.
                     check_num_rows(
                         (nr - skip).checked_mul(n_partitions),
                         // We know that we have an estimate for the number of 
rows:

Review Comment:
   Do you think the purpose of the comment was to say that the `unwrap()` is 
safe?
   
   If so, I think we can remove the comment.



##########
datafusion/common/src/stats.rs:
##########
@@ -391,62 +391,85 @@ impl Statistics {
     /// parameter to compute global statistics in a multi-partition setting.
     pub fn with_fetch(
         mut self,
-        schema: SchemaRef,
+        _schema: SchemaRef,
         fetch: Option<usize>,
         skip: usize,
         n_partitions: usize,
     ) -> Result<Self> {
         let fetch_val = fetch.unwrap_or(usize::MAX);
 
-        self.num_rows = match self {
-            Statistics {
-                num_rows: Precision::Exact(nr),
-                ..
+        // Get the ratio of rows after / rows before on a per-partition basis
+        let num_rows_before = self.num_rows;
+
+        // Check for early return case before mutating self
+        if let Precision::Exact(nr) | Precision::Inexact(nr) = self.num_rows {
+            if nr <= fetch_val && skip == 0 {
+                // If the input does not reach the `fetch` globally, and `skip`
+                // is zero (meaning the input and output are identical), return
+                // input stats as is.
+                // TODO: Can input stats still be used, but adjusted, when 
`skip`
+                //       is non-zero?
+                return Ok(self);
             }
-            | Statistics {
-                num_rows: Precision::Inexact(nr),
-                ..
-            } => {
+        }
+
+        self.num_rows = match self.num_rows {
+            Precision::Exact(nr) | Precision::Inexact(nr) => {
                 // Here, the inexact case gives us an upper bound on the 
number of rows.
                 if nr <= skip {
                     // All input data will be skipped:
                     Precision::Exact(0)
-                } else if nr <= fetch_val && skip == 0 {
-                    // If the input does not reach the `fetch` globally, and 
`skip`
-                    // is zero (meaning the input and output are identical), 
return
-                    // input stats as is.
-                    // TODO: Can input stats still be used, but adjusted, when 
`skip`
-                    //       is non-zero?
-                    return Ok(self);
                 } else if nr - skip <= fetch_val {
                     // After `skip` input rows are skipped, the remaining rows 
are
                     // less than or equal to the `fetch` values, so `num_rows` 
must
                     // equal the remaining rows.
                     check_num_rows(
                         (nr - skip).checked_mul(n_partitions),
                         // We know that we have an estimate for the number of 
rows:
-                        self.num_rows.is_exact().unwrap(),
+                        matches!(num_rows_before, Precision::Exact(_)),
                     )
                 } else {
                     // At this point we know that we were given a `fetch` value
                     // as the `None` case would go into the branch above. Since
                     // the input has more rows than `fetch + skip`, the number
-                    // of rows will be the `fetch`, but we won't be able to
-                    // predict the other statistics.
+                    // of rows will be the `fetch`, other statistics will have 
to be downgraded to inexact.
                     check_num_rows(
                         fetch_val.checked_mul(n_partitions),
                         // We know that we have an estimate for the number of 
rows:
-                        self.num_rows.is_exact().unwrap(),
+                        matches!(num_rows_before, Precision::Exact(_)),
                     )
                 }
             }
-            Statistics {
-                num_rows: Precision::Absent,
-                ..
-            } => check_num_rows(fetch.and_then(|v| 
v.checked_mul(n_partitions)), false),
+            Precision::Absent => {
+                check_num_rows(fetch.and_then(|v| 
v.checked_mul(n_partitions)), false)
+            }
+        };
+        let ratio: f64 = match (num_rows_before, self.num_rows) {
+            (
+                Precision::Exact(nr_before) | Precision::Inexact(nr_before),
+                Precision::Exact(nr_after) | Precision::Inexact(nr_after),
+            ) => {
+                if nr_before == 0 {
+                    0.0
+                } else {
+                    nr_after as f64 / nr_before as f64
+                }
+            }
+            _ => 0.0,
+        };
+        self.column_statistics = self
+            .column_statistics
+            .into_iter()
+            .map(ColumnStatistics::to_inexact)
+            .collect();

Review Comment:
   The `limit` can make the column_statistics super different from the 
original, so here I'm wondering if it's better to give it an unknown. I'm 
worried we'll use inexact column_statistics to do some estimation, but in fact, 
the column_statistics is very deviant 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Re: [PR] better preserve statistics when applying limits [datafusion]

Reply via email to