xudong963 commented on code in PR #17381: URL: https://github.com/apache/datafusion/pull/17381#discussion_r2326464415
########## datafusion/common/src/stats.rs: ########## @@ -391,62 +391,85 @@ impl Statistics { /// parameter to compute global statistics in a multi-partition setting. pub fn with_fetch( mut self, - schema: SchemaRef, + _schema: SchemaRef, fetch: Option<usize>, skip: usize, n_partitions: usize, ) -> Result<Self> { let fetch_val = fetch.unwrap_or(usize::MAX); - self.num_rows = match self { - Statistics { - num_rows: Precision::Exact(nr), - .. + // Get the ratio of rows after / rows before on a per-partition basis + let num_rows_before = self.num_rows; + + // Check for early return case before mutating self + if let Precision::Exact(nr) | Precision::Inexact(nr) = self.num_rows { + if nr <= fetch_val && skip == 0 { + // If the input does not reach the `fetch` globally, and `skip` + // is zero (meaning the input and output are identical), return + // input stats as is. + // TODO: Can input stats still be used, but adjusted, when `skip` + // is non-zero? + return Ok(self); } - | Statistics { - num_rows: Precision::Inexact(nr), - .. - } => { + } + + self.num_rows = match self.num_rows { + Precision::Exact(nr) | Precision::Inexact(nr) => { // Here, the inexact case gives us an upper bound on the number of rows. if nr <= skip { // All input data will be skipped: Precision::Exact(0) - } else if nr <= fetch_val && skip == 0 { - // If the input does not reach the `fetch` globally, and `skip` - // is zero (meaning the input and output are identical), return - // input stats as is. - // TODO: Can input stats still be used, but adjusted, when `skip` - // is non-zero? - return Ok(self); } else if nr - skip <= fetch_val { // After `skip` input rows are skipped, the remaining rows are // less than or equal to the `fetch` values, so `num_rows` must // equal the remaining rows. check_num_rows( (nr - skip).checked_mul(n_partitions), // We know that we have an estimate for the number of rows: - self.num_rows.is_exact().unwrap(), + matches!(num_rows_before, Precision::Exact(_)), ) } else { // At this point we know that we were given a `fetch` value // as the `None` case would go into the branch above. Since // the input has more rows than `fetch + skip`, the number - // of rows will be the `fetch`, but we won't be able to - // predict the other statistics. + // of rows will be the `fetch`, other statistics will have to be downgraded to inexact. check_num_rows( fetch_val.checked_mul(n_partitions), // We know that we have an estimate for the number of rows: - self.num_rows.is_exact().unwrap(), + matches!(num_rows_before, Precision::Exact(_)), ) } } - Statistics { - num_rows: Precision::Absent, - .. - } => check_num_rows(fetch.and_then(|v| v.checked_mul(n_partitions)), false), + Precision::Absent => { + check_num_rows(fetch.and_then(|v| v.checked_mul(n_partitions)), false) + } + }; + let ratio: f64 = match (num_rows_before, self.num_rows) { + ( + Precision::Exact(nr_before) | Precision::Inexact(nr_before), + Precision::Exact(nr_after) | Precision::Inexact(nr_after), + ) => { + if nr_before == 0 { + 0.0 + } else { + nr_after as f64 / nr_before as f64 + } + } + _ => 0.0, + }; + self.column_statistics = self + .column_statistics + .into_iter() + .map(ColumnStatistics::to_inexact) + .collect(); Review Comment: > do you know of any cases where this would lead to a worse result? No, just out of a conservative mindset. But I don't have a strong bias for this, we can adjust anytime. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org