adriangb commented on code in PR #19113:
URL: https://github.com/apache/datafusion/pull/19113#discussion_r2600899000
##########
datafusion/common/src/stats.rs:
##########
@@ -502,15 +502,38 @@ impl Statistics {
self.column_statistics = self
.column_statistics
.into_iter()
- .map(ColumnStatistics::to_inexact)
+ .map(|cs| {
+ let mut cs = cs.to_inexact();
+ // Scale byte_size by the row ratio
+ cs.byte_size = match cs.byte_size {
+ Precision::Exact(n) | Precision::Inexact(n) => {
+ Precision::Inexact((n as f64 * ratio) as usize)
+ }
+ Precision::Absent => Precision::Absent,
+ };
+ cs
+ })
.collect();
- // Adjust the total_byte_size for the ratio of rows before and after,
also marking it as inexact
- self.total_byte_size = match &self.total_byte_size {
- Precision::Exact(n) | Precision::Inexact(n) => {
- let adjusted = (*n as f64 * ratio) as usize;
- Precision::Inexact(adjusted)
+
+ // Compute total_byte_size as sum of column byte_size values if all
are present,
+ // otherwise fall back to scaling the original total_byte_size
+ let sum_scan_bytes: Option<usize> = self
+ .column_statistics
+ .iter()
+ .map(|cs| cs.byte_size.get_value().copied())
+ .try_fold(0usize, |acc, val| val.map(|v| acc + v));
Review Comment:
This needs to handle exactness I think
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]