nuno-faria commented on code in PR #22950:
URL: https://github.com/apache/datafusion/pull/22950#discussion_r3438157672
##########
datafusion/catalog-listing/src/table.rs:
##########
@@ -264,6 +264,18 @@ impl ListingTable {
self
}
+ fn statistics_cache(
Review Comment:
Could you please add a small comment explaining why we discard the cache for
future reference?
##########
datafusion/core/tests/parquet/file_statistics.rs:
##########
@@ -166,6 +169,121 @@ async fn load_table_stats_with_session_level_cache() {
assert_eq!(get_static_cache_size(&state1), 1);
}
+#[tokio::test]
+async fn anonymous_parquet_stats_cache_with_explicit_wider_schema() {
+ let temp_dir = tempdir().unwrap();
+ let parquet_path = temp_dir.path().join("data.parquet");
+ let parquet_path = parquet_path.to_string_lossy().to_string();
+
+ let ctx = SessionContext::new_with_config(
+ SessionConfig::new().with_collect_statistics(true),
+ );
+ let cache = ctx
+ .runtime_env()
+ .cache_manager
+ .get_file_statistic_cache()
+ .unwrap();
+
+ ctx.sql(&format!(
+ "COPY (
+ SELECT 1::BIGINT AS id, 1000::BIGINT AS population
+ ) TO '{parquet_path}' STORED AS PARQUET"
+ ))
+ .await
+ .unwrap()
+ .collect()
+ .await
+ .unwrap();
+
+ assert_eq!(cache.len(), 0);
+
+ ctx.read_parquet(&parquet_path, ParquetReadOptions::default())
+ .await
+ .unwrap()
+ .collect()
+ .await
+ .unwrap();
+
+ assert_eq!(cache.len(), 1);
+
+ let wider_schema = Schema::new(vec![
+ Field::new("id", DataType::Int64, true),
+ Field::new("population", DataType::Int64, true),
+ Field::new("extra", DataType::Int64, true),
+ ]);
+
+ let plan = ctx
+ .read_parquet(
+ &parquet_path,
+ ParquetReadOptions::default().schema(&wider_schema),
+ )
+ .await
+ .unwrap()
+ .select_columns(&["id", "extra"])
+ .unwrap()
+ .create_physical_plan()
+ .await
+ .unwrap();
+
+ let stats = plan.partition_statistics(None).unwrap();
+ assert_eq!(stats.column_statistics.len(), 2);
+ assert_eq!(stats.column_statistics[1].null_count, Precision::Exact(1));
+ assert_eq!(cache.len(), 1);
+}
+
+#[tokio::test]
+async fn anonymous_specified_schema_skips_session_statistics_cache() {
Review Comment:
Is this test needed? Seems very similar to
`anonymous_parquet_stats_cache_with_explicit_wider_schema`.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]