adriangb commented on code in PR #15301:
URL: https://github.com/apache/datafusion/pull/15301#discussion_r2006767851
##########
datafusion/core/src/datasource/physical_plan/parquet.rs:
##########
@@ -1655,4 +1656,46 @@ mod tests {
assert_eq!(calls.len(), 2);
assert_eq!(calls, vec![Some(123), Some(456)]);
}
+
+ #[tokio::test]
+ async fn test_topk_predicate_pushdown() {
+ let ctx = SessionContext::new();
+ let opt = ListingOptions::new(Arc::new(ParquetFormat::default()))
+ // We need to force 1 partition because TopK predicate pushdown
happens on a per-partition basis
+ // If we had 1 file per partition (as an example) no pushdown
would happen
+ .with_target_partitions(1);
+
+ let tmp_dir = TempDir::new().unwrap();
+ let path = tmp_dir.path().to_str().unwrap().to_string();
+ // The point here is that we write many, many files.
+ // So when we scan after we processed the first one we should be able
to skip the rest
+ // because of the TopK predicate pushdown.
+ for file in 0..100 {
+ let name = format!("test{:02}.parquet", file);
+ write_file(&format!("{path}/{name}"));
+ }
+ ctx.register_listing_table("base_table", path, opt, None, None)
+ .await
+ .unwrap();
+
+ let query = "select name from base_table order by id desc limit 3";
+
+ let batches = ctx.sql(query).await.unwrap().collect().await.unwrap();
+ #[rustfmt::skip]
+ let expected = [
+ "+--------+",
+ "| name |",
+ "+--------+",
+ "| test02 |",
+ "| test02 |",
+ "| test02 |",
+ "+--------+",
+ ];
+ assert_batches_eq!(expected, &batches);
+
+ let sql = format!("explain analyze {query}");
+ let batches = ctx.sql(&sql).await.unwrap().collect().await.unwrap();
+ let explain_plan = format!("{}",
pretty_format_batches(&batches).unwrap());
+ assert_contains!(explain_plan, "row_groups_pruned_statistics=96");
Review Comment:
Yes! More tests! I just tried this in my full system and found a bug w/ hive
partition columns. Making a note to add a test and fix.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]