Re: [PR] Improve `AggregateFuzz` testing [datafusion]

via GitHub Tue, 15 Oct 2024 12:34:59 -0700


alamb commented on code in PR #12847:
URL: https://github.com/apache/datafusion/pull/12847#discussion_r1801799363



##########
datafusion/core/tests/fuzz_cases/aggregation_fuzzer/fuzzer.rs:
##########
@@ -305,3 +327,172 @@ impl AggregationFuzzTestTask {
         )
     }
 }
+
+/// Pretty prints the `RecordBatch`es, limited to the first 100 rows
+fn format_batches_with_limit(batches: &[RecordBatch]) -> impl 
std::fmt::Display {

Review Comment:
   without limiting the size of the output, the output is overwhelming



##########
datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs:
##########
@@ -45,299 +45,149 @@ use rand::{Rng, SeedableRng};
 use tokio::task::JoinSet;
 
 use crate::fuzz_cases::aggregation_fuzzer::{
-    AggregationFuzzerBuilder, ColumnDescr, DatasetGeneratorConfig,
+    AggregationFuzzerBuilder, ColumnDescr, DatasetGeneratorConfig, 
QueryBuilder,
 };
 
 // ========================================================================
 //  The new aggregation fuzz tests based on [`AggregationFuzzer`]
 // ========================================================================
+//
+// Notes on tests:
+//
+// Since the supported types differ for each aggregation function, the tests
+// below are structured so they enumerate each different aggregate function.
+//
+// The test framework handles varying combinations of arguments (data types),
+// sortedness, and grouping parameters
+//
+// TODO: Test on floating point values (where output needs to be compared with 
some

Review Comment:
   I plan to work on these tests next. First I will add coverage for StringView 
/ BinaryView



##########
datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs:
##########
@@ -45,299 +45,149 @@ use rand::{Rng, SeedableRng};
 use tokio::task::JoinSet;
 
 use crate::fuzz_cases::aggregation_fuzzer::{
-    AggregationFuzzerBuilder, ColumnDescr, DatasetGeneratorConfig,
+    AggregationFuzzerBuilder, ColumnDescr, DatasetGeneratorConfig, 
QueryBuilder,
 };
 
 // ========================================================================
 //  The new aggregation fuzz tests based on [`AggregationFuzzer`]
 // ========================================================================
+//
+// Notes on tests:
+//
+// Since the supported types differ for each aggregation function, the tests
+// below are structured so they enumerate each different aggregate function.
+//
+// The test framework handles varying combinations of arguments (data types),
+// sortedness, and grouping parameters
+//
+// TODO: Test on floating point values (where output needs to be compared with 
some
+// acceptable range due to floating point rounding)
+//
+// TODO: test other aggregate functions
+// - AVG (unstable given the wide range of inputs)
+//
+// TODO: specific test for ordering (ensure all group by columns are ordered)
 
-// TODO: write more test case to cover more `group by`s and `aggregation 
function`s
-// TODO: maybe we can use macro to simply the case creating
-
-/// Fuzz test for `basic prim aggr(sum/sum distinct/max/min/count/avg)` + `no 
group by`
-#[tokio::test(flavor = "multi_thread")]
-async fn test_basic_prim_aggr_no_group() {
-    let builder = AggregationFuzzerBuilder::default();
-
-    // Define data generator config
-    let columns = vec![ColumnDescr::new("a", DataType::Int32)];
-
-    let data_gen_config = DatasetGeneratorConfig {
-        columns,
-        rows_num_range: (512, 1024),
-        sort_keys_set: Vec::new(),
-    };
-
-    // Build fuzzer
-    let fuzzer = builder
-        .data_gen_config(data_gen_config)
-        .data_gen_rounds(16)
-        .add_sql("SELECT sum(a) FROM fuzz_table")
-        .add_sql("SELECT sum(distinct a) FROM fuzz_table")
-        .add_sql("SELECT max(a) FROM fuzz_table")
-        .add_sql("SELECT min(a) FROM fuzz_table")
-        .add_sql("SELECT count(a) FROM fuzz_table")
-        .add_sql("SELECT count(distinct a) FROM fuzz_table")
-        .add_sql("SELECT avg(a) FROM fuzz_table")
-        .table_name("fuzz_table")
-        .build();
-
-    fuzzer.run().await
-}
-
-/// Fuzz test for `basic prim aggr(sum/sum distinct/max/min/count/avg)` + 
`group by single int64`
-#[tokio::test(flavor = "multi_thread")]
-async fn test_basic_prim_aggr_group_by_single_int64() {
-    let builder = AggregationFuzzerBuilder::default();
-
-    // Define data generator config
-    let columns = vec![
-        ColumnDescr::new("a", DataType::Int32),
-        ColumnDescr::new("b", DataType::Int64),
-        ColumnDescr::new("c", DataType::Int64),
-    ];
-    let sort_keys_set = vec![
-        vec!["b".to_string()],
-        vec!["c".to_string(), "b".to_string()],
-    ];
-    let data_gen_config = DatasetGeneratorConfig {
-        columns,
-        rows_num_range: (512, 1024),
-        sort_keys_set,
-    };
-
-    // Build fuzzer
-    let fuzzer = builder
-        .data_gen_config(data_gen_config)
-        .data_gen_rounds(16)
-        .add_sql("SELECT b, sum(a) FROM fuzz_table GROUP BY b")
-        .add_sql("SELECT b, sum(distinct a) FROM fuzz_table GROUP BY b")
-        .add_sql("SELECT b, max(a) FROM fuzz_table GROUP BY b")
-        .add_sql("SELECT b, min(a) FROM fuzz_table GROUP BY b")
-        .add_sql("SELECT b, count(a) FROM fuzz_table GROUP BY b")
-        .add_sql("SELECT b, count(distinct a) FROM fuzz_table GROUP BY b")
-        .add_sql("SELECT b, avg(a) FROM fuzz_table GROUP BY b")
-        .table_name("fuzz_table")
-        .build();
-
-    fuzzer.run().await;
-}
-
-/// Fuzz test for `basic prim aggr(sum/sum distinct/max/min/count/avg)` + 
`group by single string`
 #[tokio::test(flavor = "multi_thread")]
-async fn test_basic_prim_aggr_group_by_single_string() {
-    let builder = AggregationFuzzerBuilder::default();
-
-    // Define data generator config
-    let columns = vec![
-        ColumnDescr::new("a", DataType::Int32),
-        ColumnDescr::new("b", DataType::Utf8),
-        ColumnDescr::new("c", DataType::Int64),
-    ];
-    let sort_keys_set = vec![
-        vec!["b".to_string()],
-        vec!["c".to_string(), "b".to_string()],
-    ];
-    let data_gen_config = DatasetGeneratorConfig {
-        columns,
-        rows_num_range: (512, 1024),
-        sort_keys_set,
-    };
-
-    // Build fuzzer
-    let fuzzer = builder
-        .data_gen_config(data_gen_config)
-        .data_gen_rounds(16)
-        .add_sql("SELECT b, sum(a) FROM fuzz_table GROUP BY b")
-        .add_sql("SELECT b, sum(distinct a) FROM fuzz_table GROUP BY b")
-        .add_sql("SELECT b, max(a) FROM fuzz_table GROUP BY b")
-        .add_sql("SELECT b, min(a) FROM fuzz_table GROUP BY b")
-        .add_sql("SELECT b, count(a) FROM fuzz_table GROUP BY b")
-        .add_sql("SELECT b, count(distinct a) FROM fuzz_table GROUP BY b")
-        .add_sql("SELECT b, avg(a) FROM fuzz_table GROUP BY b")
-        .table_name("fuzz_table")
-        .build();
-
-    fuzzer.run().await;
+async fn test_min() {
+    let data_gen_config = baseline_config();
+
+    // Queries like SELECT min(a) FROM fuzz_table GROUP BY b
+    let query_builder = QueryBuilder::new()

Review Comment:
   The major "innovation" of this PR is to automatically generate the queries 
as well to increase coverage (e.g. different numbers of group columns, etc)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Improve `AggregateFuzz` testing [datafusion]

Reply via email to