Re: [PR] Improve `AggregateFuzz` testing: generate random queries [datafusion]

via GitHub Thu, 17 Oct 2024 09:13:08 -0700


alamb commented on code in PR #12847:
URL: https://github.com/apache/datafusion/pull/12847#discussion_r1805054399



##########
datafusion/core/tests/fuzz_cases/aggregation_fuzzer/fuzzer.rs:
##########
@@ -305,3 +327,172 @@ impl AggregationFuzzTestTask {
         )
     }
 }
+
+/// Pretty prints the `RecordBatch`es, limited to the first 100 rows
+fn format_batches_with_limit(batches: &[RecordBatch]) -> impl 
std::fmt::Display {

Review Comment:
   I found that seeing the differences in the output made it easier for me to 
understand what was wrong
   
   Specifically, when I added a test for `AVG` and it failed intermittently for 
me, it was easy for me to say "integer overflow" as the values of the averages 
were `e19` or something like that



##########
datafusion/core/tests/fuzz_cases/aggregation_fuzzer/fuzzer.rs:
##########
@@ -305,3 +327,172 @@ impl AggregationFuzzTestTask {
         )
     }
 }
+
+/// Pretty prints the `RecordBatch`es, limited to the first 100 rows
+fn format_batches_with_limit(batches: &[RecordBatch]) -> impl 
std::fmt::Display {
+    const MAX_ROWS: usize = 100;
+    let mut row_count = 0;
+    let to_print = batches
+        .iter()
+        .filter_map(|b| {
+            if row_count >= MAX_ROWS {
+                None
+            } else if row_count + b.num_rows() > MAX_ROWS {
+                // output last rows before limit
+                let slice_len = MAX_ROWS - row_count;
+                let b = b.slice(0, slice_len);
+                row_count += slice_len;
+                Some(b)
+            } else {
+                row_count += b.num_rows();
+                Some(b.clone())
+            }
+        })
+        .collect::<Vec<_>>();
+
+    pretty_format_batches(&to_print).unwrap()
+}
+
+/// Random aggregate query builder
+///
+/// Creates queries like
+/// ```sql
+/// SELECT AGG(..) FROM table_name GROUP BY <group_by_columns>
+///```
+#[derive(Debug, Default)]
+pub struct QueryBuilder {
+    /// The name of the table to query
+    table_name: String,
+    /// Aggregate functions to be used in the query
+    /// (function_name, is_distinct)
+    aggregate_functions: Vec<(String, bool)>,
+    /// Columns to be used in group by
+    group_by_columns: Vec<String>,
+    /// Possible columns for arguments in the aggregate functions
+    ///
+    /// Assumes each
+    arguments: Vec<String>,
+}
+impl QueryBuilder {
+    pub fn new() -> Self {
+        std::default::Default::default()
+    }
+
+    /// return the table name if any
+    pub fn table_name(&self) -> &str {
+        &self.table_name
+    }
+
+    /// Set the table name for the query builder
+    pub fn with_table_name(mut self, table_name: impl Into<String>) -> Self {
+        self.table_name = table_name.into();
+        self
+    }
+
+    /// Add a new possible aggregate function to the query builder
+    pub fn with_aggregate_function(
+        mut self,
+        aggregate_function: impl Into<String>,
+    ) -> Self {
+        self.aggregate_functions
+            .push((aggregate_function.into(), false));
+        self
+    }
+
+    /// Add a new possible `DISTINCT` aggregate function to the query
+    ///
+    /// This is different than `with_aggregate_function` because only certain
+    /// aggregates support `DISTINCT`
+    pub fn with_distinct_aggregate_function(
+        mut self,
+        aggregate_function: impl Into<String>,
+    ) -> Self {
+        self.aggregate_functions
+            .push((aggregate_function.into(), true));
+        self
+    }
+
+    /// Add a column to be used in the group bys
+    pub fn with_group_by_columns<'a>(
+        mut self,
+        group_by: impl IntoIterator<Item = &'a str>,
+    ) -> Self {
+        let group_by = group_by.into_iter().map(String::from);
+        self.group_by_columns.extend(group_by);
+        self
+    }
+
+    /// Add a column to be used as an argument in the aggregate functions
+    pub fn with_aggregate_arguments<'a>(
+        mut self,
+        arguments: impl IntoIterator<Item = &'a str>,
+    ) -> Self {
+        let arguments = arguments.into_iter().map(String::from);
+        self.arguments.extend(arguments);
+        self
+    }
+
+    pub fn generate_query(&self) -> String {
+        let group_by = self.random_group_by();
+        let mut query = String::from("SELECT ");
+        query.push_str(&self.random_aggregate_functions().join(", "));
+        query.push_str(" FROM ");
+        query.push_str(&self.table_name);
+        if !group_by.is_empty() {
+            query.push_str(" GROUP BY ");
+            query.push_str(&group_by.join(", "));
+        }
+        query
+    }
+
+    /// Generate a random number of aggregate functions (potentially 
repeating).
+    ///
+    fn random_aggregate_functions(&self) -> Vec<String> {

Review Comment:
   Yes, that is right -- Improved the comments to clarify



##########
datafusion/core/tests/fuzz_cases/aggregation_fuzzer/fuzzer.rs:
##########
@@ -305,3 +327,172 @@ impl AggregationFuzzTestTask {
         )
     }
 }
+
+/// Pretty prints the `RecordBatch`es, limited to the first 100 rows
+fn format_batches_with_limit(batches: &[RecordBatch]) -> impl 
std::fmt::Display {
+    const MAX_ROWS: usize = 100;
+    let mut row_count = 0;
+    let to_print = batches
+        .iter()
+        .filter_map(|b| {
+            if row_count >= MAX_ROWS {
+                None
+            } else if row_count + b.num_rows() > MAX_ROWS {
+                // output last rows before limit
+                let slice_len = MAX_ROWS - row_count;
+                let b = b.slice(0, slice_len);
+                row_count += slice_len;
+                Some(b)
+            } else {
+                row_count += b.num_rows();
+                Some(b.clone())
+            }
+        })
+        .collect::<Vec<_>>();
+
+    pretty_format_batches(&to_print).unwrap()
+}
+
+/// Random aggregate query builder
+///
+/// Creates queries like
+/// ```sql
+/// SELECT AGG(..) FROM table_name GROUP BY <group_by_columns>
+///```
+#[derive(Debug, Default)]
+pub struct QueryBuilder {
+    /// The name of the table to query
+    table_name: String,
+    /// Aggregate functions to be used in the query
+    /// (function_name, is_distinct)
+    aggregate_functions: Vec<(String, bool)>,
+    /// Columns to be used in group by
+    group_by_columns: Vec<String>,
+    /// Possible columns for arguments in the aggregate functions
+    ///
+    /// Assumes each
+    arguments: Vec<String>,
+}
+impl QueryBuilder {
+    pub fn new() -> Self {
+        std::default::Default::default()
+    }
+
+    /// return the table name if any
+    pub fn table_name(&self) -> &str {
+        &self.table_name
+    }
+
+    /// Set the table name for the query builder
+    pub fn with_table_name(mut self, table_name: impl Into<String>) -> Self {
+        self.table_name = table_name.into();
+        self
+    }
+
+    /// Add a new possible aggregate function to the query builder
+    pub fn with_aggregate_function(
+        mut self,
+        aggregate_function: impl Into<String>,
+    ) -> Self {
+        self.aggregate_functions
+            .push((aggregate_function.into(), false));
+        self
+    }
+
+    /// Add a new possible `DISTINCT` aggregate function to the query
+    ///
+    /// This is different than `with_aggregate_function` because only certain
+    /// aggregates support `DISTINCT`
+    pub fn with_distinct_aggregate_function(
+        mut self,
+        aggregate_function: impl Into<String>,
+    ) -> Self {
+        self.aggregate_functions
+            .push((aggregate_function.into(), true));
+        self
+    }
+
+    /// Add a column to be used in the group bys
+    pub fn with_group_by_columns<'a>(
+        mut self,
+        group_by: impl IntoIterator<Item = &'a str>,
+    ) -> Self {
+        let group_by = group_by.into_iter().map(String::from);
+        self.group_by_columns.extend(group_by);
+        self
+    }
+
+    /// Add a column to be used as an argument in the aggregate functions
+    pub fn with_aggregate_arguments<'a>(
+        mut self,
+        arguments: impl IntoIterator<Item = &'a str>,
+    ) -> Self {
+        let arguments = arguments.into_iter().map(String::from);
+        self.arguments.extend(arguments);
+        self
+    }
+
+    pub fn generate_query(&self) -> String {
+        let group_by = self.random_group_by();
+        let mut query = String::from("SELECT ");
+        query.push_str(&self.random_aggregate_functions().join(", "));
+        query.push_str(" FROM ");
+        query.push_str(&self.table_name);
+        if !group_by.is_empty() {
+            query.push_str(" GROUP BY ");
+            query.push_str(&group_by.join(", "));
+        }
+        query
+    }
+
+    /// Generate a random number of aggregate functions (potentially 
repeating).
+    ///
+    fn random_aggregate_functions(&self) -> Vec<String> {
+        const MAX_NUM_FUNCTIONS: usize = 5;
+        let mut rng = thread_rng();
+        let num_aggregate_functions = rng.gen_range(1..MAX_NUM_FUNCTIONS);
+
+        let mut alias_gen = 1;
+
+        let mut aggregate_functions = vec![];
+        while aggregate_functions.len() < num_aggregate_functions {
+            let idx = rng.gen_range(0..self.aggregate_functions.len());
+            let (function_name, is_distinct) = &self.aggregate_functions[idx];
+            let argument = self.random_argument();
+            let alias = format!("col{}", alias_gen);
+            let distinct = if *is_distinct { "DISTINCT " } else { "" };
+            alias_gen += 1;
+            let function = format!("{function_name}({distinct}{argument}) as 
{alias}");
+            aggregate_functions.push(function);
+        }
+        aggregate_functions
+    }
+
+    /// Pick a random aggregate function argument
+    fn random_argument(&self) -> String {
+        let mut rng = thread_rng();
+        let idx = rng.gen_range(0..self.arguments.len());
+        self.arguments[idx].clone()
+    }
+
+    /// Pick a random number of fields to group by (non repeating)
+    ///
+    /// Limited to 3 group by columns to ensure coverage for large groups. With
+    /// larger numbers of columns, each group has many fewer values.
+    fn random_group_by(&self) -> Vec<String> {

Review Comment:
   Yes, this is a good point. I think there should be some way to generate 
lower cardinality columns. I will do that as a follow on PR



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Improve `AggregateFuzz` testing: generate random queries [datafusion]

Reply via email to