Re: [PR] Add nulls checks to generated pruning predicates [datafusion]

via GitHub Fri, 31 Jan 2025 21:45:48 -0800


appletreeisyellow commented on code in PR #14297:
URL: https://github.com/apache/datafusion/pull/14297#discussion_r1938185973



##########
datafusion/physical-optimizer/src/pruning.rs:
##########
@@ -819,16 +813,24 @@ impl RequiredColumns {
     /// statistics column, while keeping track that a reference to the 
statistics
     /// column is required
     ///
-    /// for example, an expression like `col("foo") > 5`, when called
+    /// For example, an expression like `col("foo") > 5`, when called
     /// with Max would result in an expression like `col("foo_max") >
-    /// 5` with the appropriate entry noted in self.columns
+    /// 5` with the appropriate entry noted in self.columns.
+    ///
+    /// Along with a reference to the min/max/null_count/row_count column
+    /// an expression for `{stats_col} IS NULL` is returned that should be 
`OR`ed
+    /// with any expressions applied to the column itself such that if any 
value in `{stats_col}`
+    /// is `null` we return `true` and not `null` because `null` is falsy, but 
we'd need to scan that
+    /// container, causing confusion and preventing optimizations because of 
the inverted logic!
+    ///
+    /// Thus for the example above we'd arrive at the final expression 
`foo_max is null or foo_max > 5`.

Review Comment:
   This is helpful!



##########
datafusion/physical-optimizer/src/pruning.rs:
##########
@@ -2894,7 +3003,7 @@ mod tests {
     #[test]
     fn row_group_predicate_cast() -> Result<()> {
         let schema = Schema::new(vec![Field::new("c1", DataType::Int32, 
false)]);
-        let expected_expr = "c1_null_count@2 != c1_row_count@3 AND 
CAST(c1_min@0 AS Int64) <= 1 AND 1 <= CAST(c1_max@1 AS Int64)";
+        let expected_expr = "(c1_null_count@2 IS NULL OR c1_row_count@3 IS 
NULL OR c1_null_count@2 != c1_row_count@3) AND (CAST(c1_min@0 AS Int64) IS NULL 
OR CAST(c1_min@0 AS Int64) <= 1) AND (CAST(c1_max@1 AS Int64) IS NULL OR 1 <= 
CAST(c1_max@1 AS Int64))";

Review Comment:
   I verified in datafusion-cli ✅
   
   ```sql
   > SELECT CAST(NULL AS int) IS NULL;
   +--------------+
   | NULL IS NULL |
   +--------------+
   | true         |
   +--------------+
   1 row(s) fetched.
   Elapsed 0.001 seconds.
   
   > SELECT CAST('2' AS int) IS NULL;
   +-------------------+
   | Utf8("2") IS NULL |
   +-------------------+
   | false             |
   +-------------------+
   1 row(s) fetched.
   Elapsed 0.001 seconds.
   ```



##########
datafusion/physical-optimizer/src/pruning.rs:
##########
@@ -2910,7 +3019,7 @@ mod tests {
         assert_eq!(predicate_expr.to_string(), expected_expr);
 
         let expected_expr =
-            "c1_null_count@1 != c1_row_count@2 AND TRY_CAST(c1_max@0 AS Int64) 
> 1";
+            "(c1_null_count@1 IS NULL OR c1_row_count@2 IS NULL OR 
c1_null_count@1 != c1_row_count@2) AND (TRY_CAST(c1_max@0 AS Int64) IS NULL OR 
TRY_CAST(c1_max@0 AS Int64) > 1)";

Review Comment:
   I verified in datafusion-cli ✅
   
   ```sql
   > SELECT TRY_CAST(NULL AS int) IS NULL;
   +--------------+
   | NULL IS NULL |
   +--------------+
   | true         |
   +--------------+
   1 row(s) fetched.
   Elapsed 0.001 seconds.
   
   > SELECT TRY_CAST(1234.56 AS int) IS NULL;
   +--------------------------+
   | Float64(1234.56) IS NULL |
   +--------------------------+
   | false                    |
   +--------------------------+
   1 row(s) fetched.
   Elapsed 0.001 seconds.
   ```



##########
datafusion/physical-optimizer/src/pruning.rs:
##########
@@ -722,8 +709,15 @@ impl BoolVecBuilder {
                 // False means all containers can not pass the predicate
                 self.inner = vec![false; self.inner.len()];
             }
+            ColumnarValue::Scalar(ScalarValue::Boolean(None)) => {
+                // this should catch bugs in tests while keeping
+                // the "old" behavior for PruningPredicate in production
+                // that if any nulls are encountered they are treated as truthy
+                #[cfg(debug_assertions)]
+                panic!("no predicate should return null!")
+            }

Review Comment:
   Only panic in debug builds when null values are detected 👍 You may want to 
update the docstring on line 702 to reflect that:
   ```
       /// # Panics in debug builds
       /// If `value` is not boolean
   ```



##########
datafusion/physical-optimizer/src/pruning.rs:
##########
@@ -2785,7 +2893,7 @@ mod tests {
             vec![lit(1), lit(2), lit(3)],
             false,
         ));
-        let expected_expr = "c1_null_count@2 != c1_row_count@3 AND c1_min@0 <= 
1 AND 1 <= c1_max@1 OR c1_null_count@2 != c1_row_count@3 AND c1_min@0 <= 2 AND 
2 <= c1_max@1 OR c1_null_count@2 != c1_row_count@3 AND c1_min@0 <= 3 AND 3 <= 
c1_max@1";
+        let expected_expr = "(c1_null_count@2 IS NULL OR c1_row_count@3 IS 
NULL OR c1_null_count@2 != c1_row_count@3) AND (c1_min@0 IS NULL OR c1_min@0 <= 
1) AND (c1_max@1 IS NULL OR 1 <= c1_max@1) OR (c1_null_count@2 IS NULL OR 
c1_row_count@3 IS NULL OR c1_null_count@2 != c1_row_count@3) AND (c1_min@0 IS 
NULL OR c1_min@0 <= 2) AND (c1_max@1 IS NULL OR 2 <= c1_max@1) OR 
(c1_null_count@2 IS NULL OR c1_row_count@3 IS NULL OR c1_null_count@2 != 
c1_row_count@3) AND (c1_min@0 IS NULL OR c1_min@0 <= 3) AND (c1_max@1 IS NULL 
OR 3 <= c1_max@1)";

Review Comment:
   Totally agree!
   
   I'm not sure if DataFusion uses `insta`, but running cargo insta review 
would definitely make the development experience much easier and quicker when 
updating tests. Serializing and pretty-printing the expressions would also 
significantly improve readability.
   
   That said, I know these improvements could become a large PR on their own.
   
   cc @alamb — maybe you can find someone in the community interested in 
tackling this! 



##########
datafusion/physical-optimizer/src/pruning.rs:
##########
@@ -1306,18 +1341,25 @@ fn build_is_null_column_expr(
 
         let null_count_field = &Field::new(field.name(), DataType::UInt64, 
true);
         if with_not {
-            if let Ok(row_count_expr) =
+            if let Ok((row_count_expr, row_count_is_null_expr)) =
                 required_columns.row_count_column_expr(col, expr, 
null_count_field)
             {
                 required_columns
                     .null_count_column_expr(col, expr, null_count_field)
-                    .map(|null_count_column_expr| {
+                    .map(|(null_count_column_expr, 
null_count_column_is_null_expr)| {
+                        // If either the null_count or the row_count are null 
we can't know anything, so return true
+                        let any_null_expr = 
Arc::new(phys_expr::BinaryExpr::new(
+                            row_count_is_null_expr,
+                            Operator::Or,
+                            null_count_column_is_null_expr,
+                        ));

Review Comment:
   Aggressively returning true if either of the stat is not known 👍 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Re: [PR] Add nulls checks to generated pruning predicates [datafusion]

Reply via email to