alamb commented on code in PR #13795:
URL: https://github.com/apache/datafusion/pull/13795#discussion_r1891957472


##########
datafusion/physical-optimizer/src/pruning.rs:
##########
@@ -1618,33 +1618,29 @@ fn build_statistics_expr(
 /// will become
 ///
 /// ```sql
-/// CASE
-///  WHEN x_null_count = x_row_count THEN false
-///  ELSE x_min <= 10 AND 10 <= x_max
-/// END
+/// NOT (x_null_count = x_row_count) AND (x_min <= 10 AND 10 <= x_max)
 /// ````
 ///
 /// If the column is known to be all nulls, then the expression
 /// `x_null_count = x_row_count` will be true, which will cause the
-/// case expression to return false. Therefore, prune out the container.
-fn wrap_case_expr(
+/// boolean expression to return false. Therefore, prune out the container.
+fn wrap_null_count_check_expr(
     statistics_expr: Arc<dyn PhysicalExpr>,
     expr_builder: &mut PruningExpressionBuilder,
 ) -> Result<Arc<dyn PhysicalExpr>> {
     // x_null_count = x_row_count
-    let when_null_count_eq_row_count = Arc::new(phys_expr::BinaryExpr::new(
+    let not_when_null_count_eq_row_count = Arc::new(phys_expr::BinaryExpr::new(
         expr_builder.null_count_column_expr()?,
-        Operator::Eq,
+        Operator::NotEq,
         expr_builder.row_count_column_expr()?,
     ));
-    let then = 
Arc::new(phys_expr::Literal::new(ScalarValue::Boolean(Some(false))));
-
-    // CASE WHEN x_null_count = x_row_count THEN false ELSE <statistics_expr> 
END
-    Ok(Arc::new(phys_expr::CaseExpr::try_new(
-        None,
-        vec![(when_null_count_eq_row_count, then)],
-        Some(statistics_expr),
-    )?))
+
+    // NOT (x_null_count = x_row_count) AND (<statistics_expr>)

Review Comment:
   I think this comment is now out of date
   
   ```suggestion
       // (x_null_count != x_row_count) AND (<statistics_expr>)
   ```



##########
datafusion/physical-optimizer/src/pruning.rs:
##########
@@ -287,7 +287,12 @@ pub trait PruningStatistics {
 ///   predicate can never possibly be true). The container can be pruned 
(skipped)
 ///   entirely.
 ///
-/// Note that in order to be correct, `PruningPredicate` must return false
+/// While `PruningPredicate` will never return a `NULL` value, the
+/// rewritten predicate (as returned by `build_predicate_expression` and used 
internally
+/// by `PruningPredicate`) may evaluate to `NULL` when some of the min/max 
values
+/// or null / row counts are not known.

Review Comment:
   Thank you. Yes I find reasoning about tri-state boolean logic (and what can 
be proven about what) confusing!



##########
datafusion/physical-optimizer/src/pruning.rs:
##########
@@ -381,16 +380,22 @@ pub trait PruningStatistics {
 /// When these statistics values are substituted in to the rewritten predicate 
and
 /// simplified, the result is `false`:
 ///
-/// * `CASE WHEN null = null THEN false ELSE 1 <= 5 AND 5 <= 100 END AND CASE 
WHEN null = null THEN false ELSE 4 <= 10 AND 10 <= 7 END`
-/// * `null = null` is `null` which is not true, so the `CASE` expression will 
use the `ELSE` clause
-/// * `1 <= 5 AND 5 <= 100 AND 4 <= 10 AND 10 <= 7`
-/// * `true AND true AND true AND false`
+/// * `null != null AND (1 <= 5 AND 5 <= 100) AND null != null AND (4 <= 10 
AND 10 <= 7)`

Review Comment:
   I double checked this and looks good to me



##########
datafusion/physical-optimizer/src/pruning.rs:
##########
@@ -381,16 +380,22 @@ pub trait PruningStatistics {
 /// When these statistics values are substituted in to the rewritten predicate 
and
 /// simplified, the result is `false`:
 ///
-/// * `CASE WHEN null = null THEN false ELSE 1 <= 5 AND 5 <= 100 END AND CASE 
WHEN null = null THEN false ELSE 4 <= 10 AND 10 <= 7 END`
-/// * `null = null` is `null` which is not true, so the `CASE` expression will 
use the `ELSE` clause
-/// * `1 <= 5 AND 5 <= 100 AND 4 <= 10 AND 10 <= 7`
-/// * `true AND true AND true AND false`
+/// * `null != null AND (1 <= 5 AND 5 <= 100) AND null != null AND (4 <= 10 
AND 10 <= 7)`
+/// * `null = null` is `null` which is not true, so the AND moves on to the 
next clause
+/// * `null and (1 <= 5 AND 5 <= 100) AND null AND (4 <= 10 AND 10 <= 7)`
+/// * evaluating the clauses further we get:
+/// * `null and true and null and false`
+/// * `null and false`
 /// * `false`
 ///
 /// Returning `false` means the container can be pruned, which matches the
 /// intuition that  `x = 5 AND y = 10` can’t be true for any row if all values 
of `y`
 /// are `7` or less.
 ///
+/// Note that if we had ended up with `null AND true AND null AND true` the 
result
+/// would have been `null`.
+/// We treat `null` as `true` because we can't prove that the predicate is 
false.

Review Comment:
   ```suggestion
   /// `null` is treated the same as`true`, because we can't prove that the 
predicate is `false.`
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Reply via email to