alamb commented on code in PR #13651:
URL: https://github.com/apache/datafusion/pull/13651#discussion_r1896724308
##########
datafusion/expr/src/logical_plan/invariants.rs:
##########
@@ -15,14 +15,98 @@
// specific language governing permissions and limitations
// under the License.
-use crate::analyzer::check_plan;
-use crate::utils::collect_subquery_cols;
+use datafusion_common::{
+ plan_err,
+ tree_node::{TreeNode, TreeNodeRecursion},
+ DFSchemaRef, DataFusionError, Result,
+};
-use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion};
-use datafusion_common::{plan_err, Result};
-use datafusion_expr::expr_rewriter::strip_outer_reference;
-use datafusion_expr::utils::split_conjunction;
-use datafusion_expr::{Aggregate, Expr, Filter, Join, JoinType, LogicalPlan,
Window};
+use crate::{
+ expr::{Exists, InSubquery},
+ expr_rewriter::strip_outer_reference,
+ utils::{collect_subquery_cols, split_conjunction},
+ Aggregate, Expr, Filter, Join, JoinType, LogicalPlan, Window,
+};
+
+pub enum InvariantLevel {
+ /// Invariants that are always true in DataFusion `LogicalPlan`s
+ /// such as the number of expected children and no duplicated output fields
+ Always,
+ /// Invariants that must hold true for the plan to be "executable"
+ /// such as the type and number of function arguments are correct and
+ /// that wildcards have been expanded
+ ///
+ /// To ensure a LogicalPlan satisfies the `Executable` inariants, run the
+ /// `Analyzer`
+ Executable,
+}
+
+pub fn assert_required_invariants(plan: &LogicalPlan) -> Result<()> {
Review Comment:
would this be more accurately named `assert_always_invariants` (to mirror
`Invariant::Always`)?
##########
datafusion/optimizer/src/optimizer.rs:
##########
@@ -384,9 +396,26 @@ impl Optimizer {
// rule handles recursion itself
None => optimize_plan_node(new_plan, rule.as_ref(),
config),
}
- // verify the rule didn't change the schema
.and_then(|tnr| {
- assert_schema_is_the_same(rule.name(), &starting_schema,
&tnr.data)?;
+ // run checks optimizer invariant checks, per pass
+ assert_valid_optimization(&tnr.data, &starting_schema)
+ .map_err(|e| {
+ DataFusionError::Context(
+ format!("check_optimizer_specific_invariants
after optimizer pass: {}", rule.name()),
+ Box::new(e),
+ )
+ })?;
+
+ // run LP invariant checks only in debug
Review Comment:
```suggestion
// run LP invariant checks only in debug mode for
performance reasons
```
##########
datafusion/optimizer/src/optimizer.rs:
##########
@@ -384,9 +396,26 @@ impl Optimizer {
// rule handles recursion itself
None => optimize_plan_node(new_plan, rule.as_ref(),
config),
}
- // verify the rule didn't change the schema
.and_then(|tnr| {
- assert_schema_is_the_same(rule.name(), &starting_schema,
&tnr.data)?;
+ // run checks optimizer invariant checks, per pass
+ assert_valid_optimization(&tnr.data, &starting_schema)
+ .map_err(|e| {
+ DataFusionError::Context(
+ format!("check_optimizer_specific_invariants
after optimizer pass: {}", rule.name()),
+ Box::new(e),
+ )
+ })?;
+
+ // run LP invariant checks only in debug
+ #[cfg(debug_assertions)]
Review Comment:
I also double checked this is the right name:
https://doc.rust-lang.org/reference/conditional-compilation.html#debug_assertions
##########
datafusion/expr/src/utils.rs:
##########
@@ -1402,6 +1402,24 @@ pub fn format_state_name(name: &str, state_name: &str)
-> String {
format!("{name}[{state_name}]")
}
+/// Determine the set of [`Column`]s produced by the subquery.
Review Comment:
👍
##########
datafusion/optimizer/src/analyzer/mod.rs:
##########
@@ -140,6 +143,14 @@ impl Analyzer {
where
F: FnMut(&LogicalPlan, &dyn AnalyzerRule),
{
+ // verify the logical plan required invariants at the start, before
analyzer
+ plan.check_invariants(InvariantLevel::Always).map_err(|e| {
+ DataFusionError::Context(
+ "assert_lp_invariants_before_analyzers".to_string(),
Review Comment:
minor thing you can use `DataFusion::context` here to make it more concise
https://github.com/apache/datafusion/blob/c0ca4b4e449e07c3bcd6f3593fa31dd31ed5e0c5/datafusion/core/src/datasource/physical_plan/statistics.rs#L258-L257
##########
datafusion/optimizer/src/optimizer.rs:
##########
@@ -445,35 +462,38 @@ impl Optimizer {
}
i += 1;
}
+
+ // verify LP is valid, after the last optimizer pass.
+ new_plan
+ .check_invariants(InvariantLevel::Executable)
+ .map_err(|e| {
+ DataFusionError::Context(
+ "check_plan_after_optimizers".to_string(),
+ Box::new(e),
+ )
+ })?;
+
log_plan("Final optimized plan", &new_plan);
debug!("Optimizer took {} ms", start_time.elapsed().as_millis());
Ok(new_plan)
}
}
-/// Returns an error if `new_plan`'s schema is different than `prev_schema`
+/// These are invariants which should hold true before and after each
optimization.
///
-/// It ignores metadata and nullability.
-pub(crate) fn assert_schema_is_the_same(
+/// This differs from [`LogicalPlan::check_invariants`], which addresses if a
singular
+/// LogicalPlan is valid. Instead this address if the optimization (before and
after)
+/// is valid based upon permitted changes.
+fn assert_valid_optimization(
Review Comment:
Make sense
##########
datafusion/optimizer/src/analyzer/mod.rs:
##########
@@ -167,33 +178,16 @@ impl Analyzer {
log_plan(rule.name(), &new_plan);
observer(&new_plan, rule.as_ref());
}
- // for easier display in explain output
- check_plan(&new_plan).map_err(|e| {
- DataFusionError::Context("check_analyzed_plan".to_string(),
Box::new(e))
- })?;
+
+ // verify at the end, after the last LP analyzer pass, that the plan
is executable.
+ new_plan
+ .check_invariants(InvariantLevel::Executable)
+ .map_err(|e| {
+ DataFusionError::Context("check_analyzed_plan".to_string(),
Box::new(e))
Review Comment:
I also think a more human readable message here would help someone
understand what is going on if they get the error (imagine someone running a
query and they see "check_analyzed_plan: error ..."). It will liekly not mean
much
Maybe it could be like
"Invalid plan after Analyzer"
(Also this could use `DataFusionError::context` as well if you wanted)
##########
datafusion/optimizer/src/optimizer.rs:
##########
@@ -355,13 +356,24 @@ impl Optimizer {
where
F: FnMut(&LogicalPlan, &dyn OptimizerRule),
{
+ // verify LP is valid, before the first LP optimizer pass.
+ plan.check_invariants(InvariantLevel::Executable)
+ .map_err(|e| {
+ DataFusionError::Context(
+ "check_plan_is_executable before optimizers".to_string(),
Review Comment:
```suggestion
"Invalid plan before optimizers".to_string(),
```
##########
datafusion/optimizer/src/optimizer.rs:
##########
@@ -529,7 +567,9 @@ mod tests {
let err = opt.optimize(plan, &config, &observe).unwrap_err();
assert_eq!(
"Optimizer rule 'get table_scan rule' failed\n\
- caused by\nget table_scan rule\ncaused by\n\
+ caused by\n\
+ check_optimizer_specific_invariants after optimizer pass: get
table_scan rule\n\
Review Comment:
that is nicer
##########
datafusion/optimizer/src/analyzer/mod.rs:
##########
@@ -140,6 +143,14 @@ impl Analyzer {
where
F: FnMut(&LogicalPlan, &dyn AnalyzerRule),
{
+ // verify the logical plan required invariants at the start, before
analyzer
+ plan.check_invariants(InvariantLevel::Always).map_err(|e| {
+ DataFusionError::Context(
+ "assert_lp_invariants_before_analyzers".to_string(),
Review Comment:
I also think a more human readable message might be nice, for example
instead of
"assert_lp_invariants_before_analyzers"
Maybe something like
"Invalid input plan passed to Analyzer"
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]