MohamedAbdeen21 commented on code in PR #10333:
URL: https://github.com/apache/datafusion/pull/10333#discussion_r1591327910
##########
datafusion/optimizer/src/common_subexpr_eliminate.rs:
##########
@@ -592,109 +591,51 @@ impl ExprMask {
/// Go through an expression tree and generate identifiers for each
subexpression.
///
-/// An identifier contains information of the expression itself and its
sub-expression.
-/// This visitor implementation use a stack `visit_stack` to track traversal,
which
-/// lets us know when a sub-tree's visiting is finished. When `pre_visit` is
called
-/// (traversing to a new node), an `EnterMark` and an `ExprItem` will be
pushed into stack.
-/// And try to pop out a `EnterMark` on leaving a node (`f_up()`). All
`ExprItem`
-/// before the first `EnterMark` is considered to be sub-tree of the leaving
node.
-///
-/// This visitor also records identifier in `id_array`. Makes the following
traverse
-/// pass can get the identifier of a node without recalculate it. We assign
each node
-/// in the expr tree a series number, start from 1, maintained by
`series_number`.
-/// Series number represents the order we left (`f_up()`) a node. Has the
property
-/// that child node's series number always smaller than parent's. While
`id_array` is
-/// organized in the order we enter (`f_down()`) a node. `node_count` helps us
to
-/// get the index of `id_array` for each node.
+/// This visitor also assigns an alias symbol to each expression, to be used
if the
+/// expression is getting eliminated, and tracks the frequency and datatype of
each expression.
///
/// `Expr` without sub-expr (column, literal etc.) will not have identifier
/// because they should not be recognized as common sub-expr.
struct ExprIdentifierVisitor<'a> {
- // param
+ // expr set to be populated by the visitor
expr_set: &'a mut ExprSet,
/// input schema for the node that we're optimizing, so we can determine
the correct datatype
/// for each subexpression
input_schema: DFSchemaRef,
- // inner states
- visit_stack: Vec<VisitRecord>,
- /// increased in fn_down, start from 0.
- node_count: usize,
/// which expression should be skipped?
expr_mask: ExprMask,
}
-/// Record item that used when traversing a expression tree.
-enum VisitRecord {
- /// `usize` is the monotone increasing series number assigned in
pre_visit().
- /// Starts from 0. Is used to index the identifier array `id_array` in
post_visit().
- EnterMark(usize),
- /// the node's children were skipped => jump to f_up on same node
- JumpMark(usize),
- /// Accumulated identifier of sub expression.
- ExprItem(Identifier),
-}
-
-impl ExprIdentifierVisitor<'_> {
- /// Find the first `EnterMark` in the stack, and accumulates every
`ExprItem`
- /// before it.
- fn pop_enter_mark(&mut self) -> (usize, Identifier) {
- let mut desc = String::new();
-
- while let Some(item) = self.visit_stack.pop() {
- match item {
- VisitRecord::EnterMark(idx) | VisitRecord::JumpMark(idx) => {
- return (idx, desc);
- }
- VisitRecord::ExprItem(id) => {
- desc.push_str(&id);
- }
- }
- }
- unreachable!("Enter mark should paired with node number");
- }
-}
-
impl TreeNodeVisitor for ExprIdentifierVisitor<'_> {
type Node = Expr;
fn f_down(&mut self, expr: &Expr) -> Result<TreeNodeRecursion> {
// related to https://github.com/apache/datafusion/issues/8814
// If the expr contain volatile expression or is a short-circuit
expression, skip it.
if expr.short_circuits() || expr.is_volatile()? {
- self.visit_stack
- .push(VisitRecord::JumpMark(self.node_count));
return Ok(TreeNodeRecursion::Jump); // go to f_up
}
- self.visit_stack
- .push(VisitRecord::EnterMark(self.node_count));
- self.node_count += 1;
-
Ok(TreeNodeRecursion::Continue)
}
fn f_up(&mut self, expr: &Expr) -> Result<TreeNodeRecursion> {
- let (_idx, sub_expr_identifier) = self.pop_enter_mark();
-
// skip exprs should not be recognize.
if self.expr_mask.ignores(expr) {
- let curr_expr_identifier = ExprSet::expr_identifier(expr);
- self.visit_stack
- .push(VisitRecord::ExprItem(curr_expr_identifier));
return Ok(TreeNodeRecursion::Continue);
}
- let curr_expr_identifier = ExprSet::expr_identifier(expr);
- let alias_symbol =
format!("{curr_expr_identifier}{sub_expr_identifier}");
- self.visit_stack
- .push(VisitRecord::ExprItem(alias_symbol.clone()));
+ let curr_expr_identifier = ExprSet::expr_identifier(expr);
let data_type = expr.get_type(&self.input_schema)?;
+ let alias_symbol = format!("#{{{curr_expr_identifier}}}");
Review Comment:
Love that, this can also help us create the alias on-the-fly instead of
passing it inside the expr_set.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]