asolimando commented on code in PR #19957:
URL: https://github.com/apache/datafusion/pull/19957#discussion_r2910809049


##########
datafusion/physical-expr/src/expression_analyzer.rs:
##########
@@ -0,0 +1,1222 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Pluggable expression-level statistics analysis.
+//!
+//! This module provides an extensible mechanism for computing expression-level
+//! statistics metadata (selectivity, NDV, min/max bounds) following the chain
+//! of responsibility pattern.
+//!
+//! # Overview
+//!
+//! Different expressions have different statistical properties:
+//!
+//! - **Injective functions** (UPPER, LOWER, ABS on non-negative): preserve NDV
+//! - **Non-injective functions** (FLOOR, YEAR, SUBSTRING): reduce NDV
+//! - **Monotonic functions**: allow min/max bound propagation
+//! - **Constants**: NDV = 1, selectivity depends on value
+//!
+//! The default implementation uses classic Selinger-style estimation. Users 
can
+//! register custom [`ExpressionAnalyzer`] implementations to:
+//!
+//! 1. Provide statistics for custom UDFs
+//! 2. Override default estimation with domain-specific knowledge
+//! 3. Plug in advanced approaches (e.g., histogram-based estimation)
+//!
+//! # Example
+//!
+//! ```ignore
+//! use datafusion_physical_plan::expression_analyzer::*;
+//!
+//! // Create registry with default analyzer
+//! let mut registry = ExpressionAnalyzerRegistry::new();
+//!
+//! // Register custom analyzer (higher priority)
+//! registry.register(Arc::new(MyCustomAnalyzer));
+//!
+//! // Query expression statistics
+//! let selectivity = registry.get_selectivity(&predicate, &input_stats);
+//! ```
+
+use std::fmt::Debug;
+use std::sync::Arc;
+
+use datafusion_common::{ColumnStatistics, ScalarValue, Statistics};
+use datafusion_expr::Operator;
+
+use crate::expressions::{BinaryExpr, Column, Literal, NotExpr};
+use crate::{PhysicalExpr, ScalarFunctionExpr};
+
+// ============================================================================
+// AnalysisResult: Chain of responsibility result type
+// ============================================================================
+
+/// Result of expression analysis - either computed or delegate to next 
analyzer.
+#[derive(Debug, Clone)]
+pub enum AnalysisResult<T> {
+    /// Analysis was performed, here's the result
+    Computed(T),
+    /// This analyzer doesn't handle this expression; delegate to next
+    Delegate,
+}
+
+impl<T> AnalysisResult<T> {
+    /// Convert to Option, returning None for Delegate
+    pub fn into_option(self) -> Option<T> {
+        match self {
+            AnalysisResult::Computed(v) => Some(v),
+            AnalysisResult::Delegate => None,
+        }
+    }
+
+    /// Returns true if this is a Computed result
+    pub fn is_computed(&self) -> bool {
+        matches!(self, AnalysisResult::Computed(_))
+    }
+}
+
+// ============================================================================
+// ExpressionAnalyzer trait
+// ============================================================================
+
+/// Expression-level metadata analysis.
+///
+/// Implementations can handle specific expression types or provide domain
+/// knowledge for custom UDFs. The chain of analyzers is traversed until one
+/// returns [`AnalysisResult::Computed`].
+///
+/// # Implementing a Custom Analyzer
+///
+/// ```ignore
+/// #[derive(Debug)]
+/// struct MyUdfAnalyzer;
+///
+/// impl ExpressionAnalyzer for MyUdfAnalyzer {
+///     fn get_selectivity(
+///         &self,
+///         expr: &Arc<dyn PhysicalExpr>,
+///         input_stats: &Statistics,
+///     ) -> AnalysisResult<f64> {
+///         // Recognize my custom is_valid_email() UDF
+///         if is_my_email_validator(expr) {
+///             return AnalysisResult::Computed(0.8); // ~80% valid
+///         }
+///         AnalysisResult::Delegate
+///     }
+/// }
+/// ```
+pub trait ExpressionAnalyzer: Debug + Send + Sync {
+    /// Estimate selectivity when this expression is used as a predicate.
+    ///
+    /// Returns a value in [0.0, 1.0] representing the fraction of rows
+    /// that satisfy the predicate.
+    fn get_selectivity(
+        &self,
+        _expr: &Arc<dyn PhysicalExpr>,
+        _input_stats: &Statistics,
+    ) -> AnalysisResult<f64> {
+        AnalysisResult::Delegate
+    }
+
+    /// Estimate the number of distinct values in the expression's output.
+    ///
+    /// Properties:
+    /// - Injective functions preserve input NDV
+    /// - Non-injective functions reduce NDV (e.g., FLOOR, YEAR)
+    /// - Constants have NDV = 1
+    fn get_distinct_count(
+        &self,
+        _expr: &Arc<dyn PhysicalExpr>,
+        _input_stats: &Statistics,
+    ) -> AnalysisResult<usize> {
+        AnalysisResult::Delegate
+    }
+
+    /// Estimate min/max bounds of the expression's output.
+    ///
+    /// Monotonic functions can transform input bounds:
+    /// - Increasing: (f(min), f(max))
+    /// - Decreasing: (f(max), f(min))
+    /// - Non-monotonic: may need wider bounds or return Delegate
+    fn get_min_max(
+        &self,
+        _expr: &Arc<dyn PhysicalExpr>,
+        _input_stats: &Statistics,
+    ) -> AnalysisResult<(ScalarValue, ScalarValue)> {
+        AnalysisResult::Delegate
+    }
+
+    /// Estimate the fraction of null values in the expression's output.
+    ///
+    /// Returns a value in [0.0, 1.0].
+    fn get_null_fraction(
+        &self,
+        _expr: &Arc<dyn PhysicalExpr>,
+        _input_stats: &Statistics,
+    ) -> AnalysisResult<f64> {
+        AnalysisResult::Delegate
+    }
+}
+
+// ============================================================================
+// ExpressionAnalyzerRegistry
+// ============================================================================
+
+/// Registry that chains [`ExpressionAnalyzer`] implementations.
+///
+/// Analyzers are tried in order; the first to return 
[`AnalysisResult::Computed`]
+/// wins. Register domain-specific analyzers before the default for override.
+#[derive(Debug, Clone)]
+pub struct ExpressionAnalyzerRegistry {
+    analyzers: Vec<Arc<dyn ExpressionAnalyzer>>,
+}
+
+impl Default for ExpressionAnalyzerRegistry {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl ExpressionAnalyzerRegistry {
+    /// Create a new registry with the default expression analyzer.
+    pub fn new() -> Self {
+        Self {
+            analyzers: vec![Arc::new(DefaultExpressionAnalyzer)],
+        }
+    }
+
+    /// Create a registry with all built-in analyzers (string, math, datetime, 
default).
+    pub fn with_builtin_analyzers() -> Self {

Review Comment:
   Agreed, will move it to `SessionContext`/`SessionState` in the follow-up PR, 
thanks for your suggestion



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to