findepi commented on code in PR #12978:
URL: https://github.com/apache/datafusion/pull/12978#discussion_r1810506734


##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -1610,6 +1624,79 @@ fn build_statistics_expr(
     Ok(statistics_expr)
 }
 
+fn build_like_match(
+    expr_builder: &mut PruningExpressionBuilder,
+) -> Result<Arc<dyn PhysicalExpr>> {
+    // column LIKE literal => (min, max) LIKE literal split at % => min <= 
split literal && split literal <= max
+    // column LIKE 'foo%' => min <= 'foo' && 'foo' <= max
+    // column LIKE '%foo' => min <= '' && '' <= max => true
+    // column LIKE '%foo%' => min <= '' && '' <= max => true
+    // column LIKE 'foo' => min <= 'foo' && 'foo' <= max
+
+    fn unpack_string(s: &ScalarValue) -> Result<&String> {
+        match s {
+            ScalarValue::Utf8(Some(s)) => Ok(s),
+            ScalarValue::LargeUtf8(Some(s)) => Ok(s),
+            ScalarValue::Utf8View(Some(s)) => Ok(s),
+            ScalarValue::Dictionary(_, value) => unpack_string(value),
+            _ => plan_err!("LIKE expression must be a string literal"),
+        }
+    }
+
+    fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Result<&String> 
{
+        if let Some(lit) = expr.as_any().downcast_ref::<phys_expr::Literal>() {
+            let s = unpack_string(lit.value())?;
+            return Ok(s);
+        }
+        plan_err!("LIKE expression must be a string literal")
+    }
+
+    // I *think* that ILIKE could be handled by making the min lowercase and 
max uppercase
+    // but that requires building the physical expressions that call lower() 
and upper()
+    let min_column_expr = expr_builder.min_column_expr()?;
+    let max_column_expr = expr_builder.max_column_expr()?;
+    let scalar_expr = expr_builder.scalar_expr();
+    // check that the scalar is a string literal
+    let s = extract_string_literal(scalar_expr)?;
+    // **IMPORTANT** we need to make sure that the min and max are in the 
range of the prefix
+    // If we truncate 'A%' to 'A', we need to make sure that 'A' is less than 
'AB' so that
+    // when we make this a range query we get 'AB' <= 'A\u{10ffff}' AND 'A' <= 
'AB'.
+    // Otherwise 'AB' <= 'A' AND 'A' <= 'AB' would be *wrong* because 'AB' 
LIKE 'A%' is should be true!
+    // Credit to https://stackoverflow.com/a/35881551 for inspiration on this 
approach.
+    // ANSI SQL specifies two wildcards: % and _. % matches zero or more 
characters, _ matches exactly one character.
+    let first_wildcard_index = s.find(['%', '_']);
+    let (min_lit, max_lit) = if let Some(wildcard_index) = 
first_wildcard_index {
+        let prefix = &s[..wildcard_index];
+        let prefix_min_lit = 
Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(
+            format!("{prefix}\u{10ffff}"),

Review Comment:
   When implementing similar thing for Trino 
(https://github.com/trinodb/trino/commit/6aea881752ef9a83521a4a20bccf510bf9a31d2c),
 i decided to stay within ASCII characters to avoid any potential issues due to 
misinterpretation of "difficult" code points.
   



##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -1610,6 +1624,79 @@ fn build_statistics_expr(
     Ok(statistics_expr)
 }
 
+fn build_like_match(
+    expr_builder: &mut PruningExpressionBuilder,
+) -> Result<Arc<dyn PhysicalExpr>> {
+    // column LIKE literal => (min, max) LIKE literal split at % => min <= 
split literal && split literal <= max
+    // column LIKE 'foo%' => min <= 'foo' && 'foo' <= max
+    // column LIKE '%foo' => min <= '' && '' <= max => true
+    // column LIKE '%foo%' => min <= '' && '' <= max => true
+    // column LIKE 'foo' => min <= 'foo' && 'foo' <= max
+
+    fn unpack_string(s: &ScalarValue) -> Result<&String> {
+        match s {
+            ScalarValue::Utf8(Some(s)) => Ok(s),
+            ScalarValue::LargeUtf8(Some(s)) => Ok(s),
+            ScalarValue::Utf8View(Some(s)) => Ok(s),
+            ScalarValue::Dictionary(_, value) => unpack_string(value),
+            _ => plan_err!("LIKE expression must be a string literal"),
+        }
+    }
+
+    fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Result<&String> 
{
+        if let Some(lit) = expr.as_any().downcast_ref::<phys_expr::Literal>() {
+            let s = unpack_string(lit.value())?;
+            return Ok(s);
+        }
+        plan_err!("LIKE expression must be a string literal")
+    }
+
+    // I *think* that ILIKE could be handled by making the min lowercase and 
max uppercase
+    // but that requires building the physical expressions that call lower() 
and upper()
+    let min_column_expr = expr_builder.min_column_expr()?;
+    let max_column_expr = expr_builder.max_column_expr()?;
+    let scalar_expr = expr_builder.scalar_expr();
+    // check that the scalar is a string literal
+    let s = extract_string_literal(scalar_expr)?;
+    // **IMPORTANT** we need to make sure that the min and max are in the 
range of the prefix
+    // If we truncate 'A%' to 'A', we need to make sure that 'A' is less than 
'AB' so that
+    // when we make this a range query we get 'AB' <= 'A\u{10ffff}' AND 'A' <= 
'AB'.
+    // Otherwise 'AB' <= 'A' AND 'A' <= 'AB' would be *wrong* because 'AB' 
LIKE 'A%' is should be true!

Review Comment:
   This important, but a bit difficult to follow.



##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -1610,6 +1624,79 @@ fn build_statistics_expr(
     Ok(statistics_expr)
 }
 
+fn build_like_match(
+    expr_builder: &mut PruningExpressionBuilder,
+) -> Result<Arc<dyn PhysicalExpr>> {
+    // column LIKE literal => (min, max) LIKE literal split at % => min <= 
split literal && split literal <= max
+    // column LIKE 'foo%' => min <= 'foo' && 'foo' <= max
+    // column LIKE '%foo' => min <= '' && '' <= max => true
+    // column LIKE '%foo%' => min <= '' && '' <= max => true
+    // column LIKE 'foo' => min <= 'foo' && 'foo' <= max
+
+    fn unpack_string(s: &ScalarValue) -> Result<&String> {
+        match s {
+            ScalarValue::Utf8(Some(s)) => Ok(s),
+            ScalarValue::LargeUtf8(Some(s)) => Ok(s),
+            ScalarValue::Utf8View(Some(s)) => Ok(s),
+            ScalarValue::Dictionary(_, value) => unpack_string(value),
+            _ => plan_err!("LIKE expression must be a string literal"),
+        }
+    }
+
+    fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Result<&String> 
{
+        if let Some(lit) = expr.as_any().downcast_ref::<phys_expr::Literal>() {
+            let s = unpack_string(lit.value())?;
+            return Ok(s);
+        }
+        plan_err!("LIKE expression must be a string literal")
+    }
+
+    // I *think* that ILIKE could be handled by making the min lowercase and 
max uppercase
+    // but that requires building the physical expressions that call lower() 
and upper()
+    let min_column_expr = expr_builder.min_column_expr()?;
+    let max_column_expr = expr_builder.max_column_expr()?;
+    let scalar_expr = expr_builder.scalar_expr();
+    // check that the scalar is a string literal
+    let s = extract_string_literal(scalar_expr)?;
+    // **IMPORTANT** we need to make sure that the min and max are in the 
range of the prefix
+    // If we truncate 'A%' to 'A', we need to make sure that 'A' is less than 
'AB' so that
+    // when we make this a range query we get 'AB' <= 'A\u{10ffff}' AND 'A' <= 
'AB'.
+    // Otherwise 'AB' <= 'A' AND 'A' <= 'AB' would be *wrong* because 'AB' 
LIKE 'A%' is should be true!
+    // Credit to https://stackoverflow.com/a/35881551 for inspiration on this 
approach.
+    // ANSI SQL specifies two wildcards: % and _. % matches zero or more 
characters, _ matches exactly one character.
+    let first_wildcard_index = s.find(['%', '_']);

Review Comment:
   we do not support escape characters, right?



##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -1610,6 +1624,79 @@ fn build_statistics_expr(
     Ok(statistics_expr)
 }
 
+fn build_like_match(
+    expr_builder: &mut PruningExpressionBuilder,
+) -> Result<Arc<dyn PhysicalExpr>> {
+    // column LIKE literal => (min, max) LIKE literal split at % => min <= 
split literal && split literal <= max
+    // column LIKE 'foo%' => min <= 'foo' && 'foo' <= max
+    // column LIKE '%foo' => min <= '' && '' <= max => true
+    // column LIKE '%foo%' => min <= '' && '' <= max => true
+    // column LIKE 'foo' => min <= 'foo' && 'foo' <= max
+
+    fn unpack_string(s: &ScalarValue) -> Result<&String> {
+        match s {
+            ScalarValue::Utf8(Some(s)) => Ok(s),
+            ScalarValue::LargeUtf8(Some(s)) => Ok(s),
+            ScalarValue::Utf8View(Some(s)) => Ok(s),
+            ScalarValue::Dictionary(_, value) => unpack_string(value),
+            _ => plan_err!("LIKE expression must be a string literal"),
+        }
+    }
+
+    fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Result<&String> 
{
+        if let Some(lit) = expr.as_any().downcast_ref::<phys_expr::Literal>() {
+            let s = unpack_string(lit.value())?;
+            return Ok(s);
+        }
+        plan_err!("LIKE expression must be a string literal")
+    }
+
+    // I *think* that ILIKE could be handled by making the min lowercase and 
max uppercase
+    // but that requires building the physical expressions that call lower() 
and upper()
+    let min_column_expr = expr_builder.min_column_expr()?;
+    let max_column_expr = expr_builder.max_column_expr()?;
+    let scalar_expr = expr_builder.scalar_expr();
+    // check that the scalar is a string literal
+    let s = extract_string_literal(scalar_expr)?;
+    // **IMPORTANT** we need to make sure that the min and max are in the 
range of the prefix
+    // If we truncate 'A%' to 'A', we need to make sure that 'A' is less than 
'AB' so that
+    // when we make this a range query we get 'AB' <= 'A\u{10ffff}' AND 'A' <= 
'AB'.
+    // Otherwise 'AB' <= 'A' AND 'A' <= 'AB' would be *wrong* because 'AB' 
LIKE 'A%' is should be true!
+    // Credit to https://stackoverflow.com/a/35881551 for inspiration on this 
approach.
+    // ANSI SQL specifies two wildcards: % and _. % matches zero or more 
characters, _ matches exactly one character.
+    let first_wildcard_index = s.find(['%', '_']);
+    let (min_lit, max_lit) = if let Some(wildcard_index) = 
first_wildcard_index {
+        let prefix = &s[..wildcard_index];
+        let prefix_min_lit = 
Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(
+            format!("{prefix}\u{10ffff}"),

Review Comment:
   The 'highest character' should be appended to the max range, not the min.
   
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to