findepi commented on code in PR #12978:
URL: https://github.com/apache/datafusion/pull/12978#discussion_r1819188271
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -1648,22 +1649,46 @@ fn build_like_match(
let s = unpack_string(lit.value())?;
return Ok(s);
}
- plan_err!("LIKE expression must be a string literal")
+ plan_err!("Unexpected LIKE expression: {expr:?}")
}
- // I *think* that ILIKE could be handled by making the min lowercase and
max uppercase
- // but that requires building the physical expressions that call lower()
and upper()
+ /// Try and increment the the string's bytes from right to left, returning
when the result
+ /// is a valid UTF8 string. Returns `None` when it can't increment any
byte.
+ /// Vendored from
https://github.com/apache/arrow-rs/blob/56525efbd5f37b89d1b56aa51709cab9f81bc89e/parquet/src/column/writer/mod.rs#L1432-L1448
+ fn increment_utf8(data: &str) -> Result<String> {
+ let mut data = data.as_bytes().to_vec();
+ for idx in (0..data.len()).rev() {
+ let original = data[idx];
+ let (byte, overflow) = original.overflowing_add(1);
+ if !overflow {
+ data[idx] = byte;
+ if let Ok(res) = str::from_utf8(&data) {
+ return Ok(res.to_string())
+ }
+ data[idx] = original;
+ }
+ }
+
+ return plan_err!("Could not increment UTF8 string");
Review Comment:
The function should return `Option<String>`
it's totally valid not to be able to increment a string (for example
`\u{10ffff}` cannot be incremented), and we don't need to fail planing in such
case (or leverage `unhandled_hook`)
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -1610,6 +1625,100 @@ fn build_statistics_expr(
Ok(statistics_expr)
}
+fn build_like_match(
+ expr_builder: &mut PruningExpressionBuilder,
+) -> Result<Arc<dyn PhysicalExpr>> {
+ // column LIKE literal => (min, max) LIKE literal split at % => min <=
split literal && split literal <= max
+ // column LIKE 'foo%' => min <= 'foo' && 'foo' <= max
+ // column LIKE '%foo' => min <= '' && '' <= max => true
+ // column LIKE '%foo%' => min <= '' && '' <= max => true
+ // column LIKE 'foo' => min <= 'foo' && 'foo' <= max
+
+ fn unpack_string(s: &ScalarValue) -> Result<&String> {
+ match s {
+ ScalarValue::Utf8(Some(s)) => Ok(s),
+ ScalarValue::LargeUtf8(Some(s)) => Ok(s),
+ ScalarValue::Utf8View(Some(s)) => Ok(s),
+ ScalarValue::Dictionary(_, value) => unpack_string(value),
+ _ => plan_err!("LIKE pattern literal must be a string"),
+ }
+ }
+
+ fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Result<&String>
{
+ if let Some(lit) = expr.as_any().downcast_ref::<phys_expr::Literal>() {
+ let s = unpack_string(lit.value())?;
+ return Ok(s);
+ }
+ plan_err!("Unexpected LIKE expression: {expr:?}")
+ }
+
+ /// Try and increment the the string's bytes from right to left, returning
when the result
+ /// is a valid UTF8 string. Returns `None` when it can't increment any
byte.
+ /// Vendored from
https://github.com/apache/arrow-rs/blob/56525efbd5f37b89d1b56aa51709cab9f81bc89e/parquet/src/column/writer/mod.rs#L1432-L1448
+ fn increment_utf8(data: &str) -> Result<String> {
+ let mut data = data.as_bytes().to_vec();
+ for idx in (0..data.len()).rev() {
+ let original = data[idx];
+ let (byte, overflow) = original.overflowing_add(1);
+ if !overflow {
+ data[idx] = byte;
+ if let Ok(res) = str::from_utf8(&data) {
+ return Ok(res.to_string());
+ }
+ data[idx] = original;
+ }
+ }
+
+ return plan_err!("Could not increment UTF8 string");
+ }
+
+ // TODO Handle ILIKE perhaps by making the min lowercase and max uppercase
+ // this may involve building the physical expressions that call lower()
and upper()
+ let min_column_expr = expr_builder.min_column_expr()?;
+ let max_column_expr = expr_builder.max_column_expr()?;
+ let scalar_expr = expr_builder.scalar_expr();
+ // check that the scalar is a string literal
+ let s = extract_string_literal(scalar_expr)?;
+ // ANSI SQL specifies two wildcards: % and _. % matches zero or more
characters, _ matches exactly one character.
+ let first_wildcard_index = s.find(['%', '_']);
+ if first_wildcard_index == Some(0) {
+ // there's no filtering we could possibly do, return an error and have
this be handled by the unhandled hook
+ return plan_err!(
+ "LIKE expression with wildcard at the beginning is not supported"
+ );
+ }
+ let (min_lit, max_lit) = if let Some(wildcard_index) =
first_wildcard_index {
+ let prefix = &s[..wildcard_index];
+ let prefix_min_lit =
Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(
Review Comment:
please rename these min_lit, max_lit to upper_bound and lower_bound
because their meaning is reversed.
```
let (upper_bound, lower_bound) =
```
Also, it would be good to extract like `literal pattern -> (lower_bound,
upper_bound)` function and add unit tests for it.
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -1648,22 +1649,46 @@ fn build_like_match(
let s = unpack_string(lit.value())?;
return Ok(s);
}
- plan_err!("LIKE expression must be a string literal")
+ plan_err!("Unexpected LIKE expression: {expr:?}")
}
- // I *think* that ILIKE could be handled by making the min lowercase and
max uppercase
- // but that requires building the physical expressions that call lower()
and upper()
+ /// Try and increment the the string's bytes from right to left, returning
when the result
+ /// is a valid UTF8 string. Returns `None` when it can't increment any
byte.
+ /// Vendored from
https://github.com/apache/arrow-rs/blob/56525efbd5f37b89d1b56aa51709cab9f81bc89e/parquet/src/column/writer/mod.rs#L1432-L1448
+ fn increment_utf8(data: &str) -> Result<String> {
+ let mut data = data.as_bytes().to_vec();
+ for idx in (0..data.len()).rev() {
+ let original = data[idx];
+ let (byte, overflow) = original.overflowing_add(1);
Review Comment:
incrementing character 127 should be supported.
same with other utf-8 byte sequences where last byte is saturated.
let's operate on code points, not bytes.
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -1648,22 +1649,46 @@ fn build_like_match(
let s = unpack_string(lit.value())?;
return Ok(s);
}
- plan_err!("LIKE expression must be a string literal")
+ plan_err!("Unexpected LIKE expression: {expr:?}")
}
- // I *think* that ILIKE could be handled by making the min lowercase and
max uppercase
- // but that requires building the physical expressions that call lower()
and upper()
+ /// Try and increment the the string's bytes from right to left, returning
when the result
+ /// is a valid UTF8 string. Returns `None` when it can't increment any
byte.
+ /// Vendored from
https://github.com/apache/arrow-rs/blob/56525efbd5f37b89d1b56aa51709cab9f81bc89e/parquet/src/column/writer/mod.rs#L1432-L1448
+ fn increment_utf8(data: &str) -> Result<String> {
Review Comment:
define the function outside of `build_like_match` and add unit tests for it.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]