alamb commented on code in PR #12978:
URL: https://github.com/apache/datafusion/pull/12978#discussion_r1805374206
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -1610,6 +1625,93 @@ fn build_statistics_expr(
Ok(statistics_expr)
}
+fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Result<&String> {
+ if let Some(lit) = expr.as_any().downcast_ref::<phys_expr::Literal>() {
+ if let ScalarValue::Utf8(Some(s)) = lit.value() {
Review Comment:
I think you should probably also handle the cases `ScalarValue::LargeUtf8`,
`ScalarValue::Utff8View` as well.
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -1610,6 +1625,93 @@ fn build_statistics_expr(
Ok(statistics_expr)
}
+fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Result<&String> {
+ if let Some(lit) = expr.as_any().downcast_ref::<phys_expr::Literal>() {
+ if let ScalarValue::Utf8(Some(s)) = lit.value() {
+ return Ok(s);
+ }
+ }
+ plan_err!("LIKE expression must be a string literal")
+}
+
+fn extract_like_string_literal_prefix(
+ expr: &Arc<dyn PhysicalExpr>,
+) -> Result<Arc<phys_expr::Literal>> {
+ let s = extract_string_literal(expr)?;
+ let mut split_literal = s.split('%');
+ let prefix = split_literal.next().unwrap_or("");
+ // if the prefix is empty, return true
+ if prefix.is_empty() {
+ return plan_err!("Empty prefix in LIKE expression");
+ }
+ Ok(Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(
Review Comment:
Do we also have to test if there are other occurences of `%` in the string 🤔
(like `foo%bar%`)?
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -3443,6 +3545,115 @@ mod tests {
);
}
+ /// Creates a setup for chunk pruning, modeling a utf8 column "s1"
+ /// with 5 different containers (e.g. RowGroups). They have [min,
+ /// max]:
+ /// s1 ["A", "Z"]
+ /// s1 ["A", "L"]
+ /// s1 ["N", "Z"]
+ /// s1 [NULL, NULL]
+ /// s1 ["A", NULL]
+ fn utf8_setup() -> (SchemaRef, TestStatistics) {
+ let schema = Arc::new(Schema::new(vec![Field::new("s1",
DataType::Utf8, true)]));
+
+ let statistics = TestStatistics::new().with(
+ "s1",
+ ContainerStats::new_utf8(
+ vec![Some("A"), Some("A"), Some("N"), Some("M"), None,
Some("A")], // min
+ vec![Some("Z"), Some("L"), Some("Z"), Some("M"), None, None],
// max
+ ),
+ );
+ (schema, statistics)
+ }
+
+ #[test]
+ fn prune_utf8_eq() {
+ let (schema, statistics) = utf8_setup();
+
+ // Expression "s1 = 'A'"
+ // s1 ["A", "Z"] ==> some rows could pass (must keep)
+ // s1 ["A", "L"] ==> some rows could pass (must keep)
+ // s1 ["N", "Z"] ==> no rows can pass (not keep)
+ // s1 ["M", "M"] ==> no rows can pass (not keep)
+ // s1 [NULL, NULL] ==> unknown (must keep)
+ // s1 ["A", NULL] ==> unknown (must keep)
+ let expected_ret = &[true, true, false, false, true, true];
+
+ prune_with_expr(
+ // s1 LIKE 'A%'
Review Comment:
the comments say s1 LIKE `A%` but the code builds a different expressions
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -3443,6 +3545,115 @@ mod tests {
);
}
+ /// Creates a setup for chunk pruning, modeling a utf8 column "s1"
+ /// with 5 different containers (e.g. RowGroups). They have [min,
+ /// max]:
+ /// s1 ["A", "Z"]
+ /// s1 ["A", "L"]
+ /// s1 ["N", "Z"]
+ /// s1 [NULL, NULL]
+ /// s1 ["A", NULL]
+ fn utf8_setup() -> (SchemaRef, TestStatistics) {
+ let schema = Arc::new(Schema::new(vec![Field::new("s1",
DataType::Utf8, true)]));
+
+ let statistics = TestStatistics::new().with(
+ "s1",
+ ContainerStats::new_utf8(
+ vec![Some("A"), Some("A"), Some("N"), Some("M"), None,
Some("A")], // min
+ vec![Some("Z"), Some("L"), Some("Z"), Some("M"), None, None],
// max
+ ),
+ );
+ (schema, statistics)
+ }
+
+ #[test]
+ fn prune_utf8_eq() {
+ let (schema, statistics) = utf8_setup();
+
+ // Expression "s1 = 'A'"
+ // s1 ["A", "Z"] ==> some rows could pass (must keep)
+ // s1 ["A", "L"] ==> some rows could pass (must keep)
+ // s1 ["N", "Z"] ==> no rows can pass (not keep)
+ // s1 ["M", "M"] ==> no rows can pass (not keep)
+ // s1 [NULL, NULL] ==> unknown (must keep)
+ // s1 ["A", NULL] ==> unknown (must keep)
+ let expected_ret = &[true, true, false, false, true, true];
+
+ prune_with_expr(
+ // s1 LIKE 'A%'
+ col("s1").eq(lit("A")),
+ &schema,
+ &statistics,
+ expected_ret,
+ );
+ }
+
+ #[test]
+ fn prune_utf8_like() {
+ let (schema, statistics) = utf8_setup();
+
+ // Expression "s1 LIKE 'A%'"
+ // s1 ["A", "Z"] ==> some rows could pass (must keep)
+ // s1 ["A", "L"] ==> some rows could pass (must keep)
+ // s1 ["N", "Z"] ==> no rows can pass (not keep)
+ // s1 ["M", "M"] ==> no rows can pass (not keep)
+ // s1 [NULL, NULL] ==> unknown (must keep)
+ // s1 ["A", NULL] ==> unknown (must keep)
+ let expected_ret = &[true, true, false, false, true, true];
+
+ prune_with_expr(
+ // s1 LIKE 'A%'
+ col("s1").like(lit("A%")),
+ &schema,
+ &statistics,
+ expected_ret,
+ );
+ }
+
+ #[test]
+ fn prune_utf8_not_like() {
+ let (schema, statistics) = utf8_setup();
+
+ // Expression "s1 NOT LIKE 'M%'"
+ // s1 ["A", "Z"] ==> some rows could pass (must keep)
+ // s1 ["A", "L"] ==> some rows could pass (must keep)
+ // s1 ["N", "Z"] ==> some rows could pass (must keep)
+ // s1 ["M", "M"] ==> no rows can pass (not keep)
+ // s1 [NULL, NULL] ==> unknown (must keep)
+ // s1 ["A", NULL] ==> unknown (must keep)
+ let expected_ret = &[true, true, true, false, true, true];
+
+ prune_with_expr(
+ // s1 NOT LIKE 'M%'
+ col("s1").not_like(lit("M%")),
+ &schema,
+ &statistics,
+ expected_ret,
+ );
+ }
+
+ #[test]
+ fn prune_utf8_like_empty_suffix() {
+ let (schema, statistics) = utf8_setup();
+
+ // Expression "s1 LIKE '%A'"
+ // s1 ["A", "Z"] ==> some rows could pass (must keep)
+ // s1 ["A", "L"] ==> some rows could pass (must keep)
+ // s1 ["N", "Z"] ==> some rows could pass (must keep)
+ // s1 ["M", "M"] ==> some rows could pass (must keep)
+ // s1 [NULL, NULL] ==> unknown (must keep)
+ // s1 ["A", NULL] ==> unknown (must keep)
+ let expected_ret = &[true, true, true, true, true, true];
+
+ prune_with_expr(
+ // s1 LIKE 'A%'
Review Comment:
```suggestion
// s1 LIKE '%A'
```
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -1610,6 +1625,93 @@ fn build_statistics_expr(
Ok(statistics_expr)
}
+fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Result<&String> {
+ if let Some(lit) = expr.as_any().downcast_ref::<phys_expr::Literal>() {
+ if let ScalarValue::Utf8(Some(s)) = lit.value() {
+ return Ok(s);
+ }
+ }
+ plan_err!("LIKE expression must be a string literal")
+}
+
+fn extract_like_string_literal_prefix(
+ expr: &Arc<dyn PhysicalExpr>,
+) -> Result<Arc<phys_expr::Literal>> {
+ let s = extract_string_literal(expr)?;
+ let mut split_literal = s.split('%');
+ let prefix = split_literal.next().unwrap_or("");
+ // if the prefix is empty, return true
+ if prefix.is_empty() {
+ return plan_err!("Empty prefix in LIKE expression");
+ }
+ Ok(Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(
+ prefix.to_string(),
+ )))))
+}
+
+fn build_like_match(
+ expr_builder: &mut PruningExpressionBuilder,
+) -> Result<Arc<dyn PhysicalExpr>> {
+ // column LIKE literal => (min, max) LIKE literal split at % => min <=
split literal && split literal <= max
Review Comment:
I don't understand the `LIKE literal split at %` part
`column LIKE literal` is the same as `column = literal` if there are no wild
cards, so you should be able to use the same rules as equality I think
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -1610,6 +1625,93 @@ fn build_statistics_expr(
Ok(statistics_expr)
}
+fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Result<&String> {
+ if let Some(lit) = expr.as_any().downcast_ref::<phys_expr::Literal>() {
+ if let ScalarValue::Utf8(Some(s)) = lit.value() {
+ return Ok(s);
+ }
+ }
+ plan_err!("LIKE expression must be a string literal")
+}
+
+fn extract_like_string_literal_prefix(
+ expr: &Arc<dyn PhysicalExpr>,
+) -> Result<Arc<phys_expr::Literal>> {
+ let s = extract_string_literal(expr)?;
+ let mut split_literal = s.split('%');
+ let prefix = split_literal.next().unwrap_or("");
+ // if the prefix is empty, return true
+ if prefix.is_empty() {
+ return plan_err!("Empty prefix in LIKE expression");
+ }
+ Ok(Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(
+ prefix.to_string(),
+ )))))
+}
+
+fn build_like_match(
+ expr_builder: &mut PruningExpressionBuilder,
+) -> Result<Arc<dyn PhysicalExpr>> {
+ // column LIKE literal => (min, max) LIKE literal split at % => min <=
split literal && split literal <= max
+ // column LIKE 'foo%' => min <= 'foo' && 'foo' <= max
+ // column LIKE '%foo' => min <= '' && '' <= max => true
+ // column LIKE '%foo%' => min <= '' && '' <= max => true
+ // column LIKE 'foo' => min <= 'foo' && 'foo' <= max
+
+ // I *think* that ILIKE could be handled by making the min lowercase and
max uppercase
Review Comment:
👍 I agree. Figuring out how to make those call would be the trick
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -3443,6 +3545,115 @@ mod tests {
);
}
+ /// Creates a setup for chunk pruning, modeling a utf8 column "s1"
+ /// with 5 different containers (e.g. RowGroups). They have [min,
+ /// max]:
+ /// s1 ["A", "Z"]
+ /// s1 ["A", "L"]
+ /// s1 ["N", "Z"]
+ /// s1 [NULL, NULL]
+ /// s1 ["A", NULL]
+ fn utf8_setup() -> (SchemaRef, TestStatistics) {
+ let schema = Arc::new(Schema::new(vec![Field::new("s1",
DataType::Utf8, true)]));
+
+ let statistics = TestStatistics::new().with(
+ "s1",
+ ContainerStats::new_utf8(
+ vec![Some("A"), Some("A"), Some("N"), Some("M"), None,
Some("A")], // min
+ vec![Some("Z"), Some("L"), Some("Z"), Some("M"), None, None],
// max
+ ),
+ );
+ (schema, statistics)
+ }
+
+ #[test]
+ fn prune_utf8_eq() {
+ let (schema, statistics) = utf8_setup();
+
+ // Expression "s1 = 'A'"
+ // s1 ["A", "Z"] ==> some rows could pass (must keep)
+ // s1 ["A", "L"] ==> some rows could pass (must keep)
+ // s1 ["N", "Z"] ==> no rows can pass (not keep)
+ // s1 ["M", "M"] ==> no rows can pass (not keep)
+ // s1 [NULL, NULL] ==> unknown (must keep)
+ // s1 ["A", NULL] ==> unknown (must keep)
+ let expected_ret = &[true, true, false, false, true, true];
+
+ prune_with_expr(
+ // s1 LIKE 'A%'
+ col("s1").eq(lit("A")),
+ &schema,
+ &statistics,
+ expected_ret,
+ );
+ }
+
+ #[test]
+ fn prune_utf8_like() {
+ let (schema, statistics) = utf8_setup();
+
+ // Expression "s1 LIKE 'A%'"
+ // s1 ["A", "Z"] ==> some rows could pass (must keep)
+ // s1 ["A", "L"] ==> some rows could pass (must keep)
+ // s1 ["N", "Z"] ==> no rows can pass (not keep)
+ // s1 ["M", "M"] ==> no rows can pass (not keep)
+ // s1 [NULL, NULL] ==> unknown (must keep)
+ // s1 ["A", NULL] ==> unknown (must keep)
+ let expected_ret = &[true, true, false, false, true, true];
+
+ prune_with_expr(
+ // s1 LIKE 'A%'
Review Comment:
Can you also please add tests for other combinations:
* `s1 LIKE 'A'`
* `s1 LIKE '%A%'`
I think it is important the matching is doing the right thing
I also think it is important to to cover cases for NOT LIKE as well
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]