alamb commented on code in PR #12978:
URL: https://github.com/apache/datafusion/pull/12978#discussion_r1898518395


##########
datafusion/core/tests/fuzz_cases/pruning.rs:
##########
@@ -0,0 +1,247 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow_array::{Array, RecordBatch, StringArray};
+use arrow_schema::{DataType, Field, Schema};
+use bytes::{BufMut, Bytes, BytesMut};
+use datafusion::{
+    datasource::{
+        listing::PartitionedFile,
+        physical_plan::{parquet::ParquetExecBuilder, FileScanConfig},
+    },
+    prelude::*,
+};
+use datafusion_common::DFSchema;
+use datafusion_execution::object_store::ObjectStoreUrl;
+use datafusion_physical_expr::PhysicalExpr;
+use datafusion_physical_plan::{collect, filter::FilterExec, ExecutionPlan};
+use itertools::Itertools;
+use object_store::{memory::InMemory, path::Path, ObjectStore, PutPayload};
+use parquet::{
+    arrow::ArrowWriter,
+    file::properties::{EnabledStatistics, WriterProperties},
+};
+use rand::seq::SliceRandom;
+use url::Url;
+
+#[tokio::test]
+async fn test_fuzz_utf8() {
+    // Fuzz testing for UTF8 predicate pruning
+    // The basic idea is that query results should always be the same with or 
without stats/pruning
+    // If we get this right we at least guarantee that there are no incorrect 
results
+    // There may still be suboptimal pruning or stats but that's something we 
can try to catch
+    // with more targeted tests.
+
+    // Since we know where the edge cases might be we don't do random black 
box fuzzing.
+    // Instead we fuzz on specific pre-defined axis:
+    //
+    // - Which characters are in each value. We want to make sure to include 
characters that when
+    //   incremented, truncated or otherwise manipulated might cause issues.
+    // - The values in each row group. This impacts which min/max stats are 
generated for each rg.
+    //   We'll generate combinations of the characters with lengths ranging 
from 1 to 4.
+    // - Truncation of statistics to 1, 2 or 3 characters as well as no 
truncation.
+
+    let mut rng = rand::thread_rng();
+
+    let characters = [
+        "z",
+        "0",
+        "~",
+        "ß",
+        "℣",
+        "%", // this one is useful for like/not like tests since it will 
result in randomly inserted wildcards
+        "_", // this one is useful for like/not like tests since it will 
result in randomly inserted wildcards
+        "\u{7F}",
+        "\u{7FF}",
+        "\u{FF}",
+        "\u{10FFFF}",
+        "\u{D7FF}",
+        "\u{FDCF}",
+        // null character
+        "\u{0}",
+    ];
+
+    let value_lengths = [1, 2, 3];
+
+    // generate all combinations of characters with lengths ranging from 1 to 4
+    let mut values = vec![];
+    for length in &value_lengths {
+        values.extend(
+            characters
+                .iter()
+                .cloned()
+                .combinations(*length)
+                // now get all permutations of each combination
+                .flat_map(|c| c.into_iter().permutations(*length))
+                // and join them into strings
+                .map(|c| c.join("")),
+        );
+    }
+
+    println!("Generated {} values", values.len());
+
+    // randomly pick 100 values
+    values.shuffle(&mut rng);
+    values.truncate(100);
+
+    let mut row_groups = vec![];
+    // generate all combinations of values for row groups (1 or 2 values per 
rg, more is unessecarry since we only get min/max stats out)

Review Comment:
   ```suggestion
       // generate all combinations of values for row groups (1 or 2 values per 
rg, more is unnecessary since we only get min/max stats out)
   ```



##########
datafusion/core/tests/fuzz_cases/pruning.rs:
##########
@@ -0,0 +1,247 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow_array::{Array, RecordBatch, StringArray};
+use arrow_schema::{DataType, Field, Schema};
+use bytes::{BufMut, Bytes, BytesMut};
+use datafusion::{
+    datasource::{
+        listing::PartitionedFile,
+        physical_plan::{parquet::ParquetExecBuilder, FileScanConfig},
+    },
+    prelude::*,
+};
+use datafusion_common::DFSchema;
+use datafusion_execution::object_store::ObjectStoreUrl;
+use datafusion_physical_expr::PhysicalExpr;
+use datafusion_physical_plan::{collect, filter::FilterExec, ExecutionPlan};
+use itertools::Itertools;
+use object_store::{memory::InMemory, path::Path, ObjectStore, PutPayload};
+use parquet::{
+    arrow::ArrowWriter,
+    file::properties::{EnabledStatistics, WriterProperties},
+};
+use rand::seq::SliceRandom;
+use url::Url;
+
+#[tokio::test]
+async fn test_fuzz_utf8() {
+    // Fuzz testing for UTF8 predicate pruning
+    // The basic idea is that query results should always be the same with or 
without stats/pruning
+    // If we get this right we at least guarantee that there are no incorrect 
results
+    // There may still be suboptimal pruning or stats but that's something we 
can try to catch
+    // with more targeted tests.
+
+    // Since we know where the edge cases might be we don't do random black 
box fuzzing.
+    // Instead we fuzz on specific pre-defined axis:
+    //
+    // - Which characters are in each value. We want to make sure to include 
characters that when
+    //   incremented, truncated or otherwise manipulated might cause issues.
+    // - The values in each row group. This impacts which min/max stats are 
generated for each rg.
+    //   We'll generate combinations of the characters with lengths ranging 
from 1 to 4.
+    // - Truncation of statistics to 1, 2 or 3 characters as well as no 
truncation.
+
+    let mut rng = rand::thread_rng();
+
+    let characters = [
+        "z",
+        "0",
+        "~",
+        "ß",
+        "℣",
+        "%", // this one is useful for like/not like tests since it will 
result in randomly inserted wildcards

Review Comment:
   👍 



##########
datafusion/physical-optimizer/src/pruning.rs:
##########
@@ -1607,6 +1629,127 @@ fn build_statistics_expr(
     Ok(statistics_expr)
 }
 
+/// Convert `column LIKE literal` where P is a constant prefix of the literal
+/// to a range check on the column: `P <= column && column < P'`, where P' is 
the
+/// lowest string after all P* strings.
+fn build_like_match(
+    expr_builder: &mut PruningExpressionBuilder,
+) -> Option<Arc<dyn PhysicalExpr>> {
+    // column LIKE literal => (min, max) LIKE literal split at % => min <= 
split literal && split literal <= max
+    // column LIKE 'foo%' => min <= 'foo' && 'foo' <= max
+    // column LIKE '%foo' => min <= '' && '' <= max => true
+    // column LIKE '%foo%' => min <= '' && '' <= max => true
+    // column LIKE 'foo' => min <= 'foo' && 'foo' <= max
+
+    fn unpack_string(s: &ScalarValue) -> Option<&String> {
+        match s {
+            ScalarValue::Utf8(Some(s)) => Some(s),
+            ScalarValue::LargeUtf8(Some(s)) => Some(s),
+            ScalarValue::Utf8View(Some(s)) => Some(s),
+            ScalarValue::Dictionary(_, value) => unpack_string(value),
+            _ => None,
+        }
+    }
+
+    fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Option<&String> 
{
+        if let Some(lit) = expr.as_any().downcast_ref::<phys_expr::Literal>() {
+            let s = unpack_string(lit.value())?;
+            return Some(s);
+        }
+        None
+    }
+
+    // TODO Handle ILIKE perhaps by making the min lowercase and max uppercase
+    //  this may involve building the physical expressions that call lower() 
and upper()
+    let min_column_expr = expr_builder.min_column_expr().ok()?;
+    let max_column_expr = expr_builder.max_column_expr().ok()?;
+    let scalar_expr = expr_builder.scalar_expr();
+    // check that the scalar is a string literal
+    let s = extract_string_literal(scalar_expr)?;
+    // ANSI SQL specifies two wildcards: % and _. % matches zero or more 
characters, _ matches exactly one character.
+    let first_wildcard_index = s.find(['%', '_']);
+    if first_wildcard_index == Some(0) {
+        // there's no filtering we could possibly do, return an error and have 
this be handled by the unhandled hook
+        return None;
+    }
+    let (lower_bound, upper_bound) = if let Some(wildcard_index) = 
first_wildcard_index {
+        let prefix = &s[..wildcard_index];
+        let lower_bound_lit = 
Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(
+            prefix.to_string(),
+        ))));
+        let upper_bound_lit = 
Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(
+            increment_utf8(prefix)?,
+        ))));
+        (lower_bound_lit, upper_bound_lit)
+    } else {
+        // the like expression is a literal and can be converted into a 
comparison
+        let bound = 
Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(s.clone()))));
+        (Arc::clone(&bound), bound)
+    };
+    let lower_bound_expr = Arc::new(phys_expr::BinaryExpr::new(
+        lower_bound,
+        Operator::LtEq,
+        Arc::clone(&max_column_expr),
+    ));
+    let upper_bound_expr = Arc::new(phys_expr::BinaryExpr::new(
+        Arc::clone(&min_column_expr),
+        Operator::LtEq,
+        upper_bound,
+    ));
+    let combined = Arc::new(phys_expr::BinaryExpr::new(
+        upper_bound_expr,
+        Operator::And,
+        lower_bound_expr,
+    ));
+    Some(combined)
+}
+
+/// Increment a UTF8 string by one, returning `None` if it can't be 
incremented.
+/// This makes it so that the returned string will always compare greater than 
the input string
+/// or any other string with the same prefix.
+/// This is necessary since the statistics may have been truncated: if we have 
a min statistic
+/// of "fo" that may have originally been "foz" or anything else with the 
prefix "fo".
+/// E.g. `increment_utf8("foo") >= "foo"` and `increment_utf8("foo") >= "fooz"`
+/// In this example `increment_utf8("foo") == "fop"
+fn increment_utf8(data: &str) -> Option<String> {

Review Comment:
   would it be ok to potentially replace this with the implementation from 
@etseidl  in https://github.com/apache/arrow-rs/pull/6870 ? 
   
   If so, I can file a ticket to do so as a follow on



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Reply via email to