[I] Bloom filters are unused for certain where clause patterns [datafusion]

via GitHub Mon, 07 Jul 2025 02:37:27 -0700


debajyoti-truefoundry opened a new issue, #16697:
URL: https://github.com/apache/datafusion/issues/16697


   ### Describe the bug
   
   ### Query 1:
   ```sql
   SELECT col1, col2, count(*)
   FROM test_data
   WHERE (col1 = 'category_1' AND col2 = 'type_1') 
      OR (col1 = 'category_2' AND col2 = 'type_2')
   GROUP BY col1, col2
   ```
   
   ```
   DataSourceExec: file_groups={1 group: 
[[var/folders/6z/kt4t6jkd4ss1_fj16dv_05xc0000gn/T/.tmpl2ljbz/bloom_filter_data.parquet]]},
 projection=[col1, col2], file_type=parquet, predicate=col1@0 = category_1 AND 
col2@1 = type_1 OR col1@0 = category_2 AND col2@1 = type_2, 
pruning_predicate=col1_null_count@2 != row_count@3 AND col1_min@0 <= category_1 
AND category_1 <= col1_max@1 AND col2_null_count@6 != row_count@3 AND 
col2_min@4 <= type_1 AND type_1 <= col2_max@5 OR col1_null_count@2 != 
row_count@3 AND col1_min@0 <= category_2 AND category_2 <= col1_max@1 AND 
col2_null_count@6 != row_count@3 AND col2_min@4 <= type_2 AND type_2 <= 
col2_max@5, required_guarantees=[], metrics=[output_rows=10000, 
elapsed_compute=1ns, bytes_scanned=18452, file_open_errors=0, 
file_scan_errors=0, num_predicate_creation_errors=0, page_index_rows_matched=0, 
page_index_rows_pruned=0, predicate_evaluation_errors=0, 
pushdown_rows_matched=0, pushdown_rows_pruned=0, 
row_groups_matched_bloom_filter=0, row_groups_matc
 hed_statistics=1, row_groups_pruned_bloom_filter=0, 
row_groups_pruned_statistics=0, bloom_filter_eval_time=217.585µs, 
metadata_load_time=330.042µs, page_index_eval_time=43ns, 
row_pushdown_eval_time=2ns, statistics_eval_time=480.376µs, 
time_elapsed_opening=1.069667ms, time_elapsed_processing=2.172875ms, 
time_elapsed_scanning_total=1.259999ms, 
time_elapsed_scanning_until_data=1.026791ms]
   ```
   
   As we can see from the above query, `col1` can only have two values: 
`category_1` and `category_2`. But we have `required_guarantees=[]` and 
`row_groups_matched_bloom_filter=0`. The bloom filter is unused.
   
   ### Query 2:
   ```sql
   SELECT col1, col2, count(*)
   FROM test_data
   WHERE ((col1 = 'category_1' AND col2 = 'type_1') 
       OR (col1 = 'category_2' AND col2 = 'type_2'))
   AND col1 IN ('category_1', 'category_2')
   GROUP BY col1, col2
   ```
   
   ```
   DataSourceExec: file_groups={1 group: 
[[var/folders/6z/kt4t6jkd4ss1_fj16dv_05xc0000gn/T/.tmpl2ljbz/bloom_filter_data.parquet]]},
 projection=[col1, col2], file_type=parquet, predicate=(col1@0 = category_1 AND 
col2@1 = type_1 OR col1@0 = category_2 AND col2@1 = type_2) AND (col1@0 = 
category_1 OR col1@0 = category_2), pruning_predicate=(col1_null_count@2 != 
row_count@3 AND col1_min@0 <= category_1 AND category_1 <= col1_max@1 AND 
col2_null_count@6 != row_count@3 AND col2_min@4 <= type_1 AND type_1 <= 
col2_max@5 OR col1_null_count@2 != row_count@3 AND col1_min@0 <= category_2 AND 
category_2 <= col1_max@1 AND col2_null_count@6 != row_count@3 AND col2_min@4 <= 
type_2 AND type_2 <= col2_max@5) AND (col1_null_count@2 != row_count@3 AND 
col1_min@0 <= category_1 AND category_1 <= col1_max@1 OR col1_null_count@2 != 
row_count@3 AND col1_min@0 <= category_2 AND category_2 <= col1_max@1), 
required_guarantees=[col1 in (category_1, category_2)], 
metrics=[output_rows=10000, elapsed_compute=1ns, b
 ytes_scanned=18695, file_open_errors=0, file_scan_errors=0, 
num_predicate_creation_errors=0, page_index_rows_matched=10000, 
page_index_rows_pruned=0, predicate_evaluation_errors=0, 
pushdown_rows_matched=0, pushdown_rows_pruned=0, 
row_groups_matched_bloom_filter=1, row_groups_matched_statistics=1, 
row_groups_pruned_bloom_filter=0, row_groups_pruned_statistics=0, 
bloom_filter_eval_time=199.459µs, metadata_load_time=452.793µs, 
page_index_eval_time=85.71µs, row_pushdown_eval_time=2ns, 
statistics_eval_time=97.667µs, time_elapsed_opening=896.792µs, 
time_elapsed_processing=1.827918ms, time_elapsed_scanning_total=1.140416ms, 
time_elapsed_scanning_until_data=941.917µs]
   ```
   
   Here we have `required_guarantees=[col1 in (category_1, category_2)]` and 
`row_groups_matched_bloom_filter=1` due to explicit `col1 IN ('category_1', 
'category_2')`.
   
   In my production use case, I achieve around a 40% improvement by adding the 
`col1 IN ('category_1', 'category_2')` pattern, as I need to process less data.
   
   
   ### To Reproduce
   
   ```rust
   use arrow::array::StringArray;
   use arrow::datatypes::{DataType, Field, Schema};
   use arrow::record_batch::RecordBatch;
   use datafusion::prelude::*;
   use parquet::arrow::arrow_writer::ArrowWriter;
   use parquet::file::properties::WriterProperties;
   use std::fs::File;
   use std::sync::Arc;
   use tempfile::TempDir;
   
   #[tokio::main]
   async fn main() -> Result<(), Box<dyn std::error::Error>> {
       println!("Creating parquet file with bloom filters...");
   
       let temp_dir = TempDir::new()?;
       let parquet_path = temp_dir.path().join("bloom_filter_data.parquet");
   
       // Create sample data with repetitive patterns to test bloom filter 
efficiency
       let mut col1_values = Vec::new();
       let mut col2_values = Vec::new();
   
       // Generate data with specific patterns
       for i in 0..10000 {
           col1_values.push(format!("category_{}", i % 100));
           col2_values.push(format!("type_{}", i % 50));
       }
   
       let col1 = StringArray::from(col1_values);
       let col2 = StringArray::from(col2_values);
   
       let schema = Arc::new(Schema::new(vec![
           Field::new("col1", DataType::Utf8, false),
           Field::new("col2", DataType::Utf8, false),
       ]));
   
       let record_batch = RecordBatch::try_new(schema.clone(), 
vec![Arc::new(col1), Arc::new(col2)])?;
   
       let file = File::create(&parquet_path)?;
   
       // Enable bloom filters
       let props = WriterProperties::builder()
           .set_bloom_filter_enabled(true)
           .set_bloom_filter_ndv(100)
           .build();
   
       let mut writer = ArrowWriter::try_new(file, schema, Some(props))?;
       writer.write(&record_batch)?;
       writer.close()?;
   
       println!("Parquet file created at: {:?}", parquet_path);
       println!("Bloom filters enabled for columns");
   
       let ctx = SessionContext::new_with_config(SessionConfig::from_env()?);
       ctx.register_parquet(
           "test_data",
           parquet_path.to_str().unwrap(),
           ParquetReadOptions::default(),
       )
       .await?;
   
       println!("\nSample data:");
       let df = ctx.sql("SELECT col1, col2 FROM test_data LIMIT 10").await?;
       df.show().await?;
   
       println!("\n=== First Query: OR conditions with bloom filter ===");
       let query1 = "
           EXPLAIN ANALYZE
           SELECT col1, col2, count(*)
           FROM test_data
           WHERE (col1 = 'category_1' AND col2 = 'type_1') 
              OR (col1 = 'category_2' AND col2 = 'type_2')
           GROUP BY col1, col2
       ";
   
       let result1 = ctx.sql(query1).await?;
       result1.show().await?;
   
       println!("\n=== Second Query: OR conditions with IN clause ===");
       let query2 = "
           EXPLAIN ANALYZE
           SELECT col1, col2, count(*)
           FROM test_data
           WHERE ((col1 = 'category_1' AND col2 = 'type_1') 
               OR (col1 = 'category_2' AND col2 = 'type_2'))
           AND col1 IN ('category_1', 'category_2')
           GROUP BY col1, col2
       ";
   
       let result2 = ctx.sql(query2).await?;
       result2.show().await?;
   
       Ok(())
   }
   ```
   
   I am using Datafusion 47.
   
   ```
   ❯ env | grep DATAFUSION
   DATAFUSION_EXECUTION_PARQUET_REORDER_FILTERS=true
   DATAFUSION_EXECUTION_TARGET_PARTITIONS=8
   DATAFUSION_EXECUTION_USE_ROW_NUMBER_ESTIMATES_TO_OPTIMIZE_PARTITIONING=true
   DATAFUSION_EXECUTION_PARQUET_PUSHDOWN_FILTERS=true
   DATAFUSION_EXECUTION_META_FETCH_CONCURRENCY=10
   DATAFUSION_EXECUTION_PARQUET_BINARY_AS_STRING=true
   DATAFUSION_EXECUTION_PARQUET_SCHEMA_FORCE_VIEW_TYPES=true
   DATAFUSION_EXECUTION_BATCH_SIZE=20000
   ```
   
   ### Expected behavior
   
   ```sql
   WHERE (col1 = 'category_1' AND col2 = 'type_1') 
      OR (col1 = 'category_2' AND col2 = 'type_2')
   ```
   
   The above pattern should create `required_guarantees=[col1 in (category_1, 
category_2)]` or an equivalent expression and utilise the bloom filter.
   
   ### Additional context
   
   _No response_


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

[I] Bloom filters are unused for certain where clause patterns [datafusion]

Reply via email to