(datasketches-rust) branch main updated: feat: implement bloom filter (#53)

placave Wed, 31 Dec 2025 06:37:20 -0800

This is an automated email from the ASF dual-hosted git repository.

placave pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datasketches-rust.git



The following commit(s) were added to refs/heads/main by this push:
     new 819f563  feat: implement bloom filter (#53)
819f563 is described below

commit 819f563c29cfd139a0a69be052591a5aa3368e4d
Author: Filippo Rossi <[email protected]>
AuthorDate: Wed Dec 31 15:37:07 2025 +0100

    feat: implement bloom filter (#53)
    
    * checkpoint
    
    * checkpoint
    
    * xxhash
    
    * match cpp hashing logic
    
    * serde
    
    * cleanup
    
    * cleanup
    
    * docs
    
    * comments cleanup
    
    * builder cleanup
    
    * trade 3 loc for 1 loc
    
    * get back 1 loc
---
 datasketches/src/bloom/builder.rs              | 221 ++++++++
 datasketches/src/bloom/mod.rs                  | 127 +++++
 datasketches/src/bloom/sketch.rs               | 730 +++++++++++++++++++++++++
 datasketches/src/lib.rs                        |   1 +
 datasketches/tests/bloom_serialization_test.rs | 231 ++++++++
 5 files changed, 1310 insertions(+)

diff --git a/datasketches/src/bloom/builder.rs 
b/datasketches/src/bloom/builder.rs
new file mode 100644
index 0000000..13e3b5a
--- /dev/null
+++ b/datasketches/src/bloom/builder.rs
@@ -0,0 +1,221 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use super::BloomFilter;
+use crate::hash::DEFAULT_UPDATE_SEED;
+
+const MIN_NUM_BITS: u64 = 64;
+const MAX_NUM_BITS: u64 = (1u64 << 35) - 64; // ~32 GB - reasonable limit
+
+/// Builder for creating [`BloomFilter`] instances.
+///
+/// Provides two construction modes:
+/// - [`with_accuracy()`](Self::with_accuracy): Specify target items and false 
positive rate
+///   (recommended)
+/// - [`with_size()`](Self::with_size): Specify exact bit count and hash 
functions (manual)
+#[derive(Debug, Clone)]
+pub struct BloomFilterBuilder {
+    num_bits: u64,
+    num_hashes: u16,
+    seed: u64,
+}
+
+impl BloomFilterBuilder {
+    /// Creates a builder with optimal parameters for a target accuracy.
+    ///
+    /// Automatically calculates the optimal number of bits and hash functions
+    /// to achieve the desired false positive probability for a given number 
of items.
+    ///
+    /// # Arguments
+    ///
+    /// - `max_items`: Maximum expected number of distinct items
+    /// - `fpp`: Target false positive probability (e.g., 0.01 for 1%)
+    ///
+    /// # Panics
+    ///
+    /// Panics if `max_items` is 0 or `fpp` is not in (0.0, 1.0).
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # use datasketches::bloom::BloomFilterBuilder;
+    /// // Optimal for 10,000 items with 1% FPP
+    /// let filter = BloomFilterBuilder::with_accuracy(10_000, 0.01)
+    ///     .seed(42)
+    ///     .build();
+    /// ```
+    pub fn with_accuracy(max_items: u64, fpp: f64) -> Self {
+        assert!(max_items > 0, "max_items must be greater than 0");
+        assert!(
+            fpp > 0.0 && fpp < 1.0,
+            "fpp must be between 0.0 and 1.0 (exclusive)"
+        );
+
+        let num_bits = Self::suggest_num_bits(max_items, fpp);
+        let num_hashes = Self::suggest_num_hashes_from_accuracy(max_items, 
num_bits);
+
+        BloomFilterBuilder {
+            num_bits,
+            num_hashes,
+            seed: DEFAULT_UPDATE_SEED,
+        }
+    }
+
+    /// Creates a builder with manual size specification.
+    ///
+    /// Use this when you want precise control over the filter size,
+    /// or when working with pre-calculated parameters.
+    ///
+    /// # Arguments
+    ///
+    /// - `num_bits`: Total number of bits in the filter
+    /// - `num_hashes`: Number of hash functions to use
+    ///
+    /// # Panics
+    ///
+    /// Panics if:
+    /// - `num_bits` < MIN_NUM_BITS (64) or `num_bits` > MAX_NUM_BITS (~32 GB)
+    /// - `num_hashes` < 1 or `num_hashes` > 100
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # use datasketches::bloom::BloomFilterBuilder;
+    /// let filter = BloomFilterBuilder::with_size(10_000, 7).build();
+    /// ```
+    pub fn with_size(num_bits: u64, num_hashes: u16) -> Self {
+        assert!(
+            num_bits >= MIN_NUM_BITS,
+            "num_bits must be at least {}",
+            MIN_NUM_BITS
+        );
+        assert!(
+            num_bits <= MAX_NUM_BITS,
+            "num_bits must not exceed {}",
+            MAX_NUM_BITS
+        );
+        assert!(num_hashes >= 1, "num_hashes must be at least 1");
+        assert!(num_hashes <= 100, "num_hashes must not exceed 100");
+
+        BloomFilterBuilder {
+            num_bits,
+            num_hashes,
+            seed: DEFAULT_UPDATE_SEED,
+        }
+    }
+
+    /// Sets a custom hash seed (default: 9001).
+    ///
+    /// **Important**: Filters with different seeds cannot be merged.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # use datasketches::bloom::BloomFilterBuilder;
+    /// let filter = BloomFilterBuilder::with_accuracy(100, 0.01)
+    ///     .seed(12345)
+    ///     .build();
+    /// ```
+    pub fn seed(mut self, seed: u64) -> Self {
+        self.seed = seed;
+        self
+    }
+
+    /// Builds the Bloom filter.
+    ///
+    /// # Panics
+    ///
+    /// Panics if neither `with_accuracy()` nor `with_size()` was called.
+    pub fn build(self) -> BloomFilter {
+        let capacity_bits = self.num_bits;
+        let num_hashes = self.num_hashes;
+
+        let num_words = capacity_bits.div_ceil(64) as usize;
+        let bit_array = vec![0u64; num_words];
+
+        BloomFilter {
+            seed: self.seed,
+            num_hashes,
+            capacity_bits,
+            num_bits_set: 0,
+            bit_array,
+        }
+    }
+
+    /// Suggests optimal number of bits given max items and target FPP.
+    ///
+    /// Formula: `m = -n * ln(p) / (ln(2)^2)`
+    /// where n = max_items, p = fpp
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # use datasketches::bloom::BloomFilterBuilder;
+    /// let bits = BloomFilterBuilder::suggest_num_bits(1000, 0.01);
+    /// assert!(bits > 9000 && bits < 10000); // ~9585 bits
+    /// ```
+    pub fn suggest_num_bits(max_items: u64, fpp: f64) -> u64 {
+        let n = max_items as f64;
+        let p = fpp;
+        let ln2_squared = std::f64::consts::LN_2 * std::f64::consts::LN_2;
+
+        let bits = (-n * p.ln() / ln2_squared).ceil() as u64;
+
+        // Round up to multiple of 64 for efficiency
+        let bits = bits.div_ceil(64) * 64;
+
+        bits.clamp(MIN_NUM_BITS, MAX_NUM_BITS)
+    }
+
+    /// Suggests optimal number of hash functions given max items and bit 
count.
+    ///
+    /// Formula: `k = (m/n) * ln(2)`
+    /// where m = num_bits, n = max_items
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # use datasketches::bloom::BloomFilterBuilder;
+    /// let hashes = 
BloomFilterBuilder::suggest_num_hashes_from_accuracy(1000, 10000);
+    /// assert_eq!(hashes, 7); // Optimal k ≈ 6.93
+    /// ```
+    pub fn suggest_num_hashes_from_accuracy(max_items: u64, num_bits: u64) -> 
u16 {
+        let m = num_bits as f64;
+        let n = max_items as f64;
+
+        let k = (m / n * std::f64::consts::LN_2).round();
+
+        (k as u16).clamp(1, 100) // Reasonable bounds
+    }
+
+    /// Suggests optimal number of hash functions from target FPP.
+    ///
+    /// Formula: `k = -log2(p)`
+    /// where p = fpp
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # use datasketches::bloom::BloomFilterBuilder;
+    /// let hashes = BloomFilterBuilder::suggest_num_hashes_from_fpp(0.01);
+    /// assert_eq!(hashes, 7); // -log2(0.01) ≈ 6.64
+    /// ```
+    pub fn suggest_num_hashes_from_fpp(fpp: f64) -> u16 {
+        let k = -fpp.log2();
+        (k.round() as u16).clamp(1, 100)
+    }
+}
diff --git a/datasketches/src/bloom/mod.rs b/datasketches/src/bloom/mod.rs
new file mode 100644
index 0000000..4a91287
--- /dev/null
+++ b/datasketches/src/bloom/mod.rs
@@ -0,0 +1,127 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Bloom Filter implementation for probabilistic set membership testing.
+//!
+//! A Bloom filter is a space-efficient probabilistic data structure used to 
test whether
+//! an element is a member of a set. False positive matches are possible, but 
false negatives
+//! are not. In other words, a query returns either "possibly in set" or 
"definitely not in set".
+//!
+//! # Properties
+//!
+//! - **No false negatives**: If an item was inserted, `contains()` will 
always return `true`
+//! - **Possible false positives**: `contains()` may return `true` for items 
never inserted
+//! - **Fixed size**: Unlike typical sketches, Bloom filters do not resize 
automatically
+//! - **Linear space**: Size is proportional to the expected number of 
distinct items
+//!
+//! # Usage
+//!
+//! ```rust
+//! use datasketches::bloom::BloomFilter;
+//! use datasketches::bloom::BloomFilterBuilder;
+//!
+//! // Create a filter optimized for 1000 items with 1% false positive rate
+//! let mut filter = BloomFilterBuilder::with_accuracy(1000, 0.01).build();
+//!
+//! // Insert items
+//! filter.insert("apple");
+//! filter.insert("banana");
+//! filter.insert(42_u64);
+//!
+//! // Check membership
+//! assert!(filter.contains(&"apple")); // true - definitely inserted
+//! assert!(!filter.contains(&"grape")); // false - never inserted (probably)
+//!
+//! // Get statistics
+//! println!("Capacity: {} bits", filter.capacity());
+//! println!("Bits used: {}", filter.bits_used());
+//! println!("Est. FPP: {:.4}%", filter.estimated_fpp() * 100.0);
+//! ```
+//!
+//! # Creating Filters
+//!
+//! There are two ways to create a Bloom filter:
+//!
+//! ## By Accuracy (Recommended)
+//!
+//! Automatically calculates optimal size and hash functions:
+//!
+//! ```rust
+//! # use datasketches::bloom::BloomFilterBuilder;
+//! let filter = BloomFilterBuilder::with_accuracy(
+//!     10_000, // Expected max items
+//!     0.01,   // Target false positive probability (1%)
+//! )
+//! .seed(9001) // Optional: custom seed
+//! .build();
+//! ```
+//!
+//! ## By Size (Manual)
+//!
+//! Specify exact bit count and hash functions:
+//!
+//! ```rust
+//! # use datasketches::bloom::BloomFilterBuilder;
+//! let filter = BloomFilterBuilder::with_size(
+//!     95_851, // Number of bits
+//!     7,      // Number of hash functions
+//! )
+//! .build();
+//! ```
+//!
+//! # Set Operations
+//!
+//! Bloom filters support efficient set operations:
+//!
+//! ```rust
+//! # use datasketches::bloom::BloomFilterBuilder;
+//! let mut filter1 = BloomFilterBuilder::with_accuracy(100, 0.01).build();
+//! let mut filter2 = BloomFilterBuilder::with_accuracy(100, 0.01).build();
+//!
+//! filter1.insert("a");
+//! filter2.insert("b");
+//!
+//! // Union: recognizes items from either filter
+//! filter1.union(&filter2);
+//! assert!(filter1.contains(&"a"));
+//! assert!(filter1.contains(&"b"));
+//!
+//! // Intersect: recognizes only items in both filters
+//! // filter1.intersect(&filter2);
+//!
+//! // Invert: approximately inverts set membership
+//! // filter1.invert();
+//! ```
+//!
+//! # Implementation Details
+//!
+//! - Uses XXHash64 for hashing
+//! - Implements double hashing (Kirsch-Mitzenmacher method) for k hash 
functions
+//! - Bits packed efficiently in `u64` words
+//! - Compatible serialization format (family ID: 21)
+//!
+//! # References
+//!
+//! - Bloom, Burton H. (1970). "Space/time trade-offs in hash coding with 
allowable errors"
+//! - Kirsch and Mitzenmacher (2008). "Less Hashing, Same Performance: 
Building a Better Bloom
+//!   Filter"
+
+mod builder;
+mod sketch;
+
+pub use self::builder::BloomFilterBuilder;
+pub use self::sketch::BloomFilter;
diff --git a/datasketches/src/bloom/sketch.rs b/datasketches/src/bloom/sketch.rs
new file mode 100644
index 0000000..5589477
--- /dev/null
+++ b/datasketches/src/bloom/sketch.rs
@@ -0,0 +1,730 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::hash::Hash;
+use std::hash::Hasher;
+
+use crate::codec::SketchBytes;
+use crate::codec::SketchSlice;
+use crate::error::Error;
+use crate::hash::XxHash64;
+
+// Serialization constants
+const PREAMBLE_LONGS_EMPTY: u8 = 3;
+const PREAMBLE_LONGS_STANDARD: u8 = 4;
+const FAMILY_ID: u8 = 21; // Bloom filter family ID
+const SERIAL_VERSION: u8 = 1;
+const EMPTY_FLAG_MASK: u8 = 1 << 2;
+
+/// A Bloom filter for probabilistic set membership testing.
+///
+/// Provides fast membership queries with:
+/// - No false negatives (inserted items always return `true`)
+/// - Tunable false positive rate
+/// - Constant space usage
+///
+/// Use [`super::BloomFilterBuilder`] to construct instances.
+#[derive(Debug, Clone, PartialEq)]
+pub struct BloomFilter {
+    /// Hash seed for all hash functions
+    pub(super) seed: u64,
+    /// Number of hash functions to use (k)
+    pub(super) num_hashes: u16,
+    /// Total number of bits in the filter (m)
+    pub(super) capacity_bits: u64,
+    /// Count of bits set to 1 (for statistics)
+    pub(super) num_bits_set: u64,
+    /// Bit array packed into u64 words
+    /// Length = ceil(capacity_bits / 64)
+    pub(super) bit_array: Vec<u64>,
+}
+
+impl BloomFilter {
+    /// Tests whether an item is possibly in the set.
+    ///
+    /// Returns:
+    /// - `true`: Item was **possibly** inserted (or false positive)
+    /// - `false`: Item was **definitely not** inserted
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # use datasketches::bloom::BloomFilterBuilder;
+    /// let mut filter = BloomFilterBuilder::with_accuracy(100, 0.01).build();
+    /// filter.insert("apple");
+    ///
+    /// assert!(filter.contains(&"apple")); // true - was inserted (probably)
+    /// assert!(!filter.contains(&"grape")); // false - never inserted
+    /// ```
+    pub fn contains<T: Hash>(&self, item: &T) -> bool {
+        if self.is_empty() {
+            return false;
+        }
+
+        let (h0, h1) = self.compute_hash(item);
+        self.check_bits(h0, h1)
+    }
+
+    /// Tests and inserts an item in a single operation.
+    ///
+    /// Returns whether the item was possibly already in the set before 
insertion.
+    /// This is more efficient than calling `contains()` then `insert()` 
separately.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # use datasketches::bloom::BloomFilterBuilder;
+    /// let mut filter = BloomFilterBuilder::with_accuracy(100, 0.01).build();
+    ///
+    /// let was_present = filter.contains_and_insert(&"apple");
+    /// assert!(!was_present); // First insertion
+    ///
+    /// let was_present = filter.contains_and_insert(&"apple");
+    /// assert!(was_present); // Now it's in the set
+    /// ```
+    pub fn contains_and_insert<T: Hash>(&mut self, item: &T) -> bool {
+        let (h0, h1) = self.compute_hash(item);
+        let was_present = self.check_bits(h0, h1);
+        self.set_bits(h0, h1);
+        was_present
+    }
+
+    /// Inserts an item into the filter.
+    ///
+    /// After insertion, `contains(item)` will always return `true`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # use datasketches::bloom::BloomFilterBuilder;
+    /// let mut filter = BloomFilterBuilder::with_accuracy(100, 0.01).build();
+    ///
+    /// filter.insert("apple");
+    /// filter.insert(42_u64);
+    /// filter.insert(&[1, 2, 3]);
+    ///
+    /// assert!(filter.contains(&"apple"));
+    /// ```
+    pub fn insert<T: Hash>(&mut self, item: T) {
+        let (h0, h1) = self.compute_hash(&item);
+        self.set_bits(h0, h1);
+    }
+
+    /// Resets the filter to its initial empty state.
+    ///
+    /// Clears all bits while preserving capacity and configuration.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # use datasketches::bloom::BloomFilterBuilder;
+    /// let mut filter = BloomFilterBuilder::with_accuracy(100, 0.01).build();
+    /// filter.insert("apple");
+    /// assert!(!filter.is_empty());
+    ///
+    /// filter.reset();
+    /// assert!(filter.is_empty());
+    /// assert!(!filter.contains(&"apple"));
+    /// ```
+    pub fn reset(&mut self) {
+        self.bit_array.fill(0);
+        self.num_bits_set = 0
+    }
+
+    /// Merges another filter into this one via bitwise OR (union).
+    ///
+    /// After merging, this filter will recognize items from either filter
+    /// (plus any false positives from either).
+    ///
+    /// # Panics
+    ///
+    /// Panics if the filters are not compatible (different size, hashes, or 
seed).
+    /// Use [`is_compatible()`](Self::is_compatible) to check first.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # use datasketches::bloom::BloomFilterBuilder;
+    /// let mut f1 = BloomFilterBuilder::with_accuracy(100, 0.01)
+    ///     .seed(123)
+    ///     .build();
+    /// let mut f2 = BloomFilterBuilder::with_accuracy(100, 0.01)
+    ///     .seed(123)
+    ///     .build();
+    ///
+    /// f1.insert("a");
+    /// f2.insert("b");
+    ///
+    /// f1.union(&f2);
+    /// assert!(f1.contains(&"a"));
+    /// assert!(f1.contains(&"b"));
+    /// ```
+    pub fn union(&mut self, other: &BloomFilter) {
+        assert!(
+            self.is_compatible(other),
+            "Cannot union incompatible Bloom filters"
+        );
+
+        // Count bits during union operation (single pass)
+        let mut num_bits_set = 0;
+        for (word, other_word) in 
self.bit_array.iter_mut().zip(&other.bit_array) {
+            *word |= *other_word;
+            num_bits_set += word.count_ones() as u64;
+        }
+        self.num_bits_set = num_bits_set;
+    }
+
+    /// Intersects this filter with another via bitwise AND.
+    ///
+    /// After intersection, this filter will recognize only items present in 
both
+    /// filters (plus false positives).
+    ///
+    /// # Panics
+    ///
+    /// Panics if the filters are not compatible (different size, hashes, or 
seed).
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # use datasketches::bloom::BloomFilterBuilder;
+    /// let mut f1 = BloomFilterBuilder::with_accuracy(100, 0.01)
+    ///     .seed(123)
+    ///     .build();
+    /// let mut f2 = BloomFilterBuilder::with_accuracy(100, 0.01)
+    ///     .seed(123)
+    ///     .build();
+    ///
+    /// f1.insert("a");
+    /// f1.insert("b");
+    /// f2.insert("b");
+    /// f2.insert("c");
+    ///
+    /// f1.intersect(&f2);
+    /// assert!(f1.contains(&"b")); // In both
+    /// // "a" and "c" likely return false now
+    /// ```
+    pub fn intersect(&mut self, other: &BloomFilter) {
+        assert!(
+            self.is_compatible(other),
+            "Cannot intersect incompatible Bloom filters"
+        );
+
+        // Count bits during intersect operation (single pass)
+        let mut num_bits_set = 0;
+        for (word, other_word) in 
self.bit_array.iter_mut().zip(&other.bit_array) {
+            *word &= *other_word;
+            num_bits_set += word.count_ones() as u64;
+        }
+        self.num_bits_set = num_bits_set;
+    }
+
+    /// Inverts all bits in the filter.
+    ///
+    /// This approximately inverts the notion of set membership, though the 
false
+    /// positive guarantees no longer hold in a well-defined way.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # use datasketches::bloom::BloomFilterBuilder;
+    /// let mut filter = BloomFilterBuilder::with_accuracy(100, 0.01).build();
+    /// filter.insert("apple");
+    ///
+    /// filter.invert();
+    /// // Now "apple" probably returns false, and most other items return true
+    /// ```
+    pub fn invert(&mut self) {
+        // Invert bits and count during operation (single pass)
+        let mut num_bits_set = 0;
+        for word in &mut self.bit_array {
+            *word = !*word;
+            num_bits_set += word.count_ones() as u64;
+        }
+
+        // Mask off excess bits in the last word
+        let excess_bits = self.capacity_bits % 64;
+        if excess_bits != 0 {
+            let last_idx = self.bit_array.len() - 1;
+            let old_count = self.bit_array[last_idx].count_ones() as u64;
+            let mask = (1u64 << excess_bits) - 1;
+            self.bit_array[last_idx] &= mask;
+            let new_count = self.bit_array[last_idx].count_ones() as u64;
+            // Adjust count for masked-off bits
+            num_bits_set = num_bits_set - old_count + new_count;
+        }
+
+        self.num_bits_set = num_bits_set;
+    }
+
+    /// Returns whether the filter is empty (no items inserted).
+    pub fn is_empty(&self) -> bool {
+        self.num_bits_set == 0
+    }
+
+    /// Returns the number of bits set to 1.
+    ///
+    /// Useful for monitoring filter saturation.
+    pub fn bits_used(&self) -> u64 {
+        self.num_bits_set
+    }
+
+    /// Returns the total number of bits in the filter (capacity).
+    pub fn capacity(&self) -> u64 {
+        self.capacity_bits
+    }
+
+    /// Returns the number of hash functions used.
+    pub fn num_hashes(&self) -> u16 {
+        self.num_hashes
+    }
+
+    /// Returns the hash seed.
+    pub fn seed(&self) -> u64 {
+        self.seed
+    }
+
+    /// Returns the current load factor (fraction of bits set).
+    ///
+    /// Values near 0.5 indicate the filter is approaching saturation.
+    /// Values above 0.5 indicate degraded false positive rates.
+    pub fn load_factor(&self) -> f64 {
+        self.num_bits_set as f64 / self.capacity_bits as f64
+    }
+
+    /// Estimates the current false positive probability.
+    ///
+    /// Uses the approximation: `load_factor^k`
+    /// where:
+    /// - load_factor = fraction of bits set (bits_used / capacity)
+    /// - k = num_hashes
+    ///
+    /// This assumes uniform bit distribution and is more accurate than
+    /// trying to estimate insertion count from the load factor.
+    pub fn estimated_fpp(&self) -> f64 {
+        let k = self.num_hashes as f64;
+        let load = self.load_factor();
+
+        // FPP ≈ load^k
+        // This is the standard approximation when load factor is known 
directly
+        load.powf(k)
+    }
+
+    /// Checks if two filters are compatible for merging.
+    ///
+    /// Filters are compatible if they have the same:
+    /// - Capacity (number of bits)
+    /// - Number of hash functions
+    /// - Seed
+    pub fn is_compatible(&self, other: &BloomFilter) -> bool {
+        self.capacity_bits == other.capacity_bits
+            && self.num_hashes == other.num_hashes
+            && self.seed == other.seed
+    }
+
+    /// Serializes the filter to a byte vector.
+    ///
+    /// The format is compatible with other Apache DataSketches 
implementations.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # use datasketches::bloom::{BloomFilter, BloomFilterBuilder};
+    /// let mut filter = BloomFilterBuilder::with_accuracy(100, 0.01).build();
+    /// filter.insert("test");
+    ///
+    /// let bytes = filter.serialize();
+    /// let restored = BloomFilter::deserialize(&bytes).unwrap();
+    /// assert!(restored.contains(&"test"));
+    /// ```
+    pub fn serialize(&self) -> Vec<u8> {
+        let is_empty = self.is_empty();
+        let preamble_longs = if is_empty {
+            PREAMBLE_LONGS_EMPTY
+        } else {
+            PREAMBLE_LONGS_STANDARD
+        };
+
+        let capacity = 8 * preamble_longs as usize
+            + if is_empty {
+                0
+            } else {
+                self.bit_array.len() * 8
+            };
+        let mut bytes = SketchBytes::with_capacity(capacity);
+
+        // Preamble
+        bytes.write_u8(preamble_longs); // Byte 0
+        bytes.write_u8(SERIAL_VERSION); // Byte 1
+        bytes.write_u8(FAMILY_ID); // Byte 2
+        bytes.write_u8(if is_empty { EMPTY_FLAG_MASK } else { 0 }); // Byte 3: 
flags
+        bytes.write_u16_le(self.num_hashes); // Bytes 4-5
+        bytes.write_u16_le(0); // Bytes 6-7: unused
+
+        bytes.write_u64_le(self.seed);
+
+        // Bit array capacity stored as number of 64-bit words (int32) + 
unused padding (uint32)
+        let num_longs = (self.capacity_bits / 64) as i32;
+        bytes.write_i32_le(num_longs);
+        bytes.write_u32_le(0); // unused
+
+        if !is_empty {
+            bytes.write_u64_le(self.num_bits_set);
+
+            // Bit array
+            for &word in &self.bit_array {
+                bytes.write_u64_le(word);
+            }
+        }
+
+        bytes.into_bytes()
+    }
+
+    /// Deserializes a filter from bytes.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if:
+    /// - The data is truncated or corrupted
+    /// - The family ID doesn't match (not a Bloom filter)
+    /// - The serial version is unsupported
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # use datasketches::bloom::{BloomFilter, BloomFilterBuilder};
+    /// let original = BloomFilterBuilder::with_accuracy(100, 0.01).build();
+    /// let bytes = original.serialize();
+    ///
+    /// let restored = BloomFilter::deserialize(&bytes).unwrap();
+    /// assert_eq!(original, restored);
+    /// ```
+    pub fn deserialize(bytes: &[u8]) -> Result<Self, Error> {
+        let mut cursor = SketchSlice::new(bytes);
+
+        // Read preamble
+        let preamble_longs = cursor
+            .read_u8()
+            .map_err(|_| Error::insufficient_data("preamble_longs"))?;
+        let serial_version = cursor
+            .read_u8()
+            .map_err(|_| Error::insufficient_data("serial_version"))?;
+        let family_id = cursor
+            .read_u8()
+            .map_err(|_| Error::insufficient_data("family_id"))?;
+
+        // Byte 3: flags byte (directly after family_id)
+        let flags = cursor
+            .read_u8()
+            .map_err(|_| Error::insufficient_data("flags"))?;
+
+        // Validate
+        if family_id != FAMILY_ID {
+            return Err(Error::invalid_family(FAMILY_ID, family_id, 
"BloomFilter"));
+        }
+        if serial_version != SERIAL_VERSION {
+            return Err(Error::unsupported_serial_version(
+                SERIAL_VERSION,
+                serial_version,
+            ));
+        }
+        if preamble_longs != PREAMBLE_LONGS_EMPTY && preamble_longs != 
PREAMBLE_LONGS_STANDARD {
+            return Err(Error::invalid_preamble_longs(
+                PREAMBLE_LONGS_STANDARD,
+                preamble_longs,
+            ));
+        }
+
+        let is_empty = (flags & EMPTY_FLAG_MASK) != 0;
+
+        // Bytes 4-5: num_hashes (u16)
+        let num_hashes = cursor
+            .read_u16_le()
+            .map_err(|_| Error::insufficient_data("num_hashes"))?;
+        // Bytes 6-7: unused (u16)
+        let _unused = cursor
+            .read_u16_le()
+            .map_err(|_| Error::insufficient_data("unused_header"))?;
+        let seed = cursor
+            .read_u64_le()
+            .map_err(|_| Error::insufficient_data("seed"))?;
+
+        // Bit array capacity stored as number of 64-bit words (int32) + 
unused padding (uint32)
+        let num_longs = cursor
+            .read_i32_le()
+            .map_err(|_| Error::insufficient_data("num_longs"))? as u64;
+        let _unused = cursor
+            .read_u32_le()
+            .map_err(|_| Error::insufficient_data("unused"))?;
+
+        let capacity_bits = num_longs * 64; // Convert longs to bits
+        let num_words = num_longs as usize;
+        let mut bit_array = vec![0u64; num_words];
+        let num_bits_set;
+
+        if is_empty {
+            num_bits_set = 0;
+        } else {
+            let raw_num_bits_set = cursor
+                .read_u64_le()
+                .map_err(|_| Error::insufficient_data("num_bits_set"))?;
+
+            for word in &mut bit_array {
+                *word = cursor
+                    .read_u64_le()
+                    .map_err(|_| Error::insufficient_data("bit_array"))?;
+            }
+
+            // Handle "dirty" state: 0xFFFFFFFFFFFFFFFF indicates bits need 
recounting
+            const DIRTY_BITS_VALUE: u64 = 0xFFFFFFFFFFFFFFFF;
+            if raw_num_bits_set == DIRTY_BITS_VALUE {
+                num_bits_set = bit_array.iter().map(|w| w.count_ones() as 
u64).sum();
+            } else {
+                num_bits_set = raw_num_bits_set;
+            }
+        }
+
+        Ok(BloomFilter {
+            seed,
+            num_hashes,
+            capacity_bits,
+            num_bits_set,
+            bit_array,
+        })
+    }
+
+    /// Computes the two base hash values using XXHash64.
+    ///
+    /// Uses a two-hash approach:
+    /// - h0 = XXHash64(item, seed)
+    /// - h1 = XXHash64(item, h0)
+    fn compute_hash<T: Hash>(&self, item: &T) -> (u64, u64) {
+        // First hash with the configured seed
+        let mut hasher = XxHash64::with_seed(self.seed);
+        item.hash(&mut hasher);
+        let h0 = hasher.finish();
+
+        // Second hash using h0 as the seed
+        let mut hasher = XxHash64::with_seed(h0);
+        item.hash(&mut hasher);
+        let h1 = hasher.finish();
+
+        (h0, h1)
+    }
+
+    /// Checks if all k bits are set for the given hash values.
+    fn check_bits(&self, h0: u64, h1: u64) -> bool {
+        for i in 1..=self.num_hashes {
+            let bit_index = self.compute_bit_index(h0, h1, i);
+            if !self.get_bit(bit_index) {
+                return false;
+            }
+        }
+        true
+    }
+
+    /// Sets all k bits for the given hash values.
+    fn set_bits(&mut self, h0: u64, h1: u64) {
+        for i in 1..=self.num_hashes {
+            let bit_index = self.compute_bit_index(h0, h1, i);
+            self.set_bit(bit_index);
+        }
+    }
+
+    /// Computes a bit index using double hashing (Kirsch-Mitzenmacher).
+    ///
+    /// Formula:
+    /// ```text
+    /// hash_index = ((h0 + i * h1) >> 1) % capacity_bits
+    /// ```
+    ///
+    /// The right shift by 1 improves bit distribution. The index `i` is 
1-based.
+    fn compute_bit_index(&self, h0: u64, h1: u64, i: u16) -> u64 {
+        let hash = h0.wrapping_add(u64::from(i).wrapping_mul(h1));
+        (hash >> 1) % self.capacity_bits
+    }
+
+    /// Gets the value of a single bit.
+    fn get_bit(&self, bit_index: u64) -> bool {
+        let word_index = (bit_index >> 6) as usize; // Equivalent to bit_index 
/ 64
+        let bit_offset = bit_index & 63; // Equivalent to bit_index % 64
+        let mask = 1u64 << bit_offset;
+        (self.bit_array[word_index] & mask) != 0
+    }
+
+    /// Sets a single bit and updates the count if it wasn't already set.
+    fn set_bit(&mut self, bit_index: u64) {
+        let word_index = (bit_index >> 6) as usize; // Equivalent to bit_index 
/ 64
+        let bit_offset = bit_index & 63; // Equivalent to bit_index % 64
+        let mask = 1u64 << bit_offset;
+
+        if (self.bit_array[word_index] & mask) == 0 {
+            self.bit_array[word_index] |= mask;
+            self.num_bits_set += 1;
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::BloomFilter;
+    use crate::bloom::BloomFilterBuilder;
+
+    #[test]
+    fn test_builder_with_accuracy() {
+        let filter = BloomFilterBuilder::with_accuracy(1000, 0.01).build();
+        assert!(filter.capacity() >= 9000);
+        assert_eq!(filter.num_hashes(), 7);
+        assert!(filter.is_empty());
+    }
+
+    #[test]
+    fn test_builder_with_size() {
+        let filter = BloomFilterBuilder::with_size(1024, 5).build();
+        assert_eq!(filter.capacity(), 1024);
+        assert_eq!(filter.num_hashes(), 5);
+    }
+
+    #[test]
+    fn test_insert_and_contains() {
+        let mut filter = BloomFilterBuilder::with_accuracy(100, 0.01).build();
+
+        assert!(!filter.contains(&"apple"));
+        filter.insert("apple");
+        assert!(filter.contains(&"apple"));
+        assert!(!filter.is_empty());
+    }
+
+    #[test]
+    fn test_contains_and_insert() {
+        let mut filter = BloomFilterBuilder::with_accuracy(100, 0.01).build();
+
+        let was_present = filter.contains_and_insert(&42_u64);
+        assert!(!was_present);
+
+        let was_present = filter.contains_and_insert(&42_u64);
+        assert!(was_present);
+    }
+
+    #[test]
+    fn test_reset() {
+        let mut filter = BloomFilterBuilder::with_accuracy(100, 0.01).build();
+        filter.insert("test");
+        assert!(!filter.is_empty());
+
+        filter.reset();
+        assert!(filter.is_empty());
+        assert!(!filter.contains(&"test"));
+    }
+
+    #[test]
+    fn test_union() {
+        let mut f1 = BloomFilterBuilder::with_accuracy(100, 0.01)
+            .seed(123)
+            .build();
+        let mut f2 = BloomFilterBuilder::with_accuracy(100, 0.01)
+            .seed(123)
+            .build();
+
+        f1.insert("a");
+        f2.insert("b");
+
+        f1.union(&f2);
+        assert!(f1.contains(&"a"));
+        assert!(f1.contains(&"b"));
+    }
+
+    #[test]
+    fn test_intersect() {
+        let mut f1 = BloomFilterBuilder::with_accuracy(100, 0.01)
+            .seed(123)
+            .build();
+        let mut f2 = BloomFilterBuilder::with_accuracy(100, 0.01)
+            .seed(123)
+            .build();
+
+        f1.insert("a");
+        f1.insert("b");
+        f2.insert("b");
+        f2.insert("c");
+
+        f1.intersect(&f2);
+        assert!(f1.contains(&"b"));
+    }
+
+    #[test]
+    fn test_serialize_deserialize_empty() {
+        let filter = BloomFilterBuilder::with_accuracy(100, 0.01).build();
+        let bytes = filter.serialize();
+        let restored = BloomFilter::deserialize(&bytes).unwrap();
+
+        assert_eq!(filter, restored);
+    }
+
+    #[test]
+    fn test_serialize_deserialize_with_data() {
+        let mut filter = BloomFilterBuilder::with_accuracy(100, 0.01).build();
+        filter.insert("test");
+        filter.insert(42_u64);
+
+        let bytes = filter.serialize();
+        let restored = BloomFilter::deserialize(&bytes).unwrap();
+
+        assert_eq!(filter, restored);
+        assert!(restored.contains(&"test"));
+        assert!(restored.contains(&42_u64));
+    }
+
+    #[test]
+    fn test_statistics() {
+        let mut filter = BloomFilterBuilder::with_size(1000, 5).build();
+        assert_eq!(filter.bits_used(), 0);
+        assert_eq!(filter.load_factor(), 0.0);
+
+        filter.insert("test");
+        assert!(filter.bits_used() > 0);
+        assert!(filter.load_factor() > 0.0);
+        assert!(filter.estimated_fpp() > 0.0);
+    }
+
+    #[test]
+    fn test_is_compatible() {
+        let f1 = BloomFilterBuilder::with_accuracy(100, 0.01)
+            .seed(123)
+            .build();
+        let f2 = BloomFilterBuilder::with_accuracy(100, 0.01)
+            .seed(123)
+            .build();
+        let f3 = BloomFilterBuilder::with_accuracy(100, 0.01)
+            .seed(456)
+            .build();
+
+        assert!(f1.is_compatible(&f2));
+        assert!(!f1.is_compatible(&f3));
+    }
+
+    #[test]
+    #[should_panic(expected = "max_items must be greater than 0")]
+    fn test_invalid_max_items() {
+        BloomFilterBuilder::with_accuracy(0, 0.01);
+    }
+
+    #[test]
+    #[should_panic(expected = "fpp must be between")]
+    fn test_invalid_fpp() {
+        BloomFilterBuilder::with_accuracy(100, 1.5);
+    }
+}
diff --git a/datasketches/src/lib.rs b/datasketches/src/lib.rs
index acc051d..473ab9b 100644
--- a/datasketches/src/lib.rs
+++ b/datasketches/src/lib.rs
@@ -30,6 +30,7 @@
 #[cfg(target_endian = "big")]
 compile_error!("datasketches does not support big-endian targets");
 
+pub mod bloom;
 pub mod countmin;
 pub mod error;
 pub mod frequencies;
diff --git a/datasketches/tests/bloom_serialization_test.rs 
b/datasketches/tests/bloom_serialization_test.rs
new file mode 100644
index 0000000..5370f89
--- /dev/null
+++ b/datasketches/tests/bloom_serialization_test.rs
@@ -0,0 +1,231 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Bloom Filter Serialization Compatibility Tests
+//!
+//! These tests verify binary compatibility with Apache DataSketches 
implementations:
+//! - Java (datasketches-java)
+//! - C++ (datasketches-cpp)
+//!
+//! Test data is generated by the reference implementations and stored in:
+//! `tests/serialization_test_data/`
+
+mod common;
+
+use std::fs;
+use std::path::PathBuf;
+
+use common::serialization_test_data;
+use datasketches::bloom::BloomFilter;
+
+fn test_bloom_filter_file(path: PathBuf, expected_num_items: u64, 
expected_num_hashes: u16) {
+    let bytes = fs::read(&path).unwrap();
+    let filter1 = BloomFilter::deserialize(&bytes).unwrap();
+
+    // Verify basic properties
+    assert_eq!(
+        filter1.num_hashes(),
+        expected_num_hashes,
+        "Wrong num_hashes in {}",
+        path.display()
+    );
+
+    // Check empty state
+    if expected_num_items == 0 {
+        assert!(filter1.is_empty(), "Filter should be empty for n=0");
+        assert_eq!(
+            filter1.bits_used(),
+            0,
+            "Empty filter should have 0 bits set"
+        );
+    } else {
+        assert!(
+            !filter1.is_empty(),
+            "Filter should not be empty for n={}",
+            expected_num_items
+        );
+        assert!(
+            filter1.bits_used() > 0,
+            "Non-empty filter should have bits set"
+        );
+    }
+
+    // Verify the items that were inserted (integers 0 to n/10-1)
+    // C++ code: for (uint64_t i = 0; i < n / 10; ++i) bf.update(i);
+    let num_inserted = expected_num_items / 10;
+
+    if num_inserted > 0 {
+        // Check a sample of inserted items
+        // For large n, we only check a sample to keep tests fast
+        let sample_size = std::cmp::min(num_inserted, 100);
+        let mut false_negatives = 0;
+
+        for i in 0..sample_size {
+            if !filter1.contains(&i) {
+                false_negatives += 1;
+            }
+        }
+
+        assert_eq!(
+            false_negatives,
+            0,
+            "Found {} false negatives out of {} items in {}",
+            false_negatives,
+            sample_size,
+            path.display()
+        );
+    }
+
+    // Serialize and deserialize again to test round-trip
+    let serialized_bytes = filter1.serialize();
+    let filter2 = 
BloomFilter::deserialize(&serialized_bytes).unwrap_or_else(|err| {
+        panic!(
+            "Deserialization failed after round-trip for {}: {}",
+            path.display(),
+            err
+        )
+    });
+
+    // Check that both filters are functionally equivalent
+    assert_eq!(
+        filter1.num_hashes(),
+        filter2.num_hashes(),
+        "num_hashes mismatch after round-trip for {}",
+        path.display()
+    );
+    assert_eq!(
+        filter1.capacity(),
+        filter2.capacity(),
+        "capacity mismatch after round-trip for {}",
+        path.display()
+    );
+    assert_eq!(
+        filter1.bits_used(),
+        filter2.bits_used(),
+        "bits_used mismatch after round-trip for {}",
+        path.display()
+    );
+
+    // Verify same items are present after round-trip
+    if num_inserted > 0 {
+        let sample_size = std::cmp::min(num_inserted, 100);
+        for i in 0..sample_size {
+            assert_eq!(
+                filter1.contains(&i),
+                filter2.contains(&i),
+                "Item {} presence differs after round-trip",
+                i
+            );
+        }
+    }
+}
+
+#[test]
+fn test_java_bloom_n0_h3() {
+    let path = serialization_test_data("java_generated_files", 
"bf_n0_h3_java.sk");
+    test_bloom_filter_file(path, 0, 3);
+}
+
+#[test]
+fn test_java_bloom_n0_h5() {
+    let path = serialization_test_data("java_generated_files", 
"bf_n0_h5_java.sk");
+    test_bloom_filter_file(path, 0, 5);
+}
+
+#[test]
+fn test_java_bloom_n10000_h3() {
+    let path = serialization_test_data("java_generated_files", 
"bf_n10000_h3_java.sk");
+    test_bloom_filter_file(path, 10000, 3);
+}
+
+#[test]
+fn test_java_bloom_n10000_h5() {
+    let path = serialization_test_data("java_generated_files", 
"bf_n10000_h5_java.sk");
+    test_bloom_filter_file(path, 10000, 5);
+}
+
+#[test]
+fn test_java_bloom_n2000000_h3() {
+    let path = serialization_test_data("java_generated_files", 
"bf_n2000000_h3_java.sk");
+    test_bloom_filter_file(path, 2000000, 3);
+}
+
+#[test]
+fn test_java_bloom_n2000000_h5() {
+    let path = serialization_test_data("java_generated_files", 
"bf_n2000000_h5_java.sk");
+    test_bloom_filter_file(path, 2000000, 5);
+}
+
+#[test]
+fn test_java_bloom_n30000000_h3() {
+    let path = serialization_test_data("java_generated_files", 
"bf_n30000000_h3_java.sk");
+    test_bloom_filter_file(path, 30000000, 3);
+}
+
+#[test]
+fn test_java_bloom_n30000000_h5() {
+    let path = serialization_test_data("java_generated_files", 
"bf_n30000000_h5_java.sk");
+    test_bloom_filter_file(path, 30000000, 5);
+}
+
+#[test]
+fn test_cpp_bloom_n0_h3() {
+    let path = serialization_test_data("cpp_generated_files", 
"bf_n0_h3_cpp.sk");
+    test_bloom_filter_file(path, 0, 3);
+}
+
+#[test]
+fn test_cpp_bloom_n0_h5() {
+    let path = serialization_test_data("cpp_generated_files", 
"bf_n0_h5_cpp.sk");
+    test_bloom_filter_file(path, 0, 5);
+}
+
+#[test]
+fn test_cpp_bloom_n10000_h3() {
+    let path = serialization_test_data("cpp_generated_files", 
"bf_n10000_h3_cpp.sk");
+    test_bloom_filter_file(path, 10000, 3);
+}
+
+#[test]
+fn test_cpp_bloom_n10000_h5() {
+    let path = serialization_test_data("cpp_generated_files", 
"bf_n10000_h5_cpp.sk");
+    test_bloom_filter_file(path, 10000, 5);
+}
+
+#[test]
+fn test_cpp_bloom_n2000000_h3() {
+    let path = serialization_test_data("cpp_generated_files", 
"bf_n2000000_h3_cpp.sk");
+    test_bloom_filter_file(path, 2000000, 3);
+}
+
+#[test]
+fn test_cpp_bloom_n2000000_h5() {
+    let path = serialization_test_data("cpp_generated_files", 
"bf_n2000000_h5_cpp.sk");
+    test_bloom_filter_file(path, 2000000, 5);
+}
+
+#[test]
+fn test_cpp_bloom_n30000000_h3() {
+    let path = serialization_test_data("cpp_generated_files", 
"bf_n30000000_h3_cpp.sk");
+    test_bloom_filter_file(path, 30000000, 3);
+}
+
+#[test]
+fn test_cpp_bloom_n30000000_h5() {
+    let path = serialization_test_data("cpp_generated_files", 
"bf_n30000000_h5_cpp.sk");
+    test_bloom_filter_file(path, 30000000, 5);
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(datasketches-rust) branch main updated: feat: implement bloom filter (#53)

Reply via email to