This is an automated email from the ASF dual-hosted git repository.
placave pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datasketches-rust.git
The following commit(s) were added to refs/heads/main by this push:
new 819f563 feat: implement bloom filter (#53)
819f563 is described below
commit 819f563c29cfd139a0a69be052591a5aa3368e4d
Author: Filippo Rossi <[email protected]>
AuthorDate: Wed Dec 31 15:37:07 2025 +0100
feat: implement bloom filter (#53)
* checkpoint
* checkpoint
* xxhash
* match cpp hashing logic
* serde
* cleanup
* cleanup
* docs
* comments cleanup
* builder cleanup
* trade 3 loc for 1 loc
* get back 1 loc
---
datasketches/src/bloom/builder.rs | 221 ++++++++
datasketches/src/bloom/mod.rs | 127 +++++
datasketches/src/bloom/sketch.rs | 730 +++++++++++++++++++++++++
datasketches/src/lib.rs | 1 +
datasketches/tests/bloom_serialization_test.rs | 231 ++++++++
5 files changed, 1310 insertions(+)
diff --git a/datasketches/src/bloom/builder.rs
b/datasketches/src/bloom/builder.rs
new file mode 100644
index 0000000..13e3b5a
--- /dev/null
+++ b/datasketches/src/bloom/builder.rs
@@ -0,0 +1,221 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use super::BloomFilter;
+use crate::hash::DEFAULT_UPDATE_SEED;
+
+const MIN_NUM_BITS: u64 = 64;
+const MAX_NUM_BITS: u64 = (1u64 << 35) - 64; // ~32 GB - reasonable limit
+
+/// Builder for creating [`BloomFilter`] instances.
+///
+/// Provides two construction modes:
+/// - [`with_accuracy()`](Self::with_accuracy): Specify target items and false
positive rate
+/// (recommended)
+/// - [`with_size()`](Self::with_size): Specify exact bit count and hash
functions (manual)
+#[derive(Debug, Clone)]
+pub struct BloomFilterBuilder {
+ num_bits: u64,
+ num_hashes: u16,
+ seed: u64,
+}
+
+impl BloomFilterBuilder {
+ /// Creates a builder with optimal parameters for a target accuracy.
+ ///
+ /// Automatically calculates the optimal number of bits and hash functions
+ /// to achieve the desired false positive probability for a given number
of items.
+ ///
+ /// # Arguments
+ ///
+ /// - `max_items`: Maximum expected number of distinct items
+ /// - `fpp`: Target false positive probability (e.g., 0.01 for 1%)
+ ///
+ /// # Panics
+ ///
+ /// Panics if `max_items` is 0 or `fpp` is not in (0.0, 1.0).
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::bloom::BloomFilterBuilder;
+ /// // Optimal for 10,000 items with 1% FPP
+ /// let filter = BloomFilterBuilder::with_accuracy(10_000, 0.01)
+ /// .seed(42)
+ /// .build();
+ /// ```
+ pub fn with_accuracy(max_items: u64, fpp: f64) -> Self {
+ assert!(max_items > 0, "max_items must be greater than 0");
+ assert!(
+ fpp > 0.0 && fpp < 1.0,
+ "fpp must be between 0.0 and 1.0 (exclusive)"
+ );
+
+ let num_bits = Self::suggest_num_bits(max_items, fpp);
+ let num_hashes = Self::suggest_num_hashes_from_accuracy(max_items,
num_bits);
+
+ BloomFilterBuilder {
+ num_bits,
+ num_hashes,
+ seed: DEFAULT_UPDATE_SEED,
+ }
+ }
+
+ /// Creates a builder with manual size specification.
+ ///
+ /// Use this when you want precise control over the filter size,
+ /// or when working with pre-calculated parameters.
+ ///
+ /// # Arguments
+ ///
+ /// - `num_bits`: Total number of bits in the filter
+ /// - `num_hashes`: Number of hash functions to use
+ ///
+ /// # Panics
+ ///
+ /// Panics if:
+ /// - `num_bits` < MIN_NUM_BITS (64) or `num_bits` > MAX_NUM_BITS (~32 GB)
+ /// - `num_hashes` < 1 or `num_hashes` > 100
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::bloom::BloomFilterBuilder;
+ /// let filter = BloomFilterBuilder::with_size(10_000, 7).build();
+ /// ```
+ pub fn with_size(num_bits: u64, num_hashes: u16) -> Self {
+ assert!(
+ num_bits >= MIN_NUM_BITS,
+ "num_bits must be at least {}",
+ MIN_NUM_BITS
+ );
+ assert!(
+ num_bits <= MAX_NUM_BITS,
+ "num_bits must not exceed {}",
+ MAX_NUM_BITS
+ );
+ assert!(num_hashes >= 1, "num_hashes must be at least 1");
+ assert!(num_hashes <= 100, "num_hashes must not exceed 100");
+
+ BloomFilterBuilder {
+ num_bits,
+ num_hashes,
+ seed: DEFAULT_UPDATE_SEED,
+ }
+ }
+
+ /// Sets a custom hash seed (default: 9001).
+ ///
+ /// **Important**: Filters with different seeds cannot be merged.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::bloom::BloomFilterBuilder;
+ /// let filter = BloomFilterBuilder::with_accuracy(100, 0.01)
+ /// .seed(12345)
+ /// .build();
+ /// ```
+ pub fn seed(mut self, seed: u64) -> Self {
+ self.seed = seed;
+ self
+ }
+
+ /// Builds the Bloom filter.
+ ///
+ /// # Panics
+ ///
+ /// Panics if neither `with_accuracy()` nor `with_size()` was called.
+ pub fn build(self) -> BloomFilter {
+ let capacity_bits = self.num_bits;
+ let num_hashes = self.num_hashes;
+
+ let num_words = capacity_bits.div_ceil(64) as usize;
+ let bit_array = vec![0u64; num_words];
+
+ BloomFilter {
+ seed: self.seed,
+ num_hashes,
+ capacity_bits,
+ num_bits_set: 0,
+ bit_array,
+ }
+ }
+
+ /// Suggests optimal number of bits given max items and target FPP.
+ ///
+ /// Formula: `m = -n * ln(p) / (ln(2)^2)`
+ /// where n = max_items, p = fpp
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::bloom::BloomFilterBuilder;
+ /// let bits = BloomFilterBuilder::suggest_num_bits(1000, 0.01);
+ /// assert!(bits > 9000 && bits < 10000); // ~9585 bits
+ /// ```
+ pub fn suggest_num_bits(max_items: u64, fpp: f64) -> u64 {
+ let n = max_items as f64;
+ let p = fpp;
+ let ln2_squared = std::f64::consts::LN_2 * std::f64::consts::LN_2;
+
+ let bits = (-n * p.ln() / ln2_squared).ceil() as u64;
+
+ // Round up to multiple of 64 for efficiency
+ let bits = bits.div_ceil(64) * 64;
+
+ bits.clamp(MIN_NUM_BITS, MAX_NUM_BITS)
+ }
+
+ /// Suggests optimal number of hash functions given max items and bit
count.
+ ///
+ /// Formula: `k = (m/n) * ln(2)`
+ /// where m = num_bits, n = max_items
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::bloom::BloomFilterBuilder;
+ /// let hashes =
BloomFilterBuilder::suggest_num_hashes_from_accuracy(1000, 10000);
+ /// assert_eq!(hashes, 7); // Optimal k ≈ 6.93
+ /// ```
+ pub fn suggest_num_hashes_from_accuracy(max_items: u64, num_bits: u64) ->
u16 {
+ let m = num_bits as f64;
+ let n = max_items as f64;
+
+ let k = (m / n * std::f64::consts::LN_2).round();
+
+ (k as u16).clamp(1, 100) // Reasonable bounds
+ }
+
+ /// Suggests optimal number of hash functions from target FPP.
+ ///
+ /// Formula: `k = -log2(p)`
+ /// where p = fpp
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::bloom::BloomFilterBuilder;
+ /// let hashes = BloomFilterBuilder::suggest_num_hashes_from_fpp(0.01);
+ /// assert_eq!(hashes, 7); // -log2(0.01) ≈ 6.64
+ /// ```
+ pub fn suggest_num_hashes_from_fpp(fpp: f64) -> u16 {
+ let k = -fpp.log2();
+ (k.round() as u16).clamp(1, 100)
+ }
+}
diff --git a/datasketches/src/bloom/mod.rs b/datasketches/src/bloom/mod.rs
new file mode 100644
index 0000000..4a91287
--- /dev/null
+++ b/datasketches/src/bloom/mod.rs
@@ -0,0 +1,127 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Bloom Filter implementation for probabilistic set membership testing.
+//!
+//! A Bloom filter is a space-efficient probabilistic data structure used to
test whether
+//! an element is a member of a set. False positive matches are possible, but
false negatives
+//! are not. In other words, a query returns either "possibly in set" or
"definitely not in set".
+//!
+//! # Properties
+//!
+//! - **No false negatives**: If an item was inserted, `contains()` will
always return `true`
+//! - **Possible false positives**: `contains()` may return `true` for items
never inserted
+//! - **Fixed size**: Unlike typical sketches, Bloom filters do not resize
automatically
+//! - **Linear space**: Size is proportional to the expected number of
distinct items
+//!
+//! # Usage
+//!
+//! ```rust
+//! use datasketches::bloom::BloomFilter;
+//! use datasketches::bloom::BloomFilterBuilder;
+//!
+//! // Create a filter optimized for 1000 items with 1% false positive rate
+//! let mut filter = BloomFilterBuilder::with_accuracy(1000, 0.01).build();
+//!
+//! // Insert items
+//! filter.insert("apple");
+//! filter.insert("banana");
+//! filter.insert(42_u64);
+//!
+//! // Check membership
+//! assert!(filter.contains(&"apple")); // true - definitely inserted
+//! assert!(!filter.contains(&"grape")); // false - never inserted (probably)
+//!
+//! // Get statistics
+//! println!("Capacity: {} bits", filter.capacity());
+//! println!("Bits used: {}", filter.bits_used());
+//! println!("Est. FPP: {:.4}%", filter.estimated_fpp() * 100.0);
+//! ```
+//!
+//! # Creating Filters
+//!
+//! There are two ways to create a Bloom filter:
+//!
+//! ## By Accuracy (Recommended)
+//!
+//! Automatically calculates optimal size and hash functions:
+//!
+//! ```rust
+//! # use datasketches::bloom::BloomFilterBuilder;
+//! let filter = BloomFilterBuilder::with_accuracy(
+//! 10_000, // Expected max items
+//! 0.01, // Target false positive probability (1%)
+//! )
+//! .seed(9001) // Optional: custom seed
+//! .build();
+//! ```
+//!
+//! ## By Size (Manual)
+//!
+//! Specify exact bit count and hash functions:
+//!
+//! ```rust
+//! # use datasketches::bloom::BloomFilterBuilder;
+//! let filter = BloomFilterBuilder::with_size(
+//! 95_851, // Number of bits
+//! 7, // Number of hash functions
+//! )
+//! .build();
+//! ```
+//!
+//! # Set Operations
+//!
+//! Bloom filters support efficient set operations:
+//!
+//! ```rust
+//! # use datasketches::bloom::BloomFilterBuilder;
+//! let mut filter1 = BloomFilterBuilder::with_accuracy(100, 0.01).build();
+//! let mut filter2 = BloomFilterBuilder::with_accuracy(100, 0.01).build();
+//!
+//! filter1.insert("a");
+//! filter2.insert("b");
+//!
+//! // Union: recognizes items from either filter
+//! filter1.union(&filter2);
+//! assert!(filter1.contains(&"a"));
+//! assert!(filter1.contains(&"b"));
+//!
+//! // Intersect: recognizes only items in both filters
+//! // filter1.intersect(&filter2);
+//!
+//! // Invert: approximately inverts set membership
+//! // filter1.invert();
+//! ```
+//!
+//! # Implementation Details
+//!
+//! - Uses XXHash64 for hashing
+//! - Implements double hashing (Kirsch-Mitzenmacher method) for k hash
functions
+//! - Bits packed efficiently in `u64` words
+//! - Compatible serialization format (family ID: 21)
+//!
+//! # References
+//!
+//! - Bloom, Burton H. (1970). "Space/time trade-offs in hash coding with
allowable errors"
+//! - Kirsch and Mitzenmacher (2008). "Less Hashing, Same Performance:
Building a Better Bloom
+//! Filter"
+
+mod builder;
+mod sketch;
+
+pub use self::builder::BloomFilterBuilder;
+pub use self::sketch::BloomFilter;
diff --git a/datasketches/src/bloom/sketch.rs b/datasketches/src/bloom/sketch.rs
new file mode 100644
index 0000000..5589477
--- /dev/null
+++ b/datasketches/src/bloom/sketch.rs
@@ -0,0 +1,730 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::hash::Hash;
+use std::hash::Hasher;
+
+use crate::codec::SketchBytes;
+use crate::codec::SketchSlice;
+use crate::error::Error;
+use crate::hash::XxHash64;
+
+// Serialization constants
+const PREAMBLE_LONGS_EMPTY: u8 = 3;
+const PREAMBLE_LONGS_STANDARD: u8 = 4;
+const FAMILY_ID: u8 = 21; // Bloom filter family ID
+const SERIAL_VERSION: u8 = 1;
+const EMPTY_FLAG_MASK: u8 = 1 << 2;
+
+/// A Bloom filter for probabilistic set membership testing.
+///
+/// Provides fast membership queries with:
+/// - No false negatives (inserted items always return `true`)
+/// - Tunable false positive rate
+/// - Constant space usage
+///
+/// Use [`super::BloomFilterBuilder`] to construct instances.
+#[derive(Debug, Clone, PartialEq)]
+pub struct BloomFilter {
+ /// Hash seed for all hash functions
+ pub(super) seed: u64,
+ /// Number of hash functions to use (k)
+ pub(super) num_hashes: u16,
+ /// Total number of bits in the filter (m)
+ pub(super) capacity_bits: u64,
+ /// Count of bits set to 1 (for statistics)
+ pub(super) num_bits_set: u64,
+ /// Bit array packed into u64 words
+ /// Length = ceil(capacity_bits / 64)
+ pub(super) bit_array: Vec<u64>,
+}
+
+impl BloomFilter {
+ /// Tests whether an item is possibly in the set.
+ ///
+ /// Returns:
+ /// - `true`: Item was **possibly** inserted (or false positive)
+ /// - `false`: Item was **definitely not** inserted
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::bloom::BloomFilterBuilder;
+ /// let mut filter = BloomFilterBuilder::with_accuracy(100, 0.01).build();
+ /// filter.insert("apple");
+ ///
+ /// assert!(filter.contains(&"apple")); // true - was inserted (probably)
+ /// assert!(!filter.contains(&"grape")); // false - never inserted
+ /// ```
+ pub fn contains<T: Hash>(&self, item: &T) -> bool {
+ if self.is_empty() {
+ return false;
+ }
+
+ let (h0, h1) = self.compute_hash(item);
+ self.check_bits(h0, h1)
+ }
+
+ /// Tests and inserts an item in a single operation.
+ ///
+ /// Returns whether the item was possibly already in the set before
insertion.
+ /// This is more efficient than calling `contains()` then `insert()`
separately.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::bloom::BloomFilterBuilder;
+ /// let mut filter = BloomFilterBuilder::with_accuracy(100, 0.01).build();
+ ///
+ /// let was_present = filter.contains_and_insert(&"apple");
+ /// assert!(!was_present); // First insertion
+ ///
+ /// let was_present = filter.contains_and_insert(&"apple");
+ /// assert!(was_present); // Now it's in the set
+ /// ```
+ pub fn contains_and_insert<T: Hash>(&mut self, item: &T) -> bool {
+ let (h0, h1) = self.compute_hash(item);
+ let was_present = self.check_bits(h0, h1);
+ self.set_bits(h0, h1);
+ was_present
+ }
+
+ /// Inserts an item into the filter.
+ ///
+ /// After insertion, `contains(item)` will always return `true`.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::bloom::BloomFilterBuilder;
+ /// let mut filter = BloomFilterBuilder::with_accuracy(100, 0.01).build();
+ ///
+ /// filter.insert("apple");
+ /// filter.insert(42_u64);
+ /// filter.insert(&[1, 2, 3]);
+ ///
+ /// assert!(filter.contains(&"apple"));
+ /// ```
+ pub fn insert<T: Hash>(&mut self, item: T) {
+ let (h0, h1) = self.compute_hash(&item);
+ self.set_bits(h0, h1);
+ }
+
+ /// Resets the filter to its initial empty state.
+ ///
+ /// Clears all bits while preserving capacity and configuration.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::bloom::BloomFilterBuilder;
+ /// let mut filter = BloomFilterBuilder::with_accuracy(100, 0.01).build();
+ /// filter.insert("apple");
+ /// assert!(!filter.is_empty());
+ ///
+ /// filter.reset();
+ /// assert!(filter.is_empty());
+ /// assert!(!filter.contains(&"apple"));
+ /// ```
+ pub fn reset(&mut self) {
+ self.bit_array.fill(0);
+ self.num_bits_set = 0
+ }
+
+ /// Merges another filter into this one via bitwise OR (union).
+ ///
+ /// After merging, this filter will recognize items from either filter
+ /// (plus any false positives from either).
+ ///
+ /// # Panics
+ ///
+ /// Panics if the filters are not compatible (different size, hashes, or
seed).
+ /// Use [`is_compatible()`](Self::is_compatible) to check first.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::bloom::BloomFilterBuilder;
+ /// let mut f1 = BloomFilterBuilder::with_accuracy(100, 0.01)
+ /// .seed(123)
+ /// .build();
+ /// let mut f2 = BloomFilterBuilder::with_accuracy(100, 0.01)
+ /// .seed(123)
+ /// .build();
+ ///
+ /// f1.insert("a");
+ /// f2.insert("b");
+ ///
+ /// f1.union(&f2);
+ /// assert!(f1.contains(&"a"));
+ /// assert!(f1.contains(&"b"));
+ /// ```
+ pub fn union(&mut self, other: &BloomFilter) {
+ assert!(
+ self.is_compatible(other),
+ "Cannot union incompatible Bloom filters"
+ );
+
+ // Count bits during union operation (single pass)
+ let mut num_bits_set = 0;
+ for (word, other_word) in
self.bit_array.iter_mut().zip(&other.bit_array) {
+ *word |= *other_word;
+ num_bits_set += word.count_ones() as u64;
+ }
+ self.num_bits_set = num_bits_set;
+ }
+
+ /// Intersects this filter with another via bitwise AND.
+ ///
+ /// After intersection, this filter will recognize only items present in
both
+ /// filters (plus false positives).
+ ///
+ /// # Panics
+ ///
+ /// Panics if the filters are not compatible (different size, hashes, or
seed).
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::bloom::BloomFilterBuilder;
+ /// let mut f1 = BloomFilterBuilder::with_accuracy(100, 0.01)
+ /// .seed(123)
+ /// .build();
+ /// let mut f2 = BloomFilterBuilder::with_accuracy(100, 0.01)
+ /// .seed(123)
+ /// .build();
+ ///
+ /// f1.insert("a");
+ /// f1.insert("b");
+ /// f2.insert("b");
+ /// f2.insert("c");
+ ///
+ /// f1.intersect(&f2);
+ /// assert!(f1.contains(&"b")); // In both
+ /// // "a" and "c" likely return false now
+ /// ```
+ pub fn intersect(&mut self, other: &BloomFilter) {
+ assert!(
+ self.is_compatible(other),
+ "Cannot intersect incompatible Bloom filters"
+ );
+
+ // Count bits during intersect operation (single pass)
+ let mut num_bits_set = 0;
+ for (word, other_word) in
self.bit_array.iter_mut().zip(&other.bit_array) {
+ *word &= *other_word;
+ num_bits_set += word.count_ones() as u64;
+ }
+ self.num_bits_set = num_bits_set;
+ }
+
+ /// Inverts all bits in the filter.
+ ///
+ /// This approximately inverts the notion of set membership, though the
false
+ /// positive guarantees no longer hold in a well-defined way.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::bloom::BloomFilterBuilder;
+ /// let mut filter = BloomFilterBuilder::with_accuracy(100, 0.01).build();
+ /// filter.insert("apple");
+ ///
+ /// filter.invert();
+ /// // Now "apple" probably returns false, and most other items return true
+ /// ```
+ pub fn invert(&mut self) {
+ // Invert bits and count during operation (single pass)
+ let mut num_bits_set = 0;
+ for word in &mut self.bit_array {
+ *word = !*word;
+ num_bits_set += word.count_ones() as u64;
+ }
+
+ // Mask off excess bits in the last word
+ let excess_bits = self.capacity_bits % 64;
+ if excess_bits != 0 {
+ let last_idx = self.bit_array.len() - 1;
+ let old_count = self.bit_array[last_idx].count_ones() as u64;
+ let mask = (1u64 << excess_bits) - 1;
+ self.bit_array[last_idx] &= mask;
+ let new_count = self.bit_array[last_idx].count_ones() as u64;
+ // Adjust count for masked-off bits
+ num_bits_set = num_bits_set - old_count + new_count;
+ }
+
+ self.num_bits_set = num_bits_set;
+ }
+
+ /// Returns whether the filter is empty (no items inserted).
+ pub fn is_empty(&self) -> bool {
+ self.num_bits_set == 0
+ }
+
+ /// Returns the number of bits set to 1.
+ ///
+ /// Useful for monitoring filter saturation.
+ pub fn bits_used(&self) -> u64 {
+ self.num_bits_set
+ }
+
+ /// Returns the total number of bits in the filter (capacity).
+ pub fn capacity(&self) -> u64 {
+ self.capacity_bits
+ }
+
+ /// Returns the number of hash functions used.
+ pub fn num_hashes(&self) -> u16 {
+ self.num_hashes
+ }
+
+ /// Returns the hash seed.
+ pub fn seed(&self) -> u64 {
+ self.seed
+ }
+
+ /// Returns the current load factor (fraction of bits set).
+ ///
+ /// Values near 0.5 indicate the filter is approaching saturation.
+ /// Values above 0.5 indicate degraded false positive rates.
+ pub fn load_factor(&self) -> f64 {
+ self.num_bits_set as f64 / self.capacity_bits as f64
+ }
+
+ /// Estimates the current false positive probability.
+ ///
+ /// Uses the approximation: `load_factor^k`
+ /// where:
+ /// - load_factor = fraction of bits set (bits_used / capacity)
+ /// - k = num_hashes
+ ///
+ /// This assumes uniform bit distribution and is more accurate than
+ /// trying to estimate insertion count from the load factor.
+ pub fn estimated_fpp(&self) -> f64 {
+ let k = self.num_hashes as f64;
+ let load = self.load_factor();
+
+ // FPP ≈ load^k
+ // This is the standard approximation when load factor is known
directly
+ load.powf(k)
+ }
+
+ /// Checks if two filters are compatible for merging.
+ ///
+ /// Filters are compatible if they have the same:
+ /// - Capacity (number of bits)
+ /// - Number of hash functions
+ /// - Seed
+ pub fn is_compatible(&self, other: &BloomFilter) -> bool {
+ self.capacity_bits == other.capacity_bits
+ && self.num_hashes == other.num_hashes
+ && self.seed == other.seed
+ }
+
+ /// Serializes the filter to a byte vector.
+ ///
+ /// The format is compatible with other Apache DataSketches
implementations.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::bloom::{BloomFilter, BloomFilterBuilder};
+ /// let mut filter = BloomFilterBuilder::with_accuracy(100, 0.01).build();
+ /// filter.insert("test");
+ ///
+ /// let bytes = filter.serialize();
+ /// let restored = BloomFilter::deserialize(&bytes).unwrap();
+ /// assert!(restored.contains(&"test"));
+ /// ```
+ pub fn serialize(&self) -> Vec<u8> {
+ let is_empty = self.is_empty();
+ let preamble_longs = if is_empty {
+ PREAMBLE_LONGS_EMPTY
+ } else {
+ PREAMBLE_LONGS_STANDARD
+ };
+
+ let capacity = 8 * preamble_longs as usize
+ + if is_empty {
+ 0
+ } else {
+ self.bit_array.len() * 8
+ };
+ let mut bytes = SketchBytes::with_capacity(capacity);
+
+ // Preamble
+ bytes.write_u8(preamble_longs); // Byte 0
+ bytes.write_u8(SERIAL_VERSION); // Byte 1
+ bytes.write_u8(FAMILY_ID); // Byte 2
+ bytes.write_u8(if is_empty { EMPTY_FLAG_MASK } else { 0 }); // Byte 3:
flags
+ bytes.write_u16_le(self.num_hashes); // Bytes 4-5
+ bytes.write_u16_le(0); // Bytes 6-7: unused
+
+ bytes.write_u64_le(self.seed);
+
+ // Bit array capacity stored as number of 64-bit words (int32) +
unused padding (uint32)
+ let num_longs = (self.capacity_bits / 64) as i32;
+ bytes.write_i32_le(num_longs);
+ bytes.write_u32_le(0); // unused
+
+ if !is_empty {
+ bytes.write_u64_le(self.num_bits_set);
+
+ // Bit array
+ for &word in &self.bit_array {
+ bytes.write_u64_le(word);
+ }
+ }
+
+ bytes.into_bytes()
+ }
+
+ /// Deserializes a filter from bytes.
+ ///
+ /// # Errors
+ ///
+ /// Returns an error if:
+ /// - The data is truncated or corrupted
+ /// - The family ID doesn't match (not a Bloom filter)
+ /// - The serial version is unsupported
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use datasketches::bloom::{BloomFilter, BloomFilterBuilder};
+ /// let original = BloomFilterBuilder::with_accuracy(100, 0.01).build();
+ /// let bytes = original.serialize();
+ ///
+ /// let restored = BloomFilter::deserialize(&bytes).unwrap();
+ /// assert_eq!(original, restored);
+ /// ```
+ pub fn deserialize(bytes: &[u8]) -> Result<Self, Error> {
+ let mut cursor = SketchSlice::new(bytes);
+
+ // Read preamble
+ let preamble_longs = cursor
+ .read_u8()
+ .map_err(|_| Error::insufficient_data("preamble_longs"))?;
+ let serial_version = cursor
+ .read_u8()
+ .map_err(|_| Error::insufficient_data("serial_version"))?;
+ let family_id = cursor
+ .read_u8()
+ .map_err(|_| Error::insufficient_data("family_id"))?;
+
+ // Byte 3: flags byte (directly after family_id)
+ let flags = cursor
+ .read_u8()
+ .map_err(|_| Error::insufficient_data("flags"))?;
+
+ // Validate
+ if family_id != FAMILY_ID {
+ return Err(Error::invalid_family(FAMILY_ID, family_id,
"BloomFilter"));
+ }
+ if serial_version != SERIAL_VERSION {
+ return Err(Error::unsupported_serial_version(
+ SERIAL_VERSION,
+ serial_version,
+ ));
+ }
+ if preamble_longs != PREAMBLE_LONGS_EMPTY && preamble_longs !=
PREAMBLE_LONGS_STANDARD {
+ return Err(Error::invalid_preamble_longs(
+ PREAMBLE_LONGS_STANDARD,
+ preamble_longs,
+ ));
+ }
+
+ let is_empty = (flags & EMPTY_FLAG_MASK) != 0;
+
+ // Bytes 4-5: num_hashes (u16)
+ let num_hashes = cursor
+ .read_u16_le()
+ .map_err(|_| Error::insufficient_data("num_hashes"))?;
+ // Bytes 6-7: unused (u16)
+ let _unused = cursor
+ .read_u16_le()
+ .map_err(|_| Error::insufficient_data("unused_header"))?;
+ let seed = cursor
+ .read_u64_le()
+ .map_err(|_| Error::insufficient_data("seed"))?;
+
+ // Bit array capacity stored as number of 64-bit words (int32) +
unused padding (uint32)
+ let num_longs = cursor
+ .read_i32_le()
+ .map_err(|_| Error::insufficient_data("num_longs"))? as u64;
+ let _unused = cursor
+ .read_u32_le()
+ .map_err(|_| Error::insufficient_data("unused"))?;
+
+ let capacity_bits = num_longs * 64; // Convert longs to bits
+ let num_words = num_longs as usize;
+ let mut bit_array = vec![0u64; num_words];
+ let num_bits_set;
+
+ if is_empty {
+ num_bits_set = 0;
+ } else {
+ let raw_num_bits_set = cursor
+ .read_u64_le()
+ .map_err(|_| Error::insufficient_data("num_bits_set"))?;
+
+ for word in &mut bit_array {
+ *word = cursor
+ .read_u64_le()
+ .map_err(|_| Error::insufficient_data("bit_array"))?;
+ }
+
+ // Handle "dirty" state: 0xFFFFFFFFFFFFFFFF indicates bits need
recounting
+ const DIRTY_BITS_VALUE: u64 = 0xFFFFFFFFFFFFFFFF;
+ if raw_num_bits_set == DIRTY_BITS_VALUE {
+ num_bits_set = bit_array.iter().map(|w| w.count_ones() as
u64).sum();
+ } else {
+ num_bits_set = raw_num_bits_set;
+ }
+ }
+
+ Ok(BloomFilter {
+ seed,
+ num_hashes,
+ capacity_bits,
+ num_bits_set,
+ bit_array,
+ })
+ }
+
+ /// Computes the two base hash values using XXHash64.
+ ///
+ /// Uses a two-hash approach:
+ /// - h0 = XXHash64(item, seed)
+ /// - h1 = XXHash64(item, h0)
+ fn compute_hash<T: Hash>(&self, item: &T) -> (u64, u64) {
+ // First hash with the configured seed
+ let mut hasher = XxHash64::with_seed(self.seed);
+ item.hash(&mut hasher);
+ let h0 = hasher.finish();
+
+ // Second hash using h0 as the seed
+ let mut hasher = XxHash64::with_seed(h0);
+ item.hash(&mut hasher);
+ let h1 = hasher.finish();
+
+ (h0, h1)
+ }
+
+ /// Checks if all k bits are set for the given hash values.
+ fn check_bits(&self, h0: u64, h1: u64) -> bool {
+ for i in 1..=self.num_hashes {
+ let bit_index = self.compute_bit_index(h0, h1, i);
+ if !self.get_bit(bit_index) {
+ return false;
+ }
+ }
+ true
+ }
+
+ /// Sets all k bits for the given hash values.
+ fn set_bits(&mut self, h0: u64, h1: u64) {
+ for i in 1..=self.num_hashes {
+ let bit_index = self.compute_bit_index(h0, h1, i);
+ self.set_bit(bit_index);
+ }
+ }
+
+ /// Computes a bit index using double hashing (Kirsch-Mitzenmacher).
+ ///
+ /// Formula:
+ /// ```text
+ /// hash_index = ((h0 + i * h1) >> 1) % capacity_bits
+ /// ```
+ ///
+ /// The right shift by 1 improves bit distribution. The index `i` is
1-based.
+ fn compute_bit_index(&self, h0: u64, h1: u64, i: u16) -> u64 {
+ let hash = h0.wrapping_add(u64::from(i).wrapping_mul(h1));
+ (hash >> 1) % self.capacity_bits
+ }
+
+ /// Gets the value of a single bit.
+ fn get_bit(&self, bit_index: u64) -> bool {
+ let word_index = (bit_index >> 6) as usize; // Equivalent to bit_index
/ 64
+ let bit_offset = bit_index & 63; // Equivalent to bit_index % 64
+ let mask = 1u64 << bit_offset;
+ (self.bit_array[word_index] & mask) != 0
+ }
+
+ /// Sets a single bit and updates the count if it wasn't already set.
+ fn set_bit(&mut self, bit_index: u64) {
+ let word_index = (bit_index >> 6) as usize; // Equivalent to bit_index
/ 64
+ let bit_offset = bit_index & 63; // Equivalent to bit_index % 64
+ let mask = 1u64 << bit_offset;
+
+ if (self.bit_array[word_index] & mask) == 0 {
+ self.bit_array[word_index] |= mask;
+ self.num_bits_set += 1;
+ }
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::BloomFilter;
+ use crate::bloom::BloomFilterBuilder;
+
+ #[test]
+ fn test_builder_with_accuracy() {
+ let filter = BloomFilterBuilder::with_accuracy(1000, 0.01).build();
+ assert!(filter.capacity() >= 9000);
+ assert_eq!(filter.num_hashes(), 7);
+ assert!(filter.is_empty());
+ }
+
+ #[test]
+ fn test_builder_with_size() {
+ let filter = BloomFilterBuilder::with_size(1024, 5).build();
+ assert_eq!(filter.capacity(), 1024);
+ assert_eq!(filter.num_hashes(), 5);
+ }
+
+ #[test]
+ fn test_insert_and_contains() {
+ let mut filter = BloomFilterBuilder::with_accuracy(100, 0.01).build();
+
+ assert!(!filter.contains(&"apple"));
+ filter.insert("apple");
+ assert!(filter.contains(&"apple"));
+ assert!(!filter.is_empty());
+ }
+
+ #[test]
+ fn test_contains_and_insert() {
+ let mut filter = BloomFilterBuilder::with_accuracy(100, 0.01).build();
+
+ let was_present = filter.contains_and_insert(&42_u64);
+ assert!(!was_present);
+
+ let was_present = filter.contains_and_insert(&42_u64);
+ assert!(was_present);
+ }
+
+ #[test]
+ fn test_reset() {
+ let mut filter = BloomFilterBuilder::with_accuracy(100, 0.01).build();
+ filter.insert("test");
+ assert!(!filter.is_empty());
+
+ filter.reset();
+ assert!(filter.is_empty());
+ assert!(!filter.contains(&"test"));
+ }
+
+ #[test]
+ fn test_union() {
+ let mut f1 = BloomFilterBuilder::with_accuracy(100, 0.01)
+ .seed(123)
+ .build();
+ let mut f2 = BloomFilterBuilder::with_accuracy(100, 0.01)
+ .seed(123)
+ .build();
+
+ f1.insert("a");
+ f2.insert("b");
+
+ f1.union(&f2);
+ assert!(f1.contains(&"a"));
+ assert!(f1.contains(&"b"));
+ }
+
+ #[test]
+ fn test_intersect() {
+ let mut f1 = BloomFilterBuilder::with_accuracy(100, 0.01)
+ .seed(123)
+ .build();
+ let mut f2 = BloomFilterBuilder::with_accuracy(100, 0.01)
+ .seed(123)
+ .build();
+
+ f1.insert("a");
+ f1.insert("b");
+ f2.insert("b");
+ f2.insert("c");
+
+ f1.intersect(&f2);
+ assert!(f1.contains(&"b"));
+ }
+
+ #[test]
+ fn test_serialize_deserialize_empty() {
+ let filter = BloomFilterBuilder::with_accuracy(100, 0.01).build();
+ let bytes = filter.serialize();
+ let restored = BloomFilter::deserialize(&bytes).unwrap();
+
+ assert_eq!(filter, restored);
+ }
+
+ #[test]
+ fn test_serialize_deserialize_with_data() {
+ let mut filter = BloomFilterBuilder::with_accuracy(100, 0.01).build();
+ filter.insert("test");
+ filter.insert(42_u64);
+
+ let bytes = filter.serialize();
+ let restored = BloomFilter::deserialize(&bytes).unwrap();
+
+ assert_eq!(filter, restored);
+ assert!(restored.contains(&"test"));
+ assert!(restored.contains(&42_u64));
+ }
+
+ #[test]
+ fn test_statistics() {
+ let mut filter = BloomFilterBuilder::with_size(1000, 5).build();
+ assert_eq!(filter.bits_used(), 0);
+ assert_eq!(filter.load_factor(), 0.0);
+
+ filter.insert("test");
+ assert!(filter.bits_used() > 0);
+ assert!(filter.load_factor() > 0.0);
+ assert!(filter.estimated_fpp() > 0.0);
+ }
+
+ #[test]
+ fn test_is_compatible() {
+ let f1 = BloomFilterBuilder::with_accuracy(100, 0.01)
+ .seed(123)
+ .build();
+ let f2 = BloomFilterBuilder::with_accuracy(100, 0.01)
+ .seed(123)
+ .build();
+ let f3 = BloomFilterBuilder::with_accuracy(100, 0.01)
+ .seed(456)
+ .build();
+
+ assert!(f1.is_compatible(&f2));
+ assert!(!f1.is_compatible(&f3));
+ }
+
+ #[test]
+ #[should_panic(expected = "max_items must be greater than 0")]
+ fn test_invalid_max_items() {
+ BloomFilterBuilder::with_accuracy(0, 0.01);
+ }
+
+ #[test]
+ #[should_panic(expected = "fpp must be between")]
+ fn test_invalid_fpp() {
+ BloomFilterBuilder::with_accuracy(100, 1.5);
+ }
+}
diff --git a/datasketches/src/lib.rs b/datasketches/src/lib.rs
index acc051d..473ab9b 100644
--- a/datasketches/src/lib.rs
+++ b/datasketches/src/lib.rs
@@ -30,6 +30,7 @@
#[cfg(target_endian = "big")]
compile_error!("datasketches does not support big-endian targets");
+pub mod bloom;
pub mod countmin;
pub mod error;
pub mod frequencies;
diff --git a/datasketches/tests/bloom_serialization_test.rs
b/datasketches/tests/bloom_serialization_test.rs
new file mode 100644
index 0000000..5370f89
--- /dev/null
+++ b/datasketches/tests/bloom_serialization_test.rs
@@ -0,0 +1,231 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Bloom Filter Serialization Compatibility Tests
+//!
+//! These tests verify binary compatibility with Apache DataSketches
implementations:
+//! - Java (datasketches-java)
+//! - C++ (datasketches-cpp)
+//!
+//! Test data is generated by the reference implementations and stored in:
+//! `tests/serialization_test_data/`
+
+mod common;
+
+use std::fs;
+use std::path::PathBuf;
+
+use common::serialization_test_data;
+use datasketches::bloom::BloomFilter;
+
+fn test_bloom_filter_file(path: PathBuf, expected_num_items: u64,
expected_num_hashes: u16) {
+ let bytes = fs::read(&path).unwrap();
+ let filter1 = BloomFilter::deserialize(&bytes).unwrap();
+
+ // Verify basic properties
+ assert_eq!(
+ filter1.num_hashes(),
+ expected_num_hashes,
+ "Wrong num_hashes in {}",
+ path.display()
+ );
+
+ // Check empty state
+ if expected_num_items == 0 {
+ assert!(filter1.is_empty(), "Filter should be empty for n=0");
+ assert_eq!(
+ filter1.bits_used(),
+ 0,
+ "Empty filter should have 0 bits set"
+ );
+ } else {
+ assert!(
+ !filter1.is_empty(),
+ "Filter should not be empty for n={}",
+ expected_num_items
+ );
+ assert!(
+ filter1.bits_used() > 0,
+ "Non-empty filter should have bits set"
+ );
+ }
+
+ // Verify the items that were inserted (integers 0 to n/10-1)
+ // C++ code: for (uint64_t i = 0; i < n / 10; ++i) bf.update(i);
+ let num_inserted = expected_num_items / 10;
+
+ if num_inserted > 0 {
+ // Check a sample of inserted items
+ // For large n, we only check a sample to keep tests fast
+ let sample_size = std::cmp::min(num_inserted, 100);
+ let mut false_negatives = 0;
+
+ for i in 0..sample_size {
+ if !filter1.contains(&i) {
+ false_negatives += 1;
+ }
+ }
+
+ assert_eq!(
+ false_negatives,
+ 0,
+ "Found {} false negatives out of {} items in {}",
+ false_negatives,
+ sample_size,
+ path.display()
+ );
+ }
+
+ // Serialize and deserialize again to test round-trip
+ let serialized_bytes = filter1.serialize();
+ let filter2 =
BloomFilter::deserialize(&serialized_bytes).unwrap_or_else(|err| {
+ panic!(
+ "Deserialization failed after round-trip for {}: {}",
+ path.display(),
+ err
+ )
+ });
+
+ // Check that both filters are functionally equivalent
+ assert_eq!(
+ filter1.num_hashes(),
+ filter2.num_hashes(),
+ "num_hashes mismatch after round-trip for {}",
+ path.display()
+ );
+ assert_eq!(
+ filter1.capacity(),
+ filter2.capacity(),
+ "capacity mismatch after round-trip for {}",
+ path.display()
+ );
+ assert_eq!(
+ filter1.bits_used(),
+ filter2.bits_used(),
+ "bits_used mismatch after round-trip for {}",
+ path.display()
+ );
+
+ // Verify same items are present after round-trip
+ if num_inserted > 0 {
+ let sample_size = std::cmp::min(num_inserted, 100);
+ for i in 0..sample_size {
+ assert_eq!(
+ filter1.contains(&i),
+ filter2.contains(&i),
+ "Item {} presence differs after round-trip",
+ i
+ );
+ }
+ }
+}
+
+#[test]
+fn test_java_bloom_n0_h3() {
+ let path = serialization_test_data("java_generated_files",
"bf_n0_h3_java.sk");
+ test_bloom_filter_file(path, 0, 3);
+}
+
+#[test]
+fn test_java_bloom_n0_h5() {
+ let path = serialization_test_data("java_generated_files",
"bf_n0_h5_java.sk");
+ test_bloom_filter_file(path, 0, 5);
+}
+
+#[test]
+fn test_java_bloom_n10000_h3() {
+ let path = serialization_test_data("java_generated_files",
"bf_n10000_h3_java.sk");
+ test_bloom_filter_file(path, 10000, 3);
+}
+
+#[test]
+fn test_java_bloom_n10000_h5() {
+ let path = serialization_test_data("java_generated_files",
"bf_n10000_h5_java.sk");
+ test_bloom_filter_file(path, 10000, 5);
+}
+
+#[test]
+fn test_java_bloom_n2000000_h3() {
+ let path = serialization_test_data("java_generated_files",
"bf_n2000000_h3_java.sk");
+ test_bloom_filter_file(path, 2000000, 3);
+}
+
+#[test]
+fn test_java_bloom_n2000000_h5() {
+ let path = serialization_test_data("java_generated_files",
"bf_n2000000_h5_java.sk");
+ test_bloom_filter_file(path, 2000000, 5);
+}
+
+#[test]
+fn test_java_bloom_n30000000_h3() {
+ let path = serialization_test_data("java_generated_files",
"bf_n30000000_h3_java.sk");
+ test_bloom_filter_file(path, 30000000, 3);
+}
+
+#[test]
+fn test_java_bloom_n30000000_h5() {
+ let path = serialization_test_data("java_generated_files",
"bf_n30000000_h5_java.sk");
+ test_bloom_filter_file(path, 30000000, 5);
+}
+
+#[test]
+fn test_cpp_bloom_n0_h3() {
+ let path = serialization_test_data("cpp_generated_files",
"bf_n0_h3_cpp.sk");
+ test_bloom_filter_file(path, 0, 3);
+}
+
+#[test]
+fn test_cpp_bloom_n0_h5() {
+ let path = serialization_test_data("cpp_generated_files",
"bf_n0_h5_cpp.sk");
+ test_bloom_filter_file(path, 0, 5);
+}
+
+#[test]
+fn test_cpp_bloom_n10000_h3() {
+ let path = serialization_test_data("cpp_generated_files",
"bf_n10000_h3_cpp.sk");
+ test_bloom_filter_file(path, 10000, 3);
+}
+
+#[test]
+fn test_cpp_bloom_n10000_h5() {
+ let path = serialization_test_data("cpp_generated_files",
"bf_n10000_h5_cpp.sk");
+ test_bloom_filter_file(path, 10000, 5);
+}
+
+#[test]
+fn test_cpp_bloom_n2000000_h3() {
+ let path = serialization_test_data("cpp_generated_files",
"bf_n2000000_h3_cpp.sk");
+ test_bloom_filter_file(path, 2000000, 3);
+}
+
+#[test]
+fn test_cpp_bloom_n2000000_h5() {
+ let path = serialization_test_data("cpp_generated_files",
"bf_n2000000_h5_cpp.sk");
+ test_bloom_filter_file(path, 2000000, 5);
+}
+
+#[test]
+fn test_cpp_bloom_n30000000_h3() {
+ let path = serialization_test_data("cpp_generated_files",
"bf_n30000000_h3_cpp.sk");
+ test_bloom_filter_file(path, 30000000, 3);
+}
+
+#[test]
+fn test_cpp_bloom_n30000000_h5() {
+ let path = serialization_test_data("cpp_generated_files",
"bf_n30000000_h5_cpp.sk");
+ test_bloom_filter_file(path, 30000000, 5);
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]