This is an automated email from the ASF dual-hosted git repository. placave pushed a commit to branch bloomfilter in repository https://gitbox.apache.org/repos/asf/datasketches-go.git
commit 1c684a925366cceb61fb248a22c0234a25a5317d Author: Pierre Lacave <[email protected]> AuthorDate: Fri Dec 12 00:24:56 2025 +0100 Add a bloomfilter implementation to Go --- README.md | 2 +- filters/bit_array.go | 103 +++ filters/bit_array_test.go | 186 +++++ filters/bloom_filter.go | 600 +++++++++++++++ filters/bloom_filter_builder.go | 272 +++++++ filters/bloom_filter_serialization_test.go | 534 ++++++++++++++ filters/bloom_filter_test.go | 804 +++++++++++++++++++++ filters/preamble_utils.go | 148 ++++ go.mod | 2 + go.sum | 2 + internal/family.go | 7 +- .../bf_byte_array_n10000_h3_go.sk | Bin 0 -> 1288 bytes .../bf_double_array_n10000_h3_go.sk | Bin 0 -> 1288 bytes .../go_generated_files/bf_double_n10000_h3_go.sk | Bin 0 -> 1288 bytes .../bf_long_array_n10000_h3_go.sk | Bin 0 -> 1288 bytes .../go_generated_files/bf_n0_h3_go.sk | Bin 0 -> 24 bytes .../go_generated_files/bf_n0_h5_go.sk | Bin 0 -> 24 bytes .../go_generated_files/bf_n10000_h3_go.sk | Bin 0 -> 1288 bytes .../go_generated_files/bf_n10000_h5_go.sk | Bin 0 -> 1288 bytes .../go_generated_files/bf_n2000000_h3_go.sk | Bin 0 -> 250032 bytes .../go_generated_files/bf_n2000000_h5_go.sk | Bin 0 -> 250032 bytes .../go_generated_files/bf_string_n10000_h3_go.sk | Bin 0 -> 1288 bytes 22 files changed, 2658 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 4f0f94d..4ca1a27 100644 --- a/README.md +++ b/README.md @@ -59,7 +59,7 @@ If you are interested in making contributions to this site please see our [Commu | | ReserviorItemsSketch<T> | ❌ | | | VarOptItemsSketch<T> | ❌ | | Membership | | | -| | BloomFilterSketch | ❌ | +| | BloomFilter | 🚧 | ## Specialty Sketches diff --git a/filters/bit_array.go b/filters/bit_array.go new file mode 100644 index 0000000..9bfcc52 --- /dev/null +++ b/filters/bit_array.go @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package filters + +import "math/bits" + +// getBit returns the value of the bit at the specified index. +func getBit(array []uint64, index uint64) bool { + longIdx := index >> 6 // divide by 64 + bitIdx := index & 0x3F // mod 64 + return (array[longIdx] & (1 << bitIdx)) != 0 +} + +// setBit sets the bit at the specified index to 1. +func setBit(array []uint64, index uint64) { + longIdx := index >> 6 + bitIdx := index & 0x3F + array[longIdx] |= (1 << bitIdx) +} + +// clearBit sets the bit at the specified index to 0. +func clearBit(array []uint64, index uint64) { + longIdx := index >> 6 + bitIdx := index & 0x3F + array[longIdx] &^= (1 << bitIdx) +} + +// assignBit sets the bit at the specified index to the given value. +func assignBit(array []uint64, index uint64, value bool) { + if value { + setBit(array, index) + } else { + clearBit(array, index) + } +} + +// getAndSetBit atomically gets the current bit value and sets it to 1. +// Returns true if the bit was already set, false if it was newly set. +func getAndSetBit(array []uint64, index uint64) bool { + longIdx := index >> 6 + bitIdx := index & 0x3F + mask := uint64(1) << bitIdx + wasSet := (array[longIdx] & mask) != 0 + array[longIdx] |= mask + return wasSet +} + +// countBitsSet counts the number of bits set to 1 in the array. +func countBitsSet(array []uint64) uint64 { + count := uint64(0) + for _, val := range array { + count += uint64(bits.OnesCount64(val)) + } + return count +} + +// unionWith performs a bitwise OR operation between target and source arrays. +// The result is stored in target. Returns the number of bits set in the result. +func unionWith(target, source []uint64) uint64 { + count := uint64(0) + for i := range target { + target[i] |= source[i] + count += uint64(bits.OnesCount64(target[i])) + } + return count +} + +// intersect performs a bitwise AND operation between target and source arrays. +// The result is stored in target. Returns the number of bits set in the result. +func intersect(target, source []uint64) uint64 { + count := uint64(0) + for i := range target { + target[i] &= source[i] + count += uint64(bits.OnesCount64(target[i])) + } + return count +} + +// invert performs a bitwise NOT operation on the array. +// Returns the number of bits set in the result. +func invert(array []uint64) uint64 { + count := uint64(0) + for i := range array { + array[i] = ^array[i] + count += uint64(bits.OnesCount64(array[i])) + } + return count +} diff --git a/filters/bit_array_test.go b/filters/bit_array_test.go new file mode 100644 index 0000000..7f1172d --- /dev/null +++ b/filters/bit_array_test.go @@ -0,0 +1,186 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package filters + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestBitArrayBasicOperations(t *testing.T) { + // Create array with 128 bits (2 longs) + array := make([]uint64, 2) + + // Test initial state - all bits should be 0 + assert.Equal(t, uint64(0), countBitsSet(array)) + assert.False(t, getBit(array, 0)) + assert.False(t, getBit(array, 63)) + assert.False(t, getBit(array, 64)) + assert.False(t, getBit(array, 127)) + + // Test setBit + setBit(array, 5) + assert.True(t, getBit(array, 5)) + assert.Equal(t, uint64(1), countBitsSet(array)) + + setBit(array, 65) + assert.True(t, getBit(array, 65)) + assert.Equal(t, uint64(2), countBitsSet(array)) + + // Test clearBit + clearBit(array, 5) + assert.False(t, getBit(array, 5)) + assert.Equal(t, uint64(1), countBitsSet(array)) + + // Test assignBit + assignBit(array, 10, true) + assert.True(t, getBit(array, 10)) + assert.Equal(t, uint64(2), countBitsSet(array)) + + assignBit(array, 10, false) + assert.False(t, getBit(array, 10)) + assert.Equal(t, uint64(1), countBitsSet(array)) + + // Test getAndSetBit + wasSet := getAndSetBit(array, 20) + assert.False(t, wasSet) // Was not set + assert.True(t, getBit(array, 20)) + assert.Equal(t, uint64(2), countBitsSet(array)) + + wasSet = getAndSetBit(array, 20) + assert.True(t, wasSet) // Was already set + assert.True(t, getBit(array, 20)) + assert.Equal(t, uint64(2), countBitsSet(array)) +} + +func TestBitArrayInversion(t *testing.T) { + // Create array with 128 bits + array := make([]uint64, 2) + + // Set some bits + setBit(array, 0) + setBit(array, 10) + setBit(array, 63) + setBit(array, 100) + assert.Equal(t, uint64(4), countBitsSet(array)) + + // Invert + count := invert(array) + assert.Equal(t, uint64(128-4), count) + assert.Equal(t, uint64(128-4), countBitsSet(array)) + + // Previously set bits should now be clear + assert.False(t, getBit(array, 0)) + assert.False(t, getBit(array, 10)) + assert.False(t, getBit(array, 63)) + assert.False(t, getBit(array, 100)) + + // Previously clear bits should now be set + assert.True(t, getBit(array, 1)) + assert.True(t, getBit(array, 50)) + assert.True(t, getBit(array, 64)) + assert.True(t, getBit(array, 127)) +} + +func TestBitArrayUnion(t *testing.T) { + // Create two arrays with 192 bits (3 longs) + array1 := make([]uint64, 3) + array2 := make([]uint64, 3) + array3 := make([]uint64, 3) + + // Array1: bits 0-9 + for i := uint64(0); i < 10; i++ { + setBit(array1, i) + } + + // Array2: bits 5-14 + for i := uint64(5); i < 15; i++ { + setBit(array2, i) + } + + // Array3: even bits 0-18 + for i := uint64(0); i < 19; i += 2 { + setBit(array3, i) + } + + // Union of array2 and array3 + count := unionWith(array2, array3) + // Array2 had bits 5-14 (10 bits) + // Array3 had even bits 0-18 (10 bits: 0,2,4,6,8,10,12,14,16,18) + // Union should have: 5,6,7,8,9,10,11,12,13,14 + 0,2,4,16,18 = 15 bits + assert.Equal(t, uint64(15), count) + assert.Equal(t, uint64(15), countBitsSet(array2)) +} + +func TestBitArrayIntersection(t *testing.T) { + // Create two arrays + array1 := make([]uint64, 3) + array2 := make([]uint64, 3) + + // Array1: bits 0-9 + for i := uint64(0); i < 10; i++ { + setBit(array1, i) + } + + // Array2: bits 5-14 + for i := uint64(5); i < 15; i++ { + setBit(array2, i) + } + + // Intersection + count := intersect(array1, array2) + // Overlap is bits 5-9 = 5 bits + assert.Equal(t, uint64(5), count) + assert.Equal(t, uint64(5), countBitsSet(array1)) + + // Verify the overlap bits + for i := uint64(5); i < 10; i++ { + assert.True(t, getBit(array1, i)) + } + // Verify non-overlap bits are cleared + for i := uint64(0); i < 5; i++ { + assert.False(t, getBit(array1, i)) + } +} + +func TestBitArrayBoundaries(t *testing.T) { + // Test bit operations at long boundaries + array := make([]uint64, 3) // 192 bits + + // Test at boundary of first and second long (bit 63-64) + setBit(array, 63) + setBit(array, 64) + assert.True(t, getBit(array, 63)) + assert.True(t, getBit(array, 64)) + + clearBit(array, 63) + assert.False(t, getBit(array, 63)) + assert.True(t, getBit(array, 64)) + + // Test at boundary of second and third long (bit 127-128) + setBit(array, 127) + setBit(array, 128) + assert.True(t, getBit(array, 127)) + assert.True(t, getBit(array, 128)) + + // Test last bit + setBit(array, 191) + assert.True(t, getBit(array, 191)) + assert.Equal(t, uint64(4), countBitsSet(array)) +} diff --git a/filters/bloom_filter.go b/filters/bloom_filter.go new file mode 100644 index 0000000..f562938 --- /dev/null +++ b/filters/bloom_filter.go @@ -0,0 +1,600 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package filters provides probabilistic membership data structures for efficient +// set membership testing with controlled false positive rates. +// +// The Bloom filter is a space-efficient probabilistic data structure that is used +// to test whether an element is a member of a set. False positive matches are +// possible, but false negatives are not. This implementation uses XXHash64 with +// Kirsch-Mitzenmacher double hashing optimization. +package filters + +import ( + "encoding/binary" + "fmt" + "math" + "math/bits" + + "github.com/cespare/xxhash/v2" +) + +// BloomFilter is a probabilistic data structure for set membership testing. +// It provides constant-time updates and queries with a configurable false +// positive rate. No false negatives are possible. +type BloomFilter interface { + // Update methods add items to the filter + UpdateUInt64(datum uint64) error + UpdateInt64(datum int64) error + UpdateString(datum string) error + UpdateSlice(datum []byte) error + UpdateFloat64(datum float64) error + UpdateInt64Array(data []int64) error + UpdateFloat64Array(data []float64) error + + // Query methods test membership + QueryUInt64(datum uint64) bool + QueryInt64(datum int64) bool + QueryString(datum string) bool + QuerySlice(datum []byte) bool + QueryFloat64(datum float64) bool + QueryInt64Array(data []int64) bool + QueryFloat64Array(data []float64) bool + + // QueryAndUpdate atomically queries and updates (test-and-set) + QueryAndUpdateUInt64(datum uint64) (bool, error) + QueryAndUpdateInt64(datum int64) (bool, error) + QueryAndUpdateString(datum string) (bool, error) + QueryAndUpdateSlice(datum []byte) (bool, error) + QueryAndUpdateFloat64(datum float64) (bool, error) + QueryAndUpdateInt64Array(data []int64) (bool, error) + QueryAndUpdateFloat64Array(data []float64) (bool, error) + + // Set operations + Union(other BloomFilter) error + Intersect(other BloomFilter) error + Invert() error + IsCompatible(other BloomFilter) bool + + // State queries + IsEmpty() bool + GetBitsUsed() uint64 + GetCapacity() uint64 + GetNumHashes() uint16 + GetSeed() uint64 + + // Serialization + ToCompactSlice() ([]byte, error) + Reset() error +} + +// bloomFilterImpl is the concrete implementation of BloomFilter. +type bloomFilterImpl struct { + seed uint64 + numHashes uint16 + isDirty bool + capacityBits uint64 + numBitsSet uint64 + bitArray []uint64 +} + +// IsEmpty returns true if no bits are set in the filter. +func (bf *bloomFilterImpl) IsEmpty() bool { + return bf.GetBitsUsed() == 0 +} + +// GetBitsUsed returns the number of bits currently set to 1. +// If the count is dirty, it will be recomputed. +func (bf *bloomFilterImpl) GetBitsUsed() uint64 { + if bf.isDirty { + bf.numBitsSet = countBitsSet(bf.bitArray) + bf.isDirty = false + } + return bf.numBitsSet +} + +// GetCapacity returns the total number of bits in the filter. +func (bf *bloomFilterImpl) GetCapacity() uint64 { + return bf.capacityBits +} + +// GetNumHashes returns the number of hash functions used. +func (bf *bloomFilterImpl) GetNumHashes() uint16 { + return bf.numHashes +} + +// GetSeed returns the hash seed used by the filter. +func (bf *bloomFilterImpl) GetSeed() uint64 { + return bf.seed +} + +// Reset clears all bits in the filter. +func (bf *bloomFilterImpl) Reset() error { + for i := range bf.bitArray { + bf.bitArray[i] = 0 + } + bf.numBitsSet = 0 + bf.isDirty = false + return nil +} + +// IsCompatible checks if two filters can be combined (union/intersection). +// Filters are compatible if they have the same seed, hash count, and capacity. +func (bf *bloomFilterImpl) IsCompatible(other BloomFilter) bool { + return bf.seed == other.GetSeed() && + bf.numHashes == other.GetNumHashes() && + bf.capacityBits == other.GetCapacity() +} + +// computeHashes computes two hash values using XXHash64 and Kirsch-Mitzenmacher approach. +func (bf *bloomFilterImpl) computeHashes(data []byte) (h0, h1 uint64) { + // Compute h0 with the filter's seed + h := xxhash.NewWithSeed(bf.seed) + h.Write(data) + h0 = h.Sum64() + + // Compute h1 using h0 as seed + h.Reset() + h = xxhash.NewWithSeed(h0) + h.Write(data) + h1 = h.Sum64() + return +} + +// hashLongOptimized implements the Java-compatible optimized hash for single long values. +// This matches the implementation in org.apache.datasketches.hash.XxHash64.hash(long, long). +func hashLongOptimized(value uint64, seed uint64) uint64 { + const ( + P1 = 0x9E3779B185EBCA87 + P2 = 0xC2B2AE3D27D4EB4F + P3 = 0x165667B19E3779F9 + P4 = 0x85EBCA77C2B2AE63 + P5 = 0x27D4EB2F165667C5 + ) + + hash := seed + P5 + hash += 8 // length in bytes + + k1 := value + k1 *= P2 + k1 = bits.RotateLeft64(k1, 31) + k1 *= P1 + hash ^= k1 + hash = (bits.RotateLeft64(hash, 27) * P1) + P4 + + // Finalize + hash ^= hash >> 33 + hash *= P2 + hash ^= hash >> 29 + hash *= P3 + hash ^= hash >> 32 + + return hash +} + +// computeHashesForLong computes hashes using the optimized single-long algorithm +// to match Java/C++ behavior for integer values. +func (bf *bloomFilterImpl) computeHashesForLong(value uint64) (h0, h1 uint64) { + h0 = hashLongOptimized(value, bf.seed) + h1 = hashLongOptimized(value, h0) + return +} + +// getHashIndex computes the i-th hash index using double hashing. +// Formula: g_i(x) = ((h0 + i * h1) >> 1) mod capacity +func (bf *bloomFilterImpl) getHashIndex(h0, h1 uint64, i uint16) uint64 { + return ((h0 + uint64(i)*h1) >> 1) % bf.capacityBits +} + +// internalUpdate updates the filter with pre-computed hash values. +func (bf *bloomFilterImpl) internalUpdate(h0, h1 uint64) { + for i := uint16(1); i <= bf.numHashes; i++ { + idx := bf.getHashIndex(h0, h1, i) + if !getBit(bf.bitArray, idx) { + setBit(bf.bitArray, idx) + bf.numBitsSet++ + } + } + bf.isDirty = true +} + +// internalQuery queries the filter with pre-computed hash values. +func (bf *bloomFilterImpl) internalQuery(h0, h1 uint64) bool { + if bf.IsEmpty() { + return false + } + for i := uint16(1); i <= bf.numHashes; i++ { + idx := bf.getHashIndex(h0, h1, i) + if !getBit(bf.bitArray, idx) { + return false + } + } + return true +} + +// internalQueryAndUpdate atomically queries and updates with pre-computed hash values. +// Returns true if the item was already present (all k bits were set). +func (bf *bloomFilterImpl) internalQueryAndUpdate(h0, h1 uint64) bool { + valueExists := true + newBitsSet := uint64(0) + for i := uint16(1); i <= bf.numHashes; i++ { + idx := bf.getHashIndex(h0, h1, i) + wasSet := getAndSetBit(bf.bitArray, idx) + valueExists = valueExists && wasSet + if !wasSet { + newBitsSet++ + } + } + bf.numBitsSet += newBitsSet + bf.isDirty = true + return valueExists +} + +// UpdateUInt64 adds a uint64 value to the filter. +func (bf *bloomFilterImpl) UpdateUInt64(datum uint64) error { + h0, h1 := bf.computeHashesForLong(datum) + bf.internalUpdate(h0, h1) + return nil +} + +// UpdateInt64 adds an int64 value to the filter. +func (bf *bloomFilterImpl) UpdateInt64(datum int64) error { + h0, h1 := bf.computeHashesForLong(uint64(datum)) + bf.internalUpdate(h0, h1) + return nil +} + +// UpdateString adds a string to the filter. +// Empty strings are ignored (no update performed). +func (bf *bloomFilterImpl) UpdateString(datum string) error { + if datum == "" { + return nil // Empty string - do nothing, matching Java behavior + } + return bf.UpdateSlice([]byte(datum)) +} + +// UpdateSlice adds a byte slice to the filter. +func (bf *bloomFilterImpl) UpdateSlice(datum []byte) error { + h0, h1 := bf.computeHashes(datum) + bf.internalUpdate(h0, h1) + return nil +} + +// UpdateFloat64 adds a float64 value to the filter. +// NaN values are canonicalized to match Java's Double.doubleToLongBits(). +func (bf *bloomFilterImpl) UpdateFloat64(datum float64) error { + var bits uint64 + if datum == 0.0 { + // Canonicalize -0.0 to 0.0 + bits = 0 + } else if math.IsNaN(datum) { + // Use Java's canonical NaN: 0x7ff8000000000000 + bits = 0x7ff8000000000000 + } else if math.IsInf(datum, 1) { + // Positive infinity + bits = 0x7ff0000000000000 + } else if math.IsInf(datum, -1) { + // Negative infinity + bits = 0xfff0000000000000 + } else { + bits = math.Float64bits(datum) + } + + // Java hashes as a single-element long array (8 bytes), not using optimized single-long hash + buf := make([]byte, 8) + binary.LittleEndian.PutUint64(buf, bits) + h0, h1 := bf.computeHashes(buf) + bf.internalUpdate(h0, h1) + return nil +} + +// QueryUInt64 tests if a uint64 value might be in the filter. +func (bf *bloomFilterImpl) QueryUInt64(datum uint64) bool { + h0, h1 := bf.computeHashesForLong(datum) + return bf.internalQuery(h0, h1) +} + +// QueryInt64 tests if an int64 value might be in the filter. +func (bf *bloomFilterImpl) QueryInt64(datum int64) bool { + h0, h1 := bf.computeHashesForLong(uint64(datum)) + return bf.internalQuery(h0, h1) +} + +// QueryString tests if a string might be in the filter. +// Empty strings always return false. +func (bf *bloomFilterImpl) QueryString(datum string) bool { + if datum == "" { + return false // Empty string - do nothing, matching Java behavior + } + return bf.QuerySlice([]byte(datum)) +} + +// QuerySlice tests if a byte slice might be in the filter. +func (bf *bloomFilterImpl) QuerySlice(datum []byte) bool { + h0, h1 := bf.computeHashes(datum) + return bf.internalQuery(h0, h1) +} + +// QueryFloat64 tests if a float64 value might be in the filter. +func (bf *bloomFilterImpl) QueryFloat64(datum float64) bool { + var bits uint64 + if datum == 0.0 { + // Canonicalize -0.0 to 0.0 + bits = 0 + } else if math.IsNaN(datum) { + // Use Java's canonical NaN: 0x7ff8000000000000 + bits = 0x7ff8000000000000 + } else if math.IsInf(datum, 1) { + // Positive infinity + bits = 0x7ff0000000000000 + } else if math.IsInf(datum, -1) { + // Negative infinity + bits = 0xfff0000000000000 + } else { + bits = math.Float64bits(datum) + } + + // Java hashes as a single-element long array (8 bytes), not using optimized single-long hash + buf := make([]byte, 8) + binary.LittleEndian.PutUint64(buf, bits) + h0, h1 := bf.computeHashes(buf) + return bf.internalQuery(h0, h1) +} + +// QueryAndUpdateUInt64 atomically queries and updates for a uint64 value. +// Returns true if the value was already present before the update. +func (bf *bloomFilterImpl) QueryAndUpdateUInt64(datum uint64) (bool, error) { + h0, h1 := bf.computeHashesForLong(datum) + return bf.internalQueryAndUpdate(h0, h1), nil +} + +// QueryAndUpdateInt64 atomically queries and updates for an int64 value. +func (bf *bloomFilterImpl) QueryAndUpdateInt64(datum int64) (bool, error) { + h0, h1 := bf.computeHashesForLong(uint64(datum)) + return bf.internalQueryAndUpdate(h0, h1), nil +} + +// QueryAndUpdateString atomically queries and updates for a string. +// Empty strings are ignored and return false. +func (bf *bloomFilterImpl) QueryAndUpdateString(datum string) (bool, error) { + if datum == "" { + return false, nil // Empty string - do nothing, matching Java behavior + } + return bf.QueryAndUpdateSlice([]byte(datum)) +} + +// QueryAndUpdateSlice atomically queries and updates for a byte slice. +func (bf *bloomFilterImpl) QueryAndUpdateSlice(datum []byte) (bool, error) { + h0, h1 := bf.computeHashes(datum) + return bf.internalQueryAndUpdate(h0, h1), nil +} + +// QueryAndUpdateFloat64 atomically queries and updates for a float64 value. +func (bf *bloomFilterImpl) QueryAndUpdateFloat64(datum float64) (bool, error) { + var bits uint64 + if datum == 0.0 { + // Canonicalize -0.0 to 0.0 + bits = 0 + } else if math.IsNaN(datum) { + // Use Java's canonical NaN: 0x7ff8000000000000 + bits = 0x7ff8000000000000 + } else if math.IsInf(datum, 1) { + // Positive infinity + bits = 0x7ff0000000000000 + } else if math.IsInf(datum, -1) { + // Negative infinity + bits = 0xfff0000000000000 + } else { + bits = math.Float64bits(datum) + } + + // Java hashes as a single-element long array (8 bytes), not using optimized single-long hash + buf := make([]byte, 8) + binary.LittleEndian.PutUint64(buf, bits) + h0, h1 := bf.computeHashes(buf) + return bf.internalQueryAndUpdate(h0, h1), nil +} + +// UpdateInt64Array adds an array of int64 values to the filter. +// The entire array is hashed as a single unit (not element-by-element). +// Nil or empty arrays are ignored. +func (bf *bloomFilterImpl) UpdateInt64Array(data []int64) error { + if len(data) == 0 { + return nil + } + + // Convert array to bytes (little-endian) + bytes := make([]byte, len(data)*8) + for i, val := range data { + binary.LittleEndian.PutUint64(bytes[i*8:], uint64(val)) + } + + h0, h1 := bf.computeHashes(bytes) + bf.internalUpdate(h0, h1) + return nil +} + +// UpdateFloat64Array adds an array of float64 values to the filter. +// The entire array is hashed as a single unit (not element-by-element). +// Nil or empty arrays are ignored. +func (bf *bloomFilterImpl) UpdateFloat64Array(data []float64) error { + if len(data) == 0 { + return nil + } + + // Convert array to bytes (little-endian) + bytes := make([]byte, len(data)*8) + for i, val := range data { + bits := math.Float64bits(val) + binary.LittleEndian.PutUint64(bytes[i*8:], bits) + } + + h0, h1 := bf.computeHashes(bytes) + bf.internalUpdate(h0, h1) + return nil +} + +// QueryInt64Array tests if an array of int64 values might be in the filter. +// The entire array is hashed as a single unit. +func (bf *bloomFilterImpl) QueryInt64Array(data []int64) bool { + if len(data) == 0 { + return false + } + + // Convert array to bytes (little-endian) + bytes := make([]byte, len(data)*8) + for i, val := range data { + binary.LittleEndian.PutUint64(bytes[i*8:], uint64(val)) + } + + h0, h1 := bf.computeHashes(bytes) + return bf.internalQuery(h0, h1) +} + +// QueryFloat64Array tests if an array of float64 values might be in the filter. +// The entire array is hashed as a single unit. +func (bf *bloomFilterImpl) QueryFloat64Array(data []float64) bool { + if len(data) == 0 { + return false + } + + // Convert array to bytes (little-endian) + bytes := make([]byte, len(data)*8) + for i, val := range data { + bits := math.Float64bits(val) + binary.LittleEndian.PutUint64(bytes[i*8:], bits) + } + + h0, h1 := bf.computeHashes(bytes) + return bf.internalQuery(h0, h1) +} + +// QueryAndUpdateInt64Array atomically queries and updates for an int64 array. +// Returns true if the array was already present before the update. +func (bf *bloomFilterImpl) QueryAndUpdateInt64Array(data []int64) (bool, error) { + if len(data) == 0 { + return false, nil + } + + // Convert array to bytes (little-endian) + bytes := make([]byte, len(data)*8) + for i, val := range data { + binary.LittleEndian.PutUint64(bytes[i*8:], uint64(val)) + } + + h0, h1 := bf.computeHashes(bytes) + return bf.internalQueryAndUpdate(h0, h1), nil +} + +// QueryAndUpdateFloat64Array atomically queries and updates for a float64 array. +// Returns true if the array was already present before the update. +func (bf *bloomFilterImpl) QueryAndUpdateFloat64Array(data []float64) (bool, error) { + if len(data) == 0 { + return false, nil + } + + // Convert array to bytes (little-endian) + bytes := make([]byte, len(data)*8) + for i, val := range data { + bits := math.Float64bits(val) + binary.LittleEndian.PutUint64(bytes[i*8:], bits) + } + + h0, h1 := bf.computeHashes(bytes) + return bf.internalQueryAndUpdate(h0, h1), nil +} + +// Union performs a bitwise OR operation with another filter. +// After union, this filter will contain items from both filters. +func (bf *bloomFilterImpl) Union(other BloomFilter) error { + if !bf.IsCompatible(other) { + return fmt.Errorf("cannot union incompatible bloom filters") + } + otherImpl, ok := other.(*bloomFilterImpl) + if !ok { + return fmt.Errorf("cannot union with non-standard bloom filter implementation") + } + bf.numBitsSet = unionWith(bf.bitArray, otherImpl.bitArray) + bf.isDirty = false + return nil +} + +// Intersect performs a bitwise AND operation with another filter. +// After intersection, this filter will only contain items present in both filters. +func (bf *bloomFilterImpl) Intersect(other BloomFilter) error { + if !bf.IsCompatible(other) { + return fmt.Errorf("cannot intersect incompatible bloom filters") + } + otherImpl, ok := other.(*bloomFilterImpl) + if !ok { + return fmt.Errorf("cannot intersect with non-standard bloom filter implementation") + } + bf.numBitsSet = intersect(bf.bitArray, otherImpl.bitArray) + bf.isDirty = false + return nil +} + +// Invert flips all bits in the filter. +// This inverts the notion of membership. +func (bf *bloomFilterImpl) Invert() error { + bf.numBitsSet = invert(bf.bitArray) + bf.isDirty = false + return nil +} + +// ToCompactSlice serializes the filter to a byte slice. +func (bf *bloomFilterImpl) ToCompactSlice() ([]byte, error) { + isEmpty := bf.IsEmpty() + var size int + if isEmpty { + size = preambleEmptyBytes + } else { + size = preambleBytes + len(bf.bitArray)*8 + } + + bytes := make([]byte, size) + + // Write preamble + if isEmpty { + insertPreambleLongs(bytes, preambleLongsEmpty) + } else { + insertPreambleLongs(bytes, preambleLongsStandard) + } + insertSerVer(bytes) + insertFamilyID(bytes) + + flags := uint8(0) + if isEmpty { + flags = setEmptyFlag(flags) + } + insertFlags(bytes, flags) + insertNumHashes(bytes, bf.numHashes) + insertSeed(bytes, bf.seed) + insertBitArrayLength(bytes, uint32(len(bf.bitArray))) + + if !isEmpty { + bitsUsed := bf.GetBitsUsed() + insertNumBitsSet(bytes, bitsUsed) + + // Write bit array + for i, val := range bf.bitArray { + binary.LittleEndian.PutUint64(bytes[bitArrayOffset+i*8:], val) + } + } + + return bytes, nil +} diff --git a/filters/bloom_filter_builder.go b/filters/bloom_filter_builder.go new file mode 100644 index 0000000..73fe651 --- /dev/null +++ b/filters/bloom_filter_builder.go @@ -0,0 +1,272 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package filters + +import ( + "crypto/rand" + "encoding/binary" + "fmt" + "math" +) + +// Default seed value for hash functions +const DefaultSeed = uint64(9001) + +// bloomFilterOptions holds optional parameters for filter construction. +type bloomFilterOptions struct { + seed uint64 +} + +// BloomFilterOption is a functional option for configuring a BloomFilter. +type BloomFilterOption func(*bloomFilterOptions) + +// WithSeed sets a custom seed for the hash functions. +func WithSeed(seed uint64) BloomFilterOption { + return func(opts *bloomFilterOptions) { + opts.seed = seed + } +} + +// NewBloomFilterBySize creates a new Bloom filter with explicit size parameters. +// +// Parameters: +// - numBits: The number of bits in the filter (will be rounded up to multiple of 64) +// - numHashes: The number of hash functions to use +// - opts: Optional configuration (seed) +// +// Returns an error if parameters are invalid. +func NewBloomFilterBySize(numBits uint64, numHashes uint16, opts ...BloomFilterOption) (BloomFilter, error) { + if numBits == 0 { + return nil, fmt.Errorf("numBits must be positive") + } + if numHashes == 0 { + return nil, fmt.Errorf("numHashes must be positive") + } + + // Check for overflow + maxBits := uint64(math.MaxInt32-32) * 64 + if numBits > maxBits { + return nil, fmt.Errorf("numBits exceeds maximum allowed size") + } + + // Apply options + options := &bloomFilterOptions{ + seed: DefaultSeed, + } + for _, opt := range opts { + opt(options) + } + + // Round capacity to multiple of 64 + capacityBits := roundCapacity(numBits) + numLongs := capacityBits / 64 + + return &bloomFilterImpl{ + seed: options.seed, + numHashes: numHashes, + isDirty: false, + capacityBits: capacityBits, + numBitsSet: 0, + bitArray: make([]uint64, numLongs), + }, nil +} + +// NewBloomFilterByAccuracy creates a new Bloom filter optimized for target accuracy. +// +// The filter is sized to achieve the specified false positive probability for the +// given number of expected items. The optimal number of bits and hash functions +// are calculated automatically. +// +// Parameters: +// - maxDistinctItems: Expected number of distinct items to be inserted +// - targetFpp: Target false positive probability (between 0 and 1) +// - opts: Optional configuration (seed) +// +// Returns an error if parameters are invalid. +func NewBloomFilterByAccuracy(maxDistinctItems uint64, targetFpp float64, opts ...BloomFilterOption) (BloomFilter, error) { + if maxDistinctItems == 0 { + return nil, fmt.Errorf("maxDistinctItems must be positive") + } + if targetFpp <= 0.0 || targetFpp >= 1.0 { + return nil, fmt.Errorf("targetFpp must be between 0 and 1") + } + + // Calculate optimal parameters + numBits := SuggestNumFilterBits(maxDistinctItems, targetFpp) + numHashes := SuggestNumHashesFromSize(maxDistinctItems, numBits) + + return NewBloomFilterBySize(numBits, numHashes, opts...) +} + +// NewBloomFilterWithDefault creates a new Bloom filter with default parameters. +// Suitable for approximately 10,000 items with 1% false positive rate. +func NewBloomFilterWithDefault() (BloomFilter, error) { + return NewBloomFilterByAccuracy(10000, 0.01) +} + +// SuggestNumFilterBits calculates the optimal number of bits for a Bloom filter. +// +// Formula: m = ceil(-n * ln(p) / (ln(2))^2) +// where n = number of items, p = target false positive probability +func SuggestNumFilterBits(maxDistinctItems uint64, targetFpp float64) uint64 { + n := float64(maxDistinctItems) + p := targetFpp + ln2 := math.Ln2 + + bits := -n * math.Log(p) / (ln2 * ln2) + return uint64(math.Ceil(bits)) +} + +// SuggestNumHashes calculates the optimal number of hash functions from target FPP. +// +// Formula: k = ceil(-ln(p) / ln(2)) +// where p = target false positive probability +func SuggestNumHashes(targetFpp float64) uint16 { + k := -math.Log(targetFpp) / math.Ln2 + return uint16(math.Ceil(k)) +} + +// SuggestNumHashesFromSize calculates optimal number of hash functions from filter size. +// +// Formula: k = ceil((m/n) * ln(2)) +// where m = number of bits, n = number of items +func SuggestNumHashesFromSize(maxDistinctItems, numFilterBits uint64) uint16 { + if maxDistinctItems == 0 { + return 1 + } + ratio := float64(numFilterBits) / float64(maxDistinctItems) + k := ratio * math.Ln2 + result := uint16(math.Ceil(k)) + if result == 0 { + return 1 + } + return result +} + +// GenerateRandomSeed generates a cryptographically random seed value. +func GenerateRandomSeed() (uint64, error) { + buf := make([]byte, 8) + _, err := rand.Read(buf) + if err != nil { + return 0, fmt.Errorf("failed to generate random seed: %w", err) + } + return binary.LittleEndian.Uint64(buf), nil +} + +// NewBloomFilterFromSlice deserializes a Bloom filter from a byte slice. +// +// The byte slice must contain a valid serialized Bloom filter in the +// DataSketches binary format. This format is compatible with C++ and Java +// implementations. +// +// Returns an error if the data is invalid or corrupted. +func NewBloomFilterFromSlice(bytes []byte) (BloomFilter, error) { + // Validate minimum size + if len(bytes) < preambleEmptyBytes { + return nil, fmt.Errorf("insufficient data: need at least %d bytes, got %d", preambleEmptyBytes, len(bytes)) + } + + // Extract and validate preamble fields + pLongs := extractPreambleLongs(bytes) + sVer := extractSerVer(bytes) + fID := extractFamilyID(bytes) + flags := extractFlags(bytes) + numHashes := extractNumHashes(bytes) + seed := extractSeed(bytes) + bitArrayLength := extractBitArrayLength(bytes) + + // Validate serialization version + if sVer != serVer { + return nil, fmt.Errorf("unsupported serialization version: %d (expected %d)", sVer, serVer) + } + + // Validate family ID + if fID != familyID { + return nil, fmt.Errorf("invalid family ID: %d (expected %d)", fID, familyID) + } + + // Validate preamble longs + isEmpty := isEmptyFlag(flags) + expectedPreambleLongs := uint8(preambleLongsEmpty) + if !isEmpty { + expectedPreambleLongs = uint8(preambleLongsStandard) + } + if pLongs != expectedPreambleLongs { + return nil, fmt.Errorf("invalid preamble longs: %d (expected %d for empty=%v)", pLongs, expectedPreambleLongs, isEmpty) + } + + // Validate numHashes + if numHashes == 0 { + return nil, fmt.Errorf("numHashes must be positive") + } + + // Calculate capacity + capacityBits := uint64(bitArrayLength) * 64 + + // Create filter structure + bf := &bloomFilterImpl{ + seed: seed, + numHashes: numHashes, + isDirty: false, + capacityBits: capacityBits, + numBitsSet: 0, + bitArray: make([]uint64, bitArrayLength), + } + + // Handle empty filter + if isEmpty { + if len(bytes) != preambleEmptyBytes { + return nil, fmt.Errorf("empty filter size mismatch: got %d bytes, expected %d", len(bytes), preambleEmptyBytes) + } + return bf, nil + } + + // Handle non-empty filter + expectedSize := preambleBytes + int(bitArrayLength)*8 + if len(bytes) != expectedSize { + return nil, fmt.Errorf("non-empty filter size mismatch: got %d bytes, expected %d", len(bytes), expectedSize) + } + + // Extract num bits set + numBitsSet := extractNumBitsSet(bytes) + if numBitsSet == dirtyBitsValue { + // Need to recount + bf.isDirty = true + } else { + bf.numBitsSet = numBitsSet + } + + // Read bit array + for i := 0; i < int(bitArrayLength); i++ { + offset := bitArrayOffset + i*8 + bf.bitArray[i] = binary.LittleEndian.Uint64(bytes[offset:]) + } + + // Recount if dirty + if bf.isDirty { + bf.numBitsSet = countBitsSet(bf.bitArray) + bf.isDirty = false + } + + return bf, nil +} + +// roundCapacity rounds the number of bits up to the nearest multiple of 64. +func roundCapacity(numBits uint64) uint64 { + return (numBits + 63) & ^uint64(63) +} diff --git a/filters/bloom_filter_serialization_test.go b/filters/bloom_filter_serialization_test.go new file mode 100644 index 0000000..3e82fe6 --- /dev/null +++ b/filters/bloom_filter_serialization_test.go @@ -0,0 +1,534 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package filters + +import ( + "crypto/md5" + "fmt" + "math" + "os" + "testing" + + "github.com/apache/datasketches-go/internal" + "github.com/stretchr/testify/assert" +) + +func TestGenerateGoBinariesForCompatibilityTesting(t *testing.T) { + +} + +func TestGoCompat(t *testing.T) { + if len(os.Getenv(internal.DSketchTestGenerateGo)) == 0 { + t.Skipf("%s not set", internal.DSketchTestGenerateGo) + } + + err := os.MkdirAll(internal.GoPath, os.ModePerm) + assert.NoError(t, err) + + t.Run("bloom filter", func(t *testing.T) { + testCases := []struct { + n int + numHashes uint16 + }{ + {0, 3}, + {0, 5}, + {10000, 3}, + {10000, 5}, + {2000000, 3}, + {2000000, 5}, + } + + for _, tc := range testCases { + // Match Java's configuration: configBits = max(n, 1000) + configBits := uint64(tc.n) + if configBits < 1000 { + configBits = 1000 + } + + // Generate random seed to match Java's approach + seed, err := GenerateRandomSeed() + assert.NoError(t, err) + + // Create filter + bf, err := NewBloomFilterBySize(configBits, tc.numHashes, WithSeed(seed)) + assert.NoError(t, err) + + // Insert items: n/10 items (0 to n/10-1), matching Java + numInserts := tc.n / 10 + for i := 0; i < numInserts; i++ { + err = bf.UpdateInt64(int64(i)) + assert.NoError(t, err) + } + + // If non-empty, also insert NaN (matching Java) + if tc.n > 0 { + err = bf.UpdateFloat64(math.NaN()) + assert.NoError(t, err) + } + + // Verify state + assert.Equal(t, tc.n == 0, bf.IsEmpty()) + if !bf.IsEmpty() { + assert.Greater(t, bf.GetBitsUsed(), uint64(numInserts)) + } + + // Serialize + data, err := bf.ToCompactSlice() + assert.NoError(t, err) + + // Write to file + filename := fmt.Sprintf("%s/bf_n%d_h%d_go.sk", internal.GoPath, tc.n, tc.numHashes) + err = os.WriteFile(filename, data, 0644) + assert.NoError(t, err) + } + }) + + t.Run("Specific Go", func(t *testing.T) { + n := 10000 + numInserts := n / 10 + numHashes := uint16(3) + configBits := uint64(n) + + // Generate for string type + t.Run("string_type", func(t *testing.T) { + seed, _ := GenerateRandomSeed() + bf, _ := NewBloomFilterBySize(configBits, numHashes, WithSeed(seed)) + + for i := 0; i < numInserts; i++ { + bf.UpdateString(fmt.Sprintf("%d", i)) + } + + data, _ := bf.ToCompactSlice() + filename := fmt.Sprintf("%s/bf_string_n%d_h%d_go.sk", internal.GoPath, n, numHashes) + os.WriteFile(filename, data, 0644) + t.Logf("Generated: %s", filename) + }) + + // Generate for double type + t.Run("double_type", func(t *testing.T) { + seed, _ := GenerateRandomSeed() + bf, _ := NewBloomFilterBySize(configBits, numHashes, WithSeed(seed)) + + for i := 0; i < numInserts; i++ { + bf.UpdateFloat64(float64(i)) + } + + data, _ := bf.ToCompactSlice() + filename := fmt.Sprintf("%s/bf_double_n%d_h%d_go.sk", internal.GoPath, n, numHashes) + os.WriteFile(filename, data, 0644) + t.Logf("Generated: %s", filename) + }) + + // Generate for long array type + t.Run("long_array_type", func(t *testing.T) { + seed, _ := GenerateRandomSeed() + bf, _ := NewBloomFilterBySize(configBits, numHashes, WithSeed(seed)) + + for i := 0; i < numInserts; i++ { + arr := []int64{int64(i), int64(i)} + bf.UpdateInt64Array(arr) + } + + data, _ := bf.ToCompactSlice() + filename := fmt.Sprintf("%s/bf_long_array_n%d_h%d_go.sk", internal.GoPath, n, numHashes) + os.WriteFile(filename, data, 0644) + t.Logf("Generated: %s", filename) + }) + + // Generate for double array type + t.Run("double_array_type", func(t *testing.T) { + seed, _ := GenerateRandomSeed() + bf, _ := NewBloomFilterBySize(configBits, numHashes, WithSeed(seed)) + + for i := 0; i < numInserts; i++ { + arr := []float64{float64(i), float64(i)} + bf.UpdateFloat64Array(arr) + } + + data, _ := bf.ToCompactSlice() + filename := fmt.Sprintf("%s/bf_double_array_n%d_h%d_go.sk", internal.GoPath, n, numHashes) + os.WriteFile(filename, data, 0644) + t.Logf("Generated: %s", filename) + }) + + // Generate for byte array type + t.Run("byte_array_type", func(t *testing.T) { + seed, _ := GenerateRandomSeed() + bf, _ := NewBloomFilterBySize(configBits, numHashes, WithSeed(seed)) + + for i := 0; i < numInserts; i++ { + b := byte(i % 256) + arr := []byte{b, b, b, b} + bf.UpdateSlice(arr) + } + + data, _ := bf.ToCompactSlice() + filename := fmt.Sprintf("%s/bf_byte_array_n%d_h%d_go.sk", internal.GoPath, n, numHashes) + os.WriteFile(filename, data, 0644) + t.Logf("Generated: %s", filename) + }) + }) +} + +func TestJavaCompat(t *testing.T) { + t.Run("bloom filter", func(t *testing.T) { + testCases := []struct { + n int + numHashes uint16 + }{ + {0, 3}, + {0, 5}, + {10000, 3}, + {10000, 5}, + {2000000, 3}, + {2000000, 5}, + } + + for _, tc := range testCases { + b, err := os.ReadFile(fmt.Sprintf("%s/bf_n%d_h%d_java.sk", internal.JavaPath, tc.n, tc.numHashes)) + assert.NoError(t, err) + + bf, err := NewBloomFilterFromSlice(b) + assert.NoError(t, err) + + // Verify basic properties + assert.Equal(t, tc.n == 0, bf.IsEmpty()) + assert.Equal(t, tc.numHashes, bf.GetNumHashes()) + + if tc.n > 0 { + // Verify bits used is reasonable + assert.Greater(t, bf.GetBitsUsed(), uint64(0)) + assert.Less(t, bf.GetBitsUsed(), bf.GetCapacity()) + + // Java inserts n/10 items (0 to n/10-1) + numInserted := tc.n / 10 + + // Verify ALL inserted items are found (no false negatives!) + for i := 0; i < numInserted; i++ { + assert.True(t, bf.QueryInt64(int64(i)), + "Item %d should be found in bf_n%d_h%d_java.sk", i, tc.n, tc.numHashes) + } + + // Verify NaN is found + assert.True(t, bf.QueryFloat64(math.NaN()), + "NaN should be found in bf_n%d_h%d_java.sk", tc.n, tc.numHashes) + + // Negative test: verify false positive behavior is reasonable + // Test items that were definitely NOT inserted + negativeTestItems := []int64{-1, -100, int64(numInserted), int64(numInserted + 1), int64(numInserted * 2)} + foundNegativeCount := 0 + for _, item := range negativeTestItems { + if bf.QueryInt64(item) { + foundNegativeCount++ + } + } + + // Should not find ALL negative items (that would indicate a bug) + assert.Less(t, foundNegativeCount, len(negativeTestItems), + "Should not find ALL non-inserted items (found %d/%d)", foundNegativeCount, len(negativeTestItems)) + + // Test false positive rate on a larger sample + falsePositiveCount := 0 + testRange := 1000 + startNegative := int64(numInserted + 1000) + for i := int64(0); i < int64(testRange); i++ { + if bf.QueryInt64(startNegative + i) { + falsePositiveCount++ + } + } + + fppRate := float64(falsePositiveCount) / float64(testRange) + assert.Less(t, fppRate, 0.5, + "False positive rate should be reasonable, got %.2f%%", fppRate*100) + } + } + }) + t.Run("Specific Java", func(t *testing.T) { + n := 10000 + numInserts := n / 10 // 1000 items + numHashes := uint16(3) + + testCases := []struct { + name string + filename string + testItems func(bf BloomFilter) // Function to test items + }{ + { + name: "long_type", + filename: "bf_n10000_h3_java.sk", // Already exists - standard test + testItems: func(bf BloomFilter) { + // Java inserted integers 0-999 + for i := 0; i < numInserts; i++ { + assert.True(t, bf.QueryInt64(int64(i)), + "Should find int64 value %d", i) + } + }, + }, + { + name: "string_type", + filename: "bf_string_n10000_h3_java.sk", + testItems: func(bf BloomFilter) { + // Java inserted strings "0", "1", ..., "999" + for i := 0; i < numInserts; i++ { + assert.True(t, bf.QueryString(fmt.Sprintf("%d", i)), + "Should find string '%d'", i) + } + }, + }, + { + name: "double_type", + filename: "bf_double_n10000_h3_java.sk", + testItems: func(bf BloomFilter) { + // Java inserted doubles 0.0, 1.0, ..., 999.0 + for i := 0; i < numInserts; i++ { + assert.True(t, bf.QueryFloat64(float64(i)), + "Should find double %f", float64(i)) + } + }, + }, + { + name: "long_array_type", + filename: "bf_long_array_n10000_h3_java.sk", + testItems: func(bf BloomFilter) { + // Java inserted long arrays [0,0], [1,1], ..., [999,999] + for i := 0; i < numInserts; i++ { + arr := []int64{int64(i), int64(i)} + assert.True(t, bf.QueryInt64Array(arr), + "Should find long array [%d,%d]", i, i) + } + }, + }, + { + name: "double_array_type", + filename: "bf_double_array_n10000_h3_java.sk", + testItems: func(bf BloomFilter) { + // Java inserted double arrays [0.0,0.0], [1.0,1.0], ..., [999.0,999.0] + for i := 0; i < numInserts; i++ { + arr := []float64{float64(i), float64(i)} + assert.True(t, bf.QueryFloat64Array(arr), + "Should find double array [%f,%f]", float64(i), float64(i)) + } + }, + }, + { + name: "byte_array_type", + filename: "bf_byte_array_n10000_h3_java.sk", + testItems: func(bf BloomFilter) { + // Java inserted byte arrays [i,i,i,i] for i=0-999 (mod 256) + for i := 0; i < numInserts; i++ { + b := byte(i % 256) + arr := []byte{b, b, b, b} + assert.True(t, bf.QuerySlice(arr), + "Should find byte array [%d,%d,%d,%d]", b, b, b, b) + } + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + filepath := fmt.Sprintf("%s/%s", internal.JavaPath, tc.filename) + + // Skip if file doesn't exist + if _, err := os.Stat(filepath); os.IsNotExist(err) { + t.Skipf("Java file not found: %s (needs to be generated)", tc.filename) + return + } + + // Read Java file + data, err := os.ReadFile(filepath) + assert.NoError(t, err) + + // Deserialize + bf, err := NewBloomFilterFromSlice(data) + assert.NoError(t, err) + assert.False(t, bf.IsEmpty()) + assert.Equal(t, numHashes, bf.GetNumHashes()) + + // Test all items of this type + tc.testItems(bf) + + // Compute MD5 for reference + hash := md5.Sum(data) + t.Logf("✅ Java %s: %d items verified, MD5=%x", tc.name, numInserts, hash) + }) + } + }) +} + +func TestCPPCompat(t *testing.T) { + t.Run("bloom filter", func(t *testing.T) { + testCases := []struct { + n int + numHashes uint16 + }{ + {0, 3}, + {0, 5}, + {10000, 3}, + {10000, 5}, + {2000000, 3}, + {2000000, 5}, + } + + for _, tc := range testCases { + filename := fmt.Sprintf("%s/bf_n%d_h%d_cpp.sk", internal.CppPath, tc.n, tc.numHashes) + + // Skip if file doesn't exist + if _, err := os.Stat(filename); os.IsNotExist(err) { + t.Skipf("C++ file not found: %s", filename) + return + } + + b, err := os.ReadFile(filename) + assert.NoError(t, err) + + bf, err := NewBloomFilterFromSlice(b) + assert.NoError(t, err) + + // Verify basic properties + assert.Equal(t, tc.n == 0, bf.IsEmpty()) + assert.Equal(t, tc.numHashes, bf.GetNumHashes()) + + if tc.n > 0 { + // Verify bits used is reasonable + assert.Greater(t, bf.GetBitsUsed(), uint64(0)) + assert.Less(t, bf.GetBitsUsed(), bf.GetCapacity()) + + // C++ inserts n/10 items (0 to n/10-1) + numInserted := tc.n / 10 + + // Verify ALL inserted items are found (no false negatives!) + for i := 0; i < numInserted; i++ { + assert.True(t, bf.QueryInt64(int64(i)), + "Item %d should be found in bf_n%d_h%d_cpp.sk", i, tc.n, tc.numHashes) + } + + // Verify NaN is found + assert.True(t, bf.QueryFloat64(math.NaN()), + "NaN should be found in bf_n%d_h%d_cpp.sk", tc.n, tc.numHashes) + + // Negative test: verify false positive behavior is reasonable + // Test items that were definitely NOT inserted + negativeTestItems := []int64{-1, -100, int64(numInserted), int64(numInserted + 1), int64(numInserted * 2)} + foundNegativeCount := 0 + for _, item := range negativeTestItems { + if bf.QueryInt64(item) { + foundNegativeCount++ + } + } + + // Should not find ALL negative items (that would indicate a bug) + assert.Less(t, foundNegativeCount, len(negativeTestItems), + "Should not find ALL non-inserted items (found %d/%d)", foundNegativeCount, len(negativeTestItems)) + + // Test false positive rate on a larger sample + falsePositiveCount := 0 + testRange := 1000 + startNegative := int64(numInserted + 1000) + for i := int64(0); i < int64(testRange); i++ { + if bf.QueryInt64(startNegative + i) { + falsePositiveCount++ + } + } + + fppRate := float64(falsePositiveCount) / float64(testRange) + assert.Less(t, fppRate, 0.5, + "False positive rate should be reasonable, got %.2f%%", fppRate*100) + } + } + }) + + t.Run("Specific Cpp", func(t *testing.T) { + n := 10000 + numInserts := n / 10 // 1000 items + numHashes := uint16(3) + + testCases := []struct { + name string + filename string + testItems func(bf BloomFilter) + }{ + { + name: "long_type", + filename: "bf_n10000_h3_cpp.sk", // Standard test + testItems: func(bf BloomFilter) { + for i := 0; i < numInserts; i++ { + assert.True(t, bf.QueryInt64(int64(i))) + } + }, + }, + { + name: "string_type", + filename: "bf_string_n10000_h3_cpp.sk", + testItems: func(bf BloomFilter) { + for i := 0; i < numInserts; i++ { + assert.True(t, bf.QueryString(fmt.Sprintf("%d", i))) + } + }, + }, + { + name: "double_type", + filename: "bf_double_n10000_h3_cpp.sk", + testItems: func(bf BloomFilter) { + for i := 0; i < numInserts; i++ { + assert.True(t, bf.QueryFloat64(float64(i))) + } + }, + }, + { + name: "byte_array_type", + filename: "bf_byte_array_n10000_h3_cpp.sk", + testItems: func(bf BloomFilter) { + for i := 0; i < numInserts; i++ { + b := byte(i % 256) + arr := []byte{b, b, b, b} + assert.True(t, bf.QuerySlice(arr)) + } + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + filepath := fmt.Sprintf("%s/%s", internal.CppPath, tc.filename) + + // Skip if file doesn't exist + if _, err := os.Stat(filepath); os.IsNotExist(err) { + t.Skipf("C++ file not found: %s (needs to be generated)", tc.filename) + return + } + + // Read C++ file + data, err := os.ReadFile(filepath) + assert.NoError(t, err) + + // Deserialize + bf, err := NewBloomFilterFromSlice(data) + assert.NoError(t, err) + assert.False(t, bf.IsEmpty()) + assert.Equal(t, numHashes, bf.GetNumHashes()) + + // Test all items + tc.testItems(bf) + + hash := md5.Sum(data) + t.Logf("✅ C++ %s: %d items verified, MD5=%x", tc.name, numInserts, hash) + }) + } + }) +} diff --git a/filters/bloom_filter_test.go b/filters/bloom_filter_test.go new file mode 100644 index 0000000..97ac50c --- /dev/null +++ b/filters/bloom_filter_test.go @@ -0,0 +1,804 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package filters + +import ( + "encoding/binary" + "math" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestInvalidConstructorArguments(t *testing.T) { + // numBits = 0 + _, err := NewBloomFilterBySize(0, 3) + assert.Error(t, err) + + // numHashes = 0 + _, err = NewBloomFilterBySize(64, 0) + assert.Error(t, err) + + // numBits too large (overflow) + _, err = NewBloomFilterBySize(1<<60, 3) + assert.Error(t, err) + + // Invalid FPP + _, err = NewBloomFilterByAccuracy(1000, 0.0) + assert.Error(t, err) + + _, err = NewBloomFilterByAccuracy(1000, 1.0) + assert.Error(t, err) + + // Zero max items + _, err = NewBloomFilterByAccuracy(0, 0.01) + assert.Error(t, err) +} + +func TestStandardConstructors(t *testing.T) { + numItems := uint64(5000) + targetFpp := 0.01 + + // Create by accuracy + bf1, err := NewBloomFilterByAccuracy(numItems, targetFpp) + assert.NoError(t, err) + assert.NotNil(t, bf1) + + // Verify capacity is rounded to multiple of 64 + capacity1 := bf1.GetCapacity() + assert.Equal(t, uint64(0), capacity1%64) + + // Create by size with same parameters + numBits := SuggestNumFilterBits(numItems, targetFpp) + numHashes := SuggestNumHashesFromSize(numItems, numBits) + bf2, err := NewBloomFilterBySize(numBits, numHashes) + assert.NoError(t, err) + + // Both should have same capacity and hash count + assert.Equal(t, bf1.GetCapacity(), bf2.GetCapacity()) + assert.Equal(t, bf1.GetNumHashes(), bf2.GetNumHashes()) + + // Both should be empty + assert.True(t, bf1.IsEmpty()) + assert.True(t, bf2.IsEmpty()) + assert.Equal(t, uint64(0), bf1.GetBitsUsed()) + assert.Equal(t, uint64(0), bf2.GetBitsUsed()) +} + +func TestBasicOperations(t *testing.T) { + numItems := uint64(5000) + targetFpp := 0.01 + seed := uint64(12345) + + bf, err := NewBloomFilterByAccuracy(numItems, targetFpp, WithSeed(seed)) + assert.NoError(t, err) + assert.Equal(t, seed, bf.GetSeed()) + + // Initially empty + assert.True(t, bf.IsEmpty()) + + // Insert items + for i := uint64(0); i < numItems; i++ { + err = bf.UpdateUInt64(i) + assert.NoError(t, err) + } + + // No longer empty + assert.False(t, bf.IsEmpty()) + + // Check bits used is reasonable (should be around 50% for optimal parameters) + bitsUsed := bf.GetBitsUsed() + capacity := bf.GetCapacity() + utilizationPercent := float64(bitsUsed) * 100.0 / float64(capacity) + assert.Greater(t, utilizationPercent, 30.0) + assert.Less(t, utilizationPercent, 70.0) + + // All inserted items should be found + for i := uint64(0); i < numItems; i++ { + assert.True(t, bf.QueryUInt64(i), "Item %d should be found", i) + } + + // Count false positives on non-inserted items + falsePositives := 0 + testSize := 10000 + for i := numItems; i < numItems+uint64(testSize); i++ { + if bf.QueryUInt64(i) { + falsePositives++ + } + } + + // False positive rate should be within reasonable bounds of target + actualFpp := float64(falsePositives) / float64(testSize) + // Allow up to 3x the target FPP (probabilistic structure) + assert.Less(t, actualFpp, targetFpp*3.0, "Actual FPP: %.4f, Target: %.4f", actualFpp, targetFpp) + + // Test Reset + err = bf.Reset() + assert.NoError(t, err) + assert.True(t, bf.IsEmpty()) + assert.Equal(t, uint64(0), bf.GetBitsUsed()) +} + +func TestInversion(t *testing.T) { + bf, err := NewBloomFilterBySize(256, 5, WithSeed(42)) + assert.NoError(t, err) + + // Insert some items + for i := uint64(0); i < 100; i++ { + bf.UpdateUInt64(i) + } + + bitsUsedBefore := bf.GetBitsUsed() + capacity := bf.GetCapacity() + + // Count items that appear present before inversion + presentBefore := 0 + for i := uint64(0); i < 100; i++ { + if bf.QueryUInt64(i) { + presentBefore++ + } + } + assert.Equal(t, 100, presentBefore) + + // Invert + err = bf.Invert() + assert.NoError(t, err) + + // Bits used should be inverted + bitsUsedAfter := bf.GetBitsUsed() + assert.Equal(t, capacity-bitsUsedBefore, bitsUsedAfter) + + // Most original items should now not be present + stillPresent := 0 + for i := uint64(0); i < 100; i++ { + if bf.QueryUInt64(i) { + stillPresent++ + } + } + assert.Less(t, stillPresent, 10) // Very few should still appear present +} + +func TestIncompatibleSetOperations(t *testing.T) { + bf1, _ := NewBloomFilterBySize(256, 5, WithSeed(42)) + + // Different num_bits + bf2, _ := NewBloomFilterBySize(512, 5, WithSeed(42)) + assert.False(t, bf1.IsCompatible(bf2)) + assert.Error(t, bf1.Union(bf2)) + assert.Error(t, bf1.Intersect(bf2)) + + // Different num_hashes + bf3, _ := NewBloomFilterBySize(256, 7, WithSeed(42)) + assert.False(t, bf1.IsCompatible(bf3)) + assert.Error(t, bf1.Union(bf3)) + + // Different seed + bf4, _ := NewBloomFilterBySize(256, 5, WithSeed(99)) + assert.False(t, bf1.IsCompatible(bf4)) + assert.Error(t, bf1.Intersect(bf4)) +} + +func TestBasicUnion(t *testing.T) { + n := uint64(1000) + bf1, _ := NewBloomFilterBySize(12288, 4, WithSeed(123)) + bf2, _ := NewBloomFilterBySize(12288, 4, WithSeed(123)) + + // bf1: items 0 to n-1 + for i := uint64(0); i < n; i++ { + bf1.UpdateUInt64(i) + } + + // bf2: items n/2 to 3n/2-1 (overlap in middle) + for i := n / 2; i < 3*n/2; i++ { + bf2.UpdateUInt64(i) + } + + // Union bf2 into bf1 + err := bf1.Union(bf2) + assert.NoError(t, err) + + // All items from 0 to 3n/2-1 should be present + for i := uint64(0); i < 3*n/2; i++ { + assert.True(t, bf1.QueryUInt64(i), "Item %d should be in union", i) + } + + // Count false positives on items beyond range + falsePositives := 0 + testRange := 1000 + for i := 3 * n / 2; i < 3*n/2+uint64(testRange); i++ { + if bf1.QueryUInt64(i) { + falsePositives++ + } + } + + fppRate := float64(falsePositives) / float64(testRange) + assert.Less(t, fppRate, 0.20) // Should be reasonable +} + +func TestBasicIntersection(t *testing.T) { + n := uint64(1000) + bf1, _ := NewBloomFilterBySize(12288, 4, WithSeed(456)) + bf2, _ := NewBloomFilterBySize(12288, 4, WithSeed(456)) + + // bf1: items 0 to n-1 + for i := uint64(0); i < n; i++ { + bf1.UpdateUInt64(i) + } + + // bf2: items n/2 to 3n/2-1 + for i := n / 2; i < 3*n/2; i++ { + bf2.UpdateUInt64(i) + } + + // Intersect + err := bf1.Intersect(bf2) + assert.NoError(t, err) + + // Items in overlap (n/2 to n-1) should be present + for i := n / 2; i < n; i++ { + assert.True(t, bf1.QueryUInt64(i), "Item %d should be in intersection", i) + } + + // Items outside overlap should mostly not be present + presentOutside := 0 + for i := uint64(0); i < n/2; i++ { + if bf1.QueryUInt64(i) { + presentOutside++ + } + } + // Allow some false positives + assert.Less(t, presentOutside, int(n/10)) +} + +func TestQueryAndUpdate(t *testing.T) { + bf, _ := NewBloomFilterBySize(256, 5, WithSeed(789)) + + // First call should return false (not present) + wasPresent, err := bf.QueryAndUpdateUInt64(42) + assert.NoError(t, err) + assert.False(t, wasPresent) + + // Second call should return true (now present) + wasPresent, err = bf.QueryAndUpdateUInt64(42) + assert.NoError(t, err) + assert.True(t, wasPresent) + + // Regular query should also return true + assert.True(t, bf.QueryUInt64(42)) +} + +func TestMultipleDataTypes(t *testing.T) { + bf, _ := NewBloomFilterBySize(512, 7) + + // Test int64 + bf.UpdateInt64(-123) + assert.True(t, bf.QueryInt64(-123)) + assert.False(t, bf.QueryInt64(-124)) + + // Test string + bf.UpdateString("hello world") + assert.True(t, bf.QueryString("hello world")) + assert.False(t, bf.QueryString("hello")) + + // Test byte slice + data := []byte{1, 2, 3, 4, 5} + bf.UpdateSlice(data) + assert.True(t, bf.QuerySlice(data)) + assert.False(t, bf.QuerySlice([]byte{1, 2, 3})) + + // Test float64 + bf.UpdateFloat64(3.14159) + assert.True(t, bf.QueryFloat64(3.14159)) + assert.False(t, bf.QueryFloat64(2.71828)) + + // Test NaN handling (NaN should be canonicalized) + bf.UpdateFloat64(math.NaN()) + assert.True(t, bf.QueryFloat64(math.NaN())) + + // Test -0.0 and 0.0 are treated the same + bf.UpdateFloat64(0.0) + assert.True(t, bf.QueryFloat64(-0.0)) + assert.True(t, bf.QueryFloat64(0.0)) +} + +func TestSerializationRoundtrip(t *testing.T) { + bf, _ := NewBloomFilterBySize(256, 5, WithSeed(999)) + + // Insert some items + for i := uint64(0); i < 50; i++ { + bf.UpdateUInt64(i) + } + + // Serialize + bytes, err := bf.ToCompactSlice() + assert.NoError(t, err) + assert.NotNil(t, bytes) + + // Verify we can deserialize and properties match + bf2, err := NewBloomFilterFromSlice(bytes) + assert.NoError(t, err) + assert.Equal(t, bf.GetSeed(), bf2.GetSeed()) + assert.Equal(t, bf.GetNumHashes(), bf2.GetNumHashes()) + assert.Equal(t, bf.GetCapacity(), bf2.GetCapacity()) + assert.Equal(t, bf.GetBitsUsed(), bf2.GetBitsUsed()) +} + +func TestEmptySerializationFormat(t *testing.T) { + bf, _ := NewBloomFilterBySize(256, 5, WithSeed(111)) + + // Serialize empty filter + bytes, err := bf.ToCompactSlice() + assert.NoError(t, err) + + // Empty filter should have shorter serialization (24 bytes) + assert.Equal(t, 24, len(bytes)) + + // Should be able to deserialize + bf2, err := NewBloomFilterFromSlice(bytes) + assert.NoError(t, err) + assert.True(t, bf2.IsEmpty()) + assert.Equal(t, bf.GetSeed(), bf2.GetSeed()) + assert.Equal(t, bf.GetNumHashes(), bf2.GetNumHashes()) +} + +func TestSuggestFunctions(t *testing.T) { + // Test SuggestNumFilterBits + bits := SuggestNumFilterBits(10000, 0.01) + assert.Greater(t, bits, uint64(0)) + + // For 10000 items at 1% FPP, should be around 95850 bits + assert.InDelta(t, 95850, bits, 1000) + + // Test SuggestNumHashes + hashes := SuggestNumHashes(0.01) + assert.Greater(t, hashes, uint16(0)) + // For 1% FPP, should be around 7 hashes + assert.InDelta(t, 7, hashes, 1) + + // Test SuggestNumHashesFromSize + hashes2 := SuggestNumHashesFromSize(10000, bits) + assert.Greater(t, hashes2, uint16(0)) + // Should also be around 7 + assert.InDelta(t, 7, hashes2, 1) +} + +func TestCapacityRounding(t *testing.T) { + // Test that capacity is always rounded to multiple of 64 + testCases := []struct { + input uint64 + expected uint64 + }{ + {1, 64}, + {63, 64}, + {64, 64}, + {65, 128}, + {127, 128}, + {128, 128}, + {129, 192}, + {1000, 1024}, + } + + for _, tc := range testCases { + result := roundCapacity(tc.input) + assert.Equal(t, tc.expected, result, "roundCapacity(%d) should be %d", tc.input, tc.expected) + assert.Equal(t, uint64(0), result%64, "Result should be multiple of 64") + } +} + +func TestDeserializationRoundtrip(t *testing.T) { + // Create and populate a filter + bf1, err := NewBloomFilterBySize(512, 7, WithSeed(12345)) + assert.NoError(t, err) + + for i := uint64(0); i < 100; i++ { + bf1.UpdateUInt64(i) + } + + // Serialize + bytes, err := bf1.ToCompactSlice() + assert.NoError(t, err) + + // Deserialize + bf2, err := NewBloomFilterFromSlice(bytes) + assert.NoError(t, err) + + // Verify properties match + assert.Equal(t, bf1.GetSeed(), bf2.GetSeed()) + assert.Equal(t, bf1.GetNumHashes(), bf2.GetNumHashes()) + assert.Equal(t, bf1.GetCapacity(), bf2.GetCapacity()) + assert.Equal(t, bf1.GetBitsUsed(), bf2.GetBitsUsed()) + assert.Equal(t, bf1.IsEmpty(), bf2.IsEmpty()) + + // Verify all inserted items are found + for i := uint64(0); i < 100; i++ { + assert.True(t, bf2.QueryUInt64(i), "Item %d should be found after deserialization", i) + } + + // Verify queries match + for i := uint64(100); i < 200; i++ { + assert.Equal(t, bf1.QueryUInt64(i), bf2.QueryUInt64(i), "Query results should match for item %d", i) + } +} + +func TestDeserializeEmptyFilter(t *testing.T) { + // Create empty filter + bf1, err := NewBloomFilterBySize(256, 5, WithSeed(999)) + assert.NoError(t, err) + + // Serialize + bytes, err := bf1.ToCompactSlice() + assert.NoError(t, err) + assert.Equal(t, 24, len(bytes)) // Empty filter is 24 bytes + + // Deserialize + bf2, err := NewBloomFilterFromSlice(bytes) + assert.NoError(t, err) + + // Verify properties + assert.True(t, bf2.IsEmpty()) + assert.Equal(t, uint64(0), bf2.GetBitsUsed()) + assert.Equal(t, bf1.GetSeed(), bf2.GetSeed()) + assert.Equal(t, bf1.GetNumHashes(), bf2.GetNumHashes()) + assert.Equal(t, bf1.GetCapacity(), bf2.GetCapacity()) +} + +func TestDeserializeInvalidData(t *testing.T) { + // Too small + _, err := NewBloomFilterFromSlice([]byte{1, 2, 3}) + assert.Error(t, err) + assert.Contains(t, err.Error(), "insufficient data") + + // Wrong serialization version + bytes := make([]byte, 24) + insertPreambleLongs(bytes, preambleLongsEmpty) + insertSerVer(bytes) + bytes[serVerOffset] = 99 // Invalid version + insertFamilyID(bytes) + insertFlags(bytes, setEmptyFlag(0)) + _, err = NewBloomFilterFromSlice(bytes) + assert.Error(t, err) + assert.Contains(t, err.Error(), "unsupported serialization version") + + // Wrong family ID + bytes = make([]byte, 24) + insertPreambleLongs(bytes, preambleLongsEmpty) + insertSerVer(bytes) + bytes[familyIDOffset] = 99 // Invalid family + insertFlags(bytes, setEmptyFlag(0)) + _, err = NewBloomFilterFromSlice(bytes) + assert.Error(t, err) + assert.Contains(t, err.Error(), "invalid family ID") + + // Zero numHashes + bytes = make([]byte, 24) + insertPreambleLongs(bytes, preambleLongsEmpty) + insertSerVer(bytes) + insertFamilyID(bytes) + insertFlags(bytes, setEmptyFlag(0)) + insertNumHashes(bytes, 0) + _, err = NewBloomFilterFromSlice(bytes) + assert.Error(t, err) + assert.Contains(t, err.Error(), "numHashes must be positive") +} + +func TestDeserializeWithDirtyBits(t *testing.T) { + // Create and populate a filter + bf1, err := NewBloomFilterBySize(256, 5, WithSeed(777)) + assert.NoError(t, err) + + for i := uint64(0); i < 50; i++ { + bf1.UpdateUInt64(i) + } + + // Serialize + bytes, err := bf1.ToCompactSlice() + assert.NoError(t, err) + + // Manually set numBitsSet to dirty value (0xFFFFFFFFFFFFFFFF) + insertNumBitsSet(bytes, dirtyBitsValue) + + // Deserialize - should recount bits + bf2, err := NewBloomFilterFromSlice(bytes) + assert.NoError(t, err) + + // Should have correct bit count (recalculated) + assert.Equal(t, bf1.GetBitsUsed(), bf2.GetBitsUsed()) + assert.Greater(t, bf2.GetBitsUsed(), uint64(0)) +} + +func TestSerializeDeserializeConsistency(t *testing.T) { + // Test multiple serialize-deserialize cycles + bf, err := NewBloomFilterBySize(512, 7, WithSeed(42)) + assert.NoError(t, err) + + for i := uint64(0); i < 100; i++ { + bf.UpdateUInt64(i) + } + + // First cycle + bytes1, err := bf.ToCompactSlice() + assert.NoError(t, err) + + bf2, err := NewBloomFilterFromSlice(bytes1) + assert.NoError(t, err) + + // Second cycle + bytes2, err := bf2.ToCompactSlice() + assert.NoError(t, err) + + // Bytes should be identical + assert.Equal(t, bytes1, bytes2) + + // Third cycle + bf3, err := NewBloomFilterFromSlice(bytes2) + assert.NoError(t, err) + + bytes3, err := bf3.ToCompactSlice() + assert.NoError(t, err) + + // Still identical + assert.Equal(t, bytes1, bytes3) +} + +func TestHashFunctionConsistency(t *testing.T) { + // Test that hash function produces consistent results + seed := uint64(12345) + bf, err := NewBloomFilterBySize(1024, 5, WithSeed(seed)) + assert.NoError(t, err) + + // Insert an item + err = bf.UpdateInt64(42) + assert.NoError(t, err) + + // Query it back immediately + assert.True(t, bf.QueryInt64(42), "Item should be found immediately after insertion") + + // Serialize and deserialize + bytes, err := bf.ToCompactSlice() + assert.NoError(t, err) + + bf2, err := NewBloomFilterFromSlice(bytes) + assert.NoError(t, err) + + // Still found after deserialization + assert.True(t, bf2.QueryInt64(42), "Item should be found after deserialization") + + // Create a new filter with the same seed + bf3, err := NewBloomFilterBySize(1024, 5, WithSeed(seed)) + assert.NoError(t, err) + bf3.UpdateInt64(42) + + // Should produce identical serialization + bytes3, err := bf3.ToCompactSlice() + assert.NoError(t, err) + assert.Equal(t, bytes, bytes3, "Same seed and items should produce identical binary output") +} + +func TestArrayMethods(t *testing.T) { + seed := uint64(99999) + + // Test int64 arrays + intArray := []int64{1, 2, 3, 4, 5} + bf1, _ := NewBloomFilterBySize(512, 5, WithSeed(seed)) + + // Update with array + err := bf1.UpdateInt64Array(intArray) + assert.NoError(t, err) + assert.False(t, bf1.IsEmpty()) + + // Query array should return true + assert.True(t, bf1.QueryInt64Array(intArray), "Should find array after update") + + // QueryAndUpdate should return true (already present) + wasPresent, err := bf1.QueryAndUpdateInt64Array(intArray) + assert.NoError(t, err) + assert.True(t, wasPresent, "Array should be present") + + // Different array should not be found + differentArray := []int64{10, 20, 30} + assert.False(t, bf1.QueryInt64Array(differentArray), "Different array should not be found") + + // Test float64 arrays + floatArray := []float64{1.1, 2.2, 3.3} + bf2, _ := NewBloomFilterBySize(512, 5, WithSeed(seed)) + + err = bf2.UpdateFloat64Array(floatArray) + assert.NoError(t, err) + + assert.True(t, bf2.QueryFloat64Array(floatArray), "Should find float array after update") + + wasPresent, err = bf2.QueryAndUpdateFloat64Array(floatArray) + assert.NoError(t, err) + assert.True(t, wasPresent) + + // Empty arrays should be no-op + bf3, _ := NewBloomFilterBySize(512, 5, WithSeed(seed)) + err = bf3.UpdateInt64Array([]int64{}) + assert.NoError(t, err) + assert.True(t, bf3.IsEmpty(), "Empty array update should not change filter") + + err = bf3.UpdateFloat64Array([]float64{}) + assert.NoError(t, err) + assert.True(t, bf3.IsEmpty(), "Empty array update should not change filter") + + // Nil arrays should be no-op + err = bf3.UpdateInt64Array(nil) + assert.NoError(t, err) + assert.True(t, bf3.IsEmpty(), "Nil array update should not change filter") +} + +// TestBasicUpdateMethods replicates Java's testBasicUpdateMethods +func TestBasicUpdateMethods(t *testing.T) { + numDistinct := uint64(100) + fpp := 1e-6 + bf, err := NewBloomFilterByAccuracy(numDistinct, fpp) + assert.NoError(t, err) + + // Empty string should do nothing (no update) + err = bf.UpdateString("") + assert.NoError(t, err) + // Querying empty string should return false (not inserted) + wasPresent, err := bf.QueryAndUpdateString("") + assert.NoError(t, err) + assert.False(t, wasPresent) + assert.Equal(t, uint64(0), bf.GetBitsUsed()) + + // Update with non-empty string + err = bf.UpdateString("abc") + assert.NoError(t, err) + + // Query different string (should not be present) + wasPresent, err = bf.QueryAndUpdateString("def") + assert.NoError(t, err) + assert.False(t, wasPresent) + + // Update with int + err = bf.UpdateInt64(932) + assert.NoError(t, err) + + // Query different int (should not be present) + wasPresent, err = bf.QueryAndUpdateInt64(543) + assert.NoError(t, err) + assert.False(t, wasPresent) + + // Update with NaN + err = bf.UpdateFloat64(math.NaN()) + assert.NoError(t, err) + + // Query positive infinity (should not be present) + wasPresent, err = bf.QueryAndUpdateFloat64(math.Inf(1)) + assert.NoError(t, err) + assert.False(t, wasPresent) + + // Bits used should be reasonable (at most numHashes * 6 for 6 distinct updates) + assert.LessOrEqual(t, bf.GetBitsUsed(), uint64(bf.GetNumHashes())*6) + assert.False(t, bf.IsEmpty()) +} + +// TestArrayUpdateMethods replicates Java's testArrayUpdateMethods +func TestArrayUpdateMethods(t *testing.T) { + // Test data: 3 doubles = 24 bytes + rawData := []float64{1.414, 2.71, 3.1415926538} + + numDistinct := uint64(100) + fpp := 1e-6 + + // Test UpdateFloat64Array (matches Java's update(double[])) + bfDoubles, err := NewBloomFilterByAccuracy(numDistinct, fpp) + assert.NoError(t, err) + + err = bfDoubles.UpdateFloat64Array(rawData) + assert.NoError(t, err) + + wasPresent, err := bfDoubles.QueryAndUpdateFloat64Array(rawData) + assert.NoError(t, err) + assert.True(t, wasPresent, "Should find array after update") + + found := bfDoubles.QueryFloat64Array(rawData) + assert.True(t, found, "Query should return true") + + numBitsSet := bfDoubles.GetBitsUsed() + seed := bfDoubles.GetSeed() + + // Test with byte slice (matches Java's update(byte[])) + bytes := make([]byte, len(rawData)*8) + for i, val := range rawData { + bits := math.Float64bits(val) + binary.LittleEndian.PutUint64(bytes[i*8:], bits) + } + + bfBytes, err := NewBloomFilterByAccuracy(numDistinct, fpp, WithSeed(seed)) + assert.NoError(t, err) + + err = bfBytes.UpdateSlice(bytes) + assert.NoError(t, err) + + wasPresent, err = bfBytes.QueryAndUpdateSlice(bytes) + assert.NoError(t, err) + assert.True(t, wasPresent) + + found = bfBytes.QuerySlice(bytes) + assert.True(t, found) + + // Both should have same number of bits set (same data, same seed) + assert.Equal(t, numBitsSet, bfBytes.GetBitsUsed()) + + // Test with int64 array (matches Java's update(long[])) + intData := []int64{12345, 67890, -11111} + bfInts, err := NewBloomFilterByAccuracy(numDistinct, fpp, WithSeed(seed)) + assert.NoError(t, err) + + err = bfInts.UpdateInt64Array(intData) + assert.NoError(t, err) + + wasPresent, err = bfInts.QueryAndUpdateInt64Array(intData) + assert.NoError(t, err) + assert.True(t, wasPresent) + + found = bfInts.QueryInt64Array(intData) + assert.True(t, found) + + // Intersect all filters (each with different data but same seed) + bf := &bloomFilterImpl{ + seed: seed, + numHashes: bfDoubles.GetNumHashes(), + isDirty: false, + capacityBits: bfDoubles.GetCapacity(), + numBitsSet: 0, + bitArray: make([]uint64, bfDoubles.GetCapacity()/64), + } + + // Manually create a filter by intersecting + bf.Union(bfDoubles) + bf.Intersect(bfBytes) + + // After intersecting with itself (same data), should have same bit count + assert.Equal(t, numBitsSet, bf.GetBitsUsed(), + "Intersection of identical data should preserve bit count") +} + +// TestNegativeQueries verifies that items NOT inserted are (usually) not found +func TestNegativeQueries(t *testing.T) { + bf, err := NewBloomFilterBySize(1024, 5, WithSeed(42)) + assert.NoError(t, err) + + // Insert items 0-99 + for i := int64(0); i < 100; i++ { + err = bf.UpdateInt64(i) + assert.NoError(t, err) + } + + // Test negative cases - items that were NOT inserted + negativeItems := []int64{-1, -10, -100, 100, 101, 200, 1000, 10000} + + foundCount := 0 + for _, item := range negativeItems { + if bf.QueryInt64(item) { + foundCount++ + } + } + + // Should not find ALL negative items (that would be a bug) + assert.Less(t, foundCount, len(negativeItems), + "Should not find all non-inserted items") + + // Specifically, -1 should almost certainly not be found + // (allowing for small probability of false positive) + found := bf.QueryInt64(-1) + t.Logf("Item -1 found: %v (false positive)", found) +} diff --git a/filters/preamble_utils.go b/filters/preamble_utils.go new file mode 100644 index 0000000..4415189 --- /dev/null +++ b/filters/preamble_utils.go @@ -0,0 +1,148 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package filters + +import "encoding/binary" + +// Preamble constants matching C++ implementation +const ( + // Header sizes + preambleBytes = 32 // Full preamble size for non-empty filter + preambleEmptyBytes = 24 // Preamble size for empty filter + + // Preamble field offsets + preambleLongsOffset = 0 + serVerOffset = 1 + familyIDOffset = 2 + flagsOffset = 3 + numHashesOffset = 4 + seedOffset = 8 + bitArrayLengthOffset = 16 + numBitsSetOffset = 24 + bitArrayOffset = 32 + + // Family and version identifiers + familyID = 21 + serVer = 1 + + // Preamble sizes in longs + preambleLongsEmpty = 3 + preambleLongsStandard = 4 + + // Flag masks + emptyFlagMask = 0x04 + + // Special values + dirtyBitsValue = 0xFFFFFFFFFFFFFFFF +) + +// extractPreambleLongs extracts the preamble longs field from the header. +func extractPreambleLongs(bytes []byte) uint8 { + return bytes[preambleLongsOffset] +} + +// extractSerVer extracts the serialization version from the header. +func extractSerVer(bytes []byte) uint8 { + return bytes[serVerOffset] +} + +// extractFamilyID extracts the family ID from the header. +func extractFamilyID(bytes []byte) uint8 { + return bytes[familyIDOffset] +} + +// extractFlags extracts the flags byte from the header. +func extractFlags(bytes []byte) uint8 { + return bytes[flagsOffset] +} + +// extractNumHashes extracts the number of hash functions from the header. +func extractNumHashes(bytes []byte) uint16 { + return binary.LittleEndian.Uint16(bytes[numHashesOffset:]) +} + +// extractSeed extracts the hash seed from the header. +func extractSeed(bytes []byte) uint64 { + return binary.LittleEndian.Uint64(bytes[seedOffset:]) +} + +// extractBitArrayLength extracts the bit array length (in longs) from the header. +func extractBitArrayLength(bytes []byte) uint32 { + return binary.LittleEndian.Uint32(bytes[bitArrayLengthOffset:]) +} + +// extractNumBitsSet extracts the number of bits set from the header. +// Only valid for non-empty filters. +func extractNumBitsSet(bytes []byte) uint64 { + return binary.LittleEndian.Uint64(bytes[numBitsSetOffset:]) +} + +// insertPreambleLongs inserts the preamble longs field into the header. +func insertPreambleLongs(bytes []byte, val uint8) { + bytes[preambleLongsOffset] = val +} + +// insertSerVer inserts the serialization version into the header. +func insertSerVer(bytes []byte) { + bytes[serVerOffset] = serVer +} + +// insertFamilyID inserts the family ID into the header. +func insertFamilyID(bytes []byte) { + bytes[familyIDOffset] = familyID +} + +// insertFlags inserts the flags byte into the header. +func insertFlags(bytes []byte, flags uint8) { + bytes[flagsOffset] = flags +} + +// insertNumHashes inserts the number of hash functions into the header. +func insertNumHashes(bytes []byte, numHashes uint16) { + binary.LittleEndian.PutUint16(bytes[numHashesOffset:], numHashes) +} + +// insertSeed inserts the hash seed into the header. +func insertSeed(bytes []byte, seed uint64) { + binary.LittleEndian.PutUint64(bytes[seedOffset:], seed) +} + +// insertBitArrayLength inserts the bit array length (in longs) into the header. +func insertBitArrayLength(bytes []byte, length uint32) { + binary.LittleEndian.PutUint32(bytes[bitArrayLengthOffset:], length) +} + +// insertNumBitsSet inserts the number of bits set into the header. +func insertNumBitsSet(bytes []byte, numBitsSet uint64) { + binary.LittleEndian.PutUint64(bytes[numBitsSetOffset:], numBitsSet) +} + +// isEmptyFlag checks if the empty flag is set in the flags byte. +func isEmptyFlag(flags uint8) bool { + return (flags & emptyFlagMask) != 0 +} + +// setEmptyFlag sets the empty flag in the flags byte. +func setEmptyFlag(flags uint8) uint8 { + return flags | emptyFlagMask +} + +// clearEmptyFlag clears the empty flag in the flags byte. +func clearEmptyFlag(flags uint8) uint8 { + return flags &^ emptyFlagMask +} diff --git a/go.mod b/go.mod index b177cad..09bbb2b 100644 --- a/go.mod +++ b/go.mod @@ -24,6 +24,8 @@ require ( github.com/twmb/murmur3 v1.1.8 ) +require github.com/cespare/xxhash/v2 v2.3.0 // indirect + require ( github.com/davecgh/go-spew v1.1.1 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect diff --git a/go.sum b/go.sum index 3f83047..f1072fd 100644 --- a/go.sum +++ b/go.sum @@ -1,3 +1,5 @@ +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= diff --git a/internal/family.go b/internal/family.go index 4679414..2569330 100644 --- a/internal/family.go +++ b/internal/family.go @@ -25,9 +25,10 @@ type family struct { type families struct { HLL family Frequency family - Kll family `` + Kll family CPC family CountMinSketch family + BloomFilter family } var FamilyEnum = &families{ @@ -51,4 +52,8 @@ var FamilyEnum = &families{ Id: 18, MaxPreLongs: 3, }, + BloomFilter: family{ + Id: 21, + MaxPreLongs: 4, + }, } diff --git a/serialization_test_data/go_generated_files/bf_byte_array_n10000_h3_go.sk b/serialization_test_data/go_generated_files/bf_byte_array_n10000_h3_go.sk new file mode 100644 index 0000000..241719d Binary files /dev/null and b/serialization_test_data/go_generated_files/bf_byte_array_n10000_h3_go.sk differ diff --git a/serialization_test_data/go_generated_files/bf_double_array_n10000_h3_go.sk b/serialization_test_data/go_generated_files/bf_double_array_n10000_h3_go.sk new file mode 100644 index 0000000..49e590c Binary files /dev/null and b/serialization_test_data/go_generated_files/bf_double_array_n10000_h3_go.sk differ diff --git a/serialization_test_data/go_generated_files/bf_double_n10000_h3_go.sk b/serialization_test_data/go_generated_files/bf_double_n10000_h3_go.sk new file mode 100644 index 0000000..77a0319 Binary files /dev/null and b/serialization_test_data/go_generated_files/bf_double_n10000_h3_go.sk differ diff --git a/serialization_test_data/go_generated_files/bf_long_array_n10000_h3_go.sk b/serialization_test_data/go_generated_files/bf_long_array_n10000_h3_go.sk new file mode 100644 index 0000000..9e09163 Binary files /dev/null and b/serialization_test_data/go_generated_files/bf_long_array_n10000_h3_go.sk differ diff --git a/serialization_test_data/go_generated_files/bf_n0_h3_go.sk b/serialization_test_data/go_generated_files/bf_n0_h3_go.sk new file mode 100644 index 0000000..3f94cd8 Binary files /dev/null and b/serialization_test_data/go_generated_files/bf_n0_h3_go.sk differ diff --git a/serialization_test_data/go_generated_files/bf_n0_h5_go.sk b/serialization_test_data/go_generated_files/bf_n0_h5_go.sk new file mode 100644 index 0000000..720e455 Binary files /dev/null and b/serialization_test_data/go_generated_files/bf_n0_h5_go.sk differ diff --git a/serialization_test_data/go_generated_files/bf_n10000_h3_go.sk b/serialization_test_data/go_generated_files/bf_n10000_h3_go.sk new file mode 100644 index 0000000..9f93779 Binary files /dev/null and b/serialization_test_data/go_generated_files/bf_n10000_h3_go.sk differ diff --git a/serialization_test_data/go_generated_files/bf_n10000_h5_go.sk b/serialization_test_data/go_generated_files/bf_n10000_h5_go.sk new file mode 100644 index 0000000..20376e9 Binary files /dev/null and b/serialization_test_data/go_generated_files/bf_n10000_h5_go.sk differ diff --git a/serialization_test_data/go_generated_files/bf_n2000000_h3_go.sk b/serialization_test_data/go_generated_files/bf_n2000000_h3_go.sk new file mode 100644 index 0000000..5503c77 Binary files /dev/null and b/serialization_test_data/go_generated_files/bf_n2000000_h3_go.sk differ diff --git a/serialization_test_data/go_generated_files/bf_n2000000_h5_go.sk b/serialization_test_data/go_generated_files/bf_n2000000_h5_go.sk new file mode 100644 index 0000000..15b7a9d Binary files /dev/null and b/serialization_test_data/go_generated_files/bf_n2000000_h5_go.sk differ diff --git a/serialization_test_data/go_generated_files/bf_string_n10000_h3_go.sk b/serialization_test_data/go_generated_files/bf_string_n10000_h3_go.sk new file mode 100644 index 0000000..8b24eb6 Binary files /dev/null and b/serialization_test_data/go_generated_files/bf_string_n10000_h3_go.sk differ --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
