This is an automated email from the ASF dual-hosted git repository.
tison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datasketches-rust.git
The following commit(s) were added to refs/heads/main by this push:
new 81e619f feat: add halve and decay to countmin sketch of unsigned
values (#71)
81e619f is described below
commit 81e619f3fb37ea1abaf9a8efdd5743d60d3b62e1
Author: Chojan Shang <[email protected]>
AuthorDate: Tue Jan 27 17:24:58 2026 +0800
feat: add halve and decay to countmin sketch of unsigned values (#71)
Co-authored-by: tison <[email protected]>
---
datasketches/src/countmin/mod.rs | 12 ++-
datasketches/src/countmin/sketch.rs | 174 +++++++++++++++++++-----------
datasketches/src/countmin/value.rs | 187 +++++++++++++++++++++++++++++++++
datasketches/src/frequencies/sketch.rs | 2 +-
datasketches/tests/countmin_test.rs | 151 +++++++++++++++++++++-----
5 files changed, 434 insertions(+), 92 deletions(-)
diff --git a/datasketches/src/countmin/mod.rs b/datasketches/src/countmin/mod.rs
index 9b427e9..254907e 100644
--- a/datasketches/src/countmin/mod.rs
+++ b/datasketches/src/countmin/mod.rs
@@ -24,7 +24,7 @@
//!
//! ```rust
//! # use datasketches::countmin::CountMinSketch;
-//! let mut sketch = CountMinSketch::new(5, 256);
+//! let mut sketch = CountMinSketch::<i64>::new(5, 256);
//! sketch.update("apple");
//! sketch.update_with_weight("banana", 3);
//! assert!(sketch.estimate("banana") >= 3);
@@ -34,12 +34,16 @@
//!
//! ```rust
//! # use datasketches::countmin::CountMinSketch;
-//! let buckets = CountMinSketch::suggest_num_buckets(0.01);
-//! let hashes = CountMinSketch::suggest_num_hashes(0.99);
-//! let _sketch = CountMinSketch::new(hashes, buckets);
+//! let buckets = CountMinSketch::<i64>::suggest_num_buckets(0.01);
+//! let hashes = CountMinSketch::<i64>::suggest_num_hashes(0.99);
+//! let _sketch = CountMinSketch::<i64>::new(hashes, buckets);
//! ```
mod serialization;
mod sketch;
pub use self::sketch::CountMinSketch;
+
+mod value;
+pub use self::value::CountMinValue;
+pub use self::value::UnsignedCountMinValue;
diff --git a/datasketches/src/countmin/sketch.rs
b/datasketches/src/countmin/sketch.rs
index 321c836..e2814d1 100644
--- a/datasketches/src/countmin/sketch.rs
+++ b/datasketches/src/countmin/sketch.rs
@@ -17,10 +17,11 @@
use std::hash::Hash;
use std::hash::Hasher;
-use std::mem::size_of;
use crate::codec::SketchBytes;
use crate::codec::SketchSlice;
+use crate::countmin::CountMinValue;
+use crate::countmin::UnsignedCountMinValue;
use crate::countmin::serialization::COUNTMIN_FAMILY_ID;
use crate::countmin::serialization::FLAGS_IS_EMPTY;
use crate::countmin::serialization::LONG_SIZE_BYTES;
@@ -38,16 +39,16 @@ const MAX_TABLE_ENTRIES: usize = 1 << 30;
/// The sketch provides upper and lower bounds on estimated item frequencies
/// with configurable relative error and confidence.
#[derive(Debug, Clone, PartialEq)]
-pub struct CountMinSketch {
+pub struct CountMinSketch<T: CountMinValue> {
num_hashes: u8,
num_buckets: u32,
seed: u64,
- total_weight: i64,
- counts: Vec<i64>,
+ total_weight: T,
+ counts: Vec<T>,
hash_seeds: Vec<u64>,
}
-impl CountMinSketch {
+impl<T: CountMinValue> CountMinSketch<T> {
/// Creates a new Count-Min sketch with the default seed.
///
/// # Panics
@@ -59,7 +60,7 @@ impl CountMinSketch {
///
/// ```rust
/// # use datasketches::countmin::CountMinSketch;
- /// let sketch = CountMinSketch::new(4, 128);
+ /// let sketch = CountMinSketch::<i64>::new(4, 128);
/// assert_eq!(sketch.num_buckets(), 128);
/// ```
pub fn new(num_hashes: u8, num_buckets: u32) -> Self {
@@ -77,7 +78,7 @@ impl CountMinSketch {
///
/// ```rust
/// # use datasketches::countmin::CountMinSketch;
- /// let sketch = CountMinSketch::with_seed(4, 64, 42);
+ /// let sketch = CountMinSketch::<i64>::with_seed(4, 64, 42);
/// assert_eq!(sketch.seed(), 42);
/// ```
pub fn with_seed(num_hashes: u8, num_buckets: u32, seed: u64) -> Self {
@@ -101,7 +102,7 @@ impl CountMinSketch {
}
/// Returns the total weight inserted into the sketch.
- pub fn total_weight(&self) -> i64 {
+ pub fn total_weight(&self) -> T {
self.total_weight
}
@@ -112,7 +113,7 @@ impl CountMinSketch {
/// Returns true if the sketch has not seen any updates.
pub fn is_empty(&self) -> bool {
- self.total_weight == 0
+ self.total_weight == T::ZERO
}
/// Suggests the number of buckets to achieve the given relative error.
@@ -129,7 +130,7 @@ impl CountMinSketch {
///
/// # Panics
///
- /// Panics if `confidence` is not in [0, 1].
+ /// Panics if `confidence` is not in `[0, 1]`.
pub fn suggest_num_hashes(confidence: f64) -> u8 {
assert!(
(0.0..=1.0).contains(&confidence),
@@ -148,12 +149,12 @@ impl CountMinSketch {
///
/// ```rust
/// # use datasketches::countmin::CountMinSketch;
- /// let mut sketch = CountMinSketch::new(4, 128);
+ /// let mut sketch = CountMinSketch::<i64>::new(4, 128);
/// sketch.update("apple");
/// assert!(sketch.estimate("apple") >= 1);
/// ```
- pub fn update<T: Hash>(&mut self, item: T) {
- self.update_with_weight(item, 1);
+ pub fn update<I: Hash>(&mut self, item: I) {
+ self.update_with_weight(item, T::ONE);
}
/// Updates the sketch with the given item and weight.
@@ -162,21 +163,21 @@ impl CountMinSketch {
///
/// ```rust
/// # use datasketches::countmin::CountMinSketch;
- /// let mut sketch = CountMinSketch::new(4, 128);
+ /// let mut sketch = CountMinSketch::<i64>::new(4, 128);
/// sketch.update_with_weight("banana", 3);
/// assert!(sketch.estimate("banana") >= 3);
/// ```
- pub fn update_with_weight<T: Hash>(&mut self, item: T, weight: i64) {
- if weight == 0 {
+ pub fn update_with_weight<I: Hash>(&mut self, item: I, weight: T) {
+ if weight == T::ZERO {
return;
}
- let abs_weight = abs_i64(weight);
- self.total_weight = self.total_weight.wrapping_add(abs_weight);
+ let abs_weight = weight.abs();
+ self.total_weight = self.total_weight.add(abs_weight);
let num_buckets = self.num_buckets as usize;
for (row, seed) in self.hash_seeds.iter().enumerate() {
let bucket = self.bucket_index(&item, *seed);
let index = row * num_buckets + bucket;
- self.counts[index] = self.counts[index].wrapping_add(weight);
+ self.counts[index] = self.counts[index].add(weight);
}
}
@@ -186,13 +187,13 @@ impl CountMinSketch {
///
/// ```rust
/// # use datasketches::countmin::CountMinSketch;
- /// let mut sketch = CountMinSketch::new(4, 128);
+ /// let mut sketch = CountMinSketch::<i64>::new(4, 128);
/// sketch.update_with_weight("pear", 2);
/// assert!(sketch.estimate("pear") >= 2);
/// ```
- pub fn estimate<T: Hash>(&self, item: T) -> i64 {
+ pub fn estimate<I: Hash>(&self, item: I) -> T {
let num_buckets = self.num_buckets as usize;
- let mut min = i64::MAX;
+ let mut min = T::MAX;
for (row, seed) in self.hash_seeds.iter().enumerate() {
let bucket = self.bucket_index(&item, *seed);
let index = row * num_buckets + bucket;
@@ -205,15 +206,15 @@ impl CountMinSketch {
}
/// Returns the lower bound on the true frequency of the given item.
- pub fn lower_bound<T: Hash>(&self, item: T) -> i64 {
+ pub fn lower_bound<I: Hash>(&self, item: I) -> T {
self.estimate(item)
}
/// Returns the upper bound on the true frequency of the given item.
- pub fn upper_bound<T: Hash>(&self, item: T) -> i64 {
+ pub fn upper_bound<I: Hash>(&self, item: I) -> T {
let estimate = self.estimate(item);
- let error = (self.relative_error() * self.total_weight as f64) as i64;
- estimate.wrapping_add(error)
+ let error = T::from_f64(self.relative_error() *
self.total_weight.to_f64());
+ estimate.add(error)
}
/// Merges another sketch into this one.
@@ -226,8 +227,8 @@ impl CountMinSketch {
///
/// ```rust
/// # use datasketches::countmin::CountMinSketch;
- /// let mut left = CountMinSketch::new(4, 128);
- /// let mut right = CountMinSketch::new(4, 128);
+ /// let mut left = CountMinSketch::<i64>::new(4, 128);
+ /// let mut right = CountMinSketch::<i64>::new(4, 128);
///
/// left.update("apple");
/// right.update_with_weight("banana", 2);
@@ -235,20 +236,19 @@ impl CountMinSketch {
/// left.merge(&right);
/// assert!(left.estimate("banana") >= 2);
/// ```
- pub fn merge(&mut self, other: &CountMinSketch) {
+ pub fn merge(&mut self, other: &CountMinSketch<T>) {
if std::ptr::eq(self, other) {
panic!("Cannot merge a sketch with itself.");
}
- if self.num_hashes != other.num_hashes
- || self.num_buckets != other.num_buckets
- || self.seed != other.seed
- {
- panic!("Incompatible sketch configuration.");
+ assert_eq!(self.num_hashes, other.num_hashes);
+ assert_eq!(self.num_buckets, other.num_buckets);
+ assert_eq!(self.seed, other.seed);
+ assert_eq!(self.counts.len(), other.counts.len());
+ let counts_len = self.counts.len();
+ for i in 0..counts_len {
+ self.counts[i] = self.counts[i].add(other.counts[i]);
}
- for (dst, src) in self.counts.iter_mut().zip(other.counts.iter()) {
- *dst = dst.wrapping_add(*src);
- }
- self.total_weight = self.total_weight.wrapping_add(other.total_weight);
+ self.total_weight = self.total_weight.add(other.total_weight);
}
/// Serializes this sketch into the DataSketches Count-Min format.
@@ -257,18 +257,19 @@ impl CountMinSketch {
///
/// ```rust
/// # use datasketches::countmin::CountMinSketch;
- /// # let mut sketch = CountMinSketch::new(4, 128);
+ /// # let mut sketch = CountMinSketch::<i64>::new(4, 128);
/// # sketch.update("apple");
/// let bytes = sketch.serialize();
- /// let decoded = CountMinSketch::deserialize(&bytes).unwrap();
+ /// let decoded = CountMinSketch::<i64>::deserialize(&bytes).unwrap();
/// assert!(decoded.estimate("apple") >= 1);
/// ```
pub fn serialize(&self) -> Vec<u8> {
let header_size = PREAMBLE_LONGS_SHORT as usize * LONG_SIZE_BYTES;
+ let value_size = LONG_SIZE_BYTES;
let payload_size = if self.is_empty() {
0
} else {
- LONG_SIZE_BYTES + (self.counts.len() * size_of::<i64>())
+ value_size + (self.counts.len() * value_size)
};
let mut bytes = SketchBytes::with_capacity(header_size + payload_size);
@@ -287,9 +288,9 @@ impl CountMinSketch {
return bytes.into_bytes();
}
- bytes.write_i64_le(self.total_weight);
- for count in self.counts.iter().copied() {
- bytes.write_i64_le(count);
+ bytes.write(&self.total_weight.to_bytes());
+ for count in &self.counts {
+ bytes.write(&count.to_bytes());
}
bytes.into_bytes()
}
@@ -300,10 +301,10 @@ impl CountMinSketch {
///
/// ```rust
/// # use datasketches::countmin::CountMinSketch;
- /// # let mut sketch = CountMinSketch::new(4, 64);
+ /// # let mut sketch = CountMinSketch::<i64>::new(4, 64);
/// # sketch.update("apple");
/// # let bytes = sketch.serialize();
- /// let decoded = CountMinSketch::deserialize(&bytes).unwrap();
+ /// let decoded = CountMinSketch::<i64>::deserialize(&bytes).unwrap();
/// assert!(decoded.estimate("apple") >= 1);
/// ```
pub fn deserialize(bytes: &[u8]) -> Result<Self, Error> {
@@ -316,10 +317,10 @@ impl CountMinSketch {
///
/// ```rust
/// # use datasketches::countmin::CountMinSketch;
- /// # let mut sketch = CountMinSketch::with_seed(4, 64, 7);
+ /// # let mut sketch = CountMinSketch::<i64>::with_seed(4, 64, 7);
/// # sketch.update("apple");
/// # let bytes = sketch.serialize();
- /// let decoded = CountMinSketch::deserialize_with_seed(&bytes,
7).unwrap();
+ /// let decoded = CountMinSketch::<i64>::deserialize_with_seed(&bytes,
7).unwrap();
/// assert!(decoded.estimate("apple") >= 1);
/// ```
pub fn deserialize_with_seed(bytes: &[u8], seed: u64) -> Result<Self,
Error> {
@@ -327,6 +328,15 @@ impl CountMinSketch {
move |_| Error::insufficient_data(tag)
}
+ fn read_value<T: CountMinValue>(
+ cursor: &mut SketchSlice<'_>,
+ tag: &'static str,
+ ) -> Result<T, Error> {
+ let mut bs = [0u8; 8];
+ cursor.read_exact(&mut bs).map_err(make_error(tag))?;
+ T::try_from_bytes(bs)
+ }
+
let mut cursor = SketchSlice::new(bytes);
let preamble_longs =
cursor.read_u8().map_err(make_error("preamble_longs"))?;
let serial_version =
cursor.read_u8().map_err(make_error("serial_version"))?;
@@ -372,27 +382,27 @@ impl CountMinSketch {
return Ok(sketch);
}
- sketch.total_weight =
cursor.read_i64_le().map_err(make_error("total_weight"))?;
- for count in sketch.counts.iter_mut() {
- *count = cursor.read_i64_le().map_err(make_error("counts"))?;
+ sketch.total_weight = read_value(&mut cursor, "total_weight")?;
+ for count in &mut sketch.counts {
+ *count = read_value(&mut cursor, "counts")?;
}
Ok(sketch)
}
fn make(num_hashes: u8, num_buckets: u32, seed: u64, entries: usize) ->
Self {
- let counts = vec![0i64; entries];
+ let counts = vec![T::ZERO; entries];
let hash_seeds = make_hash_seeds(seed, num_hashes);
CountMinSketch {
num_hashes,
num_buckets,
seed,
- total_weight: 0,
+ total_weight: T::ZERO,
counts,
hash_seeds,
}
}
- fn bucket_index<T: Hash>(&self, item: &T, seed: u64) -> usize {
+ fn bucket_index<I: Hash>(&self, item: &I, seed: u64) -> usize {
let mut hasher = MurmurHash3X64128::with_seed(seed);
item.hash(&mut hasher);
let (h1, _) = hasher.finish128();
@@ -400,6 +410,54 @@ impl CountMinSketch {
}
}
+impl<T: UnsignedCountMinValue> CountMinSketch<T> {
+ /// Divides every counter by two, truncating toward zero.
+ ///
+ /// Useful for exponential decay where counts represent recent activity.
+ ///
+ /// # Examples
+ ///
+ /// ```rust
+ /// # use datasketches::countmin::CountMinSketch;
+ /// let mut sketch = CountMinSketch::<u64>::new(4, 128);
+ /// sketch.update_with_weight("apple", 3);
+ /// sketch.halve();
+ /// assert!(sketch.estimate("apple") >= 1);
+ /// ```
+ pub fn halve(&mut self) {
+ for c in &mut self.counts {
+ *c = c.halve()
+ }
+ self.total_weight = self.total_weight.halve();
+ }
+
+ /// Multiplies every counter by `decay` and truncates back into `T`.
+ ///
+ /// Values are truncated toward zero after multiplication; choose `decay`
in `(0, 1]`.
+ /// The total weight is scaled by the same factor to keep bounds
consistent.
+ ///
+ /// # Examples
+ ///
+ /// ```rust
+ /// # use datasketches::countmin::CountMinSketch;
+ /// let mut sketch = CountMinSketch::<u64>::new(4, 128);
+ /// sketch.update_with_weight("apple", 3);
+ /// sketch.decay(0.5);
+ /// assert!(sketch.estimate("apple") >= 1);
+ /// ```
+ ///
+ /// # Panics
+ ///
+ /// Panics if `decay` is not finite or is outside `(0, 1]`.
+ pub fn decay(&mut self, decay: f64) {
+ assert!(decay > 0.0 && decay <= 1.0, "decay must be within (0, 1]");
+ for c in &mut self.counts {
+ *c = c.decay(decay)
+ }
+ self.total_weight = self.total_weight.decay(decay);
+ }
+}
+
fn entries_for_config(num_hashes: u8, num_buckets: u32) -> usize {
assert!(num_hashes > 0, "num_hashes must be at least 1");
assert!(num_buckets >= 3, "num_buckets must be at least 3");
@@ -443,11 +501,3 @@ fn make_hash_seeds(seed: u64, num_hashes: u8) -> Vec<u64> {
}
seeds
}
-
-fn abs_i64(value: i64) -> i64 {
- if value >= 0 {
- value
- } else {
- value.wrapping_neg()
- }
-}
diff --git a/datasketches/src/countmin/value.rs
b/datasketches/src/countmin/value.rs
new file mode 100644
index 0000000..f236440
--- /dev/null
+++ b/datasketches/src/countmin/value.rs
@@ -0,0 +1,187 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::error::Error;
+
+mod private {
+ // Sealed trait to prevent external implementations of CountMinValue.
+ pub trait Sealed {}
+}
+
+/// Value type supported in a Count-Min sketch.
+pub trait CountMinValue: private::Sealed + Copy + Ord {
+ /// Zero value for counters and weights.
+ const ZERO: Self;
+
+ /// One value for unit updates.
+ const ONE: Self;
+
+ /// Maximum representable value for initializing minima.
+ const MAX: Self;
+
+ /// Performs the + operation.
+ fn add(self, other: Self) -> Self;
+
+ /// Computes the absolute value of `self`.
+ fn abs(self) -> Self;
+
+ /// Converts into `f64`.
+ fn to_f64(self) -> f64;
+
+ /// Converts from `f64` by truncating toward zero.
+ fn from_f64(value: f64) -> Self;
+
+ /// Returns the raw transmutation in little-endian 8 bytes.
+ fn to_bytes(self) -> [u8; 8];
+
+ /// Constructs from the raw transmutation in little-endian 8 bytes.
+ fn try_from_bytes(bytes: [u8; 8]) -> Result<Self, Error>;
+}
+
+/// Unsigned value type supported in a Count-Min sketch.
+pub trait UnsignedCountMinValue: CountMinValue {
+ /// Divides the value by two, truncating toward zero.
+ fn halve(self) -> Self;
+
+ /// Multiplies the value by decay and truncates back into `T`.
+ fn decay(self, decay: f64) -> Self;
+}
+
+macro_rules! impl_signed {
+ ($name:ty, $min:expr, $max:expr) => {
+ impl private::Sealed for $name {}
+
+ impl CountMinValue for $name {
+ const ZERO: Self = 0;
+ const ONE: Self = 1;
+ const MAX: Self = $max;
+
+ #[inline(always)]
+ fn add(self, other: Self) -> Self {
+ self + other
+ }
+
+ #[inline(always)]
+ fn abs(self) -> Self {
+ if self >= 0 { self } else { -self }
+ }
+
+ #[inline(always)]
+ fn to_f64(self) -> f64 {
+ self as f64
+ }
+
+ #[inline(always)]
+ fn from_f64(value: f64) -> Self {
+ value.trunc() as $name
+ }
+
+ #[inline(always)]
+ fn to_bytes(self) -> [u8; 8] {
+ let value = self as i64;
+ value.to_le_bytes()
+ }
+
+ #[inline(always)]
+ fn try_from_bytes(bytes: [u8; 8]) -> Result<Self, Error> {
+ let value = i64::from_le_bytes(bytes);
+ if value < $min as i64 || value > $max as i64 {
+ return Err(Error::deserial(format!(
+ "value {} out of range for {}",
+ value,
+ stringify!($name)
+ )));
+ }
+ Ok(value as $name)
+ }
+ }
+ };
+}
+
+impl_signed!(i8, i8::MIN, i8::MAX);
+impl_signed!(i16, i16::MIN, i16::MAX);
+impl_signed!(i32, i32::MIN, i32::MAX);
+impl_signed!(i64, i64::MIN, i64::MAX);
+
+macro_rules! impl_unsigned {
+ ($name:ty, $max:expr) => {
+ impl private::Sealed for $name {}
+
+ impl CountMinValue for $name {
+ const ZERO: Self = 0;
+ const ONE: Self = 1;
+ const MAX: Self = $max;
+
+ #[inline(always)]
+ fn add(self, other: Self) -> Self {
+ self + other
+ }
+
+ #[inline(always)]
+ fn abs(self) -> Self {
+ self
+ }
+
+ #[inline(always)]
+ fn to_f64(self) -> f64 {
+ self as f64
+ }
+
+ #[inline(always)]
+ fn from_f64(value: f64) -> Self {
+ value.trunc() as $name
+ }
+
+ #[inline(always)]
+ fn to_bytes(self) -> [u8; 8] {
+ let value = self as u64;
+ value.to_le_bytes()
+ }
+
+ #[inline(always)]
+ fn try_from_bytes(bytes: [u8; 8]) -> Result<Self, Error> {
+ let value = u64::from_le_bytes(bytes);
+ if value > $max as u64 {
+ return Err(Error::deserial(format!(
+ "value {} out of range for {}",
+ value,
+ stringify!($name)
+ )));
+ }
+ Ok(value as $name)
+ }
+ }
+
+ impl UnsignedCountMinValue for $name {
+ #[inline(always)]
+ fn halve(self) -> Self {
+ self >> 1
+ }
+
+ #[inline(always)]
+ fn decay(self, decay: f64) -> Self {
+ let value = self.to_f64() * decay;
+ Self::from_f64(value)
+ }
+ }
+ };
+}
+
+impl_unsigned!(u8, u8::MAX);
+impl_unsigned!(u16, u16::MAX);
+impl_unsigned!(u32, u32::MAX);
+impl_unsigned!(u64, u64::MAX);
diff --git a/datasketches/src/frequencies/sketch.rs
b/datasketches/src/frequencies/sketch.rs
index f399445..e0d9711 100644
--- a/datasketches/src/frequencies/sketch.rs
+++ b/datasketches/src/frequencies/sketch.rs
@@ -355,7 +355,7 @@ impl<T: Eq + Hash> FrequentItemsSketch<T> {
});
}
}
- rows.sort_by(|a, b| b.estimate.cmp(&a.estimate));
+ rows.sort_by_key(|row| std::cmp::Reverse(row.estimate));
rows
}
diff --git a/datasketches/tests/countmin_test.rs
b/datasketches/tests/countmin_test.rs
index b4b3685..dddc72c 100644
--- a/datasketches/tests/countmin_test.rs
+++ b/datasketches/tests/countmin_test.rs
@@ -19,7 +19,7 @@ use datasketches::countmin::CountMinSketch;
#[test]
fn test_init_defaults() {
- let sketch = CountMinSketch::new(3, 5);
+ let sketch = CountMinSketch::<i64>::new(3, 5);
assert_eq!(sketch.num_hashes(), 3);
assert_eq!(sketch.num_buckets(), 5);
assert_eq!(sketch.seed(), 9001);
@@ -30,23 +30,23 @@ fn test_init_defaults() {
#[test]
fn test_parameter_suggestions() {
- assert_eq!(CountMinSketch::suggest_num_buckets(0.2), 14);
- assert_eq!(CountMinSketch::suggest_num_buckets(0.1), 28);
- assert_eq!(CountMinSketch::suggest_num_buckets(0.05), 55);
- assert_eq!(CountMinSketch::suggest_num_buckets(0.01), 272);
+ assert_eq!(CountMinSketch::<i64>::suggest_num_buckets(0.2), 14);
+ assert_eq!(CountMinSketch::<i64>::suggest_num_buckets(0.1), 28);
+ assert_eq!(CountMinSketch::<i64>::suggest_num_buckets(0.05), 55);
+ assert_eq!(CountMinSketch::<i64>::suggest_num_buckets(0.01), 272);
- assert_eq!(CountMinSketch::suggest_num_hashes(0.682689492), 2);
- assert_eq!(CountMinSketch::suggest_num_hashes(0.954499736), 4);
- assert_eq!(CountMinSketch::suggest_num_hashes(0.997300204), 6);
+ assert_eq!(CountMinSketch::<i64>::suggest_num_hashes(0.682689492), 2);
+ assert_eq!(CountMinSketch::<i64>::suggest_num_hashes(0.954499736), 4);
+ assert_eq!(CountMinSketch::<i64>::suggest_num_hashes(0.997300204), 6);
- let buckets = CountMinSketch::suggest_num_buckets(0.1);
- let sketch = CountMinSketch::new(3, buckets);
+ let buckets = CountMinSketch::<i64>::suggest_num_buckets(0.1);
+ let sketch = CountMinSketch::<i64>::new(3, buckets);
assert!(sketch.relative_error() <= 0.1);
}
#[test]
fn test_update_and_bounds() {
- let mut sketch = CountMinSketch::with_seed(3, 128, 123);
+ let mut sketch = CountMinSketch::<i64>::with_seed(3, 128, 123);
sketch.update("x");
sketch.update_with_weight("x", 9);
assert_eq!(sketch.estimate("x"), 10);
@@ -58,9 +58,50 @@ fn test_update_and_bounds() {
assert!(estimate <= upper);
}
+#[test]
+fn test_update_and_bounds_with_scaling() {
+ let mut sketch = CountMinSketch::<u64>::with_seed(3, 128, 123);
+ sketch.update_with_weight("x", 10);
+
+ let estimate = sketch.estimate("x");
+ let upper = sketch.upper_bound("x");
+ let lower = sketch.lower_bound("x");
+ assert_eq!(estimate, 10);
+ assert!(lower <= estimate);
+ assert!(estimate <= upper);
+
+ let eps = sketch.relative_error();
+
+ sketch.halve();
+ let estimate = sketch.estimate("x");
+ let upper = sketch.upper_bound("x");
+ let lower = sketch.lower_bound("x");
+ assert_eq!(sketch.total_weight(), 5);
+ assert_eq!(estimate, 5);
+ assert!(lower <= estimate);
+ assert!(estimate <= upper);
+ assert_eq!(
+ upper,
+ estimate + (eps * sketch.total_weight() as f64) as u64
+ );
+
+ sketch.decay(0.5);
+ let estimate = sketch.estimate("x");
+ let upper = sketch.upper_bound("x");
+ let lower = sketch.lower_bound("x");
+ assert_eq!(sketch.total_weight(), 2);
+ assert_eq!(estimate, 2);
+ assert!(lower <= estimate);
+ assert!(estimate <= upper);
+ assert_eq!(
+ upper,
+ estimate + (eps * sketch.total_weight() as f64) as u64
+ );
+}
+
#[test]
fn test_negative_weights() {
- let mut sketch = CountMinSketch::with_seed(2, 32, 123);
+ let mut sketch = CountMinSketch::<i64>::with_seed(2, 32, 123);
sketch.update_with_weight("y", -1);
assert_eq!(sketch.total_weight(), 1);
assert_eq!(sketch.estimate("y"), -1);
@@ -68,10 +109,58 @@ fn test_negative_weights() {
assert_eq!(sketch.total_weight(), 3);
}
+#[test]
+fn test_halve() {
+ let buckets = CountMinSketch::<u64>::suggest_num_buckets(0.01);
+ let hashes = CountMinSketch::<u64>::suggest_num_hashes(0.9);
+ let mut sketch = CountMinSketch::<u64>::new(hashes, buckets);
+
+ for i in 0..1000usize {
+ for _ in 0..i {
+ sketch.update(i as u64);
+ }
+ }
+
+ for i in 0..1000usize {
+ assert!(sketch.estimate(i as u64) >= i as u64);
+ }
+
+ sketch.halve();
+
+ for i in 0..1000usize {
+ assert!(sketch.estimate(i as u64) >= (i as u64) / 2);
+ }
+}
+
+#[test]
+fn test_decay() {
+ let buckets = CountMinSketch::<u64>::suggest_num_buckets(0.01);
+ let hashes = CountMinSketch::<u64>::suggest_num_hashes(0.9);
+ let mut sketch = CountMinSketch::<u64>::new(hashes, buckets);
+
+ for i in 0..1000usize {
+ for _ in 0..i {
+ sketch.update(i as u64);
+ }
+ }
+
+ for i in 0..1000usize {
+ assert!(sketch.estimate(i as u64) >= i as u64);
+ }
+
+ const FACTOR: f64 = 0.5;
+ sketch.decay(FACTOR);
+
+ for i in 0..1000usize {
+ let expected = ((i as f64) * FACTOR).floor() as u64;
+ assert!(sketch.estimate(i as u64) >= expected);
+ }
+}
+
#[test]
fn test_merge() {
- let mut left = CountMinSketch::new(3, 64);
- let mut right = CountMinSketch::new(3, 64);
+ let mut left = CountMinSketch::<i64>::new(3, 64);
+ let mut right = CountMinSketch::<i64>::new(3, 64);
for _ in 0..10 {
left.update("a");
}
@@ -87,9 +176,9 @@ fn test_merge() {
#[test]
fn test_serialize_deserialize_empty() {
- let sketch = CountMinSketch::with_seed(2, 5, 123);
+ let sketch = CountMinSketch::<i64>::with_seed(2, 5, 123);
let bytes = sketch.serialize();
- let decoded = CountMinSketch::deserialize_with_seed(&bytes, 123).unwrap();
+ let decoded = CountMinSketch::<i64>::deserialize_with_seed(&bytes,
123).unwrap();
assert!(decoded.is_empty());
assert_eq!(decoded.num_hashes(), 2);
assert_eq!(decoded.num_buckets(), 5);
@@ -98,39 +187,51 @@ fn test_serialize_deserialize_empty() {
#[test]
fn test_serialize_deserialize_non_empty() {
- let mut sketch = CountMinSketch::with_seed(3, 32, 123);
+ let mut sketch = CountMinSketch::<i64>::with_seed(3, 32, 123);
for i in 0..100i64 {
sketch.update(i);
}
let bytes = sketch.serialize();
- let decoded = CountMinSketch::deserialize_with_seed(&bytes, 123).unwrap();
+ let decoded = CountMinSketch::<i64>::deserialize_with_seed(&bytes,
123).unwrap();
assert_eq!(decoded.total_weight(), sketch.total_weight());
assert_eq!(decoded.estimate(42i64), sketch.estimate(42i64));
}
+#[test]
+fn test_serialize_deserialize_non_empty_u64() {
+ let mut sketch = CountMinSketch::<u64>::with_seed(3, 32, 123);
+ for i in 0..100u64 {
+ sketch.update(i);
+ }
+ let bytes = sketch.serialize();
+ let decoded = CountMinSketch::<u64>::deserialize_with_seed(&bytes,
123).unwrap();
+ assert_eq!(decoded.total_weight(), sketch.total_weight());
+ assert_eq!(decoded.estimate(42u64), sketch.estimate(42u64));
+}
+
#[test]
#[should_panic(expected = "num_hashes must be at least 1")]
fn test_invalid_hashes() {
- CountMinSketch::new(0, 5);
+ CountMinSketch::<i64>::new(0, 5);
}
#[test]
#[should_panic(expected = "num_buckets must be at least 3")]
fn test_invalid_buckets() {
- CountMinSketch::new(1, 2);
+ CountMinSketch::<i64>::new(1, 2);
}
#[test]
-#[should_panic(expected = "Incompatible sketch configuration.")]
+#[should_panic]
fn test_merge_incompatible() {
- let mut left = CountMinSketch::new(3, 64);
- let right = CountMinSketch::new(2, 64);
+ let mut left = CountMinSketch::<i64>::new(3, 64);
+ let right = CountMinSketch::<i64>::new(2, 64);
left.merge(&right);
}
#[test]
fn test_increment_single_key_like_rust_count_min_sketch() {
- let mut sketch = CountMinSketch::new(4, 32);
+ let mut sketch = CountMinSketch::<i64>::new(4, 32);
for _ in 0..300 {
sketch.update("key");
}
@@ -139,7 +240,7 @@ fn test_increment_single_key_like_rust_count_min_sketch() {
#[test]
fn test_increment_multi_like_rust_count_min_sketch() {
- let mut sketch = CountMinSketch::new(6, 128);
+ let mut sketch = CountMinSketch::<i64>::new(6, 128);
for i in 0..1_000_000u64 {
sketch.update(i % 100);
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]