This is an automated email from the ASF dual-hosted git repository.
leerho pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datasketches-rust.git
The following commit(s) were added to refs/heads/main by this push:
new c2cbb7c chore: fine tune frequencies and theta sketches (#51)
c2cbb7c is described below
commit c2cbb7c3786fa08e1d0a00ddefa6182105f90c5f
Author: tison <[email protected]>
AuthorDate: Wed Dec 31 13:53:00 2025 +0800
chore: fine tune frequencies and theta sketches (#51)
* chore: fine tune code
Signed-off-by: tison <[email protected]>
* rework frequencies codec
Signed-off-by: tison <[email protected]>
* fixup
Signed-off-by: tison <[email protected]>
* docs
Signed-off-by: tison <[email protected]>
* resize factor is shared util
Signed-off-by: tison <[email protected]>
---------
Signed-off-by: tison <[email protected]>
---
datasketches/src/frequencies/mod.rs | 1 -
.../src/frequencies/reverse_purge_item_hash_map.rs | 26 ++++----
datasketches/src/frequencies/serde.rs | 10 ---
datasketches/src/frequencies/serialization.rs | 21 +++++-
datasketches/src/frequencies/sketch.rs | 75 ++++++++--------------
datasketches/src/lib.rs | 3 +
datasketches/src/resize.rs | 68 ++++++++++++++++++++
datasketches/src/theta/hash_table.rs | 58 +++++------------
datasketches/src/theta/mod.rs | 1 +
datasketches/src/theta/sketch.rs | 32 ++++-----
.../tests/frequencies_serialization_test.rs | 36 ++++-------
datasketches/tests/frequencies_update_test.rs | 14 ----
12 files changed, 175 insertions(+), 170 deletions(-)
diff --git a/datasketches/src/frequencies/mod.rs
b/datasketches/src/frequencies/mod.rs
index 6bd7987..d04343a 100644
--- a/datasketches/src/frequencies/mod.rs
+++ b/datasketches/src/frequencies/mod.rs
@@ -29,7 +29,6 @@ mod serde;
mod serialization;
mod sketch;
-pub use self::serde::ItemsSerde;
pub use self::sketch::ErrorType;
pub use self::sketch::FrequentItemsSketch;
pub use self::sketch::Row;
diff --git a/datasketches/src/frequencies/reverse_purge_item_hash_map.rs
b/datasketches/src/frequencies/reverse_purge_item_hash_map.rs
index 9c17e58..f934b87 100644
--- a/datasketches/src/frequencies/reverse_purge_item_hash_map.rs
+++ b/datasketches/src/frequencies/reverse_purge_item_hash_map.rs
@@ -35,7 +35,7 @@ pub(super) struct ReversePurgeItemHashMap<T> {
lg_length: u8,
load_threshold: usize,
keys: Vec<Option<T>>,
- values: Vec<i64>,
+ values: Vec<u64>,
states: Vec<u16>,
num_active: usize,
}
@@ -59,7 +59,7 @@ impl<T: Eq + Hash> ReversePurgeItemHashMap<T> {
}
/// Returns the value for `key`, or zero if the key is not present.
- pub fn get(&self, key: &T) -> i64 {
+ pub fn get(&self, key: &T) -> u64 {
let probe = self.hash_probe(key);
if self.states[probe] > 0 {
return self.values[probe];
@@ -68,7 +68,7 @@ impl<T: Eq + Hash> ReversePurgeItemHashMap<T> {
}
/// Adds `adjust_amount` to the value for `key`, inserting if absent.
- pub fn adjust_or_put_value(&mut self, key: T, adjust_amount: i64) {
+ pub fn adjust_or_put_value(&mut self, key: T, adjust_amount: u64) {
let mask = self.keys.len() - 1;
let mut probe = (hash_item(&key) as usize) & mask;
let mut drift: usize = 1;
@@ -95,20 +95,20 @@ impl<T: Eq + Hash> ReversePurgeItemHashMap<T> {
}
/// Removes all keys with non-positive counts.
- pub fn keep_only_positive_counts(&mut self) {
+ fn keep_only_positive_counts(&mut self) {
let len = self.keys.len();
let mut first_probe = len - 1;
while self.states[first_probe] > 0 {
first_probe -= 1;
}
for probe in (0..first_probe).rev() {
- if self.states[probe] > 0 && self.values[probe] <= 0 {
+ if self.states[probe] > 0 && self.values[probe] == 0 {
self.hash_delete(probe);
self.num_active -= 1;
}
}
for probe in (first_probe..len).rev() {
- if self.states[probe] > 0 && self.values[probe] <= 0 {
+ if self.states[probe] > 0 && self.values[probe] == 0 {
self.hash_delete(probe);
self.num_active -= 1;
}
@@ -118,16 +118,16 @@ impl<T: Eq + Hash> ReversePurgeItemHashMap<T> {
/// Shifts all values by `adjust_amount`.
///
/// This is used during purges to decrement counters.
- pub fn adjust_all_values_by(&mut self, adjust_amount: i64) {
- for value in &mut self.values {
- *value += adjust_amount;
+ fn adjust_all_values_by(&mut self, adjust_amount: u64) {
+ for value in self.values.iter_mut() {
+ *value = value.saturating_sub(adjust_amount);
}
}
/// Purges the map by estimating the median count and removing
non-positive entries.
///
/// Returns the estimated median value that was subtracted from all counts.
- pub fn purge(&mut self, sample_size: usize) -> i64 {
+ pub fn purge(&mut self, sample_size: usize) -> u64 {
let limit = sample_size.min(self.num_active).min(MAX_SAMPLE_SIZE);
let mut samples = Vec::with_capacity(limit);
let mut i = 0usize;
@@ -140,7 +140,7 @@ impl<T: Eq + Hash> ReversePurgeItemHashMap<T> {
let mid = samples.len() / 2;
samples.select_nth_unstable(mid);
let median = samples[mid];
- self.adjust_all_values_by(-median);
+ self.adjust_all_values_by(median);
self.keep_only_positive_counts();
median
}
@@ -206,7 +206,7 @@ impl<T: Eq + Hash> ReversePurgeItemHashMap<T> {
}
/// Returns the active values in the map.
- pub fn active_values(&self) -> Vec<i64> {
+ pub fn active_values(&self) -> Vec<u64> {
if self.num_active == 0 {
return Vec::new();
}
@@ -292,7 +292,7 @@ impl<'a, T> ReversePurgeItemIter<'a, T> {
}
impl<'a, T> Iterator for ReversePurgeItemIter<'a, T> {
- type Item = (&'a T, i64);
+ type Item = (&'a T, u64);
fn next(&mut self) -> Option<Self::Item> {
if self.count >= self.map.num_active {
diff --git a/datasketches/src/frequencies/serde.rs
b/datasketches/src/frequencies/serde.rs
index 376045d..a44e6b0 100644
--- a/datasketches/src/frequencies/serde.rs
+++ b/datasketches/src/frequencies/serde.rs
@@ -23,16 +23,6 @@ use crate::error::Error;
use crate::frequencies::serialization::read_i64_le;
use crate::frequencies::serialization::read_u32_le;
-/// Built-in serializers for frequency sketch items.
-#[non_exhaustive]
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-pub enum ItemsSerde {
- /// i64 items compatible with ArrayOfLongsSerDe in Java.
- Int64,
- /// UTF-8 strings compatible with ArrayOfStringsSerDe in Java.
- String,
-}
-
pub(crate) fn serialize_string_items(items: &[String]) -> Vec<u8> {
let total_len: usize = items.iter().map(|item| 4 + item.len()).sum();
let mut out = Vec::with_capacity(total_len);
diff --git a/datasketches/src/frequencies/serialization.rs
b/datasketches/src/frequencies/serialization.rs
index 24e8952..43003c9 100644
--- a/datasketches/src/frequencies/serialization.rs
+++ b/datasketches/src/frequencies/serialization.rs
@@ -50,7 +50,7 @@ pub const STREAM_WEIGHT_LONG: usize = 16;
/// Offset of offset (fourth pre-long).
pub const OFFSET_LONG: usize = 24;
-/// Read a u32 value from bytes at the given offset (little-endian).
+/// Read an u32 value from bytes at the given offset (little-endian).
#[inline]
pub fn read_u32_le(bytes: &[u8], offset: usize) -> u32 {
u32::from_le_bytes([
@@ -76,14 +76,29 @@ pub fn read_i64_le(bytes: &[u8], offset: usize) -> i64 {
])
}
+/// Read an u64 value from bytes at the given offset (little-endian).
+#[inline]
+pub fn read_u64_le(bytes: &[u8], offset: usize) -> u64 {
+ u64::from_le_bytes([
+ bytes[offset],
+ bytes[offset + 1],
+ bytes[offset + 2],
+ bytes[offset + 3],
+ bytes[offset + 4],
+ bytes[offset + 5],
+ bytes[offset + 6],
+ bytes[offset + 7],
+ ])
+}
+
/// Write a u32 value to bytes at the given offset (little-endian).
#[inline]
pub fn write_u32_le(bytes: &mut [u8], offset: usize, value: u32) {
bytes[offset..offset + 4].copy_from_slice(&value.to_le_bytes());
}
-/// Write an i64 value to bytes at the given offset (little-endian).
+/// Write an u64 value to bytes at the given offset (little-endian).
#[inline]
-pub fn write_i64_le(bytes: &mut [u8], offset: usize, value: i64) {
+pub fn write_u64_le(bytes: &mut [u8], offset: usize, value: u64) {
bytes[offset..offset + 8].copy_from_slice(&value.to_le_bytes());
}
diff --git a/datasketches/src/frequencies/sketch.rs
b/datasketches/src/frequencies/sketch.rs
index ac5b884..ee4552b 100644
--- a/datasketches/src/frequencies/sketch.rs
+++ b/datasketches/src/frequencies/sketch.rs
@@ -20,9 +20,7 @@
use std::hash::Hash;
use crate::error::Error;
-use crate::error::ErrorKind;
use crate::frequencies::reverse_purge_item_hash_map::ReversePurgeItemHashMap;
-use crate::frequencies::serde::ItemsSerde;
use crate::frequencies::serde::deserialize_i64_items;
use crate::frequencies::serde::deserialize_string_items;
use crate::frequencies::serde::serialize_i64_items;
@@ -52,8 +50,8 @@ pub enum ErrorType {
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Row<T> {
item: T,
- estimate: i64,
- upper_bound: i64,
+ estimate: u64,
+ upper_bound: u64,
lower_bound: u64,
}
@@ -64,12 +62,12 @@ impl<T> Row<T> {
}
/// Returns the estimated frequency.
- pub fn estimate(&self) -> i64 {
+ pub fn estimate(&self) -> u64 {
self.estimate
}
/// Returns the upper bound for the frequency.
- pub fn upper_bound(&self) -> i64 {
+ pub fn upper_bound(&self) -> u64 {
self.upper_bound
}
@@ -91,8 +89,8 @@ impl<T> Row<T> {
pub struct FrequentItemsSketch<T> {
lg_max_map_size: u8,
cur_map_cap: usize,
- offset: i64,
- stream_weight: i64,
+ offset: u64,
+ stream_weight: u64,
sample_size: usize,
hash_map: ReversePurgeItemHashMap<T>,
}
@@ -124,14 +122,14 @@ impl<T: Eq + Hash> FrequentItemsSketch<T> {
/// Returns the total weight of the stream.
///
/// This is the sum of all counts passed to `update` and
`update_with_count`.
- pub fn total_weight(&self) -> i64 {
+ pub fn total_weight(&self) -> u64 {
self.stream_weight
}
/// Returns the estimated frequency for an item.
///
/// If the item is tracked, this is `item_count + offset`. Otherwise it is
zero.
- pub fn estimate(&self, item: &T) -> i64 {
+ pub fn estimate(&self, item: &T) -> u64 {
let value = self.hash_map.get(item);
if value > 0 { value + self.offset } else { 0 }
}
@@ -141,15 +139,14 @@ impl<T: Eq + Hash> FrequentItemsSketch<T> {
/// This value is never negative and is guaranteed to be no larger than
the true frequency.
/// If the item is not tracked, the lower bound is zero.
pub fn lower_bound(&self, item: &T) -> u64 {
- let value = self.hash_map.get(item);
- value.max(0) as u64
+ self.hash_map.get(item)
}
/// Returns the guaranteed upper bound frequency for an item.
///
/// This value is guaranteed to be no smaller than the true frequency.
/// If the item is tracked, this is `item_count + offset`.
- pub fn upper_bound(&self, item: &T) -> i64 {
+ pub fn upper_bound(&self, item: &T) -> u64 {
self.hash_map.get(item) + self.offset
}
@@ -158,7 +155,7 @@ impl<T: Eq + Hash> FrequentItemsSketch<T> {
///
/// This is equivalent to the maximum distance between the upper bound and
the lower bound
/// for any item.
- pub fn maximum_error(&self) -> i64 {
+ pub fn maximum_error(&self) -> u64 {
self.offset
}
@@ -208,12 +205,8 @@ impl<T: Eq + Hash> FrequentItemsSketch<T> {
/// Updates the sketch with an item and count.
///
- /// # Panics
- ///
- /// Panics if `count` is negative.
- ///
/// A count of zero is a no-op.
- pub fn update_with_count(&mut self, item: T, count: i64) {
+ pub fn update_with_count(&mut self, item: T, count: u64) {
if count == 0 {
return;
}
@@ -266,7 +259,7 @@ impl<T: Eq + Hash> FrequentItemsSketch<T> {
pub fn frequent_items_with_threshold(
&self,
error_type: ErrorType,
- threshold: i64,
+ threshold: u64,
) -> Vec<Row<T>>
where
T: Clone,
@@ -285,7 +278,7 @@ impl<T: Eq + Hash> FrequentItemsSketch<T> {
item: item.clone(),
estimate: upper,
upper_bound: upper,
- lower_bound: lower.max(0) as u64,
+ lower_bound: lower,
});
}
}
@@ -357,12 +350,12 @@ impl<T: Eq + Hash> FrequentItemsSketch<T> {
out[LG_CUR_MAP_SIZE_BYTE] = self.hash_map.lg_length();
out[FLAGS_BYTE] = 0;
write_u32_le(&mut out, ACTIVE_ITEMS_INT, active_items as u32);
- write_i64_le(&mut out, STREAM_WEIGHT_LONG, self.stream_weight);
- write_i64_le(&mut out, OFFSET_LONG, self.offset);
+ write_u64_le(&mut out, STREAM_WEIGHT_LONG, self.stream_weight);
+ write_u64_le(&mut out, OFFSET_LONG, self.offset);
let mut offset = PREAMBLE_LONGS_NONEMPTY as usize * 8;
for value in values {
- write_i64_le(&mut out, offset, value);
+ write_u64_le(&mut out, offset, value);
offset += 8;
}
out[offset..offset + items_bytes.len()].copy_from_slice(&items_bytes);
@@ -415,8 +408,8 @@ impl<T: Eq + Hash> FrequentItemsSketch<T> {
return Err(Error::insufficient_data("full preamble"));
}
let active_items = read_u32_le(bytes, ACTIVE_ITEMS_INT) as usize;
- let stream_weight = read_i64_le(bytes, STREAM_WEIGHT_LONG);
- let offset_val = read_i64_le(bytes, OFFSET_LONG);
+ let stream_weight = read_u64_le(bytes, STREAM_WEIGHT_LONG);
+ let offset_val = read_u64_le(bytes, OFFSET_LONG);
let values_offset = PREAMBLE_LONGS_NONEMPTY as usize * 8;
let values_bytes = active_items
.checked_mul(8)
@@ -427,7 +420,7 @@ impl<T: Eq + Hash> FrequentItemsSketch<T> {
}
let mut values = Vec::with_capacity(active_items);
for i in 0..active_items {
- values.push(read_i64_le(bytes, values_offset + i * 8));
+ values.push(read_u64_le(bytes, values_offset + i * 8));
}
let (items, consumed) = deserialize_items(&bytes[items_offset..],
active_items)?;
if items.len() != active_items {
@@ -454,17 +447,9 @@ impl FrequentItemsSketch<i64> {
self.serialize_inner(serialize_i64_items)
}
- /// Deserializes a sketch from bytes using the selected serializer.
- ///
- /// Returns an error if `serde` does not match the sketch item type.
- pub fn deserialize(bytes: &[u8], serde: ItemsSerde) -> Result<Self, Error>
{
- match serde {
- ItemsSerde::Int64 => Self::deserialize_inner(bytes,
deserialize_i64_items),
- ItemsSerde::String => Err(Error::new(
- ErrorKind::InvalidArgument,
- "ItemsSerde::String cannot deserialize i64 items",
- )),
- }
+ /// Deserializes a sketch from bytes.
+ pub fn deserialize(bytes: &[u8]) -> Result<Self, Error> {
+ Self::deserialize_inner(bytes, deserialize_i64_items)
}
}
@@ -474,17 +459,9 @@ impl FrequentItemsSketch<String> {
self.serialize_inner(serialize_string_items)
}
- /// Deserializes a sketch from bytes using the selected serializer.
- ///
- /// Returns an error if `serde` does not match the sketch item type.
- pub fn deserialize(bytes: &[u8], serde: ItemsSerde) -> Result<Self, Error>
{
- match serde {
- ItemsSerde::String => Self::deserialize_inner(bytes,
deserialize_string_items),
- ItemsSerde::Int64 => Err(Error::new(
- ErrorKind::InvalidArgument,
- "ItemsSerde::Int64 cannot deserialize String items",
- )),
- }
+ /// Deserializes a sketch from bytes.
+ pub fn deserialize(bytes: &[u8]) -> Result<Self, Error> {
+ Self::deserialize_inner(bytes, deserialize_string_items)
}
}
diff --git a/datasketches/src/lib.rs b/datasketches/src/lib.rs
index 4d779a0..acc051d 100644
--- a/datasketches/src/lib.rs
+++ b/datasketches/src/lib.rs
@@ -39,3 +39,6 @@ pub mod theta;
mod codec;
mod hash;
+mod resize;
+
+pub use self::resize::ResizeFactor;
diff --git a/datasketches/src/resize.rs b/datasketches/src/resize.rs
new file mode 100644
index 0000000..caf87ab
--- /dev/null
+++ b/datasketches/src/resize.rs
@@ -0,0 +1,68 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/// For the Families that accept this configuration parameter, it controls the
size multiple that
+/// affects how fast the internal cache grows, when more space is required.
+///
+/// For Theta Sketches, the Resize Factor is a dynamic, speed performance vs.
memory size tradeoff.
+/// The sketches created on-heap and configured with a Resize Factor of > X1
start out with an
+/// internal hash table size that is the smallest submultiple of the target
Nominal Entries
+/// and larger than the minimum required hash table size for that sketch.
+///
+/// When the sketch needs to be resized larger, then the Resize Factor is used
as a multiplier of
+/// the current sketch cache array size.
+///
+/// "X1" means no resizing is allowed and the sketch will be initialized at
full size.
+///
+/// "X2" means the internal cache will start very small and double in size
until the target size is
+/// reached.
+///
+/// Similarly, "X4" is a factor of 4 and "X8" is a factor of 8.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum ResizeFactor {
+ /// Do not resize. Sketch will be configured to full size.
+ X1,
+ /// Resize by factor of 2
+ X2,
+ /// Resize by factor of 4
+ X4,
+ /// Resize by factor of 8
+ X8,
+}
+
+impl ResizeFactor {
+ /// Returns the Log-base 2 of the Resize Factor
+ pub fn lg_value(self) -> u8 {
+ match self {
+ ResizeFactor::X1 => 0,
+ ResizeFactor::X2 => 1,
+ ResizeFactor::X4 => 2,
+ ResizeFactor::X8 => 3,
+ }
+ }
+
+ /// Returns the Resize Factor.
+ pub fn value(self) -> usize {
+ // 1 << lg_value
+ match self {
+ ResizeFactor::X1 => 1,
+ ResizeFactor::X2 => 2,
+ ResizeFactor::X4 => 4,
+ ResizeFactor::X8 => 8,
+ }
+ }
+}
diff --git a/datasketches/src/theta/hash_table.rs
b/datasketches/src/theta/hash_table.rs
index 634f232..5e5dcb6 100644
--- a/datasketches/src/theta/hash_table.rs
+++ b/datasketches/src/theta/hash_table.rs
@@ -17,6 +17,7 @@
use std::hash::Hash;
+use crate::ResizeFactor;
use crate::hash::MurmurHash3X64128;
/// Maximum theta value (signed max for compatibility with Java)
@@ -31,28 +32,6 @@ pub const MAX_LG_K: u8 = 26;
/// Default log2 of K
pub const DEFAULT_LG_K: u8 = 12;
-/// Hash table resize factor
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
-pub enum ResizeFactor {
- /// Resize by factor of 2
- X2 = 2,
- /// Resize by factor of 4
- X4 = 4,
- /// Resize by factor of 8
- #[default]
- X8 = 8,
-}
-
-impl ResizeFactor {
- pub fn lg_value(&self) -> u8 {
- match self {
- ResizeFactor::X2 => 1,
- ResizeFactor::X4 => 2,
- ResizeFactor::X8 => 3,
- }
- }
-}
-
/// Resize threshold (0.5 = 50% load factor)
const RESIZE_THRESHOLD: f64 = 0.5;
@@ -329,12 +308,10 @@ impl ThetaHashTable {
fn starting_sub_multiple(lg_target: u8, lg_min: u8, lg_resize_factor: u8) ->
u8 {
if lg_target <= lg_min {
lg_min
+ } else if lg_resize_factor == 0 {
+ lg_target
} else {
- if lg_resize_factor == 0 {
- lg_target
- } else {
- ((lg_target - lg_min) % lg_resize_factor) + lg_min
- }
+ ((lg_target - lg_min) % lg_resize_factor) + lg_min
}
}
@@ -423,6 +400,17 @@ mod tests {
#[test]
fn test_resize() {
+ fn populate_values(table: &mut ThetaHashTable, count: usize) -> usize {
+ let mut inserted = 0;
+ for i in 0..count {
+ let hash = table.hash_and_screen(format!("value_{}", i));
+ if hash != 0 && table.try_insert(hash) {
+ inserted += 1;
+ }
+ }
+ inserted
+ }
+
{
let mut table = ThetaHashTable::new(8, ResizeFactor::X2, 1.0,
DEFAULT_UPDATE_SEED);
@@ -430,13 +418,7 @@ mod tests {
// Insert enough values to trigger resize (50% threshold)
// Capacity = 32 * 0.5 = 16
- let mut inserted = 0;
- for i in 0..20 {
- let hash = table.hash_and_screen(format!("value_{}", i));
- if hash != 0 && table.try_insert(hash) {
- inserted += 1;
- }
- }
+ let inserted = populate_values(&mut table, 20);
// Table should have resized and all values should be inserted
assert!(table.num_entries() > 0);
@@ -452,13 +434,7 @@ mod tests {
// Insert enough values to trigger resize (50% threshold)
// Capacity = 32 * 0.5 = 16
- let mut inserted = 0;
- for i in 0..20 {
- let hash = table.hash_and_screen(format!("value_{}", i));
- if hash != 0 && table.try_insert(hash) {
- inserted += 1;
- }
- }
+ let inserted = populate_values(&mut table, 20);
// Table should have resized and all values should be inserted
assert!(table.num_entries() > 0);
diff --git a/datasketches/src/theta/mod.rs b/datasketches/src/theta/mod.rs
index e34d30e..0d50348 100644
--- a/datasketches/src/theta/mod.rs
+++ b/datasketches/src/theta/mod.rs
@@ -33,3 +33,4 @@ mod hash_table;
mod sketch;
pub use self::sketch::ThetaSketch;
+pub use self::sketch::ThetaSketchBuilder;
diff --git a/datasketches/src/theta/sketch.rs b/datasketches/src/theta/sketch.rs
index 5f67a2e..0ad5357 100644
--- a/datasketches/src/theta/sketch.rs
+++ b/datasketches/src/theta/sketch.rs
@@ -22,12 +22,12 @@
use std::hash::Hash;
+use crate::ResizeFactor;
use crate::hash::DEFAULT_UPDATE_SEED;
use crate::theta::hash_table::DEFAULT_LG_K;
use crate::theta::hash_table::MAX_LG_K;
use crate::theta::hash_table::MAX_THETA;
use crate::theta::hash_table::MIN_LG_K;
-use crate::theta::hash_table::ResizeFactor;
use crate::theta::hash_table::ThetaHashTable;
/// Mutable theta sketch for building from input data
@@ -131,7 +131,7 @@ impl Default for ThetaSketchBuilder {
fn default() -> Self {
Self {
lg_k: DEFAULT_LG_K,
- resize_factor: ResizeFactor::default(),
+ resize_factor: ResizeFactor::X8,
sampling_probability: 1.0,
seed: DEFAULT_UPDATE_SEED,
}
@@ -139,7 +139,7 @@ impl Default for ThetaSketchBuilder {
}
impl ThetaSketchBuilder {
- /// Set lg_k (log2 of nominal size k)
+ /// Set lg_k (log2 of nominal size k).
///
/// # Panics
///
@@ -156,34 +156,33 @@ impl ThetaSketchBuilder {
self
}
- /// Set resize factor
- pub fn resize_factor(mut self, rf: ResizeFactor) -> Self {
- self.resize_factor = rf;
+ /// Set resize factor.
+ pub fn resize_factor(mut self, factor: ResizeFactor) -> Self {
+ self.resize_factor = factor;
self
}
- /// Set sampling probability p
+ /// Set sampling probability p.
///
/// # Panics
///
/// If p is not in range [0.0, 1.0]
- pub fn sampling_probability(mut self, p: f32) -> Self {
+ pub fn sampling_probability(mut self, probability: f32) -> Self {
assert!(
- (0.0..=1.0).contains(&p),
- "p must be in [0.0, 1.0], got {}",
- p
+ (0.0..=1.0).contains(&probability),
+ "p must be in [0.0, 1.0], got {probability}"
);
- self.sampling_probability = p;
+ self.sampling_probability = probability;
self
}
- /// Set hash seed
+ /// Set hash seed.
pub fn seed(mut self, seed: u64) -> Self {
self.seed = seed;
self
}
- /// Build the ThetaSketch
+ /// Build the ThetaSketch.
pub fn build(self) -> ThetaSketch {
let table = ThetaHashTable::new(
self.lg_k,
@@ -199,10 +198,11 @@ impl ThetaSketchBuilder {
/// Canonicalize double value for compatibility with Java
fn canonical_double(value: f64) -> i64 {
if value.is_nan() {
- 0x7ff8000000000000i64 // Java's Double.doubleToLongBits() NaN value
+ // Java's Double.doubleToLongBits() NaN value
+ 0x7ff8000000000000i64
} else {
// -0.0 + 0.0 == +0.0 under IEEE754 roundTiesToEven rounding mode,
- // which Rust guarantees. Thus by adding a positive zero we
+ // which Rust guarantees. Thus, by adding a positive zero we
// canonicalize signed zero without any branches in one instruction.
(value + 0.0).to_bits() as i64
}
diff --git a/datasketches/tests/frequencies_serialization_test.rs
b/datasketches/tests/frequencies_serialization_test.rs
index d43b4d1..f8d95bc 100644
--- a/datasketches/tests/frequencies_serialization_test.rs
+++ b/datasketches/tests/frequencies_serialization_test.rs
@@ -22,17 +22,15 @@ use std::fs;
use common::serialization_test_data;
use datasketches::error::ErrorKind;
use datasketches::frequencies::FrequentItemsSketch;
-use datasketches::frequencies::ItemsSerde;
#[test]
fn test_longs_round_trip() {
let mut sketch: FrequentItemsSketch<i64> = FrequentItemsSketch::new(32);
for i in 1..=100 {
- sketch.update_with_count(i, i);
+ sketch.update_with_count(i, i as u64);
}
- let serde = ItemsSerde::Int64;
let bytes = sketch.serialize();
- let restored = FrequentItemsSketch::<i64>::deserialize(&bytes,
serde).unwrap();
+ let restored = FrequentItemsSketch::<i64>::deserialize(&bytes).unwrap();
assert_eq!(restored.total_weight(), sketch.total_weight());
assert_eq!(restored.estimate(&42), sketch.estimate(&42));
assert_eq!(restored.maximum_error(), sketch.maximum_error());
@@ -45,9 +43,8 @@ fn test_items_round_trip() {
sketch.update_with_count("beta".to_string(), 5);
sketch.update_with_count("gamma".to_string(), 7);
- let serde = ItemsSerde::String;
let bytes = sketch.serialize();
- let restored = FrequentItemsSketch::<String>::deserialize(&bytes,
serde).unwrap();
+ let restored = FrequentItemsSketch::<String>::deserialize(&bytes).unwrap();
assert_eq!(restored.total_weight(), sketch.total_weight());
assert_eq!(restored.estimate(&"beta".to_string()), 5);
assert_eq!(restored.maximum_error(), sketch.maximum_error());
@@ -56,19 +53,18 @@ fn test_items_round_trip() {
#[test]
fn test_java_frequent_longs_compatibility() {
let test_cases = [0, 1, 10, 100, 1000, 10000, 100000, 1000000];
- let serde = ItemsSerde::Int64;
for n in test_cases {
let filename = format!("frequent_long_n{}_java.sk", n);
let path = serialization_test_data("java_generated_files", &filename);
let bytes = fs::read(&path).unwrap();
- let sketch = FrequentItemsSketch::<i64>::deserialize(&bytes,
serde).unwrap();
+ let sketch = FrequentItemsSketch::<i64>::deserialize(&bytes).unwrap();
assert_eq!(sketch.is_empty(), n == 0);
if n > 10 {
assert!(sketch.maximum_error() > 0);
} else {
assert_eq!(sketch.maximum_error(), 0);
}
- assert_eq!(sketch.total_weight(), n as i64);
+ assert_eq!(sketch.total_weight(), n);
}
}
@@ -76,8 +72,7 @@ fn test_java_frequent_longs_compatibility() {
fn test_java_frequent_strings_ascii() {
let path = serialization_test_data("java_generated_files",
"frequent_string_ascii_java.sk");
let bytes = fs::read(&path).unwrap();
- let serde = ItemsSerde::String;
- let sketch = FrequentItemsSketch::<String>::deserialize(&bytes,
serde).unwrap();
+ let sketch = FrequentItemsSketch::<String>::deserialize(&bytes).unwrap();
assert!(!sketch.is_empty());
assert_eq!(sketch.maximum_error(), 0);
assert_eq!(sketch.total_weight(), 10);
@@ -103,8 +98,7 @@ fn test_java_frequent_strings_ascii() {
fn test_java_frequent_strings_utf8() {
let path = serialization_test_data("java_generated_files",
"frequent_string_utf8_java.sk");
let bytes = fs::read(&path).unwrap();
- let serde = ItemsSerde::String;
- let sketch = FrequentItemsSketch::<String>::deserialize(&bytes,
serde).unwrap();
+ let sketch = FrequentItemsSketch::<String>::deserialize(&bytes).unwrap();
assert!(!sketch.is_empty());
assert_eq!(sketch.maximum_error(), 0);
assert_eq!(sketch.total_weight(), 28);
@@ -120,12 +114,11 @@ fn test_java_frequent_strings_utf8() {
#[test]
fn test_cpp_frequent_longs_compatibility() {
let test_cases = [0, 1, 10, 100, 1000, 10000, 100000, 1000000];
- let serde = ItemsSerde::Int64;
for n in test_cases {
let filename = format!("frequent_long_n{}_cpp.sk", n);
let path = serialization_test_data("cpp_generated_files", &filename);
let bytes = fs::read(&path).unwrap();
- let sketch = FrequentItemsSketch::<i64>::deserialize(&bytes, serde);
+ let sketch = FrequentItemsSketch::<i64>::deserialize(&bytes);
if cfg!(windows) {
if let Err(err) = sketch {
assert_eq!(err.kind(), ErrorKind::InvalidData);
@@ -143,7 +136,7 @@ fn test_cpp_frequent_longs_compatibility() {
} else {
assert_eq!(sketch.maximum_error(), 0);
}
- assert_eq!(sketch.total_weight(), n as i64);
+ assert_eq!(sketch.total_weight(), n);
}
}
@@ -154,15 +147,14 @@ fn test_cpp_frequent_strings_compatibility() {
let filename = format!("frequent_string_n{}_cpp.sk", n);
let path = serialization_test_data("cpp_generated_files", &filename);
let bytes = fs::read(&path).unwrap();
- let serde = ItemsSerde::String;
- let sketch = FrequentItemsSketch::<String>::deserialize(&bytes,
serde).unwrap();
+ let sketch =
FrequentItemsSketch::<String>::deserialize(&bytes).unwrap();
assert_eq!(sketch.is_empty(), n == 0);
if n > 10 {
assert!(sketch.maximum_error() > 0);
} else {
assert_eq!(sketch.maximum_error(), 0);
}
- assert_eq!(sketch.total_weight(), n as i64);
+ assert_eq!(sketch.total_weight(), n);
}
}
@@ -170,8 +162,7 @@ fn test_cpp_frequent_strings_compatibility() {
fn test_cpp_frequent_strings_ascii() {
let path = serialization_test_data("cpp_generated_files",
"frequent_string_ascii_cpp.sk");
let bytes = fs::read(&path).unwrap();
- let serde = ItemsSerde::String;
- let sketch = FrequentItemsSketch::<String>::deserialize(&bytes,
serde).unwrap();
+ let sketch = FrequentItemsSketch::<String>::deserialize(&bytes).unwrap();
assert!(!sketch.is_empty());
assert_eq!(sketch.maximum_error(), 0);
assert_eq!(sketch.total_weight(), 10);
@@ -197,8 +188,7 @@ fn test_cpp_frequent_strings_ascii() {
fn test_cpp_frequent_strings_utf8() {
let path = serialization_test_data("cpp_generated_files",
"frequent_string_utf8_cpp.sk");
let bytes = fs::read(&path).unwrap();
- let serde = ItemsSerde::String;
- let sketch = FrequentItemsSketch::<String>::deserialize(&bytes,
serde).unwrap();
+ let sketch = FrequentItemsSketch::<String>::deserialize(&bytes).unwrap();
assert!(!sketch.is_empty());
assert_eq!(sketch.maximum_error(), 0);
assert_eq!(sketch.total_weight(), 28);
diff --git a/datasketches/tests/frequencies_update_test.rs
b/datasketches/tests/frequencies_update_test.rs
index 8947793..f2b0001 100644
--- a/datasketches/tests/frequencies_update_test.rs
+++ b/datasketches/tests/frequencies_update_test.rs
@@ -479,20 +479,6 @@ fn test_longs_reset() {
assert_eq!(sketch.lg_max_map_size(), 3);
}
-#[test]
-#[should_panic(expected = "count may not be negative")]
-fn test_longs_negative_count_panics() {
- let mut sketch: FrequentItemsSketch<i64> = FrequentItemsSketch::new(8);
- sketch.update_with_count(1, -1);
-}
-
-#[test]
-#[should_panic(expected = "count may not be negative")]
-fn test_items_negative_count_panics() {
- let mut sketch = FrequentItemsSketch::new(8);
- sketch.update_with_count("a".to_string(), -1);
-}
-
#[test]
#[should_panic(expected = "value must be power of 2")]
fn test_longs_invalid_map_size_panics() {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]