(datasketches-rust) branch main updated: chore: fine tune frequencies and theta sketches (#51)

leerho Tue, 30 Dec 2025 21:53:12 -0800

This is an automated email from the ASF dual-hosted git repository.

leerho pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datasketches-rust.git



The following commit(s) were added to refs/heads/main by this push:
     new c2cbb7c  chore: fine tune frequencies and theta sketches (#51)
c2cbb7c is described below

commit c2cbb7c3786fa08e1d0a00ddefa6182105f90c5f
Author: tison <[email protected]>
AuthorDate: Wed Dec 31 13:53:00 2025 +0800

    chore: fine tune frequencies and theta sketches (#51)
    
    * chore: fine tune code
    
    Signed-off-by: tison <[email protected]>
    
    * rework frequencies codec
    
    Signed-off-by: tison <[email protected]>
    
    * fixup
    
    Signed-off-by: tison <[email protected]>
    
    * docs
    
    Signed-off-by: tison <[email protected]>
    
    * resize factor is shared util
    
    Signed-off-by: tison <[email protected]>
    
    ---------
    
    Signed-off-by: tison <[email protected]>
---
 datasketches/src/frequencies/mod.rs                |  1 -
 .../src/frequencies/reverse_purge_item_hash_map.rs | 26 ++++----
 datasketches/src/frequencies/serde.rs              | 10 ---
 datasketches/src/frequencies/serialization.rs      | 21 +++++-
 datasketches/src/frequencies/sketch.rs             | 75 ++++++++--------------
 datasketches/src/lib.rs                            |  3 +
 datasketches/src/resize.rs                         | 68 ++++++++++++++++++++
 datasketches/src/theta/hash_table.rs               | 58 +++++------------
 datasketches/src/theta/mod.rs                      |  1 +
 datasketches/src/theta/sketch.rs                   | 32 ++++-----
 .../tests/frequencies_serialization_test.rs        | 36 ++++-------
 datasketches/tests/frequencies_update_test.rs      | 14 ----
 12 files changed, 175 insertions(+), 170 deletions(-)

diff --git a/datasketches/src/frequencies/mod.rs 
b/datasketches/src/frequencies/mod.rs
index 6bd7987..d04343a 100644
--- a/datasketches/src/frequencies/mod.rs
+++ b/datasketches/src/frequencies/mod.rs
@@ -29,7 +29,6 @@ mod serde;
 mod serialization;
 mod sketch;
 
-pub use self::serde::ItemsSerde;
 pub use self::sketch::ErrorType;
 pub use self::sketch::FrequentItemsSketch;
 pub use self::sketch::Row;
diff --git a/datasketches/src/frequencies/reverse_purge_item_hash_map.rs 
b/datasketches/src/frequencies/reverse_purge_item_hash_map.rs
index 9c17e58..f934b87 100644
--- a/datasketches/src/frequencies/reverse_purge_item_hash_map.rs
+++ b/datasketches/src/frequencies/reverse_purge_item_hash_map.rs
@@ -35,7 +35,7 @@ pub(super) struct ReversePurgeItemHashMap<T> {
     lg_length: u8,
     load_threshold: usize,
     keys: Vec<Option<T>>,
-    values: Vec<i64>,
+    values: Vec<u64>,
     states: Vec<u16>,
     num_active: usize,
 }
@@ -59,7 +59,7 @@ impl<T: Eq + Hash> ReversePurgeItemHashMap<T> {
     }
 
     /// Returns the value for `key`, or zero if the key is not present.
-    pub fn get(&self, key: &T) -> i64 {
+    pub fn get(&self, key: &T) -> u64 {
         let probe = self.hash_probe(key);
         if self.states[probe] > 0 {
             return self.values[probe];
@@ -68,7 +68,7 @@ impl<T: Eq + Hash> ReversePurgeItemHashMap<T> {
     }
 
     /// Adds `adjust_amount` to the value for `key`, inserting if absent.
-    pub fn adjust_or_put_value(&mut self, key: T, adjust_amount: i64) {
+    pub fn adjust_or_put_value(&mut self, key: T, adjust_amount: u64) {
         let mask = self.keys.len() - 1;
         let mut probe = (hash_item(&key) as usize) & mask;
         let mut drift: usize = 1;
@@ -95,20 +95,20 @@ impl<T: Eq + Hash> ReversePurgeItemHashMap<T> {
     }
 
     /// Removes all keys with non-positive counts.
-    pub fn keep_only_positive_counts(&mut self) {
+    fn keep_only_positive_counts(&mut self) {
         let len = self.keys.len();
         let mut first_probe = len - 1;
         while self.states[first_probe] > 0 {
             first_probe -= 1;
         }
         for probe in (0..first_probe).rev() {
-            if self.states[probe] > 0 && self.values[probe] <= 0 {
+            if self.states[probe] > 0 && self.values[probe] == 0 {
                 self.hash_delete(probe);
                 self.num_active -= 1;
             }
         }
         for probe in (first_probe..len).rev() {
-            if self.states[probe] > 0 && self.values[probe] <= 0 {
+            if self.states[probe] > 0 && self.values[probe] == 0 {
                 self.hash_delete(probe);
                 self.num_active -= 1;
             }
@@ -118,16 +118,16 @@ impl<T: Eq + Hash> ReversePurgeItemHashMap<T> {
     /// Shifts all values by `adjust_amount`.
     ///
     /// This is used during purges to decrement counters.
-    pub fn adjust_all_values_by(&mut self, adjust_amount: i64) {
-        for value in &mut self.values {
-            *value += adjust_amount;
+    fn adjust_all_values_by(&mut self, adjust_amount: u64) {
+        for value in self.values.iter_mut() {
+            *value = value.saturating_sub(adjust_amount);
         }
     }
 
     /// Purges the map by estimating the median count and removing 
non-positive entries.
     ///
     /// Returns the estimated median value that was subtracted from all counts.
-    pub fn purge(&mut self, sample_size: usize) -> i64 {
+    pub fn purge(&mut self, sample_size: usize) -> u64 {
         let limit = sample_size.min(self.num_active).min(MAX_SAMPLE_SIZE);
         let mut samples = Vec::with_capacity(limit);
         let mut i = 0usize;
@@ -140,7 +140,7 @@ impl<T: Eq + Hash> ReversePurgeItemHashMap<T> {
         let mid = samples.len() / 2;
         samples.select_nth_unstable(mid);
         let median = samples[mid];
-        self.adjust_all_values_by(-median);
+        self.adjust_all_values_by(median);
         self.keep_only_positive_counts();
         median
     }
@@ -206,7 +206,7 @@ impl<T: Eq + Hash> ReversePurgeItemHashMap<T> {
     }
 
     /// Returns the active values in the map.
-    pub fn active_values(&self) -> Vec<i64> {
+    pub fn active_values(&self) -> Vec<u64> {
         if self.num_active == 0 {
             return Vec::new();
         }
@@ -292,7 +292,7 @@ impl<'a, T> ReversePurgeItemIter<'a, T> {
 }
 
 impl<'a, T> Iterator for ReversePurgeItemIter<'a, T> {
-    type Item = (&'a T, i64);
+    type Item = (&'a T, u64);
 
     fn next(&mut self) -> Option<Self::Item> {
         if self.count >= self.map.num_active {
diff --git a/datasketches/src/frequencies/serde.rs 
b/datasketches/src/frequencies/serde.rs
index 376045d..a44e6b0 100644
--- a/datasketches/src/frequencies/serde.rs
+++ b/datasketches/src/frequencies/serde.rs
@@ -23,16 +23,6 @@ use crate::error::Error;
 use crate::frequencies::serialization::read_i64_le;
 use crate::frequencies::serialization::read_u32_le;
 
-/// Built-in serializers for frequency sketch items.
-#[non_exhaustive]
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-pub enum ItemsSerde {
-    /// i64 items compatible with ArrayOfLongsSerDe in Java.
-    Int64,
-    /// UTF-8 strings compatible with ArrayOfStringsSerDe in Java.
-    String,
-}
-
 pub(crate) fn serialize_string_items(items: &[String]) -> Vec<u8> {
     let total_len: usize = items.iter().map(|item| 4 + item.len()).sum();
     let mut out = Vec::with_capacity(total_len);
diff --git a/datasketches/src/frequencies/serialization.rs 
b/datasketches/src/frequencies/serialization.rs
index 24e8952..43003c9 100644
--- a/datasketches/src/frequencies/serialization.rs
+++ b/datasketches/src/frequencies/serialization.rs
@@ -50,7 +50,7 @@ pub const STREAM_WEIGHT_LONG: usize = 16;
 /// Offset of offset (fourth pre-long).
 pub const OFFSET_LONG: usize = 24;
 
-/// Read a u32 value from bytes at the given offset (little-endian).
+/// Read an u32 value from bytes at the given offset (little-endian).
 #[inline]
 pub fn read_u32_le(bytes: &[u8], offset: usize) -> u32 {
     u32::from_le_bytes([
@@ -76,14 +76,29 @@ pub fn read_i64_le(bytes: &[u8], offset: usize) -> i64 {
     ])
 }
 
+/// Read an u64 value from bytes at the given offset (little-endian).
+#[inline]
+pub fn read_u64_le(bytes: &[u8], offset: usize) -> u64 {
+    u64::from_le_bytes([
+        bytes[offset],
+        bytes[offset + 1],
+        bytes[offset + 2],
+        bytes[offset + 3],
+        bytes[offset + 4],
+        bytes[offset + 5],
+        bytes[offset + 6],
+        bytes[offset + 7],
+    ])
+}
+
 /// Write a u32 value to bytes at the given offset (little-endian).
 #[inline]
 pub fn write_u32_le(bytes: &mut [u8], offset: usize, value: u32) {
     bytes[offset..offset + 4].copy_from_slice(&value.to_le_bytes());
 }
 
-/// Write an i64 value to bytes at the given offset (little-endian).
+/// Write an u64 value to bytes at the given offset (little-endian).
 #[inline]
-pub fn write_i64_le(bytes: &mut [u8], offset: usize, value: i64) {
+pub fn write_u64_le(bytes: &mut [u8], offset: usize, value: u64) {
     bytes[offset..offset + 8].copy_from_slice(&value.to_le_bytes());
 }
diff --git a/datasketches/src/frequencies/sketch.rs 
b/datasketches/src/frequencies/sketch.rs
index ac5b884..ee4552b 100644
--- a/datasketches/src/frequencies/sketch.rs
+++ b/datasketches/src/frequencies/sketch.rs
@@ -20,9 +20,7 @@
 use std::hash::Hash;
 
 use crate::error::Error;
-use crate::error::ErrorKind;
 use crate::frequencies::reverse_purge_item_hash_map::ReversePurgeItemHashMap;
-use crate::frequencies::serde::ItemsSerde;
 use crate::frequencies::serde::deserialize_i64_items;
 use crate::frequencies::serde::deserialize_string_items;
 use crate::frequencies::serde::serialize_i64_items;
@@ -52,8 +50,8 @@ pub enum ErrorType {
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct Row<T> {
     item: T,
-    estimate: i64,
-    upper_bound: i64,
+    estimate: u64,
+    upper_bound: u64,
     lower_bound: u64,
 }
 
@@ -64,12 +62,12 @@ impl<T> Row<T> {
     }
 
     /// Returns the estimated frequency.
-    pub fn estimate(&self) -> i64 {
+    pub fn estimate(&self) -> u64 {
         self.estimate
     }
 
     /// Returns the upper bound for the frequency.
-    pub fn upper_bound(&self) -> i64 {
+    pub fn upper_bound(&self) -> u64 {
         self.upper_bound
     }
 
@@ -91,8 +89,8 @@ impl<T> Row<T> {
 pub struct FrequentItemsSketch<T> {
     lg_max_map_size: u8,
     cur_map_cap: usize,
-    offset: i64,
-    stream_weight: i64,
+    offset: u64,
+    stream_weight: u64,
     sample_size: usize,
     hash_map: ReversePurgeItemHashMap<T>,
 }
@@ -124,14 +122,14 @@ impl<T: Eq + Hash> FrequentItemsSketch<T> {
     /// Returns the total weight of the stream.
     ///
     /// This is the sum of all counts passed to `update` and 
`update_with_count`.
-    pub fn total_weight(&self) -> i64 {
+    pub fn total_weight(&self) -> u64 {
         self.stream_weight
     }
 
     /// Returns the estimated frequency for an item.
     ///
     /// If the item is tracked, this is `item_count + offset`. Otherwise it is 
zero.
-    pub fn estimate(&self, item: &T) -> i64 {
+    pub fn estimate(&self, item: &T) -> u64 {
         let value = self.hash_map.get(item);
         if value > 0 { value + self.offset } else { 0 }
     }
@@ -141,15 +139,14 @@ impl<T: Eq + Hash> FrequentItemsSketch<T> {
     /// This value is never negative and is guaranteed to be no larger than 
the true frequency.
     /// If the item is not tracked, the lower bound is zero.
     pub fn lower_bound(&self, item: &T) -> u64 {
-        let value = self.hash_map.get(item);
-        value.max(0) as u64
+        self.hash_map.get(item)
     }
 
     /// Returns the guaranteed upper bound frequency for an item.
     ///
     /// This value is guaranteed to be no smaller than the true frequency.
     /// If the item is tracked, this is `item_count + offset`.
-    pub fn upper_bound(&self, item: &T) -> i64 {
+    pub fn upper_bound(&self, item: &T) -> u64 {
         self.hash_map.get(item) + self.offset
     }
 
@@ -158,7 +155,7 @@ impl<T: Eq + Hash> FrequentItemsSketch<T> {
     ///
     /// This is equivalent to the maximum distance between the upper bound and 
the lower bound
     /// for any item.
-    pub fn maximum_error(&self) -> i64 {
+    pub fn maximum_error(&self) -> u64 {
         self.offset
     }
 
@@ -208,12 +205,8 @@ impl<T: Eq + Hash> FrequentItemsSketch<T> {
 
     /// Updates the sketch with an item and count.
     ///
-    /// # Panics
-    ///
-    /// Panics if `count` is negative.
-    ///
     /// A count of zero is a no-op.
-    pub fn update_with_count(&mut self, item: T, count: i64) {
+    pub fn update_with_count(&mut self, item: T, count: u64) {
         if count == 0 {
             return;
         }
@@ -266,7 +259,7 @@ impl<T: Eq + Hash> FrequentItemsSketch<T> {
     pub fn frequent_items_with_threshold(
         &self,
         error_type: ErrorType,
-        threshold: i64,
+        threshold: u64,
     ) -> Vec<Row<T>>
     where
         T: Clone,
@@ -285,7 +278,7 @@ impl<T: Eq + Hash> FrequentItemsSketch<T> {
                     item: item.clone(),
                     estimate: upper,
                     upper_bound: upper,
-                    lower_bound: lower.max(0) as u64,
+                    lower_bound: lower,
                 });
             }
         }
@@ -357,12 +350,12 @@ impl<T: Eq + Hash> FrequentItemsSketch<T> {
         out[LG_CUR_MAP_SIZE_BYTE] = self.hash_map.lg_length();
         out[FLAGS_BYTE] = 0;
         write_u32_le(&mut out, ACTIVE_ITEMS_INT, active_items as u32);
-        write_i64_le(&mut out, STREAM_WEIGHT_LONG, self.stream_weight);
-        write_i64_le(&mut out, OFFSET_LONG, self.offset);
+        write_u64_le(&mut out, STREAM_WEIGHT_LONG, self.stream_weight);
+        write_u64_le(&mut out, OFFSET_LONG, self.offset);
 
         let mut offset = PREAMBLE_LONGS_NONEMPTY as usize * 8;
         for value in values {
-            write_i64_le(&mut out, offset, value);
+            write_u64_le(&mut out, offset, value);
             offset += 8;
         }
         out[offset..offset + items_bytes.len()].copy_from_slice(&items_bytes);
@@ -415,8 +408,8 @@ impl<T: Eq + Hash> FrequentItemsSketch<T> {
             return Err(Error::insufficient_data("full preamble"));
         }
         let active_items = read_u32_le(bytes, ACTIVE_ITEMS_INT) as usize;
-        let stream_weight = read_i64_le(bytes, STREAM_WEIGHT_LONG);
-        let offset_val = read_i64_le(bytes, OFFSET_LONG);
+        let stream_weight = read_u64_le(bytes, STREAM_WEIGHT_LONG);
+        let offset_val = read_u64_le(bytes, OFFSET_LONG);
         let values_offset = PREAMBLE_LONGS_NONEMPTY as usize * 8;
         let values_bytes = active_items
             .checked_mul(8)
@@ -427,7 +420,7 @@ impl<T: Eq + Hash> FrequentItemsSketch<T> {
         }
         let mut values = Vec::with_capacity(active_items);
         for i in 0..active_items {
-            values.push(read_i64_le(bytes, values_offset + i * 8));
+            values.push(read_u64_le(bytes, values_offset + i * 8));
         }
         let (items, consumed) = deserialize_items(&bytes[items_offset..], 
active_items)?;
         if items.len() != active_items {
@@ -454,17 +447,9 @@ impl FrequentItemsSketch<i64> {
         self.serialize_inner(serialize_i64_items)
     }
 
-    /// Deserializes a sketch from bytes using the selected serializer.
-    ///
-    /// Returns an error if `serde` does not match the sketch item type.
-    pub fn deserialize(bytes: &[u8], serde: ItemsSerde) -> Result<Self, Error> 
{
-        match serde {
-            ItemsSerde::Int64 => Self::deserialize_inner(bytes, 
deserialize_i64_items),
-            ItemsSerde::String => Err(Error::new(
-                ErrorKind::InvalidArgument,
-                "ItemsSerde::String cannot deserialize i64 items",
-            )),
-        }
+    /// Deserializes a sketch from bytes.
+    pub fn deserialize(bytes: &[u8]) -> Result<Self, Error> {
+        Self::deserialize_inner(bytes, deserialize_i64_items)
     }
 }
 
@@ -474,17 +459,9 @@ impl FrequentItemsSketch<String> {
         self.serialize_inner(serialize_string_items)
     }
 
-    /// Deserializes a sketch from bytes using the selected serializer.
-    ///
-    /// Returns an error if `serde` does not match the sketch item type.
-    pub fn deserialize(bytes: &[u8], serde: ItemsSerde) -> Result<Self, Error> 
{
-        match serde {
-            ItemsSerde::String => Self::deserialize_inner(bytes, 
deserialize_string_items),
-            ItemsSerde::Int64 => Err(Error::new(
-                ErrorKind::InvalidArgument,
-                "ItemsSerde::Int64 cannot deserialize String items",
-            )),
-        }
+    /// Deserializes a sketch from bytes.
+    pub fn deserialize(bytes: &[u8]) -> Result<Self, Error> {
+        Self::deserialize_inner(bytes, deserialize_string_items)
     }
 }
 
diff --git a/datasketches/src/lib.rs b/datasketches/src/lib.rs
index 4d779a0..acc051d 100644
--- a/datasketches/src/lib.rs
+++ b/datasketches/src/lib.rs
@@ -39,3 +39,6 @@ pub mod theta;
 
 mod codec;
 mod hash;
+mod resize;
+
+pub use self::resize::ResizeFactor;
diff --git a/datasketches/src/resize.rs b/datasketches/src/resize.rs
new file mode 100644
index 0000000..caf87ab
--- /dev/null
+++ b/datasketches/src/resize.rs
@@ -0,0 +1,68 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/// For the Families that accept this configuration parameter, it controls the 
size multiple that
+/// affects how fast the internal cache grows, when more space is required.
+///
+/// For Theta Sketches, the Resize Factor is a dynamic, speed performance vs. 
memory size tradeoff.
+/// The sketches created on-heap and configured with a Resize Factor of > X1 
start out with an
+/// internal hash table size that is the smallest submultiple of the target 
Nominal Entries
+/// and larger than the minimum required hash table size for that sketch.
+///
+/// When the sketch needs to be resized larger, then the Resize Factor is used 
as a multiplier of
+/// the current sketch cache array size.
+///
+/// "X1" means no resizing is allowed and the sketch will be initialized at 
full size.
+///
+/// "X2" means the internal cache will start very small and double in size 
until the target size is
+/// reached.
+///
+/// Similarly, "X4" is a factor of 4 and "X8" is a factor of 8.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum ResizeFactor {
+    /// Do not resize. Sketch will be configured to full size.
+    X1,
+    /// Resize by factor of 2
+    X2,
+    /// Resize by factor of 4
+    X4,
+    /// Resize by factor of 8
+    X8,
+}
+
+impl ResizeFactor {
+    /// Returns the Log-base 2 of the Resize Factor
+    pub fn lg_value(self) -> u8 {
+        match self {
+            ResizeFactor::X1 => 0,
+            ResizeFactor::X2 => 1,
+            ResizeFactor::X4 => 2,
+            ResizeFactor::X8 => 3,
+        }
+    }
+
+    /// Returns the Resize Factor.
+    pub fn value(self) -> usize {
+        // 1 << lg_value
+        match self {
+            ResizeFactor::X1 => 1,
+            ResizeFactor::X2 => 2,
+            ResizeFactor::X4 => 4,
+            ResizeFactor::X8 => 8,
+        }
+    }
+}
diff --git a/datasketches/src/theta/hash_table.rs 
b/datasketches/src/theta/hash_table.rs
index 634f232..5e5dcb6 100644
--- a/datasketches/src/theta/hash_table.rs
+++ b/datasketches/src/theta/hash_table.rs
@@ -17,6 +17,7 @@
 
 use std::hash::Hash;
 
+use crate::ResizeFactor;
 use crate::hash::MurmurHash3X64128;
 
 /// Maximum theta value (signed max for compatibility with Java)
@@ -31,28 +32,6 @@ pub const MAX_LG_K: u8 = 26;
 /// Default log2 of K
 pub const DEFAULT_LG_K: u8 = 12;
 
-/// Hash table resize factor
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
-pub enum ResizeFactor {
-    /// Resize by factor of 2
-    X2 = 2,
-    /// Resize by factor of 4
-    X4 = 4,
-    /// Resize by factor of 8
-    #[default]
-    X8 = 8,
-}
-
-impl ResizeFactor {
-    pub fn lg_value(&self) -> u8 {
-        match self {
-            ResizeFactor::X2 => 1,
-            ResizeFactor::X4 => 2,
-            ResizeFactor::X8 => 3,
-        }
-    }
-}
-
 /// Resize threshold (0.5 = 50% load factor)
 const RESIZE_THRESHOLD: f64 = 0.5;
 
@@ -329,12 +308,10 @@ impl ThetaHashTable {
 fn starting_sub_multiple(lg_target: u8, lg_min: u8, lg_resize_factor: u8) -> 
u8 {
     if lg_target <= lg_min {
         lg_min
+    } else if lg_resize_factor == 0 {
+        lg_target
     } else {
-        if lg_resize_factor == 0 {
-            lg_target
-        } else {
-            ((lg_target - lg_min) % lg_resize_factor) + lg_min
-        }
+        ((lg_target - lg_min) % lg_resize_factor) + lg_min
     }
 }
 
@@ -423,6 +400,17 @@ mod tests {
 
     #[test]
     fn test_resize() {
+        fn populate_values(table: &mut ThetaHashTable, count: usize) -> usize {
+            let mut inserted = 0;
+            for i in 0..count {
+                let hash = table.hash_and_screen(format!("value_{}", i));
+                if hash != 0 && table.try_insert(hash) {
+                    inserted += 1;
+                }
+            }
+            inserted
+        }
+
         {
             let mut table = ThetaHashTable::new(8, ResizeFactor::X2, 1.0, 
DEFAULT_UPDATE_SEED);
 
@@ -430,13 +418,7 @@ mod tests {
 
             // Insert enough values to trigger resize (50% threshold)
             // Capacity = 32 * 0.5 = 16
-            let mut inserted = 0;
-            for i in 0..20 {
-                let hash = table.hash_and_screen(format!("value_{}", i));
-                if hash != 0 && table.try_insert(hash) {
-                    inserted += 1;
-                }
-            }
+            let inserted = populate_values(&mut table, 20);
 
             // Table should have resized and all values should be inserted
             assert!(table.num_entries() > 0);
@@ -452,13 +434,7 @@ mod tests {
 
             // Insert enough values to trigger resize (50% threshold)
             // Capacity = 32 * 0.5 = 16
-            let mut inserted = 0;
-            for i in 0..20 {
-                let hash = table.hash_and_screen(format!("value_{}", i));
-                if hash != 0 && table.try_insert(hash) {
-                    inserted += 1;
-                }
-            }
+            let inserted = populate_values(&mut table, 20);
 
             // Table should have resized and all values should be inserted
             assert!(table.num_entries() > 0);
diff --git a/datasketches/src/theta/mod.rs b/datasketches/src/theta/mod.rs
index e34d30e..0d50348 100644
--- a/datasketches/src/theta/mod.rs
+++ b/datasketches/src/theta/mod.rs
@@ -33,3 +33,4 @@ mod hash_table;
 mod sketch;
 
 pub use self::sketch::ThetaSketch;
+pub use self::sketch::ThetaSketchBuilder;
diff --git a/datasketches/src/theta/sketch.rs b/datasketches/src/theta/sketch.rs
index 5f67a2e..0ad5357 100644
--- a/datasketches/src/theta/sketch.rs
+++ b/datasketches/src/theta/sketch.rs
@@ -22,12 +22,12 @@
 
 use std::hash::Hash;
 
+use crate::ResizeFactor;
 use crate::hash::DEFAULT_UPDATE_SEED;
 use crate::theta::hash_table::DEFAULT_LG_K;
 use crate::theta::hash_table::MAX_LG_K;
 use crate::theta::hash_table::MAX_THETA;
 use crate::theta::hash_table::MIN_LG_K;
-use crate::theta::hash_table::ResizeFactor;
 use crate::theta::hash_table::ThetaHashTable;
 
 /// Mutable theta sketch for building from input data
@@ -131,7 +131,7 @@ impl Default for ThetaSketchBuilder {
     fn default() -> Self {
         Self {
             lg_k: DEFAULT_LG_K,
-            resize_factor: ResizeFactor::default(),
+            resize_factor: ResizeFactor::X8,
             sampling_probability: 1.0,
             seed: DEFAULT_UPDATE_SEED,
         }
@@ -139,7 +139,7 @@ impl Default for ThetaSketchBuilder {
 }
 
 impl ThetaSketchBuilder {
-    /// Set lg_k (log2 of nominal size k)
+    /// Set lg_k (log2 of nominal size k).
     ///
     /// # Panics
     ///
@@ -156,34 +156,33 @@ impl ThetaSketchBuilder {
         self
     }
 
-    /// Set resize factor
-    pub fn resize_factor(mut self, rf: ResizeFactor) -> Self {
-        self.resize_factor = rf;
+    /// Set resize factor.
+    pub fn resize_factor(mut self, factor: ResizeFactor) -> Self {
+        self.resize_factor = factor;
         self
     }
 
-    /// Set sampling probability p
+    /// Set sampling probability p.
     ///
     /// # Panics
     ///
     /// If p is not in range [0.0, 1.0]
-    pub fn sampling_probability(mut self, p: f32) -> Self {
+    pub fn sampling_probability(mut self, probability: f32) -> Self {
         assert!(
-            (0.0..=1.0).contains(&p),
-            "p must be in [0.0, 1.0], got {}",
-            p
+            (0.0..=1.0).contains(&probability),
+            "p must be in [0.0, 1.0], got {probability}"
         );
-        self.sampling_probability = p;
+        self.sampling_probability = probability;
         self
     }
 
-    /// Set hash seed
+    /// Set hash seed.
     pub fn seed(mut self, seed: u64) -> Self {
         self.seed = seed;
         self
     }
 
-    /// Build the ThetaSketch
+    /// Build the ThetaSketch.
     pub fn build(self) -> ThetaSketch {
         let table = ThetaHashTable::new(
             self.lg_k,
@@ -199,10 +198,11 @@ impl ThetaSketchBuilder {
 /// Canonicalize double value for compatibility with Java
 fn canonical_double(value: f64) -> i64 {
     if value.is_nan() {
-        0x7ff8000000000000i64 // Java's Double.doubleToLongBits() NaN value
+        // Java's Double.doubleToLongBits() NaN value
+        0x7ff8000000000000i64
     } else {
         // -0.0 + 0.0 == +0.0 under IEEE754 roundTiesToEven rounding mode,
-        // which Rust guarantees. Thus by adding a positive zero we
+        // which Rust guarantees. Thus, by adding a positive zero we
         // canonicalize signed zero without any branches in one instruction.
         (value + 0.0).to_bits() as i64
     }
diff --git a/datasketches/tests/frequencies_serialization_test.rs 
b/datasketches/tests/frequencies_serialization_test.rs
index d43b4d1..f8d95bc 100644
--- a/datasketches/tests/frequencies_serialization_test.rs
+++ b/datasketches/tests/frequencies_serialization_test.rs
@@ -22,17 +22,15 @@ use std::fs;
 use common::serialization_test_data;
 use datasketches::error::ErrorKind;
 use datasketches::frequencies::FrequentItemsSketch;
-use datasketches::frequencies::ItemsSerde;
 
 #[test]
 fn test_longs_round_trip() {
     let mut sketch: FrequentItemsSketch<i64> = FrequentItemsSketch::new(32);
     for i in 1..=100 {
-        sketch.update_with_count(i, i);
+        sketch.update_with_count(i, i as u64);
     }
-    let serde = ItemsSerde::Int64;
     let bytes = sketch.serialize();
-    let restored = FrequentItemsSketch::<i64>::deserialize(&bytes, 
serde).unwrap();
+    let restored = FrequentItemsSketch::<i64>::deserialize(&bytes).unwrap();
     assert_eq!(restored.total_weight(), sketch.total_weight());
     assert_eq!(restored.estimate(&42), sketch.estimate(&42));
     assert_eq!(restored.maximum_error(), sketch.maximum_error());
@@ -45,9 +43,8 @@ fn test_items_round_trip() {
     sketch.update_with_count("beta".to_string(), 5);
     sketch.update_with_count("gamma".to_string(), 7);
 
-    let serde = ItemsSerde::String;
     let bytes = sketch.serialize();
-    let restored = FrequentItemsSketch::<String>::deserialize(&bytes, 
serde).unwrap();
+    let restored = FrequentItemsSketch::<String>::deserialize(&bytes).unwrap();
     assert_eq!(restored.total_weight(), sketch.total_weight());
     assert_eq!(restored.estimate(&"beta".to_string()), 5);
     assert_eq!(restored.maximum_error(), sketch.maximum_error());
@@ -56,19 +53,18 @@ fn test_items_round_trip() {
 #[test]
 fn test_java_frequent_longs_compatibility() {
     let test_cases = [0, 1, 10, 100, 1000, 10000, 100000, 1000000];
-    let serde = ItemsSerde::Int64;
     for n in test_cases {
         let filename = format!("frequent_long_n{}_java.sk", n);
         let path = serialization_test_data("java_generated_files", &filename);
         let bytes = fs::read(&path).unwrap();
-        let sketch = FrequentItemsSketch::<i64>::deserialize(&bytes, 
serde).unwrap();
+        let sketch = FrequentItemsSketch::<i64>::deserialize(&bytes).unwrap();
         assert_eq!(sketch.is_empty(), n == 0);
         if n > 10 {
             assert!(sketch.maximum_error() > 0);
         } else {
             assert_eq!(sketch.maximum_error(), 0);
         }
-        assert_eq!(sketch.total_weight(), n as i64);
+        assert_eq!(sketch.total_weight(), n);
     }
 }
 
@@ -76,8 +72,7 @@ fn test_java_frequent_longs_compatibility() {
 fn test_java_frequent_strings_ascii() {
     let path = serialization_test_data("java_generated_files", 
"frequent_string_ascii_java.sk");
     let bytes = fs::read(&path).unwrap();
-    let serde = ItemsSerde::String;
-    let sketch = FrequentItemsSketch::<String>::deserialize(&bytes, 
serde).unwrap();
+    let sketch = FrequentItemsSketch::<String>::deserialize(&bytes).unwrap();
     assert!(!sketch.is_empty());
     assert_eq!(sketch.maximum_error(), 0);
     assert_eq!(sketch.total_weight(), 10);
@@ -103,8 +98,7 @@ fn test_java_frequent_strings_ascii() {
 fn test_java_frequent_strings_utf8() {
     let path = serialization_test_data("java_generated_files", 
"frequent_string_utf8_java.sk");
     let bytes = fs::read(&path).unwrap();
-    let serde = ItemsSerde::String;
-    let sketch = FrequentItemsSketch::<String>::deserialize(&bytes, 
serde).unwrap();
+    let sketch = FrequentItemsSketch::<String>::deserialize(&bytes).unwrap();
     assert!(!sketch.is_empty());
     assert_eq!(sketch.maximum_error(), 0);
     assert_eq!(sketch.total_weight(), 28);
@@ -120,12 +114,11 @@ fn test_java_frequent_strings_utf8() {
 #[test]
 fn test_cpp_frequent_longs_compatibility() {
     let test_cases = [0, 1, 10, 100, 1000, 10000, 100000, 1000000];
-    let serde = ItemsSerde::Int64;
     for n in test_cases {
         let filename = format!("frequent_long_n{}_cpp.sk", n);
         let path = serialization_test_data("cpp_generated_files", &filename);
         let bytes = fs::read(&path).unwrap();
-        let sketch = FrequentItemsSketch::<i64>::deserialize(&bytes, serde);
+        let sketch = FrequentItemsSketch::<i64>::deserialize(&bytes);
         if cfg!(windows) {
             if let Err(err) = sketch {
                 assert_eq!(err.kind(), ErrorKind::InvalidData);
@@ -143,7 +136,7 @@ fn test_cpp_frequent_longs_compatibility() {
         } else {
             assert_eq!(sketch.maximum_error(), 0);
         }
-        assert_eq!(sketch.total_weight(), n as i64);
+        assert_eq!(sketch.total_weight(), n);
     }
 }
 
@@ -154,15 +147,14 @@ fn test_cpp_frequent_strings_compatibility() {
         let filename = format!("frequent_string_n{}_cpp.sk", n);
         let path = serialization_test_data("cpp_generated_files", &filename);
         let bytes = fs::read(&path).unwrap();
-        let serde = ItemsSerde::String;
-        let sketch = FrequentItemsSketch::<String>::deserialize(&bytes, 
serde).unwrap();
+        let sketch = 
FrequentItemsSketch::<String>::deserialize(&bytes).unwrap();
         assert_eq!(sketch.is_empty(), n == 0);
         if n > 10 {
             assert!(sketch.maximum_error() > 0);
         } else {
             assert_eq!(sketch.maximum_error(), 0);
         }
-        assert_eq!(sketch.total_weight(), n as i64);
+        assert_eq!(sketch.total_weight(), n);
     }
 }
 
@@ -170,8 +162,7 @@ fn test_cpp_frequent_strings_compatibility() {
 fn test_cpp_frequent_strings_ascii() {
     let path = serialization_test_data("cpp_generated_files", 
"frequent_string_ascii_cpp.sk");
     let bytes = fs::read(&path).unwrap();
-    let serde = ItemsSerde::String;
-    let sketch = FrequentItemsSketch::<String>::deserialize(&bytes, 
serde).unwrap();
+    let sketch = FrequentItemsSketch::<String>::deserialize(&bytes).unwrap();
     assert!(!sketch.is_empty());
     assert_eq!(sketch.maximum_error(), 0);
     assert_eq!(sketch.total_weight(), 10);
@@ -197,8 +188,7 @@ fn test_cpp_frequent_strings_ascii() {
 fn test_cpp_frequent_strings_utf8() {
     let path = serialization_test_data("cpp_generated_files", 
"frequent_string_utf8_cpp.sk");
     let bytes = fs::read(&path).unwrap();
-    let serde = ItemsSerde::String;
-    let sketch = FrequentItemsSketch::<String>::deserialize(&bytes, 
serde).unwrap();
+    let sketch = FrequentItemsSketch::<String>::deserialize(&bytes).unwrap();
     assert!(!sketch.is_empty());
     assert_eq!(sketch.maximum_error(), 0);
     assert_eq!(sketch.total_weight(), 28);
diff --git a/datasketches/tests/frequencies_update_test.rs 
b/datasketches/tests/frequencies_update_test.rs
index 8947793..f2b0001 100644
--- a/datasketches/tests/frequencies_update_test.rs
+++ b/datasketches/tests/frequencies_update_test.rs
@@ -479,20 +479,6 @@ fn test_longs_reset() {
     assert_eq!(sketch.lg_max_map_size(), 3);
 }
 
-#[test]
-#[should_panic(expected = "count may not be negative")]
-fn test_longs_negative_count_panics() {
-    let mut sketch: FrequentItemsSketch<i64> = FrequentItemsSketch::new(8);
-    sketch.update_with_count(1, -1);
-}
-
-#[test]
-#[should_panic(expected = "count may not be negative")]
-fn test_items_negative_count_panics() {
-    let mut sketch = FrequentItemsSketch::new(8);
-    sketch.update_with_count("a".to_string(), -1);
-}
-
 #[test]
 #[should_panic(expected = "value must be power of 2")]
 fn test_longs_invalid_map_size_panics() {


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(datasketches-rust) branch main updated: chore: fine tune frequencies and theta sketches (#51)

Reply via email to