This is an automated email from the ASF dual-hosted git repository.

tison pushed a commit to branch frequent-items
in repository https://gitbox.apache.org/repos/asf/datasketches-rust.git

commit 98082a54c644796b6752860059f0a3c12f88d069
Author: tison <[email protected]>
AuthorDate: Sun Feb 1 15:20:53 2026 +0800

    refactor: support customized FrequentItemValue
    
    Signed-off-by: tison <[email protected]>
---
 datasketches/src/codec.rs                     |  6 +-
 datasketches/src/frequencies/mod.rs           |  1 +
 datasketches/src/frequencies/serialization.rs | 92 ++++++++++++++-------------
 datasketches/src/frequencies/sketch.rs        | 79 ++++++++++++-----------
 datasketches/src/lib.rs                       |  2 +-
 datasketches/tests/frequencies_update_test.rs |  4 +-
 6 files changed, 97 insertions(+), 87 deletions(-)

diff --git a/datasketches/src/codec.rs b/datasketches/src/codec.rs
index 4df7b22..7a13534 100644
--- a/datasketches/src/codec.rs
+++ b/datasketches/src/codec.rs
@@ -15,13 +15,11 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#![allow(dead_code)]
-
 use std::io;
 use std::io::Cursor;
 use std::io::Read;
 
-pub(crate) struct SketchBytes {
+pub struct SketchBytes {
     bytes: Vec<u8>,
 }
 
@@ -113,7 +111,7 @@ impl SketchBytes {
     }
 }
 
-pub(crate) struct SketchSlice<'a> {
+pub struct SketchSlice<'a> {
     slice: Cursor<&'a [u8]>,
 }
 
diff --git a/datasketches/src/frequencies/mod.rs 
b/datasketches/src/frequencies/mod.rs
index 93fb5e4..8f865d6 100644
--- a/datasketches/src/frequencies/mod.rs
+++ b/datasketches/src/frequencies/mod.rs
@@ -52,6 +52,7 @@ mod reverse_purge_item_hash_map;
 mod serialization;
 mod sketch;
 
+pub use self::serialization::FrequentItemValue;
 pub use self::sketch::ErrorType;
 pub use self::sketch::FrequentItemsSketch;
 pub use self::sketch::Row;
diff --git a/datasketches/src/frequencies/serialization.rs 
b/datasketches/src/frequencies/serialization.rs
index 3f8600b..c02cdef 100644
--- a/datasketches/src/frequencies/serialization.rs
+++ b/datasketches/src/frequencies/serialization.rs
@@ -18,6 +18,7 @@
 use crate::codec::SketchBytes;
 use crate::codec::SketchSlice;
 use crate::error::Error;
+use std::hash::Hash;
 
 /// Family ID for frequency sketches.
 pub const FAMILY_ID: u8 = 10;
@@ -32,66 +33,69 @@ pub const PREAMBLE_LONGS_NONEMPTY: u8 = 4;
 /// Empty flag mask (both bits for compatibility).
 pub const EMPTY_FLAG_MASK: u8 = 5;
 
-pub(crate) fn count_string_items_bytes(items: &[String]) -> usize {
-    items.iter().map(|item| 4 + item.len()).sum()
+/// Trait for serializing and deserializing frequent item values.
+pub trait FrequentItemValue: Sized + Eq + Hash + Clone {
+    /// Returns the size in bytes required to serialize the given item.
+    fn serialize_size(item: &Self) -> usize;
+    /// Serializes the item into the given byte buffer.
+    fn serialize_value(&self, bytes: &mut SketchBytes);
+    /// Deserializes an item from the given byte cursor.
+    fn deserialize_value(cursor: &mut SketchSlice<'_>) -> Result<Self, Error>;
 }
 
-pub(crate) fn serialize_string_items(bytes: &mut SketchBytes, items: 
&[String]) {
-    for item in items {
-        let bs = item.as_bytes();
+impl FrequentItemValue for String {
+    fn serialize_size(item: &Self) -> usize {
+        size_of::<u32>() + item.len()
+    }
+
+    fn serialize_value(&self, bytes: &mut SketchBytes) {
+        let bs = self.as_bytes();
         bytes.write_u32_le(bs.len() as u32);
         bytes.write(bs);
     }
-}
 
-pub(crate) fn deserialize_string_items(
-    mut cursor: SketchSlice<'_>,
-    num_items: usize,
-) -> Result<Vec<String>, Error> {
-    let mut items = Vec::with_capacity(num_items);
-    for i in 0..num_items {
+    fn deserialize_value(cursor: &mut SketchSlice<'_>) -> Result<Self, Error> {
         let len = cursor.read_u32_le().map_err(|_| {
-            Error::insufficient_data(format!(
-                "expected {num_items} string items, failed to read len at 
index {i}"
-            ))
+            Error::insufficient_data("failed to read string item 
length".to_string())
         })?;
 
         let mut slice = vec![0; len as usize];
         cursor.read_exact(&mut slice).map_err(|_| {
-            Error::insufficient_data(format!(
-                "expected {num_items} string items, failed to read slice at 
index {i}"
-            ))
+            Error::insufficient_data("failed to read string item 
bytes".to_string())
         })?;
 
-        let value = String::from_utf8(slice)
-            .map_err(|_| Error::deserial(format!("invalid UTF-8 string payload 
at index {i}")))?;
-        items.push(value);
+        String::from_utf8(slice)
+            .map_err(|_| Error::deserial("invalid UTF-8 string 
payload".to_string()))
     }
-    Ok(items)
 }
 
-pub(crate) fn count_i64_items_bytes(items: &[i64]) -> usize {
-    items.len() * 8
-}
+macro_rules! impl_primitive {
+    ($name:ty, $read:ident, $write:ident) => {
+        impl FrequentItemValue for $name {
+            fn serialize_size(_item: &Self) -> usize {
+                size_of::<$name>()
+            }
 
-pub(crate) fn serialize_i64_items(bytes: &mut SketchBytes, items: &[i64]) {
-    for item in items.iter().copied() {
-        bytes.write_i64_le(item);
-    }
-}
+            fn serialize_value(&self, bytes: &mut SketchBytes) {
+                bytes.$write(*self);
+            }
 
-pub(crate) fn deserialize_i64_items(
-    mut cursor: SketchSlice<'_>,
-    num_items: usize,
-) -> Result<Vec<i64>, Error> {
-    let mut items = Vec::with_capacity(num_items);
-    for i in 0..num_items {
-        let value = cursor.read_i64_le().map_err(|_| {
-            Error::insufficient_data(format!(
-                "expected {num_items} i64 items, failed at index {i}"
-            ))
-        })?;
-        items.push(value);
-    }
-    Ok(items)
+            fn deserialize_value(cursor: &mut SketchSlice<'_>) -> Result<Self, 
Error> {
+                cursor.$read().map_err(|_| {
+                    Error::insufficient_data(
+                        concat!("failed to read ", stringify!($name), " item 
bytes").to_string(),
+                    )
+                })
+            }
+        }
+    };
 }
+
+impl_primitive!(i8, read_i8, write_i8);
+impl_primitive!(u8, read_u8, write_u8);
+impl_primitive!(i16, read_i16_le, write_i16_le);
+impl_primitive!(u16, read_u16_le, write_u16_le);
+impl_primitive!(i32, read_i32_le, write_i32_le);
+impl_primitive!(u32, read_u32_le, write_u32_le);
+impl_primitive!(i64, read_i64_le, write_i64_le);
+impl_primitive!(u64, read_u64_le, write_u64_le);
diff --git a/datasketches/src/frequencies/sketch.rs 
b/datasketches/src/frequencies/sketch.rs
index e0d9711..7bf5f65 100644
--- a/datasketches/src/frequencies/sketch.rs
+++ b/datasketches/src/frequencies/sketch.rs
@@ -66,14 +66,12 @@ impl<T> Row<T> {
         self.estimate
     }
 
-    /// Returns the upper bound for the frequency.
+    /// Returns the guaranteed upper bound for the frequency.
     pub fn upper_bound(&self) -> u64 {
         self.upper_bound
     }
 
     /// Returns the guaranteed lower bound for the frequency.
-    ///
-    /// This value is never negative.
     pub fn lower_bound(&self) -> u64 {
         self.lower_bound
     }
@@ -115,7 +113,11 @@ impl<T: Eq + Hash> FrequentItemsSketch<T> {
     /// assert_eq!(sketch.num_active_items(), 2);
     /// ```
     pub fn new(max_map_size: usize) -> Self {
-        let lg_max_map_size = exact_log2(max_map_size);
+        assert!(
+            max_map_size.is_power_of_two(),
+            "max_map_size must be power of 2"
+        );
+        let lg_max_map_size = max_map_size.trailing_zeros() as u8;
         Self::with_lg_map_sizes(lg_max_map_size, LG_MIN_MAP_SIZE)
     }
 
@@ -530,44 +532,23 @@ impl<T: Eq + Hash> FrequentItemsSketch<T> {
     }
 }
 
-impl FrequentItemsSketch<i64> {
+impl<T: FrequentItemValue> FrequentItemsSketch<T> {
     /// Serializes this sketch into a byte vector.
     ///
     /// # Examples
     ///
-    /// ```
-    /// # use datasketches::frequencies::FrequentItemsSketch;
-    /// # let mut sketch = FrequentItemsSketch::<i64>::new(64);
-    /// # sketch.update_with_count(7, 2);
-    /// let bytes = sketch.serialize();
-    /// let decoded = FrequentItemsSketch::<i64>::deserialize(&bytes).unwrap();
-    /// assert!(decoded.estimate(&7) >= 2);
-    /// ```
-    pub fn serialize(&self) -> Vec<u8> {
-        self.serialize_inner(count_i64_items_bytes, serialize_i64_items)
-    }
-
-    /// Deserializes a sketch from bytes.
-    ///
-    /// # Examples
+    /// Use with `i64` items:
     ///
     /// ```
     /// # use datasketches::frequencies::FrequentItemsSketch;
     /// # let mut sketch = FrequentItemsSketch::<i64>::new(64);
     /// # sketch.update_with_count(7, 2);
-    /// # let bytes = sketch.serialize();
+    /// let bytes = sketch.serialize();
     /// let decoded = FrequentItemsSketch::<i64>::deserialize(&bytes).unwrap();
     /// assert!(decoded.estimate(&7) >= 2);
     /// ```
-    pub fn deserialize(bytes: &[u8]) -> Result<Self, Error> {
-        Self::deserialize_inner(bytes, deserialize_i64_items)
-    }
-}
-
-impl FrequentItemsSketch<String> {
-    /// Serializes this sketch into a byte vector.
     ///
-    /// # Examples
+    /// Use with `String` items:
     ///
     /// ```
     /// # use datasketches::frequencies::FrequentItemsSketch;
@@ -579,13 +560,33 @@ impl FrequentItemsSketch<String> {
     /// assert!(decoded.estimate(&apple) >= 2);
     /// ```
     pub fn serialize(&self) -> Vec<u8> {
-        self.serialize_inner(count_string_items_bytes, serialize_string_items)
+        self.serialize_inner(
+            |items| items.iter().map(|item| T::serialize_size(item)).sum(),
+            |bytes, items| {
+                for item in items {
+                    item.serialize_value(bytes);
+                }
+            },
+        )
     }
 
     /// Deserializes a sketch from bytes.
     ///
     /// # Examples
     ///
+    /// Use with `i64` items:
+    ///
+    /// ```
+    /// # use datasketches::frequencies::FrequentItemsSketch;
+    /// # let mut sketch = FrequentItemsSketch::<i64>::new(64);
+    /// # sketch.update_with_count(7, 2);
+    /// # let bytes = sketch.serialize();
+    /// let decoded = FrequentItemsSketch::<i64>::deserialize(&bytes).unwrap();
+    /// assert!(decoded.estimate(&7) >= 2);
+    /// ```
+    ///
+    /// Use with `String` items:
+    ///
     /// ```
     /// # use datasketches::frequencies::FrequentItemsSketch;
     /// # let mut sketch = FrequentItemsSketch::<String>::new(64);
@@ -596,11 +597,17 @@ impl FrequentItemsSketch<String> {
     /// assert!(decoded.estimate(&apple) >= 2);
     /// ```
     pub fn deserialize(bytes: &[u8]) -> Result<Self, Error> {
-        Self::deserialize_inner(bytes, deserialize_string_items)
+        Self::deserialize_inner(bytes, |mut cursor, num_items| {
+            let mut items = Vec::with_capacity(num_items);
+            for i in 0..num_items {
+                let item = T::deserialize_value(&mut cursor).map_err(|_| {
+                    Error::insufficient_data(format!(
+                        "expected {num_items} items, failed to read item at 
index {i}"
+                    ))
+                })?;
+                items.push(item);
+            }
+            Ok(items)
+        })
     }
 }
-
-fn exact_log2(value: usize) -> u8 {
-    assert!(value.is_power_of_two(), "value must be power of 2");
-    value.trailing_zeros() as u8
-}
diff --git a/datasketches/src/lib.rs b/datasketches/src/lib.rs
index 17701ab..02dc692 100644
--- a/datasketches/src/lib.rs
+++ b/datasketches/src/lib.rs
@@ -31,6 +31,7 @@
 compile_error!("datasketches does not support big-endian targets");
 
 pub mod bloom;
+pub mod codec;
 pub mod common;
 pub mod countmin;
 pub mod cpc;
@@ -40,5 +41,4 @@ pub mod hll;
 pub mod tdigest;
 pub mod theta;
 
-mod codec;
 mod hash;
diff --git a/datasketches/tests/frequencies_update_test.rs 
b/datasketches/tests/frequencies_update_test.rs
index f2b0001..a5a98e1 100644
--- a/datasketches/tests/frequencies_update_test.rs
+++ b/datasketches/tests/frequencies_update_test.rs
@@ -480,13 +480,13 @@ fn test_longs_reset() {
 }
 
 #[test]
-#[should_panic(expected = "value must be power of 2")]
+#[should_panic(expected = "max_map_size must be power of 2")]
 fn test_longs_invalid_map_size_panics() {
     FrequentItemsSketch::<i64>::new(6);
 }
 
 #[test]
-#[should_panic(expected = "value must be power of 2")]
+#[should_panic(expected = "max_map_size must be power of 2")]
 fn test_items_invalid_map_size_panics() {
     let _ = FrequentItemsSketch::<String>::new(6);
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to