This is an automated email from the ASF dual-hosted git repository. tison pushed a commit to branch frequent-items in repository https://gitbox.apache.org/repos/asf/datasketches-rust.git
commit 98082a54c644796b6752860059f0a3c12f88d069 Author: tison <[email protected]> AuthorDate: Sun Feb 1 15:20:53 2026 +0800 refactor: support customized FrequentItemValue Signed-off-by: tison <[email protected]> --- datasketches/src/codec.rs | 6 +- datasketches/src/frequencies/mod.rs | 1 + datasketches/src/frequencies/serialization.rs | 92 ++++++++++++++------------- datasketches/src/frequencies/sketch.rs | 79 ++++++++++++----------- datasketches/src/lib.rs | 2 +- datasketches/tests/frequencies_update_test.rs | 4 +- 6 files changed, 97 insertions(+), 87 deletions(-) diff --git a/datasketches/src/codec.rs b/datasketches/src/codec.rs index 4df7b22..7a13534 100644 --- a/datasketches/src/codec.rs +++ b/datasketches/src/codec.rs @@ -15,13 +15,11 @@ // specific language governing permissions and limitations // under the License. -#![allow(dead_code)] - use std::io; use std::io::Cursor; use std::io::Read; -pub(crate) struct SketchBytes { +pub struct SketchBytes { bytes: Vec<u8>, } @@ -113,7 +111,7 @@ impl SketchBytes { } } -pub(crate) struct SketchSlice<'a> { +pub struct SketchSlice<'a> { slice: Cursor<&'a [u8]>, } diff --git a/datasketches/src/frequencies/mod.rs b/datasketches/src/frequencies/mod.rs index 93fb5e4..8f865d6 100644 --- a/datasketches/src/frequencies/mod.rs +++ b/datasketches/src/frequencies/mod.rs @@ -52,6 +52,7 @@ mod reverse_purge_item_hash_map; mod serialization; mod sketch; +pub use self::serialization::FrequentItemValue; pub use self::sketch::ErrorType; pub use self::sketch::FrequentItemsSketch; pub use self::sketch::Row; diff --git a/datasketches/src/frequencies/serialization.rs b/datasketches/src/frequencies/serialization.rs index 3f8600b..c02cdef 100644 --- a/datasketches/src/frequencies/serialization.rs +++ b/datasketches/src/frequencies/serialization.rs @@ -18,6 +18,7 @@ use crate::codec::SketchBytes; use crate::codec::SketchSlice; use crate::error::Error; +use std::hash::Hash; /// Family ID for frequency sketches. pub const FAMILY_ID: u8 = 10; @@ -32,66 +33,69 @@ pub const PREAMBLE_LONGS_NONEMPTY: u8 = 4; /// Empty flag mask (both bits for compatibility). pub const EMPTY_FLAG_MASK: u8 = 5; -pub(crate) fn count_string_items_bytes(items: &[String]) -> usize { - items.iter().map(|item| 4 + item.len()).sum() +/// Trait for serializing and deserializing frequent item values. +pub trait FrequentItemValue: Sized + Eq + Hash + Clone { + /// Returns the size in bytes required to serialize the given item. + fn serialize_size(item: &Self) -> usize; + /// Serializes the item into the given byte buffer. + fn serialize_value(&self, bytes: &mut SketchBytes); + /// Deserializes an item from the given byte cursor. + fn deserialize_value(cursor: &mut SketchSlice<'_>) -> Result<Self, Error>; } -pub(crate) fn serialize_string_items(bytes: &mut SketchBytes, items: &[String]) { - for item in items { - let bs = item.as_bytes(); +impl FrequentItemValue for String { + fn serialize_size(item: &Self) -> usize { + size_of::<u32>() + item.len() + } + + fn serialize_value(&self, bytes: &mut SketchBytes) { + let bs = self.as_bytes(); bytes.write_u32_le(bs.len() as u32); bytes.write(bs); } -} -pub(crate) fn deserialize_string_items( - mut cursor: SketchSlice<'_>, - num_items: usize, -) -> Result<Vec<String>, Error> { - let mut items = Vec::with_capacity(num_items); - for i in 0..num_items { + fn deserialize_value(cursor: &mut SketchSlice<'_>) -> Result<Self, Error> { let len = cursor.read_u32_le().map_err(|_| { - Error::insufficient_data(format!( - "expected {num_items} string items, failed to read len at index {i}" - )) + Error::insufficient_data("failed to read string item length".to_string()) })?; let mut slice = vec![0; len as usize]; cursor.read_exact(&mut slice).map_err(|_| { - Error::insufficient_data(format!( - "expected {num_items} string items, failed to read slice at index {i}" - )) + Error::insufficient_data("failed to read string item bytes".to_string()) })?; - let value = String::from_utf8(slice) - .map_err(|_| Error::deserial(format!("invalid UTF-8 string payload at index {i}")))?; - items.push(value); + String::from_utf8(slice) + .map_err(|_| Error::deserial("invalid UTF-8 string payload".to_string())) } - Ok(items) } -pub(crate) fn count_i64_items_bytes(items: &[i64]) -> usize { - items.len() * 8 -} +macro_rules! impl_primitive { + ($name:ty, $read:ident, $write:ident) => { + impl FrequentItemValue for $name { + fn serialize_size(_item: &Self) -> usize { + size_of::<$name>() + } -pub(crate) fn serialize_i64_items(bytes: &mut SketchBytes, items: &[i64]) { - for item in items.iter().copied() { - bytes.write_i64_le(item); - } -} + fn serialize_value(&self, bytes: &mut SketchBytes) { + bytes.$write(*self); + } -pub(crate) fn deserialize_i64_items( - mut cursor: SketchSlice<'_>, - num_items: usize, -) -> Result<Vec<i64>, Error> { - let mut items = Vec::with_capacity(num_items); - for i in 0..num_items { - let value = cursor.read_i64_le().map_err(|_| { - Error::insufficient_data(format!( - "expected {num_items} i64 items, failed at index {i}" - )) - })?; - items.push(value); - } - Ok(items) + fn deserialize_value(cursor: &mut SketchSlice<'_>) -> Result<Self, Error> { + cursor.$read().map_err(|_| { + Error::insufficient_data( + concat!("failed to read ", stringify!($name), " item bytes").to_string(), + ) + }) + } + } + }; } + +impl_primitive!(i8, read_i8, write_i8); +impl_primitive!(u8, read_u8, write_u8); +impl_primitive!(i16, read_i16_le, write_i16_le); +impl_primitive!(u16, read_u16_le, write_u16_le); +impl_primitive!(i32, read_i32_le, write_i32_le); +impl_primitive!(u32, read_u32_le, write_u32_le); +impl_primitive!(i64, read_i64_le, write_i64_le); +impl_primitive!(u64, read_u64_le, write_u64_le); diff --git a/datasketches/src/frequencies/sketch.rs b/datasketches/src/frequencies/sketch.rs index e0d9711..7bf5f65 100644 --- a/datasketches/src/frequencies/sketch.rs +++ b/datasketches/src/frequencies/sketch.rs @@ -66,14 +66,12 @@ impl<T> Row<T> { self.estimate } - /// Returns the upper bound for the frequency. + /// Returns the guaranteed upper bound for the frequency. pub fn upper_bound(&self) -> u64 { self.upper_bound } /// Returns the guaranteed lower bound for the frequency. - /// - /// This value is never negative. pub fn lower_bound(&self) -> u64 { self.lower_bound } @@ -115,7 +113,11 @@ impl<T: Eq + Hash> FrequentItemsSketch<T> { /// assert_eq!(sketch.num_active_items(), 2); /// ``` pub fn new(max_map_size: usize) -> Self { - let lg_max_map_size = exact_log2(max_map_size); + assert!( + max_map_size.is_power_of_two(), + "max_map_size must be power of 2" + ); + let lg_max_map_size = max_map_size.trailing_zeros() as u8; Self::with_lg_map_sizes(lg_max_map_size, LG_MIN_MAP_SIZE) } @@ -530,44 +532,23 @@ impl<T: Eq + Hash> FrequentItemsSketch<T> { } } -impl FrequentItemsSketch<i64> { +impl<T: FrequentItemValue> FrequentItemsSketch<T> { /// Serializes this sketch into a byte vector. /// /// # Examples /// - /// ``` - /// # use datasketches::frequencies::FrequentItemsSketch; - /// # let mut sketch = FrequentItemsSketch::<i64>::new(64); - /// # sketch.update_with_count(7, 2); - /// let bytes = sketch.serialize(); - /// let decoded = FrequentItemsSketch::<i64>::deserialize(&bytes).unwrap(); - /// assert!(decoded.estimate(&7) >= 2); - /// ``` - pub fn serialize(&self) -> Vec<u8> { - self.serialize_inner(count_i64_items_bytes, serialize_i64_items) - } - - /// Deserializes a sketch from bytes. - /// - /// # Examples + /// Use with `i64` items: /// /// ``` /// # use datasketches::frequencies::FrequentItemsSketch; /// # let mut sketch = FrequentItemsSketch::<i64>::new(64); /// # sketch.update_with_count(7, 2); - /// # let bytes = sketch.serialize(); + /// let bytes = sketch.serialize(); /// let decoded = FrequentItemsSketch::<i64>::deserialize(&bytes).unwrap(); /// assert!(decoded.estimate(&7) >= 2); /// ``` - pub fn deserialize(bytes: &[u8]) -> Result<Self, Error> { - Self::deserialize_inner(bytes, deserialize_i64_items) - } -} - -impl FrequentItemsSketch<String> { - /// Serializes this sketch into a byte vector. /// - /// # Examples + /// Use with `String` items: /// /// ``` /// # use datasketches::frequencies::FrequentItemsSketch; @@ -579,13 +560,33 @@ impl FrequentItemsSketch<String> { /// assert!(decoded.estimate(&apple) >= 2); /// ``` pub fn serialize(&self) -> Vec<u8> { - self.serialize_inner(count_string_items_bytes, serialize_string_items) + self.serialize_inner( + |items| items.iter().map(|item| T::serialize_size(item)).sum(), + |bytes, items| { + for item in items { + item.serialize_value(bytes); + } + }, + ) } /// Deserializes a sketch from bytes. /// /// # Examples /// + /// Use with `i64` items: + /// + /// ``` + /// # use datasketches::frequencies::FrequentItemsSketch; + /// # let mut sketch = FrequentItemsSketch::<i64>::new(64); + /// # sketch.update_with_count(7, 2); + /// # let bytes = sketch.serialize(); + /// let decoded = FrequentItemsSketch::<i64>::deserialize(&bytes).unwrap(); + /// assert!(decoded.estimate(&7) >= 2); + /// ``` + /// + /// Use with `String` items: + /// /// ``` /// # use datasketches::frequencies::FrequentItemsSketch; /// # let mut sketch = FrequentItemsSketch::<String>::new(64); @@ -596,11 +597,17 @@ impl FrequentItemsSketch<String> { /// assert!(decoded.estimate(&apple) >= 2); /// ``` pub fn deserialize(bytes: &[u8]) -> Result<Self, Error> { - Self::deserialize_inner(bytes, deserialize_string_items) + Self::deserialize_inner(bytes, |mut cursor, num_items| { + let mut items = Vec::with_capacity(num_items); + for i in 0..num_items { + let item = T::deserialize_value(&mut cursor).map_err(|_| { + Error::insufficient_data(format!( + "expected {num_items} items, failed to read item at index {i}" + )) + })?; + items.push(item); + } + Ok(items) + }) } } - -fn exact_log2(value: usize) -> u8 { - assert!(value.is_power_of_two(), "value must be power of 2"); - value.trailing_zeros() as u8 -} diff --git a/datasketches/src/lib.rs b/datasketches/src/lib.rs index 17701ab..02dc692 100644 --- a/datasketches/src/lib.rs +++ b/datasketches/src/lib.rs @@ -31,6 +31,7 @@ compile_error!("datasketches does not support big-endian targets"); pub mod bloom; +pub mod codec; pub mod common; pub mod countmin; pub mod cpc; @@ -40,5 +41,4 @@ pub mod hll; pub mod tdigest; pub mod theta; -mod codec; mod hash; diff --git a/datasketches/tests/frequencies_update_test.rs b/datasketches/tests/frequencies_update_test.rs index f2b0001..a5a98e1 100644 --- a/datasketches/tests/frequencies_update_test.rs +++ b/datasketches/tests/frequencies_update_test.rs @@ -480,13 +480,13 @@ fn test_longs_reset() { } #[test] -#[should_panic(expected = "value must be power of 2")] +#[should_panic(expected = "max_map_size must be power of 2")] fn test_longs_invalid_map_size_panics() { FrequentItemsSketch::<i64>::new(6); } #[test] -#[should_panic(expected = "value must be power of 2")] +#[should_panic(expected = "max_map_size must be power of 2")] fn test_items_invalid_map_size_panics() { let _ = FrequentItemsSketch::<String>::new(6); } --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
