This is an automated email from the ASF dual-hosted git repository. tison pushed a commit to branch frequent-items in repository https://gitbox.apache.org/repos/asf/datasketches-rust.git
commit 9044acbaae2dd547935282d5e5837b1a0847b45a Author: tison <[email protected]> AuthorDate: Sun Feb 1 15:20:57 2026 +0800 Revert "refactor: support customized FrequentItemValue" This reverts commit 98082a54c644796b6752860059f0a3c12f88d069. --- datasketches/src/codec.rs | 6 +- datasketches/src/frequencies/mod.rs | 1 - datasketches/src/frequencies/serialization.rs | 92 +++++++++++++-------------- datasketches/src/frequencies/sketch.rs | 79 +++++++++++------------ datasketches/src/lib.rs | 2 +- datasketches/tests/frequencies_update_test.rs | 4 +- 6 files changed, 87 insertions(+), 97 deletions(-) diff --git a/datasketches/src/codec.rs b/datasketches/src/codec.rs index 7a13534..4df7b22 100644 --- a/datasketches/src/codec.rs +++ b/datasketches/src/codec.rs @@ -15,11 +15,13 @@ // specific language governing permissions and limitations // under the License. +#![allow(dead_code)] + use std::io; use std::io::Cursor; use std::io::Read; -pub struct SketchBytes { +pub(crate) struct SketchBytes { bytes: Vec<u8>, } @@ -111,7 +113,7 @@ impl SketchBytes { } } -pub struct SketchSlice<'a> { +pub(crate) struct SketchSlice<'a> { slice: Cursor<&'a [u8]>, } diff --git a/datasketches/src/frequencies/mod.rs b/datasketches/src/frequencies/mod.rs index 8f865d6..93fb5e4 100644 --- a/datasketches/src/frequencies/mod.rs +++ b/datasketches/src/frequencies/mod.rs @@ -52,7 +52,6 @@ mod reverse_purge_item_hash_map; mod serialization; mod sketch; -pub use self::serialization::FrequentItemValue; pub use self::sketch::ErrorType; pub use self::sketch::FrequentItemsSketch; pub use self::sketch::Row; diff --git a/datasketches/src/frequencies/serialization.rs b/datasketches/src/frequencies/serialization.rs index c02cdef..3f8600b 100644 --- a/datasketches/src/frequencies/serialization.rs +++ b/datasketches/src/frequencies/serialization.rs @@ -18,7 +18,6 @@ use crate::codec::SketchBytes; use crate::codec::SketchSlice; use crate::error::Error; -use std::hash::Hash; /// Family ID for frequency sketches. pub const FAMILY_ID: u8 = 10; @@ -33,69 +32,66 @@ pub const PREAMBLE_LONGS_NONEMPTY: u8 = 4; /// Empty flag mask (both bits for compatibility). pub const EMPTY_FLAG_MASK: u8 = 5; -/// Trait for serializing and deserializing frequent item values. -pub trait FrequentItemValue: Sized + Eq + Hash + Clone { - /// Returns the size in bytes required to serialize the given item. - fn serialize_size(item: &Self) -> usize; - /// Serializes the item into the given byte buffer. - fn serialize_value(&self, bytes: &mut SketchBytes); - /// Deserializes an item from the given byte cursor. - fn deserialize_value(cursor: &mut SketchSlice<'_>) -> Result<Self, Error>; +pub(crate) fn count_string_items_bytes(items: &[String]) -> usize { + items.iter().map(|item| 4 + item.len()).sum() } -impl FrequentItemValue for String { - fn serialize_size(item: &Self) -> usize { - size_of::<u32>() + item.len() - } - - fn serialize_value(&self, bytes: &mut SketchBytes) { - let bs = self.as_bytes(); +pub(crate) fn serialize_string_items(bytes: &mut SketchBytes, items: &[String]) { + for item in items { + let bs = item.as_bytes(); bytes.write_u32_le(bs.len() as u32); bytes.write(bs); } +} - fn deserialize_value(cursor: &mut SketchSlice<'_>) -> Result<Self, Error> { +pub(crate) fn deserialize_string_items( + mut cursor: SketchSlice<'_>, + num_items: usize, +) -> Result<Vec<String>, Error> { + let mut items = Vec::with_capacity(num_items); + for i in 0..num_items { let len = cursor.read_u32_le().map_err(|_| { - Error::insufficient_data("failed to read string item length".to_string()) + Error::insufficient_data(format!( + "expected {num_items} string items, failed to read len at index {i}" + )) })?; let mut slice = vec![0; len as usize]; cursor.read_exact(&mut slice).map_err(|_| { - Error::insufficient_data("failed to read string item bytes".to_string()) + Error::insufficient_data(format!( + "expected {num_items} string items, failed to read slice at index {i}" + )) })?; - String::from_utf8(slice) - .map_err(|_| Error::deserial("invalid UTF-8 string payload".to_string())) + let value = String::from_utf8(slice) + .map_err(|_| Error::deserial(format!("invalid UTF-8 string payload at index {i}")))?; + items.push(value); } + Ok(items) } -macro_rules! impl_primitive { - ($name:ty, $read:ident, $write:ident) => { - impl FrequentItemValue for $name { - fn serialize_size(_item: &Self) -> usize { - size_of::<$name>() - } - - fn serialize_value(&self, bytes: &mut SketchBytes) { - bytes.$write(*self); - } +pub(crate) fn count_i64_items_bytes(items: &[i64]) -> usize { + items.len() * 8 +} - fn deserialize_value(cursor: &mut SketchSlice<'_>) -> Result<Self, Error> { - cursor.$read().map_err(|_| { - Error::insufficient_data( - concat!("failed to read ", stringify!($name), " item bytes").to_string(), - ) - }) - } - } - }; +pub(crate) fn serialize_i64_items(bytes: &mut SketchBytes, items: &[i64]) { + for item in items.iter().copied() { + bytes.write_i64_le(item); + } } -impl_primitive!(i8, read_i8, write_i8); -impl_primitive!(u8, read_u8, write_u8); -impl_primitive!(i16, read_i16_le, write_i16_le); -impl_primitive!(u16, read_u16_le, write_u16_le); -impl_primitive!(i32, read_i32_le, write_i32_le); -impl_primitive!(u32, read_u32_le, write_u32_le); -impl_primitive!(i64, read_i64_le, write_i64_le); -impl_primitive!(u64, read_u64_le, write_u64_le); +pub(crate) fn deserialize_i64_items( + mut cursor: SketchSlice<'_>, + num_items: usize, +) -> Result<Vec<i64>, Error> { + let mut items = Vec::with_capacity(num_items); + for i in 0..num_items { + let value = cursor.read_i64_le().map_err(|_| { + Error::insufficient_data(format!( + "expected {num_items} i64 items, failed at index {i}" + )) + })?; + items.push(value); + } + Ok(items) +} diff --git a/datasketches/src/frequencies/sketch.rs b/datasketches/src/frequencies/sketch.rs index 7bf5f65..e0d9711 100644 --- a/datasketches/src/frequencies/sketch.rs +++ b/datasketches/src/frequencies/sketch.rs @@ -66,12 +66,14 @@ impl<T> Row<T> { self.estimate } - /// Returns the guaranteed upper bound for the frequency. + /// Returns the upper bound for the frequency. pub fn upper_bound(&self) -> u64 { self.upper_bound } /// Returns the guaranteed lower bound for the frequency. + /// + /// This value is never negative. pub fn lower_bound(&self) -> u64 { self.lower_bound } @@ -113,11 +115,7 @@ impl<T: Eq + Hash> FrequentItemsSketch<T> { /// assert_eq!(sketch.num_active_items(), 2); /// ``` pub fn new(max_map_size: usize) -> Self { - assert!( - max_map_size.is_power_of_two(), - "max_map_size must be power of 2" - ); - let lg_max_map_size = max_map_size.trailing_zeros() as u8; + let lg_max_map_size = exact_log2(max_map_size); Self::with_lg_map_sizes(lg_max_map_size, LG_MIN_MAP_SIZE) } @@ -532,13 +530,11 @@ impl<T: Eq + Hash> FrequentItemsSketch<T> { } } -impl<T: FrequentItemValue> FrequentItemsSketch<T> { +impl FrequentItemsSketch<i64> { /// Serializes this sketch into a byte vector. /// /// # Examples /// - /// Use with `i64` items: - /// /// ``` /// # use datasketches::frequencies::FrequentItemsSketch; /// # let mut sketch = FrequentItemsSketch::<i64>::new(64); @@ -547,8 +543,31 @@ impl<T: FrequentItemValue> FrequentItemsSketch<T> { /// let decoded = FrequentItemsSketch::<i64>::deserialize(&bytes).unwrap(); /// assert!(decoded.estimate(&7) >= 2); /// ``` + pub fn serialize(&self) -> Vec<u8> { + self.serialize_inner(count_i64_items_bytes, serialize_i64_items) + } + + /// Deserializes a sketch from bytes. + /// + /// # Examples + /// + /// ``` + /// # use datasketches::frequencies::FrequentItemsSketch; + /// # let mut sketch = FrequentItemsSketch::<i64>::new(64); + /// # sketch.update_with_count(7, 2); + /// # let bytes = sketch.serialize(); + /// let decoded = FrequentItemsSketch::<i64>::deserialize(&bytes).unwrap(); + /// assert!(decoded.estimate(&7) >= 2); + /// ``` + pub fn deserialize(bytes: &[u8]) -> Result<Self, Error> { + Self::deserialize_inner(bytes, deserialize_i64_items) + } +} + +impl FrequentItemsSketch<String> { + /// Serializes this sketch into a byte vector. /// - /// Use with `String` items: + /// # Examples /// /// ``` /// # use datasketches::frequencies::FrequentItemsSketch; @@ -560,33 +579,13 @@ impl<T: FrequentItemValue> FrequentItemsSketch<T> { /// assert!(decoded.estimate(&apple) >= 2); /// ``` pub fn serialize(&self) -> Vec<u8> { - self.serialize_inner( - |items| items.iter().map(|item| T::serialize_size(item)).sum(), - |bytes, items| { - for item in items { - item.serialize_value(bytes); - } - }, - ) + self.serialize_inner(count_string_items_bytes, serialize_string_items) } /// Deserializes a sketch from bytes. /// /// # Examples /// - /// Use with `i64` items: - /// - /// ``` - /// # use datasketches::frequencies::FrequentItemsSketch; - /// # let mut sketch = FrequentItemsSketch::<i64>::new(64); - /// # sketch.update_with_count(7, 2); - /// # let bytes = sketch.serialize(); - /// let decoded = FrequentItemsSketch::<i64>::deserialize(&bytes).unwrap(); - /// assert!(decoded.estimate(&7) >= 2); - /// ``` - /// - /// Use with `String` items: - /// /// ``` /// # use datasketches::frequencies::FrequentItemsSketch; /// # let mut sketch = FrequentItemsSketch::<String>::new(64); @@ -597,17 +596,11 @@ impl<T: FrequentItemValue> FrequentItemsSketch<T> { /// assert!(decoded.estimate(&apple) >= 2); /// ``` pub fn deserialize(bytes: &[u8]) -> Result<Self, Error> { - Self::deserialize_inner(bytes, |mut cursor, num_items| { - let mut items = Vec::with_capacity(num_items); - for i in 0..num_items { - let item = T::deserialize_value(&mut cursor).map_err(|_| { - Error::insufficient_data(format!( - "expected {num_items} items, failed to read item at index {i}" - )) - })?; - items.push(item); - } - Ok(items) - }) + Self::deserialize_inner(bytes, deserialize_string_items) } } + +fn exact_log2(value: usize) -> u8 { + assert!(value.is_power_of_two(), "value must be power of 2"); + value.trailing_zeros() as u8 +} diff --git a/datasketches/src/lib.rs b/datasketches/src/lib.rs index 02dc692..17701ab 100644 --- a/datasketches/src/lib.rs +++ b/datasketches/src/lib.rs @@ -31,7 +31,6 @@ compile_error!("datasketches does not support big-endian targets"); pub mod bloom; -pub mod codec; pub mod common; pub mod countmin; pub mod cpc; @@ -41,4 +40,5 @@ pub mod hll; pub mod tdigest; pub mod theta; +mod codec; mod hash; diff --git a/datasketches/tests/frequencies_update_test.rs b/datasketches/tests/frequencies_update_test.rs index a5a98e1..f2b0001 100644 --- a/datasketches/tests/frequencies_update_test.rs +++ b/datasketches/tests/frequencies_update_test.rs @@ -480,13 +480,13 @@ fn test_longs_reset() { } #[test] -#[should_panic(expected = "max_map_size must be power of 2")] +#[should_panic(expected = "value must be power of 2")] fn test_longs_invalid_map_size_panics() { FrequentItemsSketch::<i64>::new(6); } #[test] -#[should_panic(expected = "max_map_size must be power of 2")] +#[should_panic(expected = "value must be power of 2")] fn test_items_invalid_map_size_panics() { let _ = FrequentItemsSketch::<String>::new(6); } --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
