This is an automated email from the ASF dual-hosted git repository. tison pushed a commit to branch compression in repository https://gitbox.apache.org/repos/asf/datasketches-rust.git
commit 05a562d54f08cecc8edfab45cdaa1f78111c8b89 Author: tison <[email protected]> AuthorDate: Wed Feb 4 17:09:40 2026 +0800 two way merge Signed-off-by: tison <[email protected]> --- datasketches/src/cpc/compression.rs | 78 +++++++++++++++++++------------------ 1 file changed, 40 insertions(+), 38 deletions(-) diff --git a/datasketches/src/cpc/compression.rs b/datasketches/src/cpc/compression.rs index ac5b044..4d5cd2a 100644 --- a/datasketches/src/cpc/compression.rs +++ b/datasketches/src/cpc/compression.rs @@ -73,20 +73,46 @@ impl CompressedState { if !pairs.is_empty() { introspective_insertion_sort(&mut pairs); } - let num_pairs_from_table = pairs.len() as u32; - let num_pairs_from_window = source.num_coupons() - num_pairs_from_table; - - let mut all_pairs = tricky_get_pairs_from_window( - &source.sliding_window, - k, - num_pairs_from_window, - num_pairs_from_table, - ); - // u32_table<A>::merge( - // pairs_from_table.data(), 0, pairs_from_table.size(), - // all_pairs.data(), num_pairs_from_table, num_pairs_from_window, - // all_pairs.data(), 0 - // ); // note the overlapping subarray trick + let num_pairs_from_table = pairs.len(); + let num_pairs_from_window = (source.num_coupons() as usize) - num_pairs_from_table; + + let all_pairs_len = num_pairs_from_table + num_pairs_from_window; + let mut all_pairs = vec![0; all_pairs_len]; + // tricky read pairs from sliding_window + { + // The empty space that this leaves at the beginning of the output array will be filled + // later. + let mut idx = num_pairs_from_table; + for row_index in 0..k { + let mut byte = source.sliding_window[row_index]; + while byte != 0 { + let col_index = byte.trailing_zeros(); + byte = byte ^ (1 << col_index); // erase the 1 + all_pairs[idx] = ((row_index << 6) as u32) | col_index; + idx += 1; + } + } + assert_eq!(idx, all_pairs_len); + } + // two-way merge of pairs_from_table and pairs_from_window into all_pairs + { + let mut final_idx = 0; + let mut table_idx = 0; + let mut window_idx = num_pairs_from_table; + + while final_idx < all_pairs_len { + if table_idx < num_pairs_from_table + && (window_idx >= all_pairs_len || pairs[table_idx] <= all_pairs[window_idx]) + { + all_pairs[final_idx] = pairs[table_idx]; + table_idx += 1; + } else { + all_pairs[final_idx] = all_pairs[window_idx]; + window_idx += 1; + } + final_idx += 1; + } + } self.compress_surprising_values(&all_pairs, source.lg_k()); } @@ -201,30 +227,6 @@ pub(super) struct UncompressedState { window: Vec<u8>, } -/// The empty space that this leaves at the beginning of the output array will be filled in later -/// by the caller. -fn tricky_get_pairs_from_window( - window: &[u8], - k: usize, - num_pairs_to_get: u32, - empty_space: u32, -) -> Vec<u32> { - let output_length = empty_space + num_pairs_to_get; - let mut pairs = vec![0; output_length as usize]; - let mut pair_index = empty_space; - for row_index in 0..k { - let mut byte = window[row_index]; - while byte != 0 { - let col_index = byte.trailing_zeros(); - byte = byte ^ (1 << col_index); // erase the 1 - pairs[pair_index as usize] = ((row_index << 6) as u32) | col_index; - pair_index += 1; - } - } - assert_eq!(pair_index, output_length); - pairs -} - fn write_unary( compressed_words: &mut [u32], next_word_index: &mut usize, --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
