This is an automated email from the ASF dual-hosted git repository.
tison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datasketches-rust.git
The following commit(s) were added to refs/heads/main by this push:
new dd12abf feat: impl CpcSketch (#75)
dd12abf is described below
commit dd12abfec488097ebcaef761042317f3e46bc0cf
Author: tison <[email protected]>
AuthorDate: Sun Feb 1 11:18:53 2026 +0800
feat: impl CpcSketch (#75)
Signed-off-by: tison <[email protected]>
---
Cargo.lock | 91 +++++-
Cargo.toml | 3 +
datasketches/Cargo.toml | 1 +
datasketches/src/common/inv_pow2_table.rs | 103 ++++++
datasketches/src/common/mod.rs | 14 +
datasketches/src/common/num_std_dev.rs | 5 +
datasketches/src/cpc/estimator.rs | 403 ++++++++++++++++++++++++
datasketches/src/cpc/kxp_byte_lookup.rs | 76 +++++
datasketches/src/cpc/mod.rs | 100 ++++++
datasketches/src/cpc/pair_table.rs | 346 +++++++++++++++++++++
datasketches/src/cpc/sketch.rs | 499 ++++++++++++++++++++++++++++++
datasketches/src/cpc/union.rs | 409 ++++++++++++++++++++++++
datasketches/src/error.rs | 2 +-
datasketches/src/hll/sketch.rs | 2 +-
datasketches/src/hll/union.rs | 2 +-
datasketches/src/lib.rs | 1 +
datasketches/src/tdigest/sketch.rs | 12 +-
datasketches/src/theta/sketch.rs | 22 +-
datasketches/tests/cpc_union_test.rs | 169 ++++++++++
datasketches/tests/cpc_update_test.rs | 62 ++++
20 files changed, 2280 insertions(+), 42 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index c946f1e..1bbd4f6 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -81,9 +81,9 @@ checksum =
"9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
[[package]]
name = "clap"
-version = "4.5.53"
+version = "4.5.55"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c9e340e012a1bf4935f5282ed1436d1489548e8f72308207ea5df0e23d2d03f8"
+checksum = "3e34525d5bbbd55da2bb745d34b36121baac88d07619a9a09cfcf4a6c0832785"
dependencies = [
"clap_builder",
"clap_derive",
@@ -91,9 +91,9 @@ dependencies = [
[[package]]
name = "clap_builder"
-version = "4.5.53"
+version = "4.5.55"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d76b5d13eaa18c901fd2f7fca939fefe3a0727a953561fefdf3b2922b8569d00"
+checksum = "59a20016a20a3da95bef50ec7238dbd09baeef4311dcdd38ec15aba69812fb61"
dependencies = [
"anstream",
"anstyle",
@@ -103,9 +103,9 @@ dependencies = [
[[package]]
name = "clap_derive"
-version = "4.5.49"
+version = "4.5.55"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2a0b5487afeab2deb2ff4e03a807ad1a03ac532ff5a2cee5d86884440c7f7671"
+checksum = "a92793da1a46a5f2a02a6f4c46c6496b28c43638adea8306fcb0caa1634f24e5"
dependencies = [
"heck",
"proc-macro2",
@@ -115,9 +115,9 @@ dependencies = [
[[package]]
name = "clap_lex"
-version = "0.7.6"
+version = "0.7.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d"
+checksum = "c3e64b0cc0439b12df2fa678eae89a1c56a529fd067a9115f7827f1fffd22b32"
[[package]]
name = "colorchoice"
@@ -143,6 +143,7 @@ version = "0.2.0"
dependencies = [
"googletest",
"insta",
+ "rand",
]
[[package]]
@@ -241,9 +242,9 @@ checksum =
"a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695"
[[package]]
name = "libc"
-version = "0.2.178"
+version = "0.2.180"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "37c93d8daa9d8a012fd8ab92f088405fb202ea0b6ab73ee2482ae66af4f42091"
+checksum = "bcc35a38544a891a5f7c865aca548a982ccb3b8650a5b06d0fd33a10283c56fc"
[[package]]
name = "linux-raw-sys"
@@ -278,20 +279,29 @@ version = "1.70.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
+[[package]]
+name = "ppv-lite86"
+version = "0.2.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9"
+dependencies = [
+ "zerocopy",
+]
+
[[package]]
name = "proc-macro2"
-version = "1.0.104"
+version = "1.0.106"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9695f8df41bb4f3d222c95a67532365f569318332d03d5f3f67f37b20e6ebdf0"
+checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
-version = "1.0.42"
+version = "1.0.44"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a338cc41d27e6cc6dce6cefc13a0729dfbb81c262b1f519331575dd80ef3067f"
+checksum = "21b2ebcf727b7760c461f091f9f0f539b77b8e87f2fd88131e7f1b433b3cece4"
dependencies = [
"proc-macro2",
]
@@ -302,6 +312,35 @@ version = "5.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
+[[package]]
+name = "rand"
+version = "0.9.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1"
+dependencies = [
+ "rand_chacha",
+ "rand_core",
+]
+
+[[package]]
+name = "rand_chacha"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
+dependencies = [
+ "ppv-lite86",
+ "rand_core",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.9.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c"
+dependencies = [
+ "getrandom",
+]
+
[[package]]
name = "regex"
version = "1.12.2"
@@ -364,9 +403,9 @@ checksum =
"7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
[[package]]
name = "syn"
-version = "2.0.112"
+version = "2.0.114"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "21f182278bf2d2bcb3c88b1b08a37df029d71ce3d3ae26168e3c653b213b99d4"
+checksum = "d4d107df263a3013ef9b1879b0df87d706ff80f65a86ea879bd9c31f9b307c2a"
dependencies = [
"proc-macro2",
"quote",
@@ -525,3 +564,23 @@ dependencies = [
"clap",
"which",
]
+
+[[package]]
+name = "zerocopy"
+version = "0.8.35"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fdea86ddd5568519879b8187e1cf04e24fce28f7fe046ceecbce472ff19a2572"
+dependencies = [
+ "zerocopy-derive",
+]
+
+[[package]]
+name = "zerocopy-derive"
+version = "0.8.35"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c15e1b46eff7c6c91195752e0eeed8ef040e391cdece7c25376957d5f15df22"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
diff --git a/Cargo.toml b/Cargo.toml
index d387fbf..cc776fe 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -35,6 +35,7 @@ datasketches = { path = "datasketches" }
clap = { version = "4.5.20", features = ["derive"] }
insta = { version = "1.46.1" }
googletest = { version = "0.14.2" }
+rand = { version = "0.9.2" }
which = { version = "8.0.0" }
[workspace.lints.rust]
@@ -44,4 +45,6 @@ unused_must_use = "deny"
[workspace.lints.clippy]
dbg_macro = "deny"
+
too_many_arguments = "allow"
+needless_range_loop = "allow"
diff --git a/datasketches/Cargo.toml b/datasketches/Cargo.toml
index 214288e..88b224c 100644
--- a/datasketches/Cargo.toml
+++ b/datasketches/Cargo.toml
@@ -36,6 +36,7 @@ rustdoc-args = ["--cfg", "docsrs"]
[dev-dependencies]
googletest = { workspace = true }
+rand = { workspace = true }
insta = { workspace = true }
[lints]
diff --git a/datasketches/src/common/inv_pow2_table.rs
b/datasketches/src/common/inv_pow2_table.rs
new file mode 100644
index 0000000..580a2fc
--- /dev/null
+++ b/datasketches/src/common/inv_pow2_table.rs
@@ -0,0 +1,103 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#[rustfmt::skip]
+#[allow(clippy::excessive_precision)]
+pub(crate) const INVERSE_POWERS_OF_2: [f64; 256] = [
+ 1.0, 0.5, 0.25, 0.125,
+ 0.0625, 0.03125, 0.015625, 0.0078125,
+ 0.00390625, 0.001953125, 0.0009765625, 0.00048828125,
+ 0.000244140625, 0.0001220703125, 6.103515625e-05, 3.0517578125e-05,
+ 1.52587890625e-05, 7.62939453125e-06, 3.814697265625e-06,
1.9073486328125e-06,
+ 9.5367431640625e-07, 4.76837158203125e-07, 2.384185791015625e-07,
1.1920928955078125e-07,
+ 5.9604644775390625e-08, 2.9802322387695312e-08, 1.4901161193847656e-08,
7.4505805969238281e-09,
+ 3.7252902984619141e-09, 1.862645149230957e-09, 9.3132257461547852e-10,
4.6566128730773926e-10,
+ 2.3283064365386963e-10, 1.1641532182693481e-10, 5.8207660913467407e-11,
2.9103830456733704e-11,
+ 1.4551915228366852e-11, 7.2759576141834259e-12, 3.637978807091713e-12,
1.8189894035458565e-12,
+ 9.0949470177292824e-13, 4.5474735088646412e-13, 2.2737367544323206e-13,
1.1368683772161603e-13,
+ 5.6843418860808015e-14, 2.8421709430404007e-14, 1.4210854715202004e-14,
7.1054273576010019e-15,
+ 3.5527136788005009e-15, 1.7763568394002505e-15, 8.8817841970012523e-16,
4.4408920985006262e-16,
+ 2.2204460492503131e-16, 1.1102230246251565e-16, 5.5511151231257827e-17,
2.7755575615628914e-17,
+ 1.3877787807814457e-17, 6.9388939039072284e-18, 3.4694469519536142e-18,
1.7347234759768071e-18,
+ 8.6736173798840355e-19, 4.3368086899420177e-19, 2.1684043449710089e-19,
1.0842021724855044e-19,
+ 5.4210108624275222e-20, 2.7105054312137611e-20, 1.3552527156068805e-20,
6.7762635780344027e-21,
+ 3.3881317890172014e-21, 1.6940658945086007e-21, 8.4703294725430034e-22,
4.2351647362715017e-22,
+ 2.1175823681357508e-22, 1.0587911840678754e-22, 5.2939559203393771e-23,
2.6469779601696886e-23,
+ 1.3234889800848443e-23, 6.6174449004242214e-24, 3.3087224502121107e-24,
1.6543612251060553e-24,
+ 8.2718061255302767e-25, 4.1359030627651384e-25, 2.0679515313825692e-25,
1.0339757656912846e-25,
+ 5.169878828456423e-26, 2.5849394142282115e-26, 1.2924697071141057e-26,
6.4623485355705287e-27,
+ 3.2311742677852644e-27, 1.6155871338926322e-27, 8.0779356694631609e-28,
4.0389678347315804e-28,
+ 2.0194839173657902e-28, 1.0097419586828951e-28, 5.0487097934144756e-29,
2.5243548967072378e-29,
+ 1.2621774483536189e-29, 6.3108872417680944e-30, 3.1554436208840472e-30,
1.5777218104420236e-30,
+ 7.8886090522101181e-31, 3.944304526105059e-31, 1.9721522630525295e-31,
9.8607613152626476e-32,
+ 4.9303806576313238e-32, 2.4651903288156619e-32, 1.2325951644078309e-32,
6.1629758220391547e-33,
+ 3.0814879110195774e-33, 1.5407439555097887e-33, 7.7037197775489434e-34,
3.8518598887744717e-34,
+ 1.9259299443872359e-34, 9.6296497219361793e-35, 4.8148248609680896e-35,
2.4074124304840448e-35,
+ 1.2037062152420224e-35, 6.018531076210112e-36, 3.009265538105056e-36,
1.504632769052528e-36,
+ 7.5231638452626401e-37, 3.76158192263132e-37, 1.88079096131566e-37,
9.4039548065783001e-38,
+ 4.70197740328915e-38, 2.350988701644575e-38, 1.1754943508222875e-38,
5.8774717541114375e-39,
+ 2.9387358770557188e-39, 1.4693679385278594e-39, 7.3468396926392969e-40,
3.6734198463196485e-40,
+ 1.8367099231598242e-40, 9.1835496157991212e-41, 4.5917748078995606e-41,
2.2958874039497803e-41,
+ 1.1479437019748901e-41, 5.7397185098744507e-42, 2.8698592549372254e-42,
1.4349296274686127e-42,
+ 7.1746481373430634e-43, 3.5873240686715317e-43, 1.7936620343357659e-43,
8.9683101716788293e-44,
+ 4.4841550858394146e-44, 2.2420775429197073e-44, 1.1210387714598537e-44,
5.6051938572992683e-45,
+ 2.8025969286496341e-45, 1.4012984643248171e-45, 7.0064923216240854e-46,
3.5032461608120427e-46,
+ 1.7516230804060213e-46, 8.7581154020301067e-47, 4.3790577010150533e-47,
2.1895288505075267e-47,
+ 1.0947644252537633e-47, 5.4738221262688167e-48, 2.7369110631344083e-48,
1.3684555315672042e-48,
+ 6.8422776578360209e-49, 3.4211388289180104e-49, 1.7105694144590052e-49,
8.5528470722950261e-50,
+ 4.276423536147513e-50, 2.1382117680737565e-50, 1.0691058840368783e-50,
5.3455294201843913e-51,
+ 2.6727647100921956e-51, 1.3363823550460978e-51, 6.6819117752304891e-52,
3.3409558876152446e-52,
+ 1.6704779438076223e-52, 8.3523897190381114e-53, 4.1761948595190557e-53,
2.0880974297595278e-53,
+ 1.0440487148797639e-53, 5.2202435743988196e-54, 2.6101217871994098e-54,
1.3050608935997049e-54,
+ 6.5253044679985245e-55, 3.2626522339992623e-55, 1.6313261169996311e-55,
8.1566305849981557e-56,
+ 4.0783152924990778e-56, 2.0391576462495389e-56, 1.0195788231247695e-56,
5.0978941156238473e-57,
+ 2.5489470578119236e-57, 1.2744735289059618e-57, 6.3723676445298091e-58,
3.1861838222649046e-58,
+ 1.5930919111324523e-58, 7.9654595556622614e-59, 3.9827297778311307e-59,
1.9913648889155653e-59,
+ 9.9568244445778267e-60, 4.9784122222889134e-60, 2.4892061111444567e-60,
1.2446030555722283e-60,
+ 6.2230152778611417e-61, 3.1115076389305709e-61, 1.5557538194652854e-61,
7.7787690973264271e-62,
+ 3.8893845486632136e-62, 1.9446922743316068e-62, 9.7234613716580339e-63,
4.861730685829017e-63,
+ 2.4308653429145085e-63, 1.2154326714572542e-63, 6.0771633572862712e-64,
3.0385816786431356e-64,
+ 1.5192908393215678e-64, 7.596454196607839e-65, 3.7982270983039195e-65,
1.8991135491519597e-65,
+ 9.4955677457597987e-66, 4.7477838728798994e-66, 2.3738919364399497e-66,
1.1869459682199748e-66,
+ 5.9347298410998742e-67, 2.9673649205499371e-67, 1.4836824602749686e-67,
7.4184123013748428e-68,
+ 3.7092061506874214e-68, 1.8546030753437107e-68, 9.2730153767185535e-69,
4.6365076883592767e-69,
+ 2.3182538441796384e-69, 1.1591269220898192e-69, 5.7956346104490959e-70,
2.897817305224548e-70,
+ 1.448908652612274e-70, 7.2445432630613699e-71, 3.6222716315306849e-71,
1.8111358157653425e-71,
+ 9.0556790788267124e-72, 4.5278395394133562e-72, 2.2639197697066781e-72,
1.131959884853339e-72,
+ 5.6597994242666952e-73, 2.8298997121333476e-73, 1.4149498560666738e-73,
7.074749280333369e-74,
+ 3.5373746401666845e-74, 1.7686873200833423e-74, 8.8434366004167113e-75,
4.4217183002083556e-75,
+ 2.2108591501041778e-75, 1.1054295750520889e-75, 5.5271478752604446e-76,
2.7635739376302223e-76,
+ 1.3817869688151111e-76, 6.9089348440755557e-77, 3.4544674220377779e-77,
1.7272337110188889e-77
+];
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn assert_inv_pow2_table() {
+ for i in 0u8..=255 {
+ // expected value
+ let expected = INVERSE_POWERS_OF_2[i as usize];
+
+ // computed value
+ let computed = 2f64.powf(-(i as f64));
+
+ assert_eq!(expected, computed);
+ }
+ }
+}
diff --git a/datasketches/src/common/mod.rs b/datasketches/src/common/mod.rs
index 017ddc9..a43373a 100644
--- a/datasketches/src/common/mod.rs
+++ b/datasketches/src/common/mod.rs
@@ -25,3 +25,17 @@ pub use self::resize::ResizeFactor;
// private to datasketches crate
pub(crate) mod binomial_bounds;
+pub(crate) mod inv_pow2_table;
+
+/// Canonicalize double value for compatibility with Java
+pub(crate) fn canonical_double(value: f64) -> u64 {
+ if value.is_nan() {
+ // Java's Double.doubleToLongBits() NaN value
+ 0x7ff8000000000000u64
+ } else {
+ // -0.0 + 0.0 == +0.0 under IEEE754 roundTiesToEven rounding mode,
+ // which Rust guarantees. Thus, by adding a positive zero we
+ // canonicalize signed zero without any branches in one instruction.
+ (value + 0.0).to_bits()
+ }
+}
diff --git a/datasketches/src/common/num_std_dev.rs
b/datasketches/src/common/num_std_dev.rs
index 5cae235..92cbddd 100644
--- a/datasketches/src/common/num_std_dev.rs
+++ b/datasketches/src/common/num_std_dev.rs
@@ -50,4 +50,9 @@ impl NumStdDev {
pub const fn tail_probability(&self) -> f64 {
DELTA_OF_NUM_STD_DEVS[*self as usize]
}
+
+ /// Returns the number of standard deviations as an `u8`.
+ pub const fn as_u8(&self) -> u8 {
+ *self as u8
+ }
}
diff --git a/datasketches/src/cpc/estimator.rs
b/datasketches/src/cpc/estimator.rs
new file mode 100644
index 0000000..6d37e6e
--- /dev/null
+++ b/datasketches/src/cpc/estimator.rs
@@ -0,0 +1,403 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::f64::consts::LN_2;
+
+use crate::common::NumStdDev;
+
+const ICON_ERROR_CONSTANT: f64 = LN_2;
+
+const ICON_LOW_SIDE_DATA: [u16; 33] = [
+ //1, 2, 3, kappa
+ // lgK num trials
+ 6037, 5720, 5328, // 4 1000000
+ 6411, 6262, 5682, // 5 1000000
+ 6724, 6403, 6127, // 6 1000000
+ 6665, 6411, 6208, // 7 1000000
+ 6959, 6525, 6427, // 8 1000000
+ 6892, 6665, 6619, // 9 1000000
+ 6792, 6752, 6690, // 10 1000000
+ 6899, 6818, 6708, // 11 1000000
+ 6871, 6845, 6812, // 12 1046369
+ 6909, 6861, 6828, // 13 1043411
+ 6919, 6897, 6842, // 14 1000297
+];
+
+const ICON_HIGH_SIDE_DATA: [u16; 33] = [
+ //1, 2, 3, kappa
+ // lgK num trials
+ 8031, 8559, 9309, // 4 1000000
+ 7084, 7959, 8660, // 5 1000000
+ 7141, 7514, 7876, // 6 1000000
+ 7458, 7430, 7572, // 7 1000000
+ 6892, 7141, 7497, // 8 1000000
+ 6889, 7132, 7290, // 9 1000000
+ 7075, 7118, 7185, // 10 1000000
+ 7040, 7047, 7085, // 11 1000000
+ 6993, 7019, 7053, // 12 1046369
+ 6953, 7001, 6983, // 13 1043411
+ 6944, 6966, 7004, // 14 1000297
+];
+
+#[allow(clippy::excessive_precision)]
+const HIP_ERROR_CONSTANT: f64 = 0.588705011257737332; // (LN_2 / 2.0).sqrt()
+
+const HIP_LOW_SIDE_DATA: [u16; 33] = [
+ //1, 2, 3, kappa
+ // lgK num trials
+ 5871, 5247, 4826, // 4 1000000
+ 5877, 5403, 5070, // 5 1000000
+ 5873, 5533, 5304, // 6 1000000
+ 5878, 5632, 5464, // 7 1000000
+ 5874, 5690, 5564, // 8 1000000
+ 5880, 5745, 5619, // 9 1000000
+ 5875, 5784, 5701, // 10 1000000
+ 5866, 5789, 5742, // 11 1000000
+ 5869, 5827, 5784, // 12 1046369
+ 5876, 5860, 5827, // 13 1043411
+ 5881, 5853, 5842, // 14 1000297
+];
+
+const HIP_HIGH_SIDE_DATA: [u16; 33] = [
+ //1, 2, 3, kappa
+ // lgK num trials
+ 5855, 6688, 7391, // 4 1000000
+ 5886, 6444, 6923, // 5 1000000
+ 5885, 6254, 6594, // 6 1000000
+ 5889, 6134, 6326, // 7 1000000
+ 5900, 6072, 6203, // 8 1000000
+ 5875, 6005, 6089, // 9 1000000
+ 5871, 5980, 6040, // 10 1000000
+ 5889, 5941, 6015, // 11 1000000
+ 5871, 5926, 5973, // 12 1046369
+ 5866, 5901, 5915, // 13 1043411
+ 5880, 5914, 5953, // 14 1000297
+];
+
+pub(super) fn icon_confidence_lb(lg_k: u8, num_coupons: u32, kappa: NumStdDev)
-> f64 {
+ if num_coupons == 0 {
+ return 0.0;
+ }
+
+ let k = (1 << lg_k) as f64;
+ let kappa = kappa.as_u8();
+
+ let mut x = ICON_ERROR_CONSTANT;
+ if lg_k <= 14 {
+ let idx = (3 * (lg_k - 4) + (kappa - 1)) as usize;
+ x = (ICON_HIGH_SIDE_DATA[idx] as f64) / 10000.0;
+ }
+ let rel = x / k.sqrt();
+ let eps = (kappa as f64) * rel;
+ let est = icon_estimate(lg_k, num_coupons);
+ let result = est / (1.0 + eps);
+ if result < (num_coupons as f64) {
+ num_coupons as f64
+ } else {
+ result
+ }
+}
+
+pub(super) fn icon_confidence_ub(lg_k: u8, num_coupons: u32, kappa: NumStdDev)
-> f64 {
+ if num_coupons == 0 {
+ return 0.0;
+ }
+
+ let k = (1 << lg_k) as f64;
+ let kappa = kappa.as_u8();
+
+ let mut x = ICON_ERROR_CONSTANT;
+ if lg_k <= 14 {
+ let idx = (3 * (lg_k - 4) + (kappa - 1)) as usize;
+ x = (ICON_LOW_SIDE_DATA[idx] as f64) / 10000.0;
+ }
+ let rel = x / k.sqrt();
+ let eps = (kappa as f64) * rel;
+ let est = icon_estimate(lg_k, num_coupons);
+ let result = est / (1.0 - eps);
+ result.ceil() // slight widening of interval to be conservative
+}
+
+pub(super) fn hip_confidence_lb(
+ lg_k: u8,
+ num_coupons: u32,
+ hip_est_accum: f64,
+ kappa: NumStdDev,
+) -> f64 {
+ if num_coupons == 0 {
+ return 0.0;
+ }
+
+ let k = (1 << lg_k) as f64;
+ let kappa = kappa.as_u8();
+
+ let mut x = HIP_ERROR_CONSTANT;
+ if lg_k <= 14 {
+ let idx = (3 * (lg_k - 4) + (kappa - 1)) as usize;
+ x = (HIP_HIGH_SIDE_DATA[idx] as f64) / 10000.0;
+ }
+ let rel = x / k.sqrt();
+ let eps = (kappa as f64) * rel;
+ let result = hip_est_accum / (1.0 + eps);
+ if result < (num_coupons as f64) {
+ num_coupons as f64
+ } else {
+ result
+ }
+}
+
+pub(super) fn hip_confidence_ub(
+ lg_k: u8,
+ num_coupons: u32,
+ hip_est_accum: f64,
+ kappa: NumStdDev,
+) -> f64 {
+ if num_coupons == 0 {
+ return 0.0;
+ }
+
+ let k = (1 << lg_k) as f64;
+ let kappa = kappa.as_u8();
+
+ let mut x = HIP_ERROR_CONSTANT;
+ if lg_k <= 14 {
+ let idx = (3 * (lg_k - 4) + (kappa - 1)) as usize;
+ x = (HIP_LOW_SIDE_DATA[idx] as f64) / 10000.0;
+ }
+ let rel = x / k.sqrt();
+ let eps = (kappa as f64) * rel;
+ let result = hip_est_accum / (1.0 - eps);
+ result.ceil() // widening for coverage
+}
+
+// The ICON estimator for FM85 sketches is defined by the arXiv paper.
+//
+// The current file provides exact and approximate implementations of this
estimator. The exact
+// version works for any value of K, but is quite slow.
+//
+// The much faster approximate version works for K values that are powers of
two ranging from 2^4
+// to 2^32.
+//
+// At a high-level, this approximation can be described as using an
exponential approximation when
+// C > K * (5.6 or 5.7), while smaller values of C are handled by a degree-19
polynomial
+// approximation of a pre-conditioned version of the true ICON mapping from C
to N_hat.
+//
+// This file also provides a validation procedure that compares its
approximate and exact
+// implementations of the FM85 ICON estimator.
+
+const ICON_MIN_LOG_K: usize = 4;
+const ICON_MAX_LOG_K: usize = 26;
+const ICON_POLYNOMIAL_DEGREE: usize = 19;
+const ICON_POLYNOMIAL_NUM_COEFFICIENTS: usize = 1 + ICON_POLYNOMIAL_DEGREE;
+const ICON_TABLE_SIZE: usize =
+ ICON_POLYNOMIAL_NUM_COEFFICIENTS * (1 + (ICON_MAX_LOG_K - ICON_MIN_LOG_K));
+
+#[rustfmt::skip]
+#[allow(clippy::excessive_precision)]
+const ICON_POLYNOMIAL_COEFFICIENTS: [f64; ICON_TABLE_SIZE] = [
+ // log K = 4
+ 0.9895027971889700513, 0.3319496644645180128, 0.1242818722715769986,
-0.03324149686026930256, -0.2985637298081619817,
+ 1.366555923595830002, -4.705499366260569971, 11.61506432505530029,
-21.11254986175579873, 28.89421695078809904,
+ -30.1383659011730991, 24.11946778830730054, -14.83391445199539938,
6.983088767267210173, -2.48964120264876998,
+ 0.6593243603602499947, -0.125493534558034997, 0.01620971672896159843,
-0.001271267679036929953, 4.567178653294529745e-05,
+
+ // log K = 5
+ 0.9947713741300230339, 0.3326559581620939787, 0.1250050661634889981,
-0.04130073804472530336, -0.2584095537451129854,
+ 1.218050389433120051, -4.319106696095399656, 10.87175052045090062,
-20.0184979022142997, 27.63210188163320069,
+ -28.97950009664030091, 23.26740804691930009, -14.33375703270860058,
6.751281271241110105, -2.406363094133439962,
+ 0.6367414734718820357, -0.1210468076141379967, 0.01561196698118279963,
-0.001222335432128580056, 4.383502970318410206e-05,
+
+ // log K = 6
+ 0.9973904854982870161, 0.3330148852217920119, 0.125251536589509993,
-0.04434075124043219962, -0.2436238890691720116,
+ 1.163293254754570016, -4.177758779777369647, 10.60301981340099964,
-19.6274507428828997, 27.18420839597660077,
+ -28.56827214174580121, 22.96268674086600114, -14.15234202220280046,
6.665700662642549901, -2.375043356720739851,
+ 0.6280993991240929608, -0.119319019358031006, 0.01537674055733759954,
-0.001202881695730769916, 4.309894633186929849e-05,
+
+ // log K = 7
+ 0.9986963310058679655, 0.3331956705633329907, 0.125337696770523005,
-0.04546817338088020299, -0.2386752211125199863,
+ 1.145927328111949972, -4.135694445582720036, 10.52805060502839929,
-19.52408322548339825, 27.06921653903929936,
+ -28.46207532143190022, 22.88083524357429965, -14.10057147392659971,
6.63958754983273991, -2.364865219283200037,
+ 0.6251341806425250169, -0.1186991327450530043, 0.0152892726403408008,
-0.001195439764873199896, 4.281098416794090072e-05,
+
+ // log K = 8
+ 0.999348600452531044, 0.3332480372393080148, 0.126666900963325002,
-0.06495714694254159371, -0.08376282050638980681,
+ 0.3760158094643630267, -1.568204791601850001, 4.483117719555970382,
-9.119180124379150598, 13.65799293358900002,
+ -15.3100211234349004, 12.97546344654869976, -8.351661538536939489,
4.075022612435580172, -1.49387015887069996,
+ 0.4040976870253379927, -0.07813232681879349328, 0.01020545649538820085,
-0.0008063279210812720381, 2.909334976414100078e-05,
+
+ // log K = 9
+ 0.9996743787297059924, 0.3332925779481850093, 0.1267124599259649986,
-0.06550452970936600228, -0.08191738117533520214,
+ 0.3773034458363569987, -1.604679509609959975, 4.636761898691969641,
-9.487348609558699408, 14.25164235443030059,
+ -15.99674955529870068, 13.56353219046370029, -8.730194904342459594,
4.259010067932120336, -1.56106689792022002,
+ 0.4222540912786589828, -0.08165296504921559784, 0.01066878484925220041,
-0.0008433887618256910015, 3.045339724886519912e-05,
+
+ // log K = 10
+ 0.999837191783945034, 0.3333142252339619804, 0.1267759538087240012,
-0.06631005632753710077, -0.07692759158286699428,
+ 0.3568943956395980166, -1.546598721379510044, 4.51595019978557044,
-9.298431968763770428, 14.02586858080080034,
+ -15.78858959520439953, 13.41484931677589998, -8.647958125130809748,
4.22398017468472009, -1.549708891200570093,
+ 0.419507410264540026, -0.08117411611046250475, 0.01061202286184199928,
-0.000839300527596772007, 3.03185874520205985e-05,
+
+ // log K = 11
+ 0.9999186020796150265, 0.3333249054574359826, 0.126791713589799987,
-0.06662487271699729652, -0.07335552427910230211,
+ 0.3316370184815959909, -1.434143797561290068, 4.180260309967409604,
-8.593906870708760692, 12.95088874800289958,
+ -14.56876092520539956, 12.37074367531410068, -7.969152075707960137,
3.888774396648960074, -1.424923326506990051,
+ 0.385084561785229984, -0.07435541911616409816, 0.009695363567476529554,
-0.0007644375960047160388, 2.75156194717188011e-05,
+
+ // log K = 12
+ 0.9999592955649559967, 0.3333310560725140093, 0.1267379744020450116,
-0.06524495415766619344, -0.08854031542298740343,
+ 0.4244320628874230228, -1.794077789033230008, 5.133875262768450298,
-10.40149374917120007, 15.47808115629240078,
+ -17.2272296137545986, 14.5002173676463002, -9.274819801602760094,
4.500782540026570189, -1.642359389030050076,
+ 0.442596113445525019, -0.0853226219238850947, 0.01111969379054169975,
-0.0008771614088006969611, 3.161668519459719752e-05,
+
+ // log K = 13
+ 0.9999796468102559732, 0.3333336602394039727, 0.126728089053198989,
-0.06503798598282370391, -0.09050261023823169548,
+ 0.4350609244189960201, -1.831274835815670077, 5.223387516985289913,
-10.55574395269979959, 15.67359470222429962,
+ -17.41263416341029924, 14.63297400889229927, -9.346752431221359458,
4.530124905188380069, -1.651245566462089975,
+ 0.444542549250713015, -0.08561720963336499901, 0.01114805146185449992,
-0.0008786251203363140043, 3.16416341644572998e-05,
+
+ // log K = 14
+ 0.9999898187060970445, 0.3333362579300819806, 0.1266984078369459976,
-0.06464561179765909715, -0.09343280886228019777,
+ 0.4490702549264070087, -1.878087608052450008, 5.338004322057390283,
-10.76690603590630069, 15.97069195083200022,
+ -17.73440379943459888, 14.90212518309260048, -9.520506013770420495,
4.616238931978830173, -1.68364817877918993,
+ 0.4536194960681350086, -0.087448605434800597, 0.01139929991331390009,
-0.0008995891451622229631, 3.244407259782900338e-05,
+
+ // log K = 15
+ 0.9999949072549390028, 0.3333376334705290267, 0.126665364358402005,
-0.06411790034705669439, -0.09776009134670660128,
+ 0.4704691112248470253, -1.948021675295769972, 5.497760972696490001,
-11.03165645315390009, 16.29703330781000048,
+ -18.03851029448010124, 15.11836776139680083, -9.638205179917429533,
4.665122328753120051, -1.698980686525759953,
+ 0.4571799506245269873, -0.08804011353783609828, 0.01146553155965330043,
-0.0009040455800659569869, 3.257931866957050274e-05,
+
+ // log K = 16
+ 0.9999974544793589493, 0.3333381337614599871, 0.1266524862971120102,
-0.06391676499117690535, -0.09929616211306059592,
+ 0.4771390820378790254, -1.965762451227349938, 5.526802350376460282,
-11.05703067024660058, 16.29535848023060041,
+ -18.00114005075790047, 15.06214012231560062, -9.58874727382628933,
4.63537541652793017, -1.686222848555620102,
+ 0.4532602373715179933, -0.08719448925964939923, 0.01134365425717459921,
-0.0008934965241274289835, 3.216436244471380105e-05,
+
+ // log K = 17
+ 0.9999987278278800185, 0.3333383411464330148, 0.126642761751724009,
-0.06371042959073920653, -0.1013564516034080043,
+ 0.4891311195679299839, -2.010971712051409899, 5.644390807952309963,
-11.27697253921500042, 16.59957157207080058,
+ -18.31808338317799922, 15.31363518393730061, -9.741451446816620674,
4.706207545519429658, -1.711102469010010063,
+ 0.4597587341089349744, -0.08841670767182820134, 0.01149999225097850068,
-0.0009056651366963050422, 3.259910736274500059e-05,
+
+ // log K = 18
+ 0.9999993637727100371, 0.3333385511608860097, 0.1266341580529160016,
-0.06353272828164230335, -0.103139962850642003,
+ 0.4996216017206500104, -2.05099128585287982, 5.749874086531799655,
-11.47727638570349917, 16.88141587810320132,
+ -18.61744656177490143, 15.55634230427719977, -9.892350736128680211,
4.778033520984200422, -1.737045483861280104,
+ 0.4667410882683730167, -0.08977256212421590165, 0.01167940146667079994,
-0.0009201381242396030127, 3.313600701586759867e-05,
+
+ // log K = 19
+ 0.9999996805376010212, 0.3333372324328989778, 0.1267104737214659882,
-0.06504749929326139601, -0.0882341962464350954,
+ 0.4131871162041140244, -1.725190703567099915, 4.900817515593920426,
-9.883452720776510603, 14.6657081190816001,
+ -16.29398295135089825, 13.69805011761319946, -8.753475239465899449,
4.244072374564439976, -1.547202527706629915,
+ 0.4164770109614310267, -0.08017596922092029565, 0.01043146101701039954,
-0.00082124200571200305, 2.953319493719429935e-05,
+
+ // log K = 20
+ 0.9999998390037539986, 0.3333365859956040067, 0.1267460211029839967,
-0.06569456024647769843, -0.0823070353477164951,
+ 0.3810826463303410017, -1.611983580241109992, 4.624520077758210057,
-9.397308335633589138, 14.03184981378050011,
+ -15.6703191315401007, 13.22992718704790072, -8.484216393184780713,
4.125607133488029987, -1.507690650697159906,
+ 0.4066678517577320129, -0.07842110121777939868, 0.01021780862225150042,
-0.0008054065857047439754, 2.899431830426989844e-05,
+
+ // log K = 21
+ 0.9999999207001479817, 0.3333384953015239849, 0.1266331480396669928,
-0.06345750166298599892, -0.1042341210992499961,
+ 0.5077112908497130039, -2.087398133609810191, 5.858842546192500222,
-11.70620319777190055, 17.23103975433669888,
+ -19.01462552846669851, 15.89674059836560005, -10.11395134034419918,
4.88760796465891989, -1.777886770904629987,
+ 0.4780200178339499839, -0.09200895321782050218, 0.01198029553244219989,
-0.0009447283875782100165, 3.405716775824710232e-05,
+
+ // log K = 22
+ 0.9999999606908690497, 0.3333383929524300071, 0.1266456445096819927,
-0.06373504294081690225, -0.1012834291081849969,
+ 0.4893810690172959998, -2.01391428223606983, 5.656430437473649597,
-11.3067201537791, 16.64980594135310099,
+ -18.3792355790383013, 15.36879753115040081, -9.778831246425049528,
4.725308061988969577, -1.718423596500280093,
+ 0.4618308177809870019, -0.08883675060799739454, 0.01155766944804260087,
-0.0009104695617243750358, 3.278237729674439666e-05,
+
+ // log K = 23
+ 0.9999999794683379628, 0.3333386441751680085, 0.1266463995182049995,
-0.06376031920455070556, -0.1010799540803130059,
+ 0.488540137426137, -2.012048323537570127, 5.654949475342659682,
-11.31023240892979942, 16.66334675284959843,
+ -18.40241452866079896, 15.39443572867130072, -9.798844412838670692,
4.736683907539640082, -1.723168363744929987,
+ 0.463270349018644001, -0.08914619066708899531, 0.01160235936257320022,
-0.0009143600818183229709, 3.293669304679140117e-05,
+
+ // log K = 24
+ 0.9999999911469820146, 0.3333376076934529975, 0.1266944349940530012,
-0.06470524278387919381, -0.09189342220283110152,
+ 0.4359182372694809793, -1.815980282951169977, 5.149474056470340066,
-10.37086570678100017, 15.36962686758569951,
+ -17.05756384717849983, 14.32755177515199918, -9.149944050025640152,
4.434601894497260055, -1.616478926806520056,
+ 0.4351979157055039793, -0.08381768225272340223, 0.01091321820476520016,
-0.0008600264403629039739, 3.09667800347144002e-05,
+
+ // log K = 25
+ 0.9999999968592140354, 0.3333379164881000167, 0.1266782495827009913,
-0.06434163088961859789, -0.09575258124988890451,
+ 0.4597843575354370049, -1.911374431241559924, 5.411856661251520428,
-10.88850084646090011, 16.12298941380269923,
+ -17.88172178487259956, 15.01301780636859995, -9.585542896142529301,
4.645811872761620442, -1.693952293156189892,
+ 0.4563143308861309921, -0.08795976148455289523, 0.01146560428011200033,
-0.0009048442931930629528, 3.26358391497329992e-05,
+
+ // log K = 26
+ 0.9999999970700530483, 0.333338329556315982, 0.126644753076394001,
-0.06372365346512399997, -0.1012760856945769949,
+ 0.4886852278576360176, -2.009005418394389952, 5.638119224137019714,
-11.26276715335160006, 16.57640024218650154,
+ -18.29035093605569884, 15.28892246224570073, -9.724916375991760731,
4.6978877652334603, -1.707974125916829955,
+ 0.4588937864564729963, -0.08824617586088029375, 0.01147732114826570046,
-0.00090384524860747295, 3.253252703695579795e-05,
+];
+
+fn evaluate_polynomial(coefficients: &[f64], start: usize, num: usize, x: f64)
-> f64 {
+ let end = start + num - 1;
+ let mut total = coefficients[end];
+ for i in (start..end).rev() {
+ total *= x;
+ total += coefficients[i];
+ }
+ total
+}
+
+fn icon_exponential_approximation(k: f64, c: f64) -> f64 {
+ 0.7940236163830469 * k * 2f64.powf(c / k)
+}
+
+pub(super) fn icon_estimate(lg_k: u8, num_coupons: u32) -> f64 {
+ let lg_k = lg_k as usize;
+ assert!(
+ (ICON_MIN_LOG_K..=ICON_MAX_LOG_K).contains(&lg_k),
+ "lg_k out of range; got {lg_k}",
+ );
+
+ match num_coupons {
+ 0 => return 0.0,
+ 1 => return 1.0,
+ _ => {}
+ }
+
+ let k = (1 << lg_k) as f64;
+ let c = num_coupons as f64;
+
+ // Differing thresholds ensure that the approximated estimator is
monotonically increasing.
+ let threshold_factor = if lg_k < 14 { 5.7 } else { 5.6 };
+ if c > threshold_factor * k {
+ return icon_exponential_approximation(k, c);
+ }
+
+ let factor = evaluate_polynomial(
+ &ICON_POLYNOMIAL_COEFFICIENTS,
+ ICON_POLYNOMIAL_NUM_COEFFICIENTS * (lg_k - ICON_MIN_LOG_K),
+ ICON_POLYNOMIAL_NUM_COEFFICIENTS,
+ // The constant 2.0 is baked into the table
ICON_POLYNOMIAL_COEFFICIENTS.
+ // This factor, although somewhat arbitrary, is based on extensive
characterization studies
+ // and is considered a safe conservative factor.
+ c / (2.0 * k),
+ );
+ let ratio = c / k;
+ // The constant 66.774757 is baked into the table
ICON_POLYNOMIAL_COEFFICIENTS.
+ // This factor, although somewhat arbitrary, is based on extensive
characterization studies
+ // and is considered a safe conservative factor.
+ let term = 1.0 + (ratio * ratio * ratio / 66.774757);
+ let result = c * factor * term;
+ if result >= c { result } else { c }
+}
diff --git a/datasketches/src/cpc/kxp_byte_lookup.rs
b/datasketches/src/cpc/kxp_byte_lookup.rs
new file mode 100644
index 0000000..25fddf6
--- /dev/null
+++ b/datasketches/src/cpc/kxp_byte_lookup.rs
@@ -0,0 +1,76 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+pub(super) const KXP_BYTE_TABLE: [f64; 256] = [
+ 0.99609375, 0.49609375, 0.74609375, 0.24609375, 0.87109375, 0.37109375,
0.62109375, 0.12109375,
+ 0.93359375, 0.43359375, 0.68359375, 0.18359375, 0.80859375, 0.30859375,
0.55859375, 0.05859375,
+ 0.96484375, 0.46484375, 0.71484375, 0.21484375, 0.83984375, 0.33984375,
0.58984375, 0.08984375,
+ 0.90234375, 0.40234375, 0.65234375, 0.15234375, 0.77734375, 0.27734375,
0.52734375, 0.02734375,
+ 0.98046875, 0.48046875, 0.73046875, 0.23046875, 0.85546875, 0.35546875,
0.60546875, 0.10546875,
+ 0.91796875, 0.41796875, 0.66796875, 0.16796875, 0.79296875, 0.29296875,
0.54296875, 0.04296875,
+ 0.94921875, 0.44921875, 0.69921875, 0.19921875, 0.82421875, 0.32421875,
0.57421875, 0.07421875,
+ 0.88671875, 0.38671875, 0.63671875, 0.13671875, 0.76171875, 0.26171875,
0.51171875, 0.01171875,
+ 0.98828125, 0.48828125, 0.73828125, 0.23828125, 0.86328125, 0.36328125,
0.61328125, 0.11328125,
+ 0.92578125, 0.42578125, 0.67578125, 0.17578125, 0.80078125, 0.30078125,
0.55078125, 0.05078125,
+ 0.95703125, 0.45703125, 0.70703125, 0.20703125, 0.83203125, 0.33203125,
0.58203125, 0.08203125,
+ 0.89453125, 0.39453125, 0.64453125, 0.14453125, 0.76953125, 0.26953125,
0.51953125, 0.01953125,
+ 0.97265625, 0.47265625, 0.72265625, 0.22265625, 0.84765625, 0.34765625,
0.59765625, 0.09765625,
+ 0.91015625, 0.41015625, 0.66015625, 0.16015625, 0.78515625, 0.28515625,
0.53515625, 0.03515625,
+ 0.94140625, 0.44140625, 0.69140625, 0.19140625, 0.81640625, 0.31640625,
0.56640625, 0.06640625,
+ 0.87890625, 0.37890625, 0.62890625, 0.12890625, 0.75390625, 0.25390625,
0.50390625, 0.00390625,
+ 0.9921875, 0.4921875, 0.7421875, 0.2421875, 0.8671875, 0.3671875,
0.6171875, 0.1171875,
+ 0.9296875, 0.4296875, 0.6796875, 0.1796875, 0.8046875, 0.3046875,
0.5546875, 0.0546875,
+ 0.9609375, 0.4609375, 0.7109375, 0.2109375, 0.8359375, 0.3359375,
0.5859375, 0.0859375,
+ 0.8984375, 0.3984375, 0.6484375, 0.1484375, 0.7734375, 0.2734375,
0.5234375, 0.0234375,
+ 0.9765625, 0.4765625, 0.7265625, 0.2265625, 0.8515625, 0.3515625,
0.6015625, 0.1015625,
+ 0.9140625, 0.4140625, 0.6640625, 0.1640625, 0.7890625, 0.2890625,
0.5390625, 0.0390625,
+ 0.9453125, 0.4453125, 0.6953125, 0.1953125, 0.8203125, 0.3203125,
0.5703125, 0.0703125,
+ 0.8828125, 0.3828125, 0.6328125, 0.1328125, 0.7578125, 0.2578125,
0.5078125, 0.0078125,
+ 0.984375, 0.484375, 0.734375, 0.234375, 0.859375, 0.359375, 0.609375,
0.109375, 0.921875,
+ 0.421875, 0.671875, 0.171875, 0.796875, 0.296875, 0.546875, 0.046875,
0.953125, 0.453125,
+ 0.703125, 0.203125, 0.828125, 0.328125, 0.578125, 0.078125, 0.890625,
0.390625, 0.640625,
+ 0.140625, 0.765625, 0.265625, 0.515625, 0.015625, 0.96875, 0.46875,
0.71875, 0.21875, 0.84375,
+ 0.34375, 0.59375, 0.09375, 0.90625, 0.40625, 0.65625, 0.15625, 0.78125,
0.28125, 0.53125,
+ 0.03125, 0.9375, 0.4375, 0.6875, 0.1875, 0.8125, 0.3125, 0.5625, 0.0625,
0.875, 0.375, 0.625,
+ 0.125, 0.75, 0.25, 0.5, 0.0,
+];
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use crate::common::inv_pow2_table::INVERSE_POWERS_OF_2;
+
+ #[test]
+ fn assert_kxp_byte_table() {
+ for byte in 0u8..=255 {
+ // expected value
+ let expected = KXP_BYTE_TABLE[byte as usize];
+
+ // computed value
+ let mut computed = 0.0;
+ for col in 0..8 {
+ let bit = (byte >> col) & 1;
+ if bit == 0 {
+ // note the inverted logic
+ computed += INVERSE_POWERS_OF_2[col + 1]; //note the "+1"
+ }
+ }
+
+ assert_eq!(expected, computed);
+ }
+ }
+}
diff --git a/datasketches/src/cpc/mod.rs b/datasketches/src/cpc/mod.rs
new file mode 100644
index 0000000..4c9fea2
--- /dev/null
+++ b/datasketches/src/cpc/mod.rs
@@ -0,0 +1,100 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#![allow(dead_code)]
+
+//! Compressed Probabilistic Counting sketch.
+//!
+//! This is a unique-counting sketch that implements the Compressed
Probabilistic Counting (CPC,
+//! a.k.a. FM85) algorithms developed by Kevin Lang in his paper [Back to the
Future: an Even More
+//! Nearly Optimal Cardinality Estimation
Algorithm](https://arxiv.org/abs/1708.06839).
+//!
+//! This sketch is extremely space-efficient when serialized. In an
apples-to-apples empirical
+//! comparison against compressed HyperLogLog sketches, this new algorithm
simultaneously wins on
+//! the two dimensions of the space/accuracy tradeoff and produces sketches
that are smaller than
+//! the entropy of HLL, so no possible implementation of compressed HLL can
match its space
+//! efficiency for a given accuracy. As described in the paper this sketch
implements a newly
+//! developed ICON estimator algorithm that survives union operations, another
well-known estimator,
+//! the [Historical Inverse Probability
(HIP)](https://arxiv.org/abs/1306.3284) estimator does not.
+//!
+//! The update speed performance of this sketch is quite fast and is
comparable to the speed of HLL.
+//! The union (merging) capability of this sketch also allows for merging of
sketches with different
+//! configurations of K.
+//!
+//! For additional security this sketch can be configured with a
user-specified hash seed.
+
+mod estimator;
+mod kxp_byte_lookup;
+mod pair_table;
+mod sketch;
+mod union;
+
+pub use self::sketch::CpcSketch;
+pub use self::union::CpcUnion;
+
+/// Default log2 of K.
+const DEFAULT_LG_K: u8 = 11;
+/// Min log2 of K.
+const MIN_LG_K: usize = 4;
+/// Max log2 of K.
+const MAX_LG_K: usize = 26;
+
+#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)]
+#[expect(clippy::upper_case_acronyms)]
+enum Flavor {
+ EMPTY, // 0 == C < 1
+ SPARSE, // 1 <= C < 3K/32
+ HYBRID, // 3K/32 <= C < K/2
+ PINNED, // K/2 <= C < 27K/8 [NB: 27/8 = 3 + 3/8]
+ SLIDING, // 27K/8 <= C
+}
+
+fn count_bits_set_in_matrix(matrix: &[u64]) -> u32 {
+ let mut count = 0;
+ for word in matrix {
+ count += word.count_ones();
+ }
+ count
+}
+
+fn determine_flavor(lg_k: u8, num_coupons: u32) -> Flavor {
+ let k = 1 << lg_k;
+ let c2 = num_coupons << 1;
+ let c8 = num_coupons << 3;
+ let c32 = num_coupons << 5;
+ if num_coupons == 0 {
+ Flavor::EMPTY
+ } else if c32 < (3 * k) {
+ Flavor::SPARSE
+ } else if c2 < k {
+ Flavor::HYBRID
+ } else if c8 < (27 * k) {
+ Flavor::PINNED
+ } else {
+ Flavor::SLIDING
+ }
+}
+
+fn determine_correct_offset(lg_k: u8, num_coupons: u32) -> u8 {
+ let k = 1 << lg_k;
+ let tmp = ((num_coupons as i64) << 3) - (19 * k); // 8C - 19K
+ if tmp < 0 {
+ 0
+ } else {
+ (tmp >> (lg_k + 3)) as u8 // tmp / 8K
+ }
+}
diff --git a/datasketches/src/cpc/pair_table.rs
b/datasketches/src/cpc/pair_table.rs
new file mode 100644
index 0000000..2af1859
--- /dev/null
+++ b/datasketches/src/cpc/pair_table.rs
@@ -0,0 +1,346 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+const UPSIZE_NUMERATOR: u32 = 3;
+const UPSIZE_DENOMINATOR: u32 = 4;
+const DOWNSIZE_NUMERATOR: u32 = 1;
+const DOWNSIZE_DENOMINATOR: u32 = 4;
+
+/// A highly specialized hash table used for sparse data.
+///
+/// This table stores `(row, col)` pairs and uses linear probing for collision
resolution. It is
+/// optimized for scenarios where the cardinality of entries is low.
+#[derive(Debug, Clone)]
+pub(super) struct PairTable {
+ /// log2 of number of slots
+ lg_size: u8,
+ num_valid_bits: u8,
+ num_items: u32,
+ slots: Vec<u32>,
+}
+
+impl PairTable {
+ pub fn new(lg_size: u8, num_valid_bits: u8) -> Self {
+ assert!(
+ (2..=26).contains(&lg_size),
+ "lg_size must be in [2, 26], got {lg_size}"
+ );
+ assert!(
+ ((lg_size + 1)..=32).contains(&num_valid_bits),
+ "num_valid_bits must be in [lg_size + 1, 32], got {num_valid_bits}
where lg_size = {lg_size}"
+ );
+ Self {
+ lg_size,
+ num_valid_bits,
+ num_items: 0,
+ slots: vec![u32::MAX; 1 << lg_size],
+ }
+ }
+
+ /// A constructor specifically tailored to be a part of FM85 decompression
scheme.
+ pub fn from_slots(lg_size: u8, num_items: u32, slots: Vec<u32>) -> Self {
+ let mut lg_num_slots = 2;
+ while UPSIZE_DENOMINATOR * num_items > (UPSIZE_NUMERATOR * (1 <<
lg_num_slots)) {
+ lg_num_slots += 1;
+ }
+
+ let mut table = Self::new(lg_num_slots, 6 + lg_size);
+
+ // Note: there is a possible "snowplow effect" here because the caller
is passing in a
+ // sorted pairs array. However, we are starting out with the correct
final table size, so
+ // the problem might not occur.
+
+ for slot in slots {
+ table.must_insert(slot);
+ }
+ table.num_items = num_items;
+ table
+ }
+
+ pub fn num_items(&self) -> u32 {
+ self.num_items
+ }
+
+ pub fn slots(&self) -> &[u32] {
+ &self.slots
+ }
+
+ pub fn clear(&mut self) {
+ self.slots.fill(u32::MAX);
+ self.num_items = 0;
+ }
+
+ pub fn maybe_delete(&mut self, item: u32) -> bool {
+ let index = self.lookup(item) as usize;
+ if self.slots[index] == u32::MAX {
+ return false;
+ }
+ assert_eq!(
+ self.slots[index], item,
+ "item {item} not found at index {index}"
+ );
+ assert!(self.num_items > 0, "no items to delete");
+
+ // delete the item
+ self.slots[index] = u32::MAX;
+ self.num_items -= 1;
+
+ // re-insert all items between the freed slot and the next empty slot
+ let mask = (1 << self.lg_size) - 1;
+ let mut probe = (index + 1) & mask;
+ let mut fetched = self.slots[probe];
+ while fetched != u32::MAX {
+ self.slots[probe] = u32::MAX;
+ self.must_insert(fetched);
+ probe = (probe + 1) & mask;
+ fetched = self.slots[probe];
+ }
+
+ // shrink if necessary
+ while ((DOWNSIZE_DENOMINATOR * self.num_items) < (DOWNSIZE_NUMERATOR *
(1 << self.lg_size)))
+ && (self.lg_size > 2)
+ {
+ self.rebuild(self.lg_size - 1);
+ }
+
+ true
+ }
+
+ pub fn maybe_insert(&mut self, item: u32) -> bool {
+ let index = self.lookup(item) as usize;
+ if self.slots[index] == item {
+ return false;
+ }
+ assert_eq!(
+ self.slots[index],
+ u32::MAX,
+ "no empty slot found for item {item} at index {index}"
+ );
+ self.slots[index] = item;
+ self.num_items += 1;
+ while (UPSIZE_DENOMINATOR * self.num_items) > (UPSIZE_NUMERATOR * (1
<< self.lg_size)) {
+ self.rebuild(self.lg_size + 1);
+ }
+ true
+ }
+
+ /// While extracting the items from a linear probing hashtable, this will
usually undo the
+ /// wrap-around provided that the table isn't too full.
+ ///
+ /// Experiments suggest that for sufficiently large tables the load factor
would have to be
+ /// over 90 percent before this would fail frequently, and even then the
subsequent sort would
+ /// fix things up.
+ ///
+ /// The result is nearly sorted, so make sure to use an efficient sort for
that case.
+ pub fn unwrapping_get_items(&self) -> Vec<u32> {
+ if self.slots.is_empty() {
+ return vec![];
+ }
+
+ let table_size = 1usize << self.lg_size;
+ let mut result = vec![0; self.num_items as usize];
+ let mut i = 0usize;
+ let mut l = 0usize;
+ let mut r = self.num_items as usize - 1;
+
+ // special rules for the region before the first empty slot
+ let hi_bit = 1 << (self.num_valid_bits - 1);
+ while i < table_size && self.slots[i] != u32::MAX {
+ let item = self.slots[i];
+ i += 1;
+ if (item & hi_bit) != 0 {
+ // this item was probably wrapped, so move to end
+ result[r] = item;
+ r -= 1;
+ } else {
+ result[l] = item;
+ l += 1;
+ }
+ }
+
+ // the rest of the table is processed normally
+ while i < table_size {
+ let item = self.slots[i];
+ i += 1;
+ if item != u32::MAX {
+ result[l] = item;
+ l += 1;
+ }
+ }
+
+ assert_eq!(l, r + 1, "number of items mismatch during extraction");
+ result
+ }
+
+ fn must_insert(&mut self, item: u32) {
+ let index = self.lookup(item) as usize;
+ assert_ne!(
+ self.slots[index], item,
+ "item {item} already present in table"
+ );
+ assert_eq!(
+ self.slots[index],
+ u32::MAX,
+ "no empty slot found for item {item} at index {index}"
+ );
+ self.slots[index] = item;
+ // counts and resizing must be handled by the caller.
+ }
+
+ fn lookup(&self, item: u32) -> u32 {
+ let size = 1 << self.lg_size;
+ let mask = size - 1;
+
+ let shift = self.num_valid_bits - self.lg_size;
+
+ // extract high table size bits
+ let mut probe = item >> shift;
+ assert!(probe <= mask, "probe = {probe}, mask = {mask}");
+
+ loop {
+ let slot = self.slots[probe as usize];
+ if slot != item && slot != u32::MAX {
+ probe = (probe + 1) & mask;
+ } else {
+ break;
+ }
+ }
+
+ probe
+ }
+
+ /// Rebuilds to a larger size. `num_items` and `num_valid_bits` remain
unchanged.
+ fn rebuild(&mut self, lg_size: u8) {
+ assert!(
+ (2..=26).contains(&lg_size),
+ "lg_size must be in [2, 26], got {lg_size}"
+ );
+ assert!(
+ ((lg_size + 1)..=32).contains(&self.num_valid_bits),
+ "num_valid_bits must be in [lg_size + 1, 32], got {} where lg_size
= {lg_size}",
+ self.num_valid_bits
+ );
+
+ let new_size = 1u32 << lg_size;
+ assert!(
+ new_size > self.num_items,
+ "new table size ({new_size}) must be larger than number of items
{}",
+ self.num_items
+ );
+
+ let slots = std::mem::replace(&mut self.slots, vec![u32::MAX; new_size
as usize]);
+ self.lg_size = lg_size;
+ for slot in slots {
+ if slot != u32::MAX {
+ self.must_insert(slot);
+ }
+ }
+ }
+}
+
+/// This should be used pair with [`PairTable::unwrapping_get_items`].
+///
+/// In applications where the input array is already nearly sorted, insertion
sort runs in linear
+/// time with a very small constant.
+///
+/// This introspective version of insertion sort protects against the
quadratic cost of sorting bad
+/// input arrays. It keeps track of how much work has been done, and if that
exceeds a constant
+/// times the array length, it switches to a different sorting algorithm.
+pub fn introspective_insertion_sort(a: &mut [u32]) {
+ let cost_limit = 8 * a.len();
+
+ let mut cost = 0;
+ for i in 1..a.len() {
+ let mut j = i;
+ let v = a[i];
+ while j >= 1 && v < a[j - 1] {
+ a[j] = a[j - 1];
+ j -= 1;
+ }
+ a[j] = v;
+ cost += i - j; // distance moved is a measure of work
+ if cost > cost_limit {
+ knuth_shell_sort3(a);
+ return;
+ }
+ }
+}
+
+fn knuth_shell_sort3(a: &mut [u32]) {
+ let len = a.len();
+
+ let mut h = 0;
+ while h < len / 9 {
+ h = 3 * h + 1;
+ }
+
+ while h > 0 {
+ for i in h..len {
+ let v = a[i];
+ let mut j = i;
+ while j >= h && v < a[j - h] {
+ a[j] = a[j - h];
+ j -= h;
+ }
+ a[j] = v;
+ }
+ h /= 3;
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn test_sort_1() {
+ let data = (0..10)
+ .map(|_| rand::random_range(0..10000))
+ .collect::<Vec<_>>();
+ let mut sorted = data.clone();
+ introspective_insertion_sort(&mut sorted);
+ assert!(
+ sorted.is_sorted(),
+ "data was not sorted correctly: origin={data:?}, sorted={sorted:?}"
+ );
+ }
+
+ #[test]
+ fn test_sort_2() {
+ let data = (0..10)
+ .map(|_| rand::random_range(0..10000) + 3_000_000_000)
+ .collect::<Vec<_>>();
+ let mut sorted = data.clone();
+ introspective_insertion_sort(&mut sorted);
+ assert!(
+ sorted.is_sorted(),
+ "data was not sorted correctly: origin={data:?}, sorted={sorted:?}"
+ );
+ }
+
+ #[test]
+ fn test_sort_3() {
+ let len = 20;
+ let data = (0..len).map(|i| len - i + 1).collect::<Vec<_>>();
+ let mut sorted = data.clone();
+ introspective_insertion_sort(&mut sorted);
+ assert!(
+ sorted.is_sorted(),
+ "data was not sorted correctly: origin={data:?}, sorted={sorted:?}"
+ );
+ }
+}
diff --git a/datasketches/src/cpc/sketch.rs b/datasketches/src/cpc/sketch.rs
new file mode 100644
index 0000000..6af8bfc
--- /dev/null
+++ b/datasketches/src/cpc/sketch.rs
@@ -0,0 +1,499 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::hash::Hash;
+
+use crate::common::NumStdDev;
+use crate::common::canonical_double;
+use crate::common::inv_pow2_table::INVERSE_POWERS_OF_2;
+use crate::cpc::DEFAULT_LG_K;
+use crate::cpc::Flavor;
+use crate::cpc::MAX_LG_K;
+use crate::cpc::MIN_LG_K;
+use crate::cpc::count_bits_set_in_matrix;
+use crate::cpc::determine_correct_offset;
+use crate::cpc::determine_flavor;
+use crate::cpc::estimator::hip_confidence_lb;
+use crate::cpc::estimator::hip_confidence_ub;
+use crate::cpc::estimator::icon_confidence_lb;
+use crate::cpc::estimator::icon_confidence_ub;
+use crate::cpc::estimator::icon_estimate;
+use crate::cpc::kxp_byte_lookup::KXP_BYTE_TABLE;
+use crate::cpc::pair_table::PairTable;
+use crate::hash::DEFAULT_UPDATE_SEED;
+use crate::hash::MurmurHash3X64128;
+
+/// A Compressed Probabilistic Counting sketch.
+///
+/// See the [module level documentation](super) for more.
+#[derive(Debug, Clone)]
+pub struct CpcSketch {
+ // immutable config variables
+ lg_k: u8,
+ seed: u64,
+
+ // sketch state
+ /// Part of a speed optimization.
+ pub(super) first_interesting_column: u8,
+ /// The number of coupons collected so far.
+ pub(super) num_coupons: u32,
+ /// Sparse and surprising values.
+ pub(super) surprising_value_table: Option<PairTable>,
+ /// Derivable from num_coupons, but made explicit for speed.
+ pub(super) window_offset: u8,
+ /// Size K bytes in dense mode (flavor >= HYBRID).
+ pub(super) sliding_window: Vec<u8>,
+
+ // estimator state
+ /// Whether the sketch is a result of merging.
+ ///
+ /// If `false`, the HIP (Historical Inverse Probability) estimator is used.
+ /// If `true`, the ICON (Inter-Column Optimal) Estimator is fallback in
use.
+ pub(super) merge_flag: bool,
+ // the following variables are only valid in HIP estimator
+ /// A pre-calculated probability factor (`k * p`) used to compute the
increment delta.
+ kxp: f64,
+ /// The accumulated cardinality estimate.
+ hip_est_accum: f64,
+}
+
+impl Default for CpcSketch {
+ fn default() -> Self {
+ Self::new(DEFAULT_LG_K)
+ }
+}
+
+impl CpcSketch {
+ /// Creates a new `CpcSketch` with the given `lg_k` and default seed.
+ ///
+ /// # Panics
+ ///
+ /// Panics if `lg_k` is not in the range `[4, 16]`.
+ pub fn new(lg_k: u8) -> Self {
+ Self::with_seed(lg_k, DEFAULT_UPDATE_SEED)
+ }
+
+ /// Creates a new `CpcSketch` with the given `lg_k` and `seed`.
+ ///
+ /// # Panics
+ ///
+ /// Panics if `lg_k` is not in the range `[4, 16]`.
+ pub fn with_seed(lg_k: u8, seed: u64) -> Self {
+ assert!(
+ (MIN_LG_K..=MAX_LG_K).contains(&(lg_k as usize)),
+ "lg_k out of range; got {lg_k}",
+ );
+
+ Self {
+ lg_k,
+ seed,
+ first_interesting_column: 0,
+ num_coupons: 0,
+ surprising_value_table: None,
+ window_offset: 0,
+ sliding_window: vec![],
+ merge_flag: false,
+ kxp: (1 << lg_k) as f64,
+ hip_est_accum: 0.0,
+ }
+ }
+
+ /// Return the parameter lg_k.
+ pub fn lg_k(&self) -> u8 {
+ self.lg_k
+ }
+
+ /// Returns the best estimate of the cardinality of the sketch.
+ pub fn estimate(&self) -> f64 {
+ if !self.merge_flag {
+ self.hip_est_accum
+ } else {
+ icon_estimate(self.lg_k, self.num_coupons)
+ }
+ }
+
+ /// Returns the best estimate of the lower bound of the confidence
interval given `kappa`.
+ pub fn lower_bound(&self, kappa: NumStdDev) -> f64 {
+ if !self.merge_flag {
+ hip_confidence_lb(self.lg_k, self.num_coupons, self.hip_est_accum,
kappa)
+ } else {
+ icon_confidence_lb(self.lg_k, self.num_coupons, kappa)
+ }
+ }
+
+ /// Returns the best estimate of the upper bound of the confidence
interval given `kappa`.
+ pub fn upper_bound(&self, kappa: NumStdDev) -> f64 {
+ if !self.merge_flag {
+ hip_confidence_ub(self.lg_k, self.num_coupons, self.hip_est_accum,
kappa)
+ } else {
+ icon_confidence_ub(self.lg_k, self.num_coupons, kappa)
+ }
+ }
+
+ /// Returns true if the sketch is empty.
+ pub fn is_empty(&self) -> bool {
+ self.num_coupons == 0
+ }
+
+ /// Update the sketch with a hashable value.
+ ///
+ /// For `f32`/`f64` values, use `update_f32`/`update_f64` instead.
+ pub fn update<T: Hash>(&mut self, value: T) {
+ let mut hasher = MurmurHash3X64128::with_seed(self.seed);
+ value.hash(&mut hasher);
+ let (h1, h2) = hasher.finish128();
+
+ let k = 1 << self.lg_k;
+ let col = h2.leading_zeros(); // 0 <= col <= 64
+ let col = if col > 63 { 63 } else { col as u8 }; // clip so that 0 <=
col <= 63
+ let row = (h1 & (k - 1)) as u32;
+ let mut row_col = (row << 6) | (col as u32);
+ // To avoid the hash table's "empty" value, we change the row of the
following pair.
+ // This case is extremely unlikely, but we might as well handle it.
+ if row_col == u32::MAX {
+ row_col ^= 1 << 6;
+ }
+ self.row_col_update(row_col);
+ }
+
+ /// Update the sketch with a f64 value.
+ pub fn update_f64(&mut self, value: f64) {
+ // Canonicalize double for compatibility with Java
+ let canonical = canonical_double(value);
+ self.update(canonical);
+ }
+
+ /// Update the sketch with a f32 value.
+ pub fn update_f32(&mut self, value: f32) {
+ self.update_f64(value as f64);
+ }
+
+ pub(super) fn flavor(&self) -> Flavor {
+ determine_flavor(self.lg_k, self.num_coupons)
+ }
+
+ pub(super) fn row_col_update(&mut self, row_col: u32) {
+ let col = (row_col & 63) as u8;
+ if col < self.first_interesting_column {
+ // important speed optimization
+ return;
+ }
+
+ if self.num_coupons == 0 {
+ // promote EMPTY to SPARSE
+ self.surprising_value_table = Some(PairTable::new(2, 6 +
self.lg_k));
+ }
+
+ if self.sliding_window.is_empty() {
+ self.update_sparse(row_col);
+ } else {
+ self.update_windowed(row_col);
+ }
+ }
+
+ pub(super) fn seed(&self) -> u64 {
+ self.seed
+ }
+
+ pub(super) fn surprising_value_table(&self) -> &PairTable {
+ self.surprising_value_table
+ .as_ref()
+ .expect("surprising value table must be initialized")
+ }
+
+ pub(super) fn mut_surprising_value_table(&mut self) -> &mut PairTable {
+ self.surprising_value_table
+ .as_mut()
+ .expect("surprising value table must be initialized")
+ }
+
+ fn update_hip(&mut self, row_col: u32) {
+ let k = 1 << self.lg_k;
+ let col = (row_col & 63) as usize;
+ let one_over_p = (k as f64) / self.kxp;
+ self.hip_est_accum += one_over_p;
+ self.kxp -= INVERSE_POWERS_OF_2[col + 1] // notice the "+1"
+ }
+
+ fn update_sparse(&mut self, row_col: u32) {
+ let k = 1 << self.lg_k;
+ let c32pre = (self.num_coupons as u64) << 5;
+ debug_assert!(c32pre < 3 * k); // C < 3K/32, in other words, flavor ==
SPARSE
+ let is_novel = self.mut_surprising_value_table().maybe_insert(row_col);
+ if is_novel {
+ self.num_coupons += 1;
+ self.update_hip(row_col);
+ let c32post = (self.num_coupons as u64) << 5;
+ if c32post >= 3 * k {
+ self.promote_sparse_to_windowed();
+ }
+ }
+ }
+
+ fn promote_sparse_to_windowed(&mut self) {
+ debug_assert_eq!(self.window_offset, 0);
+
+ let k = 1 << self.lg_k;
+ let c32 = (self.num_coupons as u64) << 5;
+ debug_assert!((c32 == (3 * k)) || ((self.lg_k == 4) && (c32 > (3 *
k))));
+
+ self.sliding_window.resize(k as usize, 0);
+
+ let old_table = self
+ .surprising_value_table
+ .replace(PairTable::new(2, 6 + self.lg_k))
+ .expect("surprising value table must be initialized");
+ let old_slots = old_table.slots();
+ for &row_col in old_slots {
+ if row_col != u32::MAX {
+ let col = (row_col & 63) as u8;
+ if col < 8 {
+ let row = (row_col >> 6) as usize;
+ self.sliding_window[row] |= 1 << col;
+ } else {
+ // cannot use must_insert(), because it doesn't provide
for growth
+ let is_novel =
self.mut_surprising_value_table().maybe_insert(row_col);
+ debug_assert!(is_novel);
+ }
+ }
+ }
+ }
+
+ fn update_windowed(&mut self, row_col: u32) {
+ debug_assert!(self.window_offset <= 56);
+ let k = 1 << self.lg_k;
+ let c32pre = (self.num_coupons as u64) << 5;
+ debug_assert!(c32pre >= 3 * k); // C >= 3K/32, in other words flavor
>= HYBRID
+ let c8pre = (self.num_coupons as u64) << 3;
+ let w8pre = (self.window_offset as u64) << 3;
+ debug_assert!(c8pre < (27 + w8pre) * k); // C < (K * 27/8) + (K *
windowOffset)
+
+ let mut is_novel = false; // novel if new coupon;
+ let col = (row_col & 63) as u8;
+ if col < self.window_offset {
+ // track the surprising 0's "before" the window
+ is_novel =
self.mut_surprising_value_table().maybe_delete(row_col); // inverted logic
+ } else if col < self.window_offset + 8 {
+ // track the 8 bits inside the window
+ let row = (row_col >> 6) as usize;
+ let old_bits = self.sliding_window[row];
+ let new_bits = old_bits | (1 << (col - self.window_offset));
+ if old_bits != new_bits {
+ self.sliding_window[row] = new_bits;
+ is_novel = true;
+ }
+ } else {
+ // track the surprising 1's "after" the window
+ is_novel =
self.mut_surprising_value_table().maybe_insert(row_col); // normal logic
+ }
+
+ if is_novel {
+ self.num_coupons += 1;
+ self.update_hip(row_col);
+ let c8post = (self.num_coupons as u64) << 3;
+ if c8post >= (27 + w8pre) * k {
+ self.move_window();
+ debug_assert!((1..=56).contains(&self.window_offset));
+ let w8post = (self.window_offset as u64) << 3;
+ debug_assert!(c8post < ((27 + w8post) * k)); // C < (K * 27/8)
+ (K * windowOffset)
+ }
+ }
+ }
+
+ fn move_window(&mut self) {
+ let new_offset = self.window_offset + 1;
+ debug_assert!(new_offset <= 56);
+ debug_assert_eq!(
+ new_offset,
+ determine_correct_offset(self.lg_k, self.num_coupons)
+ );
+
+ let k = 1 << self.lg_k;
+
+ // Construct the full-sized bit matrix that corresponds to the sketch
+ let bit_matrix = self.build_bit_matrix();
+
+ // refresh the KXP register on every 8th window shift.
+ if (new_offset & 0x7) == 0 {
+ self.refresh_kxp(&bit_matrix);
+ }
+
+ self.mut_surprising_value_table().clear(); // the new number of
surprises will be about the same
+
+ let mask_for_clearing_window = (0xFF << new_offset) ^ u64::MAX;
+ let mask_for_flipping_early_zone = (1u64 << new_offset) - 1;
+
+ let mut all_surprises_ored = 0u64;
+ for i in 0..k {
+ let mut pattern = bit_matrix[i];
+ self.sliding_window[i] = ((pattern >> new_offset) & 0xff) as u8;
+ pattern &= mask_for_clearing_window;
+ // The following line converts surprising 0's to 1's in the "early
zone",
+ // (and vice versa, which is essential for this procedure's O(k)
time cost).
+ pattern ^= mask_for_flipping_early_zone;
+ all_surprises_ored |= pattern; // a cheap way to recalculate
first_interesting_column
+ while pattern != 0 {
+ let col = pattern.trailing_zeros();
+ pattern ^= 1 << col; // erase the 1
+ let row_col = ((i as u32) << 6) | col;
+ let is_novel =
self.mut_surprising_value_table().maybe_insert(row_col);
+ debug_assert!(is_novel);
+ }
+ }
+
+ self.window_offset = new_offset;
+ self.first_interesting_column = all_surprises_ored.trailing_zeros() as
u8;
+ if self.first_interesting_column > new_offset {
+ self.first_interesting_column = new_offset; // corner case
+ }
+ }
+
+ /// The KXP register is a double with roughly 50 bits of precision, but it
might need roughly
+ /// 90 bits to track the value with perfect accuracy.
+ ///
+ /// Therefore, we recalculate KXP occasionally from the sketch's full
bit_matrix so that it
+ /// will reflect changes that were previously outside the mantissa.
+ fn refresh_kxp(&mut self, bit_matrix: &[u64]) {
+ // for improved numerical accuracy, we separately sum the bytes of the
u64's
+ let mut byte_sums = [0.0; 8];
+ for &bit in bit_matrix {
+ let mut word = bit;
+ for sum in byte_sums.iter_mut() {
+ let byte = (word & 0xFF) as usize;
+ *sum += KXP_BYTE_TABLE[byte];
+ word >>= 8;
+ }
+ }
+
+ let mut total = 0.0;
+ for i in (0..8).rev() {
+ // the reverse order is important
+ let factor = INVERSE_POWERS_OF_2[i * 8]; // pow (256.0, (-1.0 *
((double) j)))
+ total += factor * byte_sums[i];
+ }
+
+ self.kxp = total;
+ }
+
+ pub(super) fn build_bit_matrix(&self) -> Vec<u64> {
+ let k = 1 << self.lg_k;
+ let offset = self.window_offset;
+ debug_assert!(offset <= 56);
+
+ // Fill the matrix with default rows in which the "early zone" is
filled with ones.
+ // This is essential for the routine's O(k) time cost (as opposed to
O(C)).
+ let default_row = (1u64 << offset) - 1;
+
+ let mut matrix = vec![default_row; k];
+ if self.num_coupons == 0 {
+ return matrix;
+ }
+
+ if !self.sliding_window.is_empty() {
+ // In other words, we are in window mode, not sparse mode
+ for i in 0..k {
+ // set the window bits, trusting the sketch's current offset
+ matrix[i] |= (self.sliding_window[i] as u64) << offset;
+ }
+ }
+
+ for &row_col in self.surprising_value_table().slots() {
+ if row_col != u32::MAX {
+ let col = (row_col & 63) as u8;
+ let row = (row_col >> 6) as usize;
+ // Flip the specified matrix bit from its default value.
+ // In the "early" zone the bit changes from 1 to 0.
+ // In the "late" zone the bit changes from 0 to 1.
+ matrix[row] ^= 1 << col;
+ }
+ }
+
+ matrix
+ }
+}
+
+impl CpcSketch {
+ /// Returns the estimated maximum compressed serialized size of a sketch.
+ ///
+ /// The actual size of a compressed CPC sketch has a small random
variance, but the following
+ /// empirically measured size should be large enough for at least 99.9
percent of sketches.
+ ///
+ /// For small values of `n` the size can be much smaller.
+ ///
+ /// # Panics
+ ///
+ /// Panics if `lg_k` is not in the range `[4, 26]`.
+ pub fn max_serialized_bytes(lg_k: u8) -> usize {
+ let lg_k = lg_k as usize;
+ assert!(
+ (MIN_LG_K..=MAX_LG_K).contains(&lg_k),
+ "lg_k out of range; got {lg_k}",
+ );
+
+ // These empirical values for the 99.9th percentile of size in bytes
were measured using
+ // 100,000 trials. The value for each trial is the maximum of 5*16=80
measurements
+ // that were equally spaced over values of the quantity C/K between
3.0 and 8.0.
+ // This table does not include the worst-case space for the preamble,
which is added
+ // by the function.
+ const EMPIRICAL_SIZE_MAX_LGK: usize = 19;
+ const EMPIRICAL_MAX_SIZE_BYTES: [usize; 16] = [
+ 24, // lg_k = 4
+ 36, // lg_k = 5
+ 56, // lg_k = 6
+ 100, // lg_k = 7
+ 180, // lg_k = 8
+ 344, // lg_k = 9
+ 660, // lg_k = 10
+ 1292, // lg_k = 11
+ 2540, // lg_k = 12
+ 5020, // lg_k = 13
+ 9968, // lg_k = 14
+ 19836, // lg_k = 15
+ 39532, // lg_k = 16
+ 78880, // lg_k = 17
+ 157516, // lg_k = 18
+ 314656, // lg_k = 19
+ ];
+ const EMPIRICAL_MAX_SIZE_FACTOR: f64 = 0.6; // 0.6 = 4.8 / 8.0
+ const MAX_PREAMBLE_SIZE_BYTES: usize = 40;
+
+ if lg_k <= EMPIRICAL_SIZE_MAX_LGK {
+ return EMPIRICAL_MAX_SIZE_BYTES[lg_k - MIN_LG_K] +
MAX_PREAMBLE_SIZE_BYTES;
+ }
+ let k = 1 << lg_k;
+ ((EMPIRICAL_MAX_SIZE_FACTOR * k as f64) as usize) +
MAX_PREAMBLE_SIZE_BYTES
+ }
+}
+
+// testing methods
+impl CpcSketch {
+ /// Validate this sketch is valid.
+ ///
+ /// This is primarily for testing and validation purposes.
+ pub fn validate(&self) -> bool {
+ let bit_matrix = self.build_bit_matrix();
+ let num_bits_set = count_bits_set_in_matrix(&bit_matrix);
+ num_bits_set == self.num_coupons
+ }
+
+ /// Returns the number of coupons in the sketch.
+ ///
+ /// This is primarily for testing and validation purposes.
+ pub fn num_coupons(&self) -> u32 {
+ self.num_coupons
+ }
+}
diff --git a/datasketches/src/cpc/union.rs b/datasketches/src/cpc/union.rs
new file mode 100644
index 0000000..1a5255f
--- /dev/null
+++ b/datasketches/src/cpc/union.rs
@@ -0,0 +1,409 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! The merging logic is somewhat involved, so it will be summarized here.
+//!
+//! First, we compare the K values of the union and the source sketch.
+//!
+//! If `source.K < union.K`, we reduce the union's K to match, which requires
downsampling the
+//! union's internal sketch.
+//!
+//! Here is how to perform the downsampling:
+//!
+//! If the union contains a bitMatrix, downsample it by row-wise ORing.
+//!
+//! If the union contains a sparse sketch, then create a new empty sketch, and
walk the old target
+//! sketch updating the new one (with modulo). At the end, check whether the
new target sketch is
+//! still in sparse mode (it might not be, because downsampling densifies the
set of collected
+//! coupons). If it is NOT in sparse mode, immediately convert it to a
bitMatrix.
+//!
+//! At this point, we have `source.K >= union.K`. (We won't keep mentioning
this, but in all the
+//! following the source's row indices are used mod union.K while updating the
union's sketch.
+//! That takes care of the situation where `source.K < union.K`.)
+//!
+//! Case A: union is Sparse and source is Sparse. We walk the source sketch
updating the union's
+//! sketch. At the end, if the union's sketch is no longer in sparse mode, we
convert it to a
+//! bitMatrix.
+//!
+//! Case B: union is bitMatrix and source is Sparse. We walk the source
sketch, setting bits in the
+//! bitMatrix.
+//!
+//! In the remaining cases, we have flavor(source) > Sparse, so we immediately
convert the union's
+//! sketch to a bitMatrix (even if the union contains very few coupons). Then:
+//!
+//! Case C: union is bitMatrix and source is Hybrid or Pinned. Then we OR the
source's sliding
+//! window into the bitMatrix, and walk the source's table, setting bits in
the bitMatrix.
+//!
+//! Case D: union is bitMatrix, and source is Sliding. Then we convert the
source into a bitMatrix,
+//! and OR it into the union's bitMatrix. (Important note; merely walking the
source wouldn't work
+//! because of the partially inverted Logic in the Sliding flavor, where the
presence of coupons
+//! is sometimes indicated by the ABSENCE of row_col pairs in the surprises
table.)
+//!
+//! How does [`CpcUnion::get_result`] work?
+//!
+//! If the union has an Accumulator state, make a copy of that sketch.
+//!
+//! If the union has a BitMatrix state, then we have to convert the bitMatrix
back into a sketch,
+//! which requires doing some extra work to figure out the values of
num_coupons, offset,
+//! first_interesting_column, and kxp.
+
+use crate::cpc::CpcSketch;
+use crate::cpc::DEFAULT_LG_K;
+use crate::cpc::Flavor;
+use crate::cpc::count_bits_set_in_matrix;
+use crate::cpc::determine_correct_offset;
+use crate::cpc::pair_table::PairTable;
+use crate::hash::DEFAULT_UPDATE_SEED;
+
+/// The union (merge) operation for the CPC sketches.
+#[derive(Debug, Clone)]
+pub struct CpcUnion {
+ // immutable config variables
+ lg_k: u8,
+ seed: u64,
+
+ // union state
+ state: UnionState,
+}
+
+impl Default for CpcUnion {
+ fn default() -> Self {
+ Self::new(DEFAULT_LG_K)
+ }
+}
+
+impl CpcUnion {
+ /// Creates a new `CpcUnion` with the given `lg_k` and default seed.
+ ///
+ /// # Panics
+ ///
+ /// Panics if `lg_k` is not in the range `[4, 16]`.
+ pub fn new(lg_k: u8) -> Self {
+ Self::with_seed(lg_k, DEFAULT_UPDATE_SEED)
+ }
+
+ /// Creates a new `CpcUnion` with the given `lg_k` and `seed`.
+ ///
+ /// # Panics
+ ///
+ /// Panics if `lg_k` is not in the range `[4, 16]`.
+ pub fn with_seed(lg_k: u8, seed: u64) -> Self {
+ // We begin with the accumulator holding an EMPTY_MERGED sketch object.
+ let sketch = CpcSketch::with_seed(lg_k, seed);
+ let state = UnionState::Accumulator(sketch);
+ Self { lg_k, seed, state }
+ }
+
+ /// Return the parameter lg_k.
+ ///
+ /// Note that due to merging with source sketches that may have a lower
value of lg_k, this
+ /// value can be less than what the union object was configured with.
+ pub fn lg_k(&self) -> u8 {
+ self.lg_k
+ }
+
+ /// Returns the result of union operations as a CPC sketch.
+ pub fn get_result(&self) -> CpcSketch {
+ match &self.state {
+ UnionState::Accumulator(sketch) => {
+ if sketch.is_empty() {
+ CpcSketch::with_seed(self.lg_k, self.seed)
+ } else {
+ let mut sketch = sketch.clone();
+ assert_eq!(sketch.flavor(), Flavor::SPARSE);
+ sketch.merge_flag = true;
+ sketch
+ }
+ }
+ UnionState::BitMatrix(matrix) => {
+ let lg_k = self.lg_k;
+
+ let mut sketch = CpcSketch::with_seed(lg_k, self.seed);
+ let num_coupons = count_bits_set_in_matrix(matrix);
+ sketch.num_coupons = num_coupons;
+ let offset = determine_correct_offset(lg_k, num_coupons);
+ sketch.window_offset = offset;
+
+ // Build the window and pair table
+ let k = 1 << lg_k;
+ let mut sliding_window = vec![0u8; k];
+
+ // LgSize = K/16; in some cases this will end up being
oversized
+ let new_table_lg_size = (lg_k - 4).max(2);
+ let mut table = PairTable::new(new_table_lg_size, 6 + lg_k);
+
+ // the following should work even when the offset is zero
+ let mask_for_clearing_window = (0xFFu64 << offset) ^ u64::MAX;
+ let mask_for_flipping_early_zone = (1u64 << offset) - 1;
+ let mut all_surprises_ored = 0;
+
+ // The snowplow effect was caused by processing the rows in
order,
+ // but we have fixed it by using a sufficiently large hash
table.
+ for i in 0..k {
+ let mut pattern = matrix[i];
+ sliding_window[i] = ((pattern >> offset) & 0xFF) as u8;
+ pattern &= mask_for_clearing_window;
+ pattern ^= mask_for_flipping_early_zone; // this flipping
converts surprising 0's to 1's
+ all_surprises_ored |= pattern;
+ while pattern != 0 {
+ let col = pattern.trailing_zeros();
+ pattern ^= 1u64 << col; // erase the 1
+ let row_col = ((i as u32) << 6) | col;
+ let is_novel = table.maybe_insert(row_col);
+ assert!(is_novel);
+ }
+ }
+
+ // at this point we could shrink an oversized hash table, but
the relative waste
+ // isn't very big
+ sketch.first_interesting_column =
all_surprises_ored.trailing_zeros() as u8;
+ if sketch.first_interesting_column > offset {
+ sketch.first_interesting_column = offset; // corner case
+ }
+
+ // HIP-related fields will contain zeros, and that is okay
since merge_flag is true,
+ // and thus the HIP estimator will not be used.
+ sketch.sliding_window = sliding_window;
+ sketch.surprising_value_table = Some(table);
+ sketch.merge_flag = true;
+
+ sketch
+ }
+ }
+ }
+
+ /// Update this union with a CpcSketch.
+ ///
+ /// # Panics
+ ///
+ /// Panics if the seed of the provided sketch does not match the seed of
this union.
+ pub fn update(&mut self, sketch: &CpcSketch) {
+ assert_eq!(self.seed, sketch.seed());
+
+ let flavor = sketch.flavor();
+ if flavor == Flavor::EMPTY {
+ return;
+ }
+
+ if sketch.lg_k() < self.lg_k {
+ self.reduce_k(sketch.lg_k());
+ }
+
+ // if source is past SPARSE mode, make sure that union is a bitMatrix.
+ if flavor > Flavor::SPARSE {
+ if let UnionState::Accumulator(old_sketch) = &self.state {
+ // convert the accumulator to a bit matrix
+ let bit_matrix = old_sketch.build_bit_matrix();
+ self.state = UnionState::BitMatrix(bit_matrix);
+ }
+ }
+
+ match &mut self.state {
+ UnionState::Accumulator(old_sketch) => {
+ // [Case A] Sparse, bitMatrix == null, accumulator valid
+ if flavor == Flavor::SPARSE {
+ let old_flavor = old_sketch.flavor();
+ if old_flavor != Flavor::SPARSE && old_flavor !=
Flavor::EMPTY {
+ unreachable!(
+ "unexpected old flavor in union accumulator: {:?}",
+ old_flavor
+ );
+ }
+
+ // The following partially fixes the snowplow problem
provided that the K's
+ // are equal.
+ if old_flavor == Flavor::EMPTY && self.lg_k ==
sketch.lg_k() {
+ *old_sketch = sketch.clone();
+ return;
+ }
+
+ walk_table_updating_sketch(old_sketch,
sketch.surprising_value_table());
+ let final_flavor = old_sketch.flavor();
+
+ // if the accumulator has graduated beyond sparse, switch
to a bit matrix
+ // representation
+ if final_flavor > Flavor::SPARSE {
+ let bit_matrix = old_sketch.build_bit_matrix();
+ self.state = UnionState::BitMatrix(bit_matrix);
+ }
+
+ return;
+ }
+
+ // If flavor is past SPARSE mode, the state must have been
converted to bitMatrix.
+ // Otherwise, the EMPTY case was handled at the start of this
method, and the
+ // SPARSE case was handled above.
+ unreachable!("unexpected flavor in union accumulator: {:?}",
flavor);
+ }
+ UnionState::BitMatrix(old_matrix) => {
+ if flavor == Flavor::SPARSE {
+ // [Case B] Sparse, bitMatrix valid, accumulator == null
+ or_table_into_matrix(old_matrix, self.lg_k,
sketch.surprising_value_table());
+ return;
+ }
+
+ if matches!(flavor, Flavor::HYBRID | Flavor::PINNED) {
+ // [Case C] Hybrid, bitMatrix valid, accumulator == null
+ // [Case C] Pinned, bitMatrix valid, accumulator == null
+ or_window_into_matrix(
+ old_matrix,
+ self.lg_k,
+ &sketch.sliding_window,
+ sketch.window_offset,
+ sketch.lg_k(),
+ );
+ or_table_into_matrix(old_matrix, self.lg_k,
sketch.surprising_value_table());
+ return;
+ }
+
+ // [Case D] Sliding, bitMatrix valid, accumulator == null
+ // SLIDING mode involves inverted logic, so we cannot just
walk the source sketch.
+ // Instead, we convert it to a bitMatrix that can be ORed into
the destination.
+ assert_eq!(flavor, Flavor::SLIDING);
+ let src_matrix = sketch.build_bit_matrix();
+ or_matrix_into_matrix(old_matrix, self.lg_k, &src_matrix,
sketch.lg_k());
+ }
+ }
+ }
+
+ fn reduce_k(&mut self, new_lg_k: u8) {
+ match &mut self.state {
+ UnionState::Accumulator(sketch) => {
+ if sketch.is_empty() {
+ self.lg_k = new_lg_k;
+ self.state =
UnionState::Accumulator(CpcSketch::with_seed(new_lg_k, self.seed));
+ return;
+ }
+
+ let mut new_sketch = CpcSketch::with_seed(new_lg_k, self.seed);
+ walk_table_updating_sketch(&mut new_sketch,
sketch.surprising_value_table());
+
+ let final_new_flavor = new_sketch.flavor();
+ // SV table had to have something in it
+ assert_ne!(final_new_flavor, Flavor::EMPTY);
+ if final_new_flavor == Flavor::SPARSE {
+ self.lg_k = new_lg_k;
+ self.state = UnionState::Accumulator(new_sketch);
+ return;
+ }
+
+ // the new sketch has graduated beyond sparse, so convert to
bitMatrix
+ self.lg_k = new_lg_k;
+ self.state =
UnionState::BitMatrix(new_sketch.build_bit_matrix());
+ }
+ UnionState::BitMatrix(matrix) => {
+ let new_k = 1 << new_lg_k;
+ let mut new_matrix = vec![0; new_k];
+ or_matrix_into_matrix(&mut new_matrix, new_lg_k, matrix,
self.lg_k);
+ self.lg_k = new_lg_k;
+ self.state = UnionState::BitMatrix(new_matrix);
+ }
+ }
+ }
+}
+
+// testing methods
+impl CpcUnion {
+ /// Returns the number of coupons in the union.
+ ///
+ /// This is primarily for testing and validation purposes.
+ pub fn num_coupons(&self) -> u32 {
+ match &self.state {
+ UnionState::Accumulator(sketch) => sketch.num_coupons,
+ UnionState::BitMatrix(matrix) => count_bits_set_in_matrix(matrix),
+ }
+ }
+}
+
+fn or_window_into_matrix(
+ dst_matrix: &mut [u64],
+ dst_lg_k: u8,
+ src_window: &[u8],
+ src_offset: u8,
+ src_lg_k: u8,
+) {
+ assert!(dst_lg_k <= src_lg_k);
+ let dst_mask = (1 << dst_lg_k) - 1; // downsamples when dst_lg_k < src_lg_k
+ let src_k = 1 << src_lg_k;
+ for src_row in 0..src_k {
+ dst_matrix[src_row & dst_mask] |= (src_window[src_row] as u64) <<
src_offset;
+ }
+}
+
+fn or_table_into_matrix(dst_matrix: &mut [u64], dst_lg_k: u8, src_table:
&PairTable) {
+ let dst_mask = (1 << dst_lg_k) - 1; // downsamples when dst_lg_k < src_lg_k
+ let slots = src_table.slots();
+ for &row_col in slots.iter() {
+ if row_col != u32::MAX {
+ let src_row = (row_col >> 6) as usize;
+ let src_col = (row_col & 63) as usize;
+ let dst_row = src_row & dst_mask;
+ dst_matrix[dst_row] |= 1u64 << src_col;
+ }
+ }
+}
+
+fn or_matrix_into_matrix(dst_matrix: &mut [u64], dst_lg_k: u8, src_matrix:
&[u64], src_lg_k: u8) {
+ assert!(dst_lg_k <= src_lg_k);
+ let dst_mask = (1 << dst_lg_k) - 1; // downsamples when dst_lg_k < src_lg_k
+ let src_k = 1 << src_lg_k;
+ for src_row in 0..src_k {
+ let dst_row = src_row & dst_mask;
+ dst_matrix[dst_row] |= src_matrix[src_row];
+ }
+}
+
+fn walk_table_updating_sketch(sketch: &mut CpcSketch, table: &PairTable) {
+ assert!(sketch.lg_k() <= 26);
+
+ let slots = table.slots();
+ let num_slots = slots.len() as u32;
+
+ // downsamples when dst lgK < src LgK
+ let dst_mask = (((1u64 << sketch.lg_k()) - 1) << 6) | 63;
+ // Using a golden ratio stride fixes the snowplow effect.
+ let mut stride = (0.6180339887498949 * (num_slots as f64)) as u32;
+ assert!(stride >= 2);
+ if stride == ((stride >> 1) << 1) {
+ // force the stride to be odd
+ stride += 1;
+ }
+ assert!((stride >= 3) && (stride < num_slots));
+
+ let mut k = 0;
+ for _ in 0..num_slots {
+ k &= num_slots - 1;
+ let row_col = slots[k as usize];
+ if row_col != u32::MAX {
+ sketch.row_col_update(row_col & (dst_mask as u32));
+ }
+ k += stride;
+ }
+}
+
+/// The internal State of the Union operation.
+///
+/// At most one of BitMatrix and accumulator will be non-null at any given
moment. Accumulator is a
+/// sketch object that is employed until it graduates out of Sparse mode.
+///
+/// At that point, it is converted into a full-sized bitMatrix, which is
mathematically a sketch,
+/// but doesn't maintain any of the "extra" fields of our sketch objects.
+#[derive(Debug, Clone)]
+enum UnionState {
+ Accumulator(CpcSketch),
+ BitMatrix(Vec<u64>),
+}
diff --git a/datasketches/src/error.rs b/datasketches/src/error.rs
index dcb0592..7e8b663 100644
--- a/datasketches/src/error.rs
+++ b/datasketches/src/error.rs
@@ -89,7 +89,7 @@ impl Error {
}
}
-// Convenience constructors for deserialization errors
+// Convenient constructors used within datasketches crate.
impl Error {
pub(crate) fn invalid_argument(msg: impl Into<String>) -> Self {
Self::new(ErrorKind::InvalidArgument, msg)
diff --git a/datasketches/src/hll/sketch.rs b/datasketches/src/hll/sketch.rs
index 8c79625..bc1ce42 100644
--- a/datasketches/src/hll/sketch.rs
+++ b/datasketches/src/hll/sketch.rs
@@ -40,7 +40,7 @@ use crate::hll::serialization::*;
/// A HyperLogLog sketch.
///
-/// See the [hll module level documentation](crate::hll) for more.
+/// See the [module level documentation](super) for more.
#[derive(Debug, Clone, PartialEq)]
pub struct HllSketch {
lg_config_k: u8,
diff --git a/datasketches/src/hll/union.rs b/datasketches/src/hll/union.rs
index 5dba0bb..870c8d8 100644
--- a/datasketches/src/hll/union.rs
+++ b/datasketches/src/hll/union.rs
@@ -45,7 +45,7 @@ use crate::hll::pack_coupon;
/// the union of all input sketches. It automatically handles sketches with
/// different configurations and modes.
///
-/// See the [hll module level documentation](crate::hll) for more.
+/// See the [module level documentation](super) for more.
#[derive(Debug, Clone)]
pub struct HllUnion {
/// Maximum lg_k that this union can handle
diff --git a/datasketches/src/lib.rs b/datasketches/src/lib.rs
index 009fd9e..17701ab 100644
--- a/datasketches/src/lib.rs
+++ b/datasketches/src/lib.rs
@@ -33,6 +33,7 @@ compile_error!("datasketches does not support big-endian
targets");
pub mod bloom;
pub mod common;
pub mod countmin;
+pub mod cpc;
pub mod error;
pub mod frequencies;
pub mod hll;
diff --git a/datasketches/src/tdigest/sketch.rs
b/datasketches/src/tdigest/sketch.rs
index 91c90c7..a790892 100644
--- a/datasketches/src/tdigest/sketch.rs
+++ b/datasketches/src/tdigest/sketch.rs
@@ -22,7 +22,6 @@ use std::num::NonZeroU64;
use crate::codec::SketchBytes;
use crate::codec::SketchSlice;
use crate::error::Error;
-use crate::error::ErrorKind;
use crate::tdigest::serialization::*;
/// The default value of K if one is not specified.
@@ -34,7 +33,7 @@ const DEFAULT_WEIGHT: NonZeroU64 =
NonZeroU64::new(1).unwrap();
/// T-Digest sketch for estimating quantiles and ranks.
///
-/// See the [tdigest module level documentation](crate::tdigest) for more.
+/// See the [module level documentation](super) for more.
#[derive(Debug, Clone)]
pub struct TDigestMut {
k: u16,
@@ -100,10 +99,9 @@ impl TDigestMut {
/// ```
pub fn try_new(k: u16) -> Result<Self, Error> {
if k < 10 {
- return Err(Error::new(
- ErrorKind::InvalidArgument,
- format!("k must be at least 10, got {k}"),
- ));
+ return Err(Error::invalid_argument(format!(
+ "k must be at least 10, got {k}"
+ )));
}
Ok(Self::make(
@@ -788,7 +786,7 @@ impl TDigestMut {
/// Immutable (frozen) T-Digest sketch for estimating quantiles and ranks.
///
-/// See the [module documentation](super) for more details.
+/// See the [module level documentation](super) for more.
pub struct TDigest {
k: u16,
diff --git a/datasketches/src/theta/sketch.rs b/datasketches/src/theta/sketch.rs
index 506b2b6..07110d4 100644
--- a/datasketches/src/theta/sketch.rs
+++ b/datasketches/src/theta/sketch.rs
@@ -25,6 +25,7 @@ use std::hash::Hash;
use crate::common::NumStdDev;
use crate::common::ResizeFactor;
use crate::common::binomial_bounds;
+use crate::common::canonical_double;
use crate::hash::DEFAULT_UPDATE_SEED;
use crate::theta::hash_table::DEFAULT_LG_K;
use crate::theta::hash_table::MAX_LG_K;
@@ -52,7 +53,9 @@ impl ThetaSketch {
ThetaSketchBuilder::default()
}
- /// Update the sketch with a hashable value
+ /// Update the sketch with a hashable value.
+ ///
+ /// For `f32`/`f64` values, use `update_f32`/`update_f64` instead.
///
/// # Examples
///
@@ -69,7 +72,7 @@ impl ThetaSketch {
}
}
- /// Update the sketch with a f64 value
+ /// Update the sketch with a f64 value.
///
/// # Examples
///
@@ -85,7 +88,7 @@ impl ThetaSketch {
self.update(canonical);
}
- /// Update the sketch with a f32 value
+ /// Update the sketch with a f32 value.
///
/// # Examples
///
@@ -356,16 +359,3 @@ impl ThetaSketchBuilder {
ThetaSketch { table }
}
}
-
-/// Canonicalize double value for compatibility with Java
-fn canonical_double(value: f64) -> i64 {
- if value.is_nan() {
- // Java's Double.doubleToLongBits() NaN value
- 0x7ff8000000000000i64
- } else {
- // -0.0 + 0.0 == +0.0 under IEEE754 roundTiesToEven rounding mode,
- // which Rust guarantees. Thus, by adding a positive zero we
- // canonicalize signed zero without any branches in one instruction.
- (value + 0.0).to_bits() as i64
- }
-}
diff --git a/datasketches/tests/cpc_union_test.rs
b/datasketches/tests/cpc_union_test.rs
new file mode 100644
index 0000000..6e8aa32
--- /dev/null
+++ b/datasketches/tests/cpc_union_test.rs
@@ -0,0 +1,169 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use datasketches::cpc::CpcSketch;
+use datasketches::cpc::CpcUnion;
+use googletest::assert_that;
+use googletest::prelude::near;
+
+const RELATIVE_ERROR_FOR_LG_K_11: f64 = 0.02;
+
+#[test]
+fn test_empty() {
+ let union = CpcUnion::new(11);
+ let sketch = union.get_result();
+ assert!(sketch.is_empty());
+ assert_eq!(sketch.estimate(), 0.0);
+}
+
+#[test]
+fn test_two_values() {
+ let mut sketch = CpcSketch::new(11);
+ sketch.update(1);
+ let mut union = CpcUnion::new(11);
+ union.update(&sketch);
+
+ let result = union.get_result();
+ assert!(!result.is_empty());
+ assert_eq!(result.estimate(), 1.0);
+
+ sketch.update(2);
+ union.update(&sketch);
+ let result = union.get_result();
+ assert!(!result.is_empty());
+ assert_that!(
+ sketch.estimate(),
+ near(2.0, RELATIVE_ERROR_FOR_LG_K_11 * 2.0)
+ );
+}
+
+#[test]
+fn test_custom_seed() {
+ let mut sketch = CpcSketch::with_seed(11, 123);
+ sketch.update(1);
+ sketch.update(2);
+ sketch.update(3);
+
+ let mut union = CpcUnion::with_seed(11, 123);
+ union.update(&sketch);
+ let result = union.get_result();
+ assert!(!result.is_empty());
+ assert_that!(
+ result.estimate(),
+ near(3.0, RELATIVE_ERROR_FOR_LG_K_11 * 3.0)
+ );
+}
+
+#[test]
+#[should_panic]
+fn test_custom_seed_mismatch() {
+ let mut sketch = CpcSketch::with_seed(11, 123);
+ sketch.update(1);
+ sketch.update(2);
+ sketch.update(3);
+
+ let mut union = CpcUnion::with_seed(11, 234);
+ union.update(&sketch);
+}
+
+#[test]
+fn test_large_values() {
+ let mut key = 0;
+ let mut sketch = CpcSketch::new(11);
+ let mut union = CpcUnion::new(11);
+ for _ in 0..1000 {
+ let mut tmp = CpcSketch::new(11);
+ for _ in 0..10000 {
+ sketch.update(key);
+ tmp.update(key);
+ key += 1;
+ }
+ union.update(&tmp);
+ }
+ let result = union.get_result();
+ assert!(!result.is_empty());
+ assert_eq!(result.num_coupons(), union.num_coupons());
+ let estimate = sketch.estimate();
+ assert_that!(
+ result.estimate(),
+ near(estimate, RELATIVE_ERROR_FOR_LG_K_11 * estimate)
+ );
+}
+
+#[test]
+fn test_reduce_k_empty() {
+ let mut sketch = CpcSketch::new(11);
+ for i in 0..10000 {
+ sketch.update(i);
+ }
+ let mut union = CpcUnion::new(12);
+ union.update(&sketch);
+ let result = union.get_result();
+ assert_eq!(result.lg_k(), 11);
+ assert_that!(
+ result.estimate(),
+ near(10000.0, RELATIVE_ERROR_FOR_LG_K_11 * 10000.0)
+ );
+}
+
+#[test]
+fn test_reduce_k_sparse() {
+ let mut union = CpcUnion::new(12);
+
+ let mut sketch12 = CpcSketch::new(12);
+ for i in 0..100 {
+ sketch12.update(i);
+ }
+ union.update(&sketch12);
+
+ let mut sketch11 = CpcSketch::new(11);
+ for i in 0..1000 {
+ sketch11.update(i);
+ }
+ union.update(&sketch11);
+
+ let result = union.get_result();
+ assert_eq!(result.lg_k(), 11);
+ assert_that!(
+ result.estimate(),
+ near(1000.0, RELATIVE_ERROR_FOR_LG_K_11 * 10000.0)
+ );
+}
+
+#[test]
+fn test_reduce_k_window() {
+ let mut union = CpcUnion::new(12);
+
+ let mut sketch12 = CpcSketch::new(12);
+ for i in 0..500 {
+ sketch12.update(i);
+ }
+ union.update(&sketch12);
+
+ let mut sketch11 = CpcSketch::new(11);
+ for i in 0..1000 {
+ sketch11.update(i);
+ }
+ union.update(&sketch11);
+
+ let result = union.get_result();
+ assert_eq!(result.lg_k(), 11);
+ assert_that!(
+ result.estimate(),
+ near(1000.0, RELATIVE_ERROR_FOR_LG_K_11 * 10000.0)
+ );
+}
diff --git a/datasketches/tests/cpc_update_test.rs
b/datasketches/tests/cpc_update_test.rs
new file mode 100644
index 0000000..7b814f7
--- /dev/null
+++ b/datasketches/tests/cpc_update_test.rs
@@ -0,0 +1,62 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use datasketches::common::NumStdDev;
+use datasketches::cpc::CpcSketch;
+use googletest::assert_that;
+use googletest::prelude::ge;
+use googletest::prelude::le;
+use googletest::prelude::near;
+
+const RELATIVE_ERROR_FOR_LG_K_11: f64 = 0.02;
+
+#[test]
+fn test_empty() {
+ let sketch = CpcSketch::new(11);
+ assert!(sketch.is_empty());
+ assert_eq!(sketch.estimate(), 0.0);
+ assert_eq!(sketch.lower_bound(NumStdDev::One), 0.0);
+ assert_eq!(sketch.upper_bound(NumStdDev::One), 0.0);
+ assert!(sketch.validate());
+}
+
+#[test]
+fn test_one_value() {
+ let mut sketch = CpcSketch::new(11);
+ sketch.update(1);
+ assert!(!sketch.is_empty());
+ assert_eq!(sketch.estimate(), 1.0);
+ assert_that!(sketch.estimate(), ge(sketch.lower_bound(NumStdDev::One)));
+ assert_that!(sketch.estimate(), le(sketch.upper_bound(NumStdDev::One)));
+ assert!(sketch.validate());
+}
+
+#[test]
+fn test_many_values() {
+ let mut sketch = CpcSketch::new(11);
+ for i in 0..10000 {
+ sketch.update(i);
+ }
+ assert!(!sketch.is_empty());
+ assert_that!(
+ sketch.estimate(),
+ near(10000.0, RELATIVE_ERROR_FOR_LG_K_11 * 10000.0)
+ );
+ assert_that!(sketch.estimate(), ge(sketch.lower_bound(NumStdDev::One)));
+ assert_that!(sketch.estimate(), le(sketch.upper_bound(NumStdDev::One)));
+ assert!(sketch.validate());
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]