This is an automated email from the ASF dual-hosted git repository.
placave pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datasketches-rust.git
The following commit(s) were added to refs/heads/main by this push:
new a081c45 refactor: implement MurmurHash3 internally (#36)
a081c45 is described below
commit a081c458d013c60f05321b45c530b3ec4c98e709
Author: tison <[email protected]>
AuthorDate: Mon Dec 22 17:22:16 2025 +0800
refactor: implement MurmurHash3 internally (#36)
* add tests for murmurhash3
Signed-off-by: tison <[email protected]>
* impl murmurhash3
Signed-off-by: tison <[email protected]>
* test and fix up
Signed-off-by: tison <[email protected]>
* drop mur3 dep
Signed-off-by: tison <[email protected]>
* lint
Signed-off-by: tison <[email protected]>
---------
Signed-off-by: tison <[email protected]>
---
Cargo.lock | 7 --
Cargo.toml | 1 -
src/{lib.rs => hash/mod.rs} | 19 +---
src/hash/murmurhash.rs | 237 ++++++++++++++++++++++++++++++++++++++++++++
src/hll/mod.rs | 6 +-
src/lib.rs | 2 +
6 files changed, 244 insertions(+), 28 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index a2f2c69..cb421e2 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -29,7 +29,6 @@ version = "0.1.0"
dependencies = [
"byteorder",
"googletest",
- "mur3",
]
[[package]]
@@ -61,12 +60,6 @@ version = "2.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273"
-[[package]]
-name = "mur3"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "97af489e1e21b68de4c390ecca6703318bc1aa16e9733bcb62c089b73c6fbb1b"
-
[[package]]
name = "num-traits"
version = "0.2.19"
diff --git a/Cargo.toml b/Cargo.toml
index 90961f0..46b91f0 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -36,7 +36,6 @@ rustdoc-args = ["--cfg", "docsrs"]
[dependencies]
byteorder = { version = "1.5.0" }
-mur3 = { version = "0.1.0" }
[dev-dependencies]
googletest = { version = "0.14.2" }
diff --git a/src/lib.rs b/src/hash/mod.rs
similarity index 53%
copy from src/lib.rs
copy to src/hash/mod.rs
index d49faa0..7059dbb 100644
--- a/src/lib.rs
+++ b/src/hash/mod.rs
@@ -15,21 +15,6 @@
// specific language governing permissions and limitations
// under the License.
-//! # Apache® DataSketches™ Core Rust Library Component
-//!
-//! The Sketching Core Library provides a range of stochastic streaming
algorithms and closely
-//! related Rust technologies that are particularly useful when integrating
this technology into
-//! systems that must deal with massive data.
-//!
-//! This library is divided into modules that constitute distinct groups of
functionality.
+mod murmurhash;
-#![cfg_attr(docsrs, feature(doc_cfg))]
-#![deny(missing_docs)]
-
-// See https://github.com/apache/datasketches-rust/issues/28 for more
information.
-#[cfg(target_endian = "big")]
-compile_error!("datasketches does not support big-endian targets");
-
-pub mod error;
-pub mod hll;
-pub mod tdigest;
+pub use murmurhash::MurmurHash3X64128;
diff --git a/src/hash/murmurhash.rs b/src/hash/murmurhash.rs
new file mode 100644
index 0000000..b6d5106
--- /dev/null
+++ b/src/hash/murmurhash.rs
@@ -0,0 +1,237 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use byteorder::{ByteOrder, LE};
+use std::hash::Hasher;
+
+const DEFAULT_SEED: u64 = 9001;
+const C1: u64 = 0x87c37b91114253d5;
+const C2: u64 = 0x4cf5ad432745937f;
+
+/// The MurmurHash3 is a fast, non-cryptographic, 128-bit hash function that
has
+/// excellent avalanche and 2-way bit independence properties.
+#[derive(Debug)]
+pub struct MurmurHash3X64128 {
+ h1: u64,
+ h2: u64,
+ total: u64,
+ buf: [u8; 16],
+ buf_len: usize,
+}
+
+impl MurmurHash3X64128 {
+ pub fn with_seed(seed: u64) -> Self {
+ MurmurHash3X64128 {
+ h1: seed,
+ h2: seed,
+ total: 0,
+ buf: [0; 16],
+ buf_len: 0,
+ }
+ }
+
+ pub fn finish128(&self) -> (u64, u64) {
+ let mut h1 = self.h1;
+ let mut h2 = self.h2;
+
+ let total = self.total + self.buf_len as u64;
+ let rem = self.buf_len;
+
+ // tail
+ if rem > 0 {
+ if rem > 8 {
+ // read k2 little endian
+ let mut buf = [0u8; 8];
+ let k2_len = rem - 8;
+ buf[..k2_len].copy_from_slice(&self.buf[8..rem]);
+ // mix k2
+ let mut k2 = u64::from_le_bytes(buf);
+ k2 = k2.wrapping_mul(C2);
+ k2 = k2.rotate_left(33);
+ k2 = k2.wrapping_mul(C1);
+ h2 ^= k2;
+ }
+
+ // read k1 little endian
+ let mut buf = [0u8; 8];
+ let k1_len = rem.min(8);
+ buf[..k1_len].copy_from_slice(&self.buf[..k1_len]);
+ // mix k1
+ let mut k1 = u64::from_le_bytes(buf);
+ k1 = k1.wrapping_mul(C1);
+ k1 = k1.rotate_left(31);
+ k1 = k1.wrapping_mul(C2);
+ h1 ^= k1;
+ }
+
+ h1 ^= total;
+ h2 ^= total;
+ h1 = h1.wrapping_add(h2);
+ h2 = h2.wrapping_add(h1);
+ h1 = fmix64(h1);
+ h2 = fmix64(h2);
+ h1 = h1.wrapping_add(h2);
+ h2 = h2.wrapping_add(h1);
+ (h1, h2)
+ }
+
+ #[inline]
+ fn update(&mut self, mut k1: u64, mut k2: u64) {
+ // k1 *= c1; k1 = MURMUR3_ROTL64(k1, 31); k1 *= c2; out.h1 ^= k1;
+ k1 = k1.wrapping_mul(C1);
+ k1 = k1.rotate_left(31);
+ k1 = k1.wrapping_mul(C2);
+ self.h1 ^= k1;
+
+ // out.h1 = MURMUR3_ROTL64(out.h1, 27); out.h1 += out.h2; out.h1 =
out.h1*5+0x52dce729;
+ self.h1 = self.h1.rotate_left(27);
+ self.h1 = self.h1.wrapping_add(self.h2);
+ self.h1 = self.h1.wrapping_mul(5).wrapping_add(0x52dce729);
+
+ // k2 *= c2; k2 = MURMUR3_ROTL64(k2, 33); k2 *= c1; out.h2 ^= k2;
+ k2 = k2.wrapping_mul(C2);
+ k2 = k2.rotate_left(33);
+ k2 = k2.wrapping_mul(C1);
+ self.h2 ^= k2;
+
+ // out.h2 = MURMUR3_ROTL64(out.h2,31); out.h2 += out.h1; out.h2 =
out.h2*5+c4;
+ self.h2 = self.h2.rotate_left(31);
+ self.h2 = self.h2.wrapping_add(self.h1);
+ self.h2 = self.h2.wrapping_mul(5).wrapping_add(0x38495ab5);
+
+ // accumulate total length
+ self.total += 16;
+ }
+}
+
+impl Default for MurmurHash3X64128 {
+ fn default() -> Self {
+ Self::with_seed(DEFAULT_SEED)
+ }
+}
+
+impl Hasher for MurmurHash3X64128 {
+ fn finish(&self) -> u64 {
+ self.finish128().0
+ }
+
+ fn write(&mut self, mut bytes: &[u8]) {
+ if self.buf_len + bytes.len() < 16 {
+ self.buf[self.buf_len..self.buf_len +
bytes.len()].copy_from_slice(bytes);
+ self.buf_len += bytes.len();
+ return;
+ }
+
+ if self.buf_len != 0 {
+ let wanted = 16 - self.buf_len;
+ self.buf[self.buf_len..].copy_from_slice(&bytes[..wanted]);
+
+ let k1 = LE::read_u64(&self.buf[0..8]);
+ let k2 = LE::read_u64(&self.buf[8..16]);
+ self.update(k1, k2);
+
+ bytes = &bytes[wanted..];
+ self.buf_len = 0;
+ }
+
+ // Number of full 128-bit blocks of 16 bytes.
+ // Possible exclusion of a remainder of up to 15 bytes.
+ let blocks = bytes.len() >> 4; // bytes / 16
+
+ // Process the 128-bit blocks (the body) into the hash
+ for i in 0..blocks {
+ let lo = i << 4;
+ let mi = lo + 8;
+ let hi = mi + 8;
+ let k1 = LE::read_u64(&bytes[lo..mi]);
+ let k2 = LE::read_u64(&bytes[mi..hi]);
+ self.update(k1, k2);
+ }
+
+ // remain bytes
+ let len = bytes.len() % 16;
+ if len > 0 {
+ self.buf[0..len].copy_from_slice(&bytes[blocks << 4..]);
+ self.buf_len = len;
+ }
+ }
+}
+
+/// Finalization mix: force all bits of a hash block to avalanche.
+#[inline]
+fn fmix64(mut k: u64) -> u64 {
+ k ^= k >> 33;
+ k = k.wrapping_mul(0xff51afd7ed558ccd);
+ k ^= k >> 33;
+ k = k.wrapping_mul(0xc4ceb9fe1a85ec53);
+ k ^ (k >> 33)
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ fn murmurhash3_x64_128(key: &[u8], seed: u64) -> (u64, u64) {
+ let mut hasher = MurmurHash3X64128::with_seed(seed);
+ hasher.write(key);
+ hasher.finish128()
+ }
+
+ #[test]
+ fn test_remainder() {
+ // remainder > 8
+ let key = "The quick brown fox jumps over the lazy dog";
+ let (h1, h2) = murmurhash3_x64_128(key.as_bytes(), 0);
+ assert_eq!(h1, 0xe34bbc7bbc071b6c);
+ assert_eq!(h2, 0x7a433ca9c49a9347);
+
+ // change one bit
+ let key = "The quick brown fox jumps over the lazy eog";
+ let (h1, h2) = murmurhash3_x64_128(key.as_bytes(), 0);
+ assert_eq!(h1, 0x362108102c62d1c9);
+ assert_eq!(h2, 0x3285cd100292b305);
+
+ // test a remainder < 8
+ let key = "The quick brown fox jumps over the lazy dogdogdog";
+ let (h1, h2) = murmurhash3_x64_128(key.as_bytes(), 0);
+ assert_eq!(h1, 0x9c8205300e612fc4);
+ assert_eq!(h2, 0xcbc0af6136aa3df9);
+
+ // test a remainder = 8
+ let key = "The quick brown fox jumps over the lazy1";
+ let (h1, h2) = murmurhash3_x64_128(key.as_bytes(), 0);
+ assert_eq!(h1, 0xe3301a827e5cdfe3);
+ assert_eq!(h2, 0xbdbf05f8da0f0392);
+
+ // test a remainder = 0
+ let key = "The quick brown fox jumps over t";
+ let (h1, h2) = murmurhash3_x64_128(key.as_bytes(), 0);
+ assert_eq!(h1, 0xdf6af91bb29bdacf);
+ assert_eq!(h2, 0x91a341c58df1f3a6);
+
+ // test a ones byte and a zeros byte
+ let key = [
+ 0x54, 0x68, 0x65, 0x20, 0x71, 0x75, 0x69, 0x63, 0x6b, 0x20, 0x62,
0x72, 0x6f, 0x77,
+ 0x6e, 0x20, 0x66, 0x6f, 0x78, 0x20, 0x6a, 0x75, 0x6d, 0x70, 0x73,
0x20, 0x6f, 0x76,
+ 0x65, 0x72, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6c, 0x61, 0x7a, 0x79,
0x20, 0x64, 0x6f,
+ 0x67, 0xff, 0x64, 0x6f, 0x67, 0x00,
+ ];
+ let (h1, h2) = murmurhash3_x64_128(&key, 0);
+ assert_eq!(h1, 0xe88abda785929c9e);
+ assert_eq!(h2, 0x96b98587cacc83d6);
+ }
+}
diff --git a/src/hll/mod.rs b/src/hll/mod.rs
index 40d1b31..d156649 100644
--- a/src/hll/mod.rs
+++ b/src/hll/mod.rs
@@ -74,6 +74,8 @@
use std::hash::Hash;
+use crate::hash::MurmurHash3X64128;
+
mod array4;
mod array6;
mod array8;
@@ -147,9 +149,7 @@ fn pack_coupon(slot: u32, value: u8) -> u32 {
/// Generate a coupon from a hashable value.
fn coupon<H: Hash>(v: H) -> u32 {
- const DEFAULT_SEED: u32 = 9001;
-
- let mut hasher = mur3::Hasher128::with_seed(DEFAULT_SEED);
+ let mut hasher = MurmurHash3X64128::default();
v.hash(&mut hasher);
let (lo, hi) = hasher.finish128();
diff --git a/src/lib.rs b/src/lib.rs
index d49faa0..c566727 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -33,3 +33,5 @@ compile_error!("datasketches does not support big-endian
targets");
pub mod error;
pub mod hll;
pub mod tdigest;
+
+mod hash;
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]