signore662-beep opened a new issue, #66:
URL: https://github.com/apache/datasketches-python/issues/66
import numpy as np
import math
import numba
from numba import njit, prange, uint8, uint32, uint64, float64, config
@njit(fastmath=True, inline='always')
def _clz_v2300(n: uint64) -> int:
"""Portable Binary-Search CLZ for maximum SIMD throughput."""
if n == 0: return 64
count = 0
if (n & 0xFFFFFFFF00000000) == 0: n <<= 32; count += 32
if (n & 0xFFFF000000000000) == 0: n <<= 16; count += 16
if (n & 0xFF00000000000000) == 0: n <<= 8; count += 8
if (n & 0xF000000000000000) == 0: n <<= 4; count += 4
if (n & 0xC000000000000000) == 0: n <<= 2; count += 2
if (n & 0x8000000000000000) == 0: count += 1
return count
@njit(parallel=True, fastmath=True)
def v2300_core_engine(data, p, salt):
"""
Zenith Ingestion: MapReduce strategy with privatized registers.
"""
n_threads = config.NUMBA_NUM_THREADS
m = 1 << p
local_scratchpad = np.zeros((n_threads, m), dtype=uint8)
# --- PHASE 1: Local Mapping (Parallel) ---
for i in prange(len(data)):
tid = numba.get_thread_id()
h = uint64(data[i]) ^ salt
idx = uint32(h >> uint64(64 - p))
# Rank Calculation
w = (h << uint64(p)) | uint64(1)
rank = uint8(_clz_v2300(w) + 1)
# Update thread-local maximum
if rank > local_scratchpad[tid, idx]:
local_scratchpad[tid, idx] = rank
# --- PHASE 2: Global Reduction (Parallel) ---
global_registers = np.zeros(m, dtype=uint8)
for j in prange(m):
m_max = uint8(0)
for t in range(n_threads):
if local_scratchpad[t, j] > m_max:
m_max = local_scratchpad[t, j]
global_registers[j] = m_max
return global_registers
@njit(fastmath=True)
def v2300_estimate(registers, p):
"""Final HLL++ Estimate with Small Range Correction."""
m = 1 << p
v_zeros = 0
z_inv = 0.0
for i in range(m):
val = registers[i]
z_inv += 1.0 / (1 << val)
if val == 0: v_zeros += 1
alpha = 0.7213 / (1.0 + 1.079 / m)
raw_est = alpha * (float(m)**2) / z_inv
# Linear Counting for small sets
if raw_est <= 2.5 * m and v_zeros > 0:
return m * math.log(m / v_zeros)
return raw_est
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]