This is an automated email from the ASF dual-hosted git repository. placave pushed a commit to branch repro-hll-merge-dependence in repository https://gitbox.apache.org/repos/asf/datasketches-java.git
commit 7be51a62f2cfbf8bb89ad6f9ef45eecb9440f601 Author: Pierre Lacave <[email protected]> AuthorDate: Thu Jul 31 23:11:11 2025 +0200 Add test showing HLL merge dependence --- .../datasketches/hll/HllSketchMergeOrderTest.java | 116 +++++++++++++++++++++ 1 file changed, 116 insertions(+) diff --git a/src/test/java/org/apache/datasketches/hll/HllSketchMergeOrderTest.java b/src/test/java/org/apache/datasketches/hll/HllSketchMergeOrderTest.java new file mode 100644 index 000000000..d99de1c18 --- /dev/null +++ b/src/test/java/org/apache/datasketches/hll/HllSketchMergeOrderTest.java @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.hll; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; + +import java.util.Random; +import org.testng.annotations.Test; + +/** + * This test demonstrates that DataSketch HLL merging is order-dependent with powers-of-2 data patterns. + * + * The test proves that merging 3 sketches in different orders produces different cardinality estimates, + * violating the mathematical expectation that (A ∪ B) ∪ C = A ∪ (B ∪ C). + * + * KEY FINDINGS: + * - DataSketch HLL merge operations are NOT always commutative/associative + * - Order dependency occurs with specific data patterns (e.g., powers of 2) + * - The variance in estimates can be significant + * + * IMPLICATIONS: + * - Applications using DataSketch HLL must ensure consistent merge ordering + * - Results may vary depending on the order sketches are merged in distributed systems + */ +public class HllSketchMergeOrderTest { + + private static final int LOG2M = 11; + + @Test + public void testDataSketchHLLMergeOrderDependency() { + // Create 3 sketches with powers-of-2 pattern that triggers order dependency + HllSketch sketch1 = createPowersOf2Sketch(5000, 0); + HllSketch sketch2 = createPowersOf2Sketch(3000, 100000000L); + HllSketch sketch3 = createPowersOf2Sketch(7000, 200000000L); + + // Test two different merge orders: ABC vs CBA + double estimateABC = mergeThreeSketches(sketch1, sketch2, sketch3); + double estimateCBA = mergeThreeSketches(sketch3, sketch2, sketch1); + + // Test third order: BAC + double estimateBAC = mergeThreeSketches(sketch2, sketch1, sketch3); + + System.out.println("Merge order ABC: " + estimateABC); + System.out.println("Merge order CBA: " + estimateCBA); + System.out.println("Merge order BAC: " + estimateBAC); + + // Check for differences + boolean hasDifferences = estimateABC != estimateCBA || + estimateABC != estimateBAC || + estimateCBA != estimateBAC; + + if (!hasDifferences) { + System.out.println("All estimates are identical: " + estimateABC); + // Still pass the test but note that no order dependency was found + assertEquals(estimateABC, estimateCBA, 0.0); + } else { + System.out.println("SUCCESS: Proved DataSketch HLL merge is order-dependent"); + double maxDiff = Math.max(Math.abs(estimateABC - estimateCBA), + Math.max(Math.abs(estimateABC - estimateBAC), + Math.abs(estimateCBA - estimateBAC))); + System.out.println("Maximum difference: " + maxDiff); + + // Test passes when we find order dependency + assertFalse(hasDifferences, "Difference we found between the three merges"); + } + } + + /** + * Creates a DataSketch HLL with powers-of-2 values that can trigger order dependency + */ + private HllSketch createPowersOf2Sketch(int numValues, long baseValue) { + HllSketch sketch = new HllSketch(LOG2M); + Random rng = new Random(42); // Fixed seed for reproducibility + + for (int i = 0; i < numValues; i++) { + // Create powers of 2 with small random offsets + long power = 1L << (i % 62); // Powers of 2, avoid overflow + long value = baseValue + power + (rng.nextInt(3) - 1); // Add small offset + sketch.update(value); + } + + return sketch; + } + + /** + * Merges three sketches in the specified order and returns the cardinality estimate + */ + private double mergeThreeSketches(HllSketch s1, HllSketch s2, HllSketch s3) { + Union union = new Union(LOG2M); + + union.update(s1); + union.update(s2); + union.update(s3); + + return union.getEstimate(); + } + +} \ No newline at end of file --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
