[ 
https://issues.apache.org/jira/browse/HIVE-26277?focusedWorklogId=808314&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-808314
 ]

ASF GitHub Bot logged work on HIVE-26277:
-----------------------------------------

                Author: ASF GitHub Bot
            Created on: 13/Sep/22 14:19
            Start Date: 13/Sep/22 14:19
    Worklog Time Spent: 10m 
      Work Description: asolimando commented on code in PR #3339:
URL: https://github.com/apache/hive/pull/3339#discussion_r969692288


##########
standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DateColumnStatsAggregatorTest.java:
##########
@@ -0,0 +1,279 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.hadoop.hive.metastore.columnstats.aggr;
+
+import org.apache.hadoop.hive.metastore.TableType;
+import org.apache.hadoop.hive.metastore.annotation.MetastoreUnitTest;
+import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
+import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
+import org.apache.hadoop.hive.metastore.api.Date;
+import org.apache.hadoop.hive.metastore.api.FieldSchema;
+import org.apache.hadoop.hive.metastore.api.MetaException;
+import org.apache.hadoop.hive.metastore.api.Table;
+import org.apache.hadoop.hive.metastore.columnstats.ColStatsBuilder;
+import 
org.apache.hadoop.hive.metastore.utils.MetaStoreServerUtils.ColStatsObjWithSourceInfo;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+import static 
org.apache.hadoop.hive.metastore.StatisticsTestUtils.createStatsWithInfo;
+
+@Category(MetastoreUnitTest.class)
+public class DateColumnStatsAggregatorTest {
+
+  private static final Table TABLE = new Table("dummy", "db", "hive", 0, 0,
+      0, null, null, Collections.emptyMap(), null, null,
+      TableType.MANAGED_TABLE.toString());
+  private static final FieldSchema COL = new FieldSchema("col", "int", "");
+
+  private static final Date DATE_1 = new Date(1);
+  private static final Date DATE_2 = new Date(2);
+  private static final Date DATE_3 = new Date(3);
+  private static final Date DATE_4 = new Date(4);
+  private static final Date DATE_5 = new Date(5);
+  private static final Date DATE_6 = new Date(6);
+  private static final Date DATE_7 = new Date(7);
+  private static final Date DATE_8 = new Date(8);
+  private static final Date DATE_9 = new Date(9);
+
+  @Test
+  public void testAggregateSingleStat() throws MetaException {
+    List<String> partitions = Collections.singletonList("part1");
+
+    ColumnStatisticsData data1 = new 
ColStatsBuilder<>(Date.class).numNulls(1).numDVs(2).low(DATE_1).high(DATE_4)
+        .hll(DATE_1.getDaysSinceEpoch(), DATE_4.getDaysSinceEpoch()).build();
+    List<ColStatsObjWithSourceInfo> statsList =
+        Collections.singletonList(createStatsWithInfo(data1, TABLE, COL, 
partitions.get(0)));
+
+    DateColumnStatsAggregator aggregator = new DateColumnStatsAggregator();
+    ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList, 
partitions, true);
+
+    Assert.assertEquals(data1, computedStatsObj.getStatsData());
+  }
+
+  @Test
+  public void testAggregateSingleStatWhenNullValues() throws MetaException {
+    List<String> partitions = Collections.singletonList("part1");
+
+    ColumnStatisticsData data1 = new 
ColStatsBuilder<>(Date.class).numNulls(1).numDVs(2).build();
+    List<ColStatsObjWithSourceInfo> statsList =
+        Collections.singletonList(createStatsWithInfo(data1, TABLE, COL, 
partitions.get(0)));
+
+    DateColumnStatsAggregator aggregator = new DateColumnStatsAggregator();
+    ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList, 
partitions, true);
+    Assert.assertEquals(data1, computedStatsObj.getStatsData());
+
+    aggregator.useDensityFunctionForNDVEstimation = true;
+    computedStatsObj = aggregator.aggregate(statsList, partitions, true);
+    Assert.assertEquals(data1, computedStatsObj.getStatsData());
+
+    aggregator.useDensityFunctionForNDVEstimation = false;
+    aggregator.ndvTuner = 1;
+    // ndv tuner does not have any effect because min numDVs and max numDVs 
coincide (we have a single stats)
+    computedStatsObj = aggregator.aggregate(statsList, partitions, true);
+    Assert.assertEquals(data1, computedStatsObj.getStatsData());
+  }
+
+  @Test
+  public void testAggregateMultipleStatsWhenSomeNullValues() throws 
MetaException {
+    List<String> partitions = Arrays.asList("part1", "part2");
+
+    long[] values1 = { DATE_1.getDaysSinceEpoch(), DATE_2.getDaysSinceEpoch() 
};
+    ColumnStatisticsData data1 = new 
ColStatsBuilder<>(Date.class).numNulls(1).numDVs(2)
+        .low(DATE_1).high(DATE_2).hll(values1).build();
+    ColumnStatisticsData data2 = new 
ColStatsBuilder<>(Date.class).numNulls(2).numDVs(3).build();
+
+    List<ColStatsObjWithSourceInfo> statsList = 
Arrays.asList(createStatsWithInfo(data1, TABLE, COL, partitions.get(0)),
+        createStatsWithInfo(data2, TABLE, COL, partitions.get(1)));
+
+    DateColumnStatsAggregator aggregator = new DateColumnStatsAggregator();
+
+    ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList, 
partitions, true);
+    ColumnStatisticsData expectedStats = new 
ColStatsBuilder<>(Date.class).numNulls(3).numDVs(3)
+        .low(DATE_1).high(DATE_2).hll(values1).build();
+    Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+
+    aggregator.useDensityFunctionForNDVEstimation = true;
+    computedStatsObj = aggregator.aggregate(statsList, partitions, true);
+    expectedStats = new ColStatsBuilder<>(Date.class).numNulls(3).numDVs(4)
+        .low(DATE_1).high(DATE_2).hll(values1).build();
+    Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+
+    aggregator.useDensityFunctionForNDVEstimation = false;
+    aggregator.ndvTuner = 1;
+    computedStatsObj = aggregator.aggregate(statsList, partitions, true);
+    expectedStats = new ColStatsBuilder<>(Date.class).numNulls(3).numDVs(5)
+        .low(DATE_1).high(DATE_2).hll(values1).build();
+    Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+  }
+
+  @Test
+  public void testAggregateMultiStatsWhenAllAvailable() throws MetaException {
+    List<String> partitions = Arrays.asList("part1", "part2", "part3");
+
+    long[] values1 = { DATE_1.getDaysSinceEpoch(), DATE_2.getDaysSinceEpoch(), 
DATE_3.getDaysSinceEpoch() };
+    ColumnStatisticsData data1 = new 
ColStatsBuilder<>(Date.class).numNulls(1).numDVs(3)
+        .low(DATE_1).high(DATE_3).hll(values1).build();
+
+    long[] values2 = { DATE_3.getDaysSinceEpoch(), DATE_4.getDaysSinceEpoch(), 
DATE_5.getDaysSinceEpoch() };
+    ColumnStatisticsData data2 = new 
ColStatsBuilder<>(Date.class).numNulls(2).numDVs(3)
+        .low(DATE_3).high(DATE_5).hll(values2).build();
+
+    long[] values3 = { DATE_6.getDaysSinceEpoch(), DATE_7.getDaysSinceEpoch() 
};
+    ColumnStatisticsData data3 = new 
ColStatsBuilder<>(Date.class).numNulls(3).numDVs(2)
+        .low(DATE_6).high(DATE_7).hll(values3).build();
+
+    List<ColStatsObjWithSourceInfo> statsList = 
Arrays.asList(createStatsWithInfo(data1, TABLE, COL, partitions.get(0)),
+        createStatsWithInfo(data2, TABLE, COL, partitions.get(1)), 
createStatsWithInfo(data3, TABLE, COL, partitions.get(2)));
+
+    DateColumnStatsAggregator aggregator = new DateColumnStatsAggregator();
+    ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList, 
partitions, true);
+
+    // the aggregation does not update hll, only numNDVs is, it keeps the 
first hll
+    ColumnStatisticsData expectedStats = new 
ColStatsBuilder<>(Date.class).numNulls(6).numDVs(7)
+        .low(DATE_1).high(DATE_7).hll(values1).build();
+
+    Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+  }
+
+  @Test
+  public void testAggregateMultiStatsWhenUnmergeableBitVectors() throws 
MetaException {
+    List<String> partitions = Arrays.asList("part1", "part2", "part3");
+
+    long[] values1 = { DATE_1.getDaysSinceEpoch(), DATE_2.getDaysSinceEpoch(), 
DATE_3.getDaysSinceEpoch() };
+    ColumnStatisticsData data1 = new 
ColStatsBuilder<>(Date.class).numNulls(1).numDVs(3)
+        .low(DATE_1).high(DATE_3).fmSketch(values1).build();
+    long[] values2 = { DATE_3.getDaysSinceEpoch(), DATE_4.getDaysSinceEpoch(), 
DATE_5.getDaysSinceEpoch() };
+    ColumnStatisticsData data2 = new 
ColStatsBuilder<>(Date.class).numNulls(2).numDVs(3)
+        .low(DATE_3).high(DATE_5).hll(values2).build();
+    long[] values3 = { DATE_1.getDaysSinceEpoch(), DATE_2.getDaysSinceEpoch(), 
DATE_6.getDaysSinceEpoch(),
+        DATE_8.getDaysSinceEpoch() };
+    ColumnStatisticsData data3 = new 
ColStatsBuilder<>(Date.class).numNulls(3).numDVs(4)
+        .low(DATE_1).high(DATE_8).hll(values3).build();
+
+    List<ColStatsObjWithSourceInfo> statsList = 
Arrays.asList(createStatsWithInfo(data1, TABLE, COL, partitions.get(0)),
+        createStatsWithInfo(data2, TABLE, COL, partitions.get(1)), 
createStatsWithInfo(data3, TABLE, COL, partitions.get(2)));
+
+    DateColumnStatsAggregator aggregator = new DateColumnStatsAggregator();
+
+    ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList, 
partitions, true);
+    // the aggregation does not update the bitvector, only numDVs is, it keeps 
the first bitvector;
+    // numDVs is set to the maximum among all stats when non-mergeable 
bitvectors are detected
+    ColumnStatisticsData expectedStats = new 
ColStatsBuilder<>(Date.class).numNulls(6).numDVs(4)
+        .low(DATE_1).high(DATE_8).fmSketch(values1).build();
+    Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+
+    aggregator.useDensityFunctionForNDVEstimation = true;
+    computedStatsObj = aggregator.aggregate(statsList, partitions, true);
+    // the use of the density function leads to a different estimation for 
numNDV
+    expectedStats = new ColStatsBuilder<>(Date.class).numNulls(6).numDVs(6)
+        .low(DATE_1).high(DATE_8).fmSketch(values1).build();
+    Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+
+    // here the ndv lower bound is 4 (the highest individual numDVs), the 
higher bound is 10 (3 + 3 + 4, that is the
+    // sum of all the numDVs for all partitions), ndv tuner influences the 
choice between the lower bound
+    // (ndvTuner = 0) and the higher bound (ndvTuner = 1), and intermediate 
values for ndvTuner in the range (0, 1)
+    aggregator.useDensityFunctionForNDVEstimation = false;
+
+    aggregator.ndvTuner = 0;
+    computedStatsObj = aggregator.aggregate(statsList, partitions, true);
+    expectedStats = new ColStatsBuilder<>(Date.class).numNulls(6).numDVs(4)
+        .low(DATE_1).high(DATE_8).fmSketch(values1).build();
+    Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+
+    aggregator.ndvTuner = 0.5;
+    computedStatsObj = aggregator.aggregate(statsList, partitions, true);
+    expectedStats = new ColStatsBuilder<>(Date.class).numNulls(6).numDVs(7)
+        .low(DATE_1).high(DATE_8).fmSketch(values1).build();
+    Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+
+    aggregator.ndvTuner = 0.75;
+    computedStatsObj = aggregator.aggregate(statsList, partitions, true);
+    expectedStats = new ColStatsBuilder<>(Date.class).numNulls(6).numDVs(8)
+        .low(DATE_1).high(DATE_8).fmSketch(values1).build();
+    Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+
+    aggregator.ndvTuner = 1;
+    computedStatsObj = aggregator.aggregate(statsList, partitions, true);
+    expectedStats = new ColStatsBuilder<>(Date.class).numNulls(6).numDVs(10)
+        .low(DATE_1).high(DATE_8).fmSketch(values1).build();
+    Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+  }
+
+  @Test
+  public void testAggregateMultiStatsWhenOnlySomeAvailable() throws 
MetaException {
+    List<String> partitions = Arrays.asList("part1", "part2", "part3", 
"part4");
+
+    long[] values1 = { DATE_1.getDaysSinceEpoch(), DATE_2.getDaysSinceEpoch(), 
DATE_3.getDaysSinceEpoch() };
+    ColumnStatisticsData data1 = new 
ColStatsBuilder<>(Date.class).numNulls(1).numDVs(3)
+        .low(DATE_1).high(DATE_3).hll(values1).build();
+
+    ColumnStatisticsData data3 = new 
ColStatsBuilder<>(Date.class).numNulls(3).numDVs(1).low(DATE_7).high(DATE_7)
+        .hll(DATE_7.getDaysSinceEpoch()).build();
+
+    long[] values4 = { DATE_3.getDaysSinceEpoch(), DATE_4.getDaysSinceEpoch(), 
DATE_5.getDaysSinceEpoch() };
+    ColumnStatisticsData data4 = new 
ColStatsBuilder<>(Date.class).numNulls(2).numDVs(3)
+        .low(DATE_3).high(DATE_5).hll(values4).build();
+
+    List<ColStatsObjWithSourceInfo> statsList = 
Arrays.asList(createStatsWithInfo(data1, TABLE, COL, partitions.get(0)),
+        createStatsWithInfo(data3, TABLE, COL, partitions.get(2)), 
createStatsWithInfo(data4, TABLE, COL, partitions.get(3)));
+
+    DateColumnStatsAggregator aggregator = new DateColumnStatsAggregator();
+    ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList, 
partitions, false);
+
+    // hll in case of missing stats is left as null, only numDVs is updated
+    ColumnStatisticsData expectedStats = new 
ColStatsBuilder<>(Date.class).numNulls(8).numDVs(4)
+        .low(DATE_1).high(DATE_9).build();

Review Comment:
   Also `low` and `high` are computed with interpolation, it's an estimated 
value and it's therefore OK not to be part of the input in this specific case.





Issue Time Tracking
-------------------

    Worklog Id:     (was: 808314)
    Time Spent: 6h  (was: 5h 50m)

> NPEs and rounding issues in ColumnStatsAggregator classes
> ---------------------------------------------------------
>
>                 Key: HIVE-26277
>                 URL: https://issues.apache.org/jira/browse/HIVE-26277
>             Project: Hive
>          Issue Type: Bug
>          Components: Standalone Metastore, Statistics, Tests
>    Affects Versions: 4.0.0-alpha-2
>            Reporter: Alessandro Solimando
>            Assignee: Alessandro Solimando
>            Priority: Major
>              Labels: pull-request-available
>          Time Spent: 6h
>  Remaining Estimate: 0h
>
> Fix NPEs and rounding errors in _ColumnStatsAggregator_ classes, add 
> unit-tests for all the involved classes.



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to