[ https://issues.apache.org/jira/browse/HIVE-26277?focusedWorklogId=808303&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-808303 ]
ASF GitHub Bot logged work on HIVE-26277: ----------------------------------------- Author: ASF GitHub Bot Created on: 13/Sep/22 14:06 Start Date: 13/Sep/22 14:06 Worklog Time Spent: 10m Work Description: zabetak commented on code in PR #3339: URL: https://github.com/apache/hive/pull/3339#discussion_r969676044 ########## standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DateColumnStatsAggregatorTest.java: ########## @@ -0,0 +1,279 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.hadoop.hive.metastore.columnstats.aggr; + +import org.apache.hadoop.hive.metastore.TableType; +import org.apache.hadoop.hive.metastore.annotation.MetastoreUnitTest; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; +import org.apache.hadoop.hive.metastore.api.Date; +import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.metastore.api.MetaException; +import org.apache.hadoop.hive.metastore.api.Table; +import org.apache.hadoop.hive.metastore.columnstats.ColStatsBuilder; +import org.apache.hadoop.hive.metastore.utils.MetaStoreServerUtils.ColStatsObjWithSourceInfo; +import org.junit.Assert; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +import static org.apache.hadoop.hive.metastore.StatisticsTestUtils.createStatsWithInfo; + +@Category(MetastoreUnitTest.class) +public class DateColumnStatsAggregatorTest { + + private static final Table TABLE = new Table("dummy", "db", "hive", 0, 0, + 0, null, null, Collections.emptyMap(), null, null, + TableType.MANAGED_TABLE.toString()); + private static final FieldSchema COL = new FieldSchema("col", "int", ""); + + private static final Date DATE_1 = new Date(1); + private static final Date DATE_2 = new Date(2); + private static final Date DATE_3 = new Date(3); + private static final Date DATE_4 = new Date(4); + private static final Date DATE_5 = new Date(5); + private static final Date DATE_6 = new Date(6); + private static final Date DATE_7 = new Date(7); + private static final Date DATE_8 = new Date(8); + private static final Date DATE_9 = new Date(9); + + @Test + public void testAggregateSingleStat() throws MetaException { + List<String> partitions = Collections.singletonList("part1"); + + ColumnStatisticsData data1 = new ColStatsBuilder<>(Date.class).numNulls(1).numDVs(2).low(DATE_1).high(DATE_4) + .hll(DATE_1.getDaysSinceEpoch(), DATE_4.getDaysSinceEpoch()).build(); + List<ColStatsObjWithSourceInfo> statsList = + Collections.singletonList(createStatsWithInfo(data1, TABLE, COL, partitions.get(0))); + + DateColumnStatsAggregator aggregator = new DateColumnStatsAggregator(); + ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList, partitions, true); + + Assert.assertEquals(data1, computedStatsObj.getStatsData()); + } + + @Test + public void testAggregateSingleStatWhenNullValues() throws MetaException { + List<String> partitions = Collections.singletonList("part1"); + + ColumnStatisticsData data1 = new ColStatsBuilder<>(Date.class).numNulls(1).numDVs(2).build(); + List<ColStatsObjWithSourceInfo> statsList = + Collections.singletonList(createStatsWithInfo(data1, TABLE, COL, partitions.get(0))); + + DateColumnStatsAggregator aggregator = new DateColumnStatsAggregator(); + ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList, partitions, true); + Assert.assertEquals(data1, computedStatsObj.getStatsData()); + + aggregator.useDensityFunctionForNDVEstimation = true; + computedStatsObj = aggregator.aggregate(statsList, partitions, true); + Assert.assertEquals(data1, computedStatsObj.getStatsData()); + + aggregator.useDensityFunctionForNDVEstimation = false; + aggregator.ndvTuner = 1; + // ndv tuner does not have any effect because min numDVs and max numDVs coincide (we have a single stats) + computedStatsObj = aggregator.aggregate(statsList, partitions, true); + Assert.assertEquals(data1, computedStatsObj.getStatsData()); + } + + @Test + public void testAggregateMultipleStatsWhenSomeNullValues() throws MetaException { + List<String> partitions = Arrays.asList("part1", "part2"); + + long[] values1 = { DATE_1.getDaysSinceEpoch(), DATE_2.getDaysSinceEpoch() }; + ColumnStatisticsData data1 = new ColStatsBuilder<>(Date.class).numNulls(1).numDVs(2) + .low(DATE_1).high(DATE_2).hll(values1).build(); + ColumnStatisticsData data2 = new ColStatsBuilder<>(Date.class).numNulls(2).numDVs(3).build(); + + List<ColStatsObjWithSourceInfo> statsList = Arrays.asList(createStatsWithInfo(data1, TABLE, COL, partitions.get(0)), + createStatsWithInfo(data2, TABLE, COL, partitions.get(1))); + + DateColumnStatsAggregator aggregator = new DateColumnStatsAggregator(); + + ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList, partitions, true); + ColumnStatisticsData expectedStats = new ColStatsBuilder<>(Date.class).numNulls(3).numDVs(3) + .low(DATE_1).high(DATE_2).hll(values1).build(); + Assert.assertEquals(expectedStats, computedStatsObj.getStatsData()); + + aggregator.useDensityFunctionForNDVEstimation = true; + computedStatsObj = aggregator.aggregate(statsList, partitions, true); + expectedStats = new ColStatsBuilder<>(Date.class).numNulls(3).numDVs(4) + .low(DATE_1).high(DATE_2).hll(values1).build(); + Assert.assertEquals(expectedStats, computedStatsObj.getStatsData()); + + aggregator.useDensityFunctionForNDVEstimation = false; + aggregator.ndvTuner = 1; + computedStatsObj = aggregator.aggregate(statsList, partitions, true); + expectedStats = new ColStatsBuilder<>(Date.class).numNulls(3).numDVs(5) + .low(DATE_1).high(DATE_2).hll(values1).build(); + Assert.assertEquals(expectedStats, computedStatsObj.getStatsData()); + } + + @Test + public void testAggregateMultiStatsWhenAllAvailable() throws MetaException { + List<String> partitions = Arrays.asList("part1", "part2", "part3"); + + long[] values1 = { DATE_1.getDaysSinceEpoch(), DATE_2.getDaysSinceEpoch(), DATE_3.getDaysSinceEpoch() }; + ColumnStatisticsData data1 = new ColStatsBuilder<>(Date.class).numNulls(1).numDVs(3) + .low(DATE_1).high(DATE_3).hll(values1).build(); + + long[] values2 = { DATE_3.getDaysSinceEpoch(), DATE_4.getDaysSinceEpoch(), DATE_5.getDaysSinceEpoch() }; + ColumnStatisticsData data2 = new ColStatsBuilder<>(Date.class).numNulls(2).numDVs(3) + .low(DATE_3).high(DATE_5).hll(values2).build(); + + long[] values3 = { DATE_6.getDaysSinceEpoch(), DATE_7.getDaysSinceEpoch() }; + ColumnStatisticsData data3 = new ColStatsBuilder<>(Date.class).numNulls(3).numDVs(2) + .low(DATE_6).high(DATE_7).hll(values3).build(); + + List<ColStatsObjWithSourceInfo> statsList = Arrays.asList(createStatsWithInfo(data1, TABLE, COL, partitions.get(0)), + createStatsWithInfo(data2, TABLE, COL, partitions.get(1)), createStatsWithInfo(data3, TABLE, COL, partitions.get(2))); + + DateColumnStatsAggregator aggregator = new DateColumnStatsAggregator(); + ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList, partitions, true); + + // the aggregation does not update hll, only numNDVs is, it keeps the first hll + ColumnStatisticsData expectedStats = new ColStatsBuilder<>(Date.class).numNulls(6).numDVs(7) + .low(DATE_1).high(DATE_7).hll(values1).build(); + + Assert.assertEquals(expectedStats, computedStatsObj.getStatsData()); + } + + @Test + public void testAggregateMultiStatsWhenUnmergeableBitVectors() throws MetaException { + List<String> partitions = Arrays.asList("part1", "part2", "part3"); + + long[] values1 = { DATE_1.getDaysSinceEpoch(), DATE_2.getDaysSinceEpoch(), DATE_3.getDaysSinceEpoch() }; + ColumnStatisticsData data1 = new ColStatsBuilder<>(Date.class).numNulls(1).numDVs(3) + .low(DATE_1).high(DATE_3).fmSketch(values1).build(); + long[] values2 = { DATE_3.getDaysSinceEpoch(), DATE_4.getDaysSinceEpoch(), DATE_5.getDaysSinceEpoch() }; + ColumnStatisticsData data2 = new ColStatsBuilder<>(Date.class).numNulls(2).numDVs(3) + .low(DATE_3).high(DATE_5).hll(values2).build(); + long[] values3 = { DATE_1.getDaysSinceEpoch(), DATE_2.getDaysSinceEpoch(), DATE_6.getDaysSinceEpoch(), + DATE_8.getDaysSinceEpoch() }; + ColumnStatisticsData data3 = new ColStatsBuilder<>(Date.class).numNulls(3).numDVs(4) + .low(DATE_1).high(DATE_8).hll(values3).build(); + + List<ColStatsObjWithSourceInfo> statsList = Arrays.asList(createStatsWithInfo(data1, TABLE, COL, partitions.get(0)), + createStatsWithInfo(data2, TABLE, COL, partitions.get(1)), createStatsWithInfo(data3, TABLE, COL, partitions.get(2))); + + DateColumnStatsAggregator aggregator = new DateColumnStatsAggregator(); + + ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList, partitions, true); + // the aggregation does not update the bitvector, only numDVs is, it keeps the first bitvector; + // numDVs is set to the maximum among all stats when non-mergeable bitvectors are detected + ColumnStatisticsData expectedStats = new ColStatsBuilder<>(Date.class).numNulls(6).numDVs(4) + .low(DATE_1).high(DATE_8).fmSketch(values1).build(); + Assert.assertEquals(expectedStats, computedStatsObj.getStatsData()); + + aggregator.useDensityFunctionForNDVEstimation = true; + computedStatsObj = aggregator.aggregate(statsList, partitions, true); + // the use of the density function leads to a different estimation for numNDV + expectedStats = new ColStatsBuilder<>(Date.class).numNulls(6).numDVs(6) + .low(DATE_1).high(DATE_8).fmSketch(values1).build(); + Assert.assertEquals(expectedStats, computedStatsObj.getStatsData()); + + // here the ndv lower bound is 4 (the highest individual numDVs), the higher bound is 10 (3 + 3 + 4, that is the + // sum of all the numDVs for all partitions), ndv tuner influences the choice between the lower bound + // (ndvTuner = 0) and the higher bound (ndvTuner = 1), and intermediate values for ndvTuner in the range (0, 1) + aggregator.useDensityFunctionForNDVEstimation = false; + + aggregator.ndvTuner = 0; + computedStatsObj = aggregator.aggregate(statsList, partitions, true); + expectedStats = new ColStatsBuilder<>(Date.class).numNulls(6).numDVs(4) + .low(DATE_1).high(DATE_8).fmSketch(values1).build(); + Assert.assertEquals(expectedStats, computedStatsObj.getStatsData()); + + aggregator.ndvTuner = 0.5; + computedStatsObj = aggregator.aggregate(statsList, partitions, true); + expectedStats = new ColStatsBuilder<>(Date.class).numNulls(6).numDVs(7) + .low(DATE_1).high(DATE_8).fmSketch(values1).build(); + Assert.assertEquals(expectedStats, computedStatsObj.getStatsData()); + + aggregator.ndvTuner = 0.75; + computedStatsObj = aggregator.aggregate(statsList, partitions, true); + expectedStats = new ColStatsBuilder<>(Date.class).numNulls(6).numDVs(8) + .low(DATE_1).high(DATE_8).fmSketch(values1).build(); + Assert.assertEquals(expectedStats, computedStatsObj.getStatsData()); + + aggregator.ndvTuner = 1; + computedStatsObj = aggregator.aggregate(statsList, partitions, true); + expectedStats = new ColStatsBuilder<>(Date.class).numNulls(6).numDVs(10) + .low(DATE_1).high(DATE_8).fmSketch(values1).build(); + Assert.assertEquals(expectedStats, computedStatsObj.getStatsData()); + } + + @Test + public void testAggregateMultiStatsWhenOnlySomeAvailable() throws MetaException { + List<String> partitions = Arrays.asList("part1", "part2", "part3", "part4"); + + long[] values1 = { DATE_1.getDaysSinceEpoch(), DATE_2.getDaysSinceEpoch(), DATE_3.getDaysSinceEpoch() }; + ColumnStatisticsData data1 = new ColStatsBuilder<>(Date.class).numNulls(1).numDVs(3) + .low(DATE_1).high(DATE_3).hll(values1).build(); + + ColumnStatisticsData data3 = new ColStatsBuilder<>(Date.class).numNulls(3).numDVs(1).low(DATE_7).high(DATE_7) + .hll(DATE_7.getDaysSinceEpoch()).build(); + + long[] values4 = { DATE_3.getDaysSinceEpoch(), DATE_4.getDaysSinceEpoch(), DATE_5.getDaysSinceEpoch() }; + ColumnStatisticsData data4 = new ColStatsBuilder<>(Date.class).numNulls(2).numDVs(3) + .low(DATE_3).high(DATE_5).hll(values4).build(); + + List<ColStatsObjWithSourceInfo> statsList = Arrays.asList(createStatsWithInfo(data1, TABLE, COL, partitions.get(0)), + createStatsWithInfo(data3, TABLE, COL, partitions.get(2)), createStatsWithInfo(data4, TABLE, COL, partitions.get(3))); + + DateColumnStatsAggregator aggregator = new DateColumnStatsAggregator(); + ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList, partitions, false); + + // hll in case of missing stats is left as null, only numDVs is updated + ColumnStatisticsData expectedStats = new ColStatsBuilder<>(Date.class).numNulls(8).numDVs(4) Review Comment: Why numNulls is 8? I was expecting this to be 6. Issue Time Tracking ------------------- Worklog Id: (was: 808303) Time Spent: 4h 50m (was: 4h 40m) > NPEs and rounding issues in ColumnStatsAggregator classes > --------------------------------------------------------- > > Key: HIVE-26277 > URL: https://issues.apache.org/jira/browse/HIVE-26277 > Project: Hive > Issue Type: Bug > Components: Standalone Metastore, Statistics, Tests > Affects Versions: 4.0.0-alpha-2 > Reporter: Alessandro Solimando > Assignee: Alessandro Solimando > Priority: Major > Labels: pull-request-available > Time Spent: 4h 50m > Remaining Estimate: 0h > > Fix NPEs and rounding errors in _ColumnStatsAggregator_ classes, add > unit-tests for all the involved classes. -- This message was sent by Atlassian Jira (v8.20.10#820010)