[ https://issues.apache.org/jira/browse/HIVE-26277?focusedWorklogId=808351&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-808351 ]
ASF GitHub Bot logged work on HIVE-26277: ----------------------------------------- Author: ASF GitHub Bot Created on: 13/Sep/22 15:37 Start Date: 13/Sep/22 15:37 Worklog Time Spent: 10m Work Description: asolimando commented on code in PR #3339: URL: https://github.com/apache/hive/pull/3339#discussion_r969789332 ########## standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DateColumnStatsAggregatorTest.java: ########## @@ -0,0 +1,279 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.hadoop.hive.metastore.columnstats.aggr; + +import org.apache.hadoop.hive.metastore.TableType; +import org.apache.hadoop.hive.metastore.annotation.MetastoreUnitTest; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; +import org.apache.hadoop.hive.metastore.api.Date; +import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.metastore.api.MetaException; +import org.apache.hadoop.hive.metastore.api.Table; +import org.apache.hadoop.hive.metastore.columnstats.ColStatsBuilder; +import org.apache.hadoop.hive.metastore.utils.MetaStoreServerUtils.ColStatsObjWithSourceInfo; +import org.junit.Assert; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +import static org.apache.hadoop.hive.metastore.StatisticsTestUtils.createStatsWithInfo; + +@Category(MetastoreUnitTest.class) +public class DateColumnStatsAggregatorTest { + + private static final Table TABLE = new Table("dummy", "db", "hive", 0, 0, + 0, null, null, Collections.emptyMap(), null, null, + TableType.MANAGED_TABLE.toString()); + private static final FieldSchema COL = new FieldSchema("col", "int", ""); + + private static final Date DATE_1 = new Date(1); + private static final Date DATE_2 = new Date(2); + private static final Date DATE_3 = new Date(3); + private static final Date DATE_4 = new Date(4); + private static final Date DATE_5 = new Date(5); + private static final Date DATE_6 = new Date(6); + private static final Date DATE_7 = new Date(7); + private static final Date DATE_8 = new Date(8); + private static final Date DATE_9 = new Date(9); + + @Test + public void testAggregateSingleStat() throws MetaException { + List<String> partitions = Collections.singletonList("part1"); + + ColumnStatisticsData data1 = new ColStatsBuilder<>(Date.class).numNulls(1).numDVs(2).low(DATE_1).high(DATE_4) + .hll(DATE_1.getDaysSinceEpoch(), DATE_4.getDaysSinceEpoch()).build(); + List<ColStatsObjWithSourceInfo> statsList = + Collections.singletonList(createStatsWithInfo(data1, TABLE, COL, partitions.get(0))); + + DateColumnStatsAggregator aggregator = new DateColumnStatsAggregator(); + ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList, partitions, true); + + Assert.assertEquals(data1, computedStatsObj.getStatsData()); + } + + @Test + public void testAggregateSingleStatWhenNullValues() throws MetaException { + List<String> partitions = Collections.singletonList("part1"); + + ColumnStatisticsData data1 = new ColStatsBuilder<>(Date.class).numNulls(1).numDVs(2).build(); + List<ColStatsObjWithSourceInfo> statsList = + Collections.singletonList(createStatsWithInfo(data1, TABLE, COL, partitions.get(0))); + + DateColumnStatsAggregator aggregator = new DateColumnStatsAggregator(); + ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList, partitions, true); + Assert.assertEquals(data1, computedStatsObj.getStatsData()); + + aggregator.useDensityFunctionForNDVEstimation = true; + computedStatsObj = aggregator.aggregate(statsList, partitions, true); + Assert.assertEquals(data1, computedStatsObj.getStatsData()); + + aggregator.useDensityFunctionForNDVEstimation = false; + aggregator.ndvTuner = 1; + // ndv tuner does not have any effect because min numDVs and max numDVs coincide (we have a single stats) + computedStatsObj = aggregator.aggregate(statsList, partitions, true); + Assert.assertEquals(data1, computedStatsObj.getStatsData()); + } + + @Test + public void testAggregateMultipleStatsWhenSomeNullValues() throws MetaException { + List<String> partitions = Arrays.asList("part1", "part2"); + + long[] values1 = { DATE_1.getDaysSinceEpoch(), DATE_2.getDaysSinceEpoch() }; + ColumnStatisticsData data1 = new ColStatsBuilder<>(Date.class).numNulls(1).numDVs(2) + .low(DATE_1).high(DATE_2).hll(values1).build(); + ColumnStatisticsData data2 = new ColStatsBuilder<>(Date.class).numNulls(2).numDVs(3).build(); + + List<ColStatsObjWithSourceInfo> statsList = Arrays.asList(createStatsWithInfo(data1, TABLE, COL, partitions.get(0)), + createStatsWithInfo(data2, TABLE, COL, partitions.get(1))); + + DateColumnStatsAggregator aggregator = new DateColumnStatsAggregator(); + + ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList, partitions, true); + ColumnStatisticsData expectedStats = new ColStatsBuilder<>(Date.class).numNulls(3).numDVs(3) + .low(DATE_1).high(DATE_2).hll(values1).build(); + Assert.assertEquals(expectedStats, computedStatsObj.getStatsData()); + + aggregator.useDensityFunctionForNDVEstimation = true; + computedStatsObj = aggregator.aggregate(statsList, partitions, true); + expectedStats = new ColStatsBuilder<>(Date.class).numNulls(3).numDVs(4) + .low(DATE_1).high(DATE_2).hll(values1).build(); + Assert.assertEquals(expectedStats, computedStatsObj.getStatsData()); + + aggregator.useDensityFunctionForNDVEstimation = false; + aggregator.ndvTuner = 1; + computedStatsObj = aggregator.aggregate(statsList, partitions, true); + expectedStats = new ColStatsBuilder<>(Date.class).numNulls(3).numDVs(5) + .low(DATE_1).high(DATE_2).hll(values1).build(); + Assert.assertEquals(expectedStats, computedStatsObj.getStatsData()); + } + + @Test + public void testAggregateMultiStatsWhenAllAvailable() throws MetaException { + List<String> partitions = Arrays.asList("part1", "part2", "part3"); + + long[] values1 = { DATE_1.getDaysSinceEpoch(), DATE_2.getDaysSinceEpoch(), DATE_3.getDaysSinceEpoch() }; + ColumnStatisticsData data1 = new ColStatsBuilder<>(Date.class).numNulls(1).numDVs(3) + .low(DATE_1).high(DATE_3).hll(values1).build(); + + long[] values2 = { DATE_3.getDaysSinceEpoch(), DATE_4.getDaysSinceEpoch(), DATE_5.getDaysSinceEpoch() }; + ColumnStatisticsData data2 = new ColStatsBuilder<>(Date.class).numNulls(2).numDVs(3) + .low(DATE_3).high(DATE_5).hll(values2).build(); + + long[] values3 = { DATE_6.getDaysSinceEpoch(), DATE_7.getDaysSinceEpoch() }; + ColumnStatisticsData data3 = new ColStatsBuilder<>(Date.class).numNulls(3).numDVs(2) + .low(DATE_6).high(DATE_7).hll(values3).build(); + + List<ColStatsObjWithSourceInfo> statsList = Arrays.asList(createStatsWithInfo(data1, TABLE, COL, partitions.get(0)), + createStatsWithInfo(data2, TABLE, COL, partitions.get(1)), createStatsWithInfo(data3, TABLE, COL, partitions.get(2))); + + DateColumnStatsAggregator aggregator = new DateColumnStatsAggregator(); + ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList, partitions, true); + + // the aggregation does not update hll, only numNDVs is, it keeps the first hll + ColumnStatisticsData expectedStats = new ColStatsBuilder<>(Date.class).numNulls(6).numDVs(7) Review Comment: It's a pretty crucial point, I agree it deservers a better comment in the assert part, I will improve it. Issue Time Tracking ------------------- Worklog Id: (was: 808351) Time Spent: 7.5h (was: 7h 20m) > NPEs and rounding issues in ColumnStatsAggregator classes > --------------------------------------------------------- > > Key: HIVE-26277 > URL: https://issues.apache.org/jira/browse/HIVE-26277 > Project: Hive > Issue Type: Bug > Components: Standalone Metastore, Statistics, Tests > Affects Versions: 4.0.0-alpha-2 > Reporter: Alessandro Solimando > Assignee: Alessandro Solimando > Priority: Major > Labels: pull-request-available > Time Spent: 7.5h > Remaining Estimate: 0h > > Fix NPEs and rounding errors in _ColumnStatsAggregator_ classes, add > unit-tests for all the involved classes. -- This message was sent by Atlassian Jira (v8.20.10#820010)