This is an automated email from the ASF dual-hosted git repository.

michaelsmith pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit e35f8183cb1ba069ae00ee93e71451eccd505d0a
Author: stiga-huang <[email protected]>
AuthorDate: Tue May 21 16:10:18 2024 +0800

    IMPALA-13102: Normalize invalid column stats from HMS
    
    Column stats like numDVs, numNulls in HMS could have arbitrary values.
    Impala expects them to be non-negative or -1 for unknown. So loading
    tables with invalid stats values (<-1) will fail.
    
    This patch adds logic to normalize the stats values. If the value < -1,
    use -1 for it and add corresponding warning logs. Also refactor some
    redundant codes in ColumnStats.
    
    Tests:
     - Add e2e test
    
    Change-Id: If6216e3d6e73a529a9b3a8c0ea9d22727ab43f1a
    Reviewed-on: http://gerrit.cloudera.org:8080/21445
    Reviewed-by: Impala Public Jenkins <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 .../impala/analysis/AlterTableSetColumnStats.java  |   4 +-
 .../java/org/apache/impala/catalog/Column.java     |   2 +-
 .../org/apache/impala/catalog/ColumnStats.java     | 168 ++++++++++++---------
 .../org/apache/impala/catalog/FeCatalogUtils.java  |   1 -
 tests/metadata/test_compute_stats.py               |  45 +++++-
 5 files changed, 147 insertions(+), 73 deletions(-)

diff --git 
a/fe/src/main/java/org/apache/impala/analysis/AlterTableSetColumnStats.java 
b/fe/src/main/java/org/apache/impala/analysis/AlterTableSetColumnStats.java
index b642f44c3..a249a7d68 100644
--- a/fe/src/main/java/org/apache/impala/analysis/AlterTableSetColumnStats.java
+++ b/fe/src/main/java/org/apache/impala/analysis/AlterTableSetColumnStats.java
@@ -125,7 +125,7 @@ public class AlterTableSetColumnStats extends 
AlterTableStmt {
       if (statsVal == null || statsVal < -1) {
         throw new AnalysisException(String.format(
             "Invalid stats value '%s' for column stats key: %s\n" +
-            "Expected a positive integer or -1 for unknown.",
+            "Expected a non-negative integer or -1 for unknown.",
             statsValue, statsKey));
       }
       stats.update(col.getType(), statsKey, statsVal);
@@ -139,7 +139,7 @@ public class AlterTableSetColumnStats extends 
AlterTableStmt {
           statsVal.isNaN() || statsVal.isInfinite()) {
         throw new AnalysisException(String.format(
             "Invalid stats value '%s' for column stats key: %s\n" +
-            "Expected a positive floating-point number or -1 for unknown.",
+            "Expected a non-negative floating-point number or -1 for unknown.",
             statsValue, statsKey));
       }
       stats.update(col.getType(), statsKey, statsVal);
diff --git a/fe/src/main/java/org/apache/impala/catalog/Column.java 
b/fe/src/main/java/org/apache/impala/catalog/Column.java
index 5f82acc74..4251f14d9 100644
--- a/fe/src/main/java/org/apache/impala/catalog/Column.java
+++ b/fe/src/main/java/org/apache/impala/catalog/Column.java
@@ -70,7 +70,7 @@ public class Column {
   public boolean isVirtual() { return false; }
 
   public boolean updateStats(ColumnStatisticsData statsData) {
-    boolean statsDataCompatibleWithColType = stats_.update(type_, statsData);
+    boolean statsDataCompatibleWithColType = stats_.update(name_, type_, 
statsData);
     if (LOG.isTraceEnabled()) {
       LOG.trace("col stats: " + name_ + " #distinct=" + 
stats_.getNumDistinctValues());
     }
diff --git a/fe/src/main/java/org/apache/impala/catalog/ColumnStats.java 
b/fe/src/main/java/org/apache/impala/catalog/ColumnStats.java
index cd76a9c59..d60d2ce8e 100644
--- a/fe/src/main/java/org/apache/impala/catalog/ColumnStats.java
+++ b/fe/src/main/java/org/apache/impala/catalog/ColumnStats.java
@@ -42,12 +42,12 @@ import org.apache.impala.util.MetaStoreUtil;
 import org.apache.impala.thrift.TColumnStats;
 import org.apache.impala.thrift.TColumnValue;
 
-import org.apache.log4j.Logger;
-
 import com.google.common.base.MoreObjects;
 import com.google.common.base.Preconditions;
 import com.google.common.collect.Sets;
 import com.google.common.math.LongMath;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 /**
  * Statistics for a single column.
@@ -61,7 +61,7 @@ public class ColumnStats {
       PrimitiveType.VARCHAR, PrimitiveType.STRING, PrimitiveType.TIMESTAMP,
       PrimitiveType.TINYINT, PrimitiveType.DECIMAL);
 
-  private static final Logger LOG = Logger.getLogger(ColumnStats.class);
+  private final static Logger LOG = LoggerFactory.getLogger(ColumnStats.class);
 
   public enum StatsKey {
     NUM_DISTINCT_VALUES("numDVs"),
@@ -246,7 +246,7 @@ public class ColumnStats {
    */
   public String getTColumnValueAsString(TColumnValue value) {
     if (value==null) return "-1";
-    StringBuilder sb = new StringBuilder("");
+    StringBuilder sb = new StringBuilder();
 
     if (value.isSetBool_val()) {
       sb.append(value.bool_val);
@@ -336,7 +336,7 @@ public class ColumnStats {
       } else {
         BigDecimal value = literal.getValue();
         BigDecimal lValue = new BigDecimal(new 
String(lowValue_.getDecimal_val()));
-        if (value.compareTo(lValue) == -1) {
+        if (value.compareTo(lValue) < 0) {
           lowValue_.setDecimal_val(value.toString().getBytes());
         }
       }
@@ -380,7 +380,7 @@ public class ColumnStats {
       } else {
         BigDecimal value = literal.getValue();
         BigDecimal hValue = new BigDecimal(new 
String(highValue_.getDecimal_val()));
-        if (value.compareTo(hValue) == 1) {
+        if (value.compareTo(hValue) > 0) {
           highValue_.setDecimal_val(value.toString().getBytes());
         }
       }
@@ -435,56 +435,54 @@ public class ColumnStats {
     if (!longStats.isSetLowValue()) {
       lowValue_ = null;
     } else {
-      Long value = Long.valueOf(longStats.getLowValue());
+      long value = longStats.getLowValue();
       lowValue_ = new TColumnValue();
       switch (type) {
         case TINYINT:
-          lowValue_.setByte_val(value.byteValue());
+          lowValue_.setByte_val((byte) value);
           break;
         case SMALLINT:
-          lowValue_.setShort_val(value.shortValue());
+          lowValue_.setShort_val((short) value);
           break;
         case INT:
-          lowValue_.setInt_val(value.intValue());
+          lowValue_.setInt_val((int) value);
           break;
         case BIGINT:
-          lowValue_.setLong_val(value.longValue());
+          lowValue_.setLong_val(value);
           break;
         case TIMESTAMP:
-          Preconditions.checkState(
-              false, "TIMESTAMP columns are not supported by 
setLowAndHighValue()");
-          break;
+          throw new IllegalStateException(
+              "TIMESTAMP columns are not supported by setLowAndHighValue()");
         default:
-          Preconditions.checkState(
-              false, "Unsupported type encountered in setLowAndHighValue()");
+          throw new IllegalStateException(
+              "Unsupported type encountered in setLowAndHighValue()");
       }
     }
 
     if (!longStats.isSetHighValue()) {
       highValue_ = null;
     } else {
-      Long value = Long.valueOf(longStats.getHighValue());
+      long value = longStats.getHighValue();
       highValue_ = new TColumnValue();
       switch (type) {
         case TINYINT:
-          highValue_.setByte_val(value.byteValue());
+          highValue_.setByte_val((byte) value);
           break;
         case SMALLINT:
-          highValue_.setShort_val(value.shortValue());
+          highValue_.setShort_val((short) value);
           break;
         case INT:
-          highValue_.setInt_val(value.intValue());
+          highValue_.setInt_val((int) value);
           break;
         case BIGINT:
-          highValue_.setLong_val(value.longValue());
+          highValue_.setLong_val(value);
           break;
         case TIMESTAMP:
-          Preconditions.checkState(
-              false, "TIMESTAMP columns are not supported by 
setLowAndHighValue()");
-          break;
+          throw new IllegalStateException(
+              "TIMESTAMP columns are not supported by setLowAndHighValue()");
         default:
-          Preconditions.checkState(
-              false, "Unsupported type encountered in setLowAndHighValue()");
+          throw new IllegalStateException(
+              "Unsupported type encountered in setLowAndHighValue()");
       }
     }
   }
@@ -546,6 +544,22 @@ public class ColumnStats {
     }
   }
 
+  private long normalizeValue(String colName, StatsKey key, long value) {
+    if (value < -1) {
+      LOG.warn("Invalid {} of column {}: {}. Normalized to -1.", key, colName, 
value);
+      return -1;
+    }
+    return value;
+  }
+
+  private float normalizeAvgSize(String colName, float value) {
+    if (value < -1) {
+      LOG.warn("Invalid avgSize of column {}: {}. Normalized to -1.", colName, 
value);
+      return -1;
+    }
+    return value;
+  }
+
   /**
    * Updates the stats with the given ColumnStatisticsData. If the 
ColumnStatisticsData
    * is not compatible with the given colType, all stats are initialized based 
on
@@ -553,7 +567,7 @@ public class ColumnStats {
    * Returns false if the ColumnStatisticsData data was incompatible with the 
given
    * column type, otherwise returns true.
    */
-  public boolean update(Type colType, ColumnStatisticsData statsData) {
+  public boolean update(String colName, Type colType, ColumnStatisticsData 
statsData) {
     Preconditions.checkState(isSupportedColType(colType));
     initColStats(colType);
     boolean isCompatible = false;
@@ -572,7 +586,8 @@ public class ColumnStats {
         isCompatible = statsData.isSetBooleanStats();
         if (isCompatible) {
           BooleanColumnStatsData boolStats = statsData.getBooleanStats();
-          numNulls_ = boolStats.getNumNulls();
+          numNulls_ = normalizeValue(colName, StatsKey.NUM_NULLS,
+              boolStats.getNumNulls());
           // If we have numNulls, we can infer NDV from that.
           if (numNulls_ > 0) {
             numDistinctValues_ = 3;
@@ -581,8 +596,10 @@ public class ColumnStats {
           } else {
             numDistinctValues_ = -1;
           }
-          numTrues_ = boolStats.getNumTrues();
-          numFalses_ = boolStats.getNumFalses();
+          numTrues_ = normalizeValue(colName, StatsKey.NUM_TRUES,
+              boolStats.getNumTrues());
+          numFalses_ = normalizeValue(colName, StatsKey.NUM_FALSES,
+              boolStats.getNumFalses());
         }
         break;
       case TINYINT:
@@ -593,8 +610,10 @@ public class ColumnStats {
         isCompatible = statsData.isSetLongStats();
         if (isCompatible) {
           LongColumnStatsData longStats = statsData.getLongStats();
-          numDistinctValues_ = longStats.getNumDVs();
-          numNulls_ = longStats.getNumNulls();
+          numDistinctValues_ = normalizeValue(colName, 
StatsKey.NUM_DISTINCT_VALUES,
+              longStats.getNumDVs());
+          numNulls_ = normalizeValue(colName, StatsKey.NUM_NULLS,
+              longStats.getNumNulls());
           if (colType.getPrimitiveType() != PrimitiveType.TIMESTAMP) {
             // Low/high value handling is not yet implemented for timestamps.
             setLowAndHighValue(colType.getPrimitiveType(), longStats);
@@ -605,8 +624,10 @@ public class ColumnStats {
         isCompatible = statsData.isSetDateStats();
         if (isCompatible) {
           DateColumnStatsData dateStats = statsData.getDateStats();
-          numDistinctValues_ = dateStats.getNumDVs();
-          numNulls_ = dateStats.getNumNulls();
+          numDistinctValues_ = normalizeValue(colName, 
StatsKey.NUM_DISTINCT_VALUES,
+              dateStats.getNumDVs());
+          numNulls_ = normalizeValue(colName, StatsKey.NUM_NULLS,
+              dateStats.getNumNulls());
           setLowAndHighValue(dateStats);
         }
         break;
@@ -615,8 +636,10 @@ public class ColumnStats {
         isCompatible = statsData.isSetDoubleStats();
         if (isCompatible) {
           DoubleColumnStatsData doubleStats = statsData.getDoubleStats();
-          numDistinctValues_ = doubleStats.getNumDVs();
-          numNulls_ = doubleStats.getNumNulls();
+          numDistinctValues_ = normalizeValue(colName, 
StatsKey.NUM_DISTINCT_VALUES,
+              doubleStats.getNumDVs());
+          numNulls_ = normalizeValue(colName, StatsKey.NUM_NULLS,
+              doubleStats.getNumNulls());
           setLowAndHighValue(doubleStats);
         }
         break;
@@ -625,8 +648,10 @@ public class ColumnStats {
         isCompatible = statsData.isSetStringStats();
         if (isCompatible) {
           StringColumnStatsData stringStats = statsData.getStringStats();
-          numDistinctValues_ = stringStats.getNumDVs();
-          numNulls_ = stringStats.getNumNulls();
+          numDistinctValues_ = normalizeValue(colName, 
StatsKey.NUM_DISTINCT_VALUES,
+              stringStats.getNumDVs());
+          numNulls_ = normalizeValue(colName, StatsKey.NUM_NULLS,
+              stringStats.getNumNulls());
         }
         break;
       case VARCHAR:
@@ -634,10 +659,14 @@ public class ColumnStats {
         isCompatible = statsData.isSetStringStats();
         if (isCompatible) {
           StringColumnStatsData stringStats = statsData.getStringStats();
-          numDistinctValues_ = stringStats.getNumDVs();
-          numNulls_ = stringStats.getNumNulls();
-          maxSize_ = stringStats.getMaxColLen();
-          avgSize_ = Double.valueOf(stringStats.getAvgColLen()).floatValue();
+          numDistinctValues_ = normalizeValue(colName, 
StatsKey.NUM_DISTINCT_VALUES,
+              stringStats.getNumDVs());
+          numNulls_ = normalizeValue(colName, StatsKey.NUM_NULLS,
+              stringStats.getNumNulls());
+          maxSize_ = normalizeValue(colName, StatsKey.MAX_SIZE,
+              stringStats.getMaxColLen());
+          avgSize_ = normalizeAvgSize(colName,
+              Double.valueOf(stringStats.getAvgColLen()).floatValue());
           if (avgSize_ >= 0) {
             avgSerializedSize_ = avgSize_ + PrimitiveType.STRING.getSlotSize();
           } else {
@@ -649,9 +678,12 @@ public class ColumnStats {
         isCompatible = statsData.isSetBinaryStats();
         if (isCompatible) {
           BinaryColumnStatsData binaryStats = statsData.getBinaryStats();
-          numNulls_ = binaryStats.getNumNulls();
-          maxSize_ = binaryStats.getMaxColLen();
-          avgSize_ = Double.valueOf(binaryStats.getAvgColLen()).floatValue();
+          numNulls_ = normalizeValue(colName, StatsKey.NUM_NULLS,
+              binaryStats.getNumNulls());
+          maxSize_ = normalizeValue(colName, StatsKey.MAX_SIZE,
+              binaryStats.getMaxColLen());
+          avgSize_ = normalizeAvgSize(colName,
+              Double.valueOf(binaryStats.getAvgColLen()).floatValue());
           if (avgSize_ >= 0) {
             avgSerializedSize_ = avgSize_ + PrimitiveType.BINARY.getSlotSize();
           } else {
@@ -663,15 +695,15 @@ public class ColumnStats {
         isCompatible = statsData.isSetDecimalStats();
         if (isCompatible) {
           DecimalColumnStatsData decimalStats = statsData.getDecimalStats();
-          numNulls_ = decimalStats.getNumNulls();
-          numDistinctValues_ = decimalStats.getNumDVs();
+          numNulls_ = normalizeValue(colName, StatsKey.NUM_NULLS,
+              decimalStats.getNumNulls());
+          numDistinctValues_ = normalizeValue(colName, 
StatsKey.NUM_DISTINCT_VALUES,
+              decimalStats.getNumDVs());
           setLowAndHighValue(decimalStats);
         }
         break;
       default:
-        Preconditions.checkState(false,
-            "Unexpected column type: " + colType.toString());
-        break;
+        throw new IllegalStateException("Unexpected column type: " + colType);
     }
     validate(colType);
     return isCompatible;
@@ -683,12 +715,12 @@ public class ColumnStats {
   public static void updateLowAndHighForHiveColumnStatsData(
       Long low_value, Long high_value, LongColumnStatsData longColStatsData) {
     if (low_value != null) {
-      longColStatsData.setLowValue(low_value.longValue());
+      longColStatsData.setLowValue(low_value);
     } else {
       longColStatsData.unsetLowValue();
     }
     if (high_value != null) {
-      longColStatsData.setHighValue(high_value.longValue());
+      longColStatsData.setHighValue(high_value);
     } else {
       longColStatsData.unsetHighValue();
     }
@@ -700,12 +732,12 @@ public class ColumnStats {
   public static void updateLowAndHighForHiveColumnStatsData(
       Double low_value, Double high_value, DoubleColumnStatsData 
doubleColStatsData) {
     if (low_value != null) {
-      doubleColStatsData.setLowValue(low_value.doubleValue());
+      doubleColStatsData.setLowValue(low_value);
     } else {
       doubleColStatsData.unsetLowValue();
     }
     if (high_value != null) {
-      doubleColStatsData.setHighValue(high_value.doubleValue());
+      doubleColStatsData.setHighValue(high_value);
     } else {
       doubleColStatsData.unsetHighValue();
     }
@@ -747,7 +779,7 @@ public class ColumnStats {
 
   /**
    * Convert the statistics back into an HMS-compatible ColumnStatisticsData 
object.
-   * This is essentially the inverse of {@link #update(Type, 
ColumnStatisticsData)
+   * This is essentially the inverse of {@link #update(String, Type, 
ColumnStatisticsData)
    * above.
    *
    * Returns null if statistics for the specified type are not supported.
@@ -778,10 +810,10 @@ public class ColumnStats {
           Long lowValue = null;
           Long highValue = null;
           if (isLowValueSet && colStats.low_value.isSetByte_val()) {
-            lowValue = Long.valueOf(colStats.low_value.getByte_val());
+            lowValue = (long) colStats.low_value.getByte_val();
           }
           if (isHighValueSet && colStats.high_value.isSetByte_val()) {
-            highValue = Long.valueOf(colStats.high_value.getByte_val());
+            highValue = (long) colStats.high_value.getByte_val();
           }
           updateLowAndHighForHiveColumnStatsData(lowValue, highValue, 
longColStatsData);
           colStatsData.setLongStats(longColStatsData);
@@ -795,10 +827,10 @@ public class ColumnStats {
           Long lowValue = null;
           Long highValue = null;
           if (isLowValueSet && colStats.low_value.isSetShort_val()) {
-            lowValue = Long.valueOf(colStats.low_value.getShort_val());
+            lowValue = (long) colStats.low_value.getShort_val();
           }
           if (isHighValueSet && colStats.high_value.isSetShort_val()) {
-            highValue = Long.valueOf(colStats.high_value.getShort_val());
+            highValue = (long) colStats.high_value.getShort_val();
           }
           updateLowAndHighForHiveColumnStatsData(lowValue, highValue, 
longColStatsData);
 
@@ -813,10 +845,10 @@ public class ColumnStats {
           Long lowValue = null;
           Long highValue = null;
           if (isLowValueSet && colStats.low_value.isSetInt_val()) {
-            lowValue = Long.valueOf(colStats.low_value.getInt_val());
+            lowValue = (long) colStats.low_value.getInt_val();
           }
           if (isHighValueSet && colStats.high_value.isSetInt_val()) {
-            highValue = Long.valueOf(colStats.high_value.getInt_val());
+            highValue = (long) colStats.high_value.getInt_val();
           }
           updateLowAndHighForHiveColumnStatsData(lowValue, highValue, 
longColStatsData);
 
@@ -832,10 +864,10 @@ public class ColumnStats {
           Date lowValue = null;
           Date highValue = null;
           if (isLowValueSet && colStats.low_value.isSetDate_val()) {
-            lowValue = new 
Date(Long.valueOf(colStats.low_value.getDate_val()));
+            lowValue = new Date(colStats.low_value.getDate_val());
           }
           if (isHighValueSet && colStats.high_value.isSetDate_val()) {
-            highValue = new 
Date(Long.valueOf(colStats.high_value.getDate_val()));
+            highValue = new Date(colStats.high_value.getDate_val());
           }
           updateLowAndHighForHiveColumnStatsData(lowValue, highValue, 
dateColStatsData);
           colStatsData.setDateStats(dateColStatsData);
@@ -848,10 +880,10 @@ public class ColumnStats {
           Long lowValue = null;
           Long highValue = null;
           if (isLowValueSet && colStats.low_value.isSetLong_val()) {
-            lowValue = Long.valueOf(colStats.low_value.getLong_val());
+            lowValue = colStats.low_value.getLong_val();
           }
           if (isHighValueSet && colStats.high_value.isSetLong_val()) {
-            highValue = Long.valueOf(colStats.high_value.getLong_val());
+            highValue = colStats.high_value.getLong_val();
           }
           updateLowAndHighForHiveColumnStatsData(lowValue, highValue, 
longColStatsData);
 
@@ -870,10 +902,10 @@ public class ColumnStats {
           Double lowValue = null;
           Double highValue = null;
           if (isLowValueSet && colStats.low_value.isSetDouble_val()) {
-            lowValue = Double.valueOf(colStats.low_value.getDouble_val());
+            lowValue = colStats.low_value.getDouble_val();
           }
           if (isHighValueSet && colStats.high_value.isSetDouble_val()) {
-            highValue = Double.valueOf(colStats.high_value.getDouble_val());
+            highValue = colStats.high_value.getDouble_val();
           }
           updateLowAndHighForHiveColumnStatsData(lowValue, highValue, 
doubleColStatsData);
 
@@ -972,7 +1004,7 @@ public class ColumnStats {
         numFalses_ = (Long) value;
         break;
       }
-      default: Preconditions.checkState(false);
+      default: throw new IllegalStateException("Unknown StatsKey " + key);
     }
     validate(colType);
   }
diff --git a/fe/src/main/java/org/apache/impala/catalog/FeCatalogUtils.java 
b/fe/src/main/java/org/apache/impala/catalog/FeCatalogUtils.java
index bb1f4a66e..833234523 100644
--- a/fe/src/main/java/org/apache/impala/catalog/FeCatalogUtils.java
+++ b/fe/src/main/java/org/apache/impala/catalog/FeCatalogUtils.java
@@ -186,7 +186,6 @@ public abstract class FeCatalogUtils {
             "incompatible with column type %s. Consider regenerating 
statistics " +
             "for %s.", table.getFullName(), col.getName(), col.getType(),
             table.getFullName()));
-        continue;
       }
     }
   }
diff --git a/tests/metadata/test_compute_stats.py 
b/tests/metadata/test_compute_stats.py
index 370f3eef0..3ec53fdb6 100644
--- a/tests/metadata/test_compute_stats.py
+++ b/tests/metadata/test_compute_stats.py
@@ -18,7 +18,9 @@
 from __future__ import absolute_import, division, print_function
 from builtins import range
 import pytest
-from subprocess import check_call
+from hive_metastore.ttypes import (
+    ColumnStatistics, ColumnStatisticsDesc, ColumnStatisticsData,
+    ColumnStatisticsObj, StringColumnStatsData)
 
 from tests.common.environ import ImpalaTestClusterProperties
 from tests.common.impala_cluster import ImpalaCluster
@@ -436,3 +438,44 @@ class TestParquetComputeColumnMinMax(ImpalaTestSuite):
 
   def test_compute_stats(self, vector, unique_database):
     self.run_test_case('QueryTest/compute-stats-column-minmax', vector, 
unique_database)
+
+
+class TestInvalidStatsFromHms(ImpalaTestSuite):
+  @classmethod
+  def get_workload(self):
+    return 'functional-query'
+
+  @classmethod
+  def add_test_dimensions(cls):
+    super(TestInvalidStatsFromHms, cls).add_test_dimensions()
+    cls.ImpalaTestMatrix.add_dimension(create_single_exec_option_dimension())
+    cls.ImpalaTestMatrix.add_constraint(
+        lambda v: v.get_value('table_format').file_format == 'text'
+        and v.get_value('table_format').compression_codec == 'none')
+
+  def test_invalid_col_stats(self, unique_database):
+    """Test that invalid column stats, i.e. values < -1, are normalized in 
Impala"""
+    tbl = unique_database + ".tbl"
+    self.execute_query("create table {} as select 1 as id, 'aaa' as 
name".format(tbl))
+    # Add invalid stats in HMS
+    hms_client, _ = ImpalaTestSuite.create_hive_client(9083)
+    cs = ColumnStatistics()
+    cs.engine = "impala"
+    isTblLevel = True
+    cs.statsDesc = ColumnStatisticsDesc(isTblLevel, unique_database, "tbl")
+    cs_data = ColumnStatisticsData()
+    maxColLen = -100
+    avgColLen = -200
+    numNulls = -300
+    numDVs = -400
+    cs_data.stringStats = StringColumnStatsData(maxColLen, avgColLen, 
numNulls, numDVs)
+    cs_obj = ColumnStatisticsObj("name", "string", cs_data)
+    cs.statsObj = [cs_obj]
+    assert hms_client.update_table_column_statistics(cs)
+    # REFRESH to reload the stats
+    self.execute_query("refresh " + tbl)
+    # Verify the invalid stats are normalized to -1
+    res = self.execute_query("show column stats " + tbl)
+    assert res.data == [
+      'id\tTINYINT\t-1\t-1\t1\t1\t-1\t-1',
+      'name\tSTRING\t-1\t-1\t-1\t-1\t-1\t-1']

Reply via email to