This is an automated email from the ASF dual-hosted git repository. stigahuang pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit 10e816942ae240401ef2d55863a77181bc169eed Author: Daniel Becker <[email protected]> AuthorDate: Fri Nov 3 13:37:02 2023 +0100 IMPALA-12333: SHOW CREATE TABLE outputs some unnecessary table properties SHOW CREATE TABLE outputs some unnecessary table properties, some of which are Iceberg-specific. This change removes the following table properties from SHOW CREATE TABLE statements: Iceberg-specific: - previous_metadata_location - current-schema - snapshot-count - current-snapshot-id - current-snapshot-summary - current-snapshot-timestamp-ms - default-partition-spec - uuid General: - numRows - numFiles - totalSize - numFilesErasureCoded - last_modified_by - last_modified_time - impala.events.catalogServiceId - impala.events.catalogVersion Testing: - Added a test class in test_show_create_table.py, TestShowCreateTableIcebergProperties, that tests that the above Iceberg-specific table properties (and some others that occur with Iceberg tables) are not included in SHOW CREATE TABLE statements. - Modified test_show_create_table.py::TestShowCreateTable - removed the above properties and some others that should not be included in SHOW CREATE TABLE statements from the ignore list that is used when comparing expected and actual SHOW CREATE TABLE statements - some of these previously ignored properties were listed among the expected table properties in 'show-create-table.test', this change removes them from there. Change-Id: Id6f2cb9194f685a583b0d550532eb2454f119666 Reviewed-on: http://gerrit.cloudera.org:8080/20676 Reviewed-by: Impala Public Jenkins <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> --- .../org/apache/impala/analysis/ToSqlUtils.java | 24 +++- .../java/org/apache/impala/catalog/FeFsTable.java | 10 ++ .../java/org/apache/impala/catalog/FeTable.java | 16 ++- .../org/apache/impala/catalog/IcebergTable.java | 28 ++++ .../queries/QueryTest/show-create-table.test | 6 +- tests/metadata/test_show_create_table.py | 158 +++++++++++++++++---- 6 files changed, 207 insertions(+), 35 deletions(-) diff --git a/fe/src/main/java/org/apache/impala/analysis/ToSqlUtils.java b/fe/src/main/java/org/apache/impala/analysis/ToSqlUtils.java index 0fbb7ef1f..66434a184 100755 --- a/fe/src/main/java/org/apache/impala/analysis/ToSqlUtils.java +++ b/fe/src/main/java/org/apache/impala/analysis/ToSqlUtils.java @@ -83,8 +83,19 @@ public class ToSqlUtils { // "CREATE EXTERNAL TABLE <name> ... SORT BY ZORDER (...) ... COMMENT <comment> ..." @VisibleForTesting protected static final ImmutableSet<String> HIDDEN_TABLE_PROPERTIES = ImmutableSet.of( - "EXTERNAL", "comment", AlterTableSortByStmt.TBL_PROP_SORT_COLUMNS, - AlterTableSortByStmt.TBL_PROP_SORT_ORDER, "TRANSLATED_TO_EXTERNAL"); + "EXTERNAL", + "TRANSLATED_TO_EXTERNAL", + "comment", + AlterTableSortByStmt.TBL_PROP_SORT_COLUMNS, + AlterTableSortByStmt.TBL_PROP_SORT_ORDER, + FeFsTable.NUM_ERASURE_CODED_FILES, + FeFsTable.NUM_FILES, + FeFsTable.TOTAL_SIZE, + FeTable.CATALOG_SERVICE_ID, + FeTable.CATALOG_VERSION, + FeTable.LAST_MODIFIED_BY, + FeTable.LAST_MODIFIED_TIME, + FeTable.NUM_ROWS); /** * Removes all hidden properties from the given 'tblProperties' map. @@ -356,6 +367,7 @@ public class ToSqlUtils { TSortingOrder sortingOrder = TSortingOrder.valueOf(getSortingOrder(properties)); String comment = properties.get("comment"); removeHiddenTableProperties(properties); + List<String> colsSql = new ArrayList<>(); List<String> partitionColsSql = new ArrayList<>(); boolean isHbaseTable = table instanceof FeHBaseTable; @@ -426,6 +438,14 @@ public class ToSqlUtils { properties.remove(IcebergTable.KEY_STORAGE_HANDLER); properties.remove(StatsSetupConst.DO_NOT_UPDATE_STATS); properties.remove(IcebergTable.METADATA_LOCATION); + properties.remove(IcebergTable.PREVIOUS_METADATA_LOCATION); + properties.remove(IcebergTable.CURRENT_SCHEMA); + properties.remove(IcebergTable.SNAPSHOT_COUNT); + properties.remove(IcebergTable.CURRENT_SNAPSHOT_ID); + properties.remove(IcebergTable.CURRENT_SNAPSHOT_SUMMARY); + properties.remove(IcebergTable.CURRENT_SNAPSHOT_TIMESTAMP_MS); + properties.remove(IcebergTable.DEFAULT_PARTITION_SPEC); + properties.remove(IcebergTable.UUID); // Fill "PARTITIONED BY SPEC" part if the Iceberg table is partitioned. FeIcebergTable feIcebergTable= (FeIcebergTable)table; diff --git a/fe/src/main/java/org/apache/impala/catalog/FeFsTable.java b/fe/src/main/java/org/apache/impala/catalog/FeFsTable.java index 06e053258..47847348f 100644 --- a/fe/src/main/java/org/apache/impala/catalog/FeFsTable.java +++ b/fe/src/main/java/org/apache/impala/catalog/FeFsTable.java @@ -78,6 +78,16 @@ public interface FeFsTable extends FeTable { // and its usage in getFileSystem suggests it should be. public static final Configuration CONF = new Configuration(); + // Internal table property that specifies the number of files in the table. + public static final String NUM_FILES = "numFiles"; + + // Internal table property that specifies the total size of the table. + public static final String TOTAL_SIZE = "totalSize"; + + // Internal table property that specifies the number of erasure coded files in the + // table. + public static final String NUM_ERASURE_CODED_FILES = "numFilesErasureCoded"; + /** * @return true if the table and all its partitions reside at locations which * support caching (e.g. HDFS). diff --git a/fe/src/main/java/org/apache/impala/catalog/FeTable.java b/fe/src/main/java/org/apache/impala/catalog/FeTable.java index 679f2a4b4..e50af06a4 100644 --- a/fe/src/main/java/org/apache/impala/catalog/FeTable.java +++ b/fe/src/main/java/org/apache/impala/catalog/FeTable.java @@ -33,7 +33,6 @@ import org.apache.impala.thrift.TTableStats; * Frontend interface for interacting with a table. */ public interface FeTable { - Comparator<FeTable> NAME_COMPARATOR = new Comparator<FeTable>() { @Override public int compare(FeTable t1, FeTable t2) { @@ -41,6 +40,21 @@ public interface FeTable { } }; + // Internal table property that specifies the number of rows in the table. + public static final String NUM_ROWS = "numRows"; + + // Internal table property that specifies which user the table was last modified by. + public static final String LAST_MODIFIED_BY = "last_modified_by"; + + // Internal table property that specifies when the table was last modified. + public static final String LAST_MODIFIED_TIME = "last_modified_time"; + + // Internal table property that specifies the catalog service id. + public static final String CATALOG_SERVICE_ID = "impala.events.catalogServiceId"; + + // Internal table property that specifies the catalog version of the table. + public static final String CATALOG_VERSION = "impala.events.catalogVersion"; + /** @see CatalogObject#isLoaded() */ boolean isLoaded(); diff --git a/fe/src/main/java/org/apache/impala/catalog/IcebergTable.java b/fe/src/main/java/org/apache/impala/catalog/IcebergTable.java index 076574023..8ef7793a5 100644 --- a/fe/src/main/java/org/apache/impala/catalog/IcebergTable.java +++ b/fe/src/main/java/org/apache/impala/catalog/IcebergTable.java @@ -93,6 +93,34 @@ public class IcebergTable extends Table implements FeIcebergTable { // table metadata. This property is only valid for tables in 'hive.catalog'. public static final String METADATA_LOCATION = "metadata_location"; + // Internal Iceberg table property that specifies the absolute path of the previous + // table metadata. This property is only valid for tables in 'hive.catalog'. + public static final String PREVIOUS_METADATA_LOCATION = "previous_metadata_location"; + + // Internal Iceberg table property that specifies the current schema. + public static final String CURRENT_SCHEMA = "current-schema"; + + // Internal Iceberg table property that specifies the number of snapshots. + public static final String SNAPSHOT_COUNT = "snapshot-count"; + + // Internal Iceberg table property that specifies the current snapshot id. + public static final String CURRENT_SNAPSHOT_ID = "current-snapshot-id"; + + // Internal Iceberg table property that specifies the current snapshot summary. + public static final String CURRENT_SNAPSHOT_SUMMARY = "current-snapshot-summary"; + + // Internal Iceberg table property that specifies the current snapshot timestamp in + // milliseconds. + public static final String CURRENT_SNAPSHOT_TIMESTAMP_MS + = "current-snapshot-timestamp-ms"; + + // Internal Iceberg table property that specifies the current default partition + // specification of the table. + public static final String DEFAULT_PARTITION_SPEC = "default-partition-spec"; + + // Internal Iceberg table property that specifies the UUID of the table. + public static final String UUID = "uuid"; + // Parquet compression codec and compression level table properties. public static final String PARQUET_COMPRESSION_CODEC = "write.parquet.compression-codec"; diff --git a/testdata/workloads/functional-query/queries/QueryTest/show-create-table.test b/testdata/workloads/functional-query/queries/QueryTest/show-create-table.test index a5293bee8..018785666 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/show-create-table.test +++ b/testdata/workloads/functional-query/queries/QueryTest/show-create-table.test @@ -452,7 +452,6 @@ ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' ESCAPED BY '\\' WITH SERDEPROPERTIES ('field.delim'=',', 'serialization.format'=',', 'escape.delim'='\\') STORED AS TEXTFILE LOCATION '$$location_uri$$' -TBLPROPERTIES ('transient_lastDdlTime'='1405990341') ==== ---- QUERY SHOW CREATE VIEW functional.alltypes_view @@ -1030,6 +1029,5 @@ CREATE EXTERNAL TABLE show_create_table_test_db.bucketed_sort_test (a INT, b STR CLUSTERED BY (a) SORT BY LEXICAL (b) INTO 4 BUCKETS STORED AS TEXTFILE LOCATION '$$location_uri$$' -TBLPROPERTIES ('bucketing_version'='2', 'sort.columns'='b', 'sort.order'='LEXICAL', -'external.table.purge'='TRUE') -==== \ No newline at end of file +TBLPROPERTIES ('bucketing_version'='2', 'external.table.purge'='TRUE') +==== diff --git a/tests/metadata/test_show_create_table.py b/tests/metadata/test_show_create_table.py index d0f264739..de95d2b7f 100644 --- a/tests/metadata/test_show_create_table.py +++ b/tests/metadata/test_show_create_table.py @@ -28,6 +28,29 @@ from tests.common.environ import HIVE_MAJOR_VERSION from tests.util.filesystem_utils import WAREHOUSE +def properties_map_regex(name): + return "%s \\(([^)]+)\\)" % name + + +def get_properties_map(sql, properties_map_name, exclusions=None): + """ Extracts a dict of key-value pairs from the 'sql' string, which should be a SHOW + CREATE TABLE statement. The 'properties_map_name' is the name of the properties map, + e.g. 'TBLPROPERTIES' or 'SERDEPROPERTIES'. 'exclusions' is a list of properties to + exclude from the result. + """ + map_match = re.search(properties_map_regex(properties_map_name), sql) + if map_match is None: + return dict() + kv_regex = "'([^\']+)'\\s*=\\s*'([^\']+)'" + kv_results = dict(re.findall(kv_regex, map_match.group(1))) + + if exclusions is not None: + for filtered_key in exclusions: + if filtered_key in kv_results: + del kv_results[filtered_key] + return kv_results + + # The purpose of the show create table tests are to ensure that the "SHOW CREATE TABLE" # output can actually be used to recreate the table. A test consists of a table # definition. The table is created, then the output of "SHOW CREATE TABLE" is used to @@ -36,17 +59,7 @@ class TestShowCreateTable(ImpalaTestSuite): VALID_SECTION_NAMES = ["CREATE_TABLE", "CREATE_VIEW", "QUERY", "RESULTS-HIVE", "RESULTS-HIVE-3"] # Properties to filter before comparing results - FILTER_TBL_PROPERTIES = ["transient_lastDdlTime", "numFiles", "numPartitions", - "numRows", "rawDataSize", "totalSize", "COLUMN_STATS_ACCURATE", - "STATS_GENERATED_VIA_STATS_TASK", "last_modified_by", - "last_modified_time", "numFilesErasureCoded", - "bucketing_version", "OBJCAPABILITIES", - "TRANSLATED_TO_EXTERNAL", "previous_metadata_location", - "impala.events.catalogServiceId", - "impala.events.catalogVersion", "uuid", - "current-schema", "snapshot-count", "default-partition-spec", - "current-snapshot-id", "current-snapshot-summary", - "current-snapshot-timestamp-ms", "sort.columns", "sort.order"] + FILTER_TBL_PROPERTIES = ["bucketing_version", "OBJCAPABILITIES"] @classmethod def get_workload(self): @@ -181,28 +194,14 @@ class TestShowCreateTable(ImpalaTestSuite): s = ' '.join(s.split()) return s - def __properties_map_regex(self, name): - return "%s \(([^)]+)\)" % name - def __remove_properties_maps(self, s): """ Removes the tblproperties and serdeproperties from the string """ return re.sub( - self.__properties_map_regex("WITH SERDEPROPERTIES"), "", - re.sub(self.__properties_map_regex("TBLPROPERTIES"), "", s)).strip() + properties_map_regex("WITH SERDEPROPERTIES"), "", + re.sub(properties_map_regex("TBLPROPERTIES"), "", s)).strip() def __get_properties_map(self, s, properties_map_name): - """ Extracts a dict of key-value pairs from the sql string s. The properties_map_name - is the name of the properties map, e.g. 'tblproperties' or 'serdeproperties' - """ - map_match = re.search(self.__properties_map_regex(properties_map_name), s) - if map_match is None: - return dict() - kv_regex = "'([^\']+)'\s*=\s*'([^\']+)'" - kv_results = dict(re.findall(kv_regex, map_match.group(1))) - for filtered_key in self.FILTER_TBL_PROPERTIES: - if filtered_key in kv_results: - del kv_results[filtered_key] - return kv_results + return get_properties_map(s, properties_map_name, self.FILTER_TBL_PROPERTIES) def __replace_uri(self, s, uri): return s if uri is None else s.replace("$$location_uri$$", uri) @@ -310,3 +309,106 @@ class TestInfraCompat(ImpalaTestSuite): table = impala_testinfra_cursor.describe_table(table_primary_keys_map['table']) assert table_primary_keys_map['primary_keys'] == table.primary_key_names assert table_primary_keys_map['updatable_columns'] == table.updatable_column_names + + +def get_tbl_properties_from_describe_formatted(lines): + """Get a map of table properties from the result of a DESCRIBE FORMATTED query.""" + # The table properties are between a line that starts with 'Table Parameters' and the + # next line that doesn't start with a '\t'. + res = dict() + + start_idx = None + for idx, value in enumerate(lines): + if value.startswith('Table Parameters'): + start_idx = idx + 1 + break + if start_idx is None: return res + + past_end_idx = start_idx + for idx, value in enumerate(lines[start_idx:]): + if not value.startswith('\t'): + break + past_end_idx += 1 + + for line in lines[start_idx:past_end_idx]: + [key, value] = line.strip().split('\t') + res[key.strip()] = value.strip() + + return res + + +class TestShowCreateTableIcebergProperties(ImpalaTestSuite): + """ + Test that the SHOW CREATE TABLE statement does not contain irrelevant Iceberg-related + table properties. + """ + + @classmethod + def get_workload(self): + return 'functional-query' + + @classmethod + def add_test_dimensions(cls): + super(TestShowCreateTableIcebergProperties, cls).add_test_dimensions() + # don't use any exec options, running exactly once is fine + cls.ImpalaTestMatrix.clear_dimension('exec_option') + cls.ImpalaTestMatrix.add_constraint( + lambda v: v.get_value('table_format').file_format == 'parquet' + and v.get_value('table_format').compression_codec == 'none') + + def test_iceberg_properties(self, vector, unique_database): + """ + Test that the SHOW CREATE TABLE statement does not contain irrelevant Iceberg-related + table properties. + """ + tbl_name = unique_database + ".a" + + create_sql = "CREATE TABLE {} (i INT, d DATE, s STRING, t TIMESTAMP) \ + PARTITIONED BY SPEC (BUCKET(5, i), MONTH(d), TRUNCATE(3, s), HOUR(t)) \ + SORT BY (i) \ + STORED AS ICEBERG".format(tbl_name) + self.execute_query_expect_success(self.client, create_sql) + + # Some table properties are only added when a new Iceberg snapshot is created. + # insert_sql = "insert into {} values (1)".format(tbl_name) + insert_sql = ("INSERT INTO {} VALUES (1, '2022-01-04', 'some', '2022-01-04 10:00:00')" + .format(tbl_name)) + self.execute_query_expect_success(self.client, insert_sql) + + do_not_update_stats_sql = ("alter table {} set TBLPROPERTIES " + "('DO_NOT_UPDATE_STATS'='true')".format(tbl_name)) + self.execute_query_expect_success(self.client, do_not_update_stats_sql) + + describe_sql = "describe formatted {}".format(tbl_name) + describe_res = self.execute_query_expect_success(self.client, describe_sql) + + show_create_sql = "show create table {}".format(tbl_name) + show_create_res = self.execute_query_expect_success(self.client, show_create_sql) + + tblproperties_to_omit = [ + "storage_handler", + "DO_NOT_UPDATE_STATS", + "metadata_location", + "previous_metadata_location", + "current-schema", + "snapshot-count", + "current-snapshot-id", + "current-snapshot-summary", + "current-snapshot-timestamp-ms", + "default-partition-spec", + "uuid", + "impala.events.catalogServiceId", + "impala.events.catalogVersion", + "sort.columns", + "sort.order", + "numFiles", + "numRows", + "totalSize" + ] + + tbl_props_in_describe = get_tbl_properties_from_describe_formatted(describe_res.data) + tbl_props_in_show_create = get_properties_map( + show_create_res.data[0], 'TBLPROPERTIES', None) + for tbl_prop in tblproperties_to_omit: + assert tbl_prop in tbl_props_in_describe + assert tbl_prop not in tbl_props_in_show_create
