This is an automated email from the ASF dual-hosted git repository. stigahuang pushed a commit to branch branch-4.1.1 in repository https://gitbox.apache.org/repos/asf/impala.git
commit 1a0b39442992d8e94aef3cb7c34910dd4c0c0cb3 Author: Gergely Fürnstáhl <[email protected]> AuthorDate: Tue Jun 14 14:44:25 2022 +0200 IMPALA-11034: Resolve schema of old data files in migrated Iceberg tables When external tables are converted to Iceberg, the data files remain intact, thus missing field IDs. Previously, Impala used name based column resolution in this case. Added a feature to traverse through the data files before column resolution and assign field IDs the same way as iceberg would, to be able to use field ID based column resolutions. Testing: Default resolution method was changed to field id for migrated tables, existing tests use that from now. Added new tests to cover edge cases with complex types and schema evolution. Change-Id: I77570bbfc2fcc60c2756812d7210110e8cc11ccc Reviewed-on: http://gerrit.cloudera.org:8080/18639 Reviewed-by: Zoltan Borok-Nagy <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> Reviewed-on: http://gerrit.cloudera.org:8080/18912 Tested-by: Quanlong Huang <[email protected]> Reviewed-by: Tamas Mate <[email protected]> --- be/src/exec/orc-metadata-utils.cc | 63 ++++- be/src/exec/orc-metadata-utils.h | 9 +- be/src/exec/parquet/parquet-metadata-utils.cc | 69 ++++- be/src/exec/parquet/parquet-metadata-utils.h | 27 +- testdata/data/README | 28 +++ .../iceberg_migrated_alter_test/000000_0 | Bin 0 -> 817 bytes .../c9f83a82-60f4-443b-9ca4-359cad16fe12-m0.avro | Bin 0 -> 3182 bytes ...396-1-c9f83a82-60f4-443b-9ca4-359cad16fe12.avro | Bin 0 -> 1986 bytes .../metadata/v1.metadata.json | 81 ++++++ .../metadata/v2.metadata.json | 105 ++++++++ .../metadata/version-hint.text | 1 + .../iceberg_migrated_alter_test_orc/000000_0 | Bin 0 -> 418 bytes .../340a3b82-71e3-4f50-b030-aecb5a5ea730-m0.avro | Bin 0 -> 3182 bytes ...038-1-340a3b82-71e3-4f50-b030-aecb5a5ea730.avro | Bin 0 -> 1990 bytes .../metadata/v1.metadata.json | 81 ++++++ .../metadata/v2.metadata.json | 105 ++++++++ .../metadata/version-hint.text | 1 + .../iceberg_migrated_complex_test/000000_0 | Bin 0 -> 3006 bytes .../152e384f-2851-44b7-9ada-1bfbec74e9fc-m0.avro | Bin 0 -> 3218 bytes ...148-1-152e384f-2851-44b7-9ada-1bfbec74e9fc.avro | Bin 0 -> 1988 bytes .../metadata/v1.metadata.json | 255 +++++++++++++++++++ .../metadata/v2.metadata.json | 279 +++++++++++++++++++++ .../metadata/version-hint.text | 1 + .../iceberg_migrated_complex_test_orc/000000_0 | Bin 0 -> 1217 bytes .../8588fd4b-13c1-4451-80ad-5cf71a959b94-m0.avro | Bin 0 -> 3232 bytes ...504-1-8588fd4b-13c1-4451-80ad-5cf71a959b94.avro | Bin 0 -> 1992 bytes .../metadata/v1.metadata.json | 255 +++++++++++++++++++ .../metadata/v2.metadata.json | 279 +++++++++++++++++++++ .../metadata/version-hint.text | 1 + ...iceberg-migrated-table-field-id-resolution.test | 208 +++++++++++++++ tests/common/file_utils.py | 34 +++ tests/query_test/test_iceberg.py | 13 + 32 files changed, 1874 insertions(+), 21 deletions(-) diff --git a/be/src/exec/orc-metadata-utils.cc b/be/src/exec/orc-metadata-utils.cc index b08e5c61f..a8fbf58c6 100644 --- a/be/src/exec/orc-metadata-utils.cc +++ b/be/src/exec/orc-metadata-utils.cc @@ -17,6 +17,8 @@ #include "exec/orc-metadata-utils.h" +#include <stack> + #include <boost/algorithm/string.hpp> #include "util/debug-util.h" @@ -45,13 +47,12 @@ OrcSchemaResolver::OrcSchemaResolver(const HdfsTableDescriptor& tbl_desc, filename_(filename), is_table_full_acid_(is_table_acid) { DetermineFullAcidSchema(); - if (tbl_desc_.IsIcebergTable() && root_->getSubtypeCount() > 0) { - // Use FIELD_ID-based column resolution for Iceberg tables if possible. - const orc::Type* first_child = root_->getSubtype(0); - if (first_child->hasAttributeKey(ICEBERG_FIELD_ID)) { - schema_resolution_strategy_ = TSchemaResolutionStrategy::FIELD_ID; - } else { - schema_resolution_strategy_ = TSchemaResolutionStrategy::NAME; + if (tbl_desc_.IsIcebergTable()) { + schema_resolution_strategy_ = TSchemaResolutionStrategy::FIELD_ID; + + if (root_->getSubtypeCount() > 0 + && !root_->getSubtype(0)->hasAttributeKey(ICEBERG_FIELD_ID)) { + GenerateFieldIDs(); } } } @@ -303,15 +304,57 @@ const orc::Type* OrcSchemaResolver::FindChildWithFieldId(const orc::Type* node, for (int i = 0; i < node->getSubtypeCount(); ++i) { const orc::Type* child = node->getSubtype(i); DCHECK(child != nullptr); - if (!child->hasAttributeKey(ICEBERG_FIELD_ID)) return nullptr; - std::string field_id_str = child->getAttributeValue(ICEBERG_FIELD_ID); - int64_t child_field_id = GetFieldIdFromStr(field_id_str); + + int child_field_id = 0; + + if (LIKELY(child->hasAttributeKey(ICEBERG_FIELD_ID))) { + std::string field_id_str = child->getAttributeValue(ICEBERG_FIELD_ID); + child_field_id = GetFieldIdFromStr(field_id_str); + } else { + child_field_id = GetGeneratedFieldID(child); + } + if (child_field_id == -1) return nullptr; if (child_field_id == field_id) return child; } return nullptr; } +void OrcSchemaResolver::GenerateFieldIDs() { + std::stack<const orc::Type*> nodes; + + nodes.push(root_); + + int fieldID = 1; + + while (!nodes.empty()) { + const orc::Type* current = nodes.top(); + nodes.pop(); + + uint64_t size = current->getSubtypeCount(); + + for (uint64_t i = 0; i < size; i++) { + auto retval = orc_type_to_field_id_.emplace(current->getSubtype(i), fieldID++); + + // Emplace has to be successful, otherwise we visited the same node twice + DCHECK(retval.second); + + // Push children in reverse order to the stack so they are processed in the original + // order + nodes.push(current->getSubtype(size - i - 1)); + } + } +} + +int OrcSchemaResolver::GetGeneratedFieldID(const orc::Type* type) const { + auto it = orc_type_to_field_id_.find(type); + + // First column has field ID, this one does not, file is corrupted + if (UNLIKELY(it == orc_type_to_field_id_.end())) return -1; + + return it->second; +} + SchemaPath OrcSchemaResolver::GetCanonicalSchemaPath(const SchemaPath& col_path, int current_idx) const { DCHECK_LT(current_idx, col_path.size()); diff --git a/be/src/exec/orc-metadata-utils.h b/be/src/exec/orc-metadata-utils.h index aaab7d46f..d9e3f0612 100644 --- a/be/src/exec/orc-metadata-utils.h +++ b/be/src/exec/orc-metadata-utils.h @@ -18,7 +18,6 @@ #pragma once #include <orc/OrcFile.hh> -#include <queue> #include "runtime/descriptors.h" @@ -112,6 +111,13 @@ class OrcSchemaResolver { /// Finds child of 'node' that has Iceberg field id equals to 'field_id'. const orc::Type* FindChildWithFieldId(const orc::Type* node, const int field_id) const; + /// Generates field ids for the columns in the same order as Iceberg. The traversal is + /// preorder, but the assigned field IDs are not in that order. When a node is + /// processed, its child nodes are assigned an ID, hence the difference. + void GenerateFieldIDs(); + + inline int GetGeneratedFieldID(const orc::Type* type) const; + SchemaPath GetCanonicalSchemaPath(const SchemaPath& col_path, int last_idx) const; /// Sets 'is_file_full_acid_' based on the file schema. @@ -122,6 +128,7 @@ class OrcSchemaResolver { const char* const filename_ = nullptr; const bool is_table_full_acid_; bool is_file_full_acid_; + std::unordered_map<const orc::Type*, int> orc_type_to_field_id_; /// Validate whether the ColumnType is compatible with the orc type Status ValidateType(const ColumnType& type, const orc::Type& orc_type, diff --git a/be/src/exec/parquet/parquet-metadata-utils.cc b/be/src/exec/parquet/parquet-metadata-utils.cc index bf905b789..b463f2c35 100644 --- a/be/src/exec/parquet/parquet-metadata-utils.cc +++ b/be/src/exec/parquet/parquet-metadata-utils.cc @@ -19,6 +19,7 @@ #include <strings.h> #include <sstream> +#include <stack> #include <string> #include <vector> @@ -810,6 +811,13 @@ SchemaNode* ParquetSchemaResolver::NextSchemaNode( *missing_field = true; return NULL; } + + if (UNLIKELY(file_idx == INVALID_ID)) { + VLOG_FILE << Substitute("File '$0' is corrupted", filename_); + *missing_field = true; + return NULL; + } + return &node->children[file_idx]; } @@ -826,7 +834,17 @@ int ParquetSchemaResolver::FindChildWithFieldId(SchemaNode* node, const int& field_id) const { int idx; for (idx = 0; idx < node->children.size(); ++idx) { - if (node->children[idx].element->field_id == field_id) break; + SchemaNode* child = &node->children[idx]; + + int child_field_id = 0; + + if (LIKELY(child->element->__isset.field_id)) { + child_field_id = child->element->field_id; + } else { + child_field_id = GetGeneratedFieldID(child); + } + if (child_field_id == field_id) return idx; + if (UNLIKELY(child_field_id == INVALID_ID)) return INVALID_ID; } return idx; } @@ -950,4 +968,53 @@ Status ParquetSchemaResolver::ValidateScalarNode(const SchemaNode& node, return Status::OK(); } +void ParquetSchemaResolver::GenerateFieldIDs() { + std::stack<SchemaNode*> nodes; + + nodes.push(&schema_); + + int fieldID = 1; + + while (!nodes.empty()) { + SchemaNode* current = nodes.top(); + nodes.pop(); + + uint64_t size = current->children.size(); + + for (uint64_t i = 0; i < size; i++) { + auto retval = schema_node_to_field_id_.emplace(¤t->children[i], fieldID++); + + // Emplace has to be successful, otherwise we visited the same node twice + DCHECK(retval.second); + + // Push children in reverse order to the stack so they are processed in the original + // order + const uint64_t reverse_idx = size - i - 1; + + SchemaNode& current_child = current->children[reverse_idx]; + + const parquet::ConvertedType::type child_type = + current_child.element->converted_type; + + if (child_type == parquet::ConvertedType::type::LIST + || child_type == parquet::ConvertedType::type::MAP) { + // Skip middle level + DCHECK(current_child.children.size() == 1); + + nodes.push(¤t_child.children[0]); + } else { + nodes.push(¤t_child); + } + } + } +} + +int ParquetSchemaResolver::GetGeneratedFieldID(SchemaNode* node) const { + auto it = schema_node_to_field_id_.find(node); + + // First column has field ID, this one does not, file is corrupted + if (UNLIKELY(it == schema_node_to_field_id_.end())) return INVALID_ID; + + return it->second; +} } diff --git a/be/src/exec/parquet/parquet-metadata-utils.h b/be/src/exec/parquet/parquet-metadata-utils.h index df9656d16..1043bca86 100644 --- a/be/src/exec/parquet/parquet-metadata-utils.h +++ b/be/src/exec/parquet/parquet-metadata-utils.h @@ -155,17 +155,14 @@ class ParquetSchemaResolver { filename_ = filename; // Use FIELD_ID-based column resolution for Iceberg tables if possible. const auto& schema = file_metadata->schema; - if (tbl_desc_.IsIcebergTable() && schema.size() > 1) { + Status status = CreateSchemaTree(file_metadata->schema, &schema_); + if (tbl_desc_.IsIcebergTable()) { + fallback_schema_resolution_ = TSchemaResolutionStrategy::type::FIELD_ID; + // schema[0] is the 'root', schema[1] is the first column. - const parquet::SchemaElement& first_column = schema[1]; - if (first_column.__isset.field_id) { - fallback_schema_resolution_ = TSchemaResolutionStrategy::type::FIELD_ID; - } else { - // Use Name-based schema resolution in case of missing field ids. - fallback_schema_resolution_ = TSchemaResolutionStrategy::type::NAME; - } + if (schema.size() > 1 && !schema[1].__isset.field_id) GenerateFieldIDs(); } - return CreateSchemaTree(file_metadata->schema, &schema_); + return status; } /// Traverses 'schema_' according to 'path', returning the result in 'node'. If 'path' @@ -195,6 +192,9 @@ class ParquetSchemaResolver { /// Used to sanity-check Parquet schemas. static const int SCHEMA_NODE_CHILDREN_SANITY_LIMIT = 64 * 1024; + /// Invalid ID used to signal corrupted file + static const int INVALID_ID = -1; + /// Maps from the array-resolution policy to the ordered array encodings that should /// be tried during path resolution. All entries have the ONE_LEVEL encoding at the end /// because there is no ambiguity between the one-level and the other encodings (there @@ -229,6 +229,7 @@ class ParquetSchemaResolver { /// then the index of the first match is returned. int FindChildWithName(SchemaNode* node, const std::string& name) const; /// Returns the index of 'node's child with 'field id' for Iceberg tables. + /// Return -1 if the file is corrupted int FindChildWithFieldId(SchemaNode* node, const int& field_id) const; /// The ResolvePathHelper() logic for arrays. @@ -244,10 +245,18 @@ class ParquetSchemaResolver { Status ValidateScalarNode(const SchemaNode& node, const ColumnType& col_type, const SchemaPath& path, int idx) const; + /// Generates field ids for the columns in the same order as Iceberg. The traversal is + /// preorder, but the assigned field IDs are not. When a node is visited, its child + /// nodes are assigned an ID, hence the difference. + void GenerateFieldIDs(); + + inline int GetGeneratedFieldID(SchemaNode* node) const; + const HdfsTableDescriptor& tbl_desc_; TSchemaResolutionStrategy::type fallback_schema_resolution_; const TParquetArrayResolution::type array_resolution_; const char* filename_; + std::unordered_map<SchemaNode*, int32_t> schema_node_to_field_id_; /// Root node of our internal schema representation populated in Init(). SchemaNode schema_; diff --git a/testdata/data/README b/testdata/data/README index 568152329..13ff1162c 100644 --- a/testdata/data/README +++ b/testdata/data/README @@ -735,3 +735,31 @@ Generated by Spark 3.2 + Iceberg 0.13. Then the JSON and AVRO files were manuall to make these tables correspond to an Iceberg table in a HadoopCatalog instead of HiveCatalog. The table has a positional delete file. + +iceberg_test/iceberg_migrated_alter_test +Generated and migrated by Hive +CREATE TABLE iceberg_migrated_alter_test (int_col int, string_col string, double_col double) stored as parquet; +insert into table iceberg_migrated_alter_test values (0, "A", 0.5), (1, "B", 1.5), (2, "C", 2.5); +ALTER TABLE iceberg_migrated_alter_test SET TBLPROPERTIES ('storage_handler'='org.apache.iceberg.mr.hive.HiveIcebergStorageHandler'); +Then extracted from hdfs and modified to be able to load as an external hadoop table + +iceberg_test/iceberg_migrated_alter_test_orc +Generated and migrated by Hive +CREATE TABLE iceberg_migrated_alter_test_orc (int_col int, string_col string, double_col double) stored as orc; +insert into table iceberg_migrated_alter_test_orc values (0, "A", 0.5), (1, "B", 1.5), (2, "C", 2.5); +ALTER TABLE iceberg_migrated_alter_test_orc SET TBLPROPERTIES ('storage_handler'='org.apache.iceberg.mr.hive.HiveIcebergStorageHandler'); +Then extracted from hdfs and modified to be able to load as an external hadoop table + +iceberg_test/iceberg_migrated_complex_test +Generated and migrated by Hive +CREATE TABLE iceberg_migrated_complex_test (struct_1_col struct<int_array_col: array<int>, string_col: string, bool_int_map_col: map<boolean, int>>, int_bigint_map_col map<int, bigint>, struct_2_col struct<struct_3_col: struct<float_col: float, string_double_map_col: map<string, double>, bigint_array_col: array<bigint>>, int_int_map_col: map<int, int>>) stored as parquet; +insert into table iceberg_migrated_complex_test values (named_struct("int_array_col", array(0), "string_col", "A", "bool_int_map_col", map(True, 1 )), map(2,CAST(3 as bigint)), named_struct("struct_3_col", named_struct("float_col", cast(0.5 as float), "string_double_map_col", map("B", cast(1.5 as double)), "bigint_array_col", array(cast(4 as bigint))), "int_int_map_col", map(5,6))); +ALTER TABLE iceberg_migrated_complex_test SET TBLPROPERTIES ('storage_handler'='org.apache.iceberg.mr.hive.HiveIcebergStorageHandler'); +Then extracted from hdfs and modified to be able to load as an external hadoop table + +iceberg_test/iceberg_migrated_complex_test_orc +Generated and migrated by Hive +CREATE TABLE iceberg_migrated_complex_test_orc (struct_1_col struct<int_array_col: array<int>, string_col: string, bool_int_map_col: map<boolean, int>>, int_bigint_map_col map<int, bigint>, struct_2_col struct<struct_3_col: struct<float_col: float, string_double_map_col: map<string, double>, bigint_array_col: array<bigint>>, int_int_map_col: map<int, int>>) stored as orc; +insert into table iceberg_migrated_complex_test_orc values (named_struct("int_array_col", array(0), "string_col", "A", "bool_int_map_col", map(True, 1 )), map(2,CAST(3 as bigint)), named_struct("struct_3_col", named_struct("float_col", cast(0.5 as float), "string_double_map_col", map("B", cast(1.5 as double)), "bigint_array_col", array(cast(4 as bigint))), "int_int_map_col", map(5,6))); +ALTER TABLE iceberg_migrated_complex_test_orc SET TBLPROPERTIES ('storage_handler'='org.apache.iceberg.mr.hive.HiveIcebergStorageHandler'); +Then extracted from hdfs and modified to be able to load as an external hadoop table \ No newline at end of file diff --git a/testdata/data/iceberg_test/iceberg_migrated_alter_test/000000_0 b/testdata/data/iceberg_test/iceberg_migrated_alter_test/000000_0 new file mode 100644 index 000000000..9edbb251b Binary files /dev/null and b/testdata/data/iceberg_test/iceberg_migrated_alter_test/000000_0 differ diff --git a/testdata/data/iceberg_test/iceberg_migrated_alter_test/metadata/c9f83a82-60f4-443b-9ca4-359cad16fe12-m0.avro b/testdata/data/iceberg_test/iceberg_migrated_alter_test/metadata/c9f83a82-60f4-443b-9ca4-359cad16fe12-m0.avro new file mode 100644 index 000000000..4d0699eb1 Binary files /dev/null and b/testdata/data/iceberg_test/iceberg_migrated_alter_test/metadata/c9f83a82-60f4-443b-9ca4-359cad16fe12-m0.avro differ diff --git a/testdata/data/iceberg_test/iceberg_migrated_alter_test/metadata/snap-2941076094076108396-1-c9f83a82-60f4-443b-9ca4-359cad16fe12.avro b/testdata/data/iceberg_test/iceberg_migrated_alter_test/metadata/snap-2941076094076108396-1-c9f83a82-60f4-443b-9ca4-359cad16fe12.avro new file mode 100644 index 000000000..065af0120 Binary files /dev/null and b/testdata/data/iceberg_test/iceberg_migrated_alter_test/metadata/snap-2941076094076108396-1-c9f83a82-60f4-443b-9ca4-359cad16fe12.avro differ diff --git a/testdata/data/iceberg_test/iceberg_migrated_alter_test/metadata/v1.metadata.json b/testdata/data/iceberg_test/iceberg_migrated_alter_test/metadata/v1.metadata.json new file mode 100644 index 000000000..976f04593 --- /dev/null +++ b/testdata/data/iceberg_test/iceberg_migrated_alter_test/metadata/v1.metadata.json @@ -0,0 +1,81 @@ +{ + "format-version" : 1, + "table-uuid" : "7068d1a4-ba6e-4f14-ba63-4014606f91fe", + "location" : "/test-warehouse/iceberg_migrated_alter_test", + "last-updated-ms" : 1656338172830, + "last-column-id" : 3, + "schema" : { + "type" : "struct", + "schema-id" : 0, + "fields" : [ { + "id" : 1, + "name" : "int_col", + "required" : false, + "type" : "int" + }, { + "id" : 2, + "name" : "string_col", + "required" : false, + "type" : "string" + }, { + "id" : 3, + "name" : "double_col", + "required" : false, + "type" : "double" + } ] + }, + "current-schema-id" : 0, + "schemas" : [ { + "type" : "struct", + "schema-id" : 0, + "fields" : [ { + "id" : 1, + "name" : "int_col", + "required" : false, + "type" : "int" + }, { + "id" : 2, + "name" : "string_col", + "required" : false, + "type" : "string" + }, { + "id" : 3, + "name" : "double_col", + "required" : false, + "type" : "double" + } ] + } ], + "partition-spec" : [ ], + "default-spec-id" : 0, + "partition-specs" : [ { + "spec-id" : 0, + "fields" : [ ] + } ], + "last-partition-id" : 999, + "default-sort-order-id" : 0, + "sort-orders" : [ { + "order-id" : 0, + "fields" : [ ] + } ], + "properties" : { + "last_modified_time" : "1656338172", + "gc.enabled" : "TRUE", + "bucketing_version" : "2", + "schema.name-mapping.default" : "[ {\n \"field-id\" : 1,\n \"names\" : [ \"int_col\" ]\n}, {\n \"field-id\" : 2,\n \"names\" : [ \"string_col\" ]\n}, {\n \"field-id\" : 3,\n \"names\" : [ \"double_col\" ]\n} ]", + "last_modified_by" : "gfurnstahl", + "storage_handler" : "org.apache.iceberg.mr.hive.HiveIcebergStorageHandler", + "numFilesErasureCoded" : "0", + "engine.hive.enabled" : "true", + "MIGRATED_TO_ICEBERG" : "true", + "totalSize" : "817", + "EXTERNAL" : "TRUE", + "write.format.default" : "parquet", + "numFiles" : "1", + "TRANSLATED_TO_EXTERNAL" : "TRUE", + "table_type" : "ICEBERG" + }, + "current-snapshot-id" : -1, + "snapshots" : [ ], + "snapshot-log" : [ ], + "metadata-log" : [ ] +} diff --git a/testdata/data/iceberg_test/iceberg_migrated_alter_test/metadata/v2.metadata.json b/testdata/data/iceberg_test/iceberg_migrated_alter_test/metadata/v2.metadata.json new file mode 100644 index 000000000..1eb3292e3 --- /dev/null +++ b/testdata/data/iceberg_test/iceberg_migrated_alter_test/metadata/v2.metadata.json @@ -0,0 +1,105 @@ +{ + "format-version" : 1, + "table-uuid" : "7068d1a4-ba6e-4f14-ba63-4014606f91fe", + "location" : "/test-warehouse/iceberg_migrated_alter_test", + "last-updated-ms" : 1656338172947, + "last-column-id" : 3, + "schema" : { + "type" : "struct", + "schema-id" : 0, + "fields" : [ { + "id" : 1, + "name" : "int_col", + "required" : false, + "type" : "int" + }, { + "id" : 2, + "name" : "string_col", + "required" : false, + "type" : "string" + }, { + "id" : 3, + "name" : "double_col", + "required" : false, + "type" : "double" + } ] + }, + "current-schema-id" : 0, + "schemas" : [ { + "type" : "struct", + "schema-id" : 0, + "fields" : [ { + "id" : 1, + "name" : "int_col", + "required" : false, + "type" : "int" + }, { + "id" : 2, + "name" : "string_col", + "required" : false, + "type" : "string" + }, { + "id" : 3, + "name" : "double_col", + "required" : false, + "type" : "double" + } ] + } ], + "partition-spec" : [ ], + "default-spec-id" : 0, + "partition-specs" : [ { + "spec-id" : 0, + "fields" : [ ] + } ], + "last-partition-id" : 999, + "default-sort-order-id" : 0, + "sort-orders" : [ { + "order-id" : 0, + "fields" : [ ] + } ], + "properties" : { + "last_modified_time" : "1656338172", + "gc.enabled" : "TRUE", + "bucketing_version" : "2", + "schema.name-mapping.default" : "[ {\n \"field-id\" : 1,\n \"names\" : [ \"int_col\" ]\n}, {\n \"field-id\" : 2,\n \"names\" : [ \"string_col\" ]\n}, {\n \"field-id\" : 3,\n \"names\" : [ \"double_col\" ]\n} ]", + "last_modified_by" : "gfurnstahl", + "storage_handler" : "org.apache.iceberg.mr.hive.HiveIcebergStorageHandler", + "numFilesErasureCoded" : "0", + "engine.hive.enabled" : "true", + "MIGRATED_TO_ICEBERG" : "true", + "totalSize" : "817", + "EXTERNAL" : "TRUE", + "write.format.default" : "parquet", + "numFiles" : "1", + "TRANSLATED_TO_EXTERNAL" : "TRUE", + "table_type" : "ICEBERG" + }, + "current-snapshot-id" : 2941076094076108396, + "snapshots" : [ { + "snapshot-id" : 2941076094076108396, + "timestamp-ms" : 1656338172947, + "summary" : { + "operation" : "append", + "added-data-files" : "1", + "added-records" : "3", + "added-files-size" : "817", + "changed-partition-count" : "1", + "total-records" : "3", + "total-files-size" : "817", + "total-data-files" : "1", + "total-delete-files" : "0", + "total-position-deletes" : "0", + "total-equality-deletes" : "0" + }, + "manifest-list" : "/test-warehouse/iceberg_migrated_alter_test/metadata/snap-2941076094076108396-1-c9f83a82-60f4-443b-9ca4-359cad16fe12.avro", + "schema-id" : 0 + } ], + "snapshot-log" : [ { + "timestamp-ms" : 1656338172947, + "snapshot-id" : 2941076094076108396 + } ], + "metadata-log" : [ { + "timestamp-ms" : 1656338172830, + "metadata-file" : "/test-warehouse/iceberg_migrated_alter_test/metadata/v1.metadata.json" + } ] +} diff --git a/testdata/data/iceberg_test/iceberg_migrated_alter_test/metadata/version-hint.text b/testdata/data/iceberg_test/iceberg_migrated_alter_test/metadata/version-hint.text new file mode 100644 index 000000000..0cfbf0888 --- /dev/null +++ b/testdata/data/iceberg_test/iceberg_migrated_alter_test/metadata/version-hint.text @@ -0,0 +1 @@ +2 diff --git a/testdata/data/iceberg_test/iceberg_migrated_alter_test_orc/000000_0 b/testdata/data/iceberg_test/iceberg_migrated_alter_test_orc/000000_0 new file mode 100644 index 000000000..670731972 Binary files /dev/null and b/testdata/data/iceberg_test/iceberg_migrated_alter_test_orc/000000_0 differ diff --git a/testdata/data/iceberg_test/iceberg_migrated_alter_test_orc/metadata/340a3b82-71e3-4f50-b030-aecb5a5ea730-m0.avro b/testdata/data/iceberg_test/iceberg_migrated_alter_test_orc/metadata/340a3b82-71e3-4f50-b030-aecb5a5ea730-m0.avro new file mode 100644 index 000000000..b598a1652 Binary files /dev/null and b/testdata/data/iceberg_test/iceberg_migrated_alter_test_orc/metadata/340a3b82-71e3-4f50-b030-aecb5a5ea730-m0.avro differ diff --git a/testdata/data/iceberg_test/iceberg_migrated_alter_test_orc/metadata/snap-2205107170480729038-1-340a3b82-71e3-4f50-b030-aecb5a5ea730.avro b/testdata/data/iceberg_test/iceberg_migrated_alter_test_orc/metadata/snap-2205107170480729038-1-340a3b82-71e3-4f50-b030-aecb5a5ea730.avro new file mode 100644 index 000000000..84ba7e922 Binary files /dev/null and b/testdata/data/iceberg_test/iceberg_migrated_alter_test_orc/metadata/snap-2205107170480729038-1-340a3b82-71e3-4f50-b030-aecb5a5ea730.avro differ diff --git a/testdata/data/iceberg_test/iceberg_migrated_alter_test_orc/metadata/v1.metadata.json b/testdata/data/iceberg_test/iceberg_migrated_alter_test_orc/metadata/v1.metadata.json new file mode 100644 index 000000000..087907c8b --- /dev/null +++ b/testdata/data/iceberg_test/iceberg_migrated_alter_test_orc/metadata/v1.metadata.json @@ -0,0 +1,81 @@ +{ + "format-version" : 1, + "table-uuid" : "c7c2b112-b353-4a15-a194-97230ea9ea8b", + "location" : "/test-warehouse/iceberg_migrated_alter_test_orc", + "last-updated-ms" : 1656404671678, + "last-column-id" : 3, + "schema" : { + "type" : "struct", + "schema-id" : 0, + "fields" : [ { + "id" : 1, + "name" : "int_col", + "required" : false, + "type" : "int" + }, { + "id" : 2, + "name" : "string_col", + "required" : false, + "type" : "string" + }, { + "id" : 3, + "name" : "double_col", + "required" : false, + "type" : "double" + } ] + }, + "current-schema-id" : 0, + "schemas" : [ { + "type" : "struct", + "schema-id" : 0, + "fields" : [ { + "id" : 1, + "name" : "int_col", + "required" : false, + "type" : "int" + }, { + "id" : 2, + "name" : "string_col", + "required" : false, + "type" : "string" + }, { + "id" : 3, + "name" : "double_col", + "required" : false, + "type" : "double" + } ] + } ], + "partition-spec" : [ ], + "default-spec-id" : 0, + "partition-specs" : [ { + "spec-id" : 0, + "fields" : [ ] + } ], + "last-partition-id" : 999, + "default-sort-order-id" : 0, + "sort-orders" : [ { + "order-id" : 0, + "fields" : [ ] + } ], + "properties" : { + "last_modified_time" : "1656404671", + "gc.enabled" : "TRUE", + "bucketing_version" : "2", + "schema.name-mapping.default" : "[ {\n \"field-id\" : 1,\n \"names\" : [ \"int_col\" ]\n}, {\n \"field-id\" : 2,\n \"names\" : [ \"string_col\" ]\n}, {\n \"field-id\" : 3,\n \"names\" : [ \"double_col\" ]\n} ]", + "last_modified_by" : "gfurnstahl", + "storage_handler" : "org.apache.iceberg.mr.hive.HiveIcebergStorageHandler", + "numFilesErasureCoded" : "0", + "engine.hive.enabled" : "true", + "MIGRATED_TO_ICEBERG" : "true", + "totalSize" : "418", + "EXTERNAL" : "TRUE", + "write.format.default" : "orc", + "numFiles" : "1", + "TRANSLATED_TO_EXTERNAL" : "TRUE", + "table_type" : "ICEBERG" + }, + "current-snapshot-id" : -1, + "snapshots" : [ ], + "snapshot-log" : [ ], + "metadata-log" : [ ] +} diff --git a/testdata/data/iceberg_test/iceberg_migrated_alter_test_orc/metadata/v2.metadata.json b/testdata/data/iceberg_test/iceberg_migrated_alter_test_orc/metadata/v2.metadata.json new file mode 100644 index 000000000..3800fe04b --- /dev/null +++ b/testdata/data/iceberg_test/iceberg_migrated_alter_test_orc/metadata/v2.metadata.json @@ -0,0 +1,105 @@ +{ + "format-version" : 1, + "table-uuid" : "c7c2b112-b353-4a15-a194-97230ea9ea8b", + "location" : "/test-warehouse/iceberg_migrated_alter_test_orc", + "last-updated-ms" : 1656404671775, + "last-column-id" : 3, + "schema" : { + "type" : "struct", + "schema-id" : 0, + "fields" : [ { + "id" : 1, + "name" : "int_col", + "required" : false, + "type" : "int" + }, { + "id" : 2, + "name" : "string_col", + "required" : false, + "type" : "string" + }, { + "id" : 3, + "name" : "double_col", + "required" : false, + "type" : "double" + } ] + }, + "current-schema-id" : 0, + "schemas" : [ { + "type" : "struct", + "schema-id" : 0, + "fields" : [ { + "id" : 1, + "name" : "int_col", + "required" : false, + "type" : "int" + }, { + "id" : 2, + "name" : "string_col", + "required" : false, + "type" : "string" + }, { + "id" : 3, + "name" : "double_col", + "required" : false, + "type" : "double" + } ] + } ], + "partition-spec" : [ ], + "default-spec-id" : 0, + "partition-specs" : [ { + "spec-id" : 0, + "fields" : [ ] + } ], + "last-partition-id" : 999, + "default-sort-order-id" : 0, + "sort-orders" : [ { + "order-id" : 0, + "fields" : [ ] + } ], + "properties" : { + "last_modified_time" : "1656404671", + "gc.enabled" : "TRUE", + "bucketing_version" : "2", + "schema.name-mapping.default" : "[ {\n \"field-id\" : 1,\n \"names\" : [ \"int_col\" ]\n}, {\n \"field-id\" : 2,\n \"names\" : [ \"string_col\" ]\n}, {\n \"field-id\" : 3,\n \"names\" : [ \"double_col\" ]\n} ]", + "last_modified_by" : "gfurnstahl", + "storage_handler" : "org.apache.iceberg.mr.hive.HiveIcebergStorageHandler", + "numFilesErasureCoded" : "0", + "engine.hive.enabled" : "true", + "MIGRATED_TO_ICEBERG" : "true", + "totalSize" : "418", + "EXTERNAL" : "TRUE", + "write.format.default" : "orc", + "numFiles" : "1", + "TRANSLATED_TO_EXTERNAL" : "TRUE", + "table_type" : "ICEBERG" + }, + "current-snapshot-id" : 2205107170480729038, + "snapshots" : [ { + "snapshot-id" : 2205107170480729038, + "timestamp-ms" : 1656404671775, + "summary" : { + "operation" : "append", + "added-data-files" : "1", + "added-records" : "3", + "added-files-size" : "418", + "changed-partition-count" : "1", + "total-records" : "3", + "total-files-size" : "418", + "total-data-files" : "1", + "total-delete-files" : "0", + "total-position-deletes" : "0", + "total-equality-deletes" : "0" + }, + "manifest-list" : "/test-warehouse/iceberg_migrated_alter_test_orc/metadata/snap-2205107170480729038-1-340a3b82-71e3-4f50-b030-aecb5a5ea730.avro", + "schema-id" : 0 + } ], + "snapshot-log" : [ { + "timestamp-ms" : 1656404671775, + "snapshot-id" : 2205107170480729038 + } ], + "metadata-log" : [ { + "timestamp-ms" : 1656404671678, + "metadata-file" : "/test-warehouse/iceberg_migrated_alter_test_orc/metadata/00000-87733f1d-9cc3-4427-9451-09ea25c0f4cd.metadata.json" + } ] +} diff --git a/testdata/data/iceberg_test/iceberg_migrated_alter_test_orc/metadata/version-hint.text b/testdata/data/iceberg_test/iceberg_migrated_alter_test_orc/metadata/version-hint.text new file mode 100644 index 000000000..0cfbf0888 --- /dev/null +++ b/testdata/data/iceberg_test/iceberg_migrated_alter_test_orc/metadata/version-hint.text @@ -0,0 +1 @@ +2 diff --git a/testdata/data/iceberg_test/iceberg_migrated_complex_test/000000_0 b/testdata/data/iceberg_test/iceberg_migrated_complex_test/000000_0 new file mode 100644 index 000000000..8eb82dad1 Binary files /dev/null and b/testdata/data/iceberg_test/iceberg_migrated_complex_test/000000_0 differ diff --git a/testdata/data/iceberg_test/iceberg_migrated_complex_test/metadata/152e384f-2851-44b7-9ada-1bfbec74e9fc-m0.avro b/testdata/data/iceberg_test/iceberg_migrated_complex_test/metadata/152e384f-2851-44b7-9ada-1bfbec74e9fc-m0.avro new file mode 100644 index 000000000..15b963fb4 Binary files /dev/null and b/testdata/data/iceberg_test/iceberg_migrated_complex_test/metadata/152e384f-2851-44b7-9ada-1bfbec74e9fc-m0.avro differ diff --git a/testdata/data/iceberg_test/iceberg_migrated_complex_test/metadata/snap-3911840040574896148-1-152e384f-2851-44b7-9ada-1bfbec74e9fc.avro b/testdata/data/iceberg_test/iceberg_migrated_complex_test/metadata/snap-3911840040574896148-1-152e384f-2851-44b7-9ada-1bfbec74e9fc.avro new file mode 100644 index 000000000..8fa357984 Binary files /dev/null and b/testdata/data/iceberg_test/iceberg_migrated_complex_test/metadata/snap-3911840040574896148-1-152e384f-2851-44b7-9ada-1bfbec74e9fc.avro differ diff --git a/testdata/data/iceberg_test/iceberg_migrated_complex_test/metadata/v1.metadata.json b/testdata/data/iceberg_test/iceberg_migrated_complex_test/metadata/v1.metadata.json new file mode 100644 index 000000000..881fc1565 --- /dev/null +++ b/testdata/data/iceberg_test/iceberg_migrated_complex_test/metadata/v1.metadata.json @@ -0,0 +1,255 @@ +{ + "format-version" : 1, + "table-uuid" : "61beb841-ed4d-40f3-8be1-7b994886212f", + "location" : "/test-warehouse/iceberg_migrated_complex_test", + "last-updated-ms" : 1656501596629, + "last-column-id" : 21, + "schema" : { + "type" : "struct", + "schema-id" : 0, + "fields" : [ { + "id" : 1, + "name" : "struct_1_col", + "required" : false, + "type" : { + "type" : "struct", + "fields" : [ { + "id" : 4, + "name" : "int_array_col", + "required" : false, + "type" : { + "type" : "list", + "element-id" : 7, + "element" : "int", + "element-required" : false + } + }, { + "id" : 5, + "name" : "string_col", + "required" : false, + "type" : "string" + }, { + "id" : 6, + "name" : "bool_int_map_col", + "required" : false, + "type" : { + "type" : "map", + "key-id" : 8, + "key" : "boolean", + "value-id" : 9, + "value" : "int", + "value-required" : false + } + } ] + } + }, { + "id" : 2, + "name" : "int_bigint_map_col", + "required" : false, + "type" : { + "type" : "map", + "key-id" : 10, + "key" : "int", + "value-id" : 11, + "value" : "long", + "value-required" : false + } + }, { + "id" : 3, + "name" : "struct_2_col", + "required" : false, + "type" : { + "type" : "struct", + "fields" : [ { + "id" : 12, + "name" : "struct_3_col", + "required" : false, + "type" : { + "type" : "struct", + "fields" : [ { + "id" : 14, + "name" : "float_col", + "required" : false, + "type" : "float" + }, { + "id" : 15, + "name" : "string_double_map_col", + "required" : false, + "type" : { + "type" : "map", + "key-id" : 17, + "key" : "string", + "value-id" : 18, + "value" : "double", + "value-required" : false + } + }, { + "id" : 16, + "name" : "bigint_array_col", + "required" : false, + "type" : { + "type" : "list", + "element-id" : 19, + "element" : "long", + "element-required" : false + } + } ] + } + }, { + "id" : 13, + "name" : "int_int_map_col", + "required" : false, + "type" : { + "type" : "map", + "key-id" : 20, + "key" : "int", + "value-id" : 21, + "value" : "int", + "value-required" : false + } + } ] + } + } ] + }, + "current-schema-id" : 0, + "schemas" : [ { + "type" : "struct", + "schema-id" : 0, + "fields" : [ { + "id" : 1, + "name" : "struct_1_col", + "required" : false, + "type" : { + "type" : "struct", + "fields" : [ { + "id" : 4, + "name" : "int_array_col", + "required" : false, + "type" : { + "type" : "list", + "element-id" : 7, + "element" : "int", + "element-required" : false + } + }, { + "id" : 5, + "name" : "string_col", + "required" : false, + "type" : "string" + }, { + "id" : 6, + "name" : "bool_int_map_col", + "required" : false, + "type" : { + "type" : "map", + "key-id" : 8, + "key" : "boolean", + "value-id" : 9, + "value" : "int", + "value-required" : false + } + } ] + } + }, { + "id" : 2, + "name" : "int_bigint_map_col", + "required" : false, + "type" : { + "type" : "map", + "key-id" : 10, + "key" : "int", + "value-id" : 11, + "value" : "long", + "value-required" : false + } + }, { + "id" : 3, + "name" : "struct_2_col", + "required" : false, + "type" : { + "type" : "struct", + "fields" : [ { + "id" : 12, + "name" : "struct_3_col", + "required" : false, + "type" : { + "type" : "struct", + "fields" : [ { + "id" : 14, + "name" : "float_col", + "required" : false, + "type" : "float" + }, { + "id" : 15, + "name" : "string_double_map_col", + "required" : false, + "type" : { + "type" : "map", + "key-id" : 17, + "key" : "string", + "value-id" : 18, + "value" : "double", + "value-required" : false + } + }, { + "id" : 16, + "name" : "bigint_array_col", + "required" : false, + "type" : { + "type" : "list", + "element-id" : 19, + "element" : "long", + "element-required" : false + } + } ] + } + }, { + "id" : 13, + "name" : "int_int_map_col", + "required" : false, + "type" : { + "type" : "map", + "key-id" : 20, + "key" : "int", + "value-id" : 21, + "value" : "int", + "value-required" : false + } + } ] + } + } ] + } ], + "partition-spec" : [ ], + "default-spec-id" : 0, + "partition-specs" : [ { + "spec-id" : 0, + "fields" : [ ] + } ], + "last-partition-id" : 999, + "default-sort-order-id" : 0, + "sort-orders" : [ { + "order-id" : 0, + "fields" : [ ] + } ], + "properties" : { + "last_modified_time" : "1656501596", + "gc.enabled" : "TRUE", + "bucketing_version" : "2", + "schema.name-mapping.default" : "[ {\n \"field-id\" : 1,\n \"names\" : [ \"struct_1_col\" ],\n \"fields\" : [ {\n \"field-id\" : 2,\n \"names\" : [ \"int_array_col\" ],\n \"fields\" : [ {\n \"field-id\" : 3,\n \"names\" : [ \"element\" ]\n } ]\n }, {\n \"field-id\" : 4,\n \"names\" : [ \"string_col\" ]\n }, {\n \"field-id\" : 5,\n \"names\" : [ \"bool_int_map_col\" ],\n \"fields\" : [ {\n \"field-id\" : 6,\n \"names\" : [ \"key\" ]\ [...] + "last_modified_by" : "gfurnstahl", + "storage_handler" : "org.apache.iceberg.mr.hive.HiveIcebergStorageHandler", + "numFilesErasureCoded" : "0", + "engine.hive.enabled" : "true", + "MIGRATED_TO_ICEBERG" : "true", + "totalSize" : "3006", + "EXTERNAL" : "TRUE", + "write.format.default" : "parquet", + "numFiles" : "1", + "TRANSLATED_TO_EXTERNAL" : "TRUE", + "table_type" : "ICEBERG" + }, + "current-snapshot-id" : -1, + "snapshots" : [ ], + "snapshot-log" : [ ], + "metadata-log" : [ ] +} diff --git a/testdata/data/iceberg_test/iceberg_migrated_complex_test/metadata/v2.metadata.json b/testdata/data/iceberg_test/iceberg_migrated_complex_test/metadata/v2.metadata.json new file mode 100644 index 000000000..20a649b7c --- /dev/null +++ b/testdata/data/iceberg_test/iceberg_migrated_complex_test/metadata/v2.metadata.json @@ -0,0 +1,279 @@ +{ + "format-version" : 1, + "table-uuid" : "61beb841-ed4d-40f3-8be1-7b994886212f", + "location" : "/test-warehouse/iceberg_migrated_complex_test", + "last-updated-ms" : 1656501596751, + "last-column-id" : 21, + "schema" : { + "type" : "struct", + "schema-id" : 0, + "fields" : [ { + "id" : 1, + "name" : "struct_1_col", + "required" : false, + "type" : { + "type" : "struct", + "fields" : [ { + "id" : 4, + "name" : "int_array_col", + "required" : false, + "type" : { + "type" : "list", + "element-id" : 7, + "element" : "int", + "element-required" : false + } + }, { + "id" : 5, + "name" : "string_col", + "required" : false, + "type" : "string" + }, { + "id" : 6, + "name" : "bool_int_map_col", + "required" : false, + "type" : { + "type" : "map", + "key-id" : 8, + "key" : "boolean", + "value-id" : 9, + "value" : "int", + "value-required" : false + } + } ] + } + }, { + "id" : 2, + "name" : "int_bigint_map_col", + "required" : false, + "type" : { + "type" : "map", + "key-id" : 10, + "key" : "int", + "value-id" : 11, + "value" : "long", + "value-required" : false + } + }, { + "id" : 3, + "name" : "struct_2_col", + "required" : false, + "type" : { + "type" : "struct", + "fields" : [ { + "id" : 12, + "name" : "struct_3_col", + "required" : false, + "type" : { + "type" : "struct", + "fields" : [ { + "id" : 14, + "name" : "float_col", + "required" : false, + "type" : "float" + }, { + "id" : 15, + "name" : "string_double_map_col", + "required" : false, + "type" : { + "type" : "map", + "key-id" : 17, + "key" : "string", + "value-id" : 18, + "value" : "double", + "value-required" : false + } + }, { + "id" : 16, + "name" : "bigint_array_col", + "required" : false, + "type" : { + "type" : "list", + "element-id" : 19, + "element" : "long", + "element-required" : false + } + } ] + } + }, { + "id" : 13, + "name" : "int_int_map_col", + "required" : false, + "type" : { + "type" : "map", + "key-id" : 20, + "key" : "int", + "value-id" : 21, + "value" : "int", + "value-required" : false + } + } ] + } + } ] + }, + "current-schema-id" : 0, + "schemas" : [ { + "type" : "struct", + "schema-id" : 0, + "fields" : [ { + "id" : 1, + "name" : "struct_1_col", + "required" : false, + "type" : { + "type" : "struct", + "fields" : [ { + "id" : 4, + "name" : "int_array_col", + "required" : false, + "type" : { + "type" : "list", + "element-id" : 7, + "element" : "int", + "element-required" : false + } + }, { + "id" : 5, + "name" : "string_col", + "required" : false, + "type" : "string" + }, { + "id" : 6, + "name" : "bool_int_map_col", + "required" : false, + "type" : { + "type" : "map", + "key-id" : 8, + "key" : "boolean", + "value-id" : 9, + "value" : "int", + "value-required" : false + } + } ] + } + }, { + "id" : 2, + "name" : "int_bigint_map_col", + "required" : false, + "type" : { + "type" : "map", + "key-id" : 10, + "key" : "int", + "value-id" : 11, + "value" : "long", + "value-required" : false + } + }, { + "id" : 3, + "name" : "struct_2_col", + "required" : false, + "type" : { + "type" : "struct", + "fields" : [ { + "id" : 12, + "name" : "struct_3_col", + "required" : false, + "type" : { + "type" : "struct", + "fields" : [ { + "id" : 14, + "name" : "float_col", + "required" : false, + "type" : "float" + }, { + "id" : 15, + "name" : "string_double_map_col", + "required" : false, + "type" : { + "type" : "map", + "key-id" : 17, + "key" : "string", + "value-id" : 18, + "value" : "double", + "value-required" : false + } + }, { + "id" : 16, + "name" : "bigint_array_col", + "required" : false, + "type" : { + "type" : "list", + "element-id" : 19, + "element" : "long", + "element-required" : false + } + } ] + } + }, { + "id" : 13, + "name" : "int_int_map_col", + "required" : false, + "type" : { + "type" : "map", + "key-id" : 20, + "key" : "int", + "value-id" : 21, + "value" : "int", + "value-required" : false + } + } ] + } + } ] + } ], + "partition-spec" : [ ], + "default-spec-id" : 0, + "partition-specs" : [ { + "spec-id" : 0, + "fields" : [ ] + } ], + "last-partition-id" : 999, + "default-sort-order-id" : 0, + "sort-orders" : [ { + "order-id" : 0, + "fields" : [ ] + } ], + "properties" : { + "last_modified_time" : "1656501596", + "gc.enabled" : "TRUE", + "bucketing_version" : "2", + "schema.name-mapping.default" : "[ {\n \"field-id\" : 1,\n \"names\" : [ \"struct_1_col\" ],\n \"fields\" : [ {\n \"field-id\" : 2,\n \"names\" : [ \"int_array_col\" ],\n \"fields\" : [ {\n \"field-id\" : 3,\n \"names\" : [ \"element\" ]\n } ]\n }, {\n \"field-id\" : 4,\n \"names\" : [ \"string_col\" ]\n }, {\n \"field-id\" : 5,\n \"names\" : [ \"bool_int_map_col\" ],\n \"fields\" : [ {\n \"field-id\" : 6,\n \"names\" : [ \"key\" ]\ [...] + "last_modified_by" : "gfurnstahl", + "storage_handler" : "org.apache.iceberg.mr.hive.HiveIcebergStorageHandler", + "numFilesErasureCoded" : "0", + "engine.hive.enabled" : "true", + "MIGRATED_TO_ICEBERG" : "true", + "totalSize" : "3006", + "EXTERNAL" : "TRUE", + "write.format.default" : "parquet", + "numFiles" : "1", + "TRANSLATED_TO_EXTERNAL" : "TRUE", + "table_type" : "ICEBERG" + }, + "current-snapshot-id" : 3911840040574896148, + "snapshots" : [ { + "snapshot-id" : 3911840040574896148, + "timestamp-ms" : 1656501596751, + "summary" : { + "operation" : "append", + "added-data-files" : "1", + "added-records" : "1", + "added-files-size" : "3006", + "changed-partition-count" : "1", + "total-records" : "1", + "total-files-size" : "3006", + "total-data-files" : "1", + "total-delete-files" : "0", + "total-position-deletes" : "0", + "total-equality-deletes" : "0" + }, + "manifest-list" : "/test-warehouse/iceberg_migrated_complex_test/metadata/snap-3911840040574896148-1-152e384f-2851-44b7-9ada-1bfbec74e9fc.avro", + "schema-id" : 0 + } ], + "snapshot-log" : [ { + "timestamp-ms" : 1656501596751, + "snapshot-id" : 3911840040574896148 + } ], + "metadata-log" : [ { + "timestamp-ms" : 1656501596629, + "metadata-file" : "/test-warehouse/iceberg_migrated_complex_test/metadata/00000-050bc482-2885-4c5f-82a5-db316f892552.metadata.json" + } ] +} diff --git a/testdata/data/iceberg_test/iceberg_migrated_complex_test/metadata/version-hint.text b/testdata/data/iceberg_test/iceberg_migrated_complex_test/metadata/version-hint.text new file mode 100644 index 000000000..0cfbf0888 --- /dev/null +++ b/testdata/data/iceberg_test/iceberg_migrated_complex_test/metadata/version-hint.text @@ -0,0 +1 @@ +2 diff --git a/testdata/data/iceberg_test/iceberg_migrated_complex_test_orc/000000_0 b/testdata/data/iceberg_test/iceberg_migrated_complex_test_orc/000000_0 new file mode 100644 index 000000000..30bfddb87 Binary files /dev/null and b/testdata/data/iceberg_test/iceberg_migrated_complex_test_orc/000000_0 differ diff --git a/testdata/data/iceberg_test/iceberg_migrated_complex_test_orc/metadata/8588fd4b-13c1-4451-80ad-5cf71a959b94-m0.avro b/testdata/data/iceberg_test/iceberg_migrated_complex_test_orc/metadata/8588fd4b-13c1-4451-80ad-5cf71a959b94-m0.avro new file mode 100644 index 000000000..e9aa6c0b6 Binary files /dev/null and b/testdata/data/iceberg_test/iceberg_migrated_complex_test_orc/metadata/8588fd4b-13c1-4451-80ad-5cf71a959b94-m0.avro differ diff --git a/testdata/data/iceberg_test/iceberg_migrated_complex_test_orc/metadata/snap-3622599918649152504-1-8588fd4b-13c1-4451-80ad-5cf71a959b94.avro b/testdata/data/iceberg_test/iceberg_migrated_complex_test_orc/metadata/snap-3622599918649152504-1-8588fd4b-13c1-4451-80ad-5cf71a959b94.avro new file mode 100644 index 000000000..c5c01ff72 Binary files /dev/null and b/testdata/data/iceberg_test/iceberg_migrated_complex_test_orc/metadata/snap-3622599918649152504-1-8588fd4b-13c1-4451-80ad-5cf71a959b94.avro differ diff --git a/testdata/data/iceberg_test/iceberg_migrated_complex_test_orc/metadata/v1.metadata.json b/testdata/data/iceberg_test/iceberg_migrated_complex_test_orc/metadata/v1.metadata.json new file mode 100644 index 000000000..7b69388c5 --- /dev/null +++ b/testdata/data/iceberg_test/iceberg_migrated_complex_test_orc/metadata/v1.metadata.json @@ -0,0 +1,255 @@ +{ + "format-version" : 1, + "table-uuid" : "4025e262-f92e-4adb-800d-c42be2a65940", + "location" : "/test-warehouse/iceberg_migrated_complex_test_orc", + "last-updated-ms" : 1656496318199, + "last-column-id" : 21, + "schema" : { + "type" : "struct", + "schema-id" : 0, + "fields" : [ { + "id" : 1, + "name" : "struct_1_col", + "required" : false, + "type" : { + "type" : "struct", + "fields" : [ { + "id" : 4, + "name" : "int_array_col", + "required" : false, + "type" : { + "type" : "list", + "element-id" : 7, + "element" : "int", + "element-required" : false + } + }, { + "id" : 5, + "name" : "string_col", + "required" : false, + "type" : "string" + }, { + "id" : 6, + "name" : "bool_int_map_col", + "required" : false, + "type" : { + "type" : "map", + "key-id" : 8, + "key" : "boolean", + "value-id" : 9, + "value" : "int", + "value-required" : false + } + } ] + } + }, { + "id" : 2, + "name" : "int_bigint_map_col", + "required" : false, + "type" : { + "type" : "map", + "key-id" : 10, + "key" : "int", + "value-id" : 11, + "value" : "long", + "value-required" : false + } + }, { + "id" : 3, + "name" : "struct_2_col", + "required" : false, + "type" : { + "type" : "struct", + "fields" : [ { + "id" : 12, + "name" : "struct_3_col", + "required" : false, + "type" : { + "type" : "struct", + "fields" : [ { + "id" : 14, + "name" : "float_col", + "required" : false, + "type" : "float" + }, { + "id" : 15, + "name" : "string_double_map_col", + "required" : false, + "type" : { + "type" : "map", + "key-id" : 17, + "key" : "string", + "value-id" : 18, + "value" : "double", + "value-required" : false + } + }, { + "id" : 16, + "name" : "bigint_array_col", + "required" : false, + "type" : { + "type" : "list", + "element-id" : 19, + "element" : "long", + "element-required" : false + } + } ] + } + }, { + "id" : 13, + "name" : "int_int_map_col", + "required" : false, + "type" : { + "type" : "map", + "key-id" : 20, + "key" : "int", + "value-id" : 21, + "value" : "int", + "value-required" : false + } + } ] + } + } ] + }, + "current-schema-id" : 0, + "schemas" : [ { + "type" : "struct", + "schema-id" : 0, + "fields" : [ { + "id" : 1, + "name" : "struct_1_col", + "required" : false, + "type" : { + "type" : "struct", + "fields" : [ { + "id" : 4, + "name" : "int_array_col", + "required" : false, + "type" : { + "type" : "list", + "element-id" : 7, + "element" : "int", + "element-required" : false + } + }, { + "id" : 5, + "name" : "string_col", + "required" : false, + "type" : "string" + }, { + "id" : 6, + "name" : "bool_int_map_col", + "required" : false, + "type" : { + "type" : "map", + "key-id" : 8, + "key" : "boolean", + "value-id" : 9, + "value" : "int", + "value-required" : false + } + } ] + } + }, { + "id" : 2, + "name" : "int_bigint_map_col", + "required" : false, + "type" : { + "type" : "map", + "key-id" : 10, + "key" : "int", + "value-id" : 11, + "value" : "long", + "value-required" : false + } + }, { + "id" : 3, + "name" : "struct_2_col", + "required" : false, + "type" : { + "type" : "struct", + "fields" : [ { + "id" : 12, + "name" : "struct_3_col", + "required" : false, + "type" : { + "type" : "struct", + "fields" : [ { + "id" : 14, + "name" : "float_col", + "required" : false, + "type" : "float" + }, { + "id" : 15, + "name" : "string_double_map_col", + "required" : false, + "type" : { + "type" : "map", + "key-id" : 17, + "key" : "string", + "value-id" : 18, + "value" : "double", + "value-required" : false + } + }, { + "id" : 16, + "name" : "bigint_array_col", + "required" : false, + "type" : { + "type" : "list", + "element-id" : 19, + "element" : "long", + "element-required" : false + } + } ] + } + }, { + "id" : 13, + "name" : "int_int_map_col", + "required" : false, + "type" : { + "type" : "map", + "key-id" : 20, + "key" : "int", + "value-id" : 21, + "value" : "int", + "value-required" : false + } + } ] + } + } ] + } ], + "partition-spec" : [ ], + "default-spec-id" : 0, + "partition-specs" : [ { + "spec-id" : 0, + "fields" : [ ] + } ], + "last-partition-id" : 999, + "default-sort-order-id" : 0, + "sort-orders" : [ { + "order-id" : 0, + "fields" : [ ] + } ], + "properties" : { + "last_modified_time" : "1656496318", + "gc.enabled" : "TRUE", + "bucketing_version" : "2", + "schema.name-mapping.default" : "[ {\n \"field-id\" : 1,\n \"names\" : [ \"struct_1_col\" ],\n \"fields\" : [ {\n \"field-id\" : 2,\n \"names\" : [ \"int_array_col\" ],\n \"fields\" : [ {\n \"field-id\" : 3,\n \"names\" : [ \"element\" ]\n } ]\n }, {\n \"field-id\" : 4,\n \"names\" : [ \"string_col\" ]\n }, {\n \"field-id\" : 5,\n \"names\" : [ \"bool_int_map_col\" ],\n \"fields\" : [ {\n \"field-id\" : 6,\n \"names\" : [ \"key\" ]\ [...] + "last_modified_by" : "gfurnstahl", + "storage_handler" : "org.apache.iceberg.mr.hive.HiveIcebergStorageHandler", + "numFilesErasureCoded" : "0", + "engine.hive.enabled" : "true", + "MIGRATED_TO_ICEBERG" : "true", + "totalSize" : "1217", + "EXTERNAL" : "TRUE", + "write.format.default" : "orc", + "numFiles" : "1", + "TRANSLATED_TO_EXTERNAL" : "TRUE", + "table_type" : "ICEBERG" + }, + "current-snapshot-id" : -1, + "snapshots" : [ ], + "snapshot-log" : [ ], + "metadata-log" : [ ] +} diff --git a/testdata/data/iceberg_test/iceberg_migrated_complex_test_orc/metadata/v2.metadata.json b/testdata/data/iceberg_test/iceberg_migrated_complex_test_orc/metadata/v2.metadata.json new file mode 100644 index 000000000..b4024c224 --- /dev/null +++ b/testdata/data/iceberg_test/iceberg_migrated_complex_test_orc/metadata/v2.metadata.json @@ -0,0 +1,279 @@ +{ + "format-version" : 1, + "table-uuid" : "4025e262-f92e-4adb-800d-c42be2a65940", + "location" : "/test-warehouse/iceberg_migrated_complex_test_orc", + "last-updated-ms" : 1656496318303, + "last-column-id" : 21, + "schema" : { + "type" : "struct", + "schema-id" : 0, + "fields" : [ { + "id" : 1, + "name" : "struct_1_col", + "required" : false, + "type" : { + "type" : "struct", + "fields" : [ { + "id" : 4, + "name" : "int_array_col", + "required" : false, + "type" : { + "type" : "list", + "element-id" : 7, + "element" : "int", + "element-required" : false + } + }, { + "id" : 5, + "name" : "string_col", + "required" : false, + "type" : "string" + }, { + "id" : 6, + "name" : "bool_int_map_col", + "required" : false, + "type" : { + "type" : "map", + "key-id" : 8, + "key" : "boolean", + "value-id" : 9, + "value" : "int", + "value-required" : false + } + } ] + } + }, { + "id" : 2, + "name" : "int_bigint_map_col", + "required" : false, + "type" : { + "type" : "map", + "key-id" : 10, + "key" : "int", + "value-id" : 11, + "value" : "long", + "value-required" : false + } + }, { + "id" : 3, + "name" : "struct_2_col", + "required" : false, + "type" : { + "type" : "struct", + "fields" : [ { + "id" : 12, + "name" : "struct_3_col", + "required" : false, + "type" : { + "type" : "struct", + "fields" : [ { + "id" : 14, + "name" : "float_col", + "required" : false, + "type" : "float" + }, { + "id" : 15, + "name" : "string_double_map_col", + "required" : false, + "type" : { + "type" : "map", + "key-id" : 17, + "key" : "string", + "value-id" : 18, + "value" : "double", + "value-required" : false + } + }, { + "id" : 16, + "name" : "bigint_array_col", + "required" : false, + "type" : { + "type" : "list", + "element-id" : 19, + "element" : "long", + "element-required" : false + } + } ] + } + }, { + "id" : 13, + "name" : "int_int_map_col", + "required" : false, + "type" : { + "type" : "map", + "key-id" : 20, + "key" : "int", + "value-id" : 21, + "value" : "int", + "value-required" : false + } + } ] + } + } ] + }, + "current-schema-id" : 0, + "schemas" : [ { + "type" : "struct", + "schema-id" : 0, + "fields" : [ { + "id" : 1, + "name" : "struct_1_col", + "required" : false, + "type" : { + "type" : "struct", + "fields" : [ { + "id" : 4, + "name" : "int_array_col", + "required" : false, + "type" : { + "type" : "list", + "element-id" : 7, + "element" : "int", + "element-required" : false + } + }, { + "id" : 5, + "name" : "string_col", + "required" : false, + "type" : "string" + }, { + "id" : 6, + "name" : "bool_int_map_col", + "required" : false, + "type" : { + "type" : "map", + "key-id" : 8, + "key" : "boolean", + "value-id" : 9, + "value" : "int", + "value-required" : false + } + } ] + } + }, { + "id" : 2, + "name" : "int_bigint_map_col", + "required" : false, + "type" : { + "type" : "map", + "key-id" : 10, + "key" : "int", + "value-id" : 11, + "value" : "long", + "value-required" : false + } + }, { + "id" : 3, + "name" : "struct_2_col", + "required" : false, + "type" : { + "type" : "struct", + "fields" : [ { + "id" : 12, + "name" : "struct_3_col", + "required" : false, + "type" : { + "type" : "struct", + "fields" : [ { + "id" : 14, + "name" : "float_col", + "required" : false, + "type" : "float" + }, { + "id" : 15, + "name" : "string_double_map_col", + "required" : false, + "type" : { + "type" : "map", + "key-id" : 17, + "key" : "string", + "value-id" : 18, + "value" : "double", + "value-required" : false + } + }, { + "id" : 16, + "name" : "bigint_array_col", + "required" : false, + "type" : { + "type" : "list", + "element-id" : 19, + "element" : "long", + "element-required" : false + } + } ] + } + }, { + "id" : 13, + "name" : "int_int_map_col", + "required" : false, + "type" : { + "type" : "map", + "key-id" : 20, + "key" : "int", + "value-id" : 21, + "value" : "int", + "value-required" : false + } + } ] + } + } ] + } ], + "partition-spec" : [ ], + "default-spec-id" : 0, + "partition-specs" : [ { + "spec-id" : 0, + "fields" : [ ] + } ], + "last-partition-id" : 999, + "default-sort-order-id" : 0, + "sort-orders" : [ { + "order-id" : 0, + "fields" : [ ] + } ], + "properties" : { + "last_modified_time" : "1656496318", + "gc.enabled" : "TRUE", + "bucketing_version" : "2", + "schema.name-mapping.default" : "[ {\n \"field-id\" : 1,\n \"names\" : [ \"struct_1_col\" ],\n \"fields\" : [ {\n \"field-id\" : 2,\n \"names\" : [ \"int_array_col\" ],\n \"fields\" : [ {\n \"field-id\" : 3,\n \"names\" : [ \"element\" ]\n } ]\n }, {\n \"field-id\" : 4,\n \"names\" : [ \"string_col\" ]\n }, {\n \"field-id\" : 5,\n \"names\" : [ \"bool_int_map_col\" ],\n \"fields\" : [ {\n \"field-id\" : 6,\n \"names\" : [ \"key\" ]\ [...] + "last_modified_by" : "gfurnstahl", + "storage_handler" : "org.apache.iceberg.mr.hive.HiveIcebergStorageHandler", + "numFilesErasureCoded" : "0", + "engine.hive.enabled" : "true", + "MIGRATED_TO_ICEBERG" : "true", + "totalSize" : "1217", + "EXTERNAL" : "TRUE", + "write.format.default" : "orc", + "numFiles" : "1", + "TRANSLATED_TO_EXTERNAL" : "TRUE", + "table_type" : "ICEBERG" + }, + "current-snapshot-id" : 3622599918649152504, + "snapshots" : [ { + "snapshot-id" : 3622599918649152504, + "timestamp-ms" : 1656496318303, + "summary" : { + "operation" : "append", + "added-data-files" : "1", + "added-records" : "1", + "added-files-size" : "1217", + "changed-partition-count" : "1", + "total-records" : "1", + "total-files-size" : "1217", + "total-data-files" : "1", + "total-delete-files" : "0", + "total-position-deletes" : "0", + "total-equality-deletes" : "0" + }, + "manifest-list" : "/test-warehouse/iceberg_migrated_complex_test_orc/metadata/snap-3622599918649152504-1-8588fd4b-13c1-4451-80ad-5cf71a959b94.avro", + "schema-id" : 0 + } ], + "snapshot-log" : [ { + "timestamp-ms" : 1656496318303, + "snapshot-id" : 3622599918649152504 + } ], + "metadata-log" : [ { + "timestamp-ms" : 1656496318199, + "metadata-file" : "/test-warehouse/iceberg_migrated_complex_test_orc/metadata/00000-9ddb4fd1-dcfa-43a7-8eb5-24835fe1b8b7.metadata.json" + } ] +} diff --git a/testdata/data/iceberg_test/iceberg_migrated_complex_test_orc/metadata/version-hint.text b/testdata/data/iceberg_test/iceberg_migrated_complex_test_orc/metadata/version-hint.text new file mode 100644 index 000000000..0cfbf0888 --- /dev/null +++ b/testdata/data/iceberg_test/iceberg_migrated_complex_test_orc/metadata/version-hint.text @@ -0,0 +1 @@ +2 diff --git a/testdata/workloads/functional-query/queries/QueryTest/iceberg-migrated-table-field-id-resolution.test b/testdata/workloads/functional-query/queries/QueryTest/iceberg-migrated-table-field-id-resolution.test new file mode 100644 index 000000000..42cc98936 --- /dev/null +++ b/testdata/workloads/functional-query/queries/QueryTest/iceberg-migrated-table-field-id-resolution.test @@ -0,0 +1,208 @@ +==== +---- QUERY +select * from iceberg_migrated_alter_test; +---- RESULTS +0,'A',0.5 +1,'B',1.5 +2,'C',2.5 +---- TYPES +INT, STRING, DOUBLE +==== +---- QUERY +alter table iceberg_migrated_alter_test drop column string_col; +select * from iceberg_migrated_alter_test; +---- RESULTS +0,0.5 +1,1.5 +2,2.5 +---- TYPES +INT, DOUBLE +==== +---- QUERY +alter table iceberg_migrated_alter_test add column string_col string; +select * from iceberg_migrated_alter_test; +---- RESULTS +0,0.5,'NULL' +1,1.5,'NULL' +2,2.5,'NULL' +---- TYPES +INT, DOUBLE, STRING +==== +---- QUERY +alter table iceberg_migrated_alter_test change column double_col renamed_double_col double; +select * from iceberg_migrated_alter_test; +---- RESULTS +0,0.5,'NULL' +1,1.5,'NULL' +2,2.5,'NULL' +---- TYPES +INT, DOUBLE, STRING +==== +---- QUERY +alter table iceberg_migrated_alter_test add column double_col double; +select * from iceberg_migrated_alter_test; +---- RESULTS +0,0.5,'NULL',NULL +1,1.5,'NULL',NULL +2,2.5,'NULL',NULL +---- TYPES +INT, DOUBLE, STRING, DOUBLE +==== +---- QUERY +insert into iceberg_migrated_alter_test values (3,3.5,"D",3.8); +select * from iceberg_migrated_alter_test; +---- RESULTS +3,3.5,'D',3.8 +0,0.5,'NULL',NULL +1,1.5,'NULL',NULL +2,2.5,'NULL',NULL +---- TYPES +INT, DOUBLE, STRING, DOUBLE +==== +---- QUERY +select * from iceberg_migrated_alter_test_orc; +---- RESULTS +0,'A',0.5 +1,'B',1.5 +2,'C',2.5 +---- TYPES +INT, STRING, DOUBLE +==== +---- QUERY +alter table iceberg_migrated_alter_test_orc drop column string_col; +select * from iceberg_migrated_alter_test_orc; +---- RESULTS +0,0.5 +1,1.5 +2,2.5 +---- TYPES +INT, DOUBLE +==== +---- QUERY +alter table iceberg_migrated_alter_test_orc add column string_col string; +select * from iceberg_migrated_alter_test_orc; +---- RESULTS +0,0.5,'NULL' +1,1.5,'NULL' +2,2.5,'NULL' +---- TYPES +INT, DOUBLE, STRING +==== +---- QUERY +alter table iceberg_migrated_alter_test_orc change column double_col renamed_double_col double; +select * from iceberg_migrated_alter_test_orc; +---- RESULTS +0,0.5,'NULL' +1,1.5,'NULL' +2,2.5,'NULL' +---- TYPES +INT, DOUBLE, STRING +==== +---- QUERY +alter table iceberg_migrated_alter_test_orc add column double_col double; +select * from iceberg_migrated_alter_test_orc; +---- RESULTS +0,0.5,'NULL',NULL +1,1.5,'NULL',NULL +2,2.5,'NULL',NULL +---- TYPES +INT, DOUBLE, STRING, DOUBLE +==== +---- QUERY +select struct_1_col.string_col, struct_2_col.struct_3_col.float_col from iceberg_migrated_complex_test; +---- RESULTS +'A',0.5 +---- TYPES +STRING, FLOAT +==== +---- QUERY +select my_array_1.pos, my_array_1.item from iceberg_migrated_complex_test, iceberg_migrated_complex_test.struct_1_col.int_array_col as my_array_1; +---- RESULTS +0,0 +---- TYPES +BIGINT, INT +==== +---- QUERY +select my_array_2.pos, my_array_2.item from iceberg_migrated_complex_test, iceberg_migrated_complex_test.struct_2_col.struct_3_col.bigint_array_col as my_array_2; +---- RESULTS +0,4 +---- TYPES +BIGINT, BIGINT +==== +---- QUERY +select my_map_1.key, my_map_1.value from iceberg_migrated_complex_test, iceberg_migrated_complex_test.struct_1_col.bool_int_map_col as my_map_1; +---- RESULTS +true,1 +---- TYPES +BOOLEAN, INT +==== +---- QUERY +select my_map_2.key, my_map_2.value from iceberg_migrated_complex_test, iceberg_migrated_complex_test.int_bigint_map_col as my_map_2; +---- RESULTS +2,3 +---- TYPES +INT, BIGINT +==== +---- QUERY +select my_map_3.key, my_map_3.value from iceberg_migrated_complex_test, iceberg_migrated_complex_test.struct_2_col.struct_3_col.string_double_map_col as my_map_3; +---- RESULTS +'B',1.5 +---- TYPES +STRING, DOUBLE +==== +---- QUERY +select my_map_4.key, my_map_4.value from iceberg_migrated_complex_test, iceberg_migrated_complex_test.struct_2_col.int_int_map_col as my_map_4; +---- RESULTS +5,6 +---- TYPES +INT, INT +==== +---- QUERY +select struct_1_col.string_col, struct_2_col.struct_3_col.float_col from iceberg_migrated_complex_test_orc; +---- RESULTS +'A',0.5 +---- TYPES +STRING, FLOAT +==== +---- QUERY +select my_array_1.pos, my_array_1.item from iceberg_migrated_complex_test_orc, iceberg_migrated_complex_test_orc.struct_1_col.int_array_col as my_array_1; +---- RESULTS +0,0 +---- TYPES +BIGINT, INT +==== +---- QUERY +select my_array_2.pos, my_array_2.item from iceberg_migrated_complex_test_orc, iceberg_migrated_complex_test_orc.struct_2_col.struct_3_col.bigint_array_col as my_array_2; +---- RESULTS +0,4 +---- TYPES +BIGINT, BIGINT +==== +---- QUERY +select my_map_1.key, my_map_1.value from iceberg_migrated_complex_test_orc, iceberg_migrated_complex_test_orc.struct_1_col.bool_int_map_col as my_map_1; +---- RESULTS +true,1 +---- TYPES +BOOLEAN, INT +==== +---- QUERY +select my_map_2.key, my_map_2.value from iceberg_migrated_complex_test_orc, iceberg_migrated_complex_test_orc.int_bigint_map_col as my_map_2; +---- RESULTS +2,3 +---- TYPES +INT, BIGINT +==== +---- QUERY +select my_map_3.key, my_map_3.value from iceberg_migrated_complex_test_orc, iceberg_migrated_complex_test_orc.struct_2_col.struct_3_col.string_double_map_col as my_map_3; +---- RESULTS +'B',1.5 +---- TYPES +STRING, DOUBLE +==== +---- QUERY +select my_map_4.key, my_map_4.value from iceberg_migrated_complex_test_orc, iceberg_migrated_complex_test_orc.struct_2_col.int_int_map_col as my_map_4; +---- RESULTS +5,6 +---- TYPES +INT, INT +==== \ No newline at end of file diff --git a/tests/common/file_utils.py b/tests/common/file_utils.py index 2de55494f..de276cae7 100644 --- a/tests/common/file_utils.py +++ b/tests/common/file_utils.py @@ -26,6 +26,40 @@ from subprocess import check_call from tests.util.filesystem_utils import get_fs_path +def create_iceberg_table_from_directory(impala_client, unique_database, table_name, + file_format): + """Utility function to create an iceberg table from a directory. The directory must + exist in $IMPALA_HOME/testdata/data/iceberg_test with the name 'table_name'""" + + # Only orc and parquet tested/supported + assert file_format == "orc" or file_format == "parquet" + + local_dir = os.path.join( + os.environ['IMPALA_HOME'], 'testdata/data/iceberg_test/{0}'.format(table_name)) + assert os.path.isdir(local_dir) + + # Put the directory in the database's directory (not the table directory) + hdfs_parent_dir = get_fs_path("/test-warehouse") + + hdfs_dir = os.path.join(hdfs_parent_dir, table_name) + + # Purge existing files if any + check_call(['hdfs', 'dfs', '-rm', '-f', '-r', hdfs_dir]) + + # Note: -d skips a staging copy + check_call(['hdfs', 'dfs', '-put', '-d', local_dir, hdfs_parent_dir]) + + # Create external table + qualified_table_name = '{0}.{1}'.format(unique_database, table_name) + impala_client.execute("""create external table {0} stored as iceberg location '{1}' + tblproperties('write.format.default'='{2}', 'iceberg.catalog'= + 'hadoop.tables')""".format(qualified_table_name, hdfs_dir, + file_format)) + + # Automatic clean up after drop table + impala_client.execute("""alter table {0} set tblproperties ('external.table.purge'= + 'True');""".format(qualified_table_name)) + def create_table_from_parquet(impala_client, unique_database, table_name): """Utility function to create a database table from a Parquet file. A Parquet file must exist in $IMPALA_HOME/testdata/data with the name 'table_name'.parquet""" diff --git a/tests/query_test/test_iceberg.py b/tests/query_test/test_iceberg.py index aab727fb6..230938dee 100644 --- a/tests/query_test/test_iceberg.py +++ b/tests/query_test/test_iceberg.py @@ -33,6 +33,7 @@ from tests.common.skip import SkipIf from tests.util.filesystem_utils import get_fs_path, IS_HDFS from tests.util.get_parquet_metadata import get_parquet_metadata +from tests.common.file_utils import create_iceberg_table_from_directory class TestIcebergTable(ImpalaTestSuite): """Tests related to Iceberg tables.""" @@ -102,6 +103,18 @@ class TestIcebergTable(ImpalaTestSuite): def test_migrated_tables(self, vector, unique_database): self.run_test_case('QueryTest/iceberg-migrated-tables', vector, unique_database) + def test_migrated_table_field_id_resolution(self, vector, unique_database): + create_iceberg_table_from_directory(self.client, unique_database, + "iceberg_migrated_alter_test", "parquet") + create_iceberg_table_from_directory(self.client, unique_database, + "iceberg_migrated_complex_test", "parquet") + create_iceberg_table_from_directory(self.client, unique_database, + "iceberg_migrated_alter_test_orc", "orc") + create_iceberg_table_from_directory(self.client, unique_database, + "iceberg_migrated_complex_test_orc", "orc") + self.run_test_case('QueryTest/iceberg-migrated-table-field-id-resolution', + vector, unique_database) + def test_describe_history(self, vector, unique_database): self.run_test_case('QueryTest/iceberg-table-history', vector, use_db=unique_database)
