(impala) 03/03: IMPALA-13364: Schema resolution doesn't work for migrated partitioned Iceberg tables that have complex types

joemcdonnell Wed, 02 Oct 2024 14:38:30 -0700

This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


commit 3a861500b669f6cfd7283a43b542ff126ad44cea
Author: Zoltan Borok-Nagy <[email protected]>
AuthorDate: Mon Sep 9 18:15:31 2024 +0200

    IMPALA-13364: Schema resolution doesn't work for migrated partitioned 
Iceberg tables that have complex types
    
    Schema resolution doesn't work correctly for migrated partitioned
    Iceberg tables that have complex types. When we face a Parquet/ORC file
    in an Iceberg table that doesn't have field IDs in the file metadata, we
    assume that it is an old data file before migration, and the schema is
    the very first one, hence we can mimic Iceberg's field ID generation to
    assign field IDs to the file schema elements.
    
    This process didn't take the partition columns into account. Partition
    columns are not part of the data file but they still get field IDs. This
    only matters when there are complex types in the table, as partition
    columns are always the last columns in legacy Hive tables, and field IDs
    are assigned via a "BFS-like" traversal. I.e. if there are only primitive
    types in the table we don't have any problems, but the children of
    complex types columns are assigned incorrectly.
    
    This patch fixes field ID generation by taking the number of partitions
    into account. If none of the partition columns are included in the data
    file (common case) we adjust the file-level field IDs accordingly. It is
    also OK to have all the partition columns in the data files (it is not
    common, but we've seen such data files). We raise an error in other
    cases (some partition columns are in the data file, while others
    aren't).
    
    Testing:
     * e2e tests added
     * added negative tests
    
    Change-Id: Ie32952021b63d6b55b8820489e434bfc2a91580b
    Reviewed-on: http://gerrit.cloudera.org:8080/21761
    Reviewed-by: Impala Public Jenkins <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 be/src/exec/file-metadata-utils.cc                 |  69 ++++-
 be/src/exec/file-metadata-utils.h                  |  11 +
 be/src/exec/orc/hdfs-orc-scanner.cc                |   6 +-
 be/src/exec/orc/orc-metadata-utils.cc              |  22 +-
 be/src/exec/orc/orc-metadata-utils.h               |  11 +-
 be/src/exec/parquet/hdfs-parquet-scanner.cc        |   1 +
 be/src/exec/parquet/parquet-metadata-utils.cc      |  11 +-
 be/src/exec/parquet/parquet-metadata-utils.h       |  10 +-
 testdata/migrated_iceberg/README                   |  18 ++
 ...extypes_and_partition_columns_in_data_files.orc | Bin 0 -> 860 bytes
 ...pes_and_partition_columns_in_data_files.parquet | Bin 0 -> 1549 bytes
 ...migrated-table-field-id-resolution-complex.test | 327 +++++++++++++++++++++
 tests/query_test/test_iceberg.py                   |  59 +++-
 13 files changed, 530 insertions(+), 15 deletions(-)

diff --git a/be/src/exec/file-metadata-utils.cc 
b/be/src/exec/file-metadata-utils.cc
index b63379abe..11fe936fe 100644
--- a/be/src/exec/file-metadata-utils.cc
+++ b/be/src/exec/file-metadata-utils.cc
@@ -65,7 +65,7 @@ void FileMetadataUtils::AddFileLevelVirtualColumns(MemPool* 
mem_pool,
       int len = strlen(filename);
       char* filename_copy = reinterpret_cast<char*>(mem_pool->Allocate(len));
       Ubsan::MemCpy(filename_copy, filename, len);
-    slot->Assign(filename_copy, len);
+      slot->Assign(filename_copy, len);
       template_tuple->SetNotNull(slot_desc->null_indicator_offset());
     } else if (slot_desc->virtual_column_type() ==
         TVirtualColumnType::ICEBERG_DATA_SEQUENCE_NUMBER) {
@@ -86,6 +86,18 @@ void FileMetadataUtils::AddFileLevelVirtualColumns(MemPool* 
mem_pool,
   }
 }
 
+auto FileMetadataUtils::IcebergPartitionTransforms() const {
+  DCHECK(file_desc_ != nullptr);
+  DCHECK(scan_node_->hdfs_table()->IsIcebergTable());
+
+  using namespace org::apache::impala::fb;
+  const FbFileMetadata* file_metadata = file_desc_->file_metadata;
+  DCHECK(file_metadata != nullptr);
+  const FbIcebergMetadata* ice_metadata = file_metadata->iceberg_metadata();
+  DCHECK(ice_metadata != nullptr);
+  return ice_metadata->partition_keys();
+}
+
 void FileMetadataUtils::AddIcebergColumns(MemPool* mem_pool, Tuple** 
template_tuple,
     std::map<const SlotId, const SlotDescriptor*>* slot_descs_written) {
   using namespace org::apache::impala::fb;
@@ -208,9 +220,7 @@ bool FileMetadataUtils::IsValuePartitionCol(const 
SlotDescriptor* slot_desc) {
   if (path.size() != 1) return false;
 
   int field_id = 
scan_node_->hdfs_table()->col_descs()[path.front()].field_id();
-  const FbFileMetadata* file_metadata = file_desc_->file_metadata;
-  const FbIcebergMetadata* ice_metadata = file_metadata->iceberg_metadata();
-  auto transforms = ice_metadata->partition_keys();
+  auto transforms = IcebergPartitionTransforms();
   if (transforms == nullptr) return false;
   for (int i = 0; i < transforms->size(); ++i) {
     auto transform = transforms->Get(i);
@@ -223,6 +233,57 @@ bool FileMetadataUtils::IsValuePartitionCol(const 
SlotDescriptor* slot_desc) {
   return false;
 }
 
+Status FileMetadataUtils::AdjustFieldIdForMigratedPartitionedTables(int 
*fieldID) const {
+  DCHECK(file_desc_ != nullptr);
+  DCHECK(scan_node_->hdfs_table()->IsIcebergTable());
+  DCHECK(fieldID != nullptr);
+
+  using namespace org::apache::impala::fb;
+
+  auto transforms = IcebergPartitionTransforms();
+  if (transforms == nullptr || transforms->size() == 0) return Status::OK();
+
+  vector<int> sourceIDs;
+  sourceIDs.reserve(transforms->size());
+  for (auto transform : *transforms) {
+    if (transform->transform_type() !=
+        FbIcebergTransformType::FbIcebergTransformType_IDENTITY) {
+      return Status(Substitute("$0 has invalid partition transform: $1",
+          file_desc_->filename, transform->transform_type()));
+    }
+    sourceIDs.push_back(transform->source_id());
+  }
+
+  DCHECK_EQ(sourceIDs.size(), transforms->size());
+  sort(sourceIDs.begin(), sourceIDs.end());
+
+  string errorMsg = Substitute(
+      "Migrated file $0 has unexpected schema or partitioning.", 
file_desc_->filename);
+
+  // Field IDs must be consecutive.
+  for (int i = 0; i < sourceIDs.size() - 1; ++i) {
+    if (sourceIDs[i+1] - sourceIDs[i] != 1) {
+      return Status(errorMsg);
+    }
+  }
+
+  int fileFieldID = *fieldID;
+
+  if (sourceIDs.front() == fileFieldID) {
+    // Partition columns are not stored in the data file (as expected).
+    // In this case we need to adjust fieldID (as it was calculated based on
+    // the file schema only).
+    *fieldID += sourceIDs.size();
+  } else if (sourceIDs.back() == fileFieldID - 1) {
+    // Partition columns are stored in the data file, no need to adjust 
fieldID.
+  } else {
+    // Some partitions are stored, some aren't let's raise an error for this 
mess.
+    return Status(errorMsg);
+  }
+
+  return Status::OK();
+}
+
 bool FileMetadataUtils::NeedDataInFile(const SlotDescriptor* slot_desc) {
   if (IsValuePartitionCol(slot_desc)) return false;
   if (slot_desc->IsVirtual()) return false;
diff --git a/be/src/exec/file-metadata-utils.h 
b/be/src/exec/file-metadata-utils.h
index d2a6675d0..bf929e166 100644
--- a/be/src/exec/file-metadata-utils.h
+++ b/be/src/exec/file-metadata-utils.h
@@ -35,6 +35,7 @@ class MemPool;
 class RuntimeState;
 class ScannerContext;
 class SlotDescriptor;
+class Status;
 class Tuple;
 class TupleDescriptor;
 
@@ -63,6 +64,12 @@ public:
   /// partition columns, virtual columns.
   bool NeedDataInFile(const SlotDescriptor* slot_desc);
 
+  /// Adjusts the file-level fieldID for data files residing in migrated 
partitioned
+  /// Iceberg tables. It is OK to have none of the partition columns in the 
data
+  /// file, or to have all of the partition columns, in any other case this 
method
+  /// returns an error.
+  Status AdjustFieldIdForMigratedPartitionedTables(int *fieldID) const;
+
 private:
   void AddFileLevelVirtualColumns(MemPool* mem_pool, Tuple* template_tuple);
 
@@ -77,6 +84,10 @@ private:
       const org::apache::impala::fb::FbIcebergMetadata& ice_metadata,
       const SlotDescriptor* slot_desc);
 
+  /// Returns a pointer to the container of partition transforms. Only valid 
to call
+  /// for Iceberg tables.
+  auto IcebergPartitionTransforms() const;
+
   HdfsScanNodeBase* const scan_node_;
 
   // Members below are set in Open()
diff --git a/be/src/exec/orc/hdfs-orc-scanner.cc 
b/be/src/exec/orc/hdfs-orc-scanner.cc
index 64369ae97..631c89ea8 100644
--- a/be/src/exec/orc/hdfs-orc-scanner.cc
+++ b/be/src/exec/orc/hdfs-orc-scanner.cc
@@ -363,8 +363,10 @@ Status HdfsOrcScanner::Open(ScannerContext* context) {
 
   bool is_table_full_acid = scan_node_->hdfs_table()->IsTableFullAcid();
   schema_resolver_.reset(
-      new OrcSchemaResolver(*scan_node_->hdfs_table(), &reader_->getType(), 
filename(),
-          is_table_full_acid, state_->query_options().orc_schema_resolution));
+      new OrcSchemaResolver(*scan_node_->hdfs_table(), file_metadata_utils_,
+          &reader_->getType(), filename(), is_table_full_acid,
+          state_->query_options().orc_schema_resolution));
+  RETURN_IF_ERROR(schema_resolver_->Init());
   bool is_file_full_acid = schema_resolver_->HasFullAcidV2Schema();
   acid_original_file_ = is_table_full_acid && !is_file_full_acid;
   if (is_table_full_acid) {
diff --git a/be/src/exec/orc/orc-metadata-utils.cc 
b/be/src/exec/orc/orc-metadata-utils.cc
index c8d4c78e9..9297acf8c 100644
--- a/be/src/exec/orc/orc-metadata-utils.cc
+++ b/be/src/exec/orc/orc-metadata-utils.cc
@@ -21,6 +21,7 @@
 
 #include <boost/algorithm/string.hpp>
 
+#include "exec/file-metadata-utils.h"
 #include "util/debug-util.h"
 #include "common/names.h"
 
@@ -39,22 +40,28 @@ inline int GetFieldIdFromStr(const std::string& str) {
 }
 
 OrcSchemaResolver::OrcSchemaResolver(const HdfsTableDescriptor& tbl_desc,
+    const FileMetadataUtils& file_metadata_utils,
     const orc::Type* root, const char* filename, bool is_table_acid,
     TSchemaResolutionStrategy::type schema_resolution)
   : schema_resolution_strategy_(schema_resolution),
     tbl_desc_(tbl_desc),
+    file_metadata_utils_(file_metadata_utils),
     root_(root),
     filename_(filename),
-    is_table_full_acid_(is_table_acid) {
+    is_table_full_acid_(is_table_acid) {}
+
+
+Status OrcSchemaResolver::Init() {
   DetermineFullAcidSchema();
   if (tbl_desc_.IsIcebergTable()) {
     schema_resolution_strategy_ = TSchemaResolutionStrategy::FIELD_ID;
 
     if (root_->getSubtypeCount() > 0
         && !root_->getSubtype(0)->hasAttributeKey(ICEBERG_FIELD_ID)) {
-      GenerateFieldIDs();
+      RETURN_IF_ERROR(GenerateFieldIDs());
     }
   }
+  return Status::OK();
 }
 
 Status OrcSchemaResolver::ResolveColumn(const SchemaPath& col_path,
@@ -320,7 +327,7 @@ const orc::Type* 
OrcSchemaResolver::FindChildWithFieldId(const orc::Type* node,
   return nullptr;
 }
 
-void OrcSchemaResolver::GenerateFieldIDs() {
+Status OrcSchemaResolver::GenerateFieldIDs() {
   std::stack<const orc::Type*> nodes;
 
   nodes.push(root_);
@@ -343,7 +350,16 @@ void OrcSchemaResolver::GenerateFieldIDs() {
       // order
       nodes.push(current->getSubtype(size - i - 1));
     }
+
+    if (current == root_ && !nodes.empty()) {
+      // Partition columns are not stored in file metadata, but they get field 
IDs
+      // from Iceberg. Check if there are partition columns and adjust field ID
+      // generation. It is only relevant for tables that have complex types.
+      RETURN_IF_ERROR(
+          
file_metadata_utils_.AdjustFieldIdForMigratedPartitionedTables(&fieldID));
+    }
   }
+  return Status::OK();
 }
 
 int OrcSchemaResolver::GetGeneratedFieldID(const orc::Type* type) const {
diff --git a/be/src/exec/orc/orc-metadata-utils.h 
b/be/src/exec/orc/orc-metadata-utils.h
index d9e3f0612..0b87f2162 100644
--- a/be/src/exec/orc/orc-metadata-utils.h
+++ b/be/src/exec/orc/orc-metadata-utils.h
@@ -23,6 +23,8 @@
 
 namespace impala {
 
+class FileMetadataUtils;
+
 // Key of Hive ACID version in ORC metadata.
 const string HIVE_ACID_VERSION_KEY = "hive.acid.version";
 
@@ -40,10 +42,14 @@ constexpr int CURRENT_TRANSCACTION_TYPE_ID = 5;
 /// Util class to resolve SchemaPaths of TupleDescriptors/SlotDescriptors into 
orc::Type.
 class OrcSchemaResolver {
  public:
-  OrcSchemaResolver(const HdfsTableDescriptor& tbl_desc, const orc::Type* root,
+  OrcSchemaResolver(const HdfsTableDescriptor& tbl_desc,
+      const FileMetadataUtils& file_metadata_utils,
+      const orc::Type* root,
       const char* filename, bool is_table_acid,
       TSchemaResolutionStrategy::type schema_resolution);
 
+  Status Init();
+
   /// Resolve SchemaPath into orc::Type (ORC column representation)
   /// 'pos_field' is set to true if 'col_path' reference the index field of an 
array
   /// column. '*node' will be the array node if 'pos_field' is set to true.
@@ -114,7 +120,7 @@ class OrcSchemaResolver {
   /// Generates field ids for the columns in the same order as Iceberg. The 
traversal is
   /// preorder, but the assigned field IDs are not in that order. When a node 
is
   /// processed, its child nodes are assigned an ID, hence the difference.
-  void GenerateFieldIDs();
+  Status GenerateFieldIDs();
 
   inline int GetGeneratedFieldID(const orc::Type* type) const;
 
@@ -124,6 +130,7 @@ class OrcSchemaResolver {
   void DetermineFullAcidSchema();
 
   const HdfsTableDescriptor& tbl_desc_;
+  const FileMetadataUtils& file_metadata_utils_;
   const orc::Type* const root_;
   const char* const filename_ = nullptr;
   const bool is_table_full_acid_;
diff --git a/be/src/exec/parquet/hdfs-parquet-scanner.cc 
b/be/src/exec/parquet/hdfs-parquet-scanner.cc
index 208d0e14a..ad9d0c3ed 100644
--- a/be/src/exec/parquet/hdfs-parquet-scanner.cc
+++ b/be/src/exec/parquet/hdfs-parquet-scanner.cc
@@ -214,6 +214,7 @@ Status HdfsParquetScanner::Open(ScannerContext* context) {
 
   // Parse the file schema into an internal representation for schema 
resolution.
   schema_resolver_.reset(new ParquetSchemaResolver(*scan_node_->hdfs_table(),
+      file_metadata_utils_,
       state_->query_options().parquet_fallback_schema_resolution,
       state_->query_options().parquet_array_resolution));
   RETURN_IF_ERROR(schema_resolver_->Init(&file_metadata_, filename()));
diff --git a/be/src/exec/parquet/parquet-metadata-utils.cc 
b/be/src/exec/parquet/parquet-metadata-utils.cc
index 2d4ade20d..db6bedfc2 100644
--- a/be/src/exec/parquet/parquet-metadata-utils.cc
+++ b/be/src/exec/parquet/parquet-metadata-utils.cc
@@ -28,6 +28,7 @@
 
 #include "common/logging.h"
 #include "common/status.h"
+#include "exec/file-metadata-utils.h"
 #include "exec/parquet/parquet-column-stats.h"
 #include "exec/parquet/parquet-common.h"
 #include "runtime/runtime-state.h"
@@ -979,7 +980,7 @@ Status ParquetSchemaResolver::ValidateScalarNode(const 
SchemaNode& node,
   return Status::OK();
 }
 
-void ParquetSchemaResolver::GenerateFieldIDs() {
+Status ParquetSchemaResolver::GenerateFieldIDs() {
   std::stack<SchemaNode*> nodes;
 
   nodes.push(&schema_);
@@ -1017,7 +1018,15 @@ void ParquetSchemaResolver::GenerateFieldIDs() {
         nodes.push(&current_child);
       }
     }
+    if (current == &schema_ && !nodes.empty()) {
+      // Partition columns are not stored in file metadata, but they get field 
IDs
+      // from Iceberg. Check if there are partition columns and adjust field ID
+      // generation. It is only relevant for tables that have complex types.
+      RETURN_IF_ERROR(
+          
file_metadata_utils_.AdjustFieldIdForMigratedPartitionedTables(&fieldID));
+    }
   }
+  return Status::OK();
 }
 
 int ParquetSchemaResolver::GetGeneratedFieldID(SchemaNode* node) const {
diff --git a/be/src/exec/parquet/parquet-metadata-utils.h 
b/be/src/exec/parquet/parquet-metadata-utils.h
index 74ab9e730..5c4bd5817 100644
--- a/be/src/exec/parquet/parquet-metadata-utils.h
+++ b/be/src/exec/parquet/parquet-metadata-utils.h
@@ -25,6 +25,7 @@
 
 namespace impala {
 
+class FileMetadataUtils;
 class RuntimeState;
 class TQueryOptions;
 
@@ -140,9 +141,11 @@ struct SchemaNode {
 class ParquetSchemaResolver {
  public:
   ParquetSchemaResolver(const HdfsTableDescriptor& tbl_desc,
+      const FileMetadataUtils& file_metadata_utils,
       TSchemaResolutionStrategy::type fallback_schema_resolution,
       TParquetArrayResolution::type array_resolution)
     : tbl_desc_(tbl_desc),
+      file_metadata_utils_(file_metadata_utils),
       fallback_schema_resolution_(fallback_schema_resolution),
       array_resolution_(array_resolution),
       filename_(NULL) {}
@@ -160,7 +163,9 @@ class ParquetSchemaResolver {
       fallback_schema_resolution_ = TSchemaResolutionStrategy::type::FIELD_ID;
 
       // schema[0] is the 'root', schema[1] is the first column.
-      if (schema.size() > 1 && !schema[1].__isset.field_id) GenerateFieldIDs();
+      if (schema.size() > 1 && !schema[1].__isset.field_id) {
+        RETURN_IF_ERROR(GenerateFieldIDs());
+      }
     }
     return status;
   }
@@ -252,11 +257,12 @@ class ParquetSchemaResolver {
   /// Generates field ids for the columns in the same order as Iceberg. The 
traversal is
   /// preorder, but the assigned field IDs are not. When a node is visited, 
its child
   /// nodes are assigned an ID, hence the difference.
-  void GenerateFieldIDs();
+  Status GenerateFieldIDs();
 
   inline int GetGeneratedFieldID(SchemaNode* node) const;
 
   const HdfsTableDescriptor& tbl_desc_;
+  const FileMetadataUtils& file_metadata_utils_;
   TSchemaResolutionStrategy::type fallback_schema_resolution_;
   const TParquetArrayResolution::type array_resolution_;
   const char* filename_;
diff --git a/testdata/migrated_iceberg/README b/testdata/migrated_iceberg/README
new file mode 100644
index 000000000..b0b32526c
--- /dev/null
+++ b/testdata/migrated_iceberg/README
@@ -0,0 +1,18 @@
+iceberg_migrated_complextypes_and_partition_columns_in_data_files.parquet:
+iceberg_migrated_complextypes_and_partition_columns_in_data_files.orc:
+
+The data file in result_date=2024-08-26 was originally part of a 
non-partitioned
+legacy table, i.e. it includes the column result_date and does not have Iceberg
+field IDs.
+
+They were written via Hive using the following commands:
+
+CREATE TABLE array_struct_table_test_negative (id INT, name STRING, teststeps
+    ARRAY<STRUCT<step_number:INT,step_description:STRING>>, result_date STRING)
+STORED AS PARQUET;
+
+INSERT INTO array_struct_table_test_negative VALUES
+(1, 'Test 1', `ARRAY`(NAMED_STRUCT('step_number', 1, 'step_description', 'Step 
1 description'), NAMED_STRUCT('step_number', 2, 'step_description', 'Step 2 
description')), '2024-08-26'),
+(2, 'Test 2', `ARRAY`(NAMED_STRUCT('step_number', 1, 'step_description', 'Step 
1 description'), NAMED_STRUCT('step_number', 2, 'step_description', 'Step 2 
description'), NAMED_STRUCT('step_number', 3, 'step_description', 'Step 3 
description')), '2024-08-26');
+
+Same for ORC.
diff --git 
a/testdata/migrated_iceberg/complextypes_and_partition_columns_in_data_files.orc
 
b/testdata/migrated_iceberg/complextypes_and_partition_columns_in_data_files.orc
new file mode 100644
index 000000000..bf6b59a7b
Binary files /dev/null and 
b/testdata/migrated_iceberg/complextypes_and_partition_columns_in_data_files.orc
 differ
diff --git 
a/testdata/migrated_iceberg/complextypes_and_partition_columns_in_data_files.parquet
 
b/testdata/migrated_iceberg/complextypes_and_partition_columns_in_data_files.parquet
new file mode 100644
index 000000000..e21f1748f
Binary files /dev/null and 
b/testdata/migrated_iceberg/complextypes_and_partition_columns_in_data_files.parquet
 differ
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/iceberg-migrated-table-field-id-resolution-complex.test
 
b/testdata/workloads/functional-query/queries/QueryTest/iceberg-migrated-table-field-id-resolution-complex.test
new file mode 100644
index 000000000..a184a1bb3
--- /dev/null
+++ 
b/testdata/workloads/functional-query/queries/QueryTest/iceberg-migrated-table-field-id-resolution-complex.test
@@ -0,0 +1,327 @@
+====
+---- HIVE_QUERY
+USE $DATABASE;
+CREATE TABLE array_struct_table_test (id INT, name STRING,
+    teststeps ARRAY<STRUCT<step_number:INT,step_description:STRING>>)
+PARTITIONED BY (result_date STRING)
+STORED AS PARQUET;
+
+INSERT INTO array_struct_table_test VALUES
+(1, 'Test 1', `ARRAY`(NAMED_STRUCT('step_number', 1, 'step_description', 'Step 
1 description'),
+                      NAMED_STRUCT('step_number', 2, 'step_description', 'Step 
2 description')),
+    '2024-08-26'),
+(2, 'Test 2', `ARRAY`(NAMED_STRUCT('step_number', 1, 'step_description', 'Step 
1 description'),
+                      NAMED_STRUCT('step_number', 2, 'step_description', 'Step 
2 description'),
+                      NAMED_STRUCT('step_number', 3, 'step_description', 'Step 
3 description')),
+    '2024-08-26');
+-- Create similar table with ORC.
+CREATE TABLE array_struct_table_test_orc (id INT, name STRING,
+    teststeps ARRAY<STRUCT<step_number:INT,step_description:STRING>>)
+PARTITIONED BY (result_date STRING)
+STORED AS ORC;
+INSERT INTO array_struct_table_test_orc SELECT * FROM array_struct_table_test;
+====
+---- QUERY
+invalidate metadata array_struct_table_test;
+alter table array_struct_table_test convert to iceberg;
+select teststeps.step_description, teststeps.step_number from 
array_struct_table_test m, m.teststeps;
+---- RESULTS
+'Step 1 description',1
+'Step 2 description',2
+'Step 1 description',1
+'Step 2 description',2
+'Step 3 description',3
+---- TYPES
+STRING, INT
+====
+---- QUERY
+select * from array_struct_table_test;
+---- RESULTS
+1,'Test 1','2024-08-26'
+2,'Test 2','2024-08-26'
+---- TYPES
+INT, STRING, STRING
+====
+---- QUERY
+set expand_complex_types=true;
+select * from array_struct_table_test;
+---- RESULTS
+1,'Test 1','[{"step_number":1,"step_description":"Step 1 
description"},{"step_number":2,"step_description":"Step 2 
description"}]','2024-08-26'
+2,'Test 2','[{"step_number":1,"step_description":"Step 1 
description"},{"step_number":2,"step_description":"Step 2 
description"},{"step_number":3,"step_description":"Step 3 
description"}]','2024-08-26'
+---- TYPES
+INT, STRING, STRING, STRING
+====
+---- QUERY
+alter table array_struct_table_test set partition spec (id, name);
+====
+---- QUERY
+select teststeps.step_description, teststeps.step_number from 
array_struct_table_test m, m.teststeps;
+---- RESULTS
+'Step 1 description',1
+'Step 2 description',2
+'Step 1 description',1
+'Step 2 description',2
+'Step 3 description',3
+---- TYPES
+STRING, INT
+====
+---- QUERY
+select * from array_struct_table_test;
+---- RESULTS
+1,'Test 1','2024-08-26'
+2,'Test 2','2024-08-26'
+---- TYPES
+INT, STRING, STRING
+====
+---- QUERY
+set expand_complex_types=true;
+select * from array_struct_table_test;
+---- RESULTS
+1,'Test 1','[{"step_number":1,"step_description":"Step 1 
description"},{"step_number":2,"step_description":"Step 2 
description"}]','2024-08-26'
+2,'Test 2','[{"step_number":1,"step_description":"Step 1 
description"},{"step_number":2,"step_description":"Step 2 
description"},{"step_number":3,"step_description":"Step 3 
description"}]','2024-08-26'
+---- TYPES
+INT, STRING, STRING, STRING
+====
+---- HIVE_QUERY
+USE $DATABASE;
+INSERT INTO array_struct_table_test VALUES
+(3, 'Test 3', `ARRAY`(NAMED_STRUCT('step_number', 1, 'step_description', 'Step 
1 description'),
+                      NAMED_STRUCT('step_number', 2, 'step_description', 'Step 
2 description')),
+    '2024-08-27'),
+(4, 'Test 4', `ARRAY`(NAMED_STRUCT('step_number', 1, 'step_description', 'Step 
1 description'),
+                      NAMED_STRUCT('step_number', 2, 'step_description', 'Step 
2 description'),
+                      NAMED_STRUCT('step_number', 3, 'step_description', 'Step 
3 description')),
+    '2024-08-27');
+====
+---- QUERY
+refresh array_struct_table_test;
+select m.result_date, teststeps.step_description, teststeps.step_number from 
array_struct_table_test m, m.teststeps;
+---- RESULTS
+'2024-08-27','Step 1 description',1
+'2024-08-27','Step 2 description',2
+'2024-08-26','Step 1 description',1
+'2024-08-26','Step 2 description',2
+'2024-08-26','Step 1 description',1
+'2024-08-26','Step 2 description',2
+'2024-08-26','Step 3 description',3
+'2024-08-27','Step 1 description',1
+'2024-08-27','Step 2 description',2
+'2024-08-27','Step 3 description',3
+---- TYPES
+STRING, STRING, INT
+====
+---- QUERY
+select * from array_struct_table_test;
+---- RESULTS
+3,'Test 3','2024-08-27'
+1,'Test 1','2024-08-26'
+2,'Test 2','2024-08-26'
+4,'Test 4','2024-08-27'
+---- TYPES
+INT, STRING, STRING
+====
+---- QUERY
+set expand_complex_types=true;
+select * from array_struct_table_test;
+---- RESULTS
+4,'Test 4','[{"step_number":1,"step_description":"Step 1 
description"},{"step_number":2,"step_description":"Step 2 
description"},{"step_number":3,"step_description":"Step 3 
description"}]','2024-08-27'
+3,'Test 3','[{"step_number":1,"step_description":"Step 1 
description"},{"step_number":2,"step_description":"Step 2 
description"}]','2024-08-27'
+1,'Test 1','[{"step_number":1,"step_description":"Step 1 
description"},{"step_number":2,"step_description":"Step 2 
description"}]','2024-08-26'
+2,'Test 2','[{"step_number":1,"step_description":"Step 1 
description"},{"step_number":2,"step_description":"Step 2 
description"},{"step_number":3,"step_description":"Step 3 
description"}]','2024-08-26'
+---- TYPES
+INT, STRING, STRING, STRING
+====
+---- QUERY
+invalidate metadata array_struct_table_test_orc;
+alter table array_struct_table_test_orc convert to iceberg;
+select teststeps.step_description, teststeps.step_number from 
array_struct_table_test_orc m, m.teststeps;
+---- RESULTS
+'Step 1 description',1
+'Step 2 description',2
+'Step 1 description',1
+'Step 2 description',2
+'Step 3 description',3
+---- TYPES
+STRING, INT
+====
+---- QUERY
+select * from array_struct_table_test_orc;
+---- RESULTS
+1,'Test 1','2024-08-26'
+2,'Test 2','2024-08-26'
+---- TYPES
+INT, STRING, STRING
+====
+---- QUERY
+set expand_complex_types=true;
+select * from array_struct_table_test_orc;
+---- RESULTS
+1,'Test 1','[{"step_number":1,"step_description":"Step 1 
description"},{"step_number":2,"step_description":"Step 2 
description"}]','2024-08-26'
+2,'Test 2','[{"step_number":1,"step_description":"Step 1 
description"},{"step_number":2,"step_description":"Step 2 
description"},{"step_number":3,"step_description":"Step 3 
description"}]','2024-08-26'
+---- TYPES
+INT, STRING, STRING, STRING
+====
+---- QUERY
+alter table array_struct_table_test_orc set partition spec (id, name);
+====
+---- QUERY
+select teststeps.step_description, teststeps.step_number from 
array_struct_table_test_orc m, m.teststeps;
+---- RESULTS
+'Step 1 description',1
+'Step 2 description',2
+'Step 1 description',1
+'Step 2 description',2
+'Step 3 description',3
+---- TYPES
+STRING, INT
+====
+---- QUERY
+select * from array_struct_table_test_orc;
+---- RESULTS
+1,'Test 1','2024-08-26'
+2,'Test 2','2024-08-26'
+---- TYPES
+INT, STRING, STRING
+====
+---- QUERY
+set expand_complex_types=true;
+select * from array_struct_table_test_orc;
+---- RESULTS
+1,'Test 1','[{"step_number":1,"step_description":"Step 1 
description"},{"step_number":2,"step_description":"Step 2 
description"}]','2024-08-26'
+2,'Test 2','[{"step_number":1,"step_description":"Step 1 
description"},{"step_number":2,"step_description":"Step 2 
description"},{"step_number":3,"step_description":"Step 3 
description"}]','2024-08-26'
+---- TYPES
+INT, STRING, STRING, STRING
+====
+---- HIVE_QUERY
+USE $DATABASE;
+INSERT INTO array_struct_table_test_orc VALUES
+(3, 'Test 3', `ARRAY`(NAMED_STRUCT('step_number', 1, 'step_description', 'Step 
1 description'),
+                      NAMED_STRUCT('step_number', 2, 'step_description', 'Step 
2 description')),
+    '2024-08-27'),
+(4, 'Test 4', `ARRAY`(NAMED_STRUCT('step_number', 1, 'step_description', 'Step 
1 description'),
+                      NAMED_STRUCT('step_number', 2, 'step_description', 'Step 
2 description'),
+                      NAMED_STRUCT('step_number', 3, 'step_description', 'Step 
3 description')),
+    '2024-08-27');
+====
+---- QUERY
+refresh array_struct_table_test_orc;
+select m.result_date, teststeps.step_description, teststeps.step_number from 
array_struct_table_test_orc m, m.teststeps;
+---- RESULTS
+'2024-08-27','Step 1 description',1
+'2024-08-27','Step 2 description',2
+'2024-08-26','Step 1 description',1
+'2024-08-26','Step 2 description',2
+'2024-08-26','Step 1 description',1
+'2024-08-26','Step 2 description',2
+'2024-08-26','Step 3 description',3
+'2024-08-27','Step 1 description',1
+'2024-08-27','Step 2 description',2
+'2024-08-27','Step 3 description',3
+---- TYPES
+STRING, STRING, INT
+====
+---- QUERY
+select * from array_struct_table_test_orc;
+---- RESULTS
+4,'Test 4','2024-08-27'
+1,'Test 1','2024-08-26'
+2,'Test 2','2024-08-26'
+3,'Test 3','2024-08-27'
+---- TYPES
+INT, STRING, STRING
+====
+---- QUERY
+set expand_complex_types=true;
+select * from array_struct_table_test_orc;
+---- RESULTS
+4,'Test 4','[{"step_number":1,"step_description":"Step 1 
description"},{"step_number":2,"step_description":"Step 2 
description"},{"step_number":3,"step_description":"Step 3 
description"}]','2024-08-27'
+3,'Test 3','[{"step_number":1,"step_description":"Step 1 
description"},{"step_number":2,"step_description":"Step 2 
description"}]','2024-08-27'
+1,'Test 1','[{"step_number":1,"step_description":"Step 1 
description"},{"step_number":2,"step_description":"Step 2 
description"}]','2024-08-26'
+2,'Test 2','[{"step_number":1,"step_description":"Step 1 
description"},{"step_number":2,"step_description":"Step 2 
description"},{"step_number":3,"step_description":"Step 3 
description"}]','2024-08-26'
+---- TYPES
+INT, STRING, STRING, STRING
+====
+---- QUERY
+select * from $DATABASE.array_struct_table_test.`partitions`;
+---- RESULTS
+'{"result_date":null,"id":4,"name":"Test 4"}',1,1,1,0,0,0,0
+'{"result_date":"2024-08-26","id":null,"name":null}',0,2,1,0,0,0,0
+'{"result_date":null,"id":3,"name":"Test 3"}',1,1,1,0,0,0,0
+---- TYPES
+STRING, INT, BIGINT, INT, BIGINT, INT, BIGINT, INT
+====
+---- QUERY
+select * from $DATABASE.array_struct_table_test_orc.`partitions`;
+---- RESULTS
+'{"result_date":null,"id":4,"name":"Test 4"}',1,1,1,0,0,0,0
+'{"result_date":"2024-08-26","id":null,"name":null}',0,2,1,0,0,0,0
+'{"result_date":null,"id":3,"name":"Test 3"}',1,1,1,0,0,0,0
+---- TYPES
+STRING, INT, BIGINT, INT, BIGINT, INT, BIGINT, INT
+====
+---- QUERY
+select teststeps.step_description, teststeps.step_number from 
all_part_cols_stored_parquet m, m.teststeps;
+---- RESULTS
+'Step 1 description',1
+'Step 2 description',2
+'Step 1 description',1
+'Step 2 description',2
+'Step 3 description',3
+---- TYPES
+STRING, INT
+====
+---- QUERY
+select * from all_part_cols_stored_parquet;
+---- RESULTS
+1,'Test 1','2024-08-26'
+2,'Test 2','2024-08-26'
+---- TYPES
+INT, STRING, STRING
+====
+---- QUERY
+set expand_complex_types=true;
+select * from all_part_cols_stored_parquet;
+---- RESULTS
+1,'Test 1','[{"step_number":1,"step_description":"Step 1 
description"},{"step_number":2,"step_description":"Step 2 
description"}]','2024-08-26'
+2,'Test 2','[{"step_number":1,"step_description":"Step 1 
description"},{"step_number":2,"step_description":"Step 2 
description"},{"step_number":3,"step_description":"Step 3 
description"}]','2024-08-26'
+---- TYPES
+INT, STRING, STRING, STRING
+====
+---- QUERY
+select * from not_all_part_cols_stored_parquet;
+---- CATCH
+has unexpected schema or partitioning
+====
+---- QUERY
+select teststeps.step_description, teststeps.step_number from 
all_part_cols_stored_orc m, m.teststeps;
+---- RESULTS
+'Step 1 description',1
+'Step 2 description',2
+'Step 1 description',1
+'Step 2 description',2
+'Step 3 description',3
+---- TYPES
+STRING, INT
+====
+---- QUERY
+select * from all_part_cols_stored_orc;
+---- RESULTS
+1,'Test 1','2024-08-26'
+2,'Test 2','2024-08-26'
+---- TYPES
+INT, STRING, STRING
+====
+---- QUERY
+set expand_complex_types=true;
+select * from all_part_cols_stored_orc;
+---- RESULTS
+1,'Test 1','[{"step_number":1,"step_description":"Step 1 
description"},{"step_number":2,"step_description":"Step 2 
description"}]','2024-08-26'
+2,'Test 2','[{"step_number":1,"step_description":"Step 1 
description"},{"step_number":2,"step_description":"Step 2 
description"},{"step_number":3,"step_description":"Step 3 
description"}]','2024-08-26'
+---- TYPES
+INT, STRING, STRING, STRING
+====
+---- QUERY
+select * from not_all_part_cols_stored_orc;
+---- CATCH
+has unexpected schema or partitioning
+====
diff --git a/tests/query_test/test_iceberg.py b/tests/query_test/test_iceberg.py
index 86a602659..4dab443e8 100644
--- a/tests/query_test/test_iceberg.py
+++ b/tests/query_test/test_iceberg.py
@@ -45,7 +45,7 @@ from tests.common.file_utils import (
   create_iceberg_table_from_directory,
   create_table_from_parquet)
 from tests.shell.util import run_impala_shell_cmd
-from tests.util.filesystem_utils import get_fs_path, IS_HDFS
+from tests.util.filesystem_utils import get_fs_path, IS_HDFS, WAREHOUSE
 from tests.util.get_parquet_metadata import get_parquet_metadata
 from tests.util.iceberg_util import cast_ts, quote, get_snapshots, 
IcebergCatalogs
 
@@ -271,6 +271,63 @@ class TestIcebergTable(IcebergTestSuite):
     self.run_test_case('QueryTest/iceberg-migrated-table-field-id-resolution',
                        vector, unique_database)
 
+  @SkipIfFS.hive
+  def test_migrated_table_field_id_resolution_complex(self, vector, 
unique_database):
+    def get_table_loc(tbl_name):
+      return '%s/%s.db/%s/' % (WAREHOUSE, unique_database, tbl_name)
+
+    def create_table(tbl_name, file_format, partition_cols):
+      self.execute_query("""CREATE TABLE IF NOT EXISTS {}.{} (
+            id INT,
+            name STRING,
+            teststeps array<struct<step_number:int,step_description:string>>)
+          PARTITIONED BY ({})
+          STORED AS {}
+          """.format(unique_database, tbl_name, partition_cols, file_format))
+
+    def add_file_to_table_partition(tbl_name, part_dir, local_filename):
+      tbl_loc = get_table_loc(tbl_name)
+      part_dir = os.path.join(tbl_loc, part_dir)
+      self.filesystem_client.make_dir(part_dir)
+      data_file_path = os.path.join(os.environ['IMPALA_HOME'], "testdata",
+          "migrated_iceberg", local_filename)
+      self.filesystem_client.copy_from_local(data_file_path, part_dir)
+
+    def finalize_table(tbl_name):
+      self.execute_query("ALTER TABLE {}.{} RECOVER PARTITIONS".format(
+          unique_database, tbl_name))
+      self.execute_query("ALTER TABLE {}.{} CONVERT TO ICEBERG".format(
+          unique_database, tbl_name))
+
+    def prepare_test_table(tbl_name, file_format, partition_cols, part_dir, 
datafile):
+      create_table(tbl_name, file_format, partition_cols)
+      add_file_to_table_partition(tbl_name, part_dir, datafile)
+      finalize_table(tbl_name)
+
+    prepare_test_table('all_part_cols_stored_parquet',
+        "PARQUET",
+        "result_date STRING",
+        "result_date=2024-08-26",
+        "complextypes_and_partition_columns_in_data_files.parquet")
+    prepare_test_table('not_all_part_cols_stored_parquet',
+        "PARQUET",
+        "result_date STRING, p INT",
+        "result_date=2024-08-26/p=3",
+        "complextypes_and_partition_columns_in_data_files.parquet")
+    prepare_test_table('all_part_cols_stored_orc',
+        "ORC",
+        "result_date STRING",
+        "result_date=2024-08-26",
+        "complextypes_and_partition_columns_in_data_files.orc")
+    prepare_test_table('not_all_part_cols_stored_orc',
+        "ORC",
+        "result_date STRING, p INT",
+        "result_date=2024-08-26/p=3",
+        "complextypes_and_partition_columns_in_data_files.orc")
+
+    
self.run_test_case('QueryTest/iceberg-migrated-table-field-id-resolution-complex',
+                       vector, unique_database)
+
   def test_describe_history(self, vector, unique_database):
     self.run_test_case('QueryTest/iceberg-table-history', vector, 
use_db=unique_database)

(impala) 03/03: IMPALA-13364: Schema resolution doesn't work for migrated partitioned Iceberg tables that have complex types

Reply via email to