(impala) 02/02: IMPALA-13053: Update test to use ORC files

wzhou Sat, 04 May 2024 12:12:28 -0700

This is an automated email from the ASF dual-hosted git repository.

wzhou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


commit 0d01f5e82967b84b182d40de1ce213367e0b6181
Author: Michael Smith <[email protected]>
AuthorDate: Thu May 2 15:08:31 2024 -0700

    IMPALA-13053: Update test to use ORC files
    
    Updates test_max_nesting_depth to use precreated ORC files like the
    Parquet version to reduce runtime rather than using Hive to generate ORC
    from the Parquet files. This reduces each test run by almost 3 minutes.
    
    Change-Id: I2f5bdbb86af0e651d189217a18882d5eda1098d5
    Reviewed-on: http://gerrit.cloudera.org:8080/21391
    Reviewed-by: Impala Public Jenkins <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 testdata/max_nesting_depth/README                  |   8 ++---
 testdata/max_nesting_depth/orc/int_array/file.orc  | Bin 0 -> 2882 bytes
 testdata/max_nesting_depth/orc/int_map/file.orc    | Bin 0 -> 9864 bytes
 testdata/max_nesting_depth/orc/struct/file.orc     | Bin 0 -> 1651 bytes
 .../max_nesting_depth/orc/struct_array/file.orc    | Bin 0 -> 2344 bytes
 testdata/max_nesting_depth/orc/struct_map/file.orc | Bin 0 -> 5870 bytes
 .../{ => parquet}/int_array/file.parq              | Bin
 .../{ => parquet}/int_map/file.parq                | Bin
 .../{ => parquet}/struct/file.parq                 | Bin
 .../{ => parquet}/struct_array/file.parq           | Bin
 .../{ => parquet}/struct_map/file.parq             | Bin
 tests/query_test/test_nested_types.py              |  33 ++++++++-------------
 12 files changed, 17 insertions(+), 24 deletions(-)

diff --git a/testdata/max_nesting_depth/README 
b/testdata/max_nesting_depth/README
index a44fd42cb..7dff46361 100644
--- a/testdata/max_nesting_depth/README
+++ b/testdata/max_nesting_depth/README
@@ -1,7 +1,7 @@
-This folder contains Parquet files for testing the maximum nesting depth
-of complex types. The maximum nesting depth Types.MAX_NESTING_DEPTH (100).
-All Parquet files contain the integer 42 as the only data value in a single
-Parquet column.
+This folder contains Parquet and ORC files for testing the maximum nesting
+depth of complex types. The maximum nesting depth Types.MAX_NESTING_DEPTH
+(100). All files contain the integer 42 as the only data value in a single
+column.
 
 The folders contain files for the following complex types with nesting
 depth of exactly 100.
diff --git a/testdata/max_nesting_depth/orc/int_array/file.orc 
b/testdata/max_nesting_depth/orc/int_array/file.orc
new file mode 100644
index 000000000..fb03d4ba6
Binary files /dev/null and b/testdata/max_nesting_depth/orc/int_array/file.orc 
differ
diff --git a/testdata/max_nesting_depth/orc/int_map/file.orc 
b/testdata/max_nesting_depth/orc/int_map/file.orc
new file mode 100644
index 000000000..c13581fad
Binary files /dev/null and b/testdata/max_nesting_depth/orc/int_map/file.orc 
differ
diff --git a/testdata/max_nesting_depth/orc/struct/file.orc 
b/testdata/max_nesting_depth/orc/struct/file.orc
new file mode 100644
index 000000000..046d486ba
Binary files /dev/null and b/testdata/max_nesting_depth/orc/struct/file.orc 
differ
diff --git a/testdata/max_nesting_depth/orc/struct_array/file.orc 
b/testdata/max_nesting_depth/orc/struct_array/file.orc
new file mode 100644
index 000000000..f7b729dfd
Binary files /dev/null and 
b/testdata/max_nesting_depth/orc/struct_array/file.orc differ
diff --git a/testdata/max_nesting_depth/orc/struct_map/file.orc 
b/testdata/max_nesting_depth/orc/struct_map/file.orc
new file mode 100644
index 000000000..1fbe6113e
Binary files /dev/null and b/testdata/max_nesting_depth/orc/struct_map/file.orc 
differ
diff --git a/testdata/max_nesting_depth/int_array/file.parq 
b/testdata/max_nesting_depth/parquet/int_array/file.parq
similarity index 100%
rename from testdata/max_nesting_depth/int_array/file.parq
rename to testdata/max_nesting_depth/parquet/int_array/file.parq
diff --git a/testdata/max_nesting_depth/int_map/file.parq 
b/testdata/max_nesting_depth/parquet/int_map/file.parq
similarity index 100%
rename from testdata/max_nesting_depth/int_map/file.parq
rename to testdata/max_nesting_depth/parquet/int_map/file.parq
diff --git a/testdata/max_nesting_depth/struct/file.parq 
b/testdata/max_nesting_depth/parquet/struct/file.parq
similarity index 100%
rename from testdata/max_nesting_depth/struct/file.parq
rename to testdata/max_nesting_depth/parquet/struct/file.parq
diff --git a/testdata/max_nesting_depth/struct_array/file.parq 
b/testdata/max_nesting_depth/parquet/struct_array/file.parq
similarity index 100%
rename from testdata/max_nesting_depth/struct_array/file.parq
rename to testdata/max_nesting_depth/parquet/struct_array/file.parq
diff --git a/testdata/max_nesting_depth/struct_map/file.parq 
b/testdata/max_nesting_depth/parquet/struct_map/file.parq
similarity index 100%
rename from testdata/max_nesting_depth/struct_map/file.parq
rename to testdata/max_nesting_depth/parquet/struct_map/file.parq
diff --git a/tests/query_test/test_nested_types.py 
b/tests/query_test/test_nested_types.py
index 12179701e..a21224bcf 100644
--- a/tests/query_test/test_nested_types.py
+++ b/tests/query_test/test_nested_types.py
@@ -848,7 +848,6 @@ class TestMaxNestingDepth(ImpalaTestSuite):
   # Should be kept in sync with the FE's Type.MAX_NESTING_DEPTH
   MAX_NESTING_DEPTH = 100
   TABLES = ['struct', 'int_array', 'struct_array', 'int_map', 'struct_map']
-  TEMP_TABLE_SUFFIX = '_parquet'
 
   @classmethod
   def get_workload(self):
@@ -866,41 +865,35 @@ class TestMaxNestingDepth(ImpalaTestSuite):
     """Tests that Impala can scan Parquet and ORC files having complex types of
     the maximum nesting depth."""
     file_format = vector.get_value('table_format').file_format
-    if file_format == 'orc' and not IS_HDFS:
-      pytest.skip('Orc table loading needs Hive and thus only works with 
HDFS.')
-
     if file_format == 'parquet':
       self.__create_parquet_tables(unique_database)
     elif file_format == 'orc':
       self.__create_orc_tables(unique_database)
     self.run_test_case('QueryTest/max-nesting-depth', vector, unique_database)
 
-  def __create_parquet_tables(self, unique_database, as_target=True):
-    """Create Parquet tables from files. If 'as_target' is False, the Parquet 
tables will
-     be used to create ORC tables, so we add a suffix in the table names."""
+  def __create_parquet_tables(self, unique_database):
+    """Create Parquet tables from files."""
     self.filesystem_client.copy_from_local(
-      "%s/testdata/max_nesting_depth" % os.environ['IMPALA_HOME'],
+      "%s/testdata/max_nesting_depth/parquet" % os.environ['IMPALA_HOME'],
       "%s/%s.db/" % (WAREHOUSE, unique_database))
-    tbl_suffix = '' if as_target else self.TEMP_TABLE_SUFFIX
     for tbl in self.TABLES:
-      tbl_name = "%s.%s_tbl%s" % (unique_database, tbl, tbl_suffix)
-      tbl_location = "%s/%s.db/max_nesting_depth/%s/" % (WAREHOUSE, 
unique_database, tbl)
+      tbl_name = "%s.%s_tbl" % (unique_database, tbl)
+      tbl_location = "%s/%s.db/parquet/%s/" % (WAREHOUSE, unique_database, tbl)
       create_table = "CREATE EXTERNAL TABLE %s LIKE PARQUET '%s' STORED AS 
PARQUET" \
           " location '%s'" % (tbl_name, tbl_location + 'file.parq', 
tbl_location)
       self.client.execute(create_table)
 
   def __create_orc_tables(self, unique_database):
-    # Creating ORC tables from ORC files (IMPALA-8046) has not been supported.
-    # We create the Parquet tables first and then transform them into ORC 
tables.
-    self.__create_parquet_tables(unique_database, False)
+    """Create ORC tables from files."""
+    self.filesystem_client.copy_from_local(
+      "%s/testdata/max_nesting_depth/orc" % os.environ['IMPALA_HOME'],
+      "%s/%s.db/" % (WAREHOUSE, unique_database))
     for tbl in self.TABLES:
       tbl_name = "%s.%s_tbl" % (unique_database, tbl)
-      from_tbl_name = tbl_name + self.TEMP_TABLE_SUFFIX
-      create_table = "CREATE TABLE %s LIKE %s STORED AS ORC" % (tbl_name, 
from_tbl_name)
-      insert_table = "INSERT INTO %s SELECT * FROM %s" % (tbl_name, 
from_tbl_name)
-      self.run_stmt_in_hive(create_table)
-      self.run_stmt_in_hive(insert_table)
-      self.client.execute("INVALIDATE METADATA %s" % tbl_name)
+      tbl_location = "%s/%s.db/orc/%s/" % (WAREHOUSE, unique_database, tbl)
+      create_table = "CREATE EXTERNAL TABLE %s LIKE ORC '%s' STORED AS ORC" \
+          " location '%s'" % (tbl_name, tbl_location + 'file.orc', 
tbl_location)
+      self.client.execute(create_table)
 
   @SkipIfFS.hive
   def test_load_hive_table(self, vector, unique_database):

(impala) 02/02: IMPALA-13053: Update test to use ORC files

Reply via email to