This is an automated email from the ASF dual-hosted git repository. wzhou pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit 0d01f5e82967b84b182d40de1ce213367e0b6181 Author: Michael Smith <[email protected]> AuthorDate: Thu May 2 15:08:31 2024 -0700 IMPALA-13053: Update test to use ORC files Updates test_max_nesting_depth to use precreated ORC files like the Parquet version to reduce runtime rather than using Hive to generate ORC from the Parquet files. This reduces each test run by almost 3 minutes. Change-Id: I2f5bdbb86af0e651d189217a18882d5eda1098d5 Reviewed-on: http://gerrit.cloudera.org:8080/21391 Reviewed-by: Impala Public Jenkins <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> --- testdata/max_nesting_depth/README | 8 ++--- testdata/max_nesting_depth/orc/int_array/file.orc | Bin 0 -> 2882 bytes testdata/max_nesting_depth/orc/int_map/file.orc | Bin 0 -> 9864 bytes testdata/max_nesting_depth/orc/struct/file.orc | Bin 0 -> 1651 bytes .../max_nesting_depth/orc/struct_array/file.orc | Bin 0 -> 2344 bytes testdata/max_nesting_depth/orc/struct_map/file.orc | Bin 0 -> 5870 bytes .../{ => parquet}/int_array/file.parq | Bin .../{ => parquet}/int_map/file.parq | Bin .../{ => parquet}/struct/file.parq | Bin .../{ => parquet}/struct_array/file.parq | Bin .../{ => parquet}/struct_map/file.parq | Bin tests/query_test/test_nested_types.py | 33 ++++++++------------- 12 files changed, 17 insertions(+), 24 deletions(-) diff --git a/testdata/max_nesting_depth/README b/testdata/max_nesting_depth/README index a44fd42cb..7dff46361 100644 --- a/testdata/max_nesting_depth/README +++ b/testdata/max_nesting_depth/README @@ -1,7 +1,7 @@ -This folder contains Parquet files for testing the maximum nesting depth -of complex types. The maximum nesting depth Types.MAX_NESTING_DEPTH (100). -All Parquet files contain the integer 42 as the only data value in a single -Parquet column. +This folder contains Parquet and ORC files for testing the maximum nesting +depth of complex types. The maximum nesting depth Types.MAX_NESTING_DEPTH +(100). All files contain the integer 42 as the only data value in a single +column. The folders contain files for the following complex types with nesting depth of exactly 100. diff --git a/testdata/max_nesting_depth/orc/int_array/file.orc b/testdata/max_nesting_depth/orc/int_array/file.orc new file mode 100644 index 000000000..fb03d4ba6 Binary files /dev/null and b/testdata/max_nesting_depth/orc/int_array/file.orc differ diff --git a/testdata/max_nesting_depth/orc/int_map/file.orc b/testdata/max_nesting_depth/orc/int_map/file.orc new file mode 100644 index 000000000..c13581fad Binary files /dev/null and b/testdata/max_nesting_depth/orc/int_map/file.orc differ diff --git a/testdata/max_nesting_depth/orc/struct/file.orc b/testdata/max_nesting_depth/orc/struct/file.orc new file mode 100644 index 000000000..046d486ba Binary files /dev/null and b/testdata/max_nesting_depth/orc/struct/file.orc differ diff --git a/testdata/max_nesting_depth/orc/struct_array/file.orc b/testdata/max_nesting_depth/orc/struct_array/file.orc new file mode 100644 index 000000000..f7b729dfd Binary files /dev/null and b/testdata/max_nesting_depth/orc/struct_array/file.orc differ diff --git a/testdata/max_nesting_depth/orc/struct_map/file.orc b/testdata/max_nesting_depth/orc/struct_map/file.orc new file mode 100644 index 000000000..1fbe6113e Binary files /dev/null and b/testdata/max_nesting_depth/orc/struct_map/file.orc differ diff --git a/testdata/max_nesting_depth/int_array/file.parq b/testdata/max_nesting_depth/parquet/int_array/file.parq similarity index 100% rename from testdata/max_nesting_depth/int_array/file.parq rename to testdata/max_nesting_depth/parquet/int_array/file.parq diff --git a/testdata/max_nesting_depth/int_map/file.parq b/testdata/max_nesting_depth/parquet/int_map/file.parq similarity index 100% rename from testdata/max_nesting_depth/int_map/file.parq rename to testdata/max_nesting_depth/parquet/int_map/file.parq diff --git a/testdata/max_nesting_depth/struct/file.parq b/testdata/max_nesting_depth/parquet/struct/file.parq similarity index 100% rename from testdata/max_nesting_depth/struct/file.parq rename to testdata/max_nesting_depth/parquet/struct/file.parq diff --git a/testdata/max_nesting_depth/struct_array/file.parq b/testdata/max_nesting_depth/parquet/struct_array/file.parq similarity index 100% rename from testdata/max_nesting_depth/struct_array/file.parq rename to testdata/max_nesting_depth/parquet/struct_array/file.parq diff --git a/testdata/max_nesting_depth/struct_map/file.parq b/testdata/max_nesting_depth/parquet/struct_map/file.parq similarity index 100% rename from testdata/max_nesting_depth/struct_map/file.parq rename to testdata/max_nesting_depth/parquet/struct_map/file.parq diff --git a/tests/query_test/test_nested_types.py b/tests/query_test/test_nested_types.py index 12179701e..a21224bcf 100644 --- a/tests/query_test/test_nested_types.py +++ b/tests/query_test/test_nested_types.py @@ -848,7 +848,6 @@ class TestMaxNestingDepth(ImpalaTestSuite): # Should be kept in sync with the FE's Type.MAX_NESTING_DEPTH MAX_NESTING_DEPTH = 100 TABLES = ['struct', 'int_array', 'struct_array', 'int_map', 'struct_map'] - TEMP_TABLE_SUFFIX = '_parquet' @classmethod def get_workload(self): @@ -866,41 +865,35 @@ class TestMaxNestingDepth(ImpalaTestSuite): """Tests that Impala can scan Parquet and ORC files having complex types of the maximum nesting depth.""" file_format = vector.get_value('table_format').file_format - if file_format == 'orc' and not IS_HDFS: - pytest.skip('Orc table loading needs Hive and thus only works with HDFS.') - if file_format == 'parquet': self.__create_parquet_tables(unique_database) elif file_format == 'orc': self.__create_orc_tables(unique_database) self.run_test_case('QueryTest/max-nesting-depth', vector, unique_database) - def __create_parquet_tables(self, unique_database, as_target=True): - """Create Parquet tables from files. If 'as_target' is False, the Parquet tables will - be used to create ORC tables, so we add a suffix in the table names.""" + def __create_parquet_tables(self, unique_database): + """Create Parquet tables from files.""" self.filesystem_client.copy_from_local( - "%s/testdata/max_nesting_depth" % os.environ['IMPALA_HOME'], + "%s/testdata/max_nesting_depth/parquet" % os.environ['IMPALA_HOME'], "%s/%s.db/" % (WAREHOUSE, unique_database)) - tbl_suffix = '' if as_target else self.TEMP_TABLE_SUFFIX for tbl in self.TABLES: - tbl_name = "%s.%s_tbl%s" % (unique_database, tbl, tbl_suffix) - tbl_location = "%s/%s.db/max_nesting_depth/%s/" % (WAREHOUSE, unique_database, tbl) + tbl_name = "%s.%s_tbl" % (unique_database, tbl) + tbl_location = "%s/%s.db/parquet/%s/" % (WAREHOUSE, unique_database, tbl) create_table = "CREATE EXTERNAL TABLE %s LIKE PARQUET '%s' STORED AS PARQUET" \ " location '%s'" % (tbl_name, tbl_location + 'file.parq', tbl_location) self.client.execute(create_table) def __create_orc_tables(self, unique_database): - # Creating ORC tables from ORC files (IMPALA-8046) has not been supported. - # We create the Parquet tables first and then transform them into ORC tables. - self.__create_parquet_tables(unique_database, False) + """Create ORC tables from files.""" + self.filesystem_client.copy_from_local( + "%s/testdata/max_nesting_depth/orc" % os.environ['IMPALA_HOME'], + "%s/%s.db/" % (WAREHOUSE, unique_database)) for tbl in self.TABLES: tbl_name = "%s.%s_tbl" % (unique_database, tbl) - from_tbl_name = tbl_name + self.TEMP_TABLE_SUFFIX - create_table = "CREATE TABLE %s LIKE %s STORED AS ORC" % (tbl_name, from_tbl_name) - insert_table = "INSERT INTO %s SELECT * FROM %s" % (tbl_name, from_tbl_name) - self.run_stmt_in_hive(create_table) - self.run_stmt_in_hive(insert_table) - self.client.execute("INVALIDATE METADATA %s" % tbl_name) + tbl_location = "%s/%s.db/orc/%s/" % (WAREHOUSE, unique_database, tbl) + create_table = "CREATE EXTERNAL TABLE %s LIKE ORC '%s' STORED AS ORC" \ + " location '%s'" % (tbl_name, tbl_location + 'file.orc', tbl_location) + self.client.execute(create_table) @SkipIfFS.hive def test_load_hive_table(self, vector, unique_database):
