This is an automated email from the ASF dual-hosted git repository. michaelsmith pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit 8aea57fc77c14dccb8fcd227eaa52b0cbe3fe077 Author: Gabriella Gyorgyevics <[email protected]> AuthorDate: Tue Nov 26 15:15:33 2024 +0100 IMPALA-13211: Add negative test for Parquet Byte Stream Split encoding This change adds EE tests in test_parquet_byte_stream_split_encoding.py that check that Impala returns the correct error message when it encounters a table that contains a parquet file with Byte Stream Split encoding. To regenerate the test files, run the parquet_files_generator.py script in the testdata/parquet_byte_stream_split_encoding/ folder. Change-Id: If5eff8bf51fe246a9d0250e38c470b821fec75d9 Reviewed-on: http://gerrit.cloudera.org:8080/22124 Reviewed-by: Impala Public Jenkins <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> --- testdata/parquet_byte_stream_split_encoding/README | 6 +++ .../doubles_byte_stream_split.parquet | Bin 0 -> 481 bytes .../floats_byte_stream_split.parquet | Bin 0 -> 451 bytes .../parquet_files_generator.py | 40 +++++++++++++++ .../test_parquet_byte_stream_split_encoding.py | 57 +++++++++++++++++++++ 5 files changed, 103 insertions(+) diff --git a/testdata/parquet_byte_stream_split_encoding/README b/testdata/parquet_byte_stream_split_encoding/README new file mode 100644 index 000000000..8c6b336aa --- /dev/null +++ b/testdata/parquet_byte_stream_split_encoding/README @@ -0,0 +1,6 @@ +The doubles_byte_stream_split.parquet and floats_byte_stream_split.parquet files were +generated with the parquet_files_generator.py script. The script is using PyArrow +(https://arrow.apache.org/docs/python). + +To regenerate the files, run: +python3 parquet_files_generator.py \ No newline at end of file diff --git a/testdata/parquet_byte_stream_split_encoding/doubles_byte_stream_split.parquet b/testdata/parquet_byte_stream_split_encoding/doubles_byte_stream_split.parquet new file mode 100644 index 000000000..82df206bd Binary files /dev/null and b/testdata/parquet_byte_stream_split_encoding/doubles_byte_stream_split.parquet differ diff --git a/testdata/parquet_byte_stream_split_encoding/floats_byte_stream_split.parquet b/testdata/parquet_byte_stream_split_encoding/floats_byte_stream_split.parquet new file mode 100644 index 000000000..6cb361c84 Binary files /dev/null and b/testdata/parquet_byte_stream_split_encoding/floats_byte_stream_split.parquet differ diff --git a/testdata/parquet_byte_stream_split_encoding/parquet_files_generator.py b/testdata/parquet_byte_stream_split_encoding/parquet_files_generator.py new file mode 100644 index 000000000..7d4c9a5f6 --- /dev/null +++ b/testdata/parquet_byte_stream_split_encoding/parquet_files_generator.py @@ -0,0 +1,40 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import absolute_import + +import os + +import pyarrow as pa +import pyarrow.parquet as pq + +test_file_dir = "testdata/parquet_byte_stream_split_encoding" + +nums_to_encode = [1.45, 4.256, 6.3573, 4.235, 7.5198463, 10.57956, 100.68491, + 0.54987623514, 1.0] + +floats = pa.array(nums_to_encode, type=pa.float32()) +floats_table = pa.table([floats], names=["floats"]) +pq.write_table(floats_table, os.path.join(test_file_dir, + 'floats_byte_stream_split.parquet'), use_dictionary=False, + use_byte_stream_split=True) + +doubles = pa.array(nums_to_encode, type=pa.float64()) +doubles_table = pa.table([doubles], names=["doubles"]) +pq.write_table(doubles_table, os.path.join(test_file_dir, + 'doubles_byte_stream_split.parquet'), use_dictionary=False, + use_byte_stream_split=True) diff --git a/tests/query_test/test_parquet_byte_stream_split_encoding.py b/tests/query_test/test_parquet_byte_stream_split_encoding.py new file mode 100644 index 000000000..7bf8aec58 --- /dev/null +++ b/tests/query_test/test_parquet_byte_stream_split_encoding.py @@ -0,0 +1,57 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import absolute_import + +import os + +from tests.common.file_utils import create_table_and_copy_files +from tests.common.impala_test_suite import ImpalaTestSuite + + +class TestParquetEncodings(ImpalaTestSuite): + + TEST_FILE_DIRECTORY = "testdata/parquet_byte_stream_split_encoding" + + @classmethod + def get_workload(cls): + return 'functional-query' + + @classmethod + def add_test_dimensions(cls): + super(TestParquetEncodings, cls).add_test_dimensions() + cls.ImpalaTestMatrix.add_constraint( + lambda v: v.get_value('table_format').file_format == 'parquet') + + def test_parquet_byte_stream_split_encoding_float(self, vector, unique_database): + self._parquet_byte_stream_split_encoding_helper(vector, unique_database, "float", + os.path.join(self.TEST_FILE_DIRECTORY, "floats_byte_stream_split.parquet")) + + def test_parquet_byte_stream_split_encoding_double(self, vector, unique_database): + self._parquet_byte_stream_split_encoding_helper(vector, unique_database, "double", + os.path.join(self.TEST_FILE_DIRECTORY, "doubles_byte_stream_split.parquet")) + + def _parquet_byte_stream_split_encoding_helper(self, vector, unique_database, col_type, + filename): + table_name = "parquet_byte_stream_split_negative_test" + create_stmt = "create table {}.{} (numbers {}) stored as parquet".format( + unique_database, table_name, col_type) + create_table_and_copy_files(self.client, create_stmt, unique_database, table_name, + [filename]) + query_stmt = "select * from {}.{}".format(unique_database, table_name) + result = self.execute_query_expect_failure(self.client, query_stmt) + assert "unsupported encoding: BYTE_STREAM_SPLIT" in str(result)
