(impala) 02/03: IMPALA-13211: Add negative test for Parquet Byte Stream Split encoding

michaelsmith Thu, 02 Jan 2025 14:45:59 -0800

This is an automated email from the ASF dual-hosted git repository.

michaelsmith pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


commit 8aea57fc77c14dccb8fcd227eaa52b0cbe3fe077
Author: Gabriella Gyorgyevics <[email protected]>
AuthorDate: Tue Nov 26 15:15:33 2024 +0100

    IMPALA-13211: Add negative test for Parquet Byte Stream Split encoding
    
    This change adds EE tests in test_parquet_byte_stream_split_encoding.py
    that check that Impala returns the correct error message when it
    encounters a table that contains a parquet file with Byte Stream Split
    encoding.
    
    To regenerate the test files, run the parquet_files_generator.py
    script in the testdata/parquet_byte_stream_split_encoding/ folder.
    
    Change-Id: If5eff8bf51fe246a9d0250e38c470b821fec75d9
    Reviewed-on: http://gerrit.cloudera.org:8080/22124
    Reviewed-by: Impala Public Jenkins <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 testdata/parquet_byte_stream_split_encoding/README |   6 +++
 .../doubles_byte_stream_split.parquet              | Bin 0 -> 481 bytes
 .../floats_byte_stream_split.parquet               | Bin 0 -> 451 bytes
 .../parquet_files_generator.py                     |  40 +++++++++++++++
 .../test_parquet_byte_stream_split_encoding.py     |  57 +++++++++++++++++++++
 5 files changed, 103 insertions(+)

diff --git a/testdata/parquet_byte_stream_split_encoding/README 
b/testdata/parquet_byte_stream_split_encoding/README
new file mode 100644
index 000000000..8c6b336aa
--- /dev/null
+++ b/testdata/parquet_byte_stream_split_encoding/README
@@ -0,0 +1,6 @@
+The doubles_byte_stream_split.parquet and floats_byte_stream_split.parquet 
files were
+generated with the parquet_files_generator.py script. The script is using 
PyArrow
+(https://arrow.apache.org/docs/python).
+
+To regenerate the files, run:
+python3 parquet_files_generator.py
\ No newline at end of file
diff --git 
a/testdata/parquet_byte_stream_split_encoding/doubles_byte_stream_split.parquet 
b/testdata/parquet_byte_stream_split_encoding/doubles_byte_stream_split.parquet
new file mode 100644
index 000000000..82df206bd
Binary files /dev/null and 
b/testdata/parquet_byte_stream_split_encoding/doubles_byte_stream_split.parquet 
differ
diff --git 
a/testdata/parquet_byte_stream_split_encoding/floats_byte_stream_split.parquet 
b/testdata/parquet_byte_stream_split_encoding/floats_byte_stream_split.parquet
new file mode 100644
index 000000000..6cb361c84
Binary files /dev/null and 
b/testdata/parquet_byte_stream_split_encoding/floats_byte_stream_split.parquet 
differ
diff --git 
a/testdata/parquet_byte_stream_split_encoding/parquet_files_generator.py 
b/testdata/parquet_byte_stream_split_encoding/parquet_files_generator.py
new file mode 100644
index 000000000..7d4c9a5f6
--- /dev/null
+++ b/testdata/parquet_byte_stream_split_encoding/parquet_files_generator.py
@@ -0,0 +1,40 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from __future__ import absolute_import
+
+import os
+
+import pyarrow as pa
+import pyarrow.parquet as pq
+
+test_file_dir = "testdata/parquet_byte_stream_split_encoding"
+
+nums_to_encode = [1.45, 4.256, 6.3573, 4.235, 7.5198463, 10.57956, 100.68491,
+    0.54987623514, 1.0]
+
+floats = pa.array(nums_to_encode, type=pa.float32())
+floats_table = pa.table([floats], names=["floats"])
+pq.write_table(floats_table, os.path.join(test_file_dir,
+    'floats_byte_stream_split.parquet'), use_dictionary=False,
+    use_byte_stream_split=True)
+
+doubles = pa.array(nums_to_encode, type=pa.float64())
+doubles_table = pa.table([doubles], names=["doubles"])
+pq.write_table(doubles_table, os.path.join(test_file_dir,
+    'doubles_byte_stream_split.parquet'), use_dictionary=False,
+    use_byte_stream_split=True)
diff --git a/tests/query_test/test_parquet_byte_stream_split_encoding.py 
b/tests/query_test/test_parquet_byte_stream_split_encoding.py
new file mode 100644
index 000000000..7bf8aec58
--- /dev/null
+++ b/tests/query_test/test_parquet_byte_stream_split_encoding.py
@@ -0,0 +1,57 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from __future__ import absolute_import
+
+import os
+
+from tests.common.file_utils import create_table_and_copy_files
+from tests.common.impala_test_suite import ImpalaTestSuite
+
+
+class TestParquetEncodings(ImpalaTestSuite):
+
+  TEST_FILE_DIRECTORY = "testdata/parquet_byte_stream_split_encoding"
+
+  @classmethod
+  def get_workload(cls):
+    return 'functional-query'
+
+  @classmethod
+  def add_test_dimensions(cls):
+    super(TestParquetEncodings, cls).add_test_dimensions()
+    cls.ImpalaTestMatrix.add_constraint(
+        lambda v: v.get_value('table_format').file_format == 'parquet')
+
+  def test_parquet_byte_stream_split_encoding_float(self, vector, 
unique_database):
+    self._parquet_byte_stream_split_encoding_helper(vector, unique_database, 
"float",
+        os.path.join(self.TEST_FILE_DIRECTORY, 
"floats_byte_stream_split.parquet"))
+
+  def test_parquet_byte_stream_split_encoding_double(self, vector, 
unique_database):
+    self._parquet_byte_stream_split_encoding_helper(vector, unique_database, 
"double",
+        os.path.join(self.TEST_FILE_DIRECTORY, 
"doubles_byte_stream_split.parquet"))
+
+  def _parquet_byte_stream_split_encoding_helper(self, vector, 
unique_database, col_type,
+      filename):
+    table_name = "parquet_byte_stream_split_negative_test"
+    create_stmt = "create table {}.{} (numbers {}) stored as parquet".format(
+        unique_database, table_name, col_type)
+    create_table_and_copy_files(self.client, create_stmt, unique_database, 
table_name,
+                                [filename])
+    query_stmt = "select * from {}.{}".format(unique_database, table_name)
+    result = self.execute_query_expect_failure(self.client, query_stmt)
+    assert "unsupported encoding: BYTE_STREAM_SPLIT" in str(result)

(impala) 02/03: IMPALA-13211: Add negative test for Parquet Byte Stream Split encoding

Reply via email to