[arrow] branch master updated: ARROW-9424: [C++][Parquet] Disable writing files with LZ4 codec

wesm Tue, 14 Jul 2020 19:41:20 -0700

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/master by this push:
     new 3586292  ARROW-9424: [C++][Parquet] Disable writing files with LZ4 
codec
3586292 is described below

commit 3586292d62c8c348e9fb85676eb524cde53179cf
Author: Wes McKinney <[email protected]>
AuthorDate: Tue Jul 14 21:39:47 2020 -0500

    ARROW-9424: [C++][Parquet] Disable writing files with LZ4 codec
    
    Due to ongoing LZ4 problems with Parquet files, this patch disables writing 
files with LZ4 codec by throwing a `ParquetException`.
    
    In progress: adding exceptions for pyarrow when using LZ4 to write files 
and updating relevant pytests
    
    Mailing list discussion: 
https://mail-archives.apache.org/mod_mbox/arrow-dev/202007.mbox/%3CCAJPUwMCM4ZaJB720%2BuoM1aSA2oD9jSEnzuwWjJiw6vwXxHk7nw%40mail.gmail.com%3E
    
    Jira ticket: https://issues.apache.org/jira/browse/ARROW-9424
    
    Closes #7757 from patrickpai/ARROW-9424
    
    Lead-authored-by: Wes McKinney <[email protected]>
    Co-authored-by: Patrick Pai <[email protected]>
    Signed-off-by: Wes McKinney <[email protected]>
---
 cpp/src/parquet/column_reader.cc         |  2 +-
 cpp/src/parquet/column_writer.cc         |  2 +-
 cpp/src/parquet/column_writer_test.cc    | 10 ++++++----
 cpp/src/parquet/file_deserialize_test.cc |  5 ++---
 cpp/src/parquet/file_serialize_test.cc   |  2 +-
 cpp/src/parquet/thrift_internal.h        |  1 +
 cpp/src/parquet/types.cc                 | 33 ++++++++++++++++++++++++++++----
 cpp/src/parquet/types.h                  |  9 +++++++++
 python/pyarrow/tests/test_parquet.py     | 16 ++++++++++++++--
 9 files changed, 64 insertions(+), 16 deletions(-)

diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
index 0bfc303..bc462ad 100644
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -182,7 +182,7 @@ class SerializedPageReader : public PageReader {
       InitDecryption();
     }
     max_page_header_size_ = kDefaultMaxPageHeaderSize;
-    decompressor_ = GetCodec(codec);
+    decompressor_ = internal::GetReadCodec(codec);
   }
 
   // Implement the PageReader interface
diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc
index 13f91e3..f9cf37c 100644
--- a/cpp/src/parquet/column_writer.cc
+++ b/cpp/src/parquet/column_writer.cc
@@ -172,7 +172,7 @@ class SerializedPageWriter : public PageWriter {
     if (data_encryptor_ != nullptr || meta_encryptor_ != nullptr) {
       InitEncryption();
     }
-    compressor_ = GetCodec(codec, compression_level);
+    compressor_ = internal::GetWriteCodec(codec, compression_level);
     thrift_serializer_.reset(new ThriftSerializer);
   }
 
diff --git a/cpp/src/parquet/column_writer_test.cc 
b/cpp/src/parquet/column_writer_test.cc
index 23554aa..a92d4d2 100644
--- a/cpp/src/parquet/column_writer_test.cc
+++ b/cpp/src/parquet/column_writer_test.cc
@@ -488,13 +488,15 @@ TYPED_TEST(TestPrimitiveWriter, 
RequiredPlainWithStatsAndGzipCompression) {
 
 #ifdef ARROW_WITH_LZ4
 TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithLz4Compression) {
-  this->TestRequiredWithSettings(Encoding::PLAIN, Compression::LZ4, false, 
false,
-                                 LARGE_SIZE);
+  ASSERT_THROW(this->TestRequiredWithSettings(Encoding::PLAIN, 
Compression::LZ4, false,
+                                              false, LARGE_SIZE),
+               ParquetException);
 }
 
 TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithStatsAndLz4Compression) {
-  this->TestRequiredWithSettings(Encoding::PLAIN, Compression::LZ4, false, 
true,
-                                 LARGE_SIZE);
+  ASSERT_THROW(this->TestRequiredWithSettings(Encoding::PLAIN, 
Compression::LZ4, false,
+                                              true, LARGE_SIZE),
+               ParquetException);
 }
 #endif
 
diff --git a/cpp/src/parquet/file_deserialize_test.cc 
b/cpp/src/parquet/file_deserialize_test.cc
index 3fe2230..1dd3492 100644
--- a/cpp/src/parquet/file_deserialize_test.cc
+++ b/cpp/src/parquet/file_deserialize_test.cc
@@ -249,9 +249,8 @@ TEST_F(TestPageSerde, Compression) {
   codec_types.push_back(Compression::GZIP);
 #endif
 
-#ifdef ARROW_WITH_LZ4
-  codec_types.push_back(Compression::LZ4);
-#endif
+  // TODO: Add LZ4 compression type after PARQUET-1878 is complete.
+  // Testing for deserializing LZ4 is hard without writing enabled, so it is 
not included.
 
 #ifdef ARROW_WITH_ZSTD
   codec_types.push_back(Compression::ZSTD);
diff --git a/cpp/src/parquet/file_serialize_test.cc 
b/cpp/src/parquet/file_serialize_test.cc
index c5c4df2..72d7d6f 100644
--- a/cpp/src/parquet/file_serialize_test.cc
+++ b/cpp/src/parquet/file_serialize_test.cc
@@ -309,7 +309,7 @@ TYPED_TEST(TestSerialize, SmallFileGzip) {
 
 #ifdef ARROW_WITH_LZ4
 TYPED_TEST(TestSerialize, SmallFileLz4) {
-  ASSERT_NO_FATAL_FAILURE(this->FileSerializeTest(Compression::LZ4));
+  ASSERT_THROW(this->FileSerializeTest(Compression::LZ4), ParquetException);
 }
 #endif
 
diff --git a/cpp/src/parquet/thrift_internal.h 
b/cpp/src/parquet/thrift_internal.h
index 103dc650..e1959e7 100644
--- a/cpp/src/parquet/thrift_internal.h
+++ b/cpp/src/parquet/thrift_internal.h
@@ -101,6 +101,7 @@ static inline Compression::type 
FromThriftUnsafe(format::CompressionCodec::type
     case format::CompressionCodec::BROTLI:
       return Compression::BROTLI;
     case format::CompressionCodec::LZ4:
+      // ARROW-9424: Existing files use LZ4_RAW but this may need to change
       return Compression::LZ4;
     case format::CompressionCodec::ZSTD:
       return Compression::ZSTD;
diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc
index 7e22da4..1b4bb4a 100644
--- a/cpp/src/parquet/types.cc
+++ b/cpp/src/parquet/types.cc
@@ -49,12 +49,19 @@ bool IsCodecSupported(Compression::type codec) {
   }
 }
 
-std::unique_ptr<Codec> GetCodec(Compression::type codec) {
-  return GetCodec(codec, Codec::UseDefaultCompressionLevel());
-}
+namespace internal {
 
-std::unique_ptr<Codec> GetCodec(Compression::type codec, int 
compression_level) {
+std::unique_ptr<Codec> GetCodec(Compression::type codec, int compression_level,
+                                bool for_writing) {
   std::unique_ptr<Codec> result;
+  if (for_writing && (codec == Compression::LZ4 || codec == 
Compression::LZ4_FRAME)) {
+    throw ParquetException(
+        "Per ARROW-9424, writing files with LZ4 compression has been "
+        "disabled until implementation issues have been resolved. "
+        "It is recommended to read any existing files and rewrite them "
+        "using a different compression.");
+  }
+
   if (!IsCodecSupported(codec)) {
     std::stringstream ss;
     ss << "Codec type " << Codec::GetCodecAsString(codec)
@@ -66,6 +73,24 @@ std::unique_ptr<Codec> GetCodec(Compression::type codec, int 
compression_level)
   return result;
 }
 
+std::unique_ptr<Codec> GetReadCodec(Compression::type codec) {
+  return GetCodec(codec, Codec::UseDefaultCompressionLevel(), 
/*for_writing=*/false);
+}
+
+std::unique_ptr<Codec> GetWriteCodec(Compression::type codec, int 
compression_level) {
+  return GetCodec(codec, compression_level, /*for_writing=*/true);
+}
+
+}  // namespace internal
+
+std::unique_ptr<Codec> GetCodec(Compression::type codec, int 
compression_level) {
+  return internal::GetCodec(codec, compression_level, /*for_writing=*/false);
+}
+
+std::unique_ptr<Codec> GetCodec(Compression::type codec) {
+  return GetCodec(codec, Codec::UseDefaultCompressionLevel());
+}
+
 std::string FormatStatValue(Type::type parquet_type, 
::arrow::util::string_view val) {
   std::stringstream result;
 
diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h
index a1586b6..f7f7bb7 100644
--- a/cpp/src/parquet/types.h
+++ b/cpp/src/parquet/types.h
@@ -465,6 +465,15 @@ struct Encoding {
 PARQUET_EXPORT
 bool IsCodecSupported(Compression::type codec);
 
+namespace internal {
+
+// ARROW-9424: Separate functions for reading and writing so we can disable LZ4
+// on writing
+std::unique_ptr<Codec> GetReadCodec(Compression::type codec);
+std::unique_ptr<Codec> GetWriteCodec(Compression::type codec, int 
compression_level);
+
+}  // namespace internal
+
 PARQUET_EXPORT
 std::unique_ptr<Codec> GetCodec(Compression::type codec);
 
diff --git a/python/pyarrow/tests/test_parquet.py 
b/python/pyarrow/tests/test_parquet.py
index 410eee1..aea3c07 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -698,6 +698,10 @@ def test_pandas_parquet_pyfile_roundtrip(tempdir, 
use_legacy_dataset):
     tm.assert_frame_equal(df, df_read)
 
 
+# ARROW-9424: LZ4 support is currently disabled
+SUPPORTED_COMPRESSIONS = ['NONE', 'SNAPPY', 'GZIP', 'ZSTD']
+
+
 @pytest.mark.pandas
 @parametrize_legacy_dataset
 def test_pandas_parquet_configuration_options(tempdir, use_legacy_dataset):
@@ -735,7 +739,7 @@ def test_pandas_parquet_configuration_options(tempdir, 
use_legacy_dataset):
         df_read = table_read.to_pandas()
         tm.assert_frame_equal(df, df_read)
 
-    for compression in ['NONE', 'SNAPPY', 'GZIP', 'LZ4', 'ZSTD']:
+    for compression in SUPPORTED_COMPRESSIONS:
         if (compression != 'NONE' and
                 not pa.lib.Codec.is_available(compression)):
             continue
@@ -747,6 +751,13 @@ def test_pandas_parquet_configuration_options(tempdir, 
use_legacy_dataset):
         tm.assert_frame_equal(df, df_read)
 
 
+# ARROW-9424: LZ4 support is currently disabled
+def test_lz4_compression_disabled():
+    table = pa.table([pa.array([1, 2, 3, 4, 5])], names=['f0'])
+    with pytest.raises(IOError):
+        pq.write_table(table, pa.BufferOutputStream(), compression='lz4')
+
+
 def make_sample_file(table_or_df):
     if isinstance(table_or_df, pa.Table):
         a_table = table_or_df
@@ -828,8 +839,9 @@ def test_compression_level(use_legacy_dataset):
     # level.
     # GZIP (zlib) allows for specifying a compression level but as of up
     # to version 1.2.11 the valid range is [-1, 9].
-    invalid_combinations = [("snappy", 4), ("lz4", 5), ("gzip", -1337),
+    invalid_combinations = [("snappy", 4), ("gzip", -1337),
                             ("None", 444), ("lzo", 14)]
+    # ARROW-9424: lz4 is disabled for now ("lz4", 5),
     buf = io.BytesIO()
     for (codec, level) in invalid_combinations:
         with pytest.raises((ValueError, OSError)):

[arrow] branch master updated: ARROW-9424: [C++][Parquet] Disable writing files with LZ4 codec

Reply via email to