pitrou commented on code in PR #3148: URL: https://github.com/apache/parquet-java/pull/3148#discussion_r1944913520
########## parquet-column/src/main/java/org/apache/parquet/column/page/DataPageV2.java: ########## @@ -163,6 +170,33 @@ public DataPageV2( this.isCompressed = isCompressed; } + public DataPageV2( + int rowCount, + int nullCount, + int valueCount, + BytesInput repetitionLevels, + BytesInput definitionLevels, + Encoding dataEncoding, + BytesInput data, + int compressedSize, + int uncompressedSize, + Statistics<?> statistics, + boolean isCompressed) { + super(compressedSize, uncompressedSize, valueCount); + if (!isCompressed && compressedSize != 0) { + throw new IllegalArgumentException("compressedSize must be 0 if page is not compressed"); + } Review Comment: In Parquet C++, compressed_size is always written out. ########## parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java: ########## @@ -2123,6 +2123,9 @@ private PageHeader newDataPageV2Header( int dlByteLength) { DataPageHeaderV2 dataPageHeaderV2 = new DataPageHeaderV2( valueCount, nullCount, rowCount, getEncoding(dataEncoding), dlByteLength, rlByteLength); + if (compressedSize == 0) { + dataPageHeaderV2.setIs_compressed(false); + } Review Comment: I think you should take an explicit `bool isCompressed` parameter instead. ########## parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java: ########## @@ -1150,8 +1150,11 @@ public void writeDataPageV2( int rlByteLength = toIntWithCheck(repetitionLevels.size(), "page repetition levels"); int dlByteLength = toIntWithCheck(definitionLevels.size(), "page definition levels"); - int compressedSize = - toIntWithCheck(compressedData.size() + repetitionLevels.size() + definitionLevels.size(), "page"); + int compressedSize = 0; + if (compressedData.size() > 0) { + compressedSize = + toIntWithCheck(compressedData.size() + repetitionLevels.size() + definitionLevels.size(), "page"); + } Review Comment: So this class is duplicating code from `ColumnChunkPageWriteStore.java` above? Do you know why that is? ########## parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java: ########## @@ -295,14 +295,19 @@ public void writePageV2( int rlByteLength = toIntWithCheck(repetitionLevels.size()); int dlByteLength = toIntWithCheck(definitionLevels.size()); int uncompressedSize = toIntWithCheck(data.size() + repetitionLevels.size() + definitionLevels.size()); - // TODO: decide if we compress - BytesInput compressedData = compressor.compress(data); - if (null != pageBlockEncryptor) { - AesCipher.quickUpdatePageAAD(dataPageAAD, pageOrdinal); - compressedData = BytesInput.from(pageBlockEncryptor.encrypt(compressedData.toByteArray(), dataPageAAD)); + BytesInput compressedData = BytesInput.empty(); + int compressedSize = 0; + if (data.size() > 0) { + // TODO: decide if we compress + compressedData = compressor.compress(data); + if (null != pageBlockEncryptor) { + AesCipher.quickUpdatePageAAD(dataPageAAD, pageOrdinal); + compressedData = + BytesInput.from(pageBlockEncryptor.encrypt(compressedData.toByteArray(), dataPageAAD)); + } + compressedSize = + toIntWithCheck(compressedData.size() + repetitionLevels.size() + definitionLevels.size()); } Review Comment: So encryption wouldn't be handled properly if the uncompressed size is 0? Also, `compressedSize` should always contain the total size of the data page (excluding header, but including encryption). ########## parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java: ########## @@ -1969,6 +1969,7 @@ public ColumnChunkPageReader readAllPages( definitionLevels, converter.getEncoding(dataHeaderV2.getEncoding()), values, + dataHeaderV2.isIs_compressed() ? compressedPageSize : 0, Review Comment: Why this condition? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@parquet.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@parquet.apache.org For additional commands, e-mail: issues-h...@parquet.apache.org