This is an automated email from the ASF dual-hosted git repository.
gangwu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 3ecf1cab23 GH-48334: [C++][Parquet] Support reading encrypted bloom
filters (#49334)
3ecf1cab23 is described below
commit 3ecf1cab23e0b34400e63974b8d16c0acc5e6c5c
Author: fenfeng9 <[email protected]>
AuthorDate: Fri Mar 27 23:54:20 2026 +0800
GH-48334: [C++][Parquet] Support reading encrypted bloom filters (#49334)
### Rationale for this change
Reading bloom filters from encrypted Parquet files previously raised an
exception. This change implements encrypted bloom filter deserialization by
decrypting the Thrift header (module id 8) and bitset (module id 9) separately,
and adds the necessary validation and tests.
### What changes are included in this PR?
- Wire metadata decryptor creation into the bloom filter reader
- Add BlockSplitBloomFilter::DeserializeEncrypted(...) for encrypted bloom
filters
- Remove the fuzzer workaround that swallowed encrypted bloom filter
exceptions
### Are these changes tested?
Yes.
### Are there any user-facing changes?
Yes.
* GitHub Issue: #48334
Authored-by: fenfeng9 <[email protected]>
Signed-off-by: Gang Wu <[email protected]>
---
cpp/src/parquet/CMakeLists.txt | 1 +
cpp/src/parquet/arrow/fuzz_internal.cc | 48 +++---
cpp/src/parquet/bloom_filter.cc | 162 ++++++++++++++++++++-
cpp/src/parquet/bloom_filter.h | 19 +++
cpp/src/parquet/bloom_filter_reader.cc | 60 ++++++--
.../encryption/bloom_filter_encryption_test.cc | 94 ++++++++++++
cpp/submodules/parquet-testing | 2 +-
7 files changed, 336 insertions(+), 50 deletions(-)
diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt
index b707b21e60..07cf9f9c50 100644
--- a/cpp/src/parquet/CMakeLists.txt
+++ b/cpp/src/parquet/CMakeLists.txt
@@ -416,6 +416,7 @@ add_parquet_test(arrow-metadata-test SOURCES
arrow/arrow_metadata_test.cc
if(PARQUET_REQUIRE_ENCRYPTION)
add_parquet_test(encryption-test
SOURCES
+ encryption/bloom_filter_encryption_test.cc
encryption/encryption_internal_test.cc
encryption/write_configurations_test.cc
encryption/read_configurations_test.cc
diff --git a/cpp/src/parquet/arrow/fuzz_internal.cc
b/cpp/src/parquet/arrow/fuzz_internal.cc
index 384749dd48..dfbb8ae161 100644
--- a/cpp/src/parquet/arrow/fuzz_internal.cc
+++ b/cpp/src/parquet/arrow/fuzz_internal.cc
@@ -155,6 +155,23 @@ Status FuzzReadPageIndex(RowGroupPageIndexReader* reader,
const SchemaDescriptor
return st;
}
+Status FuzzReadBloomFilter(RowGroupBloomFilterReader* reader, int column,
+ std::uniform_int_distribution<uint64_t>& hash_dist,
+ std::default_random_engine& rng) {
+ Status st;
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ std::unique_ptr<BloomFilter> bloom;
+ bloom = reader->GetColumnBloomFilter(column);
+ // If the column has a bloom filter, find a bunch of random hashes
+ if (bloom != nullptr) {
+ for (int k = 0; k < 100; ++k) {
+ bloom->FindHash(hash_dist(rng));
+ }
+ }
+ END_PARQUET_CATCH_EXCEPTIONS
+ return st;
+}
+
ReaderProperties MakeFuzzReaderProperties(MemoryPool* pool) {
FileDecryptionProperties::Builder builder;
builder.key_retriever(MakeKeyRetriever());
@@ -207,31 +224,12 @@ Status FuzzReader(const uint8_t* data, int64_t size) {
}
{
// Read and decode bloom filters
- try {
- auto& bloom_reader = pq_file_reader->GetBloomFilterReader();
- std::uniform_int_distribution<uint64_t> hash_dist;
- for (int i = 0; i < num_row_groups; ++i) {
- auto bloom_rg = bloom_reader.RowGroup(i);
- for (int j = 0; j < num_columns; ++j) {
- std::unique_ptr<BloomFilter> bloom;
- bloom = bloom_rg->GetColumnBloomFilter(j);
- // If the column has a bloom filter, find a bunch of random hashes
- if (bloom != nullptr) {
- for (int k = 0; k < 100; ++k) {
- bloom->FindHash(hash_dist(rng));
- }
- }
- }
- }
- } catch (const ParquetException& exc) {
- // XXX we just want to ignore encrypted bloom filters and validate the
- // rest of the file; there is no better way of doing this until
GH-46597
- // is done.
- // (also see GH-48334 for reading encrypted bloom filters)
- if (std::string_view(exc.what())
- .find("BloomFilter decryption is not yet supported") ==
- std::string_view::npos) {
- throw;
+ auto& bloom_reader = pq_file_reader->GetBloomFilterReader();
+ std::uniform_int_distribution<uint64_t> hash_dist;
+ for (int i = 0; i < num_row_groups; ++i) {
+ auto bloom_rg = bloom_reader.RowGroup(i);
+ for (int j = 0; j < num_columns; ++j) {
+ st &= FuzzReadBloomFilter(bloom_rg.get(), j, hash_dist, rng);
}
}
}
diff --git a/cpp/src/parquet/bloom_filter.cc b/cpp/src/parquet/bloom_filter.cc
index e8011b5fc8..577d26fe00 100644
--- a/cpp/src/parquet/bloom_filter.cc
+++ b/cpp/src/parquet/bloom_filter.cc
@@ -17,8 +17,10 @@
#include <cstdint>
#include <cstring>
+#include <limits>
#include <memory>
+#include "arrow/io/memory.h"
#include "arrow/result.h"
#include "arrow/util/logging_internal.h"
#include "arrow/util/macros.h"
@@ -26,11 +28,41 @@
#include "generated/parquet_types.h"
#include "parquet/bloom_filter.h"
+#include "parquet/encryption/encryption_internal.h"
+#include "parquet/encryption/internal_file_decryptor.h"
#include "parquet/exception.h"
#include "parquet/thrift_internal.h"
#include "parquet/xxhasher.h"
namespace parquet {
+namespace {
+
+constexpr int32_t kCiphertextLengthSize = 4;
+
+/// Parse the 4-byte little-endian length prefix and return the total
ciphertext size,
+/// including the 4-byte length field itself.
+int64_t ParseCiphertextTotalLength(const uint8_t* data, int64_t length) {
+ if (length < kCiphertextLengthSize) {
+ throw ParquetException("Ciphertext length buffer is too small");
+ }
+ uint32_t buffer_size =
+ (static_cast<uint32_t>(data[3]) << 24) | (static_cast<uint32_t>(data[2])
<< 16) |
+ (static_cast<uint32_t>(data[1]) << 8) | (static_cast<uint32_t>(data[0]));
+ return static_cast<int64_t>(buffer_size) + kCiphertextLengthSize;
+}
+
+void CheckBloomFilterShortRead(int64_t expected, int64_t actual,
+ std::string_view context) {
+ if (ARROW_PREDICT_FALSE(actual < expected)) {
+ std::stringstream ss;
+ ss << context << " read failed: expected ";
+ ss << expected << " bytes, got " << actual;
+ throw ParquetException(ss.str());
+ }
+}
+
+} // namespace
+
constexpr uint32_t BlockSplitBloomFilter::SALT[kBitsSetPerBlock];
BlockSplitBloomFilter::BlockSplitBloomFilter(::arrow::MemoryPool* pool)
@@ -75,10 +107,11 @@ void BlockSplitBloomFilter::Init(const uint8_t* bitset,
uint32_t num_bytes) {
this->hasher_ = std::make_unique<XxHasher>();
}
-static constexpr uint32_t kBloomFilterHeaderSizeGuess = 256;
+namespace {
+
+constexpr uint32_t kBloomFilterHeaderSizeGuess = 256;
-static ::arrow::Status ValidateBloomFilterHeader(
- const format::BloomFilterHeader& header) {
+::arrow::Status ValidateBloomFilterHeader(const format::BloomFilterHeader&
header) {
if (!header.algorithm.__isset.BLOCK) {
return ::arrow::Status::Invalid(
"Unsupported Bloom filter algorithm: ", header.algorithm, ".");
@@ -104,6 +137,122 @@ static ::arrow::Status ValidateBloomFilterHeader(
return ::arrow::Status::OK();
}
+BlockSplitBloomFilter DeserializeEncryptedFromStream(
+ const ReaderProperties& properties, ArrowInputStream* input,
+ std::optional<int64_t> bloom_filter_length, Decryptor* decryptor,
+ int16_t row_group_ordinal, int16_t column_ordinal) {
+ ThriftDeserializer deserializer(properties);
+ format::BloomFilterHeader header;
+
+ // Read the length-prefixed ciphertext for the header.
+ PARQUET_ASSIGN_OR_THROW(auto length_buf, input->Read(kCiphertextLengthSize));
+ CheckBloomFilterShortRead(kCiphertextLengthSize, length_buf->size(),
+ "Bloom filter header length");
+
+ const int64_t header_cipher_total_len =
+ ParseCiphertextTotalLength(length_buf->data(), length_buf->size());
+ if (ARROW_PREDICT_FALSE(header_cipher_total_len >
+ std::numeric_limits<int32_t>::max())) {
+ throw ParquetException("Bloom filter header ciphertext length overflows
int32");
+ }
+ if (bloom_filter_length && header_cipher_total_len > *bloom_filter_length) {
+ throw ParquetException(
+ "Bloom filter length less than encrypted bloom filter header length");
+ }
+
+ // Read the full header ciphertext and decrypt the Thrift header.
+ auto header_cipher_buf =
+ AllocateBuffer(properties.memory_pool(), header_cipher_total_len);
+ std::memcpy(header_cipher_buf->mutable_data(), length_buf->data(),
+ kCiphertextLengthSize);
+ const int64_t header_cipher_remaining = header_cipher_total_len -
kCiphertextLengthSize;
+ PARQUET_ASSIGN_OR_THROW(auto read_size, input->Read(header_cipher_remaining,
+
header_cipher_buf->mutable_data() +
+
kCiphertextLengthSize));
+ CheckBloomFilterShortRead(header_cipher_remaining, read_size, "Bloom filter
header");
+
+ // Bloom filter header and bitset are separate encrypted modules with
different AADs.
+ UpdateDecryptor(decryptor, row_group_ordinal, column_ordinal,
+ encryption::kBloomFilterHeader);
+ auto header_cipher_len = static_cast<uint32_t>(header_cipher_total_len);
+ try {
+ deserializer.DeserializeMessage(header_cipher_buf->data(),
&header_cipher_len,
+ &header, decryptor);
+ } catch (std::exception& e) {
+ std::stringstream ss;
+ ss << "Deserializing bloom filter header failed.\n" << e.what();
+ throw ParquetException(ss.str());
+ }
+ if (ARROW_PREDICT_FALSE(header_cipher_len != header_cipher_total_len)) {
+ std::stringstream ss;
+ ss << "Encrypted bloom filter header length mismatch: expected "
+ << header_cipher_total_len << " bytes, got " << header_cipher_len;
+ throw ParquetException(ss.str());
+ }
+ PARQUET_THROW_NOT_OK(ValidateBloomFilterHeader(header));
+
+ const int32_t bloom_filter_size = header.numBytes;
+ UpdateDecryptor(decryptor, row_group_ordinal, column_ordinal,
+ encryption::kBloomFilterBitset);
+ const int32_t bitset_cipher_len =
decryptor->CiphertextLength(bloom_filter_size);
+ const int64_t total_cipher_len =
+ header_cipher_total_len + static_cast<int64_t>(bitset_cipher_len);
+ if (bloom_filter_length && *bloom_filter_length != total_cipher_len) {
+ std::stringstream ss;
+ ss << "Bloom filter length (" << bloom_filter_length.value()
+ << ") does not match the actual bloom filter (size: " <<
total_cipher_len << ").";
+ throw ParquetException(ss.str());
+ }
+
+ // Read and decrypt the bitset bytes.
+ PARQUET_ASSIGN_OR_THROW(auto bitset_cipher_buf,
input->Read(bitset_cipher_len));
+ CheckBloomFilterShortRead(bitset_cipher_len, bitset_cipher_buf->size(),
+ "Bloom filter bitset");
+
+ const int32_t bitset_plain_len =
+ decryptor->PlaintextLength(static_cast<int32_t>(bitset_cipher_len));
+ if (ARROW_PREDICT_FALSE(bitset_plain_len != bloom_filter_size)) {
+ throw ParquetException("Bloom filter bitset size does not match header");
+ }
+
+ auto bitset_plain_buf = AllocateBuffer(properties.memory_pool(),
bitset_plain_len);
+ int32_t decrypted_len =
+ decryptor->Decrypt(bitset_cipher_buf->span_as<const uint8_t>(),
+ bitset_plain_buf->mutable_span_as<uint8_t>());
+ if (ARROW_PREDICT_FALSE(decrypted_len != bitset_plain_len)) {
+ throw ParquetException("Bloom filter bitset decryption failed");
+ }
+
+ // Initialize the bloom filter from the decrypted bitset.
+ BlockSplitBloomFilter bloom_filter(properties.memory_pool());
+ bloom_filter.Init(bitset_plain_buf->data(), bloom_filter_size);
+ return bloom_filter;
+}
+
+} // namespace
+
+BlockSplitBloomFilter BlockSplitBloomFilter::DeserializeEncrypted(
+ const ReaderProperties& properties, ArrowInputStream* input,
+ std::optional<int64_t> bloom_filter_length, Decryptor* decryptor,
+ int16_t row_group_ordinal, int16_t column_ordinal) {
+ if (decryptor == nullptr) {
+ throw ParquetException("Bloom filter decryptor must be provided");
+ }
+
+ // Read the full Bloom filter payload up front when the total length is
known.
+ if (bloom_filter_length.has_value()) {
+ PARQUET_ASSIGN_OR_THROW(auto bloom_filter_buf,
input->Read(*bloom_filter_length));
+ CheckBloomFilterShortRead(*bloom_filter_length, bloom_filter_buf->size(),
+ "Bloom filter");
+ ::arrow::io::BufferReader reader(bloom_filter_buf);
+ return DeserializeEncryptedFromStream(properties, &reader,
bloom_filter_length,
+ decryptor, row_group_ordinal,
column_ordinal);
+ }
+
+ return DeserializeEncryptedFromStream(properties, input,
bloom_filter_length, decryptor,
+ row_group_ordinal, column_ordinal);
+}
+
BlockSplitBloomFilter BlockSplitBloomFilter::Deserialize(
const ReaderProperties& properties, ArrowInputStream* input,
std::optional<int64_t> bloom_filter_length) {
@@ -126,8 +275,7 @@ BlockSplitBloomFilter BlockSplitBloomFilter::Deserialize(
// This gets used, then set by DeserializeThriftMsg
uint32_t header_size = static_cast<uint32_t>(header_buf->size());
try {
- deserializer.DeserializeMessage(reinterpret_cast<const
uint8_t*>(header_buf->data()),
- &header_size, &header);
+ deserializer.DeserializeMessage(header_buf->data(), &header_size, &header);
DCHECK_LE(header_size, header_buf->size());
} catch (std::exception& e) {
std::stringstream ss;
@@ -166,9 +314,7 @@ BlockSplitBloomFilter BlockSplitBloomFilter::Deserialize(
PARQUET_ASSIGN_OR_THROW(
auto read_size, input->Read(required_read_size,
buffer->mutable_data() +
bloom_filter_bytes_in_header));
- if (ARROW_PREDICT_FALSE(read_size < required_read_size)) {
- throw ParquetException("Bloom Filter read failed: not enough data");
- }
+ CheckBloomFilterShortRead(required_read_size, read_size, "Bloom filter");
BlockSplitBloomFilter bloom_filter(properties.memory_pool());
bloom_filter.Init(buffer->data(), bloom_filter_size);
return bloom_filter;
diff --git a/cpp/src/parquet/bloom_filter.h b/cpp/src/parquet/bloom_filter.h
index f1a74e4f5e..e14e0558d3 100644
--- a/cpp/src/parquet/bloom_filter.h
+++ b/cpp/src/parquet/bloom_filter.h
@@ -23,6 +23,7 @@
#include "arrow/util/bit_util.h"
#include "arrow/util/logging.h"
+#include "parquet/encryption/type_fwd.h"
#include "parquet/hasher.h"
#include "parquet/platform.h"
#include "parquet/types.h"
@@ -328,6 +329,24 @@ class PARQUET_EXPORT BlockSplitBloomFilter : public
BloomFilter {
const ReaderProperties& properties, ArrowInputStream* input_stream,
std::optional<int64_t> bloom_filter_length = std::nullopt);
+ /// Deserialize an encrypted Bloom filter from an input stream.
+ ///
+ /// The same metadata decryptor is used for both the serialized header and
bitset,
+ /// while switching module AADs between the two encrypted modules.
+ ///
+ /// @param properties The parquet reader properties.
+ /// @param input_stream The input stream from which to construct the bloom
filter.
+ /// @param bloom_filter_length The length of the serialized bloom filter
including
+ /// header.
+ /// @param decryptor Decryptor for encrypted Bloom filter modules.
+ /// @param row_group_ordinal Ordinal of the row group containing this Bloom
filter.
+ /// @param column_ordinal Ordinal of the column containing this Bloom filter.
+ /// @return The BlockSplitBloomFilter.
+ static BlockSplitBloomFilter DeserializeEncrypted(
+ const ReaderProperties& properties, ArrowInputStream* input_stream,
+ std::optional<int64_t> bloom_filter_length, Decryptor* decryptor,
+ int16_t row_group_ordinal, int16_t column_ordinal);
+
private:
inline void InsertHashImpl(uint64_t hash);
diff --git a/cpp/src/parquet/bloom_filter_reader.cc
b/cpp/src/parquet/bloom_filter_reader.cc
index 0b1bc556b4..1be7748c4d 100644
--- a/cpp/src/parquet/bloom_filter_reader.cc
+++ b/cpp/src/parquet/bloom_filter_reader.cc
@@ -17,6 +17,7 @@
#include "parquet/bloom_filter_reader.h"
#include "parquet/bloom_filter.h"
+#include "parquet/encryption/internal_file_decryptor.h"
#include "parquet/exception.h"
#include "parquet/metadata.h"
@@ -26,10 +27,14 @@ class RowGroupBloomFilterReaderImpl final : public
RowGroupBloomFilterReader {
public:
RowGroupBloomFilterReaderImpl(std::shared_ptr<::arrow::io::RandomAccessFile>
input,
std::shared_ptr<RowGroupMetaData>
row_group_metadata,
- const ReaderProperties& properties)
+ const ReaderProperties& properties,
+ int32_t row_group_ordinal,
+ std::shared_ptr<InternalFileDecryptor>
file_decryptor)
: input_(std::move(input)),
row_group_metadata_(std::move(row_group_metadata)),
- properties_(properties) {}
+ properties_(properties),
+ row_group_ordinal_(row_group_ordinal),
+ file_decryptor_(std::move(file_decryptor)) {}
std::unique_ptr<BloomFilter> GetColumnBloomFilter(int i) override;
@@ -42,6 +47,12 @@ class RowGroupBloomFilterReaderImpl final : public
RowGroupBloomFilterReader {
/// Reader properties used to deserialize thrift object.
const ReaderProperties& properties_;
+
+ /// The ordinal of the row group in the file.
+ int32_t row_group_ordinal_;
+
+ /// File-level decryptor.
+ std::shared_ptr<InternalFileDecryptor> file_decryptor_;
};
std::unique_ptr<BloomFilter>
RowGroupBloomFilterReaderImpl::GetColumnBloomFilter(int i) {
@@ -50,11 +61,6 @@ std::unique_ptr<BloomFilter>
RowGroupBloomFilterReaderImpl::GetColumnBloomFilter
}
auto col_chunk = row_group_metadata_->ColumnChunk(i);
- std::unique_ptr<ColumnCryptoMetaData> crypto_metadata =
col_chunk->crypto_metadata();
- if (crypto_metadata != nullptr) {
- ParquetException::NYI("BloomFilter decryption is not yet supported");
- }
-
auto bloom_filter_offset = col_chunk->bloom_filter_offset();
if (!bloom_filter_offset.has_value()) {
return nullptr;
@@ -76,10 +82,32 @@ std::unique_ptr<BloomFilter>
RowGroupBloomFilterReaderImpl::GetColumnBloomFilter
"bloom filter length + bloom filter offset greater than file size");
}
}
- auto stream = ::arrow::io::RandomAccessFile::GetStream(
- input_, *bloom_filter_offset, file_size - *bloom_filter_offset);
+
+ std::unique_ptr<ColumnCryptoMetaData> crypto_metadata =
col_chunk->crypto_metadata();
+ std::unique_ptr<Decryptor> decryptor =
+
InternalFileDecryptor::GetColumnMetaDecryptorFactory(file_decryptor_.get(),
+
crypto_metadata.get())();
+ if (decryptor != nullptr) {
+ constexpr auto kEncryptedOrdinalLimit = 32767;
+ if (ARROW_PREDICT_FALSE(row_group_ordinal_ > kEncryptedOrdinalLimit)) {
+ throw ParquetException("Encrypted files cannot contain more than 32767
row groups");
+ }
+ if (ARROW_PREDICT_FALSE(i > kEncryptedOrdinalLimit)) {
+ throw ParquetException("Encrypted files cannot contain more than 32767
columns");
+ }
+ }
+
+ const int64_t stream_length =
+ bloom_filter_length ? *bloom_filter_length : file_size -
*bloom_filter_offset;
+ auto stream = ::arrow::io::RandomAccessFile::GetStream(input_,
*bloom_filter_offset,
+ stream_length);
auto bloom_filter =
- BlockSplitBloomFilter::Deserialize(properties_, stream->get(),
bloom_filter_length);
+ decryptor != nullptr
+ ? BlockSplitBloomFilter::DeserializeEncrypted(
+ properties_, stream->get(), bloom_filter_length,
decryptor.get(),
+ static_cast<int16_t>(row_group_ordinal_),
static_cast<int16_t>(i))
+ : BlockSplitBloomFilter::Deserialize(properties_, stream->get(),
+ bloom_filter_length);
return std::make_unique<BlockSplitBloomFilter>(std::move(bloom_filter));
}
@@ -91,11 +119,8 @@ class BloomFilterReaderImpl final : public
BloomFilterReader {
std::shared_ptr<InternalFileDecryptor> file_decryptor)
: input_(std::move(input)),
file_metadata_(std::move(file_metadata)),
- properties_(properties) {
- if (file_decryptor != nullptr) {
- ParquetException::NYI("BloomFilter decryption is not yet supported");
- }
- }
+ properties_(properties),
+ file_decryptor_(std::move(file_decryptor)) {}
std::shared_ptr<RowGroupBloomFilterReader> RowGroup(int i) {
if (i < 0 || i >= file_metadata_->num_row_groups()) {
@@ -104,7 +129,7 @@ class BloomFilterReaderImpl final : public
BloomFilterReader {
auto row_group_metadata = file_metadata_->RowGroup(i);
return std::make_shared<RowGroupBloomFilterReaderImpl>(
- input_, std::move(row_group_metadata), properties_);
+ input_, std::move(row_group_metadata), properties_, i,
file_decryptor_);
}
private:
@@ -116,6 +141,9 @@ class BloomFilterReaderImpl final : public
BloomFilterReader {
/// Reader properties used to deserialize thrift object.
const ReaderProperties& properties_;
+
+ /// File-level decryptor, if any.
+ std::shared_ptr<InternalFileDecryptor> file_decryptor_;
};
std::unique_ptr<BloomFilterReader> BloomFilterReader::Make(
diff --git a/cpp/src/parquet/encryption/bloom_filter_encryption_test.cc
b/cpp/src/parquet/encryption/bloom_filter_encryption_test.cc
new file mode 100644
index 0000000000..9a49e7277b
--- /dev/null
+++ b/cpp/src/parquet/encryption/bloom_filter_encryption_test.cc
@@ -0,0 +1,94 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <string>
+
+#include "arrow/io/file.h"
+
+#include "parquet/bloom_filter.h"
+#include "parquet/bloom_filter_reader.h"
+#include "parquet/encryption/test_encryption_util.h"
+#include "parquet/file_reader.h"
+#include "parquet/properties.h"
+
+namespace parquet::encryption::test {
+namespace {
+
+std::shared_ptr<parquet::FileDecryptionProperties> BuildDecryptionProperties()
{
+ // Map test key ids to fixed test keys for decrypting the file footer and
columns.
+ std::shared_ptr<parquet::StringKeyIdRetriever> kr =
+ std::make_shared<parquet::StringKeyIdRetriever>();
+ kr->PutKey(kFooterMasterKeyId, kFooterEncryptionKey);
+ kr->PutKey(kColumnMasterKeyIds[0], kColumnEncryptionKey1);
+ kr->PutKey(kColumnMasterKeyIds[1], kColumnEncryptionKey2);
+
+ parquet::FileDecryptionProperties::Builder builder;
+ return builder
+
.key_retriever(std::static_pointer_cast<parquet::DecryptionKeyRetriever>(kr))
+ ->build();
+}
+
+} // namespace
+
+// Read Bloom filters from an encrypted parquet-testing file.
+// The test data enables Bloom filters for double_field and float_field only.
+TEST(EncryptedBloomFilterReader, ReadEncryptedBloomFilter) {
+ const std::string file_path =
+ data_file("encrypt_columns_and_footer_bloom_filter.parquet.encrypted");
+
+ parquet::ReaderProperties reader_properties =
parquet::default_reader_properties();
+ reader_properties.file_decryption_properties(BuildDecryptionProperties());
+
+ PARQUET_ASSIGN_OR_THROW(auto source, ::arrow::io::ReadableFile::Open(
+ file_path,
reader_properties.memory_pool()));
+ auto file_reader = parquet::ParquetFileReader::Open(source,
reader_properties);
+ auto file_metadata = file_reader->metadata();
+
+ ASSERT_EQ(file_metadata->num_columns(), 4);
+ ASSERT_GE(file_metadata->num_row_groups(), 1);
+
+ auto& bloom_filter_reader = file_reader->GetBloomFilterReader();
+ auto row_group_0 = bloom_filter_reader.RowGroup(0);
+ ASSERT_NE(nullptr, row_group_0);
+
+ auto double_filter = row_group_0->GetColumnBloomFilter(0);
+ auto float_filter = row_group_0->GetColumnBloomFilter(1);
+ auto int32_filter = row_group_0->GetColumnBloomFilter(2);
+ auto name_filter = row_group_0->GetColumnBloomFilter(3);
+
+ // double_field and float_field have Bloom filters; the others do not.
+ ASSERT_NE(nullptr, double_filter);
+ ASSERT_NE(nullptr, float_filter);
+ ASSERT_EQ(nullptr, int32_filter);
+ ASSERT_EQ(nullptr, name_filter);
+
+ // Values follow a simple pattern in the test data.
+ for (int i : {0, 1, 7, 42}) {
+ const double value = static_cast<double>(i) + 0.5;
+ EXPECT_TRUE(double_filter->FindHash(double_filter->Hash(value)));
+ }
+
+ for (int i : {0, 2, 5, 10}) {
+ const float value = static_cast<float>(i) + 0.25f;
+ EXPECT_TRUE(float_filter->FindHash(float_filter->Hash(value)));
+ }
+}
+
+} // namespace parquet::encryption::test
diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing
index a3d96a65e1..e74785d85a 160000
--- a/cpp/submodules/parquet-testing
+++ b/cpp/submodules/parquet-testing
@@ -1 +1 @@
-Subproject commit a3d96a65e11e2bbca7d22a894e8313ede90a33a3
+Subproject commit e74785d85a4ecee829e1e405444d6a1b24b8bc9c