corwinjoy commented on code in PR #16237: URL: https://github.com/apache/datafusion/pull/16237#discussion_r2125241447
########## datafusion-examples/examples/parquet_encryption_with_kms.rs: ########## @@ -0,0 +1,205 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{ArrayRef, Int32Array, RecordBatch, StringArray}; +use datafusion::config::TableParquetOptions; +use datafusion::dataframe::DataFrameWriteOptions; +use datafusion::datasource::file_format::parquet::ParquetFormat; +use datafusion::datasource::listing::ListingOptions; +use datafusion::datasource::physical_plan::FileSinkConfig; +use datafusion::error::{DataFusionError, Result}; +use datafusion::physical_plan::execute_stream; +use datafusion::prelude::SessionContext; +use futures::StreamExt; +use parquet::encryption::decrypt::FileDecryptionProperties; +use parquet::encryption::encrypt::FileEncryptionProperties; +use parquet_key_management::crypto_factory::{ + CryptoFactory, DecryptionConfiguration, EncryptionConfiguration, +}; +use parquet_key_management::kms::KmsConnectionConfig; +use parquet_key_management::test_kms::TestKmsClientFactory; +use std::collections::HashMap; +use std::sync::Arc; +use tempfile::TempDir; + +/// This example demonstrates reading and writing Parquet files that +/// are encrypted using Parquet Modular Encryption, and uses the +/// parquet-key-management crate to integrate with a Key Management Server (KMS). + +const ENCRYPTION_FACTORY_ID: &'static str = "example.inmem_kms_encryption"; + +#[tokio::main] +async fn main() -> Result<()> { + let ctx = SessionContext::new(); + + // Register an `EncryptionFactory` implementation to be used for Parquet encryption + // in the session context. + // This example uses an in-memory test KMS from the `parquet_key_management` crate with + // a custom `KmsEncryptionFactory` wrapper type to integrate with DataFusion. + // `EncryptionFactory` instances are registered with a name to identify them so + // they can be later referenced in configuration options, and it's possible to register + // multiple different factories to handle different ways of encrypting Parquet. + // In future it could be possible to have built-in implementations in DataFusion. + let crypto_factory = CryptoFactory::new(TestKmsClientFactory::with_default_keys()); + let encryption_factory = KmsEncryptionFactory { crypto_factory }; + ctx.register_parquet_encryption_factory( + ENCRYPTION_FACTORY_ID, + Arc::new(encryption_factory), + ); + + let tmpdir = TempDir::new()?; + write_encrypted(&ctx, &tmpdir).await?; + read_encrypted(&ctx, &tmpdir).await?; + Ok(()) +} + +/// Write an encrypted Parquet file +async fn write_encrypted(ctx: &SessionContext, tmpdir: &TempDir) -> Result<()> { + let a: ArrayRef = Arc::new(StringArray::from(vec!["a", "b", "c", "d"])); + let b: ArrayRef = Arc::new(Int32Array::from(vec![1, 10, 10, 100])); + let batch = RecordBatch::try_from_iter(vec![("a", a), ("b", b)])?; + + ctx.register_batch("test_data", batch)?; + let df = ctx.table("test_data").await?; + + let mut parquet_options = TableParquetOptions::new(); + // We specify that we want to use Parquet encryption by setting the identifier of the + // encryption factory to use. + parquet_options.encryption.factory_id = ENCRYPTION_FACTORY_ID.to_owned(); Review Comment: I think this needs to be under `parquet_options.global... ` since it affects all columns. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org