This is an automated email from the ASF dual-hosted git repository.
agrove pushed a commit to branch rust-parquet-arrow-writer
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/rust-parquet-arrow-writer by
this push:
new e456dfc ARROW-10191: [Rust] [Parquet] Add roundtrip Arrow -> Parquet
tests for all supported Arrow DataTypes
e456dfc is described below
commit e456dfc6f2d4519a1bf2ccca9531a75065216c2f
Author: Carol (Nichols || Goulding) <[email protected]>
AuthorDate: Tue Oct 6 08:44:26 2020 -0600
ARROW-10191: [Rust] [Parquet] Add roundtrip Arrow -> Parquet tests for all
supported Arrow DataTypes
Note that this PR goes to the rust-parquet-arrow-writer branch, not master.
Inspired by tests in cpp/src/parquet/arrow/arrow_reader_writer_test.cc
These perform round-trip Arrow -> Parquet -> Arrow of a single RecordBatch
with a single column of values of each the supported data types and some of the
unsupported ones.
Tests that currently fail are either marked with `#[should_panic]` (if the
reason they fail is because of a panic) or `#[ignore]` (if the reason
they fail is because the values don't match).
I am comparing the RecordBatch's column's data before and after the round
trip directly; I'm not sure that this is appropriate or not because for some
data types, the `null_bitmap` isn't matching and I'm not sure if it's supposed
to or not.
So I would love advice on that front, and I would love to know if these
tests are useful or not!
Closes #8330 from carols10cents/roundtrip-tests
Lead-authored-by: Carol (Nichols || Goulding) <[email protected]>
Co-authored-by: Neville Dipale <[email protected]>
Signed-off-by: Andy Grove <[email protected]>
---
rust/arrow/src/compute/kernels/cast.rs | 20 +-
rust/parquet/src/arrow/array_reader.rs | 102 +++++---
rust/parquet/src/arrow/arrow_writer.rs | 413 ++++++++++++++++++++++++++++++++-
rust/parquet/src/arrow/converter.rs | 25 +-
4 files changed, 523 insertions(+), 37 deletions(-)
diff --git a/rust/arrow/src/compute/kernels/cast.rs
b/rust/arrow/src/compute/kernels/cast.rs
index d8cb480..a1142b4 100644
--- a/rust/arrow/src/compute/kernels/cast.rs
+++ b/rust/arrow/src/compute/kernels/cast.rs
@@ -327,11 +327,27 @@ pub fn cast(array: &ArrayRef, to_type: &DataType) ->
Result<ArrayRef> {
// temporal casts
(Int32, Date32(_)) => cast_array_data::<Date32Type>(array,
to_type.clone()),
- (Int32, Time32(_)) => cast_array_data::<Date32Type>(array,
to_type.clone()),
+ (Int32, Time32(unit)) => match unit {
+ TimeUnit::Second => {
+ cast_array_data::<Time32SecondType>(array, to_type.clone())
+ }
+ TimeUnit::Millisecond => {
+ cast_array_data::<Time32MillisecondType>(array,
to_type.clone())
+ }
+ _ => unreachable!(),
+ },
(Date32(_), Int32) => cast_array_data::<Int32Type>(array,
to_type.clone()),
(Time32(_), Int32) => cast_array_data::<Int32Type>(array,
to_type.clone()),
(Int64, Date64(_)) => cast_array_data::<Date64Type>(array,
to_type.clone()),
- (Int64, Time64(_)) => cast_array_data::<Date64Type>(array,
to_type.clone()),
+ (Int64, Time64(unit)) => match unit {
+ TimeUnit::Microsecond => {
+ cast_array_data::<Time64MicrosecondType>(array,
to_type.clone())
+ }
+ TimeUnit::Nanosecond => {
+ cast_array_data::<Time64NanosecondType>(array, to_type.clone())
+ }
+ _ => unreachable!(),
+ },
(Date64(_), Int64) => cast_array_data::<Int64Type>(array,
to_type.clone()),
(Time64(_), Int64) => cast_array_data::<Int64Type>(array,
to_type.clone()),
(Date32(DateUnit::Day), Date64(DateUnit::Millisecond)) => {
diff --git a/rust/parquet/src/arrow/array_reader.rs
b/rust/parquet/src/arrow/array_reader.rs
index 14bf7d2..4fbc54d 100644
--- a/rust/parquet/src/arrow/array_reader.rs
+++ b/rust/parquet/src/arrow/array_reader.rs
@@ -35,9 +35,10 @@ use crate::arrow::converter::{
BinaryArrayConverter, BinaryConverter, BoolConverter,
BooleanArrayConverter,
Converter, Date32Converter, FixedLenBinaryConverter,
FixedSizeArrayConverter,
Float32Converter, Float64Converter, Int16Converter, Int32Converter,
Int64Converter,
- Int8Converter, Int96ArrayConverter, Int96Converter,
TimestampMicrosecondConverter,
- TimestampMillisecondConverter, UInt16Converter, UInt32Converter,
UInt64Converter,
- UInt8Converter, Utf8ArrayConverter, Utf8Converter,
+ Int8Converter, Int96ArrayConverter, Int96Converter,
Time32MillisecondConverter,
+ Time32SecondConverter, Time64MicrosecondConverter,
Time64NanosecondConverter,
+ TimestampMicrosecondConverter, TimestampMillisecondConverter,
UInt16Converter,
+ UInt32Converter, UInt64Converter, UInt8Converter, Utf8ArrayConverter,
Utf8Converter,
};
use crate::arrow::record_reader::RecordReader;
use crate::arrow::schema::parquet_to_arrow_field;
@@ -196,11 +197,27 @@ impl<T: DataType> ArrayReader for PrimitiveArrayReader<T>
{
.convert(self.record_reader.cast::<Int32Type>()),
_ => Err(general_err!("No conversion from parquet type to
arrow type for date with unit {:?}", unit)),
}
- (ArrowType::Time32(_), PhysicalType::INT32) => {
-
UInt32Converter::new().convert(self.record_reader.cast::<Int32Type>())
+ (ArrowType::Time32(unit), PhysicalType::INT32) => {
+ match unit {
+ TimeUnit::Second => {
+
Time32SecondConverter::new().convert(self.record_reader.cast::<Int32Type>())
+ }
+ TimeUnit::Millisecond => {
+
Time32MillisecondConverter::new().convert(self.record_reader.cast::<Int32Type>())
+ }
+ _ => Err(general_err!("Invalid or unsupported arrow
array with datatype {:?}", self.get_data_type()))
+ }
}
- (ArrowType::Time64(_), PhysicalType::INT64) => {
-
UInt64Converter::new().convert(self.record_reader.cast::<Int64Type>())
+ (ArrowType::Time64(unit), PhysicalType::INT64) => {
+ match unit {
+ TimeUnit::Microsecond => {
+
Time64MicrosecondConverter::new().convert(self.record_reader.cast::<Int64Type>())
+ }
+ TimeUnit::Nanosecond => {
+
Time64NanosecondConverter::new().convert(self.record_reader.cast::<Int64Type>())
+ }
+ _ => Err(general_err!("Invalid or unsupported arrow
array with datatype {:?}", self.get_data_type()))
+ }
}
(ArrowType::Interval(IntervalUnit::YearMonth),
PhysicalType::INT32) => {
UInt32Converter::new().convert(self.record_reader.cast::<Int32Type>())
@@ -941,10 +958,12 @@ mod tests {
use crate::util::test_common::{get_test_file, make_pages};
use arrow::array::{Array, ArrayRef, PrimitiveArray, StringArray,
StructArray};
use arrow::datatypes::{
- DataType as ArrowType, Date32Type as ArrowDate32, Field, Int32Type as
ArrowInt32,
+ ArrowPrimitiveType, DataType as ArrowType, Date32Type as ArrowDate32,
Field,
+ Int32Type as ArrowInt32, Int64Type as ArrowInt64,
+ Time32MillisecondType as ArrowTime32MillisecondArray,
+ Time64MicrosecondType as ArrowTime64MicrosecondArray,
TimestampMicrosecondType as ArrowTimestampMicrosecondType,
TimestampMillisecondType as ArrowTimestampMillisecondType,
- UInt32Type as ArrowUInt32, UInt64Type as ArrowUInt64,
};
use rand::distributions::uniform::SampleUniform;
use rand::{thread_rng, Rng};
@@ -1101,7 +1120,7 @@ mod tests {
}
macro_rules! test_primitive_array_reader_one_type {
- ($arrow_parquet_type:ty, $physical_type:expr, $logical_type_str:expr,
$result_arrow_type:ty, $result_primitive_type:ty) => {{
+ ($arrow_parquet_type:ty, $physical_type:expr, $logical_type_str:expr,
$result_arrow_type:ty, $result_arrow_cast_type:ty, $result_primitive_type:ty)
=> {{
let message_type = format!(
"
message test_schema {{
@@ -1112,7 +1131,7 @@ mod tests {
);
let schema = parse_message_type(&message_type)
.map(|t| Rc::new(SchemaDescriptor::new(Rc::new(t))))
- .unwrap();
+ .expect("Unable to parse message type into a schema
descriptor");
let column_desc = schema.column(0);
@@ -1142,24 +1161,48 @@ mod tests {
Box::new(page_iterator),
column_desc.clone(),
)
- .unwrap();
+ .expect("Unable to get array reader");
- let array = array_reader.next_batch(50).unwrap();
+ let array = array_reader
+ .next_batch(50)
+ .expect("Unable to get batch from reader");
+ let result_data_type = <$result_arrow_type>::get_data_type();
let array = array
.as_any()
.downcast_ref::<PrimitiveArray<$result_arrow_type>>()
- .unwrap();
-
- assert_eq!(
- &PrimitiveArray::<$result_arrow_type>::from(
- data[0..50]
- .iter()
- .map(|x| *x as $result_primitive_type)
- .collect::<Vec<$result_primitive_type>>()
- ),
- array
+ .expect(
+ format!(
+ "Unable to downcast {:?} to {:?}",
+ array.data_type(),
+ result_data_type
+ )
+ .as_str(),
+ );
+
+ // create expected array as primitive, and cast to result type
+ let expected = PrimitiveArray::<$result_arrow_cast_type>::from(
+ data[0..50]
+ .iter()
+ .map(|x| *x as $result_primitive_type)
+ .collect::<Vec<$result_primitive_type>>(),
);
+ let expected = Arc::new(expected) as ArrayRef;
+ let expected = arrow::compute::cast(&expected,
&result_data_type)
+ .expect("Unable to cast expected array");
+ assert_eq!(expected.data_type(), &result_data_type);
+ let expected = expected
+ .as_any()
+ .downcast_ref::<PrimitiveArray<$result_arrow_type>>()
+ .expect(
+ format!(
+ "Unable to downcast expected {:?} to {:?}",
+ expected.data_type(),
+ result_data_type
+ )
+ .as_str(),
+ );
+ assert_eq!(expected, array);
}
}};
}
@@ -1171,27 +1214,31 @@ mod tests {
PhysicalType::INT32,
"DATE",
ArrowDate32,
+ ArrowInt32,
i32
);
test_primitive_array_reader_one_type!(
Int32Type,
PhysicalType::INT32,
"TIME_MILLIS",
- ArrowUInt32,
- u32
+ ArrowTime32MillisecondArray,
+ ArrowInt32,
+ i32
);
test_primitive_array_reader_one_type!(
Int64Type,
PhysicalType::INT64,
"TIME_MICROS",
- ArrowUInt64,
- u64
+ ArrowTime64MicrosecondArray,
+ ArrowInt64,
+ i64
);
test_primitive_array_reader_one_type!(
Int64Type,
PhysicalType::INT64,
"TIMESTAMP_MILLIS",
ArrowTimestampMillisecondType,
+ ArrowInt64,
i64
);
test_primitive_array_reader_one_type!(
@@ -1199,6 +1246,7 @@ mod tests {
PhysicalType::INT64,
"TIMESTAMP_MICROS",
ArrowTimestampMicrosecondType,
+ ArrowInt64,
i64
);
}
diff --git a/rust/parquet/src/arrow/arrow_writer.rs
b/rust/parquet/src/arrow/arrow_writer.rs
index e0ad207..cf7b9a2 100644
--- a/rust/parquet/src/arrow/arrow_writer.rs
+++ b/rust/parquet/src/arrow/arrow_writer.rs
@@ -136,7 +136,6 @@ fn write_leaves(
| ArrowDataType::UInt16
| ArrowDataType::UInt32
| ArrowDataType::UInt64
- | ArrowDataType::Float16
| ArrowDataType::Float32
| ArrowDataType::Float64
| ArrowDataType::Timestamp(_, _)
@@ -176,6 +175,9 @@ fn write_leaves(
}
Ok(())
}
+ ArrowDataType::Float16 => Err(ParquetError::ArrowError(
+ "Float16 arrays not supported".to_string(),
+ )),
ArrowDataType::FixedSizeList(_, _)
| ArrowDataType::Null
| ArrowDataType::Boolean
@@ -493,7 +495,7 @@ mod tests {
use arrow::array::*;
use arrow::datatypes::ToByteSlice;
use arrow::datatypes::{DataType, Field, Schema};
- use arrow::record_batch::{RecordBatch, RecordBatchReader};
+ use arrow::record_batch::RecordBatch;
use crate::arrow::{ArrowReader, ParquetFileArrowReader};
use crate::file::{metadata::KeyValue, reader::SerializedFileReader};
@@ -597,7 +599,7 @@ mod tests {
let mut arrow_reader =
ParquetFileArrowReader::new(Rc::new(file_reader));
let mut record_batch_reader =
arrow_reader.get_record_reader(1024).unwrap();
- let batch = record_batch_reader.next_batch().unwrap().unwrap();
+ let batch = record_batch_reader.next().unwrap().unwrap();
let string_col = batch
.column(0)
.as_any()
@@ -688,4 +690,409 @@ mod tests {
writer.write(&batch).unwrap();
writer.close().unwrap();
}
+
+ const SMALL_SIZE: usize = 100;
+
+ fn roundtrip(filename: &str, expected_batch: RecordBatch) {
+ let file = get_temp_file(filename, &[]);
+
+ let mut writer = ArrowWriter::try_new(
+ file.try_clone().unwrap(),
+ expected_batch.schema(),
+ None,
+ )
+ .unwrap();
+ writer.write(&expected_batch).unwrap();
+ writer.close().unwrap();
+
+ let reader = SerializedFileReader::new(file).unwrap();
+ let mut arrow_reader = ParquetFileArrowReader::new(Rc::new(reader));
+ let mut record_batch_reader =
arrow_reader.get_record_reader(1024).unwrap();
+
+ let actual_batch = record_batch_reader.next().unwrap().unwrap();
+
+ assert_eq!(expected_batch.schema(), actual_batch.schema());
+ assert_eq!(expected_batch.num_columns(), actual_batch.num_columns());
+ assert_eq!(expected_batch.num_rows(), actual_batch.num_rows());
+ for i in 0..expected_batch.num_columns() {
+ let expected_data = expected_batch.column(i).data();
+ let actual_data = actual_batch.column(i).data();
+
+ assert_eq!(expected_data.data_type(), actual_data.data_type());
+ assert_eq!(expected_data.len(), actual_data.len());
+ assert_eq!(expected_data.null_count(), actual_data.null_count());
+ assert_eq!(expected_data.offset(), actual_data.offset());
+ assert_eq!(expected_data.buffers(), actual_data.buffers());
+ assert_eq!(expected_data.child_data(), actual_data.child_data());
+ assert_eq!(expected_data.null_bitmap(), actual_data.null_bitmap());
+ }
+ }
+
+ fn one_column_roundtrip(filename: &str, values: ArrayRef, nullable: bool) {
+ let schema = Schema::new(vec![Field::new(
+ "col",
+ values.data_type().clone(),
+ nullable,
+ )]);
+ let expected_batch =
+ RecordBatch::try_new(Arc::new(schema), vec![values]).unwrap();
+
+ roundtrip(filename, expected_batch);
+ }
+
+ fn values_required<A, I>(iter: I, filename: &str)
+ where
+ A: From<Vec<I::Item>> + Array + 'static,
+ I: IntoIterator,
+ {
+ let raw_values: Vec<_> = iter.into_iter().collect();
+ let values = Arc::new(A::from(raw_values));
+ one_column_roundtrip(filename, values, false);
+ }
+
+ fn values_optional<A, I>(iter: I, filename: &str)
+ where
+ A: From<Vec<Option<I::Item>>> + Array + 'static,
+ I: IntoIterator,
+ {
+ let optional_raw_values: Vec<_> = iter
+ .into_iter()
+ .enumerate()
+ .map(|(i, v)| if i % 2 == 0 { None } else { Some(v) })
+ .collect();
+ let optional_values = Arc::new(A::from(optional_raw_values));
+ one_column_roundtrip(filename, optional_values, true);
+ }
+
+ fn required_and_optional<A, I>(iter: I, filename: &str)
+ where
+ A: From<Vec<I::Item>> + From<Vec<Option<I::Item>>> + Array + 'static,
+ I: IntoIterator + Clone,
+ {
+ values_required::<A, I>(iter.clone(), filename);
+ values_optional::<A, I>(iter, filename);
+ }
+
+ #[test]
+ #[should_panic(expected = "Null arrays not supported")]
+ fn null_single_column() {
+ let values = Arc::new(NullArray::new(SMALL_SIZE));
+ one_column_roundtrip("null_single_column", values.clone(), true);
+ one_column_roundtrip("null_single_column", values, false);
+ }
+
+ #[test]
+ #[should_panic(
+ expected = "Attempting to write an Arrow type that is not yet
implemented"
+ )]
+ fn bool_single_column() {
+ required_and_optional::<BooleanArray, _>(
+ [true, false].iter().cycle().copied().take(SMALL_SIZE),
+ "bool_single_column",
+ );
+ }
+
+ #[test]
+ fn i8_single_column() {
+ required_and_optional::<Int8Array, _>(0..SMALL_SIZE as i8,
"i8_single_column");
+ }
+
+ #[test]
+ fn i16_single_column() {
+ required_and_optional::<Int16Array, _>(0..SMALL_SIZE as i16,
"i16_single_column");
+ }
+
+ #[test]
+ fn i32_single_column() {
+ required_and_optional::<Int32Array, _>(0..SMALL_SIZE as i32,
"i32_single_column");
+ }
+
+ #[test]
+ fn i64_single_column() {
+ required_and_optional::<Int64Array, _>(0..SMALL_SIZE as i64,
"i64_single_column");
+ }
+
+ #[test]
+ fn u8_single_column() {
+ required_and_optional::<UInt8Array, _>(0..SMALL_SIZE as u8,
"u8_single_column");
+ }
+
+ #[test]
+ fn u16_single_column() {
+ required_and_optional::<UInt16Array, _>(
+ 0..SMALL_SIZE as u16,
+ "u16_single_column",
+ );
+ }
+
+ #[test]
+ fn u32_single_column() {
+ required_and_optional::<UInt32Array, _>(
+ 0..SMALL_SIZE as u32,
+ "u32_single_column",
+ );
+ }
+
+ #[test]
+ fn u64_single_column() {
+ required_and_optional::<UInt64Array, _>(
+ 0..SMALL_SIZE as u64,
+ "u64_single_column",
+ );
+ }
+
+ #[test]
+ fn f32_single_column() {
+ required_and_optional::<Float32Array, _>(
+ (0..SMALL_SIZE).map(|i| i as f32),
+ "f32_single_column",
+ );
+ }
+
+ #[test]
+ fn f64_single_column() {
+ required_and_optional::<Float64Array, _>(
+ (0..SMALL_SIZE).map(|i| i as f64),
+ "f64_single_column",
+ );
+ }
+
+ // The timestamp array types don't implement From<Vec<T>> because they
need the timezone
+ // argument, and they also doesn't support building from a Vec<Option<T>>,
so call
+ // one_column_roundtrip manually instead of calling required_and_optional
for these tests.
+
+ #[test]
+ #[ignore] // Timestamp support isn't correct yet
+ fn timestamp_second_single_column() {
+ let raw_values: Vec<_> = (0..SMALL_SIZE as i64).collect();
+ let values = Arc::new(TimestampSecondArray::from_vec(raw_values,
None));
+
+ one_column_roundtrip("timestamp_second_single_column", values, false);
+ }
+
+ #[test]
+ fn timestamp_millisecond_single_column() {
+ let raw_values: Vec<_> = (0..SMALL_SIZE as i64).collect();
+ let values = Arc::new(TimestampMillisecondArray::from_vec(raw_values,
None));
+
+ one_column_roundtrip("timestamp_millisecond_single_column", values,
false);
+ }
+
+ #[test]
+ fn timestamp_microsecond_single_column() {
+ let raw_values: Vec<_> = (0..SMALL_SIZE as i64).collect();
+ let values = Arc::new(TimestampMicrosecondArray::from_vec(raw_values,
None));
+
+ one_column_roundtrip("timestamp_microsecond_single_column", values,
false);
+ }
+
+ #[test]
+ #[ignore] // Timestamp support isn't correct yet
+ fn timestamp_nanosecond_single_column() {
+ let raw_values: Vec<_> = (0..SMALL_SIZE as i64).collect();
+ let values = Arc::new(TimestampNanosecondArray::from_vec(raw_values,
None));
+
+ one_column_roundtrip("timestamp_nanosecond_single_column", values,
false);
+ }
+
+ #[test]
+ fn date32_single_column() {
+ required_and_optional::<Date32Array, _>(
+ 0..SMALL_SIZE as i32,
+ "date32_single_column",
+ );
+ }
+
+ #[test]
+ #[ignore] // Date support isn't correct yet
+ fn date64_single_column() {
+ required_and_optional::<Date64Array, _>(
+ 0..SMALL_SIZE as i64,
+ "date64_single_column",
+ );
+ }
+
+ #[test]
+ #[ignore] // DateUnit resolution mismatch
+ fn time32_second_single_column() {
+ required_and_optional::<Time32SecondArray, _>(
+ 0..SMALL_SIZE as i32,
+ "time32_second_single_column",
+ );
+ }
+
+ #[test]
+ fn time32_millisecond_single_column() {
+ required_and_optional::<Time32MillisecondArray, _>(
+ 0..SMALL_SIZE as i32,
+ "time32_millisecond_single_column",
+ );
+ }
+
+ #[test]
+ fn time64_microsecond_single_column() {
+ required_and_optional::<Time64MicrosecondArray, _>(
+ 0..SMALL_SIZE as i64,
+ "time64_microsecond_single_column",
+ );
+ }
+
+ #[test]
+ #[ignore] // DateUnit resolution mismatch
+ fn time64_nanosecond_single_column() {
+ required_and_optional::<Time64NanosecondArray, _>(
+ 0..SMALL_SIZE as i64,
+ "time64_nanosecond_single_column",
+ );
+ }
+
+ #[test]
+ #[should_panic(expected = "Converting Duration to parquet not supported")]
+ fn duration_second_single_column() {
+ required_and_optional::<DurationSecondArray, _>(
+ 0..SMALL_SIZE as i64,
+ "duration_second_single_column",
+ );
+ }
+
+ #[test]
+ #[should_panic(expected = "Converting Duration to parquet not supported")]
+ fn duration_millisecond_single_column() {
+ required_and_optional::<DurationMillisecondArray, _>(
+ 0..SMALL_SIZE as i64,
+ "duration_millisecond_single_column",
+ );
+ }
+
+ #[test]
+ #[should_panic(expected = "Converting Duration to parquet not supported")]
+ fn duration_microsecond_single_column() {
+ required_and_optional::<DurationMicrosecondArray, _>(
+ 0..SMALL_SIZE as i64,
+ "duration_microsecond_single_column",
+ );
+ }
+
+ #[test]
+ #[should_panic(expected = "Converting Duration to parquet not supported")]
+ fn duration_nanosecond_single_column() {
+ required_and_optional::<DurationNanosecondArray, _>(
+ 0..SMALL_SIZE as i64,
+ "duration_nanosecond_single_column",
+ );
+ }
+
+ #[test]
+ #[should_panic(expected = "Currently unreachable because data type not
supported")]
+ fn interval_year_month_single_column() {
+ required_and_optional::<IntervalYearMonthArray, _>(
+ 0..SMALL_SIZE as i32,
+ "interval_year_month_single_column",
+ );
+ }
+
+ #[test]
+ #[should_panic(expected = "Currently unreachable because data type not
supported")]
+ fn interval_day_time_single_column() {
+ required_and_optional::<IntervalDayTimeArray, _>(
+ 0..SMALL_SIZE as i64,
+ "interval_day_time_single_column",
+ );
+ }
+
+ #[test]
+ #[ignore] // Binary support isn't correct yet - null_bitmap doesn't match
+ fn binary_single_column() {
+ let one_vec: Vec<u8> = (0..SMALL_SIZE as u8).collect();
+ let many_vecs: Vec<_> =
std::iter::repeat(one_vec).take(SMALL_SIZE).collect();
+ let many_vecs_iter = many_vecs.iter().map(|v| v.as_slice());
+
+ // BinaryArrays can't be built from Vec<Option<&str>>, so only call
`values_required`
+ values_required::<BinaryArray, _>(many_vecs_iter,
"binary_single_column");
+ }
+
+ #[test]
+ #[ignore] // Large Binary support isn't correct yet
+ fn large_binary_single_column() {
+ let one_vec: Vec<u8> = (0..SMALL_SIZE as u8).collect();
+ let many_vecs: Vec<_> =
std::iter::repeat(one_vec).take(SMALL_SIZE).collect();
+ let many_vecs_iter = many_vecs.iter().map(|v| v.as_slice());
+
+ // LargeBinaryArrays can't be built from Vec<Option<&str>>, so only
call `values_required`
+ values_required::<LargeBinaryArray, _>(
+ many_vecs_iter,
+ "large_binary_single_column",
+ );
+ }
+
+ #[test]
+ #[ignore] // String support isn't correct yet - null_bitmap doesn't match
+ fn string_single_column() {
+ let raw_values: Vec<_> = (0..SMALL_SIZE).map(|i|
i.to_string()).collect();
+ let raw_strs = raw_values.iter().map(|s| s.as_str());
+
+ required_and_optional::<StringArray, _>(raw_strs,
"string_single_column");
+ }
+
+ #[test]
+ #[ignore] // Large String support isn't correct yet - null_bitmap and
buffers don't match
+ fn large_string_single_column() {
+ let raw_values: Vec<_> = (0..SMALL_SIZE).map(|i|
i.to_string()).collect();
+ let raw_strs = raw_values.iter().map(|s| s.as_str());
+
+ required_and_optional::<LargeStringArray, _>(
+ raw_strs,
+ "large_string_single_column",
+ );
+ }
+
+ #[test]
+ #[should_panic(
+ expected = "Reading parquet list array into arrow is not supported
yet!"
+ )]
+ fn list_single_column() {
+ let a_values = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
+ let a_value_offsets =
+ arrow::buffer::Buffer::from(&[0, 1, 3, 3, 6, 10].to_byte_slice());
+ let a_list_data =
ArrayData::builder(DataType::List(Box::new(DataType::Int32)))
+ .len(5)
+ .add_buffer(a_value_offsets)
+ .add_child_data(a_values.data())
+ .build();
+ let a = ListArray::from(a_list_data);
+
+ let values = Arc::new(a);
+ one_column_roundtrip("list_single_column", values, false);
+ }
+
+ #[test]
+ #[should_panic(
+ expected = "Reading parquet list array into arrow is not supported
yet!"
+ )]
+ fn large_list_single_column() {
+ let a_values = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
+ let a_value_offsets =
+ arrow::buffer::Buffer::from(&[0i64, 1, 3, 3, 6,
10].to_byte_slice());
+ let a_list_data =
+ ArrayData::builder(DataType::LargeList(Box::new(DataType::Int32)))
+ .len(5)
+ .add_buffer(a_value_offsets)
+ .add_child_data(a_values.data())
+ .build();
+ let a = LargeListArray::from(a_list_data);
+
+ let values = Arc::new(a);
+ one_column_roundtrip("large_list_single_column", values, false);
+ }
+
+ #[test]
+ #[ignore] // Struct support isn't correct yet - null_bitmap doesn't match
+ fn struct_single_column() {
+ let a_values = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
+ let struct_field_a = Field::new("f", DataType::Int32, false);
+ let s = StructArray::from(vec![(struct_field_a, Arc::new(a_values) as
ArrayRef)]);
+
+ let values = Arc::new(s);
+ one_column_roundtrip("struct_single_column", values, false);
+ }
}
diff --git a/rust/parquet/src/arrow/converter.rs
b/rust/parquet/src/arrow/converter.rs
index 9fbfa33..c988aae 100644
--- a/rust/parquet/src/arrow/converter.rs
+++ b/rust/parquet/src/arrow/converter.rs
@@ -17,12 +17,19 @@
use crate::arrow::record_reader::RecordReader;
use crate::data_type::{ByteArray, DataType, Int96};
-use arrow::array::{
- Array, ArrayRef, BinaryBuilder, BooleanArray, BooleanBufferBuilder,
- BufferBuilderTrait, FixedSizeBinaryBuilder, StringBuilder,
- TimestampNanosecondBuilder,
+// TODO: clean up imports (best done when there are few moving parts)
+use arrow::{
+ array::{
+ Array, ArrayRef, BinaryBuilder, BooleanArray, BooleanBufferBuilder,
+ BufferBuilderTrait, FixedSizeBinaryBuilder, StringBuilder,
+ TimestampNanosecondBuilder,
+ },
+ datatypes::Time32MillisecondType,
+};
+use arrow::{
+ compute::cast, datatypes::Time32SecondType,
datatypes::Time64MicrosecondType,
+ datatypes::Time64NanosecondType,
};
-use arrow::compute::cast;
use std::convert::From;
use std::sync::Arc;
@@ -226,6 +233,14 @@ pub type TimestampMillisecondConverter =
CastConverter<ParquetInt64Type, TimestampMillisecondType,
TimestampMillisecondType>;
pub type TimestampMicrosecondConverter =
CastConverter<ParquetInt64Type, TimestampMicrosecondType,
TimestampMicrosecondType>;
+pub type Time32SecondConverter =
+ CastConverter<ParquetInt32Type, Time32SecondType, Time32SecondType>;
+pub type Time32MillisecondConverter =
+ CastConverter<ParquetInt32Type, Time32MillisecondType,
Time32MillisecondType>;
+pub type Time64MicrosecondConverter =
+ CastConverter<ParquetInt64Type, Time64MicrosecondType,
Time64MicrosecondType>;
+pub type Time64NanosecondConverter =
+ CastConverter<ParquetInt64Type, Time64NanosecondType,
Time64NanosecondType>;
pub type UInt64Converter = CastConverter<ParquetInt64Type, UInt64Type,
UInt64Type>;
pub type Float32Converter = CastConverter<ParquetFloatType, Float32Type,
Float32Type>;
pub type Float64Converter = CastConverter<ParquetDoubleType, Float64Type,
Float64Type>;