This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 1f1c3a4cea Support `ListView` codec in arrow-json (#9503)
1f1c3a4cea is described below
commit 1f1c3a4cea6972ade7ff73a7765521c21a992e4f
Author: Liam Bao <[email protected]>
AuthorDate: Thu Mar 26 18:15:41 2026 -0400
Support `ListView` codec in arrow-json (#9503)
# Which issue does this PR close?
<!--
We generally require a GitHub issue to be filed for all bug fixes and
enhancements and this helps us generate change logs for our releases.
You can link an issue to this PR using the GitHub syntax.
-->
- Closes #9340.
# Rationale for this change
<!--
Why are you proposing this change? If this is already explained clearly
in the issue then this section is not needed.
Explaining clearly why changes are proposed helps reviewers understand
your changes and offer better suggestions for fixes.
-->
# What changes are included in this PR?
<!--
There is no need to duplicate the description in the issue here but it
is sometimes worth providing a summary of the individual changes in this
PR.
-->
Support `ListView` codec in arrow-json. Using `ListLikeArray` trait to
simplify implementation.
# Are these changes tested?
<!--
We typically require tests for all PRs in order to:
1. Prevent the code from being accidentally broken by subsequent changes
2. Serve as another way to document the expected behavior of the code
If tests are not included in your PR, please explain why (for example,
are they covered by existing tests)?
-->
Tests added
# Are there any user-facing changes?
<!--
If there are user-facing changes then we may require documentation to be
updated before approving the PR.
If there are any breaking changes to public APIs, please call them out.
-->
New encoder/decoder
---
arrow-json/src/lib.rs | 66 +++++++++++++++++++++++-------
arrow-json/src/reader/list_array.rs | 43 ++++++++++++++------
arrow-json/src/reader/mod.rs | 80 ++++++++++++++++++++++++++++++++++++-
arrow-json/src/writer/encoder.rs | 79 ++++++++++--------------------------
arrow-json/src/writer/mod.rs | 48 ++++++++++++++++++++++
5 files changed, 228 insertions(+), 88 deletions(-)
diff --git a/arrow-json/src/lib.rs b/arrow-json/src/lib.rs
index 1b18e00947..201c3cd807 100644
--- a/arrow-json/src/lib.rs
+++ b/arrow-json/src/lib.rs
@@ -179,17 +179,17 @@ impl JsonSerializable for f64 {
#[cfg(test)]
mod tests {
- use std::sync::Arc;
-
- use crate::writer::JsonArray;
-
use super::*;
-
+ use crate::writer::JsonArray;
+ use crate::writer::LineDelimited;
use arrow_array::{
- ArrayRef, GenericBinaryArray, GenericByteViewArray, RecordBatch,
RecordBatchWriter,
- builder::FixedSizeBinaryBuilder, types::BinaryViewType,
+ ArrayRef, GenericBinaryArray, GenericByteViewArray,
GenericListViewArray, RecordBatch,
+ RecordBatchWriter, builder::FixedSizeBinaryBuilder,
types::BinaryViewType,
};
+ use arrow_schema::{DataType, Field, Fields, Schema};
use serde_json::Value::{Bool, Number as VNumber, String as VString};
+ use std::io::Cursor;
+ use std::sync::Arc;
#[test]
fn test_arrow_native_type_to_json() {
@@ -216,13 +216,6 @@ mod tests {
#[test]
fn test_json_roundtrip_structs() {
- use crate::writer::LineDelimited;
- use arrow_schema::DataType;
- use arrow_schema::Field;
- use arrow_schema::Fields;
- use arrow_schema::Schema;
- use std::sync::Arc;
-
let schema = Arc::new(Schema::new(vec![
Field::new(
"c1",
@@ -352,4 +345,49 @@ mod tests {
assert_eq!(batch, decoded);
}
+
+ fn assert_list_view_roundtrip<O: arrow_array::OffsetSizeTrait>() {
+ let flat_field = Arc::new(Field::new("item", DataType::Int32, true));
+ let flat_dt =
GenericListViewArray::<O>::DATA_TYPE_CONSTRUCTOR(flat_field);
+
+ let nested_inner = Arc::new(Field::new("item", DataType::Int32,
false));
+ let nested_inner_dt =
GenericListViewArray::<O>::DATA_TYPE_CONSTRUCTOR(nested_inner);
+ let nested_outer = Arc::new(Field::new("item", nested_inner_dt, true));
+ let nested_dt =
GenericListViewArray::<O>::DATA_TYPE_CONSTRUCTOR(nested_outer);
+
+ let schema = Arc::new(Schema::new(vec![
+ Field::new("flat", flat_dt, true),
+ Field::new("nested", nested_dt, true),
+ ]));
+
+ let input = r#"{"flat":[1,2,3],"nested":[[1,2],[3]]}
+{"flat":[4,null]}
+{}
+{"flat":[6],"nested":[[4,5,6]]}
+{"flat":[]}
+"#
+ .as_bytes();
+
+ let batches: Vec<RecordBatch> = ReaderBuilder::new(schema.clone())
+ .with_batch_size(1024)
+ .build(Cursor::new(input))
+ .unwrap()
+ .collect::<Result<Vec<_>, _>>()
+ .unwrap();
+
+ let mut output = Vec::new();
+ let mut writer = WriterBuilder::new().build::<_, LineDelimited>(&mut
output);
+ for batch in &batches {
+ writer.write(batch).unwrap();
+ }
+ writer.finish().unwrap();
+
+ assert_eq!(input, &output);
+ }
+
+ #[test]
+ fn test_json_roundtrip_list_view() {
+ assert_list_view_roundtrip::<i32>();
+ assert_list_view_roundtrip::<i64>();
+ }
}
diff --git a/arrow-json/src/reader/list_array.rs
b/arrow-json/src/reader/list_array.rs
index d363b6be97..ea23403c4b 100644
--- a/arrow-json/src/reader/list_array.rs
+++ b/arrow-json/src/reader/list_array.rs
@@ -18,28 +18,33 @@
use crate::reader::tape::{Tape, TapeElement};
use crate::reader::{ArrayDecoder, DecoderContext};
use arrow_array::OffsetSizeTrait;
-use arrow_array::builder::{BooleanBufferBuilder, BufferBuilder};
-use arrow_buffer::buffer::NullBuffer;
+use arrow_array::builder::BooleanBufferBuilder;
+use arrow_buffer::{Buffer, buffer::NullBuffer};
use arrow_data::{ArrayData, ArrayDataBuilder};
use arrow_schema::{ArrowError, DataType};
use std::marker::PhantomData;
-pub struct ListArrayDecoder<O> {
+pub type ListArrayDecoder<O> = ListLikeArrayDecoder<O, false>;
+pub type ListViewArrayDecoder<O> = ListLikeArrayDecoder<O, true>;
+
+pub struct ListLikeArrayDecoder<O, const IS_VIEW: bool> {
data_type: DataType,
decoder: Box<dyn ArrayDecoder>,
phantom: PhantomData<O>,
is_nullable: bool,
}
-impl<O: OffsetSizeTrait> ListArrayDecoder<O> {
+impl<O: OffsetSizeTrait, const IS_VIEW: bool> ListLikeArrayDecoder<O, IS_VIEW>
{
pub fn new(
ctx: &DecoderContext,
data_type: &DataType,
is_nullable: bool,
) -> Result<Self, ArrowError> {
- let field = match data_type {
- DataType::List(f) if !O::IS_LARGE => f,
- DataType::LargeList(f) if O::IS_LARGE => f,
+ let field = match (IS_VIEW, data_type) {
+ (false, DataType::List(f)) if !O::IS_LARGE => f,
+ (false, DataType::LargeList(f)) if O::IS_LARGE => f,
+ (true, DataType::ListView(f)) if !O::IS_LARGE => f,
+ (true, DataType::LargeListView(f)) if O::IS_LARGE => f,
_ => unreachable!(),
};
let decoder = ctx.make_decoder(field.data_type(),
field.is_nullable())?;
@@ -53,11 +58,11 @@ impl<O: OffsetSizeTrait> ListArrayDecoder<O> {
}
}
-impl<O: OffsetSizeTrait> ArrayDecoder for ListArrayDecoder<O> {
+impl<O: OffsetSizeTrait, const IS_VIEW: bool> ArrayDecoder for
ListLikeArrayDecoder<O, IS_VIEW> {
fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayData,
ArrowError> {
let mut child_pos = Vec::with_capacity(pos.len());
- let mut offsets = BufferBuilder::<O>::new(pos.len() + 1);
- offsets.append(O::from_usize(0).unwrap());
+ let mut offsets = Vec::with_capacity(pos.len() + 1);
+ offsets.push(O::from_usize(0).unwrap());
let mut nulls = self
.is_nullable
@@ -88,18 +93,30 @@ impl<O: OffsetSizeTrait> ArrayDecoder for
ListArrayDecoder<O> {
let offset = O::from_usize(child_pos.len()).ok_or_else(|| {
ArrowError::JsonError(format!("offset overflow decoding {}",
self.data_type))
})?;
- offsets.append(offset)
+ offsets.push(offset);
}
let child_data = self.decoder.decode(tape, &child_pos)?;
let nulls = nulls.as_mut().map(|x| NullBuffer::new(x.finish()));
- let data = ArrayDataBuilder::new(self.data_type.clone())
+ let mut data = ArrayDataBuilder::new(self.data_type.clone())
.len(pos.len())
.nulls(nulls)
- .add_buffer(offsets.finish())
.child_data(vec![child_data]);
+ if IS_VIEW {
+ let mut sizes = Vec::with_capacity(offsets.len() - 1);
+ for i in 1..offsets.len() {
+ sizes.push(offsets[i] - offsets[i - 1]);
+ }
+ offsets.pop();
+ data = data
+ .add_buffer(Buffer::from_vec(offsets))
+ .add_buffer(Buffer::from_vec(sizes));
+ } else {
+ data = data.add_buffer(Buffer::from_vec(offsets));
+ }
+
// Safety
// Validated lengths above
Ok(unsafe { data.build_unchecked() })
diff --git a/arrow-json/src/reader/mod.rs b/arrow-json/src/reader/mod.rs
index 04271368a4..7039d3500e 100644
--- a/arrow-json/src/reader/mod.rs
+++ b/arrow-json/src/reader/mod.rs
@@ -154,7 +154,7 @@ pub use value_iter::ValueIter;
use crate::reader::boolean_array::BooleanArrayDecoder;
use crate::reader::decimal_array::DecimalArrayDecoder;
-use crate::reader::list_array::ListArrayDecoder;
+use crate::reader::list_array::{ListArrayDecoder, ListViewArrayDecoder};
use crate::reader::map_array::MapArrayDecoder;
use crate::reader::null_array::NullArrayDecoder;
use crate::reader::primitive_array::PrimitiveArrayDecoder;
@@ -792,6 +792,8 @@ fn make_decoder(
DataType::LargeUtf8 =>
Ok(Box::new(StringArrayDecoder::<i64>::new(coerce_primitive))),
DataType::List(_) => Ok(Box::new(ListArrayDecoder::<i32>::new(ctx,
data_type, is_nullable)?)),
DataType::LargeList(_) =>
Ok(Box::new(ListArrayDecoder::<i64>::new(ctx, data_type, is_nullable)?)),
+ DataType::ListView(_) =>
Ok(Box::new(ListViewArrayDecoder::<i32>::new(ctx, data_type, is_nullable)?)),
+ DataType::LargeListView(_) =>
Ok(Box::new(ListViewArrayDecoder::<i64>::new(ctx, data_type, is_nullable)?)),
DataType::Struct(_) => Ok(Box::new(StructArrayDecoder::new(ctx,
data_type, is_nullable)?)),
DataType::Binary => Ok(Box::new(BinaryArrayDecoder::<i32>::default())),
DataType::LargeBinary =>
Ok(Box::new(BinaryArrayDecoder::<i64>::default())),
@@ -815,7 +817,10 @@ mod tests {
use std::io::{BufReader, Cursor, Seek};
use arrow_array::cast::AsArray;
- use arrow_array::{Array, BooleanArray, Float64Array, ListArray,
StringArray, StringViewArray};
+ use arrow_array::{
+ Array, BooleanArray, Float64Array, GenericListViewArray, ListArray,
OffsetSizeTrait,
+ StringArray, StringViewArray,
+ };
use arrow_buffer::{ArrowNativeType, Buffer};
use arrow_cast::display::{ArrayFormatter, FormatOptions};
use arrow_data::ArrayDataBuilder;
@@ -2192,6 +2197,77 @@ mod tests {
assert_eq!(read, expected);
}
+ fn assert_read_list_view<O: OffsetSizeTrait>() {
+ let field = Arc::new(Field::new("item", DataType::Int32, true));
+ let data_type =
GenericListViewArray::<O>::DATA_TYPE_CONSTRUCTOR(field.clone());
+ let schema = Arc::new(Schema::new(vec![Field::new("lv", data_type,
true)]));
+
+ let buf = r#"
+ {"lv": [1, 2, 3]}
+ {"lv": [4, null]}
+ {"lv": null}
+ {"lv": [6]}
+ {"lv": []}
+ "#;
+
+ let batches = do_read(buf, 1024, false, false, schema);
+ assert_eq!(batches.len(), 1);
+ let batch = &batches[0];
+ let col = batch.column(0);
+ let list_view = col
+ .as_any()
+ .downcast_ref::<GenericListViewArray<O>>()
+ .unwrap();
+
+ assert_eq!(list_view.len(), 5);
+
+ // Check offsets and sizes
+ let expected_offsets: Vec<O> = vec![0, 3, 5, 5, 6]
+ .into_iter()
+ .map(|v| O::usize_as(v))
+ .collect();
+ let expected_sizes: Vec<O> = vec![3, 2, 0, 1, 0]
+ .into_iter()
+ .map(|v| O::usize_as(v))
+ .collect();
+ assert_eq!(list_view.value_offsets(), &expected_offsets);
+ assert_eq!(list_view.value_sizes(), &expected_sizes);
+
+ // Row 0: [1, 2, 3]
+ assert!(list_view.is_valid(0));
+ let vals = list_view.value(0);
+ let ints = vals.as_primitive::<Int32Type>();
+ assert_eq!(ints.values(), &[1, 2, 3]);
+
+ // Row 1: [4, null]
+ assert!(list_view.is_valid(1));
+ let vals = list_view.value(1);
+ let ints = vals.as_primitive::<Int32Type>();
+ assert_eq!(ints.len(), 2);
+ assert_eq!(ints.value(0), 4);
+ assert!(ints.is_null(1));
+
+ // Row 2: null
+ assert!(list_view.is_null(2));
+
+ // Row 3: [6]
+ assert!(list_view.is_valid(3));
+ let vals = list_view.value(3);
+ let ints = vals.as_primitive::<Int32Type>();
+ assert_eq!(ints.values(), &[6]);
+
+ // Row 4: []
+ assert!(list_view.is_valid(4));
+ let vals = list_view.value(4);
+ assert_eq!(vals.len(), 0);
+ }
+
+ #[test]
+ fn test_read_list_view() {
+ assert_read_list_view::<i32>();
+ assert_read_list_view::<i64>();
+ }
+
#[test]
fn test_skip_empty_lines() {
let schema = Schema::new(vec![Field::new("a", DataType::Int64, true)]);
diff --git a/arrow-json/src/writer/encoder.rs b/arrow-json/src/writer/encoder.rs
index d7c3fbbe2e..45055c5a36 100644
--- a/arrow-json/src/writer/encoder.rs
+++ b/arrow-json/src/writer/encoder.rs
@@ -352,15 +352,23 @@ pub fn make_encoder<'a>(
}
DataType::List(_) => {
let array = array.as_list::<i32>();
- NullableEncoder::new(Box::new(ListEncoder::try_new(field, array,
options)?), array.nulls().cloned())
+ NullableEncoder::new(Box::new(ListLikeEncoder::try_new(field,
array, options)?), array.nulls().cloned())
}
DataType::LargeList(_) => {
let array = array.as_list::<i64>();
- NullableEncoder::new(Box::new(ListEncoder::try_new(field, array,
options)?), array.nulls().cloned())
+ NullableEncoder::new(Box::new(ListLikeEncoder::try_new(field,
array, options)?), array.nulls().cloned())
+ }
+ DataType::ListView(_) => {
+ let array = array.as_list_view::<i32>();
+ NullableEncoder::new(Box::new(ListLikeEncoder::try_new(field,
array, options)?), array.nulls().cloned())
+ }
+ DataType::LargeListView(_) => {
+ let array = array.as_list_view::<i64>();
+ NullableEncoder::new(Box::new(ListLikeEncoder::try_new(field,
array, options)?), array.nulls().cloned())
}
DataType::FixedSizeList(_, _) => {
let array = array.as_fixed_size_list();
- NullableEncoder::new(Box::new(FixedSizeListEncoder::try_new(field,
array, options)?), array.nulls().cloned())
+ NullableEncoder::new(Box::new(ListLikeEncoder::try_new(field,
array, options)?), array.nulls().cloned())
}
DataType::Dictionary(_, _) => downcast_dictionary_array! {
@@ -639,77 +647,30 @@ impl Encoder for BinaryViewEncoder<'_> {
}
}
-struct ListEncoder<'a, O: OffsetSizeTrait> {
- offsets: OffsetBuffer<O>,
- encoder: NullableEncoder<'a>,
-}
-
-impl<'a, O: OffsetSizeTrait> ListEncoder<'a, O> {
- fn try_new(
- field: &'a FieldRef,
- array: &'a GenericListArray<O>,
- options: &'a EncoderOptions,
- ) -> Result<Self, ArrowError> {
- let encoder = make_encoder(field, array.values().as_ref(), options)?;
- Ok(Self {
- offsets: array.offsets().clone(),
- encoder,
- })
- }
-}
-
-impl<O: OffsetSizeTrait> Encoder for ListEncoder<'_, O> {
- fn encode(&mut self, idx: usize, out: &mut Vec<u8>) {
- let end = self.offsets[idx + 1].as_usize();
- let start = self.offsets[idx].as_usize();
- out.push(b'[');
-
- if self.encoder.has_nulls() {
- for idx in start..end {
- if idx != start {
- out.push(b',')
- }
- if self.encoder.is_null(idx) {
- out.extend_from_slice(b"null");
- } else {
- self.encoder.encode(idx, out);
- }
- }
- } else {
- for idx in start..end {
- if idx != start {
- out.push(b',')
- }
- self.encoder.encode(idx, out);
- }
- }
- out.push(b']');
- }
-}
-
-struct FixedSizeListEncoder<'a> {
- value_length: usize,
+struct ListLikeEncoder<'a, L: ListLikeArray> {
+ list_array: &'a L,
encoder: NullableEncoder<'a>,
}
-impl<'a> FixedSizeListEncoder<'a> {
+impl<'a, L: ListLikeArray> ListLikeEncoder<'a, L> {
fn try_new(
field: &'a FieldRef,
- array: &'a FixedSizeListArray,
+ array: &'a L,
options: &'a EncoderOptions,
) -> Result<Self, ArrowError> {
let encoder = make_encoder(field, array.values().as_ref(), options)?;
Ok(Self {
+ list_array: array,
encoder,
- value_length: array.value_length().as_usize(),
})
}
}
-impl Encoder for FixedSizeListEncoder<'_> {
+impl<L: ListLikeArray> Encoder for ListLikeEncoder<'_, L> {
fn encode(&mut self, idx: usize, out: &mut Vec<u8>) {
- let start = idx * self.value_length;
- let end = start + self.value_length;
+ let range = self.list_array.element_range(idx);
+ let start = range.start;
+ let end = range.end;
out.push(b'[');
if self.encoder.has_nulls() {
for idx in start..end {
diff --git a/arrow-json/src/writer/mod.rs b/arrow-json/src/writer/mod.rs
index 2fac5ab623..04cc8c9e2a 100644
--- a/arrow-json/src/writer/mod.rs
+++ b/arrow-json/src/writer/mod.rs
@@ -1241,6 +1241,54 @@ mod tests {
);
}
+ fn assert_write_list_view<O: OffsetSizeTrait>() {
+ let field = Arc::new(Field::new("item", DataType::Int32, true));
+ let data_type =
GenericListViewArray::<O>::DATA_TYPE_CONSTRUCTOR(field.clone());
+ let schema = Schema::new(vec![Field::new("lv", data_type, true)]);
+
+ // rows: [1, 2, 3], [4, null], null, [6]
+ let values = Int32Array::from(vec![Some(1), Some(2), Some(3), Some(4),
None, Some(6)]);
+ let offsets = [0, 3, 0, 5]
+ .iter()
+ .map(|&v| O::from_usize(v).unwrap())
+ .collect::<Vec<_>>();
+ let sizes = [3, 2, 0, 1]
+ .iter()
+ .map(|&v| O::from_usize(v).unwrap())
+ .collect::<Vec<_>>();
+ let list_view = GenericListViewArray::<O>::try_new(
+ field,
+ ScalarBuffer::from(offsets),
+ ScalarBuffer::from(sizes),
+ Arc::new(values),
+ Some(NullBuffer::from_iter([true, true, false, true])),
+ )
+ .unwrap();
+
+ let batch = RecordBatch::try_new(Arc::new(schema),
vec![Arc::new(list_view)]).unwrap();
+
+ let mut buf = Vec::new();
+ {
+ let mut writer = LineDelimitedWriter::new(&mut buf);
+ writer.write_batches(&[&batch]).unwrap();
+ }
+
+ assert_json_eq(
+ &buf,
+ r#"{"lv":[1,2,3]}
+{"lv":[4,null]}
+{}
+{"lv":[6]}
+"#,
+ );
+ }
+
+ #[test]
+ fn write_list_view() {
+ assert_write_list_view::<i32>();
+ assert_write_list_view::<i64>();
+ }
+
fn test_write_for_file(test_file: &str, remove_nulls: bool) {
let file = File::open(test_file).unwrap();
let mut reader = BufReader::new(file);