This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 309659152 add test for skip_values in DictionaryDecoder and fix it 
(#2105)
309659152 is described below

commit 3096591520d303eb34a432c82733e86f34999232
Author: Yang Jiang <[email protected]>
AuthorDate: Wed Jul 20 22:22:53 2022 +0800

    add test for skip_values in DictionaryDecoder and fix it (#2105)
---
 .../arrow/array_reader/byte_array_dictionary.rs    | 81 +++++++++++++++++++++-
 1 file changed, 79 insertions(+), 2 deletions(-)

diff --git a/parquet/src/arrow/array_reader/byte_array_dictionary.rs 
b/parquet/src/arrow/array_reader/byte_array_dictionary.rs
index 181af09e8..39d920ef1 100644
--- a/parquet/src/arrow/array_reader/byte_array_dictionary.rs
+++ b/parquet/src/arrow/array_reader/byte_array_dictionary.rs
@@ -346,6 +346,7 @@ where
                         // Keys will be validated on conversion to arrow
                         let keys_slice = keys.spare_capacity_mut(range.start + 
len);
                         let len = decoder.get_batch(&mut 
keys_slice[range.start..])?;
+                        *max_remaining_values -= len;
                         Ok(len)
                     }
                     None => {
@@ -368,7 +369,7 @@ where
                             dict_offsets,
                             dict_values,
                         )?;
-
+                        *max_remaining_values -= len;
                         Ok(len)
                     }
                 }
@@ -476,6 +477,68 @@ mod tests {
         )
     }
 
+    #[test]
+    fn test_dictionary_preservation_skip() {
+        let data_type = utf8_dictionary();
+
+        let data: Vec<_> = vec!["0", "1", "0", "1", "2", "1", "2"]
+            .into_iter()
+            .map(ByteArray::from)
+            .collect();
+        let (dict, encoded) = encode_dictionary(&data);
+
+        let column_desc = utf8_column();
+        let mut decoder = DictionaryDecoder::<i32, i32>::new(&column_desc);
+
+        decoder
+            .set_dict(dict, 3, Encoding::RLE_DICTIONARY, false)
+            .unwrap();
+
+        decoder
+            .set_data(Encoding::RLE_DICTIONARY, encoded, 7, Some(data.len()))
+            .unwrap();
+
+        let mut output = DictionaryBuffer::<i32, i32>::default();
+
+        // read two skip one
+        assert_eq!(decoder.read(&mut output, 0..2).unwrap(), 2);
+        assert_eq!(decoder.skip_values(1).unwrap(), 1);
+
+        assert!(matches!(output, DictionaryBuffer::Dict { .. }));
+
+        // read two skip one
+        assert_eq!(decoder.read(&mut output, 2..4).unwrap(), 2);
+        assert_eq!(decoder.skip_values(1).unwrap(), 1);
+
+        // read one and test on skip at the end
+        assert_eq!(decoder.read(&mut output, 4..5).unwrap(), 1);
+        assert_eq!(decoder.skip_values(4).unwrap(), 0);
+
+        let valid = vec![true, true, true, true, true];
+        let valid_buffer = Buffer::from_iter(valid.iter().cloned());
+        output.pad_nulls(0, 5, 5, valid_buffer.as_slice());
+
+        assert!(matches!(output, DictionaryBuffer::Dict { .. }));
+
+        let array = output.into_array(Some(valid_buffer), &data_type).unwrap();
+        assert_eq!(array.data_type(), &data_type);
+
+        let array = cast(&array, &ArrowType::Utf8).unwrap();
+        let strings = array.as_any().downcast_ref::<StringArray>().unwrap();
+        assert_eq!(strings.len(), 5);
+
+        assert_eq!(
+            strings.iter().collect::<Vec<_>>(),
+            vec![
+                Some("0"),
+                Some("1"),
+                Some("1"),
+                Some("2"),
+                Some("2"),
+            ]
+        )
+    }
+
     #[test]
     fn test_dictionary_fallback() {
         let data_type = utf8_dictionary();
@@ -599,7 +662,7 @@ mod tests {
             .set_dict(encoded_dictionary, 4, Encoding::PLAIN_DICTIONARY, false)
             .unwrap();
 
-        for (encoding, page) in pages {
+        for (encoding, page) in pages.clone() {
             let mut output = DictionaryBuffer::<i32, i32>::default();
             decoder.set_data(encoding, page, 8, None).unwrap();
             assert_eq!(decoder.read(&mut output, 0..1024).unwrap(), 0);
@@ -612,5 +675,19 @@ mod tests {
             assert_eq!(array.len(), 8);
             assert_eq!(array.null_count(), 8);
         }
+
+        for (encoding, page) in pages {
+            let mut output = DictionaryBuffer::<i32, i32>::default();
+            decoder.set_data(encoding, page, 8, None).unwrap();
+            assert_eq!(decoder.skip_values(1024).unwrap(), 0);
+
+            output.pad_nulls(0, 0, 8, &[0]);
+            let array = output
+                .into_array(Some(Buffer::from(&[0])), &data_type)
+                .unwrap();
+
+            assert_eq!(array.len(), 8);
+            assert_eq!(array.null_count(), 8);
+        }
     }
 }

Reply via email to