[ https://issues.apache.org/jira/browse/ARROW-5231?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Rok Mihevc updated ARROW-5231: ------------------------------ External issue URL: https://github.com/apache/arrow/issues/21704 > [Java] Arrow Java can't read union vector from ArrowStreamReader written by > its own bugs > ----------------------------------------------------------------------------------------- > > Key: ARROW-5231 > URL: https://issues.apache.org/jira/browse/ARROW-5231 > Project: Apache Arrow > Issue Type: Bug > Components: Java > Environment: Mac OS 10.13.6, Arrow 0.13.0, JDK8 > Reporter: Shawn Yang > Priority: Major > > When writing union data using ArrowStreamWriter in java, I can't read it > back using ArrowStreamReader in java. The exception is: > {quote}Exception in thread "main" java.lang.IllegalArgumentException: not all > nodes and buffers were consumed. nodes: [ArrowFieldNode [length=100, > nullCount=0]] buffers: [ArrowBuf[14], udle: [7 104..117], ArrowBuf[15], udle: > [7 120..520]] > at org.apache.arrow.vector.VectorLoader.load(VectorLoader.java:64) > at > org.apache.arrow.vector.ipc.ArrowReader.loadRecordBatch(ArrowReader.java:219) > at > org.apache.arrow.vector.ipc.ArrowStreamReader.loadNextBatch(ArrowStreamReader.java:121) > {quote} > The code to reproduce this exception is: > > {code:java} > import org.apache.arrow.memory.RootAllocator; > import org.apache.arrow.vector.FieldVector; > import org.apache.arrow.vector.VectorSchemaRoot; > import org.apache.arrow.vector.complex.UnionVector; > import org.apache.arrow.vector.dictionary.DictionaryProvider; > import org.apache.arrow.vector.holders.NullableIntHolder; > import org.apache.arrow.vector.ipc.ArrowStreamReader; > import org.apache.arrow.vector.ipc.ArrowStreamWriter; > import org.apache.arrow.vector.types.UnionMode; > import org.apache.arrow.vector.types.pojo.ArrowType; > import org.apache.arrow.vector.types.pojo.Field; > import org.apache.arrow.vector.types.pojo.FieldType; > import org.apache.arrow.vector.types.pojo.Schema; > import java.io.ByteArrayInputStream; > import java.io.FileOutputStream; > import java.io.IOException; > import java.io.OutputStream; > import java.nio.file.Files; > import java.nio.file.Paths; > import java.util.Collections; > import java.util.List; > public class UnionTest { > public static void writeUnionBatch(OutputStream os) throws IOException { > int[] typeIds = new int[]{ArrowType.ArrowTypeID.Int.ordinal()}; > ArrowType.Union union = new ArrowType.Union(UnionMode.Sparse, > typeIds); > List<Field> childList = Collections.singletonList( > new Field("s1", FieldType.nullable(new ArrowType.Int(32, true)), null) > ); > Field field = new Field("f1", FieldType.nullable(union), childList); > List<Field> fields = Collections.singletonList(field); > Schema schema = new Schema(fields); > VectorSchemaRoot root = VectorSchemaRoot.create(schema, new > RootAllocator(Integer.MAX_VALUE)); > DictionaryProvider.MapDictionaryProvider provider = new > DictionaryProvider.MapDictionaryProvider(); > ArrowStreamWriter writer = new ArrowStreamWriter(root, provider, os); > writer.start(); > for (int i = 0; i < 2; i++) { > root.setRowCount(100); > List<FieldVector> vectors = root.getFieldVectors(); > UnionVector vector = (UnionVector) vectors.get(0); > fillVector(vector, 100); > for (int j = 0; j < 100; j++) { > if (!vector.isNull(j)) { > System.out.println(vector.getObject(j)); > } > } > writer.writeBatch(); > } > writer.end(); > writer.close(); > } > private static void fillVector(UnionVector vector, int batchSize) { > vector.setInitialCapacity(batchSize); > vector.allocateNew(); > for (int i = 0; i < batchSize; i++) { > NullableIntHolder intHolder = new NullableIntHolder(); > intHolder.isSet = 1; > intHolder.value = i; > vector.setSafe(i, intHolder); > } > vector.setValueCount(batchSize); > } > public static void main(String[] args) throws IOException { > try(FileOutputStream fos = new > FileOutputStream("result/union.arrow")) { > writeUnionBatch(fos); > System.out.println("write succeed"); > fos.flush(); > } > RootAllocator allocator = new RootAllocator(1000000000); > ByteArrayInputStream in = new > ByteArrayInputStream(Files.readAllBytes(Paths.get("result/union.arrow"))); > ArrowStreamReader reader = new ArrowStreamReader(in, allocator); > reader.loadNextBatch(); > } > } > {code} > And it can't read union data generated by python, as is reported in > https://issues.apache.org/jira/browse/ARROW-1692. > It seems strange arrow java can't read union data generated by its own. Is > there any format gap between arrow java UnionVector write and read? > -- This message was sent by Atlassian Jira (v8.20.10#820010)