Github user twalthr commented on a diff in the pull request: https://github.com/apache/flink/pull/5043#discussion_r152513215 --- Diff: flink-connectors/flink-orc/src/main/java/org/apache/flink/orc/OrcUtils.java --- @@ -0,0 +1,1511 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.orc; + +import org.apache.flink.api.common.typeinfo.BasicTypeInfo; +import org.apache.flink.api.common.typeinfo.PrimitiveArrayTypeInfo; +import org.apache.flink.api.common.typeinfo.SqlTimeTypeInfo; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.MapTypeInfo; +import org.apache.flink.api.java.typeutils.ObjectArrayTypeInfo; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.types.Row; + +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; + +import org.apache.orc.TypeDescription; + +import java.lang.reflect.Array; +import java.math.BigDecimal; +import java.sql.Date; +import java.sql.Timestamp; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.TimeZone; +import java.util.function.DoubleFunction; +import java.util.function.IntFunction; +import java.util.function.LongFunction; + +/** + * A class that provides utility methods for orc file reading. + */ +class OrcUtils { + + private static final long MILLIS_PER_DAY = 86400000; // = 24 * 60 * 60 * 1000 + private static final TimeZone LOCAL_TZ = TimeZone.getDefault(); + + /** + * Converts an ORC schema to a Flink TypeInformation. + * + * @param schema The ORC schema. + * @return The TypeInformation that corresponds to the ORC schema. + */ + static TypeInformation schemaToTypeInfo(TypeDescription schema) { + switch (schema.getCategory()) { + case BOOLEAN: + return BasicTypeInfo.BOOLEAN_TYPE_INFO; + case BYTE: + return BasicTypeInfo.BYTE_TYPE_INFO; + case SHORT: + return BasicTypeInfo.SHORT_TYPE_INFO; + case INT: + return BasicTypeInfo.INT_TYPE_INFO; + case LONG: + return BasicTypeInfo.LONG_TYPE_INFO; + case FLOAT: + return BasicTypeInfo.FLOAT_TYPE_INFO; + case DOUBLE: + return BasicTypeInfo.DOUBLE_TYPE_INFO; + case DECIMAL: + return BasicTypeInfo.BIG_DEC_TYPE_INFO; + case STRING: + case CHAR: + case VARCHAR: + return BasicTypeInfo.STRING_TYPE_INFO; + case DATE: + return SqlTimeTypeInfo.DATE; + case TIMESTAMP: + return SqlTimeTypeInfo.TIMESTAMP; + case BINARY: + return PrimitiveArrayTypeInfo.BYTE_PRIMITIVE_ARRAY_TYPE_INFO; + case STRUCT: + List<TypeDescription> fieldSchemas = schema.getChildren(); + TypeInformation[] fieldTypes = new TypeInformation[fieldSchemas.size()]; + for (int i = 0; i < fieldSchemas.size(); i++) { + fieldTypes[i] = schemaToTypeInfo(fieldSchemas.get(i)); + } + String[] fieldNames = schema.getFieldNames().toArray(new String[]{}); + return new RowTypeInfo(fieldTypes, fieldNames); + case LIST: + TypeDescription elementSchema = schema.getChildren().get(0); + TypeInformation<?> elementType = schemaToTypeInfo(elementSchema); + // arrays of primitive types are handled as object arrays to support null values + return ObjectArrayTypeInfo.getInfoFor(elementType); + case MAP: + TypeDescription keySchema = schema.getChildren().get(0); + TypeDescription valSchema = schema.getChildren().get(1); + TypeInformation<?> keyType = schemaToTypeInfo(keySchema); + TypeInformation<?> valType = schemaToTypeInfo(valSchema); + return new MapTypeInfo<>(keyType, valType); + case UNION: + throw new UnsupportedOperationException("UNION type is not supported yet."); + default: + throw new IllegalArgumentException("Unknown type " + schema); + } + } + + /** + * Fills an ORC batch into an array of Row. + * + * @param rows The batch of rows need to be filled. + * @param schema The schema of the ORC data. + * @param batch The ORC data. + * @param selectedFields The list of selected ORC fields. + * @return The number of rows that were filled. + */ + static int fillRows(Row[] rows, TypeDescription schema, VectorizedRowBatch batch, int[] selectedFields) { + + int rowsToRead = Math.min((int) batch.count(), rows.length); + + List<TypeDescription> fieldTypes = schema.getChildren(); + // read each selected field + for (int rowIdx = 0; rowIdx < selectedFields.length; rowIdx++) { + int orcIdx = selectedFields[rowIdx]; + readField(rows, rowIdx, fieldTypes.get(orcIdx), batch.cols[orcIdx], null, rowsToRead); + } + return rowsToRead; + } + + /** + * Reads a vector of data into an array of objects. + * + * @param vals The array that needs to be filled. + * @param fieldIdx If the vals array is an array of Row, the index of the field that needs to be filled. + * Otherwise a -1 must be passed and the data is directly filled into the array. + * @param schema The schema of the vector to read. + * @param vector The vector to read. + * @param lengthVector If the vector is of type List or Map, the number of sub-elements to read for each field. Otherwise, it must be null. + * @param childCount The number of vector entries to read. + */ + private static void readField(Object[] vals, int fieldIdx, TypeDescription schema, ColumnVector vector, long[] lengthVector, int childCount) { + + // check the type of the vector to decide how to read it. + switch (schema.getCategory()) { + case BOOLEAN: + if (vector.noNulls) { + readNonNullLongColumn(vals, fieldIdx, (LongColumnVector) vector, lengthVector, childCount, OrcUtils::readBoolean, OrcUtils::boolArray); + } else { + readLongColumn(vals, fieldIdx, (LongColumnVector) vector, lengthVector, childCount, OrcUtils::readBoolean, OrcUtils::boolArray); + } + break; + case BYTE: + if (vector.noNulls) { + readNonNullLongColumn(vals, fieldIdx, (LongColumnVector) vector, lengthVector, childCount, OrcUtils::readByte, OrcUtils::byteArray); + } else { + readLongColumn(vals, fieldIdx, (LongColumnVector) vector, lengthVector, childCount, OrcUtils::readByte, OrcUtils::byteArray); + } + break; + case SHORT: + if (vector.noNulls) { + readNonNullLongColumn(vals, fieldIdx, (LongColumnVector) vector, lengthVector, childCount, OrcUtils::readShort, OrcUtils::shortArray); + } else { + readLongColumn(vals, fieldIdx, (LongColumnVector) vector, lengthVector, childCount, OrcUtils::readShort, OrcUtils::shortArray); + } + break; + case INT: + if (vector.noNulls) { + readNonNullLongColumn(vals, fieldIdx, (LongColumnVector) vector, lengthVector, childCount, OrcUtils::readInt, OrcUtils::intArray); + } else { + readLongColumn(vals, fieldIdx, (LongColumnVector) vector, lengthVector, childCount, OrcUtils::readInt, OrcUtils::intArray); + } + break; + case LONG: + if (vector.noNulls) { + readNonNullLongColumn(vals, fieldIdx, (LongColumnVector) vector, lengthVector, childCount, OrcUtils::readLong, OrcUtils::longArray); + } else { + readLongColumn(vals, fieldIdx, (LongColumnVector) vector, lengthVector, childCount, OrcUtils::readLong, OrcUtils::longArray); + } + break; + case FLOAT: + if (vector.noNulls) { + readNonNullDoubleColumn(vals, fieldIdx, (DoubleColumnVector) vector, lengthVector, childCount, OrcUtils::readFloat, OrcUtils::floatArray); + } else { + readDoubleColumn(vals, fieldIdx, (DoubleColumnVector) vector, lengthVector, childCount, OrcUtils::readFloat, OrcUtils::floatArray); + } + break; + case DOUBLE: + if (vector.noNulls) { + readNonNullDoubleColumn(vals, fieldIdx, (DoubleColumnVector) vector, lengthVector, childCount, OrcUtils::readDouble, OrcUtils::doubleArray); + } else { + readDoubleColumn(vals, fieldIdx, (DoubleColumnVector) vector, lengthVector, childCount, OrcUtils::readDouble, OrcUtils::doubleArray); + } + break; + case CHAR: + case VARCHAR: + case STRING: + if (vector.noNulls) { + readNonNullBytesColumnAsString(vals, fieldIdx, (BytesColumnVector) vector, lengthVector, childCount); + } else { + readBytesColumnAsString(vals, fieldIdx, (BytesColumnVector) vector, lengthVector, childCount); + } + break; + case DATE: + if (vector.noNulls) { + readNonNullLongColumnAsDate(vals, fieldIdx, (LongColumnVector) vector, lengthVector, childCount); + } else { + readLongColumnAsDate(vals, fieldIdx, (LongColumnVector) vector, lengthVector, childCount); + } + break; + case TIMESTAMP: + if (vector.noNulls) { + readNonNullTimestampColumn(vals, fieldIdx, (TimestampColumnVector) vector, lengthVector, childCount); + } else { + readTimestampColumn(vals, fieldIdx, (TimestampColumnVector) vector, lengthVector, childCount); + } + break; + case BINARY: + if (vector.noNulls) { + readNonNullBytesColumnAsBinary(vals, fieldIdx, (BytesColumnVector) vector, lengthVector, childCount); + } else { + readBytesColumnAsBinary(vals, fieldIdx, (BytesColumnVector) vector, lengthVector, childCount); + } + break; + case DECIMAL: + if (vector.noNulls) { + readNonNullDecimalColumn(vals, fieldIdx, (DecimalColumnVector) vector, lengthVector, childCount); + } + else { + readDecimalColumn(vals, fieldIdx, (DecimalColumnVector) vector, lengthVector, childCount); + } + break; + case STRUCT: + if (vector.noNulls) { + readNonNullStructColumn(vals, fieldIdx, (StructColumnVector) vector, schema, lengthVector, childCount); + } else { + readStructColumn(vals, fieldIdx, (StructColumnVector) vector, schema, lengthVector, childCount); + } + break; + case LIST: + if (vector.noNulls) { + readNonNullListColumn(vals, fieldIdx, (ListColumnVector) vector, schema, lengthVector, childCount); + } + else { + readListColumn(vals, fieldIdx, (ListColumnVector) vector, schema, lengthVector, childCount); + } + break; + case MAP: + if (vector.noNulls) { + readNonNullMapColumn(vals, fieldIdx, (MapColumnVector) vector, schema, lengthVector, childCount); + } + else { + readMapColumn(vals, fieldIdx, (MapColumnVector) vector, schema, lengthVector, childCount); + } + break; + case UNION: + throw new UnsupportedOperationException("UNION type not supported yet"); + default: + throw new IllegalArgumentException("Unknown type " + schema); + } + } + + private static <T> void readNonNullLongColumn(Object[] vals, int fieldIdx, LongColumnVector vector, long[] lengthVector, int childCount, + LongFunction<T> reader, IntFunction<T[]> array) { + + // check if the values need to be read into lists or as single values + if (lengthVector == null) { + if (vector.isRepeating) { // fill complete column with first value + T repeatingValue = reader.apply(vector.vector[0]); + fillColumnWithRepeatingValue(vals, fieldIdx, repeatingValue, childCount); + } else { + if (fieldIdx == -1) { // set as an object + for (int i = 0; i < childCount; i++) { + vals[i] = reader.apply(vector.vector[i]); + } + } else { // set as a field of Row + Row[] rows = (Row[]) vals; + for (int i = 0; i < childCount; i++) { + rows[i].setField(fieldIdx, reader.apply(vector.vector[i])); + } + } + } + } else { // in a list + T[] temp; + int offset = 0; + if (vector.isRepeating) { // fill complete list with first value + T repeatingValue = reader.apply(vector.vector[0]); + for (int i = 0; offset < childCount; i++) { + temp = array.apply((int) lengthVector[i]); + Arrays.fill(temp, repeatingValue); + offset += temp.length; + if (fieldIdx == -1) { + vals[i] = temp; + } else { + ((Row) vals[i]).setField(fieldIdx, temp); + } + } + } else { + for (int i = 0; offset < childCount; i++) { + temp = array.apply((int) lengthVector[i]); + for (int j = 0; j < temp.length; j++) { + temp[j] = reader.apply(vector.vector[offset++]); + } + if (fieldIdx == -1) { + vals[i] = temp; + } else { + ((Row) vals[i]).setField(fieldIdx, temp); + } + } + } + } + } + + private static <T> void readNonNullDoubleColumn(Object[] vals, int fieldIdx, DoubleColumnVector vector, long[] lengthVector, int childCount, + DoubleFunction<T> reader, IntFunction<T[]> array) { + + // check if the values need to be read into lists or as single values + if (lengthVector == null) { + if (vector.isRepeating) { // fill complete column with first value + T repeatingValue = reader.apply(vector.vector[0]); + fillColumnWithRepeatingValue(vals, fieldIdx, repeatingValue, childCount); + } else { + if (fieldIdx == -1) { // set as an object + for (int i = 0; i < childCount; i++) { + vals[i] = reader.apply(vector.vector[i]); + } + } else { // set as a field of Row + Row[] rows = (Row[]) vals; + for (int i = 0; i < childCount; i++) { + rows[i].setField(fieldIdx, reader.apply(vector.vector[i])); + } + } + } + } else { // in a list + T[] temp; + int offset = 0; + if (vector.isRepeating) { // fill complete list with first value + T repeatingValue = reader.apply(vector.vector[0]); + for (int i = 0; offset < childCount; i++) { + temp = array.apply((int) lengthVector[i]); + Arrays.fill(temp, repeatingValue); + offset += temp.length; + if (fieldIdx == -1) { + vals[i] = temp; + } else { + ((Row) vals[i]).setField(fieldIdx, temp); + } + } + } else { + for (int i = 0; offset < childCount; i++) { + temp = array.apply((int) lengthVector[i]); + for (int j = 0; j < temp.length; j++) { + temp[j] = reader.apply(vector.vector[offset++]); + } + if (fieldIdx == -1) { + vals[i] = temp; + } else { + ((Row) vals[i]).setField(fieldIdx, temp); + } + } + } + } + } + + private static void readNonNullBytesColumnAsString(Object[] vals, int fieldIdx, BytesColumnVector bytes, long[] lengthVector, int childCount) { + // check if the values need to be read into lists or as single values + if (lengthVector == null) { + if (bytes.isRepeating) { // fill complete column with first value + String repeatingValue = new String(bytes.vector[0], bytes.start[0], bytes.length[0]); + fillColumnWithRepeatingValue(vals, fieldIdx, repeatingValue, childCount); + } else { + if (fieldIdx == -1) { // set as an object + for (int i = 0; i < childCount; i++) { + vals[i] = new String(bytes.vector[i], bytes.start[i], bytes.length[i]); + } + } else { // set as a field of Row + Row[] rows = (Row[]) vals; + for (int i = 0; i < childCount; i++) { + rows[i].setField(fieldIdx, new String(bytes.vector[i], bytes.start[i], bytes.length[i])); + } + } + } + } else { + String[] temp; + int offset = 0; + if (bytes.isRepeating) { // fill complete list with first value + String repeatingValue = new String(bytes.vector[0], bytes.start[0], bytes.length[0]); + for (int i = 0; offset < childCount; i++) { + temp = new String[(int) lengthVector[i]]; + Arrays.fill(temp, repeatingValue); + offset += temp.length; + if (fieldIdx == -1) { + vals[i] = temp; + } else { + ((Row) vals[i]).setField(fieldIdx, temp); + } + } + } else { + for (int i = 0; offset < childCount; i++) { + temp = new String[(int) lengthVector[i]]; + for (int j = 0; j < temp.length; j++) { + temp[j] = new String(bytes.vector[offset], bytes.start[offset], bytes.length[offset]); + offset++; + } + if (fieldIdx == -1) { + vals[i] = temp; + } else { + ((Row) vals[i]).setField(fieldIdx, temp); + } + } + } + } + } + + private static void readNonNullBytesColumnAsBinary(Object[] vals, int fieldIdx, BytesColumnVector bytes, long[] lengthVector, int childCount) { + // check if the values need to be read into lists or as single values + if (lengthVector == null) { + if (bytes.isRepeating) { // fill complete column with first value + if (fieldIdx == -1) { // set as an object + for (int i = 0; i < childCount; i++) { + // don't reuse repeating val to avoid object mutation + vals[i] = readBinary(bytes.vector[0], bytes.start[0], bytes.length[0]); + } + } else { // set as a field of Row + Row[] rows = (Row[]) vals; + for (int i = 0; i < childCount; i++) { + // don't reuse repeating val to avoid object mutation + rows[i].setField(fieldIdx, readBinary(bytes.vector[0], bytes.start[0], bytes.length[0])); + } + } + } else { + if (fieldIdx == -1) { // set as an object + for (int i = 0; i < childCount; i++) { + vals[i] = readBinary(bytes.vector[i], bytes.start[i], bytes.length[i]); + } + } else { // set as a field of Row + Row[] rows = (Row[]) vals; + for (int i = 0; i < childCount; i++) { + rows[i].setField(fieldIdx, readBinary(bytes.vector[i], bytes.start[i], bytes.length[i])); + } + } + } + } else { + byte[][] temp; + int offset = 0; + if (bytes.isRepeating) { // fill complete list with first value + for (int i = 0; offset < childCount; i++) { + temp = new byte[(int) lengthVector[i]][]; + for (int j = 0; j < temp.length; j++) { + temp[j] = readBinary(bytes.vector[0], bytes.start[0], bytes.length[0]); + } + offset += temp.length; + if (fieldIdx == -1) { + vals[i] = temp; + } else { + ((Row) vals[i]).setField(fieldIdx, temp); + } + } + } else { + for (int i = 0; offset < childCount; i++) { + temp = new byte[(int) lengthVector[i]][]; + for (int j = 0; j < temp.length; j++) { + temp[j] = readBinary(bytes.vector[offset], bytes.start[offset], bytes.length[offset]); + offset++; + } + if (fieldIdx == -1) { + vals[i] = temp; + } else { + ((Row) vals[i]).setField(fieldIdx, temp); + } + } + } + } + } + + private static void readNonNullLongColumnAsDate(Object[] vals, int fieldIdx, LongColumnVector vector, long[] lengthVector, int childCount) { + + // check if the values need to be read into lists or as single values + if (lengthVector == null) { + if (vector.isRepeating) { // fill complete column with first value + if (fieldIdx == -1) { // set as an object + for (int i = 0; i < childCount; i++) { + // do not reuse repeated value due to mutability of Date + vals[i] = readDate(vector.vector[0]); + } + } else { // set as a field of Row + Row[] rows = (Row[]) vals; + for (int i = 0; i < childCount; i++) { + // do not reuse repeated value due to mutability of Date + rows[i].setField(fieldIdx, readDate(vector.vector[0])); + } + } + } else { + if (fieldIdx == -1) { // set as an object + for (int i = 0; i < childCount; i++) { + vals[i] = readDate(vector.vector[i]); + } + } else { // set as a field of Row + Row[] rows = (Row[]) vals; + for (int i = 0; i < childCount; i++) { + rows[i].setField(fieldIdx, readDate(vector.vector[i])); + } + } + } + } else { // in a list + Date[] temp; + int offset = 0; + if (vector.isRepeating) { // fill complete list with first value + for (int i = 0; offset < childCount; i++) { + temp = new Date[(int) lengthVector[i]]; + for (int j = 0; j < temp.length; j++) { + temp[j] = readDate(vector.vector[0]); + } + offset += temp.length; + if (fieldIdx == -1) { + vals[i] = temp; + } else { + ((Row) vals[i]).setField(fieldIdx, temp); + } + } + } else { + for (int i = 0; offset < childCount; i++) { + temp = new Date[(int) lengthVector[i]]; + for (int j = 0; j < temp.length; j++) { + temp[j] = readDate(vector.vector[offset++]); + } + if (fieldIdx == -1) { + vals[i] = temp; + } else { + ((Row) vals[i]).setField(fieldIdx, temp); + } + } + } + } + } + + private static void readNonNullTimestampColumn(Object[] vals, int fieldIdx, TimestampColumnVector vector, long[] lengthVector, int childCount) { + + // check if the timestamps need to be read into lists or as single values + if (lengthVector == null) { + if (vector.isRepeating) { // fill complete column with first value + if (fieldIdx == -1) { // set as an object + for (int i = 0; i < childCount; i++) { + // do not reuse value to prevent object mutation + vals[i] = readTimestamp(vector.time[0], vector.nanos[0]); + } + } else { // set as a field of Row + Row[] rows = (Row[]) vals; + for (int i = 0; i < childCount; i++) { + // do not reuse value to prevent object mutation + rows[i].setField(fieldIdx, readTimestamp(vector.time[0], vector.nanos[0])); + } + } + } else { + if (fieldIdx == -1) { // set as an object + for (int i = 0; i < childCount; i++) { + vals[i] = readTimestamp(vector.time[i], vector.nanos[i]); + } + } else { // set as a field of Row + Row[] rows = (Row[]) vals; + for (int i = 0; i < childCount; i++) { + rows[i].setField(fieldIdx, readTimestamp(vector.time[i], vector.nanos[i])); + } + } + } + } else { + Timestamp[] temp; + int offset = 0; + if (vector.isRepeating) { // fill complete list with first value + for (int i = 0; offset < childCount; i++) { + temp = new Timestamp[(int) lengthVector[i]]; + for (int j = 0; j < temp.length; j++) { + // do not reuse value to prevent object mutation + temp[j] = readTimestamp(vector.time[0], vector.nanos[0]); + } + offset += temp.length; + if (fieldIdx == -1) { + vals[i] = temp; + } else { + ((Row) vals[i]).setField(fieldIdx, temp); + } + } + } else { + for (int i = 0; offset < childCount; i++) { + temp = new Timestamp[(int) lengthVector[i]]; + for (int j = 0; j < temp.length; j++) { + temp[j] = readTimestamp(vector.time[offset], vector.nanos[offset]); + offset++; + } + if (fieldIdx == -1) { + vals[i] = temp; + } else { + ((Row) vals[i]).setField(fieldIdx, temp); + } + } + } + } + } + + private static void readNonNullDecimalColumn(Object[] vals, int fieldIdx, DecimalColumnVector vector, long[] lengthVector, int childCount) { + + // check if the decimals need to be read into lists or as single values + if (lengthVector == null) { + if (vector.isRepeating) { // fill complete column with first value + fillColumnWithRepeatingValue(vals, fieldIdx, readBigDecimal(vector.vector[0]), childCount); + } else { + if (fieldIdx == -1) { // set as an object + for (int i = 0; i < childCount; i++) { + vals[i] = readBigDecimal(vector.vector[i]); + } + } else { // set as a field of Row + Row[] rows = (Row[]) vals; + for (int i = 0; i < childCount; i++) { + rows[i].setField(fieldIdx, readBigDecimal(vector.vector[i])); + } + } + } + } else { + BigDecimal[] temp; + int offset = 0; + if (vector.isRepeating) { // fill complete list with first value + BigDecimal repeatingValue = readBigDecimal(vector.vector[0]); + for (int i = 0; offset < childCount; i++) { + temp = new BigDecimal[(int) lengthVector[i]]; + Arrays.fill(temp, repeatingValue); + offset += temp.length; + if (fieldIdx == -1) { + vals[i] = temp; + } else { + ((Row) vals[i]).setField(fieldIdx, temp); + } + } + } else { + for (int i = 0; offset < childCount; i++) { + temp = new BigDecimal[(int) lengthVector[i]]; + for (int j = 0; j < temp.length; j++) { + temp[j] = readBigDecimal(vector.vector[offset++]); + } + if (fieldIdx == -1) { + vals[i] = temp; + } else { + ((Row) vals[i]).setField(fieldIdx, temp); + } + } + } + } + + } + + private static void readNonNullStructColumn(Object[] vals, int fieldIdx, StructColumnVector structVector, TypeDescription schema, long[] lengthVector, int childCount) { + + List<TypeDescription> childrenTypes = schema.getChildren(); + + int numFields = childrenTypes.size(); + // create a batch of Rows to read the structs + Row[] structs = new Row[childCount]; + // TODO: possible improvement: reuse existing Row objects + for (int i = 0; i < childCount; i++) { + structs[i] = new Row(numFields); + } + + // read struct fields + for (int i = 0; i < numFields; i++) { + readField(structs, i, childrenTypes.get(i), structVector.fields[i], null, childCount); + } + + // check if the structs need to be read into lists or as single values + if (lengthVector == null) { + if (fieldIdx == -1) { // set struct as an object + System.arraycopy(structs, 0, vals, 0, childCount); + } else { // set struct as a field of Row + Row[] rows = (Row[]) vals; + for (int i = 0; i < childCount; i++) { + rows[i].setField(fieldIdx, structs[i]); + } + } + } else { // struct in a list + int offset = 0; + Row[] temp; + for (int i = 0; offset < childCount; i++) { + temp = new Row[(int) lengthVector[i]]; + System.arraycopy(structs, offset, temp, 0, temp.length); + offset = offset + temp.length; + if (fieldIdx == -1) { + vals[i] = temp; + } else { + ((Row) vals[i]).setField(fieldIdx, temp); + } + } + } + } + + private static void readNonNullListColumn(Object[] vals, int fieldIdx, ListColumnVector list, TypeDescription schema, long[] lengthVector, int childCount) { + + TypeDescription fieldType = schema.getChildren().get(0); + // check if the list need to be read into lists or as single values + if (lengthVector == null) { + long[] lengthVectorNested = list.lengths; + readField(vals, fieldIdx, fieldType, list.child, lengthVectorNested, list.childCount); + } else { // list in a list + Object[] nestedLists = new Object[childCount]; + // length vector for nested list + long[] lengthVectorNested = list.lengths; + // read nested list + readField(nestedLists, -1, fieldType, list.child, lengthVectorNested, list.childCount); + // get type of nestedList + Class<?> classType = nestedLists[0].getClass(); + + // fill outer list with nested list + int offset = 0; + int length; + for (int i = 0; offset < childCount; i++) { + length = (int) lengthVector[i]; + Object[] temp = (Object[]) Array.newInstance(classType, length); + System.arraycopy(nestedLists, offset, temp, 0, length); + offset = offset + length; + if (fieldIdx == -1) { + vals[i] = temp; + } else { + ((Row) vals[i]).setField(fieldIdx, temp); + } + } + } + } + + private static void readNonNullMapColumn(Object[] vals, int fieldIdx, MapColumnVector mapsVector, TypeDescription schema, long[] lengthVector, int childCount) { + + List<TypeDescription> fieldType = schema.getChildren(); + TypeDescription keyType = fieldType.get(0); + TypeDescription valueType = fieldType.get(1); + + ColumnVector keys = mapsVector.keys; + ColumnVector values = mapsVector.values; + Object[] keyRows = new Object[mapsVector.childCount]; + Object[] valueRows = new Object[mapsVector.childCount]; + + // read map kes and values --- End diff -- keys
---