Github user haohui commented on a diff in the pull request: https://github.com/apache/flink/pull/4670#discussion_r140700434 --- Diff: flink-connectors/flink-orc/src/main/java/org/apache/flink/orc/OrcUtils.java --- @@ -0,0 +1,2229 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.orc; + +import org.apache.flink.api.common.typeinfo.BasicTypeInfo; +import org.apache.flink.api.common.typeinfo.PrimitiveArrayTypeInfo; +import org.apache.flink.api.common.typeinfo.SqlTimeTypeInfo; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.MapTypeInfo; +import org.apache.flink.api.java.typeutils.ObjectArrayTypeInfo; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.types.Row; + +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; + +import org.apache.orc.TypeDescription; + +import java.lang.reflect.Array; +import java.math.BigDecimal; +import java.sql.Date; +import java.sql.Timestamp; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; + +/** + * A class that provides utility methods for orc file reading. + */ +public class OrcUtils { + + /** + * Convert ORC schema types to Flink types. + * + * @param schema schema of orc file + * + */ + public static TypeInformation schemaToTypeInfo(TypeDescription schema) { + switch (schema.getCategory()) { + case BOOLEAN: + return BasicTypeInfo.BOOLEAN_TYPE_INFO; + case BYTE: + return BasicTypeInfo.BYTE_TYPE_INFO; + case SHORT: + return BasicTypeInfo.SHORT_TYPE_INFO; + case INT: + return BasicTypeInfo.INT_TYPE_INFO; + case LONG: + return BasicTypeInfo.LONG_TYPE_INFO; + case FLOAT: + return BasicTypeInfo.FLOAT_TYPE_INFO; + case DOUBLE: + return BasicTypeInfo.DOUBLE_TYPE_INFO; + case STRING: + case CHAR: + case VARCHAR: + return BasicTypeInfo.STRING_TYPE_INFO; + case DATE: + return SqlTimeTypeInfo.DATE; + case TIMESTAMP: + return SqlTimeTypeInfo.TIMESTAMP; + case BINARY: + return PrimitiveArrayTypeInfo.BYTE_PRIMITIVE_ARRAY_TYPE_INFO; + case STRUCT: + List<TypeDescription> fieldSchemas = schema.getChildren(); + TypeInformation[] fieldTypes = new TypeInformation[fieldSchemas.size()]; + for (int i = 0; i < fieldSchemas.size(); i++) { + fieldTypes[i] = schemaToTypeInfo(fieldSchemas.get(i)); + } + String[] fieldNames = schema.getFieldNames().toArray(new String[]{}); + return new RowTypeInfo(fieldTypes, fieldNames); + case LIST: + TypeDescription elementSchema = schema.getChildren().get(0); + TypeInformation elementType = schemaToTypeInfo(elementSchema); + return ObjectArrayTypeInfo.getInfoFor(elementType); + case MAP: + TypeDescription keySchema = schema.getChildren().get(0); + TypeDescription valSchema = schema.getChildren().get(1); + TypeInformation keyType = schemaToTypeInfo(keySchema); + TypeInformation valType = schemaToTypeInfo(valSchema); + return new MapTypeInfo(keyType, valType); + case DECIMAL: + return BasicTypeInfo.BIG_DEC_TYPE_INFO; + case UNION: + throw new UnsupportedOperationException("UNION type not supported yet."); + default: + throw new IllegalArgumentException("Unknown type " + schema); + } + } + + /** + * Fill rows from orc batch. + * + * @param rows the batch of rows need to be filled + * @param schema schema of orc file + * @param batch current orc batch data used to fill the rows + * @param fieldMapping field mapping + * + */ + public static void fillRows(Object[] rows, TypeDescription schema, VectorizedRowBatch batch, int[] fieldMapping) { + + int totalRowsInBatch = (int) batch.count(); + + List<TypeDescription> fieldTypes = schema.getChildren(); + for (int outIdx = 0; outIdx < fieldMapping.length; outIdx++) { + int inIdx = fieldMapping[outIdx]; + readField(rows, outIdx, fieldTypes.get(inIdx), batch.cols[inIdx], null, Math.min((int) totalRowsInBatch, rows.length)); + } + } + + private static void readField(Object[] rows, int fieldIdx, TypeDescription schema, ColumnVector vector, long[] lengthVector, int childCount) { --- End diff -- There are significant of duplications across these helper functions. It requires some refactoring to clean things up.
---