[ https://issues.apache.org/jira/browse/HIVE-24503?focusedWorklogId=523733&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-523733 ]
ASF GitHub Bot logged work on HIVE-24503: ----------------------------------------- Author: ASF GitHub Bot Created on: 14/Dec/20 07:43 Start Date: 14/Dec/20 07:43 Worklog Time Spent: 10m Work Description: maheshk114 commented on a change in pull request #1753: URL: https://github.com/apache/hive/pull/1753#discussion_r542159449 ########## File path: ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorDeserializeRow.java ########## @@ -564,173 +621,337 @@ public void init() throws HiveException { init(0); } - private void storePrimitiveRowColumn(ColumnVector colVector, Field field, - int batchIndex, boolean canRetainByteRef) throws IOException { - - switch (field.getPrimitiveCategory()) { - case VOID: + class VectorVoidDeserializer extends VectorBatchDeserializer { + @Override + void store(ColumnVector colVector, Field field, int batchIndex, boolean canRetainByteRef) + throws IOException { VectorizedBatchUtil.setNullColIsNullValue(colVector, batchIndex); - return; - case BOOLEAN: - ((LongColumnVector) colVector).vector[batchIndex] = (deserializeRead.currentBoolean ? 1 : 0); - break; - case BYTE: + } + + @Override + Object convert(ColumnVector batch, int batchIndex, Field field) throws IOException { + return convertVoid(); + } + } + + class VectorBooleanDeserializer extends VectorBatchDeserializer { + @Override + void store(ColumnVector colVector, Field field, int batchIndex, boolean canRetainByteRef) + throws IOException { + ((LongColumnVector) colVector).vector[batchIndex] = + (deserializeRead.currentBoolean ? 1 : 0); + } + + @Override + Object convert(ColumnVector batch, int batchIndex, Field field) throws IOException { + return convertBoolean(field.getConversionWritable()); + } + } + + class VectorByteDeserializer extends VectorBatchDeserializer { + @Override + void store(ColumnVector colVector, Field field, int batchIndex, boolean canRetainByteRef) + throws IOException { ((LongColumnVector) colVector).vector[batchIndex] = deserializeRead.currentByte; - break; - case SHORT: + } + + @Override + Object convert(ColumnVector batch, int batchIndex, Field field) throws IOException { + return convertByte(field.getConversionWritable()); + } + } + + class VectorShortDeserializer extends VectorBatchDeserializer { + @Override + void store(ColumnVector colVector, Field field, int batchIndex, boolean canRetainByteRef) + throws IOException { ((LongColumnVector) colVector).vector[batchIndex] = deserializeRead.currentShort; - break; - case INT: + } + + @Override + Object convert(ColumnVector batch, int batchIndex, Field field) throws IOException { + return convertShort(field.getConversionWritable()); + } + } + + class VectorIntDeserializer extends VectorBatchDeserializer { + @Override + void store(ColumnVector colVector, Field field, int batchIndex, boolean canRetainByteRef) + throws IOException { ((LongColumnVector) colVector).vector[batchIndex] = deserializeRead.currentInt; - break; - case LONG: + } + + @Override + Object convert(ColumnVector batch, int batchIndex, Field field) throws IOException { + return convertInt(field.getConversionWritable()); + } + } + + class VectorLongDeserializer extends VectorBatchDeserializer { + @Override + void store(ColumnVector colVector, Field field, int batchIndex, boolean canRetainByteRef) + throws IOException { ((LongColumnVector) colVector).vector[batchIndex] = deserializeRead.currentLong; - break; - case TIMESTAMP: + } + + @Override + Object convert(ColumnVector batch, int batchIndex, Field field) throws IOException { + return convertLong(field.getConversionWritable()); + } + } + + class VectorTimestampDeserializer extends VectorBatchDeserializer { + @Override + void store(ColumnVector colVector, Field field, int batchIndex, boolean canRetainByteRef) + throws IOException { ((TimestampColumnVector) colVector).set( - batchIndex, deserializeRead.currentTimestampWritable.getTimestamp().toSqlTimestamp()); - break; - case DATE: - ((LongColumnVector) colVector).vector[batchIndex] = deserializeRead.currentDateWritable.getDays(); - break; - case FLOAT: + batchIndex, deserializeRead.currentTimestampWritable.getTimestamp().toSqlTimestamp()); + } + + @Override + Object convert(ColumnVector batch, int batchIndex, Field field) throws IOException { + return convertTimestamp(field.getConversionWritable()); + } + } + + class VectorDateDeserializer extends VectorBatchDeserializer { + @Override + void store(ColumnVector colVector, Field field, int batchIndex, boolean canRetainByteRef) + throws IOException { + ((LongColumnVector) colVector).vector[batchIndex] = + deserializeRead.currentDateWritable.getDays(); + } + + @Override + Object convert(ColumnVector batch, int batchIndex, Field field) throws IOException { + return convertDate(field.getConversionWritable()); + } + } + + class VectorFloatDeserializer extends VectorBatchDeserializer { + @Override + void store(ColumnVector colVector, Field field, int batchIndex, boolean canRetainByteRef) + throws IOException { ((DoubleColumnVector) colVector).vector[batchIndex] = deserializeRead.currentFloat; - break; - case DOUBLE: + } + + @Override + Object convert(ColumnVector batch, int batchIndex, Field field) throws IOException { + return convertFloat(field.getConversionWritable()); + } + } + + class VectorDoubleDeserializer extends VectorBatchDeserializer { + @Override + void store(ColumnVector colVector, Field field, int batchIndex, boolean canRetainByteRef) + throws IOException { ((DoubleColumnVector) colVector).vector[batchIndex] = deserializeRead.currentDouble; - break; - case BINARY: - case STRING: - { - final BytesColumnVector bytesColVec = ((BytesColumnVector) colVector); - if (deserializeRead.currentExternalBufferNeeded) { - bytesColVec.ensureValPreallocated(deserializeRead.currentExternalBufferNeededLen); - deserializeRead.copyToExternalBuffer( + } + + @Override + Object convert(ColumnVector batch, int batchIndex, Field field) throws IOException { + return convertDouble(field.getConversionWritable()); + } + } + + private void storeString(ColumnVector colVector, Field field, int batchIndex, boolean canRetainByteRef) + throws IOException { + final BytesColumnVector bytesColVec = ((BytesColumnVector) colVector); + if (deserializeRead.currentExternalBufferNeeded) { + bytesColVec.ensureValPreallocated(deserializeRead.currentExternalBufferNeededLen); + deserializeRead.copyToExternalBuffer( bytesColVec.getValPreallocatedBytes(), bytesColVec.getValPreallocatedStart()); - bytesColVec.setValPreallocated( + bytesColVec.setValPreallocated( batchIndex, deserializeRead.currentExternalBufferNeededLen); - } else if (canRetainByteRef && inputBytes == deserializeRead.currentBytes) { - bytesColVec.setRef( + } else if (canRetainByteRef && inputBytes == deserializeRead.currentBytes) { + bytesColVec.setRef( batchIndex, deserializeRead.currentBytes, deserializeRead.currentBytesStart, deserializeRead.currentBytesLength); - } else { - bytesColVec.setVal( + } else { + bytesColVec.setVal( batchIndex, deserializeRead.currentBytes, deserializeRead.currentBytesStart, deserializeRead.currentBytesLength); - } - } - break; - case VARCHAR: - { - // Use the basic STRING bytes read to get access, then use our optimal truncate/trim method - // that does not use Java String objects. - final BytesColumnVector bytesColVec = ((BytesColumnVector) colVector); - if (deserializeRead.currentExternalBufferNeeded) { - // Write directly into our BytesColumnVector value buffer. - bytesColVec.ensureValPreallocated(deserializeRead.currentExternalBufferNeededLen); - final byte[] convertBuffer = bytesColVec.getValPreallocatedBytes(); - final int convertBufferStart = bytesColVec.getValPreallocatedStart(); - deserializeRead.copyToExternalBuffer( - convertBuffer, - convertBufferStart); - bytesColVec.setValPreallocated( - batchIndex, - StringExpr.truncate( - convertBuffer, - convertBufferStart, - deserializeRead.currentExternalBufferNeededLen, - field.getMaxLength())); - } else if (canRetainByteRef && inputBytes == deserializeRead.currentBytes) { - bytesColVec.setRef( - batchIndex, - deserializeRead.currentBytes, - deserializeRead.currentBytesStart, - StringExpr.truncate( - deserializeRead.currentBytes, - deserializeRead.currentBytesStart, - deserializeRead.currentBytesLength, - field.getMaxLength())); - } else { - bytesColVec.setVal( - batchIndex, - deserializeRead.currentBytes, - deserializeRead.currentBytesStart, - StringExpr.truncate( - deserializeRead.currentBytes, - deserializeRead.currentBytesStart, - deserializeRead.currentBytesLength, - field.getMaxLength())); - } + } + } + + class VectorBinaryDeserializer extends VectorBatchDeserializer { + @Override + void store(ColumnVector colVector, Field field, int batchIndex, boolean canRetainByteRef) + throws IOException { + storeString(colVector, field, batchIndex, canRetainByteRef); + } + + @Override + Object convert(ColumnVector batch, int batchIndex, Field field) throws IOException { + return convertBinary(field.getConversionWritable(), batchIndex); + } + } + + class VectorStringDeserializer extends VectorBatchDeserializer { + @Override + void store(ColumnVector colVector, Field field, int batchIndex, boolean canRetainByteRef) + throws IOException { + storeString(colVector, field, batchIndex, canRetainByteRef); + } + + @Override + Object convert(ColumnVector batch, int batchIndex, Field field) throws IOException { + return convertString(field.getConversionWritable(), batchIndex); + } + } + + class VectorVarcharDeserializer extends VectorBatchDeserializer { Review comment: char uses StringExpr.rightTrimAndTruncate and var char uses StringExpr.truncate( ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking ------------------- Worklog Id: (was: 523733) Time Spent: 1h 10m (was: 1h) > Optimize vector row serde by avoiding type check at run time > ------------------------------------------------------------- > > Key: HIVE-24503 > URL: https://issues.apache.org/jira/browse/HIVE-24503 > Project: Hive > Issue Type: Bug > Components: Hive > Reporter: mahesh kumar behera > Assignee: mahesh kumar behera > Priority: Major > Labels: pull-request-available > Time Spent: 1h 10m > Remaining Estimate: 0h > > Serialization/Deserialization of vectorized batch done at VectorSerializeRow > and VectorDeserializeRow does a type checking for each column of each row. > This becomes very costly when there are billions of rows to read/write. This > can be optimized if the type check is done during init time and specific > reader/writer classes are created. This classes can be used directly stored > in filed structure to avoid run time type check. -- This message was sent by Atlassian Jira (v8.3.4#803005)