[ https://issues.apache.org/jira/browse/HIVE-26496?focusedWorklogId=808771&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-808771 ]
ASF GitHub Bot logged work on HIVE-26496: ----------------------------------------- Author: ASF GitHub Bot Created on: 14/Sep/22 14:54 Start Date: 14/Sep/22 14:54 Worklog Time Spent: 10m Work Description: difin commented on code in PR #3559: URL: https://github.com/apache/hive/pull/3559#discussion_r970928460 ########## ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcSplit.java: ########## @@ -0,0 +1,181 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io.orc; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.common.ValidReadTxnList; +import org.apache.hadoop.hive.common.ValidTxnList; +import org.apache.hadoop.hive.common.ValidWriteIdList; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; +import org.apache.hadoop.hive.ql.io.*; +import org.apache.hadoop.hive.ql.io.AcidUtils.Directory; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.Reporter; +import org.apache.orc.OrcConf; +import org.junit.Before; +import org.junit.Test; + +import java.io.File; +import java.util.*; + +import static org.junit.Assert.*; + +/** + * Tests for OrcSplit class + */ +public class TestOrcSplit { + + private JobConf conf; + private FileSystem fs; + private Path root; + private ObjectInspector inspector; + public static class DummyRow { + LongWritable field; + RecordIdentifier ROW__ID; + + DummyRow(long val, long rowId, long origTxn, int bucket) { + field = new LongWritable(val); + bucket = BucketCodec.V1.encode(new AcidOutputFormat.Options(null).bucket(bucket)); + ROW__ID = new RecordIdentifier(origTxn, bucket, rowId); + } + + static String getColumnNamesProperty() { + return "field"; + } + static String getColumnTypesProperty() { + return "bigint"; + } + + } + + @Before + public void setup() throws Exception { + conf = new JobConf(); + conf.set(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL, "true"); + conf.setBoolean(HiveConf.ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN.varname, true); + conf.set(hive_metastoreConstants.TABLE_TRANSACTIONAL_PROPERTIES, "default"); + conf.setInt(HiveConf.ConfVars.HIVE_TXN_OPERATIONAL_PROPERTIES.varname, + AcidUtils.AcidOperationalProperties.getDefault().toInt()); + conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, DummyRow.getColumnNamesProperty()); + conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, DummyRow.getColumnTypesProperty()); + conf.setBoolean(HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED.varname, true); + conf.set(HiveConf.ConfVars.HIVE_ORC_SPLIT_STRATEGY.varname, "BI"); + OrcConf.ROWS_BETWEEN_CHECKS.setLong(conf, 1); + + Path workDir = new Path(System.getProperty("test.tmp.dir", + "target" + File.separator + "test" + File.separator + "tmp")); + root = new Path(workDir, "TestOrcSplit.testDump"); + fs = root.getFileSystem(conf); + root = fs.makeQualified(root); + fs.delete(root, true); + synchronized (TestOrcFile.class) { + inspector = ObjectInspectorFactory.getReflectionObjectInspector + (DummyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); + } + } + + private List<OrcInputFormat.SplitStrategy<?>> getSplitStrategies() throws Exception { + conf.setInt(HiveConf.ConfVars.HIVE_TXN_OPERATIONAL_PROPERTIES.varname, + AcidUtils.AcidOperationalProperties.getDefault().toInt()); + OrcInputFormat.Context context = new OrcInputFormat.Context(conf); + OrcInputFormat.FileGenerator gen = new OrcInputFormat.FileGenerator( + context, () -> fs, root, false, null); + Directory adi = gen.call(); + return OrcInputFormat.determineSplitStrategies( + null, context, adi.getFs(), adi.getPath(), adi.getFiles(), adi.getDeleteDeltas(), + null, null, true); + } + + /** + * This test checks that a split filters out delete_delta folders which only have transactions that happened + * in the past relative to the current split + */ + @Test + public void testDeleteDeltasFiltering() throws Exception { + + int bucket = 0; + AcidOutputFormat.Options options = new AcidOutputFormat.Options(conf) + .filesystem(fs) + .bucket(bucket) + .writingBase(false) + .minimumWriteId(1) + .maximumWriteId(1) + .inspector(inspector) + .reporter(Reporter.NULL) + .recordIdColumn(1) + .finalDestination(root); + + RecordUpdater updater = new OrcRecordUpdater(root, options); + + // Inserting a new record and then deleting it, 3 times. + // Every insertion/deletion is in a separate transaction. + // When reading, whis will generate 3 splits, one per insert event. Review Comment: fixed typo Issue Time Tracking ------------------- Worklog Id: (was: 808771) Time Spent: 3h 10m (was: 3h) > FetchOperator scans delete_delta folders multiple times causing slowness > ------------------------------------------------------------------------ > > Key: HIVE-26496 > URL: https://issues.apache.org/jira/browse/HIVE-26496 > Project: Hive > Issue Type: Bug > Components: HiveServer2 > Reporter: Rajesh Balamohan > Assignee: Dmitriy Fingerman > Priority: Major > Labels: pull-request-available > Time Spent: 3h 10m > Remaining Estimate: 0h > > FetchOperator scans way too many number of files/directories than needed. > For e.g here is a layout of a table which had set of updates and deletes. > There are set of "delta" and "delete_delta" folders which are created. > {noformat} > s3a://bucket-name/warehouse/tablespace/managed/hive/test.db/date_dim/base_0000001 > s3a://bucket-name/warehouse/tablespace/managed/hive/test.db/date_dim/delete_delta_0000002_0000002_0000 > s3a://bucket-name/warehouse/tablespace/managed/hive/test.db/date_dim/delete_delta_0000003_0000003_0000 > s3a://bucket-name/warehouse/tablespace/managed/hive/test.db/date_dim/delete_delta_0000004_0000004_0000 > s3a://bucket-name/warehouse/tablespace/managed/hive/test.db/date_dim/delete_delta_0000005_0000005_0000 > s3a://bucket-name/warehouse/tablespace/managed/hive/test.db/date_dim/delete_delta_0000006_0000006_0000 > s3a://bucket-name/warehouse/tablespace/managed/hive/test.db/date_dim/delete_delta_0000007_0000007_0000 > s3a://bucket-name/warehouse/tablespace/managed/hive/test.db/date_dim/delete_delta_0000008_0000008_0000 > s3a://bucket-name/warehouse/tablespace/managed/hive/test.db/date_dim/delete_delta_0000009_0000009_0000 > s3a://bucket-name/warehouse/tablespace/managed/hive/test.db/date_dim/delete_delta_0000010_0000010_0000 > s3a://bucket-name/warehouse/tablespace/managed/hive/test.db/date_dim/delete_delta_0000011_0000011_0000 > s3a://bucket-name/warehouse/tablespace/managed/hive/test.db/date_dim/delete_delta_0000012_0000012_0000 > s3a://bucket-name/warehouse/tablespace/managed/hive/test.db/date_dim/delete_delta_0000013_0000013_0000 > s3a://bucket-name/warehouse/tablespace/managed/hive/test.db/date_dim/delete_delta_0000014_0000014_0000 > s3a://bucket-name/warehouse/tablespace/managed/hive/test.db/date_dim/delete_delta_0000015_0000015_0000 > s3a://bucket-name/warehouse/tablespace/managed/hive/test.db/date_dim/delete_delta_0000016_0000016_0000 > s3a://bucket-name/warehouse/tablespace/managed/hive/test.db/date_dim/delete_delta_0000017_0000017_0000 > s3a://bucket-name/warehouse/tablespace/managed/hive/test.db/date_dim/delete_delta_0000018_0000018_0000 > s3a://bucket-name/warehouse/tablespace/managed/hive/test.db/date_dim/delete_delta_0000019_0000019_0000 > s3a://bucket-name/warehouse/tablespace/managed/hive/test.db/date_dim/delete_delta_0000020_0000020_0000 > s3a://bucket-name/warehouse/tablespace/managed/hive/test.db/date_dim/delete_delta_0000021_0000021_0000 > s3a://bucket-name/warehouse/tablespace/managed/hive/test.db/date_dim/delete_delta_0000022_0000022_0000 > s3a://bucket-name/warehouse/tablespace/managed/hive/test.db/date_dim/delta_0000002_0000002_0000 > s3a://bucket-name/warehouse/tablespace/managed/hive/test.db/date_dim/delta_0000003_0000003_0000 > s3a://bucket-name/warehouse/tablespace/managed/hive/test.db/date_dim/delta_0000004_0000004_0000 > s3a://bucket-name/warehouse/tablespace/managed/hive/test.db/date_dim/delta_0000005_0000005_0000 > s3a://bucket-name/warehouse/tablespace/managed/hive/test.db/date_dim/delta_0000006_0000006_0000 > s3a://bucket-name/warehouse/tablespace/managed/hive/test.db/date_dim/delta_0000007_0000007_0000 > s3a://bucket-name/warehouse/tablespace/managed/hive/test.db/date_dim/delta_0000008_0000008_0000 > s3a://bucket-name/warehouse/tablespace/managed/hive/test.db/date_dim/delta_0000009_0000009_0000 > s3a://bucket-name/warehouse/tablespace/managed/hive/test.db/date_dim/delta_0000010_0000010_0000 > s3a://bucket-name/warehouse/tablespace/managed/hive/test.db/date_dim/delta_0000011_0000011_0000 > s3a://bucket-name/warehouse/tablespace/managed/hive/test.db/date_dim/delta_0000012_0000012_0000 > s3a://bucket-name/warehouse/tablespace/managed/hive/test.db/date_dim/delta_0000013_0000013_0000 > s3a://bucket-name/warehouse/tablespace/managed/hive/test.db/date_dim/delta_0000014_0000014_0000 > s3a://bucket-name/warehouse/tablespace/managed/hive/test.db/date_dim/delta_0000015_0000015_0000 > s3a://bucket-name/warehouse/tablespace/managed/hive/test.db/date_dim/delta_0000016_0000016_0000 > s3a://bucket-name/warehouse/tablespace/managed/hive/test.db/date_dim/delta_0000017_0000017_0000 > s3a://bucket-name/warehouse/tablespace/managed/hive/test.db/date_dim/delta_0000018_0000018_0000 > s3a://bucket-name/warehouse/tablespace/managed/hive/test.db/date_dim/delta_0000019_0000019_0000 > s3a://bucket-name/warehouse/tablespace/managed/hive/test.db/date_dim/delta_0000020_0000020_0000 > s3a://bucket-name/warehouse/tablespace/managed/hive/test.db/date_dim/delta_0000021_0000021_0000 > {noformat} > > When user runs *{color:#0747a6}{{select * from date_dim}}{color}* from > beeline, FetchOperator tries to compute splits in "date_dim". This "base" and > "delta" folders and computes 21 splits. > However, for each of the 21 splits, it ends up loading entire "delete_delta" > folders and scans unnecessarily. This increases the scan by "21 splits * 21 > delete_delta folders" (i.e 1396) times. This makes the statement execution > super slow, even when there is minimal dataset present in the table. > It will be good to scan only relevant delete_delta folder in the split, > instead of loading all delete_delta folders in every split. > > [https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRawRecordMerger.java#L1142|https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRawRecordMerger.java#L1142] > [https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRawRecordMerger.java#L1172|https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRawRecordMerger.java#L1172] > > [https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/exec/FetchOperator.java#L402|https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/exec/FetchOperator.java#L402] > -- This message was sent by Atlassian Jira (v8.20.10#820010)