[ 
https://issues.apache.org/jira/browse/HUDI-3066?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17463528#comment-17463528
 ] 

Harsha Teja Kanna commented on HUDI-3066:
-----------------------------------------

I made changes to the sync

1) Clustering for each 10 commits

2) Removed hoodie.metadata.clean.async=true

2) Removed hoodie.clean.async=true

I do see compaction kick off and file listing much faster.

[^metadata_files_compacted.txt]

[^metadata_timeline_compacted.txt]

Though for a different table I see below exception

org.apache.hudi.exception.HoodieException: 
org.apache.hudi.exception.HoodieException: 
java.util.concurrent.ExecutionException: 
org.apache.hudi.exception.HoodieUpsertException: Failed to merge old record 
into new file for key from old file 
s3a://bucket/.hoodie/metadata/files/files-0000_0-45-650_20211221205951077001.hfile
 to new file 
s3a://bucket/.hoodie/metadata/files/files-0000_0-67-713_20211222000526106001.hfile
 with writerSchema {
"type" : "record",
"name" : "HoodieMetadataRecord",
"namespace" : "org.apache.hudi.avro.model",
"doc" : "A record saved within the Metadata Table",
"fields" : [ {
"name" : "_hoodie_commit_time",
"type" : [ "null", "string" ],
"doc" : "",
"default" : null
}, {
"name" : "_hoodie_commit_seqno",
"type" : [ "null", "string" ],
"doc" : "",
"default" : null
}, {
"name" : "_hoodie_record_key",
"type" : [ "null", "string" ],
"doc" : "",
"default" : null
}, {
"name" : "_hoodie_partition_path",
"type" : [ "null", "string" ],
"doc" : "",
"default" : null
}, {
"name" : "_hoodie_file_name",
"type" : [ "null", "string" ],
"doc" : "",
"default" : null
}, {
"name" : "key",
"type" : {
"type" : "string",
"avro.java.string" : "String"
}
}, {
"name" : "type",
"type" : "int",
"doc" : "Type of the metadata record"
}, {
"name" : "filesystemMetadata",
"type" : [ "null", {
"type" : "map",
"values" : {
"type" : "record",
"name" : "HoodieMetadataFileInfo",
"fields" : [ {
"name" : "size",
"type" : "long",
"doc" : "Size of the file"
}, {
"name" : "isDeleted",
"type" : "boolean",
"doc" : "True if this file has been deleted"
} ]
},
"avro.java.string" : "String"
} ],
"doc" : "Contains information about partitions and files within the dataset"
} ]
}
at 
org.apache.hudi.table.action.commit.SparkMergeHelper.runMerge(SparkMergeHelper.java:102)
at 
org.apache.hudi.table.HoodieSparkCopyOnWriteTable.handleUpdateInternal(HoodieSparkCopyOnWriteTable.java:292)
at 
org.apache.hudi.table.HoodieSparkCopyOnWriteTable.handleUpdate(HoodieSparkCopyOnWriteTable.java:283)
at 
org.apache.hudi.table.action.compact.HoodieCompactor.compact(HoodieCompactor.java:197)
at 
org.apache.hudi.table.action.compact.HoodieCompactor.lambda$compact$57154431$1(HoodieCompactor.java:133)
at 
org.apache.spark.api.java.JavaPairRDD$.$anonfun$toScalaFunction$1(JavaPairRDD.scala:1070)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:459)
at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:484)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:490)
at 
org.apache.spark.storage.memory.MemoryStore.putIterator(MemoryStore.scala:221)
at 
org.apache.spark.storage.memory.MemoryStore.putIteratorAsBytes(MemoryStore.scala:349)
at 
org.apache.spark.storage.BlockManager.$anonfun$doPutIterator$1(BlockManager.scala:1440)
at 
org.apache.spark.storage.BlockManager.org$apache$spark$storage$BlockManager$$doPut(BlockManager.scala:1350)
at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1414)
at 
org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:1237)
at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:384)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:335)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at 
org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.hudi.exception.HoodieException: 
java.util.concurrent.ExecutionException: 
org.apache.hudi.exception.HoodieUpsertException: Failed to merge old record 
into new file for key from old file 
s3a://bucket/.hoodie/metadata/files/files-0000_0-45-650_20211221205951077001.hfile
 to new file 
s3a://bucket/.hoodie/metadata/files/files-0000_0-67-713_20211222000526106001.hfile
 with writerSchema {
"type" : "record",
"name" : "HoodieMetadataRecord",
"namespace" : "org.apache.hudi.avro.model",
"doc" : "A record saved within the Metadata Table",
"fields" : [ {
"name" : "_hoodie_commit_time",
"type" : [ "null", "string" ],
"doc" : "",
"default" : null
}, {
"name" : "_hoodie_commit_seqno",
"type" : [ "null", "string" ],
"doc" : "",
"default" : null
}, {
"name" : "_hoodie_record_key",
"type" : [ "null", "string" ],
"doc" : "",
"default" : null
}, {
"name" : "_hoodie_partition_path",
"type" : [ "null", "string" ],
"doc" : "",
"default" : null
}, {
"name" : "_hoodie_file_name",
"type" : [ "null", "string" ],
"doc" : "",
"default" : null
}, {
"name" : "key",
"type" : {
"type" : "string",
"avro.java.string" : "String"
}
}, {
"name" : "type",
"type" : "int",
"doc" : "Type of the metadata record"
}, {
"name" : "filesystemMetadata",
"type" : [ "null", {
"type" : "map",
"values" : {
"type" : "record",
"name" : "HoodieMetadataFileInfo",
"fields" : [ {
"name" : "size",
"type" : "long",
"doc" : "Size of the file"
}, {
"name" : "isDeleted",
"type" : "boolean",
"doc" : "True if this file has been deleted"
} ]
},
"avro.java.string" : "String"
} ],
"doc" : "Contains information about partitions and files within the dataset"
} ]
}
at 
org.apache.hudi.common.util.queue.BoundedInMemoryExecutor.execute(BoundedInMemoryExecutor.java:147)
at 
org.apache.hudi.table.action.commit.SparkMergeHelper.runMerge(SparkMergeHelper.java:100)
... 28 more
Caused by: org.apache.hudi.exception.HoodieUpsertException: Failed to merge old 
record into new file for key from old file 
s3a://bucket/.hoodie/metadata/files/files-0000_0-45-650_20211221205951077001.hfile
 to new file 
s3a://bucket/.hoodie/metadata/files/files-0000_0-67-713_20211222000526106001.hfile
 with writerSchema {
"type" : "record",
"name" : "HoodieMetadataRecord",
"namespace" : "org.apache.hudi.avro.model",
"doc" : "A record saved within the Metadata Table",
"fields" : [ {
"name" : "_hoodie_commit_time",
"type" : [ "null", "string" ],
"doc" : "",
"default" : null
}, {
"name" : "_hoodie_commit_seqno",
"type" : [ "null", "string" ],
"doc" : "",
"default" : null
}, {
"name" : "_hoodie_record_key",
"type" : [ "null", "string" ],
"doc" : "",
"default" : null
}, {
"name" : "_hoodie_partition_path",
"type" : [ "null", "string" ],
"doc" : "",
"default" : null
}, {
"name" : "_hoodie_file_name",
"type" : [ "null", "string" ],
"doc" : "",
"default" : null
}, {
"name" : "key",
"type" : {
"type" : "string",
"avro.java.string" : "String"
}
}, {
"name" : "type",
"type" : "int",
"doc" : "Type of the metadata record"
}, {
"name" : "filesystemMetadata",
"type" : [ "null", {
"type" : "map",
"values" : {
"type" : "record",
"name" : "HoodieMetadataFileInfo",
"fields" : [ {
"name" : "size",
"type" : "long",
"doc" : "Size of the file"
}, {
"name" : "isDeleted",
"type" : "boolean",
"doc" : "True if this file has been deleted"
} ]
},
"avro.java.string" : "String"
} ],
"doc" : "Contains information about partitions and files within the dataset"
} ]
}
at org.apache.hudi.io.HoodieMergeHandle.write(HoodieMergeHandle.java:356)
at 
org.apache.hudi.io.HoodieSortedMergeHandle.write(HoodieSortedMergeHandle.java:105)
at 
org.apache.hudi.table.action.commit.AbstractMergeHelper$UpdateHandler.consumeOneRecord(AbstractMergeHelper.java:122)
at 
org.apache.hudi.table.action.commit.AbstractMergeHelper$UpdateHandler.consumeOneRecord(AbstractMergeHelper.java:112)
at 
org.apache.hudi.common.util.queue.BoundedInMemoryQueueConsumer.consume(BoundedInMemoryQueueConsumer.java:37)
at 
org.apache.hudi.common.util.queue.BoundedInMemoryExecutor.lambda$null$2(BoundedInMemoryExecutor.java:121)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
... 3 more
Caused by: java.lang.IllegalArgumentException: key length must be > 0
at org.apache.hadoop.util.bloom.HashFunction.hash(HashFunction.java:114)
at org.apache.hadoop.util.bloom.BloomFilter.add(BloomFilter.java:122)
at 
org.apache.hudi.common.bloom.InternalDynamicBloomFilter.add(InternalDynamicBloomFilter.java:94)
at 
org.apache.hudi.common.bloom.HoodieDynamicBoundedBloomFilter.add(HoodieDynamicBoundedBloomFilter.java:81)
at 
org.apache.hudi.io.storage.HoodieHFileWriter.writeAvro(HoodieHFileWriter.java:130)
at org.apache.hudi.io.HoodieMergeHandle.write(HoodieMergeHandle.java:351)
... 9 more

ApplicationMaster host: ip-10-0-4-51.
ApplicationMaster RPC port: 43575
queue: default
start time: 1640132568582
final status: FAILED
tracking URL: http://ip-10-0-4-84:20888/proxy/application_1639427377371_5540/
user: hadoop

 

> Very slow file listing after enabling metadata for existing tables in 0.10.0 
> release
> ------------------------------------------------------------------------------------
>
>                 Key: HUDI-3066
>                 URL: https://issues.apache.org/jira/browse/HUDI-3066
>             Project: Apache Hudi
>          Issue Type: Bug
>    Affects Versions: 0.10.0
>         Environment: EMR 6.4.0
> Hudi version : 0.10.0
>            Reporter: Harsha Teja Kanna
>            Assignee: Manoj Govindassamy
>            Priority: Blocker
>              Labels: performance, pull-request-available
>             Fix For: 0.11.0
>
>         Attachments: Screen Shot 2021-12-18 at 6.16.29 PM.png, Screen Shot 
> 2021-12-20 at 10.05.50 PM.png, Screen Shot 2021-12-20 at 10.17.44 PM.png, 
> metadata_files.txt, metadata_files_compacted.txt, metadata_timeline.txt, 
> metadata_timeline_archived.txt, metadata_timeline_compacted.txt, 
> stderr_part1.txt, stderr_part2.txt, timeline.txt, writer_log.txt
>
>
> After 'metadata table' is enabled, File listing takes long time.
> If metadata is enabled on Reader side(as shown below), it is taking even more 
> time per file listing task
> {code:java}
> import org.apache.hudi.DataSourceReadOptions
> import org.apache.hudi.common.config.HoodieMetadataConfig
> val hadoopConf = spark.conf
> hadoopConf.set(HoodieMetadataConfig.ENABLE.key(), "true")
> val basePath = "s3a://datalake-hudi"
> val sessions = spark
> .read
> .format("org.apache.hudi")
> .option(DataSourceReadOptions.QUERY_TYPE.key(), 
> DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL)
> .option(DataSourceReadOptions.READ_PATHS.key(), 
> s"${basePath}/sessions_by_entrydate/entrydate=2021/*/*/*")
> .load()
> sessions.createOrReplaceTempView("sessions") {code}
> Existing tables (COW) have inline clustering on and have many replace commits.
> Logs seem to suggest the delay is in view.AbstractTableFileSystemView 
> resetFileGroupsReplaced function or metadata.HoodieBackedTableMetadata
> Also many log messages in AbstractHoodieLogRecordReader
>  
> 2021-12-18 23:17:54,056 INFO view.AbstractTableFileSystemView: Took 4118 ms 
> to read  136 instants, 9731 replaced file groups
> 2021-12-18 23:37:46,086 INFO log.AbstractHoodieLogRecordReader: Number of 
> remaining logblocks to merge 1
> 2021-12-18 23:37:46,090 INFO log.AbstractHoodieLogRecordReader: Reading a 
> data block from file 
> s3a://datalake-hudi/sessions/.hoodie/metadata/files/.files-0000_20211216144130775001.log.76_0-20-515
>  at instant 20211217035105329
> 2021-12-18 23:37:46,090 INFO log.AbstractHoodieLogRecordReader: Number of 
> remaining logblocks to merge 1
> 2021-12-18 23:37:46,094 INFO log.HoodieLogFormatReader: Moving to the next 
> reader for logfile 
> HoodieLogFile\{pathStr='s3a://datalake-hudi/sessions/.hoodie/metadata/files/.files-0000_20211216144130775001.log.121_0-57-663',
>  fileLen=0}
> 2021-12-18 23:37:46,095 INFO log.AbstractHoodieLogRecordReader: Scanning log 
> file 
> HoodieLogFile\{pathStr='s3a://datalake-hudi/sessions/.hoodie/metadata/files/.files-0000_20211216144130775001.log.20_0-35-613',
>  fileLen=0}
> 2021-12-18 23:37:46,095 INFO s3a.S3AInputStream: Switching to Random IO seek 
> policy
> 2021-12-18 23:37:46,096 INFO log.AbstractHoodieLogRecordReader: Reading a 
> data block from file 
> s3a://datalake-hudi/sessions/.hoodie/metadata/files/.files-0000_20211216144130775001.log.62_0-34-377
>  at instant 20211217022049877
> 2021-12-18 23:37:46,096 INFO log.AbstractHoodieLogRecordReader: Number of 
> remaining logblocks to merge 1
> 2021-12-18 23:37:46,105 INFO log.HoodieLogFormatReader: Moving to the next 
> reader for logfile 
> HoodieLogFile\{pathStr='s3a://datalake-hudi/sessions/.hoodie/metadata/files/.files-0000_20211216144130775001.log.86_0-20-362',
>  fileLen=0}
> 2021-12-18 23:37:46,109 INFO log.AbstractHoodieLogRecordReader: Scanning log 
> file 
> HoodieLogFile\{pathStr='s3a://datalake-hudi/sessions/.hoodie/metadata/files/.files-0000_20211216144130775001.log.121_0-57-663',
>  fileLen=0}
> 2021-12-18 23:37:46,109 INFO s3a.S3AInputStream: Switching to Random IO seek 
> policy
> 2021-12-18 23:37:46,110 INFO log.HoodieLogFormatReader: Moving to the next 
> reader for logfile 
> HoodieLogFile\{pathStr='s3a://datalake-hudi/sessions/.hoodie/metadata/files/.files-0000_20211216144130775001.log.77_0-35-590',
>  fileLen=0}
> 2021-12-18 23:37:46,112 INFO log.AbstractHoodieLogRecordReader: Reading a 
> data block from file 
> s3a://datalake-hudi/sessions/.hoodie/metadata/files/.files-0000_20211216144130775001.log.20_0-35-613
>  at instant 20211216183448389
> 2021-12-18 23:37:46,112 INFO log.AbstractHoodieLogRecordReader: Number of 
> remaining logblocks to merge 1
> 2021-12-18 23:37:46,118 INFO log.HoodieLogFormatReader: Moving to the next 
> reader for logfile 
> HoodieLogFile\{pathStr='s3a://datalake-hudi/sessions/.hoodie/metadata/files/.files-0000_20211216144130775001.log.63_0-56-519',
>  fileLen=0}
> 2021-12-18 23:37:46,122 INFO log.AbstractHoodieLogRecordReader: Scanning log 
> file 
> HoodieLogFile\{pathStr='s3a://datalake-hudi/sessions/.hoodie/metadata/files/.files-0000_20211216144130775001.log.86_0-20-362',
>  fileLen=0}
> 2021-12-18 23:37:46,122 INFO s3a.S3AInputStream: Switching to Random IO seek 
> policy
> 2021-12-18 23:37:46,123 INFO log.AbstractHoodieLogRecordReader: Reading a 
> data block from file 
> s3a://datalake-hudi/sessions/.hoodie/metadata/files/.files-0000_20211216144130775001.log.121_0-57-663
>  at instant 20211217090337935
> 2021-12-18 23:37:46,123 INFO log.AbstractHoodieLogRecordReader: Number of 
> remaining logblocks to merge 1
> 2021-12-18 23:37:46,127 INFO log.AbstractHoodieLogRecordReader: Scanning log 
> file 
> HoodieLogFile\{pathStr='s3a://datalake-hudi/sessions/.hoodie/metadata/files/.files-0000_20211216144130775001.log.77_0-35-590',
>  fileLen=0}
> 2021-12-18 23:37:46,127 INFO s3a.S3AInputStream: Switching to Random IO seek 
> policy
> 2021-12-18 23:37:46,141 INFO log.HoodieLogFormatReader: Moving to the next 
> reader for logfile 
> HoodieLogFile\{pathStr='s3a://datalake-hudi/sessions/.hoodie/metadata/files/.files-0000_20211216144130775001.log.21_0-56-795',
>  fileLen=0}
> 2021-12-18 23:37:46,143 INFO log.AbstractHoodieLogRecordReader: Scanning log 
> file 
> HoodieLogFile\{pathStr='s3a://datalake-hudi/sessions/.hoodie/metadata/files/.files-0000_20211216144130775001.log.63_0-56-519',
>  fileLen=0}
> 2021-12-18 23:37:46,143 INFO s3a.S3AInputStream: Switching to Random IO seek 
> policy
> 2021-12-18 23:37:46,144 INFO log.AbstractHoodieLogRecordReader: Reading a 
> data block from file 
> s3a://datalake-hudi/sessions/.hoodie/metadata/files/.files-0000_20211216144130775001.log.86_0-20-362
>  at instant 20211217045206861
> 2021-12-18 23:37:46,144 INFO log.AbstractHoodieLogRecordReader: Number of 
> remaining logblocks to merge 1
> 2021-12-18 23:37:46,145 INFO log.AbstractHoodieLogRecordReader: Reading a 
> data block from file 
> s3a://datalake-hudi/sessions/.hoodie/metadata/files/.files-0000_20211216144130775001.log.77_0-35-590
>  at instant 20211217035058289
> 2021-12-18 23:37:46,145 INFO log.AbstractHoodieLogRecordReader: Number of 
> remaining logblocks to merge 1
> 2021-12-18 23:37:46,162 INFO log.AbstractHoodieLogRecordReader: Scanning log 
> file 
> HoodieLogFile\{pathStr='s3a://datalake-hudi/sessions/.hoodie/metadata/files/.files-0000_20211216144130775001.log.21_0-56-795',
>  fileLen=0}
> 2021-12-18 23:37:46,162 INFO log.AbstractHoodieLogRecordReader: Reading a 
> data block from file 
> s3a://datalake-hudi/sessions/.hoodie/metadata/files/.files-0000_20211216144130775001.log.63_0-56-519
>  at instant 20211217022306173
> 2021-12-18 23:37:46,162 INFO s3a.S3AInputStream: Switching to Random IO seek 
> policy
> 2021-12-18 23:37:46,162 INFO log.AbstractHoodieLogRecordReader: Number of 
> remaining logblocks to merge 1
> 2021-12-18 23:37:46,170 INFO log.HoodieLogFormatReader: Moving to the next 
> reader for logfile 
> HoodieLogFile\{pathStr='s3a://datalake-hudi/sessions/.hoodie/metadata/files/.files-0000_20211216144130775001.log.87_0-35-433',
>  fileLen=0}
> 2021-12-18 23:37:46,171 INFO log.HoodieLogFormatReader: Moving to the next 
> reader for logfile 
> HoodieLogFile\{pathStr='s3a://datalake-hudi/sessions/.hoodie/metadata/files/.files-0000_20211216144130775001.log.78_0-56-705',
>  fileLen=0}
> 2021-12-18 23:37:46,178 INFO log.AbstractHoodieLogRecordReader: Reading a 
> data block from file 
> s3a://datalake-hudi/sessions/.hoodie/metadata/files/.files-0000_20211216144130775001.log.21_0-56-795
>  at instant 20211216184106961
> 2021-12-18 23:37:46,178 INFO log.AbstractHoodieLogRecordReader: Number of 
> remaining logblocks to merge
> 2021-12-18 23:38:35,380 INFO log.HoodieMergedLogRecordScanner: Number of log 
> files scanned => 437
> 2021-12-18 23:38:35,380 INFO log.HoodieMergedLogRecordScanner: 
> MaxMemoryInBytes allowed for compaction => 1073741824
> 2021-12-18 23:38:35,380 INFO log.HoodieMergedLogRecordScanner: Number of 
> entries in MemoryBasedMap in ExternalSpillableMap => 165
> 2021-12-18 23:38:35,380 INFO log.HoodieMergedLogRecordScanner: Total size in 
> bytes of MemoryBasedMap in ExternalSpillableMap => 259380
> 2021-12-18 23:38:35,380 INFO log.HoodieMergedLogRecordScanner: Number of 
> entries in BitCaskDiskMap in ExternalSpillableMap => 0
> 2021-12-18 23:38:35,380 INFO log.HoodieMergedLogRecordScanner: Size of file 
> spilled to disk => 0
> 2021-12-18 23:38:35,380 INFO metadata.HoodieBackedTableMetadata: Opened 437 
> metadata log files (dataset instant=20211218233649435, metadata 
> instant=20211218233649435) in 22935 ms
> 2021-12-18 23:38:37,193 INFO metadata.HoodieBackedTableMetadata: Opened 437 
> metadata log files (dataset instant=20211218233649435, metadata 
> instant=20211218233649435) in 22802 ms
> Sample file listing tasks
> !Screen Shot 2021-12-18 at 6.16.29 PM.png!
>  



--
This message was sent by Atlassian Jira
(v8.20.1#820001)

Reply via email to