[ https://issues.apache.org/jira/browse/HIVE-28165?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
László Bodor updated HIVE-28165: -------------------------------- Description: After some investigations regarding hive iceberg issues, it turned out that in the presence of delete files, the serialized payload might be huge, like 1-4MB / split, which might lead to extreme memory pressure in the Tez AM, getting worse while more and more splits. Optimizing the payload is always the best option but it's not that obvious: instead we should make hive and tez together take care of such situations without running into OOMs like this below: {code} ERROR : FAILED: Execution Error, return code 2 from org.apache.hadoop.hive.ql.exec.tez.TezTask. Vertex failed, vertexName=Map 1, vertexId=vertex_1711290808080_0000_4_00, diagnostics=[Vertex vertex_1711290808080_0000_4_00 [Map 1] killed/failed due to:ROOT_INPUT_INIT_FAILURE, Vertex Input: web_sales_1 initializer failed, vertex=vertex_1711290808080_0000_4_00 [Map 1], java.lang.OutOfMemoryError: Java heap space at com.google.protobuf.ByteString$CodedBuilder.<init>(ByteString.java:907) at com.google.protobuf.ByteString$CodedBuilder.<init>(ByteString.java:902) at com.google.protobuf.ByteString.newCodedBuilder(ByteString.java:898) at com.google.protobuf.AbstractMessageLite.toByteString(AbstractMessageLite.java:49) at org.apache.hadoop.hive.ql.exec.tez.HiveSplitGenerator.createEventList(HiveSplitGenerator.java:378) at org.apache.hadoop.hive.ql.exec.tez.HiveSplitGenerator.initialize(HiveSplitGenerator.java:337) at org.apache.tez.dag.app.dag.RootInputInitializerManager.lambda$runInitializer$3(RootInputInitializerManager.java:199) at org.apache.tez.dag.app.dag.RootInputInitializerManager$$Lambda$319/0x0000000840942440.run(Unknown Source) at java.base/java.security.AccessController.doPrivileged(Native Method) at java.base/javax.security.auth.Subject.doAs(Subject.java:423) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1899) at org.apache.tez.dag.app.dag.RootInputInitializerManager.runInitializer(RootInputInitializerManager.java:192) at org.apache.tez.dag.app.dag.RootInputInitializerManager.runInitializerAndProcessResult(RootInputInitializerManager.java:173) at org.apache.tez.dag.app.dag.RootInputInitializerManager.lambda$createAndStartInitializing$2(RootInputInitializerManager.java:167) at org.apache.tez.dag.app.dag.RootInputInitializerManager$$Lambda$318/0x0000000840942040.run(Unknown Source) at java.base/java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:515) at com.google.common.util.concurrent.TrustedListenableFutureTask$TrustedFutureInterruptibleTask.runInterruptibly(TrustedListenableFutureTask.java:125) at com.google.common.util.concurrent.InterruptibleTask.run(InterruptibleTask.java:69) at com.google.common.util.concurrent.TrustedListenableFutureTask.run(TrustedListenableFutureTask.java:78) at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128) at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628) at java.base/java.lang.Thread.run(Thread.java:829) {code} > HiveSplitGenerator: send splits through filesystem instead of RPC in case of > big payload > ---------------------------------------------------------------------------------------- > > Key: HIVE-28165 > URL: https://issues.apache.org/jira/browse/HIVE-28165 > Project: Hive > Issue Type: Improvement > Reporter: László Bodor > Assignee: László Bodor > Priority: Major > > After some investigations regarding hive iceberg issues, it turned out that > in the presence of delete files, the serialized payload might be huge, like > 1-4MB / split, which might lead to extreme memory pressure in the Tez AM, > getting worse while more and more splits. > Optimizing the payload is always the best option but it's not that obvious: > instead we should make hive and tez together take care of such situations > without running into OOMs like this below: > {code} > ERROR : FAILED: Execution Error, return code 2 from > org.apache.hadoop.hive.ql.exec.tez.TezTask. Vertex failed, vertexName=Map 1, > vertexId=vertex_1711290808080_0000_4_00, diagnostics=[Vertex > vertex_1711290808080_0000_4_00 [Map 1] killed/failed due > to:ROOT_INPUT_INIT_FAILURE, Vertex Input: web_sales_1 initializer failed, > vertex=vertex_1711290808080_0000_4_00 [Map 1], java.lang.OutOfMemoryError: > Java heap space > at > com.google.protobuf.ByteString$CodedBuilder.<init>(ByteString.java:907) > at > com.google.protobuf.ByteString$CodedBuilder.<init>(ByteString.java:902) > at com.google.protobuf.ByteString.newCodedBuilder(ByteString.java:898) > at > com.google.protobuf.AbstractMessageLite.toByteString(AbstractMessageLite.java:49) > at > org.apache.hadoop.hive.ql.exec.tez.HiveSplitGenerator.createEventList(HiveSplitGenerator.java:378) > at > org.apache.hadoop.hive.ql.exec.tez.HiveSplitGenerator.initialize(HiveSplitGenerator.java:337) > at > org.apache.tez.dag.app.dag.RootInputInitializerManager.lambda$runInitializer$3(RootInputInitializerManager.java:199) > at > org.apache.tez.dag.app.dag.RootInputInitializerManager$$Lambda$319/0x0000000840942440.run(Unknown > Source) > at java.base/java.security.AccessController.doPrivileged(Native Method) > at java.base/javax.security.auth.Subject.doAs(Subject.java:423) > at > org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1899) > at > org.apache.tez.dag.app.dag.RootInputInitializerManager.runInitializer(RootInputInitializerManager.java:192) > at > org.apache.tez.dag.app.dag.RootInputInitializerManager.runInitializerAndProcessResult(RootInputInitializerManager.java:173) > at > org.apache.tez.dag.app.dag.RootInputInitializerManager.lambda$createAndStartInitializing$2(RootInputInitializerManager.java:167) > at > org.apache.tez.dag.app.dag.RootInputInitializerManager$$Lambda$318/0x0000000840942040.run(Unknown > Source) > at > java.base/java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:515) > at > com.google.common.util.concurrent.TrustedListenableFutureTask$TrustedFutureInterruptibleTask.runInterruptibly(TrustedListenableFutureTask.java:125) > at > com.google.common.util.concurrent.InterruptibleTask.run(InterruptibleTask.java:69) > at > com.google.common.util.concurrent.TrustedListenableFutureTask.run(TrustedListenableFutureTask.java:78) > at > java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128) > at > java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628) > at java.base/java.lang.Thread.run(Thread.java:829) > {code} -- This message was sent by Atlassian Jira (v8.20.10#820010)