Yu Yang created FLINK-20575:
-------------------------------
Summary: flink application failed to restore from check-point
Key: FLINK-20575
URL: https://issues.apache.org/jira/browse/FLINK-20575
Project: Flink
Issue Type: Bug
Affects Versions: 1.9.1
Reporter: Yu Yang
Our flink application failed to restore from a check-point due to
com.amazonaws.AbortedException (we use s3a file system). Initially we thought
that the s3 file had some issue. It turned out that we can download the s3 file
fine. Any insights on this issue will be very welcome.
|| |6674 2020-12-11 07:02:40,018 ERROR
org.apache.flink.contrib.streaming.state.RocksDBKeyedStateBackendBuilder -
Caught unexpected exception.|
|| |6675 java.io.InterruptedIOException: getFileStatus on
s3a://bucket/prod/checkpoints/u/tango/910d2ff2b2c7e01e99a9588d11385e92/shared/f245da83-fc01-424d-9719-d48b99a1ed35:
org.apache.flink.fs.s3base.shaded.com.amazonaws.AbortedException:|
|| |6676 at
org.apache.flink.fs.shaded.hadoop3.org.apache.hadoop.fs.s3a.S3AUtils.translateInterruptedException(S3AUtils.java:340)|
|| |6677 at
org.apache.flink.fs.shaded.hadoop3.org.apache.hadoop.fs.s3a.S3AUtils.translateException(S3AUtils.java:171)|
|| |6678 at
org.apache.flink.fs.shaded.hadoop3.org.apache.hadoop.fs.s3a.S3AUtils.translateException(S3AUtils.java:145)|
|| |6679 at
org.apache.flink.fs.shaded.hadoop3.org.apache.hadoop.fs.s3a.S3AFileSystem.s3GetFileStatus(S3AFileSystem.java:2187)|
|| |6680 at
org.apache.flink.fs.shaded.hadoop3.org.apache.hadoop.fs.s3a.S3AFileSystem.innerGetFileStatus(S3AFileSystem.java:2149)|
|| |6681 at
org.apache.flink.fs.shaded.hadoop3.org.apache.hadoop.fs.s3a.S3AFileSystem.getFileStatus(S3AFileSystem.java:2088)|
|| |6682 at
org.apache.flink.fs.shaded.hadoop3.org.apache.hadoop.fs.s3a.S3AFileSystem.open(S3AFileSystem.java:699)|
|| |6683 at
org.apache.flink.fs.shaded.hadoop3.org.apache.hadoop.fs.FileSystem.open(FileSystem.java:950)|
|| |6684 at
org.apache.flink.fs.s3.common.hadoop.HadoopFileSystem.open(HadoopFileSystem.java:120)|
|| |6685 at
org.apache.flink.fs.s3.common.hadoop.HadoopFileSystem.open(HadoopFileSystem.java:37)|
|| |6686 at
org.apache.flink.core.fs.SafetyNetWrapperFileSystem.open(SafetyNetWrapperFileSystem.java:85)|
|| |6687 at
org.apache.flink.runtime.state.filesystem.FileStateHandle.openInputStream(FileStateHandle.java:68)|
|| |6688 at
org.apache.flink.contrib.streaming.state.RocksDBStateDownloader.downloadDataForStateHandle(RocksDBStateDownloader.java:127)|
|| |6689 at
org.apache.flink.contrib.streaming.state.RocksDBStateDownloader.lambda$createDownloadRunnables$0(RocksDBStateDownloader.java:109)|
|| |6690 at
org.apache.flink.util.function.ThrowingRunnable.lambda$unchecked$0(ThrowingRunnable.java:50)|
|| |6691 at
java.util.concurrent.CompletableFuture$AsyncRun.run(CompletableFuture.java:1626)|
|| |6692 at
org.apache.flink.runtime.concurrent.DirectExecutorService.execute(DirectExecutorService.java:211)|
|| |6693 at
java.util.concurrent.CompletableFuture.asyncRunStage(CompletableFuture.java:1640)|
|| |6694 at
java.util.concurrent.CompletableFuture.runAsync(CompletableFuture.java:1858)|
|| |6695 at
org.apache.flink.contrib.streaming.state.RocksDBStateDownloader.downloadDataForAllStateHandles(RocksDBStateDownloader.java:83)|
|| |6696 at
org.apache.flink.contrib.streaming.state.RocksDBStateDownloader.transferAllStateDataToDirectory(RocksDBStateDownloader.java:66)|
|| |6697 at
org.apache.flink.contrib.streaming.state.restore.RocksDBIncrementalRestoreOperation.restoreDBInstanceFromStateHandle(RocksDBIncrementalRestoreOperation.java:40
6)|
|| |6698 at
org.apache.flink.contrib.streaming.state.restore.RocksDBIncrementalRestoreOperation.restoreWithRescaling(RocksDBIncrementalRestoreOperation.java:294)|
|| |6699 at
org.apache.flink.contrib.streaming.state.restore.RocksDBIncrementalRestoreOperation.restore(RocksDBIncrementalRestoreOperation.java:146)|
|| |6700 at
org.apache.flink.contrib.streaming.state.RocksDBKeyedStateBackendBuilder.build(RocksDBKeyedStateBackendBuilder.java:270)|
|| |6701 at
org.apache.flink.contrib.streaming.state.RocksDBStateBackend.createKeyedStateBackend(RocksDBStateBackend.java:520)|
|| |6702 at
org.apache.flink.streaming.api.operators.StreamTaskStateInitializerImpl.lambda$keyedStatedBackend$1(StreamTaskStateInitializerImpl.java:291)|
|| |6703 at
org.apache.flink.streaming.api.operators.BackendRestorerProcedure.attemptCreateAndRestore(BackendRestorerProcedure.java:142)|
|| |6704 at
org.apache.flink.streaming.api.operators.BackendRestorerProcedure.createAndRestore(BackendRestorerProcedure.java:121)|
|| |6705 at
org.apache.flink.streaming.api.operators.StreamTaskStateInitializerImpl.keyedStatedBackend(StreamTaskStateInitializerImpl.java:307)|
|| |6706 at
org.apache.flink.streaming.api.operators.StreamTaskStateInitializerImpl.streamOperatorStateContext(StreamTaskStateInitializerImpl.java:135)|
|| |6707 at
org.apache.flink.streaming.api.operators.AbstractStreamOperator.initializeState(AbstractStreamOperator.java:253)|
|| |6708 at
org.apache.flink.streaming.runtime.tasks.StreamTask.initializeState(StreamTask.java:901)|
|| |6709 at
org.apache.flink.streaming.runtime.tasks.StreamTask.invoke(StreamTask.java:415)|
|| |6710 at org.apache.flink.runtime.taskmanager.Task.doRun(Task.java:705)|
|| |6711 at org.apache.flink.runtime.taskmanager.Task.run(Task.java:530)|
|| |6712 at java.lang.Thread.run(Thread.java:748)|
|| |6713 Caused by:
org.apache.flink.fs.s3base.shaded.com.amazonaws.AbortedException:|
|| |6714 at
org.apache.flink.fs.s3base.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.handleInterruptedException(AmazonHttpClient.java:795)|
|| |6715 at
org.apache.flink.fs.s3base.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.execute(AmazonHttpClient.java:701)|
|| |6716 at
org.apache.flink.fs.s3base.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.access$500(AmazonHttpClient.java:667)|
|| |6717 at
org.apache.flink.fs.s3base.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutionBuilderImpl.execute(AmazonHttpClient.java:649)|
|| |6718 at
org.apache.flink.fs.s3base.shaded.com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:513)|
|| |6719 at
org.apache.flink.fs.s3base.shaded.com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:4325)|
|| |6720 at
org.apache.flink.fs.s3base.shaded.com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:4272)|
|| |6721 at
org.apache.flink.fs.s3base.shaded.com.amazonaws.services.s3.AmazonS3Client.getObjectMetadata(AmazonS3Client.java:1264)|
|| |6722 at
org.apache.flink.fs.shaded.hadoop3.org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$getObjectMetadata$4(S3AFileSystem.java:1235)|
|| |6723 at
org.apache.flink.fs.shaded.hadoop3.org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:317)|
|| |6724 at
org.apache.flink.fs.shaded.hadoop3.org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:280)|
|| |6725 at
org.apache.flink.fs.shaded.hadoop3.org.apache.hadoop.fs.s3a.S3AFileSystem.getObjectMetadata(S3AFileSystem.java:1232)|
|| |6726 at
org.apache.flink.fs.shaded.hadoop3.org.apache.hadoop.fs.s3a.S3AFileSystem.s3GetFileStatus(S3AFileSystem.java:2169)|
|| |6727 ... 33 more|
|| |6728 Caused by:
org.apache.flink.fs.s3base.shaded.com.amazonaws.http.timers.client.SdkInterruptedException|
|| |6729 at
org.apache.flink.fs.s3base.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.checkInterrupted(AmazonHttpClient.java:840)|
|| |6730 at
org.apache.flink.fs.s3base.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.checkInterrupted(AmazonHttpClient.java:826)|
|| |6731 at
org.apache.flink.fs.s3base.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeHelper(AmazonHttpClient.java:1019)|
|| |6732 at
org.apache.flink.fs.s3base.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.doExecute(AmazonHttpClient.java:743)|
|| |6733 at
org.apache.flink.fs.s3base.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeWithTimer(AmazonHttpClient.java:717)|
|| |6734 at
org.apache.flink.fs.s3base.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.execute(AmazonHttpClient.java:699)|
|| |6735 ... 44 more|
|| |6736 2020-12-11 07:02:40,023 WARN
org.apache.flink.streaming.api.operators.BackendRestorerProcedure - Exception
while restoring keyed state backend for KeyedProcessOperator_
95b82ed71c3c71dd5de19c1705a2f620_(24/24) from alternative (1/1), will retry
while more alternatives are available.|
|| |6737 org.apache.kafka.common.errors.InterruptException:
java.lang.InterruptedException|
|| |6738 at
org.apache.kafka.clients.producer.KafkaProducer.doSend(KafkaProducer.java:884)|
|| |6739 at
org.apache.kafka.clients.producer.KafkaProducer.send(KafkaProducer.java:803)|
|| |6740 at
org.apache.kafka.clients.producer.KafkaProducer.send(KafkaProducer.java:690)|
|| |6741 at
com.pinterest.xenon.log4jappender.KafkaLog4jAppender.append(KafkaLog4jAppender.java:338)|
|| |6742 at
org.apache.log4j.AppenderSkeleton.doAppend(AppenderSkeleton.java:251)|
|| |6743 at
org.apache.log4j.helpers.AppenderAttachableImpl.appendLoopOnAppenders(AppenderAttachableImpl.java:66)|
|| |6744 at org.apache.log4j.Category.callAppenders(Category.java:206)|
|| |6745 at org.apache.log4j.Category.forcedLog(Category.java:391)|
|| |6746 at org.apache.log4j.Category.log(Category.java:856)|
|| |6747 at
org.slf4j.impl.Log4jLoggerAdapter.error(Log4jLoggerAdapter.java:576)|
|| |6748 at
org.apache.flink.contrib.streaming.state.RocksDBKeyedStateBackendBuilder.build(RocksDBKeyedStateBackendBuilder.java:325)|
|| |6749 at
org.apache.flink.contrib.streaming.state.RocksDBStateBackend.createKeyedStateBackend(RocksDBStateBackend.java:520)|
|| |6750 at
org.apache.flink.streaming.api.operators.StreamTaskStateInitializerImpl.lambda$keyedStatedBackend$1(StreamTaskStateInitializerImpl.java:291)|
|| |6751 at
org.apache.flink.streaming.api.operators.BackendRestorerProcedure.attemptCreateAndRestore(BackendRestorerProcedure.java:142)|
|| |6752 at
org.apache.flink.streaming.api.operators.BackendRestorerProcedure.createAndRestore(BackendRestorerProcedure.java:121)|
|| |6753 at
org.apache.flink.streaming.api.operators.StreamTaskStateInitializerImpl.keyedStatedBackend(StreamTaskStateInitializerImpl.java:307)|
|| |6754 at
org.apache.flink.streaming.api.operators.StreamTaskStateInitializerImpl.streamOperatorStateContext(StreamTaskStateInitializerImpl.java:135)|
|| |6755 at
org.apache.flink.streaming.api.operators.AbstractStreamOperator.initializeState(AbstractStreamOperator.java:253)|
|| |6756 at
org.apache.flink.streaming.runtime.tasks.StreamTask.initializeState(StreamTask.java:901)|
|| |6757 at
org.apache.flink.streaming.runtime.tasks.StreamTask.invoke(StreamTask.java:415)|
|| |6758 at org.apache.flink.runtime.taskmanager.Task.doRun(Task.java:705)|
|| |6759 at org.apache.flink.runtime.taskmanager.Task.run(Task.java:530)|
|| |6760 at java.lang.Thread.run(Thread.java:748)|
|| |6761 Caused by: java.lang.InterruptedException|
|| |6762 at java.lang.Object.wait(Native Method)|
|| |6763 at org.apache.kafka.clients.Metadata.awaitUpdate(Metadata.java:193)|
|| |6764 at
org.apache.kafka.clients.producer.KafkaProducer.waitOnMetadata(KafkaProducer.java:938)|
|| |6765 at
org.apache.kafka.clients.producer.KafkaProducer.doSend(KafkaProducer.java:823)|
|| |6766 ... 22 more|
|| ||
--
This message was sent by Atlassian Jira
(v8.3.4#803005)