Yu Yang created FLINK-20575:
-------------------------------

             Summary: flink application failed to restore from check-point
                 Key: FLINK-20575
                 URL: https://issues.apache.org/jira/browse/FLINK-20575
             Project: Flink
          Issue Type: Bug
    Affects Versions: 1.9.1
            Reporter: Yu Yang


Our flink application failed to restore from a check-point due to 
com.amazonaws.AbortedException (we use s3a file system).  Initially we thought 
that the s3 file had some issue. It turned out that we can download the s3 file 
fine.  Any insights on this issue will be very welcome. 

 
|| |6674 2020-12-11 07:02:40,018 ERROR 
org.apache.flink.contrib.streaming.state.RocksDBKeyedStateBackendBuilder - 
Caught unexpected exception.|
|| |6675 java.io.InterruptedIOException: getFileStatus on 
s3a://bucket/prod/checkpoints/u/tango/910d2ff2b2c7e01e99a9588d11385e92/shared/f245da83-fc01-424d-9719-d48b99a1ed35:
 org.apache.flink.fs.s3base.shaded.com.amazonaws.AbortedException:|
|| |6676 at 
org.apache.flink.fs.shaded.hadoop3.org.apache.hadoop.fs.s3a.S3AUtils.translateInterruptedException(S3AUtils.java:340)|
|| |6677 at 
org.apache.flink.fs.shaded.hadoop3.org.apache.hadoop.fs.s3a.S3AUtils.translateException(S3AUtils.java:171)|
|| |6678 at 
org.apache.flink.fs.shaded.hadoop3.org.apache.hadoop.fs.s3a.S3AUtils.translateException(S3AUtils.java:145)|
|| |6679 at 
org.apache.flink.fs.shaded.hadoop3.org.apache.hadoop.fs.s3a.S3AFileSystem.s3GetFileStatus(S3AFileSystem.java:2187)|
|| |6680 at 
org.apache.flink.fs.shaded.hadoop3.org.apache.hadoop.fs.s3a.S3AFileSystem.innerGetFileStatus(S3AFileSystem.java:2149)|
|| |6681 at 
org.apache.flink.fs.shaded.hadoop3.org.apache.hadoop.fs.s3a.S3AFileSystem.getFileStatus(S3AFileSystem.java:2088)|
|| |6682 at 
org.apache.flink.fs.shaded.hadoop3.org.apache.hadoop.fs.s3a.S3AFileSystem.open(S3AFileSystem.java:699)|
|| |6683 at 
org.apache.flink.fs.shaded.hadoop3.org.apache.hadoop.fs.FileSystem.open(FileSystem.java:950)|
|| |6684 at 
org.apache.flink.fs.s3.common.hadoop.HadoopFileSystem.open(HadoopFileSystem.java:120)|
|| |6685 at 
org.apache.flink.fs.s3.common.hadoop.HadoopFileSystem.open(HadoopFileSystem.java:37)|
|| |6686 at 
org.apache.flink.core.fs.SafetyNetWrapperFileSystem.open(SafetyNetWrapperFileSystem.java:85)|
|| |6687 at 
org.apache.flink.runtime.state.filesystem.FileStateHandle.openInputStream(FileStateHandle.java:68)|
|| |6688 at 
org.apache.flink.contrib.streaming.state.RocksDBStateDownloader.downloadDataForStateHandle(RocksDBStateDownloader.java:127)|
|| |6689 at 
org.apache.flink.contrib.streaming.state.RocksDBStateDownloader.lambda$createDownloadRunnables$0(RocksDBStateDownloader.java:109)|
|| |6690 at 
org.apache.flink.util.function.ThrowingRunnable.lambda$unchecked$0(ThrowingRunnable.java:50)|
|| |6691 at 
java.util.concurrent.CompletableFuture$AsyncRun.run(CompletableFuture.java:1626)|
|| |6692 at 
org.apache.flink.runtime.concurrent.DirectExecutorService.execute(DirectExecutorService.java:211)|
|| |6693 at 
java.util.concurrent.CompletableFuture.asyncRunStage(CompletableFuture.java:1640)|
|| |6694 at 
java.util.concurrent.CompletableFuture.runAsync(CompletableFuture.java:1858)|
|| |6695 at 
org.apache.flink.contrib.streaming.state.RocksDBStateDownloader.downloadDataForAllStateHandles(RocksDBStateDownloader.java:83)|
|| |6696 at 
org.apache.flink.contrib.streaming.state.RocksDBStateDownloader.transferAllStateDataToDirectory(RocksDBStateDownloader.java:66)|
|| |6697 at 
org.apache.flink.contrib.streaming.state.restore.RocksDBIncrementalRestoreOperation.restoreDBInstanceFromStateHandle(RocksDBIncrementalRestoreOperation.java:40
 6)|
|| |6698 at 
org.apache.flink.contrib.streaming.state.restore.RocksDBIncrementalRestoreOperation.restoreWithRescaling(RocksDBIncrementalRestoreOperation.java:294)|
|| |6699 at 
org.apache.flink.contrib.streaming.state.restore.RocksDBIncrementalRestoreOperation.restore(RocksDBIncrementalRestoreOperation.java:146)|
|| |6700 at 
org.apache.flink.contrib.streaming.state.RocksDBKeyedStateBackendBuilder.build(RocksDBKeyedStateBackendBuilder.java:270)|
|| |6701 at 
org.apache.flink.contrib.streaming.state.RocksDBStateBackend.createKeyedStateBackend(RocksDBStateBackend.java:520)|
|| |6702 at 
org.apache.flink.streaming.api.operators.StreamTaskStateInitializerImpl.lambda$keyedStatedBackend$1(StreamTaskStateInitializerImpl.java:291)|
|| |6703 at 
org.apache.flink.streaming.api.operators.BackendRestorerProcedure.attemptCreateAndRestore(BackendRestorerProcedure.java:142)|
|| |6704 at 
org.apache.flink.streaming.api.operators.BackendRestorerProcedure.createAndRestore(BackendRestorerProcedure.java:121)|
|| |6705 at 
org.apache.flink.streaming.api.operators.StreamTaskStateInitializerImpl.keyedStatedBackend(StreamTaskStateInitializerImpl.java:307)|
|| |6706 at 
org.apache.flink.streaming.api.operators.StreamTaskStateInitializerImpl.streamOperatorStateContext(StreamTaskStateInitializerImpl.java:135)|
|| |6707 at 
org.apache.flink.streaming.api.operators.AbstractStreamOperator.initializeState(AbstractStreamOperator.java:253)|
|| |6708 at 
org.apache.flink.streaming.runtime.tasks.StreamTask.initializeState(StreamTask.java:901)|
|| |6709 at 
org.apache.flink.streaming.runtime.tasks.StreamTask.invoke(StreamTask.java:415)|
|| |6710 at org.apache.flink.runtime.taskmanager.Task.doRun(Task.java:705)|
|| |6711 at org.apache.flink.runtime.taskmanager.Task.run(Task.java:530)|
|| |6712 at java.lang.Thread.run(Thread.java:748)|
|| |6713 Caused by: 
org.apache.flink.fs.s3base.shaded.com.amazonaws.AbortedException:|
|| |6714 at 
org.apache.flink.fs.s3base.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.handleInterruptedException(AmazonHttpClient.java:795)|
|| |6715 at 
org.apache.flink.fs.s3base.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.execute(AmazonHttpClient.java:701)|
|| |6716 at 
org.apache.flink.fs.s3base.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.access$500(AmazonHttpClient.java:667)|
|| |6717 at 
org.apache.flink.fs.s3base.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutionBuilderImpl.execute(AmazonHttpClient.java:649)|
|| |6718 at 
org.apache.flink.fs.s3base.shaded.com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:513)|
|| |6719 at 
org.apache.flink.fs.s3base.shaded.com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:4325)|
|| |6720 at 
org.apache.flink.fs.s3base.shaded.com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:4272)|
|| |6721 at 
org.apache.flink.fs.s3base.shaded.com.amazonaws.services.s3.AmazonS3Client.getObjectMetadata(AmazonS3Client.java:1264)|
|| |6722 at 
org.apache.flink.fs.shaded.hadoop3.org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$getObjectMetadata$4(S3AFileSystem.java:1235)|
|| |6723 at 
org.apache.flink.fs.shaded.hadoop3.org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:317)|
|| |6724 at 
org.apache.flink.fs.shaded.hadoop3.org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:280)|
|| |6725 at 
org.apache.flink.fs.shaded.hadoop3.org.apache.hadoop.fs.s3a.S3AFileSystem.getObjectMetadata(S3AFileSystem.java:1232)|
|| |6726 at 
org.apache.flink.fs.shaded.hadoop3.org.apache.hadoop.fs.s3a.S3AFileSystem.s3GetFileStatus(S3AFileSystem.java:2169)|
|| |6727 ... 33 more|
|| |6728 Caused by: 
org.apache.flink.fs.s3base.shaded.com.amazonaws.http.timers.client.SdkInterruptedException|
|| |6729 at 
org.apache.flink.fs.s3base.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.checkInterrupted(AmazonHttpClient.java:840)|
|| |6730 at 
org.apache.flink.fs.s3base.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.checkInterrupted(AmazonHttpClient.java:826)|
|| |6731 at 
org.apache.flink.fs.s3base.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeHelper(AmazonHttpClient.java:1019)|
|| |6732 at 
org.apache.flink.fs.s3base.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.doExecute(AmazonHttpClient.java:743)|
|| |6733 at 
org.apache.flink.fs.s3base.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeWithTimer(AmazonHttpClient.java:717)|
|| |6734 at 
org.apache.flink.fs.s3base.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.execute(AmazonHttpClient.java:699)|
|| |6735 ... 44 more|
|| |6736 2020-12-11 07:02:40,023 WARN 
org.apache.flink.streaming.api.operators.BackendRestorerProcedure - Exception 
while restoring keyed state backend for KeyedProcessOperator_ 
95b82ed71c3c71dd5de19c1705a2f620_(24/24) from alternative (1/1), will retry 
while more alternatives are available.|
|| |6737 org.apache.kafka.common.errors.InterruptException: 
java.lang.InterruptedException|
|| |6738 at 
org.apache.kafka.clients.producer.KafkaProducer.doSend(KafkaProducer.java:884)|
|| |6739 at 
org.apache.kafka.clients.producer.KafkaProducer.send(KafkaProducer.java:803)|
|| |6740 at 
org.apache.kafka.clients.producer.KafkaProducer.send(KafkaProducer.java:690)|
|| |6741 at 
com.pinterest.xenon.log4jappender.KafkaLog4jAppender.append(KafkaLog4jAppender.java:338)|
|| |6742 at 
org.apache.log4j.AppenderSkeleton.doAppend(AppenderSkeleton.java:251)|
|| |6743 at 
org.apache.log4j.helpers.AppenderAttachableImpl.appendLoopOnAppenders(AppenderAttachableImpl.java:66)|
|| |6744 at org.apache.log4j.Category.callAppenders(Category.java:206)|
|| |6745 at org.apache.log4j.Category.forcedLog(Category.java:391)|
|| |6746 at org.apache.log4j.Category.log(Category.java:856)|
|| |6747 at 
org.slf4j.impl.Log4jLoggerAdapter.error(Log4jLoggerAdapter.java:576)|
|| |6748 at 
org.apache.flink.contrib.streaming.state.RocksDBKeyedStateBackendBuilder.build(RocksDBKeyedStateBackendBuilder.java:325)|
|| |6749 at 
org.apache.flink.contrib.streaming.state.RocksDBStateBackend.createKeyedStateBackend(RocksDBStateBackend.java:520)|
|| |6750 at 
org.apache.flink.streaming.api.operators.StreamTaskStateInitializerImpl.lambda$keyedStatedBackend$1(StreamTaskStateInitializerImpl.java:291)|
|| |6751 at 
org.apache.flink.streaming.api.operators.BackendRestorerProcedure.attemptCreateAndRestore(BackendRestorerProcedure.java:142)|
|| |6752 at 
org.apache.flink.streaming.api.operators.BackendRestorerProcedure.createAndRestore(BackendRestorerProcedure.java:121)|
|| |6753 at 
org.apache.flink.streaming.api.operators.StreamTaskStateInitializerImpl.keyedStatedBackend(StreamTaskStateInitializerImpl.java:307)|
|| |6754 at 
org.apache.flink.streaming.api.operators.StreamTaskStateInitializerImpl.streamOperatorStateContext(StreamTaskStateInitializerImpl.java:135)|
|| |6755 at 
org.apache.flink.streaming.api.operators.AbstractStreamOperator.initializeState(AbstractStreamOperator.java:253)|
|| |6756 at 
org.apache.flink.streaming.runtime.tasks.StreamTask.initializeState(StreamTask.java:901)|
|| |6757 at 
org.apache.flink.streaming.runtime.tasks.StreamTask.invoke(StreamTask.java:415)|
|| |6758 at org.apache.flink.runtime.taskmanager.Task.doRun(Task.java:705)|
|| |6759 at org.apache.flink.runtime.taskmanager.Task.run(Task.java:530)|
|| |6760 at java.lang.Thread.run(Thread.java:748)|
|| |6761 Caused by: java.lang.InterruptedException|
|| |6762 at java.lang.Object.wait(Native Method)|
|| |6763 at org.apache.kafka.clients.Metadata.awaitUpdate(Metadata.java:193)|
|| |6764 at 
org.apache.kafka.clients.producer.KafkaProducer.waitOnMetadata(KafkaProducer.java:938)|
|| |6765 at 
org.apache.kafka.clients.producer.KafkaProducer.doSend(KafkaProducer.java:823)|
|| |6766 ... 22 more|
|| ||

 

 

 



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

Reply via email to