rkhachatryan commented on a change in pull request #12670: URL: https://github.com/apache/flink/pull/12670#discussion_r441449694
########## File path: flink-runtime/src/main/java/org/apache/flink/runtime/checkpoint/CheckpointCoordinator.java ########## @@ -538,51 +542,61 @@ private void startTriggeringCheckpoint(CheckpointTriggerRequest request) { coordinatorsToCheckpoint, pendingCheckpoint, timer), timer); - FutureUtils.assertNoException( - CompletableFuture.allOf(masterStatesComplete, coordinatorCheckpointsComplete) - .handleAsync( - (ignored, throwable) -> { - final PendingCheckpoint checkpoint = - FutureUtils.getWithoutException(pendingCheckpointCompletableFuture); - - Preconditions.checkState( - checkpoint != null || throwable != null, - "Either the pending checkpoint needs to be created or an error must have been occurred."); - - if (throwable != null) { - // the initialization might not be finished yet - if (checkpoint == null) { - onTriggerFailure(request, throwable); - } else { - onTriggerFailure(checkpoint, throwable); - } + FutureUtils.waitForAll(asList(masterStatesComplete, coordinatorCheckpointsComplete)) + .handleAsync( + (ignored, throwable) -> { + final PendingCheckpoint checkpoint = + FutureUtils.getWithoutException(pendingCheckpointCompletableFuture); + + Preconditions.checkState( + checkpoint != null || throwable != null, + "Either the pending checkpoint needs to be created or an error must have been occurred."); + + if (throwable != null) { + // the initialization might not be finished yet + if (checkpoint == null) { + onTriggerFailure(request, throwable); } else { - if (checkpoint.isDiscarded()) { - onTriggerFailure( - checkpoint, - new CheckpointException( - CheckpointFailureReason.TRIGGER_CHECKPOINT_FAILURE, - checkpoint.getFailureCause())); - } else { - // no exception, no discarding, everything is OK - final long checkpointId = checkpoint.getCheckpointId(); - snapshotTaskState( - timestamp, - checkpointId, - checkpoint.getCheckpointStorageLocation(), - request.props, - executions, - request.advanceToEndOfTime); - - coordinatorsToCheckpoint.forEach((ctx) -> ctx.afterSourceBarrierInjection(checkpointId)); - - onTriggerSuccess(); - } + onTriggerFailure(checkpoint, throwable); } + } else { + if (checkpoint.isDiscarded()) { + onTriggerFailure( + checkpoint, + new CheckpointException( + CheckpointFailureReason.TRIGGER_CHECKPOINT_FAILURE, + checkpoint.getFailureCause())); + } else { + // no exception, no discarding, everything is OK + final long checkpointId = checkpoint.getCheckpointId(); + snapshotTaskState( + timestamp, + checkpointId, + checkpoint.getCheckpointStorageLocation(), + request.props, + executions, + request.advanceToEndOfTime); + + coordinatorsToCheckpoint.forEach((ctx) -> ctx.afterSourceBarrierInjection(checkpointId)); + + onTriggerSuccess(); + } + } - return null; - }, - timer)); + return null; + }, + timer) + .whenComplete((unused, error) -> { + if (error != null) { + if (!isShutdown()) { + failureManager.handleJobLevelCheckpointException(new CheckpointException(EXCEPTION, error), Optional.empty()); Review comment: I've reverted to `System.exit` after an offline discussion with @pnowojski (the motivation is to have more a simpler behavior and changeset given the time pressure of 1.11) ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org