[ 
https://issues.apache.org/jira/browse/FLINK-4348?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15430798#comment-15430798
 ] 

ASF GitHub Bot commented on FLINK-4348:
---------------------------------------

Github user tillrohrmann commented on a diff in the pull request:

    https://github.com/apache/flink/pull/2389#discussion_r75681431
  
    --- Diff: 
flink-runtime/src/main/java/org/apache/flink/runtime/rpc/resourcemanager/ResourceManagerToTaskExecutorHeartbeatScheduler.java
 ---
    @@ -0,0 +1,242 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one
    + * or more contributor license agreements.  See the NOTICE file
    + * distributed with this work for additional information
    + * regarding copyright ownership.  The ASF licenses this file
    + * to you under the Apache License, Version 2.0 (the
    + * "License"); you may not use this file except in compliance
    + * with the License.  You may obtain a copy of the License at
    + *
    + *     http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package org.apache.flink.runtime.rpc.resourcemanager;
    +
    +import akka.dispatch.OnFailure;
    +import akka.dispatch.OnSuccess;
    +import org.apache.flink.runtime.clusterframework.types.ResourceID;
    +import org.apache.flink.runtime.rpc.taskexecutor.SlotReport;
    +import org.apache.flink.runtime.rpc.taskexecutor.TaskExecutorGateway;
    +import org.slf4j.Logger;
    +import scala.concurrent.Future;
    +import scala.concurrent.duration.FiniteDuration;
    +
    +import java.util.UUID;
    +import java.util.concurrent.TimeUnit;
    +import java.util.concurrent.TimeoutException;
    +
    +import static org.apache.flink.util.Preconditions.checkState;
    +
    +/**
    + * heartbeat between ResourceManager and TaskManager, it is responsible 
for schedule heartbeat and handle
    + * heartbeat lost cases
    + */
    +public class ResourceManagerToTaskExecutorHeartbeatScheduler {
    +
    +   /** default heartbeat interval time in millisecond */
    +   private static final long INITIAL_HEARTBEAT_INTERVAL_MILLIS = 5000;
    +
    +   /** default heartbeat timeout in millisecond */
    +   private static final long INITIAL_HEARTBEAT_TIMEOUT_MILLIS = 200;
    +
    +   /** max heartbeat interval time in millisecond (which is used in retry 
heartbeat case) */
    +   private static final long MAX_HEARTBEAT_TIMEOUT_MILLIS = 30000;
    +
    +   /** if a failure except for timeout exception happened when trigger 
heartbeat from resourceManager to taskManager , next attemp will start after  
ERROR_HEARTBEAT_DELAY_MILLIS millisecond */
    +   private static final long ERROR_HEARTBEAT_DELAY_MILLIS = 2000;
    +
    +   /** max heartbeat retry times when lost heartbeat */
    +   private static final int MAX_ATTEMPT_TIMES = 8;
    +
    +   private final long heartbeatInterval;
    +
    +   private final long heartbeatTimeout;
    +
    +   private final long maxHeartbeatTimeout;
    +
    +   private final long delayOnError;
    +
    +   private final int maxAttempt;
    +
    +
    +   /** taskManagerGateway to receive the heartbeat and report slot 
allocation */
    +   private final TaskExecutorGateway taskExecutorGateway;
    +
    +   /** the taskManager address */
    +   private final String taskExecutorAddress;
    +
    +   /** identify the taskManager resourceID */
    +   private final ResourceID resourceID;
    +
    +   /** identify the resourceManager rpc endpoint */
    +   private final ResourceManager resourceManager;
    +
    +   private final UUID resourceManagerLeaderSessionID;
    +
    +   private final Logger log;
    +
    +   private volatile boolean closed;
    +
    +   /**
    +    * ResourceManagerToTaskExecutorHeartbeatScheduler constructor
    +    *
    +    * @param resourceManager         resourceManager which handles 
heartbeat communication with taskManager
    +    * @param taskExecutorGateway     taskManager which receives heartbeat 
from resourceManager and report its slot
    +    *                                allocation to resourceManager
    +    * @param taskExecutorAddress     taskManager's address
    +    * @param taskExecutorResourceID  taskManager's resourceID
    +    * @param log
    +    */
    +   public ResourceManagerToTaskExecutorHeartbeatScheduler(
    --- End diff --
    
    This would also allow to test this component more easily.


> Implement communication from ResourceManager to TaskManager
> -----------------------------------------------------------
>
>                 Key: FLINK-4348
>                 URL: https://issues.apache.org/jira/browse/FLINK-4348
>             Project: Flink
>          Issue Type: Sub-task
>          Components: Cluster Management
>            Reporter: Kurt Young
>            Assignee: zhangjing
>
> There are mainly 3 logics initiated from RM to TM:
> * Heartbeat, RM use heartbeat to sync with TM's slot status
> * SlotRequest, when RM decides to assign slot to JM, should first try to send 
> request to TM for slot. TM can either accept or reject this request.
> * FailureNotify, in some corner cases, TM will be marked as invalid by 
> cluster manager master(e.g. yarn master), but TM itself does not realize. RM 
> should send failure notify to TM and TM can terminate itself



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to