[ https://issues.apache.org/jira/browse/FLINK-8910?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16423797#comment-16423797 ]
ASF GitHub Bot commented on FLINK-8910: --------------------------------------- Github user StefanRRichter commented on a diff in the pull request: https://github.com/apache/flink/pull/5676#discussion_r178771812 --- Diff: flink-end-to-end-tests/src/main/java/org/apache/flink/streaming/tests/StickyAllocationAndLocalRecoveryTestJob.java --- @@ -0,0 +1,451 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * <p> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p> + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.streaming.tests; + +import org.apache.flink.api.common.functions.MapFunction; +import org.apache.flink.api.common.functions.RichFlatMapFunction; +import org.apache.flink.api.common.functions.RuntimeContext; +import org.apache.flink.api.common.restartstrategy.RestartStrategies; +import org.apache.flink.api.common.state.ListState; +import org.apache.flink.api.common.state.ListStateDescriptor; +import org.apache.flink.api.common.state.ValueState; +import org.apache.flink.api.common.state.ValueStateDescriptor; +import org.apache.flink.api.java.functions.KeySelector; +import org.apache.flink.api.java.utils.ParameterTool; +import org.apache.flink.contrib.streaming.state.RocksDBStateBackend; +import org.apache.flink.runtime.state.CheckpointListener; +import org.apache.flink.runtime.state.FunctionInitializationContext; +import org.apache.flink.runtime.state.FunctionSnapshotContext; +import org.apache.flink.runtime.state.filesystem.FsStateBackend; +import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction; +import org.apache.flink.streaming.api.environment.CheckpointConfig; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.streaming.api.functions.sink.PrintSinkFunction; +import org.apache.flink.streaming.api.functions.source.RichSourceFunction; +import org.apache.flink.streaming.api.operators.StreamingRuntimeContext; +import org.apache.flink.util.Collector; +import org.apache.flink.util.Preconditions; + +import org.apache.commons.lang3.RandomStringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Set; + +/** + * Automatic end-to-end test for local recovery (including sticky allocation). + */ +public class StickyAllocationAndLocalRecoveryTestJob { + + private static final Logger LOG = LoggerFactory.getLogger(StickyAllocationAndLocalRecoveryTestJob.class); + + public static void main(String[] args) throws Exception { + + final ParameterTool pt = ParameterTool.fromArgs(args); + + final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + + env.setParallelism(pt.getInt("parallelism", 1)); + env.setMaxParallelism(pt.getInt("maxParallelism", pt.getInt("parallelism", 1))); + env.enableCheckpointing(pt.getInt("checkpointInterval", 1000)); + env.setRestartStrategy(RestartStrategies.fixedDelayRestart(Integer.MAX_VALUE, pt.getInt("restartDelay", 0))); + if (pt.has("externalizedCheckpoints") && pt.getBoolean("externalizedCheckpoints", false)) { + env.getCheckpointConfig().enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION); + } + + String stateBackend = pt.get("stateBackend", "file"); + String checkpointDir = pt.getRequired("checkpointDir"); + + boolean killJvmOnFail = pt.getBoolean("killJvmOnFail", false); + + if ("file".equals(stateBackend)) { + boolean asyncCheckpoints = pt.getBoolean("asyncCheckpoints", false); + env.setStateBackend(new FsStateBackend(checkpointDir, asyncCheckpoints)); + } else if ("rocks".equals(stateBackend)) { + boolean incrementalCheckpoints = pt.getBoolean("incrementalCheckpoints", false); + env.setStateBackend(new RocksDBStateBackend(checkpointDir, incrementalCheckpoints)); + } else { + throw new IllegalArgumentException("Unknown backend: " + stateBackend); + } + + // make parameters available in the web interface + env.getConfig().setGlobalJobParameters(pt); + + // delay to throttle down the production of the source + long delay = pt.has("delay") ? pt.getLong("delay") : 0L; + + // the maximum number of attempts, before the job finishes with success + int maxAttempts = pt.has("maxAttempts") ? pt.getInt("maxAttempts") : 3; + + // size of one artificial value + int valueSize = pt.has("valueSize") ? pt.getInt("valueSize") : 10; + + env.addSource(new RandomLongSource(maxAttempts, delay)) + .keyBy((KeySelector<Long, Long>) aLong -> aLong) + .flatMap(new StateCreatingFlatMap(valueSize, killJvmOnFail)) + .map((MapFunction<String, String>) value -> value) + .addSink(new PrintSinkFunction<>()); + + env.execute("Sticky Allocation And Local Recovery Test"); + } + + /** + * Source function that produces a long sequence. + */ + private static final class RandomLongSource extends RichSourceFunction<Long> implements CheckpointedFunction { + + private static final long serialVersionUID = 1L; + + /** + * Generator delay between two events. + */ + final long delay; + + /** + * Maximum restarts before shutting down this source. + */ + final int maxAttempts; + + /** + * State that holds the current key for recovery. + */ + ListState<Long> sourceCurrentKeyState; + + /** + * Generator's current key. + */ + long currentKey; + + /** + * Generator runs while this is true. + */ + volatile boolean running; + + RandomLongSource(int maxAttempts, long delay) { + this.delay = delay; + this.maxAttempts = maxAttempts; + this.running = true; + } + + @Override + public void run(SourceContext<Long> sourceContext) throws Exception { + + int numberOfParallelSubtasks = getRuntimeContext().getNumberOfParallelSubtasks(); + int subtaskIdx = getRuntimeContext().getIndexOfThisSubtask(); + + // the source emits one final event and shuts down once we have reached max attempts. + if (getRuntimeContext().getAttemptNumber() > maxAttempts) { + sourceContext.collect(Long.MAX_VALUE - subtaskIdx); + return; + } + + while (running) { + sourceContext.collect(currentKey); --- End diff -- 👍 > Introduce automated end-to-end test for local recovery (including sticky > scheduling) > ------------------------------------------------------------------------------------ > > Key: FLINK-8910 > URL: https://issues.apache.org/jira/browse/FLINK-8910 > Project: Flink > Issue Type: Sub-task > Components: State Backends, Checkpointing > Affects Versions: 1.5.0 > Reporter: Stefan Richter > Assignee: Stefan Richter > Priority: Major > Fix For: 1.5.0 > > > We should have an automated end-to-end test that can run nightly to check > that sticky allocation and local recovery work as expected. -- This message was sent by Atlassian JIRA (v7.6.3#76005)