[ https://issues.apache.org/jira/browse/FLINK-10063?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16574455#comment-16574455 ]
ASF GitHub Bot commented on FLINK-10063: ---------------------------------------- asfgit closed pull request #6496: [FLINK-10063][tests] Use runit to supervise mesos processes. URL: https://github.com/apache/flink/pull/6496 This is a PR merged from a forked repository. As GitHub hides the original diff on merge, it is displayed below for the sake of provenance: As this is a foreign pull request (from a fork), the diff is supplied below (as it won't show otherwise due to GitHub magic): diff --git a/flink-jepsen/docker/Dockerfile-db b/flink-jepsen/docker/Dockerfile-db index 1555329af3f..cb60efce2e5 100644 --- a/flink-jepsen/docker/Dockerfile-db +++ b/flink-jepsen/docker/Dockerfile-db @@ -21,7 +21,7 @@ FROM debian:jessie RUN echo "deb http://http.debian.net/debian jessie-backports main" >> /etc/apt/sources.list && \ apt-get update && \ apt-get install -y -t jessie-backports openjdk-8-jdk && \ - apt-get install -y apt-utils bzip2 curl faketime iproute iptables iputils-ping less libzip2 logrotate man man-db net-tools ntpdate psmisc python rsyslog sudo sysvinit sysvinit-core sysvinit-utils tar unzip vim wget + apt-get install -y apt-utils bzip2 curl faketime iproute iptables iputils-ping less libzip2 logrotate man man-db net-tools ntpdate psmisc python rsyslog runit sudo sysvinit sysvinit-core sysvinit-utils tar unzip vim wget RUN apt-get update && \ apt-get -y install openssh-server && \ @@ -35,5 +35,12 @@ RUN mkdir -p /root/.ssh/ && \ chmod 600 /root/.ssh/authorized_keys && \ cat /root/id_rsa.pub >> /root/.ssh/authorized_keys +COPY sshd-run /etc/sv/service/sshd/run +RUN chmod +x /etc/sv/service/sshd/run && \ + ln -sf /etc/sv/service/sshd /etc/service + EXPOSE 22 -CMD exec /usr/sbin/sshd -D + +# Start runit process supervisor which will bring up sshd. +# In our tests we can use runit to supervise more processes, e.g., Mesos. +CMD runsvdir -P /etc/service /dev/null > /dev/null diff --git a/flink-jepsen/src/jepsen/flink/db.clj b/flink-jepsen/src/jepsen/flink/db.clj index 9a725d7149a..becc551e2cf 100644 --- a/flink-jepsen/src/jepsen/flink/db.clj +++ b/flink-jepsen/src/jepsen/flink/db.clj @@ -97,7 +97,7 @@ (if (cu/exists? log-dir) (cu/ls-full log-dir) [])) (defn flink-db - [test] + [] (reify db/DB (setup! [_ test node] (c/su @@ -131,7 +131,7 @@ [] (let [zk (zk/db deb-zookeeper-package) hadoop (hadoop/db hadoop-dist-url) - flink (flink-db test)] + flink (flink-db)] (combined-db [hadoop zk flink]))) (defn exec-flink! @@ -192,7 +192,7 @@ (let [zk (zk/db deb-zookeeper-package) hadoop (hadoop/db hadoop-dist-url) mesos (mesos/db deb-mesos-package deb-marathon-package) - flink (flink-db test)] + flink (flink-db)] (combined-db [hadoop zk mesos flink]))) (defn submit-job-with-retry! @@ -209,24 +209,25 @@ (let [r (fu/retry (fn [] (http/post (str (mesos/marathon-base-url test) "/v2/apps") - {:form-params {:id "flink" - :cmd (str "HADOOP_CLASSPATH=`" hadoop/install-dir "/bin/hadoop classpath` " - "HADOOP_CONF_DIR=" hadoop/hadoop-conf-dir " " - install-dir "/bin/mesos-appmaster.sh " - "-Dmesos.master=" (zookeeper-uri - test - mesos/zk-namespace) " " - "-Djobmanager.rpc.address=$(hostname -f) " - "-Djobmanager.heap.mb=2048 " - "-Djobmanager.rpc.port=6123 " - "-Djobmanager.web.port=8081 " - "-Dmesos.resourcemanager.tasks.mem=2048 " - "-Dtaskmanager.heap.mb=2048 " - "-Dtaskmanager.numberOfTaskSlots=2 " - "-Dmesos.resourcemanager.tasks.cpus=1 " - "-Drest.bind-address=$(hostname -f) ") - :cpus 1.0 - :mem 2048} + {:form-params {:id "flink" + :cmd (str "HADOOP_CLASSPATH=`" hadoop/install-dir "/bin/hadoop classpath` " + "HADOOP_CONF_DIR=" hadoop/hadoop-conf-dir " " + install-dir "/bin/mesos-appmaster.sh " + "-Dmesos.master=" (zookeeper-uri + test + mesos/zk-namespace) " " + "-Djobmanager.rpc.address=$(hostname -f) " + "-Djobmanager.heap.mb=2048 " + "-Djobmanager.rpc.port=6123 " + "-Djobmanager.web.port=8081 " + "-Dmesos.resourcemanager.tasks.mem=2048 " + "-Dtaskmanager.heap.mb=2048 " + "-Dtaskmanager.numberOfTaskSlots=2 " + "-Dmesos.resourcemanager.tasks.cpus=1 " + "-Drest.bind-address=$(hostname -f) ") + :cpus 1.0 + :mem 2048 + :maxLaunchDelaySeconds 3} :content-type :json})))] (info "Submitted Flink Application via Marathon" r) (c/on (-> test :nodes sort first) diff --git a/flink-jepsen/src/jepsen/flink/mesos.clj b/flink-jepsen/src/jepsen/flink/mesos.clj index fd75991bdf5..a73f25fd489 100644 --- a/flink-jepsen/src/jepsen/flink/mesos.clj +++ b/flink-jepsen/src/jepsen/flink/mesos.clj @@ -24,11 +24,37 @@ [jepsen.os.debian :as debian] [jepsen.flink.zookeeper :refer [zookeeper-uri]])) +;;; runit process supervisor (http://smarden.org/runit/) +;;; +;;; We use runit to supervise Mesos processes because Mesos uses a "fail-fast" approach to +;;; error handling, e.g., the Mesos master will exit when it discovers it has been partitioned away +;;; from the Zookeeper quorum. + +(def runit-version "2.1.2-3") + +(defn create-supervised-service! + "Registers a service with the process supervisor and starts it." + [service-name cmd] + (let [service-dir (str "/etc/sv/" service-name) + run-script (str service-dir "/run")] + (c/su + (c/exec :mkdir :-p service-dir) + (c/exec :echo (clojure.string/join "\n" ["#!/bin/sh" + "exec 2>&1" + (str "exec " cmd)]) :> run-script) + (c/exec :chmod :+x run-script) + (c/exec :ln :-sf service-dir (str "/etc/service/" service-name))))) + +(defn stop-supervised-service! + "Stops a service and removes it from supervision." + [service-name] + (c/su + (c/exec :sv :down service-name) + (c/exec :rm :-f (str "/etc/service/" service-name)))) + ;;; Mesos (def master-count 1) -(def master-pidfile "/var/run/mesos/master.pid") -(def slave-pidfile "/var/run/mesos/slave.pid") (def master-dir "/var/lib/mesos/master") (def slave-dir "/var/lib/mesos/slave") (def log-dir "/var/log/mesos") @@ -40,115 +66,130 @@ (def marathon-bin "/usr/bin/marathon") (def zk-marathon-namespace "marathon") -(def marathon-pidfile "/var/run/mesos/marathon.pid") (def marathon-rest-port 8080) -(defn install! - [test node mesos-version marathon-version] - (c/su - (debian/add-repo! :mesosphere - "deb http://repos.mesosphere.com/debian jessie main" - "keyserver.ubuntu.com" - "E56151BF") - (debian/install {:mesos mesos-version - :marathon marathon-version}) - (c/exec :mkdir :-p "/var/run/mesos") - (c/exec :mkdir :-p master-dir) - (c/exec :mkdir :-p slave-dir))) - ;;; Mesos functions +(defn mesos-master-cmd + "Returns the command to run the mesos master." + [test node] + (clojure.string/join " " + ["env GLOG_v=1" + master-bin + (str "--hostname=" (name node)) + (str "--log_dir=" log-dir) + (str "--offer_timeout=30secs") + (str "--quorum=" (util/majority master-count)) + (str "--registry_fetch_timeout=120secs") + (str "--registry_store_timeout=5secs") + (str "--work_dir=" master-dir) + (str "--zk=" (zookeeper-uri test zk-namespace))])) + +(defn mesos-slave-cmd + "Returns the command to run the mesos agent." + [test node] + (clojure.string/join " " + ["env GLOG_v=1" + slave-bin + (str "--hostname=" (name node)) + (str "--log_dir=" log-dir) + (str "--master=" (zookeeper-uri test zk-namespace)) + (str "--recovery_timeout=30secs") + (str "--work_dir=" slave-dir)])) + +(defn create-mesos-master-supervised-service! + [test node] + (create-supervised-service! "mesos-master" + (mesos-master-cmd test node))) + +(defn create-mesos-slave-supervised-service! + [test node] + (create-supervised-service! "mesos-slave" + (mesos-slave-cmd test node))) + +(defn master-node? + "Returns a truthy value if the node should run the mesos master." + [test node] + (some #{node} (take master-count (sort (:nodes test))))) + (defn start-master! [test node] - (when (some #{node} (take master-count (sort (:nodes test)))) + (when (master-node? test node) (info node "Starting mesos master") (c/su - (c/exec :start-stop-daemon - :--background - :--chdir master-dir - :--exec "/usr/bin/env" - :--make-pidfile - :--no-close - :--oknodo - :--pidfile master-pidfile - :--start - :-- - "GLOG_v=1" - master-bin - (str "--hostname=" (name node)) - (str "--log_dir=" log-dir) - (str "--offer_timeout=30secs") - (str "--quorum=" (util/majority master-count)) - (str "--registry_fetch_timeout=120secs") - (str "--registry_store_timeout=5secs") - (str "--work_dir=" master-dir) - (str "--zk=" (zookeeper-uri test zk-namespace)) - :>> (str log-dir "/master.stdout") - (c/lit "2>&1"))))) + (create-mesos-master-supervised-service! test node)))) (defn start-slave! [test node] - (when-not (some #{node} (take master-count (sort (:nodes test)))) + (when-not (master-node? test node) (info node "Starting mesos slave") (c/su - (c/exec :start-stop-daemon :--start - :--background - :--chdir slave-dir - :--exec slave-bin - :--make-pidfile - :--no-close - :--pidfile slave-pidfile - :--oknodo - :-- - (str "--hostname=" (name node)) - (str "--log_dir=" log-dir) - (str "--master=" (zookeeper-uri test zk-namespace)) - (str "--recovery_timeout=30secs") - (str "--work_dir=" slave-dir) - :>> (str log-dir "/slave.stdout") - (c/lit "2>&1"))))) + (create-mesos-slave-supervised-service! test node)))) (defn stop-master! - [node] - (info node "Stopping mesos master") - (meh (cu/grepkill! :mesos-master)) - (meh (c/exec :rm :-rf master-pidfile)) - (meh (c/exec :rm :-rf - (c/lit (str log-dir "/*")) - (c/lit (str master-dir "/*"))))) + [test node] + (when (master-node? test node) + (info node "Stopping mesos master") + (stop-supervised-service! "mesos-master") + (meh (c/exec :rm :-rf + (c/lit (str log-dir "/*")) + (c/lit (str master-dir "/*")))))) (defn stop-slave! - [node] - (info node "Stopping mesos slave") - (meh (cu/grepkill! :mesos-slave)) - (meh (c/exec :rm :-rf slave-pidfile)) - (meh (c/exec :rm :-rf - (c/lit (str log-dir "/*")) - (c/lit (str slave-dir "/*"))))) + [test node] + (when-not (master-node? test node) + (info node "Stopping mesos slave") + (stop-supervised-service! "mesos-slave") + (meh (c/exec :rm :-rf + (c/lit (str log-dir "/*")) + (c/lit (str slave-dir "/*")))))) ;;; Marathon functions +(defn install! + [test node mesos-version marathon-version] + (c/su + (debian/add-repo! :mesosphere + "deb http://repos.mesosphere.com/debian jessie main" + "keyserver.ubuntu.com" + "E56151BF") + (debian/install {:mesos mesos-version + :marathon marathon-version + :runit runit-version}) + (c/exec :mkdir :-p "/var/run/mesos") + (c/exec :mkdir :-p master-dir) + (c/exec :mkdir :-p slave-dir))) + +(defn marathon-cmd + "Returns the command to run the marathon." + [test node] + (clojure.string/join " " + [marathon-bin + (str "--hostname " node) + (str "--master " (zookeeper-uri test zk-namespace)) + (str "--zk " (zookeeper-uri test zk-marathon-namespace)) + (str ">> " log-dir "/marathon.out")])) + +(defn create-marathon-supervised-service! + [test node] + (create-supervised-service! "marathon" + (marathon-cmd test node))) + +(defn marathon-node? + [test node] + (= node (first (sort (:nodes test))))) + (defn start-marathon! [test node] - (when (= node (first (sort (:nodes test)))) + (when (marathon-node? test node) (info "Start marathon") (c/su - (c/exec :start-stop-daemon :--start - :--background - :--exec marathon-bin - :--make-pidfile - :--no-close - :--pidfile marathon-pidfile - :-- - (c/lit (str "--hostname " node)) - (c/lit (str "--master " (zookeeper-uri test zk-namespace))) - (c/lit (str "--zk " (zookeeper-uri test zk-marathon-namespace))) - :>> (str log-dir "/marathon.stdout") - (c/lit "2>&1"))))) + (create-marathon-supervised-service! test node)))) (defn stop-marathon! - [] - (cu/grepkill! "marathon")) + [test node] + (when (marathon-node? test node) + (stop-supervised-service! "marathon"))) (defn marathon-base-url [test] @@ -163,9 +204,9 @@ (start-slave! test node) (start-marathon! test node)) (teardown! [this test node] - (stop-slave! node) - (stop-master! node) - (stop-marathon!)) + (stop-slave! test node) + (stop-master! test node) + (stop-marathon! test node)) db/LogFiles (log-files [_ test node] (if (cu/exists? log-dir) (cu/ls-full log-dir) [])))) ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Jepsen: Automatically restart Mesos Processes > --------------------------------------------- > > Key: FLINK-10063 > URL: https://issues.apache.org/jira/browse/FLINK-10063 > Project: Flink > Issue Type: Bug > Components: Tests > Affects Versions: 1.6.0 > Reporter: Gary Yao > Assignee: Gary Yao > Priority: Critical > Labels: pull-request-available > Fix For: 1.6.1, 1.7.0 > > > Use a process supervisor to automatically restart Mesos processes. This is > needed because Mesos uses a "fail-fast" approach to error handling, e.g., the > Mesos master will exit when it discovers it has been partitioned away from > the Zookeeper quorum. Currently the some of the tests cannot pass because the > Mesos processes exiting. > *Acceptance Criteria* > * Running tests with {{--deployment-mode mesos-session}} should not fail due > to reasons related to the Mesos setup. -- This message was sent by Atlassian JIRA (v7.6.3#76005)