Hello, I am trying to install slurm in a small test cluster. Just after installation the nodes were up and running, but after rebooting the machines, the following error appears :
slurmd: debug: switch NONE plugin loaded slurmd: error: Error binding slurm stream socket: Address already in use slurmd: error: Unable to bind listen port (192.168.70.213:8018): Address already in use Below the details of my history installation and configuration files. If you have the time to take a look and give me an idea on how to solve the problem let me know. Thank you. Kind regards, Alseny ------------------------------------------------------------------------------- # INSTALLATION HISTORY DETAILS # mini cluster test set up: # master node : hostname -> toklap124, IP 192.168.70.214 # compute node 1 : hostname -> tokwor112, IP 192.168.70.212 # compute node 2 : hostname -> toklap120, IP 192.168.70.213 # All the following commands have been done as root in each of the 3 machines ############################################################### AT MASTER NODE # checking IP and hostname ip route get 8.8.8.8 | awk '{print $NF; exit}' # 192.168.70.214 # checking that the hostname is correct vi /etc/hostname # toklap124 vi /etc/hosts # 127.0.0.1 localhost localhost.localdomain localhost4 localhost4.localdomain4 # ::1 localhost localhost.localdomain localhost6 localhost6.localdomain6 # 192.168.70.214 toklap124 ############################################################### AT COMPUTE NODE1 # checking IP and hostname ip route get 8.8.8.8 | awk '{print $NF; exit}' # 192.168.70.212 vi /etc/hostname # tokwor112 vi /etc/hosts # 127.0.0.1 localhost.localdomain localhost4 localhost4.localdomain4 # ::1 localhost localhost.localdomain localhost6 localhost6.localdomain6 # 192.168.70.212 tokwor112 ############################################################### AT COMPUTE NODE2 # checking IP and hostname ip route get 8.8.8.8 | awk '{print $NF; exit}' # 192.168.70.213 vi /etc/hostname # toklap120 vi /etc/hosts # 127.0.0.1 localhost.localdomain localhost4 localhost4.localdomain4 # ::1 localhost localhost.localdomain localhost6 localhost6.localdomain6 # 192.168.70.213 toklap120 -------------------------------------------------------------- REMOVE PREVIOUS INSTALLATION ########################################################## ON ALL NODES yum remove mariadb-server mariadb-devel -y yum remove slurm munge munge-libs munge-devel -y userdel -r slurm userdel -r munge # install maria yum install mariadb-server mariadb-devel -y cd / # create the new user group "munge" (-g option is used to assign a numerical group ID) export MUNGEUSER=1127 groupadd -g $MUNGEUSER munge useradd -m -c "MUNGE Uid 'N' Gid Emporium" -d /var/lib/munge -u $MUNGEUSER -g munge -s /sbin/nologin munge # create the new user group "slurm" export SLURMUSER=1128 groupadd -g $SLURMUSER slurm useradd -m -c "SLURM workload manager" -d /var/lib/slurm -u $SLURMUSER -g slurm -s /bin/bash slurm # check grep '1127' /etc/passwd #OUTPUT -> munge:x:1127:1127:MUNGE Uid 'N' Gid Emporium:/var/lib/munge:/sbin/nologin grep '1128' /etc/passwd #OUTPUT -> slurm:x:1128:1128:SLURM workload manager:/var/lib/slurm:/bin/bash ############################################################# BACK TO MASTER NODE # at master node yum install epel-release -y yum install munge munge-libs munge-devel -y yum install rng-tools -y /usr/sbin/create-munge-key -r # overwrite key? yes dd if=/dev/urandom bs=1 count=1024 > /etc/munge/munge.key ############################################################# AT COMPUTE NODES # at compute nodes yum install epel-release -y yum install munge munge-libs munge-devel -y rm -rf /etc/munge/munge.key ############################################################# AT MASTER NODE # at master node # sending the key to each of the compute nodes # compute node 1 tokwor112: scp /etc/munge/munge.key root@192.168.70.212:/etc/munge # compute node 2 toklap120: scp /etc/munge/munge.key root@192.168.70.213:/etc/munge # starting munge services chown munge: /etc/munge/munge.key chmod 400 /etc/munge/munge.key chown -R munge: /etc/munge/ /var/log/munge/ chmod 0700 /etc/munge/ /var/log/munge/ systemctl enable munge systemctl start munge ############################################################## AT COMPUTE NODES chown munge: /etc/munge/munge.key chmod 400 /etc/munge/munge.key chown -R munge: /etc/munge/ /var/log/munge/ chmod 0700 /etc/munge/ /var/log/munge/ systemctl enable munge systemctl start munge ############################################################## AT MASTER # testing munge [root@toklap124 /]# munge -n | unmunge STATUS: Success (0) ENCODE_HOST: toklap124 (192.168.70.214) ENCODE_TIME: 2018-12-27 14:44:19 +0900 (1545889459) DECODE_TIME: 2018-12-27 14:44:19 +0900 (1545889459) [root@toklap124 /]# munge -n | ssh 192.168.70.212 unmunge root@192.168.70.212's password: STATUS: Success (0) ENCODE_HOST: ??? (192.168.70.214) ENCODE_TIME: 2018-12-27 14:44:52 +0900 (1545889492) DECODE_TIME: 2018-12-27 14:44:57 +0900 (1545889497) [root@toklap124 /]# munge -n | ssh 192.168.70.213 unmunge root@192.168.70.213's password: STATUS: Success (0) ENCODE_HOST: ??? (192.168.70.214) ENCODE_TIME: 2018-12-27 14:46:08 +0900 (1545889568) DECODE_TIME: 2018-12-27 14:46:13 +0900 (1545889573) # OK it is working but are you sure those ENCODE_HOST ??? # "???" does not look quite right, anyway let's keep going. # at compute node 192.168.70.212 : [root@tokwor112 /]# munge -n | unmunge STATUS: Success (0) ENCODE_HOST: tokwor112 (192.168.70.212) ENCODE_TIME: 2018-12-27 14:48:40 +0900 (1545889720) DECODE_TIME: 2018-12-27 14:48:40 +0900 (1545889720) [root@toklap120 /]# munge -n | unmunge STATUS: Success (0) ENCODE_HOST: toklap120 (192.168.70.213) ENCODE_TIME: 2018-12-27 14:49:39 +0900 (1545889779) DECODE_TIME: 2018-12-27 14:49:39 +0900 (1545889779) ###################################################### SLURM INSTALLATION ###################################################### IN EACH NODE (both master and compute) yum install gcc gcc-c++ gcc-gfortran kernel-devel -y yum install openssl openssl-devel pam-devel numactl numactl-devel hwloc hwloc-devel lua lua-devel readline-devel rrdtool-devel ncurses-devel man2html libibmad libibumad cpanm* -y yum install wget gcc gcc-c++ hdf5 hdf5-devel -y yum install libcurl-devel json-c-devel lz4-devel libibmad-devel libssh2-devel glibc-devel glib2-devel gtk2-devel -y yum install rpmdevtools -y cd ~ ######################## at MASTER NODE: rm -rf rpmbuild rpmbuild -ta slurm-17.11.5.tar.bz2 libtool --finish /lib64/security rm -rf ~/slurm_rpms/ mkdir ~/slurm_rpms mv rpmbuild/RPMS/x86_64/slurm*.rpm ~/slurm_rpms # sending the rpm to the compute nodes: scp -r ~/slurm_rpms root@192.168.70.212:~/ scp -r ~/slurm_rpms root@192.168.70.213:~/ ########################################################## IN EACH NODE yum install ntp -y yum install mailx -y yum install ~/slurm_rpms/*.rpm -y vi /etc/slurm/slurm.conf # and inside we copy the following : ############################################################################ slurm.conf start # slurm.conf file generated by configurator easy.html. # Put this file on all nodes of your cluster. # See the slurm.conf man page for more information. # ControlMachine=toklap124 ControlAddr=192.168.70.214 # # additional suggestions from https://wiki.fysik.dtu.dk/niflheim/Slurm_configuration#reboot-option RebootProgram="/usr/sbin/reboot" UnkillableStepTimeout=120 # end additional suggestions # MailProg=/bin/mail MpiDefault=none #MpiParams=ports=#-# ProctrackType=proctrack/cgroup ReturnToService=1 SlurmctldPidFile=/var/run/slurm/slurmctld.pid SlurmctldPort=8017 SlurmdPidFile=/var/run/slurm/slurmd.pid SlurmdPort=8018 SlurmdSpoolDir=/var/spool/slurm SlurmUser=slurm #SlurmdUser=root StateSaveLocation=/var/spool/slurm SwitchType=switch/none TaskPlugin=task/affinity # # # TIMERS #KillWait=30 #MinJobAge=300 #SlurmctldTimeout=120 #SlurmdTimeout=300 # # # SCHEDULING FastSchedule=1 SchedulerType=sched/backfill SelectType=select/cons_res SelectTypeParameters=CR_Core # # # LOGGING AND ACCOUNTING AccountingStorageType=accounting_storage/none ClusterName=cluster #JobAcctGatherFrequency=30 JobAcctGatherType=jobacct_gather/none #SlurmctldDebug=3 SlurmctldLogFile=/var/log/slurmctld.log #SlurmdDebug=3 SlurmdLogFile=/var/log/slurmd.log # # # COMPUTE NODES NodeName=tokwor112 NodeAddr=192.168.70.212 CPUs=8 Sockets=1 CoresPerSocket=4 ThreadsPerCore=2 State=UNKNOWN NodeName=toklap120 NodeAddr=192.168.70.213 CPUs=4 Sockets=1 CoresPerSocket=2 ThreadsPerCore=2 State=UNKNOWN PartitionName=production Nodes=ALL Default=YES MaxTime=INFINITE State=UP #################################################################### /etc/slurm/slurm.conf ENDS # we also set the cgroup.conf as follows in each of the nodes ############################################################## /etc/slurm/cgroup.conf STARTS CgroupAutomount=yes CgroupReleaseAgentDir="/etc/slurm/cgroup" ConstrainCores=no ConstrainRAMSpace=yes TaskAffinity=no ConstrainSwapSpace=yes AllowedSwapSpace=0 ############################################################## /etc/slurm/cgroup.conf ENDS ########################################### both on cluster and on compute nodes mkdir /var/run/slurm chown slurm: /var/run/slurm chmod 755 /var/run/slurm mkdir /var/spool/slurm chown slurm: /var/spool/slurm chmod 755 /var/spool/slurm slurmd -C # OUTPUT @ master node NodeName=toklap124 CPUs=4 Boards=1 SocketsPerBoard=1 CoresPerSocket=2 ThreadsPerCore=2 RealMemory=7727 # OUTPUT @ compute node1 NodeName=tokwor112 CPUs=8 Boards=1 SocketsPerBoard=1 CoresPerSocket=4 ThreadsPerCore=2 RealMemory=15811 # OUTPUT @ compute node2 NodeName=toklap120 CPUs=4 Boards=1 SocketsPerBoard=1 CoresPerSocket=2 ThreadsPerCore=2 RealMemory=7728 sed -i -e 's/PIDFile=.*/PIDFile=\/var\/run\/slurm\/slurmctld.pid/g' /usr/lib/systemd/system/slurmctld.service sed -i -e 's/PIDFile=.*/PIDFile=\/var\/run\/slurm\/slurmd.pid/g' /usr/lib/systemd/system/slurmd.service # start services ############################## master node only systemctl enable slurmctld systemctl start slurmctld systemctl status slurmctld.service # OK now master node is up and running # OUTPUT #● slurmctld.service - Slurm controller daemon # Loaded: loaded (/usr/lib/systemd/system/slurmctld.service; enabled; vendor preset: disabled) # Active: active (running) since Tue 2019-01-08 14:04:45 JST; 491ms ago # Process: 30750 ExecStart=/usr/sbin/slurmctld $SLURMCTLD_OPTIONS (code=exited, status=0/SUCCESS) # Main PID: 30753 (slurmctld) # Tasks: 7 # CGroup: /system.slice/slurmctld.service # └─30753 /usr/sbin/slurmctld # Jan 08 14:04:45 toklap124 systemd[1]: Starting Slurm controller daemon... # Jan 08 14:04:45 toklap124 systemd[1]: Started Slurm controller daemon. # start services ############################## compute nodes only systemctl enable slurmd.service systemctl start slurmd.service systemctl status slurmd.service # OUTPUT # COMPUTE NODE 1 # ● slurmd.service - Slurm node daemon # Loaded: loaded (/usr/lib/systemd/system/slurmd.service; enabled; vendor preset: disabled) # Active: active (running) since Tue 2019-01-08 14:05:09 JST; 453ms ago # Process: 22335 ExecStart=/usr/sbin/slurmd $SLURMD_OPTIONS (code=exited, status=0/SUCCESS) # Main PID: 22340 (slurmd) # Tasks: 2 # CGroup: /system.slice/slurmd.service # ├─ 4960 /usr/sbin/slurmd # └─22340 /usr/sbin/slurmd # # Jan 08 14:05:09 tokwor112 systemd[1]: Starting Slurm node daemon... # Jan 08 14:05:09 tokwor112 systemd[1]: PID file /var/run/slurm/slurmd.pid not readable (yet?) after start. # Jan 08 14:05:09 tokwor112 systemd[1]: Started Slurm node daemon. # COMPUTE NODE 2 # ● slurmd.service - Slurm node daemon # Loaded: loaded (/usr/lib/systemd/system/slurmd.service; enabled; vendor preset: disabled) # Active: active (running) since Tue 2019-01-08 14:05:17 JST; 541ms ago # Process: 7873 ExecStart=/usr/sbin/slurmd $SLURMD_OPTIONS (code=exited, status=0/SUCCESS) # Main PID: 7878 (slurmd) # Tasks: 3 # CGroup: /system.slice/slurmd.service # ├─1236 /usr/sbin/slurmd # └─7878 /usr/sbin/slurmd # # Jan 08 14:05:17 toklap120 systemd[1]: Starting Slurm node daemon... # Jan 08 14:05:17 toklap120 systemd[1]: Started Slurm node daemon. ############################################# AT NODE 1 # now compute node 1 is up and running # you can launch sinfo successfully sinfo OUTPUT # PARTITION AVAIL TIMELIMIT NODES STATE NODELIST # production* up infinite 2 idle toklap120,tokwor112 ############################################# AT NODE 2 sinfo OUTPUT # PARTITION AVAIL TIMELIMIT NODES STATE NODELIST # production* up infinite 2 idle toklap120,tokwor112 ################################################################### COMPUTE NODE 2 job test [root@toklap120 ~]# srun --ntasks=1 --label /bin/hostname && pwd && whoami # OUTPUT 0: toklap120 /root/testSlurm root cd ~ mkdir testSlurm cd testSlurm/ vi job.slurm and inside we copy : #!/bin/bash #SBATCH -J pbe_delta # Job name #SBATCH -o pbe_delta.o%j # Name of stdout output file(%j expands to jobId) #SBATCH -e pbe_delta.o%j # Name of stderr output file(%j expands to jobId) #SBATCH -N 1 # Total number of nodes requested (16 cores/node) #SBATCH -n 1 #SBATCH -t 48:00:00 # Run time (hh:mm:ss) date> output.out pwd >> output.out hostname >> output.out ls -lah # launching the job sbatch job.slurm # getting the message # Submitted batch job 17 # inside the directory 2 new files are present: output.out pbe_delta.o17 # Output.out contains the date, pwd e hostname like in the submitted job: Tue 8 Jan 14:22:57 JST 2019 /root/testSlurm toklap120 ############################## >> REBOOT MACHINE TESTING # Rebooting each node reboot now # AFTER RESTART COMPLETES ################################################## IN EACH COMPUTE NODE # disabling firewall on the compute nodes systemctl stop firewalld systemctl disable firewalld # Ok no errors # syncronizing clocks chkconfig ntpd on ntpdate pool.ntp.org systemctl start ntpd # clock should be now syncronized systemctl enable munge systemctl start munge systemctl enable slurmd systemctl stop slurmd systemctl start slurmd systemctl status slurmd slurmd -D -vvv # OUTPUT # from NODE 1 slurmd: debug: switch NONE plugin loaded slurmd: error: Error binding slurm stream socket: Address already in use slurmd: error: Unable to bind listen port (192.168.70.212:8018): Address already in use # from NODE 2 slurmd: debug: switch NONE plugin loaded slurmd: error: Error binding slurm stream socket: Address already in use slurmd: error: Unable to bind listen port (192.168.70.213:8018): Address already in use # not working anymore