Hi All, i am trying to hold the job from Scontol but not able to hold the job. i am not able to understand..can any one please explain the concept of Hold and Release, Suspend and Resume.
Please find the below steps which i have tried. [root@master ~]# cat test.sh #!/bin/bash #SBATCH -N 1 #SBATCH -n 1 #SBATCH -p hpc #SBATCH -t 01:00:00 #SBATCH -J testjob #SBATCH -o testjob.o%j #SBATCH -e testjob.e%j cd $SLURM_SUBMIT_DIR /bin/hostname date sleep 120 [root@master ~]# sbatch test.sh Submitted batch job 28 [root@master ~]# sbatch test.sh Submitted batch job 29 [root@master ~]# sbatch test.sh Submitted batch job 30 [root@master ~]# squeue JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) 30 hpc testjob root PD 0:00 1 (Resources) 28 hpc testjob root R 0:06 1 master 29 hpc testjob root R 0:05 1 master [root@master ~]# sinfo -Nl Sun May 23 11:16:55 2021 NODELIST NODES PARTITION STATE CPUS S:C:T MEMORY TMP_DISK WEIGHT AVAIL_FE REASON master 1 hpc* allocated 2 2:1:1 1024 0 1 (null) none [root@master ~]# squeue JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) 30 hpc testjob root PD 0:00 1 (Resources) 28 hpc testjob root R 0:39 1 master 29 hpc testjob root R 0:38 1 master [root@master ~]# scontrol hold 28 [root@master ~]# squeue JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) 30 hpc testjob root PD 0:00 1 (Resources) 29 hpc testjob root R 1:04 1 master 28 hpc testjob root R 1:05 1 master [root@master ~]# scontrol hold 28 [root@master ~]# scontrol hold 28 [root@master ~]# squeue JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) 30 hpc testjob root PD 0:00 1 (Resources) 29 hpc testjob root R 1:14 1 master 28 hpc testjob root R 1:15 1 master [root@master ~]# scontrol suspend 28 [root@master ~]# squeue JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) 29 hpc testjob root R 1:38 1 master 30 hpc testjob root R 0:01 1 master 28 hpc testjob root S 1:37 1 master [root@master ~]# squeue JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) 29 hpc testjob root R 1:59 1 master 30 hpc testjob root R 0:22 1 master 28 hpc testjob root S 1:37 1 master [root@master ~]# squeue JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) 30 hpc testjob root R 0:41 1 master 28 hpc testjob root S 1:37 1 master [root@master ~]# squeue JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) 30 hpc testjob root R 0:55 1 master 28 hpc testjob root S 1:37 1 master [root@master ~]# scontrol release 28 [root@master ~]# squeue JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) 30 hpc testjob root R 1:20 1 master 28 hpc testjob root S 1:37 1 master [root@master ~]# squeue JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) 30 hpc testjob root R 1:22 1 master 28 hpc testjob root S 1:37 1 master [root@master ~]# squeue JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) 30 hpc testjob root R 1:23 1 master 28 hpc testjob root S 1:37 1 master [root@master ~]# squeue JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) 30 hpc testjob root R 1:25 1 master 28 hpc testjob root S 1:37 1 master [root@master ~]# scontrol resume 28 [root@master ~]# squeue JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) 30 hpc testjob root R 1:40 1 master [root@master ~]# squeue JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) 30 hpc testjob root R 2:00 1 master [root@master ~]# squeue JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) [root@master ~]# -- *Regards* *Zain*