Hi All,
i am trying to hold the job from Scontol but not able to hold the job.
i am not able to understand..can any one please explain the concept of Hold
and Release, Suspend and Resume.

Please find the below steps which i have tried.

[root@master ~]# cat test.sh
#!/bin/bash

#SBATCH -N 1
#SBATCH -n 1
#SBATCH -p hpc
#SBATCH -t 01:00:00
#SBATCH -J testjob
#SBATCH -o testjob.o%j
#SBATCH -e testjob.e%j

cd $SLURM_SUBMIT_DIR
/bin/hostname
date
sleep 120

[root@master ~]# sbatch test.sh
Submitted batch job 28
[root@master ~]# sbatch test.sh
Submitted batch job 29
[root@master ~]# sbatch test.sh
Submitted batch job 30
[root@master ~]# squeue
             JOBID PARTITION     NAME     USER ST       TIME  NODES
NODELIST(REASON)
                30       hpc  testjob     root PD       0:00      1
(Resources)
                28       hpc  testjob     root  R       0:06      1 master
                29       hpc  testjob     root  R       0:05      1 master
[root@master ~]# sinfo -Nl
Sun May 23 11:16:55 2021
NODELIST   NODES PARTITION       STATE CPUS    S:C:T MEMORY TMP_DISK WEIGHT
AVAIL_FE REASON
master         1      hpc*   allocated 2       2:1:1   1024        0      1
  (null) none
[root@master ~]# squeue
             JOBID PARTITION     NAME     USER ST       TIME  NODES
NODELIST(REASON)
                30       hpc  testjob     root PD       0:00      1
(Resources)
                28       hpc  testjob     root  R       0:39      1 master
                29       hpc  testjob     root  R       0:38      1 master
[root@master ~]# scontrol hold 28
[root@master ~]# squeue
             JOBID PARTITION     NAME     USER ST       TIME  NODES
NODELIST(REASON)
                30       hpc  testjob     root PD       0:00      1
(Resources)
                29       hpc  testjob     root  R       1:04      1 master
                28       hpc  testjob     root  R       1:05      1 master
[root@master ~]# scontrol hold 28
[root@master ~]# scontrol hold 28
[root@master ~]# squeue
             JOBID PARTITION     NAME     USER ST       TIME  NODES
NODELIST(REASON)
                30       hpc  testjob     root PD       0:00      1
(Resources)
                29       hpc  testjob     root  R       1:14      1 master
                28       hpc  testjob     root  R       1:15      1 master
[root@master ~]# scontrol suspend 28
[root@master ~]# squeue
             JOBID PARTITION     NAME     USER ST       TIME  NODES
NODELIST(REASON)
                29       hpc  testjob     root  R       1:38      1 master
                30       hpc  testjob     root  R       0:01      1 master
                28       hpc  testjob     root  S       1:37      1 master
[root@master ~]# squeue
             JOBID PARTITION     NAME     USER ST       TIME  NODES
NODELIST(REASON)
                29       hpc  testjob     root  R       1:59      1 master
                30       hpc  testjob     root  R       0:22      1 master
                28       hpc  testjob     root  S       1:37      1 master
[root@master ~]# squeue
             JOBID PARTITION     NAME     USER ST       TIME  NODES
NODELIST(REASON)
                30       hpc  testjob     root  R       0:41      1 master
                28       hpc  testjob     root  S       1:37      1 master
[root@master ~]# squeue
             JOBID PARTITION     NAME     USER ST       TIME  NODES
NODELIST(REASON)
                30       hpc  testjob     root  R       0:55      1 master
                28       hpc  testjob     root  S       1:37      1 master
[root@master ~]# scontrol release 28
[root@master ~]# squeue
             JOBID PARTITION     NAME     USER ST       TIME  NODES
NODELIST(REASON)
                30       hpc  testjob     root  R       1:20      1 master
                28       hpc  testjob     root  S       1:37      1 master
[root@master ~]# squeue
             JOBID PARTITION     NAME     USER ST       TIME  NODES
NODELIST(REASON)
                30       hpc  testjob     root  R       1:22      1 master
                28       hpc  testjob     root  S       1:37      1 master
[root@master ~]# squeue
             JOBID PARTITION     NAME     USER ST       TIME  NODES
NODELIST(REASON)
                30       hpc  testjob     root  R       1:23      1 master
                28       hpc  testjob     root  S       1:37      1 master
[root@master ~]# squeue
             JOBID PARTITION     NAME     USER ST       TIME  NODES
NODELIST(REASON)
                30       hpc  testjob     root  R       1:25      1 master
                28       hpc  testjob     root  S       1:37      1 master
[root@master ~]# scontrol resume 28
[root@master ~]# squeue
             JOBID PARTITION     NAME     USER ST       TIME  NODES
NODELIST(REASON)
                30       hpc  testjob     root  R       1:40      1 master
[root@master ~]# squeue
             JOBID PARTITION     NAME     USER ST       TIME  NODES
NODELIST(REASON)
                30       hpc  testjob     root  R       2:00      1 master
[root@master ~]# squeue
             JOBID PARTITION     NAME     USER ST       TIME  NODES
NODELIST(REASON)
[root@master ~]#
-- 
*Regards*

*Zain*

Reply via email to