Hi Chris,I changed the initial state a bit (the number of cores per node was misconfigured):
https://raw.githubusercontent.com/psteinb/docker-centos7-slurm/18.08.5-with-gres/slurm.conf
But that doesn't change things. Initially, I see this: # sinfo -N -l Wed Mar 20 09:03:26 2019NODELIST NODES PARTITION STATE CPUS S:C:T MEMORY TMP_DISK WEIGHT AVAIL_FE REASON g1 1 gpu* unknown 4 1:4:1 1000 0 1 (null) none g2 1 gpu* unknown 4 1:4:1 1000 0 1 (null) none g3 1 gpu* unknown 4 1:4:1 1000 0 1 (null) none
[root@ernie /]# sbatch --wrap="sleep 600 && env" -o non-gres.log -c 3 Submitted batch job 11 [root@ernie /]# squeueJOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON)
11 gpu wrap root R 0:03 1 g1 [root@ernie /]# scontrol show job -dd 11 JobId=11 JobName=wrap UserId=root(0) GroupId=root(0) MCS_label=N/A Priority=4294901750 Nice=0 Account=(null) QOS=normal JobState=RUNNING Reason=None Dependency=(null) Requeue=1 Restarts=0 BatchFlag=1 Reboot=0 ExitCode=0:0 DerivedExitCode=0:0 RunTime=00:00:12 TimeLimit=5-00:00:00 TimeMin=N/A SubmitTime=2019-03-20T09:12:45 EligibleTime=2019-03-20T09:12:45 AccrueTime=Unknown StartTime=2019-03-20T09:12:47 EndTime=2019-03-25T09:12:47 Deadline=N/A PreemptTime=None SuspendTime=None SecsPreSuspend=0 LastSchedEval=2019-03-20T09:12:47 Partition=gpu AllocNode:Sid=ernie:1 ReqNodeList=(null) ExcNodeList=(null) NodeList=g1 BatchHost=localhost NumNodes=1 NumCPUs=3 NumTasks=1 CPUs/Task=3 ReqB:S:C:T=0:0:*:* TRES=cpu=3,mem=4000M,node=1,billing=3 Socks/Node=* NtasksPerN:B:S:C=0:0:*:* CoreSpec=* Nodes=g1 CPU_IDs=0-2 Mem=4000 GRES_IDX= MinCPUsNode=3 MinMemoryNode=4000M MinTmpDiskNode=0 Features=(null) DelayBoot=00:00:00 OverSubscribe=OK Contiguous=0 Licenses=(null) Network=(null) Command=(null) WorkDir=/ StdErr=//non-gres.log StdIn=/dev/null StdOut=//non-gres.log Power= [root@ernie /]# scontrol show node -dd g1 NodeName=g1 CoresPerSocket=4 CPUAlloc=3 CPUTot=4 CPULoad=N/A AvailableFeatures=(null) ActiveFeatures=(null) Gres=gpu:titanxp:2 GresDrain=N/A GresUsed=gpu:titanxp:0(IDX:N/A) NodeAddr=127.0.0.1 NodeHostName=localhost Port=0 RealMemory=4000 AllocMem=4000 FreeMem=N/A Sockets=1 Boards=1 State=MIXED ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A Partitions=gpu BootTime=2019-03-18T10:14:18 SlurmdStartTime=2019-03-20T09:07:45 CfgTRES=cpu=4,mem=4000M,billing=4 AllocTRES=cpu=3,mem=4000M CapWatts=n/a CurrentWatts=0 LowestJoules=0 ConsumedJoules=0 ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s I now filled the 'cluster' with non-gres jobs and I submitted a GPU job:[root@ernie /]# sbatch --wrap="sleep 6 && env" -o gres.log -c 1 --gres=gpu:1 --mem=100
Submitted batch job 15 [root@ernie /]# squeueJOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) 15 gpu wrap root PD 0:00 1 (Resources)
11 gpu wrap root R 2:02 1 g1 12 gpu wrap root R 1:01 1 g2 13 gpu wrap root R 0:58 1 g3 [root@ernie /]# scontrol show job -dd 15 JobId=15 JobName=wrap UserId=root(0) GroupId=root(0) MCS_label=N/A Priority=4294901746 Nice=0 Account=(null) QOS=normal JobState=PENDING Reason=Resources Dependency=(null) Requeue=1 Restarts=0 BatchFlag=1 Reboot=0 ExitCode=0:0 DerivedExitCode=0:0 RunTime=00:00:00 TimeLimit=5-00:00:00 TimeMin=N/A SubmitTime=2019-03-20T09:14:45 EligibleTime=2019-03-20T09:14:45 AccrueTime=Unknown StartTime=Unknown EndTime=Unknown Deadline=N/A PreemptTime=None SuspendTime=None SecsPreSuspend=0 LastSchedEval=2019-03-20T09:14:47 Partition=gpu AllocNode:Sid=ernie:1 ReqNodeList=(null) ExcNodeList=(null) NodeList=(null) NumNodes=1 NumCPUs=1 NumTasks=1 CPUs/Task=1 ReqB:S:C:T=0:0:*:* TRES=cpu=1,mem=100M,node=1,billing=1 Socks/Node=* NtasksPerN:B:S:C=0:0:*:* CoreSpec=* MinCPUsNode=1 MinMemoryNode=100M MinTmpDiskNode=0 Features=(null) DelayBoot=00:00:00 OverSubscribe=OK Contiguous=0 Licenses=(null) Network=(null) Command=(null) WorkDir=/ StdErr=//gres.log StdIn=/dev/null StdOut=//gres.log Power= TresPerNode=gpu:1 I am curious what you think! PeterPS. I submit another single-core job, this one doesn't get in as well due to priority now: [root@ernie /]# sbatch --wrap="sleep 100 && env" -o singlecpu.log -c 1 -J single-cpu --mem=100
Submitted batch job 16 [root@ernie /]# squeueJOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) 15 gpu wrap root PD 0:00 1 (Resources) 16 gpu single-c root PD 0:00 1 (Priority)
11 gpu wrap root R 5:01 1 g1 12 gpu wrap root R 4:00 1 g2 13 gpu wrap root R 3:57 1 g3 [root@ernie /]# scontrol show job -dd 16 JobId=16 JobName=single-cpu UserId=root(0) GroupId=root(0) MCS_label=N/A Priority=4294901745 Nice=0 Account=(null) QOS=normal JobState=PENDING Reason=Priority Dependency=(null) Requeue=1 Restarts=0 BatchFlag=1 Reboot=0 ExitCode=0:0 DerivedExitCode=0:0 RunTime=00:00:00 TimeLimit=5-00:00:00 TimeMin=N/A SubmitTime=2019-03-20T09:17:32 EligibleTime=2019-03-20T09:17:32 AccrueTime=Unknown StartTime=2019-03-25T09:12:47 EndTime=2019-03-30T09:12:47 Deadline=N/A PreemptTime=None SuspendTime=None SecsPreSuspend=0 LastSchedEval=2019-03-20T09:18:44 Partition=gpu AllocNode:Sid=ernie:1 ReqNodeList=(null) ExcNodeList=(null) NodeList=(null) NumNodes=1 NumCPUs=1 NumTasks=1 CPUs/Task=1 ReqB:S:C:T=0:0:*:* TRES=cpu=1,mem=100M,node=1,billing=1 Socks/Node=* NtasksPerN:B:S:C=0:0:*:* CoreSpec=* MinCPUsNode=1 MinMemoryNode=100M MinTmpDiskNode=0 Features=(null) DelayBoot=00:00:00 OverSubscribe=OK Contiguous=0 Licenses=(null) Network=(null) Command=(null) WorkDir=/ StdErr=//singlecpu.log StdIn=/dev/null StdOut=//singlecpu.log Power=
smime.p7s
Description: S/MIME Cryptographic Signature