Hi Chris,

I changed the initial state a bit (the number of cores per node was misconfigured):
https://raw.githubusercontent.com/psteinb/docker-centos7-slurm/18.08.5-with-gres/slurm.conf

But that doesn't change things. Initially, I see this:

# sinfo -N -l
Wed Mar 20 09:03:26 2019
NODELIST NODES PARTITION STATE CPUS S:C:T MEMORY TMP_DISK WEIGHT AVAIL_FE REASON g1 1 gpu* unknown 4 1:4:1 1000 0 1 (null) none g2 1 gpu* unknown 4 1:4:1 1000 0 1 (null) none g3 1 gpu* unknown 4 1:4:1 1000 0 1 (null) none

[root@ernie /]# sbatch --wrap="sleep 600 && env" -o non-gres.log -c 3
Submitted batch job 11
[root@ernie /]# squeue
JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON)
                11       gpu     wrap     root  R       0:03      1 g1
[root@ernie /]# scontrol show job -dd 11
JobId=11 JobName=wrap
   UserId=root(0) GroupId=root(0) MCS_label=N/A
   Priority=4294901750 Nice=0 Account=(null) QOS=normal
   JobState=RUNNING Reason=None Dependency=(null)
   Requeue=1 Restarts=0 BatchFlag=1 Reboot=0 ExitCode=0:0
   DerivedExitCode=0:0
   RunTime=00:00:12 TimeLimit=5-00:00:00 TimeMin=N/A
   SubmitTime=2019-03-20T09:12:45 EligibleTime=2019-03-20T09:12:45
   AccrueTime=Unknown
   StartTime=2019-03-20T09:12:47 EndTime=2019-03-25T09:12:47 Deadline=N/A
   PreemptTime=None SuspendTime=None SecsPreSuspend=0
   LastSchedEval=2019-03-20T09:12:47
   Partition=gpu AllocNode:Sid=ernie:1
   ReqNodeList=(null) ExcNodeList=(null)
   NodeList=g1
   BatchHost=localhost
   NumNodes=1 NumCPUs=3 NumTasks=1 CPUs/Task=3 ReqB:S:C:T=0:0:*:*
   TRES=cpu=3,mem=4000M,node=1,billing=3
   Socks/Node=* NtasksPerN:B:S:C=0:0:*:* CoreSpec=*
     Nodes=g1 CPU_IDs=0-2 Mem=4000 GRES_IDX=
   MinCPUsNode=3 MinMemoryNode=4000M MinTmpDiskNode=0
   Features=(null) DelayBoot=00:00:00
   OverSubscribe=OK Contiguous=0 Licenses=(null) Network=(null)
   Command=(null)
   WorkDir=/
   StdErr=//non-gres.log
   StdIn=/dev/null
   StdOut=//non-gres.log
   Power=

[root@ernie /]# scontrol show node -dd g1
NodeName=g1 CoresPerSocket=4
   CPUAlloc=3 CPUTot=4 CPULoad=N/A
   AvailableFeatures=(null)
   ActiveFeatures=(null)
   Gres=gpu:titanxp:2
   GresDrain=N/A
   GresUsed=gpu:titanxp:0(IDX:N/A)
   NodeAddr=127.0.0.1 NodeHostName=localhost Port=0
   RealMemory=4000 AllocMem=4000 FreeMem=N/A Sockets=1 Boards=1
   State=MIXED ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
   Partitions=gpu
   BootTime=2019-03-18T10:14:18 SlurmdStartTime=2019-03-20T09:07:45
   CfgTRES=cpu=4,mem=4000M,billing=4
   AllocTRES=cpu=3,mem=4000M
   CapWatts=n/a
   CurrentWatts=0 LowestJoules=0 ConsumedJoules=0
   ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s

I now filled the 'cluster' with non-gres jobs and  I submitted a GPU job:

[root@ernie /]# sbatch --wrap="sleep 6 && env" -o gres.log -c 1 --gres=gpu:1 --mem=100
Submitted batch job 15

[root@ernie /]# squeue
JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) 15 gpu wrap root PD 0:00 1 (Resources)
                11       gpu     wrap     root  R       2:02      1 g1
                12       gpu     wrap     root  R       1:01      1 g2
                13       gpu     wrap     root  R       0:58      1 g3


[root@ernie /]# scontrol show job -dd 15
JobId=15 JobName=wrap
   UserId=root(0) GroupId=root(0) MCS_label=N/A
   Priority=4294901746 Nice=0 Account=(null) QOS=normal
   JobState=PENDING Reason=Resources Dependency=(null)
   Requeue=1 Restarts=0 BatchFlag=1 Reboot=0 ExitCode=0:0
   DerivedExitCode=0:0
   RunTime=00:00:00 TimeLimit=5-00:00:00 TimeMin=N/A
   SubmitTime=2019-03-20T09:14:45 EligibleTime=2019-03-20T09:14:45
   AccrueTime=Unknown
   StartTime=Unknown EndTime=Unknown Deadline=N/A
   PreemptTime=None SuspendTime=None SecsPreSuspend=0
   LastSchedEval=2019-03-20T09:14:47
   Partition=gpu AllocNode:Sid=ernie:1
   ReqNodeList=(null) ExcNodeList=(null)
   NodeList=(null)
   NumNodes=1 NumCPUs=1 NumTasks=1 CPUs/Task=1 ReqB:S:C:T=0:0:*:*
   TRES=cpu=1,mem=100M,node=1,billing=1
   Socks/Node=* NtasksPerN:B:S:C=0:0:*:* CoreSpec=*
   MinCPUsNode=1 MinMemoryNode=100M MinTmpDiskNode=0
   Features=(null) DelayBoot=00:00:00
   OverSubscribe=OK Contiguous=0 Licenses=(null) Network=(null)
   Command=(null)
   WorkDir=/
   StdErr=//gres.log
   StdIn=/dev/null
   StdOut=//gres.log
   Power=
   TresPerNode=gpu:1

I am curious what you think!
Peter

PS. I submit another single-core job, this one doesn't get in as well due to priority now: [root@ernie /]# sbatch --wrap="sleep 100 && env" -o singlecpu.log -c 1 -J single-cpu --mem=100
Submitted batch job 16
[root@ernie /]# squeue
JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) 15 gpu wrap root PD 0:00 1 (Resources) 16 gpu single-c root PD 0:00 1 (Priority)
                11       gpu     wrap     root  R       5:01      1 g1
                12       gpu     wrap     root  R       4:00      1 g2
                13       gpu     wrap     root  R       3:57      1 g3
[root@ernie /]# scontrol show job -dd 16
JobId=16 JobName=single-cpu
   UserId=root(0) GroupId=root(0) MCS_label=N/A
   Priority=4294901745 Nice=0 Account=(null) QOS=normal
   JobState=PENDING Reason=Priority Dependency=(null)
   Requeue=1 Restarts=0 BatchFlag=1 Reboot=0 ExitCode=0:0
   DerivedExitCode=0:0
   RunTime=00:00:00 TimeLimit=5-00:00:00 TimeMin=N/A
   SubmitTime=2019-03-20T09:17:32 EligibleTime=2019-03-20T09:17:32
   AccrueTime=Unknown
   StartTime=2019-03-25T09:12:47 EndTime=2019-03-30T09:12:47 Deadline=N/A
   PreemptTime=None SuspendTime=None SecsPreSuspend=0
   LastSchedEval=2019-03-20T09:18:44
   Partition=gpu AllocNode:Sid=ernie:1
   ReqNodeList=(null) ExcNodeList=(null)
   NodeList=(null)
   NumNodes=1 NumCPUs=1 NumTasks=1 CPUs/Task=1 ReqB:S:C:T=0:0:*:*
   TRES=cpu=1,mem=100M,node=1,billing=1
   Socks/Node=* NtasksPerN:B:S:C=0:0:*:* CoreSpec=*
   MinCPUsNode=1 MinMemoryNode=100M MinTmpDiskNode=0
   Features=(null) DelayBoot=00:00:00
   OverSubscribe=OK Contiguous=0 Licenses=(null) Network=(null)
   Command=(null)
   WorkDir=/
   StdErr=//singlecpu.log
   StdIn=/dev/null
   StdOut=//singlecpu.log
   Power=

Attachment: smime.p7s
Description: S/MIME Cryptographic Signature

Reply via email to