Hello all.
An user reported that a job wasn't starting, so I tried to replicate the request and I get:
-8<--
[root@ophfe1 root.old]# scontrol show job 113936
JobId=113936 JobName=test.sh
   UserId=root(0) GroupId=root(0) MCS_label=N/A
   Priority=1 Nice=0 Account=root QOS=long
   JobState=PENDING Reason=Priority Dependency=(null)
   Requeue=1 Restarts=0 BatchFlag=1 Reboot=0 ExitCode=0:0
   RunTime=00:00:00 TimeLimit=2-00:00:00 TimeMin=N/A
   SubmitTime=2024-12-06T13:19:36 EligibleTime=2024-12-06T13:19:36
   AccrueTime=2024-12-06T13:19:36
   StartTime=Unknown EndTime=Unknown Deadline=N/A
SuspendTime=None SecsPreSuspend=0 LastSchedEval=2024-12-06T13:21:32 Scheduler=Backfill:*
   Partition=m3 AllocNode:Sid=ophfe1:855189
   ReqNodeList=(null) ExcNodeList=(null)
   NodeList=
   NumNodes=1-1 NumCPUs=96 NumTasks=96 CPUs/Task=1 ReqB:S:C:T=0:0:*:*
   TRES=cpu=96,mem=95000M,node=1,billing=1296
   Socks/Node=* NtasksPerN:B:S:C=0:0:*:* CoreSpec=*
   MinCPUsNode=1 MinMemoryNode=95000M MinTmpDiskNode=0
   Features=(null) DelayBoot=00:00:00
   OverSubscribe=OK Contiguous=0 Licenses=(null) Network=(null)
   Command=/home/root.old/test.sh
   WorkDir=/home/root.old
   StdErr=/home/root.old/%N-%J.err
   StdIn=/dev/null
   StdOut=/home/root.old/%N-%J.out
   Power=


[root@ophfe1 root.old]# scontrol sho partition m3
PartitionName=m3
   AllowGroups=ALL DenyAccounts=formazione AllowQos=ALL
   AllocNodes=ALL Default=NO QoS=N/A
DefaultTime=NONE DisableRootJobs=NO ExclusiveUser=NO GraceTime=0 Hidden=NO MaxNodes=UNLIMITED MaxTime=UNLIMITED MinNodes=0 LLN=NO MaxCPUsPerNode=UNLIMITED
   Nodes=mtx20
PriorityJobFactor=1 PriorityTier=1 RootOnly=NO ReqResv=NO OverSubscribe=NO
   OverTimeLimit=NONE PreemptMode=CANCEL
State=UP TotalCPUs=192 TotalNodes=1 SelectTypeParameters=CR_SOCKET_MEMORY
   JobDefaults=(null)
   DefMemPerNode=UNLIMITED MaxMemPerNode=UNLIMITED
   TRES=cpu=192,mem=1150000M,node=1,billing=2592
   TRESBillingWeights=CPU=13.500,Mem=2.2378G

[root@ophfe1 root.old]# scontrol show node mtx20
NodeName=mtx20 Arch=x86_64 CoresPerSocket=24
   CPUAlloc=0 CPUEfctv=192 CPUTot=192 CPULoad=0.00
   AvailableFeatures=ib,matrix,intel,avx
   ActiveFeatures=ib,matrix,intel,avx
   Gres=(null)
   NodeAddr=mtx20 NodeHostName=mtx20 Version=22.05.6
   OS=Linux 4.18.0-372.9.1.el8.x86_64 #1 SMP Tue May 10 14:48:47 UTC 2022
   RealMemory=1150000 AllocMem=0 FreeMem=1156606 Sockets=4 Boards=1
   MemSpecLimit=2048
   State=IDLE ThreadsPerCore=2 TmpDisk=0 Weight=8 Owner=N/A MCS_label=N/A
   Partitions=m3
   BootTime=2024-12-06T10:01:42 SlurmdStartTime=2024-12-06T10:02:54
   LastBusyTime=2024-12-06T10:51:58
   CfgTRES=cpu=192,mem=1150000M,billing=2592
   AllocTRES=
   CapWatts=n/a
   CurrentWatts=0 AveWatts=0
   ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s

-8<--

So the node is free, the partition does not impose extra limits (used only for accounting factors) but the job does not start.

Any hints?

Tks

--
Diego Zuccato
DIFA - Dip. di Fisica e Astronomia
Servizi Informatici
Alma Mater Studiorum - Università di Bologna
V.le Berti-Pichat 6/2 - 40127 Bologna - Italy
tel.: +39 051 20 95786


--
slurm-users mailing list -- slurm-users@lists.schedmd.com
To unsubscribe send an email to slurm-users-le...@lists.schedmd.com

Reply via email to