Hi,

when I start an interactive job like this:

srun --pty --mem=3G -c2 bash

And then I schedule and run other jobs (can be interactive or non interactive) and one of these jobs that runs on the same node terminates, the interactive job gets killed with this message:

srun: error: node01.abc.at: task 0: Killed

I attached our slurm config. Does anybody have an idea what is going on here or where I could look to debug? I'm quite new to slurm, so I don't know all the places to look...

Thanks a lot in advance!

Thomas
#
# Example slurm.conf file. Please run configurator.html
# (in doc/html) to build a configuration file customized
# for your environment.
#
#
# slurm.conf file generated by configurator.html.
#
# See the slurm.conf man page for more information.
#
ClusterName=openhpc
SlurmctldHost=abc.at
#DisableRootJobs=NO
#EnforcePartLimits=NO
#EpilogSlurmctld=
#FirstJobId=1
#MaxJobId=67043328
#GresTypes=
#GroupUpdateForce=0
#GroupUpdateTime=600
#JobFileAppend=0
#JobRequeue=1
#JobSubmitPlugins=lua
#KillOnBadExit=0
#LaunchType=launch/slurm
#Licenses=foo*4,bar
#MailProg=/bin/mail
#MaxJobCount=10000
#MaxStepCount=40000
#MaxTasksPerNode=512
MpiDefault=none
#MpiParams=ports=#-#
#PluginDir=
#PlugStackConfig=
#PrivateData=jobs
ProctrackType=proctrack/cgroup
#Prolog=
#PrologFlags=
#PrologSlurmctld=
#PropagatePrioProcess=0
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
#RebootProgram=
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmctldPort=6817
SlurmdPidFile=/var/run/slurmd.pid
SlurmdPort=6818
SlurmdSpoolDir=/var/spool/slurm # NB: not OpenHPC default!
SlurmUser=slurm
#SlurmdUser=root
#SrunEpilog=
#SrunProlog=
StateSaveLocation=/var/spool/slurm
SwitchType=switch/none
#TaskEpilog=
#TaskPlugin=task/affinity
#TaskProlog=
#TopologyPlugin=topology/tree
#TmpFS=/tmp
#TrackWCKey=no
#TreeWidth=
#UnkillableStepProgram=
#UsePAM=0
#
#
# TIMERS
#BatchStartTimeout=10
#CompleteWait=0
#EpilogMsgTime=2000
#GetEnvTimeout=2
#HealthCheckInterval=0
#HealthCheckProgram=
InactiveLimit=0
KillWait=30
#MessageTimeout=10
#ResvOverRun=0
MinJobAge=300
#OverTimeLimit=0
SlurmctldTimeout=300
SlurmdTimeout=300
#UnkillableStepTimeout=60
#VSizeFactor=0
Waittime=0
#
#
# SCHEDULING
#DefMemPerCPU=0
#MaxMemPerCPU=0
#SchedulerTimeSlice=30
SchedulerType=sched/backfill
SelectType=select/cons_res
SelectTypeParameters=CR_Core_Memory
#
#
# JOB PRIORITY
#PriorityFlags=
PriorityType=priority/multifactor
#PriorityDecayHalfLife=
#PriorityCalcPeriod=
#PriorityFavorSmall=
#PriorityMaxAge=
#PriorityUsageResetPeriod=
#PriorityWeightAge=
#PriorityWeightFairshare=
#PriorityWeightJobSize=
PriorityWeightPartition=1000
#PriorityWeightQOS=
PreemptType=preempt/qos
PreemptMode=requeue
#
# LOGGING AND ACCOUNTING
#AccountingStorageEnforce=0
AccountingStorageHost=slurmdb.abc.at
AccountingStoragePort=6819
AccountingStorageType=accounting_storage/slurmdbd
AccountingStorageUser=slurm_db
#AccountingStoreFlags=
#JobCompHost=
JobCompLoc=/var/log/slurm_jobacct.log
#JobCompPass=
#JobCompPort=
JobCompType=jobcomp/none
#JobCompUser=
#JobContainerType=job_container/none
JobAcctGatherFrequency=30
JobAcctGatherType=jobacct_gather/cgroup

# By default, SLURM will log to syslog, which is what we want
SlurmctldSyslogDebug=info
SlurmdSyslogDebug=info
#SlurmSchedLogFile=
#SlurmSchedLogLevel=
#DebugFlags=
#
#
# POWER SAVE SUPPORT FOR IDLE NODES - NOT SUPPORTED IN THIS APPLIANCE VERSION

# LOGIN-ONLY NODES
# Define slurmd nodes not in partitions for login-only nodes in "configless" 
mode:

# COMPUTE NODES
# OpenHPC default configuration
PropagateResourceLimitsExcept=MEMLOCK
Epilog=/etc/slurm/slurm.epilog.clean
        
# openhpc_slurm_partitions group: openhpc_interactive

NodeName=node01.abc.at State=UNKNOWN RealMemory=420202 Sockets=2 
CoresPerSocket=30 ThreadsPerCore=1

NodeName=node02.abc.at State=UNKNOWN RealMemory=420202 Sockets=2 
CoresPerSocket=30 ThreadsPerCore=1

NodeName=node03.abc.at State=UNKNOWN RealMemory=420202 Sockets=2 
CoresPerSocket=30 ThreadsPerCore=1

NodeName=node04.abc.at State=UNKNOWN RealMemory=420202 Sockets=2 
CoresPerSocket=30 ThreadsPerCore=1

NodeName=node05.abc.at State=UNKNOWN RealMemory=420202 Sockets=2 
CoresPerSocket=30 ThreadsPerCore=1

NodeName=node06.abc.at State=UNKNOWN RealMemory=420202 Sockets=2 
CoresPerSocket=30 ThreadsPerCore=1
PartitionName=interactive Default=YES MaxTime=2-08:00:00 State=UP 
Nodes=node01.abc.at,node02.abc.at,node03.abc.at,node04.abc.at,node05.abc.at,node06.abc.at
 Priority=100

# Define a non-existent node, in no partition, so that slurmctld starts even 
with all partitions empty
NodeName=nonesuch

SlurmctldParameters=enable_configlessReturnToService=2
PrologFlags=contain,x11
TaskPlugin=task/cgroup,task/affinity
PriorityFavorSmall=YES
PriorityDecayHalfLife=14-0
PriorityWeightAge=1000
PriorityWeightFairshare=10000
PriorityWeightJobSize=1000
PriorityWeightQOS=1000000

-- 
slurm-users mailing list -- slurm-users@lists.schedmd.com
To unsubscribe send an email to slurm-users-le...@lists.schedmd.com

Reply via email to