Hey SLURM user group,

We're seeing intermittent issues with timeouts contacting the controller / 
slurmctld using sbatch in a non-interactive / non-blocking fashion, even with 
MessageTimeout increased to 60s. Our slurmctld does not have a slurmdbd 
configured, and CPU load / memory usage is barely  registering. We're not 
seeing latency in storage I/O on the controller. In this specific cluster, we 
are running less than 50 nodes, and are not submitting more than 10's of jobs 
per second.

Any ideas on how we can troubleshoot this issue?
We have done packet captures on both the controller and the submission nodes, 
and see no obvious issues with the network.

What do you have your MessageTimeout set to?

# sinfo --version
slurm 18.08.9

# scontrol | show config

Configuration data as of 2023-11-17T10:14:25 AccountingStorageBackupHost = 
(null) AccountingStorageEnforce = none

AccountingStorageHost   = localhost

AccountingStorageLoc    = <<REDACTED>>/accounting.log

AccountingStoragePort   = 0

AccountingStorageTRES   = cpu,mem,energy,node,billing,fs/disk,vmem,pages

AccountingStorageType   = accounting_storage/filetxt

AccountingStorageUser   = root

AccountingStoreJobComment = Yes

AcctGatherEnergyType    = acct_gather_energy/none

AcctGatherFilesystemType = acct_gather_filesystem/none 
AcctGatherInterconnectType = acct_gather_interconnect/none

AcctGatherNodeFreq      = 10 sec

AcctGatherProfileType   = acct_gather_profile/none

AllowSpecResourcesUsage = 0

AuthInfo                = (null)

AuthType                = auth/none

BatchStartTimeout       = 240 sec

BOOT_TIME               = 2023-11-15T14:24:09

BurstBufferType         = (null)

CheckpointType          = checkpoint/none

ClusterName             = <<REDACTED>>/

CommunicationParameters = (null)

CompleteWait            = 0 sec

CoreSpecPlugin          = core_spec/none

CpuFreqDef              = Unknown

CpuFreqGovernors        = Performance,OnDemand,UserSpace

CryptoType              = crypto/openssl

DebugFlags              = (null)

DefMemPerCPU            = 1024

DisableRootJobs         = No

EioTimeout              = 60

EnforcePartLimits       = NO

Epilog                  = (null)

EpilogMsgTime           = 2000 usec

EpilogSlurmctld         = (null)

ExtSensorsType          = ext_sensors/none

ExtSensorsFreq          = 0 sec

FairShareDampeningFactor = 1

FastSchedule            = 1

FederationParameters    = (null)

FirstJobId              = 1

GetEnvTimeout           = 2 sec

GresTypes               = (null)

GroupUpdateForce        = 1

GroupUpdateTime         = 600 sec

HASH_VAL                = Match

HealthCheckInterval     = 60 sec

HealthCheckNodeState    = ANY

HealthCheckProgram      = /bin/ls

InactiveLimit           = 3600 sec

JobAcctGatherFrequency  = task=5

JobAcctGatherType       = jobacct_gather/none

JobAcctGatherParams     = NoOverMemoryKill

JobCheckpointDir        = <<REDACTED>>/checkpoint

JobCompHost             = localhost

JobCompLoc              = <<REDACTED>>/job_completions.log

JobCompPort             = 0

JobCompType             = jobcomp/filetxt

JobCompUser             = root

JobContainerType        = job_container/none

JobCredentialPrivateKey = /etc/slurm/pki/slurm.key

JobCredentialPublicCertificate = /etc/slurm/pki/slurm.crt

JobDefaults             = (null)

JobFileAppend           = 1

JobRequeue              = 1

JobSubmitPlugins        = (null)

KeepAliveTime           = SYSTEM_DEFAULT

KillOnBadExit           = 1

KillWait                = 60 sec

LaunchParameters        = slurmstepd_memlock

LaunchType              = launch/slurm

Layouts                 =

Licenses                = <<REDACTED>>

LicensesUsed            = <<REDACTED>>

LogTimeFormat           = iso8601_ms

MailDomain              = <<REDACTED>>

MailProg                = /bin/mail

MaxArraySize            = 1001

MaxJobCount             = 10000

MaxJobId                = 67043328

MaxMemPerNode           = UNLIMITED

MaxStepCount            = 40000

MaxTasksPerNode         = 512

MCSPlugin               = mcs/none

MCSParameters           = (null)

MemLimitEnforce         = No

MessageTimeout          = 60 sec

MinJobAge               = 600 sec

MpiDefault              = none

MpiParams               = (null)

MsgAggregationParams    = (null)

NEXT_JOB_ID             = 7262059

NodeFeaturesPlugins     = (null)

OverTimeLimit           = 10 min

PluginDir               = <<REDACTED>>/slurm

PlugStackConfig         = /etc/slurm/plugstack.conf

PowerParameters         = (null)

PowerPlugin             =

PreemptMode             = OFF

PreemptType             = preempt/none

PriorityParameters      = (null)

PriorityDecayHalfLife   = 7-00:00:00

PriorityCalcPeriod      = 00:05:00

PriorityFavorSmall      = Yes

PriorityFlags           =

PriorityMaxAge          = 1-00:00:00

PriorityUsageResetPeriod = NONE

PriorityType            = priority/multifactor

PriorityWeightAge       = 1000

PriorityWeightFairShare = 0

PriorityWeightJobSize   = 500

PriorityWeightPartition = 1000

PriorityWeightQOS       = 0

PriorityWeightTRES      = (null)

PrivateData             = none

ProctrackType           = proctrack/linuxproc

Prolog                  = (null)

PrologEpilogTimeout     = 2700

PrologSlurmctld         = (null)

PrologFlags             = (null)

PropagatePrioProcess    = 0

PropagateResourceLimits = NONE

PropagateResourceLimitsExcept = (null)

RebootProgram           = (null)

ReconfigFlags           = (null)

RequeueExit             = (null)

RequeueExitHold         = (null)

ResumeFailProgram       = (null)

ResumeProgram           = (null)

ResumeRate              = 300 nodes/min

ResumeTimeout           = 60 sec

ResvEpilog              = (null)

ResvOverRun             = 0 min

ResvProlog              = (null)

ReturnToService         = 2

RoutePlugin             = route/default

SallocDefaultCommand    = (null)

SbcastParameters        = (null)

SchedulerParameters     = nohold_on_prolog_fail

SchedulerTimeSlice      = 30 sec

SchedulerType           = sched/backfill

SelectType              = select/cons_res

SelectTypeParameters    = CR_CPU_MEMORY,CR_LLN

SlurmUser               = slurm(64030)

SlurmctldAddr           = (null)

SlurmctldDebug          = info

SlurmctldHost[0]        = <<REDACTED>>

SlurmctldLogFile        = <<REDACTED>>/slurmctld.log

SlurmctldPort           = 6817

SlurmctldSyslogDebug    = debug2

SlurmctldPrimaryOffProg = (null)

SlurmctldPrimaryOnProg  = (null)

SlurmctldTimeout        = 240 sec

SlurmctldParameters     = (null)

SlurmdDebug             = info

SlurmdLogFile           = /var/log/slurm/slurmd.log

SlurmdParameters        = (null)

SlurmdPidFile           = /run/slurmd.pid

SlurmdPort              = 6818

SlurmdSpoolDir          = /var/lib/slurm/slurmd

SlurmdSyslogDebug       = error

SlurmdTimeout           = 300 sec

SlurmdUser              = root(0)

SlurmSchedLogFile       = <<REDACTED>>/slurmsched.log

SlurmSchedLogLevel      = 3

SlurmctldPidFile        = /run/slurmctld.pid

SlurmctldPlugstack      = (null)

SLURM_CONF              = /etc/slurm/slurm.conf

SLURM_VERSION           = 18.08.9

SrunEpilog              = (null)

SrunPortRange           = 0-0

SrunProlog              = (null)

StateSaveLocation       = <<REDACTED>>/slurmctld

SuspendExcNodes         = (null)

SuspendExcParts         = (null)

SuspendProgram          = (null)

SuspendRate             = 60 nodes/min

SuspendTime             = NONE

SuspendTimeout          = 30 sec

SwitchType              = switch/none

TaskEpilog              = (null)

TaskPlugin              = task/none

TaskPluginParam         = (null type)

TaskProlog              = (null)

TCPTimeout              = 60 sec

TmpFS                   = /tmp

TopologyParam           = (null)

TopologyPlugin          = topology/none

TrackWCKey              = No

TreeWidth               = 50

UsePam                  = 0

UnkillableStepProgram   = (null)

UnkillableStepTimeout   = 240 sec

VSizeFactor             = 0 percent

WaitTime                = 300 sec

X11Parameters           = (null)



Slurmctld(primary) at <<REDACTED>> is UP


Anthony Altemara
IT Infrastructure Associate Director
[cid1603022056*image003.png@01D8384E.C479F0C0]
Office: +1 919.491.2220
anthony.altem...@q2labsolutions.com<mailto:anthony.altem...@q2labsolutions.com> 
   |  www.Q2LabSolutions.com<http://www.q2labsolutions.com/>



________________________________________
IMPORTANT - PLEASE READ: This electronic message, including its attachments, is 
CONFIDENTIAL and may contain PROPRIETARY or LEGALLY PRIVILEGED or PROTECTED 
information and is intended for the authorized recipient of the sender. If you 
are not the intended recipient, you are hereby notified that any use, 
disclosure, copying, or distribution of this message or any of the information 
included in it is unauthorized and strictly prohibited. If you have received 
this message in error, please immediately notify the sender by reply e-mail and 
permanently delete this message and its attachments, along with any copies 
thereof, from all locations received (e.g., computer, mobile device, etc.). To 
the extent permitted by law, we may monitor electronic communications for the 
purposes of ensuring compliance with our legal and regulatory obligations and 
internal policies. We may also collect email traffic headers for analyzing 
patterns of network traffic and managing client relationships. For further 
information see our 
privacy-policy<https://www.iqvia.com/about-us/privacy/privacy-policy>. Thank 
you.

Reply via email to