Hi all,
we observe a lot of job which keep being in completing state until we kill the sleep process inside the step_extern cgroup. In these cases what we see on the involved nodes is a defunct slurmd [root@r113c18s01 ~]# ps --forest -lfe | egrep '[s]leep|[s]lurm' 1 S root 26867 1 0 80 0 - 891256 inet_c Jan23 ? 00:03:48 /usr/sbin/slurmd 1 Z root 25518 26867 0 80 0 - 0 exit 12:59 ? 00:00:00 \_ [slurmd] <defunct> 0 S root 25525 1 0 80 0 - 26974 hrtime 12:59 ? 00:00:00 sleep 1000000 [root@r113c18s01 ~]# cat /sys/fs/cgroup/cpuset/slurm/uid_29592/job_62379/step_extern/tasks 25525 we see from UNIX accounting logs that the step_extern slurmstepd died immediately [root@r113c18s01 ~]# lastcomm --command slurmstepd | grep D slurmstepd DX root __ 0.89 secs Tue Jan 30 12:59 [root@r113c18s01 ~]# dump-acct /var/account/pacct | grep 'Tue Jan 30 12:59' | grep slurm slurmd |v3| 0.00| 0.00| 0.00| 0| 0|3565056.00| 0.00| 25518 26867|Tue Jan 30 12:59:48 2018 slurmstepd |v3| 31.00| 58.00| 93.00| 0| 0|199680.00| 0.00| 25519 1|Tue Jan 30 12:59:49 2018 So both the sleep and slurmstepd processes turn to be children of systemd (pid 1). Slurmd reports [root@r113c18s01 ~]# journalctl -u slurmd | grep 62379 Jan 30 12:59:48 r113c18s01 slurmd[26867]: task_p_slurmd_batch_request: 62379 Jan 30 12:59:48 r113c18s01 slurmd[26867]: task/affinity: job 62379 CPU input mask for node: 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF Jan 30 12:59:48 r113c18s01 slurmd[26867]: task/affinity: job 62379 CPU final HW mask for node: 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF Jan 30 12:59:48 r113c18s01 slurmd[26867]: debug: Waiting for job 62379's prolog to complete Jan 30 12:59:48 r113c18s01 slurmd[26867]: debug: [job 62379] attempting to run prolog [/etc/slurm/prolog.d/create_local_tmpdir.sh] Jan 30 12:59:48 r113c18s01 slurmd[26867]: _run_prolog: prolog with lock for job 62379 ran for 0 seconds Jan 30 12:59:49 r113c18s01 slurmd[26867]: debug: _step_connect: connect() failed dir /var/spool/slurmd node r113c18s01 step 62379.4294967295 Connection refused Jan 30 15:11:24 r113c18s01 slurmd[26867]: debug: _step_connect: connect() failed dir /var/spool/slurmd node r113c18s01 step 62379.4294967295 Connection refused Jan 30 15:11:24 r113c18s01 slurmd[26867]: debug: Cleaned up stray socket /var/spool/slurmd/r113c18s01_62379.4294967295 Jan 30 17:00:13 r113c18s01 slurmd[26867]: Job 62379: timeout: sent SIGTERM to 0 active steps Jan 30 17:00:13 r113c18s01 slurmd[26867]: debug: task_p_slurmd_release_resources: affinity jobid 62379 Jan 30 17:00:13 r113c18s01 slurmd[26867]: debug: credential for job 62379 revoked Jan 30 17:00:13 r113c18s01 slurmd[26867]: debug: Waiting for job 62379's prolog to complete Jan 30 17:04:33 r113c18s01 slurmd[26867]: debug: task_p_slurmd_release_resources: affinity jobid 62379 Jan 30 17:08:39 r113c18s01 slurmd[26867]: debug: task_p_slurmd_release_resources: affinity jobid 62379 Jan 30 17:12:57 r113c18s01 slurmd[26867]: debug: task_p_slurmd_release_resources: affinity jobid 62379 Jan 30 17:16:55 r113c18s01 slurmd[26867]: debug: task_p_slurmd_release_resources: affinity jobid 62379 Jan 30 17:21:02 r113c18s01 slurmd[26867]: debug: task_p_slurmd_release_resources: affinity jobid 62379 Jan 30 17:25:10 r113c18s01 slurmd[26867]: debug: task_p_slurmd_release_resources: affinity jobid 62379 Jan 30 17:29:11 r113c18s01 slurmd[26867]: debug: task_p_slurmd_release_resources: affinity jobid 62379 Jan 30 17:33:21 r113c18s01 slurmd[26867]: debug: task_p_slurmd_release_resources: affinity jobid 62379 Jan 30 17:37:24 r113c18s01 slurmd[26867]: debug: task_p_slurmd_release_resources: affinity jobid 62379 Jan 30 17:41:26 r113c18s01 slurmd[26867]: debug: task_p_slurmd_release_resources: affinity jobid 62379 Jan 30 17:45:28 r113c18s01 slurmd[26867]: debug: task_p_slurmd_release_resources: affinity jobid 62379 Jan 30 17:49:35 r113c18s01 slurmd[26867]: debug: task_p_slurmd_release_resources: affinity jobid 62379 Jan 30 17:53:37 r113c18s01 slurmd[26867]: debug: task_p_slurmd_release_resources: affinity jobid 62379 We tried to setup an UnkillableStepProgram to kill the sleep process but the script is not invoked, we guess because the slurmd is defunct. Any idea? Thanks ale -- Alessandro Federico HPC System Management Group System & Technology Department CINECA www.cineca.it Via dei Tizii 6, 00185 Rome - Italy phone: +39 06 44486708 All work and no play makes Jack a dull boy. All work and no play makes Jack a dull boy. All work and no play makes Jack...