I modified my slurm.conf like :
 
NodeName=GO[1-5]
 
PartitionName=party Default=yes Nodes=GO[1-5]
 
and I restarted slurmctld and slurmd services.
 
[root@GO1]~# systemctl start slurmctld 
[root@GO1]~# systemctl status slurmctld
● slurmctld.service - Slurm controller daemon
   Loaded: loaded (/etc/systemd/system/slurmctld.service; enabled; vendor 
preset: disabled)
   Active: failed (Result: exit-code) since 금 2017-07-28 13:26:27 KST; 1s ago
  Process: 19583 ExecStart=/usr/sbin/slurmctld (code=exited, status=0/SUCCESS)
 Main PID: 19586 (code=exited, status=1/FAILURE)
 
 7월 28 13:26:27 GO1 systemd[1]: Starting Slurm controller daemon...
 7월 28 13:26:27 GO1 systemd[1]: PID file /var/run/slurmd/slurmctld.pid not 
readable (yet?) after start.
 7월 28 13:26:27 GO1 systemd[1]: Started Slurm controller daemon.
 7월 28 13:26:27 GO1 slurmctld[19586]: fatal: Frontend not configured correctly 
in slurm.conf.  See man slurm.conf look for frontendname.
 7월 28 13:26:27 GO1 systemd[1]: slurmctld.service: main process exited, 
code=exited, status=1/FAILURE
 7월 28 13:26:27 GO1 systemd[1]: Unit slurmctld.service entered failed state.
 7월 28 13:26:27 GO1 systemd[1]: slurmctld.service failed.
 
[root@GO1]~# systemctl restart slurmd
Job for slurmd.service failed because the control process exited with error 
code. See "systemctl status slurmd.service" and "journalctl -xe" for details.
[root@GO1]~# systemctl status slurmd
● slurmd.service - Slurm Node daemon
   Loaded: loaded (/etc/systemd/system/slurmd.service; enabled; vendor preset: 
disabled)
   Active: failed (Result: exit-code) since 금 2017-07-28 13:27:47 KST; 7s ago
  Process: 19922 ExecStart=/usr/sbin/slurmd (code=exited, status=1/FAILURE)
 Main PID: 24228 (code=exited, status=0/SUCCESS)
 
 7월 28 13:27:47 GO1 systemd[1]: Starting Slurm Node daemon...
 7월 28 13:27:47 GO1 systemd[1]: slurmd.service: control process exited, 
code=exited status=1
 7월 28 13:27:47 GO1 systemd[1]: Failed to start Slurm Node daemon.
 7월 28 13:27:47 GO1 systemd[1]: Unit slurmd.service entered failed state.
 7월 28 13:27:47 GO1 systemd[1]: slurmd.service failed.

[root@GO1]~# /usr/sbin/slurmdslurmd: fatal: Frontend not configured correctly 
in slurm.conf.  See man slurm.conf look for frontendname.

 
-----Original Message-----
From: "Gilles Gouaillardet"<[email protected]> 
To: "slurm-dev"<[email protected]>; 
Cc: 
Sent: 2017-07-28 (금) 11:32:26
Subject: [slurm-dev] Re: Why my slurm is running on only one node?
 

what if you use this in your slurm.conf instead ?


# COMPUTE NODES
NodeName=GO[1-5]


# PARTITIONS
PartitionName=party Default=yes Nodes=GO[1-5]


On 7/28/2017 9:28 AM, 허웅 wrote:
> =?utf-8?B?V2h5IG15IHNsdXJtIGlzIHJ1bm5pbmcgb24gb25seSBvbmUgbm9kZT8=?= I 
> have 5 nodes include control node.
>
> and my nodes are looking like this
>
> Control Node : GO1
> Compute Nodes : GO[1-5]
>
> when i trying to allocate some job to multiple nodes, only one node 
> works.
>
> example]
>
> $ srun -N5 hostname
> GO1
> GO1
> GO1
> GO1
> GO1
>
> even I expected like this
>
> $ srun -N5 hostname
> GO1
> GO2
> GO3
> GO4
> GO5
>
> What should i do?
>
> there are some my configures.
>
> $ scontrol show frontend
> FrontendName=GO1 State=IDLE Version=17.02 Reason=(null)
> BootTime=2017-06-02T20:14:39 SlurmdStartTime=2017-07-27T16:29:46
>
> FrontendName=GO2 State=IDLE Version=17.02 Reason=(null)
> BootTime=2017-07-05T17:54:13 SlurmdStartTime=2017-07-27T16:30:07
>
> FrontendName=GO3 State=IDLE Version=17.02 Reason=(null)
> BootTime=2017-07-05T17:22:58 SlurmdStartTime=2017-07-27T16:30:08
>
> FrontendName=GO4 State=IDLE Version=17.02 Reason=(null)
> BootTime=2017-07-05T17:21:40 SlurmdStartTime=2017-07-27T16:30:08
>
> FrontendName=GO5 State=IDLE Version=17.02 Reason=(null)
> BootTime=2017-07-05T17:21:39 SlurmdStartTime=2017-07-27T16:30:09
>
> $ scontrol ping
> Slurmctld(primary/backup) at GO1/(NULL) are UP/DOWN
>
> [slurm.conf]
> # slurm.conf
> #
> # See the slurm.conf man page for more information.
> #
> ClusterName=linux
> ControlMachine=GO1
> ControlAddr=192.168.30.74
> #
> SlurmUser=slurm
> SlurmctldPort=6817
> SlurmdPort=6818
> AuthType=auth/munge
> StateSaveLocation=/var/lib/slurmd
> SlurmdSpoolDir=/var/spool/slurmd
> SwitchType=switch/none
> MpiDefault=none
> SlurmctldPidFile=/var/run/slurmd/slurmctld.pid
> SlurmdPidFile=/var/run/slurmd/slurmd.pid
> ProctrackType=proctrack/pgid
> ReturnToService=0
> TreeWidth=50
> #
> # TIMERS
> SlurmctldTimeout=300
> SlurmdTimeout=300
> InactiveLimit=0
> MinJobAge=300
> KillWait=30
> Waittime=0
> #
> # SCHEDULING
> SchedulerType=sched/backfill
> FastSchedule=1
> #
> # LOGGING
> SlurmctldDebug=7
> SlurmctldLogFile=/var/log/slurmctld.log
> SlurmdDebug=7
> SlurmdLogFile=/var/log/slurmd.log
> JobCompType=jobcomp/none
> #
> # COMPUTE NODES
> NodeName=sgo[1-5] NodeHostName=GO[1-5] 
> #NodeAddr=192.168.30.[74,141,68,70,72]
> #
> # PARTITIONS
> PartitionName=party Default=yes Nodes=ALL
>

Reply via email to