Hi Ralph and Gilles,
it's strange that the program works with "--host" and "--slot-list"
in your environment and not in mine. I get the following output, if
I run the program in gdb without a breakpoint.
loki spawn 142 gdb /usr/local/openmpi-1.10.3_64_gcc/bin/mpiexec
GNU gdb (GDB; SUSE Linux Enterprise 12) 7.9.1
...
(gdb) set args -np 1 --host loki --slot-list 0:0-1,1:0-1 simple_spawn
(gdb) run
Starting program: /usr/local/openmpi-1.10.3_64_gcc/bin/mpiexec -np 1 --host
loki --slot-list 0:0-1,1:0-1 simple_spawn
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
Detaching after fork from child process 18031.
[pid 18031] starting up!
0 completed MPI_Init
Parent [pid 18031] about to spawn!
Detaching after fork from child process 18033.
Detaching after fork from child process 18034.
[pid 18033] starting up!
[pid 18034] starting up!
[loki:18034] *** Process received signal ***
[loki:18034] Signal: Segmentation fault (11)
...
I get a different output, if I run the program in gdb with
a breakpoint.
gdb /usr/local/openmpi-1.10.3_64_gcc/bin/mpiexec
(gdb) set args -np 1 --host loki --slot-list 0:0-1,1:0-1 simple_spawn
(gbd) set follow-fork-mode child
(gdb) break ompi_proc_self
(gdb) run
(gdb) next
Repeating "next" very often results in the following output.
...
Starting program:
/home/fd1026/work/skripte/master/parallel/prog/mpi/spawn/simple_spawn
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
[pid 13277] starting up!
[New Thread 0x7ffff42ef700 (LWP 13289)]
Breakpoint 1, ompi_proc_self (size=0x7fffffffc060)
at ../../openmpi-1.10.3rc3/ompi/proc/proc.c:413
413 ompi_proc_t **procs = (ompi_proc_t**) malloc(sizeof(ompi_proc_t*));
(gdb) n
414 if (NULL == procs) {
(gdb)
423 OBJ_RETAIN(ompi_proc_local_proc);
(gdb)
424 *procs = ompi_proc_local_proc;
(gdb)
425 *size = 1;
(gdb)
426 return procs;
(gdb)
427 }
(gdb)
ompi_comm_init () at ../../openmpi-1.10.3rc3/ompi/communicator/comm_init.c:138
138 group->grp_my_rank = 0;
(gdb)
139 group->grp_proc_count = (int)size;
...
193 ompi_comm_reg_init();
(gdb)
196 ompi_comm_request_init ();
(gdb)
198 return OMPI_SUCCESS;
(gdb)
199 }
(gdb)
ompi_mpi_init (argc=0, argv=0x0, requested=0, provided=0x7fffffffc21c)
at ../../openmpi-1.10.3rc3/ompi/runtime/ompi_mpi_init.c:738
738 if (OMPI_SUCCESS != (ret = ompi_file_init())) {
(gdb)
744 if (OMPI_SUCCESS != (ret = ompi_win_init())) {
(gdb)
750 if (OMPI_SUCCESS != (ret = ompi_attr_init())) {
...
988 ompi_mpi_initialized = true;
(gdb)
991 if (ompi_enable_timing && 0 == OMPI_PROC_MY_NAME->vpid) {
(gdb)
999 return MPI_SUCCESS;
(gdb)
1000 }
(gdb)
PMPI_Init (argc=0x0, argv=0x0) at pinit.c:94
94 if (MPI_SUCCESS != err) {
(gdb)
104 return MPI_SUCCESS;
(gdb)
105 }
(gdb)
0x0000000000400d0c in main ()
(gdb)
Single stepping until exit from function main,
which has no line number information.
0 completed MPI_Init
Parent [pid 13277] about to spawn!
[New process 13472]
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
process 13472 is executing new program:
/usr/local/openmpi-1.10.3_64_gcc/bin/orted
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
[New process 13474]
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
process 13474 is executing new program:
/home/fd1026/work/skripte/master/parallel/prog/mpi/spawn/simple_spawn
[pid 13475] starting up!
[pid 13476] starting up!
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
[pid 13474] starting up!
[New Thread 0x7ffff491b700 (LWP 13480)]
[Switching to Thread 0x7ffff7ff1740 (LWP 13474)]
Breakpoint 1, ompi_proc_self (size=0x7fffffffba30)
at ../../openmpi-1.10.3rc3/ompi/proc/proc.c:413
413 ompi_proc_t **procs = (ompi_proc_t**) malloc(sizeof(ompi_proc_t*));
(gdb)
414 if (NULL == procs) {
...
426 return procs;
(gdb)
427 }
(gdb)
ompi_comm_init () at ../../openmpi-1.10.3rc3/ompi/communicator/comm_init.c:138
138 group->grp_my_rank = 0;
(gdb)
139 group->grp_proc_count = (int)size;
(gdb)
140 OMPI_GROUP_SET_INTRINSIC (group);
...
193 ompi_comm_reg_init();
(gdb)
196 ompi_comm_request_init ();
(gdb)
198 return OMPI_SUCCESS;
(gdb)
199 }
(gdb)
ompi_mpi_init (argc=0, argv=0x0, requested=0, provided=0x7fffffffbbec)
at ../../openmpi-1.10.3rc3/ompi/runtime/ompi_mpi_init.c:738
738 if (OMPI_SUCCESS != (ret = ompi_file_init())) {
(gdb)
744 if (OMPI_SUCCESS != (ret = ompi_win_init())) {
(gdb)
750 if (OMPI_SUCCESS != (ret = ompi_attr_init())) {
...
863 if (OMPI_SUCCESS != (ret = ompi_pubsub_base_select())) {
(gdb)
869 if (OMPI_SUCCESS != (ret =
mca_base_framework_open(&ompi_dpm_base_framework, 0))) {
(gdb)
873 if (OMPI_SUCCESS != (ret = ompi_dpm_base_select())) {
(gdb)
884 if ( OMPI_SUCCESS !=
(gdb)
894 if (OMPI_SUCCESS !=
(gdb)
900 if (OMPI_SUCCESS !=
(gdb)
911 if (OMPI_SUCCESS != (ret = ompi_dpm.dyn_init())) {
(gdb)
Parent done with spawn
Parent sending message to child
2 completed MPI_Init
Hello from the child 2 of 3 on host loki pid 13476
1 completed MPI_Init
Hello from the child 1 of 3 on host loki pid 13475
921 if (OMPI_SUCCESS != (ret = ompi_cr_init())) {
(gdb)
931 opal_progress_event_users_decrement();
(gdb)
934 opal_progress_set_yield_when_idle(ompi_mpi_yield_when_idle);
(gdb)
937 if (ompi_mpi_event_tick_rate >= 0) {
(gdb)
946 if (OMPI_SUCCESS != (ret = ompi_mpiext_init())) {
(gdb)
953 if (ret != OMPI_SUCCESS) {
(gdb)
972 OBJ_CONSTRUCT(&ompi_registered_datareps, opal_list_t);
(gdb)
977 OBJ_CONSTRUCT( &ompi_mpi_f90_integer_hashtable, opal_hash_table_t);
(gdb)
978 opal_hash_table_init(&ompi_mpi_f90_integer_hashtable, 16 /* why
not? */);
(gdb)
980 OBJ_CONSTRUCT( &ompi_mpi_f90_real_hashtable, opal_hash_table_t);
(gdb)
981 opal_hash_table_init(&ompi_mpi_f90_real_hashtable, FLT_MAX_10_EXP);
(gdb)
983 OBJ_CONSTRUCT( &ompi_mpi_f90_complex_hashtable, opal_hash_table_t);
(gdb)
984 opal_hash_table_init(&ompi_mpi_f90_complex_hashtable,
FLT_MAX_10_EXP);
(gdb)
988 ompi_mpi_initialized = true;
(gdb)
991 if (ompi_enable_timing && 0 == OMPI_PROC_MY_NAME->vpid) {
(gdb)
999 return MPI_SUCCESS;
(gdb)
1000 }
(gdb)
PMPI_Init (argc=0x0, argv=0x0) at pinit.c:94
94 if (MPI_SUCCESS != err) {
(gdb)
104 return MPI_SUCCESS;
(gdb)
105 }
(gdb)
0x0000000000400d0c in main ()
(gdb)
Single stepping until exit from function main,
which has no line number information.
0 completed MPI_Init
Hello from the child 0 of 3 on host loki pid 13474
Child 2 disconnected
Child 1 disconnected
Child 0 received msg: 38
Parent disconnected
13277: exiting
Program received signal SIGTERM, Terminated.
0x0000000000400f0a in main ()
(gdb)
Single stepping until exit from function main,
which has no line number information.
[tcsetpgrp failed in terminal_inferior: No such process]
[Thread 0x7ffff491b700 (LWP 13480) exited]
Program terminated with signal SIGTERM, Terminated.
The program no longer exists.
(gdb)
The program is not being run.
(gdb)
The program is not being run.
(gdb) info break
Num Type Disp Enb Address What
1 breakpoint keep y 0x00007ffff7aa35c7 in ompi_proc_self
at
../../openmpi-1.10.3rc3/ompi/proc/proc.c:413 inf 8, 7, 6, 5, 4, 3, 2, 1
breakpoint already hit 2 times
(gdb) delete 1
(gdb) r
Starting program:
/home/fd1026/work/skripte/master/parallel/prog/mpi/spawn/simple_spawn
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
[pid 16708] starting up!
0 completed MPI_Init
Parent [pid 16708] about to spawn!
[New process 16720]
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
process 16720 is executing new program:
/usr/local/openmpi-1.10.3_64_gcc/bin/orted
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
[New process 16722]
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
process 16722 is executing new program:
/home/fd1026/work/skripte/master/parallel/prog/mpi/spawn/simple_spawn
[pid 16723] starting up!
[pid 16724] starting up!
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
[pid 16722] starting up!
Parent done with spawn
Parent sending message to child
1 completed MPI_Init
Hello from the child 1 of 3 on host loki pid 16723
2 completed MPI_Init
Hello from the child 2 of 3 on host loki pid 16724
0 completed MPI_Init
Hello from the child 0 of 3 on host loki pid 16722
Child 0 received msg: 38
Child 0 disconnected
Parent disconnected
Child 1 disconnected
Child 2 disconnected
16708: exiting
16724: exiting
16723: exiting
[New Thread 0x7ffff491b700 (LWP 16729)]
Program received signal SIGTERM, Terminated.
[Switching to Thread 0x7ffff7ff1740 (LWP 16722)]
__GI__dl_debug_state () at dl-debug.c:74
74 dl-debug.c: No such file or directory.
(gdb) --------------------------------------------------------------------------
WARNING: A process refused to die despite all the efforts!
This process may still be running and/or consuming resources.
Host: loki
PID: 16722
--------------------------------------------------------------------------
The following simple_spawn processes exist now.
loki spawn 171 ps -aef | grep simple_spawn
fd1026 11079 11053 0 14:00 pts/0 00:00:00
/usr/local/openmpi-1.10.3_64_gcc/bin/mpiexec -np 1 --host loki --slot-list
0:0-1,1:0-1 simple_spawn
fd1026 11095 11079 29 14:01 pts/0 00:09:37 [simple_spawn] <defunct>
fd1026 16722 1 0 14:31 ? 00:00:00 [simple_spawn] <defunct>
fd1026 17271 29963 0 14:33 pts/2 00:00:00 grep simple_spawn
loki spawn 172
Is it possible that there is a race condition? How can I help
to get a solution for my problem?
Kind regards
Siegmar
Am 24.05.2016 um 16:54 schrieb Ralph Castain:
Works perfectly for me, so I believe this must be an environment issue - I am
using gcc 6.0.0 on CentOS7 with x86:
$ mpirun -n 1 -host bend001 --slot-list 0:0-1,1:0-1 --report-bindings
./simple_spawn
[bend001:17599] MCW rank 0 bound to socket 0[core 0[hwt 0-1]], socket 0[core
1[hwt 0-1]], socket 1[core 6[hwt 0-1]], socket 1[core 7[hwt 0-1]]:
[BB/BB/../../../..][BB/BB/../../../..]
[pid 17601] starting up!
0 completed MPI_Init
Parent [pid 17601] about to spawn!
[pid 17603] starting up!
[bend001:17599] MCW rank 0 bound to socket 0[core 0[hwt 0-1]], socket 0[core
1[hwt 0-1]], socket 1[core 6[hwt 0-1]], socket 1[core 7[hwt 0-1]]:
[BB/BB/../../../..][BB/BB/../../../..]
[bend001:17599] MCW rank 1 bound to socket 0[core 0[hwt 0-1]], socket 0[core
1[hwt 0-1]], socket 1[core 6[hwt 0-1]], socket 1[core 7[hwt 0-1]]:
[BB/BB/../../../..][BB/BB/../../../..]
[bend001:17599] MCW rank 2 bound to socket 0[core 0[hwt 0-1]], socket 0[core
1[hwt 0-1]], socket 1[core 6[hwt 0-1]], socket 1[core 7[hwt 0-1]]:
[BB/BB/../../../..][BB/BB/../../../..]
[pid 17604] starting up!
[pid 17605] starting up!
Parent done with spawn
Parent sending message to child
0 completed MPI_Init
Hello from the child 0 of 3 on host bend001 pid 17603
Child 0 received msg: 38
1 completed MPI_Init
Hello from the child 1 of 3 on host bend001 pid 17604
2 completed MPI_Init
Hello from the child 2 of 3 on host bend001 pid 17605
Child 0 disconnected
Child 2 disconnected
Parent disconnected
Child 1 disconnected
17603: exiting
17605: exiting
17601: exiting
17604: exiting
$
On May 24, 2016, at 7:18 AM, Siegmar Gross
<siegmar.gr...@informatik.hs-fulda.de> wrote:
Hi Ralph and Gilles,
the program breaks only, if I combine "--host" and "--slot-list". Perhaps this
information is helpful. I use a different machine now, so that you can see that
the problem is not restricted to "loki".
pc03 spawn 115 ompi_info | grep -e "OPAL repo revision:" -e "C compiler
absolute:"
OPAL repo revision: v1.10.2-201-gd23dda8
C compiler absolute: /usr/local/gcc-6.1.0/bin/gcc
pc03 spawn 116 uname -a
Linux pc03 3.12.55-52.42-default #1 SMP Thu Mar 3 10:35:46 UTC 2016 (4354e1d)
x86_64 x86_64 x86_64 GNU/Linux
pc03 spawn 117 cat host_pc03.openmpi
pc03.informatik.hs-fulda.de slots=12 max_slots=12
pc03 spawn 118 mpicc simple_spawn.c
pc03 spawn 119 mpiexec -np 1 --report-bindings a.out
[pc03:03711] MCW rank 0 bound to socket 0[core 0[hwt 0-1]]:
[BB/../../../../..][../../../../../..]
[pid 3713] starting up!
0 completed MPI_Init
Parent [pid 3713] about to spawn!
[pc03:03711] MCW rank 0 bound to socket 1[core 6[hwt 0-1]], socket 1[core 7[hwt
0-1]], socket 1[core 8[hwt 0-1]], socket 1[core 9[hwt 0-1]], socket 1[core
10[hwt 0-1]], socket 1[core 11[hwt 0-1]]: [../../../../../..][BB/BB/BB/BB/BB/BB]
[pc03:03711] MCW rank 1 bound to socket 0[core 0[hwt 0-1]], socket 0[core 1[hwt
0-1]], socket 0[core 2[hwt 0-1]], socket 0[core 3[hwt 0-1]], socket 0[core
4[hwt 0-1]], socket 0[core 5[hwt 0-1]]: [BB/BB/BB/BB/BB/BB][../../../../../..]
[pc03:03711] MCW rank 2 bound to socket 1[core 6[hwt 0-1]], socket 1[core 7[hwt
0-1]], socket 1[core 8[hwt 0-1]], socket 1[core 9[hwt 0-1]], socket 1[core
10[hwt 0-1]], socket 1[core 11[hwt 0-1]]: [../../../../../..][BB/BB/BB/BB/BB/BB]
[pid 3715] starting up!
[pid 3716] starting up!
[pid 3717] starting up!
Parent done with spawn
Parent sending message to child
0 completed MPI_Init
Hello from the child 0 of 3 on host pc03 pid 3715
1 completed MPI_Init
Hello from the child 1 of 3 on host pc03 pid 3716
2 completed MPI_Init
Hello from the child 2 of 3 on host pc03 pid 3717
Child 0 received msg: 38
Child 0 disconnected
Child 2 disconnected
Parent disconnected
Child 1 disconnected
3713: exiting
3715: exiting
3716: exiting
3717: exiting
pc03 spawn 120 mpiexec -np 1 --hostfile host_pc03.openmpi --slot-list
0:0-1,1:0-1 --report-bindings a.out
[pc03:03729] MCW rank 0 bound to socket 0[core 0[hwt 0-1]], socket 0[core 1[hwt
0-1]], socket 1[core 6[hwt 0-1]], socket 1[core 7[hwt 0-1]]:
[BB/BB/../../../..][BB/BB/../../../..]
[pid 3731] starting up!
0 completed MPI_Init
Parent [pid 3731] about to spawn!
[pc03:03729] MCW rank 0 bound to socket 0[core 0[hwt 0-1]], socket 0[core 1[hwt
0-1]], socket 1[core 6[hwt 0-1]], socket 1[core 7[hwt 0-1]]:
[BB/BB/../../../..][BB/BB/../../../..]
[pc03:03729] MCW rank 1 bound to socket 0[core 0[hwt 0-1]], socket 0[core 1[hwt
0-1]], socket 1[core 6[hwt 0-1]], socket 1[core 7[hwt 0-1]]:
[BB/BB/../../../..][BB/BB/../../../..]
[pc03:03729] MCW rank 2 bound to socket 0[core 0[hwt 0-1]], socket 0[core 1[hwt
0-1]], socket 1[core 6[hwt 0-1]], socket 1[core 7[hwt 0-1]]:
[BB/BB/../../../..][BB/BB/../../../..]
[pid 3733] starting up!
[pid 3734] starting up!
[pid 3735] starting up!
Parent done with spawn
Parent sending message to child
2 completed MPI_Init
Hello from the child 2 of 3 on host pc03 pid 3735
1 completed MPI_Init
Hello from the child 1 of 3 on host pc03 pid 3734
0 completed MPI_Init
Hello from the child 0 of 3 on host pc03 pid 3733
Child 0 received msg: 38
Child 0 disconnected
Child 2 disconnected
Child 1 disconnected
Parent disconnected
3731: exiting
3734: exiting
3733: exiting
3735: exiting
pc03 spawn 121 mpiexec -np 1 --host pc03 --slot-list 0:0-1,1:0-1
--report-bindings a.out
[pc03:03744] MCW rank 0 bound to socket 0[core 0[hwt 0-1]], socket 0[core 1[hwt
0-1]], socket 1[core 6[hwt 0-1]], socket 1[core 7[hwt 0-1]]:
[BB/BB/../../../..][BB/BB/../../../..]
[pid 3746] starting up!
0 completed MPI_Init
Parent [pid 3746] about to spawn!
[pc03:03744] MCW rank 0 bound to socket 0[core 0[hwt 0-1]], socket 0[core 1[hwt
0-1]], socket 1[core 6[hwt 0-1]], socket 1[core 7[hwt 0-1]]:
[BB/BB/../../../..][BB/BB/../../../..]
[pc03:03744] MCW rank 2 bound to socket 0[core 0[hwt 0-1]], socket 0[core 1[hwt
0-1]], socket 1[core 6[hwt 0-1]], socket 1[core 7[hwt 0-1]]:
[BB/BB/../../../..][BB/BB/../../../..]
[pid 3748] starting up!
[pid 3749] starting up!
[pc03:03749] *** Process received signal ***
[pc03:03749] Signal: Segmentation fault (11)
[pc03:03749] Signal code: Address not mapped (1)
[pc03:03749] Failing at address: 0x8
[pc03:03749] [ 0] /lib64/libpthread.so.0(+0xf870)[0x7fe6f0d1f870]
[pc03:03749] [ 1]
/usr/local/openmpi-1.10.3_64_gcc/lib64/libmpi.so.12(ompi_proc_self+0x35)[0x7fe6f0f825b0]
[pc03:03749] [ 2]
/usr/local/openmpi-1.10.3_64_gcc/lib64/libmpi.so.12(ompi_comm_init+0x68b)[0x7fe6f0f61b08]
[pc03:03749] [ 3]
/usr/local/openmpi-1.10.3_64_gcc/lib64/libmpi.so.12(ompi_mpi_init+0xa90)[0x7fe6f0f87e8a]
[pc03:03749] [ 4]
/usr/local/openmpi-1.10.3_64_gcc/lib64/libmpi.so.12(MPI_Init+0x1a0)[0x7fe6f0fc42ae]
[pc03:03749] [ 5] a.out[0x400d0c]
[pc03:03749] [ 6] /lib64/libc.so.6(__libc_start_main+0xf5)[0x7fe6f0989b05]
[pc03:03749] [ 7] a.out[0x400bf9]
[pc03:03749] *** End of error message ***
--------------------------------------------------------------------------
mpiexec noticed that process rank 2 with PID 3749 on node pc03 exited on signal
11 (Segmentation fault).
--------------------------------------------------------------------------
pc03 spawn 122
Kind regards
Siegmar
On 05/24/16 15:44, Ralph Castain wrote:
On May 24, 2016, at 6:21 AM, Siegmar Gross
<siegmar.gr...@informatik.hs-fulda.de> wrote:
Hi Ralph,
I copy the relevant lines to this place, so that it is easier to see what
happens. "a.out" is your program, which I compiled with mpicc.
loki spawn 153 ompi_info | grep -e "OPAL repo revision:" -e "C compiler
absolute:"
OPAL repo revision: v1.10.2-201-gd23dda8
C compiler absolute: /usr/local/gcc-6.1.0/bin/gcc
loki spawn 154 mpicc simple_spawn.c
loki spawn 155 mpiexec -np 1 a.out
[pid 24008] starting up!
0 completed MPI_Init
...
"mpiexec -np 1 a.out" works.
I don’t know what “a.out” is, but it looks like there is some memory
corruption there.
"a.out" is still your program. I get the same error on different
machines, so that it is not very likely, that the (hardware) memory
is corrupted.
loki spawn 156 mpiexec -np 1 --host loki --slot-list 0-5 a.out
[pid 24102] starting up!
0 completed MPI_Init
Parent [pid 24102] about to spawn!
[pid 24104] starting up!
[pid 24105] starting up!
[loki:24105] *** Process received signal ***
[loki:24105] Signal: Segmentation fault (11)
[loki:24105] Signal code: Address not mapped (1)
...
"mpiexec -np 1 --host loki --slot-list 0-5 a.out" breaks with a segmentation
faUlt. Can I do something, so that you can find out, what happens?
I honestly have no idea - perhaps Gilles can help, as I have no access to that
kind of environment. We aren’t seeing such problems elsewhere, so it is likely
something local.
Kind regards
Siegmar
On 05/24/16 15:07, Ralph Castain wrote:
On May 24, 2016, at 4:19 AM, Siegmar Gross
<siegmar.gr...@informatik.hs-fulda.de
<mailto:siegmar.gr...@informatik.hs-fulda.de>> wrote:
Hi Ralph,
thank you very much for your answer and your example program.
On 05/23/16 17:45, Ralph Castain wrote:
I cannot replicate the problem - both scenarios work fine for me. I’m not
convinced your test code is correct, however, as you call Comm_free the
inter-communicator but didn’t call Comm_disconnect. Checkout the attached
for a correct code and see if it works for you.
I thought that I only need MPI_Comm_Disconnect, if I would have established a
connection with MPI_Comm_connect before. The man page for MPI_Comm_free states
"This operation marks the communicator object for deallocation. The
handle is set to MPI_COMM_NULL. Any pending operations that use this
communicator will complete normally; the object is actually deallocated only
if there are no other active references to it.".
The man page for MPI_Comm_disconnect states
"MPI_Comm_disconnect waits for all pending communication on comm to complete
internally, deallocates the communicator object, and sets the handle to
MPI_COMM_NULL. It is a collective operation.".
I don't see a difference for my spawned processes, because both functions will
"wait" until all pending operations have finished, before the object will be
destroyed. Nevertheless, perhaps my small example program worked all the years
by chance.
However, I don't understand, why my program works with
"mpiexec -np 1 --host loki,loki,loki,loki,loki spawn_master" and breaks with
"mpiexec -np 1 --host loki --slot-list 0:0-5,1:0-5 spawn_master". You are right,
my slot-list is equivalent to "-bind-to none". I could also have used
"mpiexec -np 1 --host loki --oversubscribe spawn_master" which works as well.
Well, you are only giving us one slot when you specify "-host loki”, and then
you are trying to launch multiple processes into it. The “slot-list” option only
tells us what cpus to bind each process to - it doesn’t allocate process slots.
So you have to tell us how many processes are allowed to run on this node.
The program breaks with "There are not enough slots available in the system
to satisfy ...", if I only use "--host loki" or different host names,
without mentioning five host names, using "slot-list", or "oversubscribe",
Unfortunately "--host <host name>:<number of slots>" isn't available for
openmpi-1.10.3rc2 to specify the number of available slots.
Correct - we did not backport the new syntax
Your program behaves the same way as mine, so that MPI_Comm_disconnect
will not solve my problem. I had to modify your program in a negligible way
to get it compiled.
loki spawn 153 ompi_info | grep -e "OPAL repo revision:" -e "C compiler
absolute:"
OPAL repo revision: v1.10.2-201-gd23dda8
C compiler absolute: /usr/local/gcc-6.1.0/bin/gcc
loki spawn 154 mpicc simple_spawn.c
loki spawn 155 mpiexec -np 1 a.out
[pid 24008] starting up!
0 completed MPI_Init
Parent [pid 24008] about to spawn!
[pid 24010] starting up!
[pid 24011] starting up!
[pid 24012] starting up!
Parent done with spawn
Parent sending message to child
0 completed MPI_Init
Hello from the child 0 of 3 on host loki pid 24010
1 completed MPI_Init
Hello from the child 1 of 3 on host loki pid 24011
2 completed MPI_Init
Hello from the child 2 of 3 on host loki pid 24012
Child 0 received msg: 38
Child 0 disconnected
Child 1 disconnected
Child 2 disconnected
Parent disconnected
24012: exiting
24010: exiting
24008: exiting
24011: exiting
Is something wrong with my command line? I didn't use slot-list before, so
that I'm not sure, if I use it in the intended way.
I don’t know what “a.out” is, but it looks like there is some memory corruption
there.
loki spawn 156 mpiexec -np 1 --host loki --slot-list 0-5 a.out
[pid 24102] starting up!
0 completed MPI_Init
Parent [pid 24102] about to spawn!
[pid 24104] starting up!
[pid 24105] starting up!
[loki:24105] *** Process received signal ***
[loki:24105] Signal: Segmentation fault (11)
[loki:24105] Signal code: Address not mapped (1)
[loki:24105] Failing at address: 0x8
[loki:24105] [ 0] /lib64/libpthread.so.0(+0xf870)[0x7f39aa76f870]
[loki:24105] [ 1]
/usr/local/openmpi-1.10.3_64_gcc/lib64/libmpi.so.12(ompi_proc_self+0x35)[0x7f39aa9d25b0]
[loki:24105] [ 2]
/usr/local/openmpi-1.10.3_64_gcc/lib64/libmpi.so.12(ompi_comm_init+0x68b)[0x7f39aa9b1b08]
[loki:24105] [ 3] *** An error occurred in MPI_Init
*** on a NULL communicator
*** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort,
*** and potentially your MPI job)
[loki:24104] Local abort before MPI_INIT completed successfully; not able to
aggregate error messages, and not able to guarantee that all other processes
were killed!
/usr/local/openmpi-1.10.3_64_gcc/lib64/libmpi.so.12(ompi_mpi_init+0xa90)[0x7f39aa9d7e8a]
[loki:24105] [ 4]
/usr/local/openmpi-1.10.3_64_gcc/lib64/libmpi.so.12(MPI_Init+0x1a0)[0x7f39aaa142ae]
[loki:24105] [ 5] a.out[0x400d0c]
[loki:24105] [ 6] /lib64/libc.so.6(__libc_start_main+0xf5)[0x7f39aa3d9b05]
[loki:24105] [ 7] a.out[0x400bf9]
[loki:24105] *** End of error message ***
-------------------------------------------------------
Child job 2 terminated normally, but 1 process returned
a non-zero exit code.. Per user-direction, the job has been aborted.
-------------------------------------------------------
--------------------------------------------------------------------------
mpiexec detected that one or more processes exited with non-zero status, thus
causing
the job to be terminated. The first process to do so was:
Process name: [[49560,2],0]
Exit code: 1
--------------------------------------------------------------------------
loki spawn 157
Hopefully, you will find out what happens. Please let me know, if I can
help you in any way.
Kind regards
Siegmar
FWIW: I don’t know how many cores you have on your sockets, but if you
have 6 cores/socket, then your slot-list is equivalent to “—bind-to none”
as the slot-list applies to every process being launched
On May 23, 2016, at 6:26 AM, Siegmar Gross
<siegmar.gr...@informatik.hs-fulda.de
<mailto:siegmar.gr...@informatik.hs-fulda.de>> wrote:
Hi,
I installed openmpi-1.10.3rc2 on my "SUSE Linux Enterprise Server
12 (x86_64)" with Sun C 5.13 and gcc-6.1.0. Unfortunately I get
a segmentation fault for "--slot-list" for one of my small programs.
loki spawn 119 ompi_info | grep -e "OPAL repo revision:" -e "C compiler
absolute:"
OPAL repo revision: v1.10.2-201-gd23dda8
C compiler absolute: /usr/local/gcc-6.1.0/bin/gcc
loki spawn 120 mpiexec -np 1 --host loki,loki,loki,loki,loki spawn_master
Parent process 0 running on loki
I create 4 slave processes
Parent process 0: tasks in MPI_COMM_WORLD: 1
tasks in COMM_CHILD_PROCESSES local group: 1
tasks in COMM_CHILD_PROCESSES remote group: 4
Slave process 0 of 4 running on loki
Slave process 1 of 4 running on loki
Slave process 2 of 4 running on loki
spawn_slave 2: argv[0]: spawn_slave
Slave process 3 of 4 running on loki
spawn_slave 0: argv[0]: spawn_slave
spawn_slave 1: argv[0]: spawn_slave
spawn_slave 3: argv[0]: spawn_slave
loki spawn 121 mpiexec -np 1 --host loki --slot-list 0:0-5,1:0-5 spawn_master
Parent process 0 running on loki
I create 4 slave processes
[loki:17326] *** Process received signal ***
[loki:17326] Signal: Segmentation fault (11)
[loki:17326] Signal code: Address not mapped (1)
[loki:17326] Failing at address: 0x8
[loki:17326] [ 0] /lib64/libpthread.so.0(+0xf870)[0x7f4e469b3870]
[loki:17326] [ 1] *** An error occurred in MPI_Init
*** on a NULL communicator
*** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort,
*** and potentially your MPI job)
[loki:17324] Local abort before MPI_INIT completed successfully; not able to
aggregate error messages, and not able to guarantee that all other processes
were killed!
/usr/local/openmpi-1.10.3_64_gcc/lib64/libmpi.so.12(ompi_proc_self+0x35)[0x7f4e46c165b0]
[loki:17326] [ 2]
/usr/local/openmpi-1.10.3_64_gcc/lib64/libmpi.so.12(ompi_comm_init+0x68b)[0x7f4e46bf5b08]
[loki:17326] [ 3] *** An error occurred in MPI_Init
*** on a NULL communicator
*** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort,
*** and potentially your MPI job)
[loki:17325] Local abort before MPI_INIT completed successfully; not able to
aggregate error messages, and not able to guarantee that all other processes
were killed!
/usr/local/openmpi-1.10.3_64_gcc/lib64/libmpi.so.12(ompi_mpi_init+0xa90)[0x7f4e46c1be8a]
[loki:17326] [ 4]
/usr/local/openmpi-1.10.3_64_gcc/lib64/libmpi.so.12(MPI_Init+0x180)[0x7f4e46c5828e]
[loki:17326] [ 5] spawn_slave[0x40097e]
[loki:17326] [ 6] /lib64/libc.so.6(__libc_start_main+0xf5)[0x7f4e4661db05]
[loki:17326] [ 7] spawn_slave[0x400a54]
[loki:17326] *** End of error message ***
-------------------------------------------------------
Child job 2 terminated normally, but 1 process returned
a non-zero exit code.. Per user-direction, the job has been aborted.
-------------------------------------------------------
--------------------------------------------------------------------------
mpiexec detected that one or more processes exited with non-zero status,
thus causing
the job to be terminated. The first process to do so was:
Process name: [[56340,2],0]
Exit code: 1
--------------------------------------------------------------------------
loki spawn 122
I would be grateful, if somebody can fix the problem. Thank you
very much for any help in advance.
Kind regards
Siegmar
_______________________________________________
users mailing list
us...@open-mpi.org <mailto:us...@open-mpi.org>
Subscription: https://www.open-mpi.org/mailman/listinfo.cgi/users
Link to this post:
http://www.open-mpi.org/community/lists/users/2016/05/29281.php
_______________________________________________
users mailing list
us...@open-mpi.org <mailto:us...@open-mpi.org>
Subscription: https://www.open-mpi.org/mailman/listinfo.cgi/users
Link to this
post: http://www.open-mpi.org/community/lists/users/2016/05/29284.php
<simple_spawn_modified.c>_______________________________________________
users mailing list
us...@open-mpi.org <mailto:us...@open-mpi.org>
Subscription: https://www.open-mpi.org/mailman/listinfo.cgi/users
Link to this post:
http://www.open-mpi.org/community/lists/users/2016/05/29300.php
_______________________________________________
users mailing list
us...@open-mpi.org
Subscription: https://www.open-mpi.org/mailman/listinfo.cgi/users
Link to this post:
http://www.open-mpi.org/community/lists/users/2016/05/29301.php
_______________________________________________
users mailing list
us...@open-mpi.org
Subscription: https://www.open-mpi.org/mailman/listinfo.cgi/users
Link to this post:
http://www.open-mpi.org/community/lists/users/2016/05/29304.php
_______________________________________________
users mailing list
us...@open-mpi.org
Subscription: https://www.open-mpi.org/mailman/listinfo.cgi/users
Link to this post:
http://www.open-mpi.org/community/lists/users/2016/05/29307.php
_______________________________________________
users mailing list
us...@open-mpi.org
Subscription: https://www.open-mpi.org/mailman/listinfo.cgi/users
Link to this post:
http://www.open-mpi.org/community/lists/users/2016/05/29308.php
_______________________________________________
users mailing list
us...@open-mpi.org
Subscription: https://www.open-mpi.org/mailman/listinfo.cgi/users
Link to this post:
http://www.open-mpi.org/community/lists/users/2016/05/29309.php