OpenMPI 1.2.4

mpirun noticed that job rank 0 with PID 19021 on node pc801 exited on signal 15 (Terminated).
11 additional processes aborted (not shown)

(gdb) bt
#0 0x411b776c in mca_pml_ob1_recv_frag_match () from /usr/local/openmpi//lib/openmpi/mca_pml_ob1.so #1 0x411ce010 in mca_btl_sm_component_progress () from /usr/local/openmpi//lib/openmpi/mca_btl_sm.so #2 0x411c2df9 in mca_bml_r2_progress () from /usr/local/openmpi//lib/openmpi/mca_bml_r2.so #3 0x404fb549 in opal_progress () from /usr/local/openmpi/lib/libopen-pal.so.0 #4 0x411b87cb in mca_pml_ob1_recv_frag_match () from /usr/local/openmpi//lib/openmpi/mca_pml_ob1.so #5 0x411ce010 in mca_btl_sm_component_progress () from /usr/local/openmpi//lib/openmpi/mca_btl_sm.so #6 0x411c2df9 in mca_bml_r2_progress () from /usr/local/openmpi//lib/openmpi/mca_bml_r2.so #7 0x404fb549 in opal_progress () from /usr/local/openmpi/lib/libopen-pal.so.0 #8 0x411b87cb in mca_pml_ob1_recv_frag_match () from /usr/local/openmpi//lib/openmpi/mca_pml_ob1.so #9 0x411ce010 in mca_btl_sm_component_progress () from /usr/local/openmpi//lib/openmpi/mca_btl_sm.so #10 0x411c2df9 in mca_bml_r2_progress () from /usr/local/openmpi//lib/openmpi/mca_bml_r2.so #11 0x404fb549 in opal_progress () from /usr/local/openmpi/lib/libopen-pal.so.0 #12 0x411b87cb in mca_pml_ob1_recv_frag_match () from /usr/local/openmpi//lib/openmpi/mca_pml_ob1.so #13 0x411ce010 in mca_btl_sm_component_progress () from /usr/local/openmpi//lib/openmpi/mca_btl_sm.so #14 0x411c2df9 in mca_bml_r2_progress () from /usr/local/openmpi//lib/openmpi/mca_bml_r2.so #15 0x404fb549 in opal_progress () from /usr/local/openmpi/lib/libopen-pal.so.0 #16 0x411b87cb in mca_pml_ob1_recv_frag_match () from /usr/local/openmpi//lib/openmpi/mca_pml_ob1.so #17 0x411ce010 in mca_btl_sm_component_progress () from /usr/local/openmpi//lib/openmpi/mca_btl_sm.so #18 0x411c2df9 in mca_bml_r2_progress () from /usr/local/openmpi//lib/openmpi/mca_bml_r2.so #19 0x404fb549 in opal_progress () from /usr/local/openmpi/lib/libopen-pal.so.0 #20 0x411b87cb in mca_pml_ob1_recv_frag_match () from /usr/local/openmpi//lib/openmpi/mca_pml_ob1.so #21 0x411ce010 in mca_btl_sm_component_progress () from /usr/local/openmpi//lib/openmpi/mca_btl_sm.so #22 0x411c2df9 in mca_bml_r2_progress () from /usr/local/openmpi//lib/openmpi/mca_bml_r2.so #23 0x404fb549 in opal_progress () from /usr/local/openmpi/lib/libopen-pal.so.0 #24 0x411b87cb in mca_pml_ob1_recv_frag_match () from /usr/local/openmpi//lib/openmpi/mca_pml_ob1.so #25 0x411ce010 in mca_btl_sm_component_progress () from /usr/local/openmpi//lib/openmpi/mca_btl_sm.so #26 0x411c2df9 in mca_bml_r2_progress () from /usr/local/openmpi//lib/openmpi/mca_bml_r2.so #27 0x404fb549 in opal_progress () from /usr/local/openmpi/lib/libopen-pal.so.0 #28 0x411b87cb in mca_pml_ob1_recv_frag_match () from /usr/local/openmpi//lib/openmpi/mca_pml_ob1.so #29 0x411ce010 in mca_btl_sm_component_progress () from /usr/local/openmpi//lib/openmpi/mca_btl_sm.so #30 0x411c2df9 in mca_bml_r2_progress () from /usr/local/openmpi//lib/openmpi/mca_bml_r2.so #31 0x404fb549 in opal_progress () from /usr/local/openmpi/lib/libopen-pal.so.0 #32 0x411b87cb in mca_pml_ob1_recv_frag_match () from /usr/local/openmpi//lib/openmpi/mca_pml_ob1.so #33 0x411ce010 in mca_btl_sm_component_progress () from /usr/local/openmpi//lib/openmpi/mca_btl_sm.so #34 0x411c2df9 in mca_bml_r2_progress () from /usr/local/openmpi//lib/openmpi/mca_bml_r2.so #35 0x404fb549 in opal_progress () from /usr/local/openmpi/lib/libopen-pal.so.0 #36 0x411b87cb in mca_pml_ob1_recv_frag_match () from /usr/local/openmpi//lib/openmpi/mca_pml_ob1.so #37 0x411ce010 in mca_btl_sm_component_progress () from /usr/local/openmpi//lib/openmpi/mca_btl_sm.so #38 0x411c2df9 in mca_bml_r2_progress () from /usr/local/openmpi//lib/openmpi/mca_bml_r2.so #39 0x404fb549 in opal_progress () from /usr/local/openmpi/lib/libopen-pal.so.0 #40 0x411b87cb in mca_pml_ob1_recv_frag_match () from /usr/local/openmpi//lib/openmpi/mca_pml_ob1.so #41 0x411ce010 in mca_btl_sm_component_progress () from /usr/local/openmpi//lib/openmpi/mca_btl_sm.so #42 0x411c2df9 in mca_bml_r2_progress () from /usr/local/openmpi//lib/openmpi/mca_bml_r2.so #43 0x404fb549 in opal_progress () from /usr/local/openmpi/lib/libopen-pal.so.0 #44 0x411b87cb in mca_pml_ob1_recv_frag_match () from /usr/local/openmpi//lib/openmpi/mca_pml_ob1.so
(...)
#19661 0x411ce010 in mca_btl_sm_component_progress () from /usr/local/openmpi//lib/openmpi/mca_btl_sm.so #19662 0x411c2df9 in mca_bml_r2_progress () from /usr/local/openmpi//lib/openmpi/mca_bml_r2.so #19663 0x404fb549 in opal_progress () from /usr/local/openmpi/lib/libopen-pal.so.0 #19664 0x411b87cb in mca_pml_ob1_recv_frag_match () from /usr/local/openmpi//lib/openmpi/mca_pml_ob1.so #19665 0x411ce010 in mca_btl_sm_component_progress () from /usr/local/openmpi//lib/openmpi/mca_btl_sm.so #19666 0x411c2df9 in mca_bml_r2_progress () from /usr/local/openmpi//lib/openmpi/mca_bml_r2.so #19667 0x404fb549 in opal_progress () from /usr/local/openmpi/lib/libopen-pal.so.0 #19668 0x400d9bb5 in ompi_request_wait_all () from /usr/local/openmpi/lib/libmpi.so.0 #19669 0x411f57a3 in ompi_coll_tuned_bcast_intra_generic () from /usr/local/openmpi//lib/openmpi/mca_coll_tuned.so #19670 0x411f5e55 in ompi_coll_tuned_bcast_intra_binomial () from /usr/local/openmpi//lib/openmpi/mca_coll_tuned.so #19671 0x411efb3f in ompi_coll_tuned_bcast_intra_dec_fixed () from /usr/local/openmpi//lib/openmpi/mca_coll_tuned.so
#19672 0x400ee239 in PMPI_Bcast () from /usr/local/openmpi/lib/libmpi.so.0
#19673 0x081009a3 in CProcessing::postProcessWorker (this=0x843a3c8) at CProcessing.cpp:403 #19674 0x081042ee in CInputSetMap::postProcessWorker (this=0x843a260) at CInputSetMap.cpp:554 #19675 0x0812f0f5 in CInputSetMap::processWorker (this=0x843a3f8) at CInputSetMap.cpp:580
#19676 0x080b0945 in CLS_WorkerStart () at CLS_WorkerStartup.cpp:11
#19677 0x080ac2e9 in CLS_Worker () at CLS_Worker.cpp:44
#19678 0x0813706f in main (argc=1, argv=0xbfae84d4) at SYS_Main.cpp:201

Seems like recursive endless loop for me...
Unfortunately I have to spread one double per one MPI_Bcast (not whole vector for example), as the behavior later needs such approach (don't ask why). I commented out everything that can be dangerous, in fact I'm just spreading data now and this is enough to crash... it appears only on a big input set, whole code works perfecly on smaller datasets.

code:

HEAD:
       for(i=0; i < numAlphaSets; i++)
       {
           CAlphaSet *alphaSet = *alphaSetIterator;
           for(cols=0; cols < numCols; cols++)
           {
               double alpha =alphaSet->alpha[cols-1];
               MPI_Bcast(&alpha, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
           }
           *alphaSetIterator++;
       }

WORKER:
       double alpha;
       for(i=0; i < numAlphaSets; i++)
       {
           for(cols=0; cols < numCols; cols++)
           {
               MPI_Bcast(&alpha, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
               // do something with alpha, commented out for debug
           }
       }

I try to spread around 820,000 MPI_DOUBLEs that way. Obviously, I will re-write this to spread data in bigger chunks and split them on workers, but seems strange anyway... could be some buffer issues, or...?

greets, Marcin

Reply via email to