It's the failure on readv that's the source of the trouble. What
happens if you only if_include eth2? Does it work then?

Still hangs, details follow ... tx!

o Using only eth2 with verbosity gives:

[roberpj@bro127:~/samples/mpi_test] /opt/sharcnet/openmpi/1.6.5/intel-debug/bin/mpirun -np 2 --mca btl tcp,sm,self --mca btl_tcp_if_include eth2 --mca btl_base_verbose 100 --host bro127,bro128 ./a.out
[bro127:30120] mca: base: components_open: Looking for btl components
[bro128:00923] mca: base: components_open: Looking for btl components
[bro127:30120] mca: base: components_open: opening btl components
[bro127:30120] mca: base: components_open: found loaded component self
[bro127:30120] mca: base: components_open: component self has no register function [bro127:30120] mca: base: components_open: component self open function successful
[bro127:30120] mca: base: components_open: found loaded component sm
[bro127:30120] mca: base: components_open: component sm has no register function [bro127:30120] mca: base: components_open: component sm open function successful
[bro127:30120] mca: base: components_open: found loaded component tcp
[bro127:30120] mca: base: components_open: component tcp register function successful [bro127:30120] mca: base: components_open: component tcp open function successful
[bro128:00923] mca: base: components_open: opening btl components
[bro128:00923] mca: base: components_open: found loaded component self
[bro128:00923] mca: base: components_open: component self has no register function [bro128:00923] mca: base: components_open: component self open function successful
[bro128:00923] mca: base: components_open: found loaded component sm
[bro128:00923] mca: base: components_open: component sm has no register function [bro128:00923] mca: base: components_open: component sm open function successful
[bro128:00923] mca: base: components_open: found loaded component tcp
[bro128:00923] mca: base: components_open: component tcp register function successful [bro128:00923] mca: base: components_open: component tcp open function successful
[bro127:30120] select: initializing btl component self
[bro127:30120] select: init of component self returned success
[bro127:30120] select: initializing btl component sm
[bro127:30120] select: init of component sm returned success
[bro127:30120] select: initializing btl component tcp
[bro127:30120] select: init of component tcp returned success
[bro128:00923] select: initializing btl component self
[bro128:00923] select: init of component self returned success
[bro128:00923] select: initializing btl component sm
[bro128:00923] select: init of component sm returned success
[bro128:00923] select: initializing btl component tcp
[bro128:00923] select: init of component tcp returned success
Number of processes = 2
Test repeated 3 times for reliability
[bro127:30120] btl: tcp: attempting to connect() to address 10.29.4.128 on port 4 [bro128:00923] btl: tcp: attempting to connect() to address 10.29.4.127 on port 4
I am process 0 on node bro127
Run 1 of 3
P0: Sending to P1
P0: Waiting to receive from P1
I am process 1 on node bro128
P1: Waiting to receive from to P0
[bro127][[6346,1],0][../../../../../../openmpi-1.6.5/ompi/mca/btl/tcp/btl_tcp_frag.c:215:mca_btl_tcp_frag_recv] mca_btl_tcp_frag_recv: readv failed: Connection timed out (110)

o The top output and gdb full backtraces of both the mpirun (pid=30111)
and a.out (pid=30120) processes spinning on master node bro127 follow:

[root@bro127:~] top -u roberpj
  PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND
30120 roberpj   20   0  115m 5220 3228 R 100.0  0.0  12:39.25 ./a.out
29890 roberpj 20 0 115m 2428 1168 S 0.0 0.0 0:00.01 sshd: roberpj@pts/0
29895 roberpj   20   0  116m 2164 1604 S  0.0  0.0   0:00.04 -bash
30111 roberpj 20 0 83760 3660 2724 S 0.0 0.0 0:00.03 /opt/sharcnet/openmpi/1.6.5/intel-debug/bin/mpirun -np 2 --mca btl tcp,sm,self --mca btl_tcp_if_include eth2 --mca btl_base_verbose 100 --host bro127,bro128 ./a.out

[root@bro127:~] gdb -p 30111
(gdb) bt full
#0  0x00002b8b385b6a28 in poll () from /lib64/libc.so.6
No symbol table info available.
#1 0x00002b8b36dcafbf in poll_dispatch (base=0x1e59970, arg=0xd, tv=0x3e8) at ../../../../openmpi-1.6.5/opal/event/poll.c:167
        j = 31707344
        msec = 1000
        nfds = 13
#2 0x00002b8b36dc8ee6 in opal_event_base_loop (base=0x1e59970, flags=13) at ../../../../openmpi-1.6.5/opal/event/event.c:838
        evsel = 0x1e3d6c8
        evbase = 0x1e1b660
        tv = {tv_sec = 1, tv_usec = 0}
#3 0x00002b8b36dc8d92 in opal_event_loop (flags=31824240) at ../../../../openmpi-1.6.5/opal/event/event.c:766
No locals.
#4 0x00002b8b36dc8d78 in opal_event_dispatch () at ../../../../openmpi-1.6.5/opal/event/event.c:702
No locals.
#5 0x000000000040749d in orterun (argc=15, argv=0x7fff92d55be8) at ../../../../../openmpi-1.6.5/orte/tools/orterun/orterun.c:925
        rc = 0
cmd_line = {super = {obj_magic_id = 0, obj_class = 0x2b8b37099b00, obj_reference_count = 1, cls_init_file_name = 0x41f490 "../../../../../openmpi-1.6.5/orte/tools/orterun/orterun.c", cls_init_lineno = 636}, lcl_mutex = {super = {obj_magic_id = 0, obj_class = 0x628860, obj_reference_count = 1, cls_init_file_name = 0x2b8b36e49f78 "../../../../openmpi-1.6.5/opal/util/cmd_line.c", cls_init_lineno = 944}, m_lock_pthread = {__data = { __lock = 0, __count = 0, __owner = 0, __nusers = 0, __kind = -1, __spins = 0, __list = {__prev = 0x0, __next = 0x0}}, __size = '\000' <repeats 16 times>"\377, \377\377\377", '\000' <repeats 19 times>, __align = 0}, m_lock_atomic = {u = {lock = 0, sparc_lock = 0 '\000', padding = "\000\000\000"}}}, lcl_options = {super = {obj_magic_id = 0, obj_class = 0x628660, obj_reference_count = 1, cls_init_file_name = 0x2b8b36e49f78 "../../../../openmpi-1.6.5/opal/util/cmd_line.c", cls_init_lineno = 939}, opal_list_sentinel = {super = {obj_magic_id = 16046253926196952813, obj_class = 0x2b8b37098ea0, obj_reference_count = 1, cls_init_file_name = 0x2b8b36e40620 "../../../openmpi-1.6.5/opal/class/opal_list.c", cls_init_lineno = 88}, opal_list_next = 0x7fff92d556d8, opal_list_prev = 0x7fff92d556d8, item_free = 1, opal_list_item_refcount = 1, opal_list_item_belong_to = 0x7fff92d556b0}, opal_list_length = 0}, lcl_argc = 0, lcl_argv = 0x0, lcl_params = {super = {obj_magic_id = 0, obj_class = 0x628660, obj_reference_count = 1, cls_init_file_name = 0x2b8b36e49f78 "../../../../openmpi-1.6.5/opal/util/cmd_line.c", cls_init_lineno = 940}, opal_list_sentinel = {super = {obj_magic_id = 16046253926196952813, obj_class = 0x2b8b37098ea0, obj_reference_count = 1, cls_init_file_name = 0x2b8b36e40620 "../../../openmpi-1.6.5/opal/class/opal_list.c", cls_init_lineno = 88}, opal_list_next = 0x7fff92d55760, opal_list_prev = 0x7fff92d55760, item_free = 1, opal_list_item_refcount = 1, opal_list_item_belong_to = 0x7fff92d55738}, opal_list_length = 0}, lcl_tail_argc = 0, lcl_tail_argv = 0x0}
        tmp_env_var = 0x0
        param = 0x1e23db0 "\260\017\324\066\213+"
        app = 0x1e1cf10
#6 0x0000000000405354 in main (argc=15, argv=0x7fff92d55be8) at ../../../../../openmpi-1.6.5/orte/tools/orterun/main.c:13
No locals.
(gdb)

[root@bro127:~] gdb -p 30120
(gdb) bt full
#0  0x00002b27ae8f06f3 in __epoll_wait_nocancel () from /lib64/libc.so.6
No symbol table info available.
#1 0x00002b27ad085edc in epoll_dispatch (base=0x3, arg=0xa58850, tv=0x20) at ../../../../openmpi-1.6.5/opal/event/epoll.c:215
        events = 0xa58850
        evep = 0xa58850
#2 0x00002b27ad087276 in opal_event_base_loop (base=0x3, flags=10848336) at ../../../../openmpi-1.6.5/opal/event/event.c:838
        evsel = 0xa58818
        evbase = 0xa570a0
        tv = {tv_sec = 0, tv_usec = 0}
#3 0x00002b27ad087122 in opal_event_loop (flags=3) at ../../../../openmpi-1.6.5/opal/event/event.c:766
No locals.
#4 0x00002b27ad0cac14 in opal_progress () at ../../../openmpi-1.6.5/opal/runtime/opal_progress.c:189
No locals.
#5 0x00002b27b25f0c40 in mca_pml_ob1_recv (addr=0x3, count=10848336, datatype=0x20, src=-1, tag=128, comm=0x81, status=0x7fffef027478) at ../../../../../../openmpi-1.6.5/ompi/mca/pml/ob1/pml_ob1_irecv.c:105
        item = 0x20
        recvreq = 0xb05580
#6 0x00002b27acfcb0f7 in PMPI_Recv (buf=0x3, count=10848336, type=0x20, source=-1, tag=128, comm=0x81, status=0x4026e0) at precv.c:78
        size = 32
        true_lb = 3
#7 0x0000000000402b65 in main (argc=1, argv=0x7fffef0275d8) at mpi_test.c:72
        repeats = 3
        msgid = 0
        myid = 0
        procs = 2
        j = 5000
        k = 0
A = {1804289383, 846930886, 1681692777, 1714636915, 1957747793, 424238335, 719885386, 1649760492, 596516649, 1189641421, 1025202362, 1350490027, 783368690, 1102520059, 2044897763, 1967513926, 1365180540, 1540383426, 304089172, 1303455736, 35005211, 521595368, 294702567, 1726956429, 336465782, 861021530, 278722862, 233665123, 2145174067, 468703135, 1101513929, 1801979802, 1315634022, 635723058, 1369133069, 1125898167, 1059961393, 2089018456, 628175011, 1656478042, 1131176229, 1653377373, 859484421, 1914544919, 608413784, 756898537, 1734575198, 1973594324, 149798315, 2038664370, 1129566413, 184803526, 412776091, 1424268980, 1911759956, 749241873, 137806862, 42999170, 982906996, 135497281, 511702305, 2084420925, 1937477084, 1827336327, 572660336, 1159126505, 805750846, 1632621729, 1100661313, 1433925857, 1141616124, 84353895, 939819582, 2001100545, 1998898814, 1548233367, 610515434, 1585990364, 1374344043, 760313750, 1477171087, 356426808, 945117276, 1889947178, 1780695788, 709393584, 491705403, 1918502651, 752392754, 1474612399, 2053999932, 1264095060, 1411549676, 1843993368, 943947739, 1984210012, 855636226, 1749698586, 1469348094, 1956297539, 1036140795, 463480570, 2040651434, 1975960378, 317097467, 1892066601, 1376710097, 927612902, 1330573317, 603570492, 1687926652, 660260756, 959997301, 485560280, 402724286, 593209441, 1194953865, 894429689, 364228444, 1947346619, 221558440, 270744729, 1063958031, 1633108117, 2114738097, 2007905771, 1469834481, 822890675, 1610120709, 791698927, 631704567, 498777856, 1255179497, 524872353, 327254586, 1572276965, 269455306, 1703964683, 352406219, 1600028624, 160051528, 2040332871, 112805732, 1120048829, 378409503, 515530019, 1713258270, 1573363368, 1409959708, 2077486715, 1373226340, 1631518149, 200747796, 289700723, 1117142618, 168002245, 150122846, 439493451, 990892921, 1760243555, 1231192379, 1622597488, 111537764, 338888228, 2147469841, 438792350, 1911165193, 269441500, 2142757034, 116087764, 1869470124, 155324914, 8936987, 1982275856, 1275373743, 387346491, 350322227, 841148365, 1960709859, 1760281936, 771151432, 1186452551, 1244316437, 971899228, 1476153275, 213975407, 1139901474, 1626276121, 653468858, 2130794395, 1239036029, 1884661237, 1605908235, 1350573793, 76065818, 1605894428, 1789366143, 1987231011, 1875335928, 1784639529...}
        namelen = 6
        processor_name = "bro127", '\000' <repeats 249 times>
stat = {MPI_SOURCE = -1395538283, MPI_TAG = 11047, MPI_ERROR = 1, _cancelled = 0, _ucount = 63}
(gdb)

o The top output and gdb full backtraces of the orted (pid=922) and
a.out (pid=923) processes spinning on slave node bro128 are below:

[root@bro128:~] top -u roberpj
  PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND
  923 roberpj   20   0  115m 5208 3220 R 98.5  0.0  33:46.47 ./a.out
922 roberpj 20 0 79456 2400 1524 S 0.0 0.0 0:00.02 /opt/sharcnet/openmpi/1.6.5/intel-debug/bin/orted --daemonize -mca ess env -mca orte_ess_jobid 415891456 -mca orte_ess_vpid 1 -mca orte_ess_num_procs 2 --hnp-uri 415891456.0;tcp://10.27.2.127:47800;tcp://10.29.4.127:47800 --mca btl tcp,sm,self --mca btl_tcp_if_include eth2 --mca btl_base_verbose 100 -mca plm rsh

[root@bro128:~] gdb -p 923
(gdb) bt full
#0  0x00002b0c7e5c86f3 in __epoll_wait_nocancel () from /lib64/libc.so.6
No symbol table info available.
#1 0x00002b0c7cd5dedc in epoll_dispatch (base=0x3, arg=0x1117850, tv=0x20) at ../../../../openmpi-1.6.5/opal/event/epoll.c:215
        events = 0x1117850
        evep = 0x1117850
#2 0x00002b0c7cd5f276 in opal_event_base_loop (base=0x3, flags=17922128) at ../../../../openmpi-1.6.5/opal/event/event.c:838
        evsel = 0x1117818
        evbase = 0x11160a0
        tv = {tv_sec = 0, tv_usec = 0}
#3 0x00002b0c7cd5f122 in opal_event_loop (flags=3) at ../../../../openmpi-1.6.5/opal/event/event.c:766
No locals.
#4 0x00002b0c7cda2c14 in opal_progress () at ../../../openmpi-1.6.5/opal/runtime/opal_progress.c:189
No locals.
#5 0x00002b0c822c8c40 in mca_pml_ob1_recv (addr=0x3, count=17922128, datatype=0x20, src=-1, tag=0, comm=0x80000, status=0x7fffe046a028) at ../../../../../../openmpi-1.6.5/ompi/mca/pml/ob1/pml_ob1_irecv.c:105
        item = 0x20
        recvreq = 0x11c4580
#6 0x00002b0c7cca30f7 in PMPI_Recv (buf=0x3, count=17922128, type=0x20, source=-1, tag=0, comm=0x80000, status=0x4026e0) at precv.c:78
        size = 32
        true_lb = 3
#7 0x0000000000402c40 in main (argc=1, argv=0x7fffe046a188) at mpi_test.c:76
        repeats = 3
        msgid = 0
        myid = 1
        procs = 2
        j = 5000
        k = 0
A = {1804289383, 846930886, 1681692777, 1714636915, 1957747793, 424238335, 719885386, 1649760492, 596516649, 1189641421, 1025202362, 1350490027, 783368690, 1102520059, 2044897763, 1967513926, 1365180540, 1540383426, 304089172, 1303455736, 35005211, 521595368, 294702567, 1726956429, 336465782, 861021530, 278722862, 233665123, 2145174067, 468703135, 1101513929, 1801979802, 1315634022, 635723058, 1369133069, 1125898167, 1059961393, 2089018456, 628175011, 1656478042, 1131176229, 1653377373, 859484421, 1914544919, 608413784, 756898537, 1734575198, 1973594324, 149798315, 2038664370, 1129566413, 184803526, 412776091, 1424268980, 1911759956, 749241873, 137806862, 42999170, 982906996, 135497281, 511702305, 2084420925, 1937477084, 1827336327, 572660336, 1159126505, 805750846, 1632621729, 1100661313, 1433925857, 1141616124, 84353895, 939819582, 2001100545, 1998898814, 1548233367, 610515434, 1585990364, 1374344043, 760313750, 1477171087, 356426808, 945117276, 1889947178, 1780695788, 709393584, 491705403, 1918502651, 752392754, 1474612399, 2053999932, 1264095060, 1411549676, 1843993368, 943947739, 1984210012, 855636226, 1749698586, 1469348094, 1956297539, 1036140795, 463480570, 2040651434, 1975960378, 317097467, 1892066601, 1376710097, 927612902, 1330573317, 603570492, 1687926652, 660260756, 959997301, 485560280, 402724286, 593209441, 1194953865, 894429689, 364228444, 1947346619, 221558440, 270744729, 1063958031, 1633108117, 2114738097, 2007905771, 1469834481, 822890675, 1610120709, 791698927, 631704567, 498777856, 1255179497, 524872353, 327254586, 1572276965, 269455306, 1703964683, 352406219, 1600028624, 160051528, 2040332871, 112805732, 1120048829, 378409503, 515530019, 1713258270, 1573363368, 1409959708, 2077486715, 1373226340, 1631518149, 200747796, 289700723, 1117142618, 168002245, 150122846, 439493451, 990892921, 1760243555, 1231192379, 1622597488, 111537764, 338888228, 2147469841, 438792350, 1911165193, 269441500, 2142757034, 116087764, 1869470124, 155324914, 8936987, 1982275856, 1275373743, 387346491, 350322227, 841148365, 1960709859, 1760281936, 771151432, 1186452551, 1244316437, 971899228, 1476153275, 213975407, 1139901474, 1626276121, 653468858, 2130794395, 1239036029, 1884661237, 1605908235, 1350573793, 76065818, 1605894428, 1789366143, 1987231011, 1875335928, 1784639529...}
        namelen = 6
        processor_name = "bro128", '\000' <repeats 249 times>
stat = {MPI_SOURCE = 2090813077, MPI_TAG = 11020, MPI_ERROR = 1, _cancelled = 0, _ucount = 63}
(gdb)

[root@bro128:~] gdb -p 922
(gdb) bt full
#0  0x00002ae5a0deea28 in poll () from /lib64/libc.so.6
No symbol table info available.
#1 0x00002ae59f602fbf in poll_dispatch (base=0x9bb7a0, arg=0x9, tv=0x3e8) at ../../../../openmpi-1.6.5/opal/event/poll.c:167
        j = 10183248
        msec = 1000
        nfds = 9
#2 0x00002ae59f600ee6 in opal_event_base_loop (base=0x9bb7a0, flags=9) at ../../../../openmpi-1.6.5/opal/event/event.c:838
        evsel = 0x9b6848
        evbase = 0x9b6880
        tv = {tv_sec = 1, tv_usec = 0}
#3 0x00002ae59f600d92 in opal_event_loop (flags=10205088) at ../../../../openmpi-1.6.5/opal/event/event.c:766
No locals.
#4 0x00002ae59f600d78 in opal_event_dispatch () at ../../../../openmpi-1.6.5/opal/event/event.c:702
No locals.
#5 0x00002ae59f596fcc in orte_daemon (argc=28, argv=0x7fff5899f2a8) at ../../../openmpi-1.6.5/orte/orted/orted_main.c:846 launch = {parent = {obj_magic_id = 47165708974200, obj_class = 0x2ae59f338980, obj_reference_count = -1621913208, cls_init_file_name = 0x5d13aee <Address 0x5d13aee out of bounds>, cls_init_lineno = -1581336208}, type = 1 '\001', base_ptr = 0x614298 "", pack_ptr = 0x0, unpack_ptr = 0x0, bytes_allocated = 140734679872168, bytes_used = 0}
        flag = 0 '\000'
        command = 0 '\000'
        fd = -1618098584
        cmd_line = 0x15a
log_file = "@\360\231X\377\177\000\000O\360\231X\377\177\000\000\200B2\237\345*\000\000\020\360\231X\377\177", '\000' <repeats 26 times>, "\020\360B\241\345*\000\000\000\000\000\341\063\261\376\377\000\221\276\241\345*\000\000\200\360\231X\377\177\000\000~J2\237\345*\000\000\000\260\276\241\345*\000\000\000\320Р\345*\000\000\000\000\000\000\000\000\000\000\020\360\231X\377\177\000\000\200\360\231X\377\177\000\000\000\000\000\000\000\000\000\000P\341\231X\377\177\000\000\327\\Ҡ\345*\000\000\000\320Р\345*\000\000\262B2\237\345*\000\000\000\000\000\000\000\000\000\000\226a2\237\345*\000\000؏S\237\345*\000\000\070\360\231X\377\177\000\000@\360\231X\377\177\000\000O\360\231X\377\177\000\000\200B2\237\345*\000\000\020\360\231X\377\177", '\000' <repeats 26 times>, "0\361B\241\345*\000\000\200\360\231X\377\177\000\000~J2\237\345*\000\000\000\260\276\241\345*\000\000\000\320Р\345*", '\000' <repeats 12 times>, "@\300\063\261\376\377\000\000\366\302d>\313U", '\000' <repeats 16 times>"\214"...
        jobidstring = 0x2ae59f325fd0 "H\211\303d\213\004%\030"
        rml_uri = 0x0
        buffer = 0xa13d60
hostname = "\332\023@\000\000\000\000\000汎\223\000\000\000\000\070\365\330\003\000\000\000\000.\000\000\000\000\000\000\000P:Ѡ\345*\000\000\000\000\000\000\000\000\000\000\220\361\231X\377\177\000\000H<Ѡ\345*\000\000\210\373Ѡ\345*\000\000\001", '\000' <repeats 15 times>, "\001\000\000\000\000\000\000\000\250", <incomplete sequence \371\215\237>
        starttime = {tv_sec = 1391200366, tv_usec = 945156}
        setuptime = {tv_sec = 47165708932320, tv_usec = 140734679871488}
#6 0x0000000000402556 in main (argc=10205088, argv=0x9) at ../../../../../openmpi-1.6.5/orte/tools/orted/orted.c:62 umask_str = 0xfffffffffffffdfc <Address 0xfffffffffffffdfc out of bounds>
(gdb)

Reply via email to