Hi Ralph,
This is the result of adding -mca ras_base_verbose 50. SCRIPT: mpirun -machinefile pbs_hosts -np ${NPROCS} -report-bindings -bind-to core \ -mca ras_base_verbose 50 -mca plm_base_verbose 5 ./mPre OUTPUT: [node08.cluster:26770] mca:base:select:( plm) Querying component [rsh] [node08.cluster:26770] [[INVALID],INVALID] plm:rsh_lookup on agent /usr/bin/rsh path NULL [node08.cluster:26770] mca:base:select:( plm) Query of component [rsh] set priority to 10 [node08.cluster:26770] mca:base:select:( plm) Querying component [slurm] [node08.cluster:26770] mca:base:select:( plm) Skipping component [slurm]. Query failed to return a module [node08.cluster:26770] mca:base:select:( plm) Querying component [tm] [node08.cluster:26770] mca:base:select:( plm) Query of component [tm] set priority to 75 [node08.cluster:26770] mca:base:select:( plm) Selected component [tm] [node08.cluster:26770] plm:base:set_hnp_name: initial bias 26770 nodename hash 85176670 [node08.cluster:26770] plm:base:set_hnp_name: final jobfam 56543 [node08.cluster:26770] [[56543,0],0] plm:base:receive start comm [node08.cluster:26770] mca: base: components_register: registering ras components [node08.cluster:26770] mca: base: components_register: found loaded component loadleveler [node08.cluster:26770] mca: base: components_register: component loadleveler register function successful [node08.cluster:26770] mca: base: components_register: found loaded component simulator [node08.cluster:26770] mca: base: components_register: component simulator register function successful [node08.cluster:26770] mca: base: components_register: found loaded component slurm [node08.cluster:26770] mca: base: components_register: component slurm register function successful [node08.cluster:26770] mca: base: components_register: found loaded component tm [node08.cluster:26770] mca: base: components_register: component tm register function successful [node08.cluster:26770] mca: base: components_open: opening ras components [node08.cluster:26770] mca: base: components_open: found loaded component loadleveler [node08.cluster:26770] mca: base: components_open: component loadleveler open function successful [node08.cluster:26770] mca: base: components_open: found loaded component simulator [node08.cluster:26770] mca: base: components_open: found loaded component slurm [node08.cluster:26770] mca: base: components_open: component slurm open function successful [node08.cluster:26770] mca: base: components_open: found loaded component tm [node08.cluster:26770] mca: base: components_open: component tm open function successful [node08.cluster:26770] mca:base:select: Auto-selecting ras components [node08.cluster:26770] mca:base:select:( ras) Querying component [loadleveler] [node08.cluster:26770] [[56543,0],0] ras:loadleveler: NOT available for selection [node08.cluster:26770] mca:base:select:( ras) Skipping component [loadleveler]. Query failed to return a module [node08.cluster:26770] mca:base:select:( ras) Querying component [simulator] [node08.cluster:26770] mca:base:select:( ras) Skipping component [simulator]. Query failed to return a module [node08.cluster:26770] mca:base:select:( ras) Querying component [slurm] [node08.cluster:26770] mca:base:select:( ras) Skipping component [slurm]. Query failed to return a module [node08.cluster:26770] mca:base:select:( ras) Querying component [tm] [node08.cluster:26770] mca:base:select:( ras) Query of component [tm] set priority to 100 [node08.cluster:26770] mca:base:select:( ras) Selected component [tm] [node08.cluster:26770] mca: base: close: unloading component loadleveler [node08.cluster:26770] mca: base: close: unloading component simulator [node08.cluster:26770] mca: base: close: component slurm closed [node08.cluster:26770] mca: base: close: unloading component slurm [node08.cluster:26770] [[56543,0],0] plm:base:setup_job [node08.cluster:26770] [[56543,0],0] ras:base:allocate [node08.cluster:26770] [[56543,0],0] ras:tm:allocate:discover: got hostname node08 [node08.cluster:26770] [[56543,0],0] ras:tm:allocate:discover: not found -- added to list [node08.cluster:26770] [[56543,0],0] ras:tm:allocate:discover: got hostname node08 [node08.cluster:26770] [[56543,0],0] ras:tm:allocate:discover: found -- bumped slots to 2 [node08.cluster:26770] [[56543,0],0] ras:tm:allocate:discover: got hostname node08 [node08.cluster:26770] [[56543,0],0] ras:tm:allocate:discover: found -- bumped slots to 3 [node08.cluster:26770] [[56543,0],0] ras:tm:allocate:discover: got hostname node08 [node08.cluster:26770] [[56543,0],0] ras:tm:allocate:discover: found -- bumped slots to 4 [node08.cluster:26770] [[56543,0],0] ras:tm:allocate:discover: got hostname node08 [node08.cluster:26770] [[56543,0],0] ras:tm:allocate:discover: found -- bumped slots to 5 [node08.cluster:26770] [[56543,0],0] ras:tm:allocate:discover: got hostname node08 [node08.cluster:26770] [[56543,0],0] ras:tm:allocate:discover: found -- bumped slots to 6 [node08.cluster:26770] [[56543,0],0] ras:tm:allocate:discover: got hostname node08 [node08.cluster:26770] [[56543,0],0] ras:tm:allocate:discover: found -- bumped slots to 7 [node08.cluster:26770] [[56543,0],0] ras:tm:allocate:discover: got hostname node08 [node08.cluster:26770] [[56543,0],0] ras:tm:allocate:discover: found -- bumped slots to 8 [node08.cluster:26770] [[56543,0],0] ras:base:node_insert inserting 1 nodes [node08.cluster:26770] [[56543,0],0] ras:base:node_insert updating HNP [node08] info to 8 slots ====================== ALLOCATED NODES ====================== Data for node: node08 Num slots: 8 Max slots: 0 ================================================================= [node08.cluster:26770] [[56543,0],0] plm:base:setup_vm [node08.cluster:26770] [[56543,0],0] plm:base:setup_vm creating map [node08.cluster:26770] [[56543,0],0] plm:base:setup_vm only HNP in allocation -------------------------------------------------------------------------- All nodes which are allocated for this job are already filled. -------------------------------------------------------------------------- [node08.cluster:26770] [[56543,0],0] plm:base:orted_cmd sending orted_exit commands [node08.cluster:26770] [[56543,0],0] ras:tm:finalize: success (nothing to do) [node08.cluster:26770] mca: base: close: unloading component tm [node08.cluster:26770] [[56543,0],0] plm:base:receive stop comm tmishima > Hmmm...looks like we aren't getting your allocation. Can you rerun and add -mca ras_base_verbose 50? > > On Nov 12, 2013, at 11:30 PM, tmish...@jcity.maeda.co.jp wrote: > > > > > > > Hi Ralph, > > > > Here is the output of "-mca plm_base_verbose 5". > > > > [node08.cluster:23573] mca:base:select:( plm) Querying component [rsh] > > [node08.cluster:23573] [[INVALID],INVALID] plm:rsh_lookup on > > agent /usr/bin/rsh path NULL > > [node08.cluster:23573] mca:base:select:( plm) Query of component [rsh] set > > priority to 10 > > [node08.cluster:23573] mca:base:select:( plm) Querying component [slurm] > > [node08.cluster:23573] mca:base:select:( plm) Skipping component [slurm]. > > Query failed to return a module > > [node08.cluster:23573] mca:base:select:( plm) Querying component [tm] > > [node08.cluster:23573] mca:base:select:( plm) Query of component [tm] set > > priority to 75 > > [node08.cluster:23573] mca:base:select:( plm) Selected component [tm] > > [node08.cluster:23573] plm:base:set_hnp_name: initial bias 23573 nodename > > hash 85176670 > > [node08.cluster:23573] plm:base:set_hnp_name: final jobfam 59480 > > [node08.cluster:23573] [[59480,0],0] plm:base:receive start comm > > [node08.cluster:23573] [[59480,0],0] plm:base:setup_job > > [node08.cluster:23573] [[59480,0],0] plm:base:setup_vm > > [node08.cluster:23573] [[59480,0],0] plm:base:setup_vm creating map > > [node08.cluster:23573] [[59480,0],0] plm:base:setup_vm only HNP in > > allocation > > -------------------------------------------------------------------------- > > All nodes which are allocated for this job are already filled. > > -------------------------------------------------------------------------- > > > > Here, openmpi's configuration is as follows: > > > > ./configure \ > > --prefix=/home/mishima/opt/mpi/openmpi-1.7.4a1-pgi13.10 \ > > --with-tm \ > > --with-verbs \ > > --disable-ipv6 \ > > --disable-vt \ > > --enable-debug \ > > CC=pgcc CFLAGS="-tp k8-64e" \ > > CXX=pgCC CXXFLAGS="-tp k8-64e" \ > > F77=pgfortran FFLAGS="-tp k8-64e" \ > > FC=pgfortran FCFLAGS="-tp k8-64e" > > > >> Hi Ralph, > >> > >> Okey, I can help you. Please give me some time to report the output. > >> > >> Tetsuya Mishima > >> > >>> I can try, but I have no way of testing Torque any more - so all I can > > do > >> is a code review. If you can build --enable-debug and add -mca > >> plm_base_verbose 5 to your cmd line, I'd appreciate seeing the > >>> output. > >>> > >>> > >>> On Nov 12, 2013, at 9:58 PM, tmish...@jcity.maeda.co.jp wrote: > >>> > >>>> > >>>> > >>>> Hi Ralph, > >>>> > >>>> Thank you for your quick response. > >>>> > >>>> I'd like to report one more regressive issue about Torque support of > >>>> openmpi-1.7.4a1r29646, which might be related to "#3893: LAMA mapper > >>>> has problems" I reported a few days ago. > >>>> > >>>> The script below does not work with openmpi-1.7.4a1r29646, > >>>> although it worked with openmpi-1.7.3 as I told you before. > >>>> > >>>> #!/bin/sh > >>>> #PBS -l nodes=node08:ppn=8 > >>>> export OMP_NUM_THREADS=1 > >>>> cd $PBS_O_WORKDIR > >>>> cp $PBS_NODEFILE pbs_hosts > >>>> NPROCS=`wc -l < pbs_hosts` > >>>> mpirun -machinefile pbs_hosts -np ${NPROCS} -report-bindings -bind-to > >> core > >>>> Myprog > >>>> > >>>> If I drop "-machinefile pbs_hosts -np ${NPROCS} ", then it works > > fine. > >>>> Since this happens without lama request, I guess it's not the problem > >>>> in lama itself. Anyway, please look into this issue as well. > >>>> > >>>> Regards, > >>>> Tetsuya Mishima > >>>> > >>>>> Done - thanks! > >>>>> > >>>>> On Nov 12, 2013, at 7:35 PM, tmish...@jcity.maeda.co.jp wrote: > >>>>> > >>>>>> > >>>>>> > >>>>>> Dear openmpi developers, > >>>>>> > >>>>>> I got a segmentation fault in traial use of openmpi-1.7.4a1r29646 > >> built > >>>> by > >>>>>> PGI13.10 as shown below: > >>>>>> > >>>>>> [mishima@manage testbed-openmpi-1.7.3]$ mpirun -np 4 -cpus-per-proc > > 2 > >>>>>> -report-bindings mPre > >>>>>> [manage.cluster:23082] MCW rank 2 bound to socket 0[core 4[hwt 0]], > >>>> socket > >>>>>> 0[core 5[hwt 0]]: [././././B/B][./././././.] > >>>>>> [manage.cluster:23082] MCW rank 3 bound to socket 1[core 6[hwt 0]], > >>>> socket > >>>>>> 1[core 7[hwt 0]]: [./././././.][B/B/./././.] > >>>>>> [manage.cluster:23082] MCW rank 0 bound to socket 0[core 0[hwt 0]], > >>>> socket > >>>>>> 0[core 1[hwt 0]]: [B/B/./././.][./././././.] > >>>>>> [manage.cluster:23082] MCW rank 1 bound to socket 0[core 2[hwt 0]], > >>>> socket > >>>>>> 0[core 3[hwt 0]]: [././B/B/./.][./././././.] > >>>>>> [manage:23082] *** Process received signal *** > >>>>>> [manage:23082] Signal: Segmentation fault (11) > >>>>>> [manage:23082] Signal code: Address not mapped (1) > >>>>>> [manage:23082] Failing at address: 0x34 > >>>>>> [manage:23082] *** End of error message *** > >>>>>> Segmentation fault (core dumped) > >>>>>> > >>>>>> [mishima@manage testbed-openmpi-1.7.3]$ gdb mpirun core.23082 > >>>>>> GNU gdb (GDB) CentOS (7.0.1-42.el5.centos.1) > >>>>>> Copyright (C) 2009 Free Software Foundation, Inc. > >>>>>> ... > >>>>>> Core was generated by `mpirun -np 4 -cpus-per-proc 2 > > -report-bindings > >>>>>> mPre'. > >>>>>> Program terminated with signal 11, Segmentation fault. > >>>>>> #0 0x00002b5f861c9c4f in recv_connect (mod=0x5f861ca20b00007f, > >>>> sd=32767, > >>>>>> hdr=0x1ca20b00007fff25) at ./oob_tcp.c:631 > >>>>>> 631 peer = OBJ_NEW(mca_oob_tcp_peer_t); > >>>>>> (gdb) where > >>>>>> #0 0x00002b5f861c9c4f in recv_connect (mod=0x5f861ca20b00007f, > >>>> sd=32767, > >>>>>> hdr=0x1ca20b00007fff25) at ./oob_tcp.c:631 > >>>>>> #1 0x00002b5f861ca20b in recv_handler (sd=1778385023, flags=32767, > >>>>>> cbdata=0x8eb06a00007fff25) at ./oob_tcp.c:760 > >>>>>> #2 0x00002b5f848eb06a in event_process_active_single_queue > >>>>>> (base=0x5f848eb27000007f, activeq=0x848eb27000007fff) > >>>>>> at ./event.c:1366 > >>>>>> #3 0x00002b5f848eb270 in event_process_active > >>>> (base=0x5f848eb84900007f) > >>>>>> at ./event.c:1435 > >>>>>> #4 0x00002b5f848eb849 in opal_libevent2021_event_base_loop > >>>>>> (base=0x4077a000007f, flags=32767) at ./event.c:1645 > >>>>>> #5 0x00000000004077a0 in orterun (argc=7, argv=0x7fff25bbd4a8) > >>>>>> at ./orterun.c:1030 > >>>>>> #6 0x00000000004067fb in main (argc=7, argv=0x7fff25bbd4a8) > >>>> at ./main.c:13 > >>>>>> (gdb) quit > >>>>>> > >>>>>> > >>>>>> The line 627 in orte/mca/oob/tcp/oob_tcp.c is apparently > > unnecessary, > >>>> which > >>>>>> causes the segfault. > >>>>>> > >>>>>> 624 /* lookup the corresponding process */ > >>>>>> 625 peer = mca_oob_tcp_peer_lookup(mod, &hdr->origin); > >>>>>> 626 if (NULL == peer) { > >>>>>> 627 ui64 = (uint64_t*)(&peer->name); > >>>>>> 628 opal_output_verbose(OOB_TCP_DEBUG_CONNECT, > >>>>>> orte_oob_base_framework.framework_output, > >>>>>> 629 "%s mca_oob_tcp_recv_connect: > >>>>>> connection from new peer", > >>>>>> 630 ORTE_NAME_PRINT > >> (ORTE_PROC_MY_NAME)); > >>>>>> 631 peer = OBJ_NEW(mca_oob_tcp_peer_t); > >>>>>> 632 peer->mod = mod; > >>>>>> 633 peer->name = hdr->origin; > >>>>>> 634 peer->state = MCA_OOB_TCP_ACCEPTING; > >>>>>> 635 ui64 = (uint64_t*)(&peer->name); > >>>>>> 636 if (OPAL_SUCCESS != opal_hash_table_set_value_uint64 > >>>> (&mod-> > >>>>>> peers, (*ui64), peer)) { > >>>>>> 637 OBJ_RELEASE(peer); > >>>>>> 638 return; > >>>>>> 639 } > >>>>>> > >>>>>> > >>>>>> Please fix this mistake in the next release. > >>>>>> > >>>>>> Regards, > >>>>>> Tetsuya Mishima > >>>>>> > >>>>>> _______________________________________________ > >>>>>> users mailing list > >>>>>> us...@open-mpi.org > >>>>>> http://www.open-mpi.org/mailman/listinfo.cgi/users > >>>>> > >>>>> _______________________________________________ > >>>>> users mailing list > >>>>> us...@open-mpi.org > >>>>> http://www.open-mpi.org/mailman/listinfo.cgi/users > >>>> > >>>> _______________________________________________ > >>>> users mailing list > >>>> us...@open-mpi.org > >>>> http://www.open-mpi.org/mailman/listinfo.cgi/users > >>> > >>> _______________________________________________ > >>> users mailing list > >>> us...@open-mpi.org > >>> http://www.open-mpi.org/mailman/listinfo.cgi/users > >> > >> _______________________________________________ > >> users mailing list > >> us...@open-mpi.org > >> http://www.open-mpi.org/mailman/listinfo.cgi/users > > > > _______________________________________________ > > users mailing list > > us...@open-mpi.org > > http://www.open-mpi.org/mailman/listinfo.cgi/users > > _______________________________________________ > users mailing list > us...@open-mpi.org > http://www.open-mpi.org/mailman/listinfo.cgi/users