Hello,

I am trying to experiment with MPI process binding and the LAMA package
looks very promising for my needs. However, I am having problems in
that I get what appears to be internal errors in LAMA.

From what I can tell my command line options are accepted gracefully,
but then when they are expected to be executed the application seg
-faults.

Has anyone seen similar issues? I have tried with Open-MPI 1.8.6 and
Open-MPI 1.8.8 with the same behaviour. I _think_ my command line
options are valid, and in any case I think it should generate an error
and not a crash if there were incorrect.

Command line to be used on host 0 (host_225):
mpirun -np 32 -report-bindings -display-map -mca orte_base_help_aggregate 0 \
 -x MXM_RDMA_PORTS=mlx5_0:1 -mca btl_openib_if_include mlx5_0:1 \
 --hostfile /home/lnsmeds/.lsbatch/1439148094.25601.hostfile \
 --prefix /hpc/base/mellanox/hpcx/exp/icc/ompi-mellanox-v1.8-local -x 
LD_LIBRARY_PATH \
 --mca rmaps lama --mca rmaps_base_verbose 10 --mca rmaps_lama_map hcsnNb \
 --mca rmaps_lama_bind 1N --mca rmaps_lama_ordering s --mca rmaps_lama_mppr 
2:N,1:c  \
 -x custom_mxm_tl=rc -x MXM_TLS=self,shm,rc -x MXM_RC_QP_LIMIT=-1 -x 
custom_fca=off -mca coll_fca_enable 0 \
 -x custom_hcol=on  -mca coll_hcoll_enable 1 -x HCOLL_IB_IF_INCLUDE=mlx5_0:1 -x 
HCOLL_ENABLE_MCAST_ALL=1  \
 -x custom_mxm=on -mca pml yalla  ./myapplication.exe  2>&1

Sun Aug  9 21:20:56 CEST 2015
[host_225:09102] mca: base: components_register: registering rmaps components
[host_225:09102] mca: base: components_register: found loaded component lama
[host_225:09102] mca:rmaps:lama: Priority   0
[host_225:09102] mca:rmaps:lama: Map   : hcsnNb
[host_225:09102] mca:rmaps:lama: Bind  : 1N
[host_225:09102] mca:rmaps:lama: MPPR  : 2:N,1:c
[host_225:09102] mca:rmaps:lama: Order : s
[host_225:09102] mca: base: components_register: component lama register 
function successful
[host_225:09102] [[18162,0],0] rmaps:base set policy with NULL
[host_225:09102] mca: base: components_open: opening rmaps components
[host_225:09102] mca: base: components_open: found loaded component lama
[host_225:09102] mca:rmaps:select: checking available component lama
[host_225:09102] mca:rmaps:select: Querying component [lama]
[host_225:09102] [[18162,0],0]: Final mapper priorities
[host_225:09102]        Mapper: lama Priority: 0
[host_225:09102] mca:rmaps: mapping job [18162,1]
[host_225:09102] mca:rmaps: creating new map for job [18162,1]
[host_225:09102] mca:rmaps: nprocs 32
[host_225:09102] mca:rmaps[139] mapping not given - using bysocket
[host_225:09102] mca:rmaps:lama: Mapping job [18162,1]
[host_225:09102] mca:rmaps:lama: Revised Parameters -----
[host_225:09102] mca:rmaps:lama: Map   : hcsnNb
[host_225:09102] mca:rmaps:lama: Bind  : 1N
[host_225:09102] mca:rmaps:lama: MPPR  : 2:N,1:c
[host_225:09102] mca:rmaps:lama: Order : s
[host_225:09102] mca:rmaps:lama: ---------------------------------
[host_225:09102] mca:rmaps:lama: ----- Binding  : [1N]
[host_225:09102] mca:rmaps:lama: ----- Binding  :    1 x       NUMA
[host_225:09102] mca:rmaps:lama: ---------------------------------
[host_225:09102] mca:rmaps:lama: ----- Mapping  : [hcsnNb]
[host_225:09102] mca:rmaps:lama: ----- Mapping  : (0) Hw. Thread (8 vs 0)
[host_225:09102] mca:rmaps:lama: ----- Mapping  : (1)       Core (7 vs 1)
[host_225:09102] mca:rmaps:lama: ----- Mapping  : (2)     Socket (3 vs 2)
[host_225:09102] mca:rmaps:lama: ----- Mapping  : (3)    Machine (0 vs 3)
[host_225:09102] mca:rmaps:lama: ----- Mapping  : (4)       NUMA (2 vs 7)
[host_225:09102] mca:rmaps:lama: ----- Mapping  : (5)      Board (1 vs 8)
[host_225:09102] mca:rmaps:lama: ---------------------------------
[host_225:09102] mca:rmaps:lama: ----- MPPR     : [2:N,1:c]
[host_225:09102] mca:rmaps:lama: ----- MPPR     :    2 at       NUMA
[host_225:09102] mca:rmaps:lama: ----- MPPR     :    1 at       Core
[host_225:09102] mca:rmaps:lama: ---------------------------------
[host_225:09102] mca:rmaps:lama: ----- Ordering : [s]
[host_225:09102] mca:rmaps:lama: ----- Ordering : Sequential
[host_225:09102] mca:rmaps:lama: ---------------------------------
[host_225:09102] AVAILABLE NODES FOR MAPPING:
[host_225:09102]     node: host_225 daemon: 0
[host_225:09102]     node: host_229 daemon: 1
[host_225:09102]     node: host_140 daemon: 2
[host_225:09102]     node: host_342 daemon: 3
[host_225:09102] mca:rmaps:lama: ---------------------------------
[host_225:09102] mca:rmaps:lama: ----- Building the Max Tree...
[host_225:09102] mca:rmaps:lama: ---------------------------------
[host_225:09102] mca:rmaps:lama: ----- Converting Remote Tree: host_225
[host_225:09102] mca:rmaps:lama: ----- Converting Remote Tree: host_229
[host_225:09102] mca:rmaps:lama: ----- Converting Remote Tree: host_140
[host_225:09102] mca:rmaps:lama: ----- Converting Remote Tree: host_342
[host_225:09102] *** Process received signal ***
[host_225:09102] Signal: Segmentation fault (11)
[host_225:09102] Signal code: Address not mapped (1)
[host_225:09102] Failing at address: 0x60
[host_225:09102] [ 0] /lib64/libpthread.so.0(+0xf710)[0x2b0ad199f710]
[host_225:09102] [ 1] 
/hpc/base/mellanox/hpcx/exp/icc/ompi-mellanox-v1.8-local/lib/openmpi/mca_rmaps_lama.so(+0x7366)[0x2b0ad625b366]
[host_225:09102] [ 2] 
/hpc/base/mellanox/hpcx/exp/icc/ompi-mellanox-v1.8-local/lib/openmpi/mca_rmaps_lama.so(+0x73c9)[0x2b0ad625b3c9]
[host_225:09102] [ 3] 
/hpc/base/mellanox/hpcx/exp/icc/ompi-mellanox-v1.8-local/lib/openmpi/mca_rmaps_lama.so(rmaps_lama_build_max_tree+0x63f)[0x2b0ad625a7cf]
[host_225:09102] [ 4] 
/hpc/base/mellanox/hpcx/exp/icc/ompi-mellanox-v1.8-local/lib/openmpi/mca_rmaps_lama.so(+0x3573)[0x2b0ad6257573]
[host_225:09102] [ 5] 
/hpc/base/mellanox/hpcx/exp/icc/ompi-mellanox-v1.8-local/lib/openmpi/mca_rmaps_lama.so(+0x2d7b)[0x2b0ad6256d7b]
[host_225:09102] [ 6] 
/hpc/base/mellanox/hpcx/exp/icc/ompi-mellanox-v1.8-local/lib/libopen-rte.so.7(orte_rmaps_base_map_job+0x6ad)[0x2b0ad078db0d]
[host_225:09102] [ 7] 
/hpc/base/mellanox/hpcx/exp/icc/ompi-mellanox-v1.8-local/lib/libopen-pal.so.6(opal_libevent2021_event_base_loop+0xa19)[0x2b0ad0a3d489]
[host_225:09102] [ 8] mpirun[0x4064e9]
[host_225:09102] [ 9] mpirun[0x4040f4]
[host_225:09102] [10] /lib64/libc.so.6(__libc_start_main+0xfd)[0x2b0ad1bccd1d]
[host_225:09102] [11] mpirun[0x404019]
[host_225:09102] *** End of error message ***
Sun Aug  9 21:20:56 CEST 2015


Post mortem with gdb on core file gives:

(gdb) info threads
  2 Thread 0x2b0ad4963700 (LWP 9104)  0x00002b0ad1c8f5e3 in select () from 
/lib64/libc.so.6
* 1 Thread 0x2b0ad3734680 (LWP 9102)  opal_pointer_array_get_item 
(node=0x7987e0, obj=0x7579f0)
    at ../../../../../../openmpi-1.8.8/opal/class/opal_pointer_array.h:130
(gdb) where
#0  opal_pointer_array_get_item (node=0x7987e0, obj=0x7579f0)
    at ../../../../../../openmpi-1.8.8/opal/class/opal_pointer_array.h:130
#1  rmaps_lama_annotate_node_for_mppr (node=0x7987e0, obj=0x7579f0)
    at 
../../../../../../openmpi-1.8.8/orte/mca/rmaps/lama/rmaps_lama_max_tree.c:377
#2  0x00002b0ad625b3c9 in rmaps_lama_annotate_node_for_mppr (node=0x7987e0, 
obj=0x7579f0)
    at 
../../../../../../openmpi-1.8.8/orte/mca/rmaps/lama/rmaps_lama_max_tree.c:396
#3  0x00002b0ad625a7cf in rmaps_lama_build_max_tree (jdata=0x7987e0, 
node_list=0x7579f0, max_tree=0x1, is_homogeneous=0x0)
    at 
../../../../../../openmpi-1.8.8/orte/mca/rmaps/lama/rmaps_lama_max_tree.c:226
#4  0x00002b0ad6257573 in orte_rmaps_lama_map_core (jdata=0x7987e0)
    at 
../../../../../../openmpi-1.8.8/orte/mca/rmaps/lama/rmaps_lama_module.c:662
#5  0x00002b0ad6256d7b in orte_rmaps_lama_map (jdata=0x7987e0)
    at 
../../../../../../openmpi-1.8.8/orte/mca/rmaps/lama/rmaps_lama_module.c:308
#6  0x00002b0ad078db0d in orte_rmaps_base_map_job (fd=7964640, args=31216, 
cbdata=0x1)
    at ../../../../../openmpi-1.8.8/orte/mca/rmaps/base/rmaps_base_map_job.c:370
#7  0x00002b0ad0a3d489 in event_process_active_single_queue (base=0x7987e0, 
flags=7698928)
    at 
../../../../../../../openmpi-1.8.8/opal/mca/event/libevent2021/libevent/event.c:1367
#8  event_process_active (base=0x7987e0, flags=7698928)
    at 
../../../../../../../openmpi-1.8.8/opal/mca/event/libevent2021/libevent/event.c:1437
#9  opal_libevent2021_event_base_loop (base=0x7987e0, flags=7698928)
    at 
../../../../../../../openmpi-1.8.8/opal/mca/event/libevent2021/libevent/event.c:1647
#10 0x00000000004064e9 in orterun (argc=63, argv=0x7fffa5fc68b8)
    at ../../../../../openmpi-1.8.8/orte/tools/orterun/orterun.c:1133
#11 0x00000000004040f4 in main (argc=63, argv=0x7fffa5fc68b8) at 
../../../../../openmpi-1.8.8/orte/tools/orterun/main.c:13
(gdb) thread 2
[Switching to thread 2 (Thread 0x2b0ad4963700 (LWP 9104))]#0  
0x00002b0ad1c8f5e3 in select () from /lib64/libc.so.6
(gdb) where
#0  0x00002b0ad1c8f5e3 in select () from /lib64/libc.so.6
#1  0x00002b0ad4555b12 in listen_thread (obj=0xd)
    at ../../../../../../openmpi-1.8.8/orte/mca/oob/tcp/oob_tcp_listener.c:685
#2  0x00002b0ad19979d1 in start_thread () from /lib64/libpthread.so.0
#3  0x00002b0ad1c96b6d in clone () from /lib64/libc.so.6

(gdb) thread 1
(gdb) list 125,140
125     static inline void *opal_pointer_array_get_item(opal_pointer_array_t 
*table, 
126                                                     int element_index)
127     {
128         void *p;
129     
130         if( table->size <= element_index ) {
131             return NULL;
132         }
133         OPAL_THREAD_LOCK(&(table->lock));
134         p = table->addr[element_index];
135         OPAL_THREAD_UNLOCK(&(table->lock));
136         return p;
137     }
138     
139     

(gdb) up
#1  rmaps_lama_annotate_node_for_mppr (node=0x7987e0, obj=0x7579f0)
    at 
../../../../../../openmpi-1.8.8/orte/mca/rmaps/lama/rmaps_lama_max_tree.c:377
377         mppr_accounting = 
(rmaps_lama_node_mppr_t*)opal_pointer_array_get_item(hwloc_userdata->node_mppr, 
node->index);
(gdb) list 370,390
370             }
371         }
372     
373     
374         /*
375          * Add node information if it is not already there
376          */
377         mppr_accounting = 
(rmaps_lama_node_mppr_t*)opal_pointer_array_get_item(hwloc_userdata->node_mppr, 
node->index);
378         if( NULL == mppr_accounting ) {
379             /*
380              * Add MPPR accounting for this node associated with this object
381              */
382             rmaps_lama_convert_hwloc_key_to_lama_key(obj->type, 
obj->attr->cache.depth, &lama_key);
383     
384             mppr_accounting = 
(rmaps_lama_node_mppr_t*)malloc(sizeof(rmaps_lama_node_mppr_t));
385             mppr_accounting->max = rmaps_lama_get_mppr_for_key(node, 
lama_key);
386             mppr_accounting->cur =  0;
387     
388             opal_pointer_array_set_item(hwloc_userdata->node_mppr, 
node->index, mppr_accounting);
389         }
390     

(gdb) p hwloc_userdata->node_mppr
$1 = (opal_pointer_array_t *) 0x8
(gdb) p *hwloc_userdata->node_mppr
Cannot access memory at address 0x8
(gdb) p node->index
$2 = 0

Reply via email to