Hi Siegmar,

thanks for the detailled report.

i think i found the alignment issue and fixed it (commit
8c556bbc66c06fb19c6e46c67624bac1d6719b12)

here is attached the patch that fixes the issue.

Cheers,

Gilles

On 2014/10/29 5:24, Siegmar Gross wrote:
> Hi Gilles,
>  
>> From the jvm logs, there is an alignment error in native_get_attr
>> but i could not find it by reading the source code.
>>
>> Could you please do
>> ulimit -c unlimited
>> mpiexec ...
>> and then
>> gdb <your path to java>/bin/java core
>> And run bt on all threads until you get a line number in native_get_attr
> I found pmix_native.c:1131 in native_get_attr, attached gdb to the
> Java process and set a breakpoint to this line. From there I single
> stepped until I got SIGSEGV, so that you can see what happened.
>
>
> (gdb) b pmix_native.c:1131
> No source file named pmix_native.c.
> Make breakpoint pending on future shared library load? (y or [n]) y
>
> Breakpoint 1 (pmix_native.c:1131) pending.
> (gdb) thread 14
> [Switching to thread 14 (Thread 2 (LWP 2))]
> #0  0xffffffff7eadc6b0 in __pollsys () from /lib/sparcv9/libc.so.1
> (gdb) f 3
> #3  0xfffffffee5122230 in JNI_OnLoad (vm=0xffffffff7e57e9d8 <main_vm>, 
>     reserved=0x0)
>     at ../../../../../openmpi-dev-178-ga16c1e4/ompi/mpi/java/c/mpi_MPI.c:128
> 128             while (_dbg) poll(NULL, 0, 1);
> (gdb) set _dbg=0
> (gdb) c
> Continuing.
> [New LWP    13        ]
>
> Breakpoint 1, native_get_attr (attr=0xfffffffee2e05db0 "pmix.jobid", 
>     kv=0xffffffff7b4ff028)
>     at 
> ../../../../../openmpi-dev-178-ga16c1e4/opal/mca/pmix/native/pmix_native.c:1131
> 1131            OPAL_OUTPUT_VERBOSE((1, 
> opal_pmix_base_framework.framework_output,
> (gdb) s
> opal_proc_local_get () at 
> ../../../openmpi-dev-178-ga16c1e4/opal/util/proc.c:80
> 80          return opal_proc_my_name;
> (gdb) 
> 81      }
> (gdb) 
> _process_name_print_for_opal (procname=14259803799433510912)
>     at ../../openmpi-dev-178-ga16c1e4/orte/runtime/orte_init.c:64
> 64          orte_process_name_t* rte_name = (orte_process_name_t*)&procname;
> (gdb) 
> 65          return ORTE_NAME_PRINT(rte_name);
> (gdb) 
> orte_util_print_name_args (name=0xffffffff7b4feb90)
>     at ../../openmpi-dev-178-ga16c1e4/orte/util/name_fns.c:122
> 122         if (NULL == name) {
> (gdb) 
> 142         job = orte_util_print_jobids(name->jobid);
> (gdb) 
> orte_util_print_jobids (job=3320119297)
>     at ../../openmpi-dev-178-ga16c1e4/orte/util/name_fns.c:170
> 170         ptr = get_print_name_buffer();
> (gdb) 
> get_print_name_buffer ()
>     at ../../openmpi-dev-178-ga16c1e4/orte/util/name_fns.c:92
> 92          if (!fns_init) {
> (gdb) 
> 101         ret = opal_tsd_getspecific(print_args_tsd_key, (void**)&ptr);
> (gdb) 
> opal_tsd_getspecific (key=4, valuep=0xffffffff7b4fe8a0)
>     at ../../openmpi-dev-178-ga16c1e4/opal/threads/tsd.h:163
> 163         *valuep = pthread_getspecific(key);
> (gdb) 
> 164         return OPAL_SUCCESS;
> (gdb) 
> 165     }
> (gdb) 
> get_print_name_buffer ()
>     at ../../openmpi-dev-178-ga16c1e4/orte/util/name_fns.c:102
> 102         if (OPAL_SUCCESS != ret) return NULL;
> (gdb) 
> 104         if (NULL == ptr) {
> (gdb) 
> 113         return (orte_print_args_buffers_t*) ptr;
> (gdb) 
> 114     }
> (gdb) 
> orte_util_print_jobids (job=3320119297)
>     at ../../openmpi-dev-178-ga16c1e4/orte/util/name_fns.c:172
> 172         if (NULL == ptr) {
> (gdb) 
> 178         if (ORTE_PRINT_NAME_ARG_NUM_BUFS == ptr->cntr) {
> (gdb) 
> 179             ptr->cntr = 0;
> (gdb) 
> 182         if (ORTE_JOBID_INVALID == job) {
> (gdb) 
> 184         } else if (ORTE_JOBID_WILDCARD == job) {
> (gdb) 
> 187             tmp1 = ORTE_JOB_FAMILY((unsigned long)job);
> (gdb) 
> 188             tmp2 = ORTE_LOCAL_JOBID((unsigned long)job);
> (gdb) 
> 189             snprintf(ptr->buffers[ptr->cntr++], 
> (gdb) 
> 193         return ptr->buffers[ptr->cntr-1];
> (gdb) 
> 194     }
> (gdb) 
> orte_util_print_name_args (name=0xffffffff7b4feb90)
>     at ../../openmpi-dev-178-ga16c1e4/orte/util/name_fns.c:143
> 143         vpid = orte_util_print_vpids(name->vpid);
> (gdb) 
> orte_util_print_vpids (vpid=0)
>     at ../../openmpi-dev-178-ga16c1e4/orte/util/name_fns.c:260
> 260         ptr = get_print_name_buffer();
> (gdb) 
> get_print_name_buffer ()
>     at ../../openmpi-dev-178-ga16c1e4/orte/util/name_fns.c:92
> 92          if (!fns_init) {
> (gdb) 
> 101         ret = opal_tsd_getspecific(print_args_tsd_key, (void**)&ptr);
> (gdb) 
> opal_tsd_getspecific (key=4, valuep=0xffffffff7b4fe8b0)
>     at ../../openmpi-dev-178-ga16c1e4/opal/threads/tsd.h:163
> 163         *valuep = pthread_getspecific(key);
> (gdb) 
> 164         return OPAL_SUCCESS;
> (gdb) 
> 165     }
> (gdb) 
> get_print_name_buffer ()
>     at ../../openmpi-dev-178-ga16c1e4/orte/util/name_fns.c:102
> 102         if (OPAL_SUCCESS != ret) return NULL;
> (gdb) 
> 104         if (NULL == ptr) {
> (gdb) 
> 113         return (orte_print_args_buffers_t*) ptr;
> (gdb) 
> 114     }
> (gdb) 
> orte_util_print_vpids (vpid=0)
>     at ../../openmpi-dev-178-ga16c1e4/orte/util/name_fns.c:262
> 262         if (NULL == ptr) {
> (gdb) 
> 268         if (ORTE_PRINT_NAME_ARG_NUM_BUFS == ptr->cntr) {
> (gdb) 
> 272         if (ORTE_VPID_INVALID == vpid) {
> (gdb) 
> 274         } else if (ORTE_VPID_WILDCARD == vpid) {
> (gdb) 
> 277             snprintf(ptr->buffers[ptr->cntr++], 
> (gdb) 
> 281         return ptr->buffers[ptr->cntr-1];
> (gdb) 
> 282     }
> (gdb) 
> orte_util_print_name_args (name=0xffffffff7b4feb90)
>     at ../../openmpi-dev-178-ga16c1e4/orte/util/name_fns.c:146
> 146         ptr = get_print_name_buffer();
> (gdb) 
> get_print_name_buffer ()
>     at ../../openmpi-dev-178-ga16c1e4/orte/util/name_fns.c:92
> 92          if (!fns_init) {
> (gdb) 
> 101         ret = opal_tsd_getspecific(print_args_tsd_key, (void**)&ptr);
> (gdb) 
> opal_tsd_getspecific (key=4, valuep=0xffffffff7b4fe970)
>     at ../../openmpi-dev-178-ga16c1e4/opal/threads/tsd.h:163
> 163         *valuep = pthread_getspecific(key);
> (gdb) 
> 164         return OPAL_SUCCESS;
> (gdb) 
> 165     }
> (gdb) 
> get_print_name_buffer ()
>     at ../../openmpi-dev-178-ga16c1e4/orte/util/name_fns.c:102
> 102         if (OPAL_SUCCESS != ret) return NULL;
> (gdb) 
> 104         if (NULL == ptr) {
> (gdb) 
> 113         return (orte_print_args_buffers_t*) ptr;
> (gdb) 
> 114     }
> (gdb) 
> orte_util_print_name_args (name=0xffffffff7b4feb90)
>     at ../../openmpi-dev-178-ga16c1e4/orte/util/name_fns.c:148
> 148         if (NULL == ptr) {
> (gdb) 
> 154         if (ORTE_PRINT_NAME_ARG_NUM_BUFS == ptr->cntr) {
> (gdb) 
> 158         snprintf(ptr->buffers[ptr->cntr++], 
> (gdb) 
> 162         return ptr->buffers[ptr->cntr-1];
> (gdb) 
> 163     }
> (gdb) 
> _process_name_print_for_opal (procname=14259803799433510912)
>     at ../../openmpi-dev-178-ga16c1e4/orte/runtime/orte_init.c:66
> 66      }
> (gdb) 
>
> Program received signal SIGSEGV, Segmentation fault.
> 0xfffffffee3210bfc in native_get_attr (attr=0xfffffffee2e05db0 "pmix.jobid", 
>     kv=0xffffffff7b4ff028)
>     at 
> ../../../../../openmpi-dev-178-ga16c1e4/opal/mca/pmix/native/pmix_native.c:1131
> 1131            OPAL_OUTPUT_VERBOSE((1, 
> opal_pmix_base_framework.framework_output,
> (gdb) bt
> #0  0xfffffffee3210bfc in native_get_attr (
>     attr=0xfffffffee2e05db0 "pmix.jobid", kv=0xffffffff7b4ff028)
>     at 
> ../../../../../openmpi-dev-178-ga16c1e4/opal/mca/pmix/native/pmix_native.c:1131
> #1  0xfffffffee2e033e4 in rte_init ()
>     at 
> ../../../../../openmpi-dev-178-ga16c1e4/orte/mca/ess/pmi/ess_pmi_module.c:170
> #2  0xfffffffee4a340c0 in orte_init (pargc=0x0, pargv=0x0, flags=32)
>     at ../../openmpi-dev-178-ga16c1e4/orte/runtime/orte_init.c:239
> #3  0xfffffffee4d9a164 in ompi_mpi_init (argc=0, argv=0x1003f5850, 
>     requested=0, provided=0xffffffff7b4ff44c)
>     at ../../openmpi-dev-178-ga16c1e4/ompi/runtime/ompi_mpi_init.c:480
> #4  0xfffffffee4dfbb30 in PMPI_Init (argc=0xffffffff7b4ff554, 
>     argv=0xffffffff7b4ff548) at pinit.c:84
> #5  0xfffffffee5122f6c in Java_mpi_MPI_Init_1jni (env=0x10010e9e0, 
>     clazz=0xffffffff7b4ff760, argv=0xffffffff7b4ff858)
>     at ../../../../../openmpi-dev-178-ga16c1e4/ompi/mpi/java/c/mpi_MPI.c:271
> #6  0xffffffff6b810738 in ?? ()
> #7  0xffffffff6b810738 in ?? ()
> Backtrace stopped: previous frame identical to this frame (corrupt stack?)
> (gdb) 
>
>
>
> Hopefully the above output is helpful. Please let me know if you
> need something else.
>
> Kind regards
>
> Siegmar
>
>
>
>> Siegmar Gross <siegmar.gr...@informatik.hs-fulda.de> wrote:
>>> Hi,
>>>
>>> today I installed openmpi-dev-178-ga16c1e4 on Solaris 10 Sparc
>>> with gcc-4.9.1 and Java 8. Now a very simple Java program works
>>> as expected, but other Java programs still break. I removed the
>>> warnings about "shmem.jar" and used the following configure
>>> command.
>>>
>>> tyr openmpi-dev-178-ga16c1e4-SunOS.sparc.64_gcc 406 head config.log \
>>>  | grep openmpi
>>> $ ../openmpi-dev-178-ga16c1e4/configure
>>>  --prefix=/usr/local/openmpi-1.9.0_64_gcc
>>>  --libdir=/usr/local/openmpi-1.9.0_64_gcc/lib64
>>>  --with-jdk-bindir=/usr/local/jdk1.8.0/bin
>>>  --with-jdk-headers=/usr/local/jdk1.8.0/include
>>>  JAVA_HOME=/usr/local/jdk1.8.0
>>>  LDFLAGS=-m64 CC=gcc CXX=g++ FC=gfortran CFLAGS=-m64 -D_REENTRANT
>>>  CXXFLAGS=-m64 FCFLAGS=-m64 CPP=cpp CXXCPP=cpp
>>>  CPPFLAGS= -D_REENTRANT CXXCPPFLAGS=
>>>  --enable-mpi-cxx --enable-cxx-exceptions --enable-mpi-java
>>>  --enable-mpi-thread-multiple --with-threads=posix
>>>  --with-hwloc=internal
>>>  --without-verbs --with-wrapper-cflags=-std=c11 -m64
>>>  --with-wrapper-cxxflags=-m64 --enable-debug
>>>
>>>
>>> tyr java 290 ompi_info | grep -e "Open MPI repo revision:" -e "C compiler 
>>> version:"
>>>  Open MPI repo revision: dev-178-ga16c1e4
>>>      C compiler version: 4.9.1
>>>
>>>
>>>
>>>>> regarding the BUS error reported by Siegmar, i also commited
>>>>> 62bde1fcb554079143030bb305512c236672386f
>>>>> in order to fix it (this is based on code review only, i have no sparc64
>>>>> hardware to test it is enough)
>>>> I'll test it, when a new nightly snapshot is available for the trunk.
>>>
>>> tyr java 291 mpijavac InitFinalizeMain.java 
>>> tyr java 292 mpiexec -np 1 java InitFinalizeMain
>>> Hello!
>>>
>>> tyr java 293 mpijavac BcastIntMain.java 
>>> tyr java 294 mpiexec -np 2 java BcastIntMain
>>> #
>>> # A fatal error has been detected by the Java Runtime Environment:
>>> #
>>> #  SIGBUS (0xa) at pc=0xfffffffee3210bfc, pid=24792, tid=2
>>> ...
>>>
>>>
>>>
>>> tyr java 296 /usr/local/gdb-7.6.1_64_gcc/bin/gdb mpiexec
>>> ...
>>> (gdb) run -np 2 java BcastIntMain
>>> Starting program: /usr/local/openmpi-1.9.0_64_gcc/bin/mpiexec -np 2 java 
>>> BcastIntMain
>>> [Thread debugging using libthread_db enabled]
>>> [New Thread 1 (LWP 1)]
>>> [New LWP    2        ]
>>> #
>>> # A fatal error has been detected by the Java Runtime Environment:
>>> #
>>> #  SIGBUS (0xa) at pc=0xfffffffee3210bfc, pid=24814, tid=2
>>> #
>>> # JRE version: Java(TM) SE Runtime Environment (8.0-b132) (build 1.8.0-b132)
>>> # Java VM: Java HotSpot(TM) 64-Bit Server VM (25.0-b70 mixed mode 
>>> solaris-sparc compressed oops)
>>> # Problematic frame:
>>> # C  [mca_pmix_native.so+0x10bfc]  native_get_attr+0x3000
>>> #
>>> # Failed to write core dump. Core dumps have been disabled. To enable core 
>>> dumping, try "ulimit -c unlimited" 
> before starting Java again
>>> #
>>> # An error report file with more information is saved as:
>>> # 
>>> /home/fd1026/work/skripte/master/parallel/prog/mpi/java/hs_err_pid24814.log
>>> #
>>> # A fatal error has been detected by the Java Runtime Environment:
>>> #
>>> #  SIGBUS (0xa) at pc=0xfffffffee3210bfc, pid=24812, tid=2
>>> #
>>> # JRE version: Java(TM) SE Runtime Environment (8.0-b132) (build 1.8.0-b132)
>>> # Java VM: Java HotSpot(TM) 64-Bit Server VM (25.0-b70 mixed mode 
>>> solaris-sparc compressed oops)
>>> # Problematic frame:
>>> # C  [mca_pmix_native.so+0x10bfc]  native_get_attr+0x3000
>>> #
>>> # Failed to write core dump. Core dumps have been disabled. To enable core 
>>> dumping, try "ulimit -c unlimited" 
> before starting Java again
>>> #
>>> # An error report file with more information is saved as:
>>> # 
>>> /home/fd1026/work/skripte/master/parallel/prog/mpi/java/hs_err_pid24812.log
>>> #
>>> # If you would like to submit a bug report, please visit:
>>> #   http://bugreport.sun.com/bugreport/crash.jsp
>>> # The crash happened outside the Java Virtual Machine in native code.
>>> # See problematic frame for where to report the bug.
>>> #
>>> [tyr:24814] *** Process received signal ***
>>> [tyr:24814] Signal: Abort (6)
>>> [tyr:24814] Signal code:  (-1)
>>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libopen-pal.so.0.0.0:opal_backtrace_print+0x2c
>>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libopen-pal.so.0.0.0:0xdc2d4
>>> /lib/sparcv9/libc.so.1:0xd8b98
>>> /lib/sparcv9/libc.so.1:0xcc70c
>>> /lib/sparcv9/libc.so.1:0xcc918
>>> /lib/sparcv9/libc.so.1:0xdd2d0 [ Signal 6 (ABRT)]
>>> /lib/sparcv9/libc.so.1:_thr_sigsetmask+0x1c4
>>> /lib/sparcv9/libc.so.1:sigprocmask+0x28
>>> /lib/sparcv9/libc.so.1:_sigrelse+0x5c
>>> /lib/sparcv9/libc.so.1:abort+0xc0
>>> /export2/prog/SunOS_sparc/jdk1.8.0/jre/lib/sparcv9/server/libjvm.so:0xb3cb90
>>> /export2/prog/SunOS_sparc/jdk1.8.0/jre/lib/sparcv9/server/libjvm.so:0xd97a04
>>> /export2/prog/SunOS_sparc/jdk1.8.0/jre/lib/sparcv9/server/libjvm.so:JVM_handle_solaris_signal+0xc0c
>>> /export2/prog/SunOS_sparc/jdk1.8.0/jre/lib/sparcv9/server/libjvm.so:0xb44e84
>>> /lib/sparcv9/libc.so.1:0xd8b98
>>> /lib/sparcv9/libc.so.1:0xcc70c
>>> /lib/sparcv9/libc.so.1:0xcc918
>>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/openmpi/mca_pmix_native.so:0x10bfc
>>>  [ Signal 10 (BUS)]
>>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/openmpi/mca_ess_pmi.so:0x33dc
>>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libopen-rte.so.0.0.0:orte_init+0x67c
>>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libmpi.so.0.0.0:ompi_mpi_init+0x374
>>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libmpi.so.0.0.0:PMPI_Init+0x2a8
>>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libmpi_java.so.0.0.0:Java_mpi_MPI_Init_1jni+0x1a0
>>> 0xffffffff6b810730
>>> 0xffffffff6b8106d4
>>> 0xffffffff6b8078a8
>>> 0xffffffff6b8078a8
>>> 0xffffffff6b80024c
>>> /export2/prog/SunOS_sparc/jdk1.8.0/jre/lib/sparcv9/server/libjvm.so:0x6fd4e8
>>> /export2/prog/SunOS_sparc/jdk1.8.0/jre/lib/sparcv9/server/libjvm.so:0x79331c
>>> /export2/prog/SunOS_sparc/jdk1.8.0/lib/sparcv9/jli/libjli.so:0x7290
>>> /lib/sparcv9/libc.so.1:0xd8a6c
>>> [tyr:24814] *** End of error message ***
>>> --------------------------------------------------------------------------
>>> mpiexec noticed that process rank 1 with PID 0 on node tyr exited on signal 
>>> 6 (Abort).
>>> --------------------------------------------------------------------------
>>> [LWP    2         exited]
>>> [New Thread 2        ]
>>> [Switching to Thread 1 (LWP 1)]
>>> sol_thread_fetch_registers: td_ta_map_id2thr: no thread can be found to 
>>> satisfy query
>>> (gdb) bt
>>> #0  0xffffffff7f6173d0 in rtld_db_dlactivity () from 
>>> /usr/lib/sparcv9/ld.so.1
>>> #1  0xffffffff7f6175a8 in rd_event () from /usr/lib/sparcv9/ld.so.1
>>> #2  0xffffffff7f618950 in lm_delete () from /usr/lib/sparcv9/ld.so.1
>>> #3  0xffffffff7f6226bc in remove_so () from /usr/lib/sparcv9/ld.so.1
>>> #4  0xffffffff7f624574 in remove_hdl () from /usr/lib/sparcv9/ld.so.1
>>> #5  0xffffffff7f61d97c in dlclose_core () from /usr/lib/sparcv9/ld.so.1
>>> #6  0xffffffff7f61d9d4 in dlclose_intn () from /usr/lib/sparcv9/ld.so.1
>>> #7  0xffffffff7f61db0c in dlclose () from /usr/lib/sparcv9/ld.so.1
>>> #8  0xffffffff7ec87ca0 in vm_close ()
>>>   from /usr/local/openmpi-1.9.0_64_gcc/lib64/libopen-pal.so.0
>>> #9  0xffffffff7ec85274 in lt_dlclose ()
>>>   from /usr/local/openmpi-1.9.0_64_gcc/lib64/libopen-pal.so.0
>>> #10 0xffffffff7ecaa5dc in ri_destructor (obj=0x100187b70)
>>>    at 
>>> ../../../../openmpi-dev-178-ga16c1e4/opal/mca/base/mca_base_component_repository.c:382
>>> #11 0xffffffff7eca8fd8 in opal_obj_run_destructors (object=0x100187b70)
>>>    at ../../../../openmpi-dev-178-ga16c1e4/opal/class/opal_object.h:446
>>> #12 0xffffffff7eca9eac in mca_base_component_repository_release (
>>>    component=0xffffffff7b1236f0 <mca_oob_tcp_component>)
>>>    at 
>>> ../../../../openmpi-dev-178-ga16c1e4/opal/mca/base/mca_base_component_repository.c:240
>>> #13 0xffffffff7ecac17c in mca_base_component_unload (
>>>    component=0xffffffff7b1236f0 <mca_oob_tcp_component>, output_id=-1)
>>>    at 
>>> ../../../../openmpi-dev-178-ga16c1e4/opal/mca/base/mca_base_components_close.c:47
>>> #14 0xffffffff7ecac210 in mca_base_component_close (
>>>    component=0xffffffff7b1236f0 <mca_oob_tcp_component>, output_id=-1)
>>>    at 
>>> ../../../../openmpi-dev-178-ga16c1e4/opal/mca/base/mca_base_components_close.c:60
>>> #15 0xffffffff7ecac2e4 in mca_base_components_close (output_id=-1, 
>>>    components=0xffffffff7f14bc58 <orte_oob_base_framework+80>, skip=0x0)
>>>    at 
>>> ../../../../openmpi-dev-178-ga16c1e4/opal/mca/base/mca_base_components_close.c:86
>>> #16 0xffffffff7ecac24c in mca_base_framework_components_close (
>>>    framework=0xffffffff7f14bc08 <orte_oob_base_framework>, skip=0x0)
>>>    at 
>>> ../../../../openmpi-dev-178-ga16c1e4/opal/mca/base/mca_base_components_close.c:66
>>> #17 0xffffffff7efcaf80 in orte_oob_base_close ()
>>>    at 
>>> ../../../../openmpi-dev-178-ga16c1e4/orte/mca/oob/base/oob_base_frame.c:112
>>> #18 0xffffffff7ecc0d74 in mca_base_framework_close (
>>>    framework=0xffffffff7f14bc08 <orte_oob_base_framework>)
>>>    at 
>>> ../../../../openmpi-dev-178-ga16c1e4/opal/mca/base/mca_base_framework.c:187
>>> #19 0xffffffff7be07858 in rte_finalize ()
>>>    at 
>>> ../../../../../openmpi-dev-178-ga16c1e4/orte/mca/ess/hnp/ess_hnp_module.c:857
>>> #20 0xffffffff7ef338bc in orte_finalize ()
>>>    at ../../openmpi-dev-178-ga16c1e4/orte/runtime/orte_finalize.c:66
>>> #21 0x000000010000723c in orterun (argc=5, argv=0xffffffff7fffe0d8)
>>>    at ../../../../openmpi-dev-178-ga16c1e4/orte/tools/orterun/orterun.c:1103
>>> #22 0x0000000100003e80 in main (argc=5, argv=0xffffffff7fffe0d8)
>>> ---Type <return> to continue, or q <return> to quit---
>>>    at ../../../../openmpi-dev-178-ga16c1e4/orte/tools/orterun/main.c:13
>>> (gdb) 
>>>
>>>
>>>
>>>
>>> I get the same error for C programs, if they use more than
>>> MPI_Init and MPI_Finalize.
>>>
>>> tyr small_prog 301 mpicc init_finalize.c 
>>> tyr small_prog 302 mpiexec -np 1 a.out
>>> Hello!
>>> tyr small_prog 303 mpicc column_int.c 
>>> tyr small_prog 306 /usr/local/gdb-7.6.1_64_gcc/bin/gdb mpiexec
>>> ...
>>> (gdb) run -np 4 a.out
>>> Starting program: /usr/local/openmpi-1.9.0_64_gcc/bin/mpiexec -np 4 a.out
>>> [Thread debugging using libthread_db enabled]
>>> [New Thread 1 (LWP 1)]
>>> [New LWP    2        ]
>>> [tyr:24880] *** Process received signal ***
>>> [tyr:24880] Signal: Bus Error (10)
>>> [tyr:24880] Signal code: Invalid address alignment (1)
>>> [tyr:24880] Failing at address: ffffffff7bd1c10c
>>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libopen-pal.so.0.0.0:opal_backtrace_print+0x2c
>>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libopen-pal.so.0.0.0:0xdc2d4
>>> /lib/sparcv9/libc.so.1:0xd8b98
>>> /lib/sparcv9/libc.so.1:0xcc70c
>>> /lib/sparcv9/libc.so.1:0xcc918
>>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/openmpi/mca_pmix_native.so:0x10684
>>>  [ Signal 10 (BUS)]
>>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/openmpi/mca_ess_pmi.so:0x33dc
>>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libopen-rte.so.0.0.0:orte_init+0x67c
>>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libmpi.so.0.0.0:ompi_mpi_init+0x374
>>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libmpi.so.0.0.0:PMPI_Init+0x2a8
>>> /home/fd1026/work/skripte/master/parallel/prog/mpi/small_prog/a.out:main+0x20
>>> /home/fd1026/work/skripte/master/parallel/prog/mpi/small_prog/a.out:_start+0x7c
>>> [tyr:24880] *** End of error message ***
>>> [tyr:24876] *** Process received signal ***
>>> [tyr:24876] Signal: Bus Error (10)
>>> [tyr:24876] Signal code: Invalid address alignment (1)
>>> [tyr:24876] Failing at address: ffffffff7bd1c10c
>>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libopen-pal.so.0.0.0:opal_backtrace_print+0x2c
>>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libopen-pal.so.0.0.0:0xdc2d4
>>> /lib/sparcv9/libc.so.1:0xd8b98
>>> /lib/sparcv9/libc.so.1:0xcc70c
>>> /lib/sparcv9/libc.so.1:0xcc918
>>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/openmpi/mca_pmix_native.so:0x10684
>>>  [ Signal 10 (BUS)]
>>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/openmpi/mca_ess_pmi.so:0x33dc
>>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libopen-rte.so.0.0.0:orte_init+0x67c
>>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libmpi.so.0.0.0:ompi_mpi_init+0x374
>>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libmpi.so.0.0.0:PMPI_Init+0x2a8
>>> /home/fd1026/work/skripte/master/parallel/prog/mpi/small_prog/a.out:main+0x20
>>> /home/fd1026/work/skripte/master/parallel/prog/mpi/small_prog/a.out:_start+0x7c
>>> [tyr:24876] *** End of error message ***
>>> --------------------------------------------------------------------------
>>> mpiexec noticed that process rank 2 with PID 0 on node tyr exited on signal 
>>> 10 (Bus Error).
>>> --------------------------------------------------------------------------
>>> [LWP    2         exited]
>>> [New Thread 2        ]
>>> [Switching to Thread 1 (LWP 1)]
>>> sol_thread_fetch_registers: td_ta_map_id2thr: no thread can be found to 
>>> satisfy query
>>> (gdb) bt
>>> #0  0xffffffff7f6173d0 in rtld_db_dlactivity () from 
>>> /usr/lib/sparcv9/ld.so.1
>>> #1  0xffffffff7f6175a8 in rd_event () from /usr/lib/sparcv9/ld.so.1
>>> #2  0xffffffff7f618950 in lm_delete () from /usr/lib/sparcv9/ld.so.1
>>> #3  0xffffffff7f6226bc in remove_so () from /usr/lib/sparcv9/ld.so.1
>>> #4  0xffffffff7f624574 in remove_hdl () from /usr/lib/sparcv9/ld.so.1
>>> #5  0xffffffff7f61d97c in dlclose_core () from /usr/lib/sparcv9/ld.so.1
>>> #6  0xffffffff7f61d9d4 in dlclose_intn () from /usr/lib/sparcv9/ld.so.1
>>> #7  0xffffffff7f61db0c in dlclose () from /usr/lib/sparcv9/ld.so.1
>>> #8  0xffffffff7ec87ca0 in vm_close ()
>>>   from /usr/local/openmpi-1.9.0_64_gcc/lib64/libopen-pal.so.0
>>> #9  0xffffffff7ec85274 in lt_dlclose ()
>>>   from /usr/local/openmpi-1.9.0_64_gcc/lib64/libopen-pal.so.0
>>> #10 0xffffffff7ecaa5dc in ri_destructor (obj=0x100187ae0)
>>>    at 
>>> ../../../../openmpi-dev-178-ga16c1e4/opal/mca/base/mca_base_component_repository.c:382
>>> #11 0xffffffff7eca8fd8 in opal_obj_run_destructors (object=0x100187ae0)
>>>    at ../../../../openmpi-dev-178-ga16c1e4/opal/class/opal_object.h:446
>>> #12 0xffffffff7eca9eac in mca_base_component_repository_release (
>>>    component=0xffffffff7b0236f0 <mca_oob_tcp_component>)
>>>    at 
>>> ../../../../openmpi-dev-178-ga16c1e4/opal/mca/base/mca_base_component_repository.c:240
>>> #13 0xffffffff7ecac17c in mca_base_component_unload (
>>>    component=0xffffffff7b0236f0 <mca_oob_tcp_component>, output_id=-1)
>>>    at 
>>> ../../../../openmpi-dev-178-ga16c1e4/opal/mca/base/mca_base_components_close.c:47
>>> #14 0xffffffff7ecac210 in mca_base_component_close (
>>>    component=0xffffffff7b0236f0 <mca_oob_tcp_component>, output_id=-1)
>>>    at 
>>> ../../../../openmpi-dev-178-ga16c1e4/opal/mca/base/mca_base_components_close.c:60
>>> #15 0xffffffff7ecac2e4 in mca_base_components_close (output_id=-1, 
>>>    components=0xffffffff7f14bc58 <orte_oob_base_framework+80>, skip=0x0)
>>>    at 
>>> ../../../../openmpi-dev-178-ga16c1e4/opal/mca/base/mca_base_components_close.c:86
>>> #16 0xffffffff7ecac24c in mca_base_framework_components_close (
>>>    framework=0xffffffff7f14bc08 <orte_oob_base_framework>, skip=0x0)
>>>    at 
>>> ../../../../openmpi-dev-178-ga16c1e4/opal/mca/base/mca_base_components_close.c:66
>>> #17 0xffffffff7efcaf80 in orte_oob_base_close ()
>>>    at 
>>> ../../../../openmpi-dev-178-ga16c1e4/orte/mca/oob/base/oob_base_frame.c:112
>>> #18 0xffffffff7ecc0d74 in mca_base_framework_close (
>>>    framework=0xffffffff7f14bc08 <orte_oob_base_framework>)
>>>    at 
>>> ../../../../openmpi-dev-178-ga16c1e4/opal/mca/base/mca_base_framework.c:187
>>> #19 0xffffffff7bd07858 in rte_finalize ()
>>>    at 
>>> ../../../../../openmpi-dev-178-ga16c1e4/orte/mca/ess/hnp/ess_hnp_module.c:857
>>> #20 0xffffffff7ef338bc in orte_finalize ()
>>>    at ../../openmpi-dev-178-ga16c1e4/orte/runtime/orte_finalize.c:66
>>> #21 0x000000010000723c in orterun (argc=4, argv=0xffffffff7fffe0e8)
>>>    at ../../../../openmpi-dev-178-ga16c1e4/orte/tools/orterun/orterun.c:1103
>>> #22 0x0000000100003e80 in main (argc=4, argv=0xffffffff7fffe0e8)
>>>    at ../../../../openmpi-dev-178-ga16c1e4/orte/tools/orterun/main.c:13
>>> (gdb) 
>>>
>>>
>>>
>>> Do you need any other information?
>>>
>>>
>>> Kind regards
>>>
>>> Siegmar
> _______________________________________________
> users mailing list
> us...@open-mpi.org
> Subscription: http://www.open-mpi.org/mailman/listinfo.cgi/users
> Link to this post: 
> http://www.open-mpi.org/community/lists/users/2014/10/25635.php

commit 8c556bbc66c06fb19c6e46c67624bac1d6719b12
Author: Gilles Gouaillardet <gilles.gouaillar...@iferc.org>
List-Post: users@lists.open-mpi.org
Date:   Wed Oct 29 13:19:23 2014 +0900

    pmix: fix alignment issue

diff --git a/opal/mca/pmix/native/pmix_native.c 
b/opal/mca/pmix/native/pmix_native.c
index 6e771ea..b3c03da 100644
--- a/opal/mca/pmix/native/pmix_native.c
+++ b/opal/mca/pmix/native/pmix_native.c
@@ -1097,6 +1097,7 @@ static bool native_get_attr(const char *attr, 
opal_value_t **kv)
             continue;
         }
         native_pname.vid = vid;
+        memcpy(&id, &native_pname, sizeof(opal_identifier_t));
 #if OPAL_HAVE_HWLOC
         OBJ_CONSTRUCT(&vals, opal_list_t);
         if (OPAL_SUCCESS != (rc = opal_dstore.fetch(opal_dstore_internal, 
(opal_identifier_t*)&native_pname,
@@ -1104,7 +1105,7 @@ static bool native_get_attr(const char *attr, 
opal_value_t **kv)
             opal_output_verbose(2, opal_pmix_base_framework.framework_output,
                                 "%s cpuset for local proc %s not found",
                                 OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
-                                
OPAL_NAME_PRINT(*(opal_identifier_t*)&native_pname));
+                                OPAL_NAME_PRINT(id));
             OPAL_LIST_DESTRUCT(&vals);
             /* even though the cpuset wasn't found, we at least know it is
              * on the same node with us */
@@ -1131,7 +1132,7 @@ static bool native_get_attr(const char *attr, 
opal_value_t **kv)
         OPAL_OUTPUT_VERBOSE((1, opal_pmix_base_framework.framework_output,
                              "%s pmix:native proc %s locality %s",
                              OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
-                             
OPAL_NAME_PRINT(*(opal_identifier_t*)&native_pname),
+                             OPAL_NAME_PRINT(id),
                              opal_hwloc_base_print_locality(locality)));

         OBJ_CONSTRUCT(&kvn, opal_value_t);

Reply via email to