Hi Siegmar, thanks for the detailled report.
i think i found the alignment issue and fixed it (commit 8c556bbc66c06fb19c6e46c67624bac1d6719b12) here is attached the patch that fixes the issue. Cheers, Gilles On 2014/10/29 5:24, Siegmar Gross wrote: > Hi Gilles, > >> From the jvm logs, there is an alignment error in native_get_attr >> but i could not find it by reading the source code. >> >> Could you please do >> ulimit -c unlimited >> mpiexec ... >> and then >> gdb <your path to java>/bin/java core >> And run bt on all threads until you get a line number in native_get_attr > I found pmix_native.c:1131 in native_get_attr, attached gdb to the > Java process and set a breakpoint to this line. From there I single > stepped until I got SIGSEGV, so that you can see what happened. > > > (gdb) b pmix_native.c:1131 > No source file named pmix_native.c. > Make breakpoint pending on future shared library load? (y or [n]) y > > Breakpoint 1 (pmix_native.c:1131) pending. > (gdb) thread 14 > [Switching to thread 14 (Thread 2 (LWP 2))] > #0 0xffffffff7eadc6b0 in __pollsys () from /lib/sparcv9/libc.so.1 > (gdb) f 3 > #3 0xfffffffee5122230 in JNI_OnLoad (vm=0xffffffff7e57e9d8 <main_vm>, > reserved=0x0) > at ../../../../../openmpi-dev-178-ga16c1e4/ompi/mpi/java/c/mpi_MPI.c:128 > 128 while (_dbg) poll(NULL, 0, 1); > (gdb) set _dbg=0 > (gdb) c > Continuing. > [New LWP 13 ] > > Breakpoint 1, native_get_attr (attr=0xfffffffee2e05db0 "pmix.jobid", > kv=0xffffffff7b4ff028) > at > ../../../../../openmpi-dev-178-ga16c1e4/opal/mca/pmix/native/pmix_native.c:1131 > 1131 OPAL_OUTPUT_VERBOSE((1, > opal_pmix_base_framework.framework_output, > (gdb) s > opal_proc_local_get () at > ../../../openmpi-dev-178-ga16c1e4/opal/util/proc.c:80 > 80 return opal_proc_my_name; > (gdb) > 81 } > (gdb) > _process_name_print_for_opal (procname=14259803799433510912) > at ../../openmpi-dev-178-ga16c1e4/orte/runtime/orte_init.c:64 > 64 orte_process_name_t* rte_name = (orte_process_name_t*)&procname; > (gdb) > 65 return ORTE_NAME_PRINT(rte_name); > (gdb) > orte_util_print_name_args (name=0xffffffff7b4feb90) > at ../../openmpi-dev-178-ga16c1e4/orte/util/name_fns.c:122 > 122 if (NULL == name) { > (gdb) > 142 job = orte_util_print_jobids(name->jobid); > (gdb) > orte_util_print_jobids (job=3320119297) > at ../../openmpi-dev-178-ga16c1e4/orte/util/name_fns.c:170 > 170 ptr = get_print_name_buffer(); > (gdb) > get_print_name_buffer () > at ../../openmpi-dev-178-ga16c1e4/orte/util/name_fns.c:92 > 92 if (!fns_init) { > (gdb) > 101 ret = opal_tsd_getspecific(print_args_tsd_key, (void**)&ptr); > (gdb) > opal_tsd_getspecific (key=4, valuep=0xffffffff7b4fe8a0) > at ../../openmpi-dev-178-ga16c1e4/opal/threads/tsd.h:163 > 163 *valuep = pthread_getspecific(key); > (gdb) > 164 return OPAL_SUCCESS; > (gdb) > 165 } > (gdb) > get_print_name_buffer () > at ../../openmpi-dev-178-ga16c1e4/orte/util/name_fns.c:102 > 102 if (OPAL_SUCCESS != ret) return NULL; > (gdb) > 104 if (NULL == ptr) { > (gdb) > 113 return (orte_print_args_buffers_t*) ptr; > (gdb) > 114 } > (gdb) > orte_util_print_jobids (job=3320119297) > at ../../openmpi-dev-178-ga16c1e4/orte/util/name_fns.c:172 > 172 if (NULL == ptr) { > (gdb) > 178 if (ORTE_PRINT_NAME_ARG_NUM_BUFS == ptr->cntr) { > (gdb) > 179 ptr->cntr = 0; > (gdb) > 182 if (ORTE_JOBID_INVALID == job) { > (gdb) > 184 } else if (ORTE_JOBID_WILDCARD == job) { > (gdb) > 187 tmp1 = ORTE_JOB_FAMILY((unsigned long)job); > (gdb) > 188 tmp2 = ORTE_LOCAL_JOBID((unsigned long)job); > (gdb) > 189 snprintf(ptr->buffers[ptr->cntr++], > (gdb) > 193 return ptr->buffers[ptr->cntr-1]; > (gdb) > 194 } > (gdb) > orte_util_print_name_args (name=0xffffffff7b4feb90) > at ../../openmpi-dev-178-ga16c1e4/orte/util/name_fns.c:143 > 143 vpid = orte_util_print_vpids(name->vpid); > (gdb) > orte_util_print_vpids (vpid=0) > at ../../openmpi-dev-178-ga16c1e4/orte/util/name_fns.c:260 > 260 ptr = get_print_name_buffer(); > (gdb) > get_print_name_buffer () > at ../../openmpi-dev-178-ga16c1e4/orte/util/name_fns.c:92 > 92 if (!fns_init) { > (gdb) > 101 ret = opal_tsd_getspecific(print_args_tsd_key, (void**)&ptr); > (gdb) > opal_tsd_getspecific (key=4, valuep=0xffffffff7b4fe8b0) > at ../../openmpi-dev-178-ga16c1e4/opal/threads/tsd.h:163 > 163 *valuep = pthread_getspecific(key); > (gdb) > 164 return OPAL_SUCCESS; > (gdb) > 165 } > (gdb) > get_print_name_buffer () > at ../../openmpi-dev-178-ga16c1e4/orte/util/name_fns.c:102 > 102 if (OPAL_SUCCESS != ret) return NULL; > (gdb) > 104 if (NULL == ptr) { > (gdb) > 113 return (orte_print_args_buffers_t*) ptr; > (gdb) > 114 } > (gdb) > orte_util_print_vpids (vpid=0) > at ../../openmpi-dev-178-ga16c1e4/orte/util/name_fns.c:262 > 262 if (NULL == ptr) { > (gdb) > 268 if (ORTE_PRINT_NAME_ARG_NUM_BUFS == ptr->cntr) { > (gdb) > 272 if (ORTE_VPID_INVALID == vpid) { > (gdb) > 274 } else if (ORTE_VPID_WILDCARD == vpid) { > (gdb) > 277 snprintf(ptr->buffers[ptr->cntr++], > (gdb) > 281 return ptr->buffers[ptr->cntr-1]; > (gdb) > 282 } > (gdb) > orte_util_print_name_args (name=0xffffffff7b4feb90) > at ../../openmpi-dev-178-ga16c1e4/orte/util/name_fns.c:146 > 146 ptr = get_print_name_buffer(); > (gdb) > get_print_name_buffer () > at ../../openmpi-dev-178-ga16c1e4/orte/util/name_fns.c:92 > 92 if (!fns_init) { > (gdb) > 101 ret = opal_tsd_getspecific(print_args_tsd_key, (void**)&ptr); > (gdb) > opal_tsd_getspecific (key=4, valuep=0xffffffff7b4fe970) > at ../../openmpi-dev-178-ga16c1e4/opal/threads/tsd.h:163 > 163 *valuep = pthread_getspecific(key); > (gdb) > 164 return OPAL_SUCCESS; > (gdb) > 165 } > (gdb) > get_print_name_buffer () > at ../../openmpi-dev-178-ga16c1e4/orte/util/name_fns.c:102 > 102 if (OPAL_SUCCESS != ret) return NULL; > (gdb) > 104 if (NULL == ptr) { > (gdb) > 113 return (orte_print_args_buffers_t*) ptr; > (gdb) > 114 } > (gdb) > orte_util_print_name_args (name=0xffffffff7b4feb90) > at ../../openmpi-dev-178-ga16c1e4/orte/util/name_fns.c:148 > 148 if (NULL == ptr) { > (gdb) > 154 if (ORTE_PRINT_NAME_ARG_NUM_BUFS == ptr->cntr) { > (gdb) > 158 snprintf(ptr->buffers[ptr->cntr++], > (gdb) > 162 return ptr->buffers[ptr->cntr-1]; > (gdb) > 163 } > (gdb) > _process_name_print_for_opal (procname=14259803799433510912) > at ../../openmpi-dev-178-ga16c1e4/orte/runtime/orte_init.c:66 > 66 } > (gdb) > > Program received signal SIGSEGV, Segmentation fault. > 0xfffffffee3210bfc in native_get_attr (attr=0xfffffffee2e05db0 "pmix.jobid", > kv=0xffffffff7b4ff028) > at > ../../../../../openmpi-dev-178-ga16c1e4/opal/mca/pmix/native/pmix_native.c:1131 > 1131 OPAL_OUTPUT_VERBOSE((1, > opal_pmix_base_framework.framework_output, > (gdb) bt > #0 0xfffffffee3210bfc in native_get_attr ( > attr=0xfffffffee2e05db0 "pmix.jobid", kv=0xffffffff7b4ff028) > at > ../../../../../openmpi-dev-178-ga16c1e4/opal/mca/pmix/native/pmix_native.c:1131 > #1 0xfffffffee2e033e4 in rte_init () > at > ../../../../../openmpi-dev-178-ga16c1e4/orte/mca/ess/pmi/ess_pmi_module.c:170 > #2 0xfffffffee4a340c0 in orte_init (pargc=0x0, pargv=0x0, flags=32) > at ../../openmpi-dev-178-ga16c1e4/orte/runtime/orte_init.c:239 > #3 0xfffffffee4d9a164 in ompi_mpi_init (argc=0, argv=0x1003f5850, > requested=0, provided=0xffffffff7b4ff44c) > at ../../openmpi-dev-178-ga16c1e4/ompi/runtime/ompi_mpi_init.c:480 > #4 0xfffffffee4dfbb30 in PMPI_Init (argc=0xffffffff7b4ff554, > argv=0xffffffff7b4ff548) at pinit.c:84 > #5 0xfffffffee5122f6c in Java_mpi_MPI_Init_1jni (env=0x10010e9e0, > clazz=0xffffffff7b4ff760, argv=0xffffffff7b4ff858) > at ../../../../../openmpi-dev-178-ga16c1e4/ompi/mpi/java/c/mpi_MPI.c:271 > #6 0xffffffff6b810738 in ?? () > #7 0xffffffff6b810738 in ?? () > Backtrace stopped: previous frame identical to this frame (corrupt stack?) > (gdb) > > > > Hopefully the above output is helpful. Please let me know if you > need something else. > > Kind regards > > Siegmar > > > >> Siegmar Gross <siegmar.gr...@informatik.hs-fulda.de> wrote: >>> Hi, >>> >>> today I installed openmpi-dev-178-ga16c1e4 on Solaris 10 Sparc >>> with gcc-4.9.1 and Java 8. Now a very simple Java program works >>> as expected, but other Java programs still break. I removed the >>> warnings about "shmem.jar" and used the following configure >>> command. >>> >>> tyr openmpi-dev-178-ga16c1e4-SunOS.sparc.64_gcc 406 head config.log \ >>> | grep openmpi >>> $ ../openmpi-dev-178-ga16c1e4/configure >>> --prefix=/usr/local/openmpi-1.9.0_64_gcc >>> --libdir=/usr/local/openmpi-1.9.0_64_gcc/lib64 >>> --with-jdk-bindir=/usr/local/jdk1.8.0/bin >>> --with-jdk-headers=/usr/local/jdk1.8.0/include >>> JAVA_HOME=/usr/local/jdk1.8.0 >>> LDFLAGS=-m64 CC=gcc CXX=g++ FC=gfortran CFLAGS=-m64 -D_REENTRANT >>> CXXFLAGS=-m64 FCFLAGS=-m64 CPP=cpp CXXCPP=cpp >>> CPPFLAGS= -D_REENTRANT CXXCPPFLAGS= >>> --enable-mpi-cxx --enable-cxx-exceptions --enable-mpi-java >>> --enable-mpi-thread-multiple --with-threads=posix >>> --with-hwloc=internal >>> --without-verbs --with-wrapper-cflags=-std=c11 -m64 >>> --with-wrapper-cxxflags=-m64 --enable-debug >>> >>> >>> tyr java 290 ompi_info | grep -e "Open MPI repo revision:" -e "C compiler >>> version:" >>> Open MPI repo revision: dev-178-ga16c1e4 >>> C compiler version: 4.9.1 >>> >>> >>> >>>>> regarding the BUS error reported by Siegmar, i also commited >>>>> 62bde1fcb554079143030bb305512c236672386f >>>>> in order to fix it (this is based on code review only, i have no sparc64 >>>>> hardware to test it is enough) >>>> I'll test it, when a new nightly snapshot is available for the trunk. >>> >>> tyr java 291 mpijavac InitFinalizeMain.java >>> tyr java 292 mpiexec -np 1 java InitFinalizeMain >>> Hello! >>> >>> tyr java 293 mpijavac BcastIntMain.java >>> tyr java 294 mpiexec -np 2 java BcastIntMain >>> # >>> # A fatal error has been detected by the Java Runtime Environment: >>> # >>> # SIGBUS (0xa) at pc=0xfffffffee3210bfc, pid=24792, tid=2 >>> ... >>> >>> >>> >>> tyr java 296 /usr/local/gdb-7.6.1_64_gcc/bin/gdb mpiexec >>> ... >>> (gdb) run -np 2 java BcastIntMain >>> Starting program: /usr/local/openmpi-1.9.0_64_gcc/bin/mpiexec -np 2 java >>> BcastIntMain >>> [Thread debugging using libthread_db enabled] >>> [New Thread 1 (LWP 1)] >>> [New LWP 2 ] >>> # >>> # A fatal error has been detected by the Java Runtime Environment: >>> # >>> # SIGBUS (0xa) at pc=0xfffffffee3210bfc, pid=24814, tid=2 >>> # >>> # JRE version: Java(TM) SE Runtime Environment (8.0-b132) (build 1.8.0-b132) >>> # Java VM: Java HotSpot(TM) 64-Bit Server VM (25.0-b70 mixed mode >>> solaris-sparc compressed oops) >>> # Problematic frame: >>> # C [mca_pmix_native.so+0x10bfc] native_get_attr+0x3000 >>> # >>> # Failed to write core dump. Core dumps have been disabled. To enable core >>> dumping, try "ulimit -c unlimited" > before starting Java again >>> # >>> # An error report file with more information is saved as: >>> # >>> /home/fd1026/work/skripte/master/parallel/prog/mpi/java/hs_err_pid24814.log >>> # >>> # A fatal error has been detected by the Java Runtime Environment: >>> # >>> # SIGBUS (0xa) at pc=0xfffffffee3210bfc, pid=24812, tid=2 >>> # >>> # JRE version: Java(TM) SE Runtime Environment (8.0-b132) (build 1.8.0-b132) >>> # Java VM: Java HotSpot(TM) 64-Bit Server VM (25.0-b70 mixed mode >>> solaris-sparc compressed oops) >>> # Problematic frame: >>> # C [mca_pmix_native.so+0x10bfc] native_get_attr+0x3000 >>> # >>> # Failed to write core dump. Core dumps have been disabled. To enable core >>> dumping, try "ulimit -c unlimited" > before starting Java again >>> # >>> # An error report file with more information is saved as: >>> # >>> /home/fd1026/work/skripte/master/parallel/prog/mpi/java/hs_err_pid24812.log >>> # >>> # If you would like to submit a bug report, please visit: >>> # http://bugreport.sun.com/bugreport/crash.jsp >>> # The crash happened outside the Java Virtual Machine in native code. >>> # See problematic frame for where to report the bug. >>> # >>> [tyr:24814] *** Process received signal *** >>> [tyr:24814] Signal: Abort (6) >>> [tyr:24814] Signal code: (-1) >>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libopen-pal.so.0.0.0:opal_backtrace_print+0x2c >>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libopen-pal.so.0.0.0:0xdc2d4 >>> /lib/sparcv9/libc.so.1:0xd8b98 >>> /lib/sparcv9/libc.so.1:0xcc70c >>> /lib/sparcv9/libc.so.1:0xcc918 >>> /lib/sparcv9/libc.so.1:0xdd2d0 [ Signal 6 (ABRT)] >>> /lib/sparcv9/libc.so.1:_thr_sigsetmask+0x1c4 >>> /lib/sparcv9/libc.so.1:sigprocmask+0x28 >>> /lib/sparcv9/libc.so.1:_sigrelse+0x5c >>> /lib/sparcv9/libc.so.1:abort+0xc0 >>> /export2/prog/SunOS_sparc/jdk1.8.0/jre/lib/sparcv9/server/libjvm.so:0xb3cb90 >>> /export2/prog/SunOS_sparc/jdk1.8.0/jre/lib/sparcv9/server/libjvm.so:0xd97a04 >>> /export2/prog/SunOS_sparc/jdk1.8.0/jre/lib/sparcv9/server/libjvm.so:JVM_handle_solaris_signal+0xc0c >>> /export2/prog/SunOS_sparc/jdk1.8.0/jre/lib/sparcv9/server/libjvm.so:0xb44e84 >>> /lib/sparcv9/libc.so.1:0xd8b98 >>> /lib/sparcv9/libc.so.1:0xcc70c >>> /lib/sparcv9/libc.so.1:0xcc918 >>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/openmpi/mca_pmix_native.so:0x10bfc >>> [ Signal 10 (BUS)] >>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/openmpi/mca_ess_pmi.so:0x33dc >>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libopen-rte.so.0.0.0:orte_init+0x67c >>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libmpi.so.0.0.0:ompi_mpi_init+0x374 >>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libmpi.so.0.0.0:PMPI_Init+0x2a8 >>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libmpi_java.so.0.0.0:Java_mpi_MPI_Init_1jni+0x1a0 >>> 0xffffffff6b810730 >>> 0xffffffff6b8106d4 >>> 0xffffffff6b8078a8 >>> 0xffffffff6b8078a8 >>> 0xffffffff6b80024c >>> /export2/prog/SunOS_sparc/jdk1.8.0/jre/lib/sparcv9/server/libjvm.so:0x6fd4e8 >>> /export2/prog/SunOS_sparc/jdk1.8.0/jre/lib/sparcv9/server/libjvm.so:0x79331c >>> /export2/prog/SunOS_sparc/jdk1.8.0/lib/sparcv9/jli/libjli.so:0x7290 >>> /lib/sparcv9/libc.so.1:0xd8a6c >>> [tyr:24814] *** End of error message *** >>> -------------------------------------------------------------------------- >>> mpiexec noticed that process rank 1 with PID 0 on node tyr exited on signal >>> 6 (Abort). >>> -------------------------------------------------------------------------- >>> [LWP 2 exited] >>> [New Thread 2 ] >>> [Switching to Thread 1 (LWP 1)] >>> sol_thread_fetch_registers: td_ta_map_id2thr: no thread can be found to >>> satisfy query >>> (gdb) bt >>> #0 0xffffffff7f6173d0 in rtld_db_dlactivity () from >>> /usr/lib/sparcv9/ld.so.1 >>> #1 0xffffffff7f6175a8 in rd_event () from /usr/lib/sparcv9/ld.so.1 >>> #2 0xffffffff7f618950 in lm_delete () from /usr/lib/sparcv9/ld.so.1 >>> #3 0xffffffff7f6226bc in remove_so () from /usr/lib/sparcv9/ld.so.1 >>> #4 0xffffffff7f624574 in remove_hdl () from /usr/lib/sparcv9/ld.so.1 >>> #5 0xffffffff7f61d97c in dlclose_core () from /usr/lib/sparcv9/ld.so.1 >>> #6 0xffffffff7f61d9d4 in dlclose_intn () from /usr/lib/sparcv9/ld.so.1 >>> #7 0xffffffff7f61db0c in dlclose () from /usr/lib/sparcv9/ld.so.1 >>> #8 0xffffffff7ec87ca0 in vm_close () >>> from /usr/local/openmpi-1.9.0_64_gcc/lib64/libopen-pal.so.0 >>> #9 0xffffffff7ec85274 in lt_dlclose () >>> from /usr/local/openmpi-1.9.0_64_gcc/lib64/libopen-pal.so.0 >>> #10 0xffffffff7ecaa5dc in ri_destructor (obj=0x100187b70) >>> at >>> ../../../../openmpi-dev-178-ga16c1e4/opal/mca/base/mca_base_component_repository.c:382 >>> #11 0xffffffff7eca8fd8 in opal_obj_run_destructors (object=0x100187b70) >>> at ../../../../openmpi-dev-178-ga16c1e4/opal/class/opal_object.h:446 >>> #12 0xffffffff7eca9eac in mca_base_component_repository_release ( >>> component=0xffffffff7b1236f0 <mca_oob_tcp_component>) >>> at >>> ../../../../openmpi-dev-178-ga16c1e4/opal/mca/base/mca_base_component_repository.c:240 >>> #13 0xffffffff7ecac17c in mca_base_component_unload ( >>> component=0xffffffff7b1236f0 <mca_oob_tcp_component>, output_id=-1) >>> at >>> ../../../../openmpi-dev-178-ga16c1e4/opal/mca/base/mca_base_components_close.c:47 >>> #14 0xffffffff7ecac210 in mca_base_component_close ( >>> component=0xffffffff7b1236f0 <mca_oob_tcp_component>, output_id=-1) >>> at >>> ../../../../openmpi-dev-178-ga16c1e4/opal/mca/base/mca_base_components_close.c:60 >>> #15 0xffffffff7ecac2e4 in mca_base_components_close (output_id=-1, >>> components=0xffffffff7f14bc58 <orte_oob_base_framework+80>, skip=0x0) >>> at >>> ../../../../openmpi-dev-178-ga16c1e4/opal/mca/base/mca_base_components_close.c:86 >>> #16 0xffffffff7ecac24c in mca_base_framework_components_close ( >>> framework=0xffffffff7f14bc08 <orte_oob_base_framework>, skip=0x0) >>> at >>> ../../../../openmpi-dev-178-ga16c1e4/opal/mca/base/mca_base_components_close.c:66 >>> #17 0xffffffff7efcaf80 in orte_oob_base_close () >>> at >>> ../../../../openmpi-dev-178-ga16c1e4/orte/mca/oob/base/oob_base_frame.c:112 >>> #18 0xffffffff7ecc0d74 in mca_base_framework_close ( >>> framework=0xffffffff7f14bc08 <orte_oob_base_framework>) >>> at >>> ../../../../openmpi-dev-178-ga16c1e4/opal/mca/base/mca_base_framework.c:187 >>> #19 0xffffffff7be07858 in rte_finalize () >>> at >>> ../../../../../openmpi-dev-178-ga16c1e4/orte/mca/ess/hnp/ess_hnp_module.c:857 >>> #20 0xffffffff7ef338bc in orte_finalize () >>> at ../../openmpi-dev-178-ga16c1e4/orte/runtime/orte_finalize.c:66 >>> #21 0x000000010000723c in orterun (argc=5, argv=0xffffffff7fffe0d8) >>> at ../../../../openmpi-dev-178-ga16c1e4/orte/tools/orterun/orterun.c:1103 >>> #22 0x0000000100003e80 in main (argc=5, argv=0xffffffff7fffe0d8) >>> ---Type <return> to continue, or q <return> to quit--- >>> at ../../../../openmpi-dev-178-ga16c1e4/orte/tools/orterun/main.c:13 >>> (gdb) >>> >>> >>> >>> >>> I get the same error for C programs, if they use more than >>> MPI_Init and MPI_Finalize. >>> >>> tyr small_prog 301 mpicc init_finalize.c >>> tyr small_prog 302 mpiexec -np 1 a.out >>> Hello! >>> tyr small_prog 303 mpicc column_int.c >>> tyr small_prog 306 /usr/local/gdb-7.6.1_64_gcc/bin/gdb mpiexec >>> ... >>> (gdb) run -np 4 a.out >>> Starting program: /usr/local/openmpi-1.9.0_64_gcc/bin/mpiexec -np 4 a.out >>> [Thread debugging using libthread_db enabled] >>> [New Thread 1 (LWP 1)] >>> [New LWP 2 ] >>> [tyr:24880] *** Process received signal *** >>> [tyr:24880] Signal: Bus Error (10) >>> [tyr:24880] Signal code: Invalid address alignment (1) >>> [tyr:24880] Failing at address: ffffffff7bd1c10c >>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libopen-pal.so.0.0.0:opal_backtrace_print+0x2c >>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libopen-pal.so.0.0.0:0xdc2d4 >>> /lib/sparcv9/libc.so.1:0xd8b98 >>> /lib/sparcv9/libc.so.1:0xcc70c >>> /lib/sparcv9/libc.so.1:0xcc918 >>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/openmpi/mca_pmix_native.so:0x10684 >>> [ Signal 10 (BUS)] >>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/openmpi/mca_ess_pmi.so:0x33dc >>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libopen-rte.so.0.0.0:orte_init+0x67c >>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libmpi.so.0.0.0:ompi_mpi_init+0x374 >>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libmpi.so.0.0.0:PMPI_Init+0x2a8 >>> /home/fd1026/work/skripte/master/parallel/prog/mpi/small_prog/a.out:main+0x20 >>> /home/fd1026/work/skripte/master/parallel/prog/mpi/small_prog/a.out:_start+0x7c >>> [tyr:24880] *** End of error message *** >>> [tyr:24876] *** Process received signal *** >>> [tyr:24876] Signal: Bus Error (10) >>> [tyr:24876] Signal code: Invalid address alignment (1) >>> [tyr:24876] Failing at address: ffffffff7bd1c10c >>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libopen-pal.so.0.0.0:opal_backtrace_print+0x2c >>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libopen-pal.so.0.0.0:0xdc2d4 >>> /lib/sparcv9/libc.so.1:0xd8b98 >>> /lib/sparcv9/libc.so.1:0xcc70c >>> /lib/sparcv9/libc.so.1:0xcc918 >>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/openmpi/mca_pmix_native.so:0x10684 >>> [ Signal 10 (BUS)] >>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/openmpi/mca_ess_pmi.so:0x33dc >>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libopen-rte.so.0.0.0:orte_init+0x67c >>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libmpi.so.0.0.0:ompi_mpi_init+0x374 >>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libmpi.so.0.0.0:PMPI_Init+0x2a8 >>> /home/fd1026/work/skripte/master/parallel/prog/mpi/small_prog/a.out:main+0x20 >>> /home/fd1026/work/skripte/master/parallel/prog/mpi/small_prog/a.out:_start+0x7c >>> [tyr:24876] *** End of error message *** >>> -------------------------------------------------------------------------- >>> mpiexec noticed that process rank 2 with PID 0 on node tyr exited on signal >>> 10 (Bus Error). >>> -------------------------------------------------------------------------- >>> [LWP 2 exited] >>> [New Thread 2 ] >>> [Switching to Thread 1 (LWP 1)] >>> sol_thread_fetch_registers: td_ta_map_id2thr: no thread can be found to >>> satisfy query >>> (gdb) bt >>> #0 0xffffffff7f6173d0 in rtld_db_dlactivity () from >>> /usr/lib/sparcv9/ld.so.1 >>> #1 0xffffffff7f6175a8 in rd_event () from /usr/lib/sparcv9/ld.so.1 >>> #2 0xffffffff7f618950 in lm_delete () from /usr/lib/sparcv9/ld.so.1 >>> #3 0xffffffff7f6226bc in remove_so () from /usr/lib/sparcv9/ld.so.1 >>> #4 0xffffffff7f624574 in remove_hdl () from /usr/lib/sparcv9/ld.so.1 >>> #5 0xffffffff7f61d97c in dlclose_core () from /usr/lib/sparcv9/ld.so.1 >>> #6 0xffffffff7f61d9d4 in dlclose_intn () from /usr/lib/sparcv9/ld.so.1 >>> #7 0xffffffff7f61db0c in dlclose () from /usr/lib/sparcv9/ld.so.1 >>> #8 0xffffffff7ec87ca0 in vm_close () >>> from /usr/local/openmpi-1.9.0_64_gcc/lib64/libopen-pal.so.0 >>> #9 0xffffffff7ec85274 in lt_dlclose () >>> from /usr/local/openmpi-1.9.0_64_gcc/lib64/libopen-pal.so.0 >>> #10 0xffffffff7ecaa5dc in ri_destructor (obj=0x100187ae0) >>> at >>> ../../../../openmpi-dev-178-ga16c1e4/opal/mca/base/mca_base_component_repository.c:382 >>> #11 0xffffffff7eca8fd8 in opal_obj_run_destructors (object=0x100187ae0) >>> at ../../../../openmpi-dev-178-ga16c1e4/opal/class/opal_object.h:446 >>> #12 0xffffffff7eca9eac in mca_base_component_repository_release ( >>> component=0xffffffff7b0236f0 <mca_oob_tcp_component>) >>> at >>> ../../../../openmpi-dev-178-ga16c1e4/opal/mca/base/mca_base_component_repository.c:240 >>> #13 0xffffffff7ecac17c in mca_base_component_unload ( >>> component=0xffffffff7b0236f0 <mca_oob_tcp_component>, output_id=-1) >>> at >>> ../../../../openmpi-dev-178-ga16c1e4/opal/mca/base/mca_base_components_close.c:47 >>> #14 0xffffffff7ecac210 in mca_base_component_close ( >>> component=0xffffffff7b0236f0 <mca_oob_tcp_component>, output_id=-1) >>> at >>> ../../../../openmpi-dev-178-ga16c1e4/opal/mca/base/mca_base_components_close.c:60 >>> #15 0xffffffff7ecac2e4 in mca_base_components_close (output_id=-1, >>> components=0xffffffff7f14bc58 <orte_oob_base_framework+80>, skip=0x0) >>> at >>> ../../../../openmpi-dev-178-ga16c1e4/opal/mca/base/mca_base_components_close.c:86 >>> #16 0xffffffff7ecac24c in mca_base_framework_components_close ( >>> framework=0xffffffff7f14bc08 <orte_oob_base_framework>, skip=0x0) >>> at >>> ../../../../openmpi-dev-178-ga16c1e4/opal/mca/base/mca_base_components_close.c:66 >>> #17 0xffffffff7efcaf80 in orte_oob_base_close () >>> at >>> ../../../../openmpi-dev-178-ga16c1e4/orte/mca/oob/base/oob_base_frame.c:112 >>> #18 0xffffffff7ecc0d74 in mca_base_framework_close ( >>> framework=0xffffffff7f14bc08 <orte_oob_base_framework>) >>> at >>> ../../../../openmpi-dev-178-ga16c1e4/opal/mca/base/mca_base_framework.c:187 >>> #19 0xffffffff7bd07858 in rte_finalize () >>> at >>> ../../../../../openmpi-dev-178-ga16c1e4/orte/mca/ess/hnp/ess_hnp_module.c:857 >>> #20 0xffffffff7ef338bc in orte_finalize () >>> at ../../openmpi-dev-178-ga16c1e4/orte/runtime/orte_finalize.c:66 >>> #21 0x000000010000723c in orterun (argc=4, argv=0xffffffff7fffe0e8) >>> at ../../../../openmpi-dev-178-ga16c1e4/orte/tools/orterun/orterun.c:1103 >>> #22 0x0000000100003e80 in main (argc=4, argv=0xffffffff7fffe0e8) >>> at ../../../../openmpi-dev-178-ga16c1e4/orte/tools/orterun/main.c:13 >>> (gdb) >>> >>> >>> >>> Do you need any other information? >>> >>> >>> Kind regards >>> >>> Siegmar > _______________________________________________ > users mailing list > us...@open-mpi.org > Subscription: http://www.open-mpi.org/mailman/listinfo.cgi/users > Link to this post: > http://www.open-mpi.org/community/lists/users/2014/10/25635.php
commit 8c556bbc66c06fb19c6e46c67624bac1d6719b12 Author: Gilles Gouaillardet <gilles.gouaillar...@iferc.org> List-Post: users@lists.open-mpi.org Date: Wed Oct 29 13:19:23 2014 +0900 pmix: fix alignment issue diff --git a/opal/mca/pmix/native/pmix_native.c b/opal/mca/pmix/native/pmix_native.c index 6e771ea..b3c03da 100644 --- a/opal/mca/pmix/native/pmix_native.c +++ b/opal/mca/pmix/native/pmix_native.c @@ -1097,6 +1097,7 @@ static bool native_get_attr(const char *attr, opal_value_t **kv) continue; } native_pname.vid = vid; + memcpy(&id, &native_pname, sizeof(opal_identifier_t)); #if OPAL_HAVE_HWLOC OBJ_CONSTRUCT(&vals, opal_list_t); if (OPAL_SUCCESS != (rc = opal_dstore.fetch(opal_dstore_internal, (opal_identifier_t*)&native_pname, @@ -1104,7 +1105,7 @@ static bool native_get_attr(const char *attr, opal_value_t **kv) opal_output_verbose(2, opal_pmix_base_framework.framework_output, "%s cpuset for local proc %s not found", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), - OPAL_NAME_PRINT(*(opal_identifier_t*)&native_pname)); + OPAL_NAME_PRINT(id)); OPAL_LIST_DESTRUCT(&vals); /* even though the cpuset wasn't found, we at least know it is * on the same node with us */ @@ -1131,7 +1132,7 @@ static bool native_get_attr(const char *attr, opal_value_t **kv) OPAL_OUTPUT_VERBOSE((1, opal_pmix_base_framework.framework_output, "%s pmix:native proc %s locality %s", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), - OPAL_NAME_PRINT(*(opal_identifier_t*)&native_pname), + OPAL_NAME_PRINT(id), opal_hwloc_base_print_locality(locality))); OBJ_CONSTRUCT(&kvn, opal_value_t);