Hi,
I have built openmpi-dev-4221-gb707d13 on my machines (Solaris 10
Sparc, Solaris 10 x86_64, and openSUSE Linux 12.1 x86_64) with
gcc-5.1.0 and Sun C 5.13. Unfortunately I get an error for a small
program.
tyr hello_1 109 ompi_info | grep -e "OPAL repo revision:" -e "C compiler
absolute:"
OPAL repo revision: dev-4221-gb707d13
C compiler absolute: /usr/local/gcc-5.1.0/bin/gcc
tyr hello_1 110 mpiexec -np 4 --host tyr,sunpc1,linpc1,tyr hello_1_mpi
ld.so.1: orted: fatal: relocation error: file
/usr/local/openmpi-master_64_gcc/lib64/openmpi/mca_pmix_pmix114.so: symbol
strnlen: referenced symbol not found
--------------------------------------------------------------------------
ORTE has lost communication with its daemon located on node:
hostname: sunpc1
This is usually due to either a failure of the TCP network
connection to the node, or possibly an internal failure of
the daemon itself. We cannot recover from this failure, and
therefore will terminate the job.
--------------------------------------------------------------------------
I get the same error, if I login on a Solaris x86_64 machine and only use
that machine.
sunpc1 fd1026 101 mpiexec -np 2 --host sunpc1,sunpc1 hello_1_mpi
ld.so.1: orterun: fatal: relocation error: file
/usr/local/openmpi-master_64_gcc/lib64/openmpi/mca_pmix_pmix114.so: symbol
strnlen: referenced symbol not found
Killed
sunpc1 fd1026 102
tyr hello_1 111 /usr/local/gdb-7.6.1_64_gcc/bin/gdb mpiexec
GNU gdb (GDB) 7.6.1
Copyright (C) 2013 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law. Type "show copying"
and "show warranty" for details.
This GDB was configured as "sparc-sun-solaris2.10".
For bug reporting instructions, please see:
<http://www.gnu.org/software/gdb/bugs/>...
Reading symbols from
/export2/prog/SunOS_sparc/openmpi-master_64_gcc/bin/orterun...done.
(gdb) set args -np 4 --host tyr,sunpc1,linpc1,tyr hello_1_mpi
(gdb) r
Starting program: /usr/local/openmpi-master_64_gcc/bin/mpiexec -np 4 --host
tyr,sunpc1,linpc1,tyr hello_1_mpi
[Thread debugging using libthread_db enabled]
[New Thread 1 (LWP 1)]
[New LWP 2 ]
[New LWP 3 ]
[New LWP 4 ]
[New LWP 5 ]
ld.so.1: orted: fatal: relocation error: file
/usr/local/openmpi-master_64_gcc/lib64/openmpi/mca_pmix_pmix114.so: symbol
strnlen: referenced symbol not found
--------------------------------------------------------------------------
ORTE has lost communication with its daemon located on node:
hostname: sunpc1
This is usually due to either a failure of the TCP network
connection to the node, or possibly an internal failure of
the daemon itself. We cannot recover from this failure, and
therefore will terminate the job.
--------------------------------------------------------------------------
[LWP 5 exited]
[New Thread 5 ]
[LWP 4 exited]
[New Thread 4 ]
[LWP 3 exited]
[New Thread 3 ]
[Switching to Thread 1 (LWP 1)]
sol_thread_fetch_registers: td_ta_map_id2thr: no thread can be found to satisfy
query
(gdb) Killed
(gdb) bt
#0 0xffffffff7f6173d0 in rtld_db_dlactivity () from /usr/lib/sparcv9/ld.so.1
#1 0xffffffff7f6175a8 in rd_event () from /usr/lib/sparcv9/ld.so.1
#2 0xffffffff7f618950 in lm_delete () from /usr/lib/sparcv9/ld.so.1
#3 0xffffffff7f6226bc in remove_so () from /usr/lib/sparcv9/ld.so.1
#4 0xffffffff7f624574 in remove_hdl () from /usr/lib/sparcv9/ld.so.1
#5 0xffffffff7f61d97c in dlclose_core () from /usr/lib/sparcv9/ld.so.1
#6 0xffffffff7f61d9d4 in dlclose_intn () from /usr/lib/sparcv9/ld.so.1
#7 0xffffffff7f61db0c in dlclose () from /usr/lib/sparcv9/ld.so.1
#8 0xffffffff7ece8d30 in dlopen_close (handle=0x1001a8350)
at
../../../../../openmpi-dev-4221-gb707d13/opal/mca/dl/dlopen/dl_dlopen_module.c:148
#9 0xffffffff7ece8464 in opal_dl_close (handle=0x1001a8350)
at ../../../../openmpi-dev-4221-gb707d13/opal/mca/dl/base/dl_base_fns.c:53
#10 0xffffffff7ecab1c0 in mca_base_component_repository_release_internal
(ri=0x1001406d0)
at
../../../../openmpi-dev-4221-gb707d13/opal/mca/base/mca_base_component_repository.c:280
#11 0xffffffff7ecab338 in mca_base_component_repository_release (
component=0xffffffff799a70c0 <mca_pmix_pmix114_component>)
at
../../../../openmpi-dev-4221-gb707d13/opal/mca/base/mca_base_component_repository.c:317
#12 0xffffffff7ecad0d8 in mca_base_component_unload (
component=0xffffffff799a70c0 <mca_pmix_pmix114_component>, output_id=-1)
at
../../../../openmpi-dev-4221-gb707d13/opal/mca/base/mca_base_components_close.c:46
#13 0xffffffff7ecad170 in mca_base_component_close (
component=0xffffffff799a70c0 <mca_pmix_pmix114_component>, output_id=-1)
at
../../../../openmpi-dev-4221-gb707d13/opal/mca/base/mca_base_components_close.c:59
#14 0xffffffff7ecad240 in mca_base_components_close (output_id=-1,
components=0xffffffff7ee9f558 <opal_pmix_base_framework+80>, skip=0x0)
at
../../../../openmpi-dev-4221-gb707d13/opal/mca/base/mca_base_components_close.c:85
#15 0xffffffff7ecad1b0 in mca_base_framework_components_close (
framework=0xffffffff7ee9f508 <opal_pmix_base_framework>, skip=0x0)
at
../../../../openmpi-dev-4221-gb707d13/opal/mca/base/mca_base_components_close.c:65
#16 0xffffffff7ed4921c in opal_pmix_base_frame_close ()
at
../../../../openmpi-dev-4221-gb707d13/opal/mca/pmix/base/pmix_base_frame.c:57
#17 0xffffffff7ecc3418 in mca_base_framework_close (
framework=0xffffffff7ee9f508 <opal_pmix_base_framework>)
at
../../../../openmpi-dev-4221-gb707d13/opal/mca/base/mca_base_framework.c:214
#18 0xffffffff7c20782c in rte_finalize ()
at
../../../../../openmpi-dev-4221-gb707d13/orte/mca/ess/hnp/ess_hnp_module.c:795
#19 0xffffffff7ef39e20 in orte_finalize ()
at ../../openmpi-dev-4221-gb707d13/orte/runtime/orte_finalize.c:73
#20 0x0000000100002d08 in orterun (argc=6, argv=0xffffffff7fffdf88)
at ../../../../openmpi-dev-4221-gb707d13/orte/tools/orterun/orterun.c:293
#21 0x0000000100001928 in main (argc=6, argv=0xffffffff7fffdf88)
at ../../../../openmpi-dev-4221-gb707d13/orte/tools/orterun/main.c:13
(gdb) q
A debugging session is active.
Inferior 1 [process 27925 ] will be killed.
Quit anyway? (y or n) y
Quitting: sol_thread_fetch_registers: td_ta_map_id2thr: no thread can be found
to satisfy query
tyr hello_1 112
tyr hello_1 112 mpiexec -np 4 --host tyr,linpc1,linpc1,tyr hello_1_mpi
ld.so.1: orterun: fatal: relocation error: file
/usr/local/openmpi-master_64_gcc/lib64/openmpi/mca_pmix_pmix114.so: symbol
strnlen: referenced symbol not found
Killed
tyr hello_1 113 Speicherschutzverletzung
[linpc1:25689] *** Process received signal ***
[linpc1:25689] Signal: Segmentation fault (11)
[linpc1:25689] Signal code: Address not mapped (1)
[linpc1:25689] Failing at address: 0x7f721f828aa1
tyr hello_1 113
I would be grateful if somebody can fix the problem. Please let me
know, if you need more information. Thank you very much for any help
in advance.
Kind regards
Siegmar
/* An MPI-version of the "hello world" program, which delivers some
* information about its machine and operating system.
*
*
* Compiling:
* Store executable(s) into local directory.
* mpicc -o <program name> <source code file name>
*
* Store executable(s) into predefined directories.
* make
*
* Make program(s) automatically on all specified hosts. You must
* edit the file "make_compile" and specify your host names before
* you execute it.
* make_compile
*
* Running:
* LAM-MPI:
* mpiexec -boot -np <number of processes> <program name>
* or
* mpiexec -boot \
* -host <hostname> -np <number of processes> <program name> : \
* -host <hostname> -np <number of processes> <program name>
* or
* mpiexec -boot [-v] -configfile <application file>
* or
* lamboot [-v] [<host file>]
* mpiexec -np <number of processes> <program name>
* or
* mpiexec [-v] -configfile <application file>
* lamhalt
*
* OpenMPI:
* "host1", "host2", and so on can all have the same name,
* if you want to start a virtual computer with some virtual
* cpu's on the local host. The name "localhost" is allowed
* as well.
*
* mpiexec -np <number of processes> <program name>
* or
* mpiexec --host <host1,host2,...> \
* -np <number of processes> <program name>
* or
* mpiexec -hostfile <hostfile name> \
* -np <number of processes> <program name>
* or
* mpiexec -app <application file>
*
* Cleaning:
* local computer:
* rm <program name>
* or
* make clean_all
* on all specified computers (you must edit the file "make_clean_all"
* and specify your host names before you execute it.
* make_clean_all
*
*
* File: hello_1_mpi.c Author: S. Gross
* Date: 01.10.2012
*
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/utsname.h>
#include "mpi.h"
#define BUF_SIZE 255 /* message buffer size */
#define MAX_TASKS 12 /* max. number of tasks */
#define SENDTAG 1 /* send message command */
#define EXITTAG 2 /* termination command */
#define MSGTAG 3 /* normal message token */
#define ENTASKS -1 /* error: too many tasks */
static void master (void);
static void slave (void);
int main (int argc, char *argv[])
{
int mytid, /* my task id */
ntasks, /* number of parallel tasks */
namelen; /* length of processor name */
char processor_name[MPI_MAX_PROCESSOR_NAME];
MPI_Init (&argc, &argv);
MPI_Comm_rank (MPI_COMM_WORLD, &mytid);
MPI_Comm_size (MPI_COMM_WORLD, &ntasks);
MPI_Get_processor_name (processor_name, &namelen);
/* With the next statement every process executing this code will
* print one line on the display. It may happen that the lines will
* get mixed up because the display is a critical section. In general
* only one process (mostly the process with rank 0) will print on
* the display and all other processes will send their messages to
* this process. Nevertheless for debugging purposes (or to
* demonstrate that it is possible) it may be useful if every
* process prints itself.
*/
fprintf (stdout, "Process %d of %d running on %s\n",
mytid, ntasks, processor_name);
fflush (stdout);
MPI_Barrier (MPI_COMM_WORLD); /* wait for all other processes */
if (mytid == 0)
{
master ();
}
else
{
slave ();
}
MPI_Finalize ();
return EXIT_SUCCESS;
}
/* Function for the "master task". The master sends a request to all
* slaves asking for a message. After receiving and printing the
* messages he sends all slaves a termination command.
*
* input parameters: not necessary
* output parameters: not available
* return value: nothing
* side effects: no side effects
*
*/
void master (void)
{
int ntasks, /* number of parallel tasks */
mytid, /* my task id */
num, /* number of entries */
i; /* loop variable */
char buf[BUF_SIZE + 1]; /* message buffer (+1 for '\0') */
MPI_Status stat; /* message details */
MPI_Comm_rank (MPI_COMM_WORLD, &mytid);
MPI_Comm_size (MPI_COMM_WORLD, &ntasks);
if (ntasks > MAX_TASKS)
{
fprintf (stderr, "Error: Too many tasks. Try again with at most "
"%d tasks.\n", MAX_TASKS);
/* terminate all slave tasks */
for (i = 1; i < ntasks; ++i)
{
MPI_Send ((char *) NULL, 0, MPI_CHAR, i, EXITTAG, MPI_COMM_WORLD);
}
MPI_Finalize ();
exit (ENTASKS);
}
printf ("\n\nNow %d slave tasks are sending greetings.\n\n",
ntasks - 1);
/* request messages from slave tasks */
for (i = 1; i < ntasks; ++i)
{
MPI_Send ((char *) NULL, 0, MPI_CHAR, i, SENDTAG, MPI_COMM_WORLD);
}
/* wait for messages and print greetings */
for (i = 1; i < ntasks; ++i)
{
MPI_Recv (buf, BUF_SIZE, MPI_CHAR, MPI_ANY_SOURCE,
MPI_ANY_TAG, MPI_COMM_WORLD, &stat);
MPI_Get_count (&stat, MPI_CHAR, &num);
buf[num] = '\0'; /* add missing end-of-string */
printf ("Greetings from task %d:\n"
" message type: %d\n"
" msg length: %d characters\n"
" message: %s\n\n",
stat.MPI_SOURCE, stat.MPI_TAG, num, buf);
}
/* terminate all slave tasks */
for (i = 1; i < ntasks; ++i)
{
MPI_Send ((char *) NULL, 0, MPI_CHAR, i, EXITTAG, MPI_COMM_WORLD);
}
}
/* Function for "slave tasks". The slave task sends its hostname,
* operating system name and release, and processor architecture
* as a message to the master.
*
* input parameters: not necessary
* output parameters: not available
* return value: nothing
* side effects: no side effects
*
*/
void slave (void)
{
struct utsname sys_info; /* system information */
int mytid, /* my task id */
more_to_do;
char buf[BUF_SIZE]; /* message buffer */
MPI_Status stat; /* message details */
MPI_Comm_rank (MPI_COMM_WORLD, &mytid);
more_to_do = 1;
while (more_to_do == 1)
{
/* wait for a message from the master task */
MPI_Recv (buf, BUF_SIZE, MPI_CHAR, 0, MPI_ANY_TAG,
MPI_COMM_WORLD, &stat);
if (stat.MPI_TAG != EXITTAG)
{
uname (&sys_info);
strcpy (buf, "\n hostname: ");
strncpy (buf + strlen (buf), sys_info.nodename,
BUF_SIZE - strlen (buf));
strncpy (buf + strlen (buf), "\n operating system: ",
BUF_SIZE - strlen (buf));
strncpy (buf + strlen (buf), sys_info.sysname,
BUF_SIZE - strlen (buf));
strncpy (buf + strlen (buf), "\n release: ",
BUF_SIZE - strlen (buf));
strncpy (buf + strlen (buf), sys_info.release,
BUF_SIZE - strlen (buf));
strncpy (buf + strlen (buf), "\n processor: ",
BUF_SIZE - strlen (buf));
strncpy (buf + strlen (buf), sys_info.machine,
BUF_SIZE - strlen (buf));
MPI_Send (buf, strlen (buf), MPI_CHAR, stat.MPI_SOURCE,
MSGTAG, MPI_COMM_WORLD);
}
else
{
more_to_do = 0; /* terminate */
}
}
}