Hi,

I have built openmpi-dev-4221-gb707d13  on my machines (Solaris 10
Sparc, Solaris 10 x86_64, and openSUSE Linux 12.1 x86_64) with
gcc-5.1.0 and Sun C 5.13. Unfortunately I get an error for a small
program.


tyr hello_1 109 ompi_info | grep -e "OPAL repo revision:" -e "C compiler 
absolute:"
      OPAL repo revision: dev-4221-gb707d13
     C compiler absolute: /usr/local/gcc-5.1.0/bin/gcc

tyr hello_1 110 mpiexec -np 4 --host tyr,sunpc1,linpc1,tyr hello_1_mpi
ld.so.1: orted: fatal: relocation error: file /usr/local/openmpi-master_64_gcc/lib64/openmpi/mca_pmix_pmix114.so: symbol strnlen: referenced symbol not found
--------------------------------------------------------------------------
ORTE has lost communication with its daemon located on node:

  hostname:  sunpc1

This is usually due to either a failure of the TCP network
connection to the node, or possibly an internal failure of
the daemon itself. We cannot recover from this failure, and
therefore will terminate the job.
--------------------------------------------------------------------------




I get the same error, if I login on a Solaris x86_64 machine and only use
that machine.

sunpc1 fd1026 101 mpiexec -np 2 --host sunpc1,sunpc1 hello_1_mpi
ld.so.1: orterun: fatal: relocation error: file /usr/local/openmpi-master_64_gcc/lib64/openmpi/mca_pmix_pmix114.so: symbol strnlen: referenced symbol not found
Killed
sunpc1 fd1026 102





tyr hello_1 111 /usr/local/gdb-7.6.1_64_gcc/bin/gdb mpiexec
GNU gdb (GDB) 7.6.1
Copyright (C) 2013 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.  Type "show copying"
and "show warranty" for details.
This GDB was configured as "sparc-sun-solaris2.10".
For bug reporting instructions, please see:
<http://www.gnu.org/software/gdb/bugs/>...
Reading symbols from /export2/prog/SunOS_sparc/openmpi-master_64_gcc/bin/orterun...done.
(gdb) set args -np 4 --host tyr,sunpc1,linpc1,tyr hello_1_mpi
(gdb) r
Starting program: /usr/local/openmpi-master_64_gcc/bin/mpiexec -np 4 --host tyr,sunpc1,linpc1,tyr hello_1_mpi
[Thread debugging using libthread_db enabled]
[New Thread 1 (LWP 1)]
[New LWP    2        ]
[New LWP    3        ]
[New LWP    4        ]
[New LWP    5        ]
ld.so.1: orted: fatal: relocation error: file /usr/local/openmpi-master_64_gcc/lib64/openmpi/mca_pmix_pmix114.so: symbol strnlen: referenced symbol not found
--------------------------------------------------------------------------
ORTE has lost communication with its daemon located on node:

  hostname:  sunpc1

This is usually due to either a failure of the TCP network
connection to the node, or possibly an internal failure of
the daemon itself. We cannot recover from this failure, and
therefore will terminate the job.
--------------------------------------------------------------------------
[LWP    5         exited]
[New Thread 5        ]
[LWP    4         exited]
[New Thread 4        ]
[LWP    3         exited]
[New Thread 3        ]
[Switching to Thread 1 (LWP 1)]
sol_thread_fetch_registers: td_ta_map_id2thr: no thread can be found to satisfy query
(gdb) Killed

(gdb) bt
#0  0xffffffff7f6173d0 in rtld_db_dlactivity () from /usr/lib/sparcv9/ld.so.1
#1  0xffffffff7f6175a8 in rd_event () from /usr/lib/sparcv9/ld.so.1
#2  0xffffffff7f618950 in lm_delete () from /usr/lib/sparcv9/ld.so.1
#3  0xffffffff7f6226bc in remove_so () from /usr/lib/sparcv9/ld.so.1
#4  0xffffffff7f624574 in remove_hdl () from /usr/lib/sparcv9/ld.so.1
#5  0xffffffff7f61d97c in dlclose_core () from /usr/lib/sparcv9/ld.so.1
#6  0xffffffff7f61d9d4 in dlclose_intn () from /usr/lib/sparcv9/ld.so.1
#7  0xffffffff7f61db0c in dlclose () from /usr/lib/sparcv9/ld.so.1
#8  0xffffffff7ece8d30 in dlopen_close (handle=0x1001a8350)
at ../../../../../openmpi-dev-4221-gb707d13/opal/mca/dl/dlopen/dl_dlopen_module.c:148
#9  0xffffffff7ece8464 in opal_dl_close (handle=0x1001a8350)
    at ../../../../openmpi-dev-4221-gb707d13/opal/mca/dl/base/dl_base_fns.c:53
#10 0xffffffff7ecab1c0 in mca_base_component_repository_release_internal (ri=0x1001406d0) at ../../../../openmpi-dev-4221-gb707d13/opal/mca/base/mca_base_component_repository.c:280
#11 0xffffffff7ecab338 in mca_base_component_repository_release (
    component=0xffffffff799a70c0 <mca_pmix_pmix114_component>)
at ../../../../openmpi-dev-4221-gb707d13/opal/mca/base/mca_base_component_repository.c:317
#12 0xffffffff7ecad0d8 in mca_base_component_unload (
    component=0xffffffff799a70c0 <mca_pmix_pmix114_component>, output_id=-1)
at ../../../../openmpi-dev-4221-gb707d13/opal/mca/base/mca_base_components_close.c:46
#13 0xffffffff7ecad170 in mca_base_component_close (
    component=0xffffffff799a70c0 <mca_pmix_pmix114_component>, output_id=-1)
at ../../../../openmpi-dev-4221-gb707d13/opal/mca/base/mca_base_components_close.c:59
#14 0xffffffff7ecad240 in mca_base_components_close (output_id=-1,
    components=0xffffffff7ee9f558 <opal_pmix_base_framework+80>, skip=0x0)
at ../../../../openmpi-dev-4221-gb707d13/opal/mca/base/mca_base_components_close.c:85
#15 0xffffffff7ecad1b0 in mca_base_framework_components_close (
    framework=0xffffffff7ee9f508 <opal_pmix_base_framework>, skip=0x0)
at ../../../../openmpi-dev-4221-gb707d13/opal/mca/base/mca_base_components_close.c:65
#16 0xffffffff7ed4921c in opal_pmix_base_frame_close ()
at ../../../../openmpi-dev-4221-gb707d13/opal/mca/pmix/base/pmix_base_frame.c:57
#17 0xffffffff7ecc3418 in mca_base_framework_close (
    framework=0xffffffff7ee9f508 <opal_pmix_base_framework>)
    at 
../../../../openmpi-dev-4221-gb707d13/opal/mca/base/mca_base_framework.c:214
#18 0xffffffff7c20782c in rte_finalize ()
at ../../../../../openmpi-dev-4221-gb707d13/orte/mca/ess/hnp/ess_hnp_module.c:795
#19 0xffffffff7ef39e20 in orte_finalize ()
    at ../../openmpi-dev-4221-gb707d13/orte/runtime/orte_finalize.c:73
#20 0x0000000100002d08 in orterun (argc=6, argv=0xffffffff7fffdf88)
    at ../../../../openmpi-dev-4221-gb707d13/orte/tools/orterun/orterun.c:293
#21 0x0000000100001928 in main (argc=6, argv=0xffffffff7fffdf88)
    at ../../../../openmpi-dev-4221-gb707d13/orte/tools/orterun/main.c:13
(gdb) q
A debugging session is active.

        Inferior 1 [process 27925    ] will be killed.

Quit anyway? (y or n) y
Quitting: sol_thread_fetch_registers: td_ta_map_id2thr: no thread can be found to satisfy query
tyr hello_1 112





tyr hello_1 112 mpiexec -np 4 --host tyr,linpc1,linpc1,tyr hello_1_mpi
ld.so.1: orterun: fatal: relocation error: file /usr/local/openmpi-master_64_gcc/lib64/openmpi/mca_pmix_pmix114.so: symbol strnlen: referenced symbol not found
Killed
tyr hello_1 113 Speicherschutzverletzung
[linpc1:25689] *** Process received signal ***
[linpc1:25689] Signal: Segmentation fault (11)
[linpc1:25689] Signal code: Address not mapped (1)
[linpc1:25689] Failing at address: 0x7f721f828aa1

tyr hello_1 113










I would be grateful if somebody can fix the problem. Please let me
know, if you need more information. Thank you very much for any help
in advance.


Kind regards

Siegmar
/* An MPI-version of the "hello world" program, which delivers some
 * information about its machine and operating system.
 *
 *
 * Compiling:
 *   Store executable(s) into local directory.
 *     mpicc -o <program name> <source code file name>
 *
 *   Store executable(s) into predefined directories.
 *     make
 *
 *   Make program(s) automatically on all specified hosts. You must
 *   edit the file "make_compile" and specify your host names before
 *   you execute it.
 *     make_compile
 *
 * Running:
 *   LAM-MPI:
 *     mpiexec -boot -np <number of processes> <program name>
 *     or
 *     mpiexec -boot \
 *	 -host <hostname> -np <number of processes> <program name> : \
 *	 -host <hostname> -np <number of processes> <program name>
 *     or
 *     mpiexec -boot [-v] -configfile <application file>
 *     or
 *     lamboot [-v] [<host file>]
 *       mpiexec -np <number of processes> <program name>
 *	 or
 *	 mpiexec [-v] -configfile <application file>
 *     lamhalt
 *
 *   OpenMPI:
 *     "host1", "host2", and so on can all have the same name,
 *     if you want to start a virtual computer with some virtual
 *     cpu's on the local host. The name "localhost" is allowed
 *     as well.
 *
 *     mpiexec -np <number of processes> <program name>
 *     or
 *     mpiexec --host <host1,host2,...> \
 *	 -np <number of processes> <program name>
 *     or
 *     mpiexec -hostfile <hostfile name> \
 *	 -np <number of processes> <program name>
 *     or
 *     mpiexec -app <application file>
 *
 * Cleaning:
 *   local computer:
 *     rm <program name>
 *     or
 *     make clean_all
 *   on all specified computers (you must edit the file "make_clean_all"
 *   and specify your host names before you execute it.
 *     make_clean_all
 *
 *
 * File: hello_1_mpi.c		       	Author: S. Gross
 * Date: 01.10.2012
 *
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/utsname.h>
#include "mpi.h"

#define	BUF_SIZE	255		/* message buffer size		*/
#define	MAX_TASKS	12		/* max. number of tasks		*/
#define	SENDTAG		1		/* send message command		*/
#define	EXITTAG		2		/* termination command		*/
#define	MSGTAG		3		/* normal message token		*/

#define ENTASKS		-1		/* error: too many tasks	*/

static void master (void);
static void slave (void);

int main (int argc, char *argv[])
{
  int  mytid,				/* my task id			*/
       ntasks,				/* number of parallel tasks	*/
       namelen;				/* length of processor name	*/
  char processor_name[MPI_MAX_PROCESSOR_NAME];

  MPI_Init (&argc, &argv);
  MPI_Comm_rank (MPI_COMM_WORLD, &mytid);
  MPI_Comm_size (MPI_COMM_WORLD, &ntasks);
  MPI_Get_processor_name (processor_name, &namelen);
  /* With the next statement every process executing this code will
   * print one line on the display. It may happen that the lines will
   * get mixed up because the display is a critical section. In general
   * only one process (mostly the process with rank 0) will print on
   * the display and all other processes will send their messages to
   * this process. Nevertheless for debugging purposes (or to
   * demonstrate that it is possible) it may be useful if every
   * process prints itself.
   */
  fprintf (stdout, "Process %d of %d running on %s\n",
	   mytid, ntasks, processor_name);
  fflush (stdout);
  MPI_Barrier (MPI_COMM_WORLD);		/* wait for all other processes	*/

  if (mytid == 0)
  {
    master ();
  }
  else
  {
    slave ();
  }
  MPI_Finalize ();
  return EXIT_SUCCESS;
}


/* Function for the "master task". The master sends a request to all
 * slaves asking for a message. After receiving and printing the
 * messages he sends all slaves a termination command.
 *
 * input parameters:	not necessary
 * output parameters:	not available
 * return value:	nothing
 * side effects:	no side effects
 *
 */
void master (void)
{
  int		ntasks,			/* number of parallel tasks	*/
		mytid,			/* my task id			*/
		num,			/* number of entries		*/
		i;			/* loop variable		*/
  char		buf[BUF_SIZE + 1];    	/* message buffer (+1 for '\0')	*/
  MPI_Status	stat;			/* message details		*/

  MPI_Comm_rank (MPI_COMM_WORLD, &mytid);
  MPI_Comm_size (MPI_COMM_WORLD, &ntasks);
  if (ntasks > MAX_TASKS)
  {
    fprintf (stderr, "Error: Too many tasks. Try again with at most "
	     "%d tasks.\n", MAX_TASKS);
    /* terminate all slave tasks					*/
    for (i = 1; i < ntasks; ++i)
    {
      MPI_Send ((char *) NULL, 0, MPI_CHAR, i, EXITTAG, MPI_COMM_WORLD);
    }
    MPI_Finalize ();
    exit (ENTASKS);
  }
  printf ("\n\nNow %d slave tasks are sending greetings.\n\n",
	  ntasks - 1);
  /* request messages from slave tasks					*/
  for (i = 1; i < ntasks; ++i)
  {
    MPI_Send ((char *) NULL, 0, MPI_CHAR, i, SENDTAG, MPI_COMM_WORLD);
  }
  /* wait for messages and print greetings     				*/
  for (i = 1; i < ntasks; ++i)
  {
    MPI_Recv (buf, BUF_SIZE, MPI_CHAR, MPI_ANY_SOURCE,
	      MPI_ANY_TAG, MPI_COMM_WORLD, &stat);
    MPI_Get_count (&stat, MPI_CHAR, &num);
    buf[num] = '\0';			/* add missing end-of-string	*/
    printf ("Greetings from task %d:\n"
	    "  message type:        %d\n"
	    "  msg length:          %d characters\n"
	    "  message:             %s\n\n",
	    stat.MPI_SOURCE, stat.MPI_TAG, num, buf);
  }
  /* terminate all slave tasks						*/
  for (i = 1; i < ntasks; ++i)
  {
    MPI_Send ((char *) NULL, 0, MPI_CHAR, i, EXITTAG, MPI_COMM_WORLD);
  }
}


/* Function for "slave tasks". The slave task sends its hostname,
 * operating system name and release, and processor architecture
 * as a message to the master.
 *
 * input parameters:	not necessary
 * output parameters:	not available
 * return value:	nothing
 * side effects:	no side effects
 *
 */
void slave (void)
{
  struct utsname sys_info;		/* system information		*/
  int		 mytid,		       	/* my task id			*/
		 more_to_do;
  char		 buf[BUF_SIZE];       	/* message buffer      		*/
  MPI_Status	 stat;			/* message details		*/

  MPI_Comm_rank (MPI_COMM_WORLD, &mytid);
  more_to_do = 1;
  while (more_to_do == 1)
  {
    /* wait for a message from the master task				*/
    MPI_Recv (buf, BUF_SIZE, MPI_CHAR, 0, MPI_ANY_TAG,
	      MPI_COMM_WORLD, &stat);
    if (stat.MPI_TAG != EXITTAG)
    {
      uname (&sys_info);
      strcpy (buf, "\n    hostname:          ");
      strncpy (buf + strlen (buf), sys_info.nodename,
	       BUF_SIZE - strlen (buf));
      strncpy (buf + strlen (buf), "\n    operating system:  ",
	       BUF_SIZE - strlen (buf));
      strncpy (buf + strlen (buf), sys_info.sysname,
	       BUF_SIZE - strlen (buf));
      strncpy (buf + strlen (buf), "\n    release:           ",
	       BUF_SIZE - strlen (buf));
      strncpy (buf + strlen (buf), sys_info.release,
	       BUF_SIZE - strlen (buf));
      strncpy (buf + strlen (buf), "\n    processor:         ",
	       BUF_SIZE - strlen (buf));
      strncpy (buf + strlen (buf), sys_info.machine,
	       BUF_SIZE - strlen (buf));
      MPI_Send (buf, strlen (buf), MPI_CHAR, stat.MPI_SOURCE,
		MSGTAG, MPI_COMM_WORLD);
    }
    else
    {
      more_to_do = 0;			/* terminate			*/
    }
  }
}

Reply via email to