The attached example code (stripped down from a bigger app) demonstrates a way to trigger a severe crash in all recent ompi releases but not in a bunch of latest MPICH2 releases. The code is minimalistic and boils down to the call

MPI_Comm_create(MPI_COMM_WORLD, MPI_GROUP_EMPTY, &dummy_comm);

which isn't supposed to be illegal. Please refer to the (well-documented) code for details on the high-dimensional cross product I tested (on ubuntu 10.04 LTS), a potential workaround (which isn't supposed to be necessary I think) and an exemplary stack trace.

Instructions: mpicc test.c -Wall -O0 && mpirun -np 2 ./a.out

Thanks!

dom


#include <stdio.h>
#include <mpi.h>
#include <assert.h>
#include <stdlib.h>

static void check(int code, char *label, int line)
{
  if (code != MPI_SUCCESS)
  {
    fprintf(stderr, "MPI Error at %s (line %d): %d\n", label, line, code);
    exit (22);
  }
}

int main (int argc, char **argv)
{
  // init MPI
  int mpi_error_code = MPI_Init(&argc, &argv);
  check(mpi_error_code, "MPI_Init", __LINE__);

  // get total number of processes
  int num_processes;
  mpi_error_code = MPI_Comm_size(MPI_COMM_WORLD, &num_processes);
  check(mpi_error_code, "MPI_Comm_Size", __LINE__);

  // get MPI_COMM_WORLD rank of this process
  int my_rank;
  mpi_error_code = MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
  check(mpi_error_code, "MPI_Comm_Rank", __LINE__);

  // set rank of "master"
  int rank_master = num_processes - 1;

  // query world group
  MPI_Group world_group;
  mpi_error_code = MPI_Comm_group(MPI_COMM_WORLD, &world_group);
  check(mpi_error_code, "MPI_Comm_group", __LINE__);

  // split COMM_WORLD group into two:
  // first group contains my_rank==num_processes-1 and second group contains the
  // remaining ranks
  //
  // Note that *all* processes in the "parent" communicator (which is MPI_COMM_WORLD,
  // so really all processes) must collectively call MPI_Comm_create, even those not
  // part of the new communicator. 
  if(my_rank != rank_master)
  {
    // set up group
    MPI_Group gr_without_master;
    mpi_error_code = MPI_Group_excl(world_group, 1, &rank_master, &gr_without_master);
    check(mpi_error_code, " MPI_Group_excl", __LINE__);

    // create comm from group
    MPI_Comm gr_comm;
    mpi_error_code = MPI_Comm_create(MPI_COMM_WORLD, gr_without_master, &gr_comm);
    check(mpi_error_code, " MPI_Comm_create", __LINE__);
  }
  else
  {
    // This variant breaks OpenMPI, with the stack trace below.
    // This is not supposed to happen, creating empty groups from COMM_WORLD
    // is perfectly legal.
    // Code breaks with OpenMPI 1.4.x and 1.5.x and the cross product of compiler
    // bindings for Intel12, gcc4.4.3, sunstudio12.2 (where applicable) and Open64.
    // Code works perfectly fine with MPICH2 using the same cross product of compiler
    // bindings. 
    MPI_Comm dummy_comm;
    mpi_error_code = MPI_Comm_create(MPI_COMM_WORLD, MPI_GROUP_EMPTY, &dummy_comm);
    check(mpi_error_code, " MPI_Comm_create", __LINE__);

/*    
    // This variant always works, but according to the MPI 2.2 standard, creating the
    // dummy group should not be necessary.
    MPI_Group dummy_group;
    mpi_error_code = MPI_Group_incl(world_group, 1, &rank_master, &dummy_group);
    check(mpi_error_code, " MPI_Group_incl", __LINE__);
    MPI_Comm dummy_comm;
    mpi_error_code = MPI_Comm_create(MPI_COMM_WORLD, dummy_group, &dummy_comm);
    check(mpi_error_code, " MPI_Comm_create", __LINE__);
*/
  }

  // dump a success message and shut things down
  printf("Groups are set up without error, time to go to sleep\n");
  mpi_error_code = MPI_Finalize();
  check(mpi_error_code, " MPI_Finalize", __LINE__);

  return 0;
}

/* Stack trace of the crashing version, exemplary for the GCC that ships with Ubuntu 10.04 LTS

[teslaspule:19150] *** Process received signal ***
[teslaspule:19150] Signal: Segmentation fault (11)
[teslaspule:19150] Signal code: Address not mapped (1)
[teslaspule:19150] Failing at address: (nil)
[teslaspule:19150] [ 0] /lib/libpthread.so.0(+0xf8f0) [0x7f1930cb28f0]
[teslaspule:19150] [ 1] /sfw/openmpi/gcc4.4.3/1.4.3/lib/libmpi.so.0(ompi_dpm_base_mark_dyncomm+0x2e) [0x7f1931c7c51e]
[teslaspule:19150] [ 2] /sfw/openmpi/gcc4.4.3/1.4.3/lib/libmpi.so.0(ompi_comm_set+0x16d) [0x7f1931c2e3dd]
[teslaspule:19150] [ 3] /sfw/openmpi/gcc4.4.3/1.4.3/lib/libmpi.so.0(+0x251f3) [0x7f1931c2f1f3]
[teslaspule:19150] [ 4] /sfw/openmpi/gcc4.4.3/1.4.3/lib/libmpi.so.0(MPI_Comm_create+0xc1) [0x7f1931c5b421]
[teslaspule:19150] [ 5] ./a.out(main+0x147) [0x400d5e]
[teslaspule:19150] [ 6] /lib/libc.so.6(__libc_start_main+0xfd) [0x7f193093ec4d]
[teslaspule:19150] [ 7] ./a.out() [0x400b09]
[teslaspule:19150] *** End of error message ***

*/

Reply via email to