Hi,

At work, i do have some mpi codes that make use of custom datatypes to
call MPI_File_read with MPI_BOTTOM ... It mostly works, except when
the underlying filesystem is NFS where if crash with SIGSEGV.

The attached sample (code + data) works just fine with 1.10.1 on my
NetBSD/amd64 workstation using the UFS romio backend, but crash if
switched to NFS :

njoly@issan [~]> mpirun --version
mpirun (Open MPI) 1.10.1
njoly@issan [~]> mpicc -g -Wall -o sample sample.c
njoly@issan [~]> mpirun -n 2 ./sample ufs:data.txt
rank1 ... 111111111133333333335555555555
rank0 ... 000000000022222222224444444444
njoly@issan [~]> mpirun -n 2 ./sample nfs:data.txt
[issan:20563] *** Process received signal ***
[issan:08879] *** Process received signal ***
[issan:20563] Signal: Segmentation fault (11)
[issan:20563] Signal code: Address not mapped (1)
[issan:20563] Failing at address: 0xffffffffb1309240
[issan:08879] Signal: Segmentation fault (11)
[issan:08879] Signal code: Address not mapped (1)
[issan:08879] Failing at address: 0xffffffff881b0420
[issan:08879] [ 0] [issan:20563] [ 0] 0x7dafb14a52b0 <__sigtramp_siginfo_2> at 
/usr/lib/libc.so.12
[issan:20563] *** End of error message ***
0x78b9886a52b0 <__sigtramp_siginfo_2> at /usr/lib/libc.so.12
[issan:08879] *** End of error message ***
--------------------------------------------------------------------------
mpirun noticed that process rank 0 with PID 20563 on node issan exited on 
signal 11 (Segmentation fault).
--------------------------------------------------------------------------
njoly@issan [~]> gdb sample sample.core
GNU gdb (GDB) 7.10.1
[...]
Core was generated by `sample'.
Program terminated with signal SIGSEGV, Segmentation fault.
#0  0x000078b98871971f in memcpy () from /usr/lib/libc.so.12
[Current thread is 1 (LWP 1)]
(gdb) bt
#0  0x000078b98871971f in memcpy () from /usr/lib/libc.so.12
#1  0x000078b974010edf in ADIOI_NFS_ReadStrided () from 
/usr/pkg/lib/openmpi/mca_io_romio.so
#2  0x000078b97400bacf in MPIOI_File_read () from 
/usr/pkg/lib/openmpi/mca_io_romio.so
#3  0x000078b97400bc72 in mca_io_romio_dist_MPI_File_read () from 
/usr/pkg/lib/openmpi/mca_io_romio.so
#4  0x000078b988e72b38 in PMPI_File_read () from /usr/pkg/lib/libmpi.so.12
#5  0x00000000004013a4 in main (argc=2, argv=0x7f7fff7b0f00) at sample.c:63

Thanks.

-- 
Nicolas Joly

Cluster & Computing Group
Biology IT Center
Institut Pasteur, Paris.
#include <assert.h>
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

int main(int argc, char **argv) {
  char *file, *rbuf;
  int i, res, rank, size, count;
  size_t len;

  MPI_Comm comm = MPI_COMM_WORLD;
  MPI_Datatype data, view;
  MPI_File fh;
  MPI_Offset fs;
  MPI_Status sts;

  int lens[6], offs[6];
  MPI_Aint addr, disp[6];

  file = argv[1];

  res = MPI_Init(&argc, &argv);
  assert(res == MPI_SUCCESS);
  res = MPI_Comm_size(comm, &size);
  assert(res == MPI_SUCCESS);
  res = MPI_Comm_rank(comm, &rank);
  assert(res == MPI_SUCCESS);

  res = MPI_File_open(comm, file, MPI_MODE_RDONLY, MPI_INFO_NULL, &fh);
  assert(res == MPI_SUCCESS);
  res = MPI_File_get_size(fh, &fs);
  assert(res == MPI_SUCCESS);

  count = fs / 10 / size;
  len = count * 10;
  rbuf = malloc(len+1);
  assert(rbuf != NULL);
  memset(rbuf, 0, len+1);

  for (i = 0; i < count; i++) {
    lens[i] = 10;
    offs[i] = rank * 10 + size * i * 10;
    res = MPI_Get_address(rbuf + i * 10, &addr);
    assert(res == MPI_SUCCESS);
    disp[i] = addr;
  }

  res = MPI_Type_create_hindexed(count, lens, disp, MPI_CHAR, &data);
  assert(res == MPI_SUCCESS);
  res = MPI_Type_commit(&data);
  assert(res == MPI_SUCCESS);

  res = MPI_Type_indexed(count, lens, offs, MPI_CHAR, &view);
  assert(res == MPI_SUCCESS);
  res = MPI_Type_commit(&view);
  assert(res == MPI_SUCCESS);

  res = MPI_File_set_view(fh, 0, MPI_CHAR, view, "native", MPI_INFO_NULL);
  assert(res == MPI_SUCCESS);
  res = MPI_File_read(fh, MPI_BOTTOM, 1, data, &sts);
  assert(res == MPI_SUCCESS);
  res = MPI_Get_count(&sts, data, &count);
  assert(res == MPI_SUCCESS);
  assert(count == 1);

  printf("rank%d ... %s\n", rank, rbuf);

  res = MPI_Type_free(&view);
  assert(res == MPI_SUCCESS);

  res = MPI_Type_free(&data);
  assert(res == MPI_SUCCESS);

  free(rbuf);

  res = MPI_File_close(&fh);
  assert(res == MPI_SUCCESS);

  res = MPI_Finalize();
  assert(res == MPI_SUCCESS);

  return 0; }
000000000011111111112222222222333333333344444444445555555555

Reply via email to