Hi, At work, i do have some mpi codes that make use of custom datatypes to call MPI_File_read with MPI_BOTTOM ... It mostly works, except when the underlying filesystem is NFS where if crash with SIGSEGV.
The attached sample (code + data) works just fine with 1.10.1 on my NetBSD/amd64 workstation using the UFS romio backend, but crash if switched to NFS : njoly@issan [~]> mpirun --version mpirun (Open MPI) 1.10.1 njoly@issan [~]> mpicc -g -Wall -o sample sample.c njoly@issan [~]> mpirun -n 2 ./sample ufs:data.txt rank1 ... 111111111133333333335555555555 rank0 ... 000000000022222222224444444444 njoly@issan [~]> mpirun -n 2 ./sample nfs:data.txt [issan:20563] *** Process received signal *** [issan:08879] *** Process received signal *** [issan:20563] Signal: Segmentation fault (11) [issan:20563] Signal code: Address not mapped (1) [issan:20563] Failing at address: 0xffffffffb1309240 [issan:08879] Signal: Segmentation fault (11) [issan:08879] Signal code: Address not mapped (1) [issan:08879] Failing at address: 0xffffffff881b0420 [issan:08879] [ 0] [issan:20563] [ 0] 0x7dafb14a52b0 <__sigtramp_siginfo_2> at /usr/lib/libc.so.12 [issan:20563] *** End of error message *** 0x78b9886a52b0 <__sigtramp_siginfo_2> at /usr/lib/libc.so.12 [issan:08879] *** End of error message *** -------------------------------------------------------------------------- mpirun noticed that process rank 0 with PID 20563 on node issan exited on signal 11 (Segmentation fault). -------------------------------------------------------------------------- njoly@issan [~]> gdb sample sample.core GNU gdb (GDB) 7.10.1 [...] Core was generated by `sample'. Program terminated with signal SIGSEGV, Segmentation fault. #0 0x000078b98871971f in memcpy () from /usr/lib/libc.so.12 [Current thread is 1 (LWP 1)] (gdb) bt #0 0x000078b98871971f in memcpy () from /usr/lib/libc.so.12 #1 0x000078b974010edf in ADIOI_NFS_ReadStrided () from /usr/pkg/lib/openmpi/mca_io_romio.so #2 0x000078b97400bacf in MPIOI_File_read () from /usr/pkg/lib/openmpi/mca_io_romio.so #3 0x000078b97400bc72 in mca_io_romio_dist_MPI_File_read () from /usr/pkg/lib/openmpi/mca_io_romio.so #4 0x000078b988e72b38 in PMPI_File_read () from /usr/pkg/lib/libmpi.so.12 #5 0x00000000004013a4 in main (argc=2, argv=0x7f7fff7b0f00) at sample.c:63 Thanks. -- Nicolas Joly Cluster & Computing Group Biology IT Center Institut Pasteur, Paris.
#include <assert.h> #include <mpi.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <unistd.h> int main(int argc, char **argv) { char *file, *rbuf; int i, res, rank, size, count; size_t len; MPI_Comm comm = MPI_COMM_WORLD; MPI_Datatype data, view; MPI_File fh; MPI_Offset fs; MPI_Status sts; int lens[6], offs[6]; MPI_Aint addr, disp[6]; file = argv[1]; res = MPI_Init(&argc, &argv); assert(res == MPI_SUCCESS); res = MPI_Comm_size(comm, &size); assert(res == MPI_SUCCESS); res = MPI_Comm_rank(comm, &rank); assert(res == MPI_SUCCESS); res = MPI_File_open(comm, file, MPI_MODE_RDONLY, MPI_INFO_NULL, &fh); assert(res == MPI_SUCCESS); res = MPI_File_get_size(fh, &fs); assert(res == MPI_SUCCESS); count = fs / 10 / size; len = count * 10; rbuf = malloc(len+1); assert(rbuf != NULL); memset(rbuf, 0, len+1); for (i = 0; i < count; i++) { lens[i] = 10; offs[i] = rank * 10 + size * i * 10; res = MPI_Get_address(rbuf + i * 10, &addr); assert(res == MPI_SUCCESS); disp[i] = addr; } res = MPI_Type_create_hindexed(count, lens, disp, MPI_CHAR, &data); assert(res == MPI_SUCCESS); res = MPI_Type_commit(&data); assert(res == MPI_SUCCESS); res = MPI_Type_indexed(count, lens, offs, MPI_CHAR, &view); assert(res == MPI_SUCCESS); res = MPI_Type_commit(&view); assert(res == MPI_SUCCESS); res = MPI_File_set_view(fh, 0, MPI_CHAR, view, "native", MPI_INFO_NULL); assert(res == MPI_SUCCESS); res = MPI_File_read(fh, MPI_BOTTOM, 1, data, &sts); assert(res == MPI_SUCCESS); res = MPI_Get_count(&sts, data, &count); assert(res == MPI_SUCCESS); assert(count == 1); printf("rank%d ... %s\n", rank, rbuf); res = MPI_Type_free(&view); assert(res == MPI_SUCCESS); res = MPI_Type_free(&data); assert(res == MPI_SUCCESS); free(rbuf); res = MPI_File_close(&fh); assert(res == MPI_SUCCESS); res = MPI_Finalize(); assert(res == MPI_SUCCESS); return 0; }
000000000011111111112222222222333333333344444444445555555555