Hello, I am seeing inconsistent mpi-io behavior when writing to a Lustre file system using open mpi 1.3 with romio. What follows is a simple reproducer and output. Essentially one or more of the running processes does not read or write the correct amount of data to its part of a file residing on a Lustre (parallel) file system.
Any help figuring out what is happening is greatly appreciated. Thanks, Nate program gcrm_test_io implicit none include "mpif.h" integer X_SIZE integer w_me, w_nprocs integer my_info integer i integer (kind=4) :: ierr integer (kind=4) :: fileID integer (kind=MPI_OFFSET_KIND) :: mylen integer (kind=MPI_OFFSET_KIND) :: offset integer status(MPI_STATUS_SIZE) integer count integer ncells real (kind=4), allocatable, dimension (:) :: array2 logical sync call mpi_init(ierr) call MPI_COMM_SIZE(MPI_COMM_WORLD,w_nprocs,ierr) call MPI_COMM_RANK(MPI_COMM_WORLD,w_me,ierr) call mpi_info_create(my_info, ierr) ! optional ways to set things in mpi-io ! call mpi_info_set (my_info, "romio_ds_read" , "enable" , ierr) ! call mpi_info_set (my_info, "romio_ds_write", "enable" , ierr) ! call mpi_info_set (my_info, "romio_cb_write", "enable" , ierr) x_size = 410011 ! A 'big' number, with bigger numbers it is more likely to fail sync = .true. ! Extra file synchronization ncells = (X_SIZE * w_nprocs) ! Use node zero to fill it with nines if (w_me .eq. 0) then call MPI_FILE_OPEN (MPI_COMM_SELF, "output.dat", MPI_MODE_CREATE+MPI_MODE_WRONLY, my_info, fileID, ierr) allocate (array2(ncells)) array2(:) = 9.0 mylen = ncells offset = 0 * 4 call MPI_FILE_SET_VIEW(fileID,offset, MPI_REAL,MPI_REAL, "native",MPI_INFO_NULL,ierr) call MPI_File_write(fileID, array2, mylen , MPI_REAL, status,ierr) call MPI_Get_count(status,MPI_INTEGER, count, ierr) if (count .ne. mylen) print*, "Wrong initial write count:", count,mylen deallocate(array2) if (sync) call MPI_FILE_SYNC (fileID,ierr) call MPI_FILE_CLOSE (fileID,ierr) endif ! All nodes now fill their area with ones call MPI_BARRIER(MPI_COMM_WORLD,ierr) allocate (array2( X_SIZE)) array2(:) = 1.0 offset = (w_me * X_SIZE) * 4 ! multiply by four, since it is real*4 mylen = X_SIZE call MPI_FILE_OPEN (MPI_COMM_WORLD,"output.dat",MPI_MODE_WRONLY, my_info, fileID, ierr) print*,"node",w_me,"starting",(offset/4) + 1,"ending",(offset/4)+mylen call MPI_FILE_SET_VIEW(fileID,offset, MPI_REAL,MPI_REAL, "native",MPI_INFO_NULL,ierr) call MPI_File_write(fileID, array2, mylen , MPI_REAL, status,ierr) call MPI_Get_count(status,MPI_INTEGER, count, ierr) if (count .ne. mylen) print*, "Wrong write count:", count,mylen,w_me deallocate(array2) if (sync) call MPI_FILE_SYNC (fileID,ierr) call MPI_FILE_CLOSE (fileID,ierr) ! Read it back on node zero to see if it is ok data if (w_me .eq. 0) then call MPI_FILE_OPEN (MPI_COMM_SELF, "output.dat", MPI_MODE_RDONLY, my_info, fileID, ierr) mylen = ncells allocate (array2(ncells)) call MPI_File_read(fileID, array2, mylen , MPI_REAL, status,ierr) call MPI_Get_count(status,MPI_INTEGER, count, ierr) if (count .ne. mylen) print*, "Wrong read count:", count,mylen do i=1,ncells if (array2(i) .ne. 1) then print*, "ERROR", i,array2(i), ((i-1)*4), ((i-1)*4)/(1024d0*1024d0) ! Index, value, # of good bytes,MB goto 999 end if end do print*, "All done with nothing wrong" 999 deallocate(array2) call MPI_FILE_CLOSE (fileID,ierr) call MPI_file_delete ("output.dat",MPI_INFO_NULL,ierr) endif call mpi_finalize(ierr) end program gcrm_test_io 1.3 Open MPI node 0 starting 1 ending 410011 node 1 starting 410012 ending 820022 node 2 starting 820023 ending 1230033 node 3 starting 1230034 ending 1640044 node 4 starting 1640045 ending 2050055 node 5 starting 2050056 ending 2460066 All done with nothing wrong node 0 starting 1 ending 410011 node 1 starting 410012 ending 820022 node 2 starting 820023 ending 1230033 node 5 starting 2050056 ending 2460066 node 4 starting 1640045 ending 2050055 node 3 starting 1230034 ending 1640044 Wrong write count: 228554 410011 2 Wrong read count: 1048576 2460066 ERROR 1048577 0.0000000E+00 4194304 4.00000000000000 node 1 starting 410012 ending 820022 node 0 starting 1 ending 410011 node 2 starting 820023 ending 1230033 node 3 starting 1230034 ending 1640044 node 4 starting 1640045 ending 2050055 node 5 starting 2050056 ending 2460066 Wrong read count: 1229824 2460066 ERROR 1229825 0.0000000E+00 4919296 4.69140625000000 -- Nathan Baca nathan.b...@gmail.com