Hello,
I am seeing inconsistent mpi-io behavior when writing to a Lustre
file system using open mpi 1.3 with romio. What follows is a simple
reproducer and output. Essentially one or more of the running
processes does not read or write the correct amount of data to its
part of a file residing on a Lustre (parallel) file system.
Any help figuring out what is happening is greatly appreciated.
Thanks, Nate
program gcrm_test_io
implicit none
include "mpif.h"
integer X_SIZE
integer w_me, w_nprocs
integer my_info
integer i
integer (kind=4) :: ierr
integer (kind=4) :: fileID
integer (kind=MPI_OFFSET_KIND) :: mylen
integer (kind=MPI_OFFSET_KIND) :: offset
integer status(MPI_STATUS_SIZE)
integer count
integer ncells
real (kind=4), allocatable, dimension (:) :: array2
logical sync
call mpi_init(ierr)
call MPI_COMM_SIZE(MPI_COMM_WORLD,w_nprocs,ierr)
call MPI_COMM_RANK(MPI_COMM_WORLD,w_me,ierr)
call mpi_info_create(my_info, ierr)
! optional ways to set things in mpi-io
! call mpi_info_set (my_info, "romio_ds_read" , "enable" ,
ierr)
! call mpi_info_set (my_info, "romio_ds_write", "enable" ,
ierr)
! call mpi_info_set (my_info, "romio_cb_write", "enable" ,
ierr)
x_size = 410011 ! A 'big' number, with bigger numbers it is
more likely to fail
sync = .true. ! Extra file synchronization
ncells = (X_SIZE * w_nprocs)
! Use node zero to fill it with nines
if (w_me .eq. 0) then
call MPI_FILE_OPEN (MPI_COMM_SELF, "output.dat",
MPI_MODE_CREATE+MPI_MODE_WRONLY, my_info, fileID, ierr)
allocate (array2(ncells))
array2(:) = 9.0
mylen = ncells
offset = 0 * 4
call MPI_FILE_SET_VIEW(fileID,offset, MPI_REAL,MPI_REAL,
"native",MPI_INFO_NULL,ierr)
call MPI_File_write(fileID, array2, mylen , MPI_REAL,
status,ierr)
call MPI_Get_count(status,MPI_INTEGER, count, ierr)
if (count .ne. mylen) print*, "Wrong initial write
count:", count,mylen
deallocate(array2)
if (sync) call MPI_FILE_SYNC (fileID,ierr)
call MPI_FILE_CLOSE (fileID,ierr)
endif
! All nodes now fill their area with ones
call MPI_BARRIER(MPI_COMM_WORLD,ierr)
allocate (array2( X_SIZE))
array2(:) = 1.0
offset = (w_me * X_SIZE) * 4 ! multiply by four, since it is
real*4
mylen = X_SIZE
call MPI_FILE_OPEN
(MPI_COMM_WORLD,"output.dat",MPI_MODE_WRONLY, my_info, fileID, ierr)
print*,"node",w_me,"starting",(offset/4) + 1,"ending",(offset/
4)+mylen
call MPI_FILE_SET_VIEW(fileID,offset, MPI_REAL,MPI_REAL,
"native",MPI_INFO_NULL,ierr)
call MPI_File_write(fileID, array2, mylen , MPI_REAL,
status,ierr)
call MPI_Get_count(status,MPI_INTEGER, count, ierr)
if (count .ne. mylen) print*, "Wrong write count:",
count,mylen,w_me
deallocate(array2)
if (sync) call MPI_FILE_SYNC (fileID,ierr)
call MPI_FILE_CLOSE (fileID,ierr)
! Read it back on node zero to see if it is ok data
if (w_me .eq. 0) then
call MPI_FILE_OPEN (MPI_COMM_SELF, "output.dat",
MPI_MODE_RDONLY, my_info, fileID, ierr)
mylen = ncells
allocate (array2(ncells))
call MPI_File_read(fileID, array2, mylen , MPI_REAL,
status,ierr)
call MPI_Get_count(status,MPI_INTEGER, count, ierr)
if (count .ne. mylen) print*, "Wrong read count:",
count,mylen
do i=1,ncells
if (array2(i) .ne. 1) then
print*, "ERROR", i,array2(i), ((i-1)*4), ((i-1)*4)/
(1024d0*1024d0) ! Index, value, # of good bytes,MB
goto 999
end if
end do
print*, "All done with nothing wrong"
999 deallocate(array2)
call MPI_FILE_CLOSE (fileID,ierr)
call MPI_file_delete ("output.dat",MPI_INFO_NULL,ierr)
endif
call mpi_finalize(ierr)
end program gcrm_test_io
1.3 Open MPI
node 0 starting 1
ending 410011
node 1 starting 410012
ending 820022
node 2 starting 820023
ending 1230033
node 3 starting 1230034
ending 1640044
node 4 starting 1640045
ending 2050055
node 5 starting 2050056
ending 2460066
All done with nothing wrong
node 0 starting 1
ending 410011
node 1 starting 410012
ending 820022
node 2 starting 820023
ending 1230033
node 5 starting 2050056
ending 2460066
node 4 starting 1640045
ending 2050055
node 3 starting 1230034
ending 1640044
Wrong write count: 228554 410011 2
Wrong read count: 1048576 2460066
ERROR 1048577 0.0000000E+00 4194304 4.00000000000000
node 1 starting 410012
ending 820022
node 0 starting 1
ending 410011
node 2 starting 820023
ending 1230033
node 3 starting 1230034
ending 1640044
node 4 starting 1640045
ending 2050055
node 5 starting 2050056
ending 2460066
Wrong read count: 1229824 2460066
ERROR 1229825 0.0000000E+00 4919296 4.69140625000000
--
Nathan Baca
nathan.b...@gmail.com
_______________________________________________
users mailing list
us...@open-mpi.org
http://www.open-mpi.org/mailman/listinfo.cgi/users