Hello,

I am seeing inconsistent mpi-io behavior when writing to a Lustre file
system using open mpi 1.3 with romio. What follows is a simple reproducer
and output. Essentially one or more of the running processes does not read
or write the correct amount of data to its part of a file residing on a
Lustre (parallel) file system.

Any help figuring out what is happening is greatly appreciated. Thanks, Nate

program gcrm_test_io
  implicit none
  include "mpif.h"

  integer X_SIZE

      integer w_me, w_nprocs
      integer  my_info

      integer i
      integer (kind=4) :: ierr
      integer (kind=4) :: fileID

      integer (kind=MPI_OFFSET_KIND)        :: mylen
      integer (kind=MPI_OFFSET_KIND)        :: offset
      integer status(MPI_STATUS_SIZE)
      integer count
      integer ncells
      real (kind=4), allocatable, dimension (:)     :: array2
      logical sync

      call mpi_init(ierr)
      call MPI_COMM_SIZE(MPI_COMM_WORLD,w_nprocs,ierr)
      call MPI_COMM_RANK(MPI_COMM_WORLD,w_me,ierr)

      call mpi_info_create(my_info, ierr)
!     optional ways to set things in mpi-io
!     call mpi_info_set   (my_info, "romio_ds_read" , "enable"   , ierr)
!     call mpi_info_set   (my_info, "romio_ds_write", "enable"   , ierr)
!     call mpi_info_set   (my_info, "romio_cb_write", "enable"    , ierr)

      x_size = 410011  ! A 'big' number, with bigger numbers it is more
likely to fail
      sync = .true.  ! Extra file synchronization

      ncells = (X_SIZE * w_nprocs)

!  Use node zero to fill it with nines
      if (w_me .eq. 0) then
          call MPI_FILE_OPEN  (MPI_COMM_SELF, "output.dat",
MPI_MODE_CREATE+MPI_MODE_WRONLY, my_info, fileID, ierr)
          allocate (array2(ncells))
          array2(:) = 9.0
          mylen = ncells
          offset = 0 * 4
          call MPI_FILE_SET_VIEW(fileID,offset, MPI_REAL,MPI_REAL,
"native",MPI_INFO_NULL,ierr)
          call MPI_File_write(fileID, array2, mylen , MPI_REAL, status,ierr)

          call MPI_Get_count(status,MPI_INTEGER, count, ierr)
          if (count .ne. mylen) print*, "Wrong initial write count:",
count,mylen
          deallocate(array2)
          if (sync) call MPI_FILE_SYNC (fileID,ierr)
          call MPI_FILE_CLOSE (fileID,ierr)
      endif

!  All nodes now fill their area with ones
      call MPI_BARRIER(MPI_COMM_WORLD,ierr)
      allocate (array2( X_SIZE))
      array2(:) = 1.0
      offset = (w_me * X_SIZE) * 4 ! multiply by four, since it is real*4
      mylen = X_SIZE
      call MPI_FILE_OPEN  (MPI_COMM_WORLD,"output.dat",MPI_MODE_WRONLY,
my_info, fileID, ierr)
      print*,"node",w_me,"starting",(offset/4) + 1,"ending",(offset/4)+mylen

      call MPI_FILE_SET_VIEW(fileID,offset, MPI_REAL,MPI_REAL,
"native",MPI_INFO_NULL,ierr)
      call MPI_File_write(fileID, array2, mylen , MPI_REAL, status,ierr)
      call MPI_Get_count(status,MPI_INTEGER, count, ierr)
      if (count .ne. mylen) print*, "Wrong write count:", count,mylen,w_me
      deallocate(array2)
      if (sync) call MPI_FILE_SYNC (fileID,ierr)
      call MPI_FILE_CLOSE (fileID,ierr)

!  Read it back on node zero to see if it is ok data
      if (w_me .eq. 0) then
          call MPI_FILE_OPEN  (MPI_COMM_SELF, "output.dat", MPI_MODE_RDONLY,
my_info, fileID, ierr)
          mylen = ncells
          allocate (array2(ncells))
          call MPI_File_read(fileID, array2, mylen , MPI_REAL, status,ierr)
          call MPI_Get_count(status,MPI_INTEGER, count, ierr)
          if (count .ne. mylen) print*, "Wrong read count:", count,mylen
          do i=1,ncells
               if (array2(i) .ne. 1) then
                  print*, "ERROR", i,array2(i), ((i-1)*4),
((i-1)*4)/(1024d0*1024d0) ! Index, value, # of good bytes,MB
                  goto 999
               end if
          end do
          print*, "All done with nothing wrong"
 999      deallocate(array2)
          call MPI_FILE_CLOSE (fileID,ierr)
          call MPI_file_delete ("output.dat",MPI_INFO_NULL,ierr)
      endif

      call mpi_finalize(ierr)

end program gcrm_test_io

1.3 Open MPI
 node           0 starting                     1 ending
410011
 node           1 starting                410012 ending
820022
 node           2 starting                820023 ending
1230033
 node           3 starting               1230034 ending
1640044
 node           4 starting               1640045 ending
2050055
 node           5 starting               2050056 ending
2460066
 All done with nothing wrong


 node           0 starting                     1 ending
410011
 node           1 starting                410012 ending
820022
 node           2 starting                820023 ending
1230033
 node           5 starting               2050056 ending
2460066
 node           4 starting               1640045 ending
2050055
 node           3 starting               1230034 ending
1640044
 Wrong write count:      228554                410011           2
 Wrong read count:     1048576               2460066
 ERROR     1048577  0.0000000E+00     4194304   4.00000000000000


 node           1 starting                410012 ending
820022
 node           0 starting                     1 ending
410011
 node           2 starting                820023 ending
1230033
 node           3 starting               1230034 ending
1640044
 node           4 starting               1640045 ending
2050055
 node           5 starting               2050056 ending
2460066
 Wrong read count:     1229824               2460066
 ERROR     1229825  0.0000000E+00     4919296   4.69140625000000

-- 
Nathan Baca
nathan.b...@gmail.com

Reply via email to