Unfortunately, we don't have a whole lot of insight into how the internals of the IO support work -- we mainly bundle the ROMIO package from MPICH2 into Open MPI. Our latest integration was the ROMIO from MPICH2 v1.0.7.

Do you see the same behavior if you run your application under MPICH2 compiled with Lustre ROMIO support?


On Mar 3, 2009, at 12:51 PM, Nathan Baca wrote:

Hello,

I am seeing inconsistent mpi-io behavior when writing to a Lustre file system using open mpi 1.3 with romio. What follows is a simple reproducer and output. Essentially one or more of the running processes does not read or write the correct amount of data to its part of a file residing on a Lustre (parallel) file system.

Any help figuring out what is happening is greatly appreciated. Thanks, Nate

program gcrm_test_io
  implicit none
  include "mpif.h"

  integer X_SIZE

      integer w_me, w_nprocs
      integer  my_info

      integer i
      integer (kind=4) :: ierr
      integer (kind=4) :: fileID

      integer (kind=MPI_OFFSET_KIND)        :: mylen
      integer (kind=MPI_OFFSET_KIND)        :: offset
      integer status(MPI_STATUS_SIZE)
      integer count
      integer ncells
      real (kind=4), allocatable, dimension (:)     :: array2
      logical sync

      call mpi_init(ierr)
      call MPI_COMM_SIZE(MPI_COMM_WORLD,w_nprocs,ierr)
      call MPI_COMM_RANK(MPI_COMM_WORLD,w_me,ierr)

      call mpi_info_create(my_info, ierr)
!     optional ways to set things in mpi-io
! call mpi_info_set (my_info, "romio_ds_read" , "enable" , ierr) ! call mpi_info_set (my_info, "romio_ds_write", "enable" , ierr) ! call mpi_info_set (my_info, "romio_cb_write", "enable" , ierr)

x_size = 410011 ! A 'big' number, with bigger numbers it is more likely to fail
      sync = .true.  ! Extra file synchronization

      ncells = (X_SIZE * w_nprocs)

!  Use node zero to fill it with nines
      if (w_me .eq. 0) then
call MPI_FILE_OPEN (MPI_COMM_SELF, "output.dat", MPI_MODE_CREATE+MPI_MODE_WRONLY, my_info, fileID, ierr)
          allocate (array2(ncells))
          array2(:) = 9.0
          mylen = ncells
          offset = 0 * 4
call MPI_FILE_SET_VIEW(fileID,offset, MPI_REAL,MPI_REAL, "native",MPI_INFO_NULL,ierr) call MPI_File_write(fileID, array2, mylen , MPI_REAL, status,ierr)
          call MPI_Get_count(status,MPI_INTEGER, count, ierr)
if (count .ne. mylen) print*, "Wrong initial write count:", count,mylen
          deallocate(array2)
          if (sync) call MPI_FILE_SYNC (fileID,ierr)
          call MPI_FILE_CLOSE (fileID,ierr)
      endif

!  All nodes now fill their area with ones
      call MPI_BARRIER(MPI_COMM_WORLD,ierr)
      allocate (array2( X_SIZE))
      array2(:) = 1.0
offset = (w_me * X_SIZE) * 4 ! multiply by four, since it is real*4
      mylen = X_SIZE
call MPI_FILE_OPEN (MPI_COMM_WORLD,"output.dat",MPI_MODE_WRONLY, my_info, fileID, ierr) print*,"node",w_me,"starting",(offset/4) + 1,"ending",(offset/ 4)+mylen call MPI_FILE_SET_VIEW(fileID,offset, MPI_REAL,MPI_REAL, "native",MPI_INFO_NULL,ierr) call MPI_File_write(fileID, array2, mylen , MPI_REAL, status,ierr)
      call MPI_Get_count(status,MPI_INTEGER, count, ierr)
if (count .ne. mylen) print*, "Wrong write count:", count,mylen,w_me
      deallocate(array2)
      if (sync) call MPI_FILE_SYNC (fileID,ierr)
      call MPI_FILE_CLOSE (fileID,ierr)

!  Read it back on node zero to see if it is ok data
      if (w_me .eq. 0) then
call MPI_FILE_OPEN (MPI_COMM_SELF, "output.dat", MPI_MODE_RDONLY, my_info, fileID, ierr)
          mylen = ncells
          allocate (array2(ncells))
call MPI_File_read(fileID, array2, mylen , MPI_REAL, status,ierr)
          call MPI_Get_count(status,MPI_INTEGER, count, ierr)
if (count .ne. mylen) print*, "Wrong read count:", count,mylen
          do i=1,ncells
               if (array2(i) .ne. 1) then
print*, "ERROR", i,array2(i), ((i-1)*4), ((i-1)*4)/ (1024d0*1024d0) ! Index, value, # of good bytes,MB
                  goto 999
               end if
          end do
          print*, "All done with nothing wrong"
 999      deallocate(array2)
          call MPI_FILE_CLOSE (fileID,ierr)
          call MPI_file_delete ("output.dat",MPI_INFO_NULL,ierr)
      endif

      call mpi_finalize(ierr)

end program gcrm_test_io

1.3 Open MPI
node 0 starting 1 ending 410011 node 1 starting 410012 ending 820022 node 2 starting 820023 ending 1230033 node 3 starting 1230034 ending 1640044 node 4 starting 1640045 ending 2050055 node 5 starting 2050056 ending 2460066
 All done with nothing wrong


node 0 starting 1 ending 410011 node 1 starting 410012 ending 820022 node 2 starting 820023 ending 1230033 node 5 starting 2050056 ending 2460066 node 4 starting 1640045 ending 2050055 node 3 starting 1230034 ending 1640044
 Wrong write count:      228554                410011           2
 Wrong read count:     1048576               2460066
 ERROR     1048577  0.0000000E+00     4194304   4.00000000000000


node 1 starting 410012 ending 820022 node 0 starting 1 ending 410011 node 2 starting 820023 ending 1230033 node 3 starting 1230034 ending 1640044 node 4 starting 1640045 ending 2050055 node 5 starting 2050056 ending 2460066
 Wrong read count:     1229824               2460066
 ERROR     1229825  0.0000000E+00     4919296   4.69140625000000

--
Nathan Baca
nathan.b...@gmail.com
_______________________________________________
users mailing list
us...@open-mpi.org
http://www.open-mpi.org/mailman/listinfo.cgi/users


--
Jeff Squyres
Cisco Systems

Reply via email to