Hi Nathan,

  I just ran your code here and it worked fine - CentOS 5 on dual Xeons w/
IB network, and the kernel is 2.6.18-53.1.14.el5_lustre.1.6.5smp.  I used an
OpenMPI 1.3.0 install compiled with Intel 11.0.081 and, independently, one
with GCC 4.1.2.  I tried a few different times with varying numbers of
processors.

  (Both executables were compiled with -O2)

  I'm sure the main OpenMPI guys will have better ideas, but in the meantime
what kernel, OS and compilers are you using?  And does it happen when you
write to a single OST?  Make a directory and try setting the stripe-size to
1 (eg, lfs setstripe <directory name> 1048576 0 1' will give you, I think, a
1MB stripe size starting at OST 0 and of size 1.)  I'm just wondering
whether it's something with your hardware, maybe a particular OST, since it
seems to work for me.

  ... Sorry I can't be of more help, but I imagine the regular experts will
chime in shortly.

  Cheers,
  - Brian


On Tue, Mar 3, 2009 at 12:51 PM, Nathan Baca <nathan.b...@gmail.com> wrote:

> Hello,
>
> I am seeing inconsistent mpi-io behavior when writing to a Lustre file
> system using open mpi 1.3 with romio. What follows is a simple reproducer
> and output. Essentially one or more of the running processes does not read
> or write the correct amount of data to its part of a file residing on a
> Lustre (parallel) file system.
>
> Any help figuring out what is happening is greatly appreciated. Thanks,
> Nate
>
> program gcrm_test_io
>   implicit none
>   include "mpif.h"
>
>   integer X_SIZE
>
>       integer w_me, w_nprocs
>       integer  my_info
>
>       integer i
>       integer (kind=4) :: ierr
>       integer (kind=4) :: fileID
>
>       integer (kind=MPI_OFFSET_KIND)        :: mylen
>       integer (kind=MPI_OFFSET_KIND)        :: offset
>       integer status(MPI_STATUS_SIZE)
>       integer count
>       integer ncells
>       real (kind=4), allocatable, dimension (:)     :: array2
>       logical sync
>
>       call mpi_init(ierr)
>       call MPI_COMM_SIZE(MPI_COMM_WORLD,w_nprocs,ierr)
>       call MPI_COMM_RANK(MPI_COMM_WORLD,w_me,ierr)
>
>       call mpi_info_create(my_info, ierr)
> !     optional ways to set things in mpi-io
> !     call mpi_info_set   (my_info, "romio_ds_read" , "enable"   , ierr)
> !     call mpi_info_set   (my_info, "romio_ds_write", "enable"   , ierr)
> !     call mpi_info_set   (my_info, "romio_cb_write", "enable"    , ierr)
>
>       x_size = 410011  ! A 'big' number, with bigger numbers it is more
> likely to fail
>       sync = .true.  ! Extra file synchronization
>
>       ncells = (X_SIZE * w_nprocs)
>
> !  Use node zero to fill it with nines
>       if (w_me .eq. 0) then
>           call MPI_FILE_OPEN  (MPI_COMM_SELF, "output.dat",
> MPI_MODE_CREATE+MPI_MODE_WRONLY, my_info, fileID, ierr)
>           allocate (array2(ncells))
>           array2(:) = 9.0
>           mylen = ncells
>           offset = 0 * 4
>           call MPI_FILE_SET_VIEW(fileID,offset, MPI_REAL,MPI_REAL,
> "native",MPI_INFO_NULL,ierr)
>           call MPI_File_write(fileID, array2, mylen , MPI_REAL,
> status,ierr)
>           call MPI_Get_count(status,MPI_INTEGER, count, ierr)
>           if (count .ne. mylen) print*, "Wrong initial write count:",
> count,mylen
>           deallocate(array2)
>           if (sync) call MPI_FILE_SYNC (fileID,ierr)
>           call MPI_FILE_CLOSE (fileID,ierr)
>       endif
>
> !  All nodes now fill their area with ones
>       call MPI_BARRIER(MPI_COMM_WORLD,ierr)
>       allocate (array2( X_SIZE))
>       array2(:) = 1.0
>       offset = (w_me * X_SIZE) * 4 ! multiply by four, since it is real*4
>       mylen = X_SIZE
>       call MPI_FILE_OPEN  (MPI_COMM_WORLD,"output.dat",MPI_MODE_WRONLY,
> my_info, fileID, ierr)
>       print*,"node",w_me,"starting",(offset/4) +
> 1,"ending",(offset/4)+mylen
>       call MPI_FILE_SET_VIEW(fileID,offset, MPI_REAL,MPI_REAL,
> "native",MPI_INFO_NULL,ierr)
>       call MPI_File_write(fileID, array2, mylen , MPI_REAL, status,ierr)
>       call MPI_Get_count(status,MPI_INTEGER, count, ierr)
>       if (count .ne. mylen) print*, "Wrong write count:", count,mylen,w_me
>       deallocate(array2)
>       if (sync) call MPI_FILE_SYNC (fileID,ierr)
>       call MPI_FILE_CLOSE (fileID,ierr)
>
> !  Read it back on node zero to see if it is ok data
>       if (w_me .eq. 0) then
>           call MPI_FILE_OPEN  (MPI_COMM_SELF, "output.dat",
> MPI_MODE_RDONLY, my_info, fileID, ierr)
>           mylen = ncells
>           allocate (array2(ncells))
>           call MPI_File_read(fileID, array2, mylen , MPI_REAL, status,ierr)
>
>           call MPI_Get_count(status,MPI_INTEGER, count, ierr)
>           if (count .ne. mylen) print*, "Wrong read count:", count,mylen
>           do i=1,ncells
>                if (array2(i) .ne. 1) then
>                   print*, "ERROR", i,array2(i), ((i-1)*4),
> ((i-1)*4)/(1024d0*1024d0) ! Index, value, # of good bytes,MB
>                   goto 999
>                end if
>           end do
>           print*, "All done with nothing wrong"
>  999      deallocate(array2)
>           call MPI_FILE_CLOSE (fileID,ierr)
>           call MPI_file_delete ("output.dat",MPI_INFO_NULL,ierr)
>       endif
>
>       call mpi_finalize(ierr)
>
> end program gcrm_test_io
>
> 1.3 Open MPI
>  node           0 starting                     1 ending
> 410011
>  node           1 starting                410012 ending
> 820022
>  node           2 starting                820023 ending
> 1230033
>  node           3 starting               1230034 ending
> 1640044
>  node           4 starting               1640045 ending
> 2050055
>  node           5 starting               2050056 ending
> 2460066
>  All done with nothing wrong
>
>
>  node           0 starting                     1 ending
> 410011
>  node           1 starting                410012 ending
> 820022
>  node           2 starting                820023 ending
> 1230033
>  node           5 starting               2050056 ending
> 2460066
>  node           4 starting               1640045 ending
> 2050055
>  node           3 starting               1230034 ending
> 1640044
>  Wrong write count:      228554                410011           2
>  Wrong read count:     1048576               2460066
>  ERROR     1048577  0.0000000E+00     4194304   4.00000000000000
>
>
>  node           1 starting                410012 ending
> 820022
>  node           0 starting                     1 ending
> 410011
>  node           2 starting                820023 ending
> 1230033
>  node           3 starting               1230034 ending
> 1640044
>  node           4 starting               1640045 ending
> 2050055
>  node           5 starting               2050056 ending
> 2460066
>  Wrong read count:     1229824               2460066
>  ERROR     1229825  0.0000000E+00     4919296   4.69140625000000
>
> --
> Nathan Baca
> nathan.b...@gmail.com
>
> _______________________________________________
> users mailing list
> us...@open-mpi.org
> http://www.open-mpi.org/mailman/listinfo.cgi/users
>

Reply via email to