I have attached an example. Compiler: ifort (IFORT) 11.1 20090630 Copyright (C) 1985-2009 Intel Corporation. All rights reserved.
flags: mpif90 -O0 -fp-model precise -traceback -r8 -i4 -fpp -check all -warn all -warn nounused -save-temps -g -check noarg_temp_created -o testbar ./mpibarriertest.f90 OpenMPI: 1.4.3 hangs with 15 processes randomly as described. Evgeniy Message: 10 List-Post: users@lists.open-mpi.org Date: Sat, 28 Jan 2012 08:24:39 -0500 From: Jeff Squyres <jsquy...@cisco.com> Subject: Re: [OMPI users] MPI_Barrier, again To: Open MPI Users <us...@open-mpi.org> Message-ID: <1859c141-813d-46ba-97bc-4b0290fb3...@cisco.com> Content-Type: text/plain; charset=us-ascii Is there any chance you can make a small-ish reproducer of the issue that we can run? On Jan 27, 2012, at 10:45 AM, Evgeniy Shapiro wrote: > Hi > > I have a strange problem with MPI_Barrier occurring when writing to a > file. The output subroutine (the code is in FORTRAN) is called from > the main program and there is an MPI_Barrier just before the call. > > In the subroutine > > 1. Process 0 checks whether the first file exists and, if not, - > creates the file 1, writes the file header and closes the file > > 2. there is a loop over the data sets with an embedded barrier > do i=0, iDatasets > call MPI_Barrier > if I do not own data - cycle and go to the next dataset (and barrier) > check if the file exists, if not - sleep and check again until it > does (needed to make sure the buffer has been flushed) > write my portion of the file > end do > in theory the above should result in a sequential write of datasets > to the file. > > 3. Process 0 checks whether the second file exists and, if not, - > creates the file 2, writes the file header and closes the file > > 2. there is a loop over the data sets with an embedded barrier > do i=0, iDatasets > call MPI_Barrier > if I do not own data - cycle and go to the next dataset (and barrier) > check if the file exists, if not - sleep and check again until it > does (needed to make sure the buffer has been flushed) > write my portion of the file including a link to the 1st file > end do > > The sub is called several times (different files/datasets) with a > barrier between calls, erratically the program hangs in one of the > calls. The likelihood of the program hanging increases with the > increase of the number of processes. DDT shows that when this happens > some of the processes including 0 are waiting at barrier inside the > first loop, some - at the second barrier and one whereas one process > is in the sleep/check file status cycle in the second loop. So somehow > a part of processes go through the 1st barrier before process 0. > This is a debug version, so no loop unrolling etc. > > Is there anything I can do to make sure that the first barrier is > observed by all processes? Any advice greatly appreciated. > > Evgeniy > > > OpenMPI: 1.4.3 > (I cannot use parallel mpi io in this situation for various reasons) > _______________________________________________ > users mailing list > us...@open-mpi.org > http://www.open-mpi.org/mailman/listinfo.cgi/users -- Jeff Squyres jsquy...@cisco.com For corporate legal information go to: http://www.cisco.com/web/about/doing_business/legal/cri/
module data1 real, dimension(:,:,:), pointer :: rData integer :: iNx=10, iNy=20,iNz=2 end module data1 module maths ! in 1-10 integer :: iRandom contains integer function iSleep() integer :: iFlag=1 real :: rRand call random_seed() call random_number(rRand) iSleep=int(rRand/0.1) end function iSleep end module maths module ParallelMain include 'mpif.h' integer :: iMyRank integer :: iNumProcs integer :: iCommunicator end module ParallelMain program BarrierTest use ParallelMain use data1 implicit none ! integer :: iErrFlag character(30) :: sTestFile1="testfile1.out" character(30) :: sTestFile2="testfile2.out" character(30) :: sTestFile3="testfile3.out" ! In the first instance - initialise MPI call Start_MPI() call MPI_Barrier(iCommunicator, iErrFlag) allocate(rData(iNx,iNy,iNz)) call docalc call MPI_Barrier(iCommunicator, iErrFlag) if(iErrFlag.ne.MPI_SUCCESS ) print *, "1st barrier failed" call writedata(sTestFile1) call MPI_Barrier(iCommunicator, iErrFlag) if(iErrFlag.ne.MPI_SUCCESS ) print *, "2nd barrier failed" call writedata(sTestFile2) call MPI_Barrier(iCommunicator, iErrFlag) if(iErrFlag.ne.MPI_SUCCESS ) print *, "3rd barrier failed" call writedata(sTestFile3) call MPI_Finalize(iErrFlag) if(iErrFlag.ne.MPI_SUCCESS ) print *, "Finalize failed" stop end program !> Start timer and comm initialisation for MPI subroutine Start_MPI() use ParallelMain !not error flag always local otherwise it is a mess! integer :: iErrFlag(3) ! start initialising iErrFlag=0 call MPI_Init(iErrFlag(1)) iCommunicator=MPI_COMM_WORLD call MPI_Comm_Size(iCommunicator,iNumProcs,iErrFlag(2)) call MPI_Comm_Rank(iCommunicator,iMyRank,iErrFlag(3)) if(maxval(iErrFlag).ne.0) print *, "MPI Initialisation problem" end subroutine Start_MPI ! subroutine docalc use ParallelMain use data1 use maths ! integer :: i,j,k real :: rRand !! ! do k=1,iNz do j=1,iNy do i=1,iNx call Random_number(rRand) rData(i,j,k) = rRand end do end do end do ! do random "work" i=iSleep() print *, iMyRank, " is working for ",i," seconds " call sleep(i) end subroutine docalc subroutine writedata(sFileName) use ParallelMain use data1 implicit none ! logical :: lFileExists integer :: iErrMPI,i,j,k,n character*(*), intent(in) :: sFileName ! if(iMyRank.eq.0) print *, "Writing data" !> 1) if file does not exist - write file header inquire(file="init.out", exist=lFileExists) if(.not.lFileExists) then if(iMyRank.eq.0) then print *, "File: init.out not there, need to create it." print *, "File: init.out writing header." open(unit=10, file="init.out", status="new") write (10,*) "Header: ",iNx,iNy,iNz close(10) end if !> 2) now each process appends its own data do n=1,iNumProcs call MPI_Barrier(iCommunicator, iErrMPI) if(iErrMPI.ne.MPI_SUCCESS ) print *, "1st inner barrier error" if(iMyRank.ne.n) cycle !> test file inquire(file="init.out", exist=lFileExists) do while (.not.lFileExists) call sleep(1) inquire(file="init.out", exist=lFileExists) end do !> open(unit=1, file="init.out") write (1,*) "Processor portion: ",iMyRank !> write data do k=1,iNz do j=1,iNy do i=1,iNx write (1,"(3I,2E15.6)") k,j,i, rData(i,j,k) end do end do end do close(1) end do end if !> 1) if file does not exist - write file header inquire(file=sFileName, exist=lFileExists) if(.not.lFileExists) then if(iMyRank.eq.0) then print *, "File: ",trim(sFileName), " not there, need to create it." open(unit=1, file=sFileName) write (1,*) "Header: ",iNx,iNy,iNz close(1) end if !> 2) now each process appends its own data do n=1,iNumProcs call MPI_Barrier(iCommunicator, iErrMPI) if(iErrMPI.ne.MPI_SUCCESS ) print *, "1st inner barrier error" if(iMyRank.ne.n-1) cycle !> test file inquire(file=sFileName, exist=lFileExists) do while (.not.lFileExists) call sleep(1) inquire(file=sFileName, exist=lFileExists) end do !> open(unit=1, file=sFileName) write (1,*) "Processor portion: ",iMyRank !> write data do k=1,iNz do j=1,iNy do i=1,iNx write (1,"(3I,2E15.6)") k,j,i, rData(i,j,k) end do end do end do close(1) end do end if ! if(iMyRank.eq.0) print *, "Done writing data" ! end subroutine writedata