I have attached an example.

Compiler:
ifort (IFORT) 11.1 20090630
Copyright (C) 1985-2009 Intel Corporation.  All rights reserved.

flags:
mpif90  -O0  -fp-model precise  -traceback -r8 -i4 -fpp -check all
-warn all -warn nounused -save-temps -g -check noarg_temp_created -o
testbar ./mpibarriertest.f90

OpenMPI: 1.4.3


hangs with 15 processes randomly as described.


Evgeniy

Message: 10
List-Post: users@lists.open-mpi.org
Date: Sat, 28 Jan 2012 08:24:39 -0500
From: Jeff Squyres <jsquy...@cisco.com>
Subject: Re: [OMPI users] MPI_Barrier, again
To: Open MPI Users <us...@open-mpi.org>
Message-ID: <1859c141-813d-46ba-97bc-4b0290fb3...@cisco.com>
Content-Type: text/plain; charset=us-ascii

Is there any chance you can make a small-ish reproducer of the issue
that we can run?

On Jan 27, 2012, at 10:45 AM, Evgeniy Shapiro wrote:

> Hi
>
> I have a strange problem with MPI_Barrier occurring when writing to a
> file. The output subroutine (the code is in FORTRAN) is called from
> the main program and there is an MPI_Barrier just before the call.
>
> In the subroutine
>
> 1. Process 0 checks whether the first file exists and, if not, -
> creates the file 1, writes the file header and closes the file
>
> 2. there is a loop over the data sets with an embedded barrier
>  do i=0, iDatasets
>   call MPI_Barrier
>   if I do not own data - cycle and go to the next dataset (and barrier)
>   check if the file exists, if not - sleep and check again until it
> does (needed to make sure the buffer has been flushed)
>   write my portion of the file
>  end do
> in theory the above should result in a sequential write of datasets
> to the file.
>
> 3. Process 0 checks whether the second file exists and, if not, -
> creates the file 2, writes the file header and closes the file
>
> 2. there is a loop over the data sets with an embedded barrier
>  do i=0, iDatasets
>   call MPI_Barrier
>   if I do not own data - cycle and go to the next dataset (and barrier)
>   check if the file exists, if not - sleep and check again until it
> does (needed to make sure the buffer has been flushed)
>   write my portion of the file including a link to the 1st file
>  end do
>
> The sub is called several times (different files/datasets) with a
> barrier between calls, erratically the program hangs in one of the
> calls. The likelihood of the program hanging increases with the
> increase of the number of processes.  DDT shows that when this happens
> some of the processes including 0 are waiting at barrier inside the
> first loop, some - at the second barrier and one whereas one  process
> is in the sleep/check file status cycle in the second loop. So somehow
> a part of  processes go through the 1st barrier before process 0.
> This is a debug version, so no loop unrolling etc.
>
> Is there anything I can do to make sure that the first barrier is
> observed by all processes? Any advice greatly appreciated.
>
> Evgeniy
>
>
> OpenMPI: 1.4.3
> (I cannot use parallel mpi io in this situation for various reasons)
> _______________________________________________
> users mailing list
> us...@open-mpi.org
> http://www.open-mpi.org/mailman/listinfo.cgi/users


--
Jeff Squyres
jsquy...@cisco.com
For corporate legal information go to:
http://www.cisco.com/web/about/doing_business/legal/cri/
module data1
 real, dimension(:,:,:), pointer :: rData
 integer :: iNx=10, iNy=20,iNz=2
end module data1

module maths
   
! in 1-10
   integer :: iRandom 
 
contains



integer function iSleep()

 integer :: iFlag=1
 real :: rRand

 call random_seed()
 call random_number(rRand)
 iSleep=int(rRand/0.1)

end function iSleep


end module maths

module ParallelMain
  include 'mpif.h'

  integer :: iMyRank
  integer :: iNumProcs
  integer :: iCommunicator

end module ParallelMain



program BarrierTest
use ParallelMain
use data1


implicit none
!
  integer :: iErrFlag
  character(30) ::  sTestFile1="testfile1.out"
  character(30) ::  sTestFile2="testfile2.out"
  character(30) ::  sTestFile3="testfile3.out"

! In the first instance - initialise MPI
  call Start_MPI()
  call MPI_Barrier(iCommunicator, iErrFlag) 


  allocate(rData(iNx,iNy,iNz))

  call docalc

  call MPI_Barrier(iCommunicator, iErrFlag) 
  if(iErrFlag.ne.MPI_SUCCESS ) print *, "1st barrier  failed"
  call writedata(sTestFile1)

  call MPI_Barrier(iCommunicator, iErrFlag)
  if(iErrFlag.ne.MPI_SUCCESS ) print *, "2nd barrier  failed"
  call writedata(sTestFile2)

  call MPI_Barrier(iCommunicator, iErrFlag)
  if(iErrFlag.ne.MPI_SUCCESS ) print *, "3rd barrier  failed"
  call writedata(sTestFile3)

  call MPI_Finalize(iErrFlag)
  if(iErrFlag.ne.MPI_SUCCESS ) print *, "Finalize failed"
  stop
end program 



!> Start timer and comm initialisation for MPI
subroutine Start_MPI()
  use ParallelMain

!not error flag always local otherwise it is a mess!
  integer :: iErrFlag(3)
! start initialising
  iErrFlag=0
  call MPI_Init(iErrFlag(1))
  iCommunicator=MPI_COMM_WORLD
  call MPI_Comm_Size(iCommunicator,iNumProcs,iErrFlag(2))
  call MPI_Comm_Rank(iCommunicator,iMyRank,iErrFlag(3))
  if(maxval(iErrFlag).ne.0) print *, "MPI Initialisation problem"
end subroutine Start_MPI
!

subroutine docalc
 use ParallelMain
 use data1
 use maths
! 
 integer :: i,j,k

 real :: rRand
!!
!
 do k=1,iNz
  do j=1,iNy
   do i=1,iNx
     call Random_number(rRand)
    rData(i,j,k) = rRand
   end do
  end do
 end do
! do random "work"
 i=iSleep()
 print *, iMyRank, " is working for ",i," seconds "
 call sleep(i)
end subroutine docalc

subroutine writedata(sFileName)
 use ParallelMain
 use data1
 implicit none
!
 logical :: lFileExists
 integer :: iErrMPI,i,j,k,n
 character*(*), intent(in) :: sFileName
!

 if(iMyRank.eq.0) print *, "Writing data"
!> 1) if file does not exist - write file header
 inquire(file="init.out", exist=lFileExists)
 if(.not.lFileExists) then
  if(iMyRank.eq.0) then 
   print *, "File: init.out not there, need to create it."
   print *, "File: init.out writing header."
   open(unit=10, file="init.out", status="new")
   write (10,*) "Header: ",iNx,iNy,iNz
   close(10)
  end if
!> 2) now each process appends its own data
   do n=1,iNumProcs
    call MPI_Barrier(iCommunicator, iErrMPI)
    if(iErrMPI.ne.MPI_SUCCESS )  print *, "1st inner barrier error"
    if(iMyRank.ne.n) cycle
!> test file
    inquire(file="init.out", exist=lFileExists)
    do while (.not.lFileExists)
     call sleep(1)
     inquire(file="init.out", exist=lFileExists)
    end do 
!>
   open(unit=1, file="init.out")
   write (1,*) "Processor portion: ",iMyRank
!> write data
    do k=1,iNz
     do j=1,iNy   
      do i=1,iNx
       write (1,"(3I,2E15.6)") k,j,i, rData(i,j,k)
      end do
     end do
    end do
    close(1)
   end do
  end if
!> 1) if file does not exist - write file header
 inquire(file=sFileName, exist=lFileExists)
 if(.not.lFileExists) then
  if(iMyRank.eq.0) then 
   print *, "File: ",trim(sFileName), " not there, need to create it."
   open(unit=1, file=sFileName)
   write (1,*) "Header: ",iNx,iNy,iNz
   close(1)
  end if
!> 2) now each process appends its own data
   do n=1,iNumProcs
    call MPI_Barrier(iCommunicator, iErrMPI)
    if(iErrMPI.ne.MPI_SUCCESS )  print *, "1st inner barrier error"
    if(iMyRank.ne.n-1) cycle
!> test file
    inquire(file=sFileName, exist=lFileExists)
    do while (.not.lFileExists)
     call sleep(1)
     inquire(file=sFileName, exist=lFileExists)
    end do 
!>
   open(unit=1, file=sFileName)
   write (1,*) "Processor portion: ",iMyRank
!> write data
    do k=1,iNz
     do j=1,iNy   
      do i=1,iNx    
       write (1,"(3I,2E15.6)") k,j,i, rData(i,j,k)
      end do
     end do
    end do
    close(1)
   end do
  end if
!
  if(iMyRank.eq.0) print *, "Done writing data"
!
end subroutine writedata

Reply via email to