Hi,

one of our cluster users reported a problem with openmpi.
He created a short sample (just a few lines) which will start and
crash after a short time.
We only see "Fatal error in PMPI_Gather: Other MPI error" - no further details. He is using an intel fortran compiler with a self compiled openmpi (just tested 1.8.1).

I've know nearly nothing about mpi(openmpi) so I'm asking at this forum.
Has anybody some idea ?

Thanks, Peer



-----------------------makefile----------
OPTIONS=-assume byterecl -fpp -allow nofpp_comments -free
DEBUG=-g -d-lines -check -debug -debug-parameters -fpe0 -traceback

all:
        rm -f JeDi globe_mod.mod JeDi.out jedi_restart
        $(SOURCE) ; mpif90 $(OPTIONS) $(DEBUG) -o JeDi globe.f90

--------------------------

----------------globe.f90---------------------
      program globe
      use mpi
      implicit none

      integer :: mpinfo  = 0
      integer :: myworld = 0
      integer :: mypid   = 0
      integer :: npro    = 1

!     * The comments give some conditions required to reproduce the problem.

! * If the program runs at two hosts, the error message is shown two times

      integer, parameter :: vv_g_d1 = 2432
      integer, parameter :: vv_p_d1 = vv_g_d1 / 16  ! requires 16 CPUs

integer, parameter :: out_d1 = 2418 ! requires >=2416 (vv_g_d1 - 16)

integer, parameter :: d2 = 5001 ! requires >=4282 @ ii=30 / >=6682 @ ii=20 (depends on number of loops, but this limit can change for unknown reason)

      integer :: ii, jj

      real    :: vv_p(vv_p_d1,d2)
      real,allocatable :: vv_g(:,:)
! * requires the definition of the variable for write to be defined below vv_g(:,:)
      real    :: out(out_d1,d2)

      vv_p(:,:) = 0.0
      out(:,:) = 0.0

      call mpi_init(mpinfo)
      myworld = MPI_COMM_WORLD
      call mpi_comm_size(myworld, npro, mpinfo)
!     * The problem requires 16 CPUs
if (npro .ne. 16) then; write(*,*) "Works only with 16 CPUs"; stop; endif
      call mpi_comm_rank(myworld, mypid, mpinfo)

      if (mypid == 0) then
        open(11, FILE='jedi_restart', STATUS='replace', FORM='unformatted')
      endif

      write(6,*) "test1",mypid ; flush(6)

      do ii = 1, 25  ! number of loops depends on field size
        allocate(vv_g(vv_g_d1,d2))

        do jj = 1, d2
call mpi_gather(vv_p(1,jj), vv_p_d1, MPI_REAL, vv_g(1,jj), vv_p_d1, MPI_REAL, 0, myworld, mpinfo)
        enddo

        if (mypid == 0) then; write(11) out; flush(11); endif

        deallocate(vv_g)
      enddo

      write(6,*) "test2",mypid ; flush(6)

      if (mypid == 0) close(11)

      call mpi_barrier(myworld, mpinfo)
      call mpi_finalize(mpinfo)

      end
---------------------------------------------end globe.f90----------------------

--
Mit freundlichem Gruß
    Peer-Joachim Koch
_________________________________________________________
Max-Planck-Institut für Biogeochemie
Dr. Peer-Joachim Koch
Hans-Knöll Str.10            Telefon: ++49 3641 57-6705
D-07745 Jena                 Telefax: ++49 3641 57-7705

<<attachment: pkoch.vcf>>

Attachment: smime.p7s
Description: S/MIME Cryptographic Signature

Reply via email to