I would suggest having him look at the core file with a debugger and see where 
it fails. Sounds like he has a memory corruption problem.


On Jun 24, 2014, at 3:31 AM, Dr.Peer-Joachim Koch <pk...@bgc-jena.mpg.de> wrote:

> Hi,
> 
> one of our cluster users reported a problem with openmpi.
> He created a short sample (just a few lines) which will start and
> crash after a short time.
> We only see "Fatal error in PMPI_Gather: Other MPI error" - no further 
> details.
> He is using an intel fortran compiler with a self compiled openmpi (just 
> tested 1.8.1).
> 
> I've know nearly nothing about mpi(openmpi) so I'm asking at this forum.
> Has anybody some idea ?
> 
> Thanks, Peer
> 
> 
> 
> -----------------------makefile----------
> OPTIONS=-assume byterecl -fpp -allow nofpp_comments -free
> DEBUG=-g -d-lines -check -debug -debug-parameters -fpe0 -traceback
> 
> all:
>        rm -f JeDi globe_mod.mod JeDi.out jedi_restart
>        $(SOURCE) ; mpif90 $(OPTIONS) $(DEBUG) -o JeDi globe.f90
> 
> --------------------------
> 
> ----------------globe.f90---------------------
>      program globe
>      use mpi
>      implicit none
> 
>      integer :: mpinfo  = 0
>      integer :: myworld = 0
>      integer :: mypid   = 0
>      integer :: npro    = 1
> 
> !     * The comments give some conditions required to reproduce the problem.
> 
> !     * If the program runs at two hosts, the error message is shown two times
> 
>      integer, parameter :: vv_g_d1 = 2432
>      integer, parameter :: vv_p_d1 = vv_g_d1 / 16  ! requires 16 CPUs
> 
>      integer, parameter :: out_d1  = 2418  ! requires >=2416 (vv_g_d1 - 16)
> 
>      integer, parameter :: d2 = 5001 !  requires >=4282 @ ii=30 / >=6682 @ 
> ii=20 (depends on number of loops, but this limit can change for unknown 
> reason)
> 
>      integer :: ii, jj
> 
>      real    :: vv_p(vv_p_d1,d2)
>      real,allocatable :: vv_g(:,:)
> !     * requires the definition of the variable for write to be defined below 
> vv_g(:,:)
>      real    :: out(out_d1,d2)
> 
>      vv_p(:,:) = 0.0
>      out(:,:) = 0.0
> 
>      call mpi_init(mpinfo)
>      myworld = MPI_COMM_WORLD
>      call mpi_comm_size(myworld, npro, mpinfo)
> !     * The problem requires 16 CPUs
>      if (npro .ne. 16) then; write(*,*) "Works only with 16 CPUs"; stop; endif
>      call mpi_comm_rank(myworld, mypid, mpinfo)
> 
>      if (mypid == 0) then
>        open(11, FILE='jedi_restart', STATUS='replace', FORM='unformatted')
>      endif
> 
>      write(6,*) "test1",mypid ; flush(6)
> 
>      do ii = 1, 25  ! number of loops depends on field size
>        allocate(vv_g(vv_g_d1,d2))
> 
>        do jj = 1, d2
>          call mpi_gather(vv_p(1,jj), vv_p_d1, MPI_REAL, vv_g(1,jj), vv_p_d1, 
> MPI_REAL, 0, myworld, mpinfo)
>        enddo
> 
>        if (mypid == 0) then; write(11) out; flush(11); endif
> 
>        deallocate(vv_g)
>      enddo
> 
>      write(6,*) "test2",mypid ; flush(6)
> 
>      if (mypid == 0) close(11)
> 
>      call mpi_barrier(myworld, mpinfo)
>      call mpi_finalize(mpinfo)
> 
>      end
> ---------------------------------------------end 
> globe.f90----------------------
> 
> -- 
> Mit freundlichem Gruß
>    Peer-Joachim Koch
> _________________________________________________________
> Max-Planck-Institut für Biogeochemie
> Dr. Peer-Joachim Koch
> Hans-Knöll Str.10            Telefon: ++49 3641 57-6705
> D-07745 Jena                 Telefax: ++49 3641 57-7705
> 
> <pkoch.vcf>_______________________________________________
> users mailing list
> us...@open-mpi.org
> Subscription: http://www.open-mpi.org/mailman/listinfo.cgi/users
> Link to this post: 
> http://www.open-mpi.org/community/lists/users/2014/06/24695.php

Reply via email to