At the first glance I would say you are confusing the variables counting
your requests, reqcount and nrequests.

  George.


On Fri, Nov 25, 2016 at 7:11 AM, Paolo Pezzutto <bao...@gmail.com> wrote:

> Dear all,
>
> I am struggling with an invalid memory reference when calling SUB EXC_MPI
> (MOD01), and precisely at MPI_StartAll (see comment) below.
>
> @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
>     ! ********** file mod01.f90 ************ !
>     MODULE MOD01
>
>     implicit none
>     include 'mpif.h'
>     ! alternatively
>     ! use mpi
>     ! implicit none
>     PRIVATE
>     ! ...
>     INTERFACE exc_mpi
>        MODULE PROCEDURE exc_mpi
>     END INTERFACE
>     PUBLIC exc_mpi
>
>     CONTAINS
>
>     subroutine exc_mpi (X)
>     !! send and receive from procs PN0 <-> PN1 and PN0 <-> PN2
>     real, dimension (ni:ns, m, l), intent(inout) :: X
>
>     logical, save :: frstime=.true.
>     integer, save :: mpitype_sn, mpitype_sp, mpitype_rn, mpitype_rp
>     integer, save :: requests(4), reqcount
>     integer       :: istatus(MPI_STATUS_SIZE,4), ierr
>
>     if (frstime) then
>        call exc_init()
>        frstime = .false.
>     end if
>     call MPI_StartAll(reqcount,requests,ierr)         !!  <-- segfault
> here
>     call MPI_WaitAll(reqcount,requests,istatus,ierr)
>     return
>
>     contains
>
>     subroutine exc_init
>
>     integer :: i0, ierrs(12), ktag
>
>     nrequests = 0
>     ierrs=0
>     ktag = 1
>
>     ! find i0
>
>     if ( condition1 ) then
>     ! send to PN2
>        call MPI_Type_Vector(m*l, messlengthup(PN2), ns-ni+1, MPI_REAL,
> mpitype_sn, ierrs(1))
>        call MPI_Type_Commit(mpitype_sn, ierrs(3))
>        call MPI_Send_Init(X(i0, 1, 1), 1, mpitype_sn, PN2-1, ktag,
> MPI_COMM_WORLD, requests(reqcount+1), ierrs(5))
>     ! recieve from PN2
>        call MPI_Type_Vector(m*l, messlengthdo(PN0), ns-ni+1, MPI_REAL,
> mpitype_rn, ierrs(2))
>        call MPI_Type_Commit(mpitype_rn,ierrs(4))
>        call MPI_Recv_Init(X(nend(irank)+1, 1, 1), 1, mpitype_rn, PN2-1,
> ktag+1, MPI_COMM_WORLD, requests(nrequests+2), ierrs(6))
>        nrequests = nrequests + 2
>     end if
>
>     if ( condition2 ) then
>     !   send and rec PN0 <-> PN1
>        nrequests = nrequests + 2
>     end if
>
>     return
>     end subroutine exc_init
>
>     end subroutine exc_mpi
>
>     ! ...
>
>     END MODULE MOD01
> @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
>
> The calls are coming from this other module in a separate file:
>
> @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
>
>     ! ********** file mod02.f90 ************ !
>     MODULE MOD02
>
>     use MOD01, only: exc_mpi
>
>     IMPLICIT NONE
>     include 'mpif.h'
>     ! alternatively
>     ! use mpi
>     ! implicit none
>     PRIVATE
>
>     ! ...
>
>     INTERFACE MYSUB
>        MODULE PROCEDURE MYSUB
>     END INTERFACE
>     PUBLIC MYSUB
>
>     CONTAINS
>
>     SUBROUTINE MYSUB (Y)
>
>     IMPLICIT NONE
>     REAL,    INTENT(INOUT)   :: Y(nl:nr, m, l) ! ni<=nl, nr>=ns
>     REAL, ALLOCATABLE, DIMENSION(:,:,:) :: Y0
>     !...
>     allocate ( Y0(n-1:ns, 1:m, 1:l) )
>
>     DO i = 1, icount
>
>        Y0(nl:nr,:,:) = F3(:,:,:)
>        call exc_mpi ( Y0(ni:ns, :, :) )           !  <-- segfault here
>        call mpi_barrier (mpi_comm_world, ierr)
>        Y0(ni-1,:,:) = 0.
>        CALL SUB01
>
>     END DO
>     deallocate (Y0)
>     RETURN
>
>     CONTAINS
>
>     SUBROUTINE SUB01
>     !...
>        FRE: DO iterm = 1, m
>           DIR: DO iterl = 1, l
>              DO itern = nl, nr
>     !            Y(itern, iterm, iterl) = some_lin_combination(Y0)
>              END DO
>           END DO DIR
>        END DO FRE
>
>     END SUBROUTINE SUB01
>
>     ! ...
>     END SUBROUTINE MYSUB
>
>     END MODULE MOD02
> @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
>
> Segmentation fault is raised at runtime when MAIN (actually a sub in a
> module) calls MYSUB (in MOD02) for the second time, i.e. just MPI_StartAll
> without re-initialisation. The segfault is an invalid mem reference, but I
> swear that the bounds aren't changing.
>
> The error is not systematic, in the sense that the program works if
> splitting the job up to a certain number of processes, say NPMAX, which
> depend on the size of decomposed array (the bigger the size, the higher
> NPMAX). With more procs than NPMAX, the program segfaults.
>
> The same issue arises with [gfortran+ompi], [gfortran+mpich], while with
> [ifort+mpich] does not always segfault but one process might hang
> indefinitely. So I bet it is not strictly an ompi issue, so apologize for
> posting here. It is not a single version issue too: same for deb-jessie,
> ubuntu 14 and personal 2.0.1 -can share config.log if necessary-.
>
> The only thing in common is glibc (2.19, distro stable). Actually the
> backtrace of ifort-mpich lists libpthread.so. I have not tried with
> alternative c-libs, nor with newest glibc.
>
> Intel Virtual threading is enabled on all the three archs (one mini hpc
> and two pc).
>
> This error is not reported on "serious" archs like nec, sun (ifort+ompi)
> and ibm.
>
> I am trying to find a possible MPI workaround for deb-based systems,
> maintaining efficiency as much as possible.
>
> As can be seen, MOD02 passes to the exchange procedure (MOD01) a sliced
> array Y0 which is non contiguous. But I should not worry because
> MPI_Type_Vector is expected to do the remapping job for me.
>
> I could almost overcome the fault (NPMAX growing by one order of
> magnitude) is to exchange the dimensions back and forth, but this causes
> the execution slowing down approximately a factor of 2.
>
> Initialising at each call ( call exc_init() out of the if statement in sub
> exc_mpi ) does solve, but it is totally inefficient since MAIN (not listed)
> is looping a lot.
>
> I bet that permanently exchanging X/Y/Y0 dimensions will solve but I do
> not want to loose efficiency of the nested cycles like SUB01 (first
> dimension is much bigger than the others).
>
> Indeed, MOD02 creates a temporary array. Doing that explicitly doesn't
> solve the issue.
>
> Forcing allocation an heap or stack doesn't solve.
>
> Any hint out there?
>
> Thanks for reading
>
> P
>
>
>
> _______________________________________________
> users mailing list
> users@lists.open-mpi.org
> https://rfd.newmexicoconsortium.org/mailman/listinfo/users
>
_______________________________________________
users mailing list
users@lists.open-mpi.org
https://rfd.newmexicoconsortium.org/mailman/listinfo/users

Reply via email to