Hi,

I built OpenMPI 1.8.3 using PGI 14.7 and enabled CUDA support for CUDA 6.0.  I 
have a Fortran test code that tests GPUDirect and have included it here.  When 
I run it across 2 nodes using 4 MPI procs, sometimes it fails with incorrect 
results.  Specifically, sometimes rank 1 does not receive the correct value 
from one of the neighbors.

The code was compiled using PGI 14.7:
mpif90 -o direct.x -acc acc_direct.f90

and executed with:
mpirun -np 4 -npernode 2 -mca btl_openib_want_cudagdr 1 ./direct.x

Does anyone know if I'm missing something when using GPUDirect?

Thanks,Rob Aulwes


program acc_direct


 include 'mpif.h'



 integer :: ierr, rank, nranks

integer, dimension(:), allocatable :: i_ra


   call mpi_init(ierr)


   call MPI_COMM_RANK(MPI_COMM_WORLD, rank, ierr)

   rank = rank + 1

   write(*,*) 'hello from rank ',rank


   call MPI_COMM_SIZE(MPI_COMM_WORLD, nranks, ierr)


   allocate( i_ra(nranks) )


   call nb_exchange


   call mpi_finalize(ierr)



 contains


 subroutine nb_exchange


   integer :: i, j

   integer, dimension(nranks - 1) :: sendreq, recvreq

   logical :: done

   integer :: stat(MPI_STATUS_SIZE)


   i_ra = -1

   i_ra(rank) = rank


   !$acc data copy(i_ra(1:nranks))


   !$acc host_data use_device(i_ra)


   cnt = 0

   do i = 1,nranks

      if ( i .ne. rank ) then

         cnt = cnt + 1


         call MPI_ISEND(i_ra(rank), 1, MPI_INTEGER, i - 1, rank, &

                MPI_COMM_WORLD, sendreq(cnt), ierr)

         if ( ierr .ne. MPI_SUCCESS ) write(*,*) 'isend call failed.'


         call MPI_IRECV(i_ra(i), 1, MPI_INTEGER, i - 1, i, &

                MPI_COMM_WORLD, recvreq(cnt), ierr)

         if ( ierr .ne. MPI_SUCCESS ) write(*,*) 'irecv call failed.'


      endif


   enddo


   !$acc end host_data


   i = 0

   do while ( i .lt. 2*cnt )

     do j = 1, cnt

        if ( recvreq(j) .ne. MPI_REQUEST_NULL ) then

            call MPI_TEST(recvreq(j), done, stat, ierr)

            if ( ierr .ne. MPI_SUCCESS ) &

               write(*,*) 'test for irecv call failed.'

            if ( done ) then

               i = i + 1

            endif

        endif


        if ( sendreq(j) .ne. MPI_REQUEST_NULL ) then

            call MPI_TEST(sendreq(j), done, MPI_STATUS_IGNORE, ierr)

            if ( ierr .ne. MPI_SUCCESS ) &

               write(*,*) 'test for irecv call failed.'

            if ( done ) then

               i = i + 1

            endif

        endif

     enddo

   enddo


   write(*,*) rank,': nb_exchange: Updating host...'

   !$acc update host(i_ra(1:nranks))



   do j = 1, nranks

     if ( i_ra(j) .ne. j ) then

       write(*,*) 'isend/irecv failed.'

       write(*,*) 'rank', rank,': i_ra(',j,') = ',i_ra(j)

     endif

   enddo


   !$acc end data


 end subroutine



end program

Reply via email to