Hi, I built OpenMPI 1.8.3 using PGI 14.7 and enabled CUDA support for CUDA 6.0. I have a Fortran test code that tests GPUDirect and have included it here. When I run it across 2 nodes using 4 MPI procs, sometimes it fails with incorrect results. Specifically, sometimes rank 1 does not receive the correct value from one of the neighbors.
The code was compiled using PGI 14.7: mpif90 -o direct.x -acc acc_direct.f90 and executed with: mpirun -np 4 -npernode 2 -mca btl_openib_want_cudagdr 1 ./direct.x Does anyone know if I'm missing something when using GPUDirect? Thanks,Rob Aulwes program acc_direct include 'mpif.h' integer :: ierr, rank, nranks integer, dimension(:), allocatable :: i_ra call mpi_init(ierr) call MPI_COMM_RANK(MPI_COMM_WORLD, rank, ierr) rank = rank + 1 write(*,*) 'hello from rank ',rank call MPI_COMM_SIZE(MPI_COMM_WORLD, nranks, ierr) allocate( i_ra(nranks) ) call nb_exchange call mpi_finalize(ierr) contains subroutine nb_exchange integer :: i, j integer, dimension(nranks - 1) :: sendreq, recvreq logical :: done integer :: stat(MPI_STATUS_SIZE) i_ra = -1 i_ra(rank) = rank !$acc data copy(i_ra(1:nranks)) !$acc host_data use_device(i_ra) cnt = 0 do i = 1,nranks if ( i .ne. rank ) then cnt = cnt + 1 call MPI_ISEND(i_ra(rank), 1, MPI_INTEGER, i - 1, rank, & MPI_COMM_WORLD, sendreq(cnt), ierr) if ( ierr .ne. MPI_SUCCESS ) write(*,*) 'isend call failed.' call MPI_IRECV(i_ra(i), 1, MPI_INTEGER, i - 1, i, & MPI_COMM_WORLD, recvreq(cnt), ierr) if ( ierr .ne. MPI_SUCCESS ) write(*,*) 'irecv call failed.' endif enddo !$acc end host_data i = 0 do while ( i .lt. 2*cnt ) do j = 1, cnt if ( recvreq(j) .ne. MPI_REQUEST_NULL ) then call MPI_TEST(recvreq(j), done, stat, ierr) if ( ierr .ne. MPI_SUCCESS ) & write(*,*) 'test for irecv call failed.' if ( done ) then i = i + 1 endif endif if ( sendreq(j) .ne. MPI_REQUEST_NULL ) then call MPI_TEST(sendreq(j), done, MPI_STATUS_IGNORE, ierr) if ( ierr .ne. MPI_SUCCESS ) & write(*,*) 'test for irecv call failed.' if ( done ) then i = i + 1 endif endif enddo enddo write(*,*) rank,': nb_exchange: Updating host...' !$acc update host(i_ra(1:nranks)) do j = 1, nranks if ( i_ra(j) .ne. j ) then write(*,*) 'isend/irecv failed.' write(*,*) 'rank', rank,': i_ra(',j,') = ',i_ra(j) endif enddo !$acc end data end subroutine end program