Hi,

I wrote a simple program to see if OpenMPI can really handle cuda pointers as promised in the FAQ and how efficiently. The program (see below) breaks if MPI communication is to be performed between two devices that are on the same node but under different IOHs in a dual-processor Intel machine. Note that cudaMemCpy works for such devices, although not as efficiently as for the devices on the same IOH and GPUDirect enabled.

Here's the output from my program:

===============================

>  mpirun -n 6 ./a.out
Init
Init
Init
Init
Init
Init
rank: 1, size: 6
rank: 2, size: 6
rank: 3, size: 6
rank: 4, size: 6
rank: 5, size: 6
rank: 0, size: 6
device 3 is set
Process 3 is on typhoon1
Using regular memory
device 0 is set
Process 0 is on typhoon1
Using regular memory
device 4 is set
Process 4 is on typhoon1
Using regular memory
device 1 is set
Process 1 is on typhoon1
Using regular memory
device 5 is set
Process 5 is on typhoon1
Using regular memory
device 2 is set
Process 2 is on typhoon1
Using regular memory
^C^[[A^C
zkoza@typhoon1:~/multigpu$
zkoza@typhoon1:~/multigpu$ vim cudamussings.c
zkoza@typhoon1:~/multigpu$ mpicc cudamussings.c -lcuda -lcudart -L/usr/local/cuda/lib64 -I/usr/local/cuda/include
zkoza@typhoon1:~/multigpu$ vim cudamussings.c
zkoza@typhoon1:~/multigpu$ mpicc cudamussings.c -lcuda -lcudart -L/usr/local/cuda/lib64 -I/usr/local/cuda/include
zkoza@typhoon1:~/multigpu$ mpirun -n 6 ./a.out
Process 1 of 6 is on typhoon1
Process 2 of 6 is on typhoon1
Process 0 of 6 is on typhoon1
Process 4 of 6 is on typhoon1
Process 5 of 6 is on typhoon1
Process 3 of 6 is on typhoon1
device 2 is set
device 1 is set
device 0 is set
Using regular memory
device 5 is set
device 3 is set
device 4 is set
Host->device bandwidth for processor 1: 1587.993499 MB/sec
Host->device bandwidth for processor 2: 1570.275316 MB/sec
Host->device bandwidth for processor 3: 1569.890751 MB/sec
Host->device bandwidth for processor 5: 1483.637702 MB/sec
Host->device bandwidth for processor 0: 1480.888029 MB/sec
Host->device bandwidth for processor 4: 1476.241371 MB/sec
MPI_Send/MPI_Receive,  Host  [0] -> Host  [1] bandwidth: 3338.57 MB/sec
MPI_Send/MPI_Receive,  Device[0] -> Host  [1] bandwidth: 420.85 MB/sec
MPI_Send/MPI_Receive,  Host  [0] -> Device[1] bandwidth: 362.13 MB/sec
MPI_Send/MPI_Receive,  Device[0] -> Device[1] bandwidth: 6552.35 MB/sec
MPI_Send/MPI_Receive,  Host  [0] -> Host  [2] bandwidth: 3238.88 MB/sec
MPI_Send/MPI_Receive,  Device[0] -> Host  [2] bandwidth: 418.18 MB/sec
MPI_Send/MPI_Receive,  Host  [0] -> Device[2] bandwidth: 362.06 MB/sec
MPI_Send/MPI_Receive,  Device[0] -> Device[2] bandwidth: 5022.82 MB/sec
MPI_Send/MPI_Receive,  Host  [0] -> Host  [3] bandwidth: 3295.32 MB/sec
MPI_Send/MPI_Receive,  Device[0] -> Host  [3] bandwidth: 418.90 MB/sec
MPI_Send/MPI_Receive,  Host  [0] -> Device[3] bandwidth: 359.16 MB/sec
MPI_Send/MPI_Receive,  Device[0] -> Device[3] bandwidth: 5019.89 MB/sec
MPI_Send/MPI_Receive,  Host  [0] -> Host  [4] bandwidth: 4619.55 MB/sec
MPI_Send/MPI_Receive,  Device[0] -> Host  [4] bandwidth: 419.24 MB/sec
MPI_Send/MPI_Receive,  Host  [0] -> Device[4] bandwidth: 364.52 MB/sec
--------------------------------------------------------------------------
The call to cuIpcOpenMemHandle failed. This is an unrecoverable error
and will cause the program to abort.
  cuIpcOpenMemHandle return value:   205
  address: 0x200200000
Check the cuda.h file for what the return value means. Perhaps a reboot
of the node will clear the problem.
--------------------------------------------------------------------------
[typhoon1:06098] Failed to register remote memory, rc=-1
[typhoon1:06098] [[33788,1],4] ORTE_ERROR_LOG: Error in file pml_ob1_recvreq.c at line 465

========================================================



Comment:
In my machine there are 2 six-core intel processors with HT on, yielding 24 virtual processors, and 6 Tesla C2070s. The devices are grouped in two groups, one with 4 and the other with 2 devices. Devices in the same group can talk to each other via GPUDirect at approx 6GB/s; devices in different groups can use
cudaMemCpy and UVA at somewhat smaller transfer rates.


my OpenMPI is openmpi-1.9a1r26904 compiled from sources

./configure -prefix=/home/zkoza/openmpi.1.9.cuda --with-cuda=/usr/local/cuda --with-cuda-libdir=/usr/lib

> nvcc -V
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2012 NVIDIA Corporation
Built on Thu_Apr__5_00:24:31_PDT_2012
Cuda compilation tools, release 4.2, V0.2.1221

gcc version 4.6.3 (Ubuntu/Linaro 4.6.3-1ubuntu5)

Ubuntu 12.04 64-bit

Nvidia  Driver Version: 295.41         |

The program was compiled with:
> mpicc prog.c -lcuda -lcudart -L/usr/local/cuda/lib64 -I/usr/local/cuda/include



================================================
SOURCE CODE:
================================================

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <sys/time.h>
#include <mpi.h>

#define NREPEAT 20
#define NBYTES 100000000


#define CALL(x)\
{\
  cudaError_t err = x;\
  if (cudaSuccess != err)\
  {\
    printf("CUDA ERROR %s at %d\n", cudaGetErrorString(err),  __LINE__ ); \
    cudaGetLastError();\
  }\
}

int main (int argc, char *argv[])
{
        int rank, size, n, len, numbytes;
        void *a_h, *a_d;
        struct timeval time[2];
        double bandwidth;
        char name[MPI_MAX_PROCESSOR_NAME];
        MPI_Status status;

        MPI_Init (&argc, &argv);
        MPI_Comm_rank (MPI_COMM_WORLD, &rank);
        MPI_Comm_size (MPI_COMM_WORLD, &size);
        MPI_Get_processor_name(name, &len);

        printf("Process %d of %d is on %s\n", rank, size, name);
        fflush(stdout);

        CALL( cudaSetDevice(rank) );
        printf("device %d is set\n", rank);
        fflush(stdout);

#ifdef PINNED
        if (rank == 0)
                printf("Using pinned memory \n");
        CALL( cudaMallocHost( (void **) &a_h, NBYTES) );
#else
        if (rank == 0)
                printf("Using regular memory \n");
        a_h = malloc(NBYTES);
#endif
        CALL( cudaMalloc( (void **) &a_d, NBYTES) );

        MPI_Barrier(MPI_COMM_WORLD);

        gettimeofday(&time[0], NULL);
        for (n=0; n<NREPEAT; n++ )
        {
CALL( cudaMemcpy(a_d, a_h, NBYTES, cudaMemcpyHostToDevice) );
        }
        gettimeofday(&time[1], NULL);

        bandwidth = time[1].tv_sec - time[0].tv_sec;
        bandwidth += 1.e-6*(time[1].tv_usec - time[0].tv_usec);
        bandwidth = (double)NBYTES*NREPEAT/1.e6/bandwidth;

printf("Host->device bandwidth for processor %d: %f MB/sec\n", rank, bandwidth);

        /* Test MPI send/recv bandwidth. */

        MPI_Barrier(MPI_COMM_WORLD);

        int i, proc;
        for (proc = 1; proc < size; proc++)
        {
                for (i = 0; i < 4; i++)
                {
                         const int from_host = (i & 1) == 0;
                        const int to_host =   (i & 2) == 0;
                        const char* tab[2] = {"Device", "Host  "};
                        void * ptr[2] = {a_d, a_h};;

                        MPI_Barrier(MPI_COMM_WORLD);
                        gettimeofday(&time[0], NULL);
                        for (n=0; n<NREPEAT; n++)
                        {
                                if (rank == 0)
MPI_Send(ptr[from_host], NBYTES/sizeof(int), MPI_INT, proc, 0, MPI_COMM_WORLD);
                                else if (rank == proc)
MPI_Recv(ptr[to_host], NBYTES/sizeof(int), MPI_INT, 0, 0, MPI_COMM_WORLD, &status);
                        }

                        gettimeofday(&time[1], NULL);
                        //        printf("MPI status: %d\n", status);

                        bandwidth = time[1].tv_sec - time[0].tv_sec;
bandwidth += 1.e-6*(time[1].tv_usec - time[0].tv_usec);
                        bandwidth = NBYTES*NREPEAT/1.e6/bandwidth;
                        if (rank == 0)
                        {
printf("MPI_Send/MPI_Receive, %s[%d] -> %s[%d] bandwidth: %4.2f MB/sec\n", tab[from_host], 0, tab[to_host], proc, bandwidth);
                                fflush(stdout);
                        }
        }
}
#ifdef PINNED
        CALL( cudaFreeHost(a_h) );
#else
        free(a_h);
#endif
        CALL( cudaFree(a_d) ) ;

        MPI_Finalize();
        return 0;
}



Reply via email to