-----Original Message-----
From: users-boun...@open-mpi.org [mailto:users-boun...@open-mpi.org]
On Behalf Of Zbigniew Koza
Sent: Tuesday, July 31, 2012 12:38 PM
To: us...@open-mpi.org
Subject: [OMPI users] bug in CUDA support for dual-processor systems?
Hi,
I wrote a simple program to see if OpenMPI can really handle cuda pointers as
promised in the FAQ and how efficiently.
The program (see below) breaks if MPI communication is to be performed
between two devices that are on the same node but under different IOHs in a
dual-processor Intel machine.
Note that cudaMemCpy works for such devices, although not as efficiently as
for the devices on the same IOH and GPUDirect enabled.
Here's the output from my program:
===============================
mpirun -n 6 ./a.out
Init
Init
Init
Init
Init
Init
rank: 1, size: 6
rank: 2, size: 6
rank: 3, size: 6
rank: 4, size: 6
rank: 5, size: 6
rank: 0, size: 6
device 3 is set
Process 3 is on typhoon1
Using regular memory
device 0 is set
Process 0 is on typhoon1
Using regular memory
device 4 is set
Process 4 is on typhoon1
Using regular memory
device 1 is set
Process 1 is on typhoon1
Using regular memory
device 5 is set
Process 5 is on typhoon1
Using regular memory
device 2 is set
Process 2 is on typhoon1
Using regular memory
^C^[[A^C
zkoza@typhoon1:~/multigpu$
zkoza@typhoon1:~/multigpu$ vim cudamussings.c
zkoza@typhoon1:~/multigpu$ mpicc cudamussings.c -lcuda -lcudart
-L/usr/local/cuda/lib64 -I/usr/local/cuda/include
zkoza@typhoon1:~/multigpu$ vim cudamussings.c
zkoza@typhoon1:~/multigpu$ mpicc cudamussings.c -lcuda -lcudart
-L/usr/local/cuda/lib64 -I/usr/local/cuda/include
zkoza@typhoon1:~/multigpu$ mpirun -n 6 ./a.out Process 1 of 6 is on
typhoon1 Process 2 of 6 is on typhoon1 Process 0 of 6 is on typhoon1 Process
4 of 6 is on typhoon1 Process 5 of 6 is on typhoon1 Process 3 of 6 is on
typhoon1 device 2 is set device 1 is set device 0 is set Using regular memory
device 5 is set device 3 is set device 4 is set
Host->device bandwidth for processor 1: 1587.993499 MB/sec device
Host->bandwidth for processor 2: 1570.275316 MB/sec device bandwidth for
Host->processor 3: 1569.890751 MB/sec device bandwidth for processor 5:
Host->1483.637702 MB/sec device bandwidth for processor 0: 1480.888029
Host->MB/sec device bandwidth for processor 4: 1476.241371 MB/sec
MPI_Send/MPI_Receive, Host [0] -> Host [1] bandwidth: 3338.57 MB/sec
MPI_Send/MPI_Receive, Device[0] -> Host [1] bandwidth: 420.85 MB/sec
MPI_Send/MPI_Receive, Host [0] -> Device[1] bandwidth: 362.13 MB/sec
MPI_Send/MPI_Receive, Device[0] -> Device[1] bandwidth: 6552.35 MB/sec
MPI_Send/MPI_Receive, Host [0] -> Host [2] bandwidth: 3238.88 MB/sec
MPI_Send/MPI_Receive, Device[0] -> Host [2] bandwidth: 418.18 MB/sec
MPI_Send/MPI_Receive, Host [0] -> Device[2] bandwidth: 362.06 MB/sec
MPI_Send/MPI_Receive, Device[0] -> Device[2] bandwidth: 5022.82 MB/sec
MPI_Send/MPI_Receive, Host [0] -> Host [3] bandwidth: 3295.32 MB/sec
MPI_Send/MPI_Receive, Device[0] -> Host [3] bandwidth: 418.90 MB/sec
MPI_Send/MPI_Receive, Host [0] -> Device[3] bandwidth: 359.16 MB/sec
MPI_Send/MPI_Receive, Device[0] -> Device[3] bandwidth: 5019.89 MB/sec
MPI_Send/MPI_Receive, Host [0] -> Host [4] bandwidth: 4619.55 MB/sec
MPI_Send/MPI_Receive, Device[0] -> Host [4] bandwidth: 419.24 MB/sec
MPI_Send/MPI_Receive, Host [0] -> Device[4] bandwidth: 364.52 MB/sec
--------------------------------------------------------------------------
The call to cuIpcOpenMemHandle failed. This is an unrecoverable error and
will cause the program to abort.
cuIpcOpenMemHandle return value: 205
address: 0x200200000
Check the cuda.h file for what the return value means. Perhaps a reboot of
the node will clear the problem.
--------------------------------------------------------------------------
[typhoon1:06098] Failed to register remote memory, rc=-1 [typhoon1:06098]
[[33788,1],4] ORTE_ERROR_LOG: Error in file pml_ob1_recvreq.c at line 465
========================================================
Comment:
In my machine there are 2 six-core intel processors with HT on, yielding
24 virtual processors, and 6 Tesla C2070s.
The devices are grouped in two groups, one with 4 and the other with 2
devices.
Devices in the same group can talk to each other via GPUDirect at approx
6GB/s; devices in different groups can use cudaMemCpy and UVA at
somewhat smaller transfer rates.
my OpenMPI is openmpi-1.9a1r26904 compiled from sources
./configure -prefix=/home/zkoza/openmpi.1.9.cuda
--with-cuda=/usr/local/cuda --with-cuda-libdir=/usr/lib
nvcc -V
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2012 NVIDIA Corporation Built on
Thu_Apr__5_00:24:31_PDT_2012 Cuda compilation tools, release 4.2,
V0.2.1221
gcc version 4.6.3 (Ubuntu/Linaro 4.6.3-1ubuntu5)
Ubuntu 12.04 64-bit
Nvidia Driver Version: 295.41 |
The program was compiled with:
mpicc prog.c -lcuda -lcudart -L/usr/local/cuda/lib64 -I/usr/local/cuda/include
================================================
SOURCE CODE:
================================================
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <sys/time.h>
#include <mpi.h>
#define NREPEAT 20
#define NBYTES 100000000
#define CALL(x)\
{\
cudaError_t err = x;\
if (cudaSuccess != err)\
{\
printf("CUDA ERROR %s at %d\n", cudaGetErrorString(err), __LINE__ ); \
cudaGetLastError();\
}\
}
int main (int argc, char *argv[])
{
int rank, size, n, len, numbytes;
void *a_h, *a_d;
struct timeval time[2];
double bandwidth;
char name[MPI_MAX_PROCESSOR_NAME];
MPI_Status status;
MPI_Init (&argc, &argv);
MPI_Comm_rank (MPI_COMM_WORLD, &rank);
MPI_Comm_size (MPI_COMM_WORLD, &size);
MPI_Get_processor_name(name, &len);
printf("Process %d of %d is on %s\n", rank, size, name);
fflush(stdout);
CALL( cudaSetDevice(rank) );
printf("device %d is set\n", rank);
fflush(stdout);
#ifdef PINNED
if (rank == 0)
printf("Using pinned memory \n");
CALL( cudaMallocHost( (void **) &a_h, NBYTES) );
#else
if (rank == 0)
printf("Using regular memory \n");
a_h = malloc(NBYTES);
#endif
CALL( cudaMalloc( (void **) &a_d, NBYTES) );
MPI_Barrier(MPI_COMM_WORLD);
gettimeofday(&time[0], NULL);
for (n=0; n<NREPEAT; n++ )
{
CALL( cudaMemcpy(a_d, a_h, NBYTES,
cudaMemcpyHostToDevice) );
}
gettimeofday(&time[1], NULL);
bandwidth = time[1].tv_sec - time[0].tv_sec;
bandwidth += 1.e-6*(time[1].tv_usec - time[0].tv_usec);
bandwidth = (double)NBYTES*NREPEAT/1.e6/bandwidth;
printf("Host->device bandwidth for processor %d: %f MB/sec\n",
rank, bandwidth);
/* Test MPI send/recv bandwidth. */
MPI_Barrier(MPI_COMM_WORLD);
int i, proc;
for (proc = 1; proc < size; proc++)
{
for (i = 0; i < 4; i++)
{
const int from_host = (i & 1) == 0;
const int to_host = (i & 2) == 0;
const char* tab[2] = {"Device", "Host "};
void * ptr[2] = {a_d, a_h};;
MPI_Barrier(MPI_COMM_WORLD);
gettimeofday(&time[0], NULL);
for (n=0; n<NREPEAT; n++)
{
if (rank == 0)
MPI_Send(ptr[from_host],
NBYTES/sizeof(int), MPI_INT, proc, 0, MPI_COMM_WORLD);
else if (rank == proc)
MPI_Recv(ptr[to_host],
NBYTES/sizeof(int), MPI_INT, 0, 0, MPI_COMM_WORLD, &status);
}
gettimeofday(&time[1], NULL);
// printf("MPI status: %d\n", status);
bandwidth = time[1].tv_sec - time[0].tv_sec;
bandwidth += 1.e-6*(time[1].tv_usec -
time[0].tv_usec);
bandwidth = NBYTES*NREPEAT/1.e6/bandwidth;
if (rank == 0)
{
printf("MPI_Send/MPI_Receive, %s[%d]
-> %s[%d] bandwidth: %4.2f MB/sec\n",
tab[from_host],
0, tab[to_host], proc, bandwidth);
fflush(stdout);
}
}
}
#ifdef PINNED
CALL( cudaFreeHost(a_h) );
#else
free(a_h);
#endif
CALL( cudaFree(a_d) ) ;
MPI_Finalize();
return 0;
}
_______________________________________________
users mailing list
us...@open-mpi.org
http://www.open-mpi.org/mailman/listinfo.cgi/users