Yep you are correct. I did the same and it worked. When I have more than 3 MPI tasks there is lot of overhead on GPU.
But for CPU there is not overhead. All three machines have 4 quad core processors with 3.8 GB RAM. Just wondering why there is no degradation of performance on CPU ? On Tue, May 8, 2012 at 8:21 PM, Rolf vandeVaart <rvandeva...@nvidia.com>wrote: > You should be running with one GPU per MPI process. If I understand > correctly, you have a 3 node cluster and each node has a GPU so you should > run with np=3.**** > > Maybe you can try that and see if your numbers come out better.**** > > ** ** > > ** ** > > *From:* users-boun...@open-mpi.org [mailto:users-boun...@open-mpi.org] *On > Behalf Of *Rohan Deshpande > *Sent:* Monday, May 07, 2012 9:38 PM > *To:* Open MPI Users > *Subject:* [OMPI users] GPU and CPU timing - OpenMPI and Thrust**** > > ** ** > > I am running MPI and Thrust code on a cluster and measuring time for > calculations.**** > > ** ** > > My MPI code - **** > > ** ** > > #include "mpi.h"**** > > #include <stdio.h>**** > > #include <stdlib.h>**** > > #include <string.h>**** > > #include <time.h>**** > > #include <sys/time.h>**** > > #include <sys/resource.h>**** > > ** ** > > #define MASTER 0**** > > #define ARRAYSIZE 20000000**** > > ** ** > > int > *masterarray,*onearray,*twoarray,*threearray,*fourarray,*fivearray,*sixarray,*sevenarray,*eightarray,*ninearray; > **** > > int main(int argc, char* argv[])**** > > {**** > > int numtasks, taskid,chunksize, namelen; **** > > int mysum,one,two,three,four,five,six,seven,eight,nine;**** > > ** ** > > char myname[MPI_MAX_PROCESSOR_NAME];**** > > MPI_Status status;**** > > int a,b,c,d,e,f,g,h,i,j;**** > > ** ** > > /***** Initializations *****/**** > > MPI_Init(&argc, &argv);**** > > MPI_Comm_size(MPI_COMM_WORLD, &numtasks);**** > > MPI_Comm_rank(MPI_COMM_WORLD,&taskid); **** > > MPI_Get_processor_name(myname, &namelen);**** > > printf ("MPI task %d has started on host %s...\n", taskid, myname);**** > > ** ** > > masterarray= malloc(ARRAYSIZE * sizeof(int));**** > > onearray= malloc(ARRAYSIZE * sizeof(int));**** > > twoarray= malloc(ARRAYSIZE * sizeof(int));**** > > threearray= malloc(ARRAYSIZE * sizeof(int));**** > > fourarray= malloc(ARRAYSIZE * sizeof(int));**** > > fivearray= malloc(ARRAYSIZE * sizeof(int));**** > > sixarray= malloc(ARRAYSIZE * sizeof(int));**** > > sevenarray= malloc(ARRAYSIZE * sizeof(int));**** > > eightarray= malloc(ARRAYSIZE * sizeof(int));**** > > ninearray= malloc(ARRAYSIZE * sizeof(int)); **** > > ** ** > > /***** Master task only ******/**** > > if (taskid == MASTER){**** > > for(a=0; a < ARRAYSIZE; a++){**** > > masterarray[a] = 1;**** > > **** > > }**** > > mysum = run_kernel0(masterarray,ARRAYSIZE,taskid, myname);**** > > ** ** > > } /* end of master section */**** > > ** ** > > if (taskid > MASTER) {**** > > ** ** > > if(taskid == 1){**** > > for(b=0;b<ARRAYSIZE;b++){**** > > onearray[b] = 1;**** > > }**** > > one = run_kernel0(onearray,ARRAYSIZE,taskid, myname);**** > > }**** > > if(taskid == 2){**** > > for(c=0;c<ARRAYSIZE;c++){**** > > twoarray[c] = 1;**** > > }**** > > two = run_kernel0(twoarray,ARRAYSIZE,taskid, myname);**** > > }**** > > if(taskid == 3){**** > > for(d=0;d<ARRAYSIZE;d++){**** > > threearray[d] = 1;**** > > }**** > > three = run_kernel0(threearray,ARRAYSIZE,taskid, myname); > **** > > }**** > > if(taskid == 4){**** > > for(e=0;e < ARRAYSIZE; e++){**** > > fourarray[e] = 1;**** > > }**** > > four = run_kernel0(fourarray,ARRAYSIZE,taskid, myname);** > ** > > }**** > > if(taskid == 5){**** > > for(f=0;f<ARRAYSIZE;f++){**** > > fivearray[f] = 1;**** > > }**** > > five = run_kernel0(fivearray,ARRAYSIZE,taskid, myname);*** > * > > }**** > > if(taskid == 6){**** > > **** > > for(g=0;g<ARRAYSIZE;g++){**** > > sixarray[g] = 1;**** > > }**** > > six = run_kernel0(sixarray,ARRAYSIZE,taskid, myname);**** > > } **** > > if(taskid == 7){**** > > for(h=0;h<ARRAYSIZE;h++){**** > > sevenarray[h] = 1;**** > > }**** > > seven = run_kernel0(sevenarray,ARRAYSIZE,taskid, > myname);**** > > } **** > > if(taskid == 8){**** > > ** ** > > for(i=0;i<ARRAYSIZE;i++){**** > > eightarray[i] = 1;**** > > }**** > > eight = run_kernel0(eightarray,ARRAYSIZE,taskid, > myname);**** > > } **** > > if(taskid == 9){**** > > ** ** > > for(j=0;j<ARRAYSIZE;j++){**** > > ninearray[j] = 1;**** > > }**** > > nine = run_kernel0(ninearray,ARRAYSIZE,taskid, myname); > **** > > } **** > > }**** > > MPI_Finalize();**** > > ** ** > > } **** > > ** ** > > All the tasks just initialize their own array and then calculate the sum > using cuda thrust.**** > > My CUDA Thrust code - **** > > ** ** > > #include <stdio.h>**** > > #include <cutil_inline.h>**** > > #include <cutil.h>**** > > #include <thrust/version.h>**** > > #include <thrust/generate.h>**** > > #include <thrust/host_vector.h>**** > > #include <thrust/device_vector.h>**** > > #include <thrust/functional.h>**** > > #include <thrust/transform_reduce.h>**** > > #include <time.h>**** > > #include <sys/time.h>**** > > #include <sys/resource.h>**** > > ** ** > > extern "C"**** > > int run_kernel0( int array[], int nelements, int taskid, char hostname[]) > **** > > {**** > > **** > > float elapsedTime;**** > > int result = 0;**** > > int threshold = 25000000;**** > > cudaEvent_t start, stop;**** > > cudaEventCreate(&start);**** > > cudaEventCreate(&stop);**** > > cudaEventRecord(start, 0);**** > > thrust::device_vector<int> gpuarray;**** > > int *begin = array;**** > > int *end = array + nelements;**** > > while(begin != end)**** > > {**** > > int chunk_size = thrust::min(threshold,end - begin);**** > > gpuarray.assign(begin, begin + chunk_size); **** > > result += thrust::reduce(gpuarray.begin(), gpuarray.end());**** > > begin += chunk_size;**** > > }**** > > cudaEventRecord(stop, 0);**** > > cudaEventSynchronize(stop); **** > > cudaEventElapsedTime(&elapsedTime, start, stop);**** > > cudaEventDestroy(start);**** > > cudaEventDestroy(stop);**** > > ** ** > > printf(" Task %d on has sum (on GPU): %ld Time for the kernel: %f > ms \n", taskid, result, elapsedTime); **** > > **** > > return result;**** > > }**** > > ** ** > > I also calculate the sum using CPU and the code is as below - **** > > ** ** > > struct timespec time1, time2, temp_time;**** > > ** ** > > clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time1);**** > > int i;**** > > int cpu_sum = 0;**** > > long diff = 0;**** > > ** ** > > for (i = 0; i < nelements; i++) {**** > > cpu_sum += array[i];**** > > } **** > > clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time2);**** > > temp_time.tv_sec = time2.tv_sec - time1.tv_sec;**** > > temp_time.tv_nsec = time2.tv_nsec - time1.tv_nsec;**** > > diff = temp_time.tv_sec * 1000000000 + temp_time.tv_nsec; **** > > printf("Task %d calculated sum: %d using CPU in %lf ms \n", taskid, > cpu_sum, (double) diff/1000000); **** > > return cpu_sum;**** > > ** ** > > Now when I run the job on cluster with 10 MPI tasks and compare the > timings of CPU and GPU, I get weird results where GPU time is much much > higher than CPU time. **** > > But the case should be opposite isnt it?**** > > ** ** > > The CPU time is almost same for all the task but GPU time increases. **** > > ** ** > > Just wondering what might be the cause of this or are these results > correct? Anything wrong with MPI code?**** > > ** ** > > My cluster has 3 machines. 4 MPI tasks run on 2 machine and 2 Tasks run on > 1 machine. **** > > Each machine has 1 GPU - GForce 9500 GT with 512 MB memory. **** > > ** ** > > Can anyone please help me with this.?**** > > ** ** > > Thanks**** > > -- **** > > ** ** > > ** ** > > ** ** > ------------------------------ > This email message is for the sole use of the intended recipient(s) and > may contain confidential information. Any unauthorized review, use, > disclosure or distribution is prohibited. If you are not the intended > recipient, please contact the sender by reply email and destroy all copies > of the original message. > ------------------------------ > > > _______________________________________________ > users mailing list > us...@open-mpi.org > http://www.open-mpi.org/mailman/listinfo.cgi/users > -- Best Regards, ROHAN DESHPANDE