I am running MPI and Thrust code on a cluster and measuring time for calculations.
My MPI code - #include "mpi.h" #include <stdio.h> #include <stdlib.h> #include <string.h> #include <time.h> #include <sys/time.h> #include <sys/resource.h> #define MASTER 0 #define ARRAYSIZE 20000000 int *masterarray,*onearray,*twoarray,*threearray,*fourarray,*fivearray,*sixarray,*sevenarray,*eightarray,*ninearray; int main(int argc, char* argv[]) { int numtasks, taskid,chunksize, namelen; int mysum,one,two,three,four,five,six,seven,eight,nine; char myname[MPI_MAX_PROCESSOR_NAME]; MPI_Status status; int a,b,c,d,e,f,g,h,i,j; /***** Initializations *****/ MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &numtasks); MPI_Comm_rank(MPI_COMM_WORLD,&taskid); MPI_Get_processor_name(myname, &namelen); printf ("MPI task %d has started on host %s...\n", taskid, myname); masterarray= malloc(ARRAYSIZE * sizeof(int)); onearray= malloc(ARRAYSIZE * sizeof(int)); twoarray= malloc(ARRAYSIZE * sizeof(int)); threearray= malloc(ARRAYSIZE * sizeof(int)); fourarray= malloc(ARRAYSIZE * sizeof(int)); fivearray= malloc(ARRAYSIZE * sizeof(int)); sixarray= malloc(ARRAYSIZE * sizeof(int)); sevenarray= malloc(ARRAYSIZE * sizeof(int)); eightarray= malloc(ARRAYSIZE * sizeof(int)); ninearray= malloc(ARRAYSIZE * sizeof(int)); /***** Master task only ******/ if (taskid == MASTER){ for(a=0; a < ARRAYSIZE; a++){ masterarray[a] = 1; } mysum = run_kernel0(masterarray,ARRAYSIZE,taskid, myname); } /* end of master section */ if (taskid > MASTER) { if(taskid == 1){ for(b=0;b<ARRAYSIZE;b++){ onearray[b] = 1; } one = run_kernel0(onearray,ARRAYSIZE,taskid, myname); } if(taskid == 2){ for(c=0;c<ARRAYSIZE;c++){ twoarray[c] = 1; } two = run_kernel0(twoarray,ARRAYSIZE,taskid, myname); } if(taskid == 3){ for(d=0;d<ARRAYSIZE;d++){ threearray[d] = 1; } three = run_kernel0(threearray,ARRAYSIZE,taskid, myname); } if(taskid == 4){ for(e=0;e < ARRAYSIZE; e++){ fourarray[e] = 1; } four = run_kernel0(fourarray,ARRAYSIZE,taskid, myname); } if(taskid == 5){ for(f=0;f<ARRAYSIZE;f++){ fivearray[f] = 1; } five = run_kernel0(fivearray,ARRAYSIZE,taskid, myname); } if(taskid == 6){ for(g=0;g<ARRAYSIZE;g++){ sixarray[g] = 1; } six = run_kernel0(sixarray,ARRAYSIZE,taskid, myname); } if(taskid == 7){ for(h=0;h<ARRAYSIZE;h++){ sevenarray[h] = 1; } seven = run_kernel0(sevenarray,ARRAYSIZE,taskid, myname); } if(taskid == 8){ for(i=0;i<ARRAYSIZE;i++){ eightarray[i] = 1; } eight = run_kernel0(eightarray,ARRAYSIZE,taskid, myname); } if(taskid == 9){ for(j=0;j<ARRAYSIZE;j++){ ninearray[j] = 1; } nine = run_kernel0(ninearray,ARRAYSIZE,taskid, myname); } } MPI_Finalize(); } All the tasks just initialize their own array and then calculate the sum using cuda thrust. My CUDA Thrust code - #include <stdio.h> #include <cutil_inline.h> #include <cutil.h> #include <thrust/version.h> #include <thrust/generate.h> #include <thrust/host_vector.h> #include <thrust/device_vector.h> #include <thrust/functional.h> #include <thrust/transform_reduce.h> #include <time.h> #include <sys/time.h> #include <sys/resource.h> extern "C" int run_kernel0( int array[], int nelements, int taskid, char hostname[]) { float elapsedTime; int result = 0; int threshold = 25000000; cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0); thrust::device_vector<int> gpuarray; int *begin = array; int *end = array + nelements; while(begin != end) { int chunk_size = thrust::min(threshold,end - begin); gpuarray.assign(begin, begin + chunk_size); result += thrust::reduce(gpuarray.begin(), gpuarray.end()); begin += chunk_size; } cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsedTime, start, stop); cudaEventDestroy(start); cudaEventDestroy(stop); printf(" Task %d on has sum (on GPU): %ld Time for the kernel: %f ms \n", taskid, result, elapsedTime); return result; } I also calculate the sum using CPU and the code is as below - struct timespec time1, time2, temp_time; clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time1); int i; int cpu_sum = 0; long diff = 0; for (i = 0; i < nelements; i++) { cpu_sum += array[i]; } clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time2); temp_time.tv_sec = time2.tv_sec - time1.tv_sec; temp_time.tv_nsec = time2.tv_nsec - time1.tv_nsec; diff = temp_time.tv_sec * 1000000000 + temp_time.tv_nsec; printf("Task %d calculated sum: %d using CPU in %lf ms \n", taskid, cpu_sum, (double) diff/1000000); return cpu_sum; Now when I run the job on cluster with 10 MPI tasks and compare the timings of CPU and GPU, I get weird results where GPU time is much much higher than CPU time. But the case should be opposite isnt it? The CPU time is almost same for all the task but GPU time increases. Just wondering what might be the cause of this or are these results correct? Anything wrong with MPI code? My cluster has 3 machines. 4 MPI tasks run on 2 machine and 2 Tasks run on 1 machine. Each machine has 1 GPU - GForce 9500 GT with 512 MB memory. Can anyone please help me with this.? Thanks --