Yep you are correct. I did the same and it worked. When I have more than 3
MPI tasks there is lot of overhead on GPU.

But for CPU there is not overhead. All three machines have 4 quad core
processors with 3.8 GB RAM.

Just wondering why there is no degradation of performance on CPU ?

On Tue, May 8, 2012 at 8:21 PM, Rolf vandeVaart <rvandeva...@nvidia.com>wrote:

> You should be running with one GPU per MPI process.  If I understand
> correctly, you have a 3 node cluster and each node has a GPU so you should
> run with np=3.****
>
> Maybe you can try that and see if your numbers come out better.****
>
> ** **
>
> ** **
>
> *From:* users-boun...@open-mpi.org [mailto:users-boun...@open-mpi.org] *On
> Behalf Of *Rohan Deshpande
> *Sent:* Monday, May 07, 2012 9:38 PM
> *To:* Open MPI Users
> *Subject:* [OMPI users] GPU and CPU timing - OpenMPI and Thrust****
>
> ** **
>
>  I am running MPI and Thrust code on a cluster and measuring time for
> calculations.****
>
> ** **
>
> My MPI code -   ****
>
> ** **
>
> #include "mpi.h"****
>
> #include <stdio.h>****
>
> #include <stdlib.h>****
>
> #include <string.h>****
>
> #include <time.h>****
>
> #include <sys/time.h>****
>
> #include <sys/resource.h>****
>
> ** **
>
> #define  MASTER 0****
>
> #define ARRAYSIZE 20000000****
>
> ** **
>
> int
> *masterarray,*onearray,*twoarray,*threearray,*fourarray,*fivearray,*sixarray,*sevenarray,*eightarray,*ninearray;
>     ****
>
>    int main(int argc, char* argv[])****
>
> {****
>
>   int   numtasks, taskid,chunksize, namelen; ****
>
>   int mysum,one,two,three,four,five,six,seven,eight,nine;****
>
> ** **
>
>   char myname[MPI_MAX_PROCESSOR_NAME];****
>
>   MPI_Status status;****
>
>   int a,b,c,d,e,f,g,h,i,j;****
>
> ** **
>
> /***** Initializations *****/****
>
> MPI_Init(&argc, &argv);****
>
> MPI_Comm_size(MPI_COMM_WORLD, &numtasks);****
>
> MPI_Comm_rank(MPI_COMM_WORLD,&taskid); ****
>
> MPI_Get_processor_name(myname, &namelen);****
>
> printf ("MPI task %d has started on host %s...\n", taskid, myname);****
>
> ** **
>
> masterarray= malloc(ARRAYSIZE * sizeof(int));****
>
> onearray= malloc(ARRAYSIZE * sizeof(int));****
>
> twoarray= malloc(ARRAYSIZE * sizeof(int));****
>
> threearray= malloc(ARRAYSIZE * sizeof(int));****
>
> fourarray= malloc(ARRAYSIZE * sizeof(int));****
>
> fivearray= malloc(ARRAYSIZE * sizeof(int));****
>
> sixarray= malloc(ARRAYSIZE * sizeof(int));****
>
> sevenarray= malloc(ARRAYSIZE * sizeof(int));****
>
> eightarray= malloc(ARRAYSIZE * sizeof(int));****
>
> ninearray= malloc(ARRAYSIZE * sizeof(int)); ****
>
> ** **
>
> /***** Master task only ******/****
>
> if (taskid == MASTER){****
>
>            for(a=0; a < ARRAYSIZE; a++){****
>
>                  masterarray[a] = 1;****
>
>                                ****
>
>             }****
>
>    mysum = run_kernel0(masterarray,ARRAYSIZE,taskid, myname);****
>
> ** **
>
>  }  /* end of master section */****
>
> ** **
>
>   if (taskid > MASTER) {****
>
> ** **
>
>              if(taskid == 1){****
>
>                 for(b=0;b<ARRAYSIZE;b++){****
>
>                 onearray[b] = 1;****
>
>             }****
>
>                  one = run_kernel0(onearray,ARRAYSIZE,taskid, myname);****
>
>              }****
>
>              if(taskid == 2){****
>
>                 for(c=0;c<ARRAYSIZE;c++){****
>
>                  twoarray[c] = 1;****
>
>             }****
>
>                  two = run_kernel0(twoarray,ARRAYSIZE,taskid, myname);****
>
>              }****
>
>              if(taskid == 3){****
>
>                  for(d=0;d<ARRAYSIZE;d++){****
>
>                  threearray[d] = 1;****
>
>                   }****
>
>                   three = run_kernel0(threearray,ARRAYSIZE,taskid, myname);
> ****
>
>              }****
>
>      if(taskid == 4){****
>
>                    for(e=0;e < ARRAYSIZE; e++){****
>
>                       fourarray[e] = 1;****
>
>                   }****
>
>                  four = run_kernel0(fourarray,ARRAYSIZE,taskid, myname);**
> **
>
>              }****
>
>              if(taskid == 5){****
>
>                 for(f=0;f<ARRAYSIZE;f++){****
>
>                   fivearray[f] = 1;****
>
>                   }****
>
>                 five = run_kernel0(fivearray,ARRAYSIZE,taskid, myname);***
> *
>
>              }****
>
>              if(taskid == 6){****
>
>                   ****
>
>                 for(g=0;g<ARRAYSIZE;g++){****
>
>                  sixarray[g] = 1;****
>
>                 }****
>
>                  six = run_kernel0(sixarray,ARRAYSIZE,taskid, myname);****
>
>              } ****
>
>              if(taskid == 7){****
>
>                     for(h=0;h<ARRAYSIZE;h++){****
>
>                     sevenarray[h] = 1;****
>
>                   }****
>
>                    seven = run_kernel0(sevenarray,ARRAYSIZE,taskid,
> myname);****
>
>              } ****
>
>              if(taskid == 8){****
>
> ** **
>
>                   for(i=0;i<ARRAYSIZE;i++){****
>
>                   eightarray[i] = 1;****
>
>                 }****
>
>                    eight = run_kernel0(eightarray,ARRAYSIZE,taskid,
> myname);****
>
>              } ****
>
>              if(taskid == 9){****
>
> ** **
>
>                    for(j=0;j<ARRAYSIZE;j++){****
>
>                  ninearray[j] = 1;****
>
>                    }****
>
>                    nine = run_kernel0(ninearray,ARRAYSIZE,taskid, myname);
> ****
>
>              } ****
>
>    }****
>
>  MPI_Finalize();****
>
> ** **
>
> }   ****
>
> ** **
>
> All the tasks just initialize their own array and then calculate the sum
> using cuda thrust.****
>
> My CUDA Thrust code - ****
>
> ** **
>
>  #include <stdio.h>****
>
> #include <cutil_inline.h>****
>
> #include <cutil.h>****
>
> #include <thrust/version.h>****
>
> #include <thrust/generate.h>****
>
> #include <thrust/host_vector.h>****
>
> #include <thrust/device_vector.h>****
>
> #include <thrust/functional.h>****
>
> #include <thrust/transform_reduce.h>****
>
> #include <time.h>****
>
> #include <sys/time.h>****
>
> #include <sys/resource.h>****
>
> ** **
>
>   extern "C"****
>
>  int run_kernel0( int array[], int nelements, int taskid, char hostname[])
> ****
>
>  {****
>
>    ****
>
>        float elapsedTime;****
>
>         int result = 0;****
>
> int threshold = 25000000;****
>
>         cudaEvent_t start, stop;****
>
> cudaEventCreate(&start);****
>
> cudaEventCreate(&stop);****
>
> cudaEventRecord(start, 0);****
>
> thrust::device_vector<int> gpuarray;****
>
> int *begin = array;****
>
> int *end = array + nelements;****
>
> while(begin != end)****
>
> {****
>
>    int chunk_size = thrust::min(threshold,end - begin);****
>
>    gpuarray.assign(begin, begin + chunk_size); ****
>
>  result += thrust::reduce(gpuarray.begin(), gpuarray.end());****
>
>    begin += chunk_size;****
>
> }****
>
>         cudaEventRecord(stop, 0);****
>
>         cudaEventSynchronize(stop);     ****
>
> cudaEventElapsedTime(&elapsedTime, start, stop);****
>
> cudaEventDestroy(start);****
>
> cudaEventDestroy(stop);****
>
> ** **
>
>         printf(" Task %d on has sum (on GPU): %ld Time for the kernel: %f
> ms \n", taskid, result, elapsedTime); ****
>
>      ****
>
> return result;****
>
>     }****
>
> ** **
>
> I also calculate the sum using CPU and the code is as below - ****
>
> ** **
>
>   struct timespec time1, time2, temp_time;****
>
> ** **
>
>   clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time1);****
>
>   int i;****
>
>   int cpu_sum = 0;****
>
>   long diff = 0;****
>
> ** **
>
>   for (i = 0; i < nelements; i++) {****
>
>     cpu_sum += array[i];****
>
>   }    ****
>
>   clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time2);****
>
>   temp_time.tv_sec = time2.tv_sec - time1.tv_sec;****
>
>   temp_time.tv_nsec = time2.tv_nsec - time1.tv_nsec;****
>
>   diff = temp_time.tv_sec * 1000000000 + temp_time.tv_nsec; ****
>
>   printf("Task %d calculated sum: %d using CPU in %lf ms \n", taskid,
> cpu_sum, (double) diff/1000000); ****
>
>   return cpu_sum;****
>
> ** **
>
> Now when I run the job on cluster with 10 MPI tasks and compare the
> timings of CPU and GPU, I get weird results where GPU time is much much
> higher than CPU time. ****
>
> But the case should be opposite isnt it?****
>
> ** **
>
> The CPU time is almost same for all the task but GPU time increases. ****
>
> ** **
>
> Just wondering what might be the cause of this or are these results
> correct? Anything wrong with MPI code?****
>
> ** **
>
> My cluster has 3 machines. 4 MPI tasks run on 2 machine and 2 Tasks run on
> 1 machine. ****
>
> Each machine has 1 GPU - GForce 9500 GT with 512 MB memory. ****
>
> ** **
>
> Can anyone please help me with this.?****
>
> ** **
>
> Thanks****
>
> -- ****
>
> ** **
>
> ** **
>
> ** **
>  ------------------------------
>  This email message is for the sole use of the intended recipient(s) and
> may contain confidential information.  Any unauthorized review, use,
> disclosure or distribution is prohibited.  If you are not the intended
> recipient, please contact the sender by reply email and destroy all copies
> of the original message.
>  ------------------------------
>
>
> _______________________________________________
> users mailing list
> us...@open-mpi.org
> http://www.open-mpi.org/mailman/listinfo.cgi/users
>



-- 

Best Regards,

ROHAN DESHPANDE

Reply via email to