I am combining mpi and cuda. Trying to find out sum of array elements
using cuda and using mpi to distribute the array.

my cuda code

#include <stdio.h>

__global__ void add(int *devarray, int *devsum)
{
        int index = blockIdx.x * blockDim.x + threadIdx.x;
        *devsum = *devsum + devarray[index];
}

extern "C"
int * run_kernel(int array[],int nelements)
{
        int  *devarray, *sum, *devsum;
        sum =(int *) malloc(1 * sizeof(int));

        printf("\nrun_kernel called..............");

        cudaMalloc((void**) &devarray, sizeof(int)*nelements);
        cudaMalloc((void**) &devsum, sizeof(int));
        cudaMemcpy(devarray, array, sizeof(int)*nelements,
cudaMemcpyHostToDevice);
        //cudaMemcpy(devsum, sum, sizeof(int), cudaMemcpyHostToDevice);
        add<<<2, 3>>>(devarray, devsum);
      //  printf("\ndevsum is %d", devsum);             
        cudaMemcpy(sum, devsum, sizeof(int), cudaMemcpyDeviceToHost);

        printf(" \nthe sum is %d\n", *sum);
        cudaFree(devarray);
        cudaFree(devsum);
        return sum;

}



#include "mpi.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define  ARRAYSIZE      2000
#define  MASTER         0
int  data[ARRAYSIZE];

int main(int argc, char* argv[])
{

int   numtasks, taskid, rc, dest, offset, i, j, tag1, tag2, source,
chunksize, namelen;
int mysum;
long sum;
int update(int myoffset, int chunk, int myid);
char myname[MPI_MAX_PROCESSOR_NAME];
MPI_Status status;
double start = 0.0, stop = 0.0, time = 0.0;
double totaltime;
FILE *fp;
char line[128];
char element;
int n;
int k=0;

/***** Initializations *****/

MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &numtasks);
MPI_Comm_rank(MPI_COMM_WORLD,&taskid);
MPI_Get_processor_name(myname, &namelen);
printf ("MPI task %d has started on host %s...\n", taskid, myname);
chunksize = (ARRAYSIZE / numtasks);
tag2 = 1;
tag1 = 2;

/***** Master task only ******/

if (taskid == MASTER){

  fp=fopen("integers.txt", "r");
  if(fp != NULL){
   sum = 0;
   while(fgets(line, sizeof line, fp)!= NULL){
    fscanf(fp,"%d",&data[k]);
    sum = sum + data[k]; // calculate sum to verify later on
    k++;
   }
  }


printf("Initialized array sum %d\n", sum);

  /* Send each task its portion of the array - master keeps 1st part */
  offset = chunksize;
  for (dest=1; dest<numtasks; dest++) {
    MPI_Send(&offset, 1, MPI_INT, dest, tag1, MPI_COMM_WORLD);
    MPI_Send(&data[offset], chunksize, MPI_INT, dest, tag2, MPI_COMM_WORLD);
    printf("Sent %d elements to task %d offset= %d\n",chunksize,dest,offset);
    offset = offset + chunksize;
   }



  /* Master does its part of the work */

  offset = 0;
  mysum = run_kernel(&data[offset], chunksize);
  printf("Kernel returns sum %d", mysum);
  //mysum = update(offset, chunksize, taskid);

  /* Wait to receive results from each task */

  for (i=1; i<numtasks; i++) {
    source = i;
    MPI_Recv(&offset, 1, MPI_INT, source, tag1, MPI_COMM_WORLD, &status);
    MPI_Recv(&data[offset], chunksize, MPI_INT, source,
tag2,MPI_COMM_WORLD, &status);
   }

  /* Get final sum and print sample results */

  MPI_Reduce(&mysum, &sum, 1, MPI_INT, MPI_SUM, MASTER, MPI_COMM_WORLD);
  printf("\n*** Final sum= %d ***\n",sum);
 }  /* end of master section */

/***** Non-master tasks only *****/


if (taskid > MASTER) {

  /* Receive my portion of array from the master task */
  start= MPI_Wtime();
  source = MASTER;
  MPI_Recv(&offset, 1, MPI_INT, source, tag1, MPI_COMM_WORLD, &status);
  MPI_Recv(&data[offset], chunksize, MPI_INT, source,
tag2,MPI_COMM_WORLD, &status);
  mysum = run_kernel(&data[offset], chunksize);
  printf("\nKernel returns sum %d ", mysum);

// mysum = update(offset, chunksize, taskid);
  stop = MPI_Wtime();
  time = stop -start;
  printf("time taken by process %d to recieve elements and caluclate
own sum is = %lf seconds \n", taskid, time);
 // totaltime = totaltime + time;



  /* Send my results back to the master task */
  dest = MASTER;
  MPI_Send(&offset, 1, MPI_INT, dest, tag1, MPI_COMM_WORLD);
  MPI_Send(&data[offset], chunksize, MPI_INT, MASTER, tag2, MPI_COMM_WORLD);
  MPI_Reduce(&mysum, &sum, 1, MPI_INT, MPI_SUM, MASTER, MPI_COMM_WORLD);

  } /* end of non-master */

 MPI_Finalize();
}


here is the output of above code -

MPI task 2 has started on host 4
MPI task 3 has started on host 4
MPI task 0 has started on host 4
MPI task 1 has started on host 4

Initialized array sum 9061
Sent 500 elements to task 1 offset= 500
Sent 500 elements to task 2 offset= 1000
Sent 500 elements to task 3 offset= 1500



run_kernel called..............
the sum is 10

Kernel returns sum 159300360 time taken by process 2 to recieve elements
and caluclate own sum is = 0.290016 seconds
run_kernel called..............
the sum is 268452367
run_kernel called..............
the sum is 10

Kernel returns sum 145185544 time taken by process 3 to recieve elements
and caluclate own sum is = 0.293579 seconds
run_kernel called..............
the sum is 1048

Kernel returns sum 156969736 time taken by process 1 to recieve elements
and caluclate own sum is = 0.297599 seconds
Kernel returns sum 152148496
*** Final sum= 613604136 ***

The final sum and initialized sum is not matching. I am guessing its a
pointer issue. mysum should be pointer? but then MPI_REDUCE does not
execute properly and segmentation fault occurs.

Any idea what is going wrong?
Thanks

Reply via email to