I am combining mpi and cuda. Trying to find out sum of array elements using cuda and using mpi to distribute the array.
my cuda code #include <stdio.h> __global__ void add(int *devarray, int *devsum) { int index = blockIdx.x * blockDim.x + threadIdx.x; *devsum = *devsum + devarray[index]; } extern "C" int * run_kernel(int array[],int nelements) { int *devarray, *sum, *devsum; sum =(int *) malloc(1 * sizeof(int)); printf("\nrun_kernel called.............."); cudaMalloc((void**) &devarray, sizeof(int)*nelements); cudaMalloc((void**) &devsum, sizeof(int)); cudaMemcpy(devarray, array, sizeof(int)*nelements, cudaMemcpyHostToDevice); //cudaMemcpy(devsum, sum, sizeof(int), cudaMemcpyHostToDevice); add<<<2, 3>>>(devarray, devsum); // printf("\ndevsum is %d", devsum); cudaMemcpy(sum, devsum, sizeof(int), cudaMemcpyDeviceToHost); printf(" \nthe sum is %d\n", *sum); cudaFree(devarray); cudaFree(devsum); return sum; } #include "mpi.h" #include <stdio.h> #include <stdlib.h> #include <string.h> #define ARRAYSIZE 2000 #define MASTER 0 int data[ARRAYSIZE]; int main(int argc, char* argv[]) { int numtasks, taskid, rc, dest, offset, i, j, tag1, tag2, source, chunksize, namelen; int mysum; long sum; int update(int myoffset, int chunk, int myid); char myname[MPI_MAX_PROCESSOR_NAME]; MPI_Status status; double start = 0.0, stop = 0.0, time = 0.0; double totaltime; FILE *fp; char line[128]; char element; int n; int k=0; /***** Initializations *****/ MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &numtasks); MPI_Comm_rank(MPI_COMM_WORLD,&taskid); MPI_Get_processor_name(myname, &namelen); printf ("MPI task %d has started on host %s...\n", taskid, myname); chunksize = (ARRAYSIZE / numtasks); tag2 = 1; tag1 = 2; /***** Master task only ******/ if (taskid == MASTER){ fp=fopen("integers.txt", "r"); if(fp != NULL){ sum = 0; while(fgets(line, sizeof line, fp)!= NULL){ fscanf(fp,"%d",&data[k]); sum = sum + data[k]; // calculate sum to verify later on k++; } } printf("Initialized array sum %d\n", sum); /* Send each task its portion of the array - master keeps 1st part */ offset = chunksize; for (dest=1; dest<numtasks; dest++) { MPI_Send(&offset, 1, MPI_INT, dest, tag1, MPI_COMM_WORLD); MPI_Send(&data[offset], chunksize, MPI_INT, dest, tag2, MPI_COMM_WORLD); printf("Sent %d elements to task %d offset= %d\n",chunksize,dest,offset); offset = offset + chunksize; } /* Master does its part of the work */ offset = 0; mysum = run_kernel(&data[offset], chunksize); printf("Kernel returns sum %d", mysum); //mysum = update(offset, chunksize, taskid); /* Wait to receive results from each task */ for (i=1; i<numtasks; i++) { source = i; MPI_Recv(&offset, 1, MPI_INT, source, tag1, MPI_COMM_WORLD, &status); MPI_Recv(&data[offset], chunksize, MPI_INT, source, tag2,MPI_COMM_WORLD, &status); } /* Get final sum and print sample results */ MPI_Reduce(&mysum, &sum, 1, MPI_INT, MPI_SUM, MASTER, MPI_COMM_WORLD); printf("\n*** Final sum= %d ***\n",sum); } /* end of master section */ /***** Non-master tasks only *****/ if (taskid > MASTER) { /* Receive my portion of array from the master task */ start= MPI_Wtime(); source = MASTER; MPI_Recv(&offset, 1, MPI_INT, source, tag1, MPI_COMM_WORLD, &status); MPI_Recv(&data[offset], chunksize, MPI_INT, source, tag2,MPI_COMM_WORLD, &status); mysum = run_kernel(&data[offset], chunksize); printf("\nKernel returns sum %d ", mysum); // mysum = update(offset, chunksize, taskid); stop = MPI_Wtime(); time = stop -start; printf("time taken by process %d to recieve elements and caluclate own sum is = %lf seconds \n", taskid, time); // totaltime = totaltime + time; /* Send my results back to the master task */ dest = MASTER; MPI_Send(&offset, 1, MPI_INT, dest, tag1, MPI_COMM_WORLD); MPI_Send(&data[offset], chunksize, MPI_INT, MASTER, tag2, MPI_COMM_WORLD); MPI_Reduce(&mysum, &sum, 1, MPI_INT, MPI_SUM, MASTER, MPI_COMM_WORLD); } /* end of non-master */ MPI_Finalize(); } here is the output of above code - MPI task 2 has started on host 4 MPI task 3 has started on host 4 MPI task 0 has started on host 4 MPI task 1 has started on host 4 Initialized array sum 9061 Sent 500 elements to task 1 offset= 500 Sent 500 elements to task 2 offset= 1000 Sent 500 elements to task 3 offset= 1500 run_kernel called.............. the sum is 10 Kernel returns sum 159300360 time taken by process 2 to recieve elements and caluclate own sum is = 0.290016 seconds run_kernel called.............. the sum is 268452367 run_kernel called.............. the sum is 10 Kernel returns sum 145185544 time taken by process 3 to recieve elements and caluclate own sum is = 0.293579 seconds run_kernel called.............. the sum is 1048 Kernel returns sum 156969736 time taken by process 1 to recieve elements and caluclate own sum is = 0.297599 seconds Kernel returns sum 152148496 *** Final sum= 613604136 *** The final sum and initialized sum is not matching. I am guessing its a pointer issue. mysum should be pointer? but then MPI_REDUCE does not execute properly and segmentation fault occurs. Any idea what is going wrong? Thanks