I am not sure about everything that is going wrong, but there are at least two 
issues I found.
First, you are skipping the first line that you read from integers.txt.  Maybe 
something like this instead.

  while(fgets(line, sizeof line, fp)!= NULL){
    sscanf(line,"%d",&data[k]);
    sum = sum + data[k]; // calculate sum to verify later on
    k++;
}

Secondly, your function run_kernel is returning a pointer to an integer, but 
you are treating it as an integer.
A quick hack fix is:

mysumptr = run_kernel(...)
mysum = *mysumptr;

I would suggest adding lots of printfs or walking through a debugger to find 
out other places there might be problems.

Rolf
________________________________________
From: users-boun...@open-mpi.org [users-boun...@open-mpi.org] On Behalf Of 
Rohan Deshpande [rohan...@gmail.com]
Sent: Tuesday, April 24, 2012 3:35 AM
To: Open MPI Users
Subject: [OMPI users] MPI and CUDA

I am combining mpi and cuda. Trying to find out sum of array elements using 
cuda and using mpi to distribute the array.

my cuda code

#include <stdio.h>

__global__ void add(int *devarray, int *devsum)
{
        int index = blockIdx.x * blockDim.x + threadIdx.x;
        *devsum = *devsum + devarray[index];
}

extern "C"

int * run_kernel(int array[],int nelements)
{
        int  *devarray, *sum, *devsum;
        sum =(int *) malloc(1 * sizeof(int));

        printf("\nrun_kernel called..............");


        cudaMalloc((void**) &devarray, sizeof(int)*nelements);
        cudaMalloc((void**) &devsum, sizeof(int));
        cudaMemcpy(devarray, array, sizeof(int)*nelements, 
cudaMemcpyHostToDevice);

        //cudaMemcpy(devsum, sum, sizeof(int), cudaMemcpyHostToDevice);
        add<<<2, 3>>>(devarray, devsum);
      //  printf("\ndevsum is %d", devsum);

        cudaMemcpy(sum, devsum, sizeof(int), cudaMemcpyDeviceToHost);

        printf(" \nthe sum is %d\n", *sum);
        cudaFree(devarray);

        cudaFree(devsum);
        return sum;

}



#include "mpi.h"

#include <stdio.h>
#include <stdlib.h>

#include <string.h>

#define  ARRAYSIZE      2000

#define  MASTER         0
int  data[ARRAYSIZE];


int main(int argc, char* argv[])
{


int   numtasks, taskid, rc, dest, offset, i, j, tag1, tag2, source, chunksize, 
namelen;

int mysum;
long sum;
int update(int myoffset, int chunk, int myid);

char myname[MPI_MAX_PROCESSOR_NAME];
MPI_Status status;
double start = 0.0, stop = 0.0, time = 0.0;

double totaltime;
FILE *fp;
char line[128];

char element;
int n;
int k=0;


/***** Initializations *****/

MPI_Init(&argc, &argv);

MPI_Comm_size(MPI_COMM_WORLD, &numtasks);
MPI_Comm_rank(MPI_COMM_WORLD,&taskid);

MPI_Get_processor_name(myname, &namelen);
printf ("MPI task %d has started on host %s...\n", taskid, myname);

chunksize = (ARRAYSIZE / numtasks);
tag2 = 1;

tag1 = 2;

/***** Master task only ******/


if (taskid == MASTER){

  fp=fopen("integers.txt", "r");

  if(fp != NULL){
   sum = 0;

   while(fgets(line, sizeof line, fp)!= NULL){

    fscanf(fp,"%d",&data[k]);
    sum = sum + data[k]; // calculate sum to verify later on

    k++;
   }
  }


printf("Initialized array sum %d\n", sum);


  /* Send each task its portion of the array - master keeps 1st part */

  offset = chunksize;
  for (dest=1; dest<numtasks; dest++) {

    MPI_Send(&offset, 1, MPI_INT, dest, tag1, MPI_COMM_WORLD);

    MPI_Send(&data[offset], chunksize, MPI_INT, dest, tag2, MPI_COMM_WORLD);

    printf("Sent %d elements to task %d offset= %d\n",chunksize,dest,offset);

    offset = offset + chunksize;
   }




  /* Master does its part of the work */


  offset = 0;
  mysum = run_kernel(&data[offset], chunksize);

  printf("Kernel returns sum %d", mysum);

  //mysum = update(offset, chunksize, taskid);


  /* Wait to receive results from each task */


  for (i=1; i<numtasks; i++) {

    source = i;
    MPI_Recv(&offset, 1, MPI_INT, source, tag1, MPI_COMM_WORLD, &status);

    MPI_Recv(&data[offset], chunksize, MPI_INT, source, tag2,MPI_COMM_WORLD, 
&status);

   }

  /* Get final sum and print sample results */


  MPI_Reduce(&mysum, &sum, 1, MPI_INT, MPI_SUM, MASTER, MPI_COMM_WORLD);

  printf("\n*** Final sum= %d ***\n",sum);

 }  /* end of master section */


/***** Non-master tasks only *****/



if (taskid > MASTER) {


  /* Receive my portion of array from the master task */

  start= MPI_Wtime();
  source = MASTER;

  MPI_Recv(&offset, 1, MPI_INT, source, tag1, MPI_COMM_WORLD, &status);

  MPI_Recv(&data[offset], chunksize, MPI_INT, source, tag2,MPI_COMM_WORLD, 
&status);

  mysum = run_kernel(&data[offset], chunksize);
  printf("\nKernel returns sum %d ", mysum);


// mysum = update(offset, chunksize, taskid);

  stop = MPI_Wtime();
  time = stop -start;

  printf("time taken by process %d to recieve elements and caluclate own sum is 
= %lf seconds \n", taskid, time);

 // totaltime = totaltime + time;




  /* Send my results back to the master task */

  dest = MASTER;
  MPI_Send(&offset, 1, MPI_INT, dest, tag1, MPI_COMM_WORLD);

  MPI_Send(&data[offset], chunksize, MPI_INT, MASTER, tag2, MPI_COMM_WORLD);

  MPI_Reduce(&mysum, &sum, 1, MPI_INT, MPI_SUM, MASTER, MPI_COMM_WORLD);


  } /* end of non-master */


 MPI_Finalize();
}

here is the output of above code -

MPI task 2 has started on host 4
MPI task 3 has started on host 4
MPI task 0 has started on host 4
MPI task 1 has started on host 4

Initialized array sum 9061
Sent 500 elements to task 1 offset= 500
Sent 500 elements to task 2 offset= 1000
Sent 500 elements to task 3 offset= 1500



run_kernel called..............
the sum is 10

Kernel returns sum 159300360 time taken by process 2 to recieve elements and 
caluclate own sum is = 0.290016 seconds
run_kernel called..............
the sum is 268452367
run_kernel called..............
the sum is 10

Kernel returns sum 145185544 time taken by process 3 to recieve elements and 
caluclate own sum is = 0.293579 seconds
run_kernel called..............
the sum is 1048

Kernel returns sum 156969736 time taken by process 1 to recieve elements and 
caluclate own sum is = 0.297599 seconds
Kernel returns sum 152148496
*** Final sum= 613604136 ***

The final sum and initialized sum is not matching. I am guessing its a pointer 
issue. mysum should be pointer? but then MPI_REDUCE does not execute properly 
and segmentation fault occurs.

Any idea what is going wrong?
Thanks





-----------------------------------------------------------------------------------
This email message is for the sole use of the intended recipient(s) and may 
contain
confidential information.  Any unauthorized review, use, disclosure or 
distribution
is prohibited.  If you are not the intended recipient, please contact the 
sender by
reply email and destroy all copies of the original message.
-----------------------------------------------------------------------------------

Reply via email to