Hi,

I am trying to execute following code on cluster.

run_kernel is a cuda call with signature int run_kernel(int array[],int
nelements, int taskid, char hostname[]);

My MPI code is


#include "mpi.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define  MASTER        0


int *masterarray;
int *onearray;
int *twoarray;
int *threearray;
int *fourarray;
int *fivearray;
int *sixarray;
int *sevenarray;
int *eightarray;
int *ninearray;




int main(int argc, char* argv[])
{
int   numtasks, taskid, rc, dest, offset, i, j, tag1, tag2, source,
chunksize, namelen;

int mysum;
int update(int myoffset, int chunk, int myid);
char myname[MPI_MAX_PROCESSOR_NAME];
MPI_Status status;
int n;
int k=0;



/***** Initializations *****/
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &numtasks);
MPI_Comm_rank(MPI_COMM_WORLD,&taskid);
MPI_Get_processor_name(myname, &namelen);
printf ("MPI task %d has started on host %s...\n", taskid, myname);
chunksize = 20000000;
tag2 = 1;
tag1 = 2;

masterarray= malloc(chunksize * sizeof(int));
onearray= malloc(chunksize * sizeof(int));
twoarray= malloc(chunksize * sizeof(int));
threearray= malloc(chunksize * sizeof(int));
fourarray= malloc(chunksize * sizeof(int));
fivearray= malloc(chunksize * sizeof(int));
sixarray= malloc(chunksize * sizeof(int));
sevenarray= malloc(chunksize * sizeof(int));
eightarray= malloc(chunksize * sizeof(int));
ninearray= malloc(chunksize * sizeof(int));

int a;

/*initialize all the arrays*/
for(a=0;a<chunksize;a++){
   masterarray[a] = 1;
   onearray[a] = 1;
   twoarray[a] = 1;
   threearray[a] = 1;
   fourarray[a] = 1;
   fivearray[a] = 1;
   sixarray[a] = 1;
   sevenarray[a] = 1;
   eightarray[a] = 1;
   ninearray[a] = 1;

 }

/***** Master task only ******/
if (taskid == MASTER){


  mysum = run_kernel(&masterarray[20000000], chunksize, taskid, myname);

 }  /* end of master section */


if (taskid > MASTER) {


    mysum = run_kernel(&onearray[20000000], chunksize, taskid, myname);
   }

   if(taskid == 2){

    mysum = run_kernel(&twoarray[20000000], chunksize, taskid, myname);
   }

   if(taskid == 3){

    mysum = run_kernel(&threearray[20000000], chunksize, taskid, myname);
   }

   if(taskid == 4){

    mysum = run_kernel(&fourarray[20000000], chunksize, taskid, myname);
   }

  if(taskid == 5){

    mysum = run_kernel(&fivearray[20000000], chunksize, taskid, myname);
   }

    if(taskid == 6){

    mysum = run_kernel(&sixarray[20000000], chunksize, taskid, myname);
   }

  if(taskid == 7){

    mysum = run_kernel(&sevenarray[20000000], chunksize, taskid, myname);
   }

  if(taskid == 8){

    mysum = run_kernel(&eightarray[20000000], chunksize, taskid, myname);
   }

  if(taskid == 9){

    mysum = run_kernel(&ninearray[20000000], chunksize, taskid, myname);
   }

 }

 MPI_Finalize();

}


I am simply trying to calculate sum of array elements using kernel
function. Each task has its own data and it calculates its own sum.

I am getting segmentation fault on master task but all other task calculate
the sum successfully.

Here is the output


MPI task 0 has started on host node4
MPI task 1 has started on host node4
MPI task 2 has started on host node5
MPI task 6 has started on host node6
MPI task 5 has started on node5
MPI task 9 has started on host node6
MPI task 8 has started on host node6
MPI task 3 has started on node5
MPI task 4 has started on hnode5
MPI task 7 has started on node6
[node4] *** Process received signal ***
[node4] Signal: Segmentation fault (11)
[node4] Signal code: Address not mapped (1)
[node4] Failing at address: 0xb7866000
[node4] [ 0] [0xbc040c]
[node4] [ 1] /usr/lib/libcuda.so(+0x13a0f6) [0x10640f6]
[node4] [ 2] /usr/lib/libcuda.so(+0x146912) [0x1070912]
[node4] [ 3] /usr/lib/libcuda.so(+0x147231) [0x1071231]
[node4] [ 4] /usr/lib/libcuda.so(+0x13cb64) [0x1066b64]
[node4] [ 5] /usr/lib/libcuda.so(+0x11863c) [0x104263c]
[node4] [ 6] /usr/lib/libcuda.so(+0x11d93b) [0x104793b]
[node4] [ 7] /usr/lib/libcuda.so(cuMemcpyHtoD_v2+0x64) [0x1037264]
[node4] [ 8] /usr/local/cuda/lib/libcudart.so.4(+0x20336) [0x224336]
[node4] [ 9] /usr/local/cuda/lib/libcudart.so.4(cudaMemcpy+0x230) [0x257360]
[node4] [10] mpi_array_distributed(run_kernel+0x9a) [0x804a482]
[node4] [11] mpi_array_distributed(main+0x325) [0x804a139]
[node4] [12] /lib/libc.so.6(__libc_start_main+0xe6) [0x4dece6]
[node4] [13] mpi_array_distributed() [0x8049d81]
[node4] *** End of error message ***

 MPI Task 1 is executing Kernel function........
 Task 1 has sum (on GPU): 100002306 Time for the kernel: 39.462273 ms

 MPI Task 7 is executing Kernel function........
 Task 7 has sum (on GPU): 100002306 Time for the kernel: 64.105377 ms

 MPI Task 9 is executing Kernel function.....
 Task 9 has sum (on GPU): 100002306 Time for the kernel: 45.486912 ms

 MPI Task 8 is executing Kernel function........Size of shared memory: 2048

 MPI Task 4 is executing Kernel function.......
 Task 8 has sum (on GPU): 100002306 Time for the kernel: 70.883362 ms

 MPI Task 2 is executing Kernel function......
 Task 4 has sum (on GPU): 100002306 Time for the kernel: 129.759079 ms
 Task 2 has sum (on GPU): 100002306 Time for the kernel: 139.709473 ms

 MPI Task 6 is executing Kernel function......

 MPI Task 3 is executing Kernel function.......
 Task 6 has sum (on GPU): 100002306 Time for the kernel: 47.691456 ms
 Task 3 has sum (on GPU): 100002306 Time for the kernel: 110.210335 ms

 MPI Task 5 is executing Kernel function......
 Task 5 has sum (on GPU): 100002306 Time for the kernel: 110.706787 ms

 --------------------------------------------------------------------------
mpirun noticed that process rank 0 with PID 3054 on node
ecm-c-l-207-004.uniwa.uwa.edu.au exited on signal 11 (Segmentation fault).
--------------------------------------------------------------------------

Sadly i cant install memory checker such as valgrind on my machine due to
some restrictions. I could not spot any error by looking in code.

Can anyone help me ?what is wrong in above code.

Thanks
--

Reply via email to