Hi,
I am trying to execute following code on cluster.
run_kernel is a cuda call with signature int run_kernel(int array[],int
nelements, int taskid, char hostname[]);
My MPI code is
#include "mpi.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MASTER 0
int *masterarray;
int *onearray;
int *twoarray;
int *threearray;
int *fourarray;
int *fivearray;
int *sixarray;
int *sevenarray;
int *eightarray;
int *ninearray;
int main(int argc, char* argv[])
{
int numtasks, taskid, rc, dest, offset, i, j, tag1, tag2, source,
chunksize, namelen;
int mysum;
int update(int myoffset, int chunk, int myid);
char myname[MPI_MAX_PROCESSOR_NAME];
MPI_Status status;
int n;
int k=0;
/***** Initializations *****/
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &numtasks);
MPI_Comm_rank(MPI_COMM_WORLD,&taskid);
MPI_Get_processor_name(myname, &namelen);
printf ("MPI task %d has started on host %s...\n", taskid, myname);
chunksize = 20000000;
tag2 = 1;
tag1 = 2;
masterarray= malloc(chunksize * sizeof(int));
onearray= malloc(chunksize * sizeof(int));
twoarray= malloc(chunksize * sizeof(int));
threearray= malloc(chunksize * sizeof(int));
fourarray= malloc(chunksize * sizeof(int));
fivearray= malloc(chunksize * sizeof(int));
sixarray= malloc(chunksize * sizeof(int));
sevenarray= malloc(chunksize * sizeof(int));
eightarray= malloc(chunksize * sizeof(int));
ninearray= malloc(chunksize * sizeof(int));
int a;
/*initialize all the arrays*/
for(a=0;a<chunksize;a++){
masterarray[a] = 1;
onearray[a] = 1;
twoarray[a] = 1;
threearray[a] = 1;
fourarray[a] = 1;
fivearray[a] = 1;
sixarray[a] = 1;
sevenarray[a] = 1;
eightarray[a] = 1;
ninearray[a] = 1;
}
/***** Master task only ******/
if (taskid == MASTER){
mysum = run_kernel(&masterarray[20000000], chunksize, taskid, myname);
} /* end of master section */
if (taskid > MASTER) {
mysum = run_kernel(&onearray[20000000], chunksize, taskid, myname);
}
if(taskid == 2){
mysum = run_kernel(&twoarray[20000000], chunksize, taskid, myname);
}
if(taskid == 3){
mysum = run_kernel(&threearray[20000000], chunksize, taskid, myname);
}
if(taskid == 4){
mysum = run_kernel(&fourarray[20000000], chunksize, taskid, myname);
}
if(taskid == 5){
mysum = run_kernel(&fivearray[20000000], chunksize, taskid, myname);
}
if(taskid == 6){
mysum = run_kernel(&sixarray[20000000], chunksize, taskid, myname);
}
if(taskid == 7){
mysum = run_kernel(&sevenarray[20000000], chunksize, taskid, myname);
}
if(taskid == 8){
mysum = run_kernel(&eightarray[20000000], chunksize, taskid, myname);
}
if(taskid == 9){
mysum = run_kernel(&ninearray[20000000], chunksize, taskid, myname);
}
}
MPI_Finalize();
}
I am simply trying to calculate sum of array elements using kernel
function. Each task has its own data and it calculates its own sum.
I am getting segmentation fault on master task but all other task calculate
the sum successfully.
Here is the output
MPI task 0 has started on host node4
MPI task 1 has started on host node4
MPI task 2 has started on host node5
MPI task 6 has started on host node6
MPI task 5 has started on node5
MPI task 9 has started on host node6
MPI task 8 has started on host node6
MPI task 3 has started on node5
MPI task 4 has started on hnode5
MPI task 7 has started on node6
[node4] *** Process received signal ***
[node4] Signal: Segmentation fault (11)
[node4] Signal code: Address not mapped (1)
[node4] Failing at address: 0xb7866000
[node4] [ 0] [0xbc040c]
[node4] [ 1] /usr/lib/libcuda.so(+0x13a0f6) [0x10640f6]
[node4] [ 2] /usr/lib/libcuda.so(+0x146912) [0x1070912]
[node4] [ 3] /usr/lib/libcuda.so(+0x147231) [0x1071231]
[node4] [ 4] /usr/lib/libcuda.so(+0x13cb64) [0x1066b64]
[node4] [ 5] /usr/lib/libcuda.so(+0x11863c) [0x104263c]
[node4] [ 6] /usr/lib/libcuda.so(+0x11d93b) [0x104793b]
[node4] [ 7] /usr/lib/libcuda.so(cuMemcpyHtoD_v2+0x64) [0x1037264]
[node4] [ 8] /usr/local/cuda/lib/libcudart.so.4(+0x20336) [0x224336]
[node4] [ 9] /usr/local/cuda/lib/libcudart.so.4(cudaMemcpy+0x230) [0x257360]
[node4] [10] mpi_array_distributed(run_kernel+0x9a) [0x804a482]
[node4] [11] mpi_array_distributed(main+0x325) [0x804a139]
[node4] [12] /lib/libc.so.6(__libc_start_main+0xe6) [0x4dece6]
[node4] [13] mpi_array_distributed() [0x8049d81]
[node4] *** End of error message ***
MPI Task 1 is executing Kernel function........
Task 1 has sum (on GPU): 100002306 Time for the kernel: 39.462273 ms
MPI Task 7 is executing Kernel function........
Task 7 has sum (on GPU): 100002306 Time for the kernel: 64.105377 ms
MPI Task 9 is executing Kernel function.....
Task 9 has sum (on GPU): 100002306 Time for the kernel: 45.486912 ms
MPI Task 8 is executing Kernel function........Size of shared memory: 2048
MPI Task 4 is executing Kernel function.......
Task 8 has sum (on GPU): 100002306 Time for the kernel: 70.883362 ms
MPI Task 2 is executing Kernel function......
Task 4 has sum (on GPU): 100002306 Time for the kernel: 129.759079 ms
Task 2 has sum (on GPU): 100002306 Time for the kernel: 139.709473 ms
MPI Task 6 is executing Kernel function......
MPI Task 3 is executing Kernel function.......
Task 6 has sum (on GPU): 100002306 Time for the kernel: 47.691456 ms
Task 3 has sum (on GPU): 100002306 Time for the kernel: 110.210335 ms
MPI Task 5 is executing Kernel function......
Task 5 has sum (on GPU): 100002306 Time for the kernel: 110.706787 ms
--------------------------------------------------------------------------
mpirun noticed that process rank 0 with PID 3054 on node
ecm-c-l-207-004.uniwa.uwa.edu.au exited on signal 11 (Segmentation fault).
--------------------------------------------------------------------------
Sadly i cant install memory checker such as valgrind on my machine due to
some restrictions. I could not spot any error by looking in code.
Can anyone help me ?what is wrong in above code.
Thanks
--