Hi, I am trying to execute following code on cluster.
run_kernel is a cuda call with signature int run_kernel(int array[],int nelements, int taskid, char hostname[]); My MPI code is #include "mpi.h" #include <stdio.h> #include <stdlib.h> #include <string.h> #define MASTER 0 int *masterarray; int *onearray; int *twoarray; int *threearray; int *fourarray; int *fivearray; int *sixarray; int *sevenarray; int *eightarray; int *ninearray; int main(int argc, char* argv[]) { int numtasks, taskid, rc, dest, offset, i, j, tag1, tag2, source, chunksize, namelen; int mysum; int update(int myoffset, int chunk, int myid); char myname[MPI_MAX_PROCESSOR_NAME]; MPI_Status status; int n; int k=0; /***** Initializations *****/ MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &numtasks); MPI_Comm_rank(MPI_COMM_WORLD,&taskid); MPI_Get_processor_name(myname, &namelen); printf ("MPI task %d has started on host %s...\n", taskid, myname); chunksize = 20000000; tag2 = 1; tag1 = 2; masterarray= malloc(chunksize * sizeof(int)); onearray= malloc(chunksize * sizeof(int)); twoarray= malloc(chunksize * sizeof(int)); threearray= malloc(chunksize * sizeof(int)); fourarray= malloc(chunksize * sizeof(int)); fivearray= malloc(chunksize * sizeof(int)); sixarray= malloc(chunksize * sizeof(int)); sevenarray= malloc(chunksize * sizeof(int)); eightarray= malloc(chunksize * sizeof(int)); ninearray= malloc(chunksize * sizeof(int)); int a; /*initialize all the arrays*/ for(a=0;a<chunksize;a++){ masterarray[a] = 1; onearray[a] = 1; twoarray[a] = 1; threearray[a] = 1; fourarray[a] = 1; fivearray[a] = 1; sixarray[a] = 1; sevenarray[a] = 1; eightarray[a] = 1; ninearray[a] = 1; } /***** Master task only ******/ if (taskid == MASTER){ mysum = run_kernel(&masterarray[20000000], chunksize, taskid, myname); } /* end of master section */ if (taskid > MASTER) { mysum = run_kernel(&onearray[20000000], chunksize, taskid, myname); } if(taskid == 2){ mysum = run_kernel(&twoarray[20000000], chunksize, taskid, myname); } if(taskid == 3){ mysum = run_kernel(&threearray[20000000], chunksize, taskid, myname); } if(taskid == 4){ mysum = run_kernel(&fourarray[20000000], chunksize, taskid, myname); } if(taskid == 5){ mysum = run_kernel(&fivearray[20000000], chunksize, taskid, myname); } if(taskid == 6){ mysum = run_kernel(&sixarray[20000000], chunksize, taskid, myname); } if(taskid == 7){ mysum = run_kernel(&sevenarray[20000000], chunksize, taskid, myname); } if(taskid == 8){ mysum = run_kernel(&eightarray[20000000], chunksize, taskid, myname); } if(taskid == 9){ mysum = run_kernel(&ninearray[20000000], chunksize, taskid, myname); } } MPI_Finalize(); } I am simply trying to calculate sum of array elements using kernel function. Each task has its own data and it calculates its own sum. I am getting segmentation fault on master task but all other task calculate the sum successfully. Here is the output MPI task 0 has started on host node4 MPI task 1 has started on host node4 MPI task 2 has started on host node5 MPI task 6 has started on host node6 MPI task 5 has started on node5 MPI task 9 has started on host node6 MPI task 8 has started on host node6 MPI task 3 has started on node5 MPI task 4 has started on hnode5 MPI task 7 has started on node6 [node4] *** Process received signal *** [node4] Signal: Segmentation fault (11) [node4] Signal code: Address not mapped (1) [node4] Failing at address: 0xb7866000 [node4] [ 0] [0xbc040c] [node4] [ 1] /usr/lib/libcuda.so(+0x13a0f6) [0x10640f6] [node4] [ 2] /usr/lib/libcuda.so(+0x146912) [0x1070912] [node4] [ 3] /usr/lib/libcuda.so(+0x147231) [0x1071231] [node4] [ 4] /usr/lib/libcuda.so(+0x13cb64) [0x1066b64] [node4] [ 5] /usr/lib/libcuda.so(+0x11863c) [0x104263c] [node4] [ 6] /usr/lib/libcuda.so(+0x11d93b) [0x104793b] [node4] [ 7] /usr/lib/libcuda.so(cuMemcpyHtoD_v2+0x64) [0x1037264] [node4] [ 8] /usr/local/cuda/lib/libcudart.so.4(+0x20336) [0x224336] [node4] [ 9] /usr/local/cuda/lib/libcudart.so.4(cudaMemcpy+0x230) [0x257360] [node4] [10] mpi_array_distributed(run_kernel+0x9a) [0x804a482] [node4] [11] mpi_array_distributed(main+0x325) [0x804a139] [node4] [12] /lib/libc.so.6(__libc_start_main+0xe6) [0x4dece6] [node4] [13] mpi_array_distributed() [0x8049d81] [node4] *** End of error message *** MPI Task 1 is executing Kernel function........ Task 1 has sum (on GPU): 100002306 Time for the kernel: 39.462273 ms MPI Task 7 is executing Kernel function........ Task 7 has sum (on GPU): 100002306 Time for the kernel: 64.105377 ms MPI Task 9 is executing Kernel function..... Task 9 has sum (on GPU): 100002306 Time for the kernel: 45.486912 ms MPI Task 8 is executing Kernel function........Size of shared memory: 2048 MPI Task 4 is executing Kernel function....... Task 8 has sum (on GPU): 100002306 Time for the kernel: 70.883362 ms MPI Task 2 is executing Kernel function...... Task 4 has sum (on GPU): 100002306 Time for the kernel: 129.759079 ms Task 2 has sum (on GPU): 100002306 Time for the kernel: 139.709473 ms MPI Task 6 is executing Kernel function...... MPI Task 3 is executing Kernel function....... Task 6 has sum (on GPU): 100002306 Time for the kernel: 47.691456 ms Task 3 has sum (on GPU): 100002306 Time for the kernel: 110.210335 ms MPI Task 5 is executing Kernel function...... Task 5 has sum (on GPU): 100002306 Time for the kernel: 110.706787 ms -------------------------------------------------------------------------- mpirun noticed that process rank 0 with PID 3054 on node ecm-c-l-207-004.uniwa.uwa.edu.au exited on signal 11 (Segmentation fault). -------------------------------------------------------------------------- Sadly i cant install memory checker such as valgrind on my machine due to some restrictions. I could not spot any error by looking in code. Can anyone help me ?what is wrong in above code. Thanks --