Thanks for the reply. When I modify the code it still fails with segmentation error.
my latest code looks like #include "mpi.h" #include <stdio.h> #include <stdlib.h> #include <string.h> #include <time.h> #include <sys/time.h> #include <sys/resource.h> #define MASTER 0 #define ARRAYSIZE 40000000 int *masterarray,*onearray,*twoarray,*threearray,*fourarray,*fivearray,*sixarray,*sevenarray,*eightarray,*ninearray; int main(int argc, char* argv[]) { int numtasks, taskid,chunksize, namelen; int mysum,one,two,three,four,five,six,seven,eight,nine; char myname[MPI_MAX_PROCESSOR_NAME]; MPI_Status status; int a,b,c,d,e,f,g,h,i,j; /***** Initializations *****/ MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &numtasks); MPI_Comm_rank(MPI_COMM_WORLD,&taskid); MPI_Get_processor_name(myname, &namelen); printf ("MPI task %d has started on host %s...\n", taskid, myname); masterarray= malloc(ARRAYSIZE * sizeof(int)); onearray= malloc(ARRAYSIZE * sizeof(int)); twoarray= malloc(ARRAYSIZE * sizeof(int)); threearray= malloc(ARRAYSIZE * sizeof(int)); fourarray= malloc(ARRAYSIZE * sizeof(int)); fivearray= malloc(ARRAYSIZE * sizeof(int)); sixarray= malloc(ARRAYSIZE * sizeof(int)); sevenarray= malloc(ARRAYSIZE * sizeof(int)); eightarray= malloc(ARRAYSIZE * sizeof(int)); ninearray= malloc(ARRAYSIZE * sizeof(int)); /***** Master task only ******/ if (taskid == MASTER){ for(a=0; a < ARRAYSIZE; a++){ masterarray[a] = 1; } mysum = run_kernel0(masterarray,ARRAYSIZE,taskid, myname); } /* end of master section */ if (taskid > MASTER) { if(taskid == 1){ for(b=0;b<ARRAYSIZE;b++){ onearray[b] = 1; } one = run_kernel0(onearray,ARRAYSIZE,taskid, myname); } if(taskid == 2){ for(c=0;c<ARRAYSIZE;c++){ twoarray[c] = 1; } two = run_kernel0(twoarray,ARRAYSIZE,taskid, myname); } if(taskid == 3){ for(d=0;d<ARRAYSIZE;d++){ threearray[d] = 1; } three = run_kernel0(threearray,ARRAYSIZE,taskid, myname); } if(taskid == 4){ for(e=0;e < ARRAYSIZE; e++){ fourarray[e] = 1; } four = run_kernel0(fourarray,ARRAYSIZE,taskid, myname); } if(taskid == 5){ for(f=0;f<ARRAYSIZE;f++){ fivearray[f] = 1; } five = run_kernel0(fivearray,ARRAYSIZE,taskid, myname); } if(taskid == 6){ for(g=0;g<ARRAYSIZE;g++){ sixarray[g] = 1; } six = run_kernel0(sixarray,ARRAYSIZE,taskid, myname); } if(taskid == 7){ for(h=0;h<ARRAYSIZE;h++){ sevenarray[h] = 1; } seven = run_kernel0(sevenarray,ARRAYSIZE,taskid, myname); } if(taskid == 8){ for(i=0;i<ARRAYSIZE;i++){ eightarray[i] = 1; } eight = run_kernel0(eightarray,ARRAYSIZE,taskid, myname); } if(taskid == 9){ for(j=0;j<ARRAYSIZE;j++){ ninearray[j] = 1; } nine = run_kernel0(ninearray,ARRAYSIZE,taskid, myname); } } MPI_Finalize(); } and my kernel code is #include <stdio.h> #include <cutil_inline.h> #include <cutil.h> #include <thrust/version.h> #include <thrust/generate.h> #include <thrust/host_vector.h> #include <thrust/device_vector.h> #include <thrust/functional.h> #include <thrust/transform_reduce.h> #include <time.h> #include <sys/time.h> #include <sys/resource.h> #define BLOCK_NUM 8 #define THREAD_NUM 256 __global__ static void sumOfSquares(int * num, int * result,int DATA_SIZE) { extern __shared__ int shared[]; const int tid = threadIdx.x; const int bid = blockIdx.x; shared[tid] = 0; for (int i = bid * THREAD_NUM + tid; i < DATA_SIZE; i += BLOCK_NUM * THREAD_NUM) { shared[tid] += num[i]; } __syncthreads(); int offset = THREAD_NUM / 2; while (offset > 0) { if (tid < offset) { shared[tid] += shared[tid + offset]; } offset >>= 1; __syncthreads(); } if (tid == 0) { result[bid] = shared[0]; } } extern "C" int run_kernel0( int array[], int nelements, int taskid, char hostname[]) { int * gpudata, i; int * result; clock_t * time; cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0); cudaMalloc((void **) &gpudata, sizeof(int) * nelements); cudaMalloc((void **) &result, sizeof(int) * THREAD_NUM * BLOCK_NUM); cudaMemcpy(gpudata, array, sizeof(int) * nelements, cudaMemcpyHostToDevice); printf("\n MPI Task %d is executing Kernel function........", taskid); int sum[BLOCK_NUM]; sumOfSquares<<<BLOCK_NUM, THREAD_NUM, THREAD_NUM * sizeof(int)>>>(gpudata, result,nelements); cudaMemcpy(&sum, result, sizeof(int) * BLOCK_NUM, cudaMemcpyDeviceToHost); //calculate sum of each block. int final_sum = 0; for (i = 0; i < BLOCK_NUM; i++) { final_sum += sum[i]; } cudaEventRecord(stop, 0); cudaEventSynchronize(stop); float elapsedTime; cudaEventElapsedTime(&elapsedTime, start, stop); cudaEventDestroy(start); cudaEventDestroy(stop); cudaFree(gpudata); cudaFree(result); printf(" Task %d has sum (on GPU): %ld Time for the kernel: %f ms \n", taskid, final_sum, elapsedTime); return final_sum; } Error trace - MPI task 0 has started on host MPI task 1 has started on host MPI task 2 has started on host MPI task 3 has started on host MPI task 4 has started on host MPI task 6 has started on host MPI task 7 has started on host MPI task 8 has started on host MPI task 9 has started on host MPI task 5 has started on host MPI Task 1 is executing Kernel function........ Task 1 has sum (on GPU): 40000000 Time for the kernel: 120.534050 ms MPI Task 0 is executing Kernel function........ Task 0 has sum (on GPU): 40000000 Time for the kernel: 137.301315 ms MPI Task 4 is executing Kernel function........ Task 4 has sum (on GPU): 348456223 Time for the kernel: 0.000000 ms MPI Task 7 is executing Kernel function........ Task 7 has sum (on GPU): 353682719 Time for the kernel: 0.000000 ms MPI Task 3 is executing Kernel function........ Task 3 has sum (on GPU): 40000000 Time for the kernel: 4172.341309 ms MPI Task 2 is executing Kernel function........ Task 2 has sum (on GPU): 40000000 Time for the kernel: 4204.969727 ms *** Process received signal *** Signal: Segmentation fault (11) Signal code: Address not mapped (1) Failing at address: (nil) [ 0] [0xd1340c] [ 1] /usr/lib/libcuda.so(+0x163e12) [0x1092e12] [ 2] /usr/lib/libcuda.so(+0x115749) [0x1044749] ] [ 3] /usr/lib/libcuda.so(cuEventRecord+0x5c) [0x103578c] [ 4] /usr/local/cuda/lib/libcudart.so.4(+0x2480f) [0x7fd80f] [ 5] /usr/local/cuda/lib/libcudart.so.4(cudaEventRecord+0x22f) [0x838b8f] [ 6] mpi_array_distributed(run_kernel0+0x32) [0x804a2b2] [ 7] mpi_array_distributed(main+0x3ee) [0x804a0a2] [ 8] /lib/libc.so.6(__libc_start_main+0xe6) [0x2fece6] [ 9] mpi_array_distributed() [0x8049c21] *** End of error message *** -------------------------------------------------------------------------- mpirun noticed that process rank 5 with PID 6559 on node exited on signal 11 (Segmentation fault). Not sure why it is failing. Each task initializes its own data and then calls the kernel. Any help would be appreciated Thanks On Wed, May 2, 2012 at 6:00 PM, Eduardo Morras <nec...@retena.com> wrote: > At 08:51 02/05/2012, you wrote: > >> Hi, >> >> I am trying to execute following code on cluster. >> > > run_kernel is a cuda call with signature int run_kernel(int array[],int >> nelements, int taskid, char hostname[]); >> > > ... deleted code > > > mysum = run_kernel(&onearray[20000000]**, chunksize, taskid, myname); >> > > ... more deleted code > > > I am simply trying to calculate sum of array elements using kernel >> function. Each task has its own data and it calculates its own sum. >> >> I am getting segmentation fault on master task but all other task >> calculate the sum successfully. >> >> Here is the output >> >> >> MPI task 0 has started on host node4 >> MPI task 1 has started on host node4 >> MPI task 2 has started on host node5 >> MPI task 6 has started on host node6 >> MPI task 5 has started on node5 >> MPI task 9 has started on host node6 >> MPI task 8 has started on host node6 >> MPI task 3 has started on node5 >> MPI task 4 has started on hnode5 >> MPI task 7 has started on node6 >> [node4] *** Process received signal *** >> [node4] Signal: Segmentation fault (11) >> [node4] Signal code: Address not mapped (1) >> [node4] Failing at address: 0xb7866000 >> [node4] [ 0] [0xbc040c] >> [node4] [ 1] /usr/lib/libcuda.so(+0x13a0f6) [0x10640f6] >> [node4] [ 2] /usr/lib/libcuda.so(+0x146912) [0x1070912] >> [node4] [ 3] /usr/lib/libcuda.so(+0x147231) [0x1071231] >> [node4] [ 4] /usr/lib/libcuda.so(+0x13cb64) [0x1066b64] >> [node4] [ 5] /usr/lib/libcuda.so(+0x11863c) [0x104263c] >> [node4] [ 6] /usr/lib/libcuda.so(+0x11d93b) [0x104793b] >> [node4] [ 7] /usr/lib/libcuda.so(**cuMemcpyHtoD_v2+0x64) [0x1037264] >> [node4] [ 8] /usr/local/cuda/lib/libcudart.**so.4(+0x20336) [0x224336] >> [node4] [ 9] /usr/local/cuda/lib/libcudart.**so.4(cudaMemcpy+0x230) >> [0x257360] >> [node4] [10] mpi_array_distributed(run_**kernel+0x9a) [0x804a482] >> [node4] [11] mpi_array_distributed(main+**0x325) [0x804a139] >> [node4] [12] /lib/libc.so.6(__libc_start_**main+0xe6) [0x4dece6] >> [node4] [13] mpi_array_distributed() [0x8049d81] >> [node4] *** End of error message *** >> > > It fails doing the cuMemcpyHtoD inside cuda code. Perhaps doing any of > this changes can fix your problem: > > a) mysum = run_kernel(onearray, chunksize, taskid, myname); > > b) mysum = run_kernel(&onearray[20000000-**1], chunksize, taskid, myname); > > ------------------------------**------------------------------** >> -------------- >> mpirun noticed that process rank 0 with PID 3054 on node < >> http://ecm-c-l-207-004.uniwa.**uwa.edu.au<http://ecm-c-l-207-004.uniwa.uwa.edu.au> >> >ecm-c-l-207-004.**uniwa.uwa.edu.au<http://ecm-c-l-207-004.uniwa.uwa.edu.au>exited >> > on signal 11 (Segmentation fault). >> >> ------------------------------**------------------------------** >> -------------- >> >> Sadly i cant install memory checker such as valgrind on my machine due to >> some restrictions. I could not spot any error by looking in code. >> >> Can anyone help me ?what is wrong in above code. >> >> Thanks >> > > > -- Best Regards, ROHAN DESHPANDE