Re: [OMPI users] Segmentation fault when using CUDA Aware feature

Rolf vandeVaart Mon, 12 Jan 2015 09:38:09 -0500 (EST)

That is strange, not sure why that is happening.  I will try to reproduce with 
your program on my system.  Also, perhaps you could rerun with –mca 
mpi_common_cuda_verbose 100 and send me that output.


Thanks

From: users [mailto:users-boun...@open-mpi.org] On Behalf Of Xun Gong
Sent: Sunday, January 11, 2015 11:41 PM
To: us...@open-mpi.org
Subject: [OMPI users] Segmentation fault when using CUDA Aware feature

Hi,

The OpenMpi I used is 1.8.4. I just tried to run a test program to see if the 
CUDA aware feature works. But I got the following errors.

ss@ss-Inspiron-5439:~/cuda-workspace/cuda_mpi_ex1$ mpirun -np 2 s1
[ss-Inspiron-5439:32514] *** Process received signal ***
[ss-Inspiron-5439:32514] Signal: Segmentation fault (11)
[ss-Inspiron-5439:32514] Signal code: Address not mapped (1)
[ss-Inspiron-5439:32514] Failing at address: 0x3
[ss-Inspiron-5439:32514] [ 0] 
/lib/x86_64-linux-gnu/libc.so.6(+0x36c30)[0x7f74d7048c30]
[ss-Inspiron-5439:32514] [ 1] 
/lib/x86_64-linux-gnu/libc.so.6(+0x98a70)[0x7f74d70aaa70]
[ss-Inspiron-5439:32514] [ 2] 
/usr/local/openmpi-1.8.4/lib/libopen-pal.so.6(opal_convertor_pack+0x187)[0x7f74d673f097]
[ss-Inspiron-5439:32514] [ 3] 
/usr/local/openmpi-1.8.4/lib/openmpi/mca_btl_self.so(mca_btl_self_prepare_src+0xb8)[0x7f74ce196888]
[ss-Inspiron-5439:32514] [ 4] 
/usr/local/openmpi-1.8.4/lib/openmpi/mca_pml_ob1.so(mca_pml_ob1_send_request_start_prepare+0x4c)[0x7f74cd2c183c]
[ss-Inspiron-5439:32514] [ 5] 
/usr/local/openmpi-1.8.4/lib/openmpi/mca_pml_ob1.so(mca_pml_ob1_send+0x5ba)[0x7f74cd2b78aa]
[ss-Inspiron-5439:32514] [ 6] 
/usr/local/openmpi-1.8.4/lib/libmpi.so.1(PMPI_Send+0xf2)[0x7f74d79602a2]
[ss-Inspiron-5439:32514] [ 7] s1[0x408b1e]
[ss-Inspiron-5439:32514] [ 8] 
/lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf5)[0x7f74d7033ec5]
[ss-Inspiron-5439:32514] [ 9] s1[0x4088e9]
[ss-Inspiron-5439:32514] *** End of error message ***
--------------------------------------------------------------------------
mpirun noticed that process rank 0 with PID 32514 on node ss-Inspiron-5439 
exited on signal 11 (Segmentation fault).

Looks like MPI_Send can not send CUDA buffer. But I already did  the command    
  ./configure --with-cuda for OpenMPI.


The command I uesd is.

ss@ss-Inspiron-5439:~/cuda-workspace/cuda_mpi_ex1$ nvcc -c k1.cu<http://k1.cu>
ss@ss-Inspiron-5439:~/cuda-workspace/cuda_mpi_ex1$ mpic++ -c main.cc
ss@ss-Inspiron-5439:~/cuda-workspace/cuda_mpi_ex1$ mpic++ -o s1 main.o k1.o 
-L/usr/local/cuda/lib64/ -lcudart
ss@ss-Inspiron-5439:~/cuda-workspace/cuda_mpi_ex1$ mpirun -np 2 ./s1



The code I'm running is

main.cc file
#include<iostream>
using namespace std;
#include<mpi.h>
#include"k1.h"
#define vect_len 16
const int blocksize = 16;

int main(int argv, char *argc[])
{
          int numprocs, myid;
          MPI_Status status;
          const int vect_size = vect_len*sizeof(int);

          int *vect1 = new int[vect_size];
          int *vect2 = new int[vect_size];
          int *result = new int[vect_size];
          bool flag;

          int *ad;
          int *bd;

          MPI_Init(&argv, &argc);
          MPI_Comm_rank(MPI_COMM_WORLD, &myid);
          MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
          if(myid == 0)
          {
                      for(int i = 0; i < vect_len; i++)
                      {
                                  vect1[i] = i;
                                  vect2[i] = 2 * i;
                      }
          }
          else
          {
                      for(int i = 0; i < vect_len; i++)
                      {
                                  vect1[i] = 2 * i;
                                  vect2[i] = i;
                      }
          }

          initializeGPU(vect1, vect2, ad, bd, vect_size);

          if(myid == 0)
          {
                      for(int i = 0; i < numprocs; i++)
                      {
                                  MPI_Send(ad,vect_len, MPI_INT, i, 99, 
MPI_COMM_WORLD );
                                  MPI_Send(bd,vect_len, MPI_INT, i, 99, 
MPI_COMM_WORLD );
                      }
          }
          else
          {
                      MPI_Recv(ad,vect_len, MPI_INT, 0, 99, MPI_COMM_WORLD, 
&status );
                      MPI_Recv(bd,vect_len, MPI_INT, 0, 99, MPI_COMM_WORLD, 
&status );
          }



          computeGPU(blocksize, vect_len, ad, bd, result, vect_size);

          //Verify
          flag = true;

          for(int i = 0; i < vect_len; i++)
          {
                      if (i < 8)
                      vect1[i] += vect2[i];
                      else
                      vect1[i] -= vect2[i];

          }

          for(int i = 0; i < vect_len; i++)
          {
                      if( result[i] != vect1[i] )
                      {
                                  cout<<"the result ["<<i<<"] by m2s 
is"<<result[i]<<endl;
                                  cout<<"the result ["<<i<<"] of vector 
is"<<vect1[i]<<endl;
                                  cout << "Verification fail at " << i << endl;
                                  flag = false;
                                  break;
                      }

          }
          if(flag)
                      cout << "Verification passes." <<endl;
          // free device memory
/*       cudaFree( ad );
          cudaFree( bd );
          free(vect1);
          free(vect2);
          free(result);
*/
          MPI_Finalize();
}


k1.h file

void initializeGPU(int *hostptr1, int *hostptr2, int *ad, int *bd, int 
vect_size);

void computeGPU(int block_size, int vect_len, int *ptr1, int *ptr2, int* 
result, int vect_size);



k1.cu<http://k1.cu> file

#include"k1.h"

__global__ void vect_add(int *a, int *b, int n)
{

          int id = threadIdx.x;

          if (id < n)
                      a[id] = a[id] + b[id];
          else
                      a[id] = a[id] - b[id];
}

void initializeGPU(int *hostptr1, int *hostptr2, int *ad, int *bd, int 
vect_size)
{

          // initialize device memory
          cudaMalloc( (void**)&ad, vect_size );
          cudaMalloc( (void**)&bd, vect_size );

          // copy data to device
          cudaMemcpy( ad, hostptr1, vect_size, cudaMemcpyHostToDevice );
          cudaMemcpy( bd, hostptr2, vect_size, cudaMemcpyHostToDevice );

}

void computeGPU(int block_size, int vect_len, int *ptr1, int *ptr2, int* 
result, int vect_size)
{
          // setup block and grid size
          dim3 dimBlock( block_size, 1, 1);
          dim3 dimGrid( vect_len/block_size, 1 , 1);
          vect_add<<<dimGrid, dimBlock>>>(ptr1, ptr2, 8);

          cudaMemcpy( result, ptr1 , vect_size, cudaMemcpyDeviceToHost );
}


Many Thanks for help,
Xun





-----------------------------------------------------------------------------------
This email message is for the sole use of the intended recipient(s) and may 
contain
confidential information.  Any unauthorized review, use, disclosure or 
distribution
is prohibited.  If you are not the intended recipient, please contact the 
sender by
reply email and destroy all copies of the original message.
-----------------------------------------------------------------------------------

Re: [OMPI users] Segmentation fault when using CUDA Aware feature

Reply via email to