Hi, I tried to augment the command line argument list by allocating my own list of strings and passing them to MPI_Init, yet I got a segmentation fault for both OpenMPI 1.6.3 and 1.7.2, while the code works fine with MPICH2. The code is:
#include "mpi.h" #include "cuda_runtime.h" #include <cstdlib> #include <cstring> #include <cmath> int main(int argc, char **argv) { int device = 0; int skip = 0; bool skipmode = false; bool specified = false; for( int i = 0 ; i < argc ; i++ ) { if ( strcmp( argv[i], "-device" ) == 0 ) { i++; if ( argv[i][0] == '-' ) { skipmode = true; skip = fabs( atoi( argv[i] ) ); } else { skipmode = false; device = atoi( argv[i] ); } specified = true; } } if ( !specified || skipmode ) { char* var; int dev_count, local_rank = 0; if ( (var = getenv("SLURM_LOCALID")) != NULL) local_rank = atoi(var); else if( (var = getenv("MV2_COMM_WORLD_LOCAL_RANK")) != NULL) local_rank = atoi(var); else if( (var = getenv("OMPI_COMM_WORLD_LOCAL_RANK")) != NULL) local_rank = atoi(var); cudaGetDeviceCount( &dev_count ); if ( skipmode ) { device = 0; if ( device == skip ) local_rank++; while( local_rank-- > 0 ) { device = (++device) % dev_count; if ( device == skip ) local_rank++; } } else device = local_rank % dev_count; } // override command line arguments to make sure cudaengine get the correct one char **argv_new = new char*[ argc + 2 ]; for( int i = 0 ; i < argc ; i++ ) { argv_new[i] = new char[ strlen( argv[i] ) + 1 ]; strcpy( argv_new[i], argv[i] ); } argv_new[ argc ] = new char[ 32 ]; argv_new[ argc+1 ] = new char[ 32 ]; strcpy( argv_new[argc], "-device" ); sprintf( argv_new[argc+1], "%d", device ); argc += 2; argv = argv_new; cudaSetDevice( device ); MPI_Init(&argc,&argv); // do something... MPI_Finalize(); cudaDeviceReset(); for( int i = 0 ; i < argc ; i++ ) delete [] argv[i]; delete [] argv; } When compiled using *nvcc -ccbin mpic++*, The error I got was: [jueying:16317] *** Process received signal *** [jueying:16317] Signal: Segmentation fault (11) [jueying:16317] Signal code: Address not mapped (1) [jueying:16317] Failing at address: 0x21 [jueying:16317] [ 0] /usr/lib64/libpthread.so.0() [0x39e5e0f000] [jueying:16317] [ 1] /usr/lib64/libc.so.6() [0x39e5760551] [jueying:16317] [ 2] /opt/openmpi/1.7.2/lib/libopen-pal.so.5(opal_argv_join+0x39) [0x7f460b993079] [jueying:16317] [ 3] /opt/openmpi/1.7.2/lib/libmpi.so.1(ompi_mpi_init+0x347) [0x7f460c106a57] [jueying:16317] [ 4] /opt/openmpi/1.7.2/lib/libmpi.so.1(MPI_Init+0x16b) [0x7f460c12523b] [jueying:16317] [ 5] ./lmp_jueying() [0x40c035] [jueying:16317] [ 6] /usr/lib64/libc.so.6(__libc_start_main+0xf5) [0x39e5621a05] [jueying:16317] [ 7] ./lmp_jueying() [0x40dd21] [jueying:16317] *** End of error message *** Thanks for the help. Best regards, Yu-Hang Tang