[OMPI users] Segmentation fault in MPI_Init when passing pointers allocated in main()
Hi, I tried to augment the command line argument list by allocating my own list of strings and passing them to MPI_Init, yet I got a segmentation fault for both OpenMPI 1.6.3 and 1.7.2, while the code works fine with MPICH2. The code is: #include "mpi.h" #include "cuda_runtime.h" #include #include #include int main(int argc, char **argv) { int device = 0; int skip = 0; bool skipmode = false; bool specified = false; for( int i = 0 ; i < argc ; i++ ) { if ( strcmp( argv[i], "-device" ) == 0 ) { i++; if ( argv[i][0] == '-' ) { skipmode = true; skip = fabs( atoi( argv[i] ) ); } else { skipmode = false; device = atoi( argv[i] ); } specified = true; } } if ( !specified || skipmode ) { char* var; int dev_count, local_rank = 0; if ( (var = getenv("SLURM_LOCALID")) != NULL) local_rank = atoi(var); else if( (var = getenv("MV2_COMM_WORLD_LOCAL_RANK")) != NULL) local_rank = atoi(var); else if( (var = getenv("OMPI_COMM_WORLD_LOCAL_RANK")) != NULL) local_rank = atoi(var); cudaGetDeviceCount( &dev_count ); if ( skipmode ) { device = 0; if ( device == skip ) local_rank++; while( local_rank-- > 0 ) { device = (++device) % dev_count; if ( device == skip ) local_rank++; } } else device = local_rank % dev_count; } // override command line arguments to make sure cudaengine get the correct one char **argv_new = new char*[ argc + 2 ]; for( int i = 0 ; i < argc ; i++ ) { argv_new[i] = new char[ strlen( argv[i] ) + 1 ]; strcpy( argv_new[i], argv[i] ); } argv_new[ argc ] = new char[ 32 ]; argv_new[ argc+1 ] = new char[ 32 ]; strcpy( argv_new[argc], "-device" ); sprintf( argv_new[argc+1], "%d", device ); argc += 2; argv = argv_new; cudaSetDevice( device ); MPI_Init(&argc,&argv); // do something... MPI_Finalize(); cudaDeviceReset(); for( int i = 0 ; i < argc ; i++ ) delete [] argv[i]; delete [] argv; } When compiled using *nvcc -ccbin mpic++*, The error I got was: [jueying:16317] *** Process received signal *** [jueying:16317] Signal: Segmentation fault (11) [jueying:16317] Signal code: Address not mapped (1) [jueying:16317] Failing at address: 0x21 [jueying:16317] [ 0] /usr/lib64/libpthread.so.0() [0x39e5e0f000] [jueying:16317] [ 1] /usr/lib64/libc.so.6() [0x39e5760551] [jueying:16317] [ 2] /opt/openmpi/1.7.2/lib/libopen-pal.so.5(opal_argv_join+0x39) [0x7f460b993079] [jueying:16317] [ 3] /opt/openmpi/1.7.2/lib/libmpi.so.1(ompi_mpi_init+0x347) [0x7f460c106a57] [jueying:16317] [ 4] /opt/openmpi/1.7.2/lib/libmpi.so.1(MPI_Init+0x16b) [0x7f460c12523b] [jueying:16317] [ 5] ./lmp_jueying() [0x40c035] [jueying:16317] [ 6] /usr/lib64/libc.so.6(__libc_start_main+0xf5) [0x39e5621a05] [jueying:16317] [ 7] ./lmp_jueying() [0x40dd21] [jueying:16317] *** End of error message *** Thanks for the help. Best regards, Yu-Hang Tang
Re: [OMPI users] Segmentation fault in MPI_Init when passing pointers allocated in main()
I tried the following code without CUDA, the error is still there: #include "mpi.h" #include #include #include int main(int argc, char **argv) { // override command line arguments to make sure cudaengine get the correct one char **argv_new = new char*[ argc + 2 ]; for( int i = 0 ; i < argc ; i++ ) { argv_new[i] = new char[ strlen( argv[i] ) + 1 ]; strcpy( argv_new[i], argv[i] ); } argv_new[ argc ] = new char[ 32 ]; argv_new[ argc+1 ] = new char[ 32 ]; strcpy( argv_new[argc], "-device" ); sprintf( argv_new[argc+1], "%d", 0 ); argc += 2; argv = argv_new; MPI_Init(&argc,&argv); // do something... MPI_Finalize(); for( int i = 0 ; i < argc ; i++ ) delete [] argv[i]; delete [] argv; } At the end of the program the pointer stored in argv is exactly that of argv_new so this should not be a problem. Manually inserting printf tells me that the fault occured at MPI_Init. The code works fine if I use MPI_Init(NULL,NULL) instead. The same code also compiles and runs without a problem on my laptop with mpich2-1.4. Best, Yu-Hang On Tue, Nov 12, 2013 at 11:18 AM, Matthieu Brucher < matthieu.bruc...@gmail.com> wrote: > Hi, > > Are you sure this is the correct code? This seems strange and not a good > idea: > >MPI_Init(&argc,&argv); > > // do something... > > for( int i = 0 ; i < argc ; i++ ) delete [] argv[i]; > delete [] argv; > > Did you mean argc_new and argv_new instead? > Do you have the same error without CUDA? > > Cheers, > > Matthieu > > > 2013/11/12 Tang, Yu-Hang : > > Hi, > > > > I tried to augment the command line argument list by allocating my own > list > > of strings and passing them to MPI_Init, yet I got a segmentation fault > for > > both OpenMPI 1.6.3 and 1.7.2, while the code works fine with MPICH2. The > > code is: > > > > #include "mpi.h" > > #include "cuda_runtime.h" > > #include > > #include > > #include > > > > int main(int argc, char **argv) > > { > > int device = 0; > > int skip = 0; > > bool skipmode = false; > > bool specified = false; > > for( int i = 0 ; i < argc ; i++ ) > > { > > if ( strcmp( argv[i], "-device" ) == 0 ) > > { > > i++; > > if ( argv[i][0] == '-' ) > > { > > skipmode = true; > > skip = fabs( atoi( argv[i] ) ); > > } > > else > > { > > skipmode = false; > > device = atoi( argv[i] ); > > } > > specified = true; > > } > > } > > > > if ( !specified || skipmode ) > > { > > char* var; > > int dev_count, local_rank = 0; > > if ( (var = getenv("SLURM_LOCALID")) != NULL) local_rank = > > atoi(var); > > else if( (var = getenv("MV2_COMM_WORLD_LOCAL_RANK")) != NULL) > > local_rank = atoi(var); > > else if( (var = getenv("OMPI_COMM_WORLD_LOCAL_RANK")) != NULL) > > local_rank = atoi(var); > > cudaGetDeviceCount( &dev_count ); > > if ( skipmode ) > > { > > device = 0; > > if ( device == skip ) local_rank++; > > while( local_rank-- > 0 ) > > { > > device = (++device) % dev_count; > > if ( device == skip ) local_rank++; > > } > > } > > else device = local_rank % dev_count; > > } > > > > // override command line arguments to make sure cudaengine get the > > correct one > > char **argv_new = new char*[ argc + 2 ]; > > for( int i = 0 ; i < argc ; i++ ) > > { > > argv_new[i] = new char[ strlen( argv[i] ) + 1 ]; > > strcpy( argv_new[i], argv[i] ); > > } > > argv_new[ argc ] = new char[ 32 ]; > > argv_new[ argc+1 ] = new char[ 32 ]; > > strcpy( argv_new[argc], "-device" ); > > sprintf( argv_new[argc+1], "%d", device ); > > argc += 2; > > argv = argv_new; > > > > cudaSetDevice( device ); > > > > MPI_Init(&argc,&argv); > > > > // do something... > > > > MPI_Finalize(); > > > > cudaDeviceReset(); > > for( int i = 0 ; i < argc ; i++ ) delete [] argv[i]; >
Re: [OMPI users] Segmentation fault in MPI_Init when passing pointers allocated in main()
After appending an additional NULL the code works now. I admit such use of argv/argc could be confusing... thanks for pointing that out. And thank you all for figuring out my problem! Best, Yu-Hang On Tue, Nov 12, 2013 at 12:18 PM, Ralph Castain wrote: > Kernighan and Richie's C programming language manual - it goes all the way > back to the original C definition. > > > On Nov 12, 2013, at 9:15 AM, Alex A. Granovsky > wrote: > > > Hello, > > > >> It seems that argv[argc] should always be NULL according to the > >> standard. So OMPI failure is not actually a bug! > > > > could you please point to the exact document where this is explicitly > stated? > > Otherwise, I'd assume this is a bug. > > > > Kind regards, > > Alex Granovsky > > > > > > -Original Message- From: Matthieu Brucher > > Sent: Tuesday, November 12, 2013 8:56 PM > > To: Open MPI Users > > Subject: Re: [OMPI users] Segmentation fault in MPI_Init when passing > pointers allocated in main() > > > > It seems that argv[argc] should always be NULL according to the > > standard. So OMPI failure is not actually a bug! > > > > Cheers, > > > > 2013/11/12 Matthieu Brucher : > >> Interestingly enough, in ompi_mpi_init, opal_argv_join is called > >> without then array length, so I suppose that in the usual argc/argv > >> couple, you have an additional value to argv which may be NULL. So try > >> allocating 3 additional values, the last being NULL, and it may work. > >> > >> Cheers, > >> > >> Matthieu > >> > >> 2013/11/12 Tang, Yu-Hang : > >>> I tried the following code without CUDA, the error is still there: > >>> > >>> #include "mpi.h" > >>> > >>> #include > >>> #include > >>> #include > >>> > >>> int main(int argc, char **argv) > >>> { > >>>// override command line arguments to make sure cudaengine get the > >>> correct one > >>>char **argv_new = new char*[ argc + 2 ]; > >>>for( int i = 0 ; i < argc ; i++ ) > >>>{ > >>>argv_new[i] = new char[ strlen( argv[i] ) + 1 ]; > >>>strcpy( argv_new[i], argv[i] ); > >>>} > >>>argv_new[ argc ] = new char[ 32 ]; > >>>argv_new[ argc+1 ] = new char[ 32 ]; > >>>strcpy( argv_new[argc], "-device" ); > >>>sprintf( argv_new[argc+1], "%d", 0 ); > >>> > >>>argc += 2; > >>>argv = argv_new; > >>> > >>>MPI_Init(&argc,&argv); > >>> > >>>// do something... > >>> > >>>MPI_Finalize(); > >>> > >>>for( int i = 0 ; i < argc ; i++ ) delete [] argv[i]; > >>>delete [] argv; > >>> } > >>> > >>> At the end of the program the pointer stored in argv is exactly that of > >>> argv_new so this should not be a problem. Manually inserting printf > tells me > >>> that the fault occured at MPI_Init. The code works fine if I use > >>> MPI_Init(NULL,NULL) instead. The same code also compiles and runs > without a > >>> problem on my laptop with mpich2-1.4. > >>> > >>> Best, > >>> Yu-Hang > >>> > >>> > >>> > >>> On Tue, Nov 12, 2013 at 11:18 AM, Matthieu Brucher > >>> wrote: > >>>> > >>>> Hi, > >>>> > >>>> Are you sure this is the correct code? This seems strange and not a > good > >>>> idea: > >>>> > >>>> MPI_Init(&argc,&argv); > >>>> > >>>>// do something... > >>>> > >>>>for( int i = 0 ; i < argc ; i++ ) delete [] argv[i]; > >>>>delete [] argv; > >>>> > >>>> Did you mean argc_new and argv_new instead? > >>>> Do you have the same error without CUDA? > >>>> > >>>> Cheers, > >>>> > >>>> Matthieu > >>>> > >>>> > >>>> 2013/11/12 Tang, Yu-Hang : > >>>> > Hi, > >>>> > > >>>> > I tried to augment the command line argument list by allocating my > own > >>>> > list > >>>> > of strings and passing them to MPI_In