After appending an additional NULL the code works now. I admit such use of argv/argc could be confusing... thanks for pointing that out. And thank you all for figuring out my problem!
Best, Yu-Hang On Tue, Nov 12, 2013 at 12:18 PM, Ralph Castain <r...@open-mpi.org> wrote: > Kernighan and Richie's C programming language manual - it goes all the way > back to the original C definition. > > > On Nov 12, 2013, at 9:15 AM, Alex A. Granovsky <g...@classic.chem.msu.su> > wrote: > > > Hello, > > > >> It seems that argv[argc] should always be NULL according to the > >> standard. So OMPI failure is not actually a bug! > > > > could you please point to the exact document where this is explicitly > stated? > > Otherwise, I'd assume this is a bug. > > > > Kind regards, > > Alex Granovsky > > > > > > -----Original Message----- From: Matthieu Brucher > > Sent: Tuesday, November 12, 2013 8:56 PM > > To: Open MPI Users > > Subject: Re: [OMPI users] Segmentation fault in MPI_Init when passing > pointers allocated in main() > > > > It seems that argv[argc] should always be NULL according to the > > standard. So OMPI failure is not actually a bug! > > > > Cheers, > > > > 2013/11/12 Matthieu Brucher <matthieu.bruc...@gmail.com>: > >> Interestingly enough, in ompi_mpi_init, opal_argv_join is called > >> without then array length, so I suppose that in the usual argc/argv > >> couple, you have an additional value to argv which may be NULL. So try > >> allocating 3 additional values, the last being NULL, and it may work. > >> > >> Cheers, > >> > >> Matthieu > >> > >> 2013/11/12 Tang, Yu-Hang <yuhang_t...@brown.edu>: > >>> I tried the following code without CUDA, the error is still there: > >>> > >>> #include "mpi.h" > >>> > >>> #include <cstdlib> > >>> #include <cstring> > >>> #include <cmath> > >>> > >>> int main(int argc, char **argv) > >>> { > >>> // override command line arguments to make sure cudaengine get the > >>> correct one > >>> char **argv_new = new char*[ argc + 2 ]; > >>> for( int i = 0 ; i < argc ; i++ ) > >>> { > >>> argv_new[i] = new char[ strlen( argv[i] ) + 1 ]; > >>> strcpy( argv_new[i], argv[i] ); > >>> } > >>> argv_new[ argc ] = new char[ 32 ]; > >>> argv_new[ argc+1 ] = new char[ 32 ]; > >>> strcpy( argv_new[argc], "-device" ); > >>> sprintf( argv_new[argc+1], "%d", 0 ); > >>> > >>> argc += 2; > >>> argv = argv_new; > >>> > >>> MPI_Init(&argc,&argv); > >>> > >>> // do something... > >>> > >>> MPI_Finalize(); > >>> > >>> for( int i = 0 ; i < argc ; i++ ) delete [] argv[i]; > >>> delete [] argv; > >>> } > >>> > >>> At the end of the program the pointer stored in argv is exactly that of > >>> argv_new so this should not be a problem. Manually inserting printf > tells me > >>> that the fault occured at MPI_Init. The code works fine if I use > >>> MPI_Init(NULL,NULL) instead. The same code also compiles and runs > without a > >>> problem on my laptop with mpich2-1.4. > >>> > >>> Best, > >>> Yu-Hang > >>> > >>> > >>> > >>> On Tue, Nov 12, 2013 at 11:18 AM, Matthieu Brucher > >>> <matthieu.bruc...@gmail.com> wrote: > >>>> > >>>> Hi, > >>>> > >>>> Are you sure this is the correct code? This seems strange and not a > good > >>>> idea: > >>>> > >>>> MPI_Init(&argc,&argv); > >>>> > >>>> // do something... > >>>> > >>>> for( int i = 0 ; i < argc ; i++ ) delete [] argv[i]; > >>>> delete [] argv; > >>>> > >>>> Did you mean argc_new and argv_new instead? > >>>> Do you have the same error without CUDA? > >>>> > >>>> Cheers, > >>>> > >>>> Matthieu > >>>> > >>>> > >>>> 2013/11/12 Tang, Yu-Hang <yuhang_t...@brown.edu>: > >>>> > Hi, > >>>> > > >>>> > I tried to augment the command line argument list by allocating my > own > >>>> > list > >>>> > of strings and passing them to MPI_Init, yet I got a segmentation > > fault > >>>> > for > >>>> > both OpenMPI 1.6.3 and 1.7.2, while the code works fine with > MPICH2. > The > >>>> > code is: > >>>> > > >>>> > #include "mpi.h" > >>>> > #include "cuda_runtime.h" > >>>> > #include <cstdlib> > >>>> > #include <cstring> > >>>> > #include <cmath> > >>>> > > >>>> > int main(int argc, char **argv) > >>>> > { > >>>> > int device = 0; > >>>> > int skip = 0; > >>>> > bool skipmode = false; > >>>> > bool specified = false; > >>>> > for( int i = 0 ; i < argc ; i++ ) > >>>> > { > >>>> > if ( strcmp( argv[i], "-device" ) == 0 ) > >>>> > { > >>>> > i++; > >>>> > if ( argv[i][0] == '-' ) > >>>> > { > >>>> > skipmode = true; > >>>> > skip = fabs( atoi( argv[i] ) ); > >>>> > } > >>>> > else > >>>> > { > >>>> > skipmode = false; > >>>> > device = atoi( argv[i] ); > >>>> > } > >>>> > specified = true; > >>>> > } > >>>> > } > >>>> > > >>>> > if ( !specified || skipmode ) > >>>> > { > >>>> > char* var; > >>>> > int dev_count, local_rank = 0; > >>>> > if ( (var = getenv("SLURM_LOCALID")) != NULL) local_rank = > >>>> > atoi(var); > >>>> > else if( (var = getenv("MV2_COMM_WORLD_LOCAL_RANK")) != > NULL) > >>>> > local_rank = atoi(var); > >>>> > else if( (var = getenv("OMPI_COMM_WORLD_LOCAL_RANK")) != > NULL) > >>>> > local_rank = atoi(var); > >>>> > cudaGetDeviceCount( &dev_count ); > >>>> > if ( skipmode ) > >>>> > { > >>>> > device = 0; > >>>> > if ( device == skip ) local_rank++; > >>>> > while( local_rank-- > 0 ) > >>>> > { > >>>> > device = (++device) % dev_count; > >>>> > if ( device == skip ) local_rank++; > >>>> > } > >>>> > } > >>>> > else device = local_rank % dev_count; > >>>> > } > >>>> > > >>>> > // override command line arguments to make sure cudaengine get > the > >>>> > correct one > >>>> > char **argv_new = new char*[ argc + 2 ]; > >>>> > for( int i = 0 ; i < argc ; i++ ) > >>>> > { > >>>> > argv_new[i] = new char[ strlen( argv[i] ) + 1 ]; > >>>> > strcpy( argv_new[i], argv[i] ); > >>>> > } > >>>> > argv_new[ argc ] = new char[ 32 ]; > >>>> > argv_new[ argc+1 ] = new char[ 32 ]; > >>>> > strcpy( argv_new[argc], "-device" ); > >>>> > sprintf( argv_new[argc+1], "%d", device ); > >>>> > argc += 2; > >>>> > argv = argv_new; > >>>> > > >>>> > cudaSetDevice( device ); > >>>> > > >>>> > MPI_Init(&argc,&argv); > >>>> > > >>>> > // do something... > >>>> > > >>>> > MPI_Finalize(); > >>>> > > >>>> > cudaDeviceReset(); > >>>> > for( int i = 0 ; i < argc ; i++ ) delete [] argv[i]; > >>>> > delete [] argv; > >>>> > } > >>>> > > >>>> > When compiled using nvcc -ccbin mpic++, The error I got was: > >>>> > > >>>> > [jueying:16317] *** Process received signal *** > >>>> > [jueying:16317] Signal: Segmentation fault (11) > >>>> > [jueying:16317] Signal code: Address not mapped (1) > >>>> > [jueying:16317] Failing at address: 0x21 > >>>> > [jueying:16317] [ 0] /usr/lib64/libpthread.so.0() [0x39e5e0f000] > >>>> > [jueying:16317] [ 1] /usr/lib64/libc.so.6() [0x39e5760551] > >>>> > [jueying:16317] [ 2] > >>>> > /opt/openmpi/1.7.2/lib/libopen-pal.so.5(opal_argv_join+0x39) > >>>> > [0x7f460b993079] > >>>> > [jueying:16317] [ 3] > >>>> > /opt/openmpi/1.7.2/lib/libmpi.so.1(ompi_mpi_init+0x347) > >>>> > [0x7f460c106a57] > >>>> > [jueying:16317] [ 4] > > /opt/openmpi/1.7.2/lib/libmpi.so.1(MPI_Init+0x16b) > >>>> > [0x7f460c12523b] > >>>> > [jueying:16317] [ 5] ./lmp_jueying() [0x40c035] > >>>> > [jueying:16317] [ 6] /usr/lib64/libc.so.6(__libc_start_main+0xf5) > >>>> > [0x39e5621a05] > >>>> > [jueying:16317] [ 7] ./lmp_jueying() [0x40dd21] > >>>> > [jueying:16317] *** End of error message *** > >>>> > > >>>> > Thanks for the help. > >>>> > > >>>> > Best regards, > >>>> > Yu-Hang Tang > >>>> > > >>>> > _______________________________________________ > >>>> > users mailing list > >>>> > us...@open-mpi.org > >>>> > http://www.open-mpi.org/mailman/listinfo.cgi/users > >>>> > >>>> > >>>> > >>>> -- > >>>> Information System Engineer, Ph.D. > >>>> Blog: http://matt.eifelle.com > >>>> LinkedIn: http://www.linkedin.com/in/matthieubrucher > >>>> Music band: http://liliejay.com/ > >>>> _______________________________________________ > >>>> users mailing list > >>>> us...@open-mpi.org > >>>> http://www.open-mpi.org/mailman/listinfo.cgi/users > >>> > >>> > >>> > >>> > >>> -- > >>> Yu-Hang Tang > >>> Room 105, 37 Manning St > >>> Division of Applied Mathematics, Brown University > >>> Providence, RI 02912 > >>> > >>> _______________________________________________ > >>> users mailing list > >>> us...@open-mpi.org > >>> http://www.open-mpi.org/mailman/listinfo.cgi/users > >> > >> > >> > >> -- > >> Information System Engineer, Ph.D. > >> Blog: http://matt.eifelle.com > >> LinkedIn: http://www.linkedin.com/in/matthieubrucher > >> Music band: http://liliejay.com/ > > > > > > > > -- > > Information System Engineer, Ph.D. > > Blog: http://matt.eifelle.com > > LinkedIn: http://www.linkedin.com/in/matthieubrucher > > Music band: http://liliejay.com/ > > _______________________________________________ > > users mailing list > > us...@open-mpi.org > > http://www.open-mpi.org/mailman/listinfo.cgi/users > > > > _______________________________________________ > > users mailing list > > us...@open-mpi.org > > http://www.open-mpi.org/mailman/listinfo.cgi/users > > _______________________________________________ > users mailing list > us...@open-mpi.org > http://www.open-mpi.org/mailman/listinfo.cgi/users > -- Yu-Hang Tang Room 105, 37 Manning St Division of Applied Mathematics, Brown University Providence, RI 02912