Kernighan and Richie's C programming language manual - it goes all the way back to the original C definition.
On Nov 12, 2013, at 9:15 AM, Alex A. Granovsky <g...@classic.chem.msu.su> wrote: > Hello, > >> It seems that argv[argc] should always be NULL according to the >> standard. So OMPI failure is not actually a bug! > > could you please point to the exact document where this is explicitly stated? > Otherwise, I'd assume this is a bug. > > Kind regards, > Alex Granovsky > > > -----Original Message----- From: Matthieu Brucher > Sent: Tuesday, November 12, 2013 8:56 PM > To: Open MPI Users > Subject: Re: [OMPI users] Segmentation fault in MPI_Init when passing > pointers allocated in main() > > It seems that argv[argc] should always be NULL according to the > standard. So OMPI failure is not actually a bug! > > Cheers, > > 2013/11/12 Matthieu Brucher <matthieu.bruc...@gmail.com>: >> Interestingly enough, in ompi_mpi_init, opal_argv_join is called >> without then array length, so I suppose that in the usual argc/argv >> couple, you have an additional value to argv which may be NULL. So try >> allocating 3 additional values, the last being NULL, and it may work. >> >> Cheers, >> >> Matthieu >> >> 2013/11/12 Tang, Yu-Hang <yuhang_t...@brown.edu>: >>> I tried the following code without CUDA, the error is still there: >>> >>> #include "mpi.h" >>> >>> #include <cstdlib> >>> #include <cstring> >>> #include <cmath> >>> >>> int main(int argc, char **argv) >>> { >>> // override command line arguments to make sure cudaengine get the >>> correct one >>> char **argv_new = new char*[ argc + 2 ]; >>> for( int i = 0 ; i < argc ; i++ ) >>> { >>> argv_new[i] = new char[ strlen( argv[i] ) + 1 ]; >>> strcpy( argv_new[i], argv[i] ); >>> } >>> argv_new[ argc ] = new char[ 32 ]; >>> argv_new[ argc+1 ] = new char[ 32 ]; >>> strcpy( argv_new[argc], "-device" ); >>> sprintf( argv_new[argc+1], "%d", 0 ); >>> >>> argc += 2; >>> argv = argv_new; >>> >>> MPI_Init(&argc,&argv); >>> >>> // do something... >>> >>> MPI_Finalize(); >>> >>> for( int i = 0 ; i < argc ; i++ ) delete [] argv[i]; >>> delete [] argv; >>> } >>> >>> At the end of the program the pointer stored in argv is exactly that of >>> argv_new so this should not be a problem. Manually inserting printf tells me >>> that the fault occured at MPI_Init. The code works fine if I use >>> MPI_Init(NULL,NULL) instead. The same code also compiles and runs without a >>> problem on my laptop with mpich2-1.4. >>> >>> Best, >>> Yu-Hang >>> >>> >>> >>> On Tue, Nov 12, 2013 at 11:18 AM, Matthieu Brucher >>> <matthieu.bruc...@gmail.com> wrote: >>>> >>>> Hi, >>>> >>>> Are you sure this is the correct code? This seems strange and not a good >>>> idea: >>>> >>>> MPI_Init(&argc,&argv); >>>> >>>> // do something... >>>> >>>> for( int i = 0 ; i < argc ; i++ ) delete [] argv[i]; >>>> delete [] argv; >>>> >>>> Did you mean argc_new and argv_new instead? >>>> Do you have the same error without CUDA? >>>> >>>> Cheers, >>>> >>>> Matthieu >>>> >>>> >>>> 2013/11/12 Tang, Yu-Hang <yuhang_t...@brown.edu>: >>>> > Hi, >>>> > >>>> > I tried to augment the command line argument list by allocating my own >>>> > list >>>> > of strings and passing them to MPI_Init, yet I got a segmentation > fault >>>> > for >>>> > both OpenMPI 1.6.3 and 1.7.2, while the code works fine with MPICH2. > >>>> > The >>>> > code is: >>>> > >>>> > #include "mpi.h" >>>> > #include "cuda_runtime.h" >>>> > #include <cstdlib> >>>> > #include <cstring> >>>> > #include <cmath> >>>> > >>>> > int main(int argc, char **argv) >>>> > { >>>> > int device = 0; >>>> > int skip = 0; >>>> > bool skipmode = false; >>>> > bool specified = false; >>>> > for( int i = 0 ; i < argc ; i++ ) >>>> > { >>>> > if ( strcmp( argv[i], "-device" ) == 0 ) >>>> > { >>>> > i++; >>>> > if ( argv[i][0] == '-' ) >>>> > { >>>> > skipmode = true; >>>> > skip = fabs( atoi( argv[i] ) ); >>>> > } >>>> > else >>>> > { >>>> > skipmode = false; >>>> > device = atoi( argv[i] ); >>>> > } >>>> > specified = true; >>>> > } >>>> > } >>>> > >>>> > if ( !specified || skipmode ) >>>> > { >>>> > char* var; >>>> > int dev_count, local_rank = 0; >>>> > if ( (var = getenv("SLURM_LOCALID")) != NULL) local_rank = >>>> > atoi(var); >>>> > else if( (var = getenv("MV2_COMM_WORLD_LOCAL_RANK")) != NULL) >>>> > local_rank = atoi(var); >>>> > else if( (var = getenv("OMPI_COMM_WORLD_LOCAL_RANK")) != NULL) >>>> > local_rank = atoi(var); >>>> > cudaGetDeviceCount( &dev_count ); >>>> > if ( skipmode ) >>>> > { >>>> > device = 0; >>>> > if ( device == skip ) local_rank++; >>>> > while( local_rank-- > 0 ) >>>> > { >>>> > device = (++device) % dev_count; >>>> > if ( device == skip ) local_rank++; >>>> > } >>>> > } >>>> > else device = local_rank % dev_count; >>>> > } >>>> > >>>> > // override command line arguments to make sure cudaengine get the >>>> > correct one >>>> > char **argv_new = new char*[ argc + 2 ]; >>>> > for( int i = 0 ; i < argc ; i++ ) >>>> > { >>>> > argv_new[i] = new char[ strlen( argv[i] ) + 1 ]; >>>> > strcpy( argv_new[i], argv[i] ); >>>> > } >>>> > argv_new[ argc ] = new char[ 32 ]; >>>> > argv_new[ argc+1 ] = new char[ 32 ]; >>>> > strcpy( argv_new[argc], "-device" ); >>>> > sprintf( argv_new[argc+1], "%d", device ); >>>> > argc += 2; >>>> > argv = argv_new; >>>> > >>>> > cudaSetDevice( device ); >>>> > >>>> > MPI_Init(&argc,&argv); >>>> > >>>> > // do something... >>>> > >>>> > MPI_Finalize(); >>>> > >>>> > cudaDeviceReset(); >>>> > for( int i = 0 ; i < argc ; i++ ) delete [] argv[i]; >>>> > delete [] argv; >>>> > } >>>> > >>>> > When compiled using nvcc -ccbin mpic++, The error I got was: >>>> > >>>> > [jueying:16317] *** Process received signal *** >>>> > [jueying:16317] Signal: Segmentation fault (11) >>>> > [jueying:16317] Signal code: Address not mapped (1) >>>> > [jueying:16317] Failing at address: 0x21 >>>> > [jueying:16317] [ 0] /usr/lib64/libpthread.so.0() [0x39e5e0f000] >>>> > [jueying:16317] [ 1] /usr/lib64/libc.so.6() [0x39e5760551] >>>> > [jueying:16317] [ 2] >>>> > /opt/openmpi/1.7.2/lib/libopen-pal.so.5(opal_argv_join+0x39) >>>> > [0x7f460b993079] >>>> > [jueying:16317] [ 3] >>>> > /opt/openmpi/1.7.2/lib/libmpi.so.1(ompi_mpi_init+0x347) >>>> > [0x7f460c106a57] >>>> > [jueying:16317] [ 4] > /opt/openmpi/1.7.2/lib/libmpi.so.1(MPI_Init+0x16b) >>>> > [0x7f460c12523b] >>>> > [jueying:16317] [ 5] ./lmp_jueying() [0x40c035] >>>> > [jueying:16317] [ 6] /usr/lib64/libc.so.6(__libc_start_main+0xf5) >>>> > [0x39e5621a05] >>>> > [jueying:16317] [ 7] ./lmp_jueying() [0x40dd21] >>>> > [jueying:16317] *** End of error message *** >>>> > >>>> > Thanks for the help. >>>> > >>>> > Best regards, >>>> > Yu-Hang Tang >>>> > >>>> > _______________________________________________ >>>> > users mailing list >>>> > us...@open-mpi.org >>>> > http://www.open-mpi.org/mailman/listinfo.cgi/users >>>> >>>> >>>> >>>> -- >>>> Information System Engineer, Ph.D. >>>> Blog: http://matt.eifelle.com >>>> LinkedIn: http://www.linkedin.com/in/matthieubrucher >>>> Music band: http://liliejay.com/ >>>> _______________________________________________ >>>> users mailing list >>>> us...@open-mpi.org >>>> http://www.open-mpi.org/mailman/listinfo.cgi/users >>> >>> >>> >>> >>> -- >>> Yu-Hang Tang >>> Room 105, 37 Manning St >>> Division of Applied Mathematics, Brown University >>> Providence, RI 02912 >>> >>> _______________________________________________ >>> users mailing list >>> us...@open-mpi.org >>> http://www.open-mpi.org/mailman/listinfo.cgi/users >> >> >> >> -- >> Information System Engineer, Ph.D. >> Blog: http://matt.eifelle.com >> LinkedIn: http://www.linkedin.com/in/matthieubrucher >> Music band: http://liliejay.com/ > > > > -- > Information System Engineer, Ph.D. > Blog: http://matt.eifelle.com > LinkedIn: http://www.linkedin.com/in/matthieubrucher > Music band: http://liliejay.com/ > _______________________________________________ > users mailing list > us...@open-mpi.org > http://www.open-mpi.org/mailman/listinfo.cgi/users > > _______________________________________________ > users mailing list > us...@open-mpi.org > http://www.open-mpi.org/mailman/listinfo.cgi/users