Hi,

I tried to augment the command line argument list by allocating my own list
of strings and passing them to MPI_Init, yet I got a segmentation fault for
both OpenMPI 1.6.3 and 1.7.2, while the code works fine with MPICH2. The
code is:

#include "mpi.h"
#include "cuda_runtime.h"
#include <cstdlib>
#include <cstring>
#include <cmath>

int main(int argc, char **argv)
{
    int device = 0;
    int skip = 0;
    bool skipmode = false;
    bool specified = false;
    for( int i = 0 ; i < argc ; i++ )
    {
        if ( strcmp( argv[i], "-device" ) == 0 )
        {
            i++;
            if ( argv[i][0] == '-' )
            {
                skipmode = true;
                skip = fabs( atoi( argv[i] ) );
            }
            else
            {
                skipmode = false;
                device = atoi( argv[i] );
            }
            specified = true;
        }
    }

    if ( !specified || skipmode )
    {
        char* var;
        int dev_count, local_rank = 0;
        if ( (var = getenv("SLURM_LOCALID")) != NULL) local_rank =
atoi(var);
        else if( (var = getenv("MV2_COMM_WORLD_LOCAL_RANK"))  != NULL)
local_rank = atoi(var);
        else if( (var = getenv("OMPI_COMM_WORLD_LOCAL_RANK")) != NULL)
local_rank = atoi(var);
        cudaGetDeviceCount( &dev_count );
        if ( skipmode )
        {
            device = 0;
            if ( device == skip ) local_rank++;
            while( local_rank-- > 0 )
            {
                device = (++device) % dev_count;
                if ( device == skip ) local_rank++;
            }
        }
        else device = local_rank % dev_count;
    }

    // override command line arguments to make sure cudaengine get the
correct one
    char **argv_new = new char*[ argc + 2 ];
    for( int i = 0 ; i < argc ; i++ )
    {
        argv_new[i] = new char[ strlen( argv[i] ) + 1 ];
        strcpy( argv_new[i], argv[i] );
    }
    argv_new[ argc   ] = new char[ 32 ];
    argv_new[ argc+1 ] = new char[ 32 ];
    strcpy( argv_new[argc],   "-device" );
    sprintf( argv_new[argc+1], "%d", device );
    argc += 2;
    argv = argv_new;

    cudaSetDevice( device );

    MPI_Init(&argc,&argv);

    // do something...

    MPI_Finalize();

    cudaDeviceReset();
    for( int i = 0 ; i < argc ; i++ ) delete [] argv[i];
    delete [] argv;
}

When compiled using *nvcc -ccbin mpic++*, The error I got was:

[jueying:16317] *** Process received signal ***
[jueying:16317] Signal: Segmentation fault (11)
[jueying:16317] Signal code: Address not mapped (1)
[jueying:16317] Failing at address: 0x21
[jueying:16317] [ 0] /usr/lib64/libpthread.so.0() [0x39e5e0f000]
[jueying:16317] [ 1] /usr/lib64/libc.so.6() [0x39e5760551]
[jueying:16317] [ 2]
/opt/openmpi/1.7.2/lib/libopen-pal.so.5(opal_argv_join+0x39)
[0x7f460b993079]
[jueying:16317] [ 3]
/opt/openmpi/1.7.2/lib/libmpi.so.1(ompi_mpi_init+0x347) [0x7f460c106a57]
[jueying:16317] [ 4] /opt/openmpi/1.7.2/lib/libmpi.so.1(MPI_Init+0x16b)
[0x7f460c12523b]
[jueying:16317] [ 5] ./lmp_jueying() [0x40c035]
[jueying:16317] [ 6] /usr/lib64/libc.so.6(__libc_start_main+0xf5)
[0x39e5621a05]
[jueying:16317] [ 7] ./lmp_jueying() [0x40dd21]
[jueying:16317] *** End of error message ***

Thanks for the help.

Best regards,
Yu-Hang Tang

Reply via email to