I notice the following:

- you're creating an *enormous* array on the stack. you might be better allocating it on the heap. - the value of "exchanged" will quickly grow beyond 2^31 (i.e., MAX_INT) which is the max that the MPI API can handle. Bad Things can/ will happen beyond that value (i.e., you're keeping the value of "exchanged" in a long unsigned int, but MPI_Send and MPI_Recv only take an int).


On Feb 27, 2009, at 10:00 AM, Vittorio Giovara wrote:

Hello, i'm posting here another problem of my installation
I wanted to benchmark the differences between tcp and openib transport

if i run a simple non mpi application i get
randori ~ # mpirun --mca btl tcp,self -np 2 -host randori -host tatami hostname
randori
tatami

but as soon as i switch to my benchmark program i have
mpirun  --mca btl tcp,self  -np 2 -host randori -host tatami graph
Master thread reporting
matrix size 33554432 kB, time is in [us]

and instead of starting the send/receive functions it just hangs there; i also checked the transmitted packets with wireshark but after the handshake no more packets are exchanged

I read in the archives that there were some problems in this area and so i tried what was suggested in previous emails

mpirun --mca btl ^openib  -np 2 -host randori -host tatami graph
mpirun --mca pml ob1 --mca btl tcp,self -np 2 -host randori -host tatami graph

gives exactly the same output as before (no mpisend/receive)
while the next commands gives something more interesting

mpirun --mca pml cm --mca btl tcp,self -np 2 -host randori -host tatami graph
--------------------------------------------------------------------------
No available pml components were found!

This means that there are no components of this type installed on your
system or all the components reported that they could not be used.

This is a fatal error; your MPI process is likely to abort.  Check the
output of the "ompi_info" command and ensure that components of this
type are available on your system.  You may also wish to check the
value of the "component_path" MCA parameter and ensure that it has at
least one directory that contains valid MCA components.

--------------------------------------------------------------------------
[tatami:06619] PML cm cannot be selected
mpirun noticed that job rank 0 with PID 6710 on node randori exited on signal 15 (Terminated).

which is not possible as if i do ompi_info --param all there is the CM pml component

                 MCA pml: cm (MCA v1.0, API v1.0, Component v1.2.8)
                 MCA pml: ob1 (MCA v1.0, API v1.0, Component v1.2.8)


my test program is quite simple, just a couple of MPI_Send and MPI_Recv (just after the signature)
do you have any ideas that might help me?
thanks a lot
Vittorio

========================
#include "mpi.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>

#define M_COL 4096
#define M_ROW 524288
#define NUM_MSG 25

unsigned long int  gigamatrix[M_ROW][M_COL];

int main (int argc, char *argv[]) {
    int numtasks, rank, dest, source, rc, tmp, count, tag=1;
    unsigned long int  exp, exchanged;
    unsigned long int i, j, e;
    unsigned long matsize;
    MPI_Status Stat;
    struct timeval timing_start, timing_end;
    double inittime = 0;
    long int totaltime = 0;

    MPI_Init (&argc, &argv);
    MPI_Comm_size (MPI_COMM_WORLD, &numtasks);
    MPI_Comm_rank (MPI_COMM_WORLD, &rank);


    if (rank == 0) {
        fprintf (stderr, "Master thread reporting\n", numtasks - 1);
        matsize = (long) M_COL * M_ROW / 64;
fprintf (stderr, "matrix size %d kB, time is in [us]\n", matsize);

        source = 1;
        dest = 1;

        /*warm up phase*/
        rc = MPI_Send (&tmp, 1, MPI_INT, dest, tag, MPI_COMM_WORLD);
rc = MPI_Recv (&tmp, 1, MPI_INT, source, tag, MPI_COMM_WORLD, &Stat);
        rc = MPI_Send (&tmp, 1, MPI_INT, dest, tag, MPI_COMM_WORLD);
        rc = MPI_Send (&tmp, 1, MPI_INT, dest, tag, MPI_COMM_WORLD);
rc = MPI_Recv (&tmp, 1, MPI_INT, source, tag, MPI_COMM_WORLD, &Stat);
        rc = MPI_Send (&tmp, 1, MPI_INT, dest, tag, MPI_COMM_WORLD);

        for (e = 0; e < NUM_MSG; e++) {
            exp = pow (2, e);
            exchanged = 64 * exp;

            /*timing of ops*/
            gettimeofday (&timing_start, NULL);
rc = MPI_Send (&gigamatrix[0], exchanged, MPI_UNSIGNED_LONG, dest, tag, MPI_COMM_WORLD); rc = MPI_Recv (&gigamatrix[0], exchanged, MPI_UNSIGNED_LONG, source, tag, MPI_COMM_WORLD, &Stat);
            gettimeofday (&timing_end, NULL);

totaltime = (timing_end.tv_sec - timing_start.tv_sec) * 1000000 + (timing_end.tv_usec - timing_start.tv_usec);
            memset (&timing_start, 0, sizeof(struct timeval));
            memset (&timing_end, 0, sizeof(struct timeval));
            fprintf (stdout, "%d kB\t%d\n", exp, totaltime);
        }

        fprintf(stderr, "task complete\n");

    } else {
        if (rank >= 1) {
            dest = 0;
            source = 0;

rc = MPI_Recv (&tmp, 1, MPI_INT, source, tag, MPI_COMM_WORLD, &Stat); rc = MPI_Send (&tmp, 1, MPI_INT, dest, tag, MPI_COMM_WORLD); rc = MPI_Recv (&tmp, 1, MPI_INT, source, tag, MPI_COMM_WORLD, &Stat); rc = MPI_Recv (&tmp, 1, MPI_INT, source, tag, MPI_COMM_WORLD, &Stat); rc = MPI_Send (&tmp, 1, MPI_INT, dest, tag, MPI_COMM_WORLD); rc = MPI_Recv (&tmp, 1, MPI_INT, source, tag, MPI_COMM_WORLD, &Stat);

            for (e = 0; e < NUM_MSG; e++) {
                exp = pow (2, e);
                exchanged = 64 * exp;

rc = MPI_Recv (&gigamatrix[0], (unsigned) exchanged, MPI_UNSIGNED_LONG, source, tag, MPI_COMM_WORLD, &Stat); rc = MPI_Send (&gigamatrix[0], (unsigned) exchanged, MPI_UNSIGNED_LONG, dest, tag, MPI_COMM_WORLD);

            }
        }
    }

    MPI_Finalize ();

    return 0;
}


_______________________________________________
users mailing list
us...@open-mpi.org
http://www.open-mpi.org/mailman/listinfo.cgi/users


--
Jeff Squyres
Cisco Systems

Reply via email to