The following routine gives a problem after some (not reproducible) time on 
Fedora Core 12. The routine is a CPU usage friendly version of MPI_Barrier.
The verbose output shows that if the problem occurs one of the (not 
reproducible) nodes does not receive a message from one of the other (not 
reproducible) nodes, so it looks like the message is lost or is never received. 
This routine worked fine on Fedora Core 10 with OpenMPI 1.3.x and works fine on 
Centos 5.3 with OpenMPI 1.3.x. The problem occurs with OpenMPI 1.3.x, OpenMPI 
1.4, gcc and icc.
My question is: is there a problem with this routine that I overlooked that 
somehow did not show up until now, and if not, how can I debug what causes this 
problem. Is there a way to see which messages have been sent/received/are 
pending?

Regards,
Gijsbert

local void my_barrier(char * info, MPI_Comm comm, int verbose)
{
        int ncomm;
        int comm_id;
        int send[MPI_NPROCS_MAX];
        MPI_Request request[MPI_NPROCS_MAX];
        int icomm;
        int done[MPI_NPROCS_MAX];
        time_t t0, t1;
        double wall[MPI_NPROCS_MAX];
        double wall_max;

        BUG(mpi_nprocs == 1)

        MPI_Comm_size(comm, &ncomm);
        BUG(ncomm < 1)
        MPI_Comm_rank(comm, &comm_id);

        my_printf("entering barrier %s %d %d\n", info, ncomm, comm_id);
        for (icomm = 0; icomm < ncomm; icomm++) send[icomm] = comm_id;
        for (icomm = 0; icomm < ncomm; icomm++)
        {
                if (icomm != comm_id)
                {
                        if (verbose) my_printf("sending from %d to %d\n", 
comm_id, icomm);
                        MPI_Isend(send + icomm, 1, MPI_INT, icomm, 
MPI_BARRIER_TAG,
                                comm, request + icomm);
                        done[icomm] = FALSE;
                }
                else
                {
                        done[icomm] = TRUE;
                }
                wall[icomm] = 0.0;
        }
        t0 = time(NULL);
        while(TRUE)
        {
                int receive;
                int flag;
                MPI_Status status;

                MPI_Iprobe(MPI_ANY_SOURCE, MPI_BARRIER_TAG,
                        comm, &flag, &status);
                if (!flag)
                {
                        my_sleep(0, BARRIER_POLL);
                        continue;
                }
                BUG(status.MPI_SOURCE < 0)
                BUG(status.MPI_SOURCE >= ncomm)
                MPI_Recv(&receive, 1, MPI_INT, status.MPI_SOURCE, 
MPI_BARRIER_TAG,
                        comm, &status);
                BUG(receive != status.MPI_SOURCE)
                BUG(done[status.MPI_SOURCE])
                if (verbose) my_printf("receiving from %d\n", 
status.MPI_SOURCE);

                t1 = time(NULL);
                done[status.MPI_SOURCE] = TRUE;
                wall[status.MPI_SOURCE] = difftime(t1, t0);

                for (icomm = 0; icomm < ncomm; icomm++)
                        if (!done[icomm]) break;
                if (icomm == ncomm) break;
        }
        my_printf("leaving barrier %s\n", info);

        wall_max = 0;
        for (icomm = 0; icomm < ncomm; icomm++)
        {       
                if (verbose)
                        my_printf("icomm=%d time=%.0f%s\n",
                                icomm, wall[icomm], icomm == comm_id ? " *" : 
"");
                if (wall[icomm] > wall_max) wall_max = wall[icomm];
        }
        //to be sure
        MPI_Barrier(comm);
        MPI_Allreduce(MPI_IN_PLACE, &wall_max, 1,
                MPI_DOUBLE, MPI_MAX, comm);
        my_printf("mpi wall_max=%.0f\n", wall_max);
}


Reply via email to