Hi All, My MPI program's basic task consists of regularly establishing point-to-point communication with other procs via MPI_Alltoall, and then to communicate data. I tested it on two HPC clusters with 32-256 MPI tasks. One of the systems (HPC1) this custom collective runs flawlessly, while on another one (HPC2) the collective causes non-reproducible deadlocks (after a day of running, or after of few hours or so). So, I want to figure out whether it is a system (HPC2) bug that I can communicate to HPC admins, or a subtle bug in my code that needs to be fixed. One possibly important point, I communicate huge amount of data between tasks (up to ~2GB of data) in several all2all calls.
I would like to have expert eyes to look at the code to confirm or disprove that the code is deadlock-safe. I have implemented several methods (METHOD1 - METHOD4), that, if I am not mistaken, should in principle be deadlock safe. However, as a beginner MPI user, I can easily miss something subtle, as such I seek you help with this! I mostly used METHOD4 which have caused periodic deadlock, after having deadlocks with METHOD1 and METHOD2. On HPC1 none these methods deadlock in my runs. METHOD3 I am currently testing, so cannot comment on it as yet but will later; however, I will be happy to hear your comments. Both system use openmpi-1.4.3. Your answers will be of great help! Thanks! Cheers, Evghenii Here is the code snippet: template<class T> void all2all(std::vector<T> sbuf[], std::vector<T> rbuf[], const int myid, const int nproc) { static int nsend[NMAXPROC], nrecv[NMAXPROC]; for (int p = 0; p < nproc; p++) nsend[p] = sbuf[p].size(); MPI_Alltoall(nsend, 1, MPI_INT, nrecv, 1, MPI_INT, MPI_COMM_WORLD); // let the other tasks know how much data they will receive from this one #ifdef _METHOD1_ static MPI_Status stat[NMAXPROC ]; static MPI_Request req[NMAXPROC*2]; int nreq = 0; for (int p = 0; p < nproc; p++) if (p != myid) { const int scount = nsend[p]; const int rcount = nrecv[p]; rbuf[p].resize(rcount); if (scount > 0) MPI_Isend(&sbuf[p][0], nscount, datatype<T>(), p, 1, MPI_COMM_WORLD, &req[nreq++]); if (rcount > 0) MPI_Irecv(&rbuf[p][0], rcount, datatype<T>(), p, 1, MPI_COMM_WORLD, &req[nreq++]); } rbuf[myid] = sbuf[myid]; MPI_Waitall(nreq, req, stat); #elif defined _METHOD2_ static MPI_Status stat; for (int p = 0; p < nproc; p++) if (p != myid) { const int scount = nsend[p]*scale; const int rcount = nrecv[p]*scale; rbuf[p].resize(rcount); if (scount + rcount > 0) MPI_Sendrecv(&sbuf[p][0], scount, datatype<T>(), p, 1, &rbuf[p][0], rcount, datatype<T>(), p, 1, MPI_COMM_WORLD, &stat); } rbuf[myid] = sbuf[myid]; #elif defined _METHOD3_ static MPI_Status stat[NMAXPROC ]; static MPI_Request req[NMAXPROC*2]; for (int dist = 1; dist < nproc; dist++) { const int src = (nproc + myid - dist) % nproc; const int dst = (nproc + myid + dist) % nproc; const int scount = nsend[dst]*scale; const int rcount = nrecv[src]*scale; rbuf[src].resize(rcount); int nreq = 0; if (scount > 0) MPI_Isend(&sbuf[dst][0], scount, datatype<T>(), dst, 1, MPI_COMM_WORLD, &req[nreq++]); if (rcount > 0) MPI_Irecv(&rbuf[src][0], rcount, datatype<T>(), src, 1, MPI_COMM_WORLD, &req[nreq++]); MPI_Waitall(nreq, req, stat); } rbuf[myid] = sbuf[myid]; #elif defined _METHOD4_ static MPI_Status stat; for (int dist = 1; dist < nproc; dist++) { const int src = (nproc + myid - dist) % nproc; const int dst = (nproc + myid + dist) % nproc; const int scount = nsend[dst]*scale; const int rcount = nrecv[src]*scale; rbuf[src].resize(rcount); if ((myid/dist) & 1) { if (scount > 0) MPI_Send(&sbuf[dst][0], scount, datatype<T>(), dst, 1, MPI_COMM_WORLD); if (rcount > 0) MPI_Recv(&rbuf[src][0], rcount, datatype<T>(), src, 1, MPI_COMM_WORLD, &stat); } else { if (rcount > 0) MPI_Recv(&rbuf[src][0], rcount, datatype<T>(), src, 1, MPI_COMM_WORLD, &stat); if (scount > 0) MPI_Send(&sbuf[dst][0], scount, datatype<T>(), dst, 1, MPI_COMM_WORLD); } } rbuf[myid] = sbuf[myid]; #endif }