Hi, I am running into a problem with mpi_allreduce when large buffers are used. But does not appear to be unique for mpi_allreduce; it occurs with mpi_send/mpi_recv as well; program is attached. 1) run this using MPI_Allreduce:
# mpiexec -machinefile mfile -n 2 ./allreduce choose algorithm: enter 1 for MPI_Allreduce enter 2 for MPI_Send/Recv and MPI_Bcast 1 enter array size (integer; negative to stop): 40000000 allreduce completed 0.661867 enter array size (integer; negative to stop): 80000000 allreduce completed 1.356263 enter array size (integer; negative to stop): 160000000 allreduce completed 2.700941 enter array size (integer; negative to stop): 320000000 At this point the program just hangs forever. 2) running the MPI_Send/MPI_Recv/MPI_Bcast version: # mpiexec -machinefile mfile -n 2 ./allreduce choose algorithm: enter 1 for MPI_Allreduce enter 2 for MPI_Send/Recv and MPI_Bcast 2 enter array size (integer; negative to stop): 40000000 id=0 received data from id=1 in 0.263818 bcast completed in 0.652631 allreduce completed in 1.102356 enter array size (integer; negative to stop): 80000000 id=0 received data from id=1 in 0.671201 bcast completed in 1.298208 allreduce completed in 2.341906 enter array size (integer; negative to stop): 160000000 [[43618,1],0][btl_openib_component.c:2951:handle_wc] from b2 to: b1 error polling LP CQ with status LOCAL LENGTH ERROR status number 1 for wr_id 102347120 opcode 1 vendor error 105 qp_idx 3 -------------------------------------------------------------------------- mpiexec has exited due to process rank 0 with PID 26254 on node b2 exiting without calling "finalize". This may have caused other processes in the application to be terminated by signals sent by mpiexec (as reported here). -------------------------------------------------------------------------- All programs/libraries are 64bit, interconnect is IB. I expect problems with sizes larger than 2^31-1, but these array sizes are still much smaller. What is the problem here? Cheers, Martin -- Martin Siegert Head, Research Computing WestGrid Site Lead IT Services phone: 778 782-4691 Simon Fraser University fax: 778 782-4242 Burnaby, British Columbia email: sieg...@sfu.ca Canada V5A 1S6
#include <stdio.h> #include <stdlib.h> #include <math.h> #include <string.h> #include <mpi.h> void randv(double *, long); int main(int argc, char *argv[]) { long l, i; int mpierr, myid, numprocs, id, ialg, init; MPI_Status *status; double wt0, wt1, wt; double *sarr, *rarr; MPI_Init(NULL, NULL); MPI_Comm_rank(MPI_COMM_WORLD, &myid); MPI_Comm_size(MPI_COMM_WORLD, &numprocs); pseeds(myid, numprocs, -1); if (myid == 0) { fprintf(stderr, "choose algorithm: enter 1 for MPI_Allreduce\n"); fprintf(stderr, " enter 2 for MPI_Send/Recv and MPI_Bcast\n"); scanf("%i", &ialg); if (ialg != 1 && ialg != 2) { MPI_Abort(MPI_COMM_WORLD, -1); fprintf(stderr, "must enter 1 or 2\n"); exit(-1); } if (ialg == 2) ialg = 0; } MPI_Bcast(&ialg, 1, MPI_INT, 0, MPI_COMM_WORLD); for (;;) { if (myid == 0) { fprintf(stderr, "enter array size (integer; negative to stop): \n"); scanf("%li", &l); } MPI_Bcast(&l, 1, MPI_LONG, 0, MPI_COMM_WORLD); if (l < 0) break; sarr = (double *)malloc(l*sizeof(double)); rarr = (double *)malloc(l*sizeof(double)); randv(sarr, l); if (ialg) { if (myid == 0) wt0 = MPI_Wtime(); MPI_Allreduce(sarr, rarr, l, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); if (myid == 0) { wt = MPI_Wtime(); fprintf(stderr, "allreduce completed %f\n", wt - wt0); } } else { if (myid == 0) { for (i = 0; i < l; i++) { rarr[i] = sarr[i]; } wt0 = MPI_Wtime(); for (id = 1; id <= numprocs - 1; id++) { wt1 = MPI_Wtime(); MPI_Recv(sarr, l, MPI_DOUBLE, id, id, MPI_COMM_WORLD, status); wt = MPI_Wtime(); fprintf(stderr, "id=0 received data from id=%i in %f\n", id, wt - wt1); } for (i = 0; i < l; i++) { rarr[i] += sarr[i]; } wt1 = MPI_Wtime(); } else { MPI_Send(sarr, l, MPI_DOUBLE, 0, myid, MPI_COMM_WORLD); } MPI_Bcast(rarr, l, MPI_DOUBLE, 0, MPI_COMM_WORLD); if (myid == 0) { wt = MPI_Wtime(); fprintf(stderr, "bcast completed in %f\n", wt - wt1); fprintf(stderr, "allreduce completed in %f\n", wt - wt0); } } free(sarr); free(rarr); } MPI_Finalize(); } unsigned int i__RNG; unsigned int lcg1664525(unsigned int *iseed){ /* linear congruential random number generator i(n+1) = [a * i(n) + c] mod m with a=1664525, c=1013904223, and m=2^32 (see Numerical Recipes, ch 7.1) */ static unsigned int ia = 1664525, ic = 1013904223; *iseed = (*iseed)*ia + ic; return *iseed; } #include <sys/time.h> #include <unistd.h> #include <limits.h> int seeds(int iseed){ int init; struct timeval tv; struct timezone tz; if (iseed <= 0) { gettimeofday(&tv,&tz); iseed = tv.tv_usec * (INT_MAX/1000000); }/*end if*/ init = iseed; i__RNG = iseed; return init; } int pseeds(int id, int numprocs, int iseed){ /* this routine can be used in MPI programs to initialize the random number generator randv so that depending on the id of the processor different seeds are generated. Otherwise the functionality is the same as explained above for the seeds routine, only the id (as obtained from MPI_COMM_RANK) and the # of processors (as returnd from MPI_COMM_SIZE) must be provided additionally. */ int init; struct timeval tv; struct timezone tz; if (iseed <= 0) { gettimeofday(&tv,&tz); iseed = (id + 1) * tv.tv_usec * (INT_MAX/(numprocs*1000000)); }/*end if*/ init = iseed; i__RNG = iseed; return init; } void randv(double *r, long n){ long i; const double rmax=0.5/((double)INT_MAX+1.); /* 2^(-32) */ for (i = 0; i < n; i++) { r[i] = lcg1664525(&i__RNG)*rmax; } }