[OMPI users] sending/receiving large buffers

Martin Siegert Sun, 8 Nov 2009 23:40:27 -0500

Hi,

I am running into a problem with mpi_allreduce when large buffers
are used. But does not appear to be unique for mpi_allreduce; it
occurs with mpi_send/mpi_recv as well; program is attached.
1) run this using MPI_Allreduce:


# mpiexec -machinefile mfile -n 2 ./allreduce
choose algorithm: enter 1 for MPI_Allreduce
                  enter 2 for MPI_Send/Recv and MPI_Bcast
1
enter array size (integer; negative to stop):
40000000
allreduce completed 0.661867
enter array size (integer; negative to stop):
80000000
allreduce completed 1.356263
enter array size (integer; negative to stop):
160000000
allreduce completed 2.700941
enter array size (integer; negative to stop):
320000000

At this point the program just hangs forever.

2) running the MPI_Send/MPI_Recv/MPI_Bcast version:

# mpiexec -machinefile mfile -n 2 ./allreduce
choose algorithm: enter 1 for MPI_Allreduce
                  enter 2 for MPI_Send/Recv and MPI_Bcast
2
enter array size (integer; negative to stop):
40000000
id=0 received data from id=1 in 0.263818
bcast completed in 0.652631
allreduce completed in 1.102356
enter array size (integer; negative to stop):
80000000
id=0 received data from id=1 in 0.671201
bcast completed in 1.298208
allreduce completed in 2.341906
enter array size (integer; negative to stop):
160000000
[[43618,1],0][btl_openib_component.c:2951:handle_wc] from b2 to: b1 error 
polling LP CQ with status LOCAL LENGTH ERROR status number 1 for wr_id 
102347120 opcode 1  vendor error 105 qp_idx 3
--------------------------------------------------------------------------
mpiexec has exited due to process rank 0 with PID 26254 on
node b2 exiting without calling "finalize". This may
have caused other processes in the application to be
terminated by signals sent by mpiexec (as reported here).
--------------------------------------------------------------------------

All programs/libraries are 64bit, interconnect is IB.
I expect problems with sizes larger than 2^31-1, but these array sizes
are still much smaller.

What is the problem here?

Cheers,
Martin

-- 
Martin Siegert
Head, Research Computing
WestGrid Site Lead
IT Services                                phone: 778 782-4691
Simon Fraser University                    fax:   778 782-4242
Burnaby, British Columbia                  email: sieg...@sfu.ca
Canada  V5A 1S6

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <mpi.h>

void randv(double *, long);

int main(int argc, char *argv[]) {
long l, i;
int mpierr, myid, numprocs, id, ialg, init;
MPI_Status *status;
double wt0, wt1, wt;
double *sarr, *rarr;

   MPI_Init(NULL, NULL);
   MPI_Comm_rank(MPI_COMM_WORLD, &myid);
   MPI_Comm_size(MPI_COMM_WORLD, &numprocs);

   pseeds(myid, numprocs, -1);
   if (myid == 0) {
      fprintf(stderr, "choose algorithm: enter 1 for MPI_Allreduce\n");
      fprintf(stderr, "                  enter 2 for MPI_Send/Recv and MPI_Bcast\n");
      scanf("%i", &ialg);
      if (ialg != 1 && ialg != 2) {
         MPI_Abort(MPI_COMM_WORLD, -1);
         fprintf(stderr, "must enter 1 or 2\n");
         exit(-1);
      }
      if (ialg == 2) ialg = 0;
   }
   MPI_Bcast(&ialg, 1, MPI_INT, 0, MPI_COMM_WORLD);
   for (;;) {
      if (myid == 0) {
         fprintf(stderr, "enter array size (integer; negative to stop): \n");
         scanf("%li", &l);
      }
      MPI_Bcast(&l, 1, MPI_LONG, 0, MPI_COMM_WORLD);
      if (l < 0) break;
      sarr = (double *)malloc(l*sizeof(double));
      rarr = (double *)malloc(l*sizeof(double));
      randv(sarr, l);
      if (ialg) {
         if (myid == 0) wt0 = MPI_Wtime();
         MPI_Allreduce(sarr, rarr, l, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
         if (myid == 0) {
            wt = MPI_Wtime();
            fprintf(stderr, "allreduce completed %f\n", wt - wt0);
         }
      } else {
         if (myid == 0) {
            for (i = 0; i < l; i++) { rarr[i] = sarr[i]; }
            wt0 = MPI_Wtime();
            for (id = 1; id <= numprocs - 1; id++) {
               wt1 = MPI_Wtime();
               MPI_Recv(sarr, l, MPI_DOUBLE, id, id, MPI_COMM_WORLD, status);
               wt = MPI_Wtime();
               fprintf(stderr, "id=0 received data from id=%i in %f\n", id, wt - wt1);
            }
            for (i = 0; i < l; i++) { rarr[i] += sarr[i]; }
            wt1 = MPI_Wtime();
         } else {
            MPI_Send(sarr, l, MPI_DOUBLE, 0, myid, MPI_COMM_WORLD);
         }
         MPI_Bcast(rarr, l, MPI_DOUBLE, 0, MPI_COMM_WORLD);
         if (myid == 0) {
            wt = MPI_Wtime();
            fprintf(stderr, "bcast completed in %f\n", wt - wt1);
            fprintf(stderr, "allreduce completed in %f\n", wt - wt0);
         }
      }
      free(sarr);
      free(rarr);
   }
   MPI_Finalize();
}

unsigned int i__RNG;

unsigned int lcg1664525(unsigned int *iseed){
/* linear congruential random number generator
      i(n+1) = [a * i(n) + c] mod m
      with a=1664525, c=1013904223, and m=2^32
      (see Numerical Recipes, ch 7.1)           */

static unsigned int ia = 1664525, ic = 1013904223;

  *iseed = (*iseed)*ia + ic;
  return *iseed;
}

#include <sys/time.h>
#include <unistd.h>
#include <limits.h>

int seeds(int iseed){
int init;
struct timeval tv;
struct timezone tz;

  if (iseed <= 0) {
    gettimeofday(&tv,&tz);
    iseed = tv.tv_usec * (INT_MAX/1000000);   
  }/*end if*/
  init = iseed;
  i__RNG = iseed;
  return init;
}

int pseeds(int id, int numprocs, int iseed){
/*  this routine can be used in MPI programs to initialize the random number
    generator randv so that depending on the id of the processor different
    seeds are generated. Otherwise the functionality is the same as explained
    above for the seeds routine, only the id (as obtained from MPI_COMM_RANK)
    and the # of processors (as returnd from MPI_COMM_SIZE) must be provided
    additionally. */

int init;
struct timeval tv;
struct timezone tz;

  if (iseed <= 0) {
    gettimeofday(&tv,&tz);
    iseed = (id + 1) * tv.tv_usec * (INT_MAX/(numprocs*1000000));   
  }/*end if*/
  init = iseed;
  i__RNG = iseed;
  return init;
}

void randv(double *r, long n){
long i;
const double rmax=0.5/((double)INT_MAX+1.);   /* 2^(-32) */

   for (i = 0; i < n; i++) {
      r[i] = lcg1664525(&i__RNG)*rmax;
   }
}

[OMPI users] sending/receiving large buffers

Reply via email to