Dear All,
I write a simple test code to use MPI_Allgatherv function. The problem
comes when
the send buf size becomes relatively big.
When Bufsize = 2^28 – 1, run on 4 processors. OK
When Bufsize = 2^28, run on 4 processors. Error
[btl_tcp_frag.c:209:mca_btl_tcp_frag_recv] mca_btl_tcp_frag_recv: readv
error (0xffffffff85f526f8, 2147483592) Bad address(1)
When Bufsize =2^29-1, run on 2 processors. OK
When Bufsize = 2^29, run on 2 processors. Error
[btl_tcp_frag.c:209:mca_btl_tcp_frag_recv] mca_btl_tcp_frag_recv: readv
error (0xffffffff964605d0, 2147483632) Bad address(1)
Bufsize is not that close to int limit, but readv in mca_btl_tcp_frag_recv
has size close to 2147483647. Does anyone have idea why the error comes?
Any suggestion to solve or avoid this problem?
The simple test code is attached below:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <time.h>
#include "mpi.h"
int main(int argc, char ** argv)
{
int myid,nproc;
long i,j;
long size;
long bufsize;
int *rbuf;
int *sbuf;
char hostname[MPI_MAX_PROCESSOR_NAME];
int len;
size = (long) 2*1024*1024*1024-1;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &myid);
MPI_Comm_size(MPI_COMM_WORLD, &nproc);
MPI_Get_processor_name(hostname, &len);
printf("I am process %d with pid: %d at %s\n",myid,getpid(),hostname);
sleep(2);
if (myid == 0)
printf("size : %ld\n",size);
sbuf = (int *) calloc(size,sizeof(MPI_INT));
if (sbuf == NULL) {
printf("fail to allocate memory of sbuf\n");
exit(1);
}
rbuf = (int *) calloc(size,sizeof(MPI_INT));
if (rbuf == NULL) {
printf("fail to allocate memory of rbuf\n");
exit(1);
}
int *recvCount = calloc(nproc,sizeof(int));
int *displ = calloc(nproc,sizeof(int));
bufsize = 268435456; //which is 2^28
for(i=0;i<nproc;++i) {
recvCount[i] = bufsize;
displ[i] = bufsize*i;
}
for (i=0;i<bufsize;++i)
sbuf[i] = myid+i;
printf("buffer size: %ld recvCount[0]:%d last displ
index:%d\n",bufsize,recvCount[0],displ[nproc-1]);
fflush(stdout);
MPI_Allgatherv(sbuf,recvCount[0], MPI_INT,rbuf,recvCount,displ,MPI_INT,
MPI_COMM_WORLD);
printf("OK\n");
fflush(stdout);
MPI_Finalize();
return 0;
}