Hello,

We are getting a strong performance issue, which is due to a missing
pipelining behavior from OpenMPI when running over TCP. I have attached
a test case. Basically what it does is

if (myrank == 0) {
        for (i = 0; i < N; i++)
                MPI_Isend(...);
} else {
        for (i = 0; i < N; i++)
                MPI_Irecv(...);
}
for (i = 0; i < N; i++)
        MPI_Wait(...);

with corresponding printfs. And the result is:

0.182620: Isend 0 begin
0.182761: Isend 0 end
0.182766: Isend 1 begin
0.182782: Isend 1 end
...
0.183911: Isend 49 begin
0.183915: Isend 49 end
0.199028: Irecv 0 begin
0.199068: Irecv 0 end
0.199070: Irecv 1 begin
0.199072: Irecv 1 end
...
0.199187: Irecv 49 begin
0.199188: Irecv 49 end
0.233948: Isend 0 done!
0.269895: Isend 1 done!
...
1.982475: Isend 49 done!
1.984065: Irecv 0 done!
1.984078: Irecv 1 done!
...
1.984131: Irecv 49 done!

i.e. almost two seconds happen between the start of the application and
the first Irecv completes, and then all Irecv complete immediately too,
i.e. it seems the communications were grouped altogether.

This is really bad, because in our real use case, we trigger
computations after each MPI_Wait calls, and we use several messages so
as to pipeline things: the first computation can start as soon as one
message gets received, thus overlapped with further receptions.

This problem is only with openmpi on TCP, I'm not getting this behavior
with openmpi on IB, and I'm not getting it either with mpich or madmpi:

0.182168: Isend 0 begin
0.182235: Isend 0 end
0.182237: Isend 1 begin
0.182242: Isend 1 end
...
0.182842: Isend 49 begin
0.182844: Isend 49 end
0.200505: Irecv 0 begin
0.200564: Irecv 0 end
0.200567: Irecv 1 begin
0.200569: Irecv 1 end
...
0.201233: Irecv 49 begin
0.201234: Irecv 49 end
0.269511: Isend 0 done!
0.273154: Irecv 0 done!
0.341054: Isend 1 done!
0.344507: Irecv 1 done!
...
3.767726: Isend 49 done!
3.770637: Irecv 49 done!

There we do have pipelined reception.

Is there a way to get the second, pipelined behavior with openmpi on
TCP?

Samuel
#include <mpi.h>
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <sys/utsname.h>

/* run with mpirun --map-by node */

#define SIZE (8*1024*1024)
#define N 50

//#define DEBUG

int main(int argc, char *argv[]) {
	char *c[N];
	int rank;
	int i, repeat, flag;
	MPI_Request request[N];
	MPI_Status status;
	int done[N] = { 0 };
	char *actions[2] = { "Isend", "Irecv" };
	int ret;
	double start;
	struct utsname name;

	uname(&name);
	MPI_Init(&argc, &argv);
	MPI_Comm_rank(MPI_COMM_WORLD,&rank);
	fprintf(stderr,"I'm %d on %s\n", rank, name.nodename);
	MPI_Barrier(MPI_COMM_WORLD);
	start = MPI_Wtime();

	for (i = 0; i < N; i++)
	{
		c[i] = calloc(1,SIZE);
		c[i][0] = i;
		c[i][SIZE-1] = i;
	}

	if (rank == 0) {
		for (i = 0; i < N; i++)
		{
			fprintf(stderr,"%f: Isend %d begin\n", MPI_Wtime() - start, i);
			ret = MPI_Isend(c[i], SIZE, MPI_CHAR, 1, 0, MPI_COMM_WORLD, &request[i]);
			//ret = MPI_Issend(c[i], SIZE, MPI_CHAR, 1, 0, MPI_COMM_WORLD, &request[i]);
			//ret = MPI_Send(c[i], SIZE, MPI_CHAR, 1, 0, MPI_COMM_WORLD);
			assert(ret == MPI_SUCCESS);
			fprintf(stderr,"%f: Isend %d end\n", MPI_Wtime() - start, i);
		}
	} else {
		for (i = 0; i < N; i++)
		{
			fprintf(stderr,"%f: Irecv %d begin\n", MPI_Wtime() - start, i);
			ret = MPI_Irecv(c[i], SIZE, MPI_CHAR, 0, 0, MPI_COMM_WORLD, &request[i]);
			assert(ret == MPI_SUCCESS);
			fprintf(stderr,"%f: Irecv %d end\n", MPI_Wtime() - start, i);
		}
	}

//if (rank)
{
#if 0
	do {
		repeat = 0;
		for (i = 0; i < N; i++)
		{
			if (!done[i])
			{
				repeat = 1;
#ifdef DEBUG
				fprintf(stderr,"%f: %s Test %d begin\n", MPI_Wtime() - start, actions[rank], i);
#endif
				ret = MPI_Test(&request[i], &done[i], &status);
				assert(ret == MPI_SUCCESS);
#ifdef DEBUG
				fprintf(stderr,"%f: %s Test %d end\n", MPI_Wtime() - start, actions[rank], i);
#endif
				if (done[i])
				{
					fprintf(stderr,"%f: %s %d done!\n", MPI_Wtime() - start, actions[rank], i);
					if (rank)
					{
						assert(c[i][0] == i);
						assert(c[i][SIZE-1] == i);
					}
				}
			}
		}
	} while(repeat);
#elif 0
	repeat = N;
	do {
		ret = MPI_Testany(N, request, &i, &flag, &status);
		assert(ret == MPI_SUCCESS);
		if (flag)
		{
			fprintf(stderr,"%f: %s %d done!\n", MPI_Wtime() - start, actions[rank], i);
			if (rank)
			{
				assert(c[i][0] == i);
				assert(c[i][SIZE-1] == i);
			}
			repeat--;
		}
	} while (repeat);
#elif 0
	for (i = 0; i < N; i++)
	{
		do
		{
#ifdef DEBUG
			fprintf(stderr,"%f: %s Test %d begin\n", MPI_Wtime() - start, actions[rank], i);
#endif
			ret = MPI_Test(&request[i], &flag, &status);
			assert(ret == MPI_SUCCESS);
#ifdef DEBUG
			fprintf(stderr,"%f: %s Test %d end\n", MPI_Wtime() - start, actions[rank], i);
#endif
		} while(!flag);
		fprintf(stderr,"%f: %s %d done!\n", MPI_Wtime() - start, actions[rank], i);
		if (rank)
		{
			assert(c[i][0] == i);
			assert(c[i][SIZE-1] == i);
		}
	}
#else
	for (i = 0; i < N; i++)
	{
		ret = MPI_Wait(&request[i], &status);
		assert(ret == MPI_SUCCESS);
		fprintf(stderr,"%f: %s %d done!\n", MPI_Wtime() - start, actions[rank], i);
		if (rank)
		{
			assert(c[i][0] == i);
			assert(c[i][SIZE-1] == i);
		}
	}
#endif
}

	MPI_Finalize();
	return 0;
}
_______________________________________________
users mailing list
users@lists.open-mpi.org
https://rfd.newmexicoconsortium.org/mailman/listinfo/users

Reply via email to