Hi Edgar.

Thanks for the response. The simplified code is attached: server, client
and a .h containing some constants. I put some "prints" to show the
behavior.

Regards

Rodrigo


On Tue, Mar 20, 2012 at 11:47 AM, Edgar Gabriel <gabr...@cs.uh.edu> wrote:

> do you have by any chance the actual or a small reproducer? It might be
> much easier to hunt the problem down...
>
> Thanks
> Edgar
>
> On 3/19/2012 8:12 PM, Rodrigo Oliveira wrote:
> > Hi there.
> >
> > I am facing a very strange problem when using MPI_Barrier over an
> > inter-communicator after some operations I describe bellow:
> >
> > 1) I start a server calling mpirun.
> > 2) The server spawns 2 copies of a client using MPI_Comm_spawn, creating
> > an inter-communicator between the two groups. The server group with 1
> > process (lets name it as A) and the client group with 2 processes (group
> B).
> > 3) After that, I need to detach one of the processes (rank 0) in group B
> > from the inter-communicator AB. To do that I do the following steps:
> >
> > Server side:
> >         .....
> >         tmp_inter_comm = client_comm.Create ( client_comm.Get_group ( )
> );
> > client_comm.Free ( );
> > client_comm = tmp_inter_comm;
> >         .....
> >         client_comm.Barrier();
> >         .....
> >
> > Client side:
> >         ....
> >         rank = 0;
> >         tmp_inter_comm = server_comm.Create ( server_comm.Get_group (
> > ).Excl ( 1, &rank ) );
> > server_comm.Free ( );
> > server_comm = tmp_inter_comm;
> >         .....
> >         if (server_comm != MPI::COMM_NULL)
> >             server_comm.Barrier();
> >
> >
> > The problem: everything works fine until the call to Barrier. In that
> > point, the server exits the barrier, but the client at the group B does
> > not. Observe that we have only one process inside B, because I used Excl
> > to remove one process from the original group.
> >
> > p.s.: This occurs in the version 1.5.4 and the C++ API.
> >
> > I am very concerned about this problem because this solution plays a
> > very important role in my master thesis.
> >
> > Is this an ompi problem or am I doing something wrong?
> >
> > Thanks in advance
> >
> > Rodrigo Oliveira
> >
> >
> > _______________________________________________
> > users mailing list
> > us...@open-mpi.org
> > http://www.open-mpi.org/mailman/listinfo.cgi/users
>
>
> _______________________________________________
> users mailing list
> us...@open-mpi.org
> http://www.open-mpi.org/mailman/listinfo.cgi/users
>
#define OP_SHUT 0
#define OP_REM 1
#include <mpi.h>
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <iostream>
#include <iostream>
#include <vector>
#include <string>

#include "constants.h"

using namespace std;

MPI::Intercomm spawn ( MPI::Intracomm& self_comm, vector<vector<string> > & argv, vector<string>& commands, vector<string>& hosts, int* number_process, string& work_directory );
void removeClient ( MPI::Intercomm& client_comm, int rank );

int main ( int argc, char *argv[] ) {

	MPI::Init ( argc, argv );
	
	/** Initializations */
	MPI::Intracomm self_comm = MPI::COMM_SELF;
	MPI::Intercomm client_comm;

	vector < string > commands;
	vector < vector<string> > arguments;
	vector < string > hosts;
	int number_process[1];
	vector < string > arg;

	commands.push_back ( "client" );
	arg.push_back ( "none" );
	arguments.push_back ( arg );
	hosts.push_back ( "hera" );	/** This is the host where the server will spawn the clients. */
	number_process[0] = 2;
	string server_work_directory = "/home/speed/rsilva/Desktop";	/** This have to be changed. */
	
	cout << "++++++++++++++ Server spawning 2 copies of client" << endl;
	client_comm = spawn ( self_comm, arguments, commands, hosts, number_process, server_work_directory );

	cout << "++++++++++++++ Server removing the client rank 0" << endl;
	removeClient ( client_comm, 0 );
	
	for (int i = 0; i < client_comm.Get_remote_size ( ); ++i) {
		client_comm.Send ( NULL, 0, MPI::BYTE, i, OP_SHUT );
	}

	cout << "++++++++++++++ Server before barrier " << endl;
	client_comm.Barrier();
	cout << "++++++++++++++ Server after barrier " << endl;

	MPI::Finalize ( );
}

void removeClient ( MPI::Intercomm& client_comm, int rank ) {

	MPI::Intercomm tmp_inter_comm;
	int message;
	message = rank;

	for (int i = 0; i < client_comm.Get_remote_size ( ); ++i) {
		client_comm.Send ( (void*) &message, sizeof(int), MPI::BYTE, i, OP_REM );
	}

	tmp_inter_comm = client_comm.Create ( client_comm.Get_group ( ) );
	client_comm.Free ( );
	client_comm = tmp_inter_comm;
}

MPI::Intercomm spawn ( MPI::Intracomm& self_comm, vector<vector<string> > & argv, vector<string>& commands, vector<string>& hosts, int* number_process, string& work_directory ) {

	char* cmd[commands.size ( )];
	char **arguments[argv.size ( )];
	MPI::Info info[hosts.size ( )];
	MPI::Intercomm children_intercomm;
	string err_message;

	/** Creates the data structures required by the mpi spawn_multiple command. */
	for (int i = 0; i < (int) hosts.size ( ); ++i) {
		/** Commands */
		cmd[i] = (char*) malloc ( sizeof(char) * commands[i].length ( ) );
		strcpy ( cmd[i], commands[i].c_str ( ) );

		/** Commands' arguments */
		arguments[i] = (char**) malloc ( sizeof(char*) * (argv[i].size ( ) + 1) );
		for (int j = 0; j < (int) argv[i].size ( ); ++j) {
			arguments[i][j] = (char*) malloc ( sizeof(char) * argv[i][j].length ( ) );
			strcpy ( arguments[i][j], argv[i][j].c_str ( ) );
		}
		arguments[i][argv[i].size ( )] = NULL;

		/** Infos */
		info[i] = MPI::Info::Create ( );
		info[i].Set ( "host", hosts[i].c_str ( ) );
		info[i].Set ( "wdir", work_directory.c_str ( ) );
	}

	/** Tries to spawn the processes. */
	children_intercomm = self_comm.Spawn_multiple ( hosts.size ( ), (const char**) cmd, (const char***) arguments, number_process, info, 0 );

	/** Releases the used memory space. */
	for (int i = 0; i < (int) hosts.size ( ); ++i) {
		info[i].Free ( );
		free ( cmd[i] );
		for (int j = 0; j < (int) argv[i].size ( ); ++j) {
			free ( arguments[i][j] );
		}
		free ( arguments[i] );
	}

	return children_intercomm;
}
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <mpi.h>

#include "constants.h"

using namespace std;

void removeRank ( MPI::Intracomm& local_comm, MPI::Intercomm& server_comm, int rank );

int main ( int argc, char *argv[] ) {

	/** Initializes the communicators */
	MPI::Init ( argc, argv );
	MPI::Intracomm local_comm = MPI::COMM_WORLD;
	MPI::Intercomm server_comm = MPI::Comm::Get_parent ( );
	
	MPI::Status status;
	bool shut_notification = false;
	
	/** Waits messages until receiving the shutdown notification message.  */
	do {
		if (server_comm == MPI::COMM_NULL || local_comm == MPI::COMM_NULL) { /** Process was detached from the communicators. */
			shut_notification = true;
		}
		else if (server_comm.Iprobe ( MPI_ANY_SOURCE, MPI_ANY_TAG, status )) {
			int message;
			server_comm.Recv ( &message, sizeof(int), MPI::BYTE, MPI_ANY_SOURCE, MPI_ANY_TAG, status );
			if (status.Get_tag ( ) == OP_SHUT) {
				shut_notification = true;
			}
			else if (status.Get_tag ( ) == OP_REM) {	/** >>>>>>>>>> Calls the function to detach a process. */
				removeRank ( local_comm, server_comm, message );
			}
		}
	} while (!shut_notification);

	/** Only the client that was not removed calls the barrier within the server communicator */
	if (server_comm != MPI::COMM_NULL) {
		cout << "++++++++++++++ Client - before barrier " << endl;
		server_comm.Barrier();
		cout << "++++++++++++++ Client - after barrier " << endl;
	}

	cout << "++++++++++++++ Client terminating" << endl;
	MPI::Finalize ( );
	return 0;
}

void removeRank ( MPI::Intracomm& local_comm, MPI::Intercomm& server_comm, int rank ) {
	/** Removes the process from the server inter-communicator. */
	MPI::Intercomm tmp_inter_comm = server_comm.Create ( server_comm.Get_group ( ).Excl ( 1, &rank ) );
	server_comm.Free ( );
	server_comm = tmp_inter_comm;

	/** Removes the process from the local intra-communicator. */
	MPI::Intracomm tmp_intra_comm = local_comm.Create ( local_comm.Get_group ( ).Excl ( 1, &rank ) );
	local_comm = tmp_intra_comm;
}

Reply via email to