Hi Edgar.
Thanks for the response. The simplified code is attached: server, client
and a .h containing some constants. I put some "prints" to show the
behavior.
Regards
Rodrigo
On Tue, Mar 20, 2012 at 11:47 AM, Edgar Gabriel <gabr...@cs.uh.edu> wrote:
> do you have by any chance the actual or a small reproducer? It might be
> much easier to hunt the problem down...
>
> Thanks
> Edgar
>
> On 3/19/2012 8:12 PM, Rodrigo Oliveira wrote:
> > Hi there.
> >
> > I am facing a very strange problem when using MPI_Barrier over an
> > inter-communicator after some operations I describe bellow:
> >
> > 1) I start a server calling mpirun.
> > 2) The server spawns 2 copies of a client using MPI_Comm_spawn, creating
> > an inter-communicator between the two groups. The server group with 1
> > process (lets name it as A) and the client group with 2 processes (group
> B).
> > 3) After that, I need to detach one of the processes (rank 0) in group B
> > from the inter-communicator AB. To do that I do the following steps:
> >
> > Server side:
> > .....
> > tmp_inter_comm = client_comm.Create ( client_comm.Get_group ( )
> );
> > client_comm.Free ( );
> > client_comm = tmp_inter_comm;
> > .....
> > client_comm.Barrier();
> > .....
> >
> > Client side:
> > ....
> > rank = 0;
> > tmp_inter_comm = server_comm.Create ( server_comm.Get_group (
> > ).Excl ( 1, &rank ) );
> > server_comm.Free ( );
> > server_comm = tmp_inter_comm;
> > .....
> > if (server_comm != MPI::COMM_NULL)
> > server_comm.Barrier();
> >
> >
> > The problem: everything works fine until the call to Barrier. In that
> > point, the server exits the barrier, but the client at the group B does
> > not. Observe that we have only one process inside B, because I used Excl
> > to remove one process from the original group.
> >
> > p.s.: This occurs in the version 1.5.4 and the C++ API.
> >
> > I am very concerned about this problem because this solution plays a
> > very important role in my master thesis.
> >
> > Is this an ompi problem or am I doing something wrong?
> >
> > Thanks in advance
> >
> > Rodrigo Oliveira
> >
> >
> > _______________________________________________
> > users mailing list
> > us...@open-mpi.org
> > http://www.open-mpi.org/mailman/listinfo.cgi/users
>
>
> _______________________________________________
> users mailing list
> us...@open-mpi.org
> http://www.open-mpi.org/mailman/listinfo.cgi/users
>
#define OP_SHUT 0
#define OP_REM 1
#include <mpi.h>
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <iostream>
#include <iostream>
#include <vector>
#include <string>
#include "constants.h"
using namespace std;
MPI::Intercomm spawn ( MPI::Intracomm& self_comm, vector<vector<string> > & argv, vector<string>& commands, vector<string>& hosts, int* number_process, string& work_directory );
void removeClient ( MPI::Intercomm& client_comm, int rank );
int main ( int argc, char *argv[] ) {
MPI::Init ( argc, argv );
/** Initializations */
MPI::Intracomm self_comm = MPI::COMM_SELF;
MPI::Intercomm client_comm;
vector < string > commands;
vector < vector<string> > arguments;
vector < string > hosts;
int number_process[1];
vector < string > arg;
commands.push_back ( "client" );
arg.push_back ( "none" );
arguments.push_back ( arg );
hosts.push_back ( "hera" ); /** This is the host where the server will spawn the clients. */
number_process[0] = 2;
string server_work_directory = "/home/speed/rsilva/Desktop"; /** This have to be changed. */
cout << "++++++++++++++ Server spawning 2 copies of client" << endl;
client_comm = spawn ( self_comm, arguments, commands, hosts, number_process, server_work_directory );
cout << "++++++++++++++ Server removing the client rank 0" << endl;
removeClient ( client_comm, 0 );
for (int i = 0; i < client_comm.Get_remote_size ( ); ++i) {
client_comm.Send ( NULL, 0, MPI::BYTE, i, OP_SHUT );
}
cout << "++++++++++++++ Server before barrier " << endl;
client_comm.Barrier();
cout << "++++++++++++++ Server after barrier " << endl;
MPI::Finalize ( );
}
void removeClient ( MPI::Intercomm& client_comm, int rank ) {
MPI::Intercomm tmp_inter_comm;
int message;
message = rank;
for (int i = 0; i < client_comm.Get_remote_size ( ); ++i) {
client_comm.Send ( (void*) &message, sizeof(int), MPI::BYTE, i, OP_REM );
}
tmp_inter_comm = client_comm.Create ( client_comm.Get_group ( ) );
client_comm.Free ( );
client_comm = tmp_inter_comm;
}
MPI::Intercomm spawn ( MPI::Intracomm& self_comm, vector<vector<string> > & argv, vector<string>& commands, vector<string>& hosts, int* number_process, string& work_directory ) {
char* cmd[commands.size ( )];
char **arguments[argv.size ( )];
MPI::Info info[hosts.size ( )];
MPI::Intercomm children_intercomm;
string err_message;
/** Creates the data structures required by the mpi spawn_multiple command. */
for (int i = 0; i < (int) hosts.size ( ); ++i) {
/** Commands */
cmd[i] = (char*) malloc ( sizeof(char) * commands[i].length ( ) );
strcpy ( cmd[i], commands[i].c_str ( ) );
/** Commands' arguments */
arguments[i] = (char**) malloc ( sizeof(char*) * (argv[i].size ( ) + 1) );
for (int j = 0; j < (int) argv[i].size ( ); ++j) {
arguments[i][j] = (char*) malloc ( sizeof(char) * argv[i][j].length ( ) );
strcpy ( arguments[i][j], argv[i][j].c_str ( ) );
}
arguments[i][argv[i].size ( )] = NULL;
/** Infos */
info[i] = MPI::Info::Create ( );
info[i].Set ( "host", hosts[i].c_str ( ) );
info[i].Set ( "wdir", work_directory.c_str ( ) );
}
/** Tries to spawn the processes. */
children_intercomm = self_comm.Spawn_multiple ( hosts.size ( ), (const char**) cmd, (const char***) arguments, number_process, info, 0 );
/** Releases the used memory space. */
for (int i = 0; i < (int) hosts.size ( ); ++i) {
info[i].Free ( );
free ( cmd[i] );
for (int j = 0; j < (int) argv[i].size ( ); ++j) {
free ( arguments[i][j] );
}
free ( arguments[i] );
}
return children_intercomm;
}
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <mpi.h>
#include "constants.h"
using namespace std;
void removeRank ( MPI::Intracomm& local_comm, MPI::Intercomm& server_comm, int rank );
int main ( int argc, char *argv[] ) {
/** Initializes the communicators */
MPI::Init ( argc, argv );
MPI::Intracomm local_comm = MPI::COMM_WORLD;
MPI::Intercomm server_comm = MPI::Comm::Get_parent ( );
MPI::Status status;
bool shut_notification = false;
/** Waits messages until receiving the shutdown notification message. */
do {
if (server_comm == MPI::COMM_NULL || local_comm == MPI::COMM_NULL) { /** Process was detached from the communicators. */
shut_notification = true;
}
else if (server_comm.Iprobe ( MPI_ANY_SOURCE, MPI_ANY_TAG, status )) {
int message;
server_comm.Recv ( &message, sizeof(int), MPI::BYTE, MPI_ANY_SOURCE, MPI_ANY_TAG, status );
if (status.Get_tag ( ) == OP_SHUT) {
shut_notification = true;
}
else if (status.Get_tag ( ) == OP_REM) { /** >>>>>>>>>> Calls the function to detach a process. */
removeRank ( local_comm, server_comm, message );
}
}
} while (!shut_notification);
/** Only the client that was not removed calls the barrier within the server communicator */
if (server_comm != MPI::COMM_NULL) {
cout << "++++++++++++++ Client - before barrier " << endl;
server_comm.Barrier();
cout << "++++++++++++++ Client - after barrier " << endl;
}
cout << "++++++++++++++ Client terminating" << endl;
MPI::Finalize ( );
return 0;
}
void removeRank ( MPI::Intracomm& local_comm, MPI::Intercomm& server_comm, int rank ) {
/** Removes the process from the server inter-communicator. */
MPI::Intercomm tmp_inter_comm = server_comm.Create ( server_comm.Get_group ( ).Excl ( 1, &rank ) );
server_comm.Free ( );
server_comm = tmp_inter_comm;
/** Removes the process from the local intra-communicator. */
MPI::Intracomm tmp_intra_comm = local_comm.Create ( local_comm.Get_group ( ).Excl ( 1, &rank ) );
local_comm = tmp_intra_comm;
}