Hi all,

I have a very strange behaviour in a program. It seems that messages that are sent from one processor to another are getting lost.

The problem is isolated in the attached source code. The code works as follows. Two processess send each other 100k request. Each request is answered and triggers a number of requests to the other process in return. As you might already suspect, the communication is asynchronous.

I already debugged the application and found that at one point during the communication at least one of the processes does not receive any messages anymore and hangs in the while loop beginning in line 45.

The program is started with two processes on a single machine and no other parameters: "mpirun -np 2 ./mpi_test2".

I appreciate your help.

Best wishes,
Dennis

--
Dennis Luxen
Universität Karlsruhe (TH)           | Fon  : +49 (721) 608-6781
Institut für Theoretische Informatik | Fax  : +49 (721) 608-3088
Am Fasanengarten 5, Zimmer 220       | WWW  : algo2.ira.uka.de/luxen
D-76131 Karlsruhe, Germany           | Email: lu...@kit.edu
--------------------------------------------------------------------

#include <iostream>
#include <fstream>
#include <sstream>
#include <cassert>
#include <queue>
#include <list>
#include <cstdlib>
#include <mpi.h>

std::ofstream output_file;

enum {REQUEST_TAG=4321, ANSWER_TAG, FINISHED_TAG};

typedef int Answer_type;


int main(int argc, char *argv[])
{
	MPI_Init (&argc, &argv);	// starts MPI
	int number_of_PEs, my_PE_ID;
	MPI_Comm_size(MPI_COMM_WORLD, &number_of_PEs);
	assert(number_of_PEs == 2);
	MPI_Comm_rank(MPI_COMM_WORLD, &my_PE_ID);

	std::srand(123456);

	int number_of_requests_to_send = 100000;
	int number_of_requests_to_recv = number_of_requests_to_send;
	int number_of_answers_to_recv  = number_of_requests_to_send;

	std::stringstream filename;
	filename<<"output"<<my_PE_ID<<".txt";
	output_file.open(filename.str().c_str());

	int buffer[100];
	MPI_Request dummy_request;

	//Send the first request
	MPI_Isend(buffer, 1, MPI_INT, 1-my_PE_ID, REQUEST_TAG, MPI_COMM_WORLD, &dummy_request);
	number_of_requests_to_send--;

	int working_PEs = number_of_PEs;
	bool lack_of_work_sent = false;
	bool there_was_change = true;
	while(working_PEs > 0)
	{
		if(there_was_change)
		{
			there_was_change = false;
			std::cout<<my_PE_ID<<": req_to_recv = "<<number_of_requests_to_recv
							<<", req_to_send = "<<number_of_requests_to_send
							<<", answers_to_recv = "<<number_of_answers_to_recv
							<<std::endl;
			output_file<<my_PE_ID<<": req_to_recv = "<<number_of_requests_to_recv
							<<", req_to_send = "<<number_of_requests_to_send
							<<", answers_to_recv = "<<number_of_answers_to_recv
							<<std::endl;
		}

		MPI_Status status;
		int flag = 1;
		int number_of_answer;
//		MPI_Probe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
		MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &flag, &status);
		if(flag)
		{
			there_was_change = true;
			switch(status.MPI_TAG){
				case(REQUEST_TAG):
					MPI_Recv(buffer, 1, MPI_INT, status.MPI_SOURCE, REQUEST_TAG, MPI_COMM_WORLD, &status);
					MPI_Isend(buffer, (1<<(std::rand()%5))*sizeof(int), MPI_BYTE, 1-my_PE_ID, ANSWER_TAG, MPI_COMM_WORLD, &dummy_request);
					number_of_requests_to_recv--;
				break;
				case(ANSWER_TAG):
					number_of_answers_to_recv--;
					MPI_Get_count( &status, MPI_BYTE, &number_of_answer);

					MPI_Recv(buffer, number_of_answer, MPI_BYTE, status.MPI_SOURCE, ANSWER_TAG, MPI_COMM_WORLD, &status);

					for(int i = (number_of_answer+3)/4; (i>0)&&(number_of_requests_to_send>0); i--)
					{
						MPI_Isend(buffer, 1, MPI_INT, 1-my_PE_ID, REQUEST_TAG, MPI_COMM_WORLD, &dummy_request);
						number_of_requests_to_send--;
					}
				break;
				case(FINISHED_TAG):
					MPI_Recv(buffer, 1, MPI_INT, status.MPI_SOURCE, FINISHED_TAG, MPI_COMM_WORLD, &status);
					working_PEs--;
				break;
			}
		}
		if((number_of_answers_to_recv == 0) && (!lack_of_work_sent))
		{
			there_was_change = true;
			MPI_Isend(buffer, 1, MPI_INT, 1-my_PE_ID, FINISHED_TAG, MPI_COMM_WORLD, &dummy_request);
			working_PEs--;
			lack_of_work_sent = true;
		}
	}
	MPI_Barrier(MPI_COMM_WORLD);
	std::cout<<my_PE_ID<<": Finished normaly"<<std::endl;
	MPI_Finalize();

	return 0;
}
                 Package: Open MPI abuild@build26 Distribution
                Open MPI: 1.3.2
   Open MPI SVN revision: r21054
   Open MPI release date: Apr 21, 2009
                Open RTE: 1.3.2
   Open RTE SVN revision: r21054
   Open RTE release date: Apr 21, 2009
                    OPAL: 1.3.2
       OPAL SVN revision: r21054
       OPAL release date: Apr 21, 2009
            Ident string: 1.3.2
                  Prefix: /usr/lib64/mpi/gcc/openmpi
 Configured architecture: x86_64-suse-linux-gnu
          Configure host: build26
           Configured by: abuild
           Configured on: Tue May  5 16:03:55 UTC 2009
          Configure host: build26
                Built by: abuild
                Built on: Tue May  5 16:18:52 UTC 2009
              Built host: build26
              C bindings: yes
            C++ bindings: yes
      Fortran77 bindings: yes (all)
      Fortran90 bindings: yes
 Fortran90 bindings size: small
              C compiler: gcc
     C compiler absolute: /usr/bin/gcc
            C++ compiler: g++
   C++ compiler absolute: /usr/bin/g++
      Fortran77 compiler: gfortran
  Fortran77 compiler abs: /usr/bin/gfortran
      Fortran90 compiler: gfortran
  Fortran90 compiler abs: /usr/bin/gfortran
             C profiling: yes
           C++ profiling: yes
     Fortran77 profiling: yes
     Fortran90 profiling: yes
          C++ exceptions: no
          Thread support: posix (mpi: no, progress: no)
           Sparse Groups: no
  Internal debug support: no
     MPI parameter check: runtime
Memory profiling support: no
Memory debugging support: no
         libltdl support: yes
   Heterogeneous support: no
 mpirun default --prefix: no
         MPI I/O support: yes
       MPI_WTIME support: gettimeofday
Symbol visibility support: yes
   FT Checkpoint support: no  (checkpoint thread: no)
           MCA backtrace: execinfo (MCA v2.0, API v2.0, Component v1.3.2)
              MCA memory: ptmalloc2 (MCA v2.0, API v2.0, Component v1.3.2)
           MCA paffinity: linux (MCA v2.0, API v2.0, Component v1.3.2)
               MCA carto: auto_detect (MCA v2.0, API v2.0, Component v1.3.2)
               MCA carto: file (MCA v2.0, API v2.0, Component v1.3.2)
           MCA maffinity: first_use (MCA v2.0, API v2.0, Component v1.3.2)
               MCA timer: linux (MCA v2.0, API v2.0, Component v1.3.2)
         MCA installdirs: env (MCA v2.0, API v2.0, Component v1.3.2)
         MCA installdirs: config (MCA v2.0, API v2.0, Component v1.3.2)
                 MCA dpm: orte (MCA v2.0, API v2.0, Component v1.3.2)
              MCA pubsub: orte (MCA v2.0, API v2.0, Component v1.3.2)
           MCA allocator: basic (MCA v2.0, API v2.0, Component v1.3.2)
           MCA allocator: bucket (MCA v2.0, API v2.0, Component v1.3.2)
                MCA coll: basic (MCA v2.0, API v2.0, Component v1.3.2)
                MCA coll: hierarch (MCA v2.0, API v2.0, Component v1.3.2)
                MCA coll: inter (MCA v2.0, API v2.0, Component v1.3.2)
                MCA coll: self (MCA v2.0, API v2.0, Component v1.3.2)
                MCA coll: sm (MCA v2.0, API v2.0, Component v1.3.2)
                MCA coll: sync (MCA v2.0, API v2.0, Component v1.3.2)
                MCA coll: tuned (MCA v2.0, API v2.0, Component v1.3.2)
                  MCA io: romio (MCA v2.0, API v2.0, Component v1.3.2)
               MCA mpool: fake (MCA v2.0, API v2.0, Component v1.3.2)
               MCA mpool: rdma (MCA v2.0, API v2.0, Component v1.3.2)
               MCA mpool: sm (MCA v2.0, API v2.0, Component v1.3.2)
                 MCA pml: cm (MCA v2.0, API v2.0, Component v1.3.2)
                 MCA pml: csum (MCA v2.0, API v2.0, Component v1.3.2)
                 MCA pml: ob1 (MCA v2.0, API v2.0, Component v1.3.2)
                 MCA pml: v (MCA v2.0, API v2.0, Component v1.3.2)
                 MCA bml: r2 (MCA v2.0, API v2.0, Component v1.3.2)
              MCA rcache: vma (MCA v2.0, API v2.0, Component v1.3.2)
                 MCA btl: self (MCA v2.0, API v2.0, Component v1.3.2)
                 MCA btl: sm (MCA v2.0, API v2.0, Component v1.3.2)
                 MCA btl: tcp (MCA v2.0, API v2.0, Component v1.3.2)
                MCA topo: unity (MCA v2.0, API v2.0, Component v1.3.2)
                 MCA osc: pt2pt (MCA v2.0, API v2.0, Component v1.3.2)
                 MCA osc: rdma (MCA v2.0, API v2.0, Component v1.3.2)
                 MCA iof: hnp (MCA v2.0, API v2.0, Component v1.3.2)
                 MCA iof: orted (MCA v2.0, API v2.0, Component v1.3.2)
                 MCA iof: tool (MCA v2.0, API v2.0, Component v1.3.2)
                 MCA oob: tcp (MCA v2.0, API v2.0, Component v1.3.2)
                MCA odls: default (MCA v2.0, API v2.0, Component v1.3.2)
                 MCA ras: slurm (MCA v2.0, API v2.0, Component v1.3.2)
               MCA rmaps: rank_file (MCA v2.0, API v2.0, Component v1.3.2)
               MCA rmaps: round_robin (MCA v2.0, API v2.0, Component v1.3.2)
               MCA rmaps: seq (MCA v2.0, API v2.0, Component v1.3.2)
                 MCA rml: oob (MCA v2.0, API v2.0, Component v1.3.2)
              MCA routed: binomial (MCA v2.0, API v2.0, Component v1.3.2)
              MCA routed: direct (MCA v2.0, API v2.0, Component v1.3.2)
              MCA routed: linear (MCA v2.0, API v2.0, Component v1.3.2)
                 MCA plm: rsh (MCA v2.0, API v2.0, Component v1.3.2)
                 MCA plm: slurm (MCA v2.0, API v2.0, Component v1.3.2)
               MCA filem: rsh (MCA v2.0, API v2.0, Component v1.3.2)
              MCA errmgr: default (MCA v2.0, API v2.0, Component v1.3.2)
                 MCA ess: env (MCA v2.0, API v2.0, Component v1.3.2)
                 MCA ess: hnp (MCA v2.0, API v2.0, Component v1.3.2)
                 MCA ess: singleton (MCA v2.0, API v2.0, Component v1.3.2)
                 MCA ess: slurm (MCA v2.0, API v2.0, Component v1.3.2)
                 MCA ess: tool (MCA v2.0, API v2.0, Component v1.3.2)
             MCA grpcomm: bad (MCA v2.0, API v2.0, Component v1.3.2)
             MCA grpcomm: basic (MCA v2.0, API v2.0, Component v1.3.2)

Reply via email to