Hi,

I have a small matrix multiplication program which computes wrong
results in a heterogeneous environment with different little endian
and big endian architectures. Every process computes one row (block)
of the result matrix.


Solaris 10 x86_64 and Linux x86_64:

tyr matrix 162 mpiexec -np 4 -host sunpc0,sunpc1,linpc0,linpc1 mat_mult_1
Process 0 of 4 running on sunpc0
Process 1 of 4 running on sunpc1
Process 2 of 4 running on linpc0
Process 3 of 4 running on linpc1
...
(4,8)-result-matrix c = a * b :

   448   427   406   385   364   343   322   301
  1456  1399  1342  1285  1228  1171  1114  1057
  2464  2371  2278  2185  2092  1999  1906  1813
  3472  3343  3214  3085  2956  2827  2698  2569


Solaris Sparc:

tyr matrix 167 mpiexec -np 4 -host tyr,rs0,rs1 mat_mult_1
Process 0 of 4 running on tyr.informatik.hs-fulda.de
Process 3 of 4 running on tyr.informatik.hs-fulda.de
Process 2 of 4 running on rs1.informatik.hs-fulda.de
Process 1 of 4 running on rs0.informatik.hs-fulda.de
...
(4,8)-result-matrix c = a * b :

   448   427   406   385   364   343   322   301
  1456  1399  1342  1285  1228  1171  1114  1057
  2464  2371  2278  2185  2092  1999  1906  1813
  3472  3343  3214  3085  2956  2827  2698  2569


Solaris Sparc and x86_64: Rows 1 and 3 are from sunpc0 (adding the
option "-hetero" doesn't change anything)

tyr matrix 168 mpiexec -np 4 -host tyr,sunpc0 mat_mult_1
Process 1 of 4 running on sunpc0
Process 3 of 4 running on sunpc0
Process 0 of 4 running on tyr.informatik.hs-fulda.de
Process 2 of 4 running on tyr.informatik.hs-fulda.de
...
(4,8)-result-matrix c = a * b :

   448   427   406   385   364   343   322   301
    48-3.01737e+304-3.1678e+296  -NaN     0-7.40627e+304-3.16839e+296  -NaN
  2464  2371  2278  2185  2092  1999  1906  1813
    48-3.01737e+304-3.18057e+296  -NaN2.122e-314-7.68057e+304-3.26998e+296  -NaN


Solaris Sparc and Linux x86_64: Rows 1 and 3 are from linpc0

tyr matrix 169 mpiexec -np 4 -host tyr,linpc0 mat_mult_1
Process 0 of 4 running on tyr.informatik.hs-fulda.de
Process 2 of 4 running on tyr.informatik.hs-fulda.de
Process 1 of 4 running on linpc0
Process 3 of 4 running on linpc0
...
(4,8)-result-matrix c = a * b :

   448   427   406   385   364   343   322   301
     0     0     0     0     0     08.10602e-3124.27085e-319
  2464  2371  2278  2185  2092  1999  1906  1813
6.66666e-3152.86948e-3161.73834e-3101.39066e-3092.122e-3141.39066e-3091.39066e-3
099.88131e-324

In the past the program worked in a heterogeneous environment. This
is the main part of the program.

...
  double a[P][Q], b[Q][R],              /* matrices to multiply         */
         c[P][R],                       /* matrix for result            */
         row_a[Q],                      /* one row of matrix "a"        */
         row_c[R];                      /* one row of matrix "c"        */
...
  /* send matrix "b" to all processes                                   */
  MPI_Bcast (b, Q * R, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  /* send row i of "a" to process i                                     */
  MPI_Scatter (a, Q, MPI_DOUBLE, row_a, Q, MPI_DOUBLE, 0,
               MPI_COMM_WORLD);
  for (j = 0; j < R; ++j)               /* compute i-th row of "c"      */
  {
    row_c[j] = 0.0;
    for (k = 0; k < Q; ++k)
    {
      row_c[j] = row_c[j] + row_a[k] * b[k][j];
    }
  }
  /* receive row i of "c" from process i                                */
  MPI_Gather (row_c, R, MPI_DOUBLE, c, R, MPI_DOUBLE, 0,
              MPI_COMM_WORLD);
...


Does anybody know why my program doesn't work? It blocks with
openmpi-1.7a1r27379 and openmpi-1.9a1r27380 (I had to add one
more machine because my local machine will not be used in these
versions) and it works as long as the machines have the same
endian.

tyr matrix 110 mpiexec -np 4 -host tyr,linpc0,rs0 mat_mult_1
Process 0 of 4 running on linpc0
Process 1 of 4 running on linpc0
Process 3 of 4 running on rs0.informatik.hs-fulda.de
Process 2 of 4 running on rs0.informatik.hs-fulda.de
...
(6,8)-matrix b:

    48    47    46    45    44    43    42    41
    40    39    38    37    36    35    34    33
    32    31    30    29    28    27    26    25
    24    23    22    21    20    19    18    17
    16    15    14    13    12    11    10     9
     8     7     6     5     4     3     2     1

^CKilled by signal 2.
Killed by signal 2.


Thank you very much for any help in advance.


Kind regards

Siegmar
#include <stdio.h>
#include <stdlib.h>
#include "mpi.h"

#define	P		4		/* # of rows			*/
#define Q		6		/* # of columns / rows		*/
#define R		8		/* # of columns			*/

static void print_matrix (int p, int q, double **mat);

int main (int argc, char *argv[])
{
  int	 ntasks,			/* number of parallel tasks	*/
	 mytid,				/* my task id			*/
	 namelen,			/* length of processor name	*/
	 i, j, k,			/* loop variables		*/
	 tmp;				/* temporary value		*/
  double a[P][Q], b[Q][R],		/* matrices to multiply		*/
	 c[P][R],			/* matrix for result		*/
	 row_a[Q],			/* one row of matrix "a"	*/
	 row_c[R];			/* one row of matrix "c"	*/
  char	 processor_name[MPI_MAX_PROCESSOR_NAME];

  MPI_Init (&argc, &argv);
  MPI_Comm_rank (MPI_COMM_WORLD, &mytid);
  MPI_Comm_size (MPI_COMM_WORLD, &ntasks);
  MPI_Get_processor_name (processor_name, &namelen);
  fprintf (stdout, "Process %d of %d running on %s\n",
	   mytid, ntasks, processor_name);
  fflush (stdout);
  MPI_Barrier (MPI_COMM_WORLD);		/* wait for all other processes	*/

  if ((ntasks != P) && (mytid == 0))
  {
    fprintf (stderr, "\n\nI need %d processes.\n"
	     "Usage:\n"
	     "  mpiexec -np %d %s.\n\n",
	     P, P, argv[0]);
  }
  if (ntasks != P)
  {
    MPI_Finalize ();
    exit (EXIT_FAILURE);
  }
  if (mytid == 0)
  {
    tmp = 1;
    for (i = 0; i < P; ++i)		/* initialize matrix a		*/
    {
      for (j = 0; j < Q; ++j)
      {
	a[i][j] = tmp++;
      }
    }
    printf ("\n\n(%d,%d)-matrix a:\n\n", P, Q);
    print_matrix (P, Q, (double **) a);
    tmp = Q * R;
    for (i = 0; i < Q; ++i)		/* initialize matrix b		*/
    {
      for (j = 0; j < R; ++j)
      {
	b[i][j] = tmp--;
      }
    }
    printf ("(%d,%d)-matrix b:\n\n", Q, R);
    print_matrix (Q, R, (double **) b);
  }
  /* send matrix "b" to all processes					*/
  MPI_Bcast (b, Q * R, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  /* send row i of "a" to process i					*/
  MPI_Scatter (a, Q, MPI_DOUBLE, row_a, Q, MPI_DOUBLE, 0,
	       MPI_COMM_WORLD);
  for (j = 0; j < R; ++j)		/* compute i-th row of "c"	*/
  {
    row_c[j] = 0.0;
    for (k = 0; k < Q; ++k)
    {
      row_c[j] = row_c[j] + row_a[k] * b[k][j];
    }
  }
  /* receive row i of "c" from process i				*/
  MPI_Gather (row_c, R, MPI_DOUBLE, c, R, MPI_DOUBLE, 0,
	      MPI_COMM_WORLD);
  if (mytid == 0)
  {
    printf ("(%d,%d)-result-matrix c = a * b :\n\n", P, R);
    print_matrix (P, R, (double **) c);
  }
  MPI_Finalize ();
  return EXIT_SUCCESS;
}


void print_matrix (int p, int q, double **mat)
{
  int i, j;				/* loop variables		*/

  for (i = 0; i < p; ++i)
  {
    for (j = 0; j < q; ++j)
    {
      printf ("%6g", *((double *) mat + i * q + j));
    }
    printf ("\n");
  }
  printf ("\n");
}

Reply via email to