Hi, I have a small matrix multiplication program which computes wrong results in a heterogeneous environment with different little endian and big endian architectures. Every process computes one row (block) of the result matrix.
Solaris 10 x86_64 and Linux x86_64: tyr matrix 162 mpiexec -np 4 -host sunpc0,sunpc1,linpc0,linpc1 mat_mult_1 Process 0 of 4 running on sunpc0 Process 1 of 4 running on sunpc1 Process 2 of 4 running on linpc0 Process 3 of 4 running on linpc1 ... (4,8)-result-matrix c = a * b : 448 427 406 385 364 343 322 301 1456 1399 1342 1285 1228 1171 1114 1057 2464 2371 2278 2185 2092 1999 1906 1813 3472 3343 3214 3085 2956 2827 2698 2569 Solaris Sparc: tyr matrix 167 mpiexec -np 4 -host tyr,rs0,rs1 mat_mult_1 Process 0 of 4 running on tyr.informatik.hs-fulda.de Process 3 of 4 running on tyr.informatik.hs-fulda.de Process 2 of 4 running on rs1.informatik.hs-fulda.de Process 1 of 4 running on rs0.informatik.hs-fulda.de ... (4,8)-result-matrix c = a * b : 448 427 406 385 364 343 322 301 1456 1399 1342 1285 1228 1171 1114 1057 2464 2371 2278 2185 2092 1999 1906 1813 3472 3343 3214 3085 2956 2827 2698 2569 Solaris Sparc and x86_64: Rows 1 and 3 are from sunpc0 (adding the option "-hetero" doesn't change anything) tyr matrix 168 mpiexec -np 4 -host tyr,sunpc0 mat_mult_1 Process 1 of 4 running on sunpc0 Process 3 of 4 running on sunpc0 Process 0 of 4 running on tyr.informatik.hs-fulda.de Process 2 of 4 running on tyr.informatik.hs-fulda.de ... (4,8)-result-matrix c = a * b : 448 427 406 385 364 343 322 301 48-3.01737e+304-3.1678e+296 -NaN 0-7.40627e+304-3.16839e+296 -NaN 2464 2371 2278 2185 2092 1999 1906 1813 48-3.01737e+304-3.18057e+296 -NaN2.122e-314-7.68057e+304-3.26998e+296 -NaN Solaris Sparc and Linux x86_64: Rows 1 and 3 are from linpc0 tyr matrix 169 mpiexec -np 4 -host tyr,linpc0 mat_mult_1 Process 0 of 4 running on tyr.informatik.hs-fulda.de Process 2 of 4 running on tyr.informatik.hs-fulda.de Process 1 of 4 running on linpc0 Process 3 of 4 running on linpc0 ... (4,8)-result-matrix c = a * b : 448 427 406 385 364 343 322 301 0 0 0 0 0 08.10602e-3124.27085e-319 2464 2371 2278 2185 2092 1999 1906 1813 6.66666e-3152.86948e-3161.73834e-3101.39066e-3092.122e-3141.39066e-3091.39066e-3 099.88131e-324 In the past the program worked in a heterogeneous environment. This is the main part of the program. ... double a[P][Q], b[Q][R], /* matrices to multiply */ c[P][R], /* matrix for result */ row_a[Q], /* one row of matrix "a" */ row_c[R]; /* one row of matrix "c" */ ... /* send matrix "b" to all processes */ MPI_Bcast (b, Q * R, MPI_DOUBLE, 0, MPI_COMM_WORLD); /* send row i of "a" to process i */ MPI_Scatter (a, Q, MPI_DOUBLE, row_a, Q, MPI_DOUBLE, 0, MPI_COMM_WORLD); for (j = 0; j < R; ++j) /* compute i-th row of "c" */ { row_c[j] = 0.0; for (k = 0; k < Q; ++k) { row_c[j] = row_c[j] + row_a[k] * b[k][j]; } } /* receive row i of "c" from process i */ MPI_Gather (row_c, R, MPI_DOUBLE, c, R, MPI_DOUBLE, 0, MPI_COMM_WORLD); ... Does anybody know why my program doesn't work? It blocks with openmpi-1.7a1r27379 and openmpi-1.9a1r27380 (I had to add one more machine because my local machine will not be used in these versions) and it works as long as the machines have the same endian. tyr matrix 110 mpiexec -np 4 -host tyr,linpc0,rs0 mat_mult_1 Process 0 of 4 running on linpc0 Process 1 of 4 running on linpc0 Process 3 of 4 running on rs0.informatik.hs-fulda.de Process 2 of 4 running on rs0.informatik.hs-fulda.de ... (6,8)-matrix b: 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 ^CKilled by signal 2. Killed by signal 2. Thank you very much for any help in advance. Kind regards Siegmar
#include <stdio.h> #include <stdlib.h> #include "mpi.h" #define P 4 /* # of rows */ #define Q 6 /* # of columns / rows */ #define R 8 /* # of columns */ static void print_matrix (int p, int q, double **mat); int main (int argc, char *argv[]) { int ntasks, /* number of parallel tasks */ mytid, /* my task id */ namelen, /* length of processor name */ i, j, k, /* loop variables */ tmp; /* temporary value */ double a[P][Q], b[Q][R], /* matrices to multiply */ c[P][R], /* matrix for result */ row_a[Q], /* one row of matrix "a" */ row_c[R]; /* one row of matrix "c" */ char processor_name[MPI_MAX_PROCESSOR_NAME]; MPI_Init (&argc, &argv); MPI_Comm_rank (MPI_COMM_WORLD, &mytid); MPI_Comm_size (MPI_COMM_WORLD, &ntasks); MPI_Get_processor_name (processor_name, &namelen); fprintf (stdout, "Process %d of %d running on %s\n", mytid, ntasks, processor_name); fflush (stdout); MPI_Barrier (MPI_COMM_WORLD); /* wait for all other processes */ if ((ntasks != P) && (mytid == 0)) { fprintf (stderr, "\n\nI need %d processes.\n" "Usage:\n" " mpiexec -np %d %s.\n\n", P, P, argv[0]); } if (ntasks != P) { MPI_Finalize (); exit (EXIT_FAILURE); } if (mytid == 0) { tmp = 1; for (i = 0; i < P; ++i) /* initialize matrix a */ { for (j = 0; j < Q; ++j) { a[i][j] = tmp++; } } printf ("\n\n(%d,%d)-matrix a:\n\n", P, Q); print_matrix (P, Q, (double **) a); tmp = Q * R; for (i = 0; i < Q; ++i) /* initialize matrix b */ { for (j = 0; j < R; ++j) { b[i][j] = tmp--; } } printf ("(%d,%d)-matrix b:\n\n", Q, R); print_matrix (Q, R, (double **) b); } /* send matrix "b" to all processes */ MPI_Bcast (b, Q * R, MPI_DOUBLE, 0, MPI_COMM_WORLD); /* send row i of "a" to process i */ MPI_Scatter (a, Q, MPI_DOUBLE, row_a, Q, MPI_DOUBLE, 0, MPI_COMM_WORLD); for (j = 0; j < R; ++j) /* compute i-th row of "c" */ { row_c[j] = 0.0; for (k = 0; k < Q; ++k) { row_c[j] = row_c[j] + row_a[k] * b[k][j]; } } /* receive row i of "c" from process i */ MPI_Gather (row_c, R, MPI_DOUBLE, c, R, MPI_DOUBLE, 0, MPI_COMM_WORLD); if (mytid == 0) { printf ("(%d,%d)-result-matrix c = a * b :\n\n", P, R); print_matrix (P, R, (double **) c); } MPI_Finalize (); return EXIT_SUCCESS; } void print_matrix (int p, int q, double **mat) { int i, j; /* loop variables */ for (i = 0; i < p; ++i) { for (j = 0; j < q; ++j) { printf ("%6g", *((double *) mat + i * q + j)); } printf ("\n"); } printf ("\n"); }