All,

I came across what I consider another issue regarding progress in Open MPI: consider one process (P1) polling locally on a regular window (W1) for a local value to change (using MPI_Win_lock+MPI_Get+MPI_Win_unlock) while a second process (P2) tries to read from a memory location in a dynamic window (W2) on process P1 (using MPI_Rget+MPI_Wait, other combinations affected as well). P2 will later update the memory location waited on by P1. However, the read on the dynamic window stalls as the (local) read on W1 on P1 does not trigger progress on the dynamic window W2, causing the application to deadlock.

It is my understanding that process P1 should guarantee progress on any communication it is involved in, irregardless of the window or window type, and thus the communication should succeed. Is this assumption correct? Or is P1 required to access W2 as well to ensure progress? I can trigger progress on W2 on P1 by adding a call to MPI_Iprobe but that seems like a hack to me. Also, if both W1 and W2 are regular (allocated) windows the communication succeeds.

I am attaching a small reproducer, tested with Open MPI release 3.0.0 on a single GNU/Linux node (Linux Mint 18.2, gcc 5.4.1, Linux 4.10.0-38-generic).

Many thanks in advance!

Joseph
--
Dipl.-Inf. Joseph Schuchart
High Performance Computing Center Stuttgart (HLRS)
Nobelstr. 19
D-70569 Stuttgart

Tel.: +49(0)711-68565890
Fax: +49(0)711-6856832
E-Mail: schuch...@hlrs.de
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>


static void
allocate_dynamic(size_t elsize, size_t count, MPI_Win *win, MPI_Aint *disp_set, char **b)
{
    char *base;
    MPI_Aint disp;
    int rank;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    
    if (MPI_Win_create_dynamic(MPI_INFO_NULL, MPI_COMM_WORLD, win) != MPI_SUCCESS) {
        printf("Failed to create dynamic window!\n");
        exit(1);
    }

    if (MPI_Alloc_mem(elsize*count, MPI_INFO_NULL, &base) != MPI_SUCCESS) {
        printf("Failed to allocate memory!\n");
        exit(1);
    }


    if (MPI_Win_attach(*win, base, elsize*count) != MPI_SUCCESS) {
        printf("Failed to attach memory to dynamic window!\n");
        exit(1);
    }


    MPI_Get_address(base, &disp);
    printf("Offset at process %i: %p (%lu)\n", rank, base, disp);
    MPI_Allgather(&disp, 1, MPI_AINT, disp_set, 1, MPI_AINT, MPI_COMM_WORLD);

    MPI_Win_lock_all(0, *win);

    *b = base;
}

int main(int argc, char **argv) 
{
  int *baseptr1;
  int *baseptr2;
  MPI_Win win1, win2;
  int rank, size;

  MPI_Init(&argc, &argv);

  MPI_Comm_size(MPI_COMM_WORLD, &size);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);

  MPI_Win_allocate(sizeof(int), 1, MPI_INFO_NULL, MPI_COMM_WORLD, &baseptr1, &win1);
  MPI_Aint *disp_set = malloc(sizeof(MPI_Aint)*size);
  allocate_dynamic(sizeof(int), 1, &win2, disp_set, &baseptr2);

  *baseptr1 = rank;
  *baseptr2 = rank;
  MPI_Barrier(MPI_COMM_WORLD); 

  if (rank == 0) {
    int local_val;
    do {
      // trigger progress to avoid stalling read on rank 1
//      int flag;
//      MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &flag, MPI_STATUS_IGNORE);
      MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, win1);
      MPI_Get(&local_val, 1, MPI_INT, rank, 0, 1, MPI_INT, win1);
      MPI_Win_flush(rank, win1);
      if (local_val != rank) { printf("Done!\n"); }
      MPI_Win_unlock(rank, win1);
    } while (local_val == rank);
  } else if (rank == 1) {
    MPI_Request req;
    int val;
    MPI_Rget(&val, 1, MPI_INT, 0, disp_set[0], 1, MPI_INT, win2, &req);
    MPI_Wait(&req, MPI_STATUS_IGNORE);
    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, 0, 0, win1);
    MPI_Put(&rank, 1, MPI_INT, 0, 0, 1, MPI_INT, win1);
    MPI_Win_unlock(0, win1);
  }

  MPI_Win_free(&win1);
  MPI_Win_free(&win2);

  MPI_Finalize();

  return 0;
}

_______________________________________________
users mailing list
users@lists.open-mpi.org
https://lists.open-mpi.org/mailman/listinfo/users

Reply via email to