All,

We are trying to combine MPI_Put and MPI_Win_flush on locked (using MPI_Win_lock_all) dynamic windows to mimic a blocking put. The application is (potentially) multi-threaded and we are thus relying on MPI_THREAD_MULTIPLE support to be available.

When I try to use this combination (MPI_Put + MPI_Win_flush) in our application, I am seeing threads occasionally hang in MPI_Win_flush, probably waiting for some progress to happen. However, when I try to create a small reproducer (attached, the original application has multiple layers of abstraction), I am seeing fatal errors in MPI_Win_flush if using more than one thread:

```
[beryl:18037] *** An error occurred in MPI_Win_flush
[beryl:18037] *** reported by process [4020043777,2]
[beryl:18037] *** on win pt2pt window 3
[beryl:18037] *** MPI_ERR_RMA_SYNC: error executing rma sync
[beryl:18037] *** MPI_ERRORS_ARE_FATAL (processes in this win will now abort,
[beryl:18037] ***    and potentially your MPI job)
```

I could only trigger this on dynamic windows with multiple concurrent threads running.

So: Is this a valid MPI program (except for the missing clean-up at the end ;))? It seems to run fine with MPICH but maybe they are more tolerant to some programming errors...

If it is a valid MPI program, I assume there is some race condition in MPI_Win_flush that leads to the fatal error (or the hang that I observe otherwise)?

I tested this with OpenMPI 1.10.5 on single node Linux Mint 18.1 system with stock kernel 4.8.0-36 (aka my laptop). OpenMPI and the test were both compiled using GCC 5.3.0. I could not run it using OpenMPI 2.0.2 due to the fatal error in MPI_Win_create (which also applies to MPI_Win_create_dynamic, see my other thread, not sure if they are related).

Please let me know if this is a valid use case and whether I can provide you with additional information if required.

Many thanks in advance!

Cheers
Joseph

--
Dipl.-Inf. Joseph Schuchart
High Performance Computing Center Stuttgart (HLRS)
Nobelstr. 19
D-70569 Stuttgart

Tel.: +49(0)711-68565890
Fax: +49(0)711-6856832
E-Mail: schuch...@hlrs.de

#include <mpi.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <omp.h>
#include <assert.h>

static void
allocate_dynamic(size_t elsize, size_t count, MPI_Win *win, MPI_Aint *disp_set, char **b)
{
    char *base;
    MPI_Aint disp;
    int rank;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    
    if (MPI_Win_create_dynamic(MPI_INFO_NULL, MPI_COMM_WORLD, win) != MPI_SUCCESS) {
        printf("Failed to create dynamic window!\n");
        exit(1);
    }

    if (MPI_Alloc_mem(elsize*count, MPI_INFO_NULL, &base) != MPI_SUCCESS) {
        printf("Failed to allocate memory!\n");
        exit(1);
    }


    if (MPI_Win_attach(*win, base, elsize*count) != MPI_SUCCESS) {
        printf("Failed to attach memory to dynamic window!\n");
        exit(1);
    }


    MPI_Get_address(base, &disp);
    printf("Offset at process %i: %p (%lu)\n", rank, base, disp);
    MPI_Allgather(&disp, 1, MPI_AINT, disp_set, 1, MPI_AINT, MPI_COMM_WORLD);

    MPI_Win_lock_all(0, *win);

    *b = base;
}

static void
put_blocking(uint64_t value, int target, MPI_Aint offset, MPI_Win win)
{
    MPI_Put(&value, 1, MPI_UNSIGNED_LONG, target, offset, 1, MPI_UNSIGNED_LONG, win);
    MPI_Win_flush(target, win);
}

int main(int argc, char **argv)
{
    int provided;
    int rank, size;
    MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
    printf("MPI_THREAD_MULTIPLE supported: %s\n", (provided == MPI_THREAD_MULTIPLE) ? "yes" : "no" );

    MPI_Win win;
    char *base;
    // every thread writes so many values to our neighbor
    // the offset is controlled by the thread ID
    int elem_per_thread = 10;
    int num_threads = omp_get_num_threads();

    MPI_Aint *disp_set = calloc(size, sizeof(MPI_Aint));
    
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    allocate_dynamic(sizeof(uint64_t), elem_per_thread*num_threads, &win, disp_set, &base);

    int target = (rank + 1) % size;
#pragma omp parallel 
{
    int thread_num = omp_get_thread_num();
    size_t offset  = disp_set[target] + (elem_per_thread * thread_num)*sizeof(uint64_t);
    for (int i = 0; i < elem_per_thread; i++) {
        printf("[%i:%i] win[%zu => %zu + (%i * %i)*%zu] <= %i\n", rank, thread_num, 
            offset + (sizeof(uint64_t) * i), disp_set[target], elem_per_thread, thread_num, 
            sizeof(uint64_t), thread_num);
        put_blocking(thread_num, target, offset + (sizeof(uint64_t) * i), win);
    }
}

    MPI_Barrier(MPI_COMM_WORLD);

// check for the result locally (not very sophisticated)
#pragma omp parallel
{
    int thread_num = omp_get_thread_num();
    assert(((uint64_t*)base)[(elem_per_thread * thread_num)] == thread_num);
}
    MPI_Win_unlock_all(win);

//    MPI_Win_free(&win);
    
    free(disp_set);

    MPI_Finalize();

    return 0;
}

_______________________________________________
users mailing list
users@lists.open-mpi.org
https://rfd.newmexicoconsortium.org/mailman/listinfo/users

Reply via email to