In the context of FRMR (ib_frmr.c): Memory regions make it onto the "clean_list" via "rds_ib_flush_mr_pool", after the memory region has been posted for invalidation via "rds_ib_post_inv".
At that point in time, "fr_state" may still be in state "FRMR_IS_INUSE", since the only place where "fr_state" transitions to "FRMR_IS_FREE" is in "rds_ib_mr_cqe_handler", which is triggered by a tasklet. So in case we notice that "fr_state != FRMR_IS_FREE" (see below), we wait for "fr_inv_done" to trigger with a maximum of 10msec. Then we check again, and only put the memory region onto the drop_list (via "rds_ib_free_frmr") in case the situation remains unchanged. This avoids the problem of memory-regions bouncing between "clean_list" and "drop_list" before they even have a chance to be properly invalidated. Signed-off-by: Gerd Rausch <gerd.rau...@oracle.com> --- net/rds/ib_frmr.c | 32 +++++++++++++++++++++++++++++++- net/rds/ib_mr.h | 1 + 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c index 32ae26ed58a0..9f8aa310c27a 100644 --- a/net/rds/ib_frmr.c +++ b/net/rds/ib_frmr.c @@ -75,6 +75,7 @@ static struct rds_ib_mr *rds_ib_alloc_frmr(struct rds_ib_device *rds_ibdev, pool->max_items_soft = pool->max_items; frmr->fr_state = FRMR_IS_FREE; + init_waitqueue_head(&frmr->fr_inv_done); return ibmr; out_no_cigar: @@ -285,6 +286,7 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc) if (frmr->fr_inv) { frmr->fr_state = FRMR_IS_FREE; frmr->fr_inv = false; + wake_up(&frmr->fr_inv_done); } atomic_inc(&ic->i_fastreg_wrs); @@ -345,8 +347,36 @@ struct rds_ib_mr *rds_ib_reg_frmr(struct rds_ib_device *rds_ibdev, } do { - if (ibmr) + if (ibmr) { + /* Memory regions make it onto the "clean_list" via + * "rds_ib_flush_mr_pool", after the memory region has + * been posted for invalidation via "rds_ib_post_inv". + * + * At that point in time, "fr_state" may still be + * in state "FRMR_IS_INUSE", since the only place where + * "fr_state" transitions to "FRMR_IS_FREE" is in + * is in "rds_ib_mr_cqe_handler", which is + * triggered by a tasklet. + * + * So in case we notice that + * "fr_state != FRMR_IS_FREE" (see below), * we wait for + * "fr_inv_done" to trigger with a maximum of 10msec. + * Then we check again, and only put the memory region + * onto the drop_list (via "rds_ib_free_frmr") + * in case the situation remains unchanged. + * + * This avoids the problem of memory-regions bouncing + * between "clean_list" and "drop_list" before they + * even have a chance to be properly invalidated. + */ + frmr = &ibmr->u.frmr; + wait_event_timeout(frmr->fr_inv_done, + frmr->fr_state == FRMR_IS_FREE, + msecs_to_jiffies(10)); + if (frmr->fr_state == FRMR_IS_FREE) + break; rds_ib_free_frmr(ibmr, true); + } ibmr = rds_ib_alloc_frmr(rds_ibdev, nents); if (IS_ERR(ibmr)) return ibmr; diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h index 5da12c248431..42daccb7b5eb 100644 --- a/net/rds/ib_mr.h +++ b/net/rds/ib_mr.h @@ -57,6 +57,7 @@ struct rds_ib_frmr { struct ib_mr *mr; enum rds_ib_fr_state fr_state; bool fr_inv; + wait_queue_head_t fr_inv_done; struct ib_send_wr fr_wr; unsigned int dma_npages; unsigned int sg_byte_len; -- 2.18.0