In the context of FRMR (ib_frmr.c):

Memory regions make it onto the "clean_list" via "rds_ib_flush_mr_pool",
after the memory region has been posted for invalidation via
"rds_ib_post_inv".

At that point in time, "fr_state" may still be in state "FRMR_IS_INUSE",
since the only place where "fr_state" transitions to "FRMR_IS_FREE"
is in "rds_ib_mr_cqe_handler", which is triggered by a tasklet.

So in case we notice that "fr_state != FRMR_IS_FREE" (see below),
we wait for "fr_inv_done" to trigger with a maximum of 10msec.
Then we check again, and only put the memory region onto the drop_list
(via "rds_ib_free_frmr") in case the situation remains unchanged.

This avoids the problem of memory-regions bouncing between "clean_list"
and "drop_list" before they even have a chance to be properly invalidated.

Signed-off-by: Gerd Rausch <gerd.rau...@oracle.com>
---
 net/rds/ib_frmr.c | 32 +++++++++++++++++++++++++++++++-
 net/rds/ib_mr.h   |  1 +
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c
index 32ae26ed58a0..9f8aa310c27a 100644
--- a/net/rds/ib_frmr.c
+++ b/net/rds/ib_frmr.c
@@ -75,6 +75,7 @@ static struct rds_ib_mr *rds_ib_alloc_frmr(struct 
rds_ib_device *rds_ibdev,
                pool->max_items_soft = pool->max_items;
 
        frmr->fr_state = FRMR_IS_FREE;
+       init_waitqueue_head(&frmr->fr_inv_done);
        return ibmr;
 
 out_no_cigar:
@@ -285,6 +286,7 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, 
struct ib_wc *wc)
        if (frmr->fr_inv) {
                frmr->fr_state = FRMR_IS_FREE;
                frmr->fr_inv = false;
+               wake_up(&frmr->fr_inv_done);
        }
 
        atomic_inc(&ic->i_fastreg_wrs);
@@ -345,8 +347,36 @@ struct rds_ib_mr *rds_ib_reg_frmr(struct rds_ib_device 
*rds_ibdev,
        }
 
        do {
-               if (ibmr)
+               if (ibmr) {
+                       /* Memory regions make it onto the "clean_list" via
+                        * "rds_ib_flush_mr_pool", after the memory region has
+                        * been posted for invalidation via "rds_ib_post_inv".
+                        *
+                        * At that point in time, "fr_state" may still be
+                        * in state "FRMR_IS_INUSE", since the only place where
+                        * "fr_state" transitions to "FRMR_IS_FREE" is in
+                        * is in "rds_ib_mr_cqe_handler", which is
+                        * triggered by a tasklet.
+                        *
+                        * So in case we notice that
+                        * "fr_state != FRMR_IS_FREE" (see below), * we wait for
+                        * "fr_inv_done" to trigger with a maximum of 10msec.
+                        * Then we check again, and only put the memory region
+                        * onto the drop_list (via "rds_ib_free_frmr")
+                        * in case the situation remains unchanged.
+                        *
+                        * This avoids the problem of memory-regions bouncing
+                        * between "clean_list" and "drop_list" before they
+                        * even have a chance to be properly invalidated.
+                        */
+                       frmr = &ibmr->u.frmr;
+                       wait_event_timeout(frmr->fr_inv_done,
+                                          frmr->fr_state == FRMR_IS_FREE,
+                                          msecs_to_jiffies(10));
+                       if (frmr->fr_state == FRMR_IS_FREE)
+                               break;
                        rds_ib_free_frmr(ibmr, true);
+               }
                ibmr = rds_ib_alloc_frmr(rds_ibdev, nents);
                if (IS_ERR(ibmr))
                        return ibmr;
diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h
index 5da12c248431..42daccb7b5eb 100644
--- a/net/rds/ib_mr.h
+++ b/net/rds/ib_mr.h
@@ -57,6 +57,7 @@ struct rds_ib_frmr {
        struct ib_mr            *mr;
        enum rds_ib_fr_state    fr_state;
        bool                    fr_inv;
+       wait_queue_head_t       fr_inv_done;
        struct ib_send_wr       fr_wr;
        unsigned int            dma_npages;
        unsigned int            sg_byte_len;
-- 
2.18.0


Reply via email to