The commit is pushed to "branch-rh9-5.14.0-427.44.1.vz9.80.x-ovz" and will 
appear at g...@bitbucket.org:openvz/vzkernel.git
after rh9-5.14.0-427.44.1.vz9.80.14
------>
commit 4b8249e2da1a7d28882bd70844fc4d64db952f08
Author: Alexander Atanasov <alexander.atana...@virtuozzo.com>
Date:   Tue Feb 11 16:25:57 2025 +0200

    dm-ploop: fix and rework md updates
    
    With the locking reduced it opened windows for races between
    running updates, pending and new updates. Current logic to
    deal with them is not correct.
    
    Current flags are:
    MD_DIRTY - means md page is dirty and is waiting for writeback
    MD_WRITEBACK - write back is in progress
    But we drop the lock after we make page dirty so it races with
    writeback.
    
    To fix this introduce a new flag:
    MD_UPDATING - which means we are updating the pending writeback
    or we are creating a new one. Remove the check if piwb is present
    on the page and clear piwb early on submit, check was using a type
    alloc vs discard but in case of concurrent alloc and discard the piwb
    is overwritten by the second operation. Using a flag solves this
    and makes it more clear in what state page is.
    
    Nice side effect of closing that race is that we have a minimalistic
    delay for writeback updates. This happens when we try to submit a
    pending writeback and we see the MD_UPDATING flags, update is not
    submitted but it is retried on next cycle. So if we have a lot of
    update to the same page in a very short periot they are accumulated
    as one.
    
    On error only clear MD_DIRTY if it is set by us before dropping
    the lock and when breaking the prepared bat update only clear
    md->piwb  if it is the same as the piwb allocated in the function.
    
    Add some clarification error messages around WARN_ONs.
    
    While at it fix:
     - mapping bat_bage two times in same function, do it only once
     - missed unmapping on error path
     - update proper bat levels in delayed update case
     - locking around prepare_bat_update from ploop_prepare_reloc_index_wb
    
    https://virtuozzo.atlassian.net/browse/VSTOR-98807
    Signed-off-by: Alexander Atanasov <alexander.atana...@virtuozzo.com>
    
    Feature: vStorage
---
 drivers/md/dm-ploop-cmd.c |   4 +-
 drivers/md/dm-ploop-map.c | 153 +++++++++++++++++++++++++++++++++-------------
 drivers/md/dm-ploop.h     |   8 ++-
 3 files changed, 117 insertions(+), 48 deletions(-)

diff --git a/drivers/md/dm-ploop-cmd.c b/drivers/md/dm-ploop-cmd.c
index e032af52b64e..daec9827b42c 100644
--- a/drivers/md/dm-ploop-cmd.c
+++ b/drivers/md/dm-ploop-cmd.c
@@ -315,10 +315,8 @@ static int ploop_grow_relocate_cluster(struct ploop *ploop,
                goto out;
        }
 
-       spin_lock_irq(&ploop->bat_lock);
        ret = ploop_prepare_reloc_index_wb(ploop, &md, clu, &new_dst,
                                           ploop_top_delta(ploop)->file);
-       spin_unlock_irq(&ploop->bat_lock);
        if (ret < 0) {
                PL_ERR("reloc: can't prepare it: %d", ret);
                goto out;
@@ -329,7 +327,7 @@ static int ploop_grow_relocate_cluster(struct ploop *ploop,
        ret = ploop_write_cluster_sync(ploop, pio, new_dst);
        if (ret) {
                PL_ERR("reloc: failed write: %d", ret);
-               ploop_break_bat_update(ploop, md);
+               ploop_break_bat_update(ploop, md, piwb);
                goto out;
        }
 
diff --git a/drivers/md/dm-ploop-map.c b/drivers/md/dm-ploop-map.c
index f6fc453f7464..0db3a0ff5bb3 100644
--- a/drivers/md/dm-ploop-map.c
+++ b/drivers/md/dm-ploop-map.c
@@ -421,7 +421,6 @@ void ploop_dispatch_pios(struct ploop *ploop, struct pio 
*pio,
 static bool ploop_delay_if_md_busy(struct ploop *ploop, struct md_page *md,
                                   enum piwb_type type, struct pio *pio)
 {
-       struct ploop_index_wb *piwb;
        unsigned long flags;
        bool busy = false;
 
@@ -429,8 +428,7 @@ static bool ploop_delay_if_md_busy(struct ploop *ploop, 
struct md_page *md,
 
        /* lock protects piwb */
        spin_lock_irqsave(&md->md_lock, flags); /* read */
-       piwb = md->piwb;
-       if (piwb && (piwb->type != type || test_bit(MD_WRITEBACK, 
&md->status))) {
+       if (test_bit(MD_WRITEBACK, &md->status) || test_bit(MD_UPDATING, 
&md->status)) {
                llist_add((struct llist_node *)(&pio->list), &md->wait_llist);
                busy = true;
        }
@@ -616,6 +614,11 @@ static void ploop_unlink_completed_pio(struct ploop 
*ploop, struct pio *pio)
                ploop_dispatch_pios(ploop, NULL, &pio_list);
 }
 
+static void ploop_add_dirty_for_wb(struct ploop *ploop, struct md_page *md)
+{
+       llist_add((struct llist_node *)&md->wb_link, &ploop->wb_batch_llist);
+}
+
 static bool ploop_md_make_dirty(struct ploop *ploop, struct md_page *md)
 {
        bool new = false;
@@ -628,10 +631,8 @@ static bool ploop_md_make_dirty(struct ploop *ploop, 
struct md_page *md)
        lockdep_assert_held(&ploop->bat_lock);
 
        WARN_ON_ONCE(test_bit(MD_WRITEBACK, &md->status));
-       if (!test_and_set_bit(MD_DIRTY, &md->status)) {
-               llist_add((struct llist_node *)&md->wb_link, 
&ploop->wb_batch_llist);
+       if (!test_and_set_bit(MD_DIRTY, &md->status))
                new = true;
-       }
        md->dirty_timeout = jiffies + ploop->md_submit_delay_ms*HZ/1000;
 
        return new;
@@ -889,9 +890,6 @@ static void ploop_advance_local_after_bat_wb(struct ploop 
*ploop,
 #endif
 
        WARN_ON_ONCE(!test_bit(MD_WRITEBACK, &md->status));
-       /* protect piwb */
-       clear_bit(MD_WRITEBACK, &md->status);
-       md->piwb = NULL;
        spin_unlock(&md->md_lock);
        spin_unlock_irqrestore(&ploop->bat_lock, flags);
        kunmap_local(dst_clu);
@@ -907,6 +905,11 @@ static void ploop_advance_local_after_bat_wb(struct ploop 
*ploop,
 
        if (!list_empty(&list))
                ploop_dispatch_pios(ploop, NULL, &list);
+
+       spin_lock_irqsave(&ploop->bat_lock, flags);
+       /* Clear flag at the end when all processing related is done */
+       clear_bit(MD_WRITEBACK, &md->status);
+       spin_unlock_irqrestore(&ploop->bat_lock, flags);
 }
 
 static void ploop_free_piwb(struct ploop_index_wb *piwb)
@@ -1045,7 +1048,10 @@ static int ploop_prepare_bat_update(struct ploop *ploop, 
struct md_page *md,
        bat_entries = md->kmpage;
 
        spin_lock(&md->md_lock); /* write */
-       WARN_ON_ONCE(md->piwb);
+       if (WARN_ON(md->piwb)) {
+               PL_ERR("md %p has piwb: %p type:%d ourtype:%d\n",
+                       md, md->piwb, md->piwb->type, type);
+       }
        md->piwb = piwb;
        piwb->md = md;
 
@@ -1092,15 +1098,18 @@ static int ploop_prepare_bat_update(struct ploop 
*ploop, struct md_page *md,
        return -ENOMEM;
 }
 
-void ploop_break_bat_update(struct ploop *ploop, struct md_page *md)
+void ploop_break_bat_update(struct ploop *ploop, struct md_page *md,
+                           struct ploop_index_wb *piwb)
 {
-       struct ploop_index_wb *piwb;
        unsigned long flags;
 
-       spin_lock_irqsave(&md->md_lock, flags); /* write */
-       piwb = md->piwb;
-       md->piwb = NULL;
-       spin_unlock_irqrestore(&md->md_lock, flags);
+       spin_lock_irqsave(&ploop->bat_lock, flags);
+       spin_lock(&md->md_lock); /* write */
+       /* Only break if piwb is the same that failed to prepare */
+       if (md->piwb == piwb)
+               md->piwb = NULL;
+       spin_unlock(&md->md_lock);
+       spin_unlock_irqrestore(&ploop->bat_lock, flags);
 
        ploop_free_piwb(piwb);
 }
@@ -1350,7 +1359,9 @@ static int ploop_alloc_cluster(struct ploop *ploop, 
struct ploop_index_wb *piwb,
        map_index_t *to;
        int ret = 0;
        u32 tmp_clu;
+#ifdef PLOOP_DELAYWB
        unsigned long flags;
+#endif
 
        /* Cluster index related to the page[page_id] start */
        clu -= piwb->page_id * PAGE_SIZE / sizeof(map_index_t) - 
PLOOP_MAP_OFFSET;
@@ -1362,7 +1373,6 @@ static int ploop_alloc_cluster(struct ploop *ploop, 
struct ploop_index_wb *piwb,
                *dst_clu = tmp_clu;
                already_alloced = true;
        }
-       kunmap_local(to);
 
        if (already_alloced)
                goto out;
@@ -1371,17 +1381,16 @@ static int ploop_alloc_cluster(struct ploop *ploop, 
struct ploop_index_wb *piwb,
        if (unlikely(ret < 0))
                goto out;
 
-       to = kmap_local_page(piwb->bat_page);
-       spin_lock_irqsave(&piwb->md->md_lock, flags);
        WRITE_ONCE(to[clu], *dst_clu);
-       WRITE_ONCE(piwb->md->bat_levels[clu], ploop_top_level(ploop));
-       spin_unlock_irqrestore(&piwb->md->md_lock, flags);
-       kunmap_local(to);
 #ifdef PLOOP_DELAYWB
+       spin_lock_irqsave(&piwb->md->md_lock, flags);
        to = piwb->md->kmpage;
        WRITE_ONCE(to[clu], *dst_clu);
+       WRITE_ONCE(piwb->md->bat_levels[clu], ploop_top_level(ploop));
+       spin_unlock_irqrestore(&piwb->md->md_lock, flags);
 #endif
 out:
+       kunmap_local(to);
        return ret;
 }
 
@@ -1650,6 +1659,7 @@ static void ploop_submit_cow_index_wb(struct ploop_cow 
*cow)
        struct md_page *md;
        unsigned long flags;
        map_index_t *to;
+       bool add_to_wblist = false;
 
        WARN_ON_ONCE(cow->aux_pio->queue_list_id != PLOOP_LIST_COW);
        page_id = ploop_bat_clu_to_page_nr(clu);
@@ -1667,11 +1677,14 @@ static void ploop_submit_cow_index_wb(struct ploop_cow 
*cow)
                        spin_unlock_irqrestore(&ploop->bat_lock, flags);
                        goto err_resource;
                }
-               ploop_md_make_dirty(ploop, md);
+               add_to_wblist = ploop_md_make_dirty(ploop, md);
        }
-       spin_unlock_irqrestore(&ploop->bat_lock, flags);
-
        piwb = md->piwb;
+       if (WARN_ON(!piwb))
+               PL_ERR("cow no piwb:%d\n", add_to_wblist);
+
+       set_bit(MD_UPDATING, &md->status);
+       spin_unlock_irqrestore(&ploop->bat_lock, flags);
 
        clu -= page_id * PAGE_SIZE / sizeof(map_index_t) - PLOOP_MAP_OFFSET;
 
@@ -1693,6 +1706,10 @@ static void ploop_submit_cow_index_wb(struct ploop_cow 
*cow)
        cow->dst_clu = BAT_ENTRY_NONE;
 
        llist_add((struct llist_node *)(&cow->aux_pio->list), &piwb->cow_llist);
+
+       if (add_to_wblist)
+               ploop_add_dirty_for_wb(ploop, md);
+       clear_bit(MD_UPDATING, &md->status);
 out:
        return;
 err_resource:
@@ -1755,6 +1772,7 @@ static bool 
ploop_locate_new_cluster_and_attach_pio(struct ploop *ploop,
        int err;
        unsigned long flags;
        struct file *file;
+       bool add_to_wblist = false;
 
        WARN_ON_ONCE(pio->queue_list_id != PLOOP_LIST_DEFERRED);
        spin_lock_irqsave(&ploop->bat_lock, flags);
@@ -1764,6 +1782,12 @@ static bool 
ploop_locate_new_cluster_and_attach_pio(struct ploop *ploop,
        }
 
 
+       /*
+        * Flag page as updating, so if there is already a pending update it
+        * waits for the new update to be ready before processing it
+        */
+       set_bit(MD_UPDATING, &md->status);
+
        if (!test_bit(MD_DIRTY, &md->status)) {
                 /* Locked since MD_DIRTY is set and cleared concurrently  */
                page_id = ploop_bat_clu_to_page_nr(clu);
@@ -1772,19 +1796,18 @@ static bool 
ploop_locate_new_cluster_and_attach_pio(struct ploop *ploop,
                        spin_unlock_irqrestore(&ploop->bat_lock, flags);
                        goto error;
                }
+               add_to_wblist = ploop_md_make_dirty(ploop, md);
                bat_update_prepared = true;
-               ploop_md_make_dirty(ploop, md);
        }
-       spin_unlock_irqrestore(&ploop->bat_lock, flags);
-
        piwb = md->piwb;
+       WARN_ON_ONCE(!piwb);
+       spin_unlock_irqrestore(&ploop->bat_lock, flags);
 
        file = ploop_top_delta(ploop)->mtfile[pio->runner_id];
 
        err = ploop_alloc_cluster(ploop, piwb, clu, dst_clu, file);
        if (err) {
                pio->bi_status = errno_to_blk_status(err);
-               clear_bit(MD_DIRTY, &md->status);
                goto error;
        }
 
@@ -1802,13 +1825,22 @@ static bool 
ploop_locate_new_cluster_and_attach_pio(struct ploop *ploop,
        ploop_attach_end_action(pio, piwb);
 #endif
        attached = true;
+       if (add_to_wblist)
+               ploop_add_dirty_for_wb(ploop, md);
+       clear_bit(MD_UPDATING, &md->status);
 out:
        return attached;
 error:
        /* Uninit piwb */
        if (bat_update_prepared)
-               ploop_break_bat_update(ploop, md);
+               ploop_break_bat_update(ploop, md, piwb);
        ploop_pio_endio(pio);
+
+       spin_lock_irqsave(&ploop->bat_lock, flags);
+       if (add_to_wblist)
+               clear_bit(MD_DIRTY, &md->status);
+       clear_bit(MD_UPDATING, &md->status);
+       spin_unlock_irqrestore(&ploop->bat_lock, flags);
        return false;
 }
 
@@ -2007,6 +2039,7 @@ static void ploop_process_one_discard_pio(struct ploop 
*ploop, struct pio *pio)
        struct md_page *md;
        map_index_t *to;
        unsigned long flags;
+       bool add_to_wblist = false;
 
        WARN_ON(ploop->nr_deltas != 1 ||
                pio->queue_list_id != PLOOP_LIST_DISCARD);
@@ -2019,17 +2052,18 @@ static void ploop_process_one_discard_pio(struct ploop 
*ploop, struct pio *pio)
                return;
        }
 
+       set_bit(MD_UPDATING, &md->status);
        if (!test_bit(MD_DIRTY, &md->status)) {
                if (ploop_prepare_bat_update(ploop, md, PIWB_TYPE_DISCARD) < 0) 
{
                        pio->bi_status = BLK_STS_RESOURCE;
                        goto err_unlck;
                }
+               add_to_wblist = ploop_md_make_dirty(ploop, md);
                bat_update_prepared = true;
-               ploop_md_make_dirty(ploop, md);
        }
-       spin_unlock_irqrestore(&ploop->bat_lock, flags);
 
        piwb = md->piwb;
+       spin_unlock_irqrestore(&ploop->bat_lock, flags);
 
        /* Cluster index related to the page[page_id] start */
        clu -= piwb->page_id * PAGE_SIZE / sizeof(map_index_t) - 
PLOOP_MAP_OFFSET;
@@ -2037,7 +2071,7 @@ static void ploop_process_one_discard_pio(struct ploop 
*ploop, struct pio *pio)
        to = kmap_local_page(piwb->bat_page);
        if (WARN_ON_ONCE(!to[clu])) {
                pio->bi_status = BLK_STS_IOERR;
-               clear_bit(MD_DIRTY, &md->status);
+               kunmap_local(to);
                goto err;
        } else {
                WRITE_ONCE(to[clu], 0);
@@ -2046,14 +2080,22 @@ static void ploop_process_one_discard_pio(struct ploop 
*ploop, struct pio *pio)
        kunmap_local(to);
 
        ploop_md_up_prio(ploop, md);
+       if (add_to_wblist)
+               ploop_add_dirty_for_wb(ploop, md);
+       clear_bit(MD_UPDATING, &md->status);
 
        return;
 err_unlck:
        spin_unlock_irqrestore(&ploop->bat_lock, flags);
 err:
        if (bat_update_prepared)
-               ploop_break_bat_update(ploop, md);
+               ploop_break_bat_update(ploop, md, piwb);
        ploop_pio_endio(pio);
+       spin_lock_irqsave(&ploop->bat_lock, flags);
+       if (add_to_wblist)
+               clear_bit(MD_DIRTY, &md->status);
+       clear_bit(MD_UPDATING, &md->status);
+       spin_unlock_irqrestore(&ploop->bat_lock, flags);
 }
 
 static inline int ploop_runners_have_pending(struct ploop *ploop)
@@ -2082,21 +2124,46 @@ static inline int 
ploop_submit_metadata_writeback(struct ploop *ploop, int force
         */
        llist_for_each_safe(pos, t, ll_wb_batch) {
                md = list_entry((struct list_head *)pos, typeof(*md), wb_link);
+               INIT_LIST_HEAD(&md->wb_link);
                /* XXX: fixme delay results in a hang - TBD */
                if (1 || !llist_empty(&md->wait_llist) || force ||
                    test_bit(MD_HIGHPRIO, &md->status) ||
                    time_before(md->dirty_timeout, timeout) ||
                    ploop->force_md_writeback) {
+                       spin_lock_irqsave(&ploop->bat_lock, flags);
+                       if (test_bit(MD_UPDATING, &md->status) ||
+                           test_bit(MD_WRITEBACK, &md->status)) {
+                               /* if updating or there is a writeback then 
delay */
+                               spin_unlock_irqrestore(&ploop->bat_lock, flags);
+                               llist_add((struct llist_node *)&md->wb_link,
+                                         &ploop->wb_batch_llist);
+                               continue;
+                       }
+
                        /* L1L2 mustn't be redirtyed, when wb in-flight! */
-                       WARN_ON_ONCE(!test_bit(MD_DIRTY, &md->status));
-                       WARN_ON_ONCE(test_bit(MD_WRITEBACK, &md->status));
-                       set_bit(MD_WRITEBACK, &md->status);
-                       clear_bit(MD_DIRTY, &md->status);
+                       if (WARN_ON_ONCE(!test_bit(MD_DIRTY, &md->status)))
+                               PL_ERR("wb but dirty not set\n");
+
+                       if (WARN_ON_ONCE(test_bit(MD_WRITEBACK, &md->status)))
+                               PL_ERR("wb but writeback already set\n");
+
                        clear_bit(MD_HIGHPRIO, &md->status);
-                       ploop_index_wb_submit(ploop, md->piwb);
-                       ret++;
+                       if (md->piwb) {
+                               struct ploop_index_wb *piwb;
+                               /* New updates will go in wait list */
+                               set_bit(MD_WRITEBACK, &md->status);
+                               clear_bit(MD_DIRTY, &md->status);
+                               /* at this point we can clear md->piwb */
+                               piwb = md->piwb;
+                               md->piwb = NULL;
+                               /* hand off to threads */
+                               ploop_index_wb_submit(ploop, piwb);
+                               ret++;
+                       } else {
+                               PL_ERR("Unexpected md piwb is null");
+                       }
+                       spin_unlock_irqrestore(&ploop->bat_lock, flags);
                } else {
-                       INIT_LIST_HEAD(&md->wb_link);
                        llist_add((struct llist_node *)&md->wb_link, 
&ploop->wb_batch_llist);
                }
        }
@@ -2637,6 +2704,7 @@ int ploop_prepare_reloc_index_wb(struct ploop *ploop,
                type = PIWB_TYPE_RELOC;
 
        err = -EIO;
+       spin_lock_irq(&ploop->bat_lock);
        if (test_bit(MD_DIRTY, &md->status) || test_bit(MD_WRITEBACK, 
&md->status)) {
                PL_ERR("Unexpected md status: %lx", md->status);
                goto out_error;
@@ -2659,12 +2727,13 @@ int ploop_prepare_reloc_index_wb(struct ploop *ploop,
                if (err)
                        goto out_reset;
        }
+       spin_unlock_irq(&ploop->bat_lock);
 
        *ret_md = md;
        return 0;
 
 out_reset:
-       ploop_break_bat_update(ploop, md);
+       ploop_break_bat_update(ploop, md, piwb);
 out_error:
        return err;
 }
diff --git a/drivers/md/dm-ploop.h b/drivers/md/dm-ploop.h
index 06d95c525c90..46450cac8c7d 100644
--- a/drivers/md/dm-ploop.h
+++ b/drivers/md/dm-ploop.h
@@ -116,9 +116,10 @@ struct ploop_index_wb {
 struct md_page {
        struct rb_node node;
        u32 id; /* Number of this page starting from hdr */
-#define MD_DIRTY       (1U << 1) /* Page contains changes and wants writeback 
*/
-#define MD_WRITEBACK   (1U << 2) /* Writeback was submitted */
+#define MD_DIRTY       (1U << 1) /* Page contains changes and awaits writeback 
*/
+#define MD_WRITEBACK   (1U << 2) /* Writeback in progress */
 #define MD_HIGHPRIO    (1U << 3) /* High priority writeback  */
+#define MD_UPDATING    (1U << 4) /* Preparing an update */
        unsigned long status;
        struct page *page;
        void *kmpage;
@@ -612,7 +613,8 @@ extern void ploop_map_and_submit_rw(struct ploop *ploop, 
u32 dst_clu,
 extern int ploop_prepare_reloc_index_wb(struct ploop *ploop,
                                        struct md_page **ret_md, u32 clu, u32 
*dst_clu,
                                        struct file *file);
-extern void ploop_break_bat_update(struct ploop *ploop, struct md_page *md);
+extern void ploop_break_bat_update(struct ploop *ploop, struct md_page *md,
+                                  struct ploop_index_wb *piwb);
 extern void ploop_index_wb_submit(struct ploop *, struct ploop_index_wb *);
 extern int ploop_message(struct dm_target *ti, unsigned int argc, char **argv,
                         char *result, unsigned int maxlen);
_______________________________________________
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to