The commit is pushed to "branch-rh9-5.14.0-427.44.1.vz9.80.x-ovz" and will appear at g...@bitbucket.org:openvz/vzkernel.git after rh9-5.14.0-427.44.1.vz9.80.14 ------> commit 4b8249e2da1a7d28882bd70844fc4d64db952f08 Author: Alexander Atanasov <alexander.atana...@virtuozzo.com> Date: Tue Feb 11 16:25:57 2025 +0200
dm-ploop: fix and rework md updates With the locking reduced it opened windows for races between running updates, pending and new updates. Current logic to deal with them is not correct. Current flags are: MD_DIRTY - means md page is dirty and is waiting for writeback MD_WRITEBACK - write back is in progress But we drop the lock after we make page dirty so it races with writeback. To fix this introduce a new flag: MD_UPDATING - which means we are updating the pending writeback or we are creating a new one. Remove the check if piwb is present on the page and clear piwb early on submit, check was using a type alloc vs discard but in case of concurrent alloc and discard the piwb is overwritten by the second operation. Using a flag solves this and makes it more clear in what state page is. Nice side effect of closing that race is that we have a minimalistic delay for writeback updates. This happens when we try to submit a pending writeback and we see the MD_UPDATING flags, update is not submitted but it is retried on next cycle. So if we have a lot of update to the same page in a very short periot they are accumulated as one. On error only clear MD_DIRTY if it is set by us before dropping the lock and when breaking the prepared bat update only clear md->piwb if it is the same as the piwb allocated in the function. Add some clarification error messages around WARN_ONs. While at it fix: - mapping bat_bage two times in same function, do it only once - missed unmapping on error path - update proper bat levels in delayed update case - locking around prepare_bat_update from ploop_prepare_reloc_index_wb https://virtuozzo.atlassian.net/browse/VSTOR-98807 Signed-off-by: Alexander Atanasov <alexander.atana...@virtuozzo.com> Feature: vStorage --- drivers/md/dm-ploop-cmd.c | 4 +- drivers/md/dm-ploop-map.c | 153 +++++++++++++++++++++++++++++++++------------- drivers/md/dm-ploop.h | 8 ++- 3 files changed, 117 insertions(+), 48 deletions(-) diff --git a/drivers/md/dm-ploop-cmd.c b/drivers/md/dm-ploop-cmd.c index e032af52b64e..daec9827b42c 100644 --- a/drivers/md/dm-ploop-cmd.c +++ b/drivers/md/dm-ploop-cmd.c @@ -315,10 +315,8 @@ static int ploop_grow_relocate_cluster(struct ploop *ploop, goto out; } - spin_lock_irq(&ploop->bat_lock); ret = ploop_prepare_reloc_index_wb(ploop, &md, clu, &new_dst, ploop_top_delta(ploop)->file); - spin_unlock_irq(&ploop->bat_lock); if (ret < 0) { PL_ERR("reloc: can't prepare it: %d", ret); goto out; @@ -329,7 +327,7 @@ static int ploop_grow_relocate_cluster(struct ploop *ploop, ret = ploop_write_cluster_sync(ploop, pio, new_dst); if (ret) { PL_ERR("reloc: failed write: %d", ret); - ploop_break_bat_update(ploop, md); + ploop_break_bat_update(ploop, md, piwb); goto out; } diff --git a/drivers/md/dm-ploop-map.c b/drivers/md/dm-ploop-map.c index f6fc453f7464..0db3a0ff5bb3 100644 --- a/drivers/md/dm-ploop-map.c +++ b/drivers/md/dm-ploop-map.c @@ -421,7 +421,6 @@ void ploop_dispatch_pios(struct ploop *ploop, struct pio *pio, static bool ploop_delay_if_md_busy(struct ploop *ploop, struct md_page *md, enum piwb_type type, struct pio *pio) { - struct ploop_index_wb *piwb; unsigned long flags; bool busy = false; @@ -429,8 +428,7 @@ static bool ploop_delay_if_md_busy(struct ploop *ploop, struct md_page *md, /* lock protects piwb */ spin_lock_irqsave(&md->md_lock, flags); /* read */ - piwb = md->piwb; - if (piwb && (piwb->type != type || test_bit(MD_WRITEBACK, &md->status))) { + if (test_bit(MD_WRITEBACK, &md->status) || test_bit(MD_UPDATING, &md->status)) { llist_add((struct llist_node *)(&pio->list), &md->wait_llist); busy = true; } @@ -616,6 +614,11 @@ static void ploop_unlink_completed_pio(struct ploop *ploop, struct pio *pio) ploop_dispatch_pios(ploop, NULL, &pio_list); } +static void ploop_add_dirty_for_wb(struct ploop *ploop, struct md_page *md) +{ + llist_add((struct llist_node *)&md->wb_link, &ploop->wb_batch_llist); +} + static bool ploop_md_make_dirty(struct ploop *ploop, struct md_page *md) { bool new = false; @@ -628,10 +631,8 @@ static bool ploop_md_make_dirty(struct ploop *ploop, struct md_page *md) lockdep_assert_held(&ploop->bat_lock); WARN_ON_ONCE(test_bit(MD_WRITEBACK, &md->status)); - if (!test_and_set_bit(MD_DIRTY, &md->status)) { - llist_add((struct llist_node *)&md->wb_link, &ploop->wb_batch_llist); + if (!test_and_set_bit(MD_DIRTY, &md->status)) new = true; - } md->dirty_timeout = jiffies + ploop->md_submit_delay_ms*HZ/1000; return new; @@ -889,9 +890,6 @@ static void ploop_advance_local_after_bat_wb(struct ploop *ploop, #endif WARN_ON_ONCE(!test_bit(MD_WRITEBACK, &md->status)); - /* protect piwb */ - clear_bit(MD_WRITEBACK, &md->status); - md->piwb = NULL; spin_unlock(&md->md_lock); spin_unlock_irqrestore(&ploop->bat_lock, flags); kunmap_local(dst_clu); @@ -907,6 +905,11 @@ static void ploop_advance_local_after_bat_wb(struct ploop *ploop, if (!list_empty(&list)) ploop_dispatch_pios(ploop, NULL, &list); + + spin_lock_irqsave(&ploop->bat_lock, flags); + /* Clear flag at the end when all processing related is done */ + clear_bit(MD_WRITEBACK, &md->status); + spin_unlock_irqrestore(&ploop->bat_lock, flags); } static void ploop_free_piwb(struct ploop_index_wb *piwb) @@ -1045,7 +1048,10 @@ static int ploop_prepare_bat_update(struct ploop *ploop, struct md_page *md, bat_entries = md->kmpage; spin_lock(&md->md_lock); /* write */ - WARN_ON_ONCE(md->piwb); + if (WARN_ON(md->piwb)) { + PL_ERR("md %p has piwb: %p type:%d ourtype:%d\n", + md, md->piwb, md->piwb->type, type); + } md->piwb = piwb; piwb->md = md; @@ -1092,15 +1098,18 @@ static int ploop_prepare_bat_update(struct ploop *ploop, struct md_page *md, return -ENOMEM; } -void ploop_break_bat_update(struct ploop *ploop, struct md_page *md) +void ploop_break_bat_update(struct ploop *ploop, struct md_page *md, + struct ploop_index_wb *piwb) { - struct ploop_index_wb *piwb; unsigned long flags; - spin_lock_irqsave(&md->md_lock, flags); /* write */ - piwb = md->piwb; - md->piwb = NULL; - spin_unlock_irqrestore(&md->md_lock, flags); + spin_lock_irqsave(&ploop->bat_lock, flags); + spin_lock(&md->md_lock); /* write */ + /* Only break if piwb is the same that failed to prepare */ + if (md->piwb == piwb) + md->piwb = NULL; + spin_unlock(&md->md_lock); + spin_unlock_irqrestore(&ploop->bat_lock, flags); ploop_free_piwb(piwb); } @@ -1350,7 +1359,9 @@ static int ploop_alloc_cluster(struct ploop *ploop, struct ploop_index_wb *piwb, map_index_t *to; int ret = 0; u32 tmp_clu; +#ifdef PLOOP_DELAYWB unsigned long flags; +#endif /* Cluster index related to the page[page_id] start */ clu -= piwb->page_id * PAGE_SIZE / sizeof(map_index_t) - PLOOP_MAP_OFFSET; @@ -1362,7 +1373,6 @@ static int ploop_alloc_cluster(struct ploop *ploop, struct ploop_index_wb *piwb, *dst_clu = tmp_clu; already_alloced = true; } - kunmap_local(to); if (already_alloced) goto out; @@ -1371,17 +1381,16 @@ static int ploop_alloc_cluster(struct ploop *ploop, struct ploop_index_wb *piwb, if (unlikely(ret < 0)) goto out; - to = kmap_local_page(piwb->bat_page); - spin_lock_irqsave(&piwb->md->md_lock, flags); WRITE_ONCE(to[clu], *dst_clu); - WRITE_ONCE(piwb->md->bat_levels[clu], ploop_top_level(ploop)); - spin_unlock_irqrestore(&piwb->md->md_lock, flags); - kunmap_local(to); #ifdef PLOOP_DELAYWB + spin_lock_irqsave(&piwb->md->md_lock, flags); to = piwb->md->kmpage; WRITE_ONCE(to[clu], *dst_clu); + WRITE_ONCE(piwb->md->bat_levels[clu], ploop_top_level(ploop)); + spin_unlock_irqrestore(&piwb->md->md_lock, flags); #endif out: + kunmap_local(to); return ret; } @@ -1650,6 +1659,7 @@ static void ploop_submit_cow_index_wb(struct ploop_cow *cow) struct md_page *md; unsigned long flags; map_index_t *to; + bool add_to_wblist = false; WARN_ON_ONCE(cow->aux_pio->queue_list_id != PLOOP_LIST_COW); page_id = ploop_bat_clu_to_page_nr(clu); @@ -1667,11 +1677,14 @@ static void ploop_submit_cow_index_wb(struct ploop_cow *cow) spin_unlock_irqrestore(&ploop->bat_lock, flags); goto err_resource; } - ploop_md_make_dirty(ploop, md); + add_to_wblist = ploop_md_make_dirty(ploop, md); } - spin_unlock_irqrestore(&ploop->bat_lock, flags); - piwb = md->piwb; + if (WARN_ON(!piwb)) + PL_ERR("cow no piwb:%d\n", add_to_wblist); + + set_bit(MD_UPDATING, &md->status); + spin_unlock_irqrestore(&ploop->bat_lock, flags); clu -= page_id * PAGE_SIZE / sizeof(map_index_t) - PLOOP_MAP_OFFSET; @@ -1693,6 +1706,10 @@ static void ploop_submit_cow_index_wb(struct ploop_cow *cow) cow->dst_clu = BAT_ENTRY_NONE; llist_add((struct llist_node *)(&cow->aux_pio->list), &piwb->cow_llist); + + if (add_to_wblist) + ploop_add_dirty_for_wb(ploop, md); + clear_bit(MD_UPDATING, &md->status); out: return; err_resource: @@ -1755,6 +1772,7 @@ static bool ploop_locate_new_cluster_and_attach_pio(struct ploop *ploop, int err; unsigned long flags; struct file *file; + bool add_to_wblist = false; WARN_ON_ONCE(pio->queue_list_id != PLOOP_LIST_DEFERRED); spin_lock_irqsave(&ploop->bat_lock, flags); @@ -1764,6 +1782,12 @@ static bool ploop_locate_new_cluster_and_attach_pio(struct ploop *ploop, } + /* + * Flag page as updating, so if there is already a pending update it + * waits for the new update to be ready before processing it + */ + set_bit(MD_UPDATING, &md->status); + if (!test_bit(MD_DIRTY, &md->status)) { /* Locked since MD_DIRTY is set and cleared concurrently */ page_id = ploop_bat_clu_to_page_nr(clu); @@ -1772,19 +1796,18 @@ static bool ploop_locate_new_cluster_and_attach_pio(struct ploop *ploop, spin_unlock_irqrestore(&ploop->bat_lock, flags); goto error; } + add_to_wblist = ploop_md_make_dirty(ploop, md); bat_update_prepared = true; - ploop_md_make_dirty(ploop, md); } - spin_unlock_irqrestore(&ploop->bat_lock, flags); - piwb = md->piwb; + WARN_ON_ONCE(!piwb); + spin_unlock_irqrestore(&ploop->bat_lock, flags); file = ploop_top_delta(ploop)->mtfile[pio->runner_id]; err = ploop_alloc_cluster(ploop, piwb, clu, dst_clu, file); if (err) { pio->bi_status = errno_to_blk_status(err); - clear_bit(MD_DIRTY, &md->status); goto error; } @@ -1802,13 +1825,22 @@ static bool ploop_locate_new_cluster_and_attach_pio(struct ploop *ploop, ploop_attach_end_action(pio, piwb); #endif attached = true; + if (add_to_wblist) + ploop_add_dirty_for_wb(ploop, md); + clear_bit(MD_UPDATING, &md->status); out: return attached; error: /* Uninit piwb */ if (bat_update_prepared) - ploop_break_bat_update(ploop, md); + ploop_break_bat_update(ploop, md, piwb); ploop_pio_endio(pio); + + spin_lock_irqsave(&ploop->bat_lock, flags); + if (add_to_wblist) + clear_bit(MD_DIRTY, &md->status); + clear_bit(MD_UPDATING, &md->status); + spin_unlock_irqrestore(&ploop->bat_lock, flags); return false; } @@ -2007,6 +2039,7 @@ static void ploop_process_one_discard_pio(struct ploop *ploop, struct pio *pio) struct md_page *md; map_index_t *to; unsigned long flags; + bool add_to_wblist = false; WARN_ON(ploop->nr_deltas != 1 || pio->queue_list_id != PLOOP_LIST_DISCARD); @@ -2019,17 +2052,18 @@ static void ploop_process_one_discard_pio(struct ploop *ploop, struct pio *pio) return; } + set_bit(MD_UPDATING, &md->status); if (!test_bit(MD_DIRTY, &md->status)) { if (ploop_prepare_bat_update(ploop, md, PIWB_TYPE_DISCARD) < 0) { pio->bi_status = BLK_STS_RESOURCE; goto err_unlck; } + add_to_wblist = ploop_md_make_dirty(ploop, md); bat_update_prepared = true; - ploop_md_make_dirty(ploop, md); } - spin_unlock_irqrestore(&ploop->bat_lock, flags); piwb = md->piwb; + spin_unlock_irqrestore(&ploop->bat_lock, flags); /* Cluster index related to the page[page_id] start */ clu -= piwb->page_id * PAGE_SIZE / sizeof(map_index_t) - PLOOP_MAP_OFFSET; @@ -2037,7 +2071,7 @@ static void ploop_process_one_discard_pio(struct ploop *ploop, struct pio *pio) to = kmap_local_page(piwb->bat_page); if (WARN_ON_ONCE(!to[clu])) { pio->bi_status = BLK_STS_IOERR; - clear_bit(MD_DIRTY, &md->status); + kunmap_local(to); goto err; } else { WRITE_ONCE(to[clu], 0); @@ -2046,14 +2080,22 @@ static void ploop_process_one_discard_pio(struct ploop *ploop, struct pio *pio) kunmap_local(to); ploop_md_up_prio(ploop, md); + if (add_to_wblist) + ploop_add_dirty_for_wb(ploop, md); + clear_bit(MD_UPDATING, &md->status); return; err_unlck: spin_unlock_irqrestore(&ploop->bat_lock, flags); err: if (bat_update_prepared) - ploop_break_bat_update(ploop, md); + ploop_break_bat_update(ploop, md, piwb); ploop_pio_endio(pio); + spin_lock_irqsave(&ploop->bat_lock, flags); + if (add_to_wblist) + clear_bit(MD_DIRTY, &md->status); + clear_bit(MD_UPDATING, &md->status); + spin_unlock_irqrestore(&ploop->bat_lock, flags); } static inline int ploop_runners_have_pending(struct ploop *ploop) @@ -2082,21 +2124,46 @@ static inline int ploop_submit_metadata_writeback(struct ploop *ploop, int force */ llist_for_each_safe(pos, t, ll_wb_batch) { md = list_entry((struct list_head *)pos, typeof(*md), wb_link); + INIT_LIST_HEAD(&md->wb_link); /* XXX: fixme delay results in a hang - TBD */ if (1 || !llist_empty(&md->wait_llist) || force || test_bit(MD_HIGHPRIO, &md->status) || time_before(md->dirty_timeout, timeout) || ploop->force_md_writeback) { + spin_lock_irqsave(&ploop->bat_lock, flags); + if (test_bit(MD_UPDATING, &md->status) || + test_bit(MD_WRITEBACK, &md->status)) { + /* if updating or there is a writeback then delay */ + spin_unlock_irqrestore(&ploop->bat_lock, flags); + llist_add((struct llist_node *)&md->wb_link, + &ploop->wb_batch_llist); + continue; + } + /* L1L2 mustn't be redirtyed, when wb in-flight! */ - WARN_ON_ONCE(!test_bit(MD_DIRTY, &md->status)); - WARN_ON_ONCE(test_bit(MD_WRITEBACK, &md->status)); - set_bit(MD_WRITEBACK, &md->status); - clear_bit(MD_DIRTY, &md->status); + if (WARN_ON_ONCE(!test_bit(MD_DIRTY, &md->status))) + PL_ERR("wb but dirty not set\n"); + + if (WARN_ON_ONCE(test_bit(MD_WRITEBACK, &md->status))) + PL_ERR("wb but writeback already set\n"); + clear_bit(MD_HIGHPRIO, &md->status); - ploop_index_wb_submit(ploop, md->piwb); - ret++; + if (md->piwb) { + struct ploop_index_wb *piwb; + /* New updates will go in wait list */ + set_bit(MD_WRITEBACK, &md->status); + clear_bit(MD_DIRTY, &md->status); + /* at this point we can clear md->piwb */ + piwb = md->piwb; + md->piwb = NULL; + /* hand off to threads */ + ploop_index_wb_submit(ploop, piwb); + ret++; + } else { + PL_ERR("Unexpected md piwb is null"); + } + spin_unlock_irqrestore(&ploop->bat_lock, flags); } else { - INIT_LIST_HEAD(&md->wb_link); llist_add((struct llist_node *)&md->wb_link, &ploop->wb_batch_llist); } } @@ -2637,6 +2704,7 @@ int ploop_prepare_reloc_index_wb(struct ploop *ploop, type = PIWB_TYPE_RELOC; err = -EIO; + spin_lock_irq(&ploop->bat_lock); if (test_bit(MD_DIRTY, &md->status) || test_bit(MD_WRITEBACK, &md->status)) { PL_ERR("Unexpected md status: %lx", md->status); goto out_error; @@ -2659,12 +2727,13 @@ int ploop_prepare_reloc_index_wb(struct ploop *ploop, if (err) goto out_reset; } + spin_unlock_irq(&ploop->bat_lock); *ret_md = md; return 0; out_reset: - ploop_break_bat_update(ploop, md); + ploop_break_bat_update(ploop, md, piwb); out_error: return err; } diff --git a/drivers/md/dm-ploop.h b/drivers/md/dm-ploop.h index 06d95c525c90..46450cac8c7d 100644 --- a/drivers/md/dm-ploop.h +++ b/drivers/md/dm-ploop.h @@ -116,9 +116,10 @@ struct ploop_index_wb { struct md_page { struct rb_node node; u32 id; /* Number of this page starting from hdr */ -#define MD_DIRTY (1U << 1) /* Page contains changes and wants writeback */ -#define MD_WRITEBACK (1U << 2) /* Writeback was submitted */ +#define MD_DIRTY (1U << 1) /* Page contains changes and awaits writeback */ +#define MD_WRITEBACK (1U << 2) /* Writeback in progress */ #define MD_HIGHPRIO (1U << 3) /* High priority writeback */ +#define MD_UPDATING (1U << 4) /* Preparing an update */ unsigned long status; struct page *page; void *kmpage; @@ -612,7 +613,8 @@ extern void ploop_map_and_submit_rw(struct ploop *ploop, u32 dst_clu, extern int ploop_prepare_reloc_index_wb(struct ploop *ploop, struct md_page **ret_md, u32 clu, u32 *dst_clu, struct file *file); -extern void ploop_break_bat_update(struct ploop *ploop, struct md_page *md); +extern void ploop_break_bat_update(struct ploop *ploop, struct md_page *md, + struct ploop_index_wb *piwb); extern void ploop_index_wb_submit(struct ploop *, struct ploop_index_wb *); extern int ploop_message(struct dm_target *ti, unsigned int argc, char **argv, char *result, unsigned int maxlen); _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel