With the locking reduced it opened windows for races between running updates, pending and new updates. Current logic to deal with them is not correct.
Current flags are: MD_DIRTY - means md page is dirty and is waiting for writeback MD_WRITEBACK - write back is in progress But we drop the lock after we make page dirty so it races with writeback. To fix this introduce a new flag: MD_UPDATING - which means we are updating the pending writeback or we are creating a new one. Remove the check if piwb is present on the page and clear piwb early on submit, check was using a type alloc vs discard but in case of concurrent alloc and discard the piwb is overwritten by the second operation. Using a flag solves this and makes it more clear in what state page is. Nice side effect of closing that race is that we have a minimalistic delay for writeback updates. This happens when we try to submit a pending writeback and we see the MD_UPDATING flags, update is not submitted but it is retried on next cycle. So if we have a lot of update to the same page in a very short periot they are accumulated as one. On error only clear MD_DIRTY if it is set by us before dropping the lock and when breaking the prepared bat update only clear md->piwb if it is the same as the piwb allocated in the function. Add some clarification error messages around WARN_ONs. While at it fix: - mapping bat_bage two times in same function, do it only once - missed unmapping on error path - update proper bat levels in delayed update case - locking around prepare_bat_update from ploop_prepare_reloc_index_wb https://virtuozzo.atlassian.net/browse/VSTOR-98807 Signed-off-by: Alexander Atanasov <alexander.atana...@virtuozzo.com> --- drivers/md/dm-ploop-cmd.c | 5 +- drivers/md/dm-ploop-map.c | 170 +++++++++++++++++++++++++++----------- drivers/md/dm-ploop.h | 8 +- 3 files changed, 128 insertions(+), 55 deletions(-) v1->v2: - remove obsolete comment - add comment on single md page user - add missed unlock in error path - use ploop_add_dirty_for_wb - fix unmapping wrong mapping diff --git a/drivers/md/dm-ploop-cmd.c b/drivers/md/dm-ploop-cmd.c index e032af52b64e..d2eb4797ab6e 100644 --- a/drivers/md/dm-ploop-cmd.c +++ b/drivers/md/dm-ploop-cmd.c @@ -315,10 +315,8 @@ static int ploop_grow_relocate_cluster(struct ploop *ploop, goto out; } - spin_lock_irq(&ploop->bat_lock); ret = ploop_prepare_reloc_index_wb(ploop, &md, clu, &new_dst, ploop_top_delta(ploop)->file); - spin_unlock_irq(&ploop->bat_lock); if (ret < 0) { PL_ERR("reloc: can't prepare it: %d", ret); goto out; @@ -329,7 +327,8 @@ static int ploop_grow_relocate_cluster(struct ploop *ploop, ret = ploop_write_cluster_sync(ploop, pio, new_dst); if (ret) { PL_ERR("reloc: failed write: %d", ret); - ploop_break_bat_update(ploop, md); + /* piwb can not change here we are the only user of md page */ + ploop_break_bat_update(ploop, md, piwb); goto out; } diff --git a/drivers/md/dm-ploop-map.c b/drivers/md/dm-ploop-map.c index 7efbaedd69e0..9fc72d5ad376 100644 --- a/drivers/md/dm-ploop-map.c +++ b/drivers/md/dm-ploop-map.c @@ -404,18 +404,15 @@ void ploop_dispatch_pios(struct ploop *ploop, struct pio *pio, } static bool ploop_delay_if_md_busy(struct ploop *ploop, struct md_page *md, - enum piwb_type type, struct pio *pio) + struct pio *pio) { - struct ploop_index_wb *piwb; unsigned long flags; bool busy = false; WARN_ON_ONCE(!list_empty(&pio->list)); - /* lock protects piwb */ spin_lock_irqsave(&md->md_lock, flags); /* read */ - piwb = md->piwb; - if (piwb && (piwb->type != type || test_bit(MD_WRITEBACK, &md->status))) { + if (test_bit(MD_WRITEBACK, &md->status) || test_bit(MD_UPDATING, &md->status)) { llist_add((struct llist_node *)(&pio->list), &md->wait_llist); busy = true; } @@ -601,6 +598,11 @@ static void ploop_unlink_completed_pio(struct ploop *ploop, struct pio *pio) ploop_dispatch_pios(ploop, NULL, &pio_list); } +static void ploop_add_dirty_for_wb(struct ploop *ploop, struct md_page *md) +{ + llist_add((struct llist_node *)&md->wb_link, &ploop->wb_batch_llist); +} + static bool ploop_md_make_dirty(struct ploop *ploop, struct md_page *md) { bool new = false; @@ -613,10 +615,8 @@ static bool ploop_md_make_dirty(struct ploop *ploop, struct md_page *md) lockdep_assert_held(&ploop->bat_lock); WARN_ON_ONCE(test_bit(MD_WRITEBACK, &md->status)); - if (!test_and_set_bit(MD_DIRTY, &md->status)) { - llist_add((struct llist_node *)&md->wb_link, &ploop->wb_batch_llist); + if (!test_and_set_bit(MD_DIRTY, &md->status)) new = true; - } md->dirty_timeout = jiffies + ploop->md_submit_delay_ms*HZ/1000; return new; @@ -874,9 +874,6 @@ static void ploop_advance_local_after_bat_wb(struct ploop *ploop, #endif WARN_ON_ONCE(!test_bit(MD_WRITEBACK, &md->status)); - /* protect piwb */ - clear_bit(MD_WRITEBACK, &md->status); - md->piwb = NULL; spin_unlock(&md->md_lock); spin_unlock_irqrestore(&ploop->bat_lock, flags); kunmap_local(dst_clu); @@ -892,6 +889,11 @@ static void ploop_advance_local_after_bat_wb(struct ploop *ploop, if (!list_empty(&list)) ploop_dispatch_pios(ploop, NULL, &list); + + spin_lock_irqsave(&ploop->bat_lock, flags); + /* Clear flag at the end when all processing related is done */ + clear_bit(MD_WRITEBACK, &md->status); + spin_unlock_irqrestore(&ploop->bat_lock, flags); } static void ploop_free_piwb(struct ploop_index_wb *piwb) @@ -1030,7 +1032,10 @@ static int ploop_prepare_bat_update(struct ploop *ploop, struct md_page *md, bat_entries = md->kmpage; spin_lock(&md->md_lock); /* write */ - WARN_ON_ONCE(md->piwb); + if (WARN_ON(md->piwb)) { + PL_ERR("md %p has piwb: %p type:%d ourtype:%d\n", + md, md->piwb, md->piwb->type, type); + } md->piwb = piwb; piwb->md = md; @@ -1077,15 +1082,18 @@ static int ploop_prepare_bat_update(struct ploop *ploop, struct md_page *md, return -ENOMEM; } -void ploop_break_bat_update(struct ploop *ploop, struct md_page *md) +void ploop_break_bat_update(struct ploop *ploop, struct md_page *md, + struct ploop_index_wb *piwb) { - struct ploop_index_wb *piwb; unsigned long flags; - spin_lock_irqsave(&md->md_lock, flags); /* write */ - piwb = md->piwb; - md->piwb = NULL; - spin_unlock_irqrestore(&md->md_lock, flags); + spin_lock_irqsave(&ploop->bat_lock, flags); + spin_lock(&md->md_lock); /* write */ + /* Only break if piwb is the same that failed to prepare */ + if (md->piwb == piwb) + md->piwb = NULL; + spin_unlock(&md->md_lock); + spin_unlock_irqrestore(&ploop->bat_lock, flags); ploop_free_piwb(piwb); } @@ -1335,7 +1343,10 @@ static int ploop_alloc_cluster(struct ploop *ploop, struct ploop_index_wb *piwb, map_index_t *to; int ret = 0; u32 tmp_clu; +#ifdef PLOOP_DELAYWB unsigned long flags; + map_index_t *tomd; +#endif /* Cluster index related to the page[page_id] start */ clu -= piwb->page_id * PAGE_SIZE / sizeof(map_index_t) - PLOOP_MAP_OFFSET; @@ -1347,7 +1358,6 @@ static int ploop_alloc_cluster(struct ploop *ploop, struct ploop_index_wb *piwb, *dst_clu = tmp_clu; already_alloced = true; } - kunmap_local(to); if (already_alloced) goto out; @@ -1356,17 +1366,16 @@ static int ploop_alloc_cluster(struct ploop *ploop, struct ploop_index_wb *piwb, if (unlikely(ret < 0)) goto out; - to = kmap_local_page(piwb->bat_page); - spin_lock_irqsave(&piwb->md->md_lock, flags); WRITE_ONCE(to[clu], *dst_clu); +#ifdef PLOOP_DELAYWB + spin_lock_irqsave(&piwb->md->md_lock, flags); + tomd = piwb->md->kmpage; + WRITE_ONCE(tomd[clu], *dst_clu); WRITE_ONCE(piwb->md->bat_levels[clu], ploop_top_level(ploop)); spin_unlock_irqrestore(&piwb->md->md_lock, flags); - kunmap_local(to); -#ifdef PLOOP_DELAYWB - to = piwb->md->kmpage; - WRITE_ONCE(to[clu], *dst_clu); #endif out: + kunmap_local(to); return ret; } @@ -1635,6 +1644,7 @@ static void ploop_submit_cow_index_wb(struct ploop_cow *cow) struct md_page *md; unsigned long flags; map_index_t *to; + bool add_to_wblist = false; WARN_ON_ONCE(cow->aux_pio->queue_list_id != PLOOP_LIST_COW); page_id = ploop_bat_clu_to_page_nr(clu); @@ -1642,7 +1652,7 @@ static void ploop_submit_cow_index_wb(struct ploop_cow *cow) spin_lock_irqsave(&ploop->bat_lock, flags); /* MD_DIRTY is set and cleared concurrently, do it under bat_lock */ - if (ploop_delay_if_md_busy(ploop, md, PIWB_TYPE_ALLOC, cow->aux_pio)) { + if (ploop_delay_if_md_busy(ploop, md, cow->aux_pio)) { spin_unlock_irqrestore(&ploop->bat_lock, flags); goto out; } @@ -1652,11 +1662,14 @@ static void ploop_submit_cow_index_wb(struct ploop_cow *cow) spin_unlock_irqrestore(&ploop->bat_lock, flags); goto err_resource; } - ploop_md_make_dirty(ploop, md); + add_to_wblist = ploop_md_make_dirty(ploop, md); } - spin_unlock_irqrestore(&ploop->bat_lock, flags); - piwb = md->piwb; + if (WARN_ON(!piwb)) + PL_ERR("cow no piwb:%d\n", add_to_wblist); + + set_bit(MD_UPDATING, &md->status); + spin_unlock_irqrestore(&ploop->bat_lock, flags); clu -= page_id * PAGE_SIZE / sizeof(map_index_t) - PLOOP_MAP_OFFSET; @@ -1678,6 +1691,10 @@ static void ploop_submit_cow_index_wb(struct ploop_cow *cow) cow->dst_clu = BAT_ENTRY_NONE; llist_add((struct llist_node *)(&cow->aux_pio->list), &piwb->cow_llist); + + if (add_to_wblist) + ploop_add_dirty_for_wb(ploop, md); + clear_bit(MD_UPDATING, &md->status); out: return; err_resource: @@ -1740,15 +1757,22 @@ static bool ploop_locate_new_cluster_and_attach_pio(struct ploop *ploop, int err; unsigned long flags; struct file *file; + bool add_to_wblist = false; WARN_ON_ONCE(pio->queue_list_id != PLOOP_LIST_DEFERRED); spin_lock_irqsave(&ploop->bat_lock, flags); - if (ploop_delay_if_md_busy(ploop, md, PIWB_TYPE_ALLOC, pio)) { + if (ploop_delay_if_md_busy(ploop, md, pio)) { spin_unlock_irqrestore(&ploop->bat_lock, flags); goto out; } + /* + * Flag page as updating, so if there is already a pending update it + * waits for the new update to be ready before processing it + */ + set_bit(MD_UPDATING, &md->status); + if (!test_bit(MD_DIRTY, &md->status)) { /* Locked since MD_DIRTY is set and cleared concurrently */ page_id = ploop_bat_clu_to_page_nr(clu); @@ -1757,19 +1781,18 @@ static bool ploop_locate_new_cluster_and_attach_pio(struct ploop *ploop, spin_unlock_irqrestore(&ploop->bat_lock, flags); goto error; } + add_to_wblist = ploop_md_make_dirty(ploop, md); bat_update_prepared = true; - ploop_md_make_dirty(ploop, md); } - spin_unlock_irqrestore(&ploop->bat_lock, flags); - piwb = md->piwb; + WARN_ON_ONCE(!piwb); + spin_unlock_irqrestore(&ploop->bat_lock, flags); file = ploop_top_delta(ploop)->mtfile[pio->runner_id]; err = ploop_alloc_cluster(ploop, piwb, clu, dst_clu, file); if (err) { pio->bi_status = errno_to_blk_status(err); - clear_bit(MD_DIRTY, &md->status); goto error; } @@ -1787,13 +1810,22 @@ static bool ploop_locate_new_cluster_and_attach_pio(struct ploop *ploop, ploop_attach_end_action(pio, piwb); #endif attached = true; + if (add_to_wblist) + ploop_add_dirty_for_wb(ploop, md); + clear_bit(MD_UPDATING, &md->status); out: return attached; error: /* Uninit piwb */ if (bat_update_prepared) - ploop_break_bat_update(ploop, md); + ploop_break_bat_update(ploop, md, piwb); ploop_pio_endio(pio); + + spin_lock_irqsave(&ploop->bat_lock, flags); + if (add_to_wblist) + clear_bit(MD_DIRTY, &md->status); + clear_bit(MD_UPDATING, &md->status); + spin_unlock_irqrestore(&ploop->bat_lock, flags); return false; } @@ -1992,6 +2024,7 @@ static void ploop_process_one_discard_pio(struct ploop *ploop, struct pio *pio) struct md_page *md; map_index_t *to; unsigned long flags; + bool add_to_wblist = false; WARN_ON(ploop->nr_deltas != 1 || pio->queue_list_id != PLOOP_LIST_DISCARD); @@ -1999,22 +2032,23 @@ static void ploop_process_one_discard_pio(struct ploop *ploop, struct pio *pio) page_id = ploop_bat_clu_to_page_nr(clu); md = ploop_md_page_find(ploop, page_id); spin_lock_irqsave(&ploop->bat_lock, flags); - if (ploop_delay_if_md_busy(ploop, md, PIWB_TYPE_DISCARD, pio)) { + if (ploop_delay_if_md_busy(ploop, md, pio)) { spin_unlock_irqrestore(&ploop->bat_lock, flags); return; } + set_bit(MD_UPDATING, &md->status); if (!test_bit(MD_DIRTY, &md->status)) { if (ploop_prepare_bat_update(ploop, md, PIWB_TYPE_DISCARD) < 0) { pio->bi_status = BLK_STS_RESOURCE; goto err_unlck; } + add_to_wblist = ploop_md_make_dirty(ploop, md); bat_update_prepared = true; - ploop_md_make_dirty(ploop, md); } - spin_unlock_irqrestore(&ploop->bat_lock, flags); piwb = md->piwb; + spin_unlock_irqrestore(&ploop->bat_lock, flags); /* Cluster index related to the page[page_id] start */ clu -= piwb->page_id * PAGE_SIZE / sizeof(map_index_t) - PLOOP_MAP_OFFSET; @@ -2022,7 +2056,7 @@ static void ploop_process_one_discard_pio(struct ploop *ploop, struct pio *pio) to = kmap_local_page(piwb->bat_page); if (WARN_ON_ONCE(!to[clu])) { pio->bi_status = BLK_STS_IOERR; - clear_bit(MD_DIRTY, &md->status); + kunmap_local(to); goto err; } else { WRITE_ONCE(to[clu], 0); @@ -2031,14 +2065,22 @@ static void ploop_process_one_discard_pio(struct ploop *ploop, struct pio *pio) kunmap_local(to); ploop_md_up_prio(ploop, md); + if (add_to_wblist) + ploop_add_dirty_for_wb(ploop, md); + clear_bit(MD_UPDATING, &md->status); return; err_unlck: spin_unlock_irqrestore(&ploop->bat_lock, flags); err: if (bat_update_prepared) - ploop_break_bat_update(ploop, md); + ploop_break_bat_update(ploop, md, piwb); ploop_pio_endio(pio); + spin_lock_irqsave(&ploop->bat_lock, flags); + if (add_to_wblist) + clear_bit(MD_DIRTY, &md->status); + clear_bit(MD_UPDATING, &md->status); + spin_unlock_irqrestore(&ploop->bat_lock, flags); } static inline int ploop_runners_have_pending(struct ploop *ploop) @@ -2067,22 +2109,49 @@ static inline int ploop_submit_metadata_writeback(struct ploop *ploop, int force */ llist_for_each_safe(pos, t, ll_wb_batch) { md = list_entry((struct list_head *)pos, typeof(*md), wb_link); + INIT_LIST_HEAD(&md->wb_link); /* XXX: fixme delay results in a hang - TBD */ if (1 || !llist_empty(&md->wait_llist) || force || test_bit(MD_HIGHPRIO, &md->status) || time_before(md->dirty_timeout, timeout) || ploop->force_md_writeback) { + spin_lock_irqsave(&ploop->bat_lock, flags); + if (test_bit(MD_UPDATING, &md->status) || + test_bit(MD_WRITEBACK, &md->status)) { + /* if updating or there is a writeback then delay */ + spin_unlock_irqrestore(&ploop->bat_lock, flags); + llist_add((struct llist_node *)&md->wb_link, + &ploop->wb_batch_llist); + continue; + } + /* L1L2 mustn't be redirtyed, when wb in-flight! */ - WARN_ON_ONCE(!test_bit(MD_DIRTY, &md->status)); - WARN_ON_ONCE(test_bit(MD_WRITEBACK, &md->status)); - set_bit(MD_WRITEBACK, &md->status); - clear_bit(MD_DIRTY, &md->status); + WARN_ONCE(!test_bit(MD_DIRTY, &md->status), + PL_FMT("wb but dirty not set"), + ploop_device_name(ploop)); + + WARN_ONCE(test_bit(MD_WRITEBACK, &md->status), + PL_FMT("wb but writeback already set"), + ploop_device_name(ploop)); + clear_bit(MD_HIGHPRIO, &md->status); - ploop_index_wb_submit(ploop, md->piwb); - ret++; + if (md->piwb) { + struct ploop_index_wb *piwb; + /* New updates will go in wait list */ + set_bit(MD_WRITEBACK, &md->status); + clear_bit(MD_DIRTY, &md->status); + /* at this point we can clear md->piwb */ + piwb = md->piwb; + md->piwb = NULL; + /* hand off to threads */ + ploop_index_wb_submit(ploop, piwb); + ret++; + } else { + PL_ERR("Unexpected md piwb is null"); + } + spin_unlock_irqrestore(&ploop->bat_lock, flags); } else { - INIT_LIST_HEAD(&md->wb_link); - llist_add((struct llist_node *)&md->wb_link, &ploop->wb_batch_llist); + ploop_add_dirty_for_wb(ploop, md); } } return ret; @@ -2607,6 +2676,7 @@ int ploop_prepare_reloc_index_wb(struct ploop *ploop, type = PIWB_TYPE_RELOC; err = -EIO; + spin_lock_irq(&ploop->bat_lock); if (test_bit(MD_DIRTY, &md->status) || test_bit(MD_WRITEBACK, &md->status)) { PL_ERR("Unexpected md status: %lx", md->status); goto out_error; @@ -2629,13 +2699,15 @@ int ploop_prepare_reloc_index_wb(struct ploop *ploop, if (err) goto out_reset; } + spin_unlock_irq(&ploop->bat_lock); *ret_md = md; return 0; out_reset: - ploop_break_bat_update(ploop, md); + ploop_break_bat_update(ploop, md, piwb); out_error: + spin_unlock_irq(&ploop->bat_lock); return err; } ALLOW_ERROR_INJECTION(ploop_prepare_reloc_index_wb, ERRNO); diff --git a/drivers/md/dm-ploop.h b/drivers/md/dm-ploop.h index 43e65e841e4a..90cf08472660 100644 --- a/drivers/md/dm-ploop.h +++ b/drivers/md/dm-ploop.h @@ -116,9 +116,10 @@ struct ploop_index_wb { struct md_page { struct rb_node node; u32 id; /* Number of this page starting from hdr */ -#define MD_DIRTY (1U << 1) /* Page contains changes and wants writeback */ -#define MD_WRITEBACK (1U << 2) /* Writeback was submitted */ +#define MD_DIRTY (1U << 1) /* Page contains changes and awaits writeback */ +#define MD_WRITEBACK (1U << 2) /* Writeback in progress */ #define MD_HIGHPRIO (1U << 3) /* High priority writeback */ +#define MD_UPDATING (1U << 4) /* Preparing an update */ unsigned long status; struct page *page; void *kmpage; @@ -611,7 +612,8 @@ extern void ploop_map_and_submit_rw(struct ploop *ploop, u32 dst_clu, extern int ploop_prepare_reloc_index_wb(struct ploop *ploop, struct md_page **ret_md, u32 clu, u32 *dst_clu, struct file *file); -extern void ploop_break_bat_update(struct ploop *ploop, struct md_page *md); +extern void ploop_break_bat_update(struct ploop *ploop, struct md_page *md, + struct ploop_index_wb *piwb); extern void ploop_index_wb_submit(struct ploop *, struct ploop_index_wb *); extern int ploop_message(struct dm_target *ti, unsigned int argc, char **argv, char *result, unsigned int maxlen); -- 2.43.0 _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel