On 3/27/25 13:18, Alexander Atanasov wrote:
We missed the fact that resize/grow touch md0 page to update ploop
parameters, other pages are not linked so there is no issue.
But in case there is a concurrent update to md0 piwb is not handled
correctly, also using sync updates in parallel is not okay.
To fix this update code to use the new mechanism with MD_UPDATING
flag and instead of using sync operations pass the updates to
runner threads.
https://virtuozzo.atlassian.net/browse/VSTOR-101871
Signed-off-by: Alexander Atanasov <alexander.atana...@virtuozzo.com>
---
drivers/md/dm-ploop-cmd.c | 39 ++++++++++++++++++++++++++++++++-------
drivers/md/dm-ploop-map.c | 37 ++++++++++++++++++++++++++++---------
drivers/md/dm-ploop.h | 7 ++++++-
3 files changed, 66 insertions(+), 17 deletions(-)
diff --git a/drivers/md/dm-ploop-cmd.c b/drivers/md/dm-ploop-cmd.c
index d2eb4797ab6e..7d79b900eb1a 100644
--- a/drivers/md/dm-ploop-cmd.c
+++ b/drivers/md/dm-ploop-cmd.c
@@ -286,6 +286,8 @@ static int ploop_grow_relocate_cluster(struct ploop *ploop,
struct md_page *md;
bool is_locked;
int ret = 0;
+ int tries = 5;
+ int add_for_wb = 0;
dst_clu = cmd->resize.dst_clu;
@@ -308,6 +310,7 @@ static int ploop_grow_relocate_cluster(struct ploop *ploop,
}
spin_unlock_irq(&ploop->bat_lock);
+reread:
/* Read full clu sync */
ret = ploop_read_cluster_sync(ploop, pio, dst_clu);
if (ret < 0) {
@@ -316,8 +319,13 @@ static int ploop_grow_relocate_cluster(struct ploop *ploop,
}
ret = ploop_prepare_reloc_index_wb(ploop, &md, clu, &new_dst,
- ploop_top_delta(ploop)->file);
+ ploop_top_delta(ploop)->file,
+ &add_for_wb);
if (ret < 0) {
+ if (ret == -EBUSY && tries-- > 0) {
+ PL_ERR("md0 busy, retry:%d\n", tries);
+ goto reread;
+ }
PL_ERR("reloc: can't prepare it: %d", ret);
goto out;
}
@@ -332,13 +340,16 @@ static int ploop_grow_relocate_cluster(struct ploop
*ploop,
goto out;
}
- set_bit(MD_WRITEBACK, &md->status);
init_completion(&comp);
piwb->comp = ∁
piwb->comp_bi_status = &bi_status;
/* Write new index on disk */
- ploop_index_wb_submit(ploop, piwb);
+ ploop_disable_writeback_delay(ploop);
+ if (add_for_wb)
+ ploop_add_dirty_for_wb(ploop, md);
+ clear_bit(MD_UPDATING, &md->status);
wait_for_completion(&comp);
+ ploop_enable_writeback_delay(ploop);
ret = blk_status_to_errno(bi_status);
if (ret) {
@@ -378,12 +389,22 @@ static int ploop_grow_update_header(struct ploop *ploop,
struct md_page *md;
u64 sectors;
int ret;
+ int tries = 5;
+ int add_for_wb = false;
+retry:
/* hdr is in the same page as bat_entries[0] index */
ret = ploop_prepare_reloc_index_wb(ploop, &md, 0, NULL,
- ploop_top_delta(ploop)->file);
- if (ret)
+ ploop_top_delta(ploop)->file,
+ &add_for_wb);
+ if (ret) {
+ if (ret == -EBUSY && tries-- > 0) {
+ PL_ERR("md0 busy, retry:%d\n", tries);
+ schedule();
+ goto retry;
+ }
return ret;
+ }
piwb = md->piwb;
size = (PLOOP_MAP_OFFSET + cmd->resize.nr_bat_entries);
@@ -398,12 +419,16 @@ static int ploop_grow_update_header(struct ploop *ploop,
offset = hdr->m_FirstBlockOffset = cpu_to_le32(first_block_off);
kunmap_local(hdr);
- set_bit(MD_WRITEBACK, &md->status);
init_completion(&comp);
piwb->comp = ∁
piwb->comp_bi_status = &bi_status;
- ploop_index_wb_submit(ploop, piwb);
+
+ ploop_disable_writeback_delay(ploop);
+ if (add_for_wb)
I think this can be very dangerous. If add_for_wb is 0, then we have no
control over metadata writeback.
In this function we rely on piwb->compl, which we set before firing the
writeback. But if we do not fire the writeback ourselves, we have no
guarantee that piwb is valid at the time we set compl to it, as someone
may already added it to the list and it was already completed and freed.
Also I think it is not valid to work with piwb->bat_page for the same
reason: it should be added to the writeback list ONLY after we did our
manipulations, and we have no control over it if add_for_wb == 0
+ ploop_add_dirty_for_wb(ploop, md);
+ clear_bit(MD_UPDATING, &md->status);
wait_for_completion(&comp);
+ ploop_enable_writeback_delay(ploop);
ret = blk_status_to_errno(bi_status);
if (!ret) {
diff --git a/drivers/md/dm-ploop-map.c b/drivers/md/dm-ploop-map.c
index ef280a8b0f90..04e71c851b64 100644
--- a/drivers/md/dm-ploop-map.c
+++ b/drivers/md/dm-ploop-map.c
@@ -613,11 +613,6 @@ static void ploop_unlink_completed_pio(struct ploop
*ploop, struct pio *pio)
ploop_dispatch_pios(ploop, NULL, &pio_list);
}
-static void ploop_add_dirty_for_wb(struct ploop *ploop, struct md_page *md)
-{
- llist_add((struct llist_node *)&md->wb_link, &ploop->wb_batch_llist);
-}
-
static bool ploop_md_make_dirty(struct ploop *ploop, struct md_page *md)
{
bool new = false;
@@ -1031,6 +1026,7 @@ static int ploop_prepare_bat_update(struct ploop *ploop,
struct md_page *md,
struct pio *pio;
map_index_t *to;
u8 level;
+ int ret = -ENOMEM;
lockdep_assert_held(&ploop->bat_lock);
@@ -1050,6 +1046,9 @@ static int ploop_prepare_bat_update(struct ploop *ploop, struct md_page *md,
if (WARN_ON(md->piwb)) {
PL_ERR("md %p has piwb: %p type:%d ourtype:%d\n",
md, md->piwb, md->piwb->type, type);
+ spin_unlock(&md->md_lock);
+ ret = -EBUSY;
+ goto err;
}
md->piwb = piwb;
piwb->md = md;
@@ -1094,7 +1093,7 @@ static int ploop_prepare_bat_update(struct ploop *ploop,
struct md_page *md,
return 0;
err:
ploop_free_piwb(piwb);
- return -ENOMEM;
+ return ret;
}
void ploop_break_bat_update(struct ploop *ploop, struct md_page *md,
@@ -2692,31 +2691,45 @@ static void ploop_handle_cleanup(struct ploop *ploop,
struct pio *pio)
* another index instead of existing. If so, management of
* old bat_entries[@clu] and of related holes_bitmap bit
* is caller duty.
+ * Caller must clear MD_UPDATING and comply to add_for_wb
*/
int ploop_prepare_reloc_index_wb(struct ploop *ploop,
struct md_page **ret_md,
u32 clu, u32 *dst_clu,
- struct file *file)
+ struct file *file,
+ int *add_for_wb)
{
enum piwb_type type = PIWB_TYPE_ALLOC;
u32 page_id = ploop_bat_clu_to_page_nr(clu);
struct md_page *md = ploop_md_page_find(ploop, page_id);
struct ploop_index_wb *piwb;
int err;
+ bool add_to_wblist;
if (dst_clu)
type = PIWB_TYPE_RELOC;
err = -EIO;
+
+
extra space
spin_lock_irq(&ploop->bat_lock);
- if (test_bit(MD_DIRTY, &md->status) || test_bit(MD_WRITEBACK,
&md->status)) {
- PL_ERR("Unexpected md status: %lx", md->status);
+ spin_lock(&md->md_lock);
+ if (test_bit(MD_DIRTY, &md->status) || test_bit(MD_WRITEBACK,
&md->status)
+ || test_bit(MD_UPDATING, &md->status)) {
+ err = -EBUSY;
+ spin_unlock(&md->md_lock);
goto out_error;
+ } else {
+ set_bit(MD_UPDATING, &md->status);
}
+ spin_unlock(&md->md_lock);
+
err = ploop_prepare_bat_update(ploop, md, type);
if (err)
goto out_error;
+ add_to_wblist = ploop_md_make_dirty(ploop, md);
I think add_to_wblist will always be true here. We take bat_lock before
test_bit(MD_DIRTY) and do not release it until ploop_md_make_dirty().
And the only way another thread can set MD_DIRTY is also via
ploop_md_make_dirty(), which requires bat_lock to be taken, which we are
still holding
+
piwb = md->piwb;
if (dst_clu) {
@@ -2734,12 +2747,18 @@ int ploop_prepare_reloc_index_wb(struct ploop *ploop,
spin_unlock_irq(&ploop->bat_lock);
*ret_md = md;
+ *add_for_wb = add_to_wblist ? 1 : 0;
+
return 0;
out_reset:
ploop_break_bat_update(ploop, md, piwb);
out_error:
+ if (add_to_wblist)
+ clear_bit(MD_DIRTY, &md->status);
+ clear_bit(MD_UPDATING, &md->status);
spin_unlock_irq(&ploop->bat_lock);
+
return err;
}
ALLOW_ERROR_INJECTION(ploop_prepare_reloc_index_wb, ERRNO);
diff --git a/drivers/md/dm-ploop.h b/drivers/md/dm-ploop.h
index 46450cac8c7d..db4d92c9679a 100644
--- a/drivers/md/dm-ploop.h
+++ b/drivers/md/dm-ploop.h
@@ -578,6 +578,11 @@ static inline const char *ploop_device_name(struct ploop
*ploop)
return ploop->ti->table->md->disk->disk_name;
}
+static inline void ploop_add_dirty_for_wb(struct ploop *ploop, struct md_page *md)
+{
+ llist_add((struct llist_node *)&md->wb_link, &ploop->wb_batch_llist);
+}
+
#define PL_FMT(fmt) "ploop: %s: " fmt "\n"
#define PL_ERR(fmt, ...) pr_err(PL_FMT(fmt), ploop_device_name(ploop),
##__VA_ARGS__)
#define PL_ERR_ONCE(fmt, ...) pr_err_once(PL_FMT(fmt),
ploop_device_name(ploop), ##__VA_ARGS__)
@@ -612,7 +617,7 @@ extern void ploop_map_and_submit_rw(struct ploop *ploop,
u32 dst_clu,
struct pio *pio, u8 level);
extern int ploop_prepare_reloc_index_wb(struct ploop *ploop,
struct md_page **ret_md, u32 clu, u32
*dst_clu,
- struct file *file);
+ struct file *file, int *add_for_wb);
extern void ploop_break_bat_update(struct ploop *ploop, struct md_page *md,
struct ploop_index_wb *piwb);
extern void ploop_index_wb_submit(struct ploop *, struct ploop_index_wb *);
_______________________________________________
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel