The commit is pushed to "branch-rh9-5.14.0-427.44.1.vz9.80.x-ovz" and will appear at g...@bitbucket.org:openvz/vzkernel.git after rh9-5.14.0-427.44.1.vz9.80.6 ------> commit 03c4d103e5213f72087c7ce295b0dfcb66ef47de Author: Alexander Atanasov <alexander.atana...@virtuozzo.com> Date: Fri Jan 24 17:36:32 2025 +0200
dm-ploop: make filespace preallocations async Move file space allocation into a separate thread, try to preallocate space in advance. On each cluster allocation check if next allocation will possibly require file space allocation and trigger the allocator thread to perform it in background. In case we try to allocate cluster and no space is available request file space allocation and wait for it to complete. https://virtuozzo.atlassian.net/browse/VSTOR-97740 Signed-off-by: Alexander Atanasov <alexander.atana...@virtuozzo.com> ====== Patchset description: ploop: optimistations and scalling Ploop processes requsts in a different threads in parallel where possible which results in significant improvement in performance and makes further optimistations possible. Known bugs: - delayed metadata writeback is not working and is missing error handling - patch to disable it until fixed - fast path is not working - causes rcu lockups - patch to disable it Further improvements: - optimize md pages lookups Alexander Atanasov (50): dm-ploop: md_pages map all pages at creation time dm-ploop: Use READ_ONCE/WRITE_ONCE to access md page data dm-ploop: fsync after all pios are sent dm-ploop: move md status to use proper bitops dm-ploop: convert wait_list and wb_batch_llist to use lockless lists dm-ploop: convert enospc handling to use lockless lists dm-ploop: convert suspended_pios list to use lockless list dm-ploop: convert the rest of the lists to use llist variant dm-ploop: combine processing of pios thru prepare list and remove fsync worker dm-ploop: move from wq to kthread dm-ploop: move preparations of pios into the caller from worker dm-ploop: fast path execution for reads dm-ploop: do not use a wrapper for set_bit to make a page writeback dm-ploop: BAT use only one list for writeback dm-ploop: make md writeback timeout to be per page dm-ploop: add interface to disable bat writeback delay dm-ploop: convert wb_batch_list to lockless variant dm-ploop: convert high_prio to status dm-ploop: split cow processing into two functions dm-ploop: convert md page rw lock to spin lock dm-ploop: convert bat_rwlock to bat_lock spinlock dm-ploop: prepare bat updates under bat_lock dm-ploop: make ploop_bat_write_complete ready for parallel pio completion dm-ploop: make ploop_submit_metadata_writeback return number of requests sent dm-ploop: introduce pio runner threads dm-ploop: add pio list ids to be used when passing pios to runners dm-ploop: process pios via runners dm-ploop: disable metadata writeback delay dm-ploop: disable fast path dm-ploop: use lockless lists for chained cow updates list dm-ploop: use lockless lists for data ready pios dm-ploop: give runner threads better name dm-ploop: resize operation - add holes bitmap locking dm-ploop: remove unnecessary operations dm-ploop: use filp per thread dm-ploop: catch if we try to advance pio past bio end dm-ploop: support REQ_FUA for data pios dm-ploop: proplerly access nr_bat_entries dm-ploop: fix locking and improve error handling when submitting pios dm-ploop: fix how ENOTBLK is handled dm-ploop: sync when suspended or stopping dm-ploop: rework bat completion logic dm-ploop: rework logic in pio processing dm-ploop: end fsync pios in parallel dm-ploop: make filespace preallocations async dm-ploop: resubmit enospc pios from dispatcher thread dm-ploop: dm-ploop: simplify discard completion dm-ploop: use GFP_ATOMIC instead of GFP_NOIO dm-ploop: fix locks used in mixed context dm-ploop: fix how current flags are managed inside threads Andrey Zhadchenko (13): dm-ploop: do not flush after metadata writes dm-ploop: set IOCB_DSYNC on all FUA requests dm-ploop: remove extra ploop_cluster_is_in_top_delta() dm-ploop: introduce per-md page locking dm-ploop: reduce BAT accesses on discard completion dm-ploop: simplify llseek dm-ploop: speed up ploop_prepare_bat_update() dm-ploop: make new allocations immediately visible in BAT dm-ploop: drop ploop_cluster_is_in_top_delta() dm-ploop: do not wait for BAT update for non-FUA requests dm-ploop: add delay for metadata writeback dm-ploop: submit all postponed metadata on REQ_OP_FLUSH dm-ploop: handle REQ_PREFLUSH Feature: dm-ploop: ploop target driver --- drivers/md/dm-ploop-map.c | 194 +++++++++++++++++++++++++++++++++++++++---- drivers/md/dm-ploop-target.c | 13 +++ drivers/md/dm-ploop.h | 7 ++ 3 files changed, 198 insertions(+), 16 deletions(-) diff --git a/drivers/md/dm-ploop-map.c b/drivers/md/dm-ploop-map.c index cb25255e5bf4..00929455fcf5 100644 --- a/drivers/md/dm-ploop-map.c +++ b/drivers/md/dm-ploop-map.c @@ -378,6 +378,14 @@ static void ploop_schedule_work(struct ploop *ploop) wake_up_process(ploop->kt_worker->task); } +void ploop_req_prealloc(struct ploop *ploop, loff_t newlen) +{ + lockdep_assert_held(&ploop->bat_lock); + + ploop->prealloc_size += newlen; + wake_up_process(ploop->kt_allocator->task); +} + static void ploop_dispatch_pio(struct ploop *ploop, struct pio *pio) { struct llist_head *list = &ploop->pios[pio->queue_list_id]; @@ -1136,16 +1144,13 @@ ALLOW_ERROR_INJECTION(ploop_find_dst_clu_bit, ERRNO); static int ploop_truncate_prealloc_safe(struct ploop *ploop, struct ploop_delta *delta, - loff_t len, struct file *file, + loff_t old_len, loff_t new_len, struct file *file, const char *func) { - loff_t old_len = delta->file_size; - loff_t new_len = len; int ret; if (new_len <= old_len) return 0; - new_len = ALIGN(new_len, PREALLOC_SIZE); if (!ploop->falloc_new_clu) ret = vfs_truncate2(&file->f_path, new_len, file); @@ -1156,12 +1161,104 @@ static int ploop_truncate_prealloc_safe(struct ploop *ploop, return ret; } - delta->file_size = new_len; - delta->file_preallocated_area_start = len; return 0; } ALLOW_ERROR_INJECTION(ploop_truncate_prealloc_safe, ERRNO); +/* + * Always update prealloc_size and prealloc_in_progress under lock + * + * every allocation checks if next will need space and requests + * preallocation + */ +static int ploop_preallocate_cluster(struct ploop *ploop, struct file *file) +{ + struct ploop_delta *top = ploop_top_delta(ploop); + loff_t end; + unsigned long flags; + int ret, more = 0; + +prealloc_more: + spin_lock_irqsave(&ploop->bat_lock, flags); + ploop->prealloc_in_progress = ploop->prealloc_size; + end = top->file_size + ploop->prealloc_in_progress; + loff_t new_len = ALIGN(end, ploop->prealloc_in_progress); + ploop->prealloc_size = 0; + if (!ploop->prealloc_in_progress) + new_len = 0; + spin_unlock_irqrestore(&ploop->bat_lock, flags); + if (!new_len) + return 0; + + ret = ploop_truncate_prealloc_safe(ploop, top, top->file_size, + new_len, file, __func__); + if (ret) { + PL_ERR("Failed to preallocate space: %d\n", ret); + goto out; + } + + /* here must be the only place to change file_size */ + spin_lock_irqsave(&ploop->bat_lock, flags); + if (top->file_size < new_len) { + top->file_size = new_len; + } else { + PL_ERR("unexpected file size change\n"); + } + if (ploop->prealloc_size) + more = 1; + spin_unlock_irqrestore(&ploop->bat_lock, flags); + if (more) { + more = 0; + goto prealloc_more; + } + +out: + spin_lock_irqsave(&ploop->bat_lock, flags); + ploop->prealloc_in_progress = 0; + ploop->prealloc_size = 0; + spin_unlock_irqrestore(&ploop->bat_lock, flags); + + /* notify if someone is waiting */ + wake_up_interruptible(&ploop->dispatcher_wq_prealloc); + + return ret; +} +ALLOW_ERROR_INJECTION(ploop_preallocate_cluster, ERRNO); + +void ploop_should_prealloc(struct ploop *ploop, struct file *file) +{ + struct ploop_delta *top = ploop_top_delta(ploop); + u32 dst_clu; + u32 clu_size = CLU_SIZE(ploop); + loff_t pos, end; + unsigned long flags; + + spin_lock_irqsave(&ploop->bat_lock, flags); + if (ploop_find_dst_clu_bit(ploop, &dst_clu) < 0) { + spin_unlock_irqrestore(&ploop->bat_lock, flags); + return; + } + + pos = CLU_TO_POS(ploop, dst_clu); + end = pos + clu_size; + if (end > top->file_preallocated_area_start - (PREALLOC_SIZE/2)) { + ploop_req_prealloc(ploop, PREALLOC_SIZE); + } + spin_unlock_irqrestore(&ploop->bat_lock, flags); +} + +static int ploop_pending_prealloc(struct ploop *ploop) +{ + int ret; + unsigned long flags; + + spin_lock_irqsave(&ploop->bat_lock, flags); + ret = !ploop->prealloc_size && !ploop->prealloc_in_progress; + spin_unlock_irqrestore(&ploop->bat_lock, flags); + + return ret; +} + static int ploop_allocate_cluster(struct ploop *ploop, u32 *dst_clu, struct file *file) { struct ploop_delta *top = ploop_top_delta(ploop); @@ -1169,6 +1266,8 @@ static int ploop_allocate_cluster(struct ploop *ploop, u32 *dst_clu, struct file loff_t off, pos, end, old_size; unsigned long flags; int ret; + int retry_cnt = 0; + loff_t prealloc_start; spin_lock_irqsave(&ploop->bat_lock, flags); if (ploop_find_dst_clu_bit(ploop, dst_clu) < 0) { @@ -1182,15 +1281,23 @@ static int ploop_allocate_cluster(struct ploop *ploop, u32 *dst_clu, struct file * We only care to clear what find got. parallel set is ok. */ ploop_hole_clear_bit(*dst_clu, ploop); - spin_unlock_irqrestore(&ploop->bat_lock, flags); + old_size = top->file_size; + prealloc_start = top->file_preallocated_area_start; pos = CLU_TO_POS(ploop, *dst_clu); end = pos + clu_size; - old_size = top->file_size; + off = min_t(loff_t, old_size, end); + spin_unlock_irqrestore(&ploop->bat_lock, flags); + + if (pos < prealloc_start) { + if (end + clu_size > + top->file_preallocated_area_start - (PREALLOC_SIZE/2)) { + spin_lock_irqsave(&ploop->bat_lock, flags); + ploop_req_prealloc(ploop, PREALLOC_SIZE); + spin_unlock_irqrestore(&ploop->bat_lock, flags); + } - if (pos < top->file_preallocated_area_start) { /* Clu at @pos may contain dirty data */ - off = min_t(loff_t, old_size, end); if (!ploop->falloc_new_clu) ret = ploop_punch_hole(file, pos, off - pos); else @@ -1210,16 +1317,33 @@ static int ploop_allocate_cluster(struct ploop *ploop, u32 *dst_clu, struct file } } +retry_alloc: + spin_lock_irqsave(&ploop->bat_lock, flags); + /* size can change from parallel alloc */ + old_size = top->file_size; if (end > old_size) { - ret = ploop_truncate_prealloc_safe(ploop, top, end, file, __func__); - if (ret) { + ploop_req_prealloc(ploop, PREALLOC_SIZE); + spin_unlock_irqrestore(&ploop->bat_lock, flags); + + wait_event_interruptible(ploop->dispatcher_wq_prealloc, + ploop_pending_prealloc(ploop)); + + spin_lock_irqsave(&ploop->bat_lock, flags); + if (end > top->file_size) { + PL_ERR("Prealloc failed expected: %lld got: %lld\n", + end, top->file_size); + /* If allocation fails for some reason retry 3 times */ + if (retry_cnt++ < 3) { + spin_unlock_irqrestore(&ploop->bat_lock, flags); + goto retry_alloc; + } ploop_hole_set_bit(*dst_clu, ploop); - return ret; + spin_unlock_irqrestore(&ploop->bat_lock, flags); + return -EIO; } } - if (end > top->file_preallocated_area_start) - top->file_preallocated_area_start = end; + spin_unlock_irqrestore(&ploop->bat_lock, flags); return 0; } @@ -2126,6 +2250,45 @@ void do_ploop_work(struct work_struct *ws) do_ploop_run_work(ploop); } +int ploop_allocator(void *data) +{ + struct ploop_worker *worker = data; + struct ploop *ploop = worker->ploop; + unsigned int old_flags = current->flags; + int ret; + struct file *file; + + for (;;) { + /* we do not care about exact number here - risk one extra schedule wrt taking the lock */ + if (ploop->prealloc_size) { + __set_current_state(TASK_RUNNING); + file = ploop_top_delta(ploop)->file; + current->flags |= PF_IO_THREAD|PF_LOCAL_THROTTLE|PF_MEMALLOC_NOIO; + ret = ploop_preallocate_cluster(ploop, file); + if (ret) { + /* + * threads requesting prealloc must check + * if size actually changed + */ + /* ENOSPC is only for fallocate */ + pr_err_ratelimited( + PL_FMT("failed to allocate space: %d\n"), + ploop_device_name(ploop), ret); + } + wake_up_interruptible(&ploop->dispatcher_wq_prealloc); + current->flags = old_flags; + } + if (kthread_should_stop()) { + __set_current_state(TASK_RUNNING); + break; + } + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + } + + return 0; +} + int ploop_pio_runner(void *data) { struct ploop_worker *worker = data; @@ -2189,7 +2352,6 @@ int ploop_pio_runner(void *data) case PLOOP_LIST_DISCARD: ploop_process_one_discard_pio(ploop, pio); break; - // XXX: make it list MDWB case PLOOP_LIST_INVALID: /* resubmit sets the list id to invalid */ case PLOOP_LIST_WRITEBACK: /* Write back pio */ ploop_submit_rw_mapped(ploop, pio); diff --git a/drivers/md/dm-ploop-target.c b/drivers/md/dm-ploop-target.c index 61190f2f7eae..641950031fe3 100644 --- a/drivers/md/dm-ploop-target.c +++ b/drivers/md/dm-ploop-target.c @@ -189,12 +189,14 @@ static void ploop_destroy(struct ploop *ploop) /* waits for the thread to stop */ kthread_stop(ploop->kt_worker->task); + kthread_stop(ploop->kt_allocator->task); WARN_ON(!llist_empty(&ploop->pios[PLOOP_LIST_PREPARE])); WARN_ON(!llist_empty(&ploop->llresubmit_pios)); WARN_ON(!llist_empty(&ploop->enospc_pios)); kfree(ploop->kt_runners); kfree(ploop->kt_worker); + kfree(ploop->kt_allocator); } for (i = 0; i < 2; i++) @@ -559,11 +561,18 @@ static int ploop_ctr(struct dm_target *ti, unsigned int argc, char **argv) init_waitqueue_head(&ploop->dispatcher_wq_data); init_waitqueue_head(&ploop->dispatcher_wq_fsync); + init_waitqueue_head(&ploop->dispatcher_wq_prealloc); + ploop->prealloc_size = 0; + ploop->prealloc_in_progress = 0; ploop->kt_worker = ploop_worker_create(ploop, ploop_worker, "d", 0); if (!ploop->kt_worker) goto err; + ploop->kt_allocator = ploop_worker_create(ploop, ploop_allocator, "a", 0); + if (!ploop->kt_allocator) + goto err; + /* make it a param = either module or cpu based or dev req queue */ #define PLOOP_PIO_RUNNERS nr_cpu_ids ploop->kt_runners = kcalloc(PLOOP_PIO_RUNNERS, sizeof(struct kt_worker *), GFP_KERNEL); @@ -590,6 +599,10 @@ static int ploop_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->flush_supported = true; ti->num_discard_bios = 1; ti->discards_supported = true; + + if (ploop->nr_deltas > 0) + ploop_should_prealloc(ploop, ploop_top_delta(ploop)->file); + return 0; err: diff --git a/drivers/md/dm-ploop.h b/drivers/md/dm-ploop.h index 7dbe8819acf7..edf9b1448887 100644 --- a/drivers/md/dm-ploop.h +++ b/drivers/md/dm-ploop.h @@ -212,6 +212,7 @@ struct ploop { struct ploop_worker **kt_runners; /* pio runners */ unsigned int nkt_runners; struct ploop_worker *last_used_runner; + struct ploop_worker *kt_allocator; /* allocator thread */ struct completion inflight_bios_ref_comp; struct percpu_ref inflight_bios_ref[2]; bool inflight_ref_comp_pending; @@ -259,6 +260,10 @@ struct ploop { struct timer_list enospc_timer; bool event_enospc; + + loff_t prealloc_size; + loff_t prealloc_in_progress; + struct wait_queue_head dispatcher_wq_prealloc; }; #define ploop_blk_queue(p) ((p)->ti->table->md->queue) @@ -626,9 +631,11 @@ extern loff_t ploop_llseek_hole(struct dm_target *ti, loff_t offset, int whence) extern int ploop_worker(void *data); extern int ploop_pio_runner(void *data); +extern int ploop_allocator(void *data); extern void ploop_disable_writeback_delay(struct ploop *ploop); extern void ploop_enable_writeback_delay(struct ploop *ploop); +extern void ploop_should_prealloc(struct ploop *ploop, struct file *file); #endif /* __DM_PLOOP_H */ _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel