On Fri, Apr 11, 2025 at 04:56:56PM -0400, Benjamin Marzinski wrote:
> When using a kthread to delay the IOs, dm-delay would continuously loop,
> checking if IOs were ready to submit. It had a cond_resched() call in
> the loop, but might still loop hundreds of millions of times waiting for
> an IO that was scheduled to be submitted 10s of ms in the future. With
> the change to make dm-delay over zoned devices always use kthreads
> regardless of the length of the delay, this wasted work only gets worse.
> 
> To solve this and still keep roughly the same precision for very short
> delays, dm-delay now calls fsleep() for 1/8th of the smallest non-zero
> delay it will place on IOs, or 1 ms, whichever is smaller. The reason
> that dm-delay doesn't just use the actual expiration time of the next
> delayed IO to calculated the sleep time is that delay_dtr() must wait
> for the kthread to finish before deleting the table. If a zoned device
> with a long delay queued an IO shortly before being suspended and
> removed, the IO would be flushed in delay_presuspend(), but the removing
> the device would still have to wait for the remainder of the long delay.
> This time is now capped at 1 ms.

I just realized that this same issue can occur during suspending. If new
IOs come in while or after the IOs are flushed in delay_presuspend(),
they will wait until the kthread runs again, which is again capped at
1 ms.

> 
> Fixes: 70bbeb29fab09 ("dm delay: for short delays, use kthread instead of 
> timers and wq")
> Signed-off-by: Benjamin Marzinski <bmarz...@redhat.com>
> ---
> This patch is meant to apply on top of Damien Le Moal's "dm-delay:
> Prevent zoned write reordering on suspend" patch. If people think it's
> important to avoid either this much smaller amount of looping or the
> possible 1 ms delay on deleting a table, I can send a patch that uses
> usleep_range_state() and msleep_interruptible() to do an interruptible
> sleep with a duration based on the expiration time of the next delayed
> IO.
> 
>  drivers/md/dm-delay.c | 17 ++++++++++++++---
>  1 file changed, 14 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
> index c665b2ab1115..23027aa3fdca 100644
> --- a/drivers/md/dm-delay.c
> +++ b/drivers/md/dm-delay.c
> @@ -14,11 +14,14 @@
>  #include <linux/bio.h>
>  #include <linux/slab.h>
>  #include <linux/kthread.h>
> +#include <linux/delay.h>
>  
>  #include <linux/device-mapper.h>
>  
>  #define DM_MSG_PREFIX "delay"
>  
> +#define SLEEP_SHIFT 3
> +
>  struct delay_class {
>       struct dm_dev *dev;
>       sector_t start;
> @@ -34,6 +37,7 @@ struct delay_c {
>       struct work_struct flush_expired_bios;
>       struct list_head delayed_bios;
>       struct task_struct *worker;
> +     unsigned int worker_sleep_ns;
>       bool may_delay;
>  
>       struct delay_class read;
> @@ -136,7 +140,8 @@ static int flush_worker_fn(void *data)
>                       schedule();
>               } else {
>                       spin_unlock(&dc->delayed_bios_lock);
> -                     cond_resched();
> +                     if (dc->may_delay)
> +                             fsleep(dc->worker_sleep_ns);
>               }
>       }
>  
> @@ -212,7 +217,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int 
> argc, char **argv)
>  {
>       struct delay_c *dc;
>       int ret;
> -     unsigned int max_delay;
> +     unsigned int max_delay, min_delay;
>       bool is_zoned = false;
>  
>       if (argc != 3 && argc != 6 && argc != 9) {
> @@ -236,7 +241,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int 
> argc, char **argv)
>       ret = delay_class_ctr(ti, &dc->read, argv);
>       if (ret)
>               goto bad;
> -     max_delay = dc->read.delay;
> +     min_delay = max_delay = dc->read.delay;
>       is_zoned = bdev_is_zoned(dc->read.dev->bdev);
>  
>       if (argc == 3) {
> @@ -253,6 +258,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int 
> argc, char **argv)
>       if (ret)
>               goto bad;
>       max_delay = max(max_delay, dc->write.delay);
> +     min_delay = min_not_zero(min_delay, dc->write.delay);
>       is_zoned = is_zoned || bdev_is_zoned(dc->write.dev->bdev);
>  
>       if (argc == 6) {
> @@ -266,6 +272,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int 
> argc, char **argv)
>       if (ret)
>               goto bad;
>       max_delay = max(max_delay, dc->flush.delay);
> +     min_delay = min_not_zero(min_delay, dc->flush.delay);
>       is_zoned = is_zoned || bdev_is_zoned(dc->flush.dev->bdev);
>  
>  out:
> @@ -276,6 +283,10 @@ static int delay_ctr(struct dm_target *ti, unsigned int 
> argc, char **argv)
>        * suspend.
>        */
>       if (max_delay < 50 || is_zoned) {
> +             if (min_delay >> SLEEP_SHIFT)
> +                     dc->worker_sleep_ns = 1000;
> +             else
> +                     dc->worker_sleep_ns = (min_delay * 1000) >> SLEEP_SHIFT;
>               dc->worker = kthread_run(&flush_worker_fn, dc, 
> "dm-delay-flush-worker");
>               if (IS_ERR(dc->worker)) {
>                       ret = PTR_ERR(dc->worker);
> -- 
> 2.48.1
> 


Reply via email to