On Fri, Nov 29, 2024 at 03:27:19PM -0500, Kent Overstreet wrote:
> When invoked from aio, mm_struct is guaranteed to outlive the request
> since its lifetime is tied to the io_context - but that's not the case
> for io_uring, it's possible that a process could be killed and mm_struct
> goes away while a request is in flight.
> 
> So if we're submitting the rest of the io asynchronously, we may need a
> ref on mm_struct.
> 
> Per Jens, this is not actually a bug because we're not yet flipping on
> FMODE_NOWAIT, meaning io_uring will do the submission from an io_worker
> kthread - but this patch is necessary for safely flipping on
> FMODE_NOWAIT for more efficient submissions in the future.
> 
> Reported-by: Jann Horn <[email protected]>
> Cc: Jens Axboe <[email protected]>
> Signed-off-by: Kent Overstreet <[email protected]>

turns out this introduces a rare deadlock in exit_aio() 

> ---
>  fs/bcachefs/fs-io-direct.c | 42 ++++++++++++++++++++++++++++++++------
>  1 file changed, 36 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
> index 2089c36b5866..b0367b9d9e07 100644
> --- a/fs/bcachefs/fs-io-direct.c
> +++ b/fs/bcachefs/fs-io-direct.c
> @@ -226,6 +226,7 @@ struct dio_write {
>       struct mm_struct                *mm;
>       const struct iovec              *iov;
>       unsigned                        loop:1,
> +                                     have_mm_ref:1,
>                                       extending:1,
>                                       sync:1,
>                                       flush:1;
> @@ -390,6 +391,9 @@ static __always_inline long bch2_dio_write_done(struct 
> dio_write *dio)
>  
>       kfree(dio->iov);
>  
> +     if (dio->have_mm_ref)
> +             mmdrop(dio->mm);
> +
>       ret = dio->op.error ?: ((long) dio->written << 9);
>       bio_put(&dio->op.wbio.bio);
>  
> @@ -529,9 +533,24 @@ static __always_inline long bch2_dio_write_loop(struct 
> dio_write *dio)
>  
>               if (unlikely(dio->iter.count) &&
>                   !dio->sync &&
> -                 !dio->loop &&
> -                 bch2_dio_write_copy_iov(dio))
> -                     dio->sync = sync = true;
> +                 !dio->loop) {
> +                     /*
> +                      * Rest of write will be submitted asynchronously -
> +                      * unless copying the iov fails:
> +                      */
> +                     if (likely(!bch2_dio_write_copy_iov(dio))) {
> +                             /*
> +                              * aio guarantees that mm_struct outlives the
> +                              * request, but io_uring does not
> +                              */
> +                             if (dio->mm) {
> +                                     mmgrab(dio->mm);
> +                                     dio->have_mm_ref = true;
> +                             }
> +                     } else {
> +                             dio->sync = sync = true;
> +                     }
> +             }
>  
>               dio->loop = true;
>               closure_call(&dio->op.cl, bch2_write, NULL, NULL);
> @@ -559,15 +578,25 @@ static __always_inline long bch2_dio_write_loop(struct 
> dio_write *dio)
>  
>  static noinline __cold void bch2_dio_write_continue(struct dio_write *dio)
>  {
> -     struct mm_struct *mm = dio->mm;
> +     struct mm_struct *mm = dio->have_mm_ref ? dio->mm: NULL;
>  
>       bio_reset(&dio->op.wbio.bio, NULL, REQ_OP_WRITE);
>  
> -     if (mm)
> +     if (mm) {
> +             if (unlikely(!mmget_not_zero(mm))) {
> +                     /* process exited */
> +                     dio->op.error = -ESRCH;
> +                     bch2_dio_write_done(dio);
> +                     return;
> +             }
> +
>               kthread_use_mm(mm);
> +     }
>       bch2_dio_write_loop(dio);
> -     if (mm)
> +     if (mm) {
>               kthread_unuse_mm(mm);
> +             mmput(mm);
> +     }
>  }
>  
>  static void bch2_dio_write_loop_async(struct bch_write_op *op)
> @@ -641,6 +670,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct 
> iov_iter *iter)
>       dio->mm                 = current->mm;
>       dio->iov                = NULL;
>       dio->loop               = false;
> +     dio->have_mm_ref        = false;
>       dio->extending          = extending;
>       dio->sync               = is_sync_kiocb(req) || extending;
>       dio->flush              = iocb_is_dsync(req) && 
> !c->opts.journal_flush_disabled;
> -- 
> 2.45.2
> 

Reply via email to