On Mon, Mar 29, 2021 at 01:06:59PM -0700, Axel Rasmussen wrote:
> Previously, we shared too much of the code with COPY and ZEROPAGE, so we
> manipulated things in various invalid ways:
> 
> - Previously, we unconditionally called shmem_inode_acct_block. In the
>   continue case, we're looking up an existing page which would have been
>   accounted for properly when it was allocated. So doing it twice
>   results in double-counting, and eventually leaking.
> 
> - Previously, we made the pte writable whenever the VMA was writable.
>   However, for continue, consider this case:
> 
>   1. A tmpfs file was created
>   2. The non-UFFD-registered side mmap()-s with MAP_SHARED
>   3. The UFFD-registered side mmap()-s with MAP_PRIVATE
> 
>   In this case, even though the UFFD-registered VMA may be writable, we
>   still want CoW behavior. So, check for this case and don't make the
>   pte writable.
> 
> - The offset / max_off checking doesn't necessarily hurt anything, but
>   it's not needed in the CONTINUE case, so skip it.
> 
> - Previously, we unconditionally called ClearPageDirty() in the error
>   path. In the continue case though, since this is an existing page, it
>   might have already been dirty before we started touching it. So,
>   remember whether or not it was dirty before we set_page_dirty(), and
>   only clear the bit if it wasn't dirty before.
> 
> - Previously, we unconditionally removed the page from the page cache in
>   the error path. But in the continue case, we didn't add it - it was
>   already there because the page is present in some second
>   (non-UFFD-registered) mapping. So, removing it is invalid.
> 
> Because the error handling issues are easy to exercise in the selftest,
> make a small modification there to do so.
> 
> Finally, refactor shmem_mcopy_atomic_pte a bit. By this point, we've
> added a lot of "if (!is_continue)"-s everywhere. It's cleaner to just
> check for that mode first thing, and then "goto" down to where the parts
> we actually want are. This leaves the code in between cleaner.
> 
> Changes since v1:
> - Refactor to skip ahead with goto, instead of adding several more
>   "if (!is_continue)".
> - Fix unconditional ClearPageDirty().
> - Don't pte_mkwrite() when is_continue && !VM_SHARED.
> 
> Fixes: 00da60b9d0a0 ("userfaultfd: support minor fault handling for shmem")
> Signed-off-by: Axel Rasmussen <axelrasmus...@google.com>
> ---
>  mm/shmem.c                               | 67 ++++++++++++++----------
>  tools/testing/selftests/vm/userfaultfd.c | 12 +++++
>  2 files changed, 51 insertions(+), 28 deletions(-)
> 
> diff --git a/mm/shmem.c b/mm/shmem.c
> index d2e0e81b7d2e..8ab1f1f29987 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -2378,17 +2378,22 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, 
> pmd_t *dst_pmd,
>       pte_t _dst_pte, *dst_pte;
>       int ret;
>       pgoff_t offset, max_off;
> -
> -     ret = -ENOMEM;
> -     if (!shmem_inode_acct_block(inode, 1))
> -             goto out;
> +     int writable;
> +     bool was_dirty;
>  
>       if (is_continue) {
>               ret = -EFAULT;
>               page = find_lock_page(mapping, pgoff);
>               if (!page)
> -                     goto out_unacct_blocks;
> -     } else if (!*pagep) {
> +                     goto out;
> +             goto install_ptes;
> +     }
> +
> +     ret = -ENOMEM;
> +     if (!shmem_inode_acct_block(inode, 1))
> +             goto out;
> +
> +     if (!*pagep) {
>               page = shmem_alloc_page(gfp, info, pgoff);
>               if (!page)
>                       goto out_unacct_blocks;
> @@ -2415,13 +2420,11 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, 
> pmd_t *dst_pmd,
>               *pagep = NULL;
>       }
>  
> -     if (!is_continue) {
> -             VM_BUG_ON(PageSwapBacked(page));
> -             VM_BUG_ON(PageLocked(page));
> -             __SetPageLocked(page);
> -             __SetPageSwapBacked(page);
> -             __SetPageUptodate(page);
> -     }
> +     VM_BUG_ON(PageSwapBacked(page));
> +     VM_BUG_ON(PageLocked(page));
> +     __SetPageLocked(page);
> +     __SetPageSwapBacked(page);
> +     __SetPageUptodate(page);
>  
>       ret = -EFAULT;
>       offset = linear_page_index(dst_vma, dst_addr);
> @@ -2429,16 +2432,18 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, 
> pmd_t *dst_pmd,
>       if (unlikely(offset >= max_off))
>               goto out_release;
>  
> -     /* If page wasn't already in the page cache, add it. */
> -     if (!is_continue) {
> -             ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL,
> -                                           gfp & GFP_RECLAIM_MASK, dst_mm);
> -             if (ret)
> -                     goto out_release;
> -     }
> +     ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL,
> +                                   gfp & GFP_RECLAIM_MASK, dst_mm);
> +     if (ret)
> +             goto out_release;
>  
> +install_ptes:
>       _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
> -     if (dst_vma->vm_flags & VM_WRITE)
> +     /* For CONTINUE on a non-shared VMA, don't pte_mkwrite for CoW. */
> +     writable = is_continue && !(dst_vma->vm_flags & VM_SHARED)
> +             ? 0
> +             : dst_vma->vm_flags & VM_WRITE;
> +     if (writable)
>               _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
>       else {
>               /*
> @@ -2448,15 +2453,18 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, 
> pmd_t *dst_pmd,
>                * unconditionally before unlock_page(), but doing it
>                * only if VM_WRITE is not set is faster.
>                */
> +             was_dirty = PageDirty(page);
>               set_page_dirty(page);
>       }
>  
>       dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
>  
> -     ret = -EFAULT;
> -     max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
> -     if (unlikely(offset >= max_off))
> -             goto out_release_unlock;
> +     if (!is_continue) {
> +             ret = -EFAULT;
> +             max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
> +             if (unlikely(offset >= max_off))
> +                     goto out_release_unlock;
> +     }

I think you're right, but I won't touch this code explicitly since the new code
slightly affects readability, while skipping it won't save a lot of cpu cycles.
No strong opinion though.

>  
>       ret = -EEXIST;
>       if (!pte_none(*dst_pte))
> @@ -2485,13 +2493,16 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, 
> pmd_t *dst_pmd,
>       return ret;
>  out_release_unlock:
>       pte_unmap_unlock(dst_pte, ptl);
> -     ClearPageDirty(page);
> -     delete_from_page_cache(page);
> +     if (!was_dirty)
> +             ClearPageDirty(page);

It could be using a random was_dirty from stack, does not seem right.

Maybe simply drop this ClearPageDirty()?  I'm not sure whether the page free
code will complain, but if not I think an extra dirty bit is better than losing
one, as the latter corrupts data while the former happens anyways, not to
mention this is an error path.  Maybe acceptable?

> +     if (!is_continue)
> +             delete_from_page_cache(page);
>  out_release:
>       unlock_page(page);
>       put_page(page);
>  out_unacct_blocks:
> -     shmem_inode_unacct_blocks(inode, 1);
> +     if (!is_continue)
> +             shmem_inode_unacct_blocks(inode, 1);
>       goto out;
>  }
>  #endif /* CONFIG_USERFAULTFD */
> diff --git a/tools/testing/selftests/vm/userfaultfd.c 
> b/tools/testing/selftests/vm/userfaultfd.c
> index f6c86b036d0f..d8541a59dae5 100644
> --- a/tools/testing/selftests/vm/userfaultfd.c
> +++ b/tools/testing/selftests/vm/userfaultfd.c
> @@ -485,6 +485,7 @@ static void wp_range(int ufd, __u64 start, __u64 len, 
> bool wp)
>  static void continue_range(int ufd, __u64 start, __u64 len)
>  {
>       struct uffdio_continue req;
> +     int ret;
>  
>       req.range.start = start;
>       req.range.len = len;
> @@ -493,6 +494,17 @@ static void continue_range(int ufd, __u64 start, __u64 
> len)
>       if (ioctl(ufd, UFFDIO_CONTINUE, &req))
>               err("UFFDIO_CONTINUE failed for address 0x%" PRIx64,
>                   (uint64_t)start);
> +
> +     /*
> +      * Error handling within the kernel for continue is subtly different
> +      * from copy or zeropage, so it may be a source of bugs. Trigger an
> +      * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG.
> +      */
> +     req.mapped = 0;
> +     ret = ioctl(ufd, UFFDIO_CONTINUE, &req);
> +     if (ret >= 0 || req.mapped != -EEXIST)
> +             err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, 
> mapped=%" PRId64,
> +                 ret, req.mapped);
>  }
>  
>  static void *locking_thread(void *arg)
> -- 
> 2.31.0.291.g576ba9dcdaf-goog
> 

-- 
Peter Xu

Reply via email to