Re: uvm_swap: introduce uvm_swap_data_lock

Martin Pieuchot Sun, 16 Jan 2022 10:36:24 -0800

Nice!

On 30/12/21(Thu) 23:38, Theo Buehler wrote:
> The diff below does two things: it adds a uvm_swap_data_lock mutex and
> trades it for the KERNEL_LOCK in uvm_swapisfull() and uvm_swap_markbad()


Why is it enough?  Which fields is the lock protecting in these
function?  Is it `uvmexp.swpages', could that be documented?  

What about `nswapdev'?  Why is the rwlock grabbed before reading it in
sys_swapctl()?i

What about `swpginuse'?

If the mutex/rwlock are used to protect the global `swap_priority' could
that be also documented?  Once this is documented it should be trivial to
see that some places are missing some locking.  Is it intentional?

> The uvm_swap_data_lock protects all swap data structures, so needs to be
> grabbed a few times, many of them already documented in the comments.
> 
> For review, I suggest comparing to what NetBSD did and also going
> through the consumers (swaplist_insert, swaplist_find, swaplist_trim)
> and check that they are properly locked when called, or that there is
> the KERNEL_LOCK() in place when swap data structures are manipulated.

I'd suggest using the KASSERT(rw_write_held()) idiom to further reduce
the differences with NetBSD.

> In swapmount() I introduced locking since that's needed to be able to
> assert that the proper locks are held in swaplist_{insert,find,trim}.

Could the KERNEL_LOCK() in uvm_swap_get() be pushed a bit further down?
What about `uvmexp.nswget' and `uvmexp.swpgonly' in there?

> Index: uvm/uvm_swap.c
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvm_swap.c,v
> retrieving revision 1.152
> diff -u -p -r1.152 uvm_swap.c
> --- uvm/uvm_swap.c    12 Dec 2021 09:14:59 -0000      1.152
> +++ uvm/uvm_swap.c    30 Dec 2021 15:47:20 -0000
> @@ -44,6 +44,7 @@
>  #include <sys/fcntl.h>
>  #include <sys/extent.h>
>  #include <sys/mount.h>
> +#include <sys/mutex.h>
>  #include <sys/pool.h>
>  #include <sys/syscallargs.h>
>  #include <sys/swap.h>
> @@ -91,6 +92,9 @@
>   *  - swap_syscall_lock (sleep lock): this lock serializes the swapctl
>   *    system call and prevents the swap priority list from changing
>   *    while we are in the middle of a system call (e.g. SWAP_STATS).
> + *  - uvm_swap_data_lock (mutex): this lock protects all swap data
> + *    structures including the priority list, the swapdev structures,
> + *    and the swapmap arena.
>   *
>   * each swap device has the following info:
>   *  - swap device in use (could be disabled, preventing future use)
> @@ -212,6 +216,7 @@ LIST_HEAD(swap_priority, swappri);
>  struct swap_priority swap_priority;
>  
>  /* locks */
> +struct mutex uvm_swap_data_lock = MUTEX_INITIALIZER(IPL_NONE);
>  struct rwlock swap_syscall_lock = RWLOCK_INITIALIZER("swplk");
>  
>  /*
> @@ -442,7 +447,7 @@ uvm_swap_finicrypt_all(void)
>  /*
>   * swaplist_insert: insert swap device "sdp" into the global list
>   *
> - * => caller must hold both swap_syscall_lock and uvm.swap_data_lock
> + * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
>   * => caller must provide a newly malloc'd swappri structure (we will
>   *   FREE it if we don't need it... this it to prevent malloc blocking
>   *   here while adding swap)
> @@ -452,6 +457,9 @@ swaplist_insert(struct swapdev *sdp, str
>  {
>       struct swappri *spp, *pspp;
>  
> +     rw_assert_wrlock(&swap_syscall_lock);
> +     MUTEX_ASSERT_LOCKED(&uvm_swap_data_lock);
> +
>       /*
>        * find entry at or after which to insert the new device.
>        */
> @@ -493,7 +501,7 @@ swaplist_insert(struct swapdev *sdp, str
>   * swaplist_find: find and optionally remove a swap device from the
>   *   global list.
>   *
> - * => caller must hold both swap_syscall_lock and uvm.swap_data_lock
> + * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
>   * => we return the swapdev we found (and removed)
>   */
>  struct swapdev *
> @@ -502,6 +510,9 @@ swaplist_find(struct vnode *vp, boolean_
>       struct swapdev *sdp;
>       struct swappri *spp;
>  
> +     rw_assert_wrlock(&swap_syscall_lock);
> +     MUTEX_ASSERT_LOCKED(&uvm_swap_data_lock);
> +
>       /*
>        * search the lists for the requested vp
>        */
> @@ -524,13 +535,16 @@ swaplist_find(struct vnode *vp, boolean_
>   * swaplist_trim: scan priority list for empty priority entries and kill
>   *   them.
>   *
> - * => caller must hold both swap_syscall_lock and uvm.swap_data_lock
> + * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
>   */
>  void
>  swaplist_trim(void)
>  {
>       struct swappri *spp, *nextspp;
>  
> +     rw_assert_wrlock(&swap_syscall_lock);
> +     MUTEX_ASSERT_LOCKED(&uvm_swap_data_lock);
> +
>       LIST_FOREACH_SAFE(spp, &swap_priority, spi_swappri, nextspp) {
>               if (!TAILQ_EMPTY(&spp->spi_swapdev))
>                       continue;
> @@ -543,7 +557,7 @@ swaplist_trim(void)
>   * swapdrum_add: add a "swapdev"'s blocks into /dev/drum's area.
>   *
>   * => caller must hold swap_syscall_lock
> - * => uvm.swap_data_lock should be unlocked (we may sleep)
> + * => uvm_swap_data_lock should be unlocked (we may sleep)
>   */
>  void
>  swapdrum_add(struct swapdev *sdp, int npages)
> @@ -563,7 +577,7 @@ swapdrum_add(struct swapdev *sdp, int np
>   *   to the "swapdev" that maps that section of the drum.
>   *
>   * => each swapdev takes one big contig chunk of the drum
> - * => caller must hold uvm.swap_data_lock
> + * => caller must hold uvm_swap_data_lock
>   */
>  struct swapdev *
>  swapdrum_getsdp(int pgno)
> @@ -571,6 +585,8 @@ swapdrum_getsdp(int pgno)
>       struct swapdev *sdp;
>       struct swappri *spp;
>  
> +     MUTEX_ASSERT_LOCKED(&uvm_swap_data_lock);
> +
>       LIST_FOREACH(spp, &swap_priority, spi_swappri) {
>               TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
>                       if (pgno >= sdp->swd_drumoffset &&
> @@ -629,7 +645,7 @@ sys_swapctl(struct proc *p, void *v, reg
>        *
>        * note that the swap_priority list can't change as long
>        * as we are holding the swap_syscall_lock.  we don't want
> -      * to grab the uvm.swap_data_lock because we may fault&sleep during
> +      * to grab the uvm_swap_data_lock because we may fault&sleep during
>        * copyout() and we don't want to be holding that lock then!
>        */
>       if (SCARG(uap, cmd) == SWAP_STATS) {
> @@ -701,12 +717,14 @@ sys_swapctl(struct proc *p, void *v, reg
>                */
>               priority = SCARG(uap, misc);
>               spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK);
> +             mtx_enter(&uvm_swap_data_lock);
>               if ((sdp = swaplist_find(vp, 1)) == NULL) {
>                       error = ENOENT;
>               } else {
>                       swaplist_insert(sdp, spp, priority);
>                       swaplist_trim();
>               }
> +             mtx_leave(&uvm_swap_data_lock);
>               if (error)
>                       free(spp, M_VMSWAP, sizeof(*spp));
>               break;
> @@ -727,11 +745,8 @@ sys_swapctl(struct proc *p, void *v, reg
>                * trying to enable this device while we are working on
>                * it.
>                */
> +
>               priority = SCARG(uap, misc);
> -             if ((sdp = swaplist_find(vp, 0)) != NULL) {
> -                     error = EBUSY;
> -                     break;
> -             }
>               sdp = malloc(sizeof *sdp, M_VMSWAP, M_WAITOK|M_ZERO);
>               spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK);
>               sdp->swd_flags = SWF_FAKE;      /* placeholder only */
> @@ -745,7 +760,19 @@ sys_swapctl(struct proc *p, void *v, reg
>                       sdp->swd_cred = crdup(p->p_ucred);
>               }
>  
> +             mtx_enter(&uvm_swap_data_lock);
> +             if (swaplist_find(vp, 0) != NULL) {
> +                     error = EBUSY;
> +                     mtx_leave(&uvm_swap_data_lock);
> +                     if (vp->v_type == VREG) {
> +                             crfree(sdp->swd_cred);
> +                     }
> +                     free(sdp, M_VMSWAP, sizeof *sdp);
> +                     free(spp, M_VMSWAP, sizeof *spp);
> +                     break;
> +             }
>               swaplist_insert(sdp, spp, priority);
> +             mtx_leave(&uvm_swap_data_lock);
>  
>               sdp->swd_pathlen = len;
>               sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP, M_WAITOK);
> @@ -759,8 +786,10 @@ sys_swapctl(struct proc *p, void *v, reg
>                */
>  
>               if ((error = swap_on(p, sdp)) != 0) {
> +                     mtx_enter(&uvm_swap_data_lock);
>                       (void) swaplist_find(vp, 1);  /* kill fake entry */
>                       swaplist_trim();
> +                     mtx_leave(&uvm_swap_data_lock);
>                       if (vp->v_type == VREG) {
>                               crfree(sdp->swd_cred);
>                       }
> @@ -770,7 +799,9 @@ sys_swapctl(struct proc *p, void *v, reg
>               }
>               break;
>       case SWAP_OFF:
> +             mtx_enter(&uvm_swap_data_lock);
>               if ((sdp = swaplist_find(vp, 0)) == NULL) {
> +                     mtx_leave(&uvm_swap_data_lock);
>                       error = ENXIO;
>                       break;
>               }
> @@ -780,6 +811,7 @@ sys_swapctl(struct proc *p, void *v, reg
>                * can't stop swapping from it (again).
>                */
>               if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) {
> +                     mtx_leave(&uvm_swap_data_lock);
>                       error = EBUSY;
>                       break;
>               }
> @@ -808,7 +840,7 @@ out:
>   *   SWF_FAKE).
>   *
>   * => we avoid the start of the disk (to protect disk labels)
> - * => caller should leave uvm.swap_data_lock unlocked, we may lock it
> + * => caller should leave uvm_swap_data_lock unlocked, we may lock it
>   *   if needed.
>   */
>  int
> @@ -969,13 +1001,18 @@ swap_on(struct proc *p, struct swapdev *
>       /* now add the new swapdev to the drum and enable. */
>       swapdrum_add(sdp, npages);
>       sdp->swd_npages = size;
> +     mtx_enter(&uvm_swap_data_lock);
>       sdp->swd_flags &= ~SWF_FAKE;    /* going live */
>       sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE);
>       uvmexp.swpages += size;
> +     mtx_leave(&uvm_swap_data_lock);
>       return (0);
>  
> +     /*
> +      * failure: clean up and return error.
> +      */
> +
>  bad:
> -     /* failure: close device if necessary and return error. */
>       if (vp != rootvp)
>               (void)VOP_CLOSE(vp, FREAD|FWRITE, p->p_ucred, p);
>       return (error);
> @@ -989,10 +1026,15 @@ bad:
>  int
>  swap_off(struct proc *p, struct swapdev *sdp)
>  {
> +     int npages = sdp->swd_npages;
>       int error = 0;
>  
> +     rw_assert_wrlock(&swap_syscall_lock);
> +     MUTEX_ASSERT_LOCKED(&uvm_swap_data_lock);
> +
>       /* disable the swap area being removed */
>       sdp->swd_flags &= ~SWF_ENABLE;
> +     mtx_leave(&uvm_swap_data_lock);
>  
>       /*
>        * the idea is to find all the pages that are paged out to this
> @@ -1005,15 +1047,16 @@ swap_off(struct proc *p, struct swapdev 
>                        sdp->swd_drumoffset + sdp->swd_drumsize) ||
>           amap_swap_off(sdp->swd_drumoffset,
>                         sdp->swd_drumoffset + sdp->swd_drumsize)) {
> -
>               error = ENOMEM;
>       } else if (sdp->swd_npginuse > sdp->swd_npgbad) {
>               error = EBUSY;
>       }
>  
>       if (error) {
> +             mtx_enter(&uvm_swap_data_lock);
>               sdp->swd_flags |= SWF_ENABLE;
> -             return (error);
> +             mtx_leave(&uvm_swap_data_lock);
> +             return error;
>       }
>  
>       /*
> @@ -1029,11 +1072,13 @@ swap_off(struct proc *p, struct swapdev 
>               (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, p->p_ucred, p);
>       }
>  
> -     uvmexp.swpages -= sdp->swd_npages;
> +     mtx_enter(&uvm_swap_data_lock);
> +     uvmexp.swpages -= npages;
>  
>       if (swaplist_find(sdp->swd_vp, 1) == NULL)
>               panic("swap_off: swapdev not in list");
>       swaplist_trim();
> +     mtx_leave(&uvm_swap_data_lock);
>  
>       /*
>        * free all resources!
> @@ -1067,7 +1112,9 @@ swstrategy(struct buf *bp)
>        * in it (i.e. the blocks we are doing I/O on).
>        */
>       pageno = dbtob((u_int64_t)bp->b_blkno) >> PAGE_SHIFT;
> +     mtx_enter(&uvm_swap_data_lock);
>       sdp = swapdrum_getsdp(pageno);
> +     mtx_leave(&uvm_swap_data_lock);
>       if (sdp == NULL) {
>               bp->b_error = EINVAL;
>               bp->b_flags |= B_ERROR;
> @@ -1384,7 +1431,7 @@ sw_reg_iodone_internal(void *xvbp)
>   *   allocate in a priority we "rotate" the tail queue.
>   * => space can be freed with uvm_swap_free
>   * => we return the page slot number in /dev/drum (0 == invalid slot)
> - * => we lock uvm.swap_data_lock
> + * => we lock uvm_swap_data_lock
>   * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM
>   */
>  int
> @@ -1404,6 +1451,8 @@ uvm_swap_alloc(int *nslots, boolean_t le
>        * lock data lock, convert slots into blocks, and enter loop
>        */
>       KERNEL_ASSERT_LOCKED();
> +     mtx_enter(&uvm_swap_data_lock);
> +
>  ReTry:       /* XXXMRG */
>       LIST_FOREACH(spp, &swap_priority, spi_swappri) {
>               TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
> @@ -1425,6 +1474,7 @@ ReTry:  /* XXXMRG */
>                       TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
>                       sdp->swd_npginuse += *nslots;
>                       uvmexp.swpginuse += *nslots;
> +                     mtx_leave(&uvm_swap_data_lock);
>                       /* done!  return drum slot number */
>                       return result + sdp->swd_drumoffset;
>               }
> @@ -1437,6 +1487,7 @@ ReTry:  /* XXXMRG */
>       }
>       /* XXXMRG: END HACK */
>  
> +     mtx_leave(&uvm_swap_data_lock);
>       return 0;               /* failed */
>  }
>  
> @@ -1449,10 +1500,10 @@ uvm_swapisfull(void)
>  {
>       int result;
>  
> -     KERNEL_LOCK();
> +     mtx_enter(&uvm_swap_data_lock);
>       KASSERT(uvmexp.swpgonly <= uvmexp.swpages);
>       result = (uvmexp.swpgonly == uvmexp.swpages);
> -     KERNEL_UNLOCK();
> +     mtx_leave(&uvm_swap_data_lock);
>  
>       return result;
>  }
> @@ -1460,14 +1511,14 @@ uvm_swapisfull(void)
>  /*
>   * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors
>   *
> - * => we lock uvm.swap_data_lock
> + * => we lock uvm_swap_data_lock
>   */
>  void
>  uvm_swap_markbad(int startslot, int nslots)
>  {
>       struct swapdev *sdp;
>  
> -     KERNEL_LOCK();
> +     mtx_enter(&uvm_swap_data_lock);
>       sdp = swapdrum_getsdp(startslot);
>       if (sdp != NULL) {
>               /*
> @@ -1478,14 +1529,14 @@ uvm_swap_markbad(int startslot, int nslo
>                */
>               sdp->swd_npgbad += nslots;
>       }
> -     KERNEL_UNLOCK();
> +     mtx_leave(&uvm_swap_data_lock);
>  }
>  
>  /*
>   * uvm_swap_free: free swap slots
>   *
>   * => this can be all or part of an allocation made by uvm_swap_alloc
> - * => we lock uvm.swap_data_lock
> + * => we lock uvm_swap_data_lock
>   */
>  void
>  uvm_swap_free(int startslot, int nslots)
> @@ -1506,6 +1557,7 @@ uvm_swap_free(int startslot, int nslots)
>        * lookup and access the extent.
>        */
>       KERNEL_LOCK();
> +     mtx_enter(&uvm_swap_data_lock);
>       sdp = swapdrum_getsdp(startslot);
>       KASSERT(uvmexp.nswapdev >= 1);
>       KASSERT(sdp != NULL);
> @@ -1518,6 +1570,8 @@ uvm_swap_free(int startslot, int nslots)
>  
>       sdp->swd_npginuse -= nslots;
>       uvmexp.swpginuse -= nslots;
> +     mtx_leave(&uvm_swap_data_lock);
> +
>  #ifdef UVM_SWAP_ENCRYPT
>       {
>               int i;
> @@ -1647,7 +1701,9 @@ uvm_swap_io(struct vm_page **pps, int st
>                * XXX - does this information stay the same over the whole
>                * execution of this function?
>                */
> +             mtx_enter(&uvm_swap_data_lock);
>               sdp = swapdrum_getsdp(startslot);
> +             mtx_leave(&uvm_swap_data_lock);
>       }
>  
>       /*
> @@ -1892,13 +1948,11 @@ swapmount(void)
>       char *nam;
>       char path[MNAMELEN + 1];
>  
> -     /*
> -      * No locking here since we happen to know that we will just be called
> -      * once before any other process has forked.
> -      */
>       if (swap_dev == NODEV)
>               return;
>  
> +     rw_enter_write(&swap_syscall_lock);
> +
>  #if defined(NFSCLIENT)
>       if (swap_dev == NETDEV) {
>               extern struct nfs_diskless nfs_diskless;
> @@ -1935,16 +1989,22 @@ gotit:
>  
>       sdp->swd_vp = vp;
>  
> +     mtx_enter(&uvm_swap_data_lock);
>       swaplist_insert(sdp, spp, 0);
> +     mtx_leave(&uvm_swap_data_lock);
>  
>       if (swap_on(curproc, sdp)) {
> +             mtx_enter(&uvm_swap_data_lock);
>               swaplist_find(vp, 1);
>               swaplist_trim();
>               vput(sdp->swd_vp);
> +             mtx_leave(&uvm_swap_data_lock);
> +             rw_exit_write(&swap_syscall_lock);
>               free(sdp->swd_path, M_VMSWAP, sdp->swd_pathlen);
>               free(sdp, M_VMSWAP, sizeof(*sdp));
>               return;
>       }
> +     rw_exit_write(&swap_syscall_lock);
>  }
>  
>  #ifdef HIBERNATE
>

Re: uvm_swap: introduce uvm_swap_data_lock

Reply via email to