Diff below brings in & adapt most of the changes from NetBSD's r1.37 of
uvm_pdaemon.c. My motivation for doing this is to untangle the inner
loop of uvmpd_scan_inactive() which will allow us to split the global
`pageqlock' mutex in a next step.
The idea behind this change is to get rid of the too-complex uvm_pager*
abstraction by checking early if a page is going to be flushed or
swapped to disk. The loop is then clearly divided into two cases which
makes it more readable.
This also opens the door to a better integration between UVM's vnode
layer and the buffer cache.
The main loop of uvmpd_scan_inactive() can be understood as below:
. If a page can be flushed we can call "uvn_flush()" directly and pass the
PGO_ALLPAGES flag instead of building a cluster beforehand. Note that,
in its current form uvn_flush() is synchronous.
. If the page needs to be swapped, mark it as PG_PAGEOUT, build a cluster
and once it is full call uvm_swap_put().
Please test this diff, do not hesitate to play with the `vm.swapencrypt.enable'
sysctl(2).
Index: uvm/uvm_aobj.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_aobj.c,v
retrieving revision 1.103
diff -u -p -r1.103 uvm_aobj.c
--- uvm/uvm_aobj.c 29 Dec 2021 20:22:06 -0000 1.103
+++ uvm/uvm_aobj.c 24 May 2022 12:31:34 -0000
@@ -143,7 +143,7 @@ struct pool uvm_aobj_pool;
static struct uao_swhash_elt *uao_find_swhash_elt(struct uvm_aobj *, int,
boolean_t);
-static int uao_find_swslot(struct uvm_object *, int);
+int uao_find_swslot(struct uvm_object *, int);
static boolean_t uao_flush(struct uvm_object *, voff_t,
voff_t, int);
static void uao_free(struct uvm_aobj *);
@@ -241,7 +241,7 @@ uao_find_swhash_elt(struct uvm_aobj *aob
/*
* uao_find_swslot: find the swap slot number for an aobj/pageidx
*/
-inline static int
+int
uao_find_swslot(struct uvm_object *uobj, int pageidx)
{
struct uvm_aobj *aobj = (struct uvm_aobj *)uobj;
Index: uvm/uvm_aobj.h
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_aobj.h,v
retrieving revision 1.17
diff -u -p -r1.17 uvm_aobj.h
--- uvm/uvm_aobj.h 21 Oct 2020 09:08:14 -0000 1.17
+++ uvm/uvm_aobj.h 24 May 2022 12:31:34 -0000
@@ -60,6 +60,7 @@
void uao_init(void);
int uao_set_swslot(struct uvm_object *, int, int);
+int uao_find_swslot (struct uvm_object *, int);
int uao_dropswap(struct uvm_object *, int);
int uao_swap_off(int, int);
int uao_shrink(struct uvm_object *, int);
Index: uvm/uvm_map.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_map.c,v
retrieving revision 1.291
diff -u -p -r1.291 uvm_map.c
--- uvm/uvm_map.c 4 May 2022 14:58:26 -0000 1.291
+++ uvm/uvm_map.c 24 May 2022 12:31:34 -0000
@@ -3215,8 +3215,9 @@ uvm_object_printit(struct uvm_object *uo
* uvm_page_printit: actually print the page
*/
static const char page_flagbits[] =
- "\20\1BUSY\2WANTED\3TABLED\4CLEAN\5CLEANCHK\6RELEASED\7FAKE\10RDONLY"
- "\11ZERO\12DEV\15PAGER1\21FREE\22INACTIVE\23ACTIVE\25ANON\26AOBJ"
+ "\20\1BUSY\2WANTED\3TABLED\4CLEAN\5PAGEOUT\6RELEASED\7FAKE\10RDONLY"
+ "\11ZERO\12DEV\13CLEANCHK"
+ "\15PAGER1\21FREE\22INACTIVE\23ACTIVE\25ANON\26AOBJ"
"\27ENCRYPT\31PMAP0\32PMAP1\33PMAP2\34PMAP3\35PMAP4\36PMAP5";
void
Index: uvm/uvm_page.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_page.c,v
retrieving revision 1.166
diff -u -p -r1.166 uvm_page.c
--- uvm/uvm_page.c 12 May 2022 12:48:36 -0000 1.166
+++ uvm/uvm_page.c 24 May 2022 12:32:54 -0000
@@ -960,6 +960,7 @@ uvm_pageclean(struct vm_page *pg)
{
u_int flags_to_clear = 0;
+ KASSERT((pg->pg_flags & PG_PAGEOUT) == 0);
if ((pg->pg_flags & (PG_TABLED|PQ_ACTIVE|PQ_INACTIVE)) &&
(pg->uobject == NULL || !UVM_OBJ_IS_PMAP(pg->uobject)))
MUTEX_ASSERT_LOCKED(&uvm.pageqlock);
@@ -978,11 +979,14 @@ uvm_pageclean(struct vm_page *pg)
rw_write_held(pg->uanon->an_lock));
/*
- * if the page was an object page (and thus "TABLED"), remove it
- * from the object.
+ * remove page from its object or anon.
*/
- if (pg->pg_flags & PG_TABLED)
+ if (pg->pg_flags & PG_TABLED) {
uvm_pageremove(pg);
+ } else if (pg->uanon != NULL) {
+ pg->uanon->an_page = NULL;
+ pg->uanon = NULL;
+ }
/*
* now remove the page from the queues
@@ -996,10 +1000,6 @@ uvm_pageclean(struct vm_page *pg)
pg->wire_count = 0;
uvmexp.wired--;
}
- if (pg->uanon) {
- pg->uanon->an_page = NULL;
- pg->uanon = NULL;
- }
/* Clean page state bits. */
flags_to_clear |= PQ_ANON|PQ_AOBJ|PQ_ENCRYPT|PG_ZERO|PG_FAKE|PG_BUSY|
@@ -1042,7 +1042,6 @@ void
uvm_page_unbusy(struct vm_page **pgs, int npgs)
{
struct vm_page *pg;
- struct uvm_object *uobj;
int i;
for (i = 0; i < npgs; i++) {
@@ -1052,35 +1051,17 @@ uvm_page_unbusy(struct vm_page **pgs, in
continue;
}
-#if notyet
- /*
- * XXX swap case in uvm_aio_aiodone() is not holding the lock.
- *
- * This isn't compatible with the PG_RELEASED anon case below.
- */
KASSERT(uvm_page_owner_locked_p(pg));
-#endif
KASSERT(pg->pg_flags & PG_BUSY);
if (pg->pg_flags & PG_WANTED) {
wakeup(pg);
}
if (pg->pg_flags & PG_RELEASED) {
- uobj = pg->uobject;
- if (uobj != NULL) {
- uvm_lock_pageq();
- pmap_page_protect(pg, PROT_NONE);
- /* XXX won't happen right now */
- if (pg->pg_flags & PQ_AOBJ)
- uao_dropswap(uobj,
- pg->offset >> PAGE_SHIFT);
- uvm_pagefree(pg);
- uvm_unlock_pageq();
- } else {
- rw_enter(pg->uanon->an_lock, RW_WRITE);
- uvm_anon_release(pg->uanon);
- }
+ atomic_clearbits_int(&pg->pg_flags, PG_RELEASED);
+ uvm_pagefree(pg);
} else {
+ KASSERT((pg->pg_flags & PG_FAKE) == 0);
atomic_clearbits_int(&pg->pg_flags, PG_WANTED|PG_BUSY);
UVM_PAGE_OWN(pg, NULL);
}
@@ -1099,6 +1080,8 @@ uvm_page_unbusy(struct vm_page **pgs, in
void
uvm_page_own(struct vm_page *pg, char *tag)
{
+ KASSERT((pg->pg_flags & PG_PAGEOUT) == 0);
+
/* gain ownership? */
if (tag) {
if (pg->owner_tag) {
@@ -1216,10 +1199,15 @@ struct vm_page *
uvm_pagelookup(struct uvm_object *obj, voff_t off)
{
/* XXX if stack is too much, handroll */
- struct vm_page pg;
+ struct vm_page p, *pg;
+
+ p.offset = off;
+ pg = RBT_FIND(uvm_objtree, &obj->memt, &p);
- pg.offset = off;
- return RBT_FIND(uvm_objtree, &obj->memt, &pg);
+ KASSERT(pg == NULL || obj->uo_npages != 0);
+ KASSERT(pg == NULL || (pg->pg_flags & (PG_RELEASED|PG_PAGEOUT)) == 0 ||
+ (pg->pg_flags & PG_BUSY) != 0);
+ return (pg);
}
/*
@@ -1303,7 +1291,9 @@ uvm_pagedeactivate(struct vm_page *pg)
void
uvm_pageactivate(struct vm_page *pg)
{
+#ifdef notyet
KASSERT(uvm_page_owner_locked_p(pg));
+#endif
MUTEX_ASSERT_LOCKED(&uvm.pageqlock);
uvm_pagedequeue(pg);
Index: uvm/uvm_page.h
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_page.h,v
retrieving revision 1.68
diff -u -p -r1.68 uvm_page.h
--- uvm/uvm_page.h 12 May 2022 12:48:36 -0000 1.68
+++ uvm/uvm_page.h 24 May 2022 12:31:34 -0000
@@ -138,12 +138,13 @@ struct vm_page {
#define PG_WANTED 0x00000002 /* someone is waiting for page
*/
#define PG_TABLED 0x00000004 /* page is in VP table */
#define PG_CLEAN 0x00000008 /* page has not been modified */
-#define PG_CLEANCHK 0x00000010 /* clean bit has been checked */
+#define PG_PAGEOUT 0x00000010 /* page to be freed for pagedaemon */
#define PG_RELEASED 0x00000020 /* page released while paging */
#define PG_FAKE 0x00000040 /* page is not yet initialized
*/
#define PG_RDONLY 0x00000080 /* page must be mapped read-only */
#define PG_ZERO 0x00000100 /* page is pre-zero'd */
#define PG_DEV 0x00000200 /* page is in device space, lay off */
+#define PG_CLEANCHK 0x00000400 /* clean bit has been checked */
#define PG_PAGER1 0x00001000 /* pager-specific flag */
#define PG_MASK 0x0000ffff
Index: uvm/uvm_pager.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_pager.c,v
retrieving revision 1.78
diff -u -p -r1.78 uvm_pager.c
--- uvm/uvm_pager.c 18 Feb 2022 09:04:38 -0000 1.78
+++ uvm/uvm_pager.c 24 May 2022 12:33:25 -0000
@@ -745,8 +745,9 @@ uvm_aio_aiodone(struct buf *bp)
int npages = bp->b_bufsize >> PAGE_SHIFT;
struct vm_page *pg, *pgs[MAXPHYS >> PAGE_SHIFT];
struct uvm_object *uobj;
- int i, error;
- boolean_t write, swap;
+ struct rwlock *slock;
+ int i, error, swslot, wanted = 0;
+ boolean_t write, swap, pageout;
KASSERT(npages <= MAXPHYS >> PAGE_SHIFT);
splassert(IPL_BIO);
@@ -768,45 +769,128 @@ uvm_aio_aiodone(struct buf *bp)
goto freed;
}
#endif /* UVM_SWAP_ENCRYPT */
+
+ swslot = 0;
+ slock = NULL;
+ pg = pgs[0];
+ swap = (pg->uanon != NULL && pg->uobject == NULL) ||
+ (pg->pg_flags & PQ_AOBJ) != 0;
+ pageout = (pg->pg_flags & PG_PAGEOUT) != 0;
+ if (!swap) {
+ uobj = pg->uobject;
+ slock = uobj->vmobjlock;
+ rw_enter(slock, RW_WRITE);
+ uvm_lock_pageq();
+ } else if (error) {
+ if (pg->uobject != NULL) {
+ swslot = uao_find_swslot(pg->uobject,
+ pg->offset >> PAGE_SHIFT);
+ } else {
+ swslot = pg->uanon->an_swslot;
+ }
+ KASSERT(swslot);
+ }
+
for (i = 0; i < npages; i++) {
pg = pgs[i];
+ KASSERT(swap || pg->uobject == uobj);
+ KASSERT(pageout ^ ((pg->pg_flags & PG_PAGEOUT) == 0));
- if (i == 0) {
- swap = (pg->pg_flags & PQ_SWAPBACKED) != 0;
- if (!swap) {
- uobj = pg->uobject;
- rw_enter(uobj->vmobjlock, RW_WRITE);
+ /*
+ * for swap i/os, lock each page's object (or anon)
+ * individually since each page may need a different lock.
+ */
+ if (swap) {
+ if (pg->uobject != NULL) {
+ slock = pg->uobject->vmobjlock;
+ } else {
+ slock = pg->uanon->an_lock;
}
+ rw_enter(slock, RW_WRITE);
+ uvm_lock_pageq();
}
- KASSERT(swap || pg->uobject == uobj);
/*
- * if this is a read and we got an error, mark the pages
- * PG_RELEASED so that uvm_page_unbusy() will free them.
+ * process errors. for reads, just mark the page to be freed.
+ * for writes, if the error was ENOMEM, we assume this was
+ * a transient failure so we mark the page dirty so that
+ * we'll try to write it again later. for all other write
+ * errors, we assume the error is permanent, thus the data
+ * in the page is lost. bummer.
*/
- if (!write && error) {
- atomic_setbits_int(&pg->pg_flags, PG_RELEASED);
- continue;
+ if (error) {
+ if (!write) {
+ atomic_setbits_int(&pg->pg_flags, PG_RELEASED);
+ continue;
+ } else if (error == ENOMEM) {
+ if (pg->pg_flags & PG_PAGEOUT) {
+ atomic_clearbits_int(&pg->pg_flags,
+ PG_PAGEOUT);
+ uvmexp.paging--;
+ }
+ atomic_clearbits_int(&pg->pg_flags, PG_CLEAN);
+ uvm_pageactivate(pg);
+ }
}
- KASSERT(!write || (pgs[i]->pg_flags & PG_FAKE) == 0);
/*
- * if this is a read and the page is PG_FAKE,
- * or this was a successful write,
- * mark the page PG_CLEAN and not PG_FAKE.
+ * if the page is PG_FAKE, this must have been a read to
+ * initialize the page. clear PG_FAKE and activate the page.
+ * we must also clear the pmap "modified" flag since it may
+ * still be set from the page's previous identity.
*/
- if ((pgs[i]->pg_flags & PG_FAKE) || (write && error != ENOMEM))
{
- pmap_clear_reference(pgs[i]);
- pmap_clear_modify(pgs[i]);
- atomic_setbits_int(&pgs[i]->pg_flags, PG_CLEAN);
+ if (pg->pg_flags & PG_FAKE) {
+ KASSERT(!write);
atomic_clearbits_int(&pgs[i]->pg_flags, PG_FAKE);
+ uvm_pageactivate(pg);
+ pmap_clear_modify(pg);
+ }
+
+ /*
+ * do accounting for pagedaemon i/o and arrange to free
+ * the pages instead of just unbusying them.
+ */
+ if (pg->pg_flags & PG_PAGEOUT) {
+ atomic_clearbits_int(&pg->pg_flags, PG_PAGEOUT);
+ uvmexp.paging--;
+
+ /*
+ * If a process faulted on a page of the anon being
+ * swapped out it is waiting and we cannot release
+ * it.
+ */
+ if (pg->pg_flags & PG_WANTED) {
+ KASSERT(swap);
+ wanted++;
+ } else {
+ atomic_setbits_int(&pg->pg_flags, PG_RELEASED);
+ }
+ }
+
+ /*
+ * for swap pages, unlock everything for this page now.
+ */
+ if (swap) {
+ uvm_page_unbusy(&pg, 1);
+ uvm_unlock_pageq();
+ rw_exit(slock);
}
}
- uvm_page_unbusy(pgs, npages);
if (!swap) {
- rw_exit(uobj->vmobjlock);
+ uvm_page_unbusy(pgs, npages);
+ uvm_unlock_pageq();
+ rw_exit(slock);
+ } else {
+ KASSERT(write);
+ KASSERT(pageout);
+
+ /* these pages are now only in swap. */
+ KASSERT(uvmexp.swpgonly + (npages-wanted) <= uvmexp.swpginuse);
+ atomic_add_int(&uvmexp.swpgonly, (npages-wanted));
+ if (error) {
+ uvm_swap_markbad(swslot, npages);
+ }
}
-
#ifdef UVM_SWAP_ENCRYPT
freed:
#endif
Index: uvm/uvm_pdaemon.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_pdaemon.c,v
retrieving revision 1.99
diff -u -p -r1.99 uvm_pdaemon.c
--- uvm/uvm_pdaemon.c 12 May 2022 12:49:31 -0000 1.99
+++ uvm/uvm_pdaemon.c 24 May 2022 12:35:54 -0000
@@ -1,5 +1,5 @@
/* $OpenBSD: uvm_pdaemon.c,v 1.99 2022/05/12 12:49:31 mpi Exp $ */
-/* $NetBSD: uvm_pdaemon.c,v 1.23 2000/08/20 10:24:14 bjh21 Exp $ */
+/* $NetBSD: uvm_pdaemon.c,v 1.37 2001/09/15 20:36:47 chs Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
@@ -101,10 +101,14 @@ extern void drmbackoff(long);
* local prototypes
*/
+struct rwlock *uvmpd_trylockowner(struct vm_page *);
void uvmpd_scan(void);
-boolean_t uvmpd_scan_inactive(struct pglist *);
+void uvmpd_balancequeue(int);
+void uvmpd_scan_inactive(struct pglist *);
void uvmpd_tune(void);
void uvmpd_drop(struct pglist *);
+void uvmpd_dropswap(struct vm_page *);
+
/*
* uvm_wait: wait (sleep) for the page daemon to free some pages
@@ -208,7 +212,7 @@ uvm_pageout(void *arg)
{
struct uvm_constraint_range constraint;
struct uvm_pmalloc *pma;
- int npages = 0;
+ int npages = 0, free;
/* ensure correct priority and set paging parameters... */
uvm.pagedaemon_proc = curproc;
@@ -263,9 +267,10 @@ uvm_pageout(void *arg)
size = 0;
if (pma != NULL)
size += pma->pm_size >> PAGE_SHIFT;
- if (uvmexp.free - BUFPAGES_DEFICIT < uvmexp.freetarg)
- size += uvmexp.freetarg - (uvmexp.free -
- BUFPAGES_DEFICIT);
+
+ free = uvmexp.free - BUFPAGES_DEFICIT;
+ if (free < uvmexp.freetarg)
+ size += uvmexp.freetarg - free;
if (size == 0)
size = 16; /* XXX */
uvm_unlock_pageq();
@@ -278,8 +283,9 @@ uvm_pageout(void *arg)
/*
* scan if needed
*/
+ free = uvmexp.free - BUFPAGES_DEFICIT;
if (pma != NULL ||
- ((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freetarg) ||
+ (free < uvmexp.freetarg) ||
((uvmexp.inactive + BUFPAGES_INACT) < uvmexp.inactarg)) {
uvmpd_scan();
}
@@ -348,9 +354,6 @@ uvm_aiodone_daemon(void *arg)
/* process each i/o that's done. */
free = uvmexp.free;
while (bp != NULL) {
- if (bp->b_flags & B_PDAEMON) {
- uvmexp.paging -= bp->b_bufsize >> PAGE_SHIFT;
- }
nbp = TAILQ_NEXT(bp, b_freelist);
s = splbio(); /* b_iodone must by called at splbio */
(*bp->b_iodone)(bp);
@@ -366,7 +369,198 @@ uvm_aiodone_daemon(void *arg)
}
}
+/*
+ * uvmpd_trylockowner: trylock the page's owner.
+ *
+ * => return the locked rwlock on success. otherwise, return NULL.
+ */
+struct rwlock *
+uvmpd_trylockowner(struct vm_page *pg)
+{
+
+ struct uvm_object *uobj = pg->uobject;
+ struct rwlock *slock;
+
+ if (uobj != NULL) {
+ slock = uobj->vmobjlock;
+ } else {
+ struct vm_anon *anon = pg->uanon;
+
+ KASSERT(anon != NULL);
+ slock = anon->an_lock;
+ }
+
+ if (rw_enter(slock, RW_WRITE|RW_NOSLEEP)) {
+ return NULL;
+ }
+
+ return slock;
+}
+
+struct swapcluster {
+ int swc_slot;
+ int swc_nallocated;
+ int swc_nused;
+ struct vm_page *swc_pages[round_page(MAXPHYS) >> PAGE_SHIFT];
+};
+
+void
+swapcluster_init(struct swapcluster *swc)
+{
+
+ swc->swc_slot = 0;
+ swc->swc_nused = 0;
+}
+
+int
+swapcluster_allocslots(struct swapcluster *swc)
+{
+ int slot;
+ int npages;
+
+ if (swc->swc_slot != 0) {
+ return 0;
+ }
+
+ /* Even with strange MAXPHYS, the shift
+ implicitly rounds down to a page. */
+ npages = MAXPHYS >> PAGE_SHIFT;
+ slot = uvm_swap_alloc(&npages, TRUE);
+ if (slot == 0) {
+ return ENOMEM;
+ }
+ swc->swc_slot = slot;
+ swc->swc_nallocated = npages;
+ swc->swc_nused = 0;
+
+ return 0;
+}
+
+int
+swapcluster_add(struct swapcluster *swc, struct vm_page *pg)
+{
+ int slot;
+ struct uvm_object *uobj;
+
+ KASSERT(swc->swc_slot != 0);
+ KASSERT(swc->swc_nused < swc->swc_nallocated);
+ KASSERT((pg->pg_flags & PQ_SWAPBACKED) != 0);
+
+ slot = swc->swc_slot + swc->swc_nused;
+ uobj = pg->uobject;
+ if (uobj == NULL) {
+ KASSERT(rw_write_held(pg->uanon->an_lock));
+ pg->uanon->an_swslot = slot;
+ } else {
+ int result;
+
+ KASSERT(rw_write_held(uobj->vmobjlock));
+ result = uao_set_swslot(uobj, pg->offset >> PAGE_SHIFT, slot);
+ if (result == -1) {
+ return ENOMEM;
+ }
+ }
+ swc->swc_pages[swc->swc_nused] = pg;
+ swc->swc_nused++;
+
+ return 0;
+}
+
+void
+swapcluster_flush(struct swapcluster *swc, boolean_t now)
+{
+ int slot;
+ int nused;
+ int nallocated;
+ int error = 0;
+
+ if (swc->swc_slot == 0) {
+ return;
+ }
+ KASSERT(swc->swc_nused <= swc->swc_nallocated);
+
+ slot = swc->swc_slot;
+ nused = swc->swc_nused;
+ nallocated = swc->swc_nallocated;
+
+ /*
+ * if this is the final pageout we could have a few
+ * unused swap blocks. if so, free them now.
+ */
+ if (nused < nallocated) {
+ if (!now) {
+ return;
+ }
+ uvm_swap_free(slot + nused, nallocated - nused);
+ }
+
+ /*
+ * now start the pageout.
+ */
+ if (nused > 0) {
+ uvmexp.pdpageouts++;
+ error = uvm_swap_put(slot, swc->swc_pages, nused, 0);
+ if (error != VM_PAGER_OK && error != VM_PAGER_PEND) {
+ int i;
+
+ KASSERT(error == VM_PAGER_AGAIN);
+
+ for (i = 0; i < nused; i++) {
+ struct rwlock *slock;
+ struct vm_page *pg = swc->swc_pages[i];
+
+ KASSERT(pg->pg_flags & PG_PAGEOUT);
+ KASSERT((pg->pg_flags & PG_RELEASED) == 0);
+
+ if (pg->pg_flags & PQ_ANON) {
+ slock = pg->uanon->an_lock;
+ } else {
+ slock = pg->uobject->vmobjlock;
+ }
+ rw_enter(slock, RW_WRITE);
+ uvm_lock_pageq();
+ atomic_clearbits_int(&pg->pg_flags, PG_PAGEOUT);
+ uvmexp.paging--;
+ uvmpd_dropswap(pg);
+ uvm_page_unbusy(&pg, 1);
+ uvm_unlock_pageq();
+ rw_exit(slock);
+ }
+ }
+ }
+
+ /*
+ * zero swslot to indicate that we are
+ * no longer building a swap-backed cluster.
+ */
+ swc->swc_slot = 0;
+ swc->swc_nused = 0;
+}
+
+int
+swapcluster_nused(struct swapcluster *swc)
+{
+ return swc->swc_nused;
+}
+
+/*
+ * uvmpd_dropswap: free any swap allocated to this page.
+ *
+ * => called with owner locked.
+ */
+void
+uvmpd_dropswap(struct vm_page *pg)
+{
+ struct vm_anon *anon = pg->uanon;
+
+ if ((pg->pg_flags & PQ_ANON) && anon->an_swslot) {
+ uvm_swap_free(anon->an_swslot, 1);
+ anon->an_swslot = 0;
+ } else if (pg->pg_flags & PQ_AOBJ) {
+ uao_dropswap(pg->uobject, pg->offset >> PAGE_SHIFT);
+ }
+}
/*
* uvmpd_scan_inactive: scan an inactive list for pages to clean or free.
@@ -377,487 +571,223 @@ uvm_aiodone_daemon(void *arg)
* => we handle the building of swap-backed clusters
* => we return TRUE if we are exiting because we met our target
*/
-
-boolean_t
+void
uvmpd_scan_inactive(struct pglist *pglst)
{
- boolean_t retval = FALSE; /* assume we haven't hit target */
- int free, result;
+ struct swapcluster swc;
struct vm_page *p, *nextpg;
struct uvm_object *uobj;
- struct vm_page *pps[MAXBSIZE >> PAGE_SHIFT], **ppsp;
- int npages;
- struct vm_page *swpps[MAXBSIZE >> PAGE_SHIFT]; /* XXX: see below */
- int swnpages, swcpages; /* XXX: see below */
- int swslot;
struct vm_anon *anon;
- boolean_t swap_backed;
- vaddr_t start;
- int dirtyreacts;
+ struct rwlock *slock;
+ int dirtyreacts, free, error;
/*
* swslot is non-zero if we are building a swap cluster. we want
* to stay in the loop while we have a page to scan or we have
* a swap-cluster to build.
*/
- swslot = 0;
- swnpages = swcpages = 0;
- free = 0;
+ swapcluster_init(&swc);
dirtyreacts = 0;
-
- for (p = TAILQ_FIRST(pglst); p != NULL || swslot != 0; p = nextpg) {
- /*
- * note that p can be NULL iff we have traversed the whole
- * list and need to do one final swap-backed clustered pageout.
- */
+ for (p = TAILQ_FIRST(pglst); p != NULL; p = nextpg) {
uobj = NULL;
anon = NULL;
- if (p) {
- /*
- * update our copy of "free" and see if we've met
- * our target
- */
- free = uvmexp.free - BUFPAGES_DEFICIT;
+ /*
+ * see if we've met the free target
+ */
+ free = uvmexp.free - BUFPAGES_DEFICIT;
+ if (free + uvmexp.paging
+ + swapcluster_nused(&swc)
+ >= uvmexp.freetarg << 2 ||
+ dirtyreacts == UVMPD_NUMDIRTYREACTS) {
+ break;
+ }
- if (free + uvmexp.paging >= uvmexp.freetarg << 2 ||
- dirtyreacts == UVMPD_NUMDIRTYREACTS) {
- retval = TRUE;
-
- if (swslot == 0) {
- /* exit now if no swap-i/o pending */
- break;
- }
+ /*
+ * we are below target and have a new page to consider.
+ */
+ uvmexp.pdscans++;
+ nextpg = TAILQ_NEXT(p, pageq);
- /* set p to null to signal final swap i/o */
- p = NULL;
- }
+ /*
+ * move referenced pages back to active queue
+ * and skip to next page.
+ */
+ if (pmap_is_referenced(p)) {
+ uvm_pageactivate(p);
+ uvmexp.pdreact++;
+ continue;
}
- if (p) { /* if (we have a new page to consider) */
- /*
- * we are below target and have a new page to consider.
- */
- uvmexp.pdscans++;
- nextpg = TAILQ_NEXT(p, pageq);
-
- if (p->pg_flags & PQ_ANON) {
- anon = p->uanon;
- KASSERT(anon != NULL);
- if (rw_enter(anon->an_lock,
- RW_WRITE|RW_NOSLEEP)) {
- /* lock failed, skip this page */
- continue;
- }
- /*
- * move referenced pages back to active queue
- * and skip to next page.
- */
- if (pmap_is_referenced(p)) {
- uvm_pageactivate(p);
- rw_exit(anon->an_lock);
- uvmexp.pdreact++;
- continue;
- }
- if (p->pg_flags & PG_BUSY) {
- rw_exit(anon->an_lock);
- uvmexp.pdbusy++;
- /* someone else owns page, skip it */
- continue;
- }
- uvmexp.pdanscan++;
- } else {
- uobj = p->uobject;
- KASSERT(uobj != NULL);
- if (rw_enter(uobj->vmobjlock,
- RW_WRITE|RW_NOSLEEP)) {
- /* lock failed, skip this page */
- continue;
- }
- /*
- * move referenced pages back to active queue
- * and skip to next page.
- */
- if (pmap_is_referenced(p)) {
- uvm_pageactivate(p);
- rw_exit(uobj->vmobjlock);
- uvmexp.pdreact++;
- continue;
- }
- if (p->pg_flags & PG_BUSY) {
- rw_exit(uobj->vmobjlock);
- uvmexp.pdbusy++;
- /* someone else owns page, skip it */
- continue;
- }
- uvmexp.pdobscan++;
- }
+ anon = p->uanon;
+ uobj = p->uobject;
- /*
- * we now have the page queues locked.
- * the page is not busy. if the page is clean we
- * can free it now and continue.
- */
- if (p->pg_flags & PG_CLEAN) {
- if (p->pg_flags & PQ_SWAPBACKED) {
- /* this page now lives only in swap */
- atomic_inc_int(&uvmexp.swpgonly);
- }
+ /*
+ * first we attempt to lock the object that this page
+ * belongs to. if our attempt fails we skip on to
+ * the next page (no harm done). it is important to
+ * "try" locking the object as we are locking in the
+ * wrong order (pageq -> object) and we don't want to
+ * deadlock.
+ */
+ slock = uvmpd_trylockowner(p);
+ if (slock == NULL) {
+ continue;
+ }
- /* zap all mappings with pmap_page_protect... */
- pmap_page_protect(p, PROT_NONE);
- uvm_pagefree(p);
- uvmexp.pdfreed++;
-
- if (anon) {
-
- /*
- * an anonymous page can only be clean
- * if it has backing store assigned.
- */
-
- KASSERT(anon->an_swslot != 0);
-
- /* remove from object */
- anon->an_page = NULL;
- rw_exit(anon->an_lock);
- } else {
- rw_exit(uobj->vmobjlock);
- }
- continue;
- }
+ if (p->pg_flags & PG_BUSY) {
+ rw_exit(slock);
+ uvmexp.pdbusy++;
+ continue;
+ }
- /*
- * this page is dirty, skip it if we'll have met our
- * free target when all the current pageouts complete.
- */
- if (free + uvmexp.paging > uvmexp.freetarg << 2) {
- if (anon) {
- rw_exit(anon->an_lock);
- } else {
- rw_exit(uobj->vmobjlock);
- }
- continue;
- }
+ /* does the page belong to an object? */
+ if (uobj != NULL) {
+ uvmexp.pdobscan++;
+ } else {
+ KASSERT(anon != NULL);
+ uvmexp.pdanscan++;
+ }
- /*
- * this page is dirty, but we can't page it out
- * since all pages in swap are only in swap.
- * reactivate it so that we eventually cycle
- * all pages thru the inactive queue.
- */
- if ((p->pg_flags & PQ_SWAPBACKED) && uvm_swapisfull()) {
- dirtyreacts++;
- uvm_pageactivate(p);
- if (anon) {
- rw_exit(anon->an_lock);
- } else {
- rw_exit(uobj->vmobjlock);
- }
- continue;
+ /*
+ * we now have the object and the page queues locked.
+ * if the page is not swap-backed, call the object's
+ * pager to flush and free the page.
+ */
+ if ((p->pg_flags & PQ_SWAPBACKED) == 0) {
+ uvm_unlock_pageq();
+ error = (uobj->pgops->pgo_flush)(uobj,
+ p->offset, p->offset + PAGE_SIZE,
+ PGO_CLEANIT|PGO_FREE|PGO_ALLPAGES);
+ rw_exit(uobj->vmobjlock);
+ uvm_lock_pageq();
+ if (nextpg &&
+ (nextpg->pg_flags & PQ_INACTIVE) == 0) {
+ nextpg = TAILQ_FIRST(pglst);
}
+ continue;
+ }
- /*
- * if the page is swap-backed and dirty and swap space
- * is full, free any swap allocated to the page
- * so that other pages can be paged out.
+ /*
+ * the page is swap-backed. remove all the permissions
+ * from the page so we can sync the modified info
+ * without any race conditions. if the page is clean
+ * we can free it now and continue.
+ */
+ pmap_page_protect(p, PROT_NONE);
+ if ((p->pg_flags & PG_CLEAN) && pmap_clear_modify(p)) {
+ atomic_clearbits_int(&p->pg_flags, PG_CLEAN);
+ }
+ if (p->pg_flags & PG_CLEAN) {
+ int slot;
+ int pageidx;
+
+ pageidx = p->offset >> PAGE_SHIFT;
+ uvm_pagefree(p);
+ uvmexp.pdfreed++;
+
+ /*
+ * for anons, we need to remove the page
+ * from the anon ourselves. for aobjs,
+ * pagefree did that for us.
*/
- KASSERT(uvmexp.swpginuse <= uvmexp.swpages);
- if ((p->pg_flags & PQ_SWAPBACKED) &&
- uvmexp.swpginuse == uvmexp.swpages) {
-
- if ((p->pg_flags & PQ_ANON) &&
- p->uanon->an_swslot) {
- uvm_swap_free(p->uanon->an_swslot, 1);
- p->uanon->an_swslot = 0;
- }
- if (p->pg_flags & PQ_AOBJ) {
- uao_dropswap(p->uobject,
- p->offset >> PAGE_SHIFT);
- }
+ if (anon) {
+ KASSERT(anon->an_swslot != 0);
+ anon->an_page = NULL;
+ slot = anon->an_swslot;
+ } else {
+ slot = uao_find_swslot(uobj, pageidx);
}
+ rw_exit(slock);
- /*
- * the page we are looking at is dirty. we must
- * clean it before it can be freed. to do this we
- * first mark the page busy so that no one else will
- * touch the page. we write protect all the mappings
- * of the page so that no one touches it while it is
- * in I/O.
- */
-
- swap_backed = ((p->pg_flags & PQ_SWAPBACKED) != 0);
- atomic_setbits_int(&p->pg_flags, PG_BUSY);
- UVM_PAGE_OWN(p, "scan_inactive");
- pmap_page_protect(p, PROT_READ);
- uvmexp.pgswapout++;
-
- /*
- * for swap-backed pages we need to (re)allocate
- * swap space.
- */
- if (swap_backed) {
- /* free old swap slot (if any) */
- if (anon) {
- if (anon->an_swslot) {
- uvm_swap_free(anon->an_swslot,
- 1);
- anon->an_swslot = 0;
- }
- } else {
- uao_dropswap(uobj,
- p->offset >> PAGE_SHIFT);
- }
-
- /* start new cluster (if necessary) */
- if (swslot == 0) {
- swnpages = MAXBSIZE >> PAGE_SHIFT;
- swslot = uvm_swap_alloc(&swnpages,
- TRUE);
- if (swslot == 0) {
- /* no swap? give up! */
- atomic_clearbits_int(
- &p->pg_flags,
- PG_BUSY);
- UVM_PAGE_OWN(p, NULL);
- if (anon)
- rw_exit(anon->an_lock);
- else
- rw_exit(
- uobj->vmobjlock);
- continue;
- }
- swcpages = 0; /* cluster is empty */
- }
-
- /* add block to cluster */
- swpps[swcpages] = p;
- if (anon)
- anon->an_swslot = swslot + swcpages;
- else
- uao_set_swslot(uobj,
- p->offset >> PAGE_SHIFT,
- swslot + swcpages);
- swcpages++;
+ if (slot > 0) {
+ /* this page is now only in swap */
+ KASSERT(uvmexp.swpgonly <
+ uvmexp.swpginuse);
+ atomic_inc_int(&uvmexp.swpgonly);
}
- } else {
- /* if p == NULL we must be doing a last swap i/o */
- swap_backed = TRUE;
+ continue;
}
/*
- * now consider doing the pageout.
- *
- * for swap-backed pages, we do the pageout if we have either
- * filled the cluster (in which case (swnpages == swcpages) or
- * run out of pages (p == NULL).
- *
- * for object pages, we always do the pageout.
+ * this page is dirty, skip it if we'll have met our
+ * free target when all the current pageouts complete.
*/
- if (swap_backed) {
- if (p) { /* if we just added a page to cluster */
- if (anon)
- rw_exit(anon->an_lock);
- else
- rw_exit(uobj->vmobjlock);
-
- /* cluster not full yet? */
- if (swcpages < swnpages)
- continue;
- }
-
- /* starting I/O now... set up for it */
- npages = swcpages;
- ppsp = swpps;
- /* for swap-backed pages only */
- start = (vaddr_t) swslot;
-
- /* if this is final pageout we could have a few
- * extra swap blocks */
- if (swcpages < swnpages) {
- uvm_swap_free(swslot + swcpages,
- (swnpages - swcpages));
- }
- } else {
- /* normal object pageout */
- ppsp = pps;
- npages = sizeof(pps) / sizeof(struct vm_page *);
- /* not looked at because PGO_ALLPAGES is set */
- start = 0;
+ if (free + uvmexp.paging > uvmexp.freetarg << 2) {
+ rw_exit(slock);
+ continue;
}
/*
- * now do the pageout.
- *
- * for swap_backed pages we have already built the cluster.
- * for !swap_backed pages, uvm_pager_put will call the object's
- * "make put cluster" function to build a cluster on our behalf.
- *
- * we pass the PGO_PDFREECLUST flag to uvm_pager_put to instruct
- * it to free the cluster pages for us on a successful I/O (it
- * always does this for un-successful I/O requests). this
- * allows us to do clustered pageout without having to deal
- * with cluster pages at this level.
- *
- * note locking semantics of uvm_pager_put with PGO_PDFREECLUST:
- * IN: locked: page queues
- * OUT: locked:
- * !locked: pageqs
+ * free any swap allocated to the page since
+ * we'll have to write it again with its new data.
*/
-
- uvmexp.pdpageouts++;
- result = uvm_pager_put(swap_backed ? NULL : uobj, p,
- &ppsp, &npages, PGO_ALLPAGES|PGO_PDFREECLUST, start, 0);
+ uvmpd_dropswap(p);
/*
- * if we did i/o to swap, zero swslot to indicate that we are
- * no longer building a swap-backed cluster.
+ * if all pages in swap are only in swap,
+ * the swap space is full and we can't page out
+ * any more swap-backed pages. reactivate this page
+ * so that we eventually cycle all pages through
+ * the inactive queue.
*/
+ if (uvm_swapisfull()) {
+ dirtyreacts++;
+ uvm_pageactivate(p);
+ rw_exit(slock);
+ continue;
+ }
- if (swap_backed)
- swslot = 0; /* done with this cluster */
/*
- * first, we check for VM_PAGER_PEND which means that the
- * async I/O is in progress and the async I/O done routine
- * will clean up after us. in this case we move on to the
- * next page.
- *
- * there is a very remote chance that the pending async i/o can
- * finish _before_ we get here. if that happens, our page "p"
- * may no longer be on the inactive queue. so we verify this
- * when determining the next page (starting over at the head if
- * we've lost our inactive page).
+ * start new swap pageout cluster (if necessary).
*/
-
- if (result == VM_PAGER_PEND) {
- uvmexp.paging += npages;
- uvm_lock_pageq();
- uvmexp.pdpending++;
- if (p) {
- if (p->pg_flags & PQ_INACTIVE)
- nextpg = TAILQ_NEXT(p, pageq);
- else
- nextpg = TAILQ_FIRST(pglst);
- } else {
- nextpg = NULL;
- }
+ if (swapcluster_allocslots(&swc)) {
+ rw_exit(slock);
continue;
}
- /* clean up "p" if we have one */
- if (p) {
- /*
- * the I/O request to "p" is done and uvm_pager_put
- * has freed any cluster pages it may have allocated
- * during I/O. all that is left for us to do is
- * clean up page "p" (which is still PG_BUSY).
- *
- * our result could be one of the following:
- * VM_PAGER_OK: successful pageout
- *
- * VM_PAGER_AGAIN: tmp resource shortage, we skip
- * to next page
- * VM_PAGER_{FAIL,ERROR,BAD}: an error. we
- * "reactivate" page to get it out of the way (it
- * will eventually drift back into the inactive
- * queue for a retry).
- * VM_PAGER_UNLOCK: should never see this as it is
- * only valid for "get" operations
- */
-
- /* relock p's object: page queues not lock yet, so
- * no need for "try" */
-
- /* !swap_backed case: already locked... */
- if (swap_backed) {
- if (anon)
- rw_enter(anon->an_lock, RW_WRITE);
- else
- rw_enter(uobj->vmobjlock, RW_WRITE);
- }
-
-#ifdef DIAGNOSTIC
- if (result == VM_PAGER_UNLOCK)
- panic("pagedaemon: pageout returned "
- "invalid 'unlock' code");
-#endif
-
- /* handle PG_WANTED now */
- if (p->pg_flags & PG_WANTED)
- wakeup(p);
-
- atomic_clearbits_int(&p->pg_flags, PG_BUSY|PG_WANTED);
- UVM_PAGE_OWN(p, NULL);
-
- /* released during I/O? Can only happen for anons */
- if (p->pg_flags & PG_RELEASED) {
- KASSERT(anon != NULL);
- /*
- * remove page so we can get nextpg,
- * also zero out anon so we don't use
- * it after the free.
- */
- anon->an_page = NULL;
- p->uanon = NULL;
+ /*
+ * at this point, we're definitely going reuse this
+ * page. mark the page busy and delayed-free.
+ * we should remove the page from the page queues
+ * so we don't ever look at it again.
+ * adjust counters and such.
+ */
+ atomic_setbits_int(&p->pg_flags, PG_BUSY);
+ UVM_PAGE_OWN(p, "scan_inactive");
+
+ atomic_setbits_int(&p->pg_flags, PG_PAGEOUT);
+ uvmexp.paging++;
+ uvm_pagedequeue(p);
- rw_exit(anon->an_lock);
- uvm_anfree(anon); /* kills anon */
- pmap_page_protect(p, PROT_NONE);
- anon = NULL;
- uvm_lock_pageq();
- nextpg = TAILQ_NEXT(p, pageq);
- /* free released page */
- uvm_pagefree(p);
- } else { /* page was not released during I/O */
- uvm_lock_pageq();
- nextpg = TAILQ_NEXT(p, pageq);
- if (result != VM_PAGER_OK) {
- /* pageout was a failure... */
- if (result != VM_PAGER_AGAIN)
- uvm_pageactivate(p);
- pmap_clear_reference(p);
- /* XXXCDC: if (swap_backed) FREE p's
- * swap block? */
- } else {
- /* pageout was a success... */
- pmap_clear_reference(p);
- pmap_clear_modify(p);
- atomic_setbits_int(&p->pg_flags,
- PG_CLEAN);
- }
- }
+ uvmexp.pgswapout++;
+ uvm_unlock_pageq();
- /*
- * drop object lock (if there is an object left). do
- * a safety check of nextpg to make sure it is on the
- * inactive queue (it should be since PG_BUSY pages on
- * the inactive queue can't be re-queued [note: not
- * true for active queue]).
- */
- if (anon)
- rw_exit(anon->an_lock);
- else if (uobj)
- rw_exit(uobj->vmobjlock);
+ /*
+ * add the new page to the cluster.
+ */
+ error = swapcluster_add(&swc, p);
+ KASSERT(error == 0);
+ rw_exit(slock);
- if (nextpg && (nextpg->pg_flags & PQ_INACTIVE) == 0) {
- nextpg = TAILQ_FIRST(pglst); /* reload! */
- }
- } else {
- /*
- * if p is null in this loop, make sure it stays null
- * in the next loop.
- */
- nextpg = NULL;
+ swapcluster_flush(&swc, FALSE);
+ uvm_lock_pageq();
- /*
- * lock page queues here just so they're always locked
- * at the end of the loop.
- */
- uvm_lock_pageq();
+ /*
+ * the pageout is in progress. bump counters and set up
+ * for the next loop.
+ */
+ uvmexp.pdpending++;
+ if (nextpg && (nextpg->pg_flags & PQ_INACTIVE) == 0) {
+ nextpg = TAILQ_FIRST(pglst);
}
+
}
- return (retval);
+
+ uvm_unlock_pageq();
+ swapcluster_flush(&swc, TRUE);
+ uvm_lock_pageq();
}
/*
@@ -869,16 +799,11 @@ uvmpd_scan_inactive(struct pglist *pglst
void
uvmpd_scan(void)
{
- int free, inactive_shortage, swap_shortage, pages_freed;
- struct vm_page *p, *nextpg;
- struct uvm_object *uobj;
- struct vm_anon *anon;
- struct rwlock *slock;
+ int free, swap_shortage, pages_freed;
MUTEX_ASSERT_LOCKED(&uvm.pageqlock);
uvmexp.pdrevs++; /* counter */
- uobj = NULL;
/*
* get current "free" page count
@@ -905,26 +830,16 @@ uvmpd_scan(void)
* to inactive ones.
*/
- /*
- * alternate starting queue between swap and object based on the
- * low bit of uvmexp.pdrevs (which we bump by one each call).
- */
pages_freed = uvmexp.pdfreed;
(void) uvmpd_scan_inactive(&uvm.page_inactive);
pages_freed = uvmexp.pdfreed - pages_freed;
/*
- * we have done the scan to get free pages. now we work on meeting
- * our inactive target.
- */
- inactive_shortage = uvmexp.inactarg - uvmexp.inactive - BUFPAGES_INACT;
-
- /*
* detect if we're not going to be able to page anything out
* until we free some swap resources from active pages.
*/
- free = uvmexp.free - BUFPAGES_DEFICIT;
swap_shortage = 0;
+ free = uvmexp.free - BUFPAGES_DEFICIT;
if (free < uvmexp.freetarg &&
uvmexp.swpginuse == uvmexp.swpages &&
!uvm_swapisfull() &&
@@ -932,6 +847,23 @@ uvmpd_scan(void)
swap_shortage = uvmexp.freetarg - free;
}
+ uvmpd_balancequeue(swap_shortage);
+}
+
+void
+uvmpd_balancequeue(int swap_shortage)
+{
+ int inactive_shortage;
+ struct rwlock *slock;
+ struct vm_page *p, *nextpg;
+
+
+ /*
+ * we have done the scan to get free pages. now we work on meeting
+ * our inactive target.
+ */
+ inactive_shortage = uvmexp.inactarg - uvmexp.inactive - BUFPAGES_INACT;
+
for (p = TAILQ_FIRST(&uvm.page_active);
p != NULL && (inactive_shortage > 0 || swap_shortage > 0);
p = nextpg) {
@@ -943,19 +875,9 @@ uvmpd_scan(void)
/*
* lock the page's owner.
*/
- if (p->uobject != NULL) {
- uobj = p->uobject;
- slock = uobj->vmobjlock;
- if (rw_enter(slock, RW_WRITE|RW_NOSLEEP)) {
- continue;
- }
- } else {
- anon = p->uanon;
- KASSERT(p->uanon != NULL);
- slock = anon->an_lock;
- if (rw_enter(slock, RW_WRITE|RW_NOSLEEP)) {
- continue;
- }
+ slock = uvmpd_trylockowner(p);
+ if (slock == NULL) {
+ continue;
}
/*
@@ -976,8 +898,7 @@ uvmpd_scan(void)
p->uanon->an_swslot = 0;
atomic_clearbits_int(&p->pg_flags, PG_CLEAN);
swap_shortage--;
- }
- if (p->pg_flags & PQ_AOBJ) {
+ } else if (p->pg_flags & PQ_AOBJ) {
int slot = uao_set_swslot(p->uobject,
p->offset >> PAGE_SHIFT, 0);
if (slot) {
@@ -990,8 +911,7 @@ uvmpd_scan(void)
}
/*
- * deactivate this page if there's a shortage of
- * inactive pages.
+ * if there's a shortage of inactive pages, deactivate.
*/
if (inactive_shortage > 0) {
pmap_page_protect(p, PROT_NONE);
Index: uvm/uvm_swap.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_swap.c,v
retrieving revision 1.155
diff -u -p -r1.155 uvm_swap.c
--- uvm/uvm_swap.c 28 Apr 2022 09:58:11 -0000 1.155
+++ uvm/uvm_swap.c 24 May 2022 12:31:34 -0000
@@ -1711,7 +1711,7 @@ uvm_swap_io(struct vm_page **pps, int st
/* encrypt to swap */
if (write && bounce) {
- int i, opages;
+ int i, wanted = 0;
caddr_t src, dst;
u_int64_t block;
@@ -1741,13 +1741,43 @@ uvm_swap_io(struct vm_page **pps, int st
}
uvm_pagermapout(kva, npages);
+ kva = bouncekva;
- /* dispose of pages we dont use anymore */
- opages = npages;
- uvm_pager_dropcluster(NULL, NULL, pps, &opages,
- PGO_PDFREECLUST);
+ /* dispose the page we dont use anymore */
+ for (i = 0; i < npages; i++) {
+ struct rwlock *slock;
+ struct vm_page *pg = pps[i];
- kva = bouncekva;
+ KASSERT(pg->pg_flags & PG_PAGEOUT);
+
+ if (pg->pg_flags & PQ_ANON) {
+ slock = pg->uanon->an_lock;
+ } else {
+ slock = pg->uobject->vmobjlock;
+ }
+ rw_enter(slock, RW_WRITE);
+ uvm_lock_pageq();
+ atomic_clearbits_int(&pg->pg_flags, PG_PAGEOUT);
+ uvmexp.paging--;
+
+ /*
+ * If a process faulted on a page of the anon being
+ * swapped out it is waiting and we cannot release
+ * it.
+ */
+ if (pg->pg_flags & PG_WANTED) {
+ wanted++;
+ } else {
+ atomic_setbits_int(&pg->pg_flags, PG_RELEASED);
+ }
+ uvm_page_unbusy(&pg, 1);
+ uvm_unlock_pageq();
+ rw_exit(slock);
+ }
+
+ /* these pages are now only in swap. */
+ KASSERT(uvmexp.swpgonly + (npages-wanted) <= uvmexp.swpginuse);
+ atomic_add_int(&uvmexp.swpgonly, (npages-wanted));
}
/*