Here's a new version of the buffer flipper that fixes
a problem found by krw@. - All comments from before still apply:
> You too can have a GIANT buffer cache.... etc. etc...
>
> After much bug fighting in the midlayer and now uvm over the last 6
> months in a number of places, I think it's about time to shop this
> around again.
>
> This will only make a difference on amd64 - if you have 4 GB or more
> of RAM. What it does is allows the high (non-DMA reachable) memory to
> be used for buffer cache pages. It will use your set buffer
> cache percentage of both dma'able, and above dma'able pages for the
> cache, migrating the oldest cache pages into high memory. pages
> are flipped back into dma'able memory if they are needed for IO.
>
> Notwithstanding that it only "matters" on amd64, it does change how
> the world works a bit, and therefore requires testing everywhere. It
> has survived multiple make build/make release test cycles now on my
> machines (amd64,i386,zaurus,sparc,sparc64,hppa) (with various settings
> of bufcachepercent) and is running on my NFS server
> (bufcachepercent=90) without any complaints throughout that - it's
> been running on my laptop for a long time now.
>
> If you try it, and have troubles (i.e. any new regressions), please
> ensure you have your machine's console accessible (check to see if you
> have ddb.console=1 in /etc/sysctl.conf) and if you have problems
> please try to get
>
>
> trace
> ps
> show bcstats
> show uvm
>
> from ddb if at all possible.
>
> Please let me know how you do with it, and most importantly what
> you try it on/with.
>
-Bob
(diff also in ~beck/viagra.diff14 on cvs)
Index: sys/kern/kern_sysctl.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_sysctl.c,v
retrieving revision 1.234
diff -u -p -r1.234 kern_sysctl.c
--- sys/kern/kern_sysctl.c 6 Apr 2013 03:44:34 -0000 1.234
+++ sys/kern/kern_sysctl.c 3 Jun 2013 14:51:14 -0000
@@ -110,6 +110,7 @@ extern struct disklist_head disklist;
extern fixpt_t ccpu;
extern long numvnodes;
extern u_int mcllivelocks;
+extern psize_t b_dmapages_total, b_highpages_total, b_dmamaxpages;
extern void nmbclust_update(void);
@@ -564,8 +565,8 @@ kern_sysctl(int *name, u_int namelen, vo
return (sysctl_cptime2(name + 1, namelen -1, oldp, oldlenp,
newp, newlen));
case KERN_CACHEPCT: {
- u_int64_t dmapages;
- int opct, pgs;
+ psize_t pgs;
+ int opct;
opct = bufcachepercent;
error = sysctl_int(oldp, oldlenp, newp, newlen,
&bufcachepercent);
@@ -575,9 +576,11 @@ kern_sysctl(int *name, u_int namelen, vo
bufcachepercent = opct;
return (EINVAL);
}
- dmapages = uvm_pagecount(&dma_constraint);
if (bufcachepercent != opct) {
- pgs = bufcachepercent * dmapages / 100;
+ pgs = (b_highpages_total + b_dmapages_total)
+ * bufcachepercent / 100;
+ b_dmamaxpages = b_dmapages_total * bufcachepercent
+ / 100;
bufadjust(pgs); /* adjust bufpages */
bufhighpages = bufpages; /* set high water mark */
}
Index: sys/kern/spec_vnops.c
===================================================================
RCS file: /cvs/src/sys/kern/spec_vnops.c,v
retrieving revision 1.71
diff -u -p -r1.71 spec_vnops.c
--- sys/kern/spec_vnops.c 28 Mar 2013 03:29:44 -0000 1.71
+++ sys/kern/spec_vnops.c 3 Jun 2013 14:51:14 -0000
@@ -457,7 +457,9 @@ spec_strategy(void *v)
struct vop_strategy_args *ap = v;
struct buf *bp = ap->a_bp;
int maj = major(bp->b_dev);
-
+
+ if (!ISSET(bp->b_flags, B_DMA) && ISSET(bp->b_flags, B_BC))
+ panic("bogus buf %p passed to spec_strategy", bp);
if (LIST_FIRST(&bp->b_dep) != NULL)
buf_start(bp);
Index: sys/kern/vfs_bio.c
===================================================================
RCS file: /cvs/src/sys/kern/vfs_bio.c,v
retrieving revision 1.146
diff -u -p -r1.146 vfs_bio.c
--- sys/kern/vfs_bio.c 17 Feb 2013 17:39:29 -0000 1.146
+++ sys/kern/vfs_bio.c 3 Jun 2013 14:59:18 -0000
@@ -63,12 +63,17 @@
/*
* Definitions for the buffer free lists.
*/
-#define BQUEUES 2 /* number of free buffer queues
*/
+#define BQUEUES 3 /* number of free buffer queues
*/
#define BQ_DIRTY 0 /* LRU queue with dirty buffers
*/
-#define BQ_CLEAN 1 /* LRU queue with clean buffers
*/
+#define BQ_CLEANL 1 /* LRU queue with clean low
buffers */
+#define BQ_CLEANH 2 /* LRU queue with clean high
buffers */
TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
+int bfreeclean(int, struct bqueues *);
+struct uvm_constraint_range high_constraint;
+psize_t b_dmapages_total, b_highpages_total, b_dmamaxpages;
+int needda;
int nobuffers;
int needbuffer;
struct bio_ops bioops;
@@ -110,30 +115,49 @@ bremfree(struct buf *bp)
struct bqueues *dp = NULL;
splassert(IPL_BIO);
+ KASSERT(ISSET(bp->b_flags, B_BC));
+ KASSERT(!ISSET(bp->b_flags, B_BUSY));
+ if (bp->b_freelist.tqe_next == NOLIST ||
+ bp->b_freelist.tqe_next == (void *)-1)
+ panic("bremfree: - buf %p not on a free list!", bp);
- /*
- * We only calculate the head of the freelist when removing
- * the last element of the list as that is the only time that
- * it is needed (e.g. to reset the tail pointer).
- *
- * NB: This makes an assumption about how tailq's are implemented.
- */
- if (TAILQ_NEXT(bp, b_freelist) == NULL) {
- for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
- if (dp->tqh_last == &TAILQ_NEXT(bp, b_freelist))
- break;
- if (dp == &bufqueues[BQUEUES])
- panic("bremfree: lost tail");
- }
if (!ISSET(bp->b_flags, B_DELWRI)) {
+ if (ISSET(bp->b_flags, B_DMA))
+ dp = &bufqueues[BQ_CLEANL];
+ else
+ dp = &bufqueues[BQ_CLEANH];
bcstats.numcleanpages -= atop(bp->b_bufsize);
} else {
+ dp = &bufqueues[BQ_DIRTY];
bcstats.numdirtypages -= atop(bp->b_bufsize);
bcstats.delwribufs--;
}
TAILQ_REMOVE(dp, bp, b_freelist);
}
+int
+bfreeclean(int npages, struct bqueues *dp)
+{
+ struct buf *bp;
+ int i = 0;
+
+ splassert(IPL_BIO);
+ while (i < npages) {
+ bp = TAILQ_FIRST(dp);
+ if (bp == NULL)
+ return(-1);
+ i += atop(bp->b_bufsize);
+ bremfree(bp);
+ if (bp->b_vp) {
+ RB_REMOVE(buf_rb_bufs,
+ &bp->b_vp->v_bufs_tree, bp);
+ brelvp(bp);
+ }
+ buf_put(bp);
+ }
+ return(0);
+}
+
void
buf_put(struct buf *bp)
{
@@ -158,7 +182,7 @@ buf_put(struct buf *bp)
bcstats.numbufs--;
if (buf_dealloc_mem(bp) != 0)
- return;
+ return;
pool_put(&bufpool, bp);
}
@@ -168,12 +192,21 @@ buf_put(struct buf *bp)
void
bufinit(void)
{
- u_int64_t dmapages;
struct bqueues *dp;
- dmapages = uvm_pagecount(&dma_constraint);
- /* take away a guess at how much of this the kernel will consume */
- dmapages -= (atop(physmem) - atop(uvmexp.free));
+ /* How much DMA accessible memory will we consider? */
+ b_dmapages_total = uvm_pagecount(&dma_constraint);
+ /* Take away a guess at how much of this the kernel will consume. */
+ b_dmapages_total -= (atop(physmem) - atop(uvmexp.free));
+
+ /* See if we have memory above the dma accessible region. */
+ high_constraint.ucr_low = dma_constraint.ucr_high;
+ high_constraint.ucr_high = no_constraint.ucr_high;
+ if (high_constraint.ucr_low != high_constraint.ucr_high) {
+ high_constraint.ucr_low++;
+ b_highpages_total = uvm_pagecount(&high_constraint);
+ } else
+ b_highpages_total = 0;
/*
* If MD code doesn't say otherwise, use up to 10% of DMA'able
@@ -189,18 +222,18 @@ bufinit(void)
KASSERT(bufcachepercent <= 90);
KASSERT(bufcachepercent >= 5);
if (bufpages == 0)
- bufpages = dmapages * bufcachepercent / 100;
+ bufpages = (b_dmapages_total + b_highpages_total)
+ * bufcachepercent / 100;
if (bufpages < BCACHE_MIN)
bufpages = BCACHE_MIN;
- KASSERT(bufpages < dmapages);
bufhighpages = bufpages;
-
+ b_dmamaxpages = b_dmapages_total * bufcachepercent / 100;
/*
* Set the base backoff level for the buffer cache. We will
* not allow uvm to steal back more than this number of pages.
*/
- buflowpages = dmapages * 5 / 100;
+ buflowpages = b_dmapages_total * 5 / 100;
if (buflowpages < BCACHE_MIN)
buflowpages = BCACHE_MIN;
@@ -267,7 +300,6 @@ bufinit(void)
void
bufadjust(int newbufpages)
{
- struct buf *bp;
int s, growing = 0;
if (newbufpages < buflowpages)
@@ -290,15 +322,11 @@ bufadjust(int newbufpages)
* If we have more buffers allocated than our new low water mark,
* immediately free them.
*/
- while (!growing && (bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN])) &&
- (bcstats.numbufpages > lopages)) {
- bremfree(bp);
- if (bp->b_vp) {
- RB_REMOVE(buf_rb_bufs,
- &bp->b_vp->v_bufs_tree, bp);
- brelvp(bp);
- }
- buf_put(bp);
+ if (!growing && (bcstats.numbufpages > lopages)) {
+ if (bfreeclean(bcstats.numbufpages - lopages,
+ &bufqueues[BQ_CLEANH]) != 0)
+ (void) bfreeclean(bcstats.numbufpages - lopages,
+ &bufqueues[BQ_CLEANL]);
}
/*
@@ -321,8 +349,10 @@ bufbackoff(struct uvm_constraint_range *
/*
* Back off "size" buffer cache pages. Called by the page
* daemon to consume buffer cache pages rather than scanning.
+ * Also called buy the buffer cache to back off if memory
+ * allocation in a particular range fails.
*
- * It returns 0 to the pagedaemon to indicate that it has
+ * It returns 0 to the caller to indicate that it has
* succeeded in freeing enough pages. It returns -1 to
* indicate that it could not and the pagedaemon should take
* other measures.
@@ -340,8 +370,23 @@ bufbackoff(struct uvm_constraint_range *
return(-1);
if (bufpages - pdelta < buflowpages)
pdelta = bufpages - buflowpages;
+
oldbufpages = bufpages;
- bufadjust(bufpages - pdelta);
+ if (b_highpages_total
+ && (range->ucr_high <= dma_constraint.ucr_high)) {
+ /*
+ * Free up DMA accessible memory by moving pages to
+ * the high range.
+ */
+ if (bufhigh(pdelta) == 0)
+ return(0); /* we moved enough pages up high */
+ else {
+ bufadjust(bufpages - pdelta); /* shrink the cache. */
+ }
+ } else {
+ /* Free memory by shrinking the cache. */
+ bufadjust(bufpages - pdelta);
+ }
if (oldbufpages - bufpages < size)
return (-1); /* we did not free what we were asked */
else
@@ -526,12 +571,18 @@ bread_cluster(struct vnode *vp, daddr64_
for (i = 1; i < howmany; i++) {
bcstats.pendingreads++;
bcstats.numreads++;
- SET(xbpp[i]->b_flags, B_READ | B_ASYNC);
+ /*
+ * We set B_DMA here because bp above will be B_DMA,
+ * and we are playing buffer slice-n-dice games from
+ * the memory allocated in bp.
+ */
+ SET(xbpp[i]->b_flags, B_DMA | B_READ | B_ASYNC);
xbpp[i]->b_blkno = sblkno + (i * inc);
xbpp[i]->b_bufsize = xbpp[i]->b_bcount = size;
xbpp[i]->b_data = NULL;
xbpp[i]->b_pobj = bp->b_pobj;
xbpp[i]->b_poffs = bp->b_poffs + (i * size);
+ buf_dma(xbpp[i]);
}
KASSERT(bp->b_lblkno == blkno + 1);
@@ -793,6 +844,8 @@ brelse(struct buf *bp)
CLR(bp->b_flags, B_WANTED);
wakeup(bp);
}
+ if (ISSET(bp->b_flags, B_DMA) && needda)
+ wakeup(&needda);
if (bp->b_vp != NULL)
RB_REMOVE(buf_rb_bufs,
&bp->b_vp->v_bufs_tree, bp);
@@ -802,20 +855,26 @@ brelse(struct buf *bp)
}
bcstats.numcleanpages += atop(bp->b_bufsize);
- binsheadfree(bp, &bufqueues[BQ_CLEAN]);
+ if (ISSET(bp->b_flags, B_DMA))
+ binsheadfree(bp, &bufqueues[BQ_CLEANL]);
+ else
+ binsheadfree(bp, &bufqueues[BQ_CLEANH]);
} else {
/*
* It has valid data. Put it on the end of the appropriate
* queue, so that it'll stick around for as long as possible.
*/
- if (!ISSET(bp->b_flags, B_DELWRI)) {
- bcstats.numcleanpages += atop(bp->b_bufsize);
- bufq = &bufqueues[BQ_CLEAN];
- } else {
+ if (ISSET(bp->b_flags, B_DELWRI)) {
bcstats.numdirtypages += atop(bp->b_bufsize);
bcstats.delwribufs++;
bufq = &bufqueues[BQ_DIRTY];
+ } else {
+ bcstats.numcleanpages += atop(bp->b_bufsize);
+ if (ISSET(bp->b_flags, B_DMA))
+ bufq = &bufqueues[BQ_CLEANL];
+ else
+ bufq = &bufqueues[BQ_CLEANH];
}
if (ISSET(bp->b_flags, B_AGE)) {
binsheadfree(bp, bufq);
@@ -836,6 +895,10 @@ brelse(struct buf *bp)
wakeup(&nobuffers);
}
+ if (ISSET(bp->b_flags, B_DMA) && needda) {
+ wakeup(&needda);
+ }
+
/* Wake up any processes waiting for any buffer to become free. */
if (needbuffer && bcstats.numbufpages < hipages &&
bcstats.kvaslots_avail > RESERVE_SLOTS) {
@@ -987,18 +1050,17 @@ buf_get(struct vnode *vp, daddr64_t blkn
* free down to the low water mark.
*/
if (bcstats.numbufpages + npages > hipages) {
- while ((bcstats.numbufpages > lopages) &&
- (bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN]))) {
- bremfree(bp);
- if (bp->b_vp) {
- RB_REMOVE(buf_rb_bufs,
- &bp->b_vp->v_bufs_tree, bp);
- brelvp(bp);
- }
- buf_put(bp);
- }
+ if (bfreeclean(bcstats.numbufpages - lopages,
+ &bufqueues[BQ_CLEANH]) != 0)
+ (void) bfreeclean(bcstats.numbufpages
+ - lopages, &bufqueues[BQ_CLEANL]);
}
+
+ if (b_highpages_total && bcstats.dmapages + npages >
+ b_dmamaxpages)
+ bufhigh(bcstats.dmapages + npages - b_dmamaxpages);
+
/*
* If we get here, we tried to free the world down
* above, and couldn't get down - Wake the cleaner
@@ -1029,6 +1091,8 @@ buf_get(struct vnode *vp, daddr64_t blkn
return (NULL);
}
+ /* Mark buffer as the cache's */
+ SET(bp->b_flags, B_BC);
bp->b_freelist.tqe_next = NOLIST;
bp->b_synctime = time_uptime + 300;
bp->b_dev = NODEV;
@@ -1068,6 +1132,7 @@ buf_get(struct vnode *vp, daddr64_t blkn
if (size) {
buf_alloc_pages(bp, round_page(size));
buf_map(bp);
+ buf_dma(bp);
}
splx(s);
@@ -1238,6 +1303,128 @@ biodone(struct buf *bp)
}
}
+/*
+ * Ensure buffer is DMA reachable
+ */
+void
+buf_dma(struct buf *buf)
+{
+ struct buf *b;
+ int s;
+
+start:
+ KASSERT(ISSET(buf->b_flags, B_BC));
+ KASSERT(ISSET(buf->b_flags, B_BUSY));
+ KASSERT(buf->b_pobj != NULL);
+ s = splbio();
+ /*
+ * If we are adding to the queue, and we are not the cleaner or
+ * the syncer, ensure we free down below the max
+ */
+ while (b_highpages_total &&
+ curproc != syncerproc && curproc != cleanerproc &&
+ (!ISSET(buf->b_flags, B_DMA)) &&
+ (bcstats.dmapages > (b_dmamaxpages - atop(buf->b_bufsize)))) {
+ b = TAILQ_FIRST(&bufqueues[BQ_CLEANL]);
+ KASSERT(!ISSET(b->b_flags, B_BUSY));
+ if (b == NULL) {
+ /* no non-busy buffers. */
+ needda++;
+ tsleep(&needda, PRIBIO, "needda", 0);
+ needda--;
+ splx(s);
+ goto start;
+ } else {
+ bremfree(b);
+ buf_acquire_nomap(b);
+ if (buf_realloc_pages(b, &high_constraint,
+ UVM_PLA_NOWAIT) == 0) {
+ /* move the buffer to high memory if we can */
+ if (ISSET(b->b_flags, B_DMA))
+ panic("B_DMA after high flip %p", b);
+ binstailfree(b, &bufqueues[BQ_CLEANH]);
+ buf_release(b);
+ } else {
+ /* otherwise just free the buffer */
+ buf_release(b);
+ if (b->b_vp) {
+ RB_REMOVE(buf_rb_bufs,
+ &b->b_vp->v_bufs_tree, b);
+ brelvp(b);
+ }
+ buf_put(b);
+ }
+ }
+ }
+ if (!ISSET(buf->b_flags, B_DMA)) {
+ /* move buf to dma reachable memory */
+ (void) buf_realloc_pages(buf, &dma_constraint, UVM_PLA_WAITOK);
+ if (!ISSET(buf->b_flags, B_DMA))
+ panic("non-dma buffer after dma move %p\n", buf);
+ }
+ splx(s);
+ return;
+}
+
+/*
+ * Attempt to flip "delta" dma reachable cache pages high. return 0 if we can,
+ * -1 otherwise.
+ */
+int
+bufhigh(int delta)
+{
+ psize_t newdmapages;
+ struct buf *b, *bn;
+ int s;
+ if (!b_highpages_total)
+ return(-1);
+ s = splbio();
+ newdmapages = bcstats.dmapages - delta;
+ b = TAILQ_FIRST(&bufqueues[BQ_CLEANL]);
+ while ((bcstats.dmapages > newdmapages) && (b != NULL)) {
+ while (ISSET(b->b_flags, B_BUSY)) {
+ b = TAILQ_NEXT(b, b_freelist);
+ }
+ if (b != NULL) {
+ bn = TAILQ_NEXT(b, b_freelist);
+ bremfree(b);
+ buf_acquire_nomap(b);
+ moveit:
+ if (buf_realloc_pages(b, &high_constraint,
+ UVM_PLA_NOWAIT) == 0) {
+ /* move the buffer to high memory if we can */
+ if (ISSET(b->b_flags, B_DMA))
+ panic("B_DMA after high flip %p", b);
+ binstailfree(b, &bufqueues[BQ_CLEANH]);
+ buf_release(b);
+ } else {
+ /* free up some high memory and try again. */
+ if (bfreeclean(delta, &bufqueues[BQ_CLEANH])
+ == 0)
+ goto moveit;
+ else {
+ /* otherwise just free the buffer */
+ buf_release(b);
+ if (b->b_vp) {
+ RB_REMOVE(buf_rb_bufs,
+ &b->b_vp->v_bufs_tree, b);
+ brelvp(b);
+ }
+ buf_put(b);
+ }
+ }
+ b = bn;
+ }
+ }
+ wakeup(&needda);
+ splx(s);
+ if (bcstats.dmapages > newdmapages)
+ return(-1);
+ else
+ return(0);
+}
+
+
#ifdef DDB
void bcstats_print(int (*)(const char *, ...) /*
__attribute__((__format__(__kprintf__,1,2))) */);
/*
@@ -1252,8 +1439,8 @@ bcstats_print(
bcstats.numbufs, bcstats.busymapped, bcstats.delwribufs);
(*pr)("kvaslots %lld avail kva slots %lld\n",
bcstats.kvaslots, bcstats.kvaslots_avail);
- (*pr)("bufpages %lld, dirtypages %lld\n",
- bcstats.numbufpages, bcstats.numdirtypages);
+ (*pr)("total bufpages %lld, dmapages %lld, dirtypages %lld\n",
+ bcstats.numbufpages, bcstats.dmapages, bcstats.numdirtypages);
(*pr)("pendingreads %lld, pendingwrites %lld\n",
bcstats.pendingreads, bcstats.pendingwrites);
}
Index: sys/kern/vfs_biomem.c
===================================================================
RCS file: /cvs/src/sys/kern/vfs_biomem.c,v
retrieving revision 1.23
diff -u -p -r1.23 vfs_biomem.c
--- sys/kern/vfs_biomem.c 18 Jan 2013 10:07:37 -0000 1.23
+++ sys/kern/vfs_biomem.c 3 Jun 2013 14:51:14 -0000
@@ -1,6 +1,7 @@
/* $OpenBSD: vfs_biomem.c,v 1.23 2013/01/18 10:07:37 beck Exp $ */
/*
* Copyright (c) 2007 Artur Grabowski <[email protected]>
+ * Copyright (c) 2012,2013 Bob Beck <[email protected]>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
@@ -267,6 +268,7 @@ void
buf_alloc_pages(struct buf *bp, vsize_t size)
{
voff_t offs;
+ int i;
KASSERT(size == round_page(size));
KASSERT(bp->b_pobj == NULL);
@@ -278,8 +280,18 @@ buf_alloc_pages(struct buf *bp, vsize_t
KASSERT(buf_page_offset > 0);
- uvm_pagealloc_multi(buf_object, offs, size, UVM_PLA_WAITOK);
+ do {
+ i = uvm_pagealloc_multi(buf_object, offs, size,
+ UVM_PLA_NOWAIT);
+ if (i == 0)
+ break;
+ } while (bufbackoff(&dma_constraint, 100) == 0);
+ if (i != 0)
+ i = uvm_pagealloc_multi(buf_object, offs, size,
+ UVM_PLA_WAITOK);
bcstats.numbufpages += atop(size);
+ bcstats.dmapages += atop(size);
+ SET(bp->b_flags, B_DMA);
bp->b_pobj = buf_object;
bp->b_poffs = offs;
bp->b_bufsize = size;
@@ -307,10 +319,68 @@ buf_free_pages(struct buf *bp)
pg->wire_count = 0;
uvm_pagefree(pg);
bcstats.numbufpages--;
+ if (ISSET(bp->b_flags, B_DMA))
+ bcstats.dmapages--;
}
+ CLR(bp->b_flags, B_DMA);
}
-/*
- * XXX - it might make sense to make a buf_realloc_pages to avoid
- * bouncing through the free list all the time.
- */
+/* Reallocate a buf into a particular pmem range specified by "where". */
+int
+buf_realloc_pages(struct buf *bp, struct uvm_constraint_range *where,
+ int flags)
+{
+ vaddr_t va;
+ int dma;
+ int i, r;
+ KASSERT(!(flags & UVM_PLA_WAITOK) ^ !(flags & UVM_PLA_NOWAIT));
+
+ splassert(IPL_BIO);
+ KASSERT(ISSET(bp->b_flags, B_BUSY));
+ dma = ISSET(bp->b_flags, B_DMA);
+
+ /* if the original buf is mapped, unmap it */
+ if (bp->b_data != NULL) {
+ va = (vaddr_t)bp->b_data;
+ pmap_kremove(va, bp->b_bufsize);
+ pmap_update(pmap_kernel());
+ }
+
+ r = 0;
+ do {
+ r = uvm_pagerealloc_multi(bp->b_pobj, bp->b_poffs,
+ bp->b_bufsize, UVM_PLA_NOWAIT, where);
+ if (r == 0)
+ break;
+ } while ((bufbackoff(where, 100) == 0) && (flags & UVM_PLA_WAITOK));
+ if (r != 0 && (! flags & UVM_PLA_NOWAIT))
+ r = uvm_pagerealloc_multi(bp->b_pobj, bp->b_poffs,
+ bp->b_bufsize, flags, where);
+
+ /*
+ * do this now, and put it back later when we know where we are
+ */
+ if (dma)
+ bcstats.dmapages -= atop(bp->b_bufsize);
+
+ dma = 1;
+ /* if the original buf was mapped, re-map it */
+ for (i = 0; i < atop(bp->b_bufsize); i++) {
+ struct vm_page *pg = uvm_pagelookup(bp->b_pobj,
+ bp->b_poffs + ptoa(i));
+ KASSERT(pg != NULL);
+ if (!PADDR_IS_DMA_REACHABLE(VM_PAGE_TO_PHYS(pg)))
+ dma = 0;
+ if (bp->b_data != NULL) {
+ pmap_kenter_pa(va + ptoa(i), VM_PAGE_TO_PHYS(pg),
+ VM_PROT_READ|VM_PROT_WRITE);
+ pmap_update(pmap_kernel());
+ }
+ }
+ if (dma) {
+ SET(bp->b_flags, B_DMA);
+ bcstats.dmapages += atop(bp->b_bufsize);
+ } else
+ CLR(bp->b_flags, B_DMA);
+ return(r);
+}
Index: sys/kern/vfs_vops.c
===================================================================
RCS file: /cvs/src/sys/kern/vfs_vops.c,v
retrieving revision 1.5
diff -u -p -r1.5 vfs_vops.c
--- sys/kern/vfs_vops.c 28 Mar 2013 02:08:39 -0000 1.5
+++ sys/kern/vfs_vops.c 3 Jun 2013 14:51:14 -0000
@@ -633,6 +633,11 @@ VOP_STRATEGY(struct buf *bp)
if (bp->b_vp->v_op->vop_strategy == NULL)
return (EOPNOTSUPP);
+ /*
+ * Flip buffer to dma reachable memory if necessary.
+ */
+ if (ISSET(bp->b_flags, B_BC))
+ buf_dma(bp);
return ((bp->b_vp->v_op->vop_strategy)(&a));
}
Index: sys/sys/buf.h
===================================================================
RCS file: /cvs/src/sys/sys/buf.h,v
retrieving revision 1.84
diff -u -p -r1.84 buf.h
--- sys/sys/buf.h 24 Mar 2013 17:42:43 -0000 1.84
+++ sys/sys/buf.h 3 Jun 2013 14:51:14 -0000
@@ -234,12 +234,14 @@ struct buf {
#define B_SCANNED 0x00100000 /* Block already pushed during
sync */
#define B_PDAEMON 0x00200000 /* I/O started by pagedaemon */
#define B_RELEASED 0x00400000 /* free this buffer after its
kvm */
+#define B_BC 0x00800000 /* Managed by the Buffer Cache. */
+#define B_DMA 0x01000000 /* DMA reachable. */
#define B_BITS "\20\001AGE\002NEEDCOMMIT\003ASYNC\004BAD\005BUSY" \
"\006CACHE\007CALL\010DELWRI\011DONE\012EINTR\013ERROR" \
"\014INVAL\015NOCACHE\016PHYS\017RAW\020READ" \
"\021WANTED\022WRITEINPROG\023XXX(FORMAT)\024DEFERRED" \
- "\025SCANNED\026DAEMON\027RELEASED"
+ "\025SCANNED\026DAEMON\027RELEASED\030BC\031DMA"
/*
* This structure describes a clustered I/O. It is stored in the b_saveaddr
@@ -305,6 +307,7 @@ void bremfree(struct buf *);
void bufinit(void);
void buf_dirty(struct buf *);
void buf_undirty(struct buf *);
+void buf_dma(struct buf *);
int bwrite(struct buf *);
struct buf *getblk(struct vnode *, daddr64_t, int, int, int);
struct buf *geteblk(int);
@@ -328,7 +331,8 @@ int buf_dealloc_mem(struct buf *);
void buf_fix_mapping(struct buf *, vsize_t);
void buf_alloc_pages(struct buf *, vsize_t);
void buf_free_pages(struct buf *);
-
+struct uvm_constraint_range;
+int buf_realloc_pages(struct buf *, struct uvm_constraint_range *, int);
void minphys(struct buf *bp);
int physio(void (*strategy)(struct buf *), dev_t dev, int flags,
Index: sys/sys/mount.h
===================================================================
RCS file: /cvs/src/sys/sys/mount.h,v
retrieving revision 1.109
diff -u -p -r1.109 mount.h
--- sys/sys/mount.h 15 Apr 2013 15:32:19 -0000 1.109
+++ sys/sys/mount.h 3 Jun 2013 14:51:14 -0000
@@ -518,6 +518,7 @@ extern long buflowpages, bufhighpages, b
#define BUFPAGES_INACT (((bcstats.numcleanpages - buflowpages) < 0) ? 0 \
: bcstats.numcleanpages - buflowpages)
extern int bufcachepercent;
+extern int bufhigh(int);
extern void bufadjust(int);
struct uvm_constraint_range;
extern int bufbackoff(struct uvm_constraint_range*, long);
Index: sys/uvm/uvm_extern.h
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_extern.h,v
retrieving revision 1.104
diff -u -p -r1.104 uvm_extern.h
--- sys/uvm/uvm_extern.h 9 Mar 2012 13:01:29 -0000 1.104
+++ sys/uvm/uvm_extern.h 3 Jun 2013 14:51:14 -0000
@@ -681,11 +681,11 @@ struct vm_page *uvm_pagealloc(struct uv
voff_t, struct vm_anon *, int);
vaddr_t uvm_pagealloc_contig(vaddr_t, vaddr_t,
vaddr_t, vaddr_t);
-void uvm_pagealloc_multi(struct uvm_object *, voff_t,
+int uvm_pagealloc_multi(struct uvm_object *, voff_t,
vsize_t, int);
void uvm_pagerealloc(struct vm_page *,
struct uvm_object *, voff_t);
-void uvm_pagerealloc_multi(struct uvm_object *, voff_t,
+int uvm_pagerealloc_multi(struct uvm_object *, voff_t,
vsize_t, int, struct uvm_constraint_range *);
/* Actually, uvm_page_physload takes PF#s which need their own type */
void uvm_page_physload(paddr_t, paddr_t, paddr_t,
Index: sys/uvm/uvm_page.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_page.c,v
retrieving revision 1.123
diff -u -p -r1.123 uvm_page.c
--- sys/uvm/uvm_page.c 27 Mar 2013 02:02:23 -0000 1.123
+++ sys/uvm/uvm_page.c 3 Jun 2013 14:51:14 -0000
@@ -879,19 +879,21 @@ uvm_pglistfree(struct pglist *list)
* interface used by the buffer cache to allocate a buffer at a time.
* The pages are allocated wired in DMA accessible memory
*/
-void
+int
uvm_pagealloc_multi(struct uvm_object *obj, voff_t off, vsize_t size,
int flags)
{
struct pglist plist;
struct vm_page *pg;
- int i;
+ int i, r;
TAILQ_INIT(&plist);
- (void) uvm_pglistalloc(size, dma_constraint.ucr_low,
+ r = uvm_pglistalloc(size, dma_constraint.ucr_low,
dma_constraint.ucr_high, 0, 0, &plist, atop(round_page(size)),
- UVM_PLA_WAITOK);
+ flags);
+ if (r != 0)
+ return(r);
i = 0;
while ((pg = TAILQ_FIRST(&plist)) != NULL) {
pg->wire_count = 1;
@@ -900,6 +902,7 @@ uvm_pagealloc_multi(struct uvm_object *o
TAILQ_REMOVE(&plist, pg, pageq);
uvm_pagealloc_pg(pg, obj, off + ptoa(i++), NULL);
}
+ return(0);
}
/*
@@ -907,21 +910,23 @@ uvm_pagealloc_multi(struct uvm_object *o
* The pages are reallocated wired outside the DMA accessible region.
*
*/
-void
+int
uvm_pagerealloc_multi(struct uvm_object *obj, voff_t off, vsize_t size,
int flags, struct uvm_constraint_range *where)
{
struct pglist plist;
struct vm_page *pg, *tpg;
- int i;
+ int i,r;
voff_t offset;
TAILQ_INIT(&plist);
if (size == 0)
panic("size 0 uvm_pagerealloc");
- (void) uvm_pglistalloc(size, where->ucr_low, where->ucr_high, 0,
- 0, &plist, atop(round_page(size)), UVM_PLA_WAITOK);
+ r = uvm_pglistalloc(size, where->ucr_low, where->ucr_high, 0,
+ 0, &plist, atop(round_page(size)), flags);
+ if (r != 0)
+ return(r);
i = 0;
while((pg = TAILQ_FIRST(&plist)) != NULL) {
offset = off + ptoa(i++);
@@ -934,6 +939,7 @@ uvm_pagerealloc_multi(struct uvm_object
uvm_pagefree(tpg);
uvm_pagealloc_pg(pg, obj, offset, NULL);
}
+ return(0);
}
/*
Index: usr.bin/systat/iostat.c
===================================================================
RCS file: /cvs/src/usr.bin/systat/iostat.c,v
retrieving revision 1.40
diff -u -p -r1.40 iostat.c
--- usr.bin/systat/iostat.c 19 Sep 2011 14:48:04 -0000 1.40
+++ usr.bin/systat/iostat.c 18 Mar 2013 22:29:29 -0000
@@ -222,6 +222,10 @@ showbcache(void)
print_fld_ssize(FLD_IO_SVAL, bccur.numbufpages);
end_line();
+ print_fld_str(FLD_IO_SSTR, "dma pages");
+ print_fld_ssize(FLD_IO_SVAL, bccur.dmapages);
+ end_line();
+
print_fld_str(FLD_IO_SSTR, "dirty pages");
print_fld_ssize(FLD_IO_SVAL, bccur.numdirtypages);
end_line();