I'm resurrecting this thread because I finally got around to
finishing up the patches to implement swapoff.  I would appreciate
some review of them, particularly to verify that I have done the
right thing WRT synchronization.  I have not optimized it to do
read clustering, but I have ensured that such an optimization
could be made.  Other than that, I don't know of any deficiencies.


Index: include/unistd.h
===================================================================
RCS file: /home/ncvs/src/include/unistd.h,v
retrieving revision 1.60
diff -u -r1.60 unistd.h
--- include/unistd.h    2002/09/25 01:58:56     1.60
+++ include/unistd.h    2002/10/07 13:55:19
@@ -512,6 +512,7 @@
 void    setusershell(void);
 int     strtofflags(char **, u_long *, u_long *);
 int     swapon(const char *);
+int     swapoff(const char *);
 int     syscall(int, ...);
 off_t   __syscall(quad_t, ...);
 int     ttyslot(void);
Index: lib/libc/sys/Makefile.inc
===================================================================
RCS file: /home/ncvs/src/lib/libc/sys/Makefile.inc,v
retrieving revision 1.96
diff -u -r1.96 Makefile.inc
--- lib/libc/sys/Makefile.inc   2002/10/02 18:01:51     1.96
+++ lib/libc/sys/Makefile.inc   2002/10/07 13:55:24
@@ -130,6 +130,7 @@
 MLINKS+=stat.2 fstat.2 stat.2 lstat.2
 MLINKS+=statfs.2 fstatfs.2
 MLINKS+=syscall.2 __syscall.2
+MLINKS+=swapon.2 swapoff.2
 MLINKS+=truncate.2 ftruncate.2
 MLINKS+=utimes.2 futimes.2 utimes.2 lutimes.2
 MLINKS+=wait.2 wait3.2 wait.2 wait4.2 wait.2 waitpid.2
Index: lib/libc/sys/swapon.2
===================================================================
RCS file: /home/ncvs/src/lib/libc/sys/swapon.2,v
retrieving revision 1.12
diff -u -r1.12 swapon.2
--- lib/libc/sys/swapon.2       2001/10/01 16:09:03     1.12
+++ lib/libc/sys/swapon.2       2002/10/07 13:55:24
@@ -36,14 +36,16 @@
 .Dt SWAPON 2
 .Os
 .Sh NAME
-.Nm swapon
-.Nd add a swap device for interleaved paging/swapping
+.Nm swapon , swapoff
+.Nd control devices for interleaved paging/swapping
 .Sh LIBRARY
 .Lb libc
 .Sh SYNOPSIS
 .In unistd.h
 .Ft int
 .Fn swapon "const char *special"
+.Ft int
+.Fn swapoff "const char *special"
 .Sh DESCRIPTION
 .Fn Swapon
 makes the block device
@@ -55,13 +57,22 @@
 .Fa special
 is calculated at the time the device is first made available
 for swapping.
+.Pp
+The
+.Fn swapoff
+system call disables paging and swapping on the given device.
+All associated swap metadata are deallocated, and the device
+is made available for other purposes.
 .Sh RETURN VALUES
 If an error has occurred, a value of -1 is returned and
 .Va errno
 is set to indicate the error.
 .Sh ERRORS
-.Fn Swapon
-succeeds unless:
+Both
+.Fn swapon
+and
+.Fn swapoff
+can fail if:
 .Bl -tag -width Er
 .It Bq Er ENOTDIR
 A component of the path prefix is not a directory.
@@ -76,6 +87,19 @@
 Too many symbolic links were encountered in translating the pathname.
 .It Bq Er EPERM
 The caller is not the super-user.
+.It Bq Er EFAULT
+.Fa Special
+points outside the process's allocated address space.
+.El
+.Pp
+Additionally,
+.Fn swapon
+can fail for the following reasons:
+.Bl -tag -width Er
+.It Bq Er EINVAL
+The system has reached the boot-time limit on the number of
+swap devices,
+.Va vm.nswapdev .
 .It Bq Er ENOTBLK
 .Fa Special
 is not a block device.
@@ -84,11 +108,6 @@
 .Fa special
 has already
 been made available for swapping
-.It Bq Er EINVAL
-The device configured by
-.Fa special
-was not
-configured into the system as a swap device.
 .It Bq Er ENXIO
 The major device number of
 .Fa special
@@ -96,20 +115,28 @@
 for the associated hardware).
 .It Bq Er EIO
 An I/O error occurred while opening the swap device.
-.It Bq Er EFAULT
-.Fa Special
-points outside the process's allocated address space.
 .El
+.Pp
+Lastly,
+.Fn swapoff
+can fail if:
+.Bl -tag -width Er
+.It Bq Er EINVAL
+The system is not currently swapping to
+.Fa special .
+.It Bq Er ENOMEM
+Not enough virtual memory is available to safely disable
+paging and swapping to the given device.
+.El
 .Sh SEE ALSO
 .Xr config 8 ,
-.Xr swapon 8
-.Sh BUGS
-There is no way to stop swapping on a disk so that the pack may be
-dismounted.
-.Pp
-This call will be upgraded in future versions of the system.
+.Xr swapon 8 ,
+.Xr sysctl 8
 .Sh HISTORY
 The
 .Fn swapon
 function call appeared in
 .Bx 4.0 .
+.Fn Swapoff
+appeared in
+.Fx 5.0 .
Index: sbin/swapon/Makefile
===================================================================
RCS file: /home/ncvs/src/sbin/swapon/Makefile,v
retrieving revision 1.6
diff -u -r1.6 Makefile
--- sbin/swapon/Makefile        2001/12/04 02:19:58     1.6
+++ sbin/swapon/Makefile        2002/10/07 13:55:33
@@ -3,5 +3,7 @@
 
 PROG=  swapon
 MAN=   swapon.8
+LINKS= ${BINDIR}/swapon ${BINDIR}/swapoff
+MLINKS=        swapon.8 swapoff.8
 
 .include <bsd.prog.mk>
Index: sbin/swapon/swapon.8
===================================================================
RCS file: /home/ncvs/src/sbin/swapon/swapon.8,v
retrieving revision 1.20
diff -u -r1.20 swapon.8
--- sbin/swapon/swapon.8        2002/08/21 18:11:44     1.20
+++ sbin/swapon/swapon.8        2002/10/07 13:55:33
@@ -36,39 +36,46 @@
 .Dt SWAPON 8
 .Os
 .Sh NAME
-.Nm swapon
-.Nd "specify additional device for paging and swapping"
+.Nm swapon , swapoff
+.Nd "specify devices for paging and swapping"
 .Sh SYNOPSIS
-.Nm
+.Nm swap[on|off]
 .Fl a
-.Nm
+.Nm swap[on|off]
 .Ar special_file ...
 .Sh DESCRIPTION
 The
-.Nm
+.Nm swapon
 utility is used to specify additional devices on which paging and swapping
 are to take place.
 The system begins by swapping and paging on only a single device
 so that only one disk is required at bootstrap time.
 Calls to
-.Nm
+.Nm swapon
 normally occur in the system multi-user initialization file
 .Pa /etc/rc
 making all swap devices available, so that the paging and swapping
 activity is interleaved across several devices.
 .Pp
+The
+.Nm swapoff
+utility disables paging and swapping on a device.
+Calls to
+.Nm swapoff
+succeed only if disabling the device would leave enough
+remaining virtual memory to accomodate all running programs.
+.Pp
 Normally, the first form is used:
 .Bl -tag -width indent
 .It Fl a
 All devices marked as ``sw''
 swap devices in
 .Pa /etc/fstab
-are made available unless their ``noauto'' option is also set.
+are added to or removed from the pool of available swap
+unless their ``noauto'' option is also set.
 .El
 .Pp
-The second form gives individual block devices as given
-in the system swap configuration table.  The call makes only this space
-available to the system for swap allocation.
+The second form is used to configure or disable individual devices.
 .Sh SEE ALSO
 .Xr swapon 2 ,
 .Xr fstab 5 ,
@@ -85,12 +92,12 @@
 .It Pa /etc/fstab
 ASCII file system description table
 .El
-.Sh BUGS
-There is no way to stop paging and swapping on a device.
-It is therefore not possible to dismount swap devices which are
-mounted during system operation.
 .Sh HISTORY
 The
-.Nm
+.Nm swapon
 utility appeared in
 .Bx 4.0 .
+The
+.Nm swapoff
+utility appeared in
+.Fx 5.0 .
Index: sbin/swapon/swapon.c
===================================================================
RCS file: /home/ncvs/src/sbin/swapon/swapon.c,v
retrieving revision 1.12
diff -u -r1.12 swapon.c
--- sbin/swapon/swapon.c        2002/03/21 13:20:48     1.12
+++ sbin/swapon/swapon.c        2002/10/07 13:55:33
@@ -53,15 +53,21 @@
 #include <string.h>
 #include <unistd.h>
 
-static void usage(void);
-int    add(char *name, int ignoreebusy);
+int do_swapoff;
 
+static void usage(const char *);
+static int this_is_swapoff(const char *);
+int    twiddle(char *name, int ignoreebusy);
+
 int
 main(int argc, char **argv)
 {
        struct fstab *fsp;
        int stat;
        int ch, doall;
+       char *pname = argv[0];
+
+       do_swapoff = this_is_swapoff(pname);
 
        doall = 0;
        while ((ch = getopt(argc, argv, "a")) != -1)
@@ -71,7 +77,7 @@
                        break;
                case '?':
                default:
-                       usage();
+                       usage(pname);
                }
        argv += optind;
 
@@ -82,23 +88,24 @@
                                continue;
                        if (strstr(fsp->fs_mntops, "noauto"))
                                continue;
-                       if (add(fsp->fs_spec, 1))
+                       if (twiddle(fsp->fs_spec, 1))
                                stat = 1;
                        else
-                               printf("swapon: adding %s as swap device\n",
+                               printf("%s: %sing %s as swap device\n",
+                                   pname, do_swapoff ? "remov" : "add",
                                    fsp->fs_spec);
                }
        else if (!*argv)
-               usage();
+               usage(pname);
        for (; *argv; ++argv)
-               stat |= add(*argv, 0);
+               stat |= twiddle(*argv, 0);
        exit(stat);
 }
 
 int
-add(char *name, int ignoreebusy)
+twiddle(char *name, int ignoreebusy)
 {
-       if (swapon(name) == -1) {
+       if ((do_swapoff ? swapoff(name) : swapon(name)) == -1) {
                switch (errno) {
                case EBUSY:
                        if (!ignoreebusy)
@@ -114,8 +121,23 @@
 }
 
 static void
-usage()
+usage(const char *pname)
 {
-       fprintf(stderr, "usage: swapon [-a] [special_file ...]\n");
+       fprintf(stderr, "usage: %s [-a] [special_file ...]\n", pname);
        exit(1);
+}
+
+static int
+this_is_swapoff(const char *s)
+{
+       const char *u;
+
+       if ((u = strrchr(s, '/')) != NULL)
+               ++u;
+       else
+               u = s;
+       if (strcmp(u, "swapoff") == 0)
+               return 1;
+       else
+               return 0;
 }
Index: sys/kern/init_sysent.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/init_sysent.c,v
retrieving revision 1.135
diff -u -r1.135 init_sysent.c
--- sys/kern/init_sysent.c      2002/10/02 16:48:15     1.135
+++ sys/kern/init_sysent.c      2002/10/07 13:55:50
@@ -2,8 +2,8 @@
  * System call switch table.
  *
  * DO NOT EDIT-- this file is automatically generated.
- * $FreeBSD: src/sys/kern/init_sysent.c,v 1.135 2002/10/02 16:48:15 archie Exp $
- * created from FreeBSD: src/sys/kern/syscalls.master,v 1.124 2002/10/01 02:35:59 
rwatson Exp 
+ * $FreeBSD$
+ * created from FreeBSD: src/sys/kern/syscalls.master,v 1.125 2002/10/02 16:48:15 
+archie Exp 
  */
 
 #include "opt_compat.h"
@@ -443,4 +443,5 @@
        { 0, (sy_call_t *)nosys },                      /* 412 = extattr_set_link */
        { 0, (sy_call_t *)nosys },                      /* 413 = extattr_get_link */
        { 0, (sy_call_t *)nosys },                      /* 414 = extattr_delete_link */
+       { SYF_MPSAFE | AS(swapoff_args), (sy_call_t *)swapoff },        /* 415 = 
+swapoff */
 };
Index: sys/kern/subr_blist.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/subr_blist.c,v
retrieving revision 1.10
diff -u -r1.10 subr_blist.c
--- sys/kern/subr_blist.c       2002/05/18 23:46:04     1.10
+++ sys/kern/subr_blist.c       2002/10/07 13:55:51
@@ -93,7 +93,7 @@
 #include <stdlib.h>
 #include <stdarg.h>
 
-#define malloc(a,b,c)  malloc(a)
+#define malloc(a,b,c)  calloc(a, 1)
 #define free(a,b)      free(a)
 
 typedef unsigned int u_daddr_t;
@@ -116,6 +116,9 @@
                                        daddr_t radix, int skip, daddr_t blk);
 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, 
                                daddr_t skip, blist_t dest, daddr_t count);
+static int blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
+static int blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
+                               daddr_t radix, int skip, daddr_t blk);
 static daddr_t blst_radix_init(blmeta_t *scan, daddr_t radix, 
                                                int skip, daddr_t count);
 #ifndef _KERNEL
@@ -165,13 +168,14 @@
 
 #if defined(BLIST_DEBUG)
        printf(
-               "BLIST representing %d blocks (%d MB of swap)"
-               ", requiring %dK of ram\n",
-               bl->bl_blocks,
-               bl->bl_blocks * 4 / 1024,
-               (bl->bl_rootblks * sizeof(blmeta_t) + 1023) / 1024
+               "BLIST representing %lld blocks (%lld MB of swap)"
+               ", requiring %lldK of ram\n",
+               (long long)bl->bl_blocks,
+               (long long)bl->bl_blocks * 4 / 1024,
+               (long long)(bl->bl_rootblks * sizeof(blmeta_t) + 1023) / 1024
        );
-       printf("BLIST raw radix tree contains %d records\n", bl->bl_rootblks);
+       printf("BLIST raw radix tree contains %lld records\n",
+           (long long)bl->bl_rootblks);
 #endif
        blst_radix_init(bl->bl_root, bl->bl_radix, bl->bl_skip, blocks);
 
@@ -226,6 +230,30 @@
 }
 
 /*
+ * blist_fill() -      mark a region in the block bitmap as off-limits
+ *                     to the allocator (i.e. allocate it), ignoring any
+ *                     existing allocations.  Return the number of blocks
+ *                     actually filled that were free before the call.
+ */
+
+int
+blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
+{
+       int filled;
+
+       if (bl) {
+               if (bl->bl_radix == BLIST_BMAP_RADIX)
+                       filled = blst_leaf_fill(bl->bl_root, blkno, count);
+               else
+                       filled = blst_meta_fill(bl->bl_root, blkno, count,
+                           bl->bl_radix, bl->bl_skip, 0);
+               bl->bl_free -= filled;
+               return filled;
+       } else
+               return 0;
+}
+
+/*
  * blist_resize() -    resize an existing radix tree to handle the
  *                     specified number of blocks.  This will reallocate
  *                     the tree and transfer the previous bitmap to the new
@@ -507,9 +535,9 @@
        int next_skip = (skip >> BLIST_META_RADIX_SHIFT);
 
 #if 0
-       printf("FREE (%x,%d) FROM (%x,%d)\n",
-           freeBlk, count,
-           blk, radix
+       printf("FREE (%llx,%lld) FROM (%llx,%lld)\n",
+           (long long)freeBlk, (long long)count,
+           (long long)blk, (long long)radix
        );
 #endif
 
@@ -679,6 +707,117 @@
 }
 
 /*
+ * BLST_LEAF_FILL() -  allocate specific blocks in leaf bitmap
+ *
+ *     This routine allocates all blocks in the specified range
+ *     regardless of any existing allocations in that range.  Returns
+ *     the number of blocks allocated by the call.
+ */
+
+static int
+blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
+{
+       int n = blk & (BLIST_BMAP_RADIX - 1);
+       int nblks;
+       u_daddr_t mask, bitmap;
+
+       mask = ((u_daddr_t)-1 << n) &
+           ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
+
+       /* Count the number of blocks we're about to allocate */
+       bitmap = scan->u.bmu_bitmap & mask;
+       for (nblks = 0; bitmap != 0; nblks++)
+               bitmap &= bitmap - 1;
+
+       scan->u.bmu_bitmap &= ~mask;
+       return nblks;
+}
+
+/*
+ * BLIST_META_FILL() - allocate specific blocks at a meta node
+ *
+ *     This routine allocates the specified range of blocks,
+ *     regardless of any existing allocations in the range.  The
+ *     range must be within the extent of this node.  Returns the
+ *     number of blocks allocated by the call.
+ */
+static int
+blst_meta_fill(
+       blmeta_t *scan,
+       daddr_t allocBlk,
+       daddr_t count,
+       daddr_t radix, 
+       int skip,
+       daddr_t blk
+) {
+       int i;
+       int next_skip = (skip >> BLIST_META_RADIX_SHIFT);
+       int nblks = 0;
+
+       if (count == radix || scan->u.bmu_avail == 0)  {
+               /*
+                * ALL-ALLOCATED special case
+                */
+               nblks = scan->u.bmu_avail;
+               scan->u.bmu_avail = 0;
+               scan->bm_bighint = count;
+               return nblks;
+       }
+
+       if (scan->u.bmu_avail == radix) {
+               radix >>= BLIST_META_RADIX_SHIFT;
+
+               /*
+                * ALL-FREE special case, initialize sublevel
+                */
+               for (i = 1; i <= skip; i += next_skip) {
+                       if (scan[i].bm_bighint == (daddr_t)-1)
+                               break;
+                       if (next_skip == 1) {
+                               scan[i].u.bmu_bitmap = (u_daddr_t)-1;
+                               scan[i].bm_bighint = BLIST_BMAP_RADIX;
+                       } else {
+                               scan[i].bm_bighint = radix;
+                               scan[i].u.bmu_avail = radix;
+                       }
+               }
+       } else {
+               radix >>= BLIST_META_RADIX_SHIFT;
+       }
+
+       if (count > radix)
+               panic("blist_meta_fill: allocation too large");
+
+       i = (allocBlk - blk) / radix;
+       blk += i * radix;
+       i = i * next_skip + 1;
+
+       while (i <= skip && blk < allocBlk + count) {
+               daddr_t v;
+
+               v = blk + radix - allocBlk;
+               if (v > count)
+                       v = count;
+
+               if (scan->bm_bighint == (daddr_t)-1)
+                       panic("blst_meta_fill: filling unexpected range");
+
+               if (next_skip == 1) {
+                       nblks += blst_leaf_fill(&scan[i], allocBlk, v);
+               } else {
+                       nblks += blst_meta_fill(&scan[i], allocBlk, v,
+                           radix, next_skip - 1, blk);
+               }
+               count -= v;
+               allocBlk += v;
+               blk += radix;
+               i += next_skip;
+       }
+       scan->u.bmu_avail -= nblks;
+       return nblks;
+}
+
+/*
  * BLST_RADIX_INIT() - initialize radix tree
  *
  *     Initialize our meta structures and bitmaps and calculate the exact
@@ -768,41 +907,41 @@
 
        if (radix == BLIST_BMAP_RADIX) {
                printf(
-                   "%*.*s(%04x,%d): bitmap %08x big=%d\n", 
+                   "%*.*s(%08llx,%lld): bitmap %08llx big=%lld\n", 
                    tab, tab, "",
-                   blk, radix,
-                   scan->u.bmu_bitmap,
-                   scan->bm_bighint
+                   (long long)blk, (long long)radix,
+                   (long long)scan->u.bmu_bitmap,
+                   (long long)scan->bm_bighint
                );
                return;
        }
 
        if (scan->u.bmu_avail == 0) {
                printf(
-                   "%*.*s(%04x,%d) ALL ALLOCATED\n",
+                   "%*.*s(%08llx,%lld) ALL ALLOCATED\n",
                    tab, tab, "",
-                   blk,
-                   radix
+                   (long long)blk,
+                   (long long)radix
                );
                return;
        }
        if (scan->u.bmu_avail == radix) {
                printf(
-                   "%*.*s(%04x,%d) ALL FREE\n",
+                   "%*.*s(%08llx,%lld) ALL FREE\n",
                    tab, tab, "",
-                   blk,
-                   radix
+                   (long long)blk,
+                   (long long)radix
                );
                return;
        }
 
        printf(
-           "%*.*s(%04x,%d): subtree (%d/%d) big=%d {\n",
+           "%*.*s(%08llx,%lld): subtree (%lld/%lld) big=%lld {\n",
            tab, tab, "",
-           blk, radix,
-           scan->u.bmu_avail,
-           radix,
-           scan->bm_bighint
+           (long long)blk, (long long)radix,
+           (long long)scan->u.bmu_avail,
+           (long long)radix,
+           (long long)scan->bm_bighint
        );
 
        radix >>= BLIST_META_RADIX_SHIFT;
@@ -812,9 +951,9 @@
        for (i = 1; i <= skip; i += next_skip) {
                if (scan[i].bm_bighint == (daddr_t)-1) {
                        printf(
-                           "%*.*s(%04x,%d): Terminator\n",
+                           "%*.*s(%08llx,%lld): Terminator\n",
                            tab, tab, "",
-                           blk, radix
+                           (long long)blk, (long long)radix
                        );
                        lastState = 0;
                        break;
@@ -866,13 +1005,14 @@
                daddr_t count = 0;
 
 
-               printf("%d/%d/%d> ", bl->bl_free, size, bl->bl_radix);
+               printf("%lld/%lld/%lld> ", (long long)bl->bl_free,
+                   (long long)size, (long long)bl->bl_radix);
                fflush(stdout);
                if (fgets(buf, sizeof(buf), stdin) == NULL)
                        break;
                switch(buf[0]) {
                case 'r':
-                       if (sscanf(buf + 1, "%d", &count) == 1) {
+                       if (sscanf(buf + 1, "%lld", &count) == 1) {
                                blist_resize(&bl, count, 1);
                        } else {
                                printf("?\n");
@@ -881,26 +1021,37 @@
                        blist_print(bl);
                        break;
                case 'a':
-                       if (sscanf(buf + 1, "%d", &count) == 1) {
+                       if (sscanf(buf + 1, "%lld", &count) == 1) {
                                daddr_t blk = blist_alloc(bl, count);
-                               printf("    R=%04x\n", blk);
+                               printf("    R=%08llx\n", (long long)blk);
                        } else {
                                printf("?\n");
                        }
                        break;
                case 'f':
-                       if (sscanf(buf + 1, "%x %d", &da, &count) == 2) {
+                       if (sscanf(buf + 1, "%llx %lld",
+                           (long long *)&da, (long long *)&count) == 2) {
                                blist_free(bl, da, count);
                        } else {
                                printf("?\n");
                        }
                        break;
+               case 'l':
+                       if (sscanf(buf + 1, "%llx %lld",
+                           (long long *)&da, (long long *)&count) == 2) {
+                               printf("    n=%d\n",
+                                   blist_fill(bl, da, count));
+                       } else {
+                               printf("?\n");
+                       }
+                       break;
                case '?':
                case 'h':
                        puts(
                            "p          -print\n"
                            "a %d       -allocate\n"
                            "f %x %d    -free\n"
+                           "l %x %d    -fill\n"
                            "r %d       -resize\n"
                            "h/?        -help"
                        );
Index: sys/kern/syscalls.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/syscalls.c,v
retrieving revision 1.120
diff -u -r1.120 syscalls.c
--- sys/kern/syscalls.c 2002/10/01 02:37:35     1.120
+++ sys/kern/syscalls.c 2002/10/07 13:55:51
@@ -2,8 +2,8 @@
  * System call names.
  *
  * DO NOT EDIT-- this file is automatically generated.
- * $FreeBSD: src/sys/kern/syscalls.c,v 1.120 2002/10/01 02:37:35 rwatson Exp $
- * created from FreeBSD: src/sys/kern/syscalls.master,v 1.124 2002/10/01 02:35:59 
rwatson Exp 
+ * $FreeBSD$
+ * created from FreeBSD: src/sys/kern/syscalls.master,v 1.125 2002/10/02 16:48:15 
+archie Exp 
  */
 
 char *syscallnames[] = {
@@ -422,4 +422,5 @@
        "#412",                 /* 412 = extattr_set_link */
        "#413",                 /* 413 = extattr_get_link */
        "#414",                 /* 414 = extattr_delete_link */
+       "swapoff",                      /* 415 = swapoff */
 };
Index: sys/kern/syscalls.master
===================================================================
RCS file: /home/ncvs/src/sys/kern/syscalls.master,v
retrieving revision 1.125
diff -u -r1.125 syscalls.master
--- sys/kern/syscalls.master    2002/10/02 16:48:15     1.125
+++ sys/kern/syscalls.master    2002/10/07 13:55:52
@@ -590,3 +590,4 @@
 412    UNIMPL  BSD     extattr_set_link
 413    UNIMPL  BSD     extattr_get_link
 414    UNIMPL  BSD     extattr_delete_link
+415     MSTD    BSD     { int swapoff(const char *name); }
Index: sys/sys/blist.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/blist.h,v
retrieving revision 1.4
diff -u -r1.4 blist.h
--- sys/sys/blist.h     2002/05/18 09:38:20     1.4
+++ sys/sys/blist.h     2002/10/07 13:55:58
@@ -9,6 +9,7 @@
  *             (void)  blist_destroy(blist)
  *             blkno = blist_alloc(blist, count)
  *             (void)  blist_free(blist, blkno, count)
+ *             nblks = blist_fill(blist, blkno, count)
  *             (void)  blist_resize(&blist, count, freeextra)
  *             
  *
@@ -78,6 +79,7 @@
 extern void blist_destroy(blist_t blist);
 extern daddr_t blist_alloc(blist_t blist, daddr_t count);
 extern void blist_free(blist_t blist, daddr_t blkno, daddr_t count);
+extern int blist_fill(blist_t bl, daddr_t blkno, daddr_t count);
 extern void blist_print(blist_t blist);
 extern void blist_resize(blist_t *pblist, daddr_t count, int freenew);
 
Index: sys/sys/conf.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/conf.h,v
retrieving revision 1.149
diff -u -r1.149 conf.h
--- sys/sys/conf.h      2002/09/27 19:47:56     1.149
+++ sys/sys/conf.h      2002/10/07 13:55:59
@@ -267,6 +267,7 @@
 };
 #define        SW_FREED        0x01
 #define        SW_SEQUENTIAL   0x02
+#define SW_CLOSING     0x04
 #define        sw_freed        sw_flags        /* XXX compat */
 
 #ifdef _KERNEL
Index: sys/sys/syscall.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/syscall.h,v
retrieving revision 1.119
diff -u -r1.119 syscall.h
--- sys/sys/syscall.h   2002/10/01 02:37:35     1.119
+++ sys/sys/syscall.h   2002/10/07 13:56:00
@@ -2,8 +2,8 @@
  * System call numbers.
  *
  * DO NOT EDIT-- this file is automatically generated.
- * $FreeBSD: src/sys/sys/syscall.h,v 1.119 2002/10/01 02:37:35 rwatson Exp $
- * created from FreeBSD: src/sys/kern/syscalls.master,v 1.124 2002/10/01 02:35:59 
rwatson Exp 
+ * $FreeBSD$
+ * created from FreeBSD: src/sys/kern/syscalls.master,v 1.125 2002/10/02 16:48:15 
+archie Exp 
  */
 
 #define        SYS_syscall     0
@@ -317,4 +317,5 @@
 #define        SYS_ksem_unlink 406
 #define        SYS_ksem_getvalue       407
 #define        SYS_ksem_destroy        408
-#define        SYS_MAXSYSCALL  415
+#define        SYS_swapoff     415
+#define        SYS_MAXSYSCALL  416
Index: sys/sys/syscall.mk
===================================================================
RCS file: /home/ncvs/src/sys/sys/syscall.mk,v
retrieving revision 1.74
diff -u -r1.74 syscall.mk
--- sys/sys/syscall.mk  2002/10/01 02:37:35     1.74
+++ sys/sys/syscall.mk  2002/10/07 13:56:00
@@ -1,7 +1,7 @@
 # FreeBSD system call names.
 # DO NOT EDIT-- this file is automatically generated.
-# $FreeBSD: src/sys/sys/syscall.mk,v 1.74 2002/10/01 02:37:35 rwatson Exp $
-# created from FreeBSD: src/sys/kern/syscalls.master,v 1.124 2002/10/01 02:35:59 
rwatson Exp 
+# $FreeBSD$
+# created from FreeBSD: src/sys/kern/syscalls.master,v 1.125 2002/10/02 16:48:15 
+archie Exp 
 MIASM =  \
        syscall.o \
        exit.o \
@@ -265,4 +265,5 @@
        ksem_open.o \
        ksem_unlink.o \
        ksem_getvalue.o \
-       ksem_destroy.o
+       ksem_destroy.o \
+       swapoff.o
Index: sys/sys/sysproto.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/sysproto.h,v
retrieving revision 1.112
diff -u -r1.112 sysproto.h
--- sys/sys/sysproto.h  2002/10/02 16:48:16     1.112
+++ sys/sys/sysproto.h  2002/10/07 13:56:03
@@ -2,8 +2,8 @@
  * System call prototypes.
  *
  * DO NOT EDIT-- this file is automatically generated.
- * $FreeBSD: src/sys/sys/sysproto.h,v 1.112 2002/10/02 16:48:16 archie Exp $
- * created from FreeBSD: src/sys/kern/syscalls.master,v 1.124 2002/10/01 02:35:59 
rwatson Exp 
+ * $FreeBSD$
+ * created from FreeBSD: src/sys/kern/syscalls.master,v 1.125 2002/10/02 16:48:15 
+archie Exp 
  */
 
 #ifndef _SYS_SYSPROTO_H_
@@ -1157,6 +1157,9 @@
 struct ksem_destroy_args {
        char id_l_[PADL_(semid_t)]; semid_t id; char id_r_[PADR_(semid_t)];
 };
+struct swapoff_args {
+       char name_l_[PADL_(const char *)]; const char * name; char name_r_[PADR_(const 
+char *)];
+};
 int    nosys(struct thread *, struct nosys_args *);
 void   sys_exit(struct thread *, struct sys_exit_args *);
 int    fork(struct thread *, struct fork_args *);
@@ -1419,6 +1422,7 @@
 int    ksem_unlink(struct thread *, struct ksem_unlink_args *);
 int    ksem_getvalue(struct thread *, struct ksem_getvalue_args *);
 int    ksem_destroy(struct thread *, struct ksem_destroy_args *);
+int    swapoff(struct thread *, struct swapoff_args *);
 
 #ifdef COMPAT_43
 
Index: sys/vm/swap_pager.c
===================================================================
RCS file: /home/ncvs/src/sys/vm/swap_pager.c,v
retrieving revision 1.180
diff -u -r1.180 swap_pager.c
--- sys/vm/swap_pager.c 2002/09/28 17:15:32     1.180
+++ sys/vm/swap_pager.c 2002/10/07 13:56:05
@@ -197,6 +197,7 @@
 static __inline void   swp_sizecheck(void);
 static void    swp_pager_sync_iodone(struct buf *bp);
 static void    swp_pager_async_iodone(struct buf *bp);
+static __inline int    swp_pager_force_pagein(struct swblock *, int);
 
 /*
  * Swap bitmap functions
@@ -207,6 +208,8 @@
 /*
  * Metadata functions
  */
+static __inline struct swblock **
+    swp_pager_hash(vm_object_t object, vm_pindex_t index);
 static void swp_pager_meta_build(vm_object_t, vm_pindex_t, daddr_t);
 static void swp_pager_meta_free(vm_object_t, vm_pindex_t, daddr_t);
 static void swp_pager_meta_free_all(vm_object_t);
@@ -513,12 +516,22 @@
        daddr_t blk;
        int npages;
 {
+       struct swdevt *sp = &swdevt[BLK2DEVIDX(blk)];
+
        GIANT_REQUIRED;
 
+       /* per-swap area stats */
+       sp->sw_used -= npages;
+
+       /*
+        * If we are attempting to stop swapping on this device, we
+        * don't want to mark any blocks free lest they be reused.
+        */
+       if (sp->sw_flags & SW_CLOSING)
+               return;
+
        blist_free(swapblist, blk, npages);
        vm_swap_size += npages;
-       /* per-swap area stats */
-       swdevt[BLK2DEVIDX(blk)].sw_used -= npages;
        swp_sizecheck();
 }
 
@@ -1623,6 +1636,147 @@
            )
        );
        splx(s);
+}
+
+/*
+ *     swap_pager_isswapped:
+ *
+ *     Return 1 if at least one page in the given object is paged
+ *     out to the given swap device.
+ */
+int swap_pager_isswapped(vm_object_t object, int devidx) {
+       daddr_t index = 0;
+       int bcount;
+       int i;
+
+       for (bcount = 0; bcount < object->un_pager.swp.swp_bcount; bcount++) {
+               struct swblock **pswap;
+               struct swblock *swap;
+
+               pswap = swp_pager_hash(object, index);
+
+               if ((swap = *pswap) != NULL) {
+                       for (i = 0; i < SWAP_META_PAGES; ++i) {
+                               daddr_t v = swap->swb_pages[i];
+                               if (v != SWAPBLK_NONE &&
+                                   BLK2DEVIDX(v) == devidx &&
+                                    !vm_page_lookup(object, swap->swb_index+i))
+                                       return 1;
+                       }
+               }
+
+               index += SWAP_META_PAGES;
+               if (index > 0x20000000)
+                       panic("swap_pager_isswapped: failed to locate all swap meta 
+blocks");
+       }
+       return 0;
+}
+
+/*
+ *     swap_pager_swapoff:
+ *
+ *     Page in all of the pages that have been paged out to the
+ *     given device.  The corresponding blocks in the bitmap must be
+ *     marked as allocated and the device must be flagged SW_CLOSING.
+ *     There may be no processes swapped out to the device.
+ *
+ *     This routine may block.
+ */
+void
+swap_pager_swapoff(int devidx)
+{
+       struct swblock **pswap;
+       struct swblock *swap;
+       daddr_t v;
+       int i, j, k;
+       int paged;
+
+       GIANT_REQUIRED;
+
+       for (i = 0; i <= swhash_mask; i++) { /* '<=' is correct here */
+restart:
+               pswap = &swhash[i];
+               while ((swap = *pswap) != NULL) {
+                       for (j = 0; j < SWAP_META_PAGES; j += paged) {
+                               paged = 1;
+                               v = swap->swb_pages[j];
+                               if (v == SWAPBLK_NONE ||
+                                   BLK2DEVIDX(v) != devidx)
+                                       continue;
+
+                               paged = swp_pager_force_pagein(swap, j);
+
+                               swp_pager_freeswapspace(swap->swb_pages[j],
+                                   paged);
+                               for (k = j; k < j + paged; k++)
+                                       swap->swb_pages[k] = SWAPBLK_NONE;
+                               swap->swb_count -= paged;
+                               if (swap->swb_count <= 0) {
+                                       *pswap = swap->swb_hnext;
+                                       --swap->swb_object->un_pager.swp.swp_bcount;
+                                       uma_zfree(swap_zone, swap);
+                               }
+
+                               goto restart;
+                       }
+                       pswap = &swap->swb_hnext;
+               }
+       }
+}
+
+/*
+ * SWP_PAGER_FORCE_PAGEIN() - force pages on a device to be paged in
+ *
+ *     This routine dissociates the page at the given index within a
+ *     swap block from its backing store, paging it in if necessary.
+ *     If the page is paged in, it is placed in the inactive queue,
+ *     since it had its backing store ripped out from under it.  The
+ *     swap metadata associated with the page is left intact.  This
+ *     function returns the number of pages that were dissociated
+ *     from their swap backing.
+ *
+ *     XXX This routine could be optimized to bring in adjacent pages
+ *         on the device.  (needs to be passed devidx as well)
+ */
+static __inline int
+swp_pager_force_pagein(struct swblock *swap, int idx)
+{
+       vm_object_t object;
+       vm_page_t m;
+       vm_pindex_t pindex;
+
+       object = swap->swb_object;
+       pindex = swap->swb_index + idx;
+
+       if (object->type != OBJT_SWAP)
+               panic("swp_pager_force_pagein: object not backed by swap");
+
+       m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
+       if (m->valid == VM_PAGE_BITS_ALL) {
+               /*
+                * The page is already in memory, but must be
+                * dirtied, since we're taking away its backing store.
+                */
+               vm_page_lock_queues();
+               vm_page_activate(m);
+               vm_page_dirty(m);
+               vm_page_wakeup(m);
+               vm_page_unlock_queues();
+               return 1;
+       }
+
+       vm_object_pip_add(object, 1);
+       if (swap_pager_getpages(object, &m, 1, 0) != VM_PAGER_OK)
+               panic("swap_pager_force_pagein: read from swap failed");/*XXX*/
+       vm_object_pip_subtract(object, 1);
+       vm_page_dirty(m);
+
+       vm_page_lock_queues();
+       vm_page_dontneed(m);
+       vm_page_wakeup(m);
+       vm_page_unlock_queues();
+
+       return 1;
 }
 
 /************************************************************************
Index: sys/vm/swap_pager.h
===================================================================
RCS file: /home/ncvs/src/sys/vm/swap_pager.h,v
retrieving revision 1.34
diff -u -r1.34 swap_pager.h
--- sys/vm/swap_pager.h 2002/09/05 14:04:34     1.34
+++ sys/vm/swap_pager.h 2002/10/07 13:56:05
@@ -83,9 +83,11 @@
 extern int swap_pager_full;
 extern struct blist *swapblist;
 extern struct uma_zone *swap_zone;
+extern int nswap_lowat, nswap_hiwat;
 
 void swap_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *);
 boolean_t swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int 
*after);
+void swap_pager_swapoff(int devidx);
 
 int swap_pager_swp_alloc(vm_object_t, int);
 void swap_pager_copy(vm_object_t, vm_object_t, vm_pindex_t, int);
Index: sys/vm/vm_glue.c
===================================================================
RCS file: /home/ncvs/src/sys/vm/vm_glue.c,v
retrieving revision 1.157
diff -u -r1.157 vm_glue.c
--- sys/vm/vm_glue.c    2002/09/28 17:15:33     1.157
+++ sys/vm/vm_glue.c    2002/10/07 13:56:06
@@ -324,6 +324,35 @@
        up = (vm_offset_t)p->p_uarea;
        pmap_qenter(up, ma, UAREA_PAGES);
 }
+
+/*
+ * Swap in the UAREAs of all processes swapped out to the given device.
+ */
+void
+vm_proc_swapin_all(int devidx)
+{
+       struct proc *p;
+
+retry:
+       sx_slock(&allproc_lock);
+       FOREACH_PROC_IN_SYSTEM(p) {
+               PROC_LOCK(p);
+               mtx_lock_spin(&sched_lock);
+
+               if (p->p_upages_obj != NULL &&
+                   swap_pager_isswapped(p->p_upages_obj, devidx)) {
+                       sx_sunlock(&allproc_lock);
+                       faultin(p);
+                       mtx_unlock_spin(&sched_lock);
+                       PROC_UNLOCK(p);
+                       goto retry;
+               }
+
+               mtx_unlock_spin(&sched_lock);
+               PROC_UNLOCK(p);
+       }
+       sx_sunlock(&allproc_lock);
+}
 #endif
 
 /*
Index: sys/vm/vm_pageout.h
===================================================================
RCS file: /home/ncvs/src/sys/vm/vm_pageout.h,v
retrieving revision 1.30
diff -u -r1.30 vm_pageout.h
--- sys/vm/vm_pageout.h 2002/03/19 22:17:45     1.30
+++ sys/vm/vm_pageout.h 2002/10/07 13:56:06
@@ -104,6 +104,12 @@
 extern void vm_wait(void);
 extern void vm_waitpfault(void);
 
+/* XXX This is probably misplaced. */
+#ifndef NO_SWAPPING
+void vm_proc_swapin_all(int);
+int swap_pager_isswapped(vm_object_t, int);
+#endif /* !NO_SWAPPING */
+
 #ifdef _KERNEL
 void vm_pageout_page(vm_page_t, vm_object_t);
 void vm_pageout_cluster(vm_page_t, vm_object_t);
Index: sys/vm/vm_swap.c
===================================================================
RCS file: /home/ncvs/src/sys/vm/vm_swap.c,v
retrieving revision 1.122
diff -u -r1.122 vm_swap.c
--- sys/vm/vm_swap.c    2002/09/25 01:24:17     1.122
+++ sys/vm/vm_swap.c    2002/10/07 13:56:10
@@ -35,6 +35,7 @@
  */
 
 #include "opt_swap.h"
+#include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -56,6 +57,7 @@
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_param.h>
+#include <vm/vm_pageout.h>
 #include <vm/swap_pager.h>
 #include <vm/uma.h>
 
@@ -163,11 +165,12 @@
 
 /*
  * Create a special vnode op vector for swapdev_vp - we only use
- * VOP_STRATEGY(), everything else returns an error.
+ * VOP_STRATEGY() and reclaim; everything else returns an error.
  */
 vop_t **swapdev_vnodeop_p;
 static struct vnodeopv_entry_desc swapdev_vnodeop_entries[] = {  
        { &vop_default_desc,            (vop_t *) vop_defaultop },
+       { &vop_reclaim_desc,            (vop_t *) vop_null },
        { &vop_strategy_desc,           (vop_t *) swapdev_strategy },
        { NULL, NULL }
 };
@@ -324,7 +327,7 @@
        sp->sw_vp = vp;
        sp->sw_dev = dev2udev(dev);
        sp->sw_device = dev;
-       sp->sw_flags |= SW_FREED;
+       sp->sw_flags = SW_FREED;
        sp->sw_nblks = nblks;
        sp->sw_used = 0;
 
@@ -350,7 +353,121 @@
                vm_swap_size += blk;
        }
 
+       swap_pager_full = 0;
+
        return (0);
+}
+
+/*
+ * SYSCALL: swapoff(devname)
+ *
+ * Disable swapping on the given device.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct swapoff_args {
+       char *name;
+};
+#endif
+
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+swapoff(td, uap)
+       struct thread *td;
+       struct swapoff_args *uap;
+{
+       struct vnode *vp;
+       struct nameidata nd;
+       struct swdevt *sp;
+       swblk_t dvbase, vsbase;
+       u_long nblks, aligned_nblks, blk;
+       int error, index;
+
+       mtx_lock(&Giant);
+
+       error = suser(td);
+       if (error)
+               goto done;
+
+       NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->name, td);
+       error = namei(&nd);
+       if (error)
+               goto done;
+       NDFREE(&nd, NDF_ONLY_PNBUF);
+       vp = nd.ni_vp;
+
+       for (sp = swdevt, index = 0 ; index < nswdev; index++, sp++) {
+               if (sp->sw_vp == vp)
+                       goto found;
+       }
+       error = EINVAL;
+       goto done;
+found:
+       nblks = sp->sw_nblks;
+
+       /*
+        * We can turn off this swap device safely only if the
+        * available virtual memory in the system will fit the amount
+        * of data we will have to page back in, plus an epsilon so
+        * the system doesn't become critically low on swap space.
+        */
+       if (cnt.v_free_count + cnt.v_cache_count + vm_swap_size <
+           sp->sw_used + nswap_lowat) {
+               error = ENOMEM;
+               goto done;
+       }
+
+       /*
+        * Prevent further allocations on this device.
+        */
+       sp->sw_flags |= SW_CLOSING;
+       for (dvbase = dmmax; dvbase < nblks; dvbase += dmmax) {
+               blk = min(nblks - dvbase, dmmax);
+               vsbase = index * dmmax + dvbase * nswdev;
+               vm_swap_size -= blist_fill(swapblist, vsbase, blk);
+       }
+
+       /*
+        * Page in the contents of the device and close it.
+        */
+#ifndef NO_SWAPPING
+       vm_proc_swapin_all(index);
+#endif /* !NO_SWAPPING */
+       swap_pager_swapoff(index);
+
+       if (sp->sw_used != 0)
+           panic("swapoff: failed to locate %d swap blocks", sp->sw_used);
+
+       VOP_CLOSE(vp, FREAD | FWRITE, td->td_ucred, td);
+       vrele(vp);
+       sp->sw_vp = NULL;
+
+       /*
+        * Resize the bitmap based on the new largest swap device,
+        * or free the bitmap if there are no more devices.
+        */
+       for (sp = swdevt, nblks = 0; sp < swdevt + nswdev; sp++) {
+               if (sp->sw_vp == NULL)
+                       continue;
+               nblks = max(nblks, sp->sw_nblks);
+       }
+
+       aligned_nblks = (nblks + (dmmax - 1)) & ~(u_long)(dmmax - 1);
+       nswap = aligned_nblks * nswdev;
+
+       if (nswap == 0) {
+               blist_destroy(swapblist);
+               swapblist = NULL;
+               vrele(swapdev_vp);
+               swapdev_vp = NULL;
+       } else
+               blist_resize(&swapblist, nswap, 0);
+
+done:
+       mtx_unlock(&Giant);
+       return (error);
 }
 
 static int


To Unsubscribe: send mail to [EMAIL PROTECTED]
with "unsubscribe freebsd-hackers" in the body of the message

Reply via email to