Module Name:    src
Committed By:   riastradh
Date:           Sat Sep 11 10:09:31 UTC 2021

Modified Files:
        src/sys/kern: kern_ksyms.c

Log Message:
ksyms(4): Take a complete snapshot on each open.

- Snapshots are stored in pageable anonymous uvm objects.
- Snapshots are reference-counted so they can be reused across opens.
- Opening /dev/ksyms blocks module unload until snapshot is taken.
- Merely holding /dev/ksyms open does not block module unload.
- /dev/ksyms is now mmappable.

This slightly changes the behaviour of fstat(2) on /dev/ksyms -- it
is a little more useful now!  In particular, st_size is the size of
the symbol table.  Some other fields which were not very useful to
begin with -- st_dev, st_ino, st_mode, st_nlink, st_*time,
st_blksize, st_blocks -- are now different, and independent of the
file system on which the device node resides.

Discussed in

https://mail-index.netbsd.org/source-changes-d/2021/08/17/msg013425.html

This is option (3), adapted to make the ksyms snapshots pageable,
after options (1) and (2) posed practical problems.


To generate a diff of this commit:
cvs rdiff -u -r1.102 -r1.103 src/sys/kern/kern_ksyms.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/kern/kern_ksyms.c
diff -u src/sys/kern/kern_ksyms.c:1.102 src/sys/kern/kern_ksyms.c:1.103
--- src/sys/kern/kern_ksyms.c:1.102	Tue Sep  7 16:56:25 2021
+++ src/sys/kern/kern_ksyms.c	Sat Sep 11 10:09:31 2021
@@ -1,4 +1,4 @@
-/*	$NetBSD: kern_ksyms.c,v 1.102 2021/09/07 16:56:25 riastradh Exp $	*/
+/*	$NetBSD: kern_ksyms.c,v 1.103 2021/09/11 10:09:31 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2008 The NetBSD Foundation, Inc.
@@ -73,7 +73,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: kern_ksyms.c,v 1.102 2021/09/07 16:56:25 riastradh Exp $");
+__KERNEL_RCSID(0, "$NetBSD: kern_ksyms.c,v 1.103 2021/09/11 10:09:31 riastradh Exp $");
 
 #if defined(_KERNEL) && defined(_KERNEL_OPT)
 #include "opt_copy_symtab.h"
@@ -86,6 +86,9 @@ __KERNEL_RCSID(0, "$NetBSD: kern_ksyms.c
 #include <sys/param.h>
 #include <sys/queue.h>
 #include <sys/exec.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/kauth.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/kmem.h>
@@ -94,6 +97,9 @@ __KERNEL_RCSID(0, "$NetBSD: kern_ksyms.c
 #include <sys/ksyms.h>
 #include <sys/kernel.h>
 #include <sys/intr.h>
+#include <sys/stat.h>
+
+#include <uvm/uvm_extern.h>
 
 #ifdef DDB
 #include <ddb/db_output.h>
@@ -104,6 +110,15 @@ __KERNEL_RCSID(0, "$NetBSD: kern_ksyms.c
 #include "ioconf.h"
 #endif
 
+struct ksyms_snapshot {
+	uint64_t		ks_refcnt;
+	uint64_t		ks_gen;
+	struct uvm_object	*ks_uobj;
+	size_t			ks_size;
+	dev_t			ks_dev;
+	int			ks_maxlen;
+};
+
 #define KSYMS_MAX_ID	98304
 #ifdef KDTRACE_HOOKS
 static uint32_t ksyms_nmap[KSYMS_MAX_ID];	/* sorted symbol table map */
@@ -112,15 +127,20 @@ static uint32_t *ksyms_nmap = NULL;
 #endif
 
 static int ksyms_maxlen;
-static uint64_t ksyms_opencnt;
-static struct ksyms_symtab *ksyms_last_snapshot;
 static bool ksyms_initted;
 static bool ksyms_loaded;
 static kmutex_t ksyms_lock __cacheline_aligned;
 static struct ksyms_symtab kernel_symtab;
+static kcondvar_t ksyms_cv;
+static struct lwp *ksyms_snapshotting;
+static struct ksyms_snapshot *ksyms_snapshot;
+static uint64_t ksyms_snapshot_gen;
 
 static void ksyms_hdr_init(const void *);
 static void ksyms_sizes_calc(void);
+static struct ksyms_snapshot *ksyms_snapshot_alloc(int, size_t, dev_t,
+    uint64_t);
+static void ksyms_snapshot_release(struct ksyms_snapshot *);
 
 #ifdef KSYMS_DEBUG
 #define	FOLLOW_CALLS		1
@@ -245,6 +265,7 @@ ksyms_init(void)
 
 	if (!ksyms_initted) {
 		mutex_init(&ksyms_lock, MUTEX_DEFAULT, IPL_NONE);
+		cv_init(&ksyms_cv, "ksyms");
 		ksyms_initted = true;
 	}
 }
@@ -328,7 +349,6 @@ addsymtab(const char *name, void *symsta
 	tab->sd_minsym = UINTPTR_MAX;
 	tab->sd_maxsym = 0;
 	tab->sd_usroffset = 0;
-	tab->sd_gone = false;
 	tab->sd_ctfstart = ctfstart;
 	tab->sd_ctfsize = ctfsize;
 	tab->sd_nmap = nmap;
@@ -446,9 +466,9 @@ addsymtab(const char *name, void *symsta
 	KASSERT(cold || mutex_owned(&ksyms_lock));
 
 	/*
-	 * Ensure ddb never witnesses an inconsistent state of the
-	 * queue, unless memory is so corrupt that we crash in
-	 * TAILQ_INSERT_TAIL.
+	 * Publish the symtab.  Do this at splhigh to ensure ddb never
+	 * witnesses an inconsistent state of the queue, unless memory
+	 * is so corrupt that we crash in TAILQ_INSERT_TAIL.
 	 */
 	s = splhigh();
 	TAILQ_INSERT_TAIL(&ksyms_symtabs, tab, sd_queue);
@@ -557,6 +577,9 @@ ksyms_addsyms_elf(int symsize, void *sta
 	    kernel_symtab.sd_symstart, kernel_symtab.sd_strstart,
 	    (long)kernel_symtab.sd_symsize/sizeof(Elf_Sym));
 #endif
+
+	/* Should be no snapshot to invalidate yet.  */
+	KASSERT(ksyms_snapshot == NULL);
 }
 
 /*
@@ -577,6 +600,9 @@ ksyms_addsyms_explicit(void *ehdr, void 
 	ksyms_hdr_init(ehdr);
 	addsymtab("netbsd", symstart, symsize, strstart, strsize,
 	    &kernel_symtab, symstart, NULL, 0, ksyms_nmap);
+
+	/* Should be no snapshot to invalidate yet.  */
+	KASSERT(ksyms_snapshot == NULL);
 }
 
 /*
@@ -601,8 +627,6 @@ ksyms_getval_unlocked(const char *mod, c
 #endif
 
 	TAILQ_FOREACH(st, &ksyms_symtabs, sd_queue) {
-		if (__predict_false(st->sd_gone))
-			continue;
 		if (mod != NULL && strcmp(st->sd_name, mod))
 			continue;
 		if ((es = findsym(sym, st, type)) != NULL) {
@@ -636,8 +660,6 @@ ksyms_get_mod(const char *mod)
 
 	mutex_enter(&ksyms_lock);
 	TAILQ_FOREACH(st, &ksyms_symtabs, sd_queue) {
-		if (__predict_false(st->sd_gone))
-			continue;
 		if (mod != NULL && strcmp(st->sd_name, mod))
 			continue;
 		break;
@@ -671,8 +693,6 @@ ksyms_mod_foreach(const char *mod, ksyms
 
 	/* find the module */
 	TAILQ_FOREACH(st, &ksyms_symtabs, sd_queue) {
-		if (__predict_false(st->sd_gone))
-			continue;
 		if (mod != NULL && strcmp(st->sd_name, mod))
 			continue;
 
@@ -716,8 +736,6 @@ ksyms_getname(const char **mod, const ch
 		return ENOENT;
 
 	TAILQ_FOREACH(st, &ksyms_symtabs, sd_queue) {
-		if (st->sd_gone)
-			continue;
 		if (v < st->sd_minsym || v > st->sd_maxsym)
 			continue;
 		sz = st->sd_symsize/sizeof(Elf_Sym);
@@ -762,6 +780,7 @@ ksyms_modload(const char *name, void *sy
     char *strstart, vsize_t strsize)
 {
 	struct ksyms_symtab *st;
+	struct ksyms_snapshot *ks;
 	void *nmap;
 
 	st = kmem_zalloc(sizeof(*st), KM_SLEEP);
@@ -770,7 +789,12 @@ ksyms_modload(const char *name, void *sy
 	mutex_enter(&ksyms_lock);
 	addsymtab(name, symstart, symsize, strstart, strsize, st, symstart,
 	    NULL, 0, nmap);
+	ks = ksyms_snapshot;
+	ksyms_snapshot = NULL;
 	mutex_exit(&ksyms_lock);
+
+	if (ks)
+		ksyms_snapshot_release(ks);
 }
 
 /*
@@ -780,37 +804,48 @@ void
 ksyms_modunload(const char *name)
 {
 	struct ksyms_symtab *st;
-	bool do_free = false;
+	struct ksyms_snapshot *ks;
 	int s;
 
 	mutex_enter(&ksyms_lock);
 	TAILQ_FOREACH(st, &ksyms_symtabs, sd_queue) {
-		if (st->sd_gone)
-			continue;
 		if (strcmp(name, st->sd_name) != 0)
 			continue;
-		st->sd_gone = true;
-		ksyms_sizes_calc();
-		if (ksyms_opencnt == 0) {
-			/*
-			 * Ensure ddb never witnesses an inconsistent
-			 * state of the queue, unless memory is so
-			 * corrupt that we crash in TAILQ_REMOVE.
-			 */
-			s = splhigh();
-			TAILQ_REMOVE(&ksyms_symtabs, st, sd_queue);
-			splx(s);
-			do_free = true;
-		}
 		break;
 	}
-	mutex_exit(&ksyms_lock);
 	KASSERT(st != NULL);
 
-	if (do_free) {
-		kmem_free(st->sd_nmap, st->sd_nmapsize * sizeof(uint32_t));
-		kmem_free(st, sizeof(*st));
-	}
+	/* Wait for any snapshot in progress to complete.  */
+	while (ksyms_snapshotting)
+		cv_wait(&ksyms_cv, &ksyms_lock);
+
+	/*
+	 * Remove the symtab.  Do this at splhigh to ensure ddb never
+	 * witnesses an inconsistent state of the queue, unless memory
+	 * is so corrupt that we crash in TAILQ_REMOVE.
+	 */
+	s = splhigh();
+	TAILQ_REMOVE(&ksyms_symtabs, st, sd_queue);
+	splx(s);
+
+	/* Recompute the ksyms sizes now that we've removed st.  */
+	ksyms_sizes_calc();
+
+	/* Invalidate the global ksyms snapshot.  */
+	ks = ksyms_snapshot;
+	ksyms_snapshot = NULL;
+	mutex_exit(&ksyms_lock);
+
+	/*
+	 * No more references are possible.  Free the name map and the
+	 * symtab itself, which we had allocated in ksyms_modload.
+	 */
+	kmem_free(st->sd_nmap, st->sd_nmapsize * sizeof(uint32_t));
+	kmem_free(st, sizeof(*st));
+
+	/* Release the formerly global ksyms snapshot, if any.  */
+	if (ks)
+		ksyms_snapshot_release(ks);
 }
 
 #ifdef DDB
@@ -830,8 +865,6 @@ ksyms_sift(char *mod, char *sym, int mod
 		return ENOENT;
 
 	TAILQ_FOREACH(st, &ksyms_symtabs, sd_queue) {
-		if (st->sd_gone)
-			continue;
 		if (mod && strcmp(mod, st->sd_name))
 			continue;
 		sb = st->sd_strstart - st->sd_usroffset;
@@ -893,8 +926,6 @@ ksyms_sizes_calc(void)
 
 	ksyms_symsz = ksyms_strsz = 0;
 	TAILQ_FOREACH(st, &ksyms_symtabs, sd_queue) {
-		if (__predict_false(st->sd_gone))
-			continue;
 		delta = ksyms_strsz - st->sd_usroffset;
 		if (delta != 0) {
 			for (i = 0; i < st->sd_symsize/sizeof(Elf_Sym); i++)
@@ -997,164 +1028,401 @@ ksyms_hdr_init(const void *hdraddr)
 	SHTCOPY(".SUNW_ctf");
 }
 
-static int
-ksymsopen(dev_t dev, int oflags, int devtype, struct lwp *l)
+static struct ksyms_snapshot *
+ksyms_snapshot_alloc(int maxlen, size_t size, dev_t dev, uint64_t gen)
 {
-	if (minor(dev) != 0 || !ksyms_loaded)
-		return ENXIO;
+	struct ksyms_snapshot *ks;
 
-	/*
-	 * Create a "snapshot" of the kernel symbol table.  Bumping
-	 * ksyms_opencnt will prevent symbol tables from being freed.
-	 */
-	mutex_enter(&ksyms_lock);
-	if (ksyms_opencnt++)
-		goto out;
-	ksyms_hdr.kh_shdr[SYMTAB].sh_size = ksyms_symsz;
-	ksyms_hdr.kh_shdr[SYMTAB].sh_info = ksyms_symsz / sizeof(Elf_Sym);
-	ksyms_hdr.kh_shdr[STRTAB].sh_offset = ksyms_symsz +
-	    ksyms_hdr.kh_shdr[SYMTAB].sh_offset;
-	ksyms_hdr.kh_shdr[STRTAB].sh_size = ksyms_strsz;
-	ksyms_hdr.kh_shdr[SHCTF].sh_offset = ksyms_strsz +
-	    ksyms_hdr.kh_shdr[STRTAB].sh_offset;
-	ksyms_hdr.kh_shdr[SHCTF].sh_size = ksyms_ctfsz;
-	ksyms_last_snapshot = TAILQ_LAST(&ksyms_symtabs, ksyms_symtab_queue);
-out:	mutex_exit(&ksyms_lock);
+	ks = kmem_zalloc(sizeof(*ks), KM_SLEEP);
+	ks->ks_refcnt = 1;
+	ks->ks_gen = gen;
+	ks->ks_uobj = uao_create(size, 0);
+	ks->ks_size = size;
+	ks->ks_dev = dev;
+	ks->ks_maxlen = maxlen;
 
-	return 0;
+	return ks;
 }
 
-static int
-ksymsclose(dev_t dev, int oflags, int devtype, struct lwp *l)
+static void
+ksyms_snapshot_release(struct ksyms_snapshot *ks)
 {
-	struct ksyms_symtab *st, *next;
-	TAILQ_HEAD(, ksyms_symtab) to_free = TAILQ_HEAD_INITIALIZER(to_free);
-	int s;
+	uint64_t refcnt;
 
-	/* Discard references to symbol tables. */
 	mutex_enter(&ksyms_lock);
-	if (--ksyms_opencnt)
-		goto out;
-	ksyms_last_snapshot = NULL;
-	TAILQ_FOREACH_SAFE(st, &ksyms_symtabs, sd_queue, next) {
-		if (st->sd_gone) {
-			/*
-			 * Ensure ddb never witnesses an inconsistent
-			 * state of the queue, unless memory is so
-			 * corrupt that we crash in TAILQ_REMOVE.
-			 */
-			s = splhigh();
-			TAILQ_REMOVE(&ksyms_symtabs, st, sd_queue);
-			splx(s);
-			TAILQ_INSERT_TAIL(&to_free, st, sd_queue);
-		}
-	}
-	if (!TAILQ_EMPTY(&to_free))
-		ksyms_sizes_calc();
-out:	mutex_exit(&ksyms_lock);
+	refcnt = --ks->ks_refcnt;
+	mutex_exit(&ksyms_lock);
 
-	TAILQ_FOREACH_SAFE(st, &to_free, sd_queue, next) {
-		kmem_free(st->sd_nmap, st->sd_nmapsize * sizeof(uint32_t));
-		kmem_free(st, sizeof(*st));
-	}
+	if (refcnt)
+		return;
 
-	return 0;
+	uao_detach(ks->ks_uobj);
+	kmem_free(ks, sizeof(*ks));
 }
 
 static int
-ksymsread(dev_t dev, struct uio *uio, int ioflag)
+ubc_copyfrombuf(struct uvm_object *uobj, struct uio *uio, const void *buf,
+    size_t n)
 {
+	struct iovec iov = { .iov_base = __UNCONST(buf), .iov_len = n };
+
+	uio->uio_iov = &iov;
+	uio->uio_iovcnt = 1;
+	uio->uio_resid = n;
+
+	return ubc_uiomove(uobj, uio, n, UVM_ADV_SEQUENTIAL, UBC_WRITE);
+}
+
+static int
+ksyms_take_snapshot(struct ksyms_snapshot *ks, struct ksyms_symtab *last)
+{
+	struct uvm_object *uobj = ks->ks_uobj;
+	struct uio uio;
 	struct ksyms_symtab *st;
-	size_t filepos, inpos, off;
 	int error;
 
+	/* Caller must have initiated snapshotting.  */
+	KASSERT(ksyms_snapshotting == curlwp);
+
+	/* Start a uio transfer to reuse incrementally.  */
+	uio.uio_offset = 0;
+	uio.uio_rw = UIO_WRITE; /* write from buffer to uobj */
+	UIO_SETUP_SYSSPACE(&uio);
+
 	/*
-	 * First: Copy out the ELF header.   XXX Lose if ksymsopen()
-	 * occurs during read of the header.
+	 * First: Copy out the ELF header.
 	 */
-	off = uio->uio_offset;
-	if (off < sizeof(struct ksyms_hdr)) {
-		error = uiomove((char *)&ksyms_hdr + off,
-		    sizeof(struct ksyms_hdr) - off, uio);
-		if (error != 0)
-			return error;
-	}
+	error = ubc_copyfrombuf(uobj, &uio, &ksyms_hdr, sizeof(ksyms_hdr));
+	if (error)
+		return error;
 
 	/*
-	 * Copy out the symbol table.
+	 * Copy out the symbol table.  The list of symtabs is
+	 * guaranteed to be nonempty because we always have an entry
+	 * for the main kernel.  We stop at last, not at the end of the
+	 * tailq or NULL, because entries beyond last are not included
+	 * in this snapshot (and may not be fully initialized memory as
+	 * we witness it).
 	 */
-	filepos = sizeof(struct ksyms_hdr);
-	TAILQ_FOREACH(st, &ksyms_symtabs, sd_queue) {
-		if (__predict_false(st->sd_gone))
-			continue;
-		if (uio->uio_resid == 0)
-			return 0;
-		if (uio->uio_offset <= st->sd_symsize + filepos) {
-			inpos = uio->uio_offset - filepos;
-			error = uiomove((char *)st->sd_symstart + inpos,
-			   st->sd_symsize - inpos, uio);
-			if (error != 0)
-				return error;
-		}
-		filepos += st->sd_symsize;
-		if (st == ksyms_last_snapshot)
+	KASSERT(uio.uio_offset == sizeof(struct ksyms_hdr));
+	for (st = TAILQ_FIRST(&ksyms_symtabs);
+	     ;
+	     st = TAILQ_NEXT(st, sd_queue)) {
+		error = ubc_copyfrombuf(uobj, &uio, st->sd_symstart,
+		    st->sd_symsize);
+		if (error)
+			return error;
+		if (st == last)
 			break;
 	}
 
 	/*
 	 * Copy out the string table
 	 */
-	KASSERT(filepos == sizeof(struct ksyms_hdr) +
+	KASSERT(uio.uio_offset == sizeof(struct ksyms_hdr) +
 	    ksyms_hdr.kh_shdr[SYMTAB].sh_size);
 	for (st = TAILQ_FIRST(&ksyms_symtabs);
 	     ;
 	     st = TAILQ_NEXT(st, sd_queue)) {
-		if (uio->uio_resid == 0)
-			return 0;
-		if (uio->uio_offset <= st->sd_strsize + filepos) {
-			inpos = uio->uio_offset - filepos;
-			error = uiomove((char *)st->sd_strstart + inpos,
-			   st->sd_strsize - inpos, uio);
-			if (error != 0)
-				return error;
-		}
-		filepos += st->sd_strsize;
-		if (st == ksyms_last_snapshot)
+		error = ubc_copyfrombuf(uobj, &uio, st->sd_strstart,
+		    st->sd_strsize);
+		if (error)
+			return error;
+		if (st == last)
 			break;
 	}
 
 	/*
 	 * Copy out the CTF table.
 	 */
+	KASSERT(uio.uio_offset == sizeof(struct ksyms_hdr) +
+	    ksyms_hdr.kh_shdr[SYMTAB].sh_size +
+	    ksyms_hdr.kh_shdr[STRTAB].sh_size);
 	st = TAILQ_FIRST(&ksyms_symtabs);
 	if (st->sd_ctfstart != NULL) {
-		if (uio->uio_resid == 0)
-			return 0;
-		if (uio->uio_offset <= st->sd_ctfsize + filepos) {
-			inpos = uio->uio_offset - filepos;
-			error = uiomove((char *)st->sd_ctfstart + inpos,
-			    st->sd_ctfsize - inpos, uio);
-			if (error != 0)
-				return error;
-		}
-		filepos += st->sd_ctfsize;
+		error = ubc_copyfrombuf(uobj, &uio, st->sd_ctfstart,
+		    st->sd_ctfsize);
+		if (error)
+			return error;
 	}
 
+	KASSERT(uio.uio_offset == sizeof(struct ksyms_hdr) +
+	    ksyms_hdr.kh_shdr[SYMTAB].sh_size +
+	    ksyms_hdr.kh_shdr[STRTAB].sh_size +
+	    ksyms_hdr.kh_shdr[SHCTF].sh_size);
+	KASSERT(uio.uio_offset == ks->ks_size);
+
 	return 0;
 }
 
+static const struct fileops ksyms_fileops;
+
 static int
-ksymswrite(dev_t dev, struct uio *uio, int ioflag)
+ksymsopen(dev_t dev, int flags, int devtype, struct lwp *l)
 {
-	return EROFS;
+	struct file *fp = NULL;
+	int fd = -1;
+	struct ksyms_snapshot *ks = NULL;
+	size_t size;
+	struct ksyms_symtab *last;
+	int maxlen;
+	uint64_t gen;
+	int error;
+
+	if (minor(dev) != 0 || !ksyms_loaded)
+		return ENXIO;
+
+	/* Allocate a private file.  */
+	error = fd_allocfile(&fp, &fd);
+	if (error)
+		return error;
+
+	mutex_enter(&ksyms_lock);
+
+	/*
+	 * Wait until we have a snapshot, or until there is no snapshot
+	 * being taken right now so we can take one.
+	 */
+	while ((ks = ksyms_snapshot) == NULL && ksyms_snapshotting) {
+		error = cv_wait_sig(&ksyms_cv, &ksyms_lock);
+		if (error)
+			goto out;
+	}
+
+	/*
+	 * If there's a usable snapshot, increment its reference count
+	 * (can't overflow, 64-bit) and just reuse it.
+	 */
+	if (ks) {
+		ks->ks_refcnt++;
+		goto out;
+	}
+
+	/* Find the current length of the symtab object. */
+	size = sizeof(struct ksyms_hdr);
+	size += ksyms_strsz;
+	size += ksyms_symsz;
+	size += ksyms_ctfsz;
+
+	/* Start a new snapshot.  */
+	ksyms_hdr.kh_shdr[SYMTAB].sh_size = ksyms_symsz;
+	ksyms_hdr.kh_shdr[SYMTAB].sh_info = ksyms_symsz / sizeof(Elf_Sym);
+	ksyms_hdr.kh_shdr[STRTAB].sh_offset = ksyms_symsz +
+	    ksyms_hdr.kh_shdr[SYMTAB].sh_offset;
+	ksyms_hdr.kh_shdr[STRTAB].sh_size = ksyms_strsz;
+	ksyms_hdr.kh_shdr[SHCTF].sh_offset = ksyms_strsz +
+	    ksyms_hdr.kh_shdr[STRTAB].sh_offset;
+	ksyms_hdr.kh_shdr[SHCTF].sh_size = ksyms_ctfsz;
+	last = TAILQ_LAST(&ksyms_symtabs, ksyms_symtab_queue);
+	maxlen = ksyms_maxlen;
+	gen = ksyms_snapshot_gen++;
+
+	/*
+	 * Prevent ksyms entries from being removed while we take the
+	 * snapshot.
+	 */
+	KASSERT(ksyms_snapshotting == NULL);
+	ksyms_snapshotting = curlwp;
+	mutex_exit(&ksyms_lock);
+
+	/* Create a snapshot and write the symtab to it.  */
+	ks = ksyms_snapshot_alloc(maxlen, size, dev, gen);
+	error = ksyms_take_snapshot(ks, last);
+
+	/*
+	 * Snapshot creation is done.  Wake up anyone waiting to remove
+	 * entries (module unload).
+	 */
+	mutex_enter(&ksyms_lock);
+	KASSERTMSG(ksyms_snapshotting == curlwp, "lwp %p stole snapshot",
+	    ksyms_snapshotting);
+	ksyms_snapshotting = NULL;
+	cv_broadcast(&ksyms_cv);
+
+	/* If we failed, give up.  */
+	if (error)
+		goto out;
+
+	/* Cache the snapshot for the next reader.  */
+	KASSERT(ksyms_snapshot == NULL);
+	ksyms_snapshot = ks;
+	ks->ks_refcnt++;
+	KASSERT(ks->ks_refcnt == 2);
+
+out:	mutex_exit(&ksyms_lock);
+	if (error) {
+		if (fp)
+			fd_abort(curproc, fp, fd);
+		if (ks)
+			ksyms_snapshot_release(ks);
+	} else {
+		KASSERT(fp);
+		KASSERT(ks);
+		error = fd_clone(fp, fd, flags, &ksyms_fileops, ks);
+		KASSERTMSG(error == EMOVEFD, "error=%d", error);
+	}
+	return error;
+}
+
+static int
+ksymsclose(struct file *fp)
+{
+	struct ksyms_snapshot *ks = fp->f_data;
+
+	ksyms_snapshot_release(ks);
+
+	return 0;
+}
+
+static int
+ksymsread(struct file *fp, off_t *offp, struct uio *uio, kauth_cred_t cred,
+    int flags)
+{
+	const struct ksyms_snapshot *ks = fp->f_data;
+	size_t count;
+	int error;
+
+	/*
+	 * Since we don't have a per-object lock, we might as well use
+	 * the struct file lock to serialize access to fp->f_offset --
+	 * but if the caller isn't relying on or updating fp->f_offset,
+	 * there's no need to do even that.  We could use ksyms_lock,
+	 * but why bother with a global lock if not needed?  Either
+	 * way, the lock we use here must agree with what ksymsseek
+	 * takes (nothing else in ksyms uses fp->f_offset).
+	 */
+	if (offp == &fp->f_offset)
+		mutex_enter(&fp->f_lock);
+
+	/* Refuse negative offsets.  */
+	if (*offp < 0) {
+		error = EINVAL;
+		goto out;
+	}
+
+	/* Return nothing at or past end of file.  */
+	if (*offp >= ks->ks_size) {
+		error = 0;
+		goto out;
+	}
+
+	/*
+	 * 1. Set up the uio to transfer from offset *offp.
+	 * 2. Transfer as many bytes as we can (at most uio->uio_resid
+	 *    or what's left in the ksyms).
+	 * 3. If requested, update *offp to reflect the number of bytes
+	 *    transferred.
+	 */
+	uio->uio_offset = *offp;
+	count = uio->uio_resid;
+	error = ubc_uiomove(ks->ks_uobj, uio, MIN(count, ks->ks_size - *offp),
+	    UVM_ADV_SEQUENTIAL, UBC_READ|UBC_PARTIALOK);
+	if (flags & FOF_UPDATE_OFFSET)
+		*offp += count - uio->uio_resid;
+
+out:	if (offp == &fp->f_offset)
+		mutex_exit(&fp->f_lock);
+	return error;
+}
+
+static int
+ksymsstat(struct file *fp, struct stat *st)
+{
+	const struct ksyms_snapshot *ks = fp->f_data;
+
+	memset(st, 0, sizeof(*st));
+
+	st->st_dev = NODEV;
+	st->st_ino = 0;
+	st->st_mode = S_IFCHR;
+	st->st_nlink = 1;
+	st->st_uid = kauth_cred_geteuid(fp->f_cred);
+	st->st_gid = kauth_cred_getegid(fp->f_cred);
+	st->st_rdev = ks->ks_dev;
+	st->st_size = ks->ks_size;
+	/* zero time */
+	st->st_blksize = MAXPHYS; /* XXX arbitrary */
+	st->st_blocks = 0;
+	st->st_gen = ks->ks_gen;
+
+	return 0;
+}
+
+static int
+ksymsmmap(struct file *fp, off_t *offp, size_t nbytes, int prot, int *flagsp,
+    int *advicep, struct uvm_object **uobjp, int *maxprotp)
+{
+	const struct ksyms_snapshot *ks = fp->f_data;
+
+	/* uvm_mmap guarantees page-aligned offset and size.  */
+	KASSERT(*offp == round_page(*offp));
+	KASSERT(nbytes == round_page(nbytes));
+
+	/* Refuse negative offsets.  */
+	if (*offp < 0)
+		return EINVAL;
+
+	/* Refuse mappings that pass the end of file.  */
+	if (nbytes > round_page(ks->ks_size) ||
+	    *offp > round_page(ks->ks_size) - nbytes)
+		return EINVAL;	/* XXX ??? */
+
+	/* Success!  */
+	*advicep = UVM_ADV_SEQUENTIAL;
+	*uobjp = ks->ks_uobj;
+	*maxprotp = prot & VM_PROT_READ;
+	return 0;
+}
+
+static int
+ksymsseek(struct file *fp, off_t delta, int whence, off_t *newoffp, int flags)
+{
+	struct ksyms_snapshot *ks = fp->f_data;
+	off_t base, newoff;
+	int error;
+
+	mutex_enter(&fp->f_lock);
+
+	switch (whence) {
+	case SEEK_CUR:
+		base = fp->f_offset;
+		break;
+	case SEEK_END:
+		base = ks->ks_size;
+		break;
+	case SEEK_SET:
+		base = 0;
+		break;
+	default:
+		error = EINVAL;
+		goto out;
+	}
+
+	/* Compute the new offset and validate it.  */
+	newoff = base + delta;	/* XXX arithmetic overflow */
+	if (newoff < 0) {
+		error = EINVAL;
+		goto out;
+	}
+
+	/* Success!  */
+	if (newoffp)
+		*newoffp = newoff;
+	if (flags & FOF_UPDATE_OFFSET)
+		fp->f_offset = newoff;
+	error = 0;
+
+out:	mutex_exit(&fp->f_lock);
+	return error;
 }
 
 __CTASSERT(offsetof(struct ksyms_ogsymbol, kg_name) == offsetof(struct ksyms_gsymbol, kg_name));
 __CTASSERT(offsetof(struct ksyms_gvalue, kv_name) == offsetof(struct ksyms_gsymbol, kg_name));
 
 static int
-ksymsioctl(dev_t dev, u_long cmd, void *data, int fflag, struct lwp *l)
+ksymsioctl(struct file *fp, u_long cmd, void *data)
 {
+	struct ksyms_snapshot *ks = fp->f_data;
 	struct ksyms_ogsymbol *okg = (struct ksyms_ogsymbol *)data;
 	struct ksyms_gsymbol *kg = (struct ksyms_gsymbol *)data;
 	struct ksyms_gvalue *kv = (struct ksyms_gvalue *)data;
@@ -1165,8 +1433,8 @@ ksymsioctl(dev_t dev, u_long cmd, void *
 	char *str = NULL;
 	int len;
 
-	/* Read ksyms_maxlen only once while not holding the lock. */
-	len = ksyms_maxlen;
+	/* Read cached ksyms_maxlen.  */
+	len = ks->ks_maxlen;
 
 	if (cmd == OKIOCGVALUE || cmd == OKIOCGSYMBOL ||
 	    cmd == KIOCGVALUE || cmd == KIOCGSYMBOL) {
@@ -1196,8 +1464,6 @@ ksymsioctl(dev_t dev, u_long cmd, void *
 		 */
 		mutex_enter(&ksyms_lock);
 		TAILQ_FOREACH(st, &ksyms_symtabs, sd_queue) {
-			if (st->sd_gone)
-				continue;
 			if ((sym = findsym(str, st, KSYMS_ANY)) == NULL)
 				continue;
 #ifdef notdef
@@ -1238,8 +1504,6 @@ ksymsioctl(dev_t dev, u_long cmd, void *
 		 */
 		mutex_enter(&ksyms_lock);
 		TAILQ_FOREACH(st, &ksyms_symtabs, sd_queue) {
-			if (st->sd_gone)
-				continue;
 			if ((sym = findsym(str, st, KSYMS_ANY)) == NULL)
 				continue;
 #ifdef notdef
@@ -1264,10 +1528,7 @@ ksymsioctl(dev_t dev, u_long cmd, void *
 		/*
 		 * Get total size of symbol table.
 		 */
-		mutex_enter(&ksyms_lock);
-		*(int *)data = ksyms_strsz + ksyms_symsz +
-		    sizeof(struct ksyms_hdr);
-		mutex_exit(&ksyms_lock);
+		*(int *)data = ks->ks_size;
 		break;
 
 	default:
@@ -1280,15 +1541,30 @@ ksymsioctl(dev_t dev, u_long cmd, void *
 
 const struct cdevsw ksyms_cdevsw = {
 	.d_open = ksymsopen,
-	.d_close = ksymsclose,
-	.d_read = ksymsread,
-	.d_write = ksymswrite,
-	.d_ioctl = ksymsioctl,
-	.d_stop = nullstop,
+	.d_close = noclose,
+	.d_read = noread,
+	.d_write = nowrite,
+	.d_ioctl = noioctl,
+	.d_stop = nostop,
 	.d_tty = notty,
 	.d_poll = nopoll,
 	.d_mmap = nommap,
-	.d_kqfilter = nullkqfilter,
+	.d_kqfilter = nokqfilter,
 	.d_discard = nodiscard,
 	.d_flag = D_OTHER | D_MPSAFE
 };
+
+static const struct fileops ksyms_fileops = {
+	.fo_name = "ksyms",
+	.fo_read = ksymsread,
+	.fo_write = fbadop_write,
+	.fo_ioctl = ksymsioctl,
+	.fo_fcntl = fnullop_fcntl,
+	.fo_poll = fnullop_poll,
+	.fo_stat = ksymsstat,
+	.fo_close = ksymsclose,
+	.fo_kqfilter = fnullop_kqfilter,
+	.fo_restart = fnullop_restart,
+	.fo_mmap = ksymsmmap,
+	.fo_seek = ksymsseek,
+};

Reply via email to