> Date: Tue, 30 Sep 2014 23:12:10 +0200 (CEST)
> From: Mark Kettenis <[email protected]>
>
> The diff below intorduces a new flag for mmap(2) that creates mappings
> that cannot fault. Normally, if you mmap a file, and your mapping is
> larger than the mapped file, memory access to full pages beyond the
> end of the file will fault. Depending on the OS you will get a
> SIGSEGV or SIGBUS and if you don't catch those, you die. This is
> especially nasty if you use file descriptor passing to share the file
> descriptor with some other process and this other proces ftruncates
> the file without telling you.
>
> The new xserver that matthieu@ just imported has the new xshm
> extension which mmaps file descriptors passed by clients through file
> descriptor passing. To protect itself from being trivially DOSed by a
> malicious (or careless) client, it keeps a list of mappings and
> installs a SIGBUS signal handler that checks whether the fault address
> matches any of these mappings. In that case it mmaps a private
> anonymous page on top of the faulting address and returns. Since
> OpenBSD generates SIGSEGV instead of SIGBUS in this case, this doesn't
> work for us, so I made sure matthieu@ disabled this functionality for
> now. But the new xshm extension would actually be a nice thing to
> have as it circumvents certain problems with the traditional xshm
> extension that we have because of privsep. And file descriptor
> passing is also being used for DRI3 which we may want to support one
> day. Oh, and Wayland, which some people claim will replace X any day
> now, heavily uses mapping file descriptors passed over sockets as
> well.
>
> We could of course change the xserver code to also trap SIGSEGV. But
> this workaround is rather ugly. So my idea is to make X use this new
> flag and disable the stupid busfault code.
>
> The diff is remarkably simple. We already have the infrastructure in
> place to replace mapped pages with anons to support MAP_PRIVATE and
> copy-on-write. This diff simply leverages that infrastructure to
> replace a page that can't be read from the underlying object by an
> anonymous pages. Some open issues:
>
> * I need to check whether all combinations of flag actually make
> sense. Should we only support __MAP_NOFAULT with non-anonymous
> mappings?
>
> * Should we only fixup the fault for access beyond the end of the
> mapped object (VM_PAGER_BAD) and still fault for actual IO erors
> (VM_PAGER_ERROR)?
>
> * Should the flag be exported without the leading underscores since
> we actually want to encourage its use?
>
> Thoughts?
Even though this diff has been committed, I'm still interested in what
people think about the issues above.
> Index: sys/mman.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/mman.h,v
> retrieving revision 1.26
> diff -u -p -r1.26 mman.h
> --- sys/mman.h 10 Jul 2014 19:00:23 -0000 1.26
> +++ sys/mman.h 30 Sep 2014 20:34:42 -0000
> @@ -58,8 +58,9 @@
> #define __MAP_NOREPLACE 0x0800 /* fail if address not available */
> #define MAP_ANON 0x1000 /* allocated from memory, swap space */
> #define MAP_ANONYMOUS MAP_ANON /* alternate POSIX spelling */
> +#define __MAP_NOFAULT 0x2000
>
> -#define MAP_FLAGMASK 0x1ff7
> +#define MAP_FLAGMASK 0x3ff7
>
> #ifdef _KERNEL
> /*
> Index: uvm/uvm.h
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvm.h,v
> retrieving revision 1.56
> diff -u -p -r1.56 uvm.h
> --- uvm/uvm.h 11 Jul 2014 16:35:40 -0000 1.56
> +++ uvm/uvm.h 30 Sep 2014 21:03:43 -0000
> @@ -90,7 +90,8 @@ struct uvm {
> #define UVM_ET_SUBMAP 0x02 /* it is a vm_map submap */
> #define UVM_ET_COPYONWRITE 0x04 /* copy_on_write */
> #define UVM_ET_NEEDSCOPY 0x08 /* needs_copy */
> -#define UVM_ET_HOLE 0x10 /* no backend */
> +#define UVM_ET_HOLE 0x10 /* no backend */
> +#define UVM_ET_NOFAULT 0x20 /* don't fault */
> #define UVM_ET_FREEMAPPED 0x80 /* map entry is on free list (DEBUG) */
>
> #define UVM_ET_ISOBJ(E) (((E)->etype & UVM_ET_OBJ) != 0)
> @@ -98,6 +99,7 @@ struct uvm {
> #define UVM_ET_ISCOPYONWRITE(E) (((E)->etype & UVM_ET_COPYONWRITE) != 0)
> #define UVM_ET_ISNEEDSCOPY(E) (((E)->etype & UVM_ET_NEEDSCOPY) != 0)
> #define UVM_ET_ISHOLE(E) (((E)->etype & UVM_ET_HOLE) != 0)
> +#define UVM_ET_ISNOFAULT(E) (((E)->etype & UVM_ET_NOFAULT) != 0)
>
> #ifdef _KERNEL
>
> Index: uvm/uvm_extern.h
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvm_extern.h,v
> retrieving revision 1.119
> diff -u -p -r1.119 uvm_extern.h
> --- uvm/uvm_extern.h 11 Jul 2014 16:35:40 -0000 1.119
> +++ uvm/uvm_extern.h 30 Sep 2014 20:08:36 -0000
> @@ -148,14 +148,15 @@ typedef int vm_prot_t;
> #define UVM_ADV_MASK 0x7 /* mask */
>
> /* mapping flags */
> -#define UVM_FLAG_FIXED 0x010000 /* find space */
> -#define UVM_FLAG_OVERLAY 0x020000 /* establish overlay */
> -#define UVM_FLAG_NOMERGE 0x040000 /* don't merge map entries */
> -#define UVM_FLAG_COPYONW 0x080000 /* set copy_on_write flag */
> -#define UVM_FLAG_AMAPPAD 0x100000 /* for bss: pad amap to reduce malloc() */
> -#define UVM_FLAG_TRYLOCK 0x200000 /* fail if we can not lock map */
> -#define UVM_FLAG_HOLE 0x400000 /* no backend */
> -#define UVM_FLAG_QUERY 0x800000 /* do everything, except actual execution
> */
> +#define UVM_FLAG_FIXED 0x0010000 /* find space */
> +#define UVM_FLAG_OVERLAY 0x0020000 /* establish overlay */
> +#define UVM_FLAG_NOMERGE 0x0040000 /* don't merge map entries */
> +#define UVM_FLAG_COPYONW 0x0080000 /* set copy_on_write flag */
> +#define UVM_FLAG_AMAPPAD 0x0100000 /* for bss: pad amap to reduce malloc() */
> +#define UVM_FLAG_TRYLOCK 0x0200000 /* fail if we can not lock map */
> +#define UVM_FLAG_HOLE 0x0400000 /* no backend */
> +#define UVM_FLAG_QUERY 0x0800000 /* do everything, except actual execution
> */
> +#define UVM_FLAG_NOFAULT 0x1000000 /* don't fault */
>
> /* macros to extract info */
> #define UVM_PROTECTION(X) ((X) & UVM_PROT_MASK)
> Index: uvm/uvm_fault.c
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvm_fault.c,v
> retrieving revision 1.77
> diff -u -p -r1.77 uvm_fault.c
> --- uvm/uvm_fault.c 7 Sep 2014 08:17:44 -0000 1.77
> +++ uvm/uvm_fault.c 30 Sep 2014 19:44:28 -0000
> @@ -1114,7 +1114,11 @@ Case2:
> goto ReFault;
> }
>
> - return (EACCES); /* XXX i/o error */
> + if (!UVM_ET_ISNOFAULT(ufi.entry))
> + return (EACCES); /* XXX i/o error */
> +
> + uobjpage = PGO_DONTCARE;
> + promote = TRUE;
> }
>
> /* re-verify the state of the world. */
> @@ -1132,7 +1136,7 @@ Case2:
> }
>
> /* didn't get the lock? release the page and retry. */
> - if (locked == FALSE) {
> + if (locked == FALSE && uobjpage != PGO_DONTCARE) {
> uvm_lock_pageq();
> /* make sure it is in queues */
> uvm_pageactivate(uobjpage);
> Index: uvm/uvm_map.c
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvm_map.c,v
> retrieving revision 1.175
> diff -u -p -r1.175 uvm_map.c
> --- uvm/uvm_map.c 14 Aug 2014 17:21:38 -0000 1.175
> +++ uvm/uvm_map.c 30 Sep 2014 20:07:53 -0000
> @@ -1142,6 +1142,8 @@ uvm_map(struct vm_map *map, vaddr_t *add
> entry->etype |= UVM_ET_OBJ;
> else if (flags & UVM_FLAG_HOLE)
> entry->etype |= UVM_ET_HOLE;
> + if (flags & UVM_FLAG_NOFAULT)
> + entry->etype |= UVM_ET_NOFAULT;
> if (flags & UVM_FLAG_COPYONW) {
> entry->etype |= UVM_ET_COPYONWRITE;
> if ((flags & UVM_FLAG_OVERLAY) == 0)
> Index: uvm/uvm_mmap.c
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvm_mmap.c,v
> retrieving revision 1.98
> diff -u -p -r1.98 uvm_mmap.c
> --- uvm/uvm_mmap.c 12 Jul 2014 18:44:01 -0000 1.98
> +++ uvm/uvm_mmap.c 30 Sep 2014 19:58:55 -0000
> @@ -1004,6 +1004,8 @@ uvm_mmap(vm_map_t map, vaddr_t *addr, vs
>
> if ((flags & MAP_SHARED) == 0)
> uvmflag |= UVM_FLAG_COPYONW;
> + if (flags & __MAP_NOFAULT)
> + uvmflag |= (UVM_FLAG_NOFAULT | UVM_FLAG_OVERLAY);
> }
>
> /* set up mapping flags */
>
>