Author: kib
Date: Wed Sep 11 06:41:15 2013
New Revision: 255467
URL: http://svnweb.freebsd.org/changeset/base/255467

Log:
  Implement sendfile(2) for the posix shared memory segment file descriptor,
  in addition to the regular files.
  
  Requested by: alc
  Discussed with:       emaste
  Tested by:    pho (previous version)
  Sponsored by: The FreeBSD Foundation
  Approved by:  re (hrs)

Modified:
  head/sys/kern/uipc_shm.c
  head/sys/kern/uipc_syscalls.c

Modified: head/sys/kern/uipc_shm.c
==============================================================================
--- head/sys/kern/uipc_shm.c    Wed Sep 11 06:16:12 2013        (r255466)
+++ head/sys/kern/uipc_shm.c    Wed Sep 11 06:41:15 2013        (r255467)
@@ -134,7 +134,7 @@ static struct fileops shm_ops = {
        .fo_close = shm_close,
        .fo_chmod = shm_chmod,
        .fo_chown = shm_chown,
-       .fo_sendfile = invfo_sendfile,
+       .fo_sendfile = vn_sendfile,
        .fo_seek = shm_seek,
        .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
 };

Modified: head/sys/kern/uipc_syscalls.c
==============================================================================
--- head/sys/kern/uipc_syscalls.c       Wed Sep 11 06:16:12 2013        
(r255466)
+++ head/sys/kern/uipc_syscalls.c       Wed Sep 11 06:41:15 2013        
(r255467)
@@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/capability.h>
+#include <sys/condvar.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
@@ -57,6 +58,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/file.h>
 #include <sys/filio.h>
 #include <sys/jail.h>
+#include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
@@ -86,7 +88,7 @@ __FBSDID("$FreeBSD$");
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
-#include <vm/vm_pageout.h>
+#include <vm/vm_pager.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 
@@ -1850,8 +1852,6 @@ getsockaddr(namp, uaddr, len)
        return (error);
 }
 
-#include <sys/condvar.h>
-
 struct sendfile_sync {
        struct mtx      mtx;
        struct cv       cv;
@@ -1917,6 +1917,10 @@ do_sendfile(struct thread *td, struct se
        cap_rights_t rights;
        int error;
 
+       /*
+        * File offset must be positive.  If it goes beyond EOF
+        * we send only the header/trailer and no payload data.
+        */
        if (uap->offset < 0)
                return (EINVAL);
 
@@ -1978,79 +1982,240 @@ freebsd4_sendfile(struct thread *td, str
 }
 #endif /* COMPAT_FREEBSD4 */
 
-int
-vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
-    struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
-    int kflags, struct thread *td)
+static int
+sendfile_readpage(vm_object_t obj, struct vnode *vp, int nd,
+    off_t off, int xfsize, int bsize, struct thread *td, vm_page_t *res)
 {
-       struct vnode *vp = fp->f_vnode;
-       struct file *sock_fp;
-       struct vm_object *obj = NULL;
-       struct socket *so = NULL;
-       struct mbuf *m = NULL;
-       struct sf_buf *sf;
-       struct vm_page *pg;
-       struct vattr va;
-       struct sendfile_sync *sfs = NULL;
-       cap_rights_t rights;
-       off_t off, xfsize, fsbytes = 0, sbytes = 0, rem = 0;
-       int bsize, error, hdrlen = 0, mnw = 0;
+       vm_page_t m;
+       vm_pindex_t pindex;
+       ssize_t resid;
+       int error, readahead, rv;
+
+       pindex = OFF_TO_IDX(off);
+       VM_OBJECT_WLOCK(obj);
+       m = vm_page_grab(obj, pindex, (vp != NULL ? VM_ALLOC_NOBUSY |
+           VM_ALLOC_IGN_SBUSY : 0) | VM_ALLOC_WIRED | VM_ALLOC_NORMAL);
 
-       vn_lock(vp, LK_SHARED | LK_RETRY);
-       if (vp->v_type == VREG) {
-               bsize = vp->v_mount->mnt_stat.f_iosize;
-               if (nbytes == 0) {
-                       error = VOP_GETATTR(vp, &va, td->td_ucred);
-                       if (error != 0) {
-                               VOP_UNLOCK(vp, 0);
-                               obj = NULL;
-                               goto out;
+       /*
+        * Check if page is valid for what we need, otherwise initiate I/O.
+        *
+        * The non-zero nd argument prevents disk I/O, instead we
+        * return the caller what he specified in nd.  In particular,
+        * if we already turned some pages into mbufs, nd == EAGAIN
+        * and the main function send them the pages before we come
+        * here again and block.
+        */
+       if (m->valid != 0 && vm_page_is_valid(m, off & PAGE_MASK, xfsize)) {
+               if (vp == NULL)
+                       vm_page_xunbusy(m);
+               VM_OBJECT_WUNLOCK(obj);
+               *res = m;
+               return (0);
+       } else if (nd != 0) {
+               if (vp == NULL)
+                       vm_page_xunbusy(m);
+               error = nd;
+               goto free_page;
+       }
+
+       /*
+        * Get the page from backing store.
+        */
+       error = 0;
+       if (vp != NULL) {
+               VM_OBJECT_WUNLOCK(obj);
+               readahead = sfreadahead * MAXBSIZE;
+
+               /*
+                * Use vn_rdwr() instead of the pager interface for
+                * the vnode, to allow the read-ahead.
+                *
+                * XXXMAC: Because we don't have fp->f_cred here, we
+                * pass in NOCRED.  This is probably wrong, but is
+                * consistent with our original implementation.
+                */
+               error = vn_rdwr(UIO_READ, vp, NULL, readahead, trunc_page(off),
+                   UIO_NOCOPY, IO_NODELOCKED | IO_VMIO | ((readahead /
+                   bsize) << IO_SEQSHIFT), td->td_ucred, NOCRED, &resid, td);
+               SFSTAT_INC(sf_iocnt);
+               VM_OBJECT_WLOCK(obj);
+       } else {
+               if (vm_pager_has_page(obj, pindex, NULL, NULL)) {
+                       rv = vm_pager_get_pages(obj, &m, 1, 0);
+                       SFSTAT_INC(sf_iocnt);
+                       m = vm_page_lookup(obj, pindex);
+                       if (m == NULL)
+                               error = EIO;
+                       else if (rv != VM_PAGER_OK) {
+                               vm_page_lock(m);
+                               vm_page_free(m);
+                               vm_page_unlock(m);
+                               m = NULL;
+                               error = EIO;
                        }
-                       rem = va.va_size;
-               } else
-                       rem = nbytes;
+               } else {
+                       pmap_zero_page(m);
+                       m->valid = VM_PAGE_BITS_ALL;
+                       m->dirty = 0;
+               }
+               if (m != NULL)
+                       vm_page_xunbusy(m);
+       }
+       if (error == 0) {
+               *res = m;
+       } else if (m != NULL) {
+free_page:
+               vm_page_lock(m);
+               vm_page_unwire(m, 0);
+
+               /*
+                * See if anyone else might know about this page.  If
+                * not and it is not valid, then free it.
+                */
+               if (m->wire_count == 0 && m->valid == 0 && !vm_page_busied(m))
+                       vm_page_free(m);
+               vm_page_unlock(m);
+       }
+       VM_OBJECT_WUNLOCK(obj);
+       KASSERT(error != 0 || (m->wire_count > 0 && m->valid ==
+           VM_PAGE_BITS_ALL),
+           ("wrong page state m %p", m));
+       return (error);
+}
+
+static int
+sendfile_getobj(struct thread *td, struct file *fp, vm_object_t *obj_res,
+    struct vnode **vp_res, struct shmfd **shmfd_res, off_t *obj_size,
+    int *bsize)
+{
+       struct vattr va;
+       vm_object_t obj;
+       struct vnode *vp;
+       struct shmfd *shmfd;
+       int error;
+
+       vp = *vp_res = NULL;
+       obj = NULL;
+       shmfd = *shmfd_res = NULL;
+       *bsize = 0;
+
+       /*
+        * The file descriptor must be a regular file and have a
+        * backing VM object.
+        */
+       if (fp->f_type == DTYPE_VNODE) {
+               vp = fp->f_vnode;
+               vn_lock(vp, LK_SHARED | LK_RETRY);
+               if (vp->v_type != VREG) {
+                       error = EINVAL;
+                       goto out;
+               }
+               *bsize = vp->v_mount->mnt_stat.f_iosize;
+               error = VOP_GETATTR(vp, &va, td->td_ucred);
+               if (error != 0)
+                       goto out;
+               *obj_size = va.va_size;
                obj = vp->v_object;
-               if (obj != NULL) {
-                       /*
-                        * Temporarily increase the backing VM
-                        * object's reference count so that a forced
-                        * reclamation of its vnode does not
-                        * immediately destroy it.
-                        */
-                       VM_OBJECT_WLOCK(obj);
-                       if ((obj->flags & OBJ_DEAD) == 0) {
-                               vm_object_reference_locked(obj);
-                               VM_OBJECT_WUNLOCK(obj);
-                       } else {
-                               VM_OBJECT_WUNLOCK(obj);
-                               obj = NULL;
-                       }
+               if (obj == NULL) {
+                       error = EINVAL;
+                       goto out;
                }
-       } else
-               bsize = 0;      /* silence gcc */
-       VOP_UNLOCK(vp, 0);
-       if (obj == NULL) {
+       } else if (fp->f_type == DTYPE_SHM) {
+               shmfd = fp->f_data;
+               obj = shmfd->shm_object;
+               *obj_size = shmfd->shm_size;
+       } else {
                error = EINVAL;
                goto out;
        }
 
+       VM_OBJECT_WLOCK(obj);
+       if ((obj->flags & OBJ_DEAD) != 0) {
+               VM_OBJECT_WUNLOCK(obj);
+               error = EBADF;
+               goto out;
+       }
+
+       /*
+        * Temporarily increase the backing VM object's reference
+        * count so that a forced reclamation of its vnode does not
+        * immediately destroy it.
+        */
+       vm_object_reference_locked(obj);
+       VM_OBJECT_WUNLOCK(obj);
+       *obj_res = obj;
+       *vp_res = vp;
+       *shmfd_res = shmfd;
+
+out:
+       if (vp != NULL)
+               VOP_UNLOCK(vp, 0);
+       return (error);
+}
+
+static int
+kern_sendfile_getsock(struct thread *td, int s, struct file **sock_fp,
+    struct socket **so)
+{
+       cap_rights_t rights;
+       int error;
+
+       *sock_fp = NULL;
+       *so = NULL;
+
        /*
         * The socket must be a stream socket and connected.
-        * Remember if it a blocking or non-blocking socket.
         */
-       error = getsock_cap(td->td_proc->p_fd, sockfd,
-           cap_rights_init(&rights, CAP_SEND), &sock_fp, NULL);
+       error = getsock_cap(td->td_proc->p_fd, s, cap_rights_init(&rights,
+           CAP_SEND), sock_fp, NULL);
+       if (error != 0)
+               return (error);
+       *so = (*sock_fp)->f_data;
+       if ((*so)->so_type != SOCK_STREAM)
+               return (EINVAL);
+       if (((*so)->so_state & SS_ISCONNECTED) == 0)
+               return (ENOTCONN);
+       return (0);
+}
+
+int
+vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
+    struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
+    int kflags, struct thread *td)
+{
+       struct file *sock_fp;
+       struct vnode *vp;
+       struct vm_object *obj;
+       struct socket *so;
+       struct mbuf *m;
+       struct sf_buf *sf;
+       struct vm_page *pg;
+       struct shmfd *shmfd;
+       struct sendfile_sync *sfs;
+       struct vattr va;
+       off_t off, xfsize, fsbytes, sbytes, rem, obj_size;
+       int error, bsize, nd, hdrlen, mnw;
+       bool inflight_called;
+
+       obj = NULL;
+       so = NULL;
+       m = NULL;
+       sfs = NULL;
+       fsbytes = sbytes = 0;
+       hdrlen = mnw = 0;
+       rem = nbytes;
+       inflight_called = false;
+
+       error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize);
+       if (error != 0)
+               return (error);
+       if (rem == 0)
+               rem = obj_size;
+
+       error = kern_sendfile_getsock(td, sockfd, &sock_fp, &so);
        if (error != 0)
                goto out;
-       so = sock_fp->f_data;
-       if (so->so_type != SOCK_STREAM) {
-               error = EINVAL;
-               goto out;
-       }
-       if ((so->so_state & SS_ISCONNECTED) == 0) {
-               error = ENOTCONN;
-               goto out;
-       }
+
        /*
         * Do not wait on memory allocations but return ENOMEM for
         * caller to retry later.
@@ -2123,7 +2288,7 @@ vn_sendfile(struct file *fp, int sockfd,
                int done;
 
                if ((nbytes != 0 && nbytes == fsbytes) ||
-                   (nbytes == 0 && va.va_size == fsbytes))
+                   (nbytes == 0 && obj_size == fsbytes))
                        break;
 
                mtail = NULL;
@@ -2197,13 +2362,16 @@ retry_space:
                 */
                space -= hdrlen;
 
-               error = vn_lock(vp, LK_SHARED);
-               if (error != 0)
-                       goto done;
-               error = VOP_GETATTR(vp, &va, td->td_ucred);
-               if (error != 0 || off >= va.va_size) {
-                       VOP_UNLOCK(vp, 0);
-                       goto done;
+               if (vp != NULL) {
+                       error = vn_lock(vp, LK_SHARED);
+                       if (error != 0)
+                               goto done;
+                       error = VOP_GETATTR(vp, &va, td->td_ucred);
+                       if (error != 0 || off >= va.va_size) {
+                               VOP_UNLOCK(vp, 0);
+                               goto done;
+                       }
+                       obj_size = va.va_size;
                }
 
                /*
@@ -2211,7 +2379,6 @@ retry_space:
                 * dumped into socket buffer.
                 */
                while (space > loopbytes) {
-                       vm_pindex_t pindex;
                        vm_offset_t pgoff;
                        struct mbuf *m0;
 
@@ -2221,7 +2388,7 @@ retry_space:
                         * or the passed in nbytes.
                         */
                        pgoff = (vm_offset_t)(off & PAGE_MASK);
-                       rem = va.va_size - offset;
+                       rem = obj_size - offset;
                        if (nbytes != 0)
                                rem = omin(rem, nbytes);
                        rem -= fsbytes + loopbytes;
@@ -2236,59 +2403,15 @@ retry_space:
                         * Attempt to look up the page.  Allocate
                         * if not found or wait and loop if busy.
                         */
-                       pindex = OFF_TO_IDX(off);
-                       VM_OBJECT_WLOCK(obj);
-                       pg = vm_page_grab(obj, pindex, VM_ALLOC_NOBUSY |
-                           VM_ALLOC_IGN_SBUSY | VM_ALLOC_NORMAL |
-                           VM_ALLOC_WIRED);
-
-                       /*
-                        * Check if page is valid for what we need,
-                        * otherwise initiate I/O.
-                        * If we already turned some pages into mbufs,
-                        * send them off before we come here again and
-                        * block.
-                        */
-                       if (pg->valid && vm_page_is_valid(pg, pgoff, xfsize))
-                               VM_OBJECT_WUNLOCK(obj);
-                       else if (m != NULL)
-                               error = EAGAIN; /* send what we already got */
-                       else if (flags & SF_NODISKIO)
-                               error = EBUSY;
-                       else {
-                               ssize_t resid;
-                               int readahead = sfreadahead * MAXBSIZE;
-
-                               VM_OBJECT_WUNLOCK(obj);
-
-                               /*
-                                * Get the page from backing store.
-                                * XXXMAC: Because we don't have fp->f_cred
-                                * here, we pass in NOCRED.  This is probably
-                                * wrong, but is consistent with our original
-                                * implementation.
-                                */
-                               error = vn_rdwr(UIO_READ, vp, NULL, readahead,
-                                   trunc_page(off), UIO_NOCOPY, IO_NODELOCKED |
-                                   IO_VMIO | ((readahead / bsize) << 
IO_SEQSHIFT),
-                                   td->td_ucred, NOCRED, &resid, td);
-                               SFSTAT_INC(sf_iocnt);
-                               if (error != 0)
-                                       VM_OBJECT_WLOCK(obj);
-                       }
+                       if (m != NULL)
+                               nd = EAGAIN; /* send what we already got */
+                       else if ((flags & SF_NODISKIO) != 0)
+                               nd = EBUSY;
+                       else
+                               nd = 0;
+                       error = sendfile_readpage(obj, vp, nd, off,
+                           xfsize, bsize, td, &pg);
                        if (error != 0) {
-                               vm_page_lock(pg);
-                               vm_page_unwire(pg, 0);
-                               /*
-                                * See if anyone else might know about
-                                * this page.  If not and it is not valid,
-                                * then free it.
-                                */
-                               if (pg->wire_count == 0 && pg->valid == 0 &&
-                                   !vm_page_busied(pg))
-                                       vm_page_free(pg);
-                               vm_page_unlock(pg);
-                               VM_OBJECT_WUNLOCK(obj);
                                if (error == EAGAIN)
                                        error = 0;      /* not a real error */
                                break;
@@ -2358,7 +2481,8 @@ retry_space:
                        }
                }
 
-               VOP_UNLOCK(vp, 0);
+               if (vp != NULL)
+                       VOP_UNLOCK(vp, 0);
 
                /* Add the buffer chain to the socket buffer. */
                if (m != NULL) {
_______________________________________________
svn-src-head@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to