kern

Marius Strobl Mon, 22 Feb 2016 17:10:38 -0800

Author: marius
Date: Tue Feb 23 01:09:35 2016
New Revision: 295905
URL: https://svnweb.freebsd.org/changeset/base/295905


Log:
  In preparation for 10.3-RELEASE, temporarily revert the MFC of r291244
  done as part of r292895 on stable/10 as that change causes hangs with
  ZFS and the cause on at least amd64 so far not understood.
  Discussed with:       kib
  For further information see:
  https://lists.freebsd.org/pipermail/freebsd-stable/2016-February/084045.html
  
  PR:           207281
  Approved by:  re (gjb)

Modified:
  stable/10/sys/kern/vfs_subr.c
Directory Properties:
  stable/10/   (props changed)

Modified: stable/10/sys/kern/vfs_subr.c
==============================================================================
--- stable/10/sys/kern/vfs_subr.c       Tue Feb 23 01:08:39 2016        
(r295904)
+++ stable/10/sys/kern/vfs_subr.c       Tue Feb 23 01:09:35 2016        
(r295905)
@@ -145,51 +145,24 @@ int vttoif_tab[10] = {
 static TAILQ_HEAD(freelst, vnode) vnode_free_list;
 
 /*
- * "Free" vnode target.  Free vnodes are rarely completely free, but are
- * just ones that are cheap to recycle.  Usually they are for files which
- * have been stat'd but not read; these usually have inode and namecache
- * data attached to them.  This target is the preferred minimum size of a
- * sub-cache consisting mostly of such files. The system balances the size
- * of this sub-cache with its complement to try to prevent either from
- * thrashing while the other is relatively inactive.  The targets express
- * a preference for the best balance.
- *
- * "Above" this target there are 2 further targets (watermarks) related
- * to recyling of free vnodes.  In the best-operating case, the cache is
- * exactly full, the free list has size between vlowat and vhiwat above the
- * free target, and recycling from it and normal use maintains this state.
- * Sometimes the free list is below vlowat or even empty, but this state
- * is even better for immediate use provided the cache is not full.
- * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free
- * ones) to reach one of these states.  The watermarks are currently hard-
- * coded as 4% and 9% of the available space higher.  These and the default
- * of 25% for wantfreevnodes are too large if the memory size is large.
- * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim
- * whenever vnlru_proc() becomes active.
+ * Free vnode target.  Free vnodes may simply be files which have been stat'd
+ * but not read.  This is somewhat common, and a small cache of such files
+ * should be kept to avoid recreation costs.
  */
 static u_long wantfreevnodes;
-SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW,
-    &wantfreevnodes, 0, "Target for minimum number of \"free\" vnodes");
+SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, 
"");
+/* Number of vnodes in the free list. */
 static u_long freevnodes;
-SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD,
-    &freevnodes, 0, "Number of \"free\" vnodes");
+SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0,
+    "Number of vnodes in the free list");
 
-/*
- * The vfs.vlru_allow_cache_src sysctl variable is no longer used but
- * the sysctl remains to provide ABI compatibility. The new code frees
- * namecache sources as the last chance to satisfy the highest watermark,
- * instead of selecting the source vnodes randomly. This provides good
- * enough behaviour to keep vn_fullpath() working in most situations.
- * The filesystem layout with deep trees, where the depricated knob was
- * required, is thus handled automatically.
- */
 static int vlru_allow_cache_src;
 SYSCTL_INT(_vfs, OID_AUTO, vlru_allow_cache_src, CTLFLAG_RW,
-    &vlru_allow_cache_src, 0, "Placeholder for API compatibility (unused)");
+    &vlru_allow_cache_src, 0, "Allow vlru to reclaim source vnode");
 
 static u_long recycles_count;
 SYSCTL_ULONG(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, 0,
-    "Number of vnodes recycled to meet vnode cache targets");
+    "Number of vnodes recycled to avoid exceding kern.maxvnodes");
 
 /*
  * Various variables used for debugging the new implementation of
@@ -299,13 +272,14 @@ static int syncer_worklist_len;
 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
     syncer_state;
 
-/* Target for maximum number of vnodes. */
+/*
+ * Number of vnodes we want to exist at any one time.  This is mostly used
+ * to size hash tables in vnode-related code.  It is normally not used in
+ * getnewvnode(), as wantfreevnodes is normally nonzero.)
+ *
+ * XXX desiredvnodes is historical cruft and should not exist.
+ */
 int desiredvnodes;
-static int gapvnodes;          /* gap between wanted and desired */
-static int vhiwat;             /* enough extras after expansion */
-static int vlowat;             /* minimal extras before expansion */
-static int vstir;              /* nonzero to stir non-free vnodes */
-static volatile int vsmalltrigger = 8; /* pref to keep if > this many pages */
 
 static int
 sysctl_update_desiredvnodes(SYSCTL_HANDLER_ARGS)
@@ -316,8 +290,6 @@ sysctl_update_desiredvnodes(SYSCTL_HANDL
        if ((error = sysctl_handle_int(oidp, arg1, arg2, req)) != 0)
                return (error);
        if (old_desiredvnodes != desiredvnodes) {
-               wantfreevnodes = desiredvnodes / 4;
-               /* XXX locking seems to be incomplete. */
                vfs_hash_changesize(desiredvnodes);
                cache_changesize(desiredvnodes);
        }
@@ -326,9 +298,9 @@ sysctl_update_desiredvnodes(SYSCTL_HANDL
 
 SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes,
     CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, &desiredvnodes, 0,
-    sysctl_update_desiredvnodes, "I", "Target for maximum number of vnodes");
+    sysctl_update_desiredvnodes, "I", "Maximum number of vnodes");
 SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
-    &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)");
+    &wantfreevnodes, 0, "Minimum number of vnodes (legacy)");
 static int vnlru_nowhere;
 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
     &vnlru_nowhere, 0, "Number of times the vnlru process ran without 
success");
@@ -359,10 +331,10 @@ PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_tr
  *
  * Reevaluate the following cap on the number of vnodes after the physical
  * memory size exceeds 512GB.  In the limit, as the physical memory size
- * grows, the ratio of the memory size in KB to to vnodes approaches 64:1.
+ * grows, the ratio of physical pages to vnodes approaches sixteen to one.
  */
 #ifndef        MAXVNODES_MAX
-#define        MAXVNODES_MAX   (512 * 1024 * 1024 / 64)        /* 8M */
+#define        MAXVNODES_MAX   (512 * (1024 * 1024 * 1024 / (int)PAGE_SIZE / 
16))
 #endif
 
 /*
@@ -433,16 +405,15 @@ vntblinit(void *dummy __unused)
        /*
         * Desiredvnodes is a function of the physical memory size and the
         * kernel's heap size.  Generally speaking, it scales with the
-        * physical memory size.  The ratio of desiredvnodes to the physical
-        * memory size is 1:16 until desiredvnodes exceeds 98,304.
-        * Thereafter, the
-        * marginal ratio of desiredvnodes to the physical memory size is
-        * 1:64.  However, desiredvnodes is limited by the kernel's heap
+        * physical memory size.  The ratio of desiredvnodes to physical pages
+        * is one to four until desiredvnodes exceeds 98,304.  Thereafter, the
+        * marginal ratio of desiredvnodes to physical pages is one to
+        * sixteen.  However, desiredvnodes is limited by the kernel's heap
         * size.  The memory required by desiredvnodes vnodes and vm objects
-        * must not exceed 1/7th of the kernel's heap size.
+        * may not exceed one seventh of the kernel's heap size.
         */
-       physvnodes = maxproc + pgtok(cnt.v_page_count) / 64 +
-           3 * min(98304 * 16, pgtok(cnt.v_page_count)) / 64;
+       physvnodes = maxproc + cnt.v_page_count / 16 + 3 * min(98304 * 4,
+           cnt.v_page_count) / 16;
        virtvnodes = vm_kmem_size / (7 * (sizeof(struct vm_object) +
            sizeof(struct vnode)));
        desiredvnodes = min(physvnodes, virtvnodes);
@@ -831,41 +802,35 @@ vattr_null(struct vattr *vap)
  * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
  */
 static int
-vlrureclaim(struct mount *mp, int reclaim_nc_src, int trigger)
+vlrureclaim(struct mount *mp)
 {
        struct vnode *vp;
-       int count, done, target;
+       int done;
+       int trigger;
+       int usevnodes;
+       int count;
 
+       /*
+        * Calculate the trigger point, don't allow user
+        * screwups to blow us up.   This prevents us from
+        * recycling vnodes with lots of resident pages.  We
+        * aren't trying to free memory, we are trying to
+        * free vnodes.
+        */
+       usevnodes = desiredvnodes;
+       if (usevnodes <= 0)
+               usevnodes = 1;
+       trigger = cnt.v_page_count * 2 / usevnodes;
        done = 0;
        vn_start_write(NULL, &mp, V_WAIT);
        MNT_ILOCK(mp);
-       count = mp->mnt_nvnodelistsize;
-       target = count * (int64_t)gapvnodes / imax(desiredvnodes, 1);
-       target = target / 10 + 1;
-       while (count != 0 && done < target) {
+       count = mp->mnt_nvnodelistsize / 10 + 1;
+       while (count != 0) {
                vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
                while (vp != NULL && vp->v_type == VMARKER)
                        vp = TAILQ_NEXT(vp, v_nmntvnodes);
                if (vp == NULL)
                        break;
-               /*
-                * XXX LRU is completely broken for non-free vnodes.  First
-                * by calling here in mountpoint order, then by moving
-                * unselected vnodes to the end here, and most grossly by
-                * removing the vlruvp() function that was supposed to
-                * maintain the order.  (This function was born broken
-                * since syncer problems prevented it doing anything.)  The
-                * order is closer to LRC (C = Created).
-                *
-                * LRU reclaiming of vnodes seems to have last worked in
-                * FreeBSD-3 where LRU wasn't mentioned under any spelling.
-                * Then there was no hold count, and inactive vnodes were
-                * simply put on the free list in LRU order.  The separate
-                * lists also break LRU.  We prefer to reclaim from the
-                * free list for technical reasons.  This tends to thrash
-                * the free list to keep very unrecently used held vnodes.
-                * The problem is mitigated by keeping the free list large.
-                */
                TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
                TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
                --count;
@@ -874,12 +839,10 @@ vlrureclaim(struct mount *mp, int reclai
                /*
                 * If it's been deconstructed already, it's still
                 * referenced, or it exceeds the trigger, skip it.
-                * Also skip free vnodes.  We are trying to make space
-                * to expand the free list, not reduce it.
                 */
                if (vp->v_usecount ||
-                   (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) ||
-                   ((vp->v_iflag & VI_FREE) != 0) ||
+                   (!vlru_allow_cache_src &&
+                       !LIST_EMPTY(&(vp)->v_cache_src)) ||
                    (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL &&
                    vp->v_object->resident_page_count > trigger)) {
                        VI_UNLOCK(vp);
@@ -905,8 +868,8 @@ vlrureclaim(struct mount *mp, int reclai
                 * vnode lock before our VOP_LOCK() call fails.
                 */
                if (vp->v_usecount ||
-                   (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) ||
-                   (vp->v_iflag & VI_FREE) != 0 ||
+                   (!vlru_allow_cache_src &&
+                       !LIST_EMPTY(&(vp)->v_cache_src)) ||
                    (vp->v_object != NULL &&
                    vp->v_object->resident_page_count > trigger)) {
                        VOP_UNLOCK(vp, LK_INTERLOCK);
@@ -939,7 +902,7 @@ relock_mnt:
 }
 
 /*
- * Attempt to reduce the free list by the requested amount.
+ * Attempt to keep the free list at wantfreevnodes length.
  */
 static void
 vnlru_free(int count)
@@ -996,24 +959,6 @@ vnlru_free(int count)
                mtx_lock(&vnode_free_list_mtx);
        }
 }
-
-/* XXX some names and initialization are bad for limits and watermarks. */
-static int
-vspace(void)
-{
-       int space;
-
-       gapvnodes = imax(desiredvnodes - wantfreevnodes, 100);
-       vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */
-       vlowat = vhiwat / 2;
-       if (numvnodes > desiredvnodes)
-               return (0);
-       space = desiredvnodes - numvnodes;
-       if (freevnodes > wantfreevnodes)
-               space += freevnodes - wantfreevnodes;
-       return (space);
-}
-
 /*
  * Attempt to recycle vnodes in a context that is always safe to block.
  * Calling vlrurecycle() from the bowels of filesystem code has some
@@ -1026,36 +971,18 @@ static void
 vnlru_proc(void)
 {
        struct mount *mp, *nmp;
-       unsigned long ofreevnodes, onumvnodes;
-       int done, force, reclaim_nc_src, trigger, usevnodes;
+       int done;
+       struct proc *p = vnlruproc;
 
-       EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc,
+       EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
            SHUTDOWN_PRI_FIRST);
 
-       force = 0;
        for (;;) {
-               kproc_suspend_check(vnlruproc);
+               kproc_suspend_check(p);
                mtx_lock(&vnode_free_list_mtx);
-               /*
-                * If numvnodes is too large (due to desiredvnodes being
-                * adjusted using its sysctl, or emergency growth), first
-                * try to reduce it by discarding from the free list.
-                */
-               if (numvnodes > desiredvnodes && freevnodes > 0)
-                       vnlru_free(ulmin(numvnodes - desiredvnodes,
-                           freevnodes));
-               /*
-                * Sleep if the vnode cache is in a good state.  This is
-                * when it is not over-full and has space for about a 4%
-                * or 9% expansion (by growing its size or inexcessively
-                * reducing its free list).  Otherwise, try to reclaim
-                * space for a 10% expansion.
-                */
-               if (vstir && force == 0) {
-                       force = 1;
-                       vstir = 0;
-               }
-               if (vspace() >= vlowat && force == 0) {
+               if (freevnodes > wantfreevnodes)
+                       vnlru_free(freevnodes - wantfreevnodes);
+               if (numvnodes <= desiredvnodes * 9 / 10) {
                        vnlruproc_sig = 0;
                        wakeup(&vnlruproc_sig);
                        msleep(vnlruproc, &vnode_free_list_mtx,
@@ -1064,66 +991,30 @@ vnlru_proc(void)
                }
                mtx_unlock(&vnode_free_list_mtx);
                done = 0;
-               ofreevnodes = freevnodes;
-               onumvnodes = numvnodes;
-               /*
-                * Calculate parameters for recycling.  These are the same
-                * throughout the loop to give some semblance of fairness.
-                * The trigger point is to avoid recycling vnodes with lots
-                * of resident pages.  We aren't trying to free memory; we
-                * are trying to recycle or at least free vnodes.
-                */
-               if (numvnodes <= desiredvnodes)
-                       usevnodes = numvnodes - freevnodes;
-               else
-                       usevnodes = numvnodes;
-               if (usevnodes <= 0)
-                       usevnodes = 1;
-               /*
-                * The trigger value is is chosen to give a conservatively
-                * large value to ensure that it alone doesn't prevent
-                * making progress.  The value can easily be so large that
-                * it is effectively infinite in some congested and
-                * misconfigured cases, and this is necessary.  Normally
-                * it is about 8 to 100 (pages), which is quite large.
-                */
-               trigger = cnt.v_page_count * 2 / usevnodes;
-               if (force < 2)
-                       trigger = vsmalltrigger;
-               reclaim_nc_src = force >= 3;
                mtx_lock(&mountlist_mtx);
                for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
                        if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
                                nmp = TAILQ_NEXT(mp, mnt_list);
                                continue;
                        }
-                       done += vlrureclaim(mp, reclaim_nc_src, trigger);
+                       done += vlrureclaim(mp);
                        mtx_lock(&mountlist_mtx);
                        nmp = TAILQ_NEXT(mp, mnt_list);
                        vfs_unbusy(mp);
                }
                mtx_unlock(&mountlist_mtx);
-               if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes)
-                       uma_reclaim();
                if (done == 0) {
-                       if (force == 0 || force == 1) {
-                               force = 2;
-                               continue;
-                       }
-                       if (force == 2) {
-                               force = 3;
-                               continue;
-                       }
-                       force = 0;
+#if 0
+                       /* These messages are temporary debugging aids */
+                       if (vnlru_nowhere < 5)
+                               printf("vnlru process getting nowhere..\n");
+                       else if (vnlru_nowhere == 5)
+                               printf("vnlru process messages stopped.\n");
+#endif
                        vnlru_nowhere++;
                        tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
                } else
                        kern_yield(PRI_USER);
-               /*
-                * After becoming active to expand above low water, keep
-                * active until above high water.
-                */
-               force = vspace() < vhiwat;
        }
 }
 
@@ -1197,31 +1088,22 @@ vtryrecycle(struct vnode *vp)
        return (0);
 }
 
-static void
-vcheckspace(void)
-{
-
-       if (vspace() < vlowat && vnlruproc_sig == 0) {
-               vnlruproc_sig = 1;
-               wakeup(vnlruproc);
-       }
-}
-
 /*
- * Wait if necessary for space for a new vnode.
+ * Wait for available vnodes.
  */
 static int
 getnewvnode_wait(int suspended)
 {
 
        mtx_assert(&vnode_free_list_mtx, MA_OWNED);
-       if (numvnodes >= desiredvnodes) {
+       if (numvnodes > desiredvnodes) {
                if (suspended) {
                        /*
-                        * The file system is being suspended.  We cannot
-                        * risk a deadlock here, so allow allocation of
-                        * another vnode even if this would give too many.
+                        * File system is beeing suspended, we cannot risk a
+                        * deadlock here, so allocate new vnode anyway.
                         */
+                       if (freevnodes > wantfreevnodes)
+                               vnlru_free(freevnodes - wantfreevnodes);
                        return (0);
                }
                if (vnlruproc_sig == 0) {
@@ -1231,34 +1113,18 @@ getnewvnode_wait(int suspended)
                msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
                    "vlruwk", hz);
        }
-       /* Post-adjust like the pre-adjust in getnewvnode(). */
-       if (numvnodes + 1 > desiredvnodes && freevnodes > 1)
-               vnlru_free(1);
-       return (numvnodes >= desiredvnodes ? ENFILE : 0);
+       return (numvnodes > desiredvnodes ? ENFILE : 0);
 }
 
-/*
- * This hack is fragile, and probably not needed any more now that the
- * watermark handling works.
- */
 void
 getnewvnode_reserve(u_int count)
 {
        struct thread *td;
 
-       /* Pre-adjust like the pre-adjust in getnewvnode(), with any count. */
-       /* XXX no longer so quick, but this part is not racy. */
-       mtx_lock(&vnode_free_list_mtx);
-       if (numvnodes + count > desiredvnodes && freevnodes > wantfreevnodes)
-               vnlru_free(ulmin(numvnodes + count - desiredvnodes,
-                   freevnodes - wantfreevnodes));
-       mtx_unlock(&vnode_free_list_mtx);
-
        td = curthread;
        /* First try to be quick and racy. */
        if (atomic_fetchadd_long(&numvnodes, count) + count <= desiredvnodes) {
                td->td_vp_reserv += count;
-               vcheckspace();  /* XXX no longer so quick, but more racy */
                return;
        } else
                atomic_subtract_long(&numvnodes, count);
@@ -1271,18 +1137,9 @@ getnewvnode_reserve(u_int count)
                        atomic_add_long(&numvnodes, 1);
                }
        }
-       vcheckspace();
        mtx_unlock(&vnode_free_list_mtx);
 }
 
-/*
- * This hack is fragile, especially if desiredvnodes or wantvnodes are
- * misconfgured or changed significantly.  Reducing desiredvnodes below
- * the reserved amount should cause bizarre behaviour like reducing it
- * below the number of active vnodes -- the system will try to reduce
- * numvnodes to match, but should fail, so the subtraction below should
- * not overflow.
- */
 void
 getnewvnode_drop_reserve(void)
 {
@@ -1303,7 +1160,6 @@ getnewvnode(const char *tag, struct moun
        struct vnode *vp;
        struct thread *td;
        struct lock_object *lo;
-       static int cyclecount;
        int error;
 
        CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
@@ -1314,37 +1170,19 @@ getnewvnode(const char *tag, struct moun
                goto alloc;
        }
        mtx_lock(&vnode_free_list_mtx);
-       if (numvnodes < desiredvnodes)
-               cyclecount = 0;
-       else if (cyclecount++ >= freevnodes) {
-               cyclecount = 0;
-               vstir = 1;
-       }
-       /*
-        * Grow the vnode cache if it will not be above its target max
-        * after growing.  Otherwise, if the free list is nonempty, try
-        * to reclaim 1 item from it before growing the cache (possibly
-        * above its target max if the reclamation failed or is delayed).
-        * Otherwise, wait for some space.  In all cases, schedule
-        * vnlru_proc() if we are getting short of space.  The watermarks
-        * should be chosen so that we never wait or even reclaim from
-        * the free list to below its target minimum.
-        */
-       if (numvnodes + 1 <= desiredvnodes)
-               ;
-       else if (freevnodes > 0)
+       /*
+        * Lend our context to reclaim vnodes if they've exceeded the max.
+        */
+       if (freevnodes > wantfreevnodes)
                vnlru_free(1);
-       else {
-               error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag &
-                   MNTK_SUSPEND));
+       error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag &
+           MNTK_SUSPEND));
 #if 0  /* XXX Not all VFS_VGET/ffs_vget callers check returns. */
-               if (error != 0) {
-                       mtx_unlock(&vnode_free_list_mtx);
-                       return (error);
-               }
-#endif
+       if (error != 0) {
+               mtx_unlock(&vnode_free_list_mtx);
+               return (error);
        }
-       vcheckspace();
+#endif
        atomic_add_long(&numvnodes, 1);
        mtx_unlock(&vnode_free_list_mtx);
 alloc:
_______________________________________________
svn-src-stable-10@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-stable-10
To unsubscribe, send any mail to "svn-src-stable-10-unsubscr...@freebsd.org"

svn commit: r295905 - stable/10/sys/kern

Reply via email to