On Fri, 10 Jun 2011, Jeff Roberson wrote:

Author: jeff
Date: Fri Jun 10 22:48:35 2011
New Revision: 222958
URL: http://svn.freebsd.org/changeset/base/222958

Log:
 Implement fully asynchronous partial truncation with softupdates journaling
 to resolve errors which can cause corruption on recovery with the old
 synchronous mechanism.


This diff is enormous and took months of work. I'm sorry to get it in so close to 9.0, I had no idea it would take so long. pho has tested multiple versions of the patch with and without journaling for days of test time and it has probably racked up a week of machine time for me but there may be problems given that it is so huge.

There is still a snapshot problem with SUJ that mckusick and I are working on. Expect to see some checkins for that soon.

Thanks,
Jeff


  - Append partial truncation freework structures to indirdeps while
    truncation is proceeding.  These prevent new block pointers from
    becoming valid until truncation completes and serialize truncations.
  - On completion of a partial truncate journal work waits for zeroed
    pointers to hit indirects.
  - softdep_journal_freeblocks() handles last frag allocation and last
    block zeroing.
  - vtruncbuf/ffs_page_remove moved into softdep_*_freeblocks() so it
    is only implemented in one place.
  - Block allocation failure handling moved up one level so it does not
    proceed with buf locks held.  This permits us to do more extensive
    reclaims when filesystem space is exhausted.
  - softdep_sync_metadata() is broken into two parts, the first executes
    once at the start of ffs_syncvnode() and flushes truncations and
    inode dependencies.  The second is called on each locked buf.  This
    eliminates excessive looping and rollbacks.
  - Improve the mechanism in process_worklist_item() that handles
    acquiring vnode locks for handle_workitem_remove() so that it works
    more generally and does not loop excessively over the same worklist
    items on each call.
  - Don't corrupt directories by zeroing the tail in fsck.  This is only
    done for regular files.
  - Push a fsync complete record for files that need it so the checker
    knows a truncation in the journal is no longer valid.

 Discussed with:        mckusick, kib (ffs_pages_remove and ffs_truncate parts)
 Tested by:     pho

Modified:
 head/sbin/fsck_ffs/suj.c
 head/sys/sys/vnode.h
 head/sys/ufs/ffs/ffs_alloc.c
 head/sys/ufs/ffs/ffs_balloc.c
 head/sys/ufs/ffs/ffs_extern.h
 head/sys/ufs/ffs/ffs_inode.c
 head/sys/ufs/ffs/ffs_softdep.c
 head/sys/ufs/ffs/ffs_vfsops.c
 head/sys/ufs/ffs/ffs_vnops.c
 head/sys/ufs/ffs/fs.h
 head/sys/ufs/ffs/softdep.h
 head/sys/ufs/ufs/inode.h
 head/sys/ufs/ufs/ufsmount.h

Modified: head/sbin/fsck_ffs/suj.c
==============================================================================
--- head/sbin/fsck_ffs/suj.c    Fri Jun 10 22:42:00 2011        (r222957)
+++ head/sbin/fsck_ffs/suj.c    Fri Jun 10 22:48:35 2011        (r222958)
@@ -1604,7 +1604,7 @@ ino_trunc(ino_t ino, off_t size)
         * uninitialized space later.
         */
        off = blkoff(fs, size);
-       if (off) {
+       if (off && DIP(ip, di_mode) != IFDIR) {
                uint8_t *buf;
                long clrsize;

@@ -1775,13 +1775,18 @@ cg_trunc(struct suj_cg *sc)
        struct suj_ino *sino;
        int i;

-       for (i = 0; i < SUJ_HASHSIZE; i++)
-               LIST_FOREACH(sino, &sc->sc_inohash[i], si_next)
+       for (i = 0; i < SUJ_HASHSIZE; i++) {
+               LIST_FOREACH(sino, &sc->sc_inohash[i], si_next) {
                        if (sino->si_trunc) {
                                ino_trunc(sino->si_ino,
                                    sino->si_trunc->jt_size);
+                               sino->si_blkadj = 0;
                                sino->si_trunc = NULL;
                        }
+                       if (sino->si_blkadj)
+                               ino_adjblks(sino);
+               }
+       }
}

/*
@@ -1791,7 +1796,6 @@ cg_trunc(struct suj_cg *sc)
static void
cg_check_blk(struct suj_cg *sc)
{
-       struct suj_ino *sino;
        struct suj_blk *sblk;
        int i;

@@ -1799,15 +1803,6 @@ cg_check_blk(struct suj_cg *sc)
        for (i = 0; i < SUJ_HASHSIZE; i++)
                LIST_FOREACH(sblk, &sc->sc_blkhash[i], sb_next)
                        blk_check(sblk);
-       /*
-        * Now that we've freed blocks which are not referenced we
-        * make a second pass over all inodes to adjust their block
-        * counts.
-        */
-       for (i = 0; i < SUJ_HASHSIZE; i++)
-               LIST_FOREACH(sino, &sc->sc_inohash[i], si_next)
-                       if (sino->si_blkadj)
-                               ino_adjblks(sino);
}

/*
@@ -1961,14 +1956,7 @@ ino_append(union jrec *rec)
                    "parent %d, diroff %jd\n",
                    refrec->jr_op, refrec->jr_ino, refrec->jr_nlink,
                    refrec->jr_parent, refrec->jr_diroff);
-       /*
-        * Lookup the ino and clear truncate if one is found.  Partial
-        * truncates are always done synchronously so if we discover
-        * an operation that requires a lock the truncation has completed
-        * and can be discarded.
-        */
        sino = ino_lookup(((struct jrefrec *)rec)->jr_ino, 1);
-       sino->si_trunc = NULL;
        sino->si_hasrecs = 1;
        srec = errmalloc(sizeof(*srec));
        srec->sr_rec = rec;
@@ -2174,9 +2162,7 @@ blk_build(struct jblkrec *blkrec)
        struct suj_rec *srec;
        struct suj_blk *sblk;
        struct jblkrec *blkrn;
-       struct suj_ino *sino;
        ufs2_daddr_t blk;
-       off_t foff;
        int frag;

        if (debug)
@@ -2185,17 +2171,6 @@ blk_build(struct jblkrec *blkrec)
                    blkrec->jb_op, blkrec->jb_blkno, blkrec->jb_frags,
                    blkrec->jb_oldfrags, blkrec->jb_ino, blkrec->jb_lbn);

-       /*
-        * Look up the inode and clear the truncate if any lbns after the
-        * truncate lbn are freed or allocated.
-        */
-       sino = ino_lookup(blkrec->jb_ino, 0);
-       if (sino && sino->si_trunc) {
-               foff = lblktosize(fs, blkrec->jb_lbn);
-               foff += lfragtosize(fs, blkrec->jb_frags);
-               if (foff > sino->si_trunc->jt_size)
-                       sino->si_trunc = NULL;
-       }
        blk = blknum(fs, blkrec->jb_blkno);
        frag = fragnum(fs, blkrec->jb_blkno);
        sblk = blk_lookup(blk, 1);
@@ -2242,10 +2217,15 @@ ino_build_trunc(struct jtrncrec *rec)
        struct suj_ino *sino;

        if (debug)
-               printf("ino_build_trunc: ino %d, size %jd\n",
-                   rec->jt_ino, rec->jt_size);
+               printf("ino_build_trunc: op %d ino %d, size %jd\n",
+                   rec->jt_op, rec->jt_ino, rec->jt_size);
        sino = ino_lookup(rec->jt_ino, 1);
-       sino->si_trunc = rec;
+       if (rec->jt_op == JOP_SYNC) {
+               sino->si_trunc = NULL;
+               return;
+       }
+       if (sino->si_trunc == NULL || sino->si_trunc->jt_size > rec->jt_size)
+               sino->si_trunc = rec;
}

/*

Modified: head/sys/sys/vnode.h
==============================================================================
--- head/sys/sys/vnode.h        Fri Jun 10 22:42:00 2011        (r222957)
+++ head/sys/sys/vnode.h        Fri Jun 10 22:48:35 2011        (r222958)
@@ -302,6 +302,7 @@ struct vattr {
#define IO_EXT          0x0400          /* operate on external attributes */
#define IO_NORMAL       0x0800          /* operate on regular data */
#define IO_NOMACCHECK   0x1000          /* MAC checks unnecessary */
+#define        IO_BUFLOCKED    0x2000          /* ffs flag; indir buf is 
locked */

#define IO_SEQMAX       0x7F            /* seq heuristic max value */
#define IO_SEQSHIFT     16              /* seq heuristic in upper 16 bits */

Modified: head/sys/ufs/ffs/ffs_alloc.c
==============================================================================
--- head/sys/ufs/ffs/ffs_alloc.c        Fri Jun 10 22:42:00 2011        
(r222957)
+++ head/sys/ufs/ffs/ffs_alloc.c        Fri Jun 10 22:48:35 2011        
(r222958)
@@ -217,7 +217,7 @@ nospace:
        (void) chkdq(ip, -btodb(size), cred, FORCE);
        UFS_LOCK(ump);
#endif
-       if (reclaimed == 0) {
+       if (reclaimed == 0 && (flags & IO_BUFLOCKED) == 0) {
                reclaimed = 1;
                softdep_request_cleanup(fs, ITOV(ip), cred, FLUSH_BLOCKS_WAIT);
                goto retry;
@@ -418,7 +418,7 @@ nospace:
        /*
         * no space available
         */
-       if (reclaimed == 0) {
+       if (reclaimed == 0 && (flags & IO_BUFLOCKED) == 0) {
                reclaimed = 1;
                UFS_UNLOCK(ump);
                if (bp) {

Modified: head/sys/ufs/ffs/ffs_balloc.c
==============================================================================
--- head/sys/ufs/ffs/ffs_balloc.c       Fri Jun 10 22:42:00 2011        
(r222957)
+++ head/sys/ufs/ffs/ffs_balloc.c       Fri Jun 10 22:48:35 2011        
(r222958)
@@ -105,6 +105,7 @@ ffs_balloc_ufs1(struct vnode *vp, off_t
        ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1];
        int unwindidx = -1;
        int saved_inbdflush;
+       int reclaimed;

        ip = VTOI(vp);
        dp = ip->i_din1;
@@ -112,6 +113,7 @@ ffs_balloc_ufs1(struct vnode *vp, off_t
        ump = ip->i_ump;
        lbn = lblkno(fs, startoffset);
        size = blkoff(fs, startoffset) + size;
+       reclaimed = 0;
        if (size > fs->fs_bsize)
                panic("ffs_balloc_ufs1: blk too big");
        *bpp = NULL;
@@ -276,6 +278,7 @@ ffs_balloc_ufs1(struct vnode *vp, off_t
        /*
         * Fetch through the indirect blocks, allocating as necessary.
         */
+retry:
        for (i = 1;;) {
                error = bread(vp,
                    indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp);
@@ -296,8 +299,15 @@ ffs_balloc_ufs1(struct vnode *vp, off_t
                if (pref == 0)
                        pref = ffs_blkpref_ufs1(ip, lbn, 0, (ufs1_daddr_t *)0);
                if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
-                   flags, cred, &newb)) != 0) {
+                   flags | IO_BUFLOCKED, cred, &newb)) != 0) {
                        brelse(bp);
+                       if (++reclaimed == 1) {
+                               UFS_LOCK(ump);
+                               softdep_request_cleanup(fs, vp, cred,
+                                   FLUSH_BLOCKS_WAIT);
+                               UFS_UNLOCK(ump);
+                               goto retry;
+                       }
                        goto fail;
                }
                nb = newb;
@@ -349,10 +359,17 @@ ffs_balloc_ufs1(struct vnode *vp, off_t
        if (nb == 0) {
                UFS_LOCK(ump);
                pref = ffs_blkpref_ufs1(ip, lbn, indirs[i].in_off, &bap[0]);
-               error = ffs_alloc(ip,
-                   lbn, pref, (int)fs->fs_bsize, flags, cred, &newb);
+               error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
+                   flags | IO_BUFLOCKED, cred, &newb);
                if (error) {
                        brelse(bp);
+                       if (++reclaimed == 1) {
+                               UFS_LOCK(ump);
+                               softdep_request_cleanup(fs, vp, cred,
+                                   FLUSH_BLOCKS_WAIT);
+                               UFS_UNLOCK(ump);
+                               goto retry;
+                       }
                        goto fail;
                }
                nb = newb;
@@ -506,6 +523,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t
        int deallocated, osize, nsize, num, i, error;
        int unwindidx = -1;
        int saved_inbdflush;
+       int reclaimed;

        ip = VTOI(vp);
        dp = ip->i_din2;
@@ -513,6 +531,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t
        ump = ip->i_ump;
        lbn = lblkno(fs, startoffset);
        size = blkoff(fs, startoffset) + size;
+       reclaimed = 0;
        if (size > fs->fs_bsize)
                panic("ffs_balloc_ufs2: blk too big");
        *bpp = NULL;
@@ -787,6 +806,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t
        /*
         * Fetch through the indirect blocks, allocating as necessary.
         */
+retry:
        for (i = 1;;) {
                error = bread(vp,
                    indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp);
@@ -807,8 +827,15 @@ ffs_balloc_ufs2(struct vnode *vp, off_t
                if (pref == 0)
                        pref = ffs_blkpref_ufs2(ip, lbn, 0, (ufs2_daddr_t *)0);
                if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
-                   flags, cred, &newb)) != 0) {
+                   flags | IO_BUFLOCKED, cred, &newb)) != 0) {
                        brelse(bp);
+                       if (++reclaimed == 1) {
+                               UFS_LOCK(ump);
+                               softdep_request_cleanup(fs, vp, cred,
+                                   FLUSH_BLOCKS_WAIT);
+                               UFS_UNLOCK(ump);
+                               goto retry;
+                       }
                        goto fail;
                }
                nb = newb;
@@ -860,10 +887,17 @@ ffs_balloc_ufs2(struct vnode *vp, off_t
        if (nb == 0) {
                UFS_LOCK(ump);
                pref = ffs_blkpref_ufs2(ip, lbn, indirs[i].in_off, &bap[0]);
-               error = ffs_alloc(ip,
-                   lbn, pref, (int)fs->fs_bsize, flags, cred, &newb);
+               error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
+                   flags | IO_BUFLOCKED, cred, &newb);
                if (error) {
                        brelse(bp);
+                       if (++reclaimed == 1) {
+                               UFS_LOCK(ump);
+                               softdep_request_cleanup(fs, vp, cred,
+                                   FLUSH_BLOCKS_WAIT);
+                               UFS_UNLOCK(ump);
+                               goto retry;
+                       }
                        goto fail;
                }
                nb = newb;

Modified: head/sys/ufs/ffs/ffs_extern.h
==============================================================================
--- head/sys/ufs/ffs/ffs_extern.h       Fri Jun 10 22:42:00 2011        
(r222957)
+++ head/sys/ufs/ffs/ffs_extern.h       Fri Jun 10 22:48:35 2011        
(r222958)
@@ -74,6 +74,7 @@ int   ffs_isfreeblock(struct fs *, u_char
void    ffs_load_inode(struct buf *, struct inode *, struct fs *, ino_t);
int     ffs_mountroot(void);
void    ffs_oldfscompat_write(struct fs *, struct ufsmount *);
+void   ffs_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end);
int     ffs_reallocblks(struct vop_reallocblks_args *);
int     ffs_realloccg(struct inode *, ufs2_daddr_t, ufs2_daddr_t,
            ufs2_daddr_t, int, int, int, struct ucred *, struct buf **);
@@ -107,7 +108,6 @@ extern struct vop_vector ffs_fifoops2;

int     softdep_check_suspend(struct mount *, struct vnode *,
          int, int, int, int);
-int    softdep_complete_trunc(struct vnode *, void *);
void    softdep_get_depcounts(struct mount *, int *, int *);
void    softdep_initialize(void);
void    softdep_uninitialize(void);
@@ -139,14 +139,17 @@ void      softdep_setup_blkfree(struct mount
void    softdep_setup_inofree(struct mount *, struct buf *, ino_t,
            struct workhead *);
void    softdep_setup_sbupdate(struct ufsmount *, struct fs *, struct buf *);
-void   *softdep_setup_trunc(struct vnode *vp, off_t length, int flags);
void    softdep_fsync_mountdev(struct vnode *);
int     softdep_sync_metadata(struct vnode *);
+int    softdep_sync_buf(struct vnode *, struct buf *, int);
int     softdep_process_worklist(struct mount *, int);
int     softdep_fsync(struct vnode *);
int     softdep_waitidle(struct mount *);
int     softdep_prealloc(struct vnode *, int);
int     softdep_journal_lookup(struct mount *, struct vnode **);
+void   softdep_journal_freeblocks(struct inode *, struct ucred *, off_t, int);
+void   softdep_journal_fsync(struct inode *);
+

/*
 * Things to request flushing in softdep_request_cleanup()

Modified: head/sys/ufs/ffs/ffs_inode.c
==============================================================================
--- head/sys/ufs/ffs/ffs_inode.c        Fri Jun 10 22:42:00 2011        
(r222957)
+++ head/sys/ufs/ffs/ffs_inode.c        Fri Jun 10 22:48:35 2011        
(r222958)
@@ -120,7 +120,7 @@ ffs_update(vp, waitfor)
        }
}

-static void
+void
ffs_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end)
{
        vm_object_t object;
@@ -151,12 +151,12 @@ ffs_truncate(vp, length, flags, cred, td
        ufs2_daddr_t bn, lbn, lastblock, lastiblock[NIADDR], indir_lbn[NIADDR];
        ufs2_daddr_t oldblks[NDADDR + NIADDR], newblks[NDADDR + NIADDR];
        ufs2_daddr_t count, blocksreleased = 0, datablocks;
-       void *cookie;
        struct bufobj *bo;
        struct fs *fs;
        struct buf *bp;
        struct ufsmount *ump;
-       int needextclean, softdepslowdown, extblocks;
+       int softdeptrunc, journaltrunc;
+       int needextclean, extblocks;
        int offset, size, level, nblocks;
        int i, error, allerror;
        off_t osize;
@@ -165,7 +165,6 @@ ffs_truncate(vp, length, flags, cred, td
        fs = ip->i_fs;
        ump = ip->i_ump;
        bo = &vp->v_bufobj;
-       cookie = NULL;

        ASSERT_VOP_LOCKED(vp, "ffs_truncate");

@@ -173,6 +172,11 @@ ffs_truncate(vp, length, flags, cred, td
                return (EINVAL);
        if (length > fs->fs_maxfilesize)
                return (EFBIG);
+#ifdef QUOTA
+       error = getinoquota(ip);
+       if (error)
+               return (error);
+#endif
        /*
         * Historically clients did not have to specify which data
         * they were truncating. So, if not specified, we assume
@@ -191,7 +195,10 @@ ffs_truncate(vp, length, flags, cred, td
         */
        allerror = 0;
        needextclean = 0;
-       softdepslowdown = DOINGSOFTDEP(vp) && softdep_slowdown(vp);
+       softdeptrunc = 0;
+       journaltrunc = DOINGSUJ(vp);
+       if (journaltrunc == 0 && DOINGSOFTDEP(vp) && length == 0)
+               softdeptrunc = !softdep_slowdown(vp);
        extblocks = 0;
        datablocks = DIP(ip, i_blocks);
        if (fs->fs_magic == FS_UFS2_MAGIC && ip->i_din2->di_extsize > 0) {
@@ -199,27 +206,23 @@ ffs_truncate(vp, length, flags, cred, td
                datablocks -= extblocks;
        }
        if ((flags & IO_EXT) && extblocks > 0) {
-               if (DOINGSOFTDEP(vp) && softdepslowdown == 0 && length == 0) {
-                       if ((flags & IO_NORMAL) == 0) {
-                               softdep_setup_freeblocks(ip, length, IO_EXT);
-                               return (0);
-                       }
+               if (length != 0)
+                       panic("ffs_truncate: partial trunc of extdata");
+               if (softdeptrunc || journaltrunc) {
+                       if ((flags & IO_NORMAL) == 0)
+                               goto extclean;
                        needextclean = 1;
                } else {
-                       if (length != 0)
-                               panic("ffs_truncate: partial trunc of extdata");
                        if ((error = ffs_syncvnode(vp, MNT_WAIT)) != 0)
                                return (error);
-                       if (DOINGSUJ(vp))
-                               cookie = softdep_setup_trunc(vp, length, flags);
-                       osize = ip->i_din2->di_extsize;
-                       ip->i_din2->di_blocks -= extblocks;
#ifdef QUOTA
                        (void) chkdq(ip, -extblocks, NOCRED, 0);
#endif
                        vinvalbuf(vp, V_ALT, 0, 0);
                        ffs_pages_remove(vp,
                            OFF_TO_IDX(lblktosize(fs, -extblocks)), 0);
+                       osize = ip->i_din2->di_extsize;
+                       ip->i_din2->di_blocks -= extblocks;
                        ip->i_din2->di_extsize = 0;
                        for (i = 0; i < NXADDR; i++) {
                                oldblks[i] = ip->i_din2->di_extb[i];
@@ -227,7 +230,7 @@ ffs_truncate(vp, length, flags, cred, td
                        }
                        ip->i_flag |= IN_CHANGE;
                        if ((error = ffs_update(vp, 1)))
-                               goto out;
+                               return (error);
                        for (i = 0; i < NXADDR; i++) {
                                if (oldblks[i] == 0)
                                        continue;
@@ -236,10 +239,8 @@ ffs_truncate(vp, length, flags, cred, td
                        }
                }
        }
-       if ((flags & IO_NORMAL) == 0) {
-               error = 0;
-               goto out;
-       }
+       if ((flags & IO_NORMAL) == 0)
+               return (0);
        if (vp->v_type == VLNK &&
            (ip->i_size < vp->v_mount->mnt_maxsymlinklen ||
             datablocks == 0)) {
@@ -252,24 +253,17 @@ ffs_truncate(vp, length, flags, cred, td
                DIP_SET(ip, i_size, 0);
                ip->i_flag |= IN_CHANGE | IN_UPDATE;
                if (needextclean)
-                       softdep_setup_freeblocks(ip, length, IO_EXT);
-               error = ffs_update(vp, 1);
-               goto out;
+                       goto extclean;
+               return ffs_update(vp, 1);
        }
        if (ip->i_size == length) {
                ip->i_flag |= IN_CHANGE | IN_UPDATE;
                if (needextclean)
-                       softdep_setup_freeblocks(ip, length, IO_EXT);
-               error = ffs_update(vp, 0);
-               goto out;
+                       goto extclean;
+               return ffs_update(vp, 0);
        }
        if (fs->fs_ronly)
                panic("ffs_truncate: read-only filesystem");
-#ifdef QUOTA
-       error = getinoquota(ip);
-       if (error)
-               goto out;
-#endif
        if ((ip->i_flags & SF_SNAPSHOT) != 0)
                ffs_snapremove(vp);
        vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
@@ -285,7 +279,7 @@ ffs_truncate(vp, length, flags, cred, td
                error = UFS_BALLOC(vp, length - 1, 1, cred, flags, &bp);
                if (error) {
                        vnode_pager_setsize(vp, osize);
-                       goto out;
+                       return (error);
                }
                ip->i_size = length;
                DIP_SET(ip, i_size, length);
@@ -296,11 +290,10 @@ ffs_truncate(vp, length, flags, cred, td
                else
                        bawrite(bp);
                ip->i_flag |= IN_CHANGE | IN_UPDATE;
-               error = ffs_update(vp, 1);
-               goto out;
+               return ffs_update(vp, 1);
        }
        if (DOINGSOFTDEP(vp)) {
-               if (length > 0 || softdepslowdown) {
+               if (softdeptrunc == 0 && journaltrunc == 0) {
                        /*
                         * If a file is only partially truncated, then
                         * we have to clean up the data structures
@@ -311,29 +304,20 @@ ffs_truncate(vp, length, flags, cred, td
                         * so that it will have no data structures left.
                         */
                        if ((error = ffs_syncvnode(vp, MNT_WAIT)) != 0)
-                               goto out;
-                       /*
-                        * We have to journal the truncation before we change
-                        * any blocks so we don't leave the file partially
-                        * truncated.
-                        */
-                       if (DOINGSUJ(vp) && cookie == NULL)
-                               cookie = softdep_setup_trunc(vp, length, flags);
+                               return (error);
                } else {
-#ifdef QUOTA
-                       (void) chkdq(ip, -datablocks, NOCRED, 0);
-#endif
-                       softdep_setup_freeblocks(ip, length, needextclean ?
-                           IO_EXT | IO_NORMAL : IO_NORMAL);
+                       flags = IO_NORMAL | (needextclean ? IO_EXT: 0);
+                       if (journaltrunc)
+                               softdep_journal_freeblocks(ip, cred, length,
+                                   flags);
+                       else
+                               softdep_setup_freeblocks(ip, length, flags);
                        ASSERT_VOP_LOCKED(vp, "ffs_truncate1");
-                       vinvalbuf(vp, needextclean ? 0 : V_NORMAL, 0, 0);
-                       if (!needextclean)
-                               ffs_pages_remove(vp, 0,
-                                   OFF_TO_IDX(lblktosize(fs, -extblocks)));
-                       vnode_pager_setsize(vp, 0);
-                       ip->i_flag |= IN_CHANGE | IN_UPDATE;
-                       error = ffs_update(vp, 0);
-                       goto out;
+                       if (journaltrunc == 0) {
+                               ip->i_flag |= IN_CHANGE | IN_UPDATE;
+                               error = ffs_update(vp, 0);
+                       }
+                       return (error);
                }
        }
        /*
@@ -353,7 +337,7 @@ ffs_truncate(vp, length, flags, cred, td
                flags |= BA_CLRBUF;
                error = UFS_BALLOC(vp, length - 1, 1, cred, flags, &bp);
                if (error)
-                       goto out;
+                       return (error);
                /*
                 * When we are doing soft updates and the UFS_BALLOC
                 * above fills in a direct block hole with a full sized
@@ -365,7 +349,7 @@ ffs_truncate(vp, length, flags, cred, td
                if (DOINGSOFTDEP(vp) && lbn < NDADDR &&
                    fragroundup(fs, blkoff(fs, length)) < fs->fs_bsize &&
                    (error = ffs_syncvnode(vp, MNT_WAIT)) != 0)
-                       goto out;
+                       return (error);
                ip->i_size = length;
                DIP_SET(ip, i_size, length);
                size = blksize(fs, ip, lbn);
@@ -411,13 +395,7 @@ ffs_truncate(vp, length, flags, cred, td
                        DIP_SET(ip, i_db[i], 0);
        }
        ip->i_flag |= IN_CHANGE | IN_UPDATE;
-       /*
-        * When doing softupdate journaling we must preserve the size along
-        * with the old pointers until they are freed or we might not
-        * know how many fragments remain.
-        */
-       if (!DOINGSUJ(vp))
-               allerror = ffs_update(vp, 1);
+       allerror = ffs_update(vp, 1);

        /*
         * Having written the new inode to disk, save its new configuration
@@ -541,14 +519,14 @@ done:
#ifdef QUOTA
        (void) chkdq(ip, -blocksreleased, NOCRED, 0);
#endif
-       error = allerror;
-out:
-       if (cookie) {
-               allerror = softdep_complete_trunc(vp, cookie);
-               if (allerror != 0 && error == 0)
-                       error = allerror;
-       }
-       return (error);
+       return (allerror);
+
+extclean:
+       if (journaltrunc)
+               softdep_journal_freeblocks(ip, cred, length, IO_EXT);
+       else
+               softdep_setup_freeblocks(ip, length, IO_EXT);
+       return ffs_update(vp, MNT_WAIT);
}

/*

Modified: head/sys/ufs/ffs/ffs_softdep.c
==============================================================================
--- head/sys/ufs/ffs/ffs_softdep.c      Fri Jun 10 22:42:00 2011        
(r222957)
+++ head/sys/ufs/ffs/ffs_softdep.c      Fri Jun 10 22:48:35 2011        
(r222958)
@@ -59,6 +59,7 @@ __FBSDID("$FreeBSD$");
#include <sys/buf.h>
#include <sys/kdb.h>
#include <sys/kthread.h>
+#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mount.h>
@@ -71,6 +72,7 @@ __FBSDID("$FreeBSD$");
#include <sys/syslog.h>
#include <sys/vnode.h>
#include <sys/conf.h>
+
#include <ufs/ufs/dir.h>
#include <ufs/ufs/extattr.h>
#include <ufs/ufs/quota.h>
@@ -82,6 +84,8 @@ __FBSDID("$FreeBSD$");
#include <ufs/ufs/ufs_extern.h>

#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_object.h>

#include <ddb/ddb.h>

@@ -214,6 +218,25 @@ softdep_setup_allocindir_meta(nbp, ip, b
}

void
+softdep_journal_freeblocks(ip, cred, length, flags)
+       struct inode *ip;
+       struct ucred *cred;
+       off_t length;
+       int flags;
+{
+
+       panic("softdep_journal_freeblocks called");
+}
+
+void
+softdep_journal_fsync(ip)
+       struct inode *ip;
+{
+
+       panic("softdep_journal_fsync called");
+}
+
+void
softdep_setup_freeblocks(ip, length, flags)
        struct inode *ip;
        off_t length;
@@ -282,29 +305,6 @@ softdep_setup_directory_change(bp, dp, i
        panic("softdep_setup_directory_change called");
}

-void *
-softdep_setup_trunc(vp, length, flags)
-       struct vnode *vp;
-       off_t length;
-       int flags;
-{
-
-       panic("%s called", __FUNCTION__);
-
-       return (NULL);
-}
-
-int
-softdep_complete_trunc(vp, cookie)
-       struct vnode *vp;
-       void *cookie;
-{
-
-       panic("%s called", __FUNCTION__);
-
-       return (0);
-}
-
void
softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
        struct mount *mp;
@@ -499,6 +499,13 @@ softdep_sync_metadata(struct vnode *vp)
}

int
+softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor)
+{
+
+       return (0);
+}
+
+int
softdep_slowdown(vp)
        struct vnode *vp;
{
@@ -614,10 +621,13 @@ FEATURE(softupdates, "FFS soft-updates s
#define D_JSEGDEP       23
#define D_SBDEP         24
#define D_JTRUNC        25
-#define        D_LAST          D_JTRUNC
+#define        D_JFSYNC        26
+#define        D_SENTINAL      27
+#define        D_LAST          D_SENTINAL

unsigned long dep_current[D_LAST + 1];
unsigned long dep_total[D_LAST + 1];
+unsigned long dep_write[D_LAST + 1];


SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW, 0, "soft updates stats");
@@ -625,13 +635,17 @@ SYSCTL_NODE(_debug_softdep, OID_AUTO, to
    "total dependencies allocated");
SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW, 0,
    "current dependencies allocated");
+SYSCTL_NODE(_debug_softdep, OID_AUTO, write, CTLFLAG_RW, 0,
+    "current dependencies written");

#define SOFTDEP_TYPE(type, str, long)                                   \
    static MALLOC_DEFINE(M_ ## type, #str, long);                       \
    SYSCTL_ULONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD,       \
        &dep_total[D_ ## type], 0, "");                                   \
    SYSCTL_ULONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD,     \
-       &dep_current[D_ ## type], 0, "");
+       &dep_current[D_ ## type], 0, "");                         \
+    SYSCTL_ULONG(_debug_softdep_write, OID_AUTO, str, CTLFLAG_RD,      \
+       &dep_write[D_ ## type], 0, "");

SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies");
SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies");
@@ -660,6 +674,7 @@ SOFTDEP_TYPE(JSEG, jseg, "Journal segmen
SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete");
SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency");
SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation");
+SOFTDEP_TYPE(JFSYNC, jfsync, "Journal fsync complete");

static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes");
static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations");
@@ -694,7 +709,8 @@ static struct malloc_type *memtype[] = {
        M_JSEG,
        M_JSEGDEP,
        M_SBDEP,
-       M_JTRUNC
+       M_JTRUNC,
+       M_JFSYNC
};

static LIST_HEAD(mkdirlist, mkdir) mkdirlisthd;
@@ -734,10 +750,11 @@ static    void clear_unlinked_inodedep(stru
static  struct inodedep *first_unlinked_inodedep(struct ufsmount *);
static  int flush_pagedep_deps(struct vnode *, struct mount *,
            struct diraddhd *);
-static void free_pagedep(struct pagedep *);
+static int free_pagedep(struct pagedep *);
static  int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t);
-static int flush_inodedep_deps(struct mount *, ino_t);
+static int flush_inodedep_deps(struct vnode *, struct mount *, ino_t);
static  int flush_deplist(struct allocdirectlst *, int, int *);
+static int sync_cgs(struct mount *, int);
static  int handle_written_filepage(struct pagedep *, struct buf *);
static  int handle_written_sbdep(struct sbdep *, struct buf *);
static  void initiate_write_sbdep(struct sbdep *);
@@ -750,7 +767,7 @@ static      void handle_written_jaddref(struc
static  void handle_written_jremref(struct jremref *);
static  void handle_written_jseg(struct jseg *, struct buf *);
static  void handle_written_jnewblk(struct jnewblk *);
-static void handle_written_jfreeblk(struct jfreeblk *);
+static void handle_written_jblkdep(struct jblkdep *);
static  void handle_written_jfreefrag(struct jfreefrag *);
static  void complete_jseg(struct jseg *);
static  void jseg_write(struct ufsmount *ump, struct jseg *, uint8_t *);
@@ -758,6 +775,7 @@ static      void jaddref_write(struct jaddref
static  void jremref_write(struct jremref *, struct jseg *, uint8_t *);
static  void jmvref_write(struct jmvref *, struct jseg *, uint8_t *);
static  void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *);
+static void jfsync_write(struct jfsync *, struct jseg *, uint8_t *data);
static  void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *);
static  void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *);
static  void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *);
@@ -768,7 +786,9 @@ static      void handle_allocdirect_partdone(
static  struct jnewblk *cancel_newblk(struct newblk *, struct worklist *,
            struct workhead *);
static  void indirdep_complete(struct indirdep *);
-static int indirblk_inseg(struct mount *, ufs2_daddr_t);
+static int indirblk_lookup(struct mount *, ufs2_daddr_t);
+static void indirblk_insert(struct freework *);
+static void indirblk_remove(struct freework *);
static  void handle_allocindir_partdone(struct allocindir *);
static  void initiate_write_filepage(struct pagedep *, struct buf *);
static  void initiate_write_indirdep(struct indirdep*, struct buf *);
@@ -777,10 +797,12 @@ static    void initiate_write_bmsafemap(str
static  void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
static  void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
static  void handle_workitem_freefile(struct freefile *);
-static void handle_workitem_remove(struct dirrem *, struct vnode *);
+static int handle_workitem_remove(struct dirrem *, int);
static  struct dirrem *newdirrem(struct buf *, struct inode *,
            struct inode *, int, struct dirrem **);
-static void cancel_indirdep(struct indirdep *, struct buf *, struct inodedep *,
+static struct indirdep *indirdep_lookup(struct mount *, struct inode *,
+           struct buf *);
+static void cancel_indirdep(struct indirdep *, struct buf *,
            struct freeblks *);
static  void free_indirdep(struct indirdep *);
static  void free_diradd(struct diradd *, struct workhead *);
@@ -795,8 +817,13 @@ static     void cancel_diradd(struct diradd
            struct jremref *, struct jremref *);
static  void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *,
            struct jremref *);
-static void cancel_allocindir(struct allocindir *, struct inodedep *,
-           struct freeblks *);
+static void cancel_allocindir(struct allocindir *, struct buf *bp,
+           struct freeblks *, int);
+static int setup_trunc_indir(struct freeblks *, struct inode *,
+           ufs_lbn_t, ufs_lbn_t, ufs2_daddr_t);
+static void complete_trunc_indir(struct freework *);
+static void trunc_indirdep(struct indirdep *, struct freeblks *, struct buf *,
+           int);
static  void complete_mkdir(struct mkdir *);
static  void free_newdirblk(struct newdirblk *);
static  void free_jremref(struct jremref *);
@@ -806,7 +833,7 @@ static      void free_jsegs(struct jblocks *)
static  void rele_jseg(struct jseg *);
static  void free_jseg(struct jseg *, struct jblocks *);
static  void free_jnewblk(struct jnewblk *);
-static void free_jfreeblk(struct jfreeblk *);
+static void free_jblkdep(struct jblkdep *);
static  void free_jfreefrag(struct jfreefrag *);
static  void free_freedep(struct freedep *);
static  void journal_jremref(struct dirrem *, struct jremref *,
@@ -818,30 +845,33 @@ static    void cancel_jfreefrag(struct jfre
static  inline void setup_freedirect(struct freeblks *, struct inode *,
            int, int);
static  inline void setup_freeext(struct freeblks *, struct inode *, int, int);
-static inline void setup_freeindir(struct freeblks *, struct inode *, int i,
+static inline void setup_freeindir(struct freeblks *, struct inode *, int,
            ufs_lbn_t, int);
static  inline struct freeblks *newfreeblks(struct mount *, struct inode *);
static  void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t);
-static void softdep_trunc_deps(struct vnode *, struct freeblks *, ufs_lbn_t,
+ufs2_daddr_t blkcount(struct fs *, ufs2_daddr_t, off_t);
+static int trunc_check_buf(struct buf *, int *, ufs_lbn_t, int, int);
+static void trunc_dependencies(struct inode *, struct freeblks *, ufs_lbn_t,
            int, int);
-static         int cancel_pagedep(struct pagedep *, struct inodedep *,
-           struct freeblks *);
-static int deallocate_dependencies(struct buf *, struct inodedep *,
-           struct freeblks *, int off);
+static void trunc_pages(struct inode *, off_t, ufs2_daddr_t, int);
+static         int cancel_pagedep(struct pagedep *, struct freeblks *, int);
+static int deallocate_dependencies(struct buf *, struct freeblks *, int);
+static void newblk_freefrag(struct newblk*);
static  void free_newblk(struct newblk *);
static  void cancel_allocdirect(struct allocdirectlst *,
-           struct allocdirect *, struct freeblks *, int);
+           struct allocdirect *, struct freeblks *);
static  int check_inode_unwritten(struct inodedep *);
static  int free_inodedep(struct inodedep *);
static  void freework_freeblock(struct freework *);
-static void handle_workitem_freeblocks(struct freeblks *, int);
-static void handle_complete_freeblocks(struct freeblks *);
+static void freework_enqueue(struct freework *);
+static int handle_workitem_freeblocks(struct freeblks *, int);
+static int handle_complete_freeblocks(struct freeblks *, int);
static  void handle_workitem_indirblk(struct freework *);
-static void handle_written_freework(struct freework *);
+static void handle_written_freework(struct freework *, int);
static  void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);
static  struct worklist *jnewblk_merge(struct worklist *, struct worklist *,
            struct workhead *);
-static void setup_allocindir_phase2(struct buf *, struct inode *,
+static struct freefrag *setup_allocindir_phase2(struct buf *, struct inode *,
            struct inodedep *, struct allocindir *, ufs_lbn_t);
static  struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
            ufs2_daddr_t, ufs_lbn_t);
@@ -862,16 +892,20 @@ static    int newblk_lookup(struct mount *,
static  int inodedep_find(struct inodedep_hashhead *, struct fs *, ino_t,
            struct inodedep **);
static  int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **);
-static int pagedep_lookup(struct mount *, ino_t, ufs_lbn_t, int,
-           struct pagedep **);
+static int pagedep_lookup(struct mount *, struct buf *bp, ino_t, ufs_lbn_t,
+           int, struct pagedep **);
static  int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t,
            struct mount *mp, int, struct pagedep **);
static  void pause_timer(void *);
static  int request_cleanup(struct mount *, int);
-static int process_worklist_item(struct mount *, int);
+static int process_worklist_item(struct mount *, int, int);
static  void process_removes(struct vnode *);
+static void process_truncates(struct vnode *);
static  void jwork_move(struct workhead *, struct workhead *);
+static void jwork_insert(struct workhead *, struct jsegdep *);
static  void add_to_worklist(struct worklist *, int);
+static void wake_worklist(struct worklist *);
+static void wait_worklist(struct worklist *, char *);
static  void remove_from_worklist(struct worklist *);
static  void softdep_flush(void);
static  int softdep_speedup(void);
@@ -889,17 +923,20 @@ static    struct jremref *newjremref(struct
            struct inode *ip, off_t, nlink_t);
static  struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t,
            uint16_t);
-static inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t,
+static inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t,
            uint16_t);
-static inline struct jsegdep *inoref_jseg(struct inoref *);
+static inline struct jsegdep *inoref_jseg(struct inoref *);
static  struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t);
static  struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t,
            ufs2_daddr_t, int);
+static struct jtrunc *newjtrunc(struct freeblks *, off_t, int);
+static void move_newblock_dep(struct jaddref *, struct inodedep *);
+static void cancel_jfreeblk(struct freeblks *, ufs2_daddr_t);
static  struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *,
            ufs2_daddr_t, long, ufs_lbn_t);
static  struct freework *newfreework(struct ufsmount *, struct freeblks *,
-           struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int);
-static void jwait(struct worklist *wk);
+           struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int, int);
+static int jwait(struct worklist *, int);
static  struct inodedep *inodedep_lookup_ip(struct inode *);
static  int bmsafemap_rollbacks(struct bmsafemap *);
static  struct freefile *handle_bufwait(struct inodedep *, struct workhead *);
@@ -1064,6 +1101,30 @@ jwork_move(dst, src)
        }
}

+static void
+jwork_insert(dst, jsegdep)
+       struct workhead *dst;
+       struct jsegdep *jsegdep;
+{
+       struct jsegdep *jsegdepn;
+       struct worklist *wk;
+
+       LIST_FOREACH(wk, dst, wk_list)
+               if (wk->wk_type == D_JSEGDEP)
+                       break;
+       if (wk == NULL) {
+               WORKLIST_INSERT(dst, &jsegdep->jd_list);
+               return;
+       }
+       jsegdepn = WK_JSEGDEP(wk);
+       if (jsegdep->jd_seg->js_seq < jsegdepn->jd_seg->js_seq) {
+               WORKLIST_REMOVE(wk);
+               free_jsegdep(jsegdepn);
+               WORKLIST_INSERT(dst, &jsegdep->jd_list);
+       } else
+               free_jsegdep(jsegdep);
+}
+
/*
 * Routines for tracking and managing workitems.
 */
@@ -1088,6 +1149,8 @@ workitem_free(item, type)
                panic("workitem_free: type mismatch %s != %s",
                    TYPENAME(item->wk_type), TYPENAME(type));
#endif
+       if (item->wk_state & IOWAITING)
+               wakeup(item);
        ump = VFSTOUFS(item->wk_mp);
        if (--ump->softdep_deps == 0 && ump->softdep_req)
                wakeup(&ump->softdep_deps);
@@ -1101,14 +1164,18 @@ workitem_alloc(item, type, mp)
        int type;
        struct mount *mp;
{
+       struct ufsmount *ump;
+
        item->wk_type = type;
        item->wk_mp = mp;
        item->wk_state = 0;
+
+       ump = VFSTOUFS(mp);
        ACQUIRE_LOCK(&lk);
        dep_current[type]++;
        dep_total[type]++;
-       VFSTOUFS(mp)->softdep_deps++;
-       VFSTOUFS(mp)->softdep_accdeps++;
+       ump->softdep_deps++;
+       ump->softdep_accdeps++;
        FREE_LOCK(&lk);
}

@@ -1270,8 +1337,7 @@ softdep_flush(void)
                        vfslocked = VFS_LOCK_GIANT(mp);
                        progress += softdep_process_worklist(mp, 0);
                        ump = VFSTOUFS(mp);
-                       remaining += ump->softdep_on_worklist -
-                               ump->softdep_on_worklist_inprogress;
+                       remaining += ump->softdep_on_worklist;
                        VFS_UNLOCK_GIANT(vfslocked);
                        mtx_lock(&mountlist_mtx);
                        nmp = TAILQ_NEXT(mp, mnt_list);
@@ -1314,10 +1380,14 @@ softdep_speedup(void)
 * The following routine is the only one that removes items
 * and does so in order from first to last.
 */
+
+#define        WK_HEAD         0x0001  /* Add to HEAD. */
+#define        WK_NODELAY      0x0002  /* Process immediately. */
+
static void
-add_to_worklist(wk, nodelay)
+add_to_worklist(wk, flags)
        struct worklist *wk;
-       int nodelay;
+       int flags;
{
        struct ufsmount *ump;

@@ -1327,13 +1397,17 @@ add_to_worklist(wk, nodelay)
                panic("add_to_worklist: %s(0x%X) already on list",
                    TYPENAME(wk->wk_type), wk->wk_state);
        wk->wk_state |= ONWORKLIST;
-       if (LIST_EMPTY(&ump->softdep_workitem_pending))
+       if (ump->softdep_on_worklist == 0) {
                LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
-       else
+               ump->softdep_worklist_tail = wk;
+       } else if (flags & WK_HEAD) {
+               LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
+       } else {
                LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list);

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***

_______________________________________________
svn-src-all@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"

Reply via email to