The branch main has been updated by des:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=69d94f4c7608e41505996559367450706e91fbb8

commit 69d94f4c7608e41505996559367450706e91fbb8
Author:     Dag-Erling Smørgrav <d...@freebsd.org>
AuthorDate: 2023-02-02 17:18:41 +0000
Commit:     Dag-Erling Smørgrav <d...@freebsd.org>
CommitDate: 2023-02-02 17:19:29 +0000

    Add tarfs, a filesystem backed by tarballs.
    
    Sponsored by:   Juniper Networks, Inc.
    Sponsored by:   Klara, Inc.
    Reviewed by:    pauamma, imp
    Differential Revision:  https://reviews.freebsd.org/D37753
---
 etc/mtree/BSD.tests.dist         |    2 +
 share/man/man5/Makefile          |    1 +
 share/man/man5/tarfs.5           |  103 ++++
 sys/conf/files                   |    4 +
 sys/conf/options                 |    4 +
 sys/fs/tarfs/tarfs.h             |  254 +++++++++
 sys/fs/tarfs/tarfs_dbg.h         |   65 +++
 sys/fs/tarfs/tarfs_io.c          |  727 +++++++++++++++++++++++
 sys/fs/tarfs/tarfs_subr.c        |  603 ++++++++++++++++++++
 sys/fs/tarfs/tarfs_vfsops.c      | 1173 ++++++++++++++++++++++++++++++++++++++
 sys/fs/tarfs/tarfs_vnops.c       |  642 +++++++++++++++++++++
 sys/kern/subr_witness.c          |    6 +
 sys/modules/Makefile             |    1 +
 sys/modules/tarfs/Makefile       |   23 +
 tests/sys/fs/Makefile            |    1 +
 tests/sys/fs/tarfs/Makefile      |   10 +
 tests/sys/fs/tarfs/mktar.c       |  238 ++++++++
 tests/sys/fs/tarfs/tarfs_test.sh |   54 ++
 18 files changed, 3911 insertions(+)

diff --git a/etc/mtree/BSD.tests.dist b/etc/mtree/BSD.tests.dist
index 0d05ecaf06fc..b4b18997b7f9 100644
--- a/etc/mtree/BSD.tests.dist
+++ b/etc/mtree/BSD.tests.dist
@@ -757,6 +757,8 @@
         fs
             fusefs
             ..
+            tarfs
+            ..
             tmpfs
             ..
         ..
diff --git a/share/man/man5/Makefile b/share/man/man5/Makefile
index 2d49d981c2f9..f6e91e4ed00b 100644
--- a/share/man/man5/Makefile
+++ b/share/man/man5/Makefile
@@ -70,6 +70,7 @@ MAN=  acct.5 \
        style.Makefile.5 \
        style.mdoc.5 \
        sysctl.conf.5 \
+       tarfs.5 \
        tmpfs.5 \
        unionfs.5
 
diff --git a/share/man/man5/tarfs.5 b/share/man/man5/tarfs.5
new file mode 100644
index 000000000000..b25131c323c1
--- /dev/null
+++ b/share/man/man5/tarfs.5
@@ -0,0 +1,103 @@
+.\"-
+.\" SPDX-License-Identifier: BSD-2-Clause
+.\"
+.\" Copyright (c) 2022 Klara, Inc.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in the
+.\"    documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.Dd February 2, 2023
+.Dt TARFS 5
+.Os
+.Sh NAME
+.Nm tarfs
+.Nd tarball filesystem
+.Sh SYNOPSIS
+To compile this driver into the kernel, place the following line in
+your kernel configuration file:
+.Bd -ragged -offset indent
+.Cd "options TARFS"
+.Ed
+.Pp
+Alternatively, to load the driver as a module at boot time, place the
+following line in
+.Xr loader.conf 5 :
+.Bd -literal -offset indent
+tarfs_load="YES"
+.Ed
+.Sh DESCRIPTION
+The
+.Nm
+driver implementes a read-only filesystem backed by a
+.Xr tar 5
+file.
+Currently, only POSIX archives, optionally compressed with
+.Xr zstd 1 ,
+are supported.
+.Pp
+The preferred I/O size for
+.Nm
+filesystems can be adjusted using the
+.Va vfs.tarfs.ioshift
+sysctl setting and tunable.
+Setting it to 0 will reset it to its default value.
+Note that changes to this setting only apply to filesystems mounted
+after the change.
+.Sh DIAGNOSTICS
+If enabled by the
+.Dv TARFS_DEBUG
+kernel option, the
+.Va vfs.tarfs.debug
+sysctl setting can be used to control debugging output from the
+.Nm
+driver.
+Debugging output for individual sections of the driver can be enabled
+by adding together the relevant values from the table below.
+.Bl -column Value Description
+.It 0x01 Ta Memory allocations
+.It 0x02 Ta Checksum calculations
+.It 0x04 Ta Filesystem operations (vfsops)
+.It 0x08 Ta Path lookups
+.It 0x10 Ta File operations (vnops)
+.It 0x20 Ta General I/O
+.It 0x40 Ta Decompression
+.It 0x80 Ta Decompression index
+.It 0x100 Ta Sparse file mapping
+.El
+.Sh SEE ALSO
+.Xr tar 1 ,
+.Xr zstd 1 ,
+.Xr fstab 5 ,
+.Xr tar 5 ,
+.Xr mount 8 ,
+.Xr sysctl 8
+.Sh HISTORY
+.An -nosplit
+The
+.Nm
+driver was developed by
+.An Stephen J. Kiernan Aq Mt ste...@freebsd.org
+and
+.An Dag-Erling Smørgrav Aq Mt d...@freebsd.org
+for Juniper Networks and Klara Systems.
+This manual page was written by
+.An Dag-Erling Smørgrav Aq Mt d...@freebsd.org
+for Juniper Networks and Klara Systems.
diff --git a/sys/conf/files b/sys/conf/files
index 6cb4abcd9223..08966a9b46e4 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -3615,6 +3615,10 @@ fs/smbfs/smbfs_smb.c             optional smbfs
 fs/smbfs/smbfs_subr.c          optional smbfs
 fs/smbfs/smbfs_vfsops.c                optional smbfs
 fs/smbfs/smbfs_vnops.c         optional smbfs
+fs/tarfs/tarfs_io.c            optional tarfs compile-with "${NORMAL_C} 
-I$S/contrib/zstd/lib/freebsd"
+fs/tarfs/tarfs_subr.c          optional tarfs
+fs/tarfs/tarfs_vfsops.c                optional tarfs
+fs/tarfs/tarfs_vnops.c         optional tarfs
 fs/udf/osta.c                  optional udf
 fs/udf/udf_iconv.c             optional udf_iconv
 fs/udf/udf_vfsops.c            optional udf
diff --git a/sys/conf/options b/sys/conf/options
index 1f5003507539..3b2be66ba602 100644
--- a/sys/conf/options
+++ b/sys/conf/options
@@ -265,6 +265,7 @@ NULLFS              opt_dontuse.h
 PROCFS         opt_dontuse.h
 PSEUDOFS       opt_dontuse.h
 SMBFS          opt_dontuse.h
+TARFS          opt_dontuse.h
 TMPFS          opt_dontuse.h
 UDF            opt_dontuse.h
 UNIONFS                opt_dontuse.h
@@ -273,6 +274,9 @@ ZFS         opt_dontuse.h
 # Pseudofs debugging
 PSEUDOFS_TRACE opt_pseudofs.h
 
+# Tarfs debugging
+TARFS_DEBUG    opt_tarfs.h
+
 # In-kernel GSS-API
 KGSSAPI                opt_kgssapi.h
 KGSSAPI_DEBUG  opt_kgssapi.h
diff --git a/sys/fs/tarfs/tarfs.h b/sys/fs/tarfs/tarfs.h
new file mode 100644
index 000000000000..dffd60ee6d8a
--- /dev/null
+++ b/sys/fs/tarfs/tarfs.h
@@ -0,0 +1,254 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2013 Juniper Networks, Inc.
+ * Copyright (c) 2022-2023 Klara, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef        _FS_TARFS_TARFS_H_
+#define        _FS_TARFS_TARFS_H_
+
+#ifndef _KERNEL
+#error Should only be included by kernel
+#endif
+
+MALLOC_DECLARE(M_TARFSMNT);
+MALLOC_DECLARE(M_TARFSNODE);
+MALLOC_DECLARE(M_TARFSNAME);
+
+#ifdef SYSCTL_DECL
+SYSCTL_DECL(_vfs_tarfs);
+#endif
+
+struct componentname;
+struct mount;
+struct vnode;
+
+/*
+ * Internal representation of a tarfs file system node.
+ */
+struct tarfs_node {
+       TAILQ_ENTRY(tarfs_node) entries;
+       TAILQ_ENTRY(tarfs_node) dirents;
+
+       struct mtx               lock;
+
+       struct vnode            *vnode;
+       struct tarfs_mount      *tmp;
+       enum vtype               type;
+       ino_t                    ino;
+       off_t                    offset;
+       size_t                   size;
+       size_t                   physize;
+       char                    *name;
+       size_t                   namelen;
+
+       /* Node attributes */
+       uid_t                    uid;
+       gid_t                    gid;
+       mode_t                   mode;
+       unsigned int             flags;
+       nlink_t                  nlink;
+       struct timespec          atime;
+       struct timespec          mtime;
+       struct timespec          ctime;
+       struct timespec          birthtime;
+       unsigned long            gen;
+
+       /* Block map */
+       size_t                   nblk;
+       struct tarfs_blk        *blk;
+
+       struct tarfs_node       *parent;
+       union {
+               /* VDIR */
+               struct {
+                       TAILQ_HEAD(, tarfs_node) dirhead;
+                       off_t                    lastcookie;
+                       struct tarfs_node       *lastnode;
+               } dir;
+
+               /* VLNK */
+               struct {
+                       char                    *name;
+                       size_t                   namelen;
+               } link;
+
+               /* VBLK or VCHR */
+               dev_t                    rdev;
+
+               /* VREG */
+               struct tarfs_node       *other;
+       };
+};
+
+/*
+ * Entry in sparse file block map.
+ */
+struct tarfs_blk {
+       off_t    i;             /* input (physical) offset */
+       off_t    o;             /* output (logical) offset */
+       size_t   l;             /* length */
+};
+
+/*
+ * Decompression buffer.
+ */
+#define TARFS_ZBUF_SIZE 1048576
+struct tarfs_zbuf {
+       u_char           buf[TARFS_ZBUF_SIZE];
+       size_t           off; /* offset of contents */
+       size_t           len; /* length of contents */
+};
+
+/*
+ * Internal representation of a tarfs mount point.
+ */
+struct tarfs_mount {
+       TAILQ_HEAD(, tarfs_node) allnodes;
+       struct mtx               allnode_lock;
+
+       struct tarfs_node       *root;
+       struct vnode            *vp;
+       struct mount            *vfs;
+       ino_t                    ino;
+       struct unrhdr           *ino_unr;
+       size_t                   iosize;
+       size_t                   nblocks;
+       size_t                   nfiles;
+       time_t                   mtime; /* default mtime for directories */
+
+       struct tarfs_zio        *zio;
+       struct vnode            *znode;
+};
+
+struct tarfs_zio {
+       struct tarfs_mount      *tmp;
+
+       /* decompression state */
+#ifdef ZSTDIO
+       struct tarfs_zstd       *zstd; /* decompression state (zstd) */
+#endif
+       off_t                    ipos; /* current input position */
+       off_t                    opos; /* current output position */
+
+       /* index of compression frames */
+       unsigned int             curidx; /* current index position*/
+       unsigned int             nidx; /* number of index entries */
+       unsigned int             szidx; /* index capacity */
+       struct tarfs_idx { off_t i, o; } *idx;
+};
+
+struct tarfs_fid {
+       u_short                  len;   /* length of data in bytes */
+       u_short                  data0; /* force alignment */
+       ino_t                    ino;
+       unsigned long            gen;
+};
+
+#define        TARFS_NODE_LOCK(tnp) \
+       mtx_lock(&(tnp)->lock)
+#define        TARFS_NODE_UNLOCK(tnp) \
+       mtx_unlock(&(tnp)->lock)
+#define        TARFS_ALLNODES_LOCK(tnp) \
+       mtx_lock(&(tmp)->allnode_lock)
+#define        TARFS_ALLNODES_UNLOCK(tnp) \
+       mtx_unlock(&(tmp)->allnode_lock)
+
+/*
+ * Data and metadata within tar files are aligned on 512-byte boundaries,
+ * to match the block size of the magnetic tapes they were originally
+ * intended for.
+ */
+#define        TARFS_BSHIFT            9
+#define        TARFS_BLOCKSIZE         (size_t)(1U << TARFS_BSHIFT)
+#define        TARFS_BLKOFF(l)         ((l) % TARFS_BLOCKSIZE)
+#define        TARFS_BLKNUM(l)         ((l) >> TARFS_BSHIFT)
+#define        TARFS_SZ2BLKS(sz)       (((sz) + TARFS_BLOCKSIZE - 1) / 
TARFS_BLOCKSIZE)
+
+/*
+ * Our preferred I/O size.
+ */
+extern unsigned int tarfs_ioshift;
+#define        TARFS_IOSHIFT_MIN       TARFS_BSHIFT
+#define        TARFS_IOSHIFT_DEFAULT   PAGE_SHIFT
+#define        TARFS_IOSHIFT_MAX       PAGE_SHIFT
+
+#define        TARFS_ROOTINO           ((ino_t)3)
+#define        TARFS_ZIOINO            ((ino_t)4)
+#define        TARFS_MININO            ((ino_t)65535)
+
+#define        TARFS_COOKIE_DOT        0
+#define        TARFS_COOKIE_DOTDOT     1
+#define        TARFS_COOKIE_EOF        OFF_MAX
+
+#define        TARFS_ZIO_NAME          ".tar"
+#define        TARFS_ZIO_NAMELEN       (sizeof(TARFS_ZIO_NAME) - 1)
+
+extern struct vop_vector tarfs_vnodeops;
+
+static inline
+struct tarfs_mount *
+MP_TO_TARFS_MOUNT(struct mount *mp)
+{
+
+       MPASS(mp != NULL && mp->mnt_data != NULL);
+       return (mp->mnt_data);
+}
+
+static inline
+struct tarfs_node *
+VP_TO_TARFS_NODE(struct vnode *vp)
+{
+
+       MPASS(vp != NULL && vp->v_data != NULL);
+       return (vp->v_data);
+}
+
+int    tarfs_alloc_node(struct tarfs_mount *tmp, const char *name,
+           size_t namelen, enum vtype type, off_t off, size_t sz,
+           time_t mtime, uid_t uid, gid_t gid, mode_t mode,
+           unsigned int flags, const char *linkname, dev_t rdev,
+           struct tarfs_node *parent, struct tarfs_node **node);
+int    tarfs_load_blockmap(struct tarfs_node *tnp, size_t realsize);
+void   tarfs_dump_tree(struct tarfs_node *tnp);
+void   tarfs_free_node(struct tarfs_node *tnp);
+struct tarfs_node *
+       tarfs_lookup_dir(struct tarfs_node *tnp, off_t cookie);
+struct tarfs_node *
+       tarfs_lookup_node(struct tarfs_node *tnp, struct tarfs_node *f,
+           struct componentname *cnp);
+void   tarfs_print_node(struct tarfs_node *tnp);
+int    tarfs_read_file(struct tarfs_node *tnp, size_t len, struct uio *uiop);
+
+int    tarfs_io_init(struct tarfs_mount *tmp);
+int    tarfs_io_fini(struct tarfs_mount *tmp);
+int    tarfs_io_read(struct tarfs_mount *tmp, bool raw,
+    struct uio *uiop);
+ssize_t        tarfs_io_read_buf(struct tarfs_mount *tmp, bool raw,
+    void *buf, off_t off, size_t len);
+unsigned int
+       tarfs_strtofflags(const char *str, char **end);
+
+#endif /* _FS_TARFS_TARFS_H_ */
diff --git a/sys/fs/tarfs/tarfs_dbg.h b/sys/fs/tarfs/tarfs_dbg.h
new file mode 100644
index 000000000000..45d11d679719
--- /dev/null
+++ b/sys/fs/tarfs/tarfs_dbg.h
@@ -0,0 +1,65 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2013 Juniper Networks, Inc.
+ * Copyright (c) 2022 Klara, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef        _FS_TARFS_TARFS_DBG_H_
+#define        _FS_TARFS_TARFS_DBG_H_
+
+#ifndef _KERNEL
+#error Should only be included by kernel
+#endif
+
+#ifdef TARFS_DEBUG
+extern int tarfs_debug;
+
+#define        TARFS_DEBUG_ALLOC       0x01
+#define        TARFS_DEBUG_CHECKSUM    0x02
+#define        TARFS_DEBUG_FS          0x04
+#define        TARFS_DEBUG_LOOKUP      0x08
+#define        TARFS_DEBUG_VNODE       0x10
+#define        TARFS_DEBUG_IO          0x20
+#define        TARFS_DEBUG_ZIO         0x40
+#define        TARFS_DEBUG_ZIDX        0x80
+#define        TARFS_DEBUG_MAP         0x100
+
+#define        TARFS_DPF(category, fmt, ...)                                   
\
+       do {                                                            \
+               if ((tarfs_debug & TARFS_DEBUG_##category) != 0)        \
+                       printf(fmt, ## __VA_ARGS__);                    \
+       } while (0)
+#define        TARFS_DPF_IFF(category, cond, fmt, ...)                         
\
+       do {                                                            \
+               if ((cond)                                              \
+                   && (tarfs_debug & TARFS_DEBUG_##category) != 0)     \
+                       printf(fmt, ## __VA_ARGS__);                    \
+       } while (0)
+#else
+#define        TARFS_DPF(category, fmt, ...)
+#define        TARFS_DPF_IFF(category, cond, fmt, ...)
+#endif
+
+#endif /* _FS_TARFS_TARFS_DBG_H_ */
diff --git a/sys/fs/tarfs/tarfs_io.c b/sys/fs/tarfs/tarfs_io.c
new file mode 100644
index 000000000000..b957ac11ff51
--- /dev/null
+++ b/sys/fs/tarfs/tarfs_io.c
@@ -0,0 +1,727 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2013 Juniper Networks, Inc.
+ * Copyright (c) 2022-2023 Klara, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "opt_tarfs.h"
+#include "opt_zstdio.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/counter.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/sysctl.h>
+#include <sys/uio.h>
+#include <sys/vnode.h>
+
+#ifdef ZSTDIO
+#define ZSTD_STATIC_LINKING_ONLY
+#include <contrib/zstd/lib/zstd.h>
+#endif
+
+#include <fs/tarfs/tarfs.h>
+#include <fs/tarfs/tarfs_dbg.h>
+
+#ifdef TARFS_DEBUG
+SYSCTL_NODE(_vfs_tarfs, OID_AUTO, zio, CTLFLAG_RD, 0,
+    "Tar filesystem decompression layer");
+COUNTER_U64_DEFINE_EARLY(tarfs_zio_inflated);
+SYSCTL_COUNTER_U64(_vfs_tarfs_zio, OID_AUTO, inflated, CTLFLAG_RD,
+    &tarfs_zio_inflated, "Amount of compressed data inflated.");
+COUNTER_U64_DEFINE_EARLY(tarfs_zio_consumed);
+SYSCTL_COUNTER_U64(_vfs_tarfs_zio, OID_AUTO, consumed, CTLFLAG_RD,
+    &tarfs_zio_consumed, "Amount of compressed data consumed.");
+COUNTER_U64_DEFINE_EARLY(tarfs_zio_bounced);
+SYSCTL_COUNTER_U64(_vfs_tarfs_zio, OID_AUTO, bounced, CTLFLAG_RD,
+    &tarfs_zio_bounced, "Amount of decompressed data bounced.");
+
+static int
+tarfs_sysctl_handle_zio_reset(SYSCTL_HANDLER_ARGS)
+{
+       unsigned int tmp;
+       int error;
+
+       tmp = 0;
+       if ((error = SYSCTL_OUT(req, &tmp, sizeof(tmp))) != 0)
+               return (error);
+       if (req->newptr != NULL) {
+               if ((error = SYSCTL_IN(req, &tmp, sizeof(tmp))) != 0)
+                       return (error);
+               counter_u64_zero(tarfs_zio_inflated);
+               counter_u64_zero(tarfs_zio_consumed);
+               counter_u64_zero(tarfs_zio_bounced);
+       }
+       return (0);
+}
+
+SYSCTL_PROC(_vfs_tarfs_zio, OID_AUTO, reset,
+    CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW,
+    NULL, 0, tarfs_sysctl_handle_zio_reset, "IU",
+    "Reset compression counters.");
+#endif
+
+MALLOC_DEFINE(M_TARFSZSTATE, "tarfs zstate", "tarfs decompression state");
+MALLOC_DEFINE(M_TARFSZBUF, "tarfs zbuf", "tarfs decompression buffers");
+
+#define XZ_MAGIC               (uint8_t[]){ 0xfd, 0x37, 0x7a, 0x58, 0x5a }
+#define ZLIB_MAGIC             (uint8_t[]){ 0x1f, 0x8b, 0x08 }
+#define ZSTD_MAGIC             (uint8_t[]){ 0x28, 0xb5, 0x2f, 0xfd }
+
+#ifdef ZSTDIO
+struct tarfs_zstd {
+       ZSTD_DStream *zds;
+};
+#endif
+
+/* XXX review use of curthread / uio_td / td_cred */
+
+/*
+ * Reads from the tar file according to the provided uio.  If the archive
+ * is compressed and raw is false, reads the decompressed stream;
+ * otherwise, reads directly from the original file.  Returns 0 on success
+ * and a positive errno value on failure.
+ */
+int
+tarfs_io_read(struct tarfs_mount *tmp, bool raw, struct uio *uiop)
+{
+       void *rl = NULL;
+       off_t off = uiop->uio_offset;
+       size_t len = uiop->uio_resid;
+       int error;
+
+       if (raw || tmp->znode == NULL) {
+               rl = vn_rangelock_rlock(tmp->vp, off, off + len);
+               error = vn_lock(tmp->vp, LK_SHARED);
+               if (error == 0) {
+                       error = VOP_READ(tmp->vp, uiop,
+                           IO_DIRECT|IO_NODELOCKED,
+                           uiop->uio_td->td_ucred);
+                       VOP_UNLOCK(tmp->vp);
+               }
+               vn_rangelock_unlock(tmp->vp, rl);
+       } else {
+               error = vn_lock(tmp->znode, LK_EXCLUSIVE);
+               if (error == 0) {
+                       error = VOP_READ(tmp->znode, uiop,
+                           IO_DIRECT | IO_NODELOCKED,
+                           uiop->uio_td->td_ucred);
+                       VOP_UNLOCK(tmp->znode);
+               }
+       }
+       TARFS_DPF(IO, "%s(%zu, %zu) = %d (resid %zd)\n", __func__,
+           (size_t)off, len, error, uiop->uio_resid);
+       return (error);
+}
+
+/*
+ * Reads from the tar file into the provided buffer.  If the archive is
+ * compressed and raw is false, reads the decompressed stream; otherwise,
+ * reads directly from the original file.  Returns the number of bytes
+ * read on success, 0 on EOF, and a negative errno value on failure.
+ */
+ssize_t
+tarfs_io_read_buf(struct tarfs_mount *tmp, bool raw,
+    void *buf, off_t off, size_t len)
+{
+       struct uio auio;
+       struct iovec aiov;
+       ssize_t res;
+       int error;
+
+       if (len == 0) {
+               TARFS_DPF(IO, "%s(%zu, %zu) null\n", __func__,
+                   (size_t)off, len);
+               return (0);
+       }
+       aiov.iov_base = buf;
+       aiov.iov_len = len;
+       auio.uio_iov = &aiov;
+       auio.uio_iovcnt = 1;
+       auio.uio_offset = off;
+       auio.uio_segflg = UIO_SYSSPACE;
+       auio.uio_rw = UIO_READ;
+       auio.uio_resid = len;
+       auio.uio_td = curthread;
+       error = tarfs_io_read(tmp, raw, &auio);
+       if (error != 0) {
+               TARFS_DPF(IO, "%s(%zu, %zu) error %d\n", __func__,
+                   (size_t)off, len, error);
+               return (-error);
+       }
+       res = len - auio.uio_resid;
+       if (res == 0 && len != 0) {
+               TARFS_DPF(IO, "%s(%zu, %zu) eof\n", __func__,
+                   (size_t)off, len);
+       } else {
+               TARFS_DPF(IO, "%s(%zu, %zu) read %zd | %*D\n", __func__,
+                   (size_t)off, len, res,
+                   (int)(res > 8 ? 8 : res), (uint8_t *)buf, " ");
+       }
+       return (res);
+}
+
+#ifdef ZSTDIO
+static void *
+tarfs_zstate_alloc(void *opaque, size_t size)
+{
+
+       (void)opaque;
+       return (malloc(size, M_TARFSZSTATE, M_WAITOK));
+}
+#endif
+
+#ifdef ZSTDIO
+static void
+tarfs_zstate_free(void *opaque, void *address)
+{
+
+       (void)opaque;
+       free(address, M_TARFSZSTATE);
+}
+#endif
+
+#ifdef ZSTDIO
+static ZSTD_customMem tarfs_zstd_mem = {
+       tarfs_zstate_alloc,
+       tarfs_zstate_free,
+       NULL,
+};
+#endif
+
+/*
+ * Updates the decompression frame index, recording the current input and
+ * output offsets in a new index entry, and growing the index if
+ * necessary.
+ */
+static void
+tarfs_zio_update_index(struct tarfs_zio *zio, off_t i, off_t o)
+{
+
+       if (++zio->curidx >= zio->nidx) {
+               if (++zio->nidx > zio->szidx) {
+                       zio->szidx *= 2;
+                       zio->idx = realloc(zio->idx,
+                           zio->szidx * sizeof(*zio->idx),
+                           M_TARFSZSTATE, M_ZERO | M_WAITOK);
+                       TARFS_DPF(ALLOC, "%s: resized zio index\n", __func__);
+               }
+               zio->idx[zio->curidx].i = i;
+               zio->idx[zio->curidx].o = o;
+               TARFS_DPF(ZIDX, "%s: index %u = i %zu o %zu\n", __func__,
+                   zio->curidx, (size_t)zio->idx[zio->curidx].i,
+                   (size_t)zio->idx[zio->curidx].o);
+       }
+       MPASS(zio->idx[zio->curidx].i == i);
+       MPASS(zio->idx[zio->curidx].o == o);
+}
+
+/*
+ * VOP_ACCESS for zio node.
+ */
+static int
+tarfs_zaccess(struct vop_access_args *ap)
+{
+       struct vnode *vp = ap->a_vp;
+       struct tarfs_zio *zio = vp->v_data;
+       struct tarfs_mount *tmp = zio->tmp;
+       accmode_t accmode = ap->a_accmode;
+       int error = EPERM;
+
+       if (accmode == VREAD) {
+               error = vn_lock(tmp->vp, LK_SHARED);
+               if (error == 0) {
+                       error = VOP_ACCESS(tmp->vp, accmode, ap->a_cred, 
ap->a_td);
+                       VOP_UNLOCK(tmp->vp);
+               }
+       }
+       TARFS_DPF(ZIO, "%s(%d) = %d\n", __func__, accmode, error);
+       return (error);
+}
+
+/*
+ * VOP_GETATTR for zio node.
+ */
+static int
+tarfs_zgetattr(struct vop_getattr_args *ap)
+{
+       struct vattr va;
+       struct vnode *vp = ap->a_vp;
+       struct tarfs_zio *zio = vp->v_data;
+       struct tarfs_mount *tmp = zio->tmp;
+       struct vattr *vap = ap->a_vap;
+       int error = 0;
+
+       VATTR_NULL(vap);
+       error = vn_lock(tmp->vp, LK_SHARED);
+       if (error == 0) {
+               error = VOP_GETATTR(tmp->vp, &va, ap->a_cred);
+               VOP_UNLOCK(tmp->vp);
+               if (error == 0) {
+                       vap->va_type = VREG;
+                       vap->va_mode = va.va_mode;
+                       vap->va_nlink = 1;
+                       vap->va_gid = va.va_gid;
+                       vap->va_uid = va.va_uid;
+                       vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
+                       vap->va_fileid = TARFS_ZIOINO;
+                       vap->va_size = zio->idx[zio->nidx - 1].o;
+                       vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize;
+                       vap->va_atime = va.va_atime;
+                       vap->va_ctime = va.va_ctime;
+                       vap->va_mtime = va.va_mtime;
+                       vap->va_birthtime = tmp->root->birthtime;
+                       vap->va_bytes = va.va_bytes;
+               }
+       }
+       TARFS_DPF(ZIO, "%s() = %d\n", __func__, error);
+       return (error);
+}
+
+#ifdef ZSTDIO
+/*
+ * VOP_READ for zio node, zstd edition.
+ */
+static int
+tarfs_zread_zstd(struct tarfs_zio *zio, struct uio *uiop)
+{
+       void *ibuf = NULL, *obuf = NULL, *rl = NULL;
+       struct uio auio;
+       struct iovec aiov;
+       struct tarfs_mount *tmp = zio->tmp;
+       struct tarfs_zstd *zstd = zio->zstd;
+       struct thread *td = curthread;
+       ZSTD_inBuffer zib;
+       ZSTD_outBuffer zob;
+       off_t zsize;
+       off_t ipos, opos;
+       size_t ilen, olen;
+       size_t zerror;
+       off_t off = uiop->uio_offset;
+       size_t len = uiop->uio_resid;
+       size_t resid = uiop->uio_resid;
+       size_t bsize;
+       int error;
+       bool reset = false;
+
+       /* do we have to rewind? */
+       if (off < zio->opos) {
+               while (zio->curidx > 0 && off < zio->idx[zio->curidx].o)
+                       zio->curidx--;
+               reset = true;
+       }
+       /* advance to the nearest index entry */
+       if (off > zio->opos) {
+               // XXX maybe do a binary search instead
+               while (zio->curidx < zio->nidx - 1 &&
+                   off >= zio->idx[zio->curidx + 1].o) {
+                       zio->curidx++;
+                       reset = true;
+               }
+       }
+       /* reset the decompression stream if needed */
+       if (reset) {
+               zio->ipos = zio->idx[zio->curidx].i;
+               zio->opos = zio->idx[zio->curidx].o;
+               ZSTD_resetDStream(zstd->zds);
+               TARFS_DPF(ZIDX, "%s: skipping to index %u = i %zu o %zu\n", 
__func__,
+                   zio->curidx, (size_t)zio->ipos, (size_t)zio->opos);
+       } else {
+               TARFS_DPF(ZIDX, "%s: continuing at i %zu o %zu\n", __func__,
+                   (size_t)zio->ipos, (size_t)zio->opos);
+       }
+
+       /*
+        * Set up a temporary buffer for compressed data.  Use the size
+        * recommended by the zstd library; this is usually 128 kB, but
+        * just in case, make sure it's a multiple of the page size and no
+        * larger than MAXBSIZE.
+        */
+       bsize = roundup(ZSTD_CStreamOutSize(), PAGE_SIZE);
+       if (bsize > MAXBSIZE)
+               bsize = MAXBSIZE;
+       ibuf = malloc(bsize, M_TEMP, M_WAITOK);
+       zib.src = NULL;
+       zib.size = 0;
+       zib.pos = 0;
+
+       /*
+        * Set up the decompression buffer.  If the target is not in
+        * kernel space, we will have to set up a bounce buffer.
+        *
+        * TODO: to avoid using a bounce buffer, map destination pages
+        * using vm_fault_quick_hold_pages().
+        */
+       MPASS(zio->opos <= off);
+       MPASS(uiop->uio_iovcnt == 1);
+       MPASS(uiop->uio_iov->iov_len >= len);
+       if (uiop->uio_segflg == UIO_SYSSPACE) {
+               zob.dst = uiop->uio_iov->iov_base;
+       } else {
+               TARFS_DPF(ALLOC, "%s: allocating %zu-byte bounce buffer\n",
+                   __func__, len);
+               zob.dst = obuf = malloc(len, M_TEMP, M_WAITOK);
+       }
+       zob.size = len;
+       zob.pos = 0;
+
+       /* lock tarball */
+       rl = vn_rangelock_rlock(tmp->vp, zio->ipos, OFF_MAX);
+       error = vn_lock(tmp->vp, LK_SHARED);
+       if (error != 0) {
+               goto fail_unlocked;
+       }
+       /* check size */
+       error = vn_getsize_locked(tmp->vp, &zsize, td->td_ucred);
+       if (error != 0) {
+               goto fail;
+       }
+       if (zio->ipos >= zsize) {
+               /* beyond EOF */
+               goto fail;
+       }
+
+       while (resid > 0) {
+               if (zib.pos == zib.size) {
+                       /* request data from the underlying file */
+                       aiov.iov_base = ibuf;
+                       aiov.iov_len = bsize;
+                       auio.uio_iov = &aiov;
+                       auio.uio_iovcnt = 1;
+                       auio.uio_offset = zio->ipos;
+                       auio.uio_segflg = UIO_SYSSPACE;
+                       auio.uio_rw = UIO_READ;
+                       auio.uio_resid = aiov.iov_len;
+                       auio.uio_td = td;
+                       error = VOP_READ(tmp->vp, &auio,
+                           IO_DIRECT | IO_NODELOCKED,
+                           td->td_ucred);
+                       if (error != 0)
+                               goto fail;
+                       TARFS_DPF(ZIO, "%s: req %zu+%zu got %zu+%zu\n", 
__func__,
+                           (size_t)zio->ipos, bsize,
+                           (size_t)zio->ipos, bsize - auio.uio_resid);
+                       zib.src = ibuf;
+                       zib.size = bsize - auio.uio_resid;
+                       zib.pos = 0;
+               }
+               MPASS(zib.pos <= zib.size);
+               if (zib.pos == zib.size) {
+                       TARFS_DPF(ZIO, "%s: end of file after i %zu o %zu\n", 
__func__,
+                           (size_t)zio->ipos, (size_t)zio->opos);
+                       goto fail;
+               }
+               if (zio->opos < off) {
+                       /* to be discarded */
+                       zob.size = min(off - zio->opos, len);
+                       zob.pos = 0;
*** 3111 LINES SKIPPED ***

Reply via email to