A long time ago the posix folks extended the ustar format to allow
representing arbitrarily big files, long file names, precise timestamps,
etc.  We have support to read such archives but no support to write them
out.  Here's a minimal proposal following discussions with Caspar and
other folks.

What the diff below does:

- provide a 'pax' format usable with pax(1) -x pax

- use extended headers to store file names and link names that don't fit
  in a standard ustar header.  Long file names in the ports tree were
  the intial motive for this effort.

What the diff below doesn't do:

- handle all the file attributes that ought to benefit from extended
  headers.  Right now I haven't even written that code and I hope to
  extend handled attributes gradually.

- the diff *doesn't change the default format* used by either pax(1) or
  tar(1).  I think changing the default at some point makes sense, since
  the format is superior and support is widely available.  My opinion is
  that we shouldn't expose this format by default until we're confident
  that the code is reasonably complete and exercised.  I have no idea
  whether this ought to happen before the next release.

- since the diff doesn't change the default for tar(1) and since tar(1)
  doesn't have a generic option to choose the format used, this code is
  unreachable using tar(1).  I'd *love* to have a bikeshed discussion
  about the proper way to handle that, but preferably after the code gets
  reviewed, pushed in the tree and improved. :)

Two more notes:
- no size increase for distrib/special/pax
- pax 101: you can exercise the new format with:
    pax -x pax -w files... > outfile.tar

Thoughts?  ok?


Index: cpio.1
===================================================================
RCS file: /home/cvs/src/bin/pax/cpio.1,v
retrieving revision 1.36
diff -u -p -r1.36 cpio.1
--- cpio.1      16 Jan 2020 16:46:46 -0000      1.36
+++ cpio.1      4 Sep 2023 16:26:53 -0000
@@ -98,6 +98,8 @@ format.
 Old octal character
 .Nm
 format.
+.It Ar pax
+POSIX pax format.
 .It Ar sv4cpio
 SVR4 hex
 .Nm
@@ -173,6 +175,8 @@ format.
 Old octal character
 .Nm
 format.
+.It Ar pax
+POSIX pax format.
 .It Ar sv4cpio
 SVR4 hex
 .Nm
@@ -298,6 +302,8 @@ be used for larger files.
 .It bcpio Ta "4 Gigabytes"
 .It sv4cpio Ta "4 Gigabytes"
 .It cpio Ta "8 Gigabytes"
+.\" XXX should be "unlimited"
+.It pax Ta "8 Gigabytes"
 .It tar Ta "8 Gigabytes"
 .It ustar Ta "8 Gigabytes"
 .El
Index: extern.h
===================================================================
RCS file: /home/cvs/src/bin/pax/extern.h,v
retrieving revision 1.60
diff -u -p -r1.60 extern.h
--- extern.h    23 Mar 2020 20:04:19 -0000      1.60
+++ extern.h    4 Sep 2023 14:59:23 -0000
@@ -296,6 +296,7 @@ int tar_wr(ARCHD *);
 int ustar_id(char *, int);
 int ustar_rd(ARCHD *, char *);
 int ustar_wr(ARCHD *);
+int pax_wr(ARCHD *);
 
 /*
  * tty_subs.c
Index: options.c
===================================================================
RCS file: /home/cvs/src/bin/pax/options.c,v
retrieving revision 1.105
diff -u -p -r1.105 options.c
--- options.c   17 Jan 2023 16:20:28 -0000      1.105
+++ options.c   4 Sep 2023 16:44:55 -0000
@@ -214,6 +214,8 @@ FSUB fsub[] = {
        { },
 /* 9: gzip, to detect failure to use -z */
        { },
+/* 10: POSIX PAX */
+       { },
 #else
 /* 6: compress, to detect failure to use -Z */
        {NULL, 0, 4, 0, 0, 0, 0, compress_id},
@@ -223,6 +225,10 @@ FSUB fsub[] = {
        {NULL, 0, 4, 0, 0, 0, 0, bzip2_id},
 /* 9: gzip, to detect failure to use -z */
        {NULL, 0, 4, 0, 0, 0, 0, gzip_id},
+/* 10: POSIX PAX */
+       {"pax", 5120, BLKMULT, 0, 1, BLKMULT, 0, ustar_id, no_op,
+       ustar_rd, tar_endrd, no_op, pax_wr, tar_endwr, tar_trail,
+       tar_opt},
 #endif
 };
 #define        F_OCPIO 0       /* format when called as cpio -6 */
Index: pax.1
===================================================================
RCS file: /home/cvs/src/bin/pax/pax.1,v
retrieving revision 1.76
diff -u -p -r1.76 pax.1
--- pax.1       31 Mar 2022 17:27:14 -0000      1.76
+++ pax.1       5 Sep 2023 13:46:15 -0000
@@ -868,6 +868,11 @@ standard.
 The default blocksize for this format is 10240 bytes.
 Filenames stored by this format must be 100 characters or less in length;
 the total pathname must be 256 characters or less.
+.It Cm pax
+The pax interchange format specified in the
+.St -p1003.1-2001
+standard.
+The default blocksize for this format is 5120 bytes.
 .El
 .Pp
 .Nm
@@ -1081,9 +1086,10 @@ utility is compliant with the
 specification,
 except that the
 .Cm pax
-archive format and the
+archive format is only partially supported,
+and the
 .Cm listopt
-keyword are unsupported.
+keyword is unsupported.
 .Pp
 The flags
 .Op Fl 0BDEGjOPTUYZz ,
Index: tar.c
===================================================================
RCS file: /home/cvs/src/bin/pax/tar.c,v
retrieving revision 1.73
diff -u -p -r1.73 tar.c
--- tar.c       4 Sep 2023 17:05:34 -0000       1.73
+++ tar.c       4 Sep 2023 23:45:28 -0000
@@ -35,10 +35,12 @@
  */
 
 #include <sys/types.h>
+#include <sys/queue.h>
 #include <sys/stat.h>
 #include <ctype.h>
 #include <errno.h>
 #include <grp.h>
+#include <libgen.h>
 #include <limits.h>
 #include <pwd.h>
 #include <stdio.h>
@@ -50,6 +52,19 @@
 #include "extern.h"
 #include "tar.h"
 
+SLIST_HEAD(xheader, xheader_record);
+struct xheader_record {
+       SLIST_ENTRY(xheader_record)      entry;
+       size_t                           reclen;
+       char                            *record;
+};
+
+/* shortest possible extended record: "5 a=\n" */
+#define MINXHDRSZ      5
+
+/* longest record we'll accept */
+#define MAXXHDRSZ      BLKMULT
+
 /*
  * Routines for reading, writing and header identify of various versions of tar
  */
@@ -60,6 +75,9 @@ static char *name_split(char *, int);
 static int ul_oct(u_long, char *, int, int);
 static int ull_oct(unsigned long long, char *, int, int);
 static int rd_xheader(ARCHD *arcn, int, off_t);
+#ifndef SMALL
+static int wr_xheader(ARCHD *, struct xheader *);
+#endif
 
 static uid_t uid_nobody;
 static uid_t uid_warn;
@@ -891,24 +909,121 @@ reset:
        return(0);
 }
 
-/*
- * ustar_wr()
- *     write a ustar header for the file specified in the ARCHD to the archive
- *     Have to check for file types that cannot be stored and file names that
- *     are too long. Be careful of the term (last arg) to ul_oct, we only use
- *     '\0' for the termination character (this is different than picky tar)
- *     ASSUMED: space after header in header block is zero filled
- * Return:
- *     0 if file has data to be written after the header, 1 if file has NO
- *     data to write after the header, -1 if archive write failed
- */
+#ifndef SMALL
+static int
+xheader_add(struct xheader *xhdr, const char *keyword,
+    const char *value)
+{
+       struct xheader_record *rec;
+       int reclen, tmplen;
+       char *s;
+
+       tmplen = MINXHDRSZ;
+       do {
+               reclen = tmplen;
+               tmplen = snprintf(NULL, 0, "%d %s=%s\n", reclen, keyword,
+                   value);
+       } while (tmplen >= 0 && tmplen != reclen);
+       if (tmplen < 0)
+               return -1;
+
+       rec = calloc(1, sizeof(*rec));
+       if (rec == NULL)
+               return -1;
+       rec->reclen = reclen;
+       if (asprintf(&s, "%d %s=%s\n", reclen, keyword, value) < 0) {
+               free(rec);
+               return -1;
+       }
+       rec->record = s;
 
-int
-ustar_wr(ARCHD *arcn)
+       SLIST_INSERT_HEAD(xhdr, rec, entry);
+
+       return 0;
+}
+
+static void
+xheader_free(struct xheader *xhdr)
+{
+       struct xheader_record *rec;
+
+       while (!SLIST_EMPTY(xhdr)) {
+               rec = SLIST_FIRST(xhdr);
+               SLIST_REMOVE_HEAD(xhdr, entry);
+               free(rec->record);
+               free(rec);
+       }
+}
+
+static int
+wr_xheader(ARCHD *arcn, struct xheader *xhdr)
+{
+       char hdblk[sizeof(HD_USTAR)];
+       HD_USTAR *hd;
+       char buf[sizeof(hd->name) + 1];
+       struct xheader_record *rec;
+       size_t size;
+
+       size = 0;
+       SLIST_FOREACH(rec, xhdr, entry)
+               size += rec->reclen;
+
+       memset(hdblk, 0, sizeof(hdblk));
+       hd = (HD_USTAR *)hdblk;
+       hd->typeflag = XHDRTYPE;
+       strncpy(hd->magic, TMAGIC, TMAGLEN);
+       strncpy(hd->version, TVERSION, TVERSLEN);
+       if (ul_oct(size, hd->size, sizeof(hd->size), 3))
+               return -1;
+
+       /*
+        * Best effort attempt at providing a useful file name for
+        * implementations that don't support pax format. Don't bother
+        * with truncation if the resulting file name doesn't fit.
+        * XXX dirname/basename portability (check return value?)
+        */
+       (void)snprintf(buf, sizeof(buf), "%s/PaxHeaders.%ld/%s",
+           dirname(arcn->name), (long)getpid(), basename(arcn->name));
+       fieldcpy(hd->name, sizeof(hd->name), buf, sizeof(buf));
+
+       if (ul_oct(arcn->sb.st_mode, hd->mode, sizeof(hd->mode), 0) ||
+           ull_oct(arcn->sb.st_mtime < 0 ? 0 : arcn->sb.st_mtime, hd->mtime,
+               sizeof(hd->mtime), 1) ||
+           ul_oct(arcn->sb.st_uid, hd->uid, sizeof(hd->uid), 0) ||
+           ul_oct(arcn->sb.st_gid, hd->gid, sizeof(hd->gid), 0))
+               return -1;
+
+       if (ul_oct(tar_chksm(hdblk, sizeof(HD_USTAR)), hd->chksum,
+          sizeof(hd->chksum), 3))
+               return -1;
+
+       /* write out extended header */
+       if (wr_rdbuf(hdblk, sizeof(HD_USTAR)) < 0)
+               return -1;
+       if (wr_skip(BLKMULT - sizeof(HD_USTAR)) < 0)
+               return -1;
+
+       /* write out extended header records */
+       SLIST_FOREACH(rec, xhdr, entry)
+               if (wr_rdbuf(rec->record, rec->reclen) < 0)
+                       return -1;
+
+       if (wr_skip(TAR_PAD(size)) < 0)
+               return -1;
+
+       return 0;
+}
+#endif
+
+static int
+wr_ustar_or_pax(ARCHD *arcn, int ustar)
 {
        HD_USTAR *hd;
        const char *name;
        char *pt, hdblk[sizeof(HD_USTAR)];
+#ifndef SMALL
+       struct xheader xhdr = SLIST_HEAD_INITIALIZER(xhdr);
+#endif
 
        /*
         * check for those file system types ustar cannot store
@@ -929,8 +1044,19 @@ ustar_wr(ARCHD *arcn)
         */
        if (PAX_IS_LINK(arcn->type) &&
            ((size_t)arcn->ln_nlen > sizeof(hd->linkname))) {
-               paxwarn(1, "Link name too long for ustar %s", arcn->ln_name);
-               return(1);
+               if (ustar) {
+                       paxwarn(1, "Link name too long for ustar %s",
+                           arcn->ln_name);
+                       return(1);
+               }
+#ifndef SMALL
+               else if (xheader_add(&xhdr, "linkpath", arcn->name) == -1) {
+                       paxwarn(1, "Link name too long for pax %s",
+                           arcn->ln_name);
+                       xheader_free(&xhdr);
+                       return(1);
+               }
+#endif
        }
 
        /*
@@ -938,8 +1064,21 @@ ustar_wr(ARCHD *arcn)
         * pt != arcn->name, the name has to be split
         */
        if ((pt = name_split(arcn->name, arcn->nlen)) == NULL) {
-               paxwarn(1, "File name too long for ustar %s", arcn->name);
-               return(1);
+               if (ustar) {
+                       paxwarn(1, "File name too long for ustar %s",
+                           arcn->name);
+                       return(1);
+               }
+#ifndef SMALL
+               else if (xheader_add(&xhdr, "path", arcn->name) == -1) {
+                       paxwarn(1, "File name too long for pax %s",
+                           arcn->ln_name);
+                       xheader_free(&xhdr);
+                       return(1);
+               }
+               /* PAX format, we don't need to split the path */
+               pt = arcn->name;
+#endif
        }
 
        /*
@@ -1074,6 +1213,18 @@ ustar_wr(ARCHD *arcn)
                        strncpy(hd->gname, name, sizeof(hd->gname));
        }
 
+#ifndef SMALL
+       /* write out a pax extended header if needed */
+       if (!SLIST_EMPTY(&xhdr)) {
+               int ret;
+
+               ret = wr_xheader(arcn, &xhdr);
+               xheader_free(&xhdr);
+               if (ret == -1)
+                       return(-1);
+       }
+#endif
+
        /*
         * calculate and store the checksum write the header to the archive
         * return 0 tells the caller to now write the file data, 1 says no data
@@ -1091,6 +1242,9 @@ ustar_wr(ARCHD *arcn)
        return(1);
 
     out:
+#ifndef SMALL
+       xheader_free(&xhdr);
+#endif
        /*
         * header field is out of range
         */
@@ -1099,6 +1253,42 @@ ustar_wr(ARCHD *arcn)
 }
 
 /*
+ * ustar_wr()
+ *     Write out a ustar format archive.
+ *     Have to check for file types that cannot be stored and file names that
+ *     are too long. Be careful of the term (last arg) to ul_oct, we only use
+ *     '\0' for the termination character (this is different than picky tar).
+ *     ASSUMED: space after header in header block is zero filled
+ * Return:
+ *     0 if file has data to be written after the header, 1 if file has NO
+ *     data to write after the header, -1 if archive write failed
+ */
+int
+ustar_wr(ARCHD *arcn)
+{
+       return wr_ustar_or_pax(arcn, 1);
+}
+
+/*
+ * pax_wr()
+ *     Write out a pax format archive.
+ *     Have to check for file types that cannot be stored.  Be careful of the
+ *      term (last arg) to ul_oct, we only use '\0' for the termination
+ *      character (this is different than picky tar).
+ *     ASSUMED: space after header in header block is zero filled
+ * Return:
+ *     0 if file has data to be written after the header, 1 if file has NO
+ *     data to write after the header, -1 if archive write failed
+ */
+#ifndef SMALL
+int
+pax_wr(ARCHD *arcn)
+{
+       return wr_ustar_or_pax(arcn, 0);
+}
+#endif
+
+/*
  * name_split()
  *     see if the name has to be split for storage in a ustar header. We try
  *     to fit the entire name in the name field without splitting if we can.
@@ -1183,12 +1373,6 @@ expandname(char *buf, size_t len, char *
                nlen = fieldcpy(buf, len, name, limit);
        return(nlen);
 }
-
-/* shortest possible extended record: "5 a=\n" */
-#define MINXHDRSZ      5
-
-/* longest record we'll accept */
-#define MAXXHDRSZ      BLKMULT
 
 static int
 rd_time(struct timespec *ts, const char *keyword, char *p)


-- 
jca | PGP : 0x1524E7EE / 5135 92C1 AD36 5293 2BDF  DDCC 0DFA 74AE 1524 E7EE

Reply via email to