EROFS has supported "native sub-filesystem merging" as a single block
device since kernel commit 8b465fecc35a ("erofs: support flattened block
device for multi-blob images").  It allows sub-filesystems (e.g., EROFS
container layers) to be merged into one filesystem and mounted in
a single shot, which is particularly useful for layered container images
in VM-based secure containers where file-backed mounts are unusable.

Additionally, it can also be used to pass through external data (such as
a tar file) w/o attaching an extra block device to the guest.

Intuitively, there are two approaches to generate a single merged block
device for virtualization scenarios:
 - Concatenate fsmeta + sub-image files, for example, to merge
   `[tar index][tar data]`:

    $ mkfs.erofs --tar=i foo.erofs foo.tar
    $ cat foo.tar >> foo.erofs

   This approach is inefficient unless the host filesystem supports
   reflinks: but a major player, EXT4, does not.

 - Use the virtual block device approach on the host (e.g., NBD, TCMU,
   UBLK) or vhost-user-blk to generate merged devices in the guest.
   However, this requires an additional daemon to stay active, which can
   be inconvenient.

Furthermore, I wondered whether any virtual disk format supports this
functionality.  After doing some research on popular formats, I found
that only VMDK [1] and VHD [2] natively support merging split files.
QEMU appears to have supported VMDK split files [3] since very early
versions.

Add a `--vmdk-desc` option to generate valid `twoGbMaxExtentFlat` VMDK
descriptor files and use the following QEMU option to attach:
  -drive file=foo.vmdk,format=vmdk,if=virtio

Hopefully, Cloud Hypervisor and other microVMs could support VMDK or
concatenating raw files as a single block device in the future.

[1] https://www.vmware.com/app/vmdk/?src=vmdk
[2] See `Splitting Hard Disk Images` in the VHD Format Specification
    https://www.microsoft.com/en-us/download/details.aspx?id=23850
[3] https://github.com/qemu/qemu/blob/master/block/vmdk.c

Signed-off-by: Gao Xiang <hsiang...@linux.alibaba.com>
---
 include/erofs/internal.h |  3 ++
 lib/Makefile.am          |  3 +-
 lib/vmdk.c               | 74 ++++++++++++++++++++++++++++++++++++++++
 man/mkfs.erofs.1         |  4 +++
 mkfs/main.c              | 15 ++++++++
 5 files changed, 98 insertions(+), 1 deletion(-)
 create mode 100644 lib/vmdk.c

diff --git a/include/erofs/internal.h b/include/erofs/internal.h
index 8916be1..d3debc6 100644
--- a/include/erofs/internal.h
+++ b/include/erofs/internal.h
@@ -511,6 +511,9 @@ static inline int erofs_blk_read(struct erofs_sb_info *sbi, 
int device_id,
                              erofs_pos(sbi, nblocks));
 }
 
+/* vmdk.c */
+int erofs_dump_vmdk_desc(FILE *f, struct erofs_sb_info *sbi);
+
 #ifdef EUCLEAN
 #define EFSCORRUPTED   EUCLEAN         /* Filesystem is corrupted */
 #else
diff --git a/lib/Makefile.am b/lib/Makefile.am
index bdc74ad..688403b 100644
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@@ -35,7 +35,8 @@ liberofs_la_SOURCES = config.c io.c cache.c super.c inode.c 
xattr.c exclude.c \
                      namei.c data.c compress.c compressor.c zmap.c 
decompress.c \
                      compress_hints.c hashmap.c sha256.c blobchunk.c dir.c \
                      fragments.c dedupe.c uuid_unparse.c uuid.c tar.c \
-                     block_list.c rebuild.c diskbuf.c bitops.c dedupe_ext.c
+                     block_list.c rebuild.c diskbuf.c bitops.c dedupe_ext.c \
+                     vmdk.c
 
 liberofs_la_CFLAGS = -Wall ${libuuid_CFLAGS} -I$(top_srcdir)/include
 if ENABLE_LZ4
diff --git a/lib/vmdk.c b/lib/vmdk.c
new file mode 100644
index 0000000..06d4a49
--- /dev/null
+++ b/lib/vmdk.c
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0+ OR Apache-2.0
+#include "erofs/internal.h"
+
+static int erofs_vmdk_desc_add_extent(FILE *f, u64 sectors,
+                                     const char *filename, u64 offset)
+{
+       static const char extent_line_fmt[] =
+               "RW %" PRIu64 " FLAT \"%s\" %" PRIu64 "\n";
+
+       while (sectors) {
+               u64 count = min_t(u64, sectors, 0x80000000 >> 9);
+               int ret;
+
+               ret = fprintf(f, extent_line_fmt, count, filename, offset);
+               if (ret < 0)
+                       return -errno;
+               offset += count;
+               sectors -= count;
+       }
+       return 0;
+}
+
+int erofs_dump_vmdk_desc(FILE *f, struct erofs_sb_info *sbi)
+{
+       static const char desc_template_1[] =
+               "# Disk DescriptorFile\n"
+               "version=1\n"
+               "CID=%" PRIx32 "\n"
+               "parentCID=%" PRIx32 "\n"
+               "createType=\"%s\"\n"
+               "\n"
+               "# Extent description\n";
+       static const char desc_template_2[] =
+               "\n"
+               "# The Disk Data Base\n"
+               "#DDB\n"
+               "\n"
+               "ddb.virtualHWVersion = \"%s\"\n"
+               "ddb.geometry.cylinders = \"%" PRIu64 "\"\n"
+               "ddb.geometry.heads = \"%" PRIu32 "\"\n"
+               "ddb.geometry.sectors = \"63\"\n"
+               "ddb.adapterType = \"%s\"\n";
+       static const char subformat[] = "twoGbMaxExtentFlat";
+       static const char adapter_type[] = "ide";
+       u32 cid = ((u32 *)sbi->uuid)[0] ^ ((u32 *)sbi->uuid)[1] ^
+               ((u32 *)sbi->uuid)[2] ^ ((u32 *)sbi->uuid)[3];
+       u32 parent_cid = 0xffffffff;
+       u32 number_heads = 16;
+       char *hw_version = "4";
+       u64 total_sectors, sectors;
+       int ret, i;
+
+       fprintf(f, desc_template_1, cid, parent_cid, subformat);
+       sectors = sbi->primarydevice_blocks << (sbi->blkszbits - 9);
+       ret = erofs_vmdk_desc_add_extent(f, sectors, (char *)sbi->devname, 0);
+       if (ret)
+               return ret;
+       total_sectors = sectors;
+       for (i = 0; i < sbi->extra_devices; ++i) {
+               const char *name = sbi->devs[i].src_path ?:
+                               (const char *)sbi->devs[i].tag;
+
+               sectors = sbi->devs[i].blocks << (sbi->blkszbits - 9);
+               ret = erofs_vmdk_desc_add_extent(f, sectors, name, 0);
+               if (ret)
+                       return ret;
+               total_sectors += sectors;
+       }
+
+       fprintf(f, desc_template_2, hw_version,
+               (u64)DIV_ROUND_UP(total_sectors, 63ULL * number_heads),
+               number_heads, adapter_type);
+       return 0;
+}
diff --git a/man/mkfs.erofs.1 b/man/mkfs.erofs.1
index 48202b6..63f7a2f 100644
--- a/man/mkfs.erofs.1
+++ b/man/mkfs.erofs.1
@@ -270,6 +270,10 @@ together.
 Filter tarball streams through xz, lzma, or lzip. Optionally, raw streams can
 be dumped together.
 .TP
+.BI "\-\-vmdk-desc=" FILE
+Generate a VMDK descriptor file to merge sub-filesystems, which can be used
+for tar index or rebuild mode.
+.TP
 .BI "\-\-xattr-prefix=" PREFIX
 Specify a customized extended attribute namespace prefix for space saving,
 e.g. "trusted.overlay.".  You may give multiple
diff --git a/mkfs/main.c b/mkfs/main.c
index ef83f2e..14ea6ff 100644
--- a/mkfs/main.c
+++ b/mkfs/main.c
@@ -90,6 +90,7 @@ static struct option long_options[] = {
        {"async-queue-limit", required_argument, NULL, 530},
 #endif
        {"fsalignblks", required_argument, NULL, 531},
+       {"vmdk-desc", required_argument, NULL, 532},
        {0, 0, 0, 0},
 };
 
@@ -210,6 +211,7 @@ static void usage(int argc, char **argv)
                " --unxz[=X]            try to filter the tarball stream 
through xz/lzma/lzip\n"
                "                       (and optionally dump the raw stream to 
X together)\n"
 #endif
+               " --vmdk-desc=X         generate a VMDK descriptor file to 
merge sub-filesystems\n"
 #ifdef EROFS_MT_ENABLED
                " --workers=#           set the number of worker threads to # 
(default: %u)\n"
 #endif
@@ -254,6 +256,7 @@ static bool valid_fixeduuid;
 static unsigned int dsunit;
 static unsigned int fsalignblks = 1;
 static int tarerofs_decoder;
+static FILE *vmdk_dcf;
 
 static int erofs_mkfs_feat_set_legacy_compress(bool en, const char *val,
                                               unsigned int vallen)
@@ -988,6 +991,13 @@ static int mkfs_parse_options_cfg(int argc, char *argv[])
                                return -EINVAL;
                        }
                        break;
+               case 532:
+                       vmdk_dcf = fopen(optarg, "wb");
+                       if (!vmdk_dcf) {
+                               erofs_err("failed to open vmdk desc `%s`", 
optarg);
+                               return -EINVAL;
+                       }
+                       break;
                case 'V':
                        version();
                        exit(0);
@@ -1548,6 +1558,11 @@ int main(int argc, char **argv)
                if (!err)
                        erofs_info("superblock checksum 0x%08x written", crc);
        }
+
+       if (!err && vmdk_dcf) {
+               err = erofs_dump_vmdk_desc(vmdk_dcf, &g_sbi);
+               fclose(vmdk_dcf);
+       }
 exit:
        if (root)
                erofs_iput(root);
-- 
2.43.5


Reply via email to