On Wed, Aug 6, 2014 at 4:57 PM, Milos Vyletel <milos.vyle...@gmail.com> wrote: > VMDK's streamOptimized format is different from regular sparse format. > L1(GD) and L2(GT) tables are not predefined but rather generated and > written during image creation mainly because there is no way to tell > how much space data will occupy once they are compressed. Also the > location of header, L1 and L2 tables differ. > > - L2 tables (grain tables) are written after all grains they point to > - L1 tables are written after all grains and L2 tables > - footer at the end is used instead of header in first sector > > Images generated by qemu-img could not be imported (as part of OVA archive) > to neither VMWare nor OVM because of errors. > > - VMWare during OVA import: > Not a supported disk format (sparse VMDK too old) > > - OVM's vbox-img during conversion: > vbox-img: error: Error while copying the image: VERR_EOF > > This patch fixes streamOptimized support in qemu which was not fully > compatible with VMDK specifications as defined in latest avaialble version > at https://www.vmware.com/support/developer/vddk/vmdk_50_technote.pdf. > > Qemu generated images are identical to the ones generated by VMWare and > OVM (vbox-img) with the exception of DescriptorFile but that is expected > (CID and some additional DDB entries differ). They were also succesfully > imported to VMWare vCloud, ESXi and Oracle OVM. > > Signed-off-by: Milos Vyletel <milos.vyle...@gmail.com> > --- > v2 changes: > - updated commit message description with errors received > - style/grammar fixes (clean checkpatch pass) > - removed l2_table pointer from VmdkExtent struct > - fixed memory leak in vmdk_write_footer() > > v3 changes: > - removed footer from VmdkExtent structure > - split added vmdk_write_grain_directory function to separate GD and footer > writes > - fix possible problems with opening of images created by older implementation > block/vmdk.c | 355 +++++++++++++++++++++++++++++++++++++++++++++------------ > 1 files changed, 280 insertions(+), 75 deletions(-) > > diff --git a/block/vmdk.c b/block/vmdk.c > index 0517bba..3ea1c31 100644 > --- a/block/vmdk.c > +++ b/block/vmdk.c > @@ -81,6 +81,21 @@ typedef struct { > uint16_t compressAlgorithm; > } QEMU_PACKED VMDK4Header; > > +typedef struct { > + uint64_t val; > + uint32_t size; > + uint32_t type; > + uint8_t pad[BDRV_SECTOR_SIZE - sizeof(uint64_t) - 2*sizeof(uint32_t)]; > +} QEMU_PACKED VMDK4MetaMarker; > + > +typedef struct { > + VMDK4MetaMarker footer_marker; > + uint32_t magic; > + VMDK4Header header; > + uint8_t pad[BDRV_SECTOR_SIZE - sizeof(uint32_t) - sizeof(VMDK4Header)]; > + VMDK4MetaMarker eos_marker; > +} QEMU_PACKED VMDK4Footer; > + > #define L2_CACHE_SIZE 16 > > typedef struct VmdkExtent { > @@ -89,12 +104,14 @@ typedef struct VmdkExtent { > bool compressed; > bool has_marker; > bool has_zero_grain; > + bool has_footer; > int version; > int64_t sectors; > int64_t end_sector; > int64_t flat_start_offset; > int64_t l1_table_offset; > int64_t l1_backup_table_offset; > + uint32_t l1_index; > uint32_t *l1_table; > uint32_t *l1_backup_table; > unsigned int l1_size; > @@ -125,7 +142,6 @@ typedef struct BDRVVmdkState { > > typedef struct VmdkMetaData { > uint32_t offset; > - unsigned int l1_index; > unsigned int l2_index; > unsigned int l2_offset; > int valid; > @@ -555,14 +571,50 @@ static char *vmdk_read_desc(BlockDriverState *file, > uint64_t desc_offset, > return buf; > } > > +static int vmdk_read_footer(BlockDriverState *bs, > + VMDK4Footer *footer) > +{ > + int ret; > + > + /* > + * footer starts 3 sectors from end > + * - footer marker > + * - footer > + * - end-of-stream marker > + */ > + ret = bdrv_pread(bs->file, > + (bs->file->total_sectors - 3) * BDRV_SECTOR_SIZE, > + footer, sizeof(*footer)); > + if (ret < 0) { > + goto out; > + } > + > + /* Some sanity checks for the footer */ > + if (be32_to_cpu(footer->magic) != VMDK4_MAGIC || > + le32_to_cpu(footer->footer_marker.size) != 0 || > + le32_to_cpu(footer->footer_marker.type) != MARKER_FOOTER || > + le64_to_cpu(footer->eos_marker.val) != 0 || > + le32_to_cpu(footer->eos_marker.size) != 0 || > + le32_to_cpu(footer->eos_marker.type) != MARKER_END_OF_STREAM) { > + ret = -EINVAL; > + goto out; > + } > + > + ret = VMDK_OK; > + out: > + return ret; > +} > + > static int vmdk_open_vmdk4(BlockDriverState *bs, > BlockDriverState *file, > int flags, Error **errp) > { > int ret; > + bool has_footer = false; > uint32_t magic; > uint32_t l1_size, l1_entry_sectors; > VMDK4Header header; > + VMDK4Footer footer; > VmdkExtent *extent; > BDRVVmdkState *s = bs->opaque; > int64_t l1_backup_offset = 0; > @@ -593,48 +645,13 @@ static int vmdk_open_vmdk4(BlockDriverState *bs, > > if (le64_to_cpu(header.gd_offset) == VMDK4_GD_AT_END) { > /* > - * The footer takes precedence over the header, so read it in. The > - * footer starts at offset -1024 from the end: One sector for the > - * footer, and another one for the end-of-stream marker. > + * The footer takes precedence over the header, so read it in. > */ > - struct { > - struct { > - uint64_t val; > - uint32_t size; > - uint32_t type; > - uint8_t pad[512 - 16]; > - } QEMU_PACKED footer_marker; > - > - uint32_t magic; > - VMDK4Header header; > - uint8_t pad[512 - 4 - sizeof(VMDK4Header)]; > - > - struct { > - uint64_t val; > - uint32_t size; > - uint32_t type; > - uint8_t pad[512 - 16]; > - } QEMU_PACKED eos_marker; > - } QEMU_PACKED footer; > - > - ret = bdrv_pread(file, > - bs->file->total_sectors * 512 - 1536, > - &footer, sizeof(footer)); > + ret = vmdk_read_footer(bs, &footer); > if (ret < 0) { > return ret; > } > - > - /* Some sanity checks for the footer */ > - if (be32_to_cpu(footer.magic) != VMDK4_MAGIC || > - le32_to_cpu(footer.footer_marker.size) != 0 || > - le32_to_cpu(footer.footer_marker.type) != MARKER_FOOTER || > - le64_to_cpu(footer.eos_marker.val) != 0 || > - le32_to_cpu(footer.eos_marker.size) != 0 || > - le32_to_cpu(footer.eos_marker.type) != MARKER_END_OF_STREAM) > - { > - return -EINVAL; > - } > - > + has_footer = true; > header = footer.header; > } > > @@ -645,11 +662,15 @@ static int vmdk_open_vmdk4(BlockDriverState *bs, > error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, > bs->device_name, "vmdk", buf); > return -ENOTSUP; > - } else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR)) { > + } else if (le32_to_cpu(header.version) == 3 && > + (flags & BDRV_O_RDWR) && > + !(le64_to_cpu(header.flags) & VMDK4_FLAG_COMPRESS)) { > /* VMware KB 2064959 explains that version 3 added support for > * persistent changed block tracking (CBT), and backup software can > * read it as version=1 if it doesn't care about the changed area > - * information. So we are safe to enable read only. */ > + * information. So we are safe to enable read only. > + * Note that this does not apply to streamOptimized images which > + * are written only once and are used as transport format */ > error_setg(errp, "VMDK version 3 must be read only"); > return -EINVAL; > } > @@ -689,11 +710,20 @@ static int vmdk_open_vmdk4(BlockDriverState *bs, > if (ret < 0) { > return ret; > } > + if (has_footer) { > + extent->has_footer = has_footer; > + } > + > extent->compressed = > le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE; > if (extent->compressed) { > g_free(s->create_type); > s->create_type = g_strdup("streamOptimized"); > + > + if (extent->has_footer && (flags & BDRV_O_RDWR)) { > + bdrv_truncate(file, > + le64_to_cpu(header.grain_offset) * BDRV_SECTOR_SIZE); > + } > } > extent->has_marker = le32_to_cpu(header.flags) & VMDK4_FLAG_MARKER; > extent->version = le32_to_cpu(header.version); > @@ -998,6 +1028,12 @@ static int vmdk_L2update(VmdkExtent *extent, > VmdkMetaData *m_data) > uint32_t offset; > QEMU_BUILD_BUG_ON(sizeof(offset) != sizeof(m_data->offset)); > offset = cpu_to_le32(m_data->offset); > + > + /* nothing to update on streamOptimized */ > + if (extent->compressed) { > + return VMDK_OK; > + } > + > /* update L2 table */ > if (bdrv_pwrite_sync( > extent->file, > @@ -1008,7 +1044,7 @@ static int vmdk_L2update(VmdkExtent *extent, > VmdkMetaData *m_data) > } > /* update backup L2 table */ > if (extent->l1_backup_table_offset != 0) { > - m_data->l2_offset = extent->l1_backup_table[m_data->l1_index]; > + m_data->l2_offset = extent->l1_backup_table[extent->l1_index]; > if (bdrv_pwrite_sync( > extent->file, > ((int64_t)m_data->l2_offset * 512) > @@ -1024,6 +1060,108 @@ static int vmdk_L2update(VmdkExtent *extent, > VmdkMetaData *m_data) > return VMDK_OK; > } > > +static int vmdk_write_footer(BlockDriverState *bs, VMDK4Footer *footer) > +{ > + int ret; > + uint64_t offset; > + uint32_t grains, gt_count, gd_sectors; > + > + if (!footer) { > + return -EINVAL; > + } > + > + grains = DIV_ROUND_UP(le64_to_cpu(footer->header.capacity), > + le64_to_cpu(footer->header.granularity)); > + gt_count = DIV_ROUND_UP(grains, > + le32_to_cpu(footer->header.num_gtes_per_gt)); > + gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE); > + > + offset = le64_to_cpu(footer->header.gd_offset) + gd_sectors; > + footer->footer_marker.val = cpu_to_le64(1); > + footer->footer_marker.type = cpu_to_le32(MARKER_FOOTER); > + footer->magic = cpu_to_be32(VMDK4_MAGIC); > + footer->eos_marker.type = cpu_to_le32(MARKER_END_OF_STREAM); > + > + ret = bdrv_pwrite(bs, offset * BDRV_SECTOR_SIZE, footer, > sizeof(*footer)); > + if (ret < 0) { > + return ret; > + } > + > + return VMDK_OK; > +} > + > +static int vmdk_write_grain_directory(VmdkExtent *extent) > +{ > + int i, ret, gd_buf_size; > + uint32_t *gd_buf = NULL, gd_sectors; > + VMDK4MetaMarker gd_marker; > + > + /* write grain directory marker */ > + memset(&gd_marker, 0, sizeof(gd_marker)); > + gd_sectors = DIV_ROUND_UP(extent->l1_size * sizeof(uint32_t), > + BDRV_SECTOR_SIZE); > + gd_marker.val = cpu_to_le64(gd_sectors); > + gd_marker.type = cpu_to_le32(MARKER_GRAIN_DIRECTORY); > + ret = bdrv_pwrite(extent->file, extent->l1_table_offset, > + &gd_marker, sizeof(gd_marker)); > + if (ret < 0) { > + return ret; > + } > + extent->l1_table_offset += sizeof(gd_marker); > + > + /* write grain directory */ > + gd_buf_size = extent->l1_size * sizeof(uint32_t); > + gd_buf = g_malloc0(gd_buf_size); > + for (i = 0; i < extent->l1_size; i++) { > + gd_buf[i] = cpu_to_le32(extent->l1_table[i]); > + } > + ret = bdrv_pwrite(extent->file, extent->l1_table_offset, > + gd_buf, gd_buf_size); > + if (ret < 0) { > + goto exit; > + } > + > + ret = VMDK_OK; > + exit: > + g_free(gd_buf); > + return ret; > +} > + > +static int vmdk_write_grain_table(VmdkExtent *extent) > +{ > + int i; > + uint32_t *l2_table = NULL; > + VMDK4MetaMarker gtm; > + > + for (i = 0; i < L2_CACHE_SIZE; i++) { > + if (extent->l1_table[extent->l1_index] == > extent->l2_cache_offsets[i]) { > + l2_table = extent->l2_cache + (i * extent->l2_size); > + } > + } > + if (!l2_table) { > + return -EINVAL; > + } > + > + memset(>m, 0, sizeof(gtm)); > + gtm.val = cpu_to_le32((extent->l2_size * sizeof(uint32_t)) >> 9); > + gtm.type = cpu_to_le32(MARKER_GRAIN_TABLE); > + if (bdrv_pwrite(extent->file, extent->l1_table_offset, > + >m, sizeof(gtm)) != sizeof(gtm)) { > + return -EIO; > + } > + extent->l1_table_offset += sizeof(gtm); > + > + extent->l1_table[extent->l1_index] = extent->l1_table_offset >> 9; > + if (bdrv_pwrite(extent->file, extent->l1_table_offset, > + l2_table, extent->l2_size * sizeof(uint32_t) > + ) != extent->l2_size * sizeof(uint32_t)) { > + return -EIO; > + } > + extent->l1_table_offset += extent->l2_size * sizeof(uint32_t); > + > + return VMDK_OK; > +} > + > static int get_cluster_offset(BlockDriverState *bs, > VmdkExtent *extent, > VmdkMetaData *m_data, > @@ -1032,7 +1170,7 @@ static int get_cluster_offset(BlockDriverState *bs, > uint64_t *cluster_offset) > { > unsigned int l1_index, l2_offset, l2_index; > - int min_index, i, j; > + int min_index, i, j, ret; > uint32_t min_count, *l2_table; > bool zeroed = false; > > @@ -1046,6 +1184,22 @@ static int get_cluster_offset(BlockDriverState *bs, > > offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE; > l1_index = (offset >> 9) / extent->l1_entry_sectors; > + if (extent->compressed && !extent->l1_table[l1_index]) { > + if (l1_index) { > + /* grain (L2) tables follow compressed data so first L2 table > will > + * be written when we move to next index or when we close image. > + * that is why we need to save l1_index in extent itself for easy > + * access from both here and vmdk_close */ > + ret = vmdk_write_grain_table(extent); > + if (ret < 0) { > + return ret; > + } > + } > + /* allocate new L2; set it to GD offset for now */ > + extent->l1_table[l1_index] = extent->l1_table_offset; > + } > + extent->l1_index = l1_index; > + > if (l1_index >= extent->l1_size) { > return VMDK_ERROR; > } > @@ -1092,7 +1246,6 @@ static int get_cluster_offset(BlockDriverState *bs, > > if (m_data) { > m_data->valid = 1; > - m_data->l1_index = l1_index; > m_data->l2_index = l2_index; > m_data->offset = *cluster_offset; > m_data->l2_offset = l2_offset; > @@ -1234,6 +1387,10 @@ static int vmdk_write_extent(VmdkExtent *extent, > int64_t cluster_offset, > ret = ret < 0 ? ret : -EIO; > goto out; > } > + if (extent->compressed) { > + /* update GD offset after each write */ > + extent->l1_table_offset = bdrv_getlength(extent->file); > + } > ret = 0; > out: > g_free(data); > @@ -1532,10 +1689,12 @@ static int vmdk_create_extent(const char *filename, > int64_t filesize, > int ret, i; > BlockDriverState *bs = NULL; > VMDK4Header header; > + VMDK4Footer footer; > Error *local_err = NULL; > uint32_t tmp, magic, grains, gd_sectors, gt_size, gt_count; > uint32_t *gd_buf = NULL; > int gd_buf_size; > + uint64_t grain_offset, rgd_offset, gd_offset; > > ret = bdrv_create_file(filename, opts, &local_err); > if (ret < 0) { > @@ -1560,28 +1719,38 @@ static int vmdk_create_extent(const char *filename, > int64_t filesize, > } > magic = cpu_to_be32(VMDK4_MAGIC); > memset(&header, 0, sizeof(header)); > - header.version = zeroed_grain ? 2 : 1; > - header.flags = VMDK4_FLAG_RGD | VMDK4_FLAG_NL_DETECT > - | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0) > + memset(&footer, 0, sizeof(footer)); > + > + header.version = (compress ? 3 : zeroed_grain ? 2 : 1); > + header.flags = VMDK4_FLAG_NL_DETECT > + | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER > + : VMDK4_FLAG_RGD) > | (zeroed_grain ? VMDK4_FLAG_ZERO_GRAIN : 0); > header.compressAlgorithm = compress ? VMDK4_COMPRESSION_DEFLATE : 0; > header.capacity = filesize / BDRV_SECTOR_SIZE; > header.granularity = 128; > header.num_gtes_per_gt = BDRV_SECTOR_SIZE; > > - grains = DIV_ROUND_UP(filesize / BDRV_SECTOR_SIZE, header.granularity); > + grains = DIV_ROUND_UP(header.capacity, header.granularity); > gt_size = DIV_ROUND_UP(header.num_gtes_per_gt * sizeof(uint32_t), > BDRV_SECTOR_SIZE); > gt_count = DIV_ROUND_UP(grains, header.num_gtes_per_gt); > gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE); > > header.desc_offset = 1; > - header.desc_size = 20; > - header.rgd_offset = header.desc_offset + header.desc_size; > - header.gd_offset = header.rgd_offset + gd_sectors + (gt_size * gt_count); > - header.grain_offset = > + header.desc_size = (compress ? 2 : 20); > + rgd_offset = header.desc_offset + header.desc_size; > + header.rgd_offset = ((header.flags & VMDK4_FLAG_RGD) ? rgd_offset : 0); > + gd_offset = rgd_offset + gd_sectors + (gt_size * gt_count); > + header.gd_offset = (compress ? VMDK4_GD_AT_END : gd_offset); > + grain_offset = > ROUND_UP(header.gd_offset + gd_sectors + (gt_size * gt_count), > header.granularity); > + /* streamOptimized reserves first 128 sectors */ > + header.grain_offset = (compress ? header.granularity : grain_offset); > + /* streamOptimzed's grain directory is at the end */ > + gd_offset = header.grain_offset + 1; > + > /* swap endianness for all header fields */ > header.version = cpu_to_le32(header.version); > header.flags = cpu_to_le32(header.flags); > @@ -1618,30 +1787,44 @@ static int vmdk_create_extent(const char *filename, > int64_t filesize, > goto exit; > } > > - /* write grain directory */ > - gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE; > - gd_buf = g_malloc0(gd_buf_size); > - for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors; > - i < gt_count; i++, tmp += gt_size) { > - gd_buf[i] = cpu_to_le32(tmp); > - } > - ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE, > - gd_buf, gd_buf_size); > - if (ret < 0) { > - error_set(errp, QERR_IO_ERROR); > - goto exit; > - } > + if (compress) { > + footer.header = header; > + footer.header.gd_offset = cpu_to_le64(gd_offset); > > - /* write backup grain directory */ > - for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors; > - i < gt_count; i++, tmp += gt_size) { > - gd_buf[i] = cpu_to_le32(tmp); > - } > - ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE, > - gd_buf, gd_buf_size); > - if (ret < 0) { > - error_set(errp, QERR_IO_ERROR); > - goto exit; > + ret = vmdk_write_footer(bs, &footer); > + if (ret < 0) { > + error_set(errp, QERR_IO_ERROR); > + goto exit; > + } > + } else { > + /* write redundant grain directory (if applicable) */ > + if (le64_to_cpu(header.rgd_offset)) { > + gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE; > + gd_buf = g_malloc0(gd_buf_size); > + for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors; > + i < gt_count; i++, tmp += gt_size) { > + gd_buf[i] = cpu_to_le32(tmp); > + } > + ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) * > + BDRV_SECTOR_SIZE, > + gd_buf, gd_buf_size); > + if (ret < 0) { > + error_set(errp, QERR_IO_ERROR); > + goto exit; > + } > + } > + > + /* write grain directory */ > + for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors; > + i < gt_count; i++, tmp += gt_size) { > + gd_buf[i] = cpu_to_le32(tmp); > + } > + ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * > BDRV_SECTOR_SIZE, > + gd_buf, gd_buf_size); > + if (ret < 0) { > + error_set(errp, QERR_IO_ERROR); > + goto exit; > + } > } > > ret = 0; > @@ -1911,7 +2094,29 @@ exit: > > static void vmdk_close(BlockDriverState *bs) > { > + int ret; > BDRVVmdkState *s = bs->opaque; > + VmdkExtent *extent = &s->extents[0]; > + VMDK4Footer footer; > + > + if (extent->compressed) { > + while (extent < &s->extents[s->num_extents]) { > + vmdk_write_grain_table(extent); > + vmdk_write_grain_directory(extent); > + if (extent->has_footer) { > + memset(&footer, 0, sizeof(footer)); > + ret = bdrv_pread(extent->file, sizeof(uint32_t), > + &footer.header, sizeof(footer.header)); > + if (ret < 0) { > + continue; > + } > + footer.header.gd_offset = > + cpu_to_le64(extent->l1_table_offset >> 9); > + vmdk_write_footer(extent->file, &footer); > + } > + extent++; > + } > + } > > vmdk_free_extents(bs); > g_free(s->create_type); > -- > 1.7.1 >
Please ignore this patch. Found small bug and will resend with fix and correct subject this time. Milos