Re: [Qemu-devel] [RFC] qcow2 journalling draft

Fam Zheng Fri, 06 Sep 2013 03:01:22 -0700

On Tue, 09/03 15:45, Kevin Wolf wrote:
> This contains an extension of the qcow2 spec that introduces journalling
> to the image format, plus some preliminary type definitions and
> function prototypes in the qcow2 code.
> 
> Journalling functionality is a crucial feature for the design of data
> deduplication, and it will improve the core part of qcow2 by avoiding
> cluster leaks on crashes as well as provide an easier way to get a
> reliable implementation of performance features like Delayed COW.
> 
> At this point of the RFC, it would be most important to review the
> on-disk structure. Once we're confident that it can do everything we
> want, we can start going into more detail on the qemu side of things.
> 
> Signed-off-by: Kevin Wolf <kw...@redhat.com>
> ---
>  block/Makefile.objs   |   2 +-
>  block/qcow2-journal.c |  55 ++++++++++++++
>  block/qcow2.h         |  78 +++++++++++++++++++
>  docs/specs/qcow2.txt  | 204 
> +++++++++++++++++++++++++++++++++++++++++++++++++-
>  4 files changed, 337 insertions(+), 2 deletions(-)
>  create mode 100644 block/qcow2-journal.c
> 
> diff --git a/block/Makefile.objs b/block/Makefile.objs
> index 3bb85b5..59be314 100644
> --- a/block/Makefile.objs
> +++ b/block/Makefile.objs
> @@ -1,5 +1,5 @@
>  block-obj-y += raw_bsd.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o 
> vpc.o vvfat.o
> -block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o 
> qcow2-cache.o
> +block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o 
> qcow2-cache.o qcow2-journal.o
>  block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
>  block-obj-y += qed-check.o
>  block-obj-y += vhdx.o
> diff --git a/block/qcow2-journal.c b/block/qcow2-journal.c
> new file mode 100644
> index 0000000..5b20239
> --- /dev/null
> +++ b/block/qcow2-journal.c
> @@ -0,0 +1,55 @@
> +/*
> + * qcow2 journalling functions
> + *
> + * Copyright (c) 2013 Kevin Wolf <kw...@redhat.com>
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a 
> copy
> + * of this software and associated documentation files (the "Software"), to 
> deal
> + * in the Software without restriction, including without limitation the 
> rights
> + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
> + * copies of the Software, and to permit persons to whom the Software is
> + * furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
> FROM,
> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
> + * THE SOFTWARE.
> + */
> +
> +#include "qemu-common.h"
> +#include "block/block_int.h"
> +#include "qcow2.h"
> +
> +#define QCOW2_JOURNAL_MAGIC 0x716a6f75726e616cULL  /* "qjournal" */
> +#define QCOW2_JOURNAL_BLOCK_MAGIC 0x716a626b  /* "qjbk" */
> +
> +typedef struct Qcow2JournalHeader {
> +    uint64_t    magic;
> +    uint32_t    journal_size;
> +    uint32_t    block_size;
> +    uint32_t    synced_index;
> +    uint32_t    synced_seq;
> +    uint32_t    committed_seq;
> +    uint32_t    checksum;
> +} QEMU_PACKED Qcow2JournalHeader;
> +
> +/*
> + * One big transaction per journal block. The transaction is committed either
> + * time based or when a microtransaction (single set of operations that must 
> be
> + * performed atomically) doesn't fit in the same block any more.
> + */
> +typedef struct Qcow2JournalBlock {
> +    uint32_t    magic;
> +    uint32_t    checksum;
> +    uint32_t    seq;
> +    uint32_t    desc_offset; /* Allow block header extensions */
> +    uint32_t    desc_bytes;
> +    uint32_t    nb_data_blocks;
> +} QEMU_PACKED Qcow2JournalBlock;
> +
> diff --git a/block/qcow2.h b/block/qcow2.h
> index 1000239..2aee1fd 100644
> --- a/block/qcow2.h
> +++ b/block/qcow2.h
> @@ -157,6 +157,10 @@ typedef struct Qcow2DiscardRegion {
>      QTAILQ_ENTRY(Qcow2DiscardRegion) next;
>  } Qcow2DiscardRegion;
>  
> +typedef struct Qcow2Journal {
> +
> +} Qcow2Journal;
> +
>  typedef struct BDRVQcowState {
>      int cluster_bits;
>      int cluster_size;
> @@ -479,4 +483,78 @@ int qcow2_cache_get_empty(BlockDriverState *bs, 
> Qcow2Cache *c, uint64_t offset,
>      void **table);
>  int qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table);
>  
> +/* qcow2-journal.c functions */
> +
> +typedef struct Qcow2JournalTransaction Qcow2JournalTransaction;
> +
> +enum Qcow2JournalEntryTypeID {
> +    QJ_DESC_NOOP    = 0,
> +    QJ_DESC_WRITE   = 1,
> +    QJ_DESC_COPY    = 2,
> +
> +    /* required after a cluster is freed and used for other purposes, so that
> +     * new (unjournalled) data won't be overwritten with stale metadata */
> +    QJ_DESC_REVOKE  = 3,
> +};
> +
> +typedef struct Qcow2JournalEntryType {
> +    enum Qcow2JournalEntryTypeID id;
> +    int (*sync)(void *buf, size_t size);
> +} Qcow2JournalEntryType;
> +
> +typedef struct Qcow2JournalDesc {
> +    uint16_t    type;
> +    uint16_t    size;
> +} QEMU_PACKED Qcow2JournalDesc;
> +
> +typedef struct Qcow2JournalDescWrite {
> +    Qcow2JournalDesc common;
> +    struct {
> +        uint32_t length;
> +        uint64_t target_offset;
> +        uint32_t data_block_index;
> +    } write[];
> +} QEMU_PACKED Qcow2JournalDescData;
> +
> +typedef struct Qcow2JournalDescCopy {
> +    Qcow2JournalDesc common;
> +    struct {
> +        uint32_t length;
> +        uint64_t target_offset;
> +        uint64_t source_offset;
> +    } copy[];
> +} QEMU_PACKED Qcow2JournalDescCopy;
> +
> +typedef struct Qcow2JournalRevoke {
> +    Qcow2JournalDesc common;
> +    struct {
> +        uint32_t length;
> +        uint64_t target_offset;
> +    } revoke[];
> +} QEMU_PACKED Qcow2JournalDescRevoke;
> +
> +void qcow2_journal_register_entry_type(Qcow2JournalEntryType *type);
> +
> +/* When commit_interval seconds have passed since the last commit, or
> + * uncommitted journal data of at least commit_datasize bytes has accumulated
> + * (whatever occurs first), transactions are committed. */
> +int qcow2_journal_init(Qcow2Journal **journal, uint64_t start_offset,
> +                       int commit_interval, size_t commit_datasize);
> +int qcow2_journal_destroy(Qcow2Journal *journal);
> +
> +/* These functions create microtransactions, i.e. a set of operations that 
> must
> + * be executed atomically. In general, qemu doesn't map this to one qcow2
> + * on-disk transaction (which would leave a lot of space unused), but handles
> + * multiple microtransaction with one on-disk transaction. */
> +Qcow2JournalTransaction *qcow2_journal_begin_transaction(Qcow2Journal 
> *journal);
> +void qcow2_journal_add(Qcow2JournalTransaction *ta, Qcow2JournalDesc *desc);
> +void qcow2_journal_end_transaction(Qcow2JournalTransaction *ta);
> +
> +/* Commits all completed microtransactions (i.e. 
> qcow2_journal_end_transaction
> + * has already been called) */
> +int qcow2_journal_commit(Qcow2Journal *journal);
> +
> +/* Syncs all committed transactions */
> +int qcow2_journal_sync(Qcow2Journal *journal);
> +
>  #endif
> diff --git a/docs/specs/qcow2.txt b/docs/specs/qcow2.txt
> index 33eca36..7578a4b 100644
> --- a/docs/specs/qcow2.txt
> +++ b/docs/specs/qcow2.txt
> @@ -85,6 +85,10 @@ in the description of a field.
>                                  be written to (unless for regaining
>                                  consistency).
>  
> +                    Bit 2:      Journal dirty. A replay of the main journal 
> is
> +                                needed in order to regain consistency before
> +                                accessing the image.
> +
>                      Bits 2-63:  Reserved (set to 0)


-EBUSY for bit 2, should continue from bit 3. :)

>  
>           80 -  87:  compatible_features
> @@ -103,7 +107,11 @@ in the description of a field.
>                      write to an image with unknown auto-clear features if it
>                      clears the respective bits from this field first.
>  
> -                    Bits 0-63:  Reserved (set to 0)
> +                    Bit 0:      Journal valid bit. This bit indicates that 
> the
> +                                image contains a valid main journal starting 
> at
> +                                journal_offset.
> +
> +                    Bits 1-63:  Reserved (set to 0)
>  
>           96 -  99:  refcount_order
>                      Describes the width of a reference count block entry 
> (width
> @@ -114,6 +122,16 @@ in the description of a field.
>                      Length of the header structure in bytes. For version 2
>                      images, the length is always assumed to be 72 bytes.
>  
> +        104 - 111:  journal_offset
> +                    Offset into the image file at which the main image 
> journal
> +                    starts. Must be aligned to a cluster boundary. 0 means 
> that
> +                    no journal is used.
> +
> +                    This field is only valid if the journal feature bit in
> +                    autoclear_features is set. If the field is invalid or the
> +                    header is too short to contain the field, the field is
> +                    assumed to be 0 (no journal is used)
> +
>  Directly after the image header, optional sections called header extensions 
> can
>  be stored. Each extension has a structure like the following:
>  
> @@ -355,3 +373,187 @@ Snapshot table entry:
>          variable:   Unique ID string for the snapshot (not null terminated)
>  
>          variable:   Name of the snapshot (not null terminated)
> +
> +
> +== Journal ==
> +
> +Journals are used to allow safe updates of metadata without impacting
> +performance by requiring flushes to order updates to different parts of the
> +metadata. They consist of transactions, which in turn contain operations that
> +are effectively executed atomically. A qcow2 image can have a main image
> +journal that deals with cluster management operations, and additional 
> specific
> +journals can be used by other features like data deduplication.
> +
> +
> +As far as the on-disk format is concerned, a transaction is in one of the
> +following states:
> +
> +    Incomplete:     This is the initial state of any transaction, while new
> +                    operations can still be added. When opening an image 
> with a
> +                    dirty journal, incomplete transactions are discarded.
> +
> +    Committed:      When all operations that must be performed atomically
> +                    during the transaction have been written and are stable 
> on
> +                    disk, the transaction can be committed by increasing the
> +                    commited sequence number in the journal heder. A
> +                    transaction in this state may not be changed. When 
> opening
> +                    an image with a dirty image, committed transactions 
> should
> +                    be replayed.
> +
> +    Synced:         A transaction is synced if all of its operations have 
> been
> +                    performed, all data written is stable on disk, and the
> +                    synced sequence number is increased in the journal 
> header.
> +                    Synced transactions are no longer needed in the journal 
> and
> +                    can be overwritten. They are ignored during replay.
> +
> +The use of a sequence number implies that transactions are processed
> +sequentially and an earlier transaction can never be unsynced/uncommitted if 
> a
> +later one is synced/committed.
> +
> +
> +A journal is organised in journal blocks, all of which have a reference count
> +of exactly 1. It starts with a block containing the following journal header:
> +
> +    Byte  0 -  7:   Magic ("qjournal" ASCII string)
> +
> +          8 - 11:   Journal size in bytes, including the header
> +
> +         12 - 15:   Journal block size order (block size in bytes = 1 << 
> order)
> +                    The block size must be at least 512 bytes and must not
> +                    exceed the cluster size.
> +
> +         16 - 19:   Journal block index of the descriptor for the last
> +                    transaction that has been synced, starting with 1 for the
> +                    journal block after the header. 0 is used for empty

I suggest s/header/journal header/, for less confusion with image header.

> +                    journals.
> +
> +         20 - 23:   Sequence number of the last transaction that has been
> +                    synced. 0 is recommended as the initial value.
> +
> +         24 - 27:   Sequence number of the last transaction that has been
> +                    committed. When replaying a journal, all transactions
> +                    after the last synced one up to the last commit one must 
> be
> +                    synced. Note that this may include a wraparound of 
> sequence
> +                    numbers.
> +
I'm not sure of the downside of relatively frequent wraparound, but any reason
not to use 64 bit sequence numbers which makes it much rare?

> +         28 -  31:  Checksum (one's complement of the sum of all bytes in the
> +                    header journal block except those of the checksum field)
> +
> +         32 - 511:  Reserved (set to 0)
> +
> +
> +The header is followed by journal blocks that are either descriptor or data
> +blocks. The block index at byte 16 points to the first valid descriptor, 
> except
> +for completely empty journals, where it can be 0. The next descriptor can be
> +found by skipping a descriptor and its associated data blocks. When the 
> journal
> +size is exceeded, a wraparound occurs, essentially forming a ring buffer.
> +
> +A wraparound may not occur in the middle of a single transaction, but only
> +between two transactions. For the necessary padding an empty descriptor with
> +any number of data blocks can be used as the last entry of the ring.
> +
> +The chain of valid descriptors ends if a descriptor is reached whose sequence
> +number isn't the successor of the previous sequence number. This means in
> +particular that the journal must be ordered chronologically and has ascending
> +sequence numbers (except in the case of a sequence number wraparound).

Worth documenting the wraparound case ( (seq_num_t)-1 => 0x1)?

Fam
> +All blocks from the end of the descriptor chain until the starting point are
> +unused.
> +
> +
> +Descriptor blocks describe one transaction each and have the following
> +structure:
> +
> +    Byte  0 -  3:   Magic ("qjbk" ASCII string)
> +
> +          4 -  7:   Checksum (one's complement of the sum of all bytes in the
> +                    descriptor block except those of the checksum field, and
> +                    all bytes in the associated data blocks)
> +
> +          8 - 11:   Sequence number of the transaction
> +
> +         12 - 15:   Byte offset into the descriptor block at which 
> descriptors
> +                    start
> +
> +         16 - 19:   Total length of descriptors in this block in bytes
> +
> +         20 - 23:   Number of following data blocks that are associated with
> +                    this transaction.
> +
> +         24 -  n:   (Future extensions)
> +
> +          n -  m:   Array of descriptors as described below. The exact values
> +                    of n and m are determined by the above fields.
> +
> +All descriptors start with a common part:
> +
> +    Byte  0 -  1:   Descriptor type
> +                        0 - No-op descriptor
> +                        1 - Write data block
> +                        2 - Copy data
> +                        3 - Revoke
> +                        4 - Deduplication hash insertion
> +                        5 - Deduplication hash deletion
> +
> +          2 -  3:   Size of the descriptor in bytes
> +
> +          4 -  n:   Type-specific data
> +
> +The following section specifies the purpose (i.e. the action that is to be
> +performed when syncing) and type-specific data layout of each descriptor 
> type:
> +
> +  * No-op descriptor: No action is to be performed when syncing this 
> descriptor
> +
> +          4 -  n:   Ignored
> +
> +  * Write data block: Write literal data associated with this transaction 
> from
> +    the journal to a given offset.
> +
> +          4 -  7:   Length of the data to write in bytes
> +
> +          8 - 15:   Offset in the image file to write the data to
> +
> +         16 - 19:   Index of the journal block at which the data to write
> +                    starts. The data must be stored sequentially and be fully
> +                    contained in the data blocks associated with the
> +                    transaction.
> +
> +    The type-specific data can be repeated, specifying multiple chunks of 
> data
> +    to be written in one operation. This means the size of the descriptor 
> must
> +    be 4 + 16 * n.
> +
> +  * Copy data: Copy data from one offset in the image to another one. This 
> can
> +    be used for journalling copy-on-write operations.
> +
> +          4 -  7:   Length of the data to write in bytes
> +
> +          8 - 15:   Target offset in the image file
> +
> +         16 - 23:   Source offset in the image file
> +
> +    The type-specific data can be repeated, specifying multiple chunks of 
> data
> +    to be copied in one operation. This means the size of the descriptor must
> +    be 4 + 20 * n.
> +
> +  * Revoke: Marks operations on a given range in the imag file invalid for 
> all
> +    earlier transactions (this does not include the transaction containing 
> the
> +    revoke). They must not be executed on a sync operation (e.g. because the
> +    range in question has been freed and may have been reused for other, not
> +    journalled data structures that must not be overwritten with stale data).
> +    Note that this may mean that operations are to be executed partially.
> +
> +          4 -  7:   Length of the range in bytes
> +
> +          8 - 15:   Offset of the range in the image file
> +
> +    The type-specific data can be repeated, specifying multiple ranges for
> +    which operations should be revoked. This means the size of the descriptor
> +    must be 4 + 12 * n.
> +
> +  * Deduplication hash insertion: Associates a hash value with a cluster.
> +
> +    TODO
> +
> +  * Deduplication hash deletion: Marks a hash value invalid (e.g. because the
> +    hashed data has changed)
> +
> +    TODO
> -- 
> 1.8.1.4
>

Re: [Qemu-devel] [RFC] qcow2 journalling draft

Reply via email to