On Tue, 09/03 15:45, Kevin Wolf wrote: > This contains an extension of the qcow2 spec that introduces journalling > to the image format, plus some preliminary type definitions and > function prototypes in the qcow2 code. > > Journalling functionality is a crucial feature for the design of data > deduplication, and it will improve the core part of qcow2 by avoiding > cluster leaks on crashes as well as provide an easier way to get a > reliable implementation of performance features like Delayed COW. > > At this point of the RFC, it would be most important to review the > on-disk structure. Once we're confident that it can do everything we > want, we can start going into more detail on the qemu side of things. > > Signed-off-by: Kevin Wolf <kw...@redhat.com> > --- > block/Makefile.objs | 2 +- > block/qcow2-journal.c | 55 ++++++++++++++ > block/qcow2.h | 78 +++++++++++++++++++ > docs/specs/qcow2.txt | 204 > +++++++++++++++++++++++++++++++++++++++++++++++++- > 4 files changed, 337 insertions(+), 2 deletions(-) > create mode 100644 block/qcow2-journal.c > > diff --git a/block/Makefile.objs b/block/Makefile.objs > index 3bb85b5..59be314 100644 > --- a/block/Makefile.objs > +++ b/block/Makefile.objs > @@ -1,5 +1,5 @@ > block-obj-y += raw_bsd.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o > vpc.o vvfat.o > -block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o > qcow2-cache.o > +block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o > qcow2-cache.o qcow2-journal.o > block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o > block-obj-y += qed-check.o > block-obj-y += vhdx.o > diff --git a/block/qcow2-journal.c b/block/qcow2-journal.c > new file mode 100644 > index 0000000..5b20239 > --- /dev/null > +++ b/block/qcow2-journal.c > @@ -0,0 +1,55 @@ > +/* > + * qcow2 journalling functions > + * > + * Copyright (c) 2013 Kevin Wolf <kw...@redhat.com> > + * > + * Permission is hereby granted, free of charge, to any person obtaining a > copy > + * of this software and associated documentation files (the "Software"), to > deal > + * in the Software without restriction, including without limitation the > rights > + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell > + * copies of the Software, and to permit persons to whom the Software is > + * furnished to do so, subject to the following conditions: > + * > + * The above copyright notice and this permission notice shall be included in > + * all copies or substantial portions of the Software. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL > + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING > FROM, > + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN > + * THE SOFTWARE. > + */ > + > +#include "qemu-common.h" > +#include "block/block_int.h" > +#include "qcow2.h" > + > +#define QCOW2_JOURNAL_MAGIC 0x716a6f75726e616cULL /* "qjournal" */ > +#define QCOW2_JOURNAL_BLOCK_MAGIC 0x716a626b /* "qjbk" */ > + > +typedef struct Qcow2JournalHeader { > + uint64_t magic; > + uint32_t journal_size; > + uint32_t block_size; > + uint32_t synced_index; > + uint32_t synced_seq; > + uint32_t committed_seq; > + uint32_t checksum; > +} QEMU_PACKED Qcow2JournalHeader; > + > +/* > + * One big transaction per journal block. The transaction is committed either > + * time based or when a microtransaction (single set of operations that must > be > + * performed atomically) doesn't fit in the same block any more. > + */ > +typedef struct Qcow2JournalBlock { > + uint32_t magic; > + uint32_t checksum; > + uint32_t seq; > + uint32_t desc_offset; /* Allow block header extensions */ > + uint32_t desc_bytes; > + uint32_t nb_data_blocks; > +} QEMU_PACKED Qcow2JournalBlock; > + > diff --git a/block/qcow2.h b/block/qcow2.h > index 1000239..2aee1fd 100644 > --- a/block/qcow2.h > +++ b/block/qcow2.h > @@ -157,6 +157,10 @@ typedef struct Qcow2DiscardRegion { > QTAILQ_ENTRY(Qcow2DiscardRegion) next; > } Qcow2DiscardRegion; > > +typedef struct Qcow2Journal { > + > +} Qcow2Journal; > + > typedef struct BDRVQcowState { > int cluster_bits; > int cluster_size; > @@ -479,4 +483,78 @@ int qcow2_cache_get_empty(BlockDriverState *bs, > Qcow2Cache *c, uint64_t offset, > void **table); > int qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table); > > +/* qcow2-journal.c functions */ > + > +typedef struct Qcow2JournalTransaction Qcow2JournalTransaction; > + > +enum Qcow2JournalEntryTypeID { > + QJ_DESC_NOOP = 0, > + QJ_DESC_WRITE = 1, > + QJ_DESC_COPY = 2, > + > + /* required after a cluster is freed and used for other purposes, so that > + * new (unjournalled) data won't be overwritten with stale metadata */ > + QJ_DESC_REVOKE = 3, > +}; > + > +typedef struct Qcow2JournalEntryType { > + enum Qcow2JournalEntryTypeID id; > + int (*sync)(void *buf, size_t size); > +} Qcow2JournalEntryType; > + > +typedef struct Qcow2JournalDesc { > + uint16_t type; > + uint16_t size; > +} QEMU_PACKED Qcow2JournalDesc; > + > +typedef struct Qcow2JournalDescWrite { > + Qcow2JournalDesc common; > + struct { > + uint32_t length; > + uint64_t target_offset; > + uint32_t data_block_index; > + } write[]; > +} QEMU_PACKED Qcow2JournalDescData; > + > +typedef struct Qcow2JournalDescCopy { > + Qcow2JournalDesc common; > + struct { > + uint32_t length; > + uint64_t target_offset; > + uint64_t source_offset; > + } copy[]; > +} QEMU_PACKED Qcow2JournalDescCopy; > + > +typedef struct Qcow2JournalRevoke { > + Qcow2JournalDesc common; > + struct { > + uint32_t length; > + uint64_t target_offset; > + } revoke[]; > +} QEMU_PACKED Qcow2JournalDescRevoke; > + > +void qcow2_journal_register_entry_type(Qcow2JournalEntryType *type); > + > +/* When commit_interval seconds have passed since the last commit, or > + * uncommitted journal data of at least commit_datasize bytes has accumulated > + * (whatever occurs first), transactions are committed. */ > +int qcow2_journal_init(Qcow2Journal **journal, uint64_t start_offset, > + int commit_interval, size_t commit_datasize); > +int qcow2_journal_destroy(Qcow2Journal *journal); > + > +/* These functions create microtransactions, i.e. a set of operations that > must > + * be executed atomically. In general, qemu doesn't map this to one qcow2 > + * on-disk transaction (which would leave a lot of space unused), but handles > + * multiple microtransaction with one on-disk transaction. */ > +Qcow2JournalTransaction *qcow2_journal_begin_transaction(Qcow2Journal > *journal); > +void qcow2_journal_add(Qcow2JournalTransaction *ta, Qcow2JournalDesc *desc); > +void qcow2_journal_end_transaction(Qcow2JournalTransaction *ta); > + > +/* Commits all completed microtransactions (i.e. > qcow2_journal_end_transaction > + * has already been called) */ > +int qcow2_journal_commit(Qcow2Journal *journal); > + > +/* Syncs all committed transactions */ > +int qcow2_journal_sync(Qcow2Journal *journal); > + > #endif > diff --git a/docs/specs/qcow2.txt b/docs/specs/qcow2.txt > index 33eca36..7578a4b 100644 > --- a/docs/specs/qcow2.txt > +++ b/docs/specs/qcow2.txt > @@ -85,6 +85,10 @@ in the description of a field. > be written to (unless for regaining > consistency). > > + Bit 2: Journal dirty. A replay of the main journal > is > + needed in order to regain consistency before > + accessing the image. > + > Bits 2-63: Reserved (set to 0)
-EBUSY for bit 2, should continue from bit 3. :) > > 80 - 87: compatible_features > @@ -103,7 +107,11 @@ in the description of a field. > write to an image with unknown auto-clear features if it > clears the respective bits from this field first. > > - Bits 0-63: Reserved (set to 0) > + Bit 0: Journal valid bit. This bit indicates that > the > + image contains a valid main journal starting > at > + journal_offset. > + > + Bits 1-63: Reserved (set to 0) > > 96 - 99: refcount_order > Describes the width of a reference count block entry > (width > @@ -114,6 +122,16 @@ in the description of a field. > Length of the header structure in bytes. For version 2 > images, the length is always assumed to be 72 bytes. > > + 104 - 111: journal_offset > + Offset into the image file at which the main image > journal > + starts. Must be aligned to a cluster boundary. 0 means > that > + no journal is used. > + > + This field is only valid if the journal feature bit in > + autoclear_features is set. If the field is invalid or the > + header is too short to contain the field, the field is > + assumed to be 0 (no journal is used) > + > Directly after the image header, optional sections called header extensions > can > be stored. Each extension has a structure like the following: > > @@ -355,3 +373,187 @@ Snapshot table entry: > variable: Unique ID string for the snapshot (not null terminated) > > variable: Name of the snapshot (not null terminated) > + > + > +== Journal == > + > +Journals are used to allow safe updates of metadata without impacting > +performance by requiring flushes to order updates to different parts of the > +metadata. They consist of transactions, which in turn contain operations that > +are effectively executed atomically. A qcow2 image can have a main image > +journal that deals with cluster management operations, and additional > specific > +journals can be used by other features like data deduplication. > + > + > +As far as the on-disk format is concerned, a transaction is in one of the > +following states: > + > + Incomplete: This is the initial state of any transaction, while new > + operations can still be added. When opening an image > with a > + dirty journal, incomplete transactions are discarded. > + > + Committed: When all operations that must be performed atomically > + during the transaction have been written and are stable > on > + disk, the transaction can be committed by increasing the > + commited sequence number in the journal heder. A > + transaction in this state may not be changed. When > opening > + an image with a dirty image, committed transactions > should > + be replayed. > + > + Synced: A transaction is synced if all of its operations have > been > + performed, all data written is stable on disk, and the > + synced sequence number is increased in the journal > header. > + Synced transactions are no longer needed in the journal > and > + can be overwritten. They are ignored during replay. > + > +The use of a sequence number implies that transactions are processed > +sequentially and an earlier transaction can never be unsynced/uncommitted if > a > +later one is synced/committed. > + > + > +A journal is organised in journal blocks, all of which have a reference count > +of exactly 1. It starts with a block containing the following journal header: > + > + Byte 0 - 7: Magic ("qjournal" ASCII string) > + > + 8 - 11: Journal size in bytes, including the header > + > + 12 - 15: Journal block size order (block size in bytes = 1 << > order) > + The block size must be at least 512 bytes and must not > + exceed the cluster size. > + > + 16 - 19: Journal block index of the descriptor for the last > + transaction that has been synced, starting with 1 for the > + journal block after the header. 0 is used for empty I suggest s/header/journal header/, for less confusion with image header. > + journals. > + > + 20 - 23: Sequence number of the last transaction that has been > + synced. 0 is recommended as the initial value. > + > + 24 - 27: Sequence number of the last transaction that has been > + committed. When replaying a journal, all transactions > + after the last synced one up to the last commit one must > be > + synced. Note that this may include a wraparound of > sequence > + numbers. > + I'm not sure of the downside of relatively frequent wraparound, but any reason not to use 64 bit sequence numbers which makes it much rare? > + 28 - 31: Checksum (one's complement of the sum of all bytes in the > + header journal block except those of the checksum field) > + > + 32 - 511: Reserved (set to 0) > + > + > +The header is followed by journal blocks that are either descriptor or data > +blocks. The block index at byte 16 points to the first valid descriptor, > except > +for completely empty journals, where it can be 0. The next descriptor can be > +found by skipping a descriptor and its associated data blocks. When the > journal > +size is exceeded, a wraparound occurs, essentially forming a ring buffer. > + > +A wraparound may not occur in the middle of a single transaction, but only > +between two transactions. For the necessary padding an empty descriptor with > +any number of data blocks can be used as the last entry of the ring. > + > +The chain of valid descriptors ends if a descriptor is reached whose sequence > +number isn't the successor of the previous sequence number. This means in > +particular that the journal must be ordered chronologically and has ascending > +sequence numbers (except in the case of a sequence number wraparound). Worth documenting the wraparound case ( (seq_num_t)-1 => 0x1)? Fam > +All blocks from the end of the descriptor chain until the starting point are > +unused. > + > + > +Descriptor blocks describe one transaction each and have the following > +structure: > + > + Byte 0 - 3: Magic ("qjbk" ASCII string) > + > + 4 - 7: Checksum (one's complement of the sum of all bytes in the > + descriptor block except those of the checksum field, and > + all bytes in the associated data blocks) > + > + 8 - 11: Sequence number of the transaction > + > + 12 - 15: Byte offset into the descriptor block at which > descriptors > + start > + > + 16 - 19: Total length of descriptors in this block in bytes > + > + 20 - 23: Number of following data blocks that are associated with > + this transaction. > + > + 24 - n: (Future extensions) > + > + n - m: Array of descriptors as described below. The exact values > + of n and m are determined by the above fields. > + > +All descriptors start with a common part: > + > + Byte 0 - 1: Descriptor type > + 0 - No-op descriptor > + 1 - Write data block > + 2 - Copy data > + 3 - Revoke > + 4 - Deduplication hash insertion > + 5 - Deduplication hash deletion > + > + 2 - 3: Size of the descriptor in bytes > + > + 4 - n: Type-specific data > + > +The following section specifies the purpose (i.e. the action that is to be > +performed when syncing) and type-specific data layout of each descriptor > type: > + > + * No-op descriptor: No action is to be performed when syncing this > descriptor > + > + 4 - n: Ignored > + > + * Write data block: Write literal data associated with this transaction > from > + the journal to a given offset. > + > + 4 - 7: Length of the data to write in bytes > + > + 8 - 15: Offset in the image file to write the data to > + > + 16 - 19: Index of the journal block at which the data to write > + starts. The data must be stored sequentially and be fully > + contained in the data blocks associated with the > + transaction. > + > + The type-specific data can be repeated, specifying multiple chunks of > data > + to be written in one operation. This means the size of the descriptor > must > + be 4 + 16 * n. > + > + * Copy data: Copy data from one offset in the image to another one. This > can > + be used for journalling copy-on-write operations. > + > + 4 - 7: Length of the data to write in bytes > + > + 8 - 15: Target offset in the image file > + > + 16 - 23: Source offset in the image file > + > + The type-specific data can be repeated, specifying multiple chunks of > data > + to be copied in one operation. This means the size of the descriptor must > + be 4 + 20 * n. > + > + * Revoke: Marks operations on a given range in the imag file invalid for > all > + earlier transactions (this does not include the transaction containing > the > + revoke). They must not be executed on a sync operation (e.g. because the > + range in question has been freed and may have been reused for other, not > + journalled data structures that must not be overwritten with stale data). > + Note that this may mean that operations are to be executed partially. > + > + 4 - 7: Length of the range in bytes > + > + 8 - 15: Offset of the range in the image file > + > + The type-specific data can be repeated, specifying multiple ranges for > + which operations should be revoked. This means the size of the descriptor > + must be 4 + 12 * n. > + > + * Deduplication hash insertion: Associates a hash value with a cluster. > + > + TODO > + > + * Deduplication hash deletion: Marks a hash value invalid (e.g. because the > + hashed data has changed) > + > + TODO > -- > 1.8.1.4 >