On Wed, Feb 06, 2013 at 01:31:40PM +0100, Benoît Canet wrote: > Signed-off-by: Benoit Canet <ben...@irqsave.net> > --- > block/qcow2-dedup.c | 436 > +++++++++++++++++++++++++++++++++++++++++++++++++++ > block/qcow2.h | 5 + > 2 files changed, 441 insertions(+) > > diff --git a/block/qcow2-dedup.c b/block/qcow2-dedup.c > index 4e99eb1..5901749 100644 > --- a/block/qcow2-dedup.c > +++ b/block/qcow2-dedup.c > @@ -117,3 +117,439 @@ fail: > *data = NULL; > return ret; > } > + > +/* > + * Build a QCowHashNode structure > + * > + * @hash: the given hash > + * @physical_sect: the cluster offset in the QCOW2 file > + * @first_logical_sect: the first logical cluster offset written > + * @ret: the build QCowHashNode > + */ > +static QCowHashNode *qcow2_dedup_build_qcow_hash_node(QCowHash *hash, > + uint64_t physical_sect, > + uint64_t > first_logical_sect)
qcow2_hash_node_new() would be a shorter name. It uses "new" like coroutine_new(), qemu_bh_new(), etc. > +{ > + QCowHashNode *hash_node; > + > + hash_node = g_new0(QCowHashNode, 1); > + memcpy(hash_node->hash.data, hash->data, HASH_LENGTH); > + hash_node->physical_sect = physical_sect; > + hash_node->first_logical_sect = first_logical_sect; > + > + return hash_node; > +} > + > +/* > + * Compute the hash of a given cluster > + * > + * @data: a buffer containing the cluster data > + * @hash: a QCowHash where to store the computed hash > + * @ret: 0 on success, negative on error > + */ > +static int qcow2_compute_cluster_hash(BlockDriverState *bs, > + QCowHash *hash, > + uint8_t *data) > +{ > + return 0; > +} > + > +/* > + * Get a QCowHashNode corresponding to a cluster data > + * > + * @phash: if phash can be used no hash is computed > + * @data: a buffer containing the cluster > + * @nb_clusters_processed: the number of cluster to skip in the buffer > + * @err: Error code if any > + * @ret: QCowHashNode of the duplicated cluster or NULL if not > found > + */ > +static QCowHashNode *qcow2_get_hash_node_for_cluster(BlockDriverState *bs, > + QcowPersistantHash > *phash, > + uint8_t *data, > + int > nb_clusters_processed, > + int *err) > +{ > + BDRVQcowState *s = bs->opaque; > + int ret = 0; > + *err = 0; > + > + /* no hash has been provided compute it and store it for later usage */ > + if (!phash->reuse) { > + ret = qcow2_compute_cluster_hash(bs, > + &phash->hash, > + data + > + nb_clusters_processed * > + s->cluster_size); > + } > + > + /* do not reuse the hash anymore if it was precomputed */ > + phash->reuse = false; > + > + if (ret < 0) { > + *err = ret; > + return NULL; > + } > + > + return g_tree_lookup(s->dedup_tree_by_hash, &phash->hash); > +} > + > +/* > + * Build a QCowHashNode from a given QCowHash and insert it into the tree > + * > + * @hash: the given QCowHash > + */ > +static void qcow2_build_and_insert_hash_node(BlockDriverState *bs, > + QCowHash *hash) > +{ > + BDRVQcowState *s = bs->opaque; > + QCowHashNode *hash_node; > + > + /* build the hash node with QCOW_FLAG_EMPTY as offsets so we will > remember > + * to fill these field later with real values. > + */ > + hash_node = qcow2_dedup_build_qcow_hash_node(hash, > + QCOW_FLAG_EMPTY, > + QCOW_FLAG_EMPTY); > + g_tree_insert(s->dedup_tree_by_hash, &hash_node->hash, hash_node); > +} Interesting function, it doesn't return hash_node. Someone will have to look it up. Also, we put an incomplete node into dedup_tree_by_hash. The caller must ensure that other coroutines do not use dedup_tree_by_hash() before we've filled in real values, or the callers need to check for QCOW_FLAG_EMPTY. Seems a little risky, so why insert before completing the hash node? > + > +/* > + * Helper used to build a QCowHashElement > + * > + * @hash: the QCowHash to use > + * @ret: a newly allocated QCowHashElement containing the given hash > + */ > +static QCowHashElement *qcow2_build_dedup_hash(QCowHash *hash) I suggest _new() instead of _build_*(). > +{ > + QCowHashElement *dedup_hash; > + dedup_hash = g_new0(QCowHashElement, 1); > + memcpy(dedup_hash->hash.data, hash->data, HASH_LENGTH); > + return dedup_hash; > +} > + > +/* > + * Helper used to link a deduplicated cluster in the l2 > + * > + * @logical_sect: the cluster sector seen by the guest > + * @physical_sect: the cluster sector in the QCOW2 file > + * @overwrite: true if we must overwrite the L2 table entry > + * @ret: > + */ > +static int qcow2_dedup_link_l2(BlockDriverState *bs, > + uint64_t logical_sect, > + uint64_t physical_sect, > + bool overwrite) > +{ > + QCowL2Meta m = { > + .alloc_offset = physical_sect << 9, > + .offset = logical_sect << 9, > + .nb_clusters = 1, I wonder if there's any real-world performance advantage to doing nb_clusters > 0 when possible... > + .nb_available = 0, > + .cow_start = { > + .offset = 0, > + .nb_sectors = 0, > + }, > + .cow_end = { > + .offset = 0, > + .nb_sectors = 0, > + }, > + .oflag_copied = false, > + .overwrite = overwrite, > + }; > + return qcow2_alloc_cluster_link_l2(bs, &m); > +} > + > +/* Clear the QCOW_OFLAG_COPIED from the first L2 entry written for a physical > + * cluster. > + * > + * @hash_node: the duplicated hash node > + * @ret: 0 on success, negative on error > + */ > +static int qcow2_clear_l2_copied_flag_if_needed(BlockDriverState *bs, > + QCowHashNode *hash_node) > +{ > + int ret = 0; > + uint64_t first_logical_sect = hash_node->first_logical_sect; > + > + /* QCOW_OFLAG_COPIED already cleared -> do nothing */ > + if (!(first_logical_sect & QCOW_FLAG_FIRST)) { > + return 0; > + } > + > + /* note : QCOW_FLAG_FIRST == QCOW_OFLAG_COPIED */ Please use QCOW_OFLAG_COPIED instead of redefining it. > + first_logical_sect &= ~QCOW_FLAG_FIRST; > + > + /* overwrite first L2 entry to clear QCOW_FLAG_COPIED */ > + ret = qcow2_dedup_link_l2(bs, first_logical_sect, > + hash_node->physical_sect, > + true); > + > + if (ret < 0) { > + return ret; > + } > + > + /* remember that we dont't need to clear QCOW_OFLAG_COPIED again */ s/dont't/don't/ > + hash_node->first_logical_sect &= first_logical_sect; Just '=' would be clearer. > + > + return 0; > +} > + > +/* This function deduplicate a cluster > + * > + * @logical_sect: The logical sector of the write > + * @hash_node: The duplicated cluster hash node > + * @ret: 0 on success, negative on error > + */ > +static int qcow2_deduplicate_cluster(BlockDriverState *bs, > + uint64_t logical_sect, > + QCowHashNode *hash_node) > +{ > + BDRVQcowState *s = bs->opaque; > + int ret = 0; > + > + /* create new L2 entry */ > + ret = qcow2_dedup_link_l2(bs, logical_sect, > + hash_node->physical_sect, > + false); > + > + if (ret < 0) { > + return ret; > + } > + > + /* Increment the refcount of the cluster */ > + return update_refcount(bs, > + (hash_node->physical_sect / > + s->cluster_sectors) << s->cluster_bits, > + 1, 1); I think it's better to increment the refcount and then store the new L2 entry. This way a crash partway through leaves a leaked cluster instead of an L2 entry to an unallocated cluster (which could cause corruption further on). Cache flushes may be necessary between these steps, I haven't checked. > +} > + > +/* This function tries to deduplicate a given cluster. > + * > + * @sector_num: the logical sector number we are trying to > deduplicate > + * @phash: Used instead of computing the hash if provided > + * @data: the buffer in which to look for a duplicated > cluster > + * @nb_clusters_processed: the number of cluster that must be skipped in data > + * @ret: ret < 0 on error, 1 on deduplication else 0 > + */ > +static int qcow2_try_dedup_cluster(BlockDriverState *bs, > + QcowPersistantHash *phash, > + uint64_t sector_num, > + uint8_t *data, > + int nb_clusters_processed) > +{ > + BDRVQcowState *s = bs->opaque; > + int ret = 0; > + QCowHashNode *hash_node; > + uint64_t logical_sect; > + uint64_t existing_physical_offset; > + int pnum = s->cluster_sectors; > + > + /* search the tree for duplicated cluster */ > + hash_node = qcow2_get_hash_node_for_cluster(bs, > + phash, > + data, > + nb_clusters_processed, > + &ret); > + > + /* we won't reuse the hash on error */ > + if (ret < 0) { > + return ret; > + } > + > + /* if cluster is not duplicated store hash for later usage */ > + if (!hash_node) { > + qcow2_build_and_insert_hash_node(bs, &phash->hash); > + return 0; > + } > + > + logical_sect = sector_num & ~(s->cluster_sectors - 1); > + ret = qcow2_get_cluster_offset(bs, logical_sect << 9, > + &pnum, &existing_physical_offset); > + > + if (ret < 0) { > + return ret; > + } > + > + /* if we are rewriting the same cluster at the same place do nothing */ > + if (existing_physical_offset == hash_node->physical_sect << 9) { > + return 1; > + } > + > + /* take care of not having refcount > 1 and QCOW_OFLAG_COPIED at once */ > + ret = qcow2_clear_l2_copied_flag_if_needed(bs, hash_node); > + > + if (ret < 0) { > + return ret; > + } > + > + /* do the deduplication */ > + ret = qcow2_deduplicate_cluster(bs, logical_sect, > + hash_node); I think none of these drop the lock so we're guaranteed that metadata doesn't change beneath our feet. Good. > + if (ret < 0) { > + return ret; > + } > + > + return 1; > +} > + > + > +static void add_hash_to_undedupable_list(BlockDriverState *bs, > + QCowDedupState *ds) > +{ > + /* memorise hash for later storage in gtree and disk */ > + QCowHashElement *dedup_hash = qcow2_build_dedup_hash(&ds->phash.hash); > + QTAILQ_INSERT_TAIL(&ds->undedupables, dedup_hash, next); > +} > + > +static int qcow2_dedup_starting_from_begining(BlockDriverState *bs, > + QCowDedupState *ds, > + uint64_t sector_num, > + uint8_t *data, > + int left_to_process) > +{ > + BDRVQcowState *s = bs->opaque; > + int i; > + int ret = 0; > + > + for (i = 0; i < left_to_process; i++) { > + ret = qcow2_try_dedup_cluster(bs, > + &ds->phash, > + sector_num + i * s->cluster_sectors, > + data, > + ds->nb_clusters_processed + i); I don't understand how the data argument works here. The data for left_to_process clusters has been pre-read into data? Why is data not incremented (data += s->cluster_size)?