Re: [Qemu-devel] [RFC V6 07/33] qcow2: Add qcow2_dedup and related functions

Stefan Hajnoczi Wed, 06 Feb 2013 09:27:07 -0800

On Wed, Feb 06, 2013 at 01:31:40PM +0100, Benoît Canet wrote:
> Signed-off-by: Benoit Canet <ben...@irqsave.net>
> ---
>  block/qcow2-dedup.c |  436 
> +++++++++++++++++++++++++++++++++++++++++++++++++++
>  block/qcow2.h       |    5 +
>  2 files changed, 441 insertions(+)
> 
> diff --git a/block/qcow2-dedup.c b/block/qcow2-dedup.c
> index 4e99eb1..5901749 100644
> --- a/block/qcow2-dedup.c
> +++ b/block/qcow2-dedup.c
> @@ -117,3 +117,439 @@ fail:
>      *data = NULL;
>      return ret;
>  }
> +
> +/*
> + * Build a QCowHashNode structure
> + *
> + * @hash:               the given hash
> + * @physical_sect:      the cluster offset in the QCOW2 file
> + * @first_logical_sect: the first logical cluster offset written
> + * @ret:                the build QCowHashNode
> + */
> +static QCowHashNode *qcow2_dedup_build_qcow_hash_node(QCowHash *hash,
> +                                                  uint64_t physical_sect,
> +                                                  uint64_t 
> first_logical_sect)


qcow2_hash_node_new() would be a shorter name.  It uses "new" like
coroutine_new(), qemu_bh_new(), etc.

> +{
> +    QCowHashNode *hash_node;
> +
> +    hash_node = g_new0(QCowHashNode, 1);
> +    memcpy(hash_node->hash.data, hash->data, HASH_LENGTH);
> +    hash_node->physical_sect = physical_sect;
> +    hash_node->first_logical_sect = first_logical_sect;
> +
> +    return hash_node;
> +}
> +
> +/*
> + * Compute the hash of a given cluster
> + *
> + * @data: a buffer containing the cluster data
> + * @hash: a QCowHash where to store the computed hash
> + * @ret:  0 on success, negative on error
> + */
> +static int qcow2_compute_cluster_hash(BlockDriverState *bs,
> +                                       QCowHash *hash,
> +                                       uint8_t *data)
> +{
> +    return 0;
> +}
> +
> +/*
> + * Get a QCowHashNode corresponding to a cluster data
> + *
> + * @phash:           if phash can be used no hash is computed
> + * @data:            a buffer containing the cluster
> + * @nb_clusters_processed: the number of cluster to skip in the buffer
> + * @err:             Error code if any
> + * @ret:             QCowHashNode of the duplicated cluster or NULL if not 
> found
> + */
> +static QCowHashNode *qcow2_get_hash_node_for_cluster(BlockDriverState *bs,
> +                                                     QcowPersistantHash 
> *phash,
> +                                                     uint8_t *data,
> +                                                     int 
> nb_clusters_processed,
> +                                                     int *err)
> +{
> +    BDRVQcowState *s = bs->opaque;
> +    int ret = 0;
> +    *err = 0;
> +
> +    /* no hash has been provided compute it and store it for later usage */
> +    if (!phash->reuse) {
> +        ret = qcow2_compute_cluster_hash(bs,
> +                                         &phash->hash,
> +                                         data +
> +                                         nb_clusters_processed *
> +                                         s->cluster_size);
> +    }
> +
> +    /* do not reuse the hash anymore if it was precomputed */
> +    phash->reuse = false;
> +
> +    if (ret < 0) {
> +        *err = ret;
> +        return NULL;
> +    }
> +
> +    return g_tree_lookup(s->dedup_tree_by_hash, &phash->hash);
> +}
> +
> +/*
> + * Build a QCowHashNode from a given QCowHash and insert it into the tree
> + *
> + * @hash: the given QCowHash
> + */
> +static void qcow2_build_and_insert_hash_node(BlockDriverState *bs,
> +                                             QCowHash *hash)
> +{
> +    BDRVQcowState *s = bs->opaque;
> +    QCowHashNode *hash_node;
> +
> +    /* build the hash node with QCOW_FLAG_EMPTY as offsets so we will 
> remember
> +     * to fill these field later with real values.
> +     */
> +    hash_node = qcow2_dedup_build_qcow_hash_node(hash,
> +                                                 QCOW_FLAG_EMPTY,
> +                                                 QCOW_FLAG_EMPTY);
> +    g_tree_insert(s->dedup_tree_by_hash, &hash_node->hash, hash_node);
> +}

Interesting function, it doesn't return hash_node.  Someone will have to
look it up.

Also, we put an incomplete node into dedup_tree_by_hash.  The caller
must ensure that other coroutines do not use dedup_tree_by_hash() before
we've filled in real values, or the callers need to check for
QCOW_FLAG_EMPTY.  Seems a little risky, so why insert before completing
the hash node?

> +
> +/*
> + * Helper used to build a QCowHashElement
> + *
> + * @hash: the QCowHash to use
> + * @ret:  a newly allocated QCowHashElement containing the given hash
> + */
> +static QCowHashElement *qcow2_build_dedup_hash(QCowHash *hash)

I suggest _new() instead of _build_*().

> +{
> +    QCowHashElement *dedup_hash;
> +    dedup_hash = g_new0(QCowHashElement, 1);
> +    memcpy(dedup_hash->hash.data, hash->data, HASH_LENGTH);
> +    return dedup_hash;
> +}
> +
> +/*
> + * Helper used to link a deduplicated cluster in the l2
> + *
> + * @logical_sect:  the cluster sector seen by the guest
> + * @physical_sect: the cluster sector in the QCOW2 file
> + * @overwrite:     true if we must overwrite the L2 table entry
> + * @ret:
> + */
> +static int qcow2_dedup_link_l2(BlockDriverState *bs,
> +                               uint64_t logical_sect,
> +                               uint64_t physical_sect,
> +                               bool overwrite)
> +{
> +    QCowL2Meta m = {
> +        .alloc_offset   = physical_sect << 9,
> +        .offset         = logical_sect << 9,
> +        .nb_clusters    = 1,

I wonder if there's any real-world performance advantage to doing
nb_clusters > 0 when possible...

> +        .nb_available   = 0,
> +        .cow_start = {
> +            .offset     = 0,
> +            .nb_sectors = 0,
> +        },
> +        .cow_end = {
> +            .offset     = 0,
> +            .nb_sectors = 0,
> +        },
> +        .oflag_copied   = false,
> +        .overwrite      = overwrite,
> +    };
> +    return qcow2_alloc_cluster_link_l2(bs, &m);
> +}
> +
> +/* Clear the QCOW_OFLAG_COPIED from the first L2 entry written for a physical
> + * cluster.
> + *
> + * @hash_node: the duplicated hash node
> + * @ret:       0 on success, negative on error
> + */
> +static int qcow2_clear_l2_copied_flag_if_needed(BlockDriverState *bs,
> +                                                QCowHashNode *hash_node)
> +{
> +    int ret = 0;
> +    uint64_t first_logical_sect = hash_node->first_logical_sect;
> +
> +    /* QCOW_OFLAG_COPIED already cleared -> do nothing */
> +    if (!(first_logical_sect & QCOW_FLAG_FIRST)) {
> +        return 0;
> +    }
> +
> +    /* note : QCOW_FLAG_FIRST == QCOW_OFLAG_COPIED */

Please use QCOW_OFLAG_COPIED instead of redefining it.

> +    first_logical_sect &= ~QCOW_FLAG_FIRST;
> +
> +    /* overwrite first L2 entry to clear QCOW_FLAG_COPIED */
> +    ret = qcow2_dedup_link_l2(bs, first_logical_sect,
> +                              hash_node->physical_sect,
> +                              true);
> +
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    /* remember that we dont't need to clear QCOW_OFLAG_COPIED again */

s/dont't/don't/

> +    hash_node->first_logical_sect &= first_logical_sect;

Just '=' would be clearer.

> +
> +    return 0;
> +}
> +
> +/* This function deduplicate a cluster
> + *
> + * @logical_sect: The logical sector of the write
> + * @hash_node:    The duplicated cluster hash node
> + * @ret:          0 on success, negative on error
> + */
> +static int qcow2_deduplicate_cluster(BlockDriverState *bs,
> +                                     uint64_t logical_sect,
> +                                     QCowHashNode *hash_node)
> +{
> +    BDRVQcowState *s = bs->opaque;
> +    int ret = 0;
> +
> +    /* create new L2 entry */
> +    ret = qcow2_dedup_link_l2(bs, logical_sect,
> +                              hash_node->physical_sect,
> +                              false);
> +
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    /* Increment the refcount of the cluster */
> +    return update_refcount(bs,
> +                           (hash_node->physical_sect /
> +                            s->cluster_sectors) << s->cluster_bits,
> +                            1, 1);

I think it's better to increment the refcount and then store the new L2
entry.  This way a crash partway through leaves a leaked cluster instead
of an L2 entry to an unallocated cluster (which could cause corruption
further on).

Cache flushes may be necessary between these steps, I haven't checked.

> +}
> +
> +/* This function tries to deduplicate a given cluster.
> + *
> + * @sector_num:           the logical sector number we are trying to 
> deduplicate
> + * @phash:                Used instead of computing the hash if provided
> + * @data:                 the buffer in which to look for a duplicated 
> cluster
> + * @nb_clusters_processed: the number of cluster that must be skipped in data
> + * @ret:                  ret < 0 on error, 1 on deduplication else 0
> + */
> +static int qcow2_try_dedup_cluster(BlockDriverState *bs,
> +                                   QcowPersistantHash *phash,
> +                                   uint64_t sector_num,
> +                                   uint8_t *data,
> +                                   int nb_clusters_processed)
> +{
> +    BDRVQcowState *s = bs->opaque;
> +    int ret = 0;
> +    QCowHashNode *hash_node;
> +    uint64_t logical_sect;
> +    uint64_t existing_physical_offset;
> +    int pnum = s->cluster_sectors;
> +
> +    /* search the tree for duplicated cluster */
> +    hash_node = qcow2_get_hash_node_for_cluster(bs,
> +                                                phash,
> +                                                data,
> +                                                nb_clusters_processed,
> +                                                &ret);
> +
> +    /* we won't reuse the hash on error */
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    /* if cluster is not duplicated store hash for later usage */
> +    if (!hash_node) {
> +        qcow2_build_and_insert_hash_node(bs, &phash->hash);
> +        return 0;
> +    }
> +
> +    logical_sect = sector_num & ~(s->cluster_sectors - 1);
> +    ret = qcow2_get_cluster_offset(bs, logical_sect << 9,
> +                                   &pnum, &existing_physical_offset);
> +
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    /* if we are rewriting the same cluster at the same place do nothing */
> +    if (existing_physical_offset == hash_node->physical_sect << 9) {
> +        return 1;
> +    }
> +
> +    /* take care of not having refcount > 1 and QCOW_OFLAG_COPIED at once */
> +    ret = qcow2_clear_l2_copied_flag_if_needed(bs, hash_node);
> +
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    /* do the deduplication */
> +    ret = qcow2_deduplicate_cluster(bs, logical_sect,
> +                                    hash_node);

I think none of these drop the lock so we're guaranteed that metadata
doesn't change beneath our feet.  Good.

> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    return 1;
> +}
> +
> +
> +static void add_hash_to_undedupable_list(BlockDriverState *bs,
> +                                                    QCowDedupState *ds)
> +{
> +    /* memorise hash for later storage in gtree and disk */
> +    QCowHashElement *dedup_hash = qcow2_build_dedup_hash(&ds->phash.hash);
> +    QTAILQ_INSERT_TAIL(&ds->undedupables, dedup_hash, next);
> +}
> +
> +static int qcow2_dedup_starting_from_begining(BlockDriverState *bs,
> +                                              QCowDedupState *ds,
> +                                              uint64_t sector_num,
> +                                              uint8_t *data,
> +                                              int left_to_process)
> +{
> +    BDRVQcowState *s = bs->opaque;
> +    int i;
> +    int ret = 0;
> +
> +    for (i = 0; i < left_to_process; i++) {
> +        ret = qcow2_try_dedup_cluster(bs,
> +                                      &ds->phash,
> +                                      sector_num + i * s->cluster_sectors,
> +                                      data,
> +                                      ds->nb_clusters_processed + i);

I don't understand how the data argument works here.  The data for
left_to_process clusters has been pre-read into data?  Why is data not
incremented (data += s->cluster_size)?

Re: [Qemu-devel] [RFC V6 07/33] qcow2: Add qcow2_dedup and related functions

Reply via email to