Lazy refcounts is a performance optimization for qcow2 that postpones
refcount metadata updates and instead marks the image dirty.  In the
case of crash or power failure the image will be left in a dirty state
and repaired next time it is opened.

Reducing metadata I/O is important for cache=writethrough and
cache=directsync because these modes guarantee that data is on disk
after each write (hence we cannot take advantage of caching updates in
RAM).  Refcount metadata is not needed for guest->file block address
translation and therefore does not need to be on-disk at the time of
write completion - this is the motivation behind the lazy refcount
optimization.

The lazy refcount optimization must be enabled at image creation time:

  qemu-img create -f qcow2 -o compat=1.1,lazy_refcounts=on a.qcow2 10G
  qemu-system-x86_64 -drive if=virtio,file=a.qcow2,cache=writethrough

Signed-off-by: Stefan Hajnoczi <stefa...@linux.vnet.ibm.com>
---
 block/qcow2-cluster.c |    5 +++-
 block/qcow2.c         |   67 ++++++++++++++++++++++++++++++++++++++++++++++---
 block/qcow2.h         |    8 ++++++
 block_int.h           |   26 ++++++++++---------
 4 files changed, 89 insertions(+), 17 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index d7e0e19..e179211 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -662,7 +662,10 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, 
QCowL2Meta *m)
         qcow2_cache_depends_on_flush(s->l2_table_cache);
     }
 
-    qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
+    if (qcow2_need_accurate_refcounts(s)) {
+        qcow2_cache_set_dependency(bs, s->l2_table_cache,
+                                   s->refcount_block_cache);
+    }
     ret = get_cluster_table(bs, m->offset, &l2_table, &l2_index);
     if (ret < 0) {
         goto err;
diff --git a/block/qcow2.c b/block/qcow2.c
index cc30784..b54955c 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -215,6 +215,40 @@ static void report_unsupported_feature(BlockDriverState 
*bs,
 }
 
 /*
+ * Sets the dirty bit and flushes afterwards if necessary.
+ *
+ * The incompatible_features bit is only set if the image file header was
+ * updated successfully.  Therefore it is not required to check the return
+ * value of this function.
+ */
+static int qcow2_mark_dirty(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t val;
+    int ret;
+
+    if (s->incompatible_features & QCOW2_INCOMPATIBLE_FEAT_DIRTY) {
+        return 0; /* already dirty */
+    }
+
+    val = cpu_to_be64(s->incompatible_features |
+                      QCOW2_INCOMPATIBLE_FEAT_DIRTY);
+    ret = bdrv_pwrite(bs->file, offsetof(QCowHeader, incompatible_features),
+                      &val, sizeof(val));
+    if (ret < 0) {
+        return ret;
+    }
+    ret = bdrv_flush(bs->file);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* Only treat image as dirty if the header was updated successfully */
+    s->incompatible_features |= QCOW2_INCOMPATIBLE_FEAT_DIRTY;
+    return 0;
+}
+
+/*
  * Clears the dirty bit and flushes before if necessary.  Only call this
  * function when there are no pending requests, it does not guard against
  * concurrent requests dirtying the image.
@@ -755,6 +789,11 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState 
*bs,
             goto fail;
         }
 
+        if (l2meta.nb_clusters > 0 &&
+            (s->compatible_features & QCOW2_COMPATIBLE_FEAT_LAZY_REFCOUNTS)) {
+            qcow2_mark_dirty(bs);
+        }
+
         cluster_offset = l2meta.cluster_offset;
         assert((cluster_offset & 511) == 0);
 
@@ -1175,6 +1214,11 @@ static int qcow2_create2(const char *filename, int64_t 
total_size,
         header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
     }
 
+    if (flags & BLOCK_FLAG_LAZY_REFCOUNTS) {
+        header.compatible_features |=
+            cpu_to_be64(QCOW2_COMPATIBLE_FEAT_LAZY_REFCOUNTS);
+    }
+
     ret = bdrv_pwrite(bs, 0, &header, sizeof(header));
     if (ret < 0) {
         goto out;
@@ -1288,6 +1332,8 @@ static int qcow2_create(const char *filename, 
QEMUOptionParameter *options)
                     options->value.s);
                 return -EINVAL;
             }
+        } else if (!strcmp(options->name, BLOCK_OPT_LAZY_REFCOUNTS)) {
+            flags |= options->value.n ? BLOCK_FLAG_LAZY_REFCOUNTS : 0;
         }
         options++;
     }
@@ -1298,6 +1344,12 @@ static int qcow2_create(const char *filename, 
QEMUOptionParameter *options)
         return -EINVAL;
     }
 
+    if (version < 3 && (flags & BLOCK_FLAG_LAZY_REFCOUNTS)) {
+        fprintf(stderr, "Lazy refcounts only supported with compatibility "
+                "level 1.1 and above (use compat=1.1 or greater)\n");
+        return -EINVAL;
+    }
+
     return qcow2_create2(filename, sectors, backing_file, backing_fmt, flags,
                          cluster_size, prealloc, options, version);
 }
@@ -1484,10 +1536,12 @@ static coroutine_fn int 
qcow2_co_flush_to_os(BlockDriverState *bs)
         return ret;
     }
 
-    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
-    if (ret < 0) {
-        qemu_co_mutex_unlock(&s->lock);
-        return ret;
+    if (qcow2_need_accurate_refcounts(s)) {
+        ret = qcow2_cache_flush(bs, s->refcount_block_cache);
+        if (ret < 0) {
+            qemu_co_mutex_unlock(&s->lock);
+            return ret;
+        }
     }
     qemu_co_mutex_unlock(&s->lock);
 
@@ -1602,6 +1656,11 @@ static QEMUOptionParameter qcow2_create_options[] = {
         .type = OPT_STRING,
         .help = "Preallocation mode (allowed values: off, metadata)"
     },
+    {
+        .name = BLOCK_OPT_LAZY_REFCOUNTS,
+        .type = OPT_FLAG,
+        .help = "Postpone refcount updates",
+    },
     { NULL }
 };
 
diff --git a/block/qcow2.h b/block/qcow2.h
index 5c7cfb6..b8c0beb 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -111,6 +111,9 @@ enum {
 
     QCOW2_INCOMPATIBLE_FEAT_DIRTY   = 0x1,
     QCOW2_INCOMPATIBLE_FEAT_MASK    = QCOW2_INCOMPATIBLE_FEAT_DIRTY,
+
+    QCOW2_COMPATIBLE_FEAT_LAZY_REFCOUNTS = 0x1,
+    QCOW2_COMPATIBLE_FEAT_MASK      = QCOW2_COMPATIBLE_FEAT_LAZY_REFCOUNTS,
 };
 
 typedef struct Qcow2Feature {
@@ -240,6 +243,11 @@ static inline int qcow2_get_cluster_type(uint64_t l2_entry)
     }
 }
 
+/* Check whether refcounts are eager or lazy */
+static inline bool qcow2_need_accurate_refcounts(BDRVQcowState *s)
+{
+    return !(s->incompatible_features & QCOW2_INCOMPATIBLE_FEAT_DIRTY);
+}
 
 // FIXME Need qcow2_ prefix to global functions
 
diff --git a/block_int.h b/block_int.h
index 1fb5352..733aa71 100644
--- a/block_int.h
+++ b/block_int.h
@@ -31,8 +31,9 @@
 #include "qemu-timer.h"
 #include "qapi-types.h"
 
-#define BLOCK_FLAG_ENCRYPT     1
-#define BLOCK_FLAG_COMPAT6     4
+#define BLOCK_FLAG_ENCRYPT          1
+#define BLOCK_FLAG_COMPAT6          4
+#define BLOCK_FLAG_LAZY_REFCOUNTS   8
 
 #define BLOCK_IO_LIMIT_READ     0
 #define BLOCK_IO_LIMIT_WRITE    1
@@ -41,16 +42,17 @@
 #define BLOCK_IO_SLICE_TIME     100000000
 #define NANOSECONDS_PER_SECOND  1000000000.0
 
-#define BLOCK_OPT_SIZE          "size"
-#define BLOCK_OPT_ENCRYPT       "encryption"
-#define BLOCK_OPT_COMPAT6       "compat6"
-#define BLOCK_OPT_BACKING_FILE  "backing_file"
-#define BLOCK_OPT_BACKING_FMT   "backing_fmt"
-#define BLOCK_OPT_CLUSTER_SIZE  "cluster_size"
-#define BLOCK_OPT_TABLE_SIZE    "table_size"
-#define BLOCK_OPT_PREALLOC      "preallocation"
-#define BLOCK_OPT_SUBFMT        "subformat"
-#define BLOCK_OPT_COMPAT_LEVEL  "compat"
+#define BLOCK_OPT_SIZE              "size"
+#define BLOCK_OPT_ENCRYPT           "encryption"
+#define BLOCK_OPT_COMPAT6           "compat6"
+#define BLOCK_OPT_BACKING_FILE      "backing_file"
+#define BLOCK_OPT_BACKING_FMT       "backing_fmt"
+#define BLOCK_OPT_CLUSTER_SIZE      "cluster_size"
+#define BLOCK_OPT_TABLE_SIZE        "table_size"
+#define BLOCK_OPT_PREALLOC          "preallocation"
+#define BLOCK_OPT_SUBFMT            "subformat"
+#define BLOCK_OPT_COMPAT_LEVEL      "compat"
+#define BLOCK_OPT_LAZY_REFCOUNTS    "lazy_refcounts"
 
 typedef struct BdrvTrackedRequest BdrvTrackedRequest;
 
-- 
1.7.10


Reply via email to