qcow2: use seqcache for compressed writes

Max Reitz Fri, 12 Mar 2021 11:17:44 -0800

On 05.03.21 18:35, Vladimir Sementsov-Ogievskiy wrote:

Compressed writes are unaligned to 512, which works very slow in
O_DIRECT mode. Let's use the cache.


Signed-off-by: Vladimir Sementsov-Ogievskiy <vsement...@virtuozzo.com>
---
  block/coroutines.h     |   3 +
  block/qcow2.h          |   4 ++
  block/qcow2-refcount.c |  10 +++
  block/qcow2.c          | 158 ++++++++++++++++++++++++++++++++++++++---
  4 files changed, 164 insertions(+), 11 deletions(-)

diff --git a/block/coroutines.h b/block/coroutines.h
index 4cfb4946e6..cb8e05830e 100644
--- a/block/coroutines.h
+++ b/block/coroutines.h
@@ -66,4 +66,7 @@ int coroutine_fn bdrv_co_readv_vmstate(BlockDriverState *bs,
  int coroutine_fn bdrv_co_writev_vmstate(BlockDriverState *bs,
                                          QEMUIOVector *qiov, int64_t pos);

+int coroutine_fn qcow2_co_flush_compressed_cache(BlockDriverState *bs);

+int generated_co_wrapper qcow2_flush_compressed_cache(BlockDriverState *bs);
+
  #endif /* BLOCK_COROUTINES_INT_H */
diff --git a/block/qcow2.h b/block/qcow2.h
index e18d8dfbae..8b8fed0950 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -28,6 +28,7 @@
  #include "crypto/block.h"
  #include "qemu/coroutine.h"
  #include "qemu/units.h"
+#include "qemu/seqcache.h"
  #include "block/block_int.h"

//#define DEBUG_ALLOC

@@ -422,6 +423,9 @@ typedef struct BDRVQcow2State {
      Qcow2CompressionType compression_type;

GHashTable *inflight_writes_counters;

+
+    SeqCache *compressed_cache;
+    int compressed_flush_ret;
  } BDRVQcow2State;

typedef struct Qcow2COWRegion {

diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 464d133368..218917308e 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -29,6 +29,7 @@
  #include "qemu/bswap.h"
  #include "qemu/cutils.h"
  #include "trace.h"
+#include "block/coroutines.h"

static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size,

                                      uint64_t max);
@@ -1040,6 +1041,10 @@ static int QEMU_WARN_UNUSED_RESULT 
update_refcount(BlockDriverState *bs,
                  qcow2_cache_discard(s->l2_table_cache, table);
              }

+ if (s->compressed_cache) {

+                seqcache_discard_cluster(s->compressed_cache, cluster_offset);
+            }
+
              if (s->discard_passthrough[type]) {
                  update_refcount_discard(bs, cluster_offset, s->cluster_size);
              }
@@ -1349,6 +1354,11 @@ int coroutine_fn qcow2_write_caches(BlockDriverState *bs)
      BDRVQcow2State *s = bs->opaque;
      int ret;

+ ret = qcow2_flush_compressed_cache(bs);

+    if (ret < 0) {
+        return ret;
+    }
+

I wonder a bit why this function doesn’t work on a best-effort basis,but that isn’t your problem.

      ret = qcow2_cache_write(bs, s->l2_table_cache);
      if (ret < 0) {
          return ret;
diff --git a/block/qcow2.c b/block/qcow2.c
index 6ee94421e0..5f3713cd6f 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -42,6 +42,7 @@
  #include "qapi/qapi-visit-block-core.h"
  #include "crypto.h"
  #include "block/aio_task.h"
+#include "block/coroutines.h"

/*

    Differences with QCOW:
@@ -1834,6 +1835,10 @@ static int coroutine_fn qcow2_do_open(BlockDriverState 
*bs, QDict *options,
      s->inflight_writes_counters =
          g_hash_table_new_full(g_int64_hash, g_int64_equal, g_free, g_free);

+ if (!has_data_file(bs) && (bs->file->bs->open_flags & BDRV_O_NOCACHE)) {

Looks a bit like a layering violation, but I have no better proposal andyou gave your reasoning, so, OK.


I wonder whether this should be re-checked during reopen, though.

+        s->compressed_cache = seqcache_new(s->cluster_size);
+    }
+
      return ret;

fail:

@@ -2653,6 +2658,91 @@ fail_nometa:
      return ret;
  }

+/*

+ * Flush one cluster of compressed cache if exists. If @unfinished is true and
+ * there is no finished cluster to flush, flush the unfinished one. Otherwise
+ * flush only finished clusters.
+ *
+ * Return 0 if nothing to flush, 1 if one cluster successfully flushed and <0 
on
+ * error.
+ */
+static int coroutine_fn
+qcow2_co_compressed_flush_one(BlockDriverState *bs, bool unfinished)
+{
+    BDRVQcow2State *s = bs->opaque;
+    int ret;
+    int64_t align = 1;
+    int64_t offset, bytes;
+    uint8_t *buf;
+
+    if (!s->compressed_cache) {
+        return 0;
+    }
+
+    if (!seqcache_get_next_flush(s->compressed_cache, &offset, &bytes, &buf,
+                                 &unfinished))
+    {
+        return 0;
+    }
+
+    qcow2_inflight_writes_inc(bs, offset, bytes);
+
+    /*
+     * Don't try to align-up unfinished cached cluster, parallel write to it is
+     * possible! For finished host clusters data in the tail of the cluster 
will
+     * be never used. So, take some good alignment for speed.
+     *
+     * Note also, that seqcache guarantees that allocated size of @buf is 
enough
+     * to fill the cluster up to the end, so we are safe to align up with
+     * align <= cluster_size.
+     */
+    if (!unfinished) {
+        align = MIN(s->cluster_size,
+                    MAX(s->data_file->bs->bl.request_alignment,
+                        bdrv_opt_mem_align(bs)));
+    }
+

I’d move the pre-write overlap check here, because its purpose is toprevent writes to metadata structures as they are happening, not as theyare queued into a cache. (I’d say.)

+    BLKDBG_EVENT(s->data_file, BLKDBG_WRITE_COMPRESSED);
+    ret = bdrv_co_pwrite(s->data_file, offset,
+                         QEMU_ALIGN_UP(offset + bytes, align) - offset,

I remember you said you wanted to describe more of the properties of thebuffer returned by seqcache_get_next_flush(). That would be nice here,because intuitively one would assume that the buffer is @bytes byteslong, which aligning here will exceed. (It’s fine, but it would benicer if there was a comment that assured that it’s fine.)

+                         buf, 0);
+    if (ret < 0 && s->compressed_flush_ret == 0) {
+        /*
+         * The data that was "written" earlier is lost. We don't want to
+         * care with storing it somehow to retry flushing later.

Yeah, there is little point in trying something like writing it againand again in the hope that maybe at some point it’ll work.

It is a shame that the error isn’t returned by the original compressedwrite, but what can you do.

                                                                  Also, how much
+         * data will we have to store, if not drop it after failed flush?
+         * After this point qcow2_co_flush_compressed_cache() (and
+         * qcow2_flush_compressed_cache()) never return success.
+         */
+        s->compressed_flush_ret = ret;
+    }
+
+    seqcache_discard_cluster(s->compressed_cache, offset);
+
+    qcow2_inflight_writes_dec(bs, offset, bytes);
+
+    return ret < 0 ? ret : 1;
+}
+
+int coroutine_fn qcow2_co_flush_compressed_cache(BlockDriverState *bs)
+{
+    BDRVQcow2State *s = bs->opaque;
+
+    if (s->compressed_cache) {
+        uint64_t nb_clusters = seqcache_nb_clusters(s->compressed_cache);
+
+        /*
+         * If parallel writes are possible we don't want to loop forever. So
+         * flush only clusters available at start of flush operation.
+         */
+        while (nb_clusters--) {
+            qcow2_co_compressed_flush_one(bs, true);
+        }
+    }
+
+    return s->compressed_flush_ret;
+}
+
  static int qcow2_inactivate(BlockDriverState *bs)
  {
      BDRVQcow2State *s = bs->opaque;
@@ -2667,6 +2757,13 @@ static int qcow2_inactivate(BlockDriverState *bs)
                            bdrv_get_device_or_node_name(bs));
      }

+ ret = qcow2_flush_compressed_cache(bs);

+    if (ret) {
+        result = ret;
+        error_report("Failed to flush the compressed write cache: %s",
+                     strerror(-ret));
+    }
+
      ret = qcow2_cache_flush(bs, s->l2_table_cache);
      if (ret) {
          result = ret;
@@ -2699,6 +2796,12 @@ static void qcow2_close(BlockDriverState *bs)
          qcow2_inactivate(bs);
      }

+ /*

+     * Cache should be flushed in qcow2_inactivate() and should be empty in
+     * inactive mode. So we are safe to free it.
+     */
+    seqcache_free(s->compressed_cache);
+
      cache_clean_timer_del(bs);
      qcow2_cache_destroy(s->l2_table_cache);
      qcow2_cache_destroy(s->refcount_block_cache);
@@ -4558,18 +4661,42 @@ qcow2_co_pwritev_compressed_task(BlockDriverState *bs,
          goto fail;
      }

- qcow2_inflight_writes_inc(bs, cluster_offset, out_len);

+    if (s->compressed_cache) {


Why is this conditional?

+        /*
+         * It's important to do seqcache_write() in the same critical section
+         * (by s->lock) as qcow2_alloc_compressed_cluster_offset(), so that the
+         * cache is filled sequentially.
+         */


Yes.

+        seqcache_write(s->compressed_cache, cluster_offset, out_len, out_buf);

- qemu_co_mutex_unlock(&s->lock);

+        qemu_co_mutex_unlock(&s->lock);

- BLKDBG_EVENT(s->data_file, BLKDBG_WRITE_COMPRESSED);

-    ret = bdrv_co_pwrite(s->data_file, cluster_offset, out_len, out_buf, 0);
+        ret = qcow2_co_compressed_flush_one(bs, false);

The qcow2 doc says a compressed cluster can span multiple host clusters.I don’t know whether that can happen with this driver, but if it does,wouldn’t that mean we’d need to flush two clusters here? Oh, no, nevermind. Only the first one would be finished and thus flushed, not thesecond one.

I could have now removed the above paragraph, but it made me think, so Ikept it:

Hm. Actually, if we unconditionally flush here, doesn’t that mean thatwe’ll never have a finished cluster in the cache for longer than thespan between the seqcache_write() and thisqcow2_co_compressed_flush_one()? I.e., theqcow2_co_flush_compressed_cache() is supposed to never flush anyfinished cluster, but only the currently active unfinished cluster (ifthere is one), right?

+        if (ret < 0) {
+            /*
+             * if ret < 0 it probably not this request which failed, but some
+             * previous one that was cached. Moreover, when we write to the
+             * cache we probably may always report success, because
+             * seqcache_write() never fails. Still, it's better to inform
+             * compressed backup now then wait until final flush.
+             */


Yup.

+            goto fail;
+        }
+    } else {
+        qcow2_inflight_writes_inc(bs, cluster_offset, out_len);

- qcow2_inflight_writes_dec(bs, cluster_offset, out_len);

+        qemu_co_mutex_unlock(&s->lock);

- if (ret < 0) {

-        goto fail;
+        BLKDBG_EVENT(s->data_file, BLKDBG_WRITE_COMPRESSED);
+        ret = bdrv_co_pwrite(s->data_file, cluster_offset, out_len, out_buf, 
0);
+
+        qcow2_inflight_writes_dec(bs, cluster_offset, out_len);
+
+        if (ret < 0) {
+            goto fail;
+        }
      }
+
  success:
      ret = 0;
  fail:
@@ -4681,10 +4808,19 @@ qcow2_co_preadv_compressed(BlockDriverState *bs,

out_buf = qemu_blockalign(bs, s->cluster_size);- BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED);

-    ret = bdrv_co_pread(bs->file, coffset, csize, buf, 0);
-    if (ret < 0) {
-        goto fail;
+    /*
+     * seqcache_read may return less bytes than csize, as csize may exceed
+     * actual compressed data size. So we are OK if seqcache_read returns
+     * something > 0.

I was about to ask what happens when a compressed cluster spans two hostclusters (I could have imagined that in theory the second one could havebeen discarded, but not the first one, so reading from the cache wouldreally be short -- we would have needed to check that we only fell shortin the range of 512 bytes, not more).

But then I realized that in this version of the series, all finishedclusters are immediately discarded and only the current unfinished oneis kept. Does it even make sense to try seqcache_read() here, then?

+     */
+    if (!s->compressed_cache ||


(As for writing, I don’t think this can ever occur.)

Max

+        seqcache_read(s->compressed_cache, coffset, csize, buf) <= 0)
+    {
+        BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
+        ret = bdrv_co_pread(bs->file, coffset, csize, buf, 0);
+        if (ret < 0) {
+            goto fail;
+        }
      }

if (qcow2_co_decompress(bs, out_buf, s->cluster_size, buf, csize) < 0) {

Re: [PATCH v3 6/6] block/qcow2: use seqcache for compressed writes

Reply via email to