On 05.03.21 18:35, Vladimir Sementsov-Ogievskiy wrote:
Compressed writes are unaligned to 512, which works very slow in
O_DIRECT mode. Let's use the cache.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsement...@virtuozzo.com>
---
block/coroutines.h | 3 +
block/qcow2.h | 4 ++
block/qcow2-refcount.c | 10 +++
block/qcow2.c | 158 ++++++++++++++++++++++++++++++++++++++---
4 files changed, 164 insertions(+), 11 deletions(-)
diff --git a/block/coroutines.h b/block/coroutines.h
index 4cfb4946e6..cb8e05830e 100644
--- a/block/coroutines.h
+++ b/block/coroutines.h
@@ -66,4 +66,7 @@ int coroutine_fn bdrv_co_readv_vmstate(BlockDriverState *bs,
int coroutine_fn bdrv_co_writev_vmstate(BlockDriverState *bs,
QEMUIOVector *qiov, int64_t pos);
+int coroutine_fn qcow2_co_flush_compressed_cache(BlockDriverState *bs);
+int generated_co_wrapper qcow2_flush_compressed_cache(BlockDriverState *bs);
+
#endif /* BLOCK_COROUTINES_INT_H */
diff --git a/block/qcow2.h b/block/qcow2.h
index e18d8dfbae..8b8fed0950 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -28,6 +28,7 @@
#include "crypto/block.h"
#include "qemu/coroutine.h"
#include "qemu/units.h"
+#include "qemu/seqcache.h"
#include "block/block_int.h"
//#define DEBUG_ALLOC
@@ -422,6 +423,9 @@ typedef struct BDRVQcow2State {
Qcow2CompressionType compression_type;
GHashTable *inflight_writes_counters;
+
+ SeqCache *compressed_cache;
+ int compressed_flush_ret;
} BDRVQcow2State;
typedef struct Qcow2COWRegion {
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 464d133368..218917308e 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -29,6 +29,7 @@
#include "qemu/bswap.h"
#include "qemu/cutils.h"
#include "trace.h"
+#include "block/coroutines.h"
static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size,
uint64_t max);
@@ -1040,6 +1041,10 @@ static int QEMU_WARN_UNUSED_RESULT
update_refcount(BlockDriverState *bs,
qcow2_cache_discard(s->l2_table_cache, table);
}
+ if (s->compressed_cache) {
+ seqcache_discard_cluster(s->compressed_cache, cluster_offset);
+ }
+
if (s->discard_passthrough[type]) {
update_refcount_discard(bs, cluster_offset, s->cluster_size);
}
@@ -1349,6 +1354,11 @@ int coroutine_fn qcow2_write_caches(BlockDriverState *bs)
BDRVQcow2State *s = bs->opaque;
int ret;
+ ret = qcow2_flush_compressed_cache(bs);
+ if (ret < 0) {
+ return ret;
+ }
+
I wonder a bit why this function doesn’t work on a best-effort basis,
but that isn’t your problem.
ret = qcow2_cache_write(bs, s->l2_table_cache);
if (ret < 0) {
return ret;
diff --git a/block/qcow2.c b/block/qcow2.c
index 6ee94421e0..5f3713cd6f 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -42,6 +42,7 @@
#include "qapi/qapi-visit-block-core.h"
#include "crypto.h"
#include "block/aio_task.h"
+#include "block/coroutines.h"
/*
Differences with QCOW:
@@ -1834,6 +1835,10 @@ static int coroutine_fn qcow2_do_open(BlockDriverState
*bs, QDict *options,
s->inflight_writes_counters =
g_hash_table_new_full(g_int64_hash, g_int64_equal, g_free, g_free);
+ if (!has_data_file(bs) && (bs->file->bs->open_flags & BDRV_O_NOCACHE)) {
Looks a bit like a layering violation, but I have no better proposal and
you gave your reasoning, so, OK.
I wonder whether this should be re-checked during reopen, though.
+ s->compressed_cache = seqcache_new(s->cluster_size);
+ }
+
return ret;
fail:
@@ -2653,6 +2658,91 @@ fail_nometa:
return ret;
}
+/*
+ * Flush one cluster of compressed cache if exists. If @unfinished is true and
+ * there is no finished cluster to flush, flush the unfinished one. Otherwise
+ * flush only finished clusters.
+ *
+ * Return 0 if nothing to flush, 1 if one cluster successfully flushed and <0
on
+ * error.
+ */
+static int coroutine_fn
+qcow2_co_compressed_flush_one(BlockDriverState *bs, bool unfinished)
+{
+ BDRVQcow2State *s = bs->opaque;
+ int ret;
+ int64_t align = 1;
+ int64_t offset, bytes;
+ uint8_t *buf;
+
+ if (!s->compressed_cache) {
+ return 0;
+ }
+
+ if (!seqcache_get_next_flush(s->compressed_cache, &offset, &bytes, &buf,
+ &unfinished))
+ {
+ return 0;
+ }
+
+ qcow2_inflight_writes_inc(bs, offset, bytes);
+
+ /*
+ * Don't try to align-up unfinished cached cluster, parallel write to it is
+ * possible! For finished host clusters data in the tail of the cluster
will
+ * be never used. So, take some good alignment for speed.
+ *
+ * Note also, that seqcache guarantees that allocated size of @buf is
enough
+ * to fill the cluster up to the end, so we are safe to align up with
+ * align <= cluster_size.
+ */
+ if (!unfinished) {
+ align = MIN(s->cluster_size,
+ MAX(s->data_file->bs->bl.request_alignment,
+ bdrv_opt_mem_align(bs)));
+ }
+
I’d move the pre-write overlap check here, because its purpose is to
prevent writes to metadata structures as they are happening, not as they
are queued into a cache. (I’d say.)
+ BLKDBG_EVENT(s->data_file, BLKDBG_WRITE_COMPRESSED);
+ ret = bdrv_co_pwrite(s->data_file, offset,
+ QEMU_ALIGN_UP(offset + bytes, align) - offset,
I remember you said you wanted to describe more of the properties of the
buffer returned by seqcache_get_next_flush(). That would be nice here,
because intuitively one would assume that the buffer is @bytes bytes
long, which aligning here will exceed. (It’s fine, but it would be
nicer if there was a comment that assured that it’s fine.)
+ buf, 0);
+ if (ret < 0 && s->compressed_flush_ret == 0) {
+ /*
+ * The data that was "written" earlier is lost. We don't want to
+ * care with storing it somehow to retry flushing later.
Yeah, there is little point in trying something like writing it again
and again in the hope that maybe at some point it’ll work.
It is a shame that the error isn’t returned by the original compressed
write, but what can you do.
Also, how much
+ * data will we have to store, if not drop it after failed flush?
+ * After this point qcow2_co_flush_compressed_cache() (and
+ * qcow2_flush_compressed_cache()) never return success.
+ */
+ s->compressed_flush_ret = ret;
+ }
+
+ seqcache_discard_cluster(s->compressed_cache, offset);
+
+ qcow2_inflight_writes_dec(bs, offset, bytes);
+
+ return ret < 0 ? ret : 1;
+}
+
+int coroutine_fn qcow2_co_flush_compressed_cache(BlockDriverState *bs)
+{
+ BDRVQcow2State *s = bs->opaque;
+
+ if (s->compressed_cache) {
+ uint64_t nb_clusters = seqcache_nb_clusters(s->compressed_cache);
+
+ /*
+ * If parallel writes are possible we don't want to loop forever. So
+ * flush only clusters available at start of flush operation.
+ */
+ while (nb_clusters--) {
+ qcow2_co_compressed_flush_one(bs, true);
+ }
+ }
+
+ return s->compressed_flush_ret;
+}
+
static int qcow2_inactivate(BlockDriverState *bs)
{
BDRVQcow2State *s = bs->opaque;
@@ -2667,6 +2757,13 @@ static int qcow2_inactivate(BlockDriverState *bs)
bdrv_get_device_or_node_name(bs));
}
+ ret = qcow2_flush_compressed_cache(bs);
+ if (ret) {
+ result = ret;
+ error_report("Failed to flush the compressed write cache: %s",
+ strerror(-ret));
+ }
+
ret = qcow2_cache_flush(bs, s->l2_table_cache);
if (ret) {
result = ret;
@@ -2699,6 +2796,12 @@ static void qcow2_close(BlockDriverState *bs)
qcow2_inactivate(bs);
}
+ /*
+ * Cache should be flushed in qcow2_inactivate() and should be empty in
+ * inactive mode. So we are safe to free it.
+ */
+ seqcache_free(s->compressed_cache);
+
cache_clean_timer_del(bs);
qcow2_cache_destroy(s->l2_table_cache);
qcow2_cache_destroy(s->refcount_block_cache);
@@ -4558,18 +4661,42 @@ qcow2_co_pwritev_compressed_task(BlockDriverState *bs,
goto fail;
}
- qcow2_inflight_writes_inc(bs, cluster_offset, out_len);
+ if (s->compressed_cache) {
Why is this conditional?
+ /*
+ * It's important to do seqcache_write() in the same critical section
+ * (by s->lock) as qcow2_alloc_compressed_cluster_offset(), so that the
+ * cache is filled sequentially.
+ */
Yes.
+ seqcache_write(s->compressed_cache, cluster_offset, out_len, out_buf);
- qemu_co_mutex_unlock(&s->lock);
+ qemu_co_mutex_unlock(&s->lock);
- BLKDBG_EVENT(s->data_file, BLKDBG_WRITE_COMPRESSED);
- ret = bdrv_co_pwrite(s->data_file, cluster_offset, out_len, out_buf, 0);
+ ret = qcow2_co_compressed_flush_one(bs, false);
The qcow2 doc says a compressed cluster can span multiple host clusters.
I don’t know whether that can happen with this driver, but if it does,
wouldn’t that mean we’d need to flush two clusters here? Oh, no, never
mind. Only the first one would be finished and thus flushed, not the
second one.
I could have now removed the above paragraph, but it made me think, so I
kept it:
Hm. Actually, if we unconditionally flush here, doesn’t that mean that
we’ll never have a finished cluster in the cache for longer than the
span between the seqcache_write() and this
qcow2_co_compressed_flush_one()? I.e., the
qcow2_co_flush_compressed_cache() is supposed to never flush any
finished cluster, but only the currently active unfinished cluster (if
there is one), right?
+ if (ret < 0) {
+ /*
+ * if ret < 0 it probably not this request which failed, but some
+ * previous one that was cached. Moreover, when we write to the
+ * cache we probably may always report success, because
+ * seqcache_write() never fails. Still, it's better to inform
+ * compressed backup now then wait until final flush.
+ */
Yup.
+ goto fail;
+ }
+ } else {
+ qcow2_inflight_writes_inc(bs, cluster_offset, out_len);
- qcow2_inflight_writes_dec(bs, cluster_offset, out_len);
+ qemu_co_mutex_unlock(&s->lock);
- if (ret < 0) {
- goto fail;
+ BLKDBG_EVENT(s->data_file, BLKDBG_WRITE_COMPRESSED);
+ ret = bdrv_co_pwrite(s->data_file, cluster_offset, out_len, out_buf,
0);
+
+ qcow2_inflight_writes_dec(bs, cluster_offset, out_len);
+
+ if (ret < 0) {
+ goto fail;
+ }
}
+
success:
ret = 0;
fail:
@@ -4681,10 +4808,19 @@ qcow2_co_preadv_compressed(BlockDriverState *bs,
out_buf = qemu_blockalign(bs, s->cluster_size);
- BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
- ret = bdrv_co_pread(bs->file, coffset, csize, buf, 0);
- if (ret < 0) {
- goto fail;
+ /*
+ * seqcache_read may return less bytes than csize, as csize may exceed
+ * actual compressed data size. So we are OK if seqcache_read returns
+ * something > 0.
I was about to ask what happens when a compressed cluster spans two host
clusters (I could have imagined that in theory the second one could have
been discarded, but not the first one, so reading from the cache would
really be short -- we would have needed to check that we only fell short
in the range of 512 bytes, not more).
But then I realized that in this version of the series, all finished
clusters are immediately discarded and only the current unfinished one
is kept. Does it even make sense to try seqcache_read() here, then?
+ */
+ if (!s->compressed_cache ||
(As for writing, I don’t think this can ever occur.)
Max
+ seqcache_read(s->compressed_cache, coffset, csize, buf) <= 0)
+ {
+ BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
+ ret = bdrv_co_pread(bs->file, coffset, csize, buf, 0);
+ if (ret < 0) {
+ goto fail;
+ }
}
if (qcow2_co_decompress(bs, out_buf, s->cluster_size, buf, csize) < 0) {