qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH 3/3] qemu-img: Speed up checksum


From: Nir Soffer
Subject: [PATCH 3/3] qemu-img: Speed up checksum
Date: Thu, 1 Sep 2022 17:32:23 +0300

Add coroutine based loop inspired by `qemu-img convert` design.

Changes compared to `qemu-img convert`:

- State for the entire image is kept in ImgChecksumState

- State for single worker coroutine is kept in ImgChecksumworker.

- "Writes" are always in-order, ensured using a queue.

- Calling block status once per image extent, when the current extent is
  consumed by the workers.

- Using 1m buffer size - testings shows that this gives best read
  performance both with buffered and direct I/O.

- Number of coroutines is not configurable. Testing does not show
  improvement when using more than 8 coroutines.

- Progress include entire image, not only the allocated state.

Comparing to the simple read loop shows that this version is up to 4.67
times faster when computing a checksum for an image full of zeroes. For
real images it is 1.59 times faster with direct I/O, and with buffered
I/O there is no difference.

Test results on Dell PowerEdge R640 in a CentOS Stream 9 container:

| image    | size | i/o       | before         | after          | change |
|----------|------|-----------|----------------|----------------|--------|
| zero [1] |   6g | buffered  | 1.600s ±0.014s | 0.342s ±0.016s |  x4.67 |
| zero     |   6g | direct    | 4.684s ±0.093s | 2.211s ±0.009s |  x2.12 |
| real [2] |   6g | buffered  | 1.841s ±0.075s | 1.806s ±0.036s |  x1.02 |
| real     |   6g | direct    | 3.094s ±0.079s | 1.947s ±0.017s |  x1.59 |
| nbd  [3] |   6g | buffered  | 2.455s ±0.183s | 1.808s ±0.016s |  x1.36 |
| nbd      |   6g | direct    | 3.540s ±0.020s | 1.749s ±0.018s |  x2.02 |

[1] raw image full of zeroes
[2] raw fedora 35 image with additional random data, 50% full
[3] image [2] exported by qemu-nbd via unix socket

Signed-off-by: Nir Soffer <nsoffer@redhat.com>
---
 qemu-img.c | 343 +++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 270 insertions(+), 73 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index 7edcfe4bc8..bfa8e2862f 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -1613,48 +1613,288 @@ out:
     qemu_vfree(buf2);
     blk_unref(blk2);
 out2:
     blk_unref(blk1);
 out3:
     qemu_progress_end();
     return ret;
 }
 
 #ifdef CONFIG_BLKHASH
+
+#define CHECKSUM_COROUTINES 8
+#define CHECKSUM_BUF_SIZE (1 * MiB)
+#define CHECKSUM_ZERO_SIZE MIN(16 * GiB, SIZE_MAX)
+
+typedef struct ImgChecksumState ImgChecksumState;
+
+typedef struct ImgChecksumWorker {
+    QTAILQ_ENTRY(ImgChecksumWorker) entry;
+    ImgChecksumState *state;
+    Coroutine *co;
+    uint8_t *buf;
+
+    /* The current chunk. */
+    int64_t offset;
+    int64_t length;
+    bool zero;
+
+    /* Always true for zero extent, false for data extent. Set to true
+     * when reading the chunk completes. */
+    bool ready;
+} ImgChecksumWorker;
+
+struct ImgChecksumState {
+    const char *filename;
+    BlockBackend *blk;
+    BlockDriverState *bs;
+    int64_t total_size;
+
+    /* Current extent, modified in checksum_co_next. */
+    int64_t offset;
+    int64_t length;
+    bool zero;
+
+    int running_coroutines;
+    CoMutex lock;
+    ImgChecksumWorker workers[CHECKSUM_COROUTINES];
+
+    /* Ensure in-order updates. Update are scheduled at the tail of the
+     * queue and processed from the head of the queue when a worker is
+     * ready. */
+    QTAILQ_HEAD(, ImgChecksumWorker) update_queue;
+
+    struct blkhash *hash;
+    int ret;
+};
+
+static int checksum_block_status(ImgChecksumState *s)
+{
+    int64_t length;
+    int status;
+
+    /* Must be called when current extent is consumed. */
+    assert(s->length == 0);
+
+    status = bdrv_block_status_above(s->bs, NULL, s->offset,
+                                     s->total_size - s->offset, &length, NULL,
+                                     NULL);
+    if (status < 0) {
+        error_report("Error checking status at offset %" PRId64 " for %s",
+                     s->offset, s->filename);
+        s->ret = status;
+        return -1;
+    }
+
+    assert(length > 0);
+
+    s->length = length;
+    s->zero = !!(status & BDRV_BLOCK_ZERO);
+
+    return 0;
+}
+
+/**
+ * Grab the next chunk from the current extent, getting the next extent if
+ * needed, and schecule the next update at the end fo the update queue.
+ *
+ * Retrun true if the worker has work to do, false if the worker has
+ * finished or there was an error getting the next extent.
+ */
+static coroutine_fn bool checksum_co_next(ImgChecksumWorker *w)
+{
+    ImgChecksumState *s = w->state;
+
+    qemu_co_mutex_lock(&s->lock);
+
+    if (s->offset == s->total_size || s->ret != -EINPROGRESS) {
+        qemu_co_mutex_unlock(&s->lock);
+        return false;
+    }
+
+    if (s->length == 0 && checksum_block_status(s)) {
+        qemu_co_mutex_unlock(&s->lock);
+        return false;
+    }
+
+    /* Grab one chunk from current extent. */
+    w->offset = s->offset;
+    w->length = MIN(s->length,
+                    s->zero ?  CHECKSUM_ZERO_SIZE : CHECKSUM_BUF_SIZE);
+    w->zero = s->zero;
+    w->ready = s->zero;
+
+    /* Advance state to the next chunk. */
+    s->offset += w->length;
+    s->length -= w->length;
+
+    /* Schedule this chunk update. */
+    QTAILQ_INSERT_TAIL(&s->update_queue, w, entry);
+
+    qemu_co_mutex_unlock(&s->lock);
+
+    return true;
+}
+
+/* Wait until this chunk is in the head of the update queue. */
+static void coroutine_fn checksum_co_wait_for_update(ImgChecksumWorker *w)
+{
+    ImgChecksumState *s = w->state;
+
+    /* Must be called only when we are ready to update the hash. */
+    assert(w->ready);
+
+    while (QTAILQ_FIRST(&s->update_queue) != w) {
+        qemu_coroutine_yield();
+    }
+
+    QTAILQ_REMOVE(&s->update_queue, w, entry);
+}
+
+/* Enter the next worker coroutine if the worker is ready. */
+static void coroutine_fn checksum_co_enter_next(ImgChecksumWorker *w)
+{
+    ImgChecksumState *s = w->state;
+    ImgChecksumWorker *next;
+
+    if (!QTAILQ_EMPTY(&s->update_queue)) {
+        next = QTAILQ_FIRST(&s->update_queue);
+        if (next->ready)
+            qemu_coroutine_enter(next->co);
+    }
+}
+
+static int coroutine_fn checksum_co_read(ImgChecksumWorker *w)
+{
+    ImgChecksumState *s = w->state;
+    int err;
+
+    err = blk_co_pread(s->blk, w->offset, w->length, w->buf, 0);
+    if (err < 0) {
+        error_report("Error reading at offset %" PRId64 " of %s: %s",
+                     w->offset, s->filename, strerror(-err));
+        /* Unschedule this chunk. */
+        QTAILQ_REMOVE(&s->update_queue, w, entry);
+        s->ret = err;
+        return -1;
+    }
+
+    w->ready = true;
+
+    return 0;
+}
+
+static int checksum_update_hash(ImgChecksumWorker *w)
+{
+    ImgChecksumState *s = w->state;
+    int err;
+
+    if (w->zero) {
+        err = blkhash_zero(s->hash, w->length);
+        if (err) {
+            error_report("Error zeroing hash at offset %" PRId64 " of %s: %s",
+                         w->offset, s->filename, strerror(err));
+            s->ret = -err;
+            return -1;
+        }
+    } else {
+        err = blkhash_update(s->hash, w->buf, w->length);
+        if (err) {
+            error_report("Error updating hash at offset %" PRId64 " of %s: %s",
+                         w->offset, s->filename, strerror(err));
+            s->ret = -err;
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+static void coroutine_fn checksum_co_loop(void *opaque)
+{
+    ImgChecksumWorker *w = opaque;
+    ImgChecksumState *s = w->state;
+
+    s->running_coroutines++;
+    w->buf = blk_blockalign(s->blk, CHECKSUM_BUF_SIZE);
+
+    while (checksum_co_next(w)) {
+        if (!w->zero && checksum_co_read(w)) {
+            break;
+        }
+        checksum_co_wait_for_update(w);
+        if (s->ret == -EINPROGRESS) {
+            if (checksum_update_hash(w)) {
+                break;
+            }
+            qemu_progress_print(((float) w->length / s->total_size) * 100, 
100);
+        }
+        checksum_co_enter_next(w);
+    }
+
+    qemu_vfree(w->buf);
+    w->co = NULL;
+    s->running_coroutines--;
+
+    if (s->running_coroutines == 0 && s->ret == -EINPROGRESS) {
+        /* the checksum job finished successfully */
+        s->ret = 0;
+    }
+}
+
+static int checksum_loop(ImgChecksumState *s)
+{
+    int i;
+
+    s->ret = -EINPROGRESS;
+
+    qemu_co_mutex_init(&s->lock);
+    QTAILQ_INIT(&s->update_queue);
+
+    for (i = 0; i < CHECKSUM_COROUTINES; i++) {
+        ImgChecksumWorker *w = &s->workers[i];
+        w->state = s;
+        w->co = qemu_coroutine_create(checksum_co_loop, w);
+        qemu_coroutine_enter(w->co);
+    }
+
+    while (s->running_coroutines) {
+        main_loop_wait(false);
+    }
+
+    return s->ret;
+}
+
 /*
  * Compute image checksum.
  */
 static int img_checksum(int argc, char **argv)
 {
     const char *digest_name = "sha256";
     const size_t block_size = 64 * KiB;
 
     const char *format = NULL;
     const char *cache = BDRV_DEFAULT_CACHE;
-    const char *filename;
-    BlockBackend *blk;
-    BlockDriverState *bs;
-    uint8_t *buf = NULL;
-    int64_t pnum;
     bool progress = false;
+    bool image_opts = false;
+
     int flags = 0;
-    bool writethrough;
-    int64_t total_size;
-    int64_t offset = 0;
+    bool writethrough = false;
     int c;
-    bool image_opts = false;
-    struct blkhash *h = NULL;
-    unsigned char digest[64];
-    unsigned int digest_len;
+
     int ret = 1;
     int err;
 
+    ImgChecksumState s = {0};
+    unsigned char digest[64];
+    unsigned int digest_len;
+
     for (;;) {
         static const struct option long_options[] = {
             {"help", no_argument, 0, 'h'},
             {"object", required_argument, 0, OPTION_OBJECT},
             {"image-opts", no_argument, 0, OPTION_IMAGE_OPTS},
             {0, 0, 0, 0}
         };
         c = getopt_long(argc, argv, ":hf:T:p",
                         long_options, NULL);
         if (c == -1) {
@@ -1697,119 +1937,76 @@ static int img_checksum(int argc, char **argv)
         case OPTION_IMAGE_OPTS:
             image_opts = true;
             break;
         }
     }
 
     if (optind != argc - 1) {
         error_exit("Expecting image file name");
     }
 
-    filename = argv[optind++];
+    s.filename = argv[optind++];
 
     err = bdrv_parse_cache_mode(cache, &flags, &writethrough);
     if (err < 0) {
         error_report("Invalid source cache option: %s", cache);
         return ret;
     }
 
-    blk = img_open(image_opts, filename, format, flags, writethrough, false,
-                   false);
-    if (!blk) {
+    s.blk = img_open(image_opts, s.filename, format, flags, writethrough,
+                     false, false);
+    if (!s.blk) {
         return ret;
     }
 
     /* Initialize before using goto out. */
     qemu_progress_init(progress, 2.0);
 
-    bs = blk_bs(blk);
-    buf = blk_blockalign(blk, IO_BUF_SIZE);
+    s.bs = blk_bs(s.blk);
 
-    total_size = blk_getlength(blk);
-    if (total_size < 0) {
+    s.total_size = blk_getlength(s.blk);
+    if (s.total_size < 0) {
         error_report("Can't get size of %s: %s",
-                     filename, strerror(-total_size));
+                     s.filename, strerror(-s.total_size));
         goto out;
     }
 
-    h = blkhash_new(block_size, digest_name);
-    if (!h) {
+    s.hash = blkhash_new(block_size, digest_name);
+    if (!s.hash) {
         error_report("Can't create blkhash: %s", strerror(errno));
         goto out;
     }
 
     qemu_progress_print(0, 100);
 
-    while (offset < total_size) {
-        int status;
-        int64_t chunk;
-
-        status = bdrv_block_status_above(bs, NULL, offset,
-                                         total_size - offset, &pnum, NULL,
-                                         NULL);
-        if (status < 0) {
-            error_report("Error checking status at offset %" PRId64 " for %s",
-                         offset, filename);
-            goto out;
-        }
-
-        assert(pnum);
-        chunk = pnum;
-
-        if (status & BDRV_BLOCK_ZERO) {
-            chunk = MIN(chunk, SIZE_MAX);
-            err = blkhash_zero(h, chunk);
-            if (err) {
-                error_report("Error zeroing hash at offset %" PRId64
-                             " of %s: %s",
-                             offset, filename, strerror(err));
-                goto out;
-            }
-        } else {
-            chunk = MIN(chunk, IO_BUF_SIZE);
-            err = blk_pread(blk, offset, chunk, buf, 0);
-            if (err < 0) {
-                error_report("Error reading at offset %" PRId64 " of %s: %s",
-                             offset, filename, strerror(-err));
-                goto out;
-            }
-            err = blkhash_update(h, buf, chunk);
-            if (err) {
-                error_report("Error updating hash at offset %" PRId64
-                             " of %s: %s",
-                             offset, filename, strerror(err));
-                goto out;
-            }
-        }
-
-        offset += chunk;
-        qemu_progress_print(((float) chunk / total_size) * 100, 100);
+    err = checksum_loop(&s);
+    if (err) {
+        goto out;
     }
 
-    err = blkhash_final(h, digest, &digest_len);
+    err = blkhash_final(s.hash, digest, &digest_len);
     if (err) {
         error_report("Error finalizing hash of %s: %s",
-                     filename, strerror(err));
+                     s.filename, strerror(err));
         goto out;
     }
 
     for (unsigned i = 0; i < digest_len; i++) {
         printf("%02x", digest[i]);
     }
-    printf("  %s%s", filename, progress ? "" : "\n");
+    printf("  %s%s", s.filename, progress ? "" : "\n");
 
     ret = 0;
 
 out:
-    blkhash_free(h);
-    qemu_vfree(buf);
-    blk_unref(blk);
+    blkhash_free(s.hash);
+    blk_unref(s.blk);
     qemu_progress_end();
 
     return ret;
 }
 #endif /* CONFIG_BLKHASH */
 
 /* Convenience wrapper around qmp_block_dirty_bitmap_merge */
 static void do_dirty_bitmap_merge(const char *dst_node, const char *dst_name,
                                   const char *src_node, const char *src_name,
                                   Error **errp)
-- 
2.37.2




reply via email to

[Prev in Thread] Current Thread [Next in Thread]