[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Qemu-devel] [RFC PATCH] drive-backup 'stream' mode
From: |
Wolfgang Richter |
Subject: |
[Qemu-devel] [RFC PATCH] drive-backup 'stream' mode |
Date: |
Fri, 11 Oct 2013 11:18:09 -0400 |
Idea: Introduce a mode for drive-backup that duplicates writes to
another target, not CoW. It is useful for introspecting (my use
case), and for keeping a remote block device in sync with writes
(helps with migration or backup).
Issue with current modes: All of the current modes are well-designed
to support point-in-time snapshots, but none of them handle keeping
another drive up-to-date as new writes continuously occur. The 'None'
mode documentation is a bit ambiguous in this regard, but what it
actually implements is a very low overhead CoW snapshot.
Patch: Fixes ambiguity in the 'None' mode documentation, introduces a
new mode 'stream' which duplicates writes without reading any data
from the original disk.
I put the logic for copying the write into a new coroutine called
'backup_do_stream' as it needs almost nothing from the original
'backup_do_cow' function (no bit map, no reads from a block device,
etc.). The other major change is that tracked requests also contain a
handle to the QIOV involved in the write (and it is passed along).
This is based off of v1.6.0 code.
diff --git a/block.c b/block.c
index 01b66d8..159f825 100644
--- a/block.c
+++ b/block.c
@@ -1872,12 +1872,14 @@ static void tracked_request_end(BdrvTrackedRequest *req)
static void tracked_request_begin(BdrvTrackedRequest *req,
BlockDriverState *bs,
int64_t sector_num,
- int nb_sectors, bool is_write)
+ int nb_sectors, bool is_write,
+ QEMUIOVector *qiov)
{
*req = (BdrvTrackedRequest){
.bs = bs,
.sector_num = sector_num,
.nb_sectors = nb_sectors,
+ .qiov = qiov,
.is_write = is_write,
.co = qemu_coroutine_self(),
};
@@ -2528,7 +2530,7 @@ static int coroutine_fn
bdrv_co_do_readv(BlockDriverState *bs,
wait_for_overlapping_requests(bs, sector_num, nb_sectors);
}
- tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
+ tracked_request_begin(&req, bs, sector_num, nb_sectors, false, NULL);
if (flags & BDRV_REQ_COPY_ON_READ) {
int pnum;
@@ -2634,7 +2636,7 @@ static int coroutine_fn
bdrv_co_do_writev(BlockDriverState *bs,
wait_for_overlapping_requests(bs, sector_num, nb_sectors);
}
- tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
+ tracked_request_begin(&req, bs, sector_num, nb_sectors, true, qiov);
ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req);
diff --git a/block/backup.c b/block/backup.c
index 6ae8a05..686a53f 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -84,6 +84,37 @@ static void cow_request_end(CowRequest *req)
qemu_co_queue_restart_all(&req->wait_queue);
}
+static int coroutine_fn backup_do_stream(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors,
+ QEMUIOVector *qiov)
+{
+ BackupBlockJob *job = (BackupBlockJob *)bs->job;
+ CowRequest cow_request;
+ int ret = 0;
+ int64_t start = sector_num, end = sector_num + nb_sectors;
+
+ qemu_co_rwlock_rdlock(&job->flush_rwlock);
+
+ wait_for_overlapping_requests(job, start, end);
+ cow_request_begin(&cow_request, job, start, end);
+
+ ret = bdrv_co_writev(job->target,
+ sector_num, nb_sectors,
+ qiov);
+
+ /* Publish progress, guest I/O counts as progress too. Note that the
+ * offset field is an opaque progress value, it is not a disk offset.
+ */
+ job->sectors_read += sector_num;
+ job->common.offset += sector_num * BDRV_SECTOR_SIZE;
+
+ cow_request_end(&cow_request);
+
+ qemu_co_rwlock_unlock(&job->flush_rwlock);
+
+ return ret;
+}
+
static int coroutine_fn backup_do_cow(BlockDriverState *bs,
int64_t sector_num, int nb_sectors,
bool *error_is_read)
@@ -181,7 +212,12 @@ static int coroutine_fn backup_before_write_notify(
{
BdrvTrackedRequest *req = opaque;
- return backup_do_cow(req->bs, req->sector_num, req->nb_sectors, NULL);
+ if (MIRROR_SYNC_MODE_STREAM) {
+ return backup_do_stream(req->bs, req->sector_num, req->nb_sectors,
+ req->qiov);
+ } else {
+ return backup_do_cow(req->bs, req->sector_num, req->nb_sectors, NULL);
+ }
}
static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
@@ -248,7 +284,8 @@ static void coroutine_fn backup_run(void *opaque)
bdrv_add_before_write_notifier(bs, &before_write);
- if (job->sync_mode == MIRROR_SYNC_MODE_NONE) {
+ if (job->sync_mode == MIRROR_SYNC_MODE_NONE ||
+ job->sync_mode == MIRROR_SYNC_MODE_STREAM) {
while (!block_job_is_cancelled(&job->common)) {
/* Yield until the job is cancelled. We just let our before_write
* notify callback service CoW requests. */
index e45f2a0..13ab769 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -63,6 +63,7 @@ typedef struct BdrvTrackedRequest {
BlockDriverState *bs;
int64_t sector_num;
int nb_sectors;
+ QEMUIOVector *qiov;
bool is_write;
QLIST_ENTRY(BdrvTrackedRequest) list;
Coroutine *co; /* owner, used for deadlock detection */
diff --git a/qapi-schema.json b/qapi-schema.json
index a51f7d2..9a3008a 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -1311,12 +1311,14 @@
#
# @full: copies data from all images to the destination
#
-# @none: only copy data written from now on
+# @none: only copy on write data written from now on
+#
+# @stream: copy every new write to target
#
# Since: 1.3
##
{ 'enum': 'MirrorSyncMode',
- 'data': ['top', 'full', 'none'] }
+ 'data': ['top', 'full', 'none', 'stream'] }
##
# @BlockJobInfo:
diff --git a/qmp-commands.hx b/qmp-commands.hx
index cf47e3f..39056bd 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -944,7 +944,8 @@ Arguments:
(json-string, optional)
- "sync": what parts of the disk image should be copied to the destination;
possibilities include "full" for all the disk, "top" for only the sectors
- allocated in the topmost image, or "none" to only replicate new I/O
+ allocated in the topmost image, "none" to CoW on new I/O, or "stream"
+ to send every new write to the target
(MirrorSyncMode).
- "mode": whether and how QEMU should create a new image
(NewImageMode, optional, default 'absolute-paths')
- [Qemu-devel] [RFC PATCH] drive-backup 'stream' mode,
Wolfgang Richter <=