[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [PATCH 2/2] qcow2: improve savevm performance
From: |
Denis V. Lunev |
Subject: |
Re: [PATCH 2/2] qcow2: improve savevm performance |
Date: |
Thu, 11 Jun 2020 11:25:35 +0300 |
User-agent: |
Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Thunderbird/68.8.0 |
On 6/11/20 11:04 AM, Vladimir Sementsov-Ogievskiy wrote:
> 10.06.2020 22:00, Denis V. Lunev wrote:
>> This patch does 2 standard basic things:
>> - it creates intermediate buffer for all writes from QEMU migration code
>> to QCOW2 image,
>> - this buffer is sent to disk asynchronously, allowing several writes to
>> run in parallel.
>>
>> In general, migration code is fantastically inefficent (by observation),
>> buffers are not aligned and sent with arbitrary pieces, a lot of time
>> less than 100 bytes at a chunk, which results in read-modify-write
>> operations with non-cached operations. It should also be noted that all
>> operations are performed into unallocated image blocks, which also
>> suffer
>> due to partial writes to such new clusters.
>>
>> Snapshot creation time (2 GB Fedora-31 VM running over NVME storage):
>> original fixed
>> cached: 1.79s 1.27s
>> non-cached: 3.29s 0.81s
>>
>> The difference over HDD would be more significant :)
>>
>> Signed-off-by: Denis V. Lunev <den@openvz.org>
>
> If I follow correctly, you make qcow2_save_vmstate implicitly
> asynchronous:
> it may return immediately after creating a task, and task is executing in
> parallel.
>
> I think, block-layer is unprepared for such behavior, it rely on the
> fact that
> .bdrv_save_vmstate is synchronous.
>
> For example, look at bdrv_co_rw_vmstate(). It calls
> drv->bdrv_save_vmstate
> inside pair of bdrv_inc_in_flight()/bdrv_dec_in_flight(). It means
> that with
> this patch, we may break drained section.
>
Drained sections are not involved into the equation - the guest
is stopped at the moment.
If we are talking about in_flight count, it should not be a problem
even if the guest is running. We could just put inc/dec into
qcow2_co_vmstate_task_entry().
> Next, it's a kind of cache for vmstate-write operation. It seems for
> me that
> it's not directly related to qcow2. So, we can implement it in generic
> block
> layer, where we can handle in_fligth requests. Can we keep
> .bdrv_save_vmstate
> handlers of format drivers as is, keep them synchronous, but instead
> change
> generic interface to be (optionally?) cached?
This has been discussed already in the previous thread. .bdrv_save_vmstate
is implemented in QCOW2 and sheepdog only. Thus other formats are
mostly non-interested.
>
>> ---
>> block/qcow2.c | 111 +++++++++++++++++++++++++++++++++++++++++++++++++-
>> block/qcow2.h | 4 ++
>> 2 files changed, 113 insertions(+), 2 deletions(-)
>>
>> diff --git a/block/qcow2.c b/block/qcow2.c
>> index 0cd2e6757e..e6232f32e2 100644
>> --- a/block/qcow2.c
>> +++ b/block/qcow2.c
>> @@ -4797,11 +4797,43 @@ static int qcow2_make_empty(BlockDriverState
>> *bs)
>> return ret;
>> }
>> +
>> +typedef struct Qcow2VMStateTask {
>> + AioTask task;
>> +
>> + BlockDriverState *bs;
>> + int64_t offset;
>> + void *buf;
>> + size_t bytes;
>> +} Qcow2VMStateTask;
>> +
>> +typedef struct Qcow2SaveVMState {
>> + AioTaskPool *pool;
>> + Qcow2VMStateTask *t;
>> +} Qcow2SaveVMState;
>> +
>> static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs)
>> {
>> BDRVQcow2State *s = bs->opaque;
>> + Qcow2SaveVMState *state = s->savevm_state;
>> int ret;
>> + if (state != NULL) {
>> + aio_task_pool_start_task(state->pool, &state->t->task);
>> +
>> + aio_task_pool_wait_all(state->pool);
>> + ret = aio_task_pool_status(state->pool);
>> +
>> + aio_task_pool_free(state->pool);
>> + g_free(state);
>> +
>> + s->savevm_state = NULL;
>> +
>> + if (ret < 0) {
>> + return ret;
>> + }
>> + }
>> +
>> qemu_co_mutex_lock(&s->lock);
>> ret = qcow2_write_caches(bs);
>> qemu_co_mutex_unlock(&s->lock);
>> @@ -5098,14 +5130,89 @@ static int
>> qcow2_has_zero_init(BlockDriverState *bs)
>> }
>> }
>> +
>> +static coroutine_fn int qcow2_co_vmstate_task_entry(AioTask *task)
>> +{
>> + int err = 0;
>> + Qcow2VMStateTask *t = container_of(task, Qcow2VMStateTask, task);
>> +
>> + if (t->bytes != 0) {
>> + QEMUIOVector local_qiov;
>> + qemu_iovec_init_buf(&local_qiov, t->buf, t->bytes);
>> + err = t->bs->drv->bdrv_co_pwritev_part(t->bs, t->offset,
>> t->bytes,
>> + &local_qiov, 0, 0);
>> + }
>> +
>> + qemu_vfree(t->buf);
>> + return err;
>> +}
>> +
>> +static Qcow2VMStateTask *qcow2_vmstate_task_create(BlockDriverState
>> *bs,
>> + int64_t pos,
>> size_t size)
>> +{
>> + BDRVQcow2State *s = bs->opaque;
>> + Qcow2VMStateTask *t = g_new(Qcow2VMStateTask, 1);
>> +
>> + *t = (Qcow2VMStateTask) {
>> + .task.func = qcow2_co_vmstate_task_entry,
>> + .buf = qemu_blockalign(bs, size),
>> + .offset = qcow2_vm_state_offset(s) + pos,
>> + .bs = bs,
>> + };
>> +
>> + return t;
>> +}
>> +
>> static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector
>> *qiov,
>> int64_t pos)
>> {
>> BDRVQcow2State *s = bs->opaque;
>> + Qcow2SaveVMState *state = s->savevm_state;
>> + Qcow2VMStateTask *t;
>> + size_t buf_size = MAX(s->cluster_size, 1 * MiB);
>> + size_t to_copy;
>> + size_t off;
>> BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE);
>> - return bs->drv->bdrv_co_pwritev_part(bs,
>> qcow2_vm_state_offset(s) + pos,
>> - qiov->size, qiov, 0, 0);
>> +
>> + if (state == NULL) {
>> + state = g_new(Qcow2SaveVMState, 1);
>> + *state = (Qcow2SaveVMState) {
>> + .pool = aio_task_pool_new(QCOW2_MAX_WORKERS),
>> + .t = qcow2_vmstate_task_create(bs, pos, buf_size),
>> + };
>> +
>> + s->savevm_state = state;
>> + }
>> +
>> + if (aio_task_pool_status(state->pool) != 0) {
>> + return aio_task_pool_status(state->pool);
>> + }
>> +
>> + t = state->t;
>> + if (t->offset + t->bytes != qcow2_vm_state_offset(s) + pos) {
>> + /* Normally this branch is not reachable from migration */
>> + return bs->drv->bdrv_co_pwritev_part(bs,
>> + qcow2_vm_state_offset(s) + pos, qiov->size, qiov, 0,
>> 0);
>> + }
>> +
>> + off = 0;
>> + while (1) {
>> + to_copy = MIN(qiov->size - off, buf_size - t->bytes);
>> + qemu_iovec_to_buf(qiov, off, t->buf + t->bytes, to_copy);
>> + t->bytes += to_copy;
>> + if (t->bytes < buf_size) {
>> + return 0;
>> + }
>> +
>> + aio_task_pool_start_task(state->pool, &t->task);
>> +
>> + pos += to_copy;
>> + off += to_copy;
>> + state->t = t = qcow2_vmstate_task_create(bs, pos, buf_size);
>> + }
>> +
>> + return 0;
>> }
>> static int qcow2_load_vmstate(BlockDriverState *bs, QEMUIOVector
>> *qiov,
>> diff --git a/block/qcow2.h b/block/qcow2.h
>> index 7ce2c23bdb..146cfed739 100644
>> --- a/block/qcow2.h
>> +++ b/block/qcow2.h
>> @@ -291,6 +291,8 @@ typedef struct Qcow2BitmapHeaderExt {
>> #define QCOW2_MAX_THREADS 4
>> +typedef struct Qcow2SaveVMState Qcow2SaveVMState;
>> +
>> typedef struct BDRVQcow2State {
>> int cluster_bits;
>> int cluster_size;
>> @@ -384,6 +386,8 @@ typedef struct BDRVQcow2State {
>> * is to convert the image with the desired compression type set.
>> */
>> Qcow2CompressionType compression_type;
>> +
>> + Qcow2SaveVMState *savevm_state;
>> } BDRVQcow2State;
>> typedef struct Qcow2COWRegion {
>>
>
>