qemu-block
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v10 18/21] job.c: enable job lock/unlock and remove Aiocontex


From: Vladimir Sementsov-Ogievskiy
Subject: Re: [PATCH v10 18/21] job.c: enable job lock/unlock and remove Aiocontext locks
Date: Wed, 27 Jul 2022 18:53:10 +0300
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:91.0) Gecko/20100101 Thunderbird/91.8.1

On 7/25/22 10:38, Emanuele Giuseppe Esposito wrote:
Change the job_{lock/unlock} and macros to use job_mutex.

Now that they are not nop anymore, remove the aiocontext
to avoid deadlocks.

Therefore:
- when possible, remove completely the aiocontext lock/unlock pair
- if it is used by some other function too, reduce the locking
   section as much as possible, leaving the job API outside.
- change AIO_WAIT_WHILE in AIO_WAIT_WHILE_UNLOCKED, since we
   are not using the aiocontext lock anymore

There is only one JobDriver callback, ->free() that assumes that
the aiocontext lock is held (because it calls bdrv_unref), so for
now keep that under aiocontext lock.

Also remove real_job_{lock/unlock}, as they are replaced by the
public functions.

Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
  blockdev.c                       | 74 +++++-----------------------
  include/qemu/job.h               | 22 ++++-----
  job-qmp.c                        | 46 +++--------------
  job.c                            | 84 ++++++--------------------------
  tests/unit/test-bdrv-drain.c     |  4 +-
  tests/unit/test-block-iothread.c |  2 +-
  tests/unit/test-blockjob.c       | 15 ++----
  7 files changed, 52 insertions(+), 195 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index 5b79093155..2cd84d206c 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -155,12 +155,7 @@ void blockdev_mark_auto_del(BlockBackend *blk)
      for (job = block_job_next_locked(NULL); job;
           job = block_job_next_locked(job)) {
          if (block_job_has_bdrv(job, blk_bs(blk))) {
-            AioContext *aio_context = job->job.aio_context;
-            aio_context_acquire(aio_context);
-
              job_cancel_locked(&job->job, false);
-
-            aio_context_release(aio_context);
          }
      }
@@ -1836,14 +1831,7 @@ static void drive_backup_abort(BlkActionState *common)
      DriveBackupState *state = DO_UPCAST(DriveBackupState, common, common);
if (state->job) {
-        AioContext *aio_context;
-
-        aio_context = bdrv_get_aio_context(state->bs);
-        aio_context_acquire(aio_context);
-
          job_cancel_sync(&state->job->job, true);
-
-        aio_context_release(aio_context);
      }
  }
@@ -1937,14 +1925,7 @@ static void blockdev_backup_abort(BlkActionState *common)
      BlockdevBackupState *state = DO_UPCAST(BlockdevBackupState, common, 
common);
if (state->job) {
-        AioContext *aio_context;
-
-        aio_context = bdrv_get_aio_context(state->bs);
-        aio_context_acquire(aio_context);
-
          job_cancel_sync(&state->job->job, true);
-
-        aio_context_release(aio_context);
      }
  }
@@ -3306,19 +3287,14 @@ out:
  }
/*
- * Get a block job using its ID and acquire its AioContext.
- * Called with job_mutex held.
+ * Get a block job using its ID. Called with job_mutex held.
   */
-static BlockJob *find_block_job_locked(const char *id,
-                                       AioContext **aio_context,
-                                       Error **errp)
+static BlockJob *find_block_job_locked(const char *id, Error **errp)
  {
      BlockJob *job;
assert(id != NULL); - *aio_context = NULL;
-
      job = block_job_get_locked(id);
if (!job) {
@@ -3327,36 +3303,30 @@ static BlockJob *find_block_job_locked(const char *id,
          return NULL;
      }
- *aio_context = block_job_get_aio_context(job);
-    aio_context_acquire(*aio_context);
-
      return job;
  }
void qmp_block_job_set_speed(const char *device, int64_t speed, Error **errp)
  {
-    AioContext *aio_context;
      BlockJob *job;
JOB_LOCK_GUARD();
-    job = find_block_job_locked(device, &aio_context, errp);
+    job = find_block_job_locked(device, errp);
if (!job) {
          return;
      }
block_job_set_speed_locked(job, speed, errp);
-    aio_context_release(aio_context);
  }
void qmp_block_job_cancel(const char *device,
                            bool has_force, bool force, Error **errp)
  {
-    AioContext *aio_context;
      BlockJob *job;
JOB_LOCK_GUARD();
-    job = find_block_job_locked(device, &aio_context, errp);
+    job = find_block_job_locked(device, errp);
if (!job) {
          return;
@@ -3369,22 +3339,19 @@ void qmp_block_job_cancel(const char *device,
      if (job_user_paused_locked(&job->job) && !force) {
          error_setg(errp, "The block job for device '%s' is currently paused",
                     device);
-        goto out;
+        return;
      }
trace_qmp_block_job_cancel(job);
      job_user_cancel_locked(&job->job, force, errp);
-out:
-    aio_context_release(aio_context);
  }
void qmp_block_job_pause(const char *device, Error **errp)
  {
-    AioContext *aio_context;
      BlockJob *job;
JOB_LOCK_GUARD();
-    job = find_block_job_locked(device, &aio_context, errp);
+    job = find_block_job_locked(device, errp);
if (!job) {
          return;
@@ -3392,16 +3359,14 @@ void qmp_block_job_pause(const char *device, Error 
**errp)
trace_qmp_block_job_pause(job);
      job_user_pause_locked(&job->job, errp);
-    aio_context_release(aio_context);
  }
void qmp_block_job_resume(const char *device, Error **errp)
  {
-    AioContext *aio_context;
      BlockJob *job;
JOB_LOCK_GUARD();
-    job = find_block_job_locked(device, &aio_context, errp);
+    job = find_block_job_locked(device, errp);
if (!job) {
          return;
@@ -3409,16 +3374,14 @@ void qmp_block_job_resume(const char *device, Error 
**errp)
trace_qmp_block_job_resume(job);
      job_user_resume_locked(&job->job, errp);
-    aio_context_release(aio_context);
  }
void qmp_block_job_complete(const char *device, Error **errp)
  {
-    AioContext *aio_context;
      BlockJob *job;
JOB_LOCK_GUARD();
-    job = find_block_job_locked(device, &aio_context, errp);
+    job = find_block_job_locked(device, errp);
if (!job) {
          return;
@@ -3426,16 +3389,14 @@ void qmp_block_job_complete(const char *device, Error 
**errp)
trace_qmp_block_job_complete(job);
      job_complete_locked(&job->job, errp);
-    aio_context_release(aio_context);
  }
void qmp_block_job_finalize(const char *id, Error **errp)
  {
-    AioContext *aio_context;
      BlockJob *job;
JOB_LOCK_GUARD();
-    job = find_block_job_locked(id, &aio_context, errp);
+    job = find_block_job_locked(id, errp);
if (!job) {
          return;
@@ -3445,24 +3406,16 @@ void qmp_block_job_finalize(const char *id, Error 
**errp)
      job_ref_locked(&job->job);
      job_finalize_locked(&job->job, errp);
- /*
-     * Job's context might have changed via job_finalize (and job_txn_apply
-     * automatically acquires the new one), so make sure we release the correct
-     * one.
-     */
-    aio_context = block_job_get_aio_context(job);
      job_unref_locked(&job->job);
-    aio_context_release(aio_context);
  }
void qmp_block_job_dismiss(const char *id, Error **errp)
  {
-    AioContext *aio_context;
      BlockJob *bjob;
      Job *job;
JOB_LOCK_GUARD();
-    bjob = find_block_job_locked(id, &aio_context, errp);
+    bjob = find_block_job_locked(id, errp);
if (!bjob) {
          return;
@@ -3471,7 +3424,6 @@ void qmp_block_job_dismiss(const char *id, Error **errp)
      trace_qmp_block_job_dismiss(bjob);
      job = &bjob->job;
      job_dismiss_locked(&job, errp);
-    aio_context_release(aio_context);
  }
void qmp_change_backing_file(const char *device,
@@ -3753,15 +3705,11 @@ BlockJobInfoList *qmp_query_block_jobs(Error **errp)
      for (job = block_job_next_locked(NULL); job;
           job = block_job_next_locked(job)) {
          BlockJobInfo *value;
-        AioContext *aio_context;
if (block_job_is_internal(job)) {
              continue;
          }
-        aio_context = block_job_get_aio_context(job);
-        aio_context_acquire(aio_context);
-        value = block_job_query(job, errp);
-        aio_context_release(aio_context);
+        value = block_job_query_locked(job, errp);
          if (!value) {
              qapi_free_BlockJobInfoList(head);
              return NULL;
diff --git a/include/qemu/job.h b/include/qemu/job.h
index c144aabefc..676f69bb2e 100644
--- a/include/qemu/job.h
+++ b/include/qemu/job.h
@@ -75,13 +75,14 @@ typedef struct Job {
      ProgressMeter progress;
- /** Protected by AioContext lock */
+    /** Protected by job_mutex */
/**
       * AioContext to run the job coroutine in.
-     * This field can be read when holding either the BQL (so we are in
-     * the main loop) or the job_mutex.
-     * It can be only written when we hold *both* BQL and job_mutex.
+     * The job Aiocontext can be read when holding *either*
+     * the BQL (so we are in the main loop) or the job_mutex.
+     * It can only be written when we hold *both* BQL
+     * and the job_mutex.

You reword comment you've added some patches ago. Could you please merge this 
to original patch?

       */
      AioContext *aio_context;
@@ -106,7 +107,7 @@ typedef struct Job {
      /**
       * Set to false by the job while the coroutine has yielded and may be
       * re-entered by job_enter(). There may still be I/O or event loop 
activity
-     * pending. Accessed under block_job_mutex (in blockjob.c).
+     * pending. Accessed under job_mutex.
       *
       * When the job is deferred to the main loop, busy is true as long as the
       * bottom half is still pending.
@@ -322,9 +323,9 @@ typedef enum JobCreateFlags {
extern QemuMutex job_mutex; -#define JOB_LOCK_GUARD() /* QEMU_LOCK_GUARD(&job_mutex) */
+#define JOB_LOCK_GUARD() QEMU_LOCK_GUARD(&job_mutex)
-#define WITH_JOB_LOCK_GUARD() /* WITH_QEMU_LOCK_GUARD(&job_mutex) */
+#define WITH_JOB_LOCK_GUARD() WITH_QEMU_LOCK_GUARD(&job_mutex)
/**
   * job_lock:
@@ -672,7 +673,7 @@ void job_user_cancel_locked(Job *job, bool force, Error 
**errp);
   * Returns the return value from the job if the job actually completed
   * during the call, or -ECANCELED if it was canceled.
   *
- * Callers must hold the AioContext lock of job->aio_context.
+ * Called with job_lock held.

That's wrong, it should be called with job_lock not held :)

   */
  int job_cancel_sync(Job *job, bool force);
@@ -697,8 +698,7 @@ void job_cancel_sync_all(void);
   * function).
   *
   * Returns the return value from the job.
- *
- * Callers must hold the AioContext lock of job->aio_context.
+ * Called with job_lock held.

and this,

   */
  int job_complete_sync(Job *job, Error **errp);
@@ -734,7 +734,7 @@ void job_dismiss_locked(Job **job, Error **errp);
   * Returns 0 if the job is successfully completed, -ECANCELED if the job was
   * cancelled before completing, and -errno in other error cases.
   *
- * Callers must hold the AioContext lock of job->aio_context.
+ * Called with job_lock held.

and this.

   */
  int job_finish_sync(Job *job, void (*finish)(Job *, Error **errp),
                      Error **errp);
diff --git a/job-qmp.c b/job-qmp.c
index cfaf34ffb7..96d67246d2 100644
--- a/job-qmp.c
+++ b/job-qmp.c
@@ -30,36 +30,27 @@

[..]

  }
@@ -501,8 +481,12 @@ void job_unref_locked(Job *job)>           
assert(!job->txn);
if (job->driver->free) {
+            AioContext *aio_context = job->aio_context;
              job_unlock();
+            /* FIXME: aiocontext lock is required because cb calls blk_unref */
+            aio_context_acquire(aio_context);
              job->driver->free(job);
+            aio_context_release(aio_context);

So, job_unref_locked() must be called with aio_context not locked, otherwise we 
dead-lock here? That should be documented in function declaration comment.

For example in qemu-img.c in run_block_job() we do exactly that: call 
job_unref_locked()  inside aio-context lock critical seaction..


              job_lock();
          }
@@ -581,21 +565,17 @@ void job_enter_cond_locked(Job *job, bool(*fn)(Job *job))
          return;
      }
- real_job_lock();
      if (job->busy) {
-        real_job_unlock();
          return;
      }
if (fn && !fn(job)) {
-        real_job_unlock();
          return;
      }
assert(!job->deferred_to_main_loop);
      timer_del(&job->sleep_timer);
      job->busy = true;
-    real_job_unlock();
      job_unlock();
      aio_co_wake(job->co);
      job_lock();
@@ -626,13 +606,11 @@ static void coroutine_fn job_do_yield_locked(Job *job, 
uint64_t ns)
  {
      AioContext *next_aio_context;
- real_job_lock();
      if (ns != -1) {
          timer_mod(&job->sleep_timer, ns);
      }
      job->busy = false;
      job_event_idle_locked(job);
-    real_job_unlock();
      job_unlock();
      qemu_coroutine_yield();
      job_lock();
@@ -922,6 +900,7 @@ static void job_clean(Job *job)
  static int job_finalize_single_locked(Job *job)
  {
      int job_ret;
+    AioContext *ctx = job->aio_context;
assert(job_is_completed_locked(job)); @@ -929,6 +908,7 @@ static int job_finalize_single_locked(Job *job)
      job_update_rc_locked(job);
job_unlock();
+    aio_context_acquire(ctx);

Hmm, and this function and all its callers now should be called with 
aio-context lock not locked?

For example job_exit is scheduled as as BH. Aren't BHs called with aio-context 
lock held?

if (!job->ret) {
          job_commit(job);
@@ -937,6 +917,7 @@ static int job_finalize_single_locked(Job *job)
      }
      job_clean(job);
+ aio_context_release(ctx);
      job_lock();
if (job->cb) {

[..]


--
Best regards,
Vladimir



reply via email to

[Prev in Thread] Current Thread [Next in Thread]