Re: [Qemu-devel] [PATCH] block/mirror: change the semantic of 'force' of block-job-cancel

John Snow Tue, 30 Jan 2018 12:22:12 -0800


On 01/30/2018 03:38 AM, Liang Li wrote:
> When doing drive mirror to a low speed shared storage, if there was heavy
> BLK IO write workload in VM after the 'ready' event, drive mirror block job
> can't be canceled immediately, it would keep running until the heavy BLK IO
> workload stopped in the VM.
> 
> Because libvirt depends on block-job-cancel for block live migration, the
> current block-job-cancel has the semantic to make sure data is in sync after
> the 'ready' event.  This semantic can't meet some requirement, for example,
> people may use drive mirror for realtime backup while need the ability of
> block live migration. If drive mirror can't not be cancelled immediately,
> it means block live migration need to wait, because libvirt make use drive
> mirror to implement block live migration and only one drive mirror block
> job is allowed at the same time for a give block dev.
> 
> We need a new interface for 'force cancel', which could quit block job
> immediately if don't care about whether data is in sync or not.
> 
> 'force' is not used by libvirt currently, to make things simple, change
> it's semantic slightly, hope it will not break some use case which need its
> original semantic.
> 
> Cc: Paolo Bonzini <pbonz...@redhat.com>
> Cc: Jeff Cody <jc...@redhat.com>
> Cc: Kevin Wolf <kw...@redhat.com>
> Cc: Max Reitz <mre...@redhat.com>
> Cc: Eric Blake <ebl...@redhat.com>
> Cc: John Snow <js...@redhat.com>
> Reported-by: Huaitong Han <huanhuait...@didichuxing.com>
> Signed-off-by: Huaitong Han <huanhuait...@didichuxing.com>
> Signed-off-by: Liang Li <liliang...@didichuxing.com>
> ---
>  block/mirror.c            |  9 +++------
>  blockdev.c                |  4 ++--
>  blockjob.c                | 11 ++++++-----
>  hmp-commands.hx           |  3 ++-
>  include/block/blockjob.h  |  9 ++++++++-
>  qapi/block-core.json      |  6 ++++--
>  tests/test-blockjob-txn.c |  8 ++++----
>  7 files changed, 29 insertions(+), 21 deletions(-)
> 
> diff --git a/block/mirror.c b/block/mirror.c
> index c9badc1..c22dff9 100644
> --- a/block/mirror.c
> +++ b/block/mirror.c
> @@ -869,11 +869,8 @@ static void coroutine_fn mirror_run(void *opaque)
>  
>          ret = 0;
>          trace_mirror_before_sleep(s, cnt, s->synced, delay_ns);
> -        if (!s->synced) {
> -            block_job_sleep_ns(&s->common, delay_ns);
> -            if (block_job_is_cancelled(&s->common)) {
> -                break;
> -            }
> +        if (block_job_is_cancelled(&s->common) && s->common.force) {
> +            break;


what's the justification for removing the sleep in the case that
!s->synced && !block_job_is_cancelled(...) ?

>          } else if (!should_complete) {
>              delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0);
>              block_job_sleep_ns(&s->common, delay_ns);
> @@ -887,7 +884,7 @@ immediate_exit:
>           * or it was cancelled prematurely so that we do not guarantee that
>           * the target is a copy of the source.
>           */
> -        assert(ret < 0 || (!s->synced && 
> block_job_is_cancelled(&s->common)));
> +        assert(ret < 0 || block_job_is_cancelled(&s->common));

This assertion gets weaker in the case where force isn't provided, is
that desired?

>          assert(need_drain);
>          mirror_wait_for_all_io(s);
>      }
> diff --git a/blockdev.c b/blockdev.c
> index 8e977ee..039f156 100644
> --- a/blockdev.c
> +++ b/blockdev.c
> @@ -145,7 +145,7 @@ void blockdev_mark_auto_del(BlockBackend *blk)
>          aio_context_acquire(aio_context);
>  
>          if (bs->job) {
> -            block_job_cancel(bs->job);
> +            block_job_cancel(bs->job, false);
>          }
>  
>          aio_context_release(aio_context);
> @@ -3802,7 +3802,7 @@ void qmp_block_job_cancel(const char *device,
>      }
>  
>      trace_qmp_block_job_cancel(job);
> -    block_job_cancel(job);
> +    block_job_cancel(job, force);
>  out:
>      aio_context_release(aio_context);
>  }
> diff --git a/blockjob.c b/blockjob.c
> index f5cea84..0aacb50 100644
> --- a/blockjob.c
> +++ b/blockjob.c
> @@ -365,7 +365,7 @@ static void block_job_completed_single(BlockJob *job)
>      block_job_unref(job);
>  }
>  
> -static void block_job_cancel_async(BlockJob *job)
> +static void block_job_cancel_async(BlockJob *job, bool force)
>  {
>      if (job->iostatus != BLOCK_DEVICE_IO_STATUS_OK) {
>          block_job_iostatus_reset(job);
> @@ -376,6 +376,7 @@ static void block_job_cancel_async(BlockJob *job)
>          job->pause_count--;
>      }
>      job->cancelled = true;
> +    job->force = force;
>  }
>  

I suppose this is okay; we're setting a permanent property of the job as
part of a limited operation.

Granted, the job should disappear afterwards, so it should never
conflict, but it made me wonder for a moment.

What happens if we cancel with force = true and then immediately cancel
again with force = false? What happens? Can it cause issues?

>  static int block_job_finish_sync(BlockJob *job,
> @@ -437,7 +438,7 @@ static void block_job_completed_txn_abort(BlockJob *job)
>       * on the caller, so leave it. */
>      QLIST_FOREACH(other_job, &txn->jobs, txn_list) {
>          if (other_job != job) {
> -            block_job_cancel_async(other_job);
> +            block_job_cancel_async(other_job, true);

What's the rationale for deciding that transactional cancellations are
always force=true?

Granted, this doesn't matter currently because mirror isn't a
transactional job, but that makes the decision all the more curious.

>          }
>      }
>      while (!QLIST_EMPTY(&txn->jobs)) {
> @@ -542,10 +543,10 @@ void block_job_user_resume(BlockJob *job)
>      }
>  }
>  
> -void block_job_cancel(BlockJob *job)
> +void block_job_cancel(BlockJob *job, bool force)
>  {
>      if (block_job_started(job)) {
> -        block_job_cancel_async(job);
> +        block_job_cancel_async(job, force);
>          block_job_enter(job);
>      } else {
>          block_job_completed(job, -ECANCELED);
> @@ -557,7 +558,7 @@ void block_job_cancel(BlockJob *job)
>   * function pointer casts there. */
>  static void block_job_cancel_err(BlockJob *job, Error **errp)
>  {
> -    block_job_cancel(job);
> +    block_job_cancel(job, false);
>  }
>  
>  int block_job_cancel_sync(BlockJob *job)
> diff --git a/hmp-commands.hx b/hmp-commands.hx
> index 15620c9..c8bb414 100644
> --- a/hmp-commands.hx
> +++ b/hmp-commands.hx
> @@ -106,7 +106,8 @@ ETEXI
>          .args_type  = "force:-f,device:B",
>          .params     = "[-f] device",
>          .help       = "stop an active background block operation (use -f"
> -                      "\n\t\t\t if the operation is currently paused)",
> +                      "\n\t\t\t if you want to abort the operation 
> immediately"
> +                      "\n\t\t\t instead of keep running until data is in 
> sync )",
>          .cmd        = hmp_block_job_cancel,
>      },
>  
> diff --git a/include/block/blockjob.h b/include/block/blockjob.h
> index 00403d9..4a96c42 100644
> --- a/include/block/blockjob.h
> +++ b/include/block/blockjob.h
> @@ -63,6 +63,12 @@ typedef struct BlockJob {
>      bool cancelled;
>  
>      /**
> +     * Set to true if the job should be abort immediately without waiting
> +     * for data is in sync.
> +     */
> +    bool force;
> +
> +    /**
>       * Counter for pause request. If non-zero, the block job is either 
> paused,
>       * or if busy == true will pause itself as soon as possible.
>       */
> @@ -218,10 +224,11 @@ void block_job_start(BlockJob *job);
>  /**
>   * block_job_cancel:
>   * @job: The job to be canceled.
> + * @force: Quit a job without waiting data is in sync.
>   *
>   * Asynchronously cancel the specified job.
>   */
> -void block_job_cancel(BlockJob *job);
> +void block_job_cancel(BlockJob *job, bool force);
>  
>  /**
>   * block_job_complete:
> diff --git a/qapi/block-core.json b/qapi/block-core.json
> index 8225308..7c4dbfe 100644
> --- a/qapi/block-core.json
> +++ b/qapi/block-core.json
> @@ -2098,8 +2098,10 @@
>  #          the name of the parameter), but since QEMU 2.7 it can have
>  #          other values.
>  #
> -# @force: whether to allow cancellation of a paused job (default
> -#         false).  Since 1.3.
> +# @force: #optional whether to allow cancellation a job without waiting data 
> is
> +#         in sync, please not that since 2.12 it's semantic is not exactly 
> the
> +#         same as before, from 1.3 to 2.11 it means whether to allow 
> cancellation
> +#         of a paused job (default false).  Since 1.3.
>  #
>  # Returns: Nothing on success
>  #          If no background operation is active on this device, 
> DeviceNotActive
> diff --git a/tests/test-blockjob-txn.c b/tests/test-blockjob-txn.c
> index 3591c96..53daadb 100644
> --- a/tests/test-blockjob-txn.c
> +++ b/tests/test-blockjob-txn.c
> @@ -125,7 +125,7 @@ static void test_single_job(int expected)
>      block_job_start(job);
>  
>      if (expected == -ECANCELED) {
> -        block_job_cancel(job);
> +        block_job_cancel(job, false);
>      }
>  
>      while (result == -EINPROGRESS) {
> @@ -173,10 +173,10 @@ static void test_pair_jobs(int expected1, int expected2)
>      block_job_txn_unref(txn);
>  
>      if (expected1 == -ECANCELED) {
> -        block_job_cancel(job1);
> +        block_job_cancel(job1, false);
>      }
>      if (expected2 == -ECANCELED) {
> -        block_job_cancel(job2);
> +        block_job_cancel(job2, false);
>      }
>  
>      while (result1 == -EINPROGRESS || result2 == -EINPROGRESS) {
> @@ -231,7 +231,7 @@ static void test_pair_jobs_fail_cancel_race(void)
>      block_job_start(job1);
>      block_job_start(job2);
>  
> -    block_job_cancel(job1);
> +    block_job_cancel(job1, false);
>  
>      /* Now make job2 finish before the main loop kicks jobs.  This simulates
>       * the race between a pending kick and another job completing.
> 

--js

Re: [Qemu-devel] [PATCH] block/mirror: change the semantic of 'force' of block-job-cancel

Reply via email to