On Fri, Feb 22, 2019 at 12:28:33PM +0100, Cédric Le Goater wrote:
> When migration of a VM is initiated, a first copy of the RAM is
> transferred to the destination before the VM is stopped, but there is
> no guarantee that the EQ pages in which the event notification are
> queued have not been modified.
> 
> To make sure migration will capture a consistent memory state, the
> XIVE device should perform a XIVE quiesce sequence to stop the flow of
> event notifications and stabilize the EQs. This is the purpose of the
> KVM_DEV_XIVE_EQ_SYNC control which will also marks the EQ pages dirty
> to force their transfer.
> 
> Signed-off-by: Cédric Le Goater <c...@kaod.org>
> ---
>  arch/powerpc/include/uapi/asm/kvm.h        |  1 +
>  arch/powerpc/kvm/book3s_xive_native.c      | 67 ++++++++++++++++++++++
>  Documentation/virtual/kvm/devices/xive.txt | 29 ++++++++++
>  3 files changed, 97 insertions(+)
> 
> diff --git a/arch/powerpc/include/uapi/asm/kvm.h 
> b/arch/powerpc/include/uapi/asm/kvm.h
> index 289c504b7c1d..cd78ad1020fe 100644
> --- a/arch/powerpc/include/uapi/asm/kvm.h
> +++ b/arch/powerpc/include/uapi/asm/kvm.h
> @@ -678,6 +678,7 @@ struct kvm_ppc_cpu_char {
>  /* POWER9 XIVE Native Interrupt Controller */
>  #define KVM_DEV_XIVE_GRP_CTRL                1
>  #define   KVM_DEV_XIVE_RESET         1
> +#define   KVM_DEV_XIVE_EQ_SYNC               2
>  #define KVM_DEV_XIVE_GRP_SOURCE              2       /* 64-bit source 
> attributes */
>  #define KVM_DEV_XIVE_GRP_SOURCE_CONFIG       3       /* 64-bit source 
> attributes */
>  #define KVM_DEV_XIVE_GRP_EQ_CONFIG   4       /* 64-bit eq attributes */
> diff --git a/arch/powerpc/kvm/book3s_xive_native.c 
> b/arch/powerpc/kvm/book3s_xive_native.c
> index dd2a9d411fe7..3debc876d5a0 100644
> --- a/arch/powerpc/kvm/book3s_xive_native.c
> +++ b/arch/powerpc/kvm/book3s_xive_native.c
> @@ -640,6 +640,70 @@ static int kvmppc_xive_reset(struct kvmppc_xive *xive)
>       return 0;
>  }
>  
> +static void kvmppc_xive_native_sync_sources(struct kvmppc_xive_src_block *sb)
> +{
> +     int j;
> +
> +     for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++) {
> +             struct kvmppc_xive_irq_state *state = &sb->irq_state[j];
> +             struct xive_irq_data *xd;
> +             u32 hw_num;
> +
> +             if (!state->valid)
> +                     continue;
> +             if (state->act_priority == MASKED)

Is this correct?  If you masked an irq, then immediately did a sync,
couldn't there still be some of the irqs in flight?  I thought the
reason we needed a sync was that masking and other such operations
_didn't_ implicitly synchronize.

> +                     continue;
> +
> +             arch_spin_lock(&sb->lock);
> +             kvmppc_xive_select_irq(state, &hw_num, &xd);
> +             xive_native_sync_source(hw_num);
> +             xive_native_sync_queue(hw_num);
> +             arch_spin_unlock(&sb->lock);
> +     }
> +}
> +
> +static int kvmppc_xive_native_vcpu_eq_sync(struct kvm_vcpu *vcpu)
> +{
> +     struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> +     unsigned int prio;
> +
> +     if (!xc)
> +             return -ENOENT;
> +
> +     for (prio = 0; prio < KVMPPC_XIVE_Q_COUNT; prio++) {
> +             struct xive_q *q = &xc->queues[prio];
> +
> +             if (!q->qpage)
> +                     continue;
> +
> +             /* Mark EQ page dirty for migration */
> +             mark_page_dirty(vcpu->kvm, gpa_to_gfn(q->guest_qpage));
> +     }
> +     return 0;
> +}
> +
> +static int kvmppc_xive_native_eq_sync(struct kvmppc_xive *xive)
> +{
> +     struct kvm *kvm = xive->kvm;
> +     struct kvm_vcpu *vcpu;
> +     unsigned int i;
> +
> +     pr_devel("%s\n", __func__);
> +
> +     for (i = 0; i <= xive->max_sbid; i++) {
> +             if (xive->src_blocks[i])
> +                     kvmppc_xive_native_sync_sources(xive->src_blocks[i]);
> +     }
> +
> +     mutex_lock(&kvm->lock);
> +     kvm_for_each_vcpu(i, vcpu, kvm) {
> +             kvmppc_xive_native_vcpu_eq_sync(vcpu);
> +     }
> +     mutex_unlock(&kvm->lock);
> +
> +     return 0;
> +}
> +
>  static int kvmppc_xive_native_set_attr(struct kvm_device *dev,
>                                      struct kvm_device_attr *attr)
>  {
> @@ -650,6 +714,8 @@ static int kvmppc_xive_native_set_attr(struct kvm_device 
> *dev,
>               switch (attr->attr) {
>               case KVM_DEV_XIVE_RESET:
>                       return kvmppc_xive_reset(xive);
> +             case KVM_DEV_XIVE_EQ_SYNC:
> +                     return kvmppc_xive_native_eq_sync(xive);
>               }
>               break;
>       case KVM_DEV_XIVE_GRP_SOURCE:
> @@ -688,6 +754,7 @@ static int kvmppc_xive_native_has_attr(struct kvm_device 
> *dev,
>       case KVM_DEV_XIVE_GRP_CTRL:
>               switch (attr->attr) {
>               case KVM_DEV_XIVE_RESET:
> +             case KVM_DEV_XIVE_EQ_SYNC:
>                       return 0;
>               }
>               break;
> diff --git a/Documentation/virtual/kvm/devices/xive.txt 
> b/Documentation/virtual/kvm/devices/xive.txt
> index 267634eae9e0..a26be635cff9 100644
> --- a/Documentation/virtual/kvm/devices/xive.txt
> +++ b/Documentation/virtual/kvm/devices/xive.txt
> @@ -23,6 +23,12 @@ the legacy interrupt mode, referred as XICS (POWER7/8).
>      queues. To be used by kexec and kdump.
>      Errors: none
>  
> +    1.2 KVM_DEV_XIVE_EQ_SYNC (write only)
> +    Sync all the sources and queues and mark the EQ pages dirty. This
> +    to make sure that a consistent memory state is captured when
> +    migrating the VM.
> +    Errors: none
> +
>    2. KVM_DEV_XIVE_GRP_SOURCE (write only)
>    Initializes a new source in the XIVE device and mask it.
>    Attributes:
> @@ -95,3 +101,26 @@ the legacy interrupt mode, referred as XICS (POWER7/8).
>      -ENOENT: Unknown source number
>      -EINVAL: Not initialized source number, invalid priority or
>               invalid CPU number.
> +
> +* Migration:
> +
> +  Saving the state of a VM using the XIVE native exploitation mode
> +  should follow a specific sequence. When the VM is stopped :
> +
> +  1. Mask all sources (PQ=01) to stop the flow of events.
> +
> +  2. Sync the XIVE device with the KVM control KVM_DEV_XIVE_EQ_SYNC to
> +  flush any in-flight event notification and to stabilize the EQs. At
> +  this stage, the EQ pages are marked dirty to make sure they are
> +  transferred in the migration sequence.
> +
> +  3. Capture the state of the source targeting, the EQs configuration
> +  and the state of thread interrupt context registers.
> +
> +  Restore is similar :
> +
> +  1. Restore the EQ configuration. As targeting depends on it.
> +  2. Restore targeting
> +  3. Restore the thread interrupt contexts
> +  4. Restore the source states
> +  5. Let the vCPU run

-- 
David Gibson                    | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
                                | _way_ _around_!
http://www.ozlabs.org/~dgibson

Attachment: signature.asc
Description: PGP signature

Reply via email to