On 2/25/19 3:53 AM, David Gibson wrote:
> On Fri, Feb 22, 2019 at 12:28:33PM +0100, Cédric Le Goater wrote:
>> When migration of a VM is initiated, a first copy of the RAM is
>> transferred to the destination before the VM is stopped, but there is
>> no guarantee that the EQ pages in which the event notification are
>> queued have not been modified.
>>
>> To make sure migration will capture a consistent memory state, the
>> XIVE device should perform a XIVE quiesce sequence to stop the flow of
>> event notifications and stabilize the EQs. This is the purpose of the
>> KVM_DEV_XIVE_EQ_SYNC control which will also marks the EQ pages dirty
>> to force their transfer.
>>
>> Signed-off-by: Cédric Le Goater <c...@kaod.org>
>> ---
>>  arch/powerpc/include/uapi/asm/kvm.h        |  1 +
>>  arch/powerpc/kvm/book3s_xive_native.c      | 67 ++++++++++++++++++++++
>>  Documentation/virtual/kvm/devices/xive.txt | 29 ++++++++++
>>  3 files changed, 97 insertions(+)
>>
>> diff --git a/arch/powerpc/include/uapi/asm/kvm.h 
>> b/arch/powerpc/include/uapi/asm/kvm.h
>> index 289c504b7c1d..cd78ad1020fe 100644
>> --- a/arch/powerpc/include/uapi/asm/kvm.h
>> +++ b/arch/powerpc/include/uapi/asm/kvm.h
>> @@ -678,6 +678,7 @@ struct kvm_ppc_cpu_char {
>>  /* POWER9 XIVE Native Interrupt Controller */
>>  #define KVM_DEV_XIVE_GRP_CTRL               1
>>  #define   KVM_DEV_XIVE_RESET                1
>> +#define   KVM_DEV_XIVE_EQ_SYNC              2
>>  #define KVM_DEV_XIVE_GRP_SOURCE             2       /* 64-bit source 
>> attributes */
>>  #define KVM_DEV_XIVE_GRP_SOURCE_CONFIG      3       /* 64-bit source 
>> attributes */
>>  #define KVM_DEV_XIVE_GRP_EQ_CONFIG  4       /* 64-bit eq attributes */
>> diff --git a/arch/powerpc/kvm/book3s_xive_native.c 
>> b/arch/powerpc/kvm/book3s_xive_native.c
>> index dd2a9d411fe7..3debc876d5a0 100644
>> --- a/arch/powerpc/kvm/book3s_xive_native.c
>> +++ b/arch/powerpc/kvm/book3s_xive_native.c
>> @@ -640,6 +640,70 @@ static int kvmppc_xive_reset(struct kvmppc_xive *xive)
>>      return 0;
>>  }
>>  
>> +static void kvmppc_xive_native_sync_sources(struct kvmppc_xive_src_block 
>> *sb)
>> +{
>> +    int j;
>> +
>> +    for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++) {
>> +            struct kvmppc_xive_irq_state *state = &sb->irq_state[j];
>> +            struct xive_irq_data *xd;
>> +            u32 hw_num;
>> +
>> +            if (!state->valid)
>> +                    continue;
>> +            if (state->act_priority == MASKED)
> 
> Is this correct?  If you masked an irq, then immediately did a sync,
> couldn't there still be some of the irqs in flight?  I thought the
> reason we needed a sync was that masking and other such operations
> _didn't_ implicitly synchronize.

The struct kvmppc_xive_irq_state reflects the state of the EAS 
configuration and not the state of the source. The source is masked 
setting the PQ bits to '-Q', which is what is being done before calling 
the KVM_DEV_XIVE_EQ_SYNC control. 

If a source EAS is configured, OPAL syncs the XIVE IC of the source and
the XIVE IC of the previous target if any.  

So I think we are fine.

C.
  

  
>> +                    continue;
>> +
>> +            arch_spin_lock(&sb->lock);
>> +            kvmppc_xive_select_irq(state, &hw_num, &xd);
>> +            xive_native_sync_source(hw_num);
>> +            xive_native_sync_queue(hw_num);
>> +            arch_spin_unlock(&sb->lock);
>> +    }
>> +}
>> +
>> +static int kvmppc_xive_native_vcpu_eq_sync(struct kvm_vcpu *vcpu)
>> +{
>> +    struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
>> +    unsigned int prio;
>> +
>> +    if (!xc)
>> +            return -ENOENT;
>> +
>> +    for (prio = 0; prio < KVMPPC_XIVE_Q_COUNT; prio++) {
>> +            struct xive_q *q = &xc->queues[prio];
>> +
>> +            if (!q->qpage)
>> +                    continue;
>> +
>> +            /* Mark EQ page dirty for migration */
>> +            mark_page_dirty(vcpu->kvm, gpa_to_gfn(q->guest_qpage));
>> +    }
>> +    return 0;
>> +}
>> +
>> +static int kvmppc_xive_native_eq_sync(struct kvmppc_xive *xive)
>> +{
>> +    struct kvm *kvm = xive->kvm;
>> +    struct kvm_vcpu *vcpu;
>> +    unsigned int i;
>> +
>> +    pr_devel("%s\n", __func__);
>> +
>> +    for (i = 0; i <= xive->max_sbid; i++) {
>> +            if (xive->src_blocks[i])
>> +                    kvmppc_xive_native_sync_sources(xive->src_blocks[i]);
>> +    }
>> +
>> +    mutex_lock(&kvm->lock);
>> +    kvm_for_each_vcpu(i, vcpu, kvm) {
>> +            kvmppc_xive_native_vcpu_eq_sync(vcpu);
>> +    }
>> +    mutex_unlock(&kvm->lock);
>> +
>> +    return 0;
>> +}
>> +
>>  static int kvmppc_xive_native_set_attr(struct kvm_device *dev,
>>                                     struct kvm_device_attr *attr)
>>  {
>> @@ -650,6 +714,8 @@ static int kvmppc_xive_native_set_attr(struct kvm_device 
>> *dev,
>>              switch (attr->attr) {
>>              case KVM_DEV_XIVE_RESET:
>>                      return kvmppc_xive_reset(xive);
>> +            case KVM_DEV_XIVE_EQ_SYNC:
>> +                    return kvmppc_xive_native_eq_sync(xive);
>>              }
>>              break;
>>      case KVM_DEV_XIVE_GRP_SOURCE:
>> @@ -688,6 +754,7 @@ static int kvmppc_xive_native_has_attr(struct kvm_device 
>> *dev,
>>      case KVM_DEV_XIVE_GRP_CTRL:
>>              switch (attr->attr) {
>>              case KVM_DEV_XIVE_RESET:
>> +            case KVM_DEV_XIVE_EQ_SYNC:
>>                      return 0;
>>              }
>>              break;
>> diff --git a/Documentation/virtual/kvm/devices/xive.txt 
>> b/Documentation/virtual/kvm/devices/xive.txt
>> index 267634eae9e0..a26be635cff9 100644
>> --- a/Documentation/virtual/kvm/devices/xive.txt
>> +++ b/Documentation/virtual/kvm/devices/xive.txt
>> @@ -23,6 +23,12 @@ the legacy interrupt mode, referred as XICS (POWER7/8).
>>      queues. To be used by kexec and kdump.
>>      Errors: none
>>  
>> +    1.2 KVM_DEV_XIVE_EQ_SYNC (write only)
>> +    Sync all the sources and queues and mark the EQ pages dirty. This
>> +    to make sure that a consistent memory state is captured when
>> +    migrating the VM.
>> +    Errors: none
>> +
>>    2. KVM_DEV_XIVE_GRP_SOURCE (write only)
>>    Initializes a new source in the XIVE device and mask it.
>>    Attributes:
>> @@ -95,3 +101,26 @@ the legacy interrupt mode, referred as XICS (POWER7/8).
>>      -ENOENT: Unknown source number
>>      -EINVAL: Not initialized source number, invalid priority or
>>               invalid CPU number.
>> +
>> +* Migration:
>> +
>> +  Saving the state of a VM using the XIVE native exploitation mode
>> +  should follow a specific sequence. When the VM is stopped :
>> +
>> +  1. Mask all sources (PQ=01) to stop the flow of events.
>> +
>> +  2. Sync the XIVE device with the KVM control KVM_DEV_XIVE_EQ_SYNC to
>> +  flush any in-flight event notification and to stabilize the EQs. At
>> +  this stage, the EQ pages are marked dirty to make sure they are
>> +  transferred in the migration sequence.
>> +
>> +  3. Capture the state of the source targeting, the EQs configuration
>> +  and the state of thread interrupt context registers.
>> +
>> +  Restore is similar :
>> +
>> +  1. Restore the EQ configuration. As targeting depends on it.
>> +  2. Restore targeting
>> +  3. Restore the thread interrupt contexts
>> +  4. Restore the source states
>> +  5. Let the vCPU run
> 

Reply via email to