from:"Roman Kagan"

Re: [PATCH 1/4] KVM: x86: always fill in vcpu->arch.hv_clock

2016-09-19 Thread Roman Kagan

On Mon, Sep 19, 2016 at 01:39:10PM +0200, Paolo Bonzini wrote:
> We will use it in the next patches for KVM_GET_CLOCK and as a basis for the
> contents of the Hyper-V TSC page.  Get the values from the Linux
> timekeeper even if kvmclock is not enabled.
> 
> Signed-off-by: Paolo Bonzini 
> ---
>  arch/x86/kvm/x86.c | 109 
> +
>  1 file changed, 59 insertions(+), 50 deletions(-)

Reviewed-by: Roman Kagan

Re: [PATCH 4/4] KVM: x86: Hyper-V tsc page setup

2016-09-19 Thread Roman Kagan

On Mon, Sep 19, 2016 at 01:39:13PM +0200, Paolo Bonzini wrote:
> Lately tsc page was implemented but filled with empty
> values. This patch setup tsc page scale and offset based
> on vcpu tsc, tsc_khz and  HV_X64_MSR_TIME_REF_COUNT value.
> 
> The valid tsc page drops HV_X64_MSR_TIME_REF_COUNT msr
> reads count to zero which potentially improves performance.
> 
> Signed-off-by: Andrey Smetanin 
> Reviewed-by: Peter Hornyack 
> CC: Paolo Bonzini 
> CC: Radim Krčmář 
> CC: Roman Kagan 
> CC: Denis V. Lunev 
> [Computation of TSC page parameters rewritten to use the Linux timekeeper
>  parameters. - Paolo]
> Signed-off-by: Paolo Bonzini 
> ---
>  arch/x86/include/asm/kvm_host.h |   2 +
>  arch/x86/kvm/hyperv.c   | 162 
> 
>  arch/x86/kvm/hyperv.h   |   3 +
>  arch/x86/kvm/x86.c  |   8 +-
>  4 files changed, 155 insertions(+), 20 deletions(-)

Reviewed-by: Roman Kagan

Re: [PATCH 3/4] KVM: x86: introduce get_kvmclock_ns

2016-09-19 Thread Roman Kagan

On Mon, Sep 19, 2016 at 01:39:12PM +0200, Paolo Bonzini wrote:
> Introduce a function that reads the exact nanoseconds value that is
> provided to the guest in kvmclock.  This crystallizes the notion of
> kvmclock as a thin veneer over a stable TSC, that the guest will
> (hopefully) convert with NTP.  In other words, kvmclock is *not* a
> paravirtualized host-to-guest NTP.
> 
> Drop the get_kernel_ns() function, that was used both to get the base
> value of the master clock and to get the current value of kvmclock.
> The former use is replaced by ktime_get_boot_ns(), the latter is
> the purpose of get_kernel_ns().
> 
> This also allows KVM to provide a Hyper-V time reference counter that
> is synchronized with the time that is computed from the TSC page.
> 
> Signed-off-by: Paolo Bonzini 
> ---
>  arch/x86/entry/vdso/vclock_gettime.c |  2 +-
>  arch/x86/include/asm/pvclock.h   |  5 ++--
>  arch/x86/kernel/pvclock.c|  2 +-
>  arch/x86/kvm/hyperv.c|  2 +-
>  arch/x86/kvm/x86.c   | 48 
> +++-
>  arch/x86/kvm/x86.h   |  6 +
>  6 files changed, 43 insertions(+), 22 deletions(-)

Reviewed-by: Roman Kagan

Re: [PATCH 3/4] KVM: x86: introduce get_kvmclock_ns

2016-09-02 Thread Roman Kagan

On Thu, Sep 01, 2016 at 05:26:14PM +0200, Paolo Bonzini wrote:
> Introduce a function that reads the exact nanoseconds value that is
> provided to the guest in kvmclock.  This crystallizes the notion of
> kvmclock as a thin veneer over a stable TSC, that the guest will
> (hopefully) convert with NTP.  In other words, kvmclock is *not* a
> paravirtualized host-to-guest NTP.
> 
> Drop the get_kernel_ns() function, that was used both to get the base
> value of the master clock and to get the current value of kvmclock.
> The former use is replaced by ktime_get_boot_ns(), the latter is
> the purpose of get_kernel_ns().
> 
> This also allows KVM to provide a Hyper-V time reference counter that
> is synchronized with the time that is computed from the TSC page.
> 
> Signed-off-by: Paolo Bonzini 
> ---
>  arch/x86/entry/vdso/vclock_gettime.c |  2 +-
>  arch/x86/include/asm/pvclock.h   |  5 ++--
>  arch/x86/kernel/pvclock.c|  2 +-
>  arch/x86/kvm/hyperv.c|  2 +-
>  arch/x86/kvm/x86.c   | 48 
> +++-
>  arch/x86/kvm/x86.h   |  6 +
>  6 files changed, 43 insertions(+), 22 deletions(-)
> 
> diff --git a/arch/x86/entry/vdso/vclock_gettime.c 
> b/arch/x86/entry/vdso/vclock_gettime.c
> index 94d54d0defa7..02223cb4bcfd 100644
> --- a/arch/x86/entry/vdso/vclock_gettime.c
> +++ b/arch/x86/entry/vdso/vclock_gettime.c
> @@ -129,7 +129,7 @@ static notrace cycle_t vread_pvclock(int *mode)
>   return 0;
>   }
>  
> - ret = __pvclock_read_cycles(pvti);
> + ret = __pvclock_read_cycles(pvti, rdtsc_ordered());
>   } while (pvclock_read_retry(pvti, version));
>  
>   /* refer to vread_tsc() comment for rationale */
> diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
> index d019f0cc80ec..3ad741b84072 100644
> --- a/arch/x86/include/asm/pvclock.h
> +++ b/arch/x86/include/asm/pvclock.h
> @@ -87,9 +87,10 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 
> mul_frac, int shift)
>  }
>  
>  static __always_inline
> -cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src)
> +cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
> +   u64 tsc)
>  {
> - u64 delta = rdtsc_ordered() - src->tsc_timestamp;
> + u64 delta = tsc - src->tsc_timestamp;
>   cycle_t offset = pvclock_scale_delta(delta, src->tsc_to_system_mul,
>src->tsc_shift);
>   return src->system_time + offset;
> diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
> index 3599404e3089..5b2cc889ce34 100644
> --- a/arch/x86/kernel/pvclock.c
> +++ b/arch/x86/kernel/pvclock.c
> @@ -80,7 +80,7 @@ cycle_t pvclock_clocksource_read(struct 
> pvclock_vcpu_time_info *src)
>  
>   do {
>   version = pvclock_read_begin(src);
> - ret = __pvclock_read_cycles(src);
> + ret = __pvclock_read_cycles(src, rdtsc_ordered());
>   flags = src->flags;
>   } while (pvclock_read_retry(src, version));
>  

Perhaps adding an argument to __pvclock_read_cycles deserves a separate
patch, to get timekeeping folks' ack on it?

> diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
> index 01bd7b7a6866..ed5b77f39ffb 100644
> --- a/arch/x86/kvm/hyperv.c
> +++ b/arch/x86/kvm/hyperv.c
> @@ -386,7 +386,7 @@ static void synic_init(struct kvm_vcpu_hv_synic *synic)
>  
>  static u64 get_time_ref_counter(struct kvm *kvm)
>  {
> - return div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100);
> + return div_u64(get_kvmclock_ns(kvm), 100);

Since this does slightly different calculation than the real hyperv tsc
ref page clock is supposed to, I wonder if we are safe WRT precision
errors leading to occasional monotonicity violations?

>  }
>  
>  static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index b5853b86b67d..2edcfa054cbe 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1425,7 +1425,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct 
> msr_data *msr)
>  
>   raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
>   offset = kvm_compute_tsc_offset(vcpu, data);
> - ns = get_kernel_ns();
> + ns = ktime_get_boot_ns();
>   elapsed = ns - kvm->arch.last_tsc_nsec;
>  
>   if (vcpu->arch.virtual_tsc_khz) {
> @@ -1716,6 +1716,34 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
>  #endif
>  }
>  
> +static u64 __get_kvmclock_ns(struct kvm *kvm)
> +{
> + struct kvm_vcpu *vcpu = kvm_get_vcpu(kvm, 0);
> + struct kvm_arch *ka = &kvm->arch;
> + s64 ns;
> +
> + if (vcpu->arch.hv_clock.flags & PVCLOCK_TSC_STABLE_BIT) {
> + u64 tsc = kvm_read_l1_tsc(vcpu, rdtsc());
> + ns = __pvclock_read_cycles(&vcpu->arch.hv_clock, tsc);
> + } else {
> + ns = ktime_get_boot_ns

Re: [PATCH 3/4] KVM: x86: introduce get_kvmclock_ns

2016-09-02 Thread Roman Kagan

On Fri, Sep 02, 2016 at 04:09:42PM +0200, Paolo Bonzini wrote:
> On 02/09/2016 15:52, Roman Kagan wrote:
> > On Thu, Sep 01, 2016 at 05:26:14PM +0200, Paolo Bonzini wrote:
> >> --- a/arch/x86/kvm/hyperv.c
> >> +++ b/arch/x86/kvm/hyperv.c
> >> @@ -386,7 +386,7 @@ static void synic_init(struct kvm_vcpu_hv_synic *synic)
> >>  
> >>  static u64 get_time_ref_counter(struct kvm *kvm)
> >>  {
> >> -  return div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100);
> >> +  return div_u64(get_kvmclock_ns(kvm), 100);
> > 
> > Since this does slightly different calculation than the real hyperv tsc
> > ref page clock is supposed to, I wonder if we are safe WRT precision
> > errors leading to occasional monotonicity violations?
> 
> The Hyper-V scale is
> 
>  tsc_to_system_mul * 2^(32+tsc_shift) / 100
> 
> and the only source of error could be from doing here
> 
>  (tsc * tsc_to_system_mul >> (32-tsc_shift)) / 100
> 
> vs
> 
>  tsc * ((tsc_to_system_mul >> (32-tsc_shift)) / 100))
>^^
>  this is scale / 2^64
> 
> in the TSC ref page clock.  If my reasoning is correct the error will be
> at most 100 units of the scale value, which is a relative error of
> around 1 parts per 2^49.
> 
> Likewise for the offset, the improvement from
> 
> (tsc - base_tsc) * tsc_to_system_mul >> (32-tsc_shift)
>  + base_system_time
> 
> vs. using a single offset as in the TSC ref page is one nanosecond---and
> the ref page only has a resolution of 100 ns.

AFAICS it's not a matter of resolution.  If one calculation flips from
value T to T+1 at tsc1, while the other at tsc2, during the window
between tsc1 and tsc2 we can have monotonicity violation.  If the window
is a few cycles (i.e. less than a vmexit) we're probably safe, but if
it's not this may be a problem.

Roman.

Re: [PATCH 3/4] KVM: x86: introduce get_kvmclock_ns

2016-09-02 Thread Roman Kagan

On Thu, Sep 01, 2016 at 05:26:14PM +0200, Paolo Bonzini wrote:
> Introduce a function that reads the exact nanoseconds value that is
> provided to the guest in kvmclock.  This crystallizes the notion of
> kvmclock as a thin veneer over a stable TSC, that the guest will
> (hopefully) convert with NTP.  In other words, kvmclock is *not* a
> paravirtualized host-to-guest NTP.

It might also be worth documenting this approach in a comment somewhere
around pvclock_gtod_notify / update_pvclock_gtod or friends.  And, if
those could be simplified due to this change, it would be great, too.

Roman.

Re: [PATCH 1/4] KVM: x86: always fill in vcpu->arch.hv_clock

2016-09-02 Thread Roman Kagan

On Thu, Sep 01, 2016 at 05:26:12PM +0200, Paolo Bonzini wrote:
> We will use it in the next patches for KVM_GET_CLOCK and as a basis for the
> contents of the Hyper-V TSC page.  Get the values from the Linux
> timekeeper even if kvmclock is not enabled.
> 
> Signed-off-by: Paolo Bonzini 
> ---
>  arch/x86/kvm/x86.c | 109 
> +
>  1 file changed, 59 insertions(+), 50 deletions(-)
> 
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 19f9f9e05c2a..65974dd0565f 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1716,6 +1716,60 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
>  #endif
>  }
>  
> +static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
> +{
> + struct kvm_vcpu_arch *vcpu = &v->arch;
> + struct pvclock_vcpu_time_info guest_hv_clock;
> +
> + if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
> + &guest_hv_clock, sizeof(guest_hv_clock
> + return;
> +
> + /* This VCPU is paused, but it's legal for a guest to read another
> +  * VCPU's kvmclock, so we really have to follow the specification where
> +  * it says that version is odd if data is being modified, and even after
> +  * it is consistent.
> +  *
> +  * Version field updates must be kept separate.  This is because
> +  * kvm_write_guest_cached might use a "rep movs" instruction, and
> +  * writes within a string instruction are weakly ordered.  So there
> +  * are three writes overall.
> +  *
> +  * As a small optimization, only write the version field in the first
> +  * and third write.  The vcpu->pv_time cache is still valid, because the
> +  * version field is the first in the struct.
> +  */
> + BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
> +
> + vcpu->hv_clock.version = guest_hv_clock.version + 1;
> + kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
> + &vcpu->hv_clock,
> + sizeof(vcpu->hv_clock.version));
> +
> + smp_wmb();
> +
> + /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
> + vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
> +
> + if (vcpu->pvclock_set_guest_stopped_request) {
> + vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
> + vcpu->pvclock_set_guest_stopped_request = false;
> + }
> +
> + trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
> +
> + kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
> + &vcpu->hv_clock,
> + sizeof(vcpu->hv_clock));
> +
> + smp_wmb();
> +
> + vcpu->hv_clock.version++;
> + kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
> + &vcpu->hv_clock,
> + sizeof(vcpu->hv_clock.version));
> +}
> +
>  static int kvm_guest_time_update(struct kvm_vcpu *v)
>  {
>   unsigned long flags, tgt_tsc_khz;
> @@ -1723,7 +1777,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
>   struct kvm_arch *ka = &v->kvm->arch;
>   s64 kernel_ns;
>   u64 tsc_timestamp, host_tsc;
> - struct pvclock_vcpu_time_info guest_hv_clock;
>   u8 pvclock_flags;
>   bool use_master_clock;
>  
> @@ -1777,8 +1830,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
>  
>   local_irq_restore(flags);
>  
> - if (!vcpu->pv_time_enabled)
> - return 0;

Strictly speaking, you only need .hv_clock updated if either kvmclock or
tsc_ref_page is enabled, so you may want to still skip the calculations
otherwise.

> + /* With all the info we got, fill in the values */
>  
>   if (kvm_has_tsc_control)
>   tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
> @@ -1790,64 +1842,21 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
>   vcpu->hw_tsc_khz = tgt_tsc_khz;
>   }
>  
> - /* With all the info we got, fill in the values */
>   vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
>   vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
>   vcpu->last_guest_tsc = tsc_timestamp;
>  
> - if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
> - &guest_hv_clock, sizeof(guest_hv_clock
> - return 0;
> -
> - /* This VCPU is paused, but it's legal for a guest to read another
> -  * VCPU's kvmclock, so we really have to follow the specification where
> -  * it says that version is odd if data is being modified, and even after
> -  * it is consistent.
> -  *
> -  * Version field updates must be kept separate.  This is because
> -  * kvm_write_guest_cached might use a "rep movs" instruction, and
> -  * writes within a string instruction are weakly ordered.  So there
> -  * are three writes overall.
> -  *
> -  * As a small optimization, only write the version field in the first
> -

Re: [PATCH 2/4] KVM: x86: initialize kvmclock_offset

2016-09-02 Thread Roman Kagan

On Thu, Sep 01, 2016 at 05:26:13PM +0200, Paolo Bonzini wrote:
> Make the guest's kvmclock count up from zero, not from the host boot
> time.  The guest cannot rely on that anyway because it changes on
> migration, the numbers are easier on the eye and finally it matches the
> desired semantics of the Hyper-V time reference counter.
> 
> Signed-off-by: Paolo Bonzini 
> ---
>  arch/x86/kvm/x86.c | 1 +
>  1 file changed, 1 insertion(+)

Reviewed-by: Roman Kagan

Re: [PATCH 3/4] KVM: x86: introduce get_kvmclock_ns

2016-09-02 Thread Roman Kagan

On Fri, Sep 02, 2016 at 06:57:10PM +0200, Paolo Bonzini wrote:
> 
> 
> On 02/09/2016 18:55, Roman Kagan wrote:
> >> > I'll change patch 4 to store the parameters and use them when accessing
> >> > the time reference counter MSR.  I'll still keep the procedure that goes
> >> > through kvmclock.  It's a bit more involved for the scale, but
> >> > vcpu->last_guest_tsc only provides a part of the offset computation; the
> >> > other half is vcpu->hv_clock.system_time and it's not stored anywhere.
> > Erm... It is stored right there, in vcpu->hv_clock.system_time, you can
> > access it just fine when you populate tsc_ref_page values.  Am I missing
> > anything?
> 
> No.  It's not stored anywhere outside vcpu->hv_clock.  My reasoning goes
> that if I have to use vcpu->hv_clock.system_time I might as well use
> vcpu->hv_clock for everything. :)

Well, there're plenty of ways to bring the necessary values to where
they are needed.

But that's a matter of simplicity, not correctness, so I won't mind that
much.

Thanks,
Roman.

Re: [PATCH 4/4] KVM: x86: Hyper-V tsc page setup

2016-09-02 Thread Roman Kagan

On Thu, Sep 01, 2016 at 05:26:15PM +0200, Paolo Bonzini wrote:
> Lately tsc page was implemented but filled with empty
> values. This patch setup tsc page scale and offset based
> on vcpu tsc, tsc_khz and  HV_X64_MSR_TIME_REF_COUNT value.
> 
> The valid tsc page drops HV_X64_MSR_TIME_REF_COUNT msr
> reads count to zero which potentially improves performance.
> 
> Signed-off-by: Andrey Smetanin 
> Reviewed-by: Peter Hornyack 
> CC: Paolo Bonzini 
> CC: Radim Krčmář 
> CC: Roman Kagan 
> CC: Denis V. Lunev 
> [Computation of TSC page parameters rewritten to use the Linux timekeeper
>  parameters. - Paolo]
> Signed-off-by: Paolo Bonzini 
> ---
>  arch/x86/kvm/hyperv.c | 141 
> --
>  arch/x86/kvm/hyperv.h |   3 ++
>  arch/x86/kvm/x86.c|   8 +--
>  3 files changed, 133 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
> index ed5b77f39ffb..e089d1f52dc0 100644
> --- a/arch/x86/kvm/hyperv.c
> +++ b/arch/x86/kvm/hyperv.c
> @@ -756,6 +756,129 @@ static int kvm_hv_msr_set_crash_data(struct kvm_vcpu 
> *vcpu,
>   return 0;
>  }
>  
> +/*
> + * The kvmclock and Hyper-V TSC page use similar formulas.  Because the KVM
> + * one is more precise, it is a little more complex.  However, converting

I'm not sure you're right regarding which one is more precise :)
Hyper-V uses a right shift of 64 which is higher precision than typical
kvmclock shift of around 22.

> + * between them is possible:
> + *
> + * kvmclock formula:
> + *nsec = (ticks - tsc_timestamp) * tsc_to_system_mul * 2^(tsc_shift-32)
> + *   + system_time
> + *
> + * Hyper-V formula:
> + *nsec/100 = ticks * scale / 2^64 + offset
> + *
> + * When tsc_timestamp = system_time = 0, offset is zero in the Hyper-V 
> formula.
> + * By dividing the kvmclock formula by 100 and equating what's left we get:
> + *ticks * scale / 2^64 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 
> 100
> + *scale / 2^64 = tsc_to_system_mul * 2^(tsc_shift-32) / 
> 100
> + *scale= tsc_to_system_mul * 2^(32+tsc_shift) / 
> 100
> + *
> + * Now expand the kvmclock formula and divide by 100:
> + *nsec = ticks * tsc_to_system_mul * 2^(tsc_shift-32)
> + *   - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32)
> + *   + system_time
> + *nsec/100 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100
> + *   - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32) / 100
> + *   + system_time / 100
> + *
> + * Replace tsc_to_system_mul * 2^(tsc_shift-32) / 100 by scale / 2^64:
> + *nsec/100 = ticks * scale / 2^64
> + *   - tsc_timestamp * scale / 2^64
> + *   + system_time / 100
> + *
> + * Equate with the Hyper-V formula so that ticks * scale / 2^64 cancels out:
> + *offset = system_time / 100 - tsc_timestamp * scale / 2^64
> + *
> + * These two equivalencies are implemented in this function.
> + */
> +static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info 
> *hv_clock,
> + HV_REFERENCE_TSC_PAGE *tsc_ref)
> +{
> + u64 max_mul;
> +
> + if (!(hv_clock->flags & PVCLOCK_TSC_STABLE_BIT))
> + return false;
> +
> + /*
> +  * check if scale would overflow, if so we use the time ref counter
> +  *tsc_to_system_mul * 2^(tsc_shift+32) / 100 >= 2^64
> +  *tsc_to_system_mul / 100 >= 2^(32-tsc_shift)
> +  *tsc_to_system_mul >= 100 * 2^(32-tsc_shift)
> +  */
> + max_mul = 100ull << (32 - hv_clock->tsc_shift);
> + if (hv_clock->tsc_to_system_mul >= max_mul)
> + return false;
> +
> + /*
> +  * Otherwise compute the scale and offset according to the formulas
> +  * derived above.
> +  */
> + tsc_ref->tsc_scale =
> + mul_u64_u32_div(1ULL << (32 + hv_clock->tsc_shift),
> + hv_clock->tsc_to_system_mul,
> + 100);
> +
> + tsc_ref->tsc_offset = hv_clock->system_time;
> + do_div(tsc_ref->tsc_offset, 100);
> + tsc_ref->tsc_offset -=
> + mul_u64_u64_shr(hv_clock->tsc_timestamp, tsc_ref->tsc_scale, 
> 64);
> + return true;

Although this is correct, we may want to make it slightly easier to
follow: note that the hv_clock contents is essentially populated using
vcpu->hw_tsc_khz and vcpu->last_guest_tsc, so at this point we can just
directly calculate ->tsc_scale and ->tsc_offset from them.  If we also
stash them somewhere on vcpu we can ma

Re: [PATCH v5 5/5] KVM: x86: hyperv: implement PV IPI send hypercalls

2018-08-28 Thread Roman Kagan

_VECTOR))
> + return HV_STATUS_INVALID_HYPERCALL_INPUT;
> +
> + if (all_cpus) {
> + kvm_for_each_vcpu(i, vcpu, kvm) {
> + /* We fail only when APIC is disabled */
> + if (!kvm_apic_set_irq(vcpu, &irq, NULL))
> + return HV_STATUS_INVALID_HYPERCALL_INPUT;
> + }
> + goto ret_success;
> + }
> +
> + for_each_set_bit(bank, (unsigned long *)&valid_bank_mask, 64) {
> + for_each_set_bit(i, (unsigned long *)&sparse_banks[bank], 64) {
> + u32 vp_index = bank * 64 + i;
> + struct kvm_vcpu *vcpu =
> + get_vcpu_by_vpidx(kvm, vp_index);
> +
> + /* Unknown vCPU specified */
> +     if (!vcpu)
> + continue;
> +
> + /* We fail only when APIC is disabled */
> + kvm_apic_set_irq(vcpu, &irq, NULL);
> + }
> + }
> +
> +ret_success:
> + return HV_STATUS_SUCCESS;
> +}
> +

I still think that splitting kvm_hv_send_ipi into three functions would
make it more readable, but that's a matter of taste of course, so I'm OK
if Radim insists otherwise.

Reviewed-by: Roman Kagan

Re: [PATCH v4 RESEND 5/5] KVM: x86: hyperv: implement PV IPI send hypercalls

2018-08-23 Thread Roman Kagan

On Wed, Aug 22, 2018 at 12:18:32PM +0200, Vitaly Kuznetsov wrote:
> Using hypercall for sending IPIs is faster because this allows to specify
> any number of vCPUs (even > 64 with sparse CPU set), the whole procedure
> will take only one VMEXIT.
> 
> Current Hyper-V TLFS (v5.0b) claims that HvCallSendSyntheticClusterIpi
> hypercall can't be 'fast' (passing parameters through registers) but
> apparently this is not true, Windows always uses it as 'fast' so we need
> to support that.
> 
> Signed-off-by: Vitaly Kuznetsov 
> ---
>  Documentation/virtual/kvm/api.txt |   8 +++
>  arch/x86/kvm/hyperv.c | 109 
> ++
>  arch/x86/kvm/trace.h  |  42 +++
>  arch/x86/kvm/x86.c|   1 +
>  include/uapi/linux/kvm.h  |   1 +
>  5 files changed, 161 insertions(+)

Looks like I forgot to respond to this one, sorry.

> diff --git a/Documentation/virtual/kvm/api.txt 
> b/Documentation/virtual/kvm/api.txt
> index 7b83b176c662..832ea72d43c1 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -4690,3 +4690,11 @@ This capability indicates that KVM supports 
> paravirtualized Hyper-V TLB Flush
>  hypercalls:
>  HvFlushVirtualAddressSpace, HvFlushVirtualAddressSpaceEx,
>  HvFlushVirtualAddressList, HvFlushVirtualAddressListEx.
> +
> +8.19 KVM_CAP_HYPERV_SEND_IPI
> +
> +Architectures: x86
> +
> +This capability indicates that KVM supports paravirtualized Hyper-V IPI send
> +hypercalls:
> +HvCallSendSyntheticClusterIpi, HvCallSendSyntheticClusterIpiEx.
> diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
> index d1a911132b59..3183cf9bcb63 100644
> --- a/arch/x86/kvm/hyperv.c
> +++ b/arch/x86/kvm/hyperv.c
> @@ -1360,6 +1360,101 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu 
> *current_vcpu, u64 ingpa,
>   ((u64)rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET);
>  }
>  
> +static u64 kvm_hv_send_ipi(struct kvm_vcpu *current_vcpu, u64 ingpa, u64 
> outgpa,
> +bool ex, bool fast)
> +{
> + struct kvm *kvm = current_vcpu->kvm;
> + struct hv_send_ipi_ex send_ipi_ex;
> + struct hv_send_ipi send_ipi;
> + struct kvm_vcpu *vcpu;
> + unsigned long valid_bank_mask;
> + u64 sparse_banks[64];
> + int sparse_banks_len, bank, i;
> + struct kvm_lapic_irq irq = {.delivery_mode = APIC_DM_FIXED};
> + bool all_cpus;
> +
> + if (!ex) {
> + if (!fast) {
> + if (unlikely(kvm_read_guest(kvm, ingpa, &send_ipi,
> + sizeof(send_ipi
> + return HV_STATUS_INVALID_HYPERCALL_INPUT;
> + sparse_banks[0] = send_ipi.cpu_mask;
> + irq.vector = send_ipi.vector;
> + } else {
> + /* 'reserved' part of hv_send_ipi should be 0 */
> + if (unlikely(ingpa >> 32 != 0))
> + return HV_STATUS_INVALID_HYPERCALL_INPUT;
> + sparse_banks[0] = outgpa;
> + irq.vector = (u32)ingpa;
> + }
> + all_cpus = false;
> + valid_bank_mask = BIT_ULL(0);
> +
> + trace_kvm_hv_send_ipi(irq.vector, sparse_banks[0]);
> + } else {
> + if (unlikely(kvm_read_guest(kvm, ingpa, &send_ipi_ex,
> + sizeof(send_ipi_ex
> + return HV_STATUS_INVALID_HYPERCALL_INPUT;
> +
> + trace_kvm_hv_send_ipi_ex(send_ipi_ex.vector,
> +  send_ipi_ex.vp_set.format,
> +  send_ipi_ex.vp_set.valid_bank_mask);
> +
> + irq.vector = send_ipi_ex.vector;
> + valid_bank_mask = send_ipi_ex.vp_set.valid_bank_mask;
> + sparse_banks_len = bitmap_weight(&valid_bank_mask, 64) *
> + sizeof(sparse_banks[0]);
> +
> + all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL;
> +
> + if (!sparse_banks_len)
> + goto ret_success;
> +
> + if (!all_cpus &&
> + kvm_read_guest(kvm,
> +ingpa + offsetof(struct hv_send_ipi_ex,
> + vp_set.bank_contents),
> +sparse_banks,
> +sparse_banks_len))
> + return HV_STATUS_INVALID_HYPERCALL_INPUT;
> + }
> +
> + if ((irq.vector < HV_IPI_LOW_VECTOR) ||
> + (irq.vector > HV_IPI_HIGH_VECTOR))
> + return HV_STATUS_INVALID_HYPERCALL_INPUT;
> +
> + if (all_cpus) {
> + kvm_for_each_vcpu(i, vcpu, kvm) {
> + /* We fail only when APIC is disabled */
> + if (!kvm_apic_set_irq(vcpu, &irq, NULL))
> + return HV_STATUS_INVALID_HYPERCALL_INPUT;
> +

Re: [PATCH v2 1/5] x86/kvm: rename HV_X64_MSR_APIC_ASSIST_PAGE to HV_X64_MSR_VP_ASSIST_PAGE

2018-03-07 Thread Roman Kagan

On Wed, Mar 07, 2018 at 05:19:44PM +0100, Radim Krčmář wrote:
> 2018-02-26 18:11+0100, Vitaly Kuznetsov:
> > From: Ladi Prosek 
> > 
> > The assist page has been used only for the paravirtual EOI so far, hence
> > the "APIC" in the MSR name. Renaming to match the Hyper-V TLFS where it's
> > called "Virtual VP Assist MSR".
> > 
> > Signed-off-by: Ladi Prosek 
> > Signed-off-by: Vitaly Kuznetsov 
> > ---
> >  arch/x86/include/uapi/asm/hyperv.h | 10 +-
> >  arch/x86/kvm/hyperv.c  |  8 
> >  arch/x86/kvm/lapic.h   |  2 +-
> >  arch/x86/kvm/x86.c |  2 +-
> >  4 files changed, 11 insertions(+), 11 deletions(-)
> > 
> > diff --git a/arch/x86/include/uapi/asm/hyperv.h 
> > b/arch/x86/include/uapi/asm/hyperv.h
> > index 1c12aaf33915..45cc62352040 100644
> > --- a/arch/x86/include/uapi/asm/hyperv.h
> > +++ b/arch/x86/include/uapi/asm/hyperv.h
> > @@ -189,7 +189,7 @@
> >  #define HV_X64_MSR_EOI 0x4070
> >  #define HV_X64_MSR_ICR 0x4071
> >  #define HV_X64_MSR_TPR 0x4072
> > -#define HV_X64_MSR_APIC_ASSIST_PAGE0x4073
> > +#define HV_X64_MSR_VP_ASSIST_PAGE  0x4073
> >  
> >  /* Define synthetic interrupt controller model specific registers. */
> >  #define HV_X64_MSR_SCONTROL0x4080
> > @@ -275,10 +275,10 @@ struct hv_tsc_emulation_status {
> >  #define HVCALL_POST_MESSAGE0x005c
> >  #define HVCALL_SIGNAL_EVENT0x005d
> >  
> > -#define HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE 0x0001
> > -#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT  12
> > -#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_MASK   \
> > -   (~((1ull << HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT) - 1))
> 
> Removing definitions from userspace api isn't a good idea.
> 
> I have no idea why hyper.h is a userspace api, though -- Linux doesn't
> define any of those, so we could copy the definitions to a private
> header, rename, and never look at this file again.

That was a thinko when it was moved to uapi, and it has already been
identified as a problem, so now QEMU has its own header with the
definitions it needs, and I'm unaware of any other userspace project
that depends on this stuff.  So I've been planning to remove it from
uapi but still haven't got around to posting the patch :(

Roman.

Re: [PATCH v6 3/7] KVM: x86: hyperv: consistently use 'hv_vcpu' for 'struct kvm_vcpu_hv' variables

2018-09-27 Thread Roman Kagan

On Wed, Sep 26, 2018 at 07:02:55PM +0200, Vitaly Kuznetsov wrote:
> Rename 'hv' to 'hv_vcpu' in kvm_hv_set_msr/kvm_hv_get_msr(); 'hv' is
> 'reserved' for 'struct kvm_hv' variables across the file.
> 
> Signed-off-by: Vitaly Kuznetsov 
> ---
>  arch/x86/kvm/hyperv.c | 18 +-----
>  1 file changed, 9 insertions(+), 9 deletions(-)

Reviewed-by: Roman Kagan

Re: [PATCH v6 4/7] KVM: x86: hyperv: keep track of mismatched VP indexes

2018-09-27 Thread Roman Kagan

On Wed, Sep 26, 2018 at 07:02:56PM +0200, Vitaly Kuznetsov wrote:
> In most common cases VP index of a vcpu matches its vcpu index. Userspace
> is, however, free to set any mapping it wishes and we need to account for
> that when we need to find a vCPU with a particular VP index. To keep search
> algorithms optimal in both cases introduce 'num_mismatched_vp_indexes'
> counter showing how many vCPUs with mismatching VP index we have. In case
> the counter is zero we can assume vp_index == vcpu_idx.
> 
> Signed-off-by: Vitaly Kuznetsov 
> ---
>  arch/x86/include/asm/kvm_host.h |  3 +++
>  arch/x86/kvm/hyperv.c   | 26 +++---
>  2 files changed, 26 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 09b2e3e2cf1b..711f79f1b5e6 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -781,6 +781,9 @@ struct kvm_hv {
>   u64 hv_reenlightenment_control;
>   u64 hv_tsc_emulation_control;
>   u64 hv_tsc_emulation_status;
> +
> + /* How many vCPUs have VP index != vCPU index */
> + atomic_t num_mismatched_vp_indexes;
>  };
>  
>  enum kvm_irqchip_mode {
> diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
> index c8764faf783b..6a19c8e3c432 100644
> --- a/arch/x86/kvm/hyperv.c
> +++ b/arch/x86/kvm/hyperv.c
> @@ -1045,11 +1045,31 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 
> msr, u64 data, bool host)
>   struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv;
>  
>   switch (msr) {
> - case HV_X64_MSR_VP_INDEX:
> - if (!host || (u32)data >= KVM_MAX_VCPUS)
> + case HV_X64_MSR_VP_INDEX: {
> + struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
> + int vcpu_idx = kvm_vcpu_get_idx(vcpu);
> + u32 new_vp_index = (u32)data;
> +
> + if (!host || new_vp_index >= KVM_MAX_VCPUS)
>   return 1;
> - hv_vcpu->vp_index = (u32)data;
> +
> + if (new_vp_index == hv_vcpu->vp_index)
> + return 0;
> +
> + /*
> +  * VP index is changing, increment num_mismatched_vp_indexes in
> +  * case it was equal to vcpu_idx before; on the other hand, if
> +  * the new VP index matches vcpu_idx num_mismatched_vp_indexes
> +  * needs to be decremented.

It may be worth mentioning that the initial balance is provided by
kvm_hv_vcpu_postcreate setting vp_index = vcpu_idx.

> +  */
> + if (hv_vcpu->vp_index == vcpu_idx)
> + atomic_inc(&hv->num_mismatched_vp_indexes);
> + else if (new_vp_index == vcpu_idx)
> + atomic_dec(&hv->num_mismatched_vp_indexes);

> +
> + hv_vcpu->vp_index = new_vp_index;
>   break;
> + }
>   case HV_X64_MSR_VP_ASSIST_PAGE: {
>   u64 gfn;
>   unsigned long addr;

Reviewed-by: Roman Kagan

Re: [PATCH v6 5/7] KVM: x86: hyperv: valid_bank_mask should be 'u64'

2018-09-27 Thread Roman Kagan

On Wed, Sep 26, 2018 at 07:02:57PM +0200, Vitaly Kuznetsov wrote:
> This probably doesn't matter much (KVM_MAX_VCPUS is much lower nowadays)
> but valid_bank_mask is really u64 and not unsigned long.
> 
> Signed-off-by: Vitaly Kuznetsov 
> ---
>  arch/x86/kvm/hyperv.c | 5 +++--
>  1 file changed, 3 insertions(+), 2 deletions(-)

Reviewed-by: Roman Kagan

Re: [PATCH v6 6/7] KVM: x86: hyperv: optimize kvm_hv_flush_tlb() for vp_index == vcpu_idx case

2018-09-27 Thread Roman Kagan

On Wed, Sep 26, 2018 at 07:02:58PM +0200, Vitaly Kuznetsov wrote:
> VP inedx almost always matches VCPU and when it does it's faster to walk
> the sparse set instead of all vcpus.
> 
> Signed-off-by: Vitaly Kuznetsov 
> ---
>  arch/x86/kvm/hyperv.c | 96 +++
>  1 file changed, 52 insertions(+), 44 deletions(-)
> 
> diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
> index eeb12eacd525..cc0535a078f7 100644
> --- a/arch/x86/kvm/hyperv.c
> +++ b/arch/x86/kvm/hyperv.c
> @@ -1277,32 +1277,37 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 
> msr, u64 *pdata, bool host)
>   return kvm_hv_get_msr(vcpu, msr, pdata, host);
>  }
>  
> -static __always_inline int get_sparse_bank_no(u64 valid_bank_mask, int 
> bank_no)
> +static __always_inline bool hv_vcpu_in_sparse_set(struct kvm_vcpu_hv 
> *hv_vcpu,
> +   u64 sparse_banks[],
> +   u64 valid_bank_mask)
>  {
> - int i = 0, j;
> + int bank = hv_vcpu->vp_index / 64, sbank;
>  
> - if (!(valid_bank_mask & BIT_ULL(bank_no)))
> - return -1;
> + if (bank >= 64)
> + return false;
>  
> - for (j = 0; j < bank_no; j++)
> - if (valid_bank_mask & BIT_ULL(j))
> - i++;
> + if (!(valid_bank_mask & BIT_ULL(bank)))
> + return false;
>  
> - return i;
> + /* Sparse bank number equals to the number of set bits before it */
> + sbank = bitmap_weight((unsigned long *)&valid_bank_mask, bank);
> +
> + return !!(sparse_banks[sbank] & BIT_ULL(hv_vcpu->vp_index % 64));
>  }
>  
>  static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa,
>   u16 rep_cnt, bool ex)
>  {
>   struct kvm *kvm = current_vcpu->kvm;
> - struct kvm_vcpu_hv *hv_current = ¤t_vcpu->arch.hyperv;
> + struct kvm_hv *hv = &kvm->arch.hyperv;
> + struct kvm_vcpu_hv *hv_vcpu = ¤t_vcpu->arch.hyperv;
>   struct hv_tlb_flush_ex flush_ex;
>   struct hv_tlb_flush flush;
>   struct kvm_vcpu *vcpu;
>   unsigned long vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)] = {0};
> - u64 valid_bank_mask = 0;
> + u64 valid_bank_mask;
>   u64 sparse_banks[64];
> - int sparse_banks_len, i;
> + int sparse_banks_len, i, bank, sbank;
>   bool all_cpus;
>  
>   if (!ex) {
> @@ -1312,6 +1317,7 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu 
> *current_vcpu, u64 ingpa,
>   trace_kvm_hv_flush_tlb(flush.processor_mask,
>  flush.address_space, flush.flags);
>  
> + valid_bank_mask = BIT_ULL(0);
>   sparse_banks[0] = flush.processor_mask;
>   all_cpus = flush.flags & HV_FLUSH_ALL_PROCESSORS;
>   } else {
> @@ -1344,52 +1350,54 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu 
> *current_vcpu, u64 ingpa,
>   return HV_STATUS_INVALID_HYPERCALL_INPUT;
>   }
>  
> - cpumask_clear(&hv_current->tlb_lush);
> + /*
> +  * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't
> +  * analyze it here, flush TLB regardless of the specified address space.
> +  */
> + cpumask_clear(&hv_vcpu->tlb_lush);

Maybe squash this hv_current -> hv_vcpu renaming into patch 3?
(And yes this "lush" is funny, too ;)

>  
>   if (all_cpus) {
>   kvm_make_vcpus_request_mask(kvm,
>   KVM_REQ_TLB_FLUSH | KVM_REQUEST_NO_WAKEUP,
> - NULL, &hv_current->tlb_lush);
> + NULL, &hv_vcpu->tlb_lush);
>   goto ret_success;
>   }
>  
> - kvm_for_each_vcpu(i, vcpu, kvm) {
> - struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv;
> - int bank = hv->vp_index / 64, sbank = 0;
> -
> - /* Banks >64 can't be represented */
> - if (bank >= 64)
> - continue;
> -
> - /* Non-ex hypercalls can only address first 64 vCPUs */
> - if (!ex && bank)
> - continue;
> -
> - if (ex) {
> - /*
> -  * Check is the bank of this vCPU is in sparse
> -  * set and get the sparse bank number.
> -  */
> - sbank = get_sparse_bank_no(valid_bank_mask, bank);
> -
> - if (sbank < 0)
> - continue;
> + if (atomic_read(&hv->num_mismatched_vp_indexes)) {
> + kvm_for_each_vcpu(i, vcpu, kvm) {
> + if (hv_vcpu_in_sparse_set(&vcpu->arch.hyperv,
> +   sparse_banks,
> +   valid_bank_mask))
> + __set_bit(i, vcpu_bitmap);
>   }
> + goto flush_request;
> + }
>  
> - if (!(sparse_ba

Re: [PATCH v6 7/7] KVM: x86: hyperv: implement PV IPI send hypercalls

2018-09-27 Thread Roman Kagan

On Wed, Sep 26, 2018 at 07:02:59PM +0200, Vitaly Kuznetsov wrote:
> Using hypercall for sending IPIs is faster because this allows to specify
> any number of vCPUs (even > 64 with sparse CPU set), the whole procedure
> will take only one VMEXIT.
> 
> Current Hyper-V TLFS (v5.0b) claims that HvCallSendSyntheticClusterIpi
> hypercall can't be 'fast' (passing parameters through registers) but
> apparently this is not true, Windows always uses it as 'fast' so we need
> to support that.
> 
> Signed-off-by: Vitaly Kuznetsov 
> ---
>  Documentation/virtual/kvm/api.txt |   7 ++
>  arch/x86/kvm/hyperv.c | 115 ++
>  arch/x86/kvm/trace.h  |  42 +++
>  arch/x86/kvm/x86.c|   1 +
>  include/uapi/linux/kvm.h  |   1 +
>  5 files changed, 166 insertions(+)
> 
> diff --git a/Documentation/virtual/kvm/api.txt 
> b/Documentation/virtual/kvm/api.txt
> index 647f94128a85..1659b75d577d 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -4772,3 +4772,10 @@ CPU when the exception is taken. If this virtual 
> SError is taken to EL1 using
>  AArch64, this value will be reported in the ISS field of ESR_ELx.
>  
>  See KVM_CAP_VCPU_EVENTS for more details.
> +8.20 KVM_CAP_HYPERV_SEND_IPI
> +
> +Architectures: x86
> +
> +This capability indicates that KVM supports paravirtualized Hyper-V IPI send
> +hypercalls:
> +HvCallSendSyntheticClusterIpi, HvCallSendSyntheticClusterIpiEx.
> diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
> index cc0535a078f7..4b4a6d015ade 100644
> --- a/arch/x86/kvm/hyperv.c
> +++ b/arch/x86/kvm/hyperv.c
> @@ -1405,6 +1405,107 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu 
> *current_vcpu, u64 ingpa,
>   ((u64)rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET);
>  }
>  
> +static u64 kvm_hv_send_ipi(struct kvm_vcpu *current_vcpu, u64 ingpa, u64 
> outgpa,
> +bool ex, bool fast)
> +{
> + struct kvm *kvm = current_vcpu->kvm;
> + struct kvm_hv *hv = &kvm->arch.hyperv;
> + struct hv_send_ipi_ex send_ipi_ex;
> + struct hv_send_ipi send_ipi;
> + struct kvm_vcpu *vcpu;
> + unsigned long valid_bank_mask;
> + u64 sparse_banks[64];
> + int sparse_banks_len, bank, i, sbank;
> + struct kvm_lapic_irq irq = {.delivery_mode = APIC_DM_FIXED};
> + bool all_cpus;
> +
> + if (!ex) {
> + if (!fast) {
> + if (unlikely(kvm_read_guest(kvm, ingpa, &send_ipi,
> + sizeof(send_ipi
> + return HV_STATUS_INVALID_HYPERCALL_INPUT;
> + sparse_banks[0] = send_ipi.cpu_mask;
> + irq.vector = send_ipi.vector;
> + } else {
> + /* 'reserved' part of hv_send_ipi should be 0 */
> + if (unlikely(ingpa >> 32 != 0))
> + return HV_STATUS_INVALID_HYPERCALL_INPUT;
> + sparse_banks[0] = outgpa;
> + irq.vector = (u32)ingpa;
> + }
> + all_cpus = false;
> + valid_bank_mask = BIT_ULL(0);
> +
> + trace_kvm_hv_send_ipi(irq.vector, sparse_banks[0]);
> + } else {
> + if (unlikely(kvm_read_guest(kvm, ingpa, &send_ipi_ex,
> + sizeof(send_ipi_ex
> + return HV_STATUS_INVALID_HYPERCALL_INPUT;
> +
> + trace_kvm_hv_send_ipi_ex(send_ipi_ex.vector,
> +  send_ipi_ex.vp_set.format,
> +  send_ipi_ex.vp_set.valid_bank_mask);
> +
> + irq.vector = send_ipi_ex.vector;
> + valid_bank_mask = send_ipi_ex.vp_set.valid_bank_mask;
> + sparse_banks_len = bitmap_weight(&valid_bank_mask, 64) *
> + sizeof(sparse_banks[0]);
> +
> + all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL;
> +
> + if (!sparse_banks_len)
> + goto ret_success;
> +
> + if (!all_cpus &&
> + kvm_read_guest(kvm,
> +ingpa + offsetof(struct hv_send_ipi_ex,
> + vp_set.bank_contents),
> +sparse_banks,
> +sparse_banks_len))
> + return HV_STATUS_INVALID_HYPERCALL_INPUT;
> + }
> +
> + if ((irq.vector < HV_IPI_LOW_VECTOR) ||
> + (irq.vector > HV_IPI_HIGH_VECTOR))
> + return HV_STATUS_INVALID_HYPERCALL_INPUT;
> +
> + if (all_cpus || atomic_read(&hv->num_mismatched_vp_indexes)) {
> + kvm_for_each_vcpu(i, vcpu, kvm) {
> + if (all_cpus || hv_vcpu_in_sparse_set(
> + &vcpu->arch.hyperv, sparse_banks,
> + valid_bank_mask)) {
> +

Re: [PATCH v2 1/4] x86/hyper-v: move synic/stimer control structures definitions to hyperv-tlfs.h

2018-11-26 Thread Roman Kagan

[ Sorry for having missed v1 ]

On Mon, Nov 26, 2018 at 04:47:29PM +0100, Vitaly Kuznetsov wrote:
> We implement Hyper-V SynIC and synthetic timers in KVM too so there's some
> room for code sharing.
> 
> Signed-off-by: Vitaly Kuznetsov 
> ---
>  arch/x86/include/asm/hyperv-tlfs.h | 69 ++
>  drivers/hv/hv.c|  2 +-
>  drivers/hv/hyperv_vmbus.h  | 68 -
>  3 files changed, 70 insertions(+), 69 deletions(-)
> 
> diff --git a/arch/x86/include/asm/hyperv-tlfs.h 
> b/arch/x86/include/asm/hyperv-tlfs.h
> index 4139f7650fe5..b032c05cd3ee 100644
> --- a/arch/x86/include/asm/hyperv-tlfs.h
> +++ b/arch/x86/include/asm/hyperv-tlfs.h
> @@ -731,6 +731,75 @@ struct hv_enlightened_vmcs {
>  #define HV_STIMER_AUTOENABLE (1ULL << 3)
>  #define HV_STIMER_SINT(config)   (__u8)(((config) >> 16) & 0x0F)
>  
> +/* Define synthetic interrupt controller flag constants. */
> +#define HV_EVENT_FLAGS_COUNT (256 * 8)
> +#define HV_EVENT_FLAGS_LONG_COUNT(256 / sizeof(unsigned long))
> +
> +/*
> + * Synthetic timer configuration.
> + */
> +union hv_stimer_config {
> + u64 as_uint64;
> + struct {
> + u64 enable:1;
> + u64 periodic:1;
> + u64 lazy:1;
> + u64 auto_enable:1;
> + u64 apic_vector:8;
> + u64 direct_mode:1;
> + u64 reserved_z0:3;
> + u64 sintx:4;
> + u64 reserved_z1:44;
> + };
> +};
> +
> +
> +/* Define the synthetic interrupt controller event flags format. */
> +union hv_synic_event_flags {
> + unsigned long flags[HV_EVENT_FLAGS_LONG_COUNT];
> +};
> +
> +/* Define SynIC control register. */
> +union hv_synic_scontrol {
> + u64 as_uint64;
> + struct {
> + u64 enable:1;
> + u64 reserved:63;
> + };
> +};
> +
> +/* Define synthetic interrupt source. */
> +union hv_synic_sint {
> + u64 as_uint64;
> + struct {
> + u64 vector:8;
> + u64 reserved1:8;
> + u64 masked:1;
> + u64 auto_eoi:1;
> + u64 reserved2:46;
> + };
> +};
> +
> +/* Define the format of the SIMP register */
> +union hv_synic_simp {
> + u64 as_uint64;
> + struct {
> + u64 simp_enabled:1;
> + u64 preserved:11;
> + u64 base_simp_gpa:52;
> + };
> +};
> +
> +/* Define the format of the SIEFP register */
> +union hv_synic_siefp {
> + u64 as_uint64;
> + struct {
> + u64 siefp_enabled:1;
> + u64 preserved:11;
> + u64 base_siefp_gpa:52;
> + };
> +};
> +
>  struct hv_vpset {
>   u64 format;
>   u64 valid_bank_mask;

I personally tend to prefer masks over bitfields, so I'd rather do the
consolidation in the opposite direction: use the definitions in
hyperv-tlfs.h and replace those unions/bitfields elsewhere.  (I vaguely
remember posting such a patchset a couple of years ago but I lacked the
motivation to complete it).

Roman.

Re: [PATCH v2 3/4] x86/kvm/hyper-v: direct mode for synthetic timers

2018-11-27 Thread Roman Kagan

On Mon, Nov 26, 2018 at 04:47:31PM +0100, Vitaly Kuznetsov wrote:
> Turns out Hyper-V on KVM (as of 2016) will only use synthetic timers
> if direct mode is available. With direct mode we notify the guest by
> asserting APIC irq instead of sending a SynIC message.
> 
> The implementation uses existing vec_bitmap for letting lapic code
> know that we're interested in the particular IRQ's EOI request. We assume
> that the same APIC irq won't be used by the guest for both direct mode
> stimer and as sint source (especially with AutoEOI semantics). It is
> unclear how things should be handled if that's not true.
> 
> Direct mode is also somewhat less expensive; in my testing
> stimer_send_msg() takes not less than 1500 cpu cycles and
> stimer_notify_direct() can usually be done in 300-400. WS2016 without
> Hyper-V, however, always sticks to non-direct version.
> 
> Signed-off-by: Vitaly Kuznetsov 
> ---
> - Changes since v1: avoid open-coding stimer_mark_pending() in
>   kvm_hv_synic_send_eoi() [Paolo Bonzini]
> ---
>  arch/x86/kvm/hyperv.c| 67 +++-
>  arch/x86/kvm/trace.h | 10 +++---
>  arch/x86/kvm/x86.c   |  1 +
>  include/uapi/linux/kvm.h |  1 +
>  4 files changed, 67 insertions(+), 12 deletions(-)

Reviewed-by: Roman Kagan

Re: [PATCH v2 3/4] x86/kvm/hyper-v: direct mode for synthetic timers

2018-11-27 Thread Roman Kagan

On Mon, Nov 26, 2018 at 05:44:24PM +0100, Paolo Bonzini wrote:
> On 26/11/18 16:47, Vitaly Kuznetsov wrote:
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 5cd5647120f2..b21b5ceb8d26 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -2997,6 +2997,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, 
> > long ext)
> > case KVM_CAP_HYPERV_TLBFLUSH:
> > case KVM_CAP_HYPERV_SEND_IPI:
> > case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
> > +   case KVM_CAP_HYPERV_STIMER_DIRECT:
> > case KVM_CAP_PCI_SEGMENT:
> > case KVM_CAP_DEBUGREGS:
> > case KVM_CAP_X86_ROBUST_SINGLESTEP:
> > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> > index 2b7a652c9fa4..b8da14cee8e5 100644
> > --- a/include/uapi/linux/kvm.h
> > +++ b/include/uapi/linux/kvm.h
> > @@ -975,6 +975,7 @@ struct kvm_ppc_resize_hpt {
> >  #define KVM_CAP_HYPERV_ENLIGHTENED_VMCS 163
> >  #define KVM_CAP_EXCEPTION_PAYLOAD 164
> >  #define KVM_CAP_ARM_VM_IPA_SIZE 165
> > +#define KVM_CAP_HYPERV_STIMER_DIRECT 166
> 
> I wonder if all these capabilities shouldn't be replaced by a single
> KVM_GET_HYPERV_SUPPORTED_CPUID ioctl, or something like that.

Hmm, why?  Are we running short of cap numbers?

Capabilities are a well-established and unambiguous negotiation
mechanism, why invent another one?  Besides, not all features map
conveniently onto cpuid bits, e.g. currently we have two versions of
SynIC support, which differ in the way the userspace deals with it, but
not in the cpuid bits we expose in the guest.  IMO such an ioctl would
bring more complexity rather than less.

Roman.

Re: [PATCH v2 4/4] x86/kvm/hyper-v: avoid open-coding stimer_mark_pending() in kvm_hv_notify_acked_sint()

2018-11-27 Thread Roman Kagan

On Mon, Nov 26, 2018 at 04:47:32PM +0100, Vitaly Kuznetsov wrote:
> stimers_pending optimization only helps us to avoid multiple
> kvm_make_request() calls. This doesn't happen very often and these
> calls are very cheap in the first place, remove open-coded version of
> stimer_mark_pending() from kvm_hv_notify_acked_sint().

Frankly speaking, I've yet to see a guest that configures more than one
SynIC timer.  So it was indeed a bit of overengineering in the first
place.

> Suggested-by: Paolo Bonzini 
> Signed-off-by: Vitaly Kuznetsov 
> ---
>  arch/x86/kvm/hyperv.c | 12 +++-
>  1 file changed, 3 insertions(+), 9 deletions(-)

Reviewed-by: Roman Kagan

Re: [PATCH v2 1/4] x86/hyper-v: move synic/stimer control structures definitions to hyperv-tlfs.h

2018-11-27 Thread Roman Kagan

On Tue, Nov 27, 2018 at 02:10:49PM +0100, Vitaly Kuznetsov wrote:
> Roman Kagan  writes:
> > On Mon, Nov 26, 2018 at 04:47:29PM +0100, Vitaly Kuznetsov wrote:
> > I personally tend to prefer masks over bitfields, so I'd rather do the
> > consolidation in the opposite direction: use the definitions in
> > hyperv-tlfs.h and replace those unions/bitfields elsewhere.  (I vaguely
> > remember posting such a patchset a couple of years ago but I lacked the
> > motivation to complete it).
> 
> Are there any known advantages of using masks over bitfields or the
> resulting binary code is the same?

Strictly speaking bitwise ops are portable while bitfields are not, but
I guess this is not much of an issue with gcc which is dependable to
produce the right thing.

I came to dislike the bitfields for the false feeling of atomicity of
assignment while most of the time they are read-modify-write operations.

And no, I don't feel strong about it, so if nobody backs me on this I
give up :)

Roman.

Re: [PATCH v2 3/4] x86/kvm/hyper-v: direct mode for synthetic timers

2018-11-27 Thread Roman Kagan

On Tue, Nov 27, 2018 at 02:54:21PM +0100, Paolo Bonzini wrote:
> On 27/11/18 09:37, Roman Kagan wrote:
> > On Mon, Nov 26, 2018 at 05:44:24PM +0100, Paolo Bonzini wrote:
> >> On 26/11/18 16:47, Vitaly Kuznetsov wrote:
> >>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> >>> index 5cd5647120f2..b21b5ceb8d26 100644
> >>> --- a/arch/x86/kvm/x86.c
> >>> +++ b/arch/x86/kvm/x86.c
> >>> @@ -2997,6 +2997,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, 
> >>> long ext)
> >>>   case KVM_CAP_HYPERV_TLBFLUSH:
> >>>   case KVM_CAP_HYPERV_SEND_IPI:
> >>>   case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
> >>> + case KVM_CAP_HYPERV_STIMER_DIRECT:
> >>>   case KVM_CAP_PCI_SEGMENT:
> >>>   case KVM_CAP_DEBUGREGS:
> >>>   case KVM_CAP_X86_ROBUST_SINGLESTEP:
> >>> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> >>> index 2b7a652c9fa4..b8da14cee8e5 100644
> >>> --- a/include/uapi/linux/kvm.h
> >>> +++ b/include/uapi/linux/kvm.h
> >>> @@ -975,6 +975,7 @@ struct kvm_ppc_resize_hpt {
> >>>  #define KVM_CAP_HYPERV_ENLIGHTENED_VMCS 163
> >>>  #define KVM_CAP_EXCEPTION_PAYLOAD 164
> >>>  #define KVM_CAP_ARM_VM_IPA_SIZE 165
> >>> +#define KVM_CAP_HYPERV_STIMER_DIRECT 166
> >>
> >> I wonder if all these capabilities shouldn't be replaced by a single
> >> KVM_GET_HYPERV_SUPPORTED_CPUID ioctl, or something like that.
> > 
> > Hmm, why?  Are we running short of cap numbers?
> > 
> > Capabilities are a well-established and unambiguous negotiation
> > mechanism, why invent another one?  Besides, not all features map
> > conveniently onto cpuid bits, e.g. currently we have two versions of
> > SynIC support, which differ in the way the userspace deals with it, but
> > not in the cpuid bits we expose in the guest.  IMO such an ioctl would
> > bring more complexity rather than less.
> 
> Yes, in that case (for bugfixes basically) you'd have to use
> capabilities.  But if the feature is completely hidden to userspace
> except for the CPUID bit, it seems like a ioctl would be more consistent
> with the rest of the KVM API.

So there will be some features that are more equal than others?

I think that it's the current scheme which is more consistent: there are
features that are implemented in KVM, and there are caps for negotiating
them with userspace, and then -- separately -- there's a mix of bits to
show to the guest in response to CPUID, which only the userspace has to
bother with.

Thanks,
Roman.

Re: [PATCH v2 1/4] x86/hyper-v: move synic/stimer control structures definitions to hyperv-tlfs.h

2018-11-28 Thread Roman Kagan

On Wed, Nov 28, 2018 at 02:07:42PM +0100, Thomas Gleixner wrote:
> On Wed, 28 Nov 2018, Vitaly Kuznetsov wrote:
> 
> > Nadav Amit  writes:
> > 
> > >
> > > On a different note: how come all of the hyper-v structs are not marked
> > > with the “packed" attribute?
> > 
> > "packed" should not be needed with proper padding; I vaguely remember
> > someone (from x86@?) arguing _against_ "packed".
> 
> Packed needs to be used, when describing fixed format data structures in
> hardware or other ABIs, so the compiler cannot put alignment holes into
> them.
> 
> Using packed for generic data structures might result in suboptimal layouts
> and prevents layout randomization.

Sorry for my illiteracy, I didn't watch this field closely so I had to
google about layout randomization.  Is my understanding correct that one
can't rely any more on the compiler to naturally align the struct
members with minimal padding?  My life will never be the same...

Roman.

Re: [PATCH] x86/hyper-v: define structures from TLFS as packed

2018-11-30 Thread Roman Kagan

On Fri, Nov 30, 2018 at 01:15:11PM +0100, Vitaly Kuznetsov wrote:
> Without 'packed' compiler is free to add optimization paddings and re-order
> structure fields for randomization/optimization. And structures from
> hyperv-tlfs.h are used for hypervisor-guest communication, we need to
> ultimately forbid such practices.

Note that __packed also reduces the structure alignment to 1, which is
not necessarily what you want.

E.g. some of these structures are passed by pointer to the hypercall,
which requires its arguments to be 8byte-aligned.  I'm also not sure
that passing unaligned argument to [rw]msr is ok, need to double-check.

Roman.

> Suggested-by: Nadav Amit 
> Signed-off-by: Vitaly Kuznetsov 
> ---
> - This is a follow-up to my "[PATCH v2 0/4] x86/kvm/hyper-v: Implement
>  Direct Mode for synthetic timers" series, as suggested by Thomas I'm
>  routing it to KVM tree to avoid merge conflicts.
> ---
>  arch/x86/include/asm/hyperv-tlfs.h | 50 +++---
>  1 file changed, 25 insertions(+), 25 deletions(-)
> 
> diff --git a/arch/x86/include/asm/hyperv-tlfs.h 
> b/arch/x86/include/asm/hyperv-tlfs.h
> index ebfed56976d2..6a60fa17f6f2 100644
> --- a/arch/x86/include/asm/hyperv-tlfs.h
> +++ b/arch/x86/include/asm/hyperv-tlfs.h
> @@ -271,7 +271,7 @@ union hv_x64_msr_hypercall_contents {
>   u64 enable:1;
>   u64 reserved:11;
>   u64 guest_physical_address:52;
> - };
> + } __packed;
>  };
>  
>  /*
> @@ -283,7 +283,7 @@ struct ms_hyperv_tsc_page {
>   volatile u64 tsc_scale;
>   volatile s64 tsc_offset;
>   u64 reserved2[509];
> -};
> +}  __packed;
>  
>  /*
>   * The guest OS needs to register the guest ID with the hypervisor.
> @@ -324,7 +324,7 @@ struct hv_reenlightenment_control {
>   __u64 enabled:1;
>   __u64 reserved2:15;
>   __u64 target_vp:32;
> -};
> +}  __packed;
>  
>  #define HV_X64_MSR_TSC_EMULATION_CONTROL 0x4107
>  #define HV_X64_MSR_TSC_EMULATION_STATUS  0x4108
> @@ -332,12 +332,12 @@ struct hv_reenlightenment_control {
>  struct hv_tsc_emulation_control {
>   __u64 enabled:1;
>   __u64 reserved:63;
> -};
> +} __packed;
>  
>  struct hv_tsc_emulation_status {
>   __u64 inprogress:1;
>   __u64 reserved:63;
> -};
> +} __packed;
>  
>  #define HV_X64_MSR_HYPERCALL_ENABLE  0x0001
>  #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT  12
> @@ -409,7 +409,7 @@ typedef struct _HV_REFERENCE_TSC_PAGE {
>   __u32 res1;
>   __u64 tsc_scale;
>   __s64 tsc_offset;
> -} HV_REFERENCE_TSC_PAGE, *PHV_REFERENCE_TSC_PAGE;
> +}  __packed HV_REFERENCE_TSC_PAGE, *PHV_REFERENCE_TSC_PAGE;
>  
>  /* Define the number of synthetic interrupt sources. */
>  #define HV_SYNIC_SINT_COUNT  (16)
> @@ -466,7 +466,7 @@ union hv_message_flags {
>   struct {
>   __u8 msg_pending:1;
>   __u8 reserved:7;
> - };
> + } __packed;
>  };
>  
>  /* Define port identifier type. */
> @@ -488,7 +488,7 @@ struct hv_message_header {
>   __u64 sender;
>   union hv_port_id port;
>   };
> -};
> +} __packed;
>  
>  /* Define synthetic interrupt controller message format. */
>  struct hv_message {
> @@ -496,12 +496,12 @@ struct hv_message {
>   union {
>   __u64 payload[HV_MESSAGE_PAYLOAD_QWORD_COUNT];
>   } u;
> -};
> +} __packed;
>  
>  /* Define the synthetic interrupt message page layout. */
>  struct hv_message_page {
>   struct hv_message sint_message[HV_SYNIC_SINT_COUNT];
> -};
> +} __packed;
>  
>  /* Define timer message payload structure. */
>  struct hv_timer_message_payload {
> @@ -509,7 +509,7 @@ struct hv_timer_message_payload {
>   __u32 reserved;
>   __u64 expiration_time;  /* When the timer expired */
>   __u64 delivery_time;/* When the message was delivered */
> -};
> +} __packed;
>  
>  /* Define virtual processor assist page structure. */
>  struct hv_vp_assist_page {
> @@ -519,7 +519,7 @@ struct hv_vp_assist_page {
>   __u64 nested_enlightenments_control[2];
>   __u32 enlighten_vmentry;
>   __u64 current_nested_vmcs;
> -};
> +} __packed;
>  
>  struct hv_enlightened_vmcs {
>   u32 revision_id;
> @@ -693,7 +693,7 @@ struct hv_enlightened_vmcs {
>   u32 nested_flush_hypercall:1;
>   u32 msr_bitmap:1;
>   u32 reserved:30;
> - } hv_enlightenments_control;
> + }  __packed hv_enlightenments_control;
>   u32 hv_vp_id;
>  
>   u64 hv_vm_id;
> @@ -703,7 +703,7 @@ struct hv_enlightened_vmcs {
>   u64 padding64_5[7];
>   u64 xss_exit_bitmap;
>   u64 padding64_6[7];
> -};
> +} __packed;
>  
>  #define HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE  0
>  #define HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP BIT(0)
> @@ -744,7 +744,7 @@ union hv_stimer_config {
>   u64 reserved_z0:3;
>   u64 sintx:4;
>   u64 reserved_z1:44;
> - };
> + } __packed

Re: [PATCH] x86/hyper-v: define structures from TLFS as packed

2018-11-30 Thread Roman Kagan

On Fri, Nov 30, 2018 at 02:44:54PM +0100, Vitaly Kuznetsov wrote:
> Roman Kagan  writes:
> 
> > On Fri, Nov 30, 2018 at 01:15:11PM +0100, Vitaly Kuznetsov wrote:
> >> Without 'packed' compiler is free to add optimization paddings and re-order
> >> structure fields for randomization/optimization. And structures from
> >> hyperv-tlfs.h are used for hypervisor-guest communication, we need to
> >> ultimately forbid such practices.
> >
> > Note that __packed also reduces the structure alignment to 1, which is
> > not necessarily what you want.
> >
> > E.g. some of these structures are passed by pointer to the hypercall,
> > which requires its arguments to be 8byte-aligned.
> 
> Hm,
> 
> I thought we always take precautions for Hyper-V hypercall arguments, in
> particular
> 
> PV IPI/TLB flush use pre-allocated hyperv_pcpu_input_arg,
> hv_post_message() uses pre-allocated message page, other call sites use
> fast hypercalls where we use registers.

Looks so indeed.

> I also checked this patch before sending out, WS2016 guest boots without
> issues. Any particular places you're worried about?

It's Linux guests on Hyper-V that need to be checked.

> >  I'm also not sure
> > that passing unaligned argument to [rw]msr is ok, need to
> > double-check.
> 
> My understanding is that rdmsr/wrmsr instuctions are registers-only.

Indeed.

> We can, of course, just add __aligned(8) to some structures but I'd like
> to find the reason first.

I guess you're right, in the current code the relevant structures are
made sufficiently aligned by other means.  False alarm, sorry.

Roman.

Re: [PATCH v2] x86/hyper-v: Mark TLFS structures packed

2018-12-02 Thread Roman Kagan

On Mon, Dec 03, 2018 at 12:35:35AM +0100, Vitaly Kuznetsov wrote:
> Nadav Amit  writes:
> 
> [skip]
> 
> >
> > Having said that, something else is sort of strange in the TLFS definitions,
> > I think (I really know little about this whole protocol). Look at the
> > following definitions from hyperv-tlfs.h:
> >
> >> struct hv_vpset {
> >> u64 format;
> >> u64 valid_bank_mask;
> >> u64 bank_contents[];
> >> };
> >> 
> >> struct hv_tlb_flush_ex {
> >> u64 address_space;
> >> u64 flags;
> >> struct hv_vpset hv_vp_set;
> >> u64 gva_list[];
> >> };
> >
> > It seems you have two flexible array members at the end of hv_tlb_flush_ex.
> > This causes bank_contents[x] and gva_list[x] to overlap. So unless they have
> > the same meaning, this asks for trouble IMHO.
> >
> 
> This is weird but intentional :-) We're just following Hyper-V spec
> here.
> 
> E.g. HvFlushVirtualAddressListEx hypercall has the following input ABI:
> 
> [Fixed len head][[Fixed len VP set spec]Var len VP set][Var len addr List]
> 
> "Fixed len VP set spec" defines the true length of "Var len VP set" and
> "Address List" starts right after that. The length of the whole
> structure is also known.
> 
> So bank_contents[] and gva_list[] do overlap (and have different
> meaning). We take special precautions when forming the structure
> (e.g. fill_gva_list() takes 'offset').

This basically means that the argument of this hypercall can't be
represented by a C struct.  gva_list just can't be used.  So I'd rather
remove it from the struct (but leave a comment to that end perhaps), and
construct the message in place (as is done now anyway).

Roman.

Re: [PATCH v2 3/4] x86/kvm/hyper-v: direct mode for synthetic timers

2018-12-03 Thread Roman Kagan

On Mon, Nov 26, 2018 at 04:47:31PM +0100, Vitaly Kuznetsov wrote:
> @@ -379,6 +398,14 @@ void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int 
> vector)
>   for (i = 0; i < ARRAY_SIZE(synic->sint); i++)
>   if (synic_get_sint_vector(synic_read_sint(synic, i)) == vector)
>   kvm_hv_notify_acked_sint(vcpu, i);
> +
> + for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) {
> + stimer = &hv_vcpu->stimer[i];
> + if (stimer->msg_pending && stimer->config.enable &&
> + stimer->config.direct_mode &&
> + stimer->config.apic_vector == vector)
> + stimer_mark_pending(stimer, false);
> + }
>  }

While debugging another issue with synic timers, it just occurred to me
that with direct timers no extra processing is necessary on EOI: unlike
traditional synic timers which may have failed to deliver a message and
want to be notified when they can retry, direct timers just set the irq
directly in the apic.

So this hunk shouldn't be needed, should it?

Roman.

Re: [PATCH v2 2/5] KVM: x86: hyperv: introduce vp_index_to_vcpu_idx mapping

2018-06-29 Thread Roman Kagan

On Fri, Jun 29, 2018 at 01:37:44PM +0200, Vitaly Kuznetsov wrote:
> The problem we're trying to solve here is: with PV TLB flush and IPI we
> need to walk through the supplied list of VP_INDEXes and get VCPU
> ids. Usually they match. But in case they don't [...]

Why wouldn't they *in practice*?  Only if the userspace wanted to be
funny and assigned VP_INDEXes randomly?  I'm not sure we need to
optimize for this case.

Note that the userspace can actually do nasty things with these
VP_INDEXes, like, say, have them non-unique.  We need to be resilent to
it, but don't need to optimize for it.

I think I'd rather have a warning in kvm_hv_set_msr if the VP_INDEX
being assigned is not equal to the vcpu index, and start worrying about
optimization only if this warning starts being triggered by real
hypervisor applications.

Anyway I don't see an urgent need to bloat this patchset with optimizing
this translation; it can be done separately, if needed.

Roman.

Re: [PATCH v2 2/5] KVM: x86: hyperv: introduce vp_index_to_vcpu_idx mapping

2018-06-29 Thread Roman Kagan

On Fri, Jun 29, 2018 at 03:10:14PM +0200, Vitaly Kuznetsov wrote:
> Roman Kagan  writes:
> 
> > On Fri, Jun 29, 2018 at 01:37:44PM +0200, Vitaly Kuznetsov wrote:
> >> The problem we're trying to solve here is: with PV TLB flush and IPI we
> >> need to walk through the supplied list of VP_INDEXes and get VCPU
> >> ids. Usually they match. But in case they don't [...]
> >
> > Why wouldn't they *in practice*?  Only if the userspace wanted to be
> > funny and assigned VP_INDEXes randomly?  I'm not sure we need to
> > optimize for this case.
> 
> Can someone please remind me why we allow userspace to change it in the
> first place?

I can ;)

We used not to, and reported KVM's vcpu index as the VP_INDEX.  However,
later we realized that VP_INDEX needed to be persistent across
migrations and otherwise also known to userspace.  Relying on the kernel
to always initialize its indices in the same order was unacceptable, and
we came up with no better way of synchronizing VP_INDEX between the
userspace and the kernel than to let the former to set it explicitly.

However, this is basically a future-proofing feature; in practice, both
QEMU and KVM initialize their indices in the same order.

Roman.

Re: [PATCH v3 3/5] KVM: x86: hyperv: use get_vcpu_by_vpidx() in kvm_hv_flush_tlb()

2018-06-29 Thread Roman Kagan

On Fri, Jun 29, 2018 at 04:14:53PM +0200, Vitaly Kuznetsov wrote:
> VP_INDEX almost always matches VCPU id and get_vcpu_by_vpidx() is fast,
> use it instead of traversing full vCPU list every time.
> 
> To support the change switch kvm_make_vcpus_request_mask() to checking
> vcpu_id instead of vcpu index,

I'm afraid you can't do this: vcpu_id (== apic id) can be sparse, i.e.
it's not very well suited for bitmaps and can exceed the max number of
vcpus.

Roman.

Re: [PATCH v2 2/5] KVM: x86: hyperv: introduce vp_index_to_vcpu_idx mapping

2018-06-29 Thread Roman Kagan

On Fri, Jun 29, 2018 at 05:25:56PM +0200, Vitaly Kuznetsov wrote:
> Roman Kagan  writes:
> 
> > On Fri, Jun 29, 2018 at 03:10:14PM +0200, Vitaly Kuznetsov wrote:
> >> Roman Kagan  writes:
> >> 
> >> > On Fri, Jun 29, 2018 at 01:37:44PM +0200, Vitaly Kuznetsov wrote:
> >> >> The problem we're trying to solve here is: with PV TLB flush and IPI we
> >> >> need to walk through the supplied list of VP_INDEXes and get VCPU
> >> >> ids. Usually they match. But in case they don't [...]
> >> >
> >> > Why wouldn't they *in practice*?  Only if the userspace wanted to be
> >> > funny and assigned VP_INDEXes randomly?  I'm not sure we need to
> >> > optimize for this case.
> >> 
> >> Can someone please remind me why we allow userspace to change it in the
> >> first place?
> >
> > I can ;)
> >
> > We used not to, and reported KVM's vcpu index as the VP_INDEX.  However,
> > later we realized that VP_INDEX needed to be persistent across
> > migrations and otherwise also known to userspace.  Relying on the kernel
> > to always initialize its indices in the same order was unacceptable, and
> > we came up with no better way of synchronizing VP_INDEX between the
> > userspace and the kernel than to let the former to set it explicitly.
> >
> > However, this is basically a future-proofing feature; in practice, both
> > QEMU and KVM initialize their indices in the same order.
> 
> 
> Thanks!
> 
> But in the theoretical case when these indices start to differ after
> migration, users will notice a slowdown which will be hard to explain,
> right?

That's exactly why I suggested a warning on VP_INDEX != vcpu index in
kvm_hv_set_msr.

> Does it justify the need for vp_idx_to_vcpu_idx?

I'd personally prefer being pointed at a scenario where this becomes
relevant first.

Roman.

Re: [PATCH v3 3/5] KVM: x86: hyperv: use get_vcpu_by_vpidx() in kvm_hv_flush_tlb()

2018-06-29 Thread Roman Kagan

On Fri, Jun 29, 2018 at 05:21:47PM +0200, Vitaly Kuznetsov wrote:
> Roman Kagan  writes:
> 
> > On Fri, Jun 29, 2018 at 04:14:53PM +0200, Vitaly Kuznetsov wrote:
> >> VP_INDEX almost always matches VCPU id and get_vcpu_by_vpidx() is fast,
> >> use it instead of traversing full vCPU list every time.
> >> 
> >> To support the change switch kvm_make_vcpus_request_mask() to checking
> >> vcpu_id instead of vcpu index,
> >
> > I'm afraid you can't do this: vcpu_id (== apic id) can be sparse, i.e.
> > it's not very well suited for bitmaps and can exceed the max number of
> > vcpus.
> 
> True. The bitmap should be of KVM_MAX_VCPU_ID size, not
> KVM_MAX_VCPUS.
> 
> Unfortunately there's no convenient way to get VCPU idx from VCPU
> id, kvm_vcpu_get_idx() just walks the whole list :-( I see two possible
> options:
> 1) Add vcpu_idx fields to struct kvm_vcpu
> 2) Keep the change expecting masks of KVM_MAX_VCPU_ID in
> kvm_make_vcpus_request_mask(). KVM_MAX_VCPU_ID is currently 1023 so our
> bitmaps will be 16 longs long. Not sure if it's too much.

3) rework get_vcpu_by_vpidx into get_vcpu_idx_by_vpidx followed by
get_cpu, and use the former for your purposes
4) duplicate get_vcpu_by_vpidx logic in get_vcpu_idx_by_vpidx

Roman.

P.S. I'm starting to wonder how safe this get_vcpu_* thing is WRT vcpu
removal, but that's a different story anyway.

Re: [PATCH v3 3/5] KVM: x86: hyperv: use get_vcpu_by_vpidx() in kvm_hv_flush_tlb()

2018-06-29 Thread Roman Kagan

On Fri, Jun 29, 2018 at 07:26:36PM +0300, Roman Kagan wrote:
> On Fri, Jun 29, 2018 at 05:21:47PM +0200, Vitaly Kuznetsov wrote:
> > Roman Kagan  writes:
> > 
> > > On Fri, Jun 29, 2018 at 04:14:53PM +0200, Vitaly Kuznetsov wrote:
> > >> VP_INDEX almost always matches VCPU id and get_vcpu_by_vpidx() is fast,
> > >> use it instead of traversing full vCPU list every time.
> > >> 
> > >> To support the change switch kvm_make_vcpus_request_mask() to checking
> > >> vcpu_id instead of vcpu index,
> > >
> > > I'm afraid you can't do this: vcpu_id (== apic id) can be sparse, i.e.
> > > it's not very well suited for bitmaps and can exceed the max number of
> > > vcpus.
> > 
> > True. The bitmap should be of KVM_MAX_VCPU_ID size, not
> > KVM_MAX_VCPUS.
> > 
> > Unfortunately there's no convenient way to get VCPU idx from VCPU
> > id, kvm_vcpu_get_idx() just walks the whole list :-( I see two possible
> > options:
> > 1) Add vcpu_idx fields to struct kvm_vcpu
> > 2) Keep the change expecting masks of KVM_MAX_VCPU_ID in
> > kvm_make_vcpus_request_mask(). KVM_MAX_VCPU_ID is currently 1023 so our
> > bitmaps will be 16 longs long. Not sure if it's too much.
> 
> 3) rework get_vcpu_by_vpidx into get_vcpu_idx_by_vpidx followed by
> get_cpu, and use the former for your purposes

s/get_cpu/kvm_get_vcpu/ of course

> 4) duplicate get_vcpu_by_vpidx logic in get_vcpu_idx_by_vpidx
> 
> Roman.
> 
> P.S. I'm starting to wonder how safe this get_vcpu_* thing is WRT vcpu
> removal, but that's a different story anyway.

Re: [PATCH v4 1/5] KVM: x86: hyperv: enforce vp_index < KVM_MAX_VCPUS

2018-07-09 Thread Roman Kagan

On Tue, Jul 03, 2018 at 03:42:02PM +0200, Vitaly Kuznetsov wrote:
> Hyper-V TLFS (5.0b) states:
> 
> > Virtual processors are identified by using an index (VP index). The
> > maximum number of virtual processors per partition supported by the
> > current implementation of the hypervisor can be obtained through CPUID
> > leaf 0x4005. A virtual processor index must be less than the
> > maximum number of virtual processors per partition.
> 
> Forbid userspace to set VP_INDEX above KVM_MAX_VCPUS. get_vcpu_by_vpidx()
> can now be optimized to bail early when supplied vpidx is >= KVM_MAX_VCPUS.
> 
> Signed-off-by: Vitaly Kuznetsov 
> ---
>  arch/x86/kvm/hyperv.c | 8 +---
>  1 file changed, 5 insertions(+), 3 deletions(-)

Reviewed-by: Roman Kagan

Re: [PATCH v4 2/5] KVM: x86: hyperv: optimize 'all cpus' case in kvm_hv_flush_tlb()

2018-07-09 Thread Roman Kagan

On Tue, Jul 03, 2018 at 03:42:03PM +0200, Vitaly Kuznetsov wrote:
> We can use 'NULL' to represent 'all cpus' case in
> kvm_make_vcpus_request_mask() and avoid building vCPU mask with
> all vCPUs.
> 
> Suggested-by: Radim Krčmář 
> Signed-off-by: Vitaly Kuznetsov 
> ---
>  arch/x86/kvm/hyperv.c | 42 +++---
>  virt/kvm/kvm_main.c   |  6 ++
>  2 files changed, 25 insertions(+), 23 deletions(-)

Reviewed-by: Roman Kagan

Re: [PATCH v4 3/5] KVM: x86: hyperv: use get_vcpu_by_vpidx() in kvm_hv_flush_tlb()

2018-07-09 Thread Roman Kagan

On Tue, Jul 03, 2018 at 03:42:04PM +0200, Vitaly Kuznetsov wrote:
> VP_INDEX almost always matches VCPU id and get_vcpu_by_vpidx() is fast,
> use it instead of traversing full vCPU list every time.
> 
> To support the change split off get_vcpu_idx_by_vpidx() from
> get_vcpu_by_vpidx().
> 
> Signed-off-by: Vitaly Kuznetsov 
> ---
>  arch/x86/kvm/hyperv.c | 78 
> ---
>  1 file changed, 31 insertions(+), 47 deletions(-)

Reviewed-by: Roman Kagan

Re: [PATCH v4 4/5] x86/hyper-v: rename ipi_arg_{ex,non_ex} structures

2018-07-09 Thread Roman Kagan

On Tue, Jul 03, 2018 at 03:42:05PM +0200, Vitaly Kuznetsov wrote:
> These structures are going to be used from KVM code so let's make
> their names reflect their Hyper-V origin.
> 
> Signed-off-by: Vitaly Kuznetsov 
> ---
>  arch/x86/hyperv/hv_apic.c  | 12 ++--
>  arch/x86/include/asm/hyperv-tlfs.h | 16 +---
>  2 files changed, 15 insertions(+), 13 deletions(-)

Reviewed-by: Roman Kagan

Re: [PATCH v2 2/5] KVM: x86: hyperv: introduce vp_index_to_vcpu_idx mapping

2018-06-29 Thread Roman Kagan

On Thu, Jun 28, 2018 at 03:53:10PM +0200, Vitaly Kuznetsov wrote:
> While it is easy to get VP index from vCPU index the reverse task is hard.
> Basically, to solve it we have to walk all vCPUs checking if their VP index
> matches. For hypercalls like HvFlushVirtualAddress{List,Space}* and the
> upcoming HvSendSyntheticClusterIpi* where a single CPU may be specified in
> the whole set this is obviously sub-optimal.
> 
> As VP index can be set to anything <= U32_MAX by userspace using plain
> [0..MAX_VP_INDEX] array is not a viable option. Use condensed sorted
> array with logarithmic search complexity instead. Use RCU to make read
> access as fast as possible and maintain atomicity of updates.

Quoting TLFS 5.0C section 7.8.1:

> Virtual processors are identified by using an index (VP index). The
> maximum number of virtual processors per partition supported by the
> current implementation of the hypervisor can be obtained through CPUID
> leaf 0x4005. A virtual processor index must be less than the
> maximum number of virtual processors per partition.

so this is a dense index, and VP_INDEX >= KVM_MAX_VCPUS is invalid.  I
think we're better off enforcing this in kvm_hv_set_msr and keep the
translation simple.  If the algorithm in get_vcpu_by_vpidx is not good
enough (and yes it can be made to return NULL early on vpidx >=
KVM_MAX_VCPUS instead of taking the slow path) then a simple index array
of KVM_MAX_VCPUS entries should certainly do.

Roman.

Re: [PATCH v2 2/5] KVM: x86: hyperv: introduce vp_index_to_vcpu_idx mapping

2018-06-29 Thread Roman Kagan

On Fri, Jun 29, 2018 at 12:26:23PM +0200, Vitaly Kuznetsov wrote:
> Roman Kagan  writes:
> 
> > On Thu, Jun 28, 2018 at 03:53:10PM +0200, Vitaly Kuznetsov wrote:
> >> While it is easy to get VP index from vCPU index the reverse task is hard.
> >> Basically, to solve it we have to walk all vCPUs checking if their VP index
> >> matches. For hypercalls like HvFlushVirtualAddress{List,Space}* and the
> >> upcoming HvSendSyntheticClusterIpi* where a single CPU may be specified in
> >> the whole set this is obviously sub-optimal.
> >> 
> >> As VP index can be set to anything <= U32_MAX by userspace using plain
> >> [0..MAX_VP_INDEX] array is not a viable option. Use condensed sorted
> >> array with logarithmic search complexity instead. Use RCU to make read
> >> access as fast as possible and maintain atomicity of updates.
> >
> > Quoting TLFS 5.0C section 7.8.1:
> >
> >> Virtual processors are identified by using an index (VP index). The
> >> maximum number of virtual processors per partition supported by the
> >> current implementation of the hypervisor can be obtained through CPUID
> >> leaf 0x4005. A virtual processor index must be less than the
> >> maximum number of virtual processors per partition.
> >
> > so this is a dense index, and VP_INDEX >= KVM_MAX_VCPUS is invalid.  I
> > think we're better off enforcing this in kvm_hv_set_msr and keep the
> > translation simple.  If the algorithm in get_vcpu_by_vpidx is not good
> > enough (and yes it can be made to return NULL early on vpidx >=
> > KVM_MAX_VCPUS instead of taking the slow path) then a simple index array
> > of KVM_MAX_VCPUS entries should certainly do.
> 
> Sure, we can use pre-allocated [0..KVM_MAX_VCPUS] array instead and put
> limits on what userspace can assign VP_INDEX to. Howver, while thinking
> about it I decided to go with the more complex condensed array approach
> because the tendency is for KVM_MAX_VCPUS to grow and we will be
> pre-allocating more and more memory for no particular reason (so I think
> even 'struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]' in 'struct kvm' will need
> to be converted to something else eventually). 

We're talking of kilobytes here.  I guess this is going to be the least
of the scalability problems.

> Anyway, I'm flexible and if you think we should go this way now I'll do
> this in v3. We can re-think this when we later decide to raise
> KVM_MAX_VCPUS significantly.

Although there's no strict requirement for that I think every sensible
userspace will allocate VP_INDEX linearly resulting in it being equal to
KVM's vcpu index.  So we've yet to see a case where get_vcpu_by_vpidx
doesn't take the fast path.  If it ever starts appearing in the profiles
we may consider optimiziing it but ATM I don't even think introducing
the translation array is justified.

Roman.

Re: [PATCH] idr: fix invalid ptr dereference on item delete

2018-05-18 Thread Roman Kagan

On Fri, May 18, 2018 at 10:50:25AM -0700, Matthew Wilcox wrote:
> It'd be nice if you cc'd the person who wrote the code you're patching.
> You'd get a response a lot quicker than waiting until I happened to
> notice the email in a different forum.

I sent it to someone called "Matthew Wilcox ".
Also I cc'd that guy when I only started to point the finger at IDR as
the suspected culprit in that syzcaller report.  I thought it was him
who wrote the code...

> Thanks for finding the situation that leads to the bug.  Your fix is
> incorrect; it's legitimate to store a NULL value at offset 0, and
> your patch makes it impossible to delete.  Fortunately, the test-suite
> covers that case ;-)

How do you build it?  I wish I had it when debugging but I got linking
errors due to missing spin_lock_init, so I decided it wasn't up-to-date.

Thanks,
Roman.

P.S. I'll forward your message to all the recepients of my patch, to let
them know it's wrong and you have a better one.

Re: [PATCH] idr: fix invalid ptr dereference on item delete

2018-05-18 Thread Roman Kagan

On Thu, May 10, 2018 at 10:16:34PM +0300, Roman Kagan wrote:
> If an IDR contains a single entry at index==0, the underlying radix tree
> has a single item in its root node, in which case
> __radix_tree_lookup(index!=0) doesn't set its *@nodep argument (in
> addition to returning NULL).
> 
> However, the tree itself is not empty, i.e. the tree root doesn't have
> IDR_FREE tag.
> 
> As a result, on an attempt to remove an index!=0 entry from such an IDR,
> radix_tree_delete_item doesn't return early and calls
> __radix_tree_delete with invalid parameters which are then dereferenced.
> 
> Reported-by: syzbot+35666cba7f0a337e2...@syzkaller.appspotmail.com
> Signed-off-by: Roman Kagan 
> ---
>  lib/radix-tree.c | 5 +++--
>  1 file changed, 3 insertions(+), 2 deletions(-)
> 
> diff --git a/lib/radix-tree.c b/lib/radix-tree.c
> index da9e10c827df..10ff1bfae952 100644
> --- a/lib/radix-tree.c
> +++ b/lib/radix-tree.c
> @@ -2040,8 +2040,9 @@ void *radix_tree_delete_item(struct radix_tree_root 
> *root,
>   void *entry;
>  
>   entry = __radix_tree_lookup(root, index, &node, &slot);
> - if (!entry && (!is_idr(root) || node_tag_get(root, node, IDR_FREE,
> - get_slot_offset(node, slot
> + if (!entry && (!is_idr(root) || !node ||
> +node_tag_get(root, node, IDR_FREE,
> + get_slot_offset(node, slot
>   return NULL;
>  
>   if (item && entry != item)

Turned out Matthew didn't receive my messages; now that he's found this
patch elsewhere he's responded with a correct fix:

- Forwarded message from Matthew Wilcox  -

Date: Fri, 18 May 2018 10:50:25 -0700
From: Matthew Wilcox 
To: Roman Kagan 
Cc: Andrew Morton , linux-kernel@vger.kernel.org
Subject: Re: [PATCH] idr: fix invalid ptr dereference on item delete

It'd be nice if you cc'd the person who wrote the code you're patching.
You'd get a response a lot quicker than waiting until I happened to
notice the email in a different forum.

Thanks for finding the situation that leads to the bug.  Your fix is
incorrect; it's legitimate to store a NULL value at offset 0, and
your patch makes it impossible to delete.  Fortunately, the test-suite
covers that case ;-)

Andrew, can you take this through your tree for extra testing?

--- >8 ---

From: Matthew Wilcox 

If the radix tree underlying the IDR happens to be full and we attempt
to remove an id which is larger than any id in the IDR, we will call
__radix_tree_delete() with an uninitialised 'slot' pointer, at which
point anything could happen.  This was easiest to hit with a single entry
at id 0 and attempting to remove a non-0 id, but it could have happened
with 64 entries and attempting to remove an id >= 64.

Fixes: 0a835c4f090a ("Reimplement IDR and IDA using the radix tree")
Reported-by: syzbot+35666cba7f0a337e2...@syzkaller.appspotmail.com
Debugged-by: Roman Kagan 
Signed-off-by: Matthew Wilcox 

diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index da9e10c827df..4dd4fbc7279c 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -2036,10 +2036,12 @@ void *radix_tree_delete_item(struct radix_tree_root 
*root,
 unsigned long index, void *item)
 {
struct radix_tree_node *node = NULL;
-   void __rcu **slot;
+   void __rcu **slot = NULL;
void *entry;
 
entry = __radix_tree_lookup(root, index, &node, &slot);
+   if (!slot)
+   return NULL;
if (!entry && (!is_idr(root) || node_tag_get(root, node, IDR_FREE,
get_slot_offset(node, slot
return NULL;
diff --git a/tools/testing/radix-tree/idr-test.c 
b/tools/testing/radix-tree/idr-test.c
index 1c18617951dd..410ca58bbe9c 100644
--- a/tools/testing/radix-tree/idr-test.c
+++ b/tools/testing/radix-tree/idr-test.c
@@ -254,6 +254,13 @@ void idr_checks(void)
idr_remove(&idr, 0xfedcba98U);
idr_remove(&idr, 0);
 
+   assert(idr_alloc(&idr, DUMMY_PTR, 0, 0, GFP_KERNEL) == 0);
+   idr_remove(&idr, 1);
+   for (i = 1; i < RADIX_TREE_MAP_SIZE; i++)
+   assert(idr_alloc(&idr, DUMMY_PTR, 0, 0, GFP_KERNEL) == i);
+   idr_remove(&idr, 1 << 30);
+   idr_destroy(&idr);
+
for (i = INT_MAX - 3UL; i < INT_MAX + 1UL; i++) {
struct item *item = item_create(i, 0);
assert(idr_alloc(&idr, item, i, i + 10, GFP_KERNEL) == i);

--- original email ---

If an IDR contains a single entry at index==0, the underlying radix tree
has a single item in its root node, in which case
__radix_tree_lookup(index!=0) doesn't set its *@nodep argument (in
addition to r

Re: [PATCH v2 1/2] x86/hyper-v: Stop caring about EOI for direct stimers

2018-12-13 Thread Roman Kagan

On Wed, Dec 12, 2018 at 05:50:16PM +0100, Vitaly Kuznetsov wrote:
> Turns out we over-engineered Direct Mode for stimers a bit: unlike
> traditional stimers where we may want to try to re-inject the message upon
> EOI, Direct Mode stimers just set the irq in APIC and kvm_apic_set_irq()
> fails only when APIC is disabled (see APIC_DM_FIXED case in
> __apic_accept_irq()). Remove the redundant part.
> 
> Suggested-by: Roman Kagan 
> Signed-off-by: Vitaly Kuznetsov 
> ---
>  arch/x86/kvm/hyperv.c | 36 +++-
>  1 file changed, 3 insertions(+), 33 deletions(-)

Reviewed-by: Roman Kagan

Re: [PATCH v2 2/2] x86/kvm/hyper-v: disallow setting illegal vectors for direct mode stimers

2018-12-13 Thread Roman Kagan

On Wed, Dec 12, 2018 at 05:50:17PM +0100, Vitaly Kuznetsov wrote:
> APIC vectors used for direct mode stimers should be valid for lAPIC and
> just like genuine Hyper-V we should #GP when an illegal one is specified.
> 
> Add the appropriate check to stimer_set_config()
> 
> Suggested-by: Roman Kagan 
> Signed-off-by: Vitaly Kuznetsov 
> ---
>  arch/x86/kvm/hyperv.c | 5 +
>  1 file changed, 5 insertions(+)

Reviewed-by: Roman Kagan

Re: [PATCH v2 3/4] x86/kvm/hyper-v: direct mode for synthetic timers

2018-12-10 Thread Roman Kagan

On Mon, Nov 26, 2018 at 04:47:31PM +0100, Vitaly Kuznetsov wrote:
> Turns out Hyper-V on KVM (as of 2016) will only use synthetic timers
> if direct mode is available. With direct mode we notify the guest by
> asserting APIC irq instead of sending a SynIC message.
> 
> The implementation uses existing vec_bitmap for letting lapic code
> know that we're interested in the particular IRQ's EOI request. We assume
> that the same APIC irq won't be used by the guest for both direct mode
> stimer and as sint source (especially with AutoEOI semantics). It is
> unclear how things should be handled if that's not true.
> 
> Direct mode is also somewhat less expensive; in my testing
> stimer_send_msg() takes not less than 1500 cpu cycles and
> stimer_notify_direct() can usually be done in 300-400. WS2016 without
> Hyper-V, however, always sticks to non-direct version.
> 
> Signed-off-by: Vitaly Kuznetsov 
> ---
> - Changes since v1: avoid open-coding stimer_mark_pending() in
>   kvm_hv_synic_send_eoi() [Paolo Bonzini]
> ---
>  arch/x86/kvm/hyperv.c| 67 +++-
>  arch/x86/kvm/trace.h | 10 +++---
>  arch/x86/kvm/x86.c   |  1 +
>  include/uapi/linux/kvm.h |  1 +
>  4 files changed, 67 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
> index eaec15c738df..9533133be566 100644
> --- a/arch/x86/kvm/hyperv.c
> +++ b/arch/x86/kvm/hyperv.c
> @@ -38,6 +38,9 @@
>  
>  #define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, 64)
>  
> +static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
> + bool vcpu_kick);
> +
>  static inline u64 synic_read_sint(struct kvm_vcpu_hv_synic *synic, int sint)
>  {
>   return atomic64_read(&synic->sint[sint]);
> @@ -53,8 +56,21 @@ static inline int synic_get_sint_vector(u64 sint_value)
>  static bool synic_has_vector_connected(struct kvm_vcpu_hv_synic *synic,
> int vector)
>  {
> + struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
> + struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
> + struct kvm_vcpu_hv_stimer *stimer;
>   int i;
>  
> + for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) {
> + stimer = &hv_vcpu->stimer[i];
> + if (stimer->config.enable && stimer->config.direct_mode &&
> + stimer->config.apic_vector == vector)
> + return true;
> + }
> +
> + if (vector < HV_SYNIC_FIRST_VALID_VECTOR)
> + return false;
> +
>   for (i = 0; i < ARRAY_SIZE(synic->sint); i++) {
>   if (synic_get_sint_vector(synic_read_sint(synic, i)) == vector)
>   return true;
> @@ -80,14 +96,14 @@ static bool synic_has_vector_auto_eoi(struct 
> kvm_vcpu_hv_synic *synic,
>  static void synic_update_vector(struct kvm_vcpu_hv_synic *synic,
>   int vector)
>  {
> - if (vector < HV_SYNIC_FIRST_VALID_VECTOR)
> - return;
> -
>   if (synic_has_vector_connected(synic, vector))
>   __set_bit(vector, synic->vec_bitmap);
>   else
>   __clear_bit(vector, synic->vec_bitmap);
>  
> + if (vector < HV_SYNIC_FIRST_VALID_VECTOR)
> + return;
> +

Just noticed that the patch seems to assume that "direct" timers are
allowed to use any vectors including 0-15.  I guess this is incorrect,
and instead stimer_set_config should error out on direct mode with a
vector less than HV_SYNIC_FIRST_VALID_VECTOR.

Roman.

Re: [PATCH v2 3/4] x86/kvm/hyper-v: direct mode for synthetic timers

2018-12-10 Thread Roman Kagan

On Mon, Dec 10, 2018 at 01:54:18PM +0100, Vitaly Kuznetsov wrote:
> Roman Kagan  writes:
> > Just noticed that the patch seems to assume that "direct" timers are
> > allowed to use any vectors including 0-15.  I guess this is incorrect,
> > and instead stimer_set_config should error out on direct mode with a
> > vector less than HV_SYNIC_FIRST_VALID_VECTOR.
> 
> The spec is really vague about this and I'm not sure that this has
> anything to do with HV_SYNIC_FIRST_VALID_VECTOR (as these are actually
> not "synic" vectors, I *think* that SynIC doesn't even need to be
> enabled to make them work).
> 
> I checked and Hyper-V 2016 uses vector '0xff', not sure if it proves
> your point :-)
> 
> Do you envision any issues in KVM if we keep allowing vectors <
> HV_SYNIC_FIRST_VALID_VECTOR?

It's actually lapic that treats vectors 0..15 as illegal.  Nothing
Hyper-V specific here.

Roman.

Re: [PATCH] x86/hyper-v: Stop caring about EOI for direct stimers

2018-12-10 Thread Roman Kagan

[ Sorry, missed this one ]

On Wed, Dec 05, 2018 at 04:36:21PM +0100, Vitaly Kuznetsov wrote:
> Turns out we over-engineered Direct Mode for stimers a bit: unlike
> traditional stimers where we may want to try to re-inject the message upon
> EOI, Direct Mode stimers just set the irq in APIC and kvm_apic_set_irq()
> fails only when APIC is disabled (see APIC_DM_FIXED case in
> __apic_accept_irq()). Remove the redundant part.
> 
> Suggested-by: Roman Kagan 
> Signed-off-by: Vitaly Kuznetsov 
> ---
>  arch/x86/kvm/hyperv.c | 36 +++-
>  1 file changed, 3 insertions(+), 33 deletions(-)
> 
> diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
> index e6a2a085644a..0a16a77e6ac3 100644
> --- a/arch/x86/kvm/hyperv.c
> +++ b/arch/x86/kvm/hyperv.c
> @@ -56,21 +56,8 @@ static inline int synic_get_sint_vector(u64 sint_value)
>  static bool synic_has_vector_connected(struct kvm_vcpu_hv_synic *synic,
> int vector)
>  {
> - struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
> - struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
> - struct kvm_vcpu_hv_stimer *stimer;
>   int i;
>  
> - for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) {
> - stimer = &hv_vcpu->stimer[i];
> - if (stimer->config.enable && stimer->config.direct_mode &&
> - stimer->config.apic_vector == vector)
> - return true;
> - }
> -
> - if (vector < HV_SYNIC_FIRST_VALID_VECTOR)
> - return false;
> -
>   for (i = 0; i < ARRAY_SIZE(synic->sint); i++) {
>   if (synic_get_sint_vector(synic_read_sint(synic, i)) == vector)
>   return true;
> @@ -96,14 +83,14 @@ static bool synic_has_vector_auto_eoi(struct 
> kvm_vcpu_hv_synic *synic,
>  static void synic_update_vector(struct kvm_vcpu_hv_synic *synic,
>   int vector)
>  {
> + if (vector < HV_SYNIC_FIRST_VALID_VECTOR)
> + return;
> +
>   if (synic_has_vector_connected(synic, vector))
>   __set_bit(vector, synic->vec_bitmap);
>   else
>   __clear_bit(vector, synic->vec_bitmap);
>  
> - if (vector < HV_SYNIC_FIRST_VALID_VECTOR)
> - return;
> -
>   if (synic_has_vector_auto_eoi(synic, vector))
>   __set_bit(vector, synic->auto_eoi_bitmap);
>   else
> @@ -382,9 +369,7 @@ int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vpidx, u32 
> sint)
>  
>  void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector)
>  {
> - struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
>   struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
> - struct kvm_vcpu_hv_stimer *stimer;
>   int i;
>  
>   trace_kvm_hv_synic_send_eoi(vcpu->vcpu_id, vector);
> @@ -392,14 +377,6 @@ void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int 
> vector)
>   for (i = 0; i < ARRAY_SIZE(synic->sint); i++)
>   if (synic_get_sint_vector(synic_read_sint(synic, i)) == vector)
>   kvm_hv_notify_acked_sint(vcpu, i);
> -
> - for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) {
> - stimer = &hv_vcpu->stimer[i];
> - if (stimer->msg_pending && stimer->config.enable &&
> - stimer->config.direct_mode &&
> - stimer->config.apic_vector == vector)
> - stimer_mark_pending(stimer, false);
> - }
>  }
>  
>  static int kvm_hv_set_sint_gsi(struct kvm *kvm, u32 vpidx, u32 sint, int gsi)
> @@ -566,8 +543,6 @@ static int stimer_start(struct kvm_vcpu_hv_stimer *stimer)
>  static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config,
>bool host)
>  {
> - struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
> - struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
>   union hv_stimer_config new_config = {.as_uint64 = config},
>   old_config = {.as_uint64 = stimer->config.as_uint64};
>  
> @@ -580,11 +555,6 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer 
> *stimer, u64 config,
>   new_config.enable = 0;
>   stimer->config.as_uint64 = new_config.as_uint64;
>  
> - if (old_config.direct_mode)
> - synic_update_vector(&hv_vcpu->synic, old_config.apic_vector);
> - if (new_config.direct_mode)
> - synic_update_vector(&hv_vcpu->synic, new_config.apic_vector);
> -
>   stimer_mark_pending(stimer, false);
>   return 0;
>  }

As discussed in another thread, it seems worth while to make
stimer_set_config reject vectors 0..15.

Besides I'd rather sqwash this patch into the one that introduced direct
timers, before it reached Linus' tree.

Roman.

Re: [PATCH v2 4/7] x86/kvm/hyper-v: Introduce KVM_GET_SUPPORTED_HV_CPUID

2018-12-11 Thread Roman Kagan

On Mon, Dec 10, 2018 at 06:21:56PM +0100, Vitaly Kuznetsov wrote:
> With every new Hyper-V Enlightenment we implement we're forced to add a
> KVM_CAP_HYPERV_* capability. While this approach works it is fairly
> inconvenient: the majority of the enlightenments we do have corresponding
> CPUID feature bit(s) and userspace has to know this anyways to be able to
> expose the feature to the guest.
> 
> Add KVM_GET_SUPPORTED_HV_CPUID ioctl (backed by KVM_CAP_HYPERV_CPUID, "one
> cap to rule them all!") returning all Hyper-V CPUID feature leaves.
> 
> Using the existing KVM_GET_SUPPORTED_CPUID doesn't seem to be possible:
> Hyper-V CPUID feature leaves intersect with KVM's (e.g. 0x4000,
> 0x4001) and we would probably confuse userspace in case we decide to
> return these twice.
> 
> KVM_CAP_HYPERV_CPUID's number is interim: we're intended to drop
> KVM_CAP_HYPERV_STIMER_DIRECT and use its number instead.
> 
> Suggested-by: Paolo Bonzini 
> Signed-off-by: Vitaly Kuznetsov 
> ---
> Changes since v1:
> - Document KVM_GET_SUPPORTED_HV_CPUID and KVM_CAP_HYPERV_CPUID.
> - Fix error handling in kvm_vcpu_ioctl_get_hv_cpuid()
> ---
>  Documentation/virtual/kvm/api.txt |  57 ++
>  arch/x86/kvm/hyperv.c | 121 ++
>  arch/x86/kvm/hyperv.h |   2 +
>  arch/x86/kvm/x86.c|  20 +
>  include/uapi/linux/kvm.h  |   4 +
>  5 files changed, 204 insertions(+)
> 
> diff --git a/Documentation/virtual/kvm/api.txt 
> b/Documentation/virtual/kvm/api.txt
> index cd209f7730af..5b72de0af24d 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -3753,6 +3753,63 @@ Coalesced pio is based on coalesced mmio. There is 
> little difference
>  between coalesced mmio and pio except that coalesced pio records accesses
>  to I/O ports.
>  
> +4.117 KVM_GET_SUPPORTED_HV_CPUID
> +
> +Capability: KVM_CAP_HYPERV_CPUID
> +Architectures: x86
> +Type: vcpu ioctl
> +Parameters: struct kvm_cpuid2 (in/out)
> +Returns: 0 on success, -1 on error
> +
> +struct kvm_cpuid2 {
> + __u32 nent;
> + __u32 padding;
> + struct kvm_cpuid_entry2 entries[0];
> +};
> +
> +struct kvm_cpuid_entry2 {
> + __u32 function;
> + __u32 index;
> + __u32 flags;
> + __u32 eax;
> + __u32 ebx;
> + __u32 ecx;
> + __u32 edx;
> + __u32 padding[3];
> +};
> +
> +This ioctl returns x86 cpuid features leaves related to Hyper-V emulation in
> +KVM.  Userspace can use the information returned by this ioctl to construct
> +cpuid information presented to guests consuming Hyper-V enlightenments (e.g.
> +Windows or Hyper-V guests).
> +
> +CPUID feature leaves returned by this ioctl are defined by Hyper-V Top Level
> +Functional Specification (TLFS). These leaves can't be obtained with
> +KVM_GET_SUPPORTED_CPUID ioctl because some of them intersect with KVM feature
> +leaves (0x4000, 0x4001).
> +
> +Currently, the following list of CPUID leaves are returned:
> + HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS
> + HYPERV_CPUID_INTERFACE
> + HYPERV_CPUID_VERSION
> + HYPERV_CPUID_FEATURES
> + HYPERV_CPUID_ENLIGHTMENT_INFO
> + HYPERV_CPUID_IMPLEMENT_LIMITS
> + HYPERV_CPUID_NESTED_FEATURES
> +
> +HYPERV_CPUID_NESTED_FEATURES leaf is only exposed when Enlightened VMCS was
> +enabled on the corresponding vCPU (KVM_CAP_HYPERV_ENLIGHTENED_VMCS).

IOW the output of ioctl(KVM_GET_SUPPORTED_HV_CPUID) depends on
whether ioctl(KVM_ENABLE_CAP, KVM_CAP_HYPERV_ENLIGHTENED_VMCS) has
already been called on that vcpu?  I wonder if this fits the intended
usage?

Roman.

> +
> +Userspace invokes KVM_GET_SUPPORTED_CPUID by passing a kvm_cpuid2 structure
> +with the 'nent' field indicating the number of entries in the variable-size
> +array 'entries'.  If the number of entries is too low to describe all Hyper-V
> +feature leaves, an error (E2BIG) is returned. If the number is more or equal
> +to the number of Hyper-V feature leaves, the 'nent' field is adjusted to the
> +number of valid entries in the 'entries' array, which is then filled.
> +
> +'index' and 'flags' fields in 'struct kvm_cpuid_entry2' are currently 
> reserved,
> +userspace should not expect to get any particular value there.
> +
>  5. The kvm_run structure
>  
>  
> diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
> index cecf907e4c49..04c3cdf3389e 100644
> --- a/arch/x86/kvm/hyperv.c
> +++ b/arch/x86/kvm/hyperv.c
> @@ -1804,3 +1804,124 @@ int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct 
> kvm_hyperv_eventfd *args)
>   return kvm_hv_eventfd_deassign(kvm, args->conn_id);
>   return kvm_hv_eventfd_assign(kvm, args->conn_id, args->fd);
>  }
> +
> +int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 
> *cpuid,
> + struct kvm_cpuid_entry2 __user *entries)
> +{
> + uint16_t evmcs_ver = kvm_x86_ops->nested_get_evmcs_version(vcpu);
> + struct kvm_cpuid_entry2 cpuid_entries[] = {
>

Re: [PATCH v2 4/7] x86/kvm/hyper-v: Introduce KVM_GET_SUPPORTED_HV_CPUID

2018-12-11 Thread Roman Kagan

On Tue, Dec 11, 2018 at 02:28:14PM +0100, Vitaly Kuznetsov wrote:
> Roman Kagan  writes:
> 
> > On Mon, Dec 10, 2018 at 06:21:56PM +0100, Vitaly Kuznetsov wrote:
> 
> >> +
> >> +Currently, the following list of CPUID leaves are returned:
> >> + HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS
> >> + HYPERV_CPUID_INTERFACE
> >> + HYPERV_CPUID_VERSION
> >> + HYPERV_CPUID_FEATURES
> >> + HYPERV_CPUID_ENLIGHTMENT_INFO
> >> + HYPERV_CPUID_IMPLEMENT_LIMITS
> >> + HYPERV_CPUID_NESTED_FEATURES
> >> +
> >> +HYPERV_CPUID_NESTED_FEATURES leaf is only exposed when Enlightened VMCS 
> >> was
> >> +enabled on the corresponding vCPU (KVM_CAP_HYPERV_ENLIGHTENED_VMCS).
> >
> > IOW the output of ioctl(KVM_GET_SUPPORTED_HV_CPUID) depends on
> > whether ioctl(KVM_ENABLE_CAP, KVM_CAP_HYPERV_ENLIGHTENED_VMCS) has
> > already been called on that vcpu?  I wonder if this fits the intended
> > usage?
> 
> I added HYPERV_CPUID_NESTED_FEATURES in the list (and made the new ioctl
> per-cpu and not per-vm) for consistency. *In theory*
> KVM_CAP_HYPERV_ENLIGHTENED_VMCS is also enabled per-vcpu so some
> hypothetical userspace can later check enabled eVMCS versions (which can
> differ across vCPUs!) with KVM_GET_SUPPORTED_HV_CPUID. We will also have
> direct tlb flush and other nested features there so to avoid addning new
> KVM_CAP_* for them we need the CPUID.

This is different from how KVM_GET_SUPPORTED_CPUID is used: QEMU assumes
that its output doesn't change between calls, and even caches the result
calling the ioctl only once.

> Another thing I'm thinking about is something like 'hv_all' cpu flag for
> Qemu which would enable everything by setting guest CPUIDs to what
> KVM_GET_SUPPORTED_HV_CPUID returns. In that case it would also be
> convenient to have HYPERV_CPUID_NESTED_FEATURES properly filled (or not
> filled when eVMCS was not enabled).

I think this is orthogonal to the way you obtain capability info from
the kernel.

Roman.

Re: [PATCH v2 4/7] x86/kvm/hyper-v: Introduce KVM_GET_SUPPORTED_HV_CPUID

2018-12-11 Thread Roman Kagan

On Tue, Dec 11, 2018 at 04:04:01PM +0100, Vitaly Kuznetsov wrote:
> Roman Kagan  writes:
> 
> > On Tue, Dec 11, 2018 at 02:28:14PM +0100, Vitaly Kuznetsov wrote:
> >> Roman Kagan  writes:
> >> 
> >> > On Mon, Dec 10, 2018 at 06:21:56PM +0100, Vitaly Kuznetsov wrote:
> >> 
> >> >> +
> >> >> +Currently, the following list of CPUID leaves are returned:
> >> >> + HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS
> >> >> + HYPERV_CPUID_INTERFACE
> >> >> + HYPERV_CPUID_VERSION
> >> >> + HYPERV_CPUID_FEATURES
> >> >> + HYPERV_CPUID_ENLIGHTMENT_INFO
> >> >> + HYPERV_CPUID_IMPLEMENT_LIMITS
> >> >> + HYPERV_CPUID_NESTED_FEATURES
> >> >> +
> >> >> +HYPERV_CPUID_NESTED_FEATURES leaf is only exposed when Enlightened 
> >> >> VMCS was
> >> >> +enabled on the corresponding vCPU (KVM_CAP_HYPERV_ENLIGHTENED_VMCS).
> >> >
> >> > IOW the output of ioctl(KVM_GET_SUPPORTED_HV_CPUID) depends on
> >> > whether ioctl(KVM_ENABLE_CAP, KVM_CAP_HYPERV_ENLIGHTENED_VMCS) has
> >> > already been called on that vcpu?  I wonder if this fits the intended
> >> > usage?
> >> 
> >> I added HYPERV_CPUID_NESTED_FEATURES in the list (and made the new ioctl
> >> per-cpu and not per-vm) for consistency. *In theory*
> >> KVM_CAP_HYPERV_ENLIGHTENED_VMCS is also enabled per-vcpu so some
> >> hypothetical userspace can later check enabled eVMCS versions (which can
> >> differ across vCPUs!) with KVM_GET_SUPPORTED_HV_CPUID. We will also have
> >> direct tlb flush and other nested features there so to avoid addning new
> >> KVM_CAP_* for them we need the CPUID.
> >
> > This is different from how KVM_GET_SUPPORTED_CPUID is used: QEMU assumes
> > that its output doesn't change between calls, and even caches the result
> > calling the ioctl only once.
> >
> 
> Yes, I'm not sure if we have to have full consistency between
> KVM_GET_SUPPORTED_CPUID and KVM_GET_SUPPORTED_HV_CPUID.

Neither do I.  I just noticed the difference and thought it might
matter.

> >> Another thing I'm thinking about is something like 'hv_all' cpu flag for
> >> Qemu which would enable everything by setting guest CPUIDs to what
> >> KVM_GET_SUPPORTED_HV_CPUID returns. In that case it would also be
> >> convenient to have HYPERV_CPUID_NESTED_FEATURES properly filled (or not
> >> filled when eVMCS was not enabled).
> >
> > I think this is orthogonal to the way you obtain capability info from
> > the kernel.
> 
> Not necessarily. If very dumb userspace does 'host passthrough' for
> Hyper-V features without doing anything (e.g. not enabling Enlightened
> VMCS) it will just put the result of KVM_GET_SUPPORTED_HV_CPUID in guest
> facing CPUIDs and it will all work. In case eVMCS was previously enabled
> it again just copies everything and this still works.
> 
> We don't probably need this for Qemu though. If you think it would be
> better to have HYPERV_CPUID_NESTED_FEATURES returned regardless of eVMCS
> enablement I'm ready to budge)

I have no opinion on this.  I hope Paolo, who requested the feature, can
explain the desired semantics :)

Roman.

[PATCH] drivers/nvdimm/e820: turn off write cache by default

2022-09-29 Thread Roman Kagan

When regular DRAM is registered for use as PMEM via "memmap" command
line parameter, there's no write cache in front of the backing store of
this PMEM (as there's no backing store itself), so there's no point
doing expensive cache flush on sync etc.

Mark the regions being registered with e820 as ND_REGION_PERSIST_CACHE
so that write cache is off by default for the respective DAX devices.
This also matches the assumed behavior of the flag
ND_REGION_PERSIST_CACHE:

  Platform ensures entire CPU store data path is flushed to pmem on
  system power loss.

for the only usecase where such regions actually kind of persist the
data -- across kexec.

Signed-off-by: Roman Kagan 
---
 drivers/nvdimm/e820.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/nvdimm/e820.c b/drivers/nvdimm/e820.c
index 4cd18be9d0e9..3af63b7b6d23 100644
--- a/drivers/nvdimm/e820.c
+++ b/drivers/nvdimm/e820.c
@@ -28,6 +28,7 @@ static int e820_register_one(struct resource *res, void *data)
ndr_desc.numa_node = numa_map_to_online_node(nid);
ndr_desc.target_node = nid;
set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags);
+   set_bit(ND_REGION_PERSIST_CACHE, &ndr_desc.flags);
if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc))
return -ENXIO;
return 0;
-- 
2.37.3

Re: [PATCH 2/3] x86/kvm/hyper-v: remove stale entries from vec_bitmap/auto_eoi_bitmap on vector change

2018-02-28 Thread Roman Kagan

On Wed, Feb 28, 2018 at 02:44:00PM +0100, Vitaly Kuznetsov wrote:
> When a new vector is written to SINx we update vec_bitmap/auto_eoi_bitmap
> but we forget to remove old vector from these masks (in case it is not
> present in some other SINTx).
> 
> Signed-off-by: Vitaly Kuznetsov 
> ---
>  arch/x86/include/uapi/asm/hyperv.h |  2 ++
>  arch/x86/kvm/hyperv.c  | 32 ++--
>  2 files changed, 24 insertions(+), 10 deletions(-)

Reviewed-by: Roman Kagan

Re: [PATCH 3/3] x86/kvm/hyper-v: inject #GP only when invalid SINTx vector is unmasked

2018-02-28 Thread Roman Kagan

On Wed, Feb 28, 2018 at 02:44:01PM +0100, Vitaly Kuznetsov wrote:
> Hyper-V 2016 on KVM with SynIC enabled doesn't boot with the following
> trace:
> 
> kvm_entry:vcpu 0
> kvm_exit: reason MSR_WRITE rip 0xf8000131c1e5 info 0 0
> kvm_hv_synic_set_msr: vcpu_id 0 msr 0x4090 data 0x1 host 0
> kvm_msr:  msr_write 4090 = 0x1 (#GP)
> kvm_inj_exception:#GP (0x0)

I don't remember having seen this...  Does this happen with the mainline
QEMU, which doesn't set the SintPollingModeAvailable (17) bit in cpuid
0x4003:edx?

> 
> KVM acts according to the following statement from TLFS:
> 
> "
> 11.8.4 SINTx Registers
> ...
> Valid values for vector are 16-255 inclusive. Specifying an invalid
> vector number results in #GP.
> "
> 
> However, I checked and genuine Hyper-V doesn't #GP when we write 0x1
> to SINTx. I checked with Microsoft and they confirmed that if either the
> Masked bit (bit 16) or the Polling bit (bit 18) is set to 1, then they
> ignore the value of Vector. Make KVM act accordingly.

I wonder if that cpuid setting affects this behavior?  Also curious what
exactly the guest is trying to achieve writing this bogus value?

> 
> Signed-off-by: Vitaly Kuznetsov 
> ---
>  arch/x86/include/uapi/asm/hyperv.h | 1 +
>  arch/x86/kvm/hyperv.c  | 7 ++-
>  2 files changed, 7 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/include/uapi/asm/hyperv.h 
> b/arch/x86/include/uapi/asm/hyperv.h
> index 62c778a303a1..a492dc357bd7 100644
> --- a/arch/x86/include/uapi/asm/hyperv.h
> +++ b/arch/x86/include/uapi/asm/hyperv.h
> @@ -326,6 +326,7 @@ typedef struct _HV_REFERENCE_TSC_PAGE {
>  #define HV_SYNIC_SIEFP_ENABLE(1ULL << 0)
>  #define HV_SYNIC_SINT_MASKED (1ULL << 16)
>  #define HV_SYNIC_SINT_AUTO_EOI   (1ULL << 17)
> +#define HV_SYNIC_SINT_POLLING(1ULL << 18)
>  #define HV_SYNIC_SINT_VECTOR_MASK(0xFF)
>  
>  #define HV_SYNIC_STIMER_COUNT(4)
> diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
> index 6d14f808145d..d3d866c32976 100644
> --- a/arch/x86/kvm/hyperv.c
> +++ b/arch/x86/kvm/hyperv.c
> @@ -95,9 +95,14 @@ static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, 
> int sint,
> u64 data, bool host)
>  {
>   int vector, old_vector;
> + bool masked, polling;
>  
>   vector = data & HV_SYNIC_SINT_VECTOR_MASK;
> - if (vector < HV_SYNIC_FIRST_VALID_VECTOR && !host)
> + masked = data & HV_SYNIC_SINT_MASKED;
> + polling = data & HV_SYNIC_SINT_POLLING;
> +
> + if (vector < HV_SYNIC_FIRST_VALID_VECTOR &&
> + !host && !masked && !polling)
>   return 1;
>   /*
>* Guest may configure multiple SINTs to use the same vector, so

I'm not sure this is enough to implement the polling mode: per spec,

> Setting the polling bit will have the effect of unmasking an interrupt
> source, except that an actual interrupt is not generated.

However, if the guest sets a valid vector and the masked bit cleared,
we'll consider it a usual SINT and add to masks and inject interrupts,
etc, regardless of the polling bit.

I must admit I'm confused by the above quote from the spec: is the
polling bit supposed to come together with the masked bit?  If so, then
we probably should validate it here (but your logs indicate otherwise).
In general I'm missing the utility of this mode: why should an interrupt
controller be involved in polling at all?

Roman.

Re: [PATCH 3/3] x86/kvm/hyper-v: inject #GP only when invalid SINTx vector is unmasked

2018-02-28 Thread Roman Kagan

On Wed, Feb 28, 2018 at 04:35:59PM +0100, Vitaly Kuznetsov wrote:
> Roman Kagan  writes:
> 
> > On Wed, Feb 28, 2018 at 02:44:01PM +0100, Vitaly Kuznetsov wrote:
> >> Hyper-V 2016 on KVM with SynIC enabled doesn't boot with the following
> >> trace:
> >> 
> >> kvm_entry:vcpu 0
> >> kvm_exit: reason MSR_WRITE rip 0xf8000131c1e5 info 0 0
> >> kvm_hv_synic_set_msr: vcpu_id 0 msr 0x4090 data 0x1 host 0
> >> kvm_msr:  msr_write 4090 = 0x1 (#GP)
> >> kvm_inj_exception:#GP (0x0)
> >
> > I don't remember having seen this...  Does this happen with the mainline
> > QEMU, which doesn't set the SintPollingModeAvailable (17) bit in cpuid
> > 0x4003:edx?
> 
> Yes, you need to have Hyper-V role enabled, kvm-intel modules needs to
> be loaded with 'nesting' support enabled.

Thanks, reproduced now.

> >> 
> >> KVM acts according to the following statement from TLFS:
> >> 
> >> "
> >> 11.8.4 SINTx Registers
> >> ...
> >> Valid values for vector are 16-255 inclusive. Specifying an invalid
> >> vector number results in #GP.
> >> "
> >> 
> >> However, I checked and genuine Hyper-V doesn't #GP when we write 0x1
> >> to SINTx. I checked with Microsoft and they confirmed that if either the
> >> Masked bit (bit 16) or the Polling bit (bit 18) is set to 1, then they
> >> ignore the value of Vector. Make KVM act accordingly.
> >
> > I wonder if that cpuid setting affects this behavior?  Also curious what
> > exactly the guest is trying to achieve writing this bogus value?
> 
> The value is actually the default value which is supposed to be there:
> 
> "At virtual processor creation time, the default value of all SINTx
> (synthetic interrupt source) registers is 0x0001." so I
> guess this is just an intialization procedure.

Oh, sorry, I trapped on that polling thing throughout the patch so I
missed that the value you observe has actually only the masked bit set,
which is indeed the initial value (no idea why the guest *writes* it
though, it's the hypervisor's responsibility to put it there on vCPU
reset).

> >> 
> >> Signed-off-by: Vitaly Kuznetsov 
> >> ---
> >>  arch/x86/include/uapi/asm/hyperv.h | 1 +
> >>  arch/x86/kvm/hyperv.c  | 7 ++-
> >>  2 files changed, 7 insertions(+), 1 deletion(-)
> >> 
> >> diff --git a/arch/x86/include/uapi/asm/hyperv.h 
> >> b/arch/x86/include/uapi/asm/hyperv.h
> >> index 62c778a303a1..a492dc357bd7 100644
> >> --- a/arch/x86/include/uapi/asm/hyperv.h
> >> +++ b/arch/x86/include/uapi/asm/hyperv.h
> >> @@ -326,6 +326,7 @@ typedef struct _HV_REFERENCE_TSC_PAGE {
> >>  #define HV_SYNIC_SIEFP_ENABLE (1ULL << 0)
> >>  #define HV_SYNIC_SINT_MASKED  (1ULL << 16)
> >>  #define HV_SYNIC_SINT_AUTO_EOI(1ULL << 17)
> >> +#define HV_SYNIC_SINT_POLLING (1ULL << 18)
> >>  #define HV_SYNIC_SINT_VECTOR_MASK (0xFF)
> >>  
> >>  #define HV_SYNIC_STIMER_COUNT (4)
> >> diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
> >> index 6d14f808145d..d3d866c32976 100644
> >> --- a/arch/x86/kvm/hyperv.c
> >> +++ b/arch/x86/kvm/hyperv.c
> >> @@ -95,9 +95,14 @@ static int synic_set_sint(struct kvm_vcpu_hv_synic 
> >> *synic, int sint,
> >>  u64 data, bool host)
> >>  {
> >>int vector, old_vector;
> >> +  bool masked, polling;
> >>  
> >>vector = data & HV_SYNIC_SINT_VECTOR_MASK;
> >> -  if (vector < HV_SYNIC_FIRST_VALID_VECTOR && !host)
> >> +  masked = data & HV_SYNIC_SINT_MASKED;
> >> +  polling = data & HV_SYNIC_SINT_POLLING;
> >> +
> >> +  if (vector < HV_SYNIC_FIRST_VALID_VECTOR &&
> >> +  !host && !masked && !polling)
> >>return 1;
> >>/*
> >> * Guest may configure multiple SINTs to use the same vector, so
> >
> > I'm not sure this is enough to implement the polling mode: per spec,
> >
> 
> Oh, no, I wasn't trying to -- and by the way we don't currently announce
> SintPollingModeAvailable so guests are not supposed to do that. This is
> rather a future proof to 'not forget'.
> 
> >> Setting the polling bit will have the effect of unmasking an interrup

Re: [PATCH 1/3] x86/kvm/hyper-v: add reenlightenment MSRs support

2018-02-28 Thread Roman Kagan

On Wed, Feb 28, 2018 at 02:43:59PM +0100, Vitaly Kuznetsov wrote:
> Nested Hyper-V/Windows guest running on top of KVM will use TSC page
> clocksource in two cases:
> - L0 exposes invariant TSC (CPUID.8007H:EDX[8]).
> - L0 provides Hyper-V Reenlightenment support (CPUID.4003H:EAX[13]).
> 
> Exposing invariant TSC effectively blocks migration to hosts with different
> TSC frequencies,

I wonder if TSC scaling on the destination host doesn't allow to relax
this requirement?

> providing reenlightenment support will be needed when we
> start migrating nested workloads.
> 
> Implement rudimentary support for reenlightenment MSRs. For now, these are
> just read/write MSRs with no effect.
> 
> Signed-off-by: Vitaly Kuznetsov 
> ---
>  arch/x86/include/asm/kvm_host.h |  4 
>  arch/x86/kvm/hyperv.c   | 21 +
>  arch/x86/kvm/x86.c  | 12 +++-
>  3 files changed, 36 insertions(+), 1 deletion(-)

Reviewed-by: Roman Kagan

Re: [PATCH v2 3/3] x86/kvm/hyper-v: inject #GP only when invalid SINTx vector is unmasked

2018-03-01 Thread Roman Kagan

On Thu, Mar 01, 2018 at 03:15:14PM +0100, Vitaly Kuznetsov wrote:
> Hyper-V 2016 on KVM with SynIC enabled doesn't boot with the following
> trace:
> 
> kvm_entry:vcpu 0
> kvm_exit: reason MSR_WRITE rip 0xf8000131c1e5 info 0 0
> kvm_hv_synic_set_msr: vcpu_id 0 msr 0x4090 data 0x1 host 0
> kvm_msr:  msr_write 4090 = 0x1 (#GP)
> kvm_inj_exception:#GP (0x0)
> 
> KVM acts according to the following statement from TLFS:
> 
> "
> 11.8.4 SINTx Registers
> ...
> Valid values for vector are 16-255 inclusive. Specifying an invalid
> vector number results in #GP.
> "
> 
> However, I checked and genuine Hyper-V doesn't #GP when we write 0x1
> to SINTx. I checked with Microsoft and they confirmed that if either the
> Masked bit (bit 16) or the Polling bit (bit 18) is set to 1, then they
> ignore the value of Vector. Make KVM act accordingly.
> 
> Signed-off-by: Vitaly Kuznetsov 
> ---
> Changes since v1:
> - Drop 'polling' bit check for now as we don't support this mode. We'll
>   need to bring some form of this check back when polling mode is
>   implemented [Roman Kagan].
> - Add a comment explaining "!host && !masked" in synic_set_sint()
> ---
>  arch/x86/kvm/hyperv.c | 10 +-
>  1 file changed, 9 insertions(+), 1 deletion(-)

Reviewed-by: Roman Kagan

Re: [PATCH 0/5] KVM: x86: hyperv: PV TLB flush for Windows guests

2018-04-03 Thread Roman Kagan

On Mon, Apr 02, 2018 at 06:10:54PM +0200, Vitaly Kuznetsov wrote:
> This is both a new feature and a bugfix.
> 
> Bugfix description: 
> 
> It was found that Windows 2016 guests on KVM crash when they have > 64
> vCPUs, non-flat topology (>1 core/thread per socket; in case it has >64
> sockets Windows just ignores vCPUs above 64) and Hyper-V enlightenments
> (any) are enabled. The most common error reported is "PAGE FAULT IN
> NONPAGED AREA" but I saw different messages. Apparently, Windows doesn't
> expect to run on a Hyper-V server without PV TLB flush support as there's
> no such Hyper-V servers out there (it's only WS2016 supporting > 64 vCPUs
> AFAIR).
> 
> Adding PV TLB flush support to KVM helps, Windows 2016 guests now boot 
> normally (I tried '-smp 128,sockets=64,cores=1,threads=2' and 
> '-smp 128,sockets=8,cores=16,threads=1' but other topologies should work
> too).
> 
> Feature description:
> 
> PV TLB flush helps a lot when running overcommited. KVM gained support for
> it recently but it is only available for Linux guests. Windows guests use
> emulated Hyper-V interface and PV TLB flush needs to be added there.
> 
> I tested WS2016 guest with 128 vCPUs running on a 12 pCPU server. The test
> was running 64 threads doing 100 mmap()/munmap() for 16384 pages with a
> tiny random nanosleep in between (I used Cygwin. It would be great if
> someone could point me to a good Windows-native TLB trashing test).
> 
> The results are:
> Before:
> real0m44.362s
> user0m1.796s
> sys 6m43.218s
> 
> After:
> real0m24.425s
> user0m1.811s
> sys 0m40.625s
> 
> When running without overcommit (single 12 vCPU guest on 12 pCPU server) the
> results of the same test are very close:
> Before:
> real0m21.237s
> user0m1.531s
> sys 0m19.984s
> 
> After:
> real0m21.082s
> user0m1.546s
> sys 0m20.030s

I vaguely remember Denis Plotnikov (cc-d) did a similar attempt a couple
of years ago.  IIRC the outcome was that win2012r2 (back then) guests
started to also use this mechanism for local tlb flushes via self-IPI,
which led to noticable degradation on certain workloads.

Denis do you have any details to share?

Thanks,
Roman.

Re: [Qemu-devel] [RFC qemu 0/4] A PV solution for live migration optimization

2016-03-09 Thread Roman Kagan

On Fri, Mar 04, 2016 at 06:51:21PM +, Dr. David Alan Gilbert wrote:
> * Paolo Bonzini (pbonz...@redhat.com) wrote:
> > 
> > 
> > On 04/03/2016 15:26, Li, Liang Z wrote:
> > >> > 
> > >> > The memory usage will keep increasing due to ever growing caches, etc, 
> > >> > so
> > >> > you'll be left with very little free memory fairly soon.
> > >> > 
> > > I don't think so.
> > > 
> > 
> > Roman is right.  For example, here I am looking at a 64 GB (physical)
> > machine which was booted about 30 minutes ago, and which is running
> > disk-heavy workloads (installing VMs).
> > 
> > Since I have started writing this email (2 minutes?), the amount of free
> > memory has already gone down from 37 GB to 33 GB.  I expect that by the
> > time I have finished running the workload, in two hours, it will not
> > have any free memory.
> 
> But what about a VM sitting idle, or that just has more RAM assigned to it
> than is currently using.
>  I've got a host here that's been up for 46 days and has been doing some
> heavy VM debugging a few days ago, but today:
> 
> # free -m
>   totalusedfree  shared  buff/cache   
> available
> Mem:  965361146   44834 184   50555   
> 94735
> 
> I very rarely use all it's RAM, so it's got a big chunk of free RAM, and yes
> it's got a big chunk of cache as well.

One of the promises of virtualization is better resource utilization.
People tend to avoid purchasing VMs so much oversized that they never
touch a significant amount of their RAM.  (Well, at least this is how
things stand in hosting market; I guess enterprize market is similar in
this regard).

That said, I'm not at all opposed to optimizing the migration of free
memory; what I'm trying to say is that creating brand new infrastructure
specifically for that case doesn't look justified when the existing one
can cover it in addition to much more common scenarios.

Roman.

Re: [Qemu-devel] [RFC qemu 0/4] A PV solution for live migration optimization

2016-03-09 Thread Roman Kagan

On Mon, Mar 07, 2016 at 01:40:06PM +0200, Michael S. Tsirkin wrote:
> On Mon, Mar 07, 2016 at 06:49:19AM +, Li, Liang Z wrote:
> > > > No. And it's exactly what I mean. The ballooned memory is still
> > > > processed during live migration without skipping. The live migration 
> > > > code is
> > > in migration/ram.c.
> > > 
> > > So if guest acknowledged VIRTIO_BALLOON_F_MUST_TELL_HOST, we can
> > > teach qemu to skip these pages.
> > > Want to write a patch to do this?
> > > 
> > 
> > Yes, we really can teach qemu to skip these pages and it's not hard.  
> > The problem is the poor performance, this PV solution
> 
> Balloon is always PV. And do not call patches solutions please.
> 
> > is aimed to make it more
> > efficient and reduce the performance impact on guest.
> 
> We need to get a bit beyond this.  You are making multiple
> changes, it seems to make sense to split it all up, and analyse each
> change separately.

Couldn't agree more.

There are three stages in this optimization:

1) choosing which pages to skip

2) communicating them from guest to host

3) skip transferring uninteresting pages to the remote side on migration

For (3) there seems to be a low-hanging fruit to amend
migration/ram.c:iz_zero_range() to consult /proc/self/pagemap.  This
would work for guest RAM that hasn't been touched yet or which has been
ballooned out.

For (1) I've been trying to make a point that skipping clean pages is
much more likely to result in noticable benefit than free pages only.

As for (2), we do seem to have a problem with the existing balloon:
according to your measurements it's very slow; besides, I guess it plays
badly with transparent huge pages (as both the guest and the host work
with one 4k page at a time).  This is a problem for other use cases of
balloon (e.g. as a facility for resource management); tackling that
appears a more natural application for optimization efforts.

Thanks,
Roman.

Re: [Qemu-devel] [RFC qemu 0/4] A PV solution for live migration optimization

2016-03-09 Thread Roman Kagan

On Wed, Mar 09, 2016 at 05:41:39PM +0200, Michael S. Tsirkin wrote:
> On Wed, Mar 09, 2016 at 05:28:54PM +0300, Roman Kagan wrote:
> > For (1) I've been trying to make a point that skipping clean pages is
> > much more likely to result in noticable benefit than free pages only.
> 
> I guess when you say clean you mean zero?

No I meant clean, i.e. those that could be evicted from RAM without
causing I/O.

> Yea. In fact, one can zero out any number of pages
> quickly by putting them in balloon and immediately
> taking them out.
> 
> Access will fault a zero page in, then COW kicks in.

I must be missing something obvious, but how is that different from
inflating and then immediately deflating the balloon?

> We could have a new zero VQ (or some other option)
> to pass these pages guest to host, but this only
> works well if page size matches the host page size.

I'm afraid I don't yet understand what kind of pages that would be and
how they are different from ballooned pages.

I still tend to think that ballooning is a sensible solution to the
problem at hand; it's just the granularity that makes things slow and
stands in the way.

Roman.

Re: [Qemu-devel] [RFC qemu 0/4] A PV solution for live migration optimization

2016-03-10 Thread Roman Kagan

On Wed, Mar 09, 2016 at 02:38:52PM -0500, Rik van Riel wrote:
> On Wed, 2016-03-09 at 20:04 +0300, Roman Kagan wrote:
> > On Wed, Mar 09, 2016 at 05:41:39PM +0200, Michael S. Tsirkin wrote:
> > > On Wed, Mar 09, 2016 at 05:28:54PM +0300, Roman Kagan wrote:
> > > > For (1) I've been trying to make a point that skipping clean
> > > > pages is
> > > > much more likely to result in noticable benefit than free pages
> > > > only.
> > > 
> > > I guess when you say clean you mean zero?
> > 
> > No I meant clean, i.e. those that could be evicted from RAM without
> > causing I/O.
> > 
> 
> Programs in the guest may have that memory mmapped.
> This could include things like libraries and executables.
> 
> How do you deal with the guest page cache containing
> references to now non-existent memory?
> 
> How do you re-populate the memory on the destination
> host?

I guess the confusion is due to the context I stripped from the previous
messages...  Actually I've been talking about doing full-fledged balloon
inflation before the migration, so, when it's deflated the guest will
fault in that data from the filesystem as usual.

Roman.

Re: [Qemu-devel] [RFC qemu 0/4] A PV solution for live migration optimization

2016-03-10 Thread Roman Kagan

On Wed, Mar 09, 2016 at 07:39:18PM +0200, Michael S. Tsirkin wrote:
> On Wed, Mar 09, 2016 at 08:04:39PM +0300, Roman Kagan wrote:
> > On Wed, Mar 09, 2016 at 05:41:39PM +0200, Michael S. Tsirkin wrote:
> > > On Wed, Mar 09, 2016 at 05:28:54PM +0300, Roman Kagan wrote:
> > > > For (1) I've been trying to make a point that skipping clean pages is
> > > > much more likely to result in noticable benefit than free pages only.
> > > 
> > > I guess when you say clean you mean zero?
> > 
> > No I meant clean, i.e. those that could be evicted from RAM without
> > causing I/O.
> 
> They must be migrated unless guest actually evicts them.

If the balloon is inflated the guest will.

> It's not at all clear to me that it's always preferable
> to drop all clean pages from pagecache. It is clearly is
> going to slow the guest down significantly.

That's a matter for optimization.  The current value for
/proc/meminfo:MemAvailable (which is being proposed as a member of
balloon stats, too) is a conservative estimate which will probably cover
a good deal of cases.

> > I must be missing something obvious, but how is that different from
> > inflating and then immediately deflating the balloon?
> 
> It's exactly the same except
> - we do not initiate this from host - it's guest doing
>   things for its own reasons
> - a bit less guest/host interaction this way

I don't quite understand why you need to deflate the balloon until the
VM is on the destination host.  deflate_on_oom will do it if the guest
is really tight on memory; otherwise there appears to be no reason for
it.  But then inflation followed immediately by deflation doubles the
guest/host interactions rather than reduces them, no?

> > it's just the granularity that makes things slow and
> > stands in the way.
> 
> So we could request a specific page size/alignment from guest.
> Send guest request to give us memory in aligned units of 2Mbytes,
> and then host can treat each of these as a single huge page.

I'd guess just coalescing contiguous pages would already speed things
up.  I'll try to find some time to experiment with it.

Roman.

Re: [PATCH] x86/kvm/hyper-v: avoid spurious pending stimer on vCPU init

2019-03-18 Thread Roman Kagan

On Wed, Mar 13, 2019 at 06:13:42PM +0100, Vitaly Kuznetsov wrote:
> When userspace initializes guest vCPUs it may want to zero all supported
> MSRs including Hyper-V related ones including HV_X64_MSR_STIMERn_CONFIG/
> HV_X64_MSR_STIMERn_COUNT. With commit f3b138c5d89a ("kvm/x86: Update SynIC
> timers on guest entry only") we began doing stimer_mark_pending()
> unconditionally on every config change.
> 
> The issue I'm observing manifests itself as following:
> - Qemu writes 0 to STIMERn_{CONFIG,COUNT} MSRs and marks all stimers as
>   pending in stimer_pending_bitmap, arms KVM_REQ_HV_STIMER;
> - kvm_hv_has_stimer_pending() starts returning true;
> - kvm_vcpu_has_events() starts returning true;
> - kvm_arch_vcpu_runnable() starts returning true;
> - when kvm_arch_vcpu_ioctl_run() gets into
>   (vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED) case:
>   - kvm_vcpu_block() gets in 'kvm_vcpu_check_block(vcpu) < 0' and returns
> immediately, avoiding normal wait path;
>   - -EAGAIN is returned from kvm_arch_vcpu_ioctl_run() immediately forcing
> userspace to retry.
> 
> So instead of normal wait path we get a busy loop on all secondary vCPUs
> before they get INIT signal. This seems to be undesirable, especially given
> that this happens even when Hyper-V extensions are not used.
> 
> Generally, it seems to be pointless to mark an stimer as pending in
> stimer_pending_bitmap and arm KVM_REQ_HV_STIMER as the only thing
> kvm_hv_process_stimers() will do is clear the corresponding bit. We may
> just not mark disabled timers as pending instead.
> 
> Fixes: f3b138c5d89a ("kvm/x86: Update SynIC timers on guest entry only")
> Signed-off-by: Vitaly Kuznetsov 
> ---
>  arch/x86/kvm/hyperv.c | 9 +++--
>  1 file changed, 7 insertions(+), 2 deletions(-)

Reviewed-by: Roman Kagan

[PATCH v2] x86/hyperv: make vapic support x2apic mode

2019-10-02 Thread Roman Kagan

Now that there's Hyper-V IOMMU driver, Linux can switch to x2apic mode
when supported by the vcpus.

However, the apic access functions for Hyper-V enlightened apic assume
xapic mode only.

As a result, Linux fails to bring up secondary cpus when run as a guest
in QEMU/KVM with both hv_apic and x2apic enabled.

I didn't manage to make my instance of Hyper-V expose x2apic to the
guest; nor does Hyper-V spec document the expected behavior.  However,
a Windows guest running in QEMU/KVM with hv_apic and x2apic and a big
number of vcpus (so that it turns on x2apic mode) does use enlightened
apic MSRs passing unshifted 32bit destination id and falls back to the
regular x2apic MSRs for less frequently used apic fields.

So implement the same behavior, by replacing enlightened apic access
functions (only those where it makes a difference) with their
x2apic-aware versions when x2apic is in use.

Fixes: 29217a474683 ("iommu/hyper-v: Add Hyper-V stub IOMMU driver")
Fixes: 6b48cb5f8347 ("X86/Hyper-V: Enlighten APIC access")
Cc: sta...@vger.kernel.org
Signed-off-by: Roman Kagan 
---
v1 -> v2:
- add ifdefs to handle !CONFIG_X86_X2APIC

 arch/x86/hyperv/hv_apic.c | 54 ---
 1 file changed, 51 insertions(+), 3 deletions(-)

diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c
index 5c056b8aebef..eb1434ae9e46 100644
--- a/arch/x86/hyperv/hv_apic.c
+++ b/arch/x86/hyperv/hv_apic.c
@@ -84,6 +84,44 @@ static void hv_apic_write(u32 reg, u32 val)
}
 }
 
+#ifdef CONFIG_X86_X2APIC
+static void hv_x2apic_icr_write(u32 low, u32 id)
+{
+   wrmsr(HV_X64_MSR_ICR, low, id);
+}
+
+static u32 hv_x2apic_read(u32 reg)
+{
+   u32 reg_val, hi;
+
+   switch (reg) {
+   case APIC_EOI:
+   rdmsr(HV_X64_MSR_EOI, reg_val, hi);
+   return reg_val;
+   case APIC_TASKPRI:
+   rdmsr(HV_X64_MSR_TPR, reg_val, hi);
+   return reg_val;
+
+   default:
+   return native_apic_msr_read(reg);
+   }
+}
+
+static void hv_x2apic_write(u32 reg, u32 val)
+{
+   switch (reg) {
+   case APIC_EOI:
+   wrmsr(HV_X64_MSR_EOI, val, 0);
+   break;
+   case APIC_TASKPRI:
+   wrmsr(HV_X64_MSR_TPR, val, 0);
+   break;
+   default:
+   native_apic_msr_write(reg, val);
+   }
+}
+#endif /* CONFIG_X86_X2APIC */
+
 static void hv_apic_eoi_write(u32 reg, u32 val)
 {
struct hv_vp_assist_page *hvp = hv_vp_assist_page[smp_processor_id()];
@@ -262,9 +300,19 @@ void __init hv_apic_init(void)
if (ms_hyperv.hints & HV_X64_APIC_ACCESS_RECOMMENDED) {
pr_info("Hyper-V: Using MSR based APIC access\n");
apic_set_eoi_write(hv_apic_eoi_write);
-   apic->read  = hv_apic_read;
-   apic->write = hv_apic_write;
-   apic->icr_write = hv_apic_icr_write;
+#ifdef CONFIG_X86_X2APIC
+   if (x2apic_enabled()) {
+   apic->read  = hv_x2apic_read;
+   apic->write = hv_x2apic_write;
+   apic->icr_write = hv_x2apic_icr_write;
+   } else {
+#endif
+   apic->read  = hv_apic_read;
+   apic->write = hv_apic_write;
+   apic->icr_write = hv_apic_icr_write;
+#ifdef CONFIG_X86_X2APIC
+   }
+#endif
apic->icr_read  = hv_apic_icr_read;
}
 }
-- 
2.21.0

Re: [PATCH v2] x86/hyperv: make vapic support x2apic mode

2019-10-03 Thread Roman Kagan

On Thu, Oct 03, 2019 at 12:54:03PM +0200, Vitaly Kuznetsov wrote:
> Roman Kagan  writes:
> 
> > Now that there's Hyper-V IOMMU driver, Linux can switch to x2apic mode
> > when supported by the vcpus.
> >
> > However, the apic access functions for Hyper-V enlightened apic assume
> > xapic mode only.
> >
> > As a result, Linux fails to bring up secondary cpus when run as a guest
> > in QEMU/KVM with both hv_apic and x2apic enabled.
> >
> > I didn't manage to make my instance of Hyper-V expose x2apic to the
> > guest; nor does Hyper-V spec document the expected behavior.  However,
> > a Windows guest running in QEMU/KVM with hv_apic and x2apic and a big
> > number of vcpus (so that it turns on x2apic mode) does use enlightened
> > apic MSRs passing unshifted 32bit destination id and falls back to the
> > regular x2apic MSRs for less frequently used apic fields.
> >
> > So implement the same behavior, by replacing enlightened apic access
> > functions (only those where it makes a difference) with their
> > x2apic-aware versions when x2apic is in use.
> >
> > Fixes: 29217a474683 ("iommu/hyper-v: Add Hyper-V stub IOMMU driver")
> > Fixes: 6b48cb5f8347 ("X86/Hyper-V: Enlighten APIC access")
> > Cc: sta...@vger.kernel.org
> > Signed-off-by: Roman Kagan 
> > ---
> > v1 -> v2:
> > - add ifdefs to handle !CONFIG_X86_X2APIC
> >
> >  arch/x86/hyperv/hv_apic.c | 54 ---
> >  1 file changed, 51 insertions(+), 3 deletions(-)
> >
> > diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c
> > index 5c056b8aebef..eb1434ae9e46 100644
> > --- a/arch/x86/hyperv/hv_apic.c
> > +++ b/arch/x86/hyperv/hv_apic.c
> > @@ -84,6 +84,44 @@ static void hv_apic_write(u32 reg, u32 val)
> > }
> >  }
> >  
> > +#ifdef CONFIG_X86_X2APIC
> > +static void hv_x2apic_icr_write(u32 low, u32 id)
> > +{
> > +   wrmsr(HV_X64_MSR_ICR, low, id);
> > +}
> 
> AFAIU you're trying to mirror native_x2apic_icr_write() here but this is
> different from what hv_apic_icr_write() does
> (SET_APIC_DEST_FIELD(id)).

Right.  In xapic mode the ICR2 aka the high 4 bytes of ICR is programmed
with the destination id in the highest byte; in x2apic mode the whole
ICR2 is set to the 32bit destination id.

> Is it actually correct? (I think you've tested this and it is but)

As I wrote in the commit log, I haven't tested it in the sense that I
ran a Linux guest in a Hyper-V VM exposing x2apic to the guest, because
I didn't manage to configure it to do so.  OTOH I did run a Windows
guest in QEMU/KVM with hv_apic and x2apic enabled and saw it write
destination ids unshifted to the ICR2 part of ICR, so I assume it's
correct.

> Michael, could you please shed some light here?

Would be appreciated, indeed.

> > +static u32 hv_x2apic_read(u32 reg)
> > +{
> > +   u32 reg_val, hi;
> > +
> > +   switch (reg) {
> > +   case APIC_EOI:
> > +   rdmsr(HV_X64_MSR_EOI, reg_val, hi);
> > +   return reg_val;
> > +   case APIC_TASKPRI:
> > +   rdmsr(HV_X64_MSR_TPR, reg_val, hi);
> > +   return reg_val;
> > +
> > +   default:
> > +   return native_apic_msr_read(reg);
> > +   }
> > +}
> > +
> > +static void hv_x2apic_write(u32 reg, u32 val)
> > +{
> > +   switch (reg) {
> > +   case APIC_EOI:
> > +   wrmsr(HV_X64_MSR_EOI, val, 0);
> > +   break;
> > +   case APIC_TASKPRI:
> > +   wrmsr(HV_X64_MSR_TPR, val, 0);
> > +   break;
> > +   default:
> > +   native_apic_msr_write(reg, val);
> > +   }
> > +}
> > +#endif /* CONFIG_X86_X2APIC */
> > +
> >  static void hv_apic_eoi_write(u32 reg, u32 val)
> >  {
> > struct hv_vp_assist_page *hvp = hv_vp_assist_page[smp_processor_id()];
> > @@ -262,9 +300,19 @@ void __init hv_apic_init(void)
> > if (ms_hyperv.hints & HV_X64_APIC_ACCESS_RECOMMENDED) {
> > pr_info("Hyper-V: Using MSR based APIC access\n");
> > apic_set_eoi_write(hv_apic_eoi_write);
> > -   apic->read  = hv_apic_read;
> > -   apic->write = hv_apic_write;
> > -   apic->icr_write = hv_apic_icr_write;
> > +#ifdef CONFIG_X86_X2APIC
> > +   if (x2apic_enabled()) {
> > +   apic->read  = hv_x2apic_read;
> > +   apic->write = hv_x2apic_write;
> > +   apic->icr_write = hv_x2apic_icr_write;
> > +

[PATCH] x86/hyperv: make vapic support x2apic mode

2019-09-30 Thread Roman Kagan

Now that there's Hyper-V IOMMU driver, Linux can switch to x2apic mode
when supported by the vcpus.

However, the apic access functions for Hyper-V enlightened apic assume
xapic mode only.

As a result, Linux fails to bring up secondary cpus when run as a guest
in QEMU/KVM with both hv_apic and x2apic enabled.

I didn't manage to make my instance of Hyper-V expose x2apic to the
guest; nor does Hyper-V spec document the expected behavior.  However,
a Windows guest running in QEMU/KVM with hv_apic and x2apic and a big
number of vcpus (so that it turns on x2apic mode) does use enlightened
apic MSRs passing unshifted 32bit destination id and falls back to the
regular x2apic MSRs for less frequently used apic fields.

So implement the same behavior, by replacing enlightened apic access
functions (only those where it makes a difference) with their
x2apic-aware versions when x2apic is in use.

Fixes: 29217a474683 ("iommu/hyper-v: Add Hyper-V stub IOMMU driver")
Fixes: 6b48cb5f8347 ("X86/Hyper-V: Enlighten APIC access")
Cc: sta...@vger.kernel.org
Signed-off-by: Roman Kagan 
---
 arch/x86/hyperv/hv_apic.c | 48 ---
 1 file changed, 45 insertions(+), 3 deletions(-)

diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c
index 5c056b8aebef..9564fec00375 100644
--- a/arch/x86/hyperv/hv_apic.c
+++ b/arch/x86/hyperv/hv_apic.c
@@ -53,6 +53,11 @@ static void hv_apic_icr_write(u32 low, u32 id)
wrmsrl(HV_X64_MSR_ICR, reg_val);
 }
 
+static void hv_x2apic_icr_write(u32 low, u32 id)
+{
+   wrmsr(HV_X64_MSR_ICR, low, id);
+}
+
 static u32 hv_apic_read(u32 reg)
 {
u32 reg_val, hi;
@@ -70,6 +75,23 @@ static u32 hv_apic_read(u32 reg)
}
 }
 
+static u32 hv_x2apic_read(u32 reg)
+{
+   u32 reg_val, hi;
+
+   switch (reg) {
+   case APIC_EOI:
+   rdmsr(HV_X64_MSR_EOI, reg_val, hi);
+   return reg_val;
+   case APIC_TASKPRI:
+   rdmsr(HV_X64_MSR_TPR, reg_val, hi);
+   return reg_val;
+
+   default:
+   return native_apic_msr_read(reg);
+   }
+}
+
 static void hv_apic_write(u32 reg, u32 val)
 {
switch (reg) {
@@ -84,6 +106,20 @@ static void hv_apic_write(u32 reg, u32 val)
}
 }
 
+static void hv_x2apic_write(u32 reg, u32 val)
+{
+   switch (reg) {
+   case APIC_EOI:
+   wrmsr(HV_X64_MSR_EOI, val, 0);
+   break;
+   case APIC_TASKPRI:
+   wrmsr(HV_X64_MSR_TPR, val, 0);
+   break;
+   default:
+   native_apic_msr_write(reg, val);
+   }
+}
+
 static void hv_apic_eoi_write(u32 reg, u32 val)
 {
struct hv_vp_assist_page *hvp = hv_vp_assist_page[smp_processor_id()];
@@ -262,9 +298,15 @@ void __init hv_apic_init(void)
if (ms_hyperv.hints & HV_X64_APIC_ACCESS_RECOMMENDED) {
pr_info("Hyper-V: Using MSR based APIC access\n");
apic_set_eoi_write(hv_apic_eoi_write);
-   apic->read  = hv_apic_read;
-   apic->write = hv_apic_write;
-   apic->icr_write = hv_apic_icr_write;
+   if (x2apic_enabled()) {
+   apic->read  = hv_x2apic_read;
+   apic->write = hv_x2apic_write;
+   apic->icr_write = hv_x2apic_icr_write;
+   } else {
+   apic->read  = hv_apic_read;
+   apic->write = hv_apic_write;
+   apic->icr_write = hv_apic_icr_write;
+   }
apic->icr_read  = hv_apic_icr_read;
}
 }
-- 
2.21.0

Re: [PATCH] x86/hyperv: make vapic support x2apic mode

2019-10-01 Thread Roman Kagan

On Tue, Oct 01, 2019 at 06:44:08AM +0800, kbuild test robot wrote:
> url:
> https://github.com/0day-ci/linux/commits/Roman-Kagan/x86-hyperv-make-vapic-support-x2apic-mode/20191001-044238
> config: x86_64-randconfig-e004-201939 (attached as .config)
> compiler: gcc-7 (Debian 7.4.0-13) 7.4.0
> reproduce:
> # save the attached .config to linux build tree
> make ARCH=x86_64 
> 
> If you fix the issue, kindly add following tag
> Reported-by: kbuild test robot 
> 
> All errors (new ones prefixed by >>):
> 
>arch/x86/hyperv/hv_apic.c: In function 'hv_x2apic_read':
> >> arch/x86/hyperv/hv_apic.c:91:10: error: implicit declaration of function 
> >> 'native_apic_msr_read'; did you mean 'native_apic_icr_read'? 
> >> [-Werror=implicit-function-declaration]
>   return native_apic_msr_read(reg);
>  ^~~~
>  native_apic_icr_read
>arch/x86/hyperv/hv_apic.c: In function 'hv_x2apic_write':
> >> arch/x86/hyperv/hv_apic.c:119:3: error: implicit declaration of function 
> >> 'native_apic_msr_write'; did you mean 'native_apic_icr_write'? 
> >> [-Werror=implicit-function-declaration]
>   native_apic_msr_write(reg, val);
>   ^
>   native_apic_icr_write
>cc1: some warnings being treated as errors

Oops, !CONFIG_X86_X2APIC needs to be handled.  Will post v2.

Roman.

Re: [PATCH v3 15/16] kvm: x86: ioapic: Lazy update IOAPIC EOI

2019-10-09 Thread Roman Kagan

On Wed, Oct 09, 2019 at 11:21:41AM +0200, Paolo Bonzini wrote:
> On 13/09/19 21:01, Suthikulpanit, Suravee wrote:
> > /*
> > +* In case APICv accelerate EOI write and do not trap,
> > +* in-kernel IOAPIC will not be able to receive the EOI.
> > +* In this case, we do lazy update of the pending EOI when
> > +* trying to set IOAPIC irq.
> > +*/
> > +   if (kvm_apicv_eoi_accelerate(ioapic->kvm, edge))
> > +   ioapic_lazy_update_eoi(ioapic, irq);
> > +
> 
> This is okay for the RTC, and in fact I suggest that you make it work
> like this even for Intel.  This will get rid of kvm_apicv_eoi_accelerate
> and be nicer overall.
> 
> However, it cannot work for the in-kernel PIT, because it is currently
> checking ps->irq_ack before kvm_set_irq.  Unfortunately, the in-kernel
> PIT is relying on the ack notifier to timely queue the pt->worker work
> item and reinject the missed tick.
> 
> Thus, you cannot enable APICv if ps->reinject is true.
> 
> Perhaps you can make kvm->arch.apicv_state a disabled counter?  Then
> Hyper-V can increment it when enabled, PIT can increment it when
> ps->reinject becomes true and decrement it when it becomes false;
> finally, svm.c can increment it when an SVM guest is launched and
> increment/decrement it around ExtINT handling?

This can benefit Hyper-V emulation too.  The point is that it's only
AutoEOI feature in SynIC that is incompatible with APICv.  So the VM can
use APICv until the guest configures its first AutoEOI SINT.  If the
hypervisor sets HV_DEPRECATING_AEOI_RECOMMENDED (bit 9) in
HYPERV_CPUID_ENLIGHTMENT_INFO (0x4004) cpuid this may never happen
so we will not be pessimizing guests on modern hardware by merely
enabling SynIC.  I started looking into this recently and would be happy
to piggy-back on this series.

Roman.

> (This conflicts with some of the suggestions I made earlier, which
> implied the existence of apicv_state, but everything should if anything
> become easier).
> 
> Paolo

[PATCH v3] x86/hyperv: make vapic support x2apic mode

2019-10-09 Thread Roman Kagan

Now that there's Hyper-V IOMMU driver, Linux can switch to x2apic mode
when supported by the vcpus.

However, the apic access functions for Hyper-V enlightened apic assume
xapic mode only.

As a result, Linux fails to bring up secondary cpus when run as a guest
in QEMU/KVM with both hv_apic and x2apic enabled.

According to Michael Kelley, when in x2apic mode, the Hyper-V synthetic
apic MSRs behave exactly the same as the corresponding architectural
x2apic MSRs, so there's no need to override the apic accessors.  The
only exception is hv_apic_eoi_write, which benefits from lazy EOI when
available; however, its implementation works for both xapic and x2apic
modes.

Fixes: 29217a474683 ("iommu/hyper-v: Add Hyper-V stub IOMMU driver")
Fixes: 6b48cb5f8347 ("X86/Hyper-V: Enlighten APIC access")
Cc: sta...@vger.kernel.org
Suggested-by: Michael Kelley 
Signed-off-by: Roman Kagan 
---
v2 -> v3:
- do not introduce x2apic-capable hv_apic accessors; leave original
  x2apic accessors instead

v1 -> v2:
- add ifdefs to handle !CONFIG_X86_X2APIC

 arch/x86/hyperv/hv_apic.c | 17 +
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c
index 5c056b8aebef..26eeff5bd535 100644
--- a/arch/x86/hyperv/hv_apic.c
+++ b/arch/x86/hyperv/hv_apic.c
@@ -261,10 +261,19 @@ void __init hv_apic_init(void)
 
if (ms_hyperv.hints & HV_X64_APIC_ACCESS_RECOMMENDED) {
pr_info("Hyper-V: Using MSR based APIC access\n");
+   /*
+* With x2apic, architectural x2apic MSRs are equivalent to the
+* respective synthetic MSRs, so there's no need to override
+* the apic accessors.  The only exception is
+* hv_apic_eoi_write, because it benefits from lazy EOI when
+* available, but it works for both xapic and x2apic modes.
+*/
apic_set_eoi_write(hv_apic_eoi_write);
-   apic->read  = hv_apic_read;
-   apic->write = hv_apic_write;
-   apic->icr_write = hv_apic_icr_write;
-   apic->icr_read  = hv_apic_icr_read;
+   if (!x2apic_enabled()) {
+   apic->read  = hv_apic_read;
+   apic->write = hv_apic_write;
+   apic->icr_write = hv_apic_icr_write;
+   apic->icr_read  = hv_apic_icr_read;
+   }
}
 }
-- 
2.21.0

Re: [PATCH v2] x86/hyperv: make vapic support x2apic mode

2019-10-04 Thread Roman Kagan

On Fri, Oct 04, 2019 at 03:01:51AM +, Michael Kelley wrote:
> From: Roman Kagan  Sent: Thursday, October 3, 2019 5:53 
> AM
> > >
> > > AFAIU you're trying to mirror native_x2apic_icr_write() here but this is
> > > different from what hv_apic_icr_write() does
> > > (SET_APIC_DEST_FIELD(id)).
> > 
> > Right.  In xapic mode the ICR2 aka the high 4 bytes of ICR is programmed
> > with the destination id in the highest byte; in x2apic mode the whole
> > ICR2 is set to the 32bit destination id.
> > 
> > > Is it actually correct? (I think you've tested this and it is but)
> > 
> > As I wrote in the commit log, I haven't tested it in the sense that I
> > ran a Linux guest in a Hyper-V VM exposing x2apic to the guest, because
> > I didn't manage to configure it to do so.  OTOH I did run a Windows
> > guest in QEMU/KVM with hv_apic and x2apic enabled and saw it write
> > destination ids unshifted to the ICR2 part of ICR, so I assume it's
> > correct.
> > 
> > > Michael, could you please shed some light here?
> > 
> > Would be appreciated, indeed.
> > 
> 
> The newest version of Hyper-V provides an x2apic in a guest VM when the
> number of vCPUs in the VM is > 240.  This version of Hyper-V is beginning
> to be deployed in Azure to enable the M416v2 VM size, but the functionality
> is not yet available for the on-premises version of Hyper-V.  However, I can
> test this configuration internally with the above patch -- give me a few days.
> 
> An additional complication is that when running on Intel processors that offer
> vAPIC functionality, the Hyper-V "hints" value does *not* recommend using the
> MSR-based APIC accesses.  In this case, memory-mapped access to the x2apic
> registers is faster than the synthetic MSRs.

I guess you mean "using regular x2apic MSRs compared to the synthetic
MSRs".  Indeed they do essentially the same thing, and there's no reason
for one set of MSRs to be significantly faster than the other.  However,
hv_apic_eoi_write makes use of "apic assists" aka lazy EOI which is
certainly a win, and I'm not sure if it works without hv_apic.

> I've already looked at a VM that has
> the x2apic, and indeed that is the case, so the above code wouldn't run
> anyway.  But I can temporarily code around that for testing purposes and see
> if everything works.

Thanks!
Roman.

Re: [RFC v2 0/2] kvm: Use host timekeeping in guest.

2019-10-10 Thread Roman Kagan

On Thu, Oct 10, 2019 at 04:30:53PM +0900, Suleiman Souhlal wrote:
> This RFC is to try to solve the following problem:
> 
> We have some applications that are currently running in their
> own namespace, that still talk to other processes on the
> machine, using IPC, and expect to run on the same machine.
> 
> We want to move them into a virtual machine, for the usual
> benefits of virtualization.
> 
> However, some of these programs use CLOCK_MONOTONIC and
> CLOCK_BOOTTIME timestamps, as part of their protocol, when talking
> to the host.
> 
> Generally speaking, we have multiple event sources, for example
> sensors, input devices, display controller vsync, etc and we would
> like to rely on them in the guest for various scenarios.
> 
> As a specific example, we are trying to run some wayland clients
> (in the guest) who talk to the server (in the host), and the server
> gives input events based on host time. Additionally, there are also
> vsync events that the clients use for timing their rendering.
> 
> Another use case we have are timestamps from IIO sensors and cameras.
> There are applications that need to determine how the timestamps
> relate to the current time and the only way to get current time is
> clock_gettime(), which would return a value from a different time
> domain than the timestamps.
> 
> In this case, it is not feasible to change these programs, due to
> the number of the places we would have to change.
> 
> We spent some time thinking about this, and the best solution we
> could come up with was the following:
> 
> Make the guest kernel return the same CLOCK_MONOTONIC and
> CLOCK_GETTIME timestamps as the host.
> 
> To do that, I am changing kvmclock to request to the host to copy
> its timekeeping parameters (mult, base, cycle_last, etc), so that
> the guest timekeeper can use the same values, so that time can
> be synchronized between the guest and the host.

I wonder how feasible it is to map the host's vdso into the guest and
thus make the guest use the *same* (as opposed to "synchronized") clock
as the host's userspace?  Another benefit is that it's essentially an
ABI so is not changed as liberally as internal structures like
timekeeper, etc.  There is probably certain complication in handling the
syscall fallback in the vdso when used in the guest kernel, though.

You'll also need to ensure neither tsc scaling and nor offsetting is
applied to the VM once this clock is enabled.

Roman.

[PATCH v4] x86/hyperv: make vapic support x2apic mode

2019-10-10 Thread Roman Kagan

Now that there's Hyper-V IOMMU driver, Linux can switch to x2apic mode
when supported by the vcpus.

However, the apic access functions for Hyper-V enlightened apic assume
xapic mode only.

As a result, Linux fails to bring up secondary cpus when run as a guest
in QEMU/KVM with both hv_apic and x2apic enabled.

According to Michael Kelley, when in x2apic mode, the Hyper-V synthetic
apic MSRs behave exactly the same as the corresponding architectural
x2apic MSRs, so there's no need to override the apic accessors.  The
only exception is hv_apic_eoi_write, which benefits from lazy EOI when
available; however, its implementation works for both xapic and x2apic
modes.

Fixes: 29217a474683 ("iommu/hyper-v: Add Hyper-V stub IOMMU driver")
Fixes: 6b48cb5f8347 ("X86/Hyper-V: Enlighten APIC access")
Cc: sta...@vger.kernel.org
Suggested-by: Michael Kelley 
Reviewed-by: Vitaly Kuznetsov 
Reviewed-by: Michael Kelley 
Signed-off-by: Roman Kagan 
---
v3 -> v4:
- adjust the log message [Vitaly, Michael]

v2 -> v3:
- do not introduce x2apic-capable hv_apic accessors; leave original
  x2apic accessors instead

v1 -> v2:
- add ifdefs to handle !CONFIG_X86_X2APIC

 arch/x86/hyperv/hv_apic.c | 20 +++-
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c
index 5c056b8aebef..e01078e93dd3 100644
--- a/arch/x86/hyperv/hv_apic.c
+++ b/arch/x86/hyperv/hv_apic.c
@@ -260,11 +260,21 @@ void __init hv_apic_init(void)
}
 
if (ms_hyperv.hints & HV_X64_APIC_ACCESS_RECOMMENDED) {
-   pr_info("Hyper-V: Using MSR based APIC access\n");
+   pr_info("Hyper-V: Using enlightened APIC (%s mode)",
+   x2apic_enabled() ? "x2apic" : "xapic");
+   /*
+* With x2apic, architectural x2apic MSRs are equivalent to the
+* respective synthetic MSRs, so there's no need to override
+* the apic accessors.  The only exception is
+* hv_apic_eoi_write, because it benefits from lazy EOI when
+* available, but it works for both xapic and x2apic modes.
+*/
apic_set_eoi_write(hv_apic_eoi_write);
-   apic->read  = hv_apic_read;
-   apic->write = hv_apic_write;
-   apic->icr_write = hv_apic_icr_write;
-   apic->icr_read  = hv_apic_icr_read;
+   if (!x2apic_enabled()) {
+   apic->read  = hv_apic_read;
+   apic->write = hv_apic_write;
+   apic->icr_write = hv_apic_icr_write;
+   apic->icr_read  = hv_apic_icr_read;
+   }
}
 }
-- 
2.21.0

Re: [PATCH 1/2] x86: kvm: avoid -Wsometimes-uninitized warning

2019-07-12 Thread Roman Kagan

On Fri, Jul 12, 2019 at 11:12:29AM +0200, Arnd Bergmann wrote:
> clang points out that running a 64-bit guest on a 32-bit host
> would lead to uninitialized variables:
> 
> arch/x86/kvm/hyperv.c:1610:6: error: variable 'ingpa' is used uninitialized 
> whenever 'if' condition is false [-Werror,-Wsometimes-uninitialized]
> if (!longmode) {
> ^
> arch/x86/kvm/hyperv.c:1632:55: note: uninitialized use occurs here
> trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa);
>  ^
> arch/x86/kvm/hyperv.c:1610:2: note: remove the 'if' if its condition is 
> always true
> if (!longmode) {
> ^~~
> arch/x86/kvm/hyperv.c:1595:18: note: initialize the variable 'ingpa' to 
> silence this warning
> u64 param, ingpa, outgpa, ret = HV_STATUS_SUCCESS;
> ^
>  = 0
> arch/x86/kvm/hyperv.c:1610:6: error: variable 'outgpa' is used uninitialized 
> whenever 'if' condition is false [-Werror,-Wsometimes-uninitialized]
> arch/x86/kvm/hyperv.c:1610:6: error: variable 'param' is used uninitialized 
> whenever 'if' condition is false [-Werror,-Wsometimes-uninitialized]
> 
> Since that combination is not supported anyway, change the condition
> to tell the compiler how the code is actually executed.

Hmm, the compiler *is* told all it needs:


arch/x86/kvm/x86.h:
...
static inline int is_long_mode(struct kvm_vcpu *vcpu)
{
#ifdef CONFIG_X86_64
return vcpu->arch.efer & EFER_LMA;
#else
return 0;
#endif
}

static inline bool is_64_bit_mode(struct kvm_vcpu *vcpu)
{
int cs_db, cs_l;

if (!is_long_mode(vcpu))
return false;
kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
return cs_l;
}
...

so in !CONFIG_X86_64 case is_64_bit_mode() unconditionally returns
false, and the branch setting the values of the variables is always
taken.

> Signed-off-by: Arnd Bergmann 
> ---
>  arch/x86/kvm/hyperv.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
> index a39e38f13029..950436c502ba 100644
> --- a/arch/x86/kvm/hyperv.c
> +++ b/arch/x86/kvm/hyperv.c
> @@ -1607,7 +1607,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
>  
>   longmode = is_64_bit_mode(vcpu);
>  
> - if (!longmode) {
> + if (!IS_ENABLED(CONFIG_X86_64) || !longmode) {
>   param = ((u64)kvm_rdx_read(vcpu) << 32) |
>   (kvm_rax_read(vcpu) & 0x);
>   ingpa = ((u64)kvm_rbx_read(vcpu) << 32) |

So this is rather a workaround for the compiler giving false positive.
I suggest to at least rephrase the log message to inidcate this.

Roman.

Re: [PATCH] [v2] x86: kvm: avoid -Wsometimes-uninitized warning

2019-07-12 Thread Roman Kagan

On Fri, Jul 12, 2019 at 03:32:43PM +0200, Arnd Bergmann wrote:
> clang points out that running a 64-bit guest on a 32-bit host
> would lead to uninitialized variables:
> 
> arch/x86/kvm/hyperv.c:1610:6: error: variable 'ingpa' is used uninitialized 
> whenever 'if' condition is false [-Werror,-Wsometimes-uninitialized]
> if (!longmode) {
> ^
> arch/x86/kvm/hyperv.c:1632:55: note: uninitialized use occurs here
> trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa);
>  ^
> arch/x86/kvm/hyperv.c:1610:2: note: remove the 'if' if its condition is 
> always true
> if (!longmode) {
> ^~~
> arch/x86/kvm/hyperv.c:1595:18: note: initialize the variable 'ingpa' to 
> silence this warning
> u64 param, ingpa, outgpa, ret = HV_STATUS_SUCCESS;
> ^
>  = 0
> arch/x86/kvm/hyperv.c:1610:6: error: variable 'outgpa' is used uninitialized 
> whenever 'if' condition is false [-Werror,-Wsometimes-uninitialized]
> arch/x86/kvm/hyperv.c:1610:6: error: variable 'param' is used uninitialized 
> whenever 'if' condition is false [-Werror,-Wsometimes-uninitialized]
> 
> Since that combination is not supported anyway, change the condition
> to tell the compiler how the code is actually executed.
> 
> Signed-off-by: Arnd Bergmann 
> ---
> v2: make the change inside of is_64_bit_mode().
> ---
>  arch/x86/kvm/hyperv.c | 6 ++
>  arch/x86/kvm/x86.h| 4 
>  2 files changed, 6 insertions(+), 4 deletions(-)

Reviewed-by: Roman Kagan 

However I still think the log message could state it more explicitly
that it was the compiler's fault, and the patch is a workaround for it.

Otherwise later on someone may decide to restore the similarity of
is_64_bit_mode() to other inlines in this file, and will be extremely
unlikely to test clang 32-bit build...

Roman.

Re: [PATCH] [v3] x86: kvm: avoid -Wsometimes-uninitized warning

2019-07-12 Thread Roman Kagan

On Fri, Jul 12, 2019 at 04:13:09PM +0200, Arnd Bergmann wrote:
> Clang notices a code path in which some variables are never
> initialized, but fails to figure out that this can never happen
> on i386 because is_64_bit_mode() always returns false.
> 
> arch/x86/kvm/hyperv.c:1610:6: error: variable 'ingpa' is used uninitialized 
> whenever 'if' condition is false [-Werror,-Wsometimes-uninitialized]
> if (!longmode) {
> ^
> arch/x86/kvm/hyperv.c:1632:55: note: uninitialized use occurs here
> trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa);
>  ^
> arch/x86/kvm/hyperv.c:1610:2: note: remove the 'if' if its condition is 
> always true
> if (!longmode) {
> ^~~
> arch/x86/kvm/hyperv.c:1595:18: note: initialize the variable 'ingpa' to 
> silence this warning
> u64 param, ingpa, outgpa, ret = HV_STATUS_SUCCESS;
> ^
>  = 0
> arch/x86/kvm/hyperv.c:1610:6: error: variable 'outgpa' is used uninitialized 
> whenever 'if' condition is false [-Werror,-Wsometimes-uninitialized]
> arch/x86/kvm/hyperv.c:1610:6: error: variable 'param' is used uninitialized 
> whenever 'if' condition is false [-Werror,-Wsometimes-uninitialized]
> 
> Flip the condition around to avoid the conditional execution on i386.
> 
> Signed-off-by: Arnd Bergmann 
> ---
> v3: reword commit log, simplify patch again
> v2: make the change inside of is_64_bit_mode().
> ---
>  arch/x86/kvm/hyperv.c | 20 +---
>  1 file changed, 9 insertions(+), 11 deletions(-)

Reviewed-by: Roman Kagan

Re: [PATCH v4 RESEND 3/5] KVM: x86: hyperv: use get_vcpu_by_vpidx() in kvm_hv_flush_tlb()

2018-09-25 Thread Roman Kagan

On Mon, Sep 24, 2018 at 06:55:28PM +0200, Paolo Bonzini wrote:
> On 24/09/2018 18:24, Paolo Bonzini wrote:
> > Hi Paolo,
> > 
> > could you please clarify what needs to be done to get this merged? In
> > particular, are you OK with my comment above or do we need to do
> > something with it (e.g. get back to the 'logarythmic search' from v2)?
> > 
> > In kvm/queue I can see only 'x86/hyper-v: rename ipi_arg_{ex,non_ex}
> > structures' patch from this series applied.
> 
> Hi,
> 
> my plan was to apply only 1/2/5 for now.  I singled out the rename patch
> because that one could be included in 4.19-rc kernels as a cleanup.

Is this supposed to mean you're not happy with the approach taken in
Vitaly's patch?  Can you explain why?  I take my part of guilt for it so
I'd like to know, too.

Speaking of the options we have, the choice depends on the assumptions
we take. (And I guess when you spoke of quadratic complexity you
referred to the algorithm to convert the vp_index mask into the KVM cpu
mask.)

If we can assume that in all relevant cases vp_index coincides with the
cpu index (which I think we can) then Vitaly's approach is the most
efficient.

If, on the opposite, we want to optimize for random mapping between
vp_index and cpu index, then it's probably better instead to iterate
over vcpus and test if their vp_index belongs to the requested mask.

Neither of the above is quadratic.

Dunno if we need to specifically consider intermediate situations.

Anyway using a havier vp_index -> cpu index translation looks like an
overkill to me.

What do you think?

Thanks,
Roman.

Re: [PATCH v4 RESEND 3/5] KVM: x86: hyperv: use get_vcpu_by_vpidx() in kvm_hv_flush_tlb()

2018-09-25 Thread Roman Kagan

On Tue, Sep 25, 2018 at 11:29:57AM +0200, Paolo Bonzini wrote:
> On 25/09/2018 10:57, Roman Kagan wrote:
> > If we can assume that in all relevant cases vp_index coincides with the
> > cpu index (which I think we can) then Vitaly's approach is the most
> > efficient.
> > 
> > If, on the opposite, we want to optimize for random mapping between
> > vp_index and cpu index, then it's probably better instead to iterate
> > over vcpus and test if their vp_index belongs to the requested mask.
> 
> Yes, that would work too.  Perhaps we can do both?  You can have a
> kvm->num_mismatched_vp_indexes count to choose between the two.

Makes sense to me.

Roman.

Re: [PATCH v6 4/7] KVM: x86: hyperv: keep track of mismatched VP indexes

2018-10-01 Thread Roman Kagan

On Mon, Oct 01, 2018 at 05:48:54PM +0200, Paolo Bonzini wrote:
> On 27/09/2018 11:17, Vitaly Kuznetsov wrote:
> > Roman Kagan  writes:
> > 
> >> On Wed, Sep 26, 2018 at 07:02:56PM +0200, Vitaly Kuznetsov wrote:
> >>> In most common cases VP index of a vcpu matches its vcpu index. Userspace
> >>> is, however, free to set any mapping it wishes and we need to account for
> >>> that when we need to find a vCPU with a particular VP index. To keep 
> >>> search
> >>> algorithms optimal in both cases introduce 'num_mismatched_vp_indexes'
> >>> counter showing how many vCPUs with mismatching VP index we have. In case
> >>> the counter is zero we can assume vp_index == vcpu_idx.
> >>>
> >>> Signed-off-by: Vitaly Kuznetsov 
> >>> ---
> >>>  arch/x86/include/asm/kvm_host.h |  3 +++
> >>>  arch/x86/kvm/hyperv.c   | 26 +++---
> >>>  2 files changed, 26 insertions(+), 3 deletions(-)
> >>>
> >>> diff --git a/arch/x86/include/asm/kvm_host.h 
> >>> b/arch/x86/include/asm/kvm_host.h
> >>> index 09b2e3e2cf1b..711f79f1b5e6 100644
> >>> --- a/arch/x86/include/asm/kvm_host.h
> >>> +++ b/arch/x86/include/asm/kvm_host.h
> >>> @@ -781,6 +781,9 @@ struct kvm_hv {
> >>>   u64 hv_reenlightenment_control;
> >>>   u64 hv_tsc_emulation_control;
> >>>   u64 hv_tsc_emulation_status;
> >>> +
> >>> + /* How many vCPUs have VP index != vCPU index */
> >>> + atomic_t num_mismatched_vp_indexes;
> >>>  };
> >>>  
> >>>  enum kvm_irqchip_mode {
> >>> diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
> >>> index c8764faf783b..6a19c8e3c432 100644
> >>> --- a/arch/x86/kvm/hyperv.c
> >>> +++ b/arch/x86/kvm/hyperv.c
> >>> @@ -1045,11 +1045,31 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, 
> >>> u32 msr, u64 data, bool host)
> >>>   struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv;
> >>>  
> >>>   switch (msr) {
> >>> - case HV_X64_MSR_VP_INDEX:
> >>> - if (!host || (u32)data >= KVM_MAX_VCPUS)
> >>> + case HV_X64_MSR_VP_INDEX: {
> >>> + struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
> >>> + int vcpu_idx = kvm_vcpu_get_idx(vcpu);
> >>> + u32 new_vp_index = (u32)data;
> >>> +
> >>> + if (!host || new_vp_index >= KVM_MAX_VCPUS)
> >>>   return 1;
> >>> - hv_vcpu->vp_index = (u32)data;
> >>> +
> >>> + if (new_vp_index == hv_vcpu->vp_index)
> >>> + return 0;
> >>> +
> >>> + /*
> >>> +  * VP index is changing, increment num_mismatched_vp_indexes in
> >>> +  * case it was equal to vcpu_idx before; on the other hand, if
> >>> +  * the new VP index matches vcpu_idx num_mismatched_vp_indexes
> >>> +  * needs to be decremented.
> >>
> >> It may be worth mentioning that the initial balance is provided by
> >> kvm_hv_vcpu_postcreate setting vp_index = vcpu_idx.
> >>
> > 
> > Of course, yes, will update the comment in case I'll be re-submitting.
> 
>   /*
>* VP index is initialized to hv_vcpu->vp_index by
>* kvm_hv_vcpu_postcreate so they initially match.  Now the
>* VP index is changing, adjust num_mismatched_vp_indexes if
>* it now matches or no longer matches vcpu_idx.
>*/
> 
> ?

To my taste - perfect :)

Roman.

Re: [PATCH v6 4/7] KVM: x86: hyperv: keep track of mismatched VP indexes

2018-10-01 Thread Roman Kagan

On Mon, Oct 01, 2018 at 03:54:26PM +, Roman Kagan wrote:
> On Mon, Oct 01, 2018 at 05:48:54PM +0200, Paolo Bonzini wrote:
> > On 27/09/2018 11:17, Vitaly Kuznetsov wrote:
> > > Roman Kagan  writes:
> > > 
> > >> On Wed, Sep 26, 2018 at 07:02:56PM +0200, Vitaly Kuznetsov wrote:
> > >>> In most common cases VP index of a vcpu matches its vcpu index. 
> > >>> Userspace
> > >>> is, however, free to set any mapping it wishes and we need to account 
> > >>> for
> > >>> that when we need to find a vCPU with a particular VP index. To keep 
> > >>> search
> > >>> algorithms optimal in both cases introduce 'num_mismatched_vp_indexes'
> > >>> counter showing how many vCPUs with mismatching VP index we have. In 
> > >>> case
> > >>> the counter is zero we can assume vp_index == vcpu_idx.
> > >>>
> > >>> Signed-off-by: Vitaly Kuznetsov 
> > >>> ---
> > >>>  arch/x86/include/asm/kvm_host.h |  3 +++
> > >>>  arch/x86/kvm/hyperv.c   | 26 +++---
> > >>>  2 files changed, 26 insertions(+), 3 deletions(-)
> > >>>
> > >>> diff --git a/arch/x86/include/asm/kvm_host.h 
> > >>> b/arch/x86/include/asm/kvm_host.h
> > >>> index 09b2e3e2cf1b..711f79f1b5e6 100644
> > >>> --- a/arch/x86/include/asm/kvm_host.h
> > >>> +++ b/arch/x86/include/asm/kvm_host.h
> > >>> @@ -781,6 +781,9 @@ struct kvm_hv {
> > >>> u64 hv_reenlightenment_control;
> > >>> u64 hv_tsc_emulation_control;
> > >>> u64 hv_tsc_emulation_status;
> > >>> +
> > >>> +   /* How many vCPUs have VP index != vCPU index */
> > >>> +   atomic_t num_mismatched_vp_indexes;
> > >>>  };
> > >>>  
> > >>>  enum kvm_irqchip_mode {
> > >>> diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
> > >>> index c8764faf783b..6a19c8e3c432 100644
> > >>> --- a/arch/x86/kvm/hyperv.c
> > >>> +++ b/arch/x86/kvm/hyperv.c
> > >>> @@ -1045,11 +1045,31 @@ static int kvm_hv_set_msr(struct kvm_vcpu 
> > >>> *vcpu, u32 msr, u64 data, bool host)
> > >>> struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv;
> > >>>  
> > >>> switch (msr) {
> > >>> -   case HV_X64_MSR_VP_INDEX:
> > >>> -   if (!host || (u32)data >= KVM_MAX_VCPUS)
> > >>> +   case HV_X64_MSR_VP_INDEX: {
> > >>> +   struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
> > >>> +   int vcpu_idx = kvm_vcpu_get_idx(vcpu);
> > >>> +   u32 new_vp_index = (u32)data;
> > >>> +
> > >>> +   if (!host || new_vp_index >= KVM_MAX_VCPUS)
> > >>> return 1;
> > >>> -   hv_vcpu->vp_index = (u32)data;
> > >>> +
> > >>> +   if (new_vp_index == hv_vcpu->vp_index)
> > >>> +   return 0;
> > >>> +
> > >>> +   /*
> > >>> +* VP index is changing, increment 
> > >>> num_mismatched_vp_indexes in
> > >>> +* case it was equal to vcpu_idx before; on the other 
> > >>> hand, if
> > >>> +* the new VP index matches vcpu_idx 
> > >>> num_mismatched_vp_indexes
> > >>> +* needs to be decremented.
> > >>
> > >> It may be worth mentioning that the initial balance is provided by
> > >> kvm_hv_vcpu_postcreate setting vp_index = vcpu_idx.
> > >>
> > > 
> > > Of course, yes, will update the comment in case I'll be re-submitting.
> > 
> > /*
> >  * VP index is initialized to hv_vcpu->vp_index by
  ^
  vcpu_idx

> >  * kvm_hv_vcpu_postcreate so they initially match.  Now the
> >  * VP index is changing, adjust num_mismatched_vp_indexes if
> >  * it now matches or no longer matches vcpu_idx.
> >  */
> > 
> > ?
> 
> To my taste - perfect :)

Well, almost :)

Roman.

Re: [PATCH] idr: fix invalid ptr dereference on item delete

2018-05-18 Thread Roman Kagan

On Fri, May 18, 2018 at 03:31:38PM -0700, Andrew Morton wrote:
> On Fri, 18 May 2018 10:50:25 -0700 Matthew Wilcox  wrote:
> 
> > If the radix tree underlying the IDR happens to be full and we attempt
> > to remove an id which is larger than any id in the IDR, we will call
> > __radix_tree_delete() with an uninitialised 'slot' pointer, at which
> > point anything could happen.  This was easiest to hit with a single entry
> > at id 0 and attempting to remove a non-0 id, but it could have happened
> > with 64 entries and attempting to remove an id >= 64.
> > 
> > Fixes: 0a835c4f090a ("Reimplement IDR and IDA using the radix tree")
> > Reported-by: syzbot+35666cba7f0a337e2...@syzkaller.appspotmail.com
> > Debugged-by: Roman Kagan 
> > Signed-off-by: Matthew Wilcox 
> 
> Neither of the changelogs I'm seeing attempt to describe the end-user
> impact of the bug.  People like to know that so they can decide which
> kernel version(s) need patching, so please always remember it.

That's my fault, Matthew may not have seen the original discussion among
the KVM folks.

> Looknig at the sysbot report, the impact is at least "privileged user
> can trigger a WARN", but I assume there could be worse,

Unfortunately it is worse: the syzcaller test boils down to opening
/dev/kvm, creating an eventfd, and calling a couple of KVM ioctls.  None
of this requires superuser.  And the result is dereferencing an
uninitialized pointer which is likely a crash.

> as-yet-undiscovered impacts.  So I'm thinking a cc:stable is needed,
> yes?

Well the specific path caught by syzbot is via KVM_HYPERV_EVENTD ioctl
which is new in 4.17.  But I guess there are other user-triggerable
paths, so cc:stable is probably justified.

Thanks,
Roman.

[PATCH] idr: fix invalid ptr dereference on item delete

2018-05-10 Thread Roman Kagan

If an IDR contains a single entry at index==0, the underlying radix tree
has a single item in its root node, in which case
__radix_tree_lookup(index!=0) doesn't set its *@nodep argument (in
addition to returning NULL).

However, the tree itself is not empty, i.e. the tree root doesn't have
IDR_FREE tag.

As a result, on an attempt to remove an index!=0 entry from such an IDR,
radix_tree_delete_item doesn't return early and calls
__radix_tree_delete with invalid parameters which are then dereferenced.

Reported-by: syzbot+35666cba7f0a337e2...@syzkaller.appspotmail.com
Signed-off-by: Roman Kagan 
---
 lib/radix-tree.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index da9e10c827df..10ff1bfae952 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -2040,8 +2040,9 @@ void *radix_tree_delete_item(struct radix_tree_root *root,
void *entry;
 
entry = __radix_tree_lookup(root, index, &node, &slot);
-   if (!entry && (!is_idr(root) || node_tag_get(root, node, IDR_FREE,
-   get_slot_offset(node, slot
+   if (!entry && (!is_idr(root) || !node ||
+  node_tag_get(root, node, IDR_FREE,
+   get_slot_offset(node, slot
return NULL;
 
if (item && entry != item)
-- 
2.17.0

Re: [PATCH] idr: fix invalid ptr dereference on item delete

2018-05-10 Thread Roman Kagan

On Fri, May 11, 2018 at 07:40:26AM +0200, Dmitry Vyukov wrote:
> On Fri, May 11, 2018 at 1:54 AM, Paolo Bonzini  wrote:
> > On 10/05/2018 21:16, Roman Kagan wrote:
> >> If an IDR contains a single entry at index==0, the underlying radix tree
> >> has a single item in its root node, in which case
> >> __radix_tree_lookup(index!=0) doesn't set its *@nodep argument (in
> >> addition to returning NULL).
> >>
> >> However, the tree itself is not empty, i.e. the tree root doesn't have
> >> IDR_FREE tag.
> >>
> >> As a result, on an attempt to remove an index!=0 entry from such an IDR,
> >> radix_tree_delete_item doesn't return early and calls
> >> __radix_tree_delete with invalid parameters which are then dereferenced.
> >>
> >> Reported-by: syzbot+35666cba7f0a337e2...@syzkaller.appspotmail.com
> >> Signed-off-by: Roman Kagan 
> >> ---
> >>  lib/radix-tree.c | 5 +++--
> >>  1 file changed, 3 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/lib/radix-tree.c b/lib/radix-tree.c
> >> index da9e10c827df..10ff1bfae952 100644
> >> --- a/lib/radix-tree.c
> >> +++ b/lib/radix-tree.c
> >> @@ -2040,8 +2040,9 @@ void *radix_tree_delete_item(struct radix_tree_root 
> >> *root,
> >>   void *entry;
> >>
> >>   entry = __radix_tree_lookup(root, index, &node, &slot);
> >> - if (!entry && (!is_idr(root) || node_tag_get(root, node, IDR_FREE,
> >> - get_slot_offset(node, 
> >> slot
> >> + if (!entry && (!is_idr(root) || !node ||
> >> +node_tag_get(root, node, IDR_FREE,
> >> + get_slot_offset(node, slot
> >>   return NULL;
> >>
> >>   if (item && entry != item)
> >>
> >
> > I cannot really vouch for the patch, but if it is correct it's
> > definitely stuff for stable.  The KVM testcase is only for 4.17-rc but
> > this is a really nasty bug in a core data structure.
> >
> > Cc: sta...@vger.kernel.org
> >
> > Should radix-tree be compilable in userspace, so that we can add unit
> > tests for it?...
> 
> Good point.
> 
> For my education, what/where are the tests that run as user-space code?

Actually there are userspace tests for it under tools/tests/radix-tree,
but I didn't manage to get them to build.  Looks like the recent
introduction of a spin_lock in the radix_tree structure (for XArray
work?) broke them.

Roman.

Re: WARNING in __mutex_unlock_slowpath

2018-05-08 Thread Roman Kagan

On Mon, May 07, 2018 at 07:19:04PM +0200, Paolo Bonzini wrote:
> On 29/04/2018 19:00, syzbot wrote:
> > syzbot hit the following crash on upstream commit
> > bf8f5de17442bba5f811e7e724980730e079ee11 (Sat Apr 28 17:05:04 2018 +)
> > MAINTAINERS: add myself as maintainer of AFFS
> > syzbot dashboard link:
> > https://syzkaller.appspot.com/bug?extid=35666cba7f0a337e2e79
> > 
> > C reproducer: https://syzkaller.appspot.com/x/repro.c?id=5686569910403072
> > syzkaller reproducer:
> > https://syzkaller.appspot.com/x/repro.syz?id=5767017265102848
> > Raw console output:
> > https://syzkaller.appspot.com/x/log.txt?id=6346308495343616
> > Kernel config:
> > https://syzkaller.appspot.com/x/.config?id=7043958930931867332
> > compiler: gcc (GCC) 8.0.1 20180413 (experimental)
> > 
> > IMPORTANT: if you fix the bug, please add the following tag to the commit:
> > Reported-by: syzbot+35666cba7f0a337e2...@syzkaller.appspotmail.com
> > It will help syzbot understand when the bug is fixed. See footer for
> > details.
> > If you forward the report, please keep this part and the footer.
> > 
> > [ cut here ]
> > DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current)
> > WARNING: CPU: 0 PID: 4525 at kernel/locking/mutex.c:1032
> > __mutex_unlock_slowpath+0x62e/0x8a0 kernel/locking/mutex.c:1032
> > Kernel panic - not syncing: panic_on_warn set ...
> 
> This doesn't make much sense, unless it's a "generic" memory corruption,
> but at least the reproducer seems to be simple, just (in pseudocode)
> 
>   ioctl(kvm_vm_fd, KVM_HYPERV_EVENTFD,
> { fd = some_eventfd, conn_id = 0, flags = 0 })
>   ioctl(kvm_vm_fd, KVM_HYPERV_EVENTFD,
> { fd = -1, conn_id = 5, flags = KVM_HYPERV_EVENTFD_DEASSIGN })
> 
> Roman, Cathy, can you give it a quick look?  (Reproducing the reproducer
> link: https://syzkaller.appspot.com/x/repro.c?id=5686569910403072).

Something seems broken in the IDR machinery: IDR with a single id==0
entry reliably crashes when attempting to idr_remove a non-zero id.
Other combinations look fine: removing the existing id==0 entry;
removing a non-existing entry from an IDR with at least one id!=0 entry.

I still haven't pinpointed the root cause.
Cc-ing Matthew.

Roman.

Re: [PATCH] KVM: hyperv: idr_find needs RCU protection

2018-05-08 Thread Roman Kagan

On Mon, May 07, 2018 at 07:25:00PM +0200, Paolo Bonzini wrote:
> Even though the eventfd is released after the KVM SRCU grace period
> elapses, the conn_to_evt data structure itself is not; it uses RCU
> internally, instead.  Fix the read-side critical section to happen
> under rcu_read_lock/unlock; the result is still protected by
> vcpu->kvm->srcu.
> 
> Signed-off-by: Paolo Bonzini 
> ---
>  arch/x86/kvm/hyperv.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)

Reviewed-by: Roman Kagan

Re: [PATCH 12/15] hyperv: move VMBus connection ids to uapi

2017-01-09 Thread Roman Kagan

On Mon, Jan 02, 2017 at 09:19:57AM +0100, Paolo Bonzini wrote:
> On 28/12/2016 18:09, Roman Kagan wrote:
> > Am I correct assuming that QEMU is currently the only user of
> > arch/x86/include/uapi/asm/hyperv.h?
> > 
> > Then I think we're fine withdrawing it from uapi as a whole and letting
> > QEMU pull it in through its header-harvesting scripts (as does now
> > anyway).  This would lift all licensing and longterm API stability
> > expectations.
> 
> Actually, QEMU's header-harvesting scripts use uapi/ headers
> exclusively, since they are built on "make headers_install".
> 
> The extra cleanups that QEMU does on top are to allow compilation of the
> headers on non-Linux machines.  They don't really do much more than
> changing Linux (linux/types.h) integer types to the C99 (stdint.h)
> equivalents.

Ouch, I stand corrected.

So what should we do with it then?  I'm sorta lost...

We certainly can give it up and live with a private copy of the
definitions in the QEMU tree but that doesn't sound optimal in any
sense.

Thanks,
Roman.

Re: [PATCH 12/15] hyperv: move VMBus connection ids to uapi

2017-01-09 Thread Roman Kagan

On Mon, Jan 09, 2017 at 12:40:48AM -0800, h...@zytor.com wrote:
> On January 9, 2017 12:32:23 AM PST, Roman Kagan  wrote:
> >On Mon, Jan 02, 2017 at 09:19:57AM +0100, Paolo Bonzini wrote:
> >> On 28/12/2016 18:09, Roman Kagan wrote:
> >> > Am I correct assuming that QEMU is currently the only user of
> >> > arch/x86/include/uapi/asm/hyperv.h?
> >> > 
> >> > Then I think we're fine withdrawing it from uapi as a whole and
> >letting
> >> > QEMU pull it in through its header-harvesting scripts (as does now
> >> > anyway).  This would lift all licensing and longterm API stability
> >> > expectations.
> >> 
> >> Actually, QEMU's header-harvesting scripts use uapi/ headers
> >> exclusively, since they are built on "make headers_install".
> >> 
> >> The extra cleanups that QEMU does on top are to allow compilation of
> >the
> >> headers on non-Linux machines.  They don't really do much more than
> >> changing Linux (linux/types.h) integer types to the C99 (stdint.h)
> >> equivalents.
> >
> >Ouch, I stand corrected.
> >
> >So what should we do with it then?  I'm sorta lost...
> >
> >We certainly can give it up and live with a private copy of the
> >definitions in the QEMU tree but that doesn't sound optimal in any
> >sense.
> >
> >Thanks,
> >Roman.
> 
> Why do that through header mangling rather than typedef?

Sorry for not being clear, I actually asked what to do with the Hyper-V
and VMBus protocol definitions.

The typedef vs mangling is a different matter; I guess mangling was
chosen to avoid conflicts with system-provided definitions on non-Linux
systems, but I think Paolo can tell more.

Roman.

[PATCH] x86:pvclock: add missing barriers

2016-06-08 Thread Roman Kagan

Gradual removal of excessive barriers in pvclock reading functions
(commits 502dfeff239e8313bfbe906ca0a1a6827ac8481b,
a3eb97bd80134ba07864ca00747466c02118aca1) ended up removing too much:
although rdtsc is now orderd WRT other loads, there's no protection
against the compiler reordering the loads of ->version with the loads of
other fields.

E.g. on my system gcc-5.3.1 generates code which loads ->system_time and
->flags outside of the ->version test loop.

(Re)introduce the compiler barriers around accesses to the contents of
pvclock.  While at this, make the function a bit more compact by
removing unnecessary local variables.

Signed-off-by: Roman Kagan 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: "H. Peter Anvin" 
Cc: x...@kernel.org
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Paolo Bonzini 
Cc: sta...@vger.kernel.org
---
 arch/x86/include/asm/pvclock.h | 17 +
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index fdcc040..65c4de2 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -80,18 +80,11 @@ static __always_inline
 unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
   cycle_t *cycles, u8 *flags)
 {
-   unsigned version;
-   cycle_t ret, offset;
-   u8 ret_flags;
-
-   version = src->version;
-
-   offset = pvclock_get_nsec_offset(src);
-   ret = src->system_time + offset;
-   ret_flags = src->flags;
-
-   *cycles = ret;
-   *flags = ret_flags;
+   unsigned version = src->version;
+   barrier();
+   *cycles = src->system_time + pvclock_get_nsec_offset(src);
+   *flags = src->flags;
+   barrier();
return version;
 }
 
-- 
2.5.5

Re: [PATCH] x86:pvclock: add missing barriers

2016-06-08 Thread Roman Kagan

On Wed, Jun 08, 2016 at 09:45:09PM +0200, Borislav Petkov wrote:
> On Wed, Jun 08, 2016 at 09:11:39PM +0300, Roman Kagan wrote:
> > Gradual removal of excessive barriers in pvclock reading functions
> > (commits 502dfeff239e8313bfbe906ca0a1a6827ac8481b,
> > a3eb97bd80134ba07864ca00747466c02118aca1) ended up removing too much:
> > although rdtsc is now orderd WRT other loads, there's no protection
> > against the compiler reordering the loads of ->version with the loads of
> > other fields.
> > 
> > E.g. on my system gcc-5.3.1 generates code which loads ->system_time and
> > ->flags outside of the ->version test loop.
> > 
> > (Re)introduce the compiler barriers around accesses to the contents of
> > pvclock.  While at this, make the function a bit more compact by
> > removing unnecessary local variables.
> > 
> > Signed-off-by: Roman Kagan 
> > Cc: Thomas Gleixner 
> > Cc: Ingo Molnar 
> > Cc: "H. Peter Anvin" 
> > Cc: x...@kernel.org
> > Cc: Andy Lutomirski 
> > Cc: Borislav Petkov 
> > Cc: Paolo Bonzini 
> > Cc: sta...@vger.kernel.org
> > ---
> >  arch/x86/include/asm/pvclock.h | 17 +
> >  1 file changed, 5 insertions(+), 12 deletions(-)
> > 
> > diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
> > index fdcc040..65c4de2 100644
> > --- a/arch/x86/include/asm/pvclock.h
> > +++ b/arch/x86/include/asm/pvclock.h
> > @@ -80,18 +80,11 @@ static __always_inline
> >  unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
> >cycle_t *cycles, u8 *flags)
> >  {
> > -   unsigned version;
> > -   cycle_t ret, offset;
> > -   u8 ret_flags;
> > -
> > -   version = src->version;
> > -
> > -   offset = pvclock_get_nsec_offset(src);
> > -   ret = src->system_time + offset;
> > -   ret_flags = src->flags;
> > -
> > -   *cycles = ret;
> > -   *flags = ret_flags;
> > +   unsigned version = src->version;
> > +   barrier();
> > +   *cycles = src->system_time + pvclock_get_nsec_offset(src);
> > +   *flags = src->flags;
> > +   barrier();
> > return version;
> 
> I have a similar patchset in my mbox starting here:
> 
> https://lkml.kernel.org/r/1464329832-4638-1-git-send-email-mngh...@gmail.com
> 
> Care to take a look?

Just did, thanks for the link.

The difference is whether you want the reader to see consistent view of
the pvclock data (as in my patch) or also the most up to date one
(as in Minfei Huang's patch) at the cost of extra lfence instructions
(on my machine this is 30% slowdown).

I'm not sure if the latter is really necessary.  If it is, then the
lfence or mfence in rdtsc_ordered() becomes excessive.  Perhaps we'd
have to revert to rdtsc_barrier() to surround pvclock data access, and
plain rdtsc() instead of rdtsc_ordered().

Roman.

Re: [PATCH] x86:pvclock: add missing barriers

2016-06-09 Thread Roman Kagan

On Thu, Jun 09, 2016 at 12:01:13AM +0300, Roman Kagan wrote:
> On Wed, Jun 08, 2016 at 09:45:09PM +0200, Borislav Petkov wrote:
> > On Wed, Jun 08, 2016 at 09:11:39PM +0300, Roman Kagan wrote:
> > > --- a/arch/x86/include/asm/pvclock.h
> > > +++ b/arch/x86/include/asm/pvclock.h
> > > @@ -80,18 +80,11 @@ static __always_inline
> > >  unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
> > >  cycle_t *cycles, u8 *flags)
> > >  {
> > > - unsigned version;
> > > - cycle_t ret, offset;
> > > - u8 ret_flags;
> > > -
> > > - version = src->version;
> > > -
> > > - offset = pvclock_get_nsec_offset(src);
> > > - ret = src->system_time + offset;
> > > - ret_flags = src->flags;
> > > -
> > > - *cycles = ret;
> > > - *flags = ret_flags;
> > > + unsigned version = src->version;
> > > + barrier();
> > > + *cycles = src->system_time + pvclock_get_nsec_offset(src);
> > > + *flags = src->flags;
> > > + barrier();
> > >   return version;
> > 
> > I have a similar patchset in my mbox starting here:
> > 
> > https://lkml.kernel.org/r/1464329832-4638-1-git-send-email-mngh...@gmail.com
> > 
> > Care to take a look?
> 
> Just did, thanks for the link.
> 
> The difference is whether you want the reader to see consistent view of
> the pvclock data (as in my patch) or also the most up to date one
> (as in Minfei Huang's patch) at the cost of extra lfence instructions
> (on my machine this is 30% slowdown).

Sorry, I should have looked better.  Minfei's patch inserts smb_rmb()-s
which on x86 are just barrier()-s, so that patch results in the code
equivalent to mine.  So I'll jump onto that thread instead of pursuing
this one.

Roman.

Re: [PATCH] pvclock: introduce seqcount-like API

2016-06-09 Thread Roman Kagan

On Thu, Jun 09, 2016 at 01:23:23PM +0200, Paolo Bonzini wrote:
> The version field in struct pvclock_vcpu_time_info basically implements
> a seqcount.  Wrap it with the usual read_begin and read_retry functions,
> and use these APIs instead of peppering the code with smp_rmb()s.
> While at it, change it to the more pedantically correct virt_rmb().
> 
> With this change, __pvclock_read_cycles can be simplified noticeably.
> 
> Signed-off-by: Paolo Bonzini 
> ---
>  arch/x86/entry/vdso/vclock_gettime.c | 25 +--
>  arch/x86/include/asm/pvclock.h   | 39 
> +---
>  arch/x86/kernel/pvclock.c| 17 ++--
>  3 files changed, 34 insertions(+), 47 deletions(-)
[...]
> --- a/arch/x86/include/asm/pvclock.h
> +++ b/arch/x86/include/asm/pvclock.h
[...]
> @@ -69,23 +87,12 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 
> mul_frac, int shift)
>  }
>  
>  static __always_inline
> -unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
> -cycle_t *cycles, u8 *flags)
> +cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src)
>  {
> - unsigned version;
> - cycle_t offset;
> - u64 delta;
> -
> - version = src->version;
> - /* Make the latest version visible */
> - smp_rmb();

This is on top of Minfei's patch, right?  It isn't in Linus' tree yet so
I wonder if it makes sense to merge the two patches into one.

Will you post it to stable, too?

Anyway

Reviewed-by: Roman Kagan 

Roman.

Re: [PATCH] pvclock: introduce seqcount-like API

2016-06-09 Thread Roman Kagan

On Thu, Jun 09, 2016 at 02:47:54PM +0200, Paolo Bonzini wrote:
> On 09/06/2016 14:43, Roman Kagan wrote:
> > On Thu, Jun 09, 2016 at 01:23:23PM +0200, Paolo Bonzini wrote:
> >> The version field in struct pvclock_vcpu_time_info basically implements
> >> a seqcount.  Wrap it with the usual read_begin and read_retry functions,
> >> and use these APIs instead of peppering the code with smp_rmb()s.
> >> While at it, change it to the more pedantically correct virt_rmb().
> >>
> >> With this change, __pvclock_read_cycles can be simplified noticeably.
> >>
> >> Signed-off-by: Paolo Bonzini 
> >> ---
> >>  arch/x86/entry/vdso/vclock_gettime.c | 25 +--
> >>  arch/x86/include/asm/pvclock.h   | 39 
> >> +---
> >>  arch/x86/kernel/pvclock.c| 17 ++--
> >>  3 files changed, 34 insertions(+), 47 deletions(-)
> > [...]
> >> --- a/arch/x86/include/asm/pvclock.h
> >> +++ b/arch/x86/include/asm/pvclock.h
> > [...]
> >> @@ -69,23 +87,12 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 
> >> mul_frac, int shift)
> >>  }
> >>  
> >>  static __always_inline
> >> -unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
> >> - cycle_t *cycles, u8 *flags)
> >> +cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src)
> >>  {
> >> -  unsigned version;
> >> -  cycle_t offset;
> >> -  u64 delta;
> >> -
> >> -  version = src->version;
> >> -  /* Make the latest version visible */
> >> -  smp_rmb();
> > 
> > This is on top of Minfei's patch, right?  It isn't in Linus' tree yet so
> > I wonder if it makes sense to merge the two patches into one.
> > 
> > Will you post it to stable, too?
> 
> Not this one, because Minfei's patch is enough to fix the bug, but I do

Has it landed in any public tree?  I'm unable to find any.  There
appears to be another version of the patch on the list, so I'm confused.

Roman.

Re: [RFC PATCH v3 0/6] KVM: x86: avoid redundant REQ_EVENT

2016-12-20 Thread Roman Kagan

On Mon, Dec 19, 2016 at 03:30:17PM +0100, Paolo Bonzini wrote:
> 
> 
> On 19/12/2016 14:05, Denis Plotnikov wrote:
> >>
> > This new patch-set does avoid unnecessary interrupt re-injections -
> > checked.
> > 
> > The test (MS Windows, sending messages between multiple windows using
> > windows-specific interface),
> > which showed performance growth with "[PATCH v2] KVM: x86: avoid
> > redundant REQ_EVENT" showed pretty much the same performance level with
> > this new patch-set.
> > The test score difference (+2.4% to [PATCH v2]) is within the tolerance
> > range(5%).
> > 
> > The test score on mainstream v4.9 kernel CPU Intel i5-2400, guest
> > Windows Server 2012 2VCPU, 2GB:
> > 
> > without patch: 31510 (+/- 4%)
> >with patch: 36270 (+/- 5%)
> > 
> > difference = (36270-31510)/31510 * 100 = +15% -- looks good!
> 
> Awesome!  I hope it also qualifies as less ugly. :)

Absolutely.  FWIW

Reviewed-by: Roman Kagan 

Roman.

[PATCH 00/15] hyperv: more stuff to uapi + cleanup

2016-12-20 Thread Roman Kagan

Expose more Hyper-V-related definitions in the uapi header for
consumption by userspace.

While doing so, get rid of a number of duplications between the KVM and
the guest driver code.  Also a few other cleanups are made which are not
strictly necessary for the main purpose of the series but appear
reasonable to do at the same time.

The most controversial is the last patch which modifies the stuff
already published in the uapi header, in the hope that no userspace
applications have started relying on it; I'm ok dropping it if this is
unacceptable.

Roman Kagan (15):
  hyperv: consolidate TSC ref page definitions
  hyperv: uapi-fy synic event flags definitions
  hyperv: use standard bitops
  hyperv: define VMBus message type
  hyperv: GFP_ATOMIC -> GFP_KERNEL
  hyperv: avoid unnecessary vmalloc
  hyperv: dedup cpuid definitions
  hyperv: dedup crash msr related definitions
  hyperv: unify Hyper-V msr definitions
  hyperv: uapi-fy PostMessage and SignalEvent hypercall structures
  hyperv: uapi-fy monitored notification structures
  hyperv: move VMBus connection ids to uapi
  hyperv: move function close to its only callsite
  hyperv_vmbus: drop unused definitions
  hyperv: redefine hv_message without bitfields

 arch/x86/include/asm/kvm_host.h|   2 +-
 arch/x86/include/uapi/asm/hyperv.h | 101 +++---
 drivers/hv/hyperv_vmbus.h  | 399 +
 include/linux/hyperv.h |  24 +--
 arch/x86/kvm/hyperv.c  |  14 +-
 drivers/hv/channel.c   |   8 +-
 drivers/hv/channel_mgmt.c  |  30 +--
 drivers/hv/connection.c|  65 ++
 drivers/hv/hv.c| 300 +---
 drivers/hv/vmbus_drv.c |  67 +++
 10 files changed, 288 insertions(+), 722 deletions(-)

-- 
2.9.3

[PATCH 14/15] hyperv_vmbus: drop unused definitions

2016-12-20 Thread Roman Kagan

None of these is used in the kernel.

Signed-off-by: Roman Kagan 
---
 drivers/hv/hyperv_vmbus.h | 119 --
 1 file changed, 119 deletions(-)

diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index fcb5d91..8ce6d64 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -39,125 +39,6 @@
  */
 #define HV_UTIL_NEGO_TIMEOUT 55
 
-/* Define version of the synthetic interrupt controller. */
-#define HV_SYNIC_VERSION   (1)
-
-#define HV_ANY_VP  (0x)
-
-/* Define invalid partition identifier. */
-#define HV_PARTITION_ID_INVALID((u64)0x0)
-
-/* Define port type. */
-enum hv_port_type {
-   HVPORT_MSG  = 1,
-   HVPORT_EVENT= 2,
-   HVPORT_MONITOR  = 3
-};
-
-/* Define port information structure. */
-struct hv_port_info {
-   enum hv_port_type port_type;
-   u32 padding;
-   union {
-   struct {
-   u32 target_sint;
-   u32 target_vp;
-   u64 rsvdz;
-   } message_port_info;
-   struct {
-   u32 target_sint;
-   u32 target_vp;
-   u16 base_flag_number;
-   u16 flag_count;
-   u32 rsvdz;
-   } event_port_info;
-   struct {
-   u64 monitor_address;
-   u64 rsvdz;
-   } monitor_port_info;
-   };
-};
-
-struct hv_connection_info {
-   enum hv_port_type port_type;
-   u32 padding;
-   union {
-   struct {
-   u64 rsvdz;
-   } message_connection_info;
-   struct {
-   u64 rsvdz;
-   } event_connection_info;
-   struct {
-   u64 monitor_address;
-   } monitor_connection_info;
-   };
-};
-
-/*
- * Versioning definitions used for guests reporting themselves to the
- * hypervisor, and visa versa.
- */
-
-/* Version info reported by guest OS's */
-enum hv_guest_os_vendor {
-   HVGUESTOS_VENDOR_MICROSOFT  = 0x0001
-};
-
-enum hv_guest_os_microsoft_ids {
-   HVGUESTOS_MICROSOFT_UNDEFINED   = 0x00,
-   HVGUESTOS_MICROSOFT_MSDOS   = 0x01,
-   HVGUESTOS_MICROSOFT_WINDOWS3X   = 0x02,
-   HVGUESTOS_MICROSOFT_WINDOWS9X   = 0x03,
-   HVGUESTOS_MICROSOFT_WINDOWSNT   = 0x04,
-   HVGUESTOS_MICROSOFT_WINDOWSCE   = 0x05
-};
-
-
-/* #defines */
-
-#define HV_PRESENT_BIT 0x8000
-
-#define HV_CPU_POWER_MANAGEMENT(1 << 0)
-#define HV_RECOMMENDATIONS_MAX 4
-
-#define HV_X64_MAX 5
-#define HV_CAPS_MAX8
-
-
-/* Service definitions */
-
-#define HV_SERVICE_PARENT_PORT (0)
-#define HV_SERVICE_PARENT_CONNECTION   (0)
-
-#define HV_SERVICE_CONNECT_RESPONSE_SUCCESS(0)
-#define HV_SERVICE_CONNECT_RESPONSE_INVALID_PARAMETER  (1)
-#define HV_SERVICE_CONNECT_RESPONSE_UNKNOWN_SERVICE(2)
-#define HV_SERVICE_CONNECT_RESPONSE_CONNECTION_REJECTED(3)
-
-#define HV_SERVICE_CONNECT_REQUEST_MESSAGE_ID  (1)
-#define HV_SERVICE_CONNECT_RESPONSE_MESSAGE_ID (2)
-#define HV_SERVICE_DISCONNECT_REQUEST_MESSAGE_ID   (3)
-#define HV_SERVICE_DISCONNECT_RESPONSE_MESSAGE_ID  (4)
-#define HV_SERVICE_MAX_MESSAGE_ID  (4)
-
-#define HV_SERVICE_PROTOCOL_VERSION (0x0010)
-#define HV_CONNECT_PAYLOAD_BYTE_COUNT 64
-
-/* #define VMBUS_REVISION_NUMBER   6 */
-
-/* Our local vmbus's port and connection id. Anything >0 is fine */
-/* #define VMBUS_PORT_ID   11 */
-
-/* 628180B8-308D-4c5e-B7DB-1BEB62E62EF4 */
-static const uuid_le VMBUS_SERVICE_ID = {
-   .b = {
-   0xb8, 0x80, 0x81, 0x62, 0x8d, 0x30, 0x5e, 0x4c,
-   0xb7, 0xdb, 0x1b, 0xeb, 0x62, 0xe6, 0x2e, 0xf4
-   },
-};
-
-
 
 struct hv_context {
/* We only support running on top of Hyper-V
-- 
2.9.3

[PATCH 12/15] hyperv: move VMBus connection ids to uapi

2016-12-20 Thread Roman Kagan

Userspace will need them too.

Signed-off-by: Roman Kagan 
---
 arch/x86/include/uapi/asm/hyperv.h |  9 +
 drivers/hv/hyperv_vmbus.h  | 10 --
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/uapi/asm/hyperv.h 
b/arch/x86/include/uapi/asm/hyperv.h
index e081615..5d6e525 100644
--- a/arch/x86/include/uapi/asm/hyperv.h
+++ b/arch/x86/include/uapi/asm/hyperv.h
@@ -419,4 +419,13 @@ struct hv_monitor_page {
__u8 rsvdz4[1984];
 };
 
+/* VMBus expects pre-established communication with the following IDs */
+#define VMBUS_MESSAGE_CONNECTION_ID1
+#define VMBUS_MESSAGE_PORT_ID  1
+#define VMBUS_EVENT_CONNECTION_ID  2
+#define VMBUS_EVENT_PORT_ID2
+#define VMBUS_MONITOR_CONNECTION_ID3
+#define VMBUS_MONITOR_PORT_ID  3
+#define VMBUS_MESSAGE_SINT 2
+
 #endif
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index 7f247f2..c0a65f7 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -114,16 +114,6 @@ enum hv_guest_os_microsoft_ids {
 };
 
 
-enum {
-   VMBUS_MESSAGE_CONNECTION_ID = 1,
-   VMBUS_MESSAGE_PORT_ID   = 1,
-   VMBUS_EVENT_CONNECTION_ID   = 2,
-   VMBUS_EVENT_PORT_ID = 2,
-   VMBUS_MONITOR_CONNECTION_ID = 3,
-   VMBUS_MONITOR_PORT_ID   = 3,
-   VMBUS_MESSAGE_SINT  = 2,
-};
-
 /* #defines */
 
 #define HV_PRESENT_BIT 0x8000
-- 
2.9.3

[PATCH 13/15] hyperv: move function close to its only callsite

2016-12-20 Thread Roman Kagan

Signed-off-by: Roman Kagan 
---
 drivers/hv/hyperv_vmbus.h | 44 
 drivers/hv/hv.c   | 39 +++
 2 files changed, 39 insertions(+), 44 deletions(-)

diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index c0a65f7..fcb5d91 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -118,50 +118,6 @@ enum hv_guest_os_microsoft_ids {
 
 #define HV_PRESENT_BIT 0x8000
 
-/*
- * The guest OS needs to register the guest ID with the hypervisor.
- * The guest ID is a 64 bit entity and the structure of this ID is
- * specified in the Hyper-V specification:
- *
- * 
http://msdn.microsoft.com/en-us/library/windows/hardware/ff542653%28v=vs.85%29.aspx
- *
- * While the current guideline does not specify how Linux guest ID(s)
- * need to be generated, our plan is to publish the guidelines for
- * Linux and other guest operating systems that currently are hosted
- * on Hyper-V. The implementation here conforms to this yet
- * unpublished guidelines.
- *
- *
- * Bit(s)
- * 63 - Indicates if the OS is Open Source or not; 1 is Open Source
- * 62:56 - Os Type; Linux is 0x100
- * 55:48 - Distro specific identification
- * 47:16 - Linux kernel version number
- * 15:0  - Distro specific identification
- *
- *
- */
-
-#define HV_LINUX_VENDOR_ID 0x8100
-
-/*
- * Generate the guest ID based on the guideline described above.
- */
-
-static inline  __u64 generate_guest_id(__u8 d_info1, __u32 kernel_version,
-   __u16 d_info2)
-{
-   __u64 guest_id = 0;
-
-   guest_id = (((__u64)HV_LINUX_VENDOR_ID) << 48);
-   guest_id |= (((__u64)(d_info1)) << 48);
-   guest_id |= (((__u64)(kernel_version)) << 16);
-   guest_id |= ((__u64)(d_info2));
-
-   return guest_id;
-}
-
-
 #define HV_CPU_POWER_MANAGEMENT(1 << 0)
 #define HV_RECOMMENDATIONS_MAX 4
 
diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
index b9f50de..3598090 100644
--- a/drivers/hv/hv.c
+++ b/drivers/hv/hv.c
@@ -45,6 +45,45 @@ struct hv_context hv_context = {
 #define HV_MIN_DELTA_TICKS 1
 
 /*
+ * The guest OS needs to register the guest ID with the hypervisor.
+ * The guest ID is a 64 bit entity and the structure of this ID is
+ * specified in the Hyper-V specification:
+ *
+ * 
http://msdn.microsoft.com/en-us/library/windows/hardware/ff542653%28v=vs.85%29.aspx
+ *
+ * While the current guideline does not specify how Linux guest ID(s)
+ * need to be generated, our plan is to publish the guidelines for
+ * Linux and other guest operating systems that currently are hosted
+ * on Hyper-V. The implementation here conforms to this yet
+ * unpublished guidelines.
+ *
+ * Bit(s)
+ * 63 - Indicates if the OS is Open Source or not; 1 is Open Source
+ * 62:56 - Os Type; Linux is 0x100
+ * 55:48 - Distro specific identification
+ * 47:16 - Linux kernel version number
+ * 15:0  - Distro specific identification
+ */
+
+#define HV_LINUX_VENDOR_ID 0x8100
+
+/*
+ * Generate the guest ID based on the guideline described above.
+ */
+
+static u64 generate_guest_id(u8 d_info1, u32 kernel_version, u16 d_info2)
+{
+   u64 guest_id;
+
+   guest_id = ((u64)HV_LINUX_VENDOR_ID) << 48;
+   guest_id |= ((u64)d_info1) << 48;
+   guest_id |= ((u64)kernel_version) << 16;
+   guest_id |= (u64)d_info2;
+
+   return guest_id;
+}
+
+/*
  * query_hypervisor_info - Get version info of the windows hypervisor
  */
 unsigned int host_info_eax;
-- 
2.9.3

[PATCH 15/15] hyperv: redefine hv_message without bitfields

2016-12-20 Thread Roman Kagan

This brings more symmetry in the API.

The downside is that this changes the userspace-visible structure.
Hopefully no userspace code had a chance to use it yet.

Signed-off-by: Roman Kagan 
---
 arch/x86/include/uapi/asm/hyperv.h | 32 +---
 drivers/hv/hyperv_vmbus.h  |  2 +-
 arch/x86/kvm/hyperv.c  | 10 +-
 drivers/hv/channel_mgmt.c  |  6 +++---
 drivers/hv/vmbus_drv.c |  2 +-
 5 files changed, 15 insertions(+), 37 deletions(-)

diff --git a/arch/x86/include/uapi/asm/hyperv.h 
b/arch/x86/include/uapi/asm/hyperv.h
index 5d6e525..89ef9c2 100644
--- a/arch/x86/include/uapi/asm/hyperv.h
+++ b/arch/x86/include/uapi/asm/hyperv.h
@@ -276,7 +276,8 @@ struct hv_ref_tsc_page {
 /* Define synthetic interrupt controller message constants. */
 #define HV_MESSAGE_SIZE(256)
 #define HV_MESSAGE_PAYLOAD_BYTE_COUNT  (240)
-#define HV_MESSAGE_PAYLOAD_QWORD_COUNT (30)
+
+#define HV_MESSAGE_FLAG_PENDING(1)
 
 /* Define hypervisor message types. */
 enum hv_message_type {
@@ -308,42 +309,19 @@ enum hv_message_type {
HVMSG_X64_LEGACY_FP_ERROR   = 0x80010005
 };
 
-/* Define synthetic interrupt controller message flags. */
-union hv_message_flags {
-   __u8 asu8;
-   struct {
-   __u8 msg_pending:1;
-   __u8 reserved:7;
-   };
-};
-
-/* Define port identifier type. */
-union hv_port_id {
-   __u32 asu32;
-   struct {
-   __u32 id:24;
-   __u32 reserved:8;
-   } u;
-};
-
 /* Define synthetic interrupt controller message header. */
 struct hv_message_header {
__u32 message_type;
__u8 payload_size;
-   union hv_message_flags message_flags;
+   __u8 message_flags;
__u8 reserved[2];
-   union {
-   __u64 sender;
-   union hv_port_id port;
-   };
+   __u64 origination_id;
 };
 
 /* Define synthetic interrupt controller message format. */
 struct hv_message {
struct hv_message_header header;
-   union {
-   __u64 payload[HV_MESSAGE_PAYLOAD_QWORD_COUNT];
-   } u;
+   __u64 payload[HV_MESSAGE_PAYLOAD_BYTE_COUNT / 8];
 };
 
 /* Define the synthetic interrupt message page layout. */
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index 8ce6d64..87713cc 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -260,7 +260,7 @@ static inline void vmbus_signal_eom(struct hv_message *msg, 
u32 old_msg_type)
 */
mb();
 
-   if (msg->header.message_flags.msg_pending) {
+   if (msg->header.message_flags & HV_MESSAGE_FLAG_PENDING) {
/*
 * This will cause message queue rescan to
 * possibly deliver another msg from the
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index c7db112..546dddc 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -137,7 +137,7 @@ static void synic_clear_sint_msg_pending(struct 
kvm_vcpu_hv_synic *synic,
msg_page = kmap_atomic(page);
 
msg = &msg_page->sint_message[sint];
-   msg->header.message_flags.msg_pending = 0;
+   msg->header.message_flags &= ~HV_MESSAGE_FLAG_PENDING;
 
kunmap_atomic(msg_page);
kvm_release_page_dirty(page);
@@ -564,10 +564,10 @@ static int synic_deliver_msg(struct kvm_vcpu_hv_synic 
*synic, u32 sint,
dst_msg = &msg_page->sint_message[sint];
if (sync_cmpxchg(&dst_msg->header.message_type, HVMSG_NONE,
 src_msg->header.message_type) != HVMSG_NONE) {
-   dst_msg->header.message_flags.msg_pending = 1;
+   dst_msg->header.message_flags |= HV_MESSAGE_FLAG_PENDING;
r = -EAGAIN;
} else {
-   memcpy(&dst_msg->u.payload, &src_msg->u.payload,
+   memcpy(&dst_msg->payload, &src_msg->payload,
   src_msg->header.payload_size);
dst_msg->header.message_type = src_msg->header.message_type;
dst_msg->header.payload_size = src_msg->header.payload_size;
@@ -588,7 +588,7 @@ static int stimer_send_msg(struct kvm_vcpu_hv_stimer 
*stimer)
struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
struct hv_message *msg = &stimer->msg;
struct hv_timer_message_payload *payload =
-   (struct hv_timer_message_payload *)&msg->u.payload;
+   (struct hv_timer_message_payload *)&msg->payload;
 
payload->expiration_time = stimer->exp_time;
payload->delivery_time = get_time_ref_counter(vcpu->kvm);
@@ -653,7 +653,7 @@ static void stimer_prepare_msg(struct kvm_vcpu_hv_stimer 
*stimer)
 {
struct hv_message *msg = &stimer->msg;
struct hv_timer_message_payload *payload =
-

[PATCH 02/15] hyperv: uapi-fy synic event flags definitions

2016-12-20 Thread Roman Kagan

Move definitions related to the Hyper-V SynIC event flags to a header
where they can be consumed by userspace.

While doing so, also clean up their use by switching to standard bitops
and struct-based dereferencing.  The latter is also done for message
pages.

Signed-off-by: Roman Kagan 
---
 arch/x86/include/uapi/asm/hyperv.h | 13 +
 drivers/hv/hyperv_vmbus.h  | 24 ++--
 drivers/hv/channel_mgmt.c  | 10 +++
 drivers/hv/connection.c| 47 ++-
 drivers/hv/vmbus_drv.c | 57 ++
 5 files changed, 54 insertions(+), 97 deletions(-)

diff --git a/arch/x86/include/uapi/asm/hyperv.h 
b/arch/x86/include/uapi/asm/hyperv.h
index 6098ab5..af542a3 100644
--- a/arch/x86/include/uapi/asm/hyperv.h
+++ b/arch/x86/include/uapi/asm/hyperv.h
@@ -363,4 +363,17 @@ struct hv_timer_message_payload {
 #define HV_STIMER_AUTOENABLE   (1ULL << 3)
 #define HV_STIMER_SINT(config) (__u8)(((config) >> 16) & 0x0F)
 
+/* Define synthetic interrupt controller flag constants. */
+#define HV_EVENT_FLAGS_COUNT   (256 * 8)
+
+/* Define the synthetic interrupt controller event flags format. */
+struct hv_synic_event_flags {
+   __u64 flags[HV_EVENT_FLAGS_COUNT / 64];
+};
+
+/* Define the synthetic interrupt flags page layout. */
+struct hv_synic_event_flags_page {
+   struct hv_synic_event_flags sintevent_flags[HV_SYNIC_SINT_COUNT];
+};
+
 #endif
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index 4516498..4fab154 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -26,7 +26,6 @@
 #define _HYPERV_VMBUS_H
 
 #include 
-#include 
 #include 
 #include 
 
@@ -75,11 +74,6 @@ enum hv_cpuid_function {
 
 #define HV_ANY_VP  (0x)
 
-/* Define synthetic interrupt controller flag constants. */
-#define HV_EVENT_FLAGS_COUNT   (256 * 8)
-#define HV_EVENT_FLAGS_BYTE_COUNT  (256)
-#define HV_EVENT_FLAGS_DWORD_COUNT (256 / sizeof(u32))
-
 /* Define invalid partition identifier. */
 #define HV_PARTITION_ID_INVALID((u64)0x0)
 
@@ -146,20 +140,6 @@ union hv_timer_config {
};
 };
 
-/* Define the number of message buffers associated with each port. */
-#define HV_PORT_MESSAGE_BUFFER_COUNT   (16)
-
-/* Define the synthetic interrupt controller event flags format. */
-union hv_synic_event_flags {
-   u8 flags8[HV_EVENT_FLAGS_BYTE_COUNT];
-   u32 flags32[HV_EVENT_FLAGS_DWORD_COUNT];
-};
-
-/* Define the synthetic interrupt flags page layout. */
-struct hv_synic_event_flags_page {
-   union hv_synic_event_flags sintevent_flags[HV_SYNIC_SINT_COUNT];
-};
-
 /* Define SynIC control register. */
 union hv_synic_scontrol {
u64 as_uint64;
@@ -434,8 +414,8 @@ struct hv_context {
 
bool synic_initialized;
 
-   void *synic_message_page[NR_CPUS];
-   void *synic_event_page[NR_CPUS];
+   struct hv_message_page *synic_message_page[NR_CPUS];
+   struct hv_synic_event_flags_page *synic_event_page[NR_CPUS];
/*
 * Hypervisor's notion of virtual processor ID is different from
 * Linux' notion of CPU ID. This information can only be retrieved
diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index 26b4192..49eaae2 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -654,7 +654,6 @@ static void init_vp_index(struct vmbus_channel *channel, 
u16 dev_type)
 static void vmbus_wait_for_unload(void)
 {
int cpu;
-   void *page_addr;
struct hv_message *msg;
struct vmbus_channel_message_header *hdr;
u32 message_type;
@@ -673,9 +672,8 @@ static void vmbus_wait_for_unload(void)
break;
 
for_each_online_cpu(cpu) {
-   page_addr = hv_context.synic_message_page[cpu];
-   msg = (struct hv_message *)page_addr +
-   VMBUS_MESSAGE_SINT;
+   msg = &hv_context.synic_message_page[cpu]->
+   sint_message[VMBUS_MESSAGE_SINT];
 
message_type = READ_ONCE(msg->header.message_type);
if (message_type == HVMSG_NONE)
@@ -699,8 +697,8 @@ static void vmbus_wait_for_unload(void)
 * messages after we reconnect.
 */
for_each_online_cpu(cpu) {
-   page_addr = hv_context.synic_message_page[cpu];
-   msg = (struct hv_message *)page_addr + VMBUS_MESSAGE_SINT;
+   msg = &hv_context.synic_message_page[cpu]->
+   sint_message[VMBUS_MESSAGE_SINT];
msg->header.message_type = HVMSG_NONE;
}
 }
diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index 6ce8b87..aaa2103 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -381,17 +381

[PATCH 06/15] hyperv: avoid unnecessary vmalloc

2016-12-20 Thread Roman Kagan

Make hypercall and tsc page allocation similar to the rest of the
Hyper-V shared memory stuff instead of vmalloc-ing them.

Also perform cleanup unconditionally which is safe.

TODO: the skipping of free in case of a crash is probably no longer
necessary, too.

Signed-off-by: Roman Kagan 
---
 drivers/hv/hv.c | 42 ++
 1 file changed, 18 insertions(+), 24 deletions(-)

diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
index 6bbc0b09..b40c7d9 100644
--- a/drivers/hv/hv.c
+++ b/drivers/hv/hv.c
@@ -29,6 +29,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include "hyperv_vmbus.h"
@@ -208,14 +209,15 @@ int hv_init(void)
/* See if the hypercall page is already set */
rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
 
-   virtaddr = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL_EXEC);
-
-   if (!virtaddr)
+   virtaddr = (void *)get_zeroed_page(GFP_KERNEL);
+   if (!virtaddr || set_memory_x((unsigned long)virtaddr, 1))
goto cleanup;
+   hv_context.hypercall_page = virtaddr;
 
hypercall_msr.enable = 1;
 
-   hypercall_msr.guest_physical_address = vmalloc_to_pfn(virtaddr);
+   hypercall_msr.guest_physical_address =
+   virt_to_phys(virtaddr) >> PAGE_SHIFT;
wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
 
/* Confirm that hypercall page did get setup. */
@@ -225,14 +227,12 @@ int hv_init(void)
if (!hypercall_msr.enable)
goto cleanup;
 
-   hv_context.hypercall_page = virtaddr;
-
 #ifdef CONFIG_X86_64
if (ms_hyperv.features & HV_X64_MSR_REFERENCE_TSC_AVAILABLE) {
union hv_x64_msr_hypercall_contents tsc_msr;
void *va_tsc;
 
-   va_tsc = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL);
+   va_tsc = (void *)get_zeroed_page(GFP_KERNEL);
if (!va_tsc)
goto cleanup;
hv_context.tsc_page = va_tsc;
@@ -240,7 +240,8 @@ int hv_init(void)
rdmsrl(HV_X64_MSR_REFERENCE_TSC, tsc_msr.as_uint64);
 
tsc_msr.enable = 1;
-   tsc_msr.guest_physical_address = vmalloc_to_pfn(va_tsc);
+   tsc_msr.guest_physical_address =
+   virt_to_phys(va_tsc) >> PAGE_SHIFT;
 
wrmsrl(HV_X64_MSR_REFERENCE_TSC, tsc_msr.as_uint64);
clocksource_register_hz(&hyperv_cs_tsc, NSEC_PER_SEC/100);
@@ -249,14 +250,9 @@ int hv_init(void)
return 0;
 
 cleanup:
-   if (virtaddr) {
-   if (hypercall_msr.enable) {
-   hypercall_msr.as_uint64 = 0;
-   wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
-   }
-
-   vfree(virtaddr);
-   }
+   hypercall_msr.as_uint64 = 0;
+   wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
+   free_page((unsigned long)virtaddr);
 
return -ENOTSUPP;
 }
@@ -273,13 +269,11 @@ void hv_cleanup(bool crash)
/* Reset our OS id */
wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0);
 
-   if (hv_context.hypercall_page) {
-   hypercall_msr.as_uint64 = 0;
-   wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
-   if (!crash)
-   vfree(hv_context.hypercall_page);
-   hv_context.hypercall_page = NULL;
-   }
+   hypercall_msr.as_uint64 = 0;
+   wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
+   if (!crash)
+   free_page((unsigned long)hv_context.hypercall_page);
+   hv_context.hypercall_page = NULL;
 
 #ifdef CONFIG_X86_64
/*
@@ -298,7 +292,7 @@ void hv_cleanup(bool crash)
hypercall_msr.as_uint64 = 0;
wrmsrl(HV_X64_MSR_REFERENCE_TSC, hypercall_msr.as_uint64);
if (!crash)
-   vfree(hv_context.tsc_page);
+   free_page((unsigned long)hv_context.tsc_page);
hv_context.tsc_page = NULL;
}
 #endif
-- 
2.9.3

1 2 >

1 - 100 of 133 matches

Mail list logo