Gerd Hoffmann wrote:
> Signed-off-by: Gerd Hoffmann <[EMAIL PROTECTED]>
>   

Looks basically OK, but some comments.

> ---
>  arch/x86/Kconfig          |    4 +
>  arch/x86/kernel/Makefile  |    1 +
>  arch/x86/kernel/pvclock.c |  148 
> +++++++++++++++++++++++++++++++++++++++++++++
>  include/asm-x86/pvclock.h |    6 ++
>  4 files changed, 159 insertions(+), 0 deletions(-)
>  create mode 100644 arch/x86/kernel/pvclock.c
>  create mode 100644 include/asm-x86/pvclock.h
>
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index fe361ae..deb3049 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -417,6 +417,10 @@ config PARAVIRT
>         over full virtualization.  However, when run without a hypervisor
>         the kernel is theoretically slower and slightly larger.
>  
> +config PARAVIRT_CLOCK
> +     bool
> +     default n
> +
>  endif
>  
>  config MEMTEST_BOOTPARAM
> diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
> index 5e618c3..77807d4 100644
> --- a/arch/x86/kernel/Makefile
> +++ b/arch/x86/kernel/Makefile
> @@ -82,6 +82,7 @@ obj-$(CONFIG_VMI)           += vmi_32.o vmiclock_32.o
>  obj-$(CONFIG_KVM_GUEST)              += kvm.o
>  obj-$(CONFIG_KVM_CLOCK)              += kvmclock.o
>  obj-$(CONFIG_PARAVIRT)               += paravirt.o paravirt_patch_$(BITS).o
> +obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
>  
>  obj-$(CONFIG_PCSPKR_PLATFORM)        += pcspeaker.o
>  
> diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
> new file mode 100644
> index 0000000..33e526f
> --- /dev/null
> +++ b/arch/x86/kernel/pvclock.c
> @@ -0,0 +1,148 @@
> +/*  paravirtual clock -- common code used by kvm/xen
> +
> +    This program is free software; you can redistribute it and/or modify
> +    it under the terms of the GNU General Public License as published by
> +    the Free Software Foundation; either version 2 of the License, or
> +    (at your option) any later version.
> +
> +    This program is distributed in the hope that it will be useful,
> +    but WITHOUT ANY WARRANTY; without even the implied warranty of
> +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +    GNU General Public License for more details.
> +
> +    You should have received a copy of the GNU General Public License
> +    along with this program; if not, write to the Free Software
> +    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  
> USA
> +*/
> +
> +#include <linux/kernel.h>
> +#include <linux/percpu.h>
> +#include <asm/pvclock.h>
> +
> +/*
> + * These are perodically updated
> + *    xen: magic shared_info page
> + *    kvm: gpa registered via msr
> + * and then copied here.
> + */
> +struct pvclock_shadow_time {
> +     u64 tsc_timestamp;     /* TSC at last update of time vals.  */
> +     u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
> +     u32 tsc_to_nsec_mul;
> +     int tsc_shift;
> +     u32 version;
> +};
> +
> +/*
> + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
> + * yielding a 64-bit result.
> + */
> +static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
> +{
> +     u64 product;
> +#ifdef __i386__
> +     u32 tmp1, tmp2;
> +#endif
> +
> +     if (shift < 0)
> +             delta >>= -shift;
> +     else
> +             delta <<= shift;
> +
> +#ifdef __i386__
> +     __asm__ (
> +             "mul  %5       ; "
> +             "mov  %4,%%eax ; "
> +             "mov  %%edx,%4 ; "
> +             "mul  %5       ; "
> +             "xor  %5,%5    ; "
> +             "add  %4,%%eax ; "
> +             "adc  %5,%%edx ; "
> +             : "=A" (product), "=r" (tmp1), "=r" (tmp2)
> +             : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
> +#elif __x86_64__
> +     __asm__ (
> +             "mul %%rdx ; shrd $32,%%rdx,%%rax"
> +             : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
> +#else
> +#error implement me!
> +#endif
> +
> +     return product;
> +}
> +
> +static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
> +{
> +     u64 delta = native_read_tsc() - shadow->tsc_timestamp;
> +     return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
> +}
> +
> +/*
> + * Reads a consistent set of time-base values from hypervisor,
> + * into a shadow data area.
> + */
> +static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
> +                                     struct kvm_vcpu_time_info *src)
>   

I think the kvm_* types should be renamed.  xen_* would make some sense 
since the ABI originated with Xen, but something generic would be OK 
too.  The important thing to get across is that the structure defines a 
guest<->host ABI which happens to be shared by Xen and KVM, and it isn't 
an in-kernel API like the rest of paravirt_ops.

And having defined a common structure, we may as well use it in the 
hypervisor-specific code/headers so there's no strange casting going on.
> +{
> +     do {
> +             dst->version = src->version;
> +             rmb();          /* fetch version before data */
> +             dst->tsc_timestamp     = src->tsc_timestamp;
> +             dst->system_timestamp  = src->system_time;
> +             dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
> +             dst->tsc_shift         = src->tsc_shift;
> +             rmb();          /* test version after fetching data */
> +     } while ((src->version & 1) || (dst->version != src->version));
> +
> +     return dst->version;
> +}
> +
> +/*
> + * This is our read_clock function. The host puts an tsc timestamp each time
> + * it updates a new time. Without the tsc adjustment, we can have a situation
> + * in which a vcpu starts to run earlier (smaller system_time), but probes
> + * time later (compared to another vcpu), leading to backwards time
> + */
>   

This comment is confusing.  Are you explaining why the tsc is read 
within the loop?  I think it can be clarified.

> +
> +cycle_t pvclock_clocksource_read(struct kvm_vcpu_time_info *src)
> +{
> +     struct pvclock_shadow_time shadow;
> +     unsigned version;
> +     cycle_t ret, offset;
> +
> +     do {
> +             version = pvclock_get_time_values(&shadow, src);
> +             barrier();
> +             offset = pvclock_get_nsec_offset(&shadow);
> +             ret = shadow.system_timestamp + offset;
> +             barrier();
> +     } while (version != src->version);
> +
> +     return ret;
> +}
> +
> +void pvclock_read_wallclock(struct kvm_wall_clock *wall_clock,
> +                         struct kvm_vcpu_time_info *vcpu_time,
>   

Ditto type names.

> +                         struct timespec *ts)
> +{
> +     u32 version;
> +     u64 delta;
> +     struct timespec now;
> +
> +     /* get wallclock at system boot */
> +     do {
> +             version = wall_clock->wc_version;
> +             rmb();          /* fetch version before time */
> +             now.tv_sec  = wall_clock->wc_sec;
> +             now.tv_nsec = wall_clock->wc_nsec;
> +             rmb();          /* fetch time before checking version */
> +     } while ((wall_clock->wc_version & 1) || (version != 
> wall_clock->wc_version));
> +
> +     delta = pvclock_clocksource_read(vcpu_time);    /* time since system 
> boot */
> +     delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
> +
> +     now.tv_nsec = do_div(delta, NSEC_PER_SEC);
> +     now.tv_sec = delta;
> +
> +     set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
> +}
> diff --git a/include/asm-x86/pvclock.h b/include/asm-x86/pvclock.h
> new file mode 100644
> index 0000000..2b9812f
> --- /dev/null
> +++ b/include/asm-x86/pvclock.h
> @@ -0,0 +1,6 @@
> +#include <linux/clocksource.h>
> +#include <asm/kvm_para.h>
> +cycle_t pvclock_clocksource_read(struct kvm_vcpu_time_info *src);
> +void pvclock_read_wallclock(struct kvm_wall_clock *wall,
> +                         struct kvm_vcpu_time_info *vcpu,
> +                         struct timespec *ts);
>   

No #ifdef guards?

    J
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization

Reply via email to