QEMU support for direct pmtimer reads. Hopefully its safe, since its a
read-only register ?
With self-disable C2 + this I'm seeing less CPU usage when idle with
CONFIG_CPU_IDLE enabled. Quite noticeable on SMP guests. Windows XP is
comparable to standard (never seen it consume less than 10% either way,
usually 20-30%).
On migration the destination host can either lack ACPI or have the timer
in a different IO port, so emulation is necessary.
Or luckily the pmtimer is in the same address. Since the 24-bit counter
overflow period is only ~= 4.6 seconds, its probably worthwhile to wait
for synchronization before restarting the guest. Not implemented though.
Also simplify and fix the overflow emulation, which was happening every
2.3 seconds instead of the expected 4.6s.
Index: kvm-userspace.tip/bios/rombios32.c
===================================================================
--- kvm-userspace.tip.orig/bios/rombios32.c
+++ kvm-userspace.tip/bios/rombios32.c
@@ -391,7 +391,7 @@ uint8_t bios_uuid[16];
unsigned long ebda_cur_addr;
#endif
int acpi_enabled;
-uint32_t pm_io_base, smb_io_base;
+uint32_t pm_io_base, pmtmr_base, smb_io_base;
int pm_sci_int;
unsigned long bios_table_cur_addr;
unsigned long bios_table_end_addr;
@@ -819,6 +819,12 @@ static void pci_bios_init_device(PCIDevi
pci_config_writeb(d, PCI_INTERRUPT_LINE, 9);
pm_io_base = PM_IO_BASE;
+ pmtmr_base = cmos_readb(0x60);
+ pmtmr_base |= cmos_readb(0x61) << 8;
+ pmtmr_base |= cmos_readb(0x62) << 16;
+ pmtmr_base |= cmos_readb(0x63) << 24;
+ if (!pmtmr_base)
+ pmtmr_base = pm_io_base + 0x08;
pci_config_writel(d, 0x40, pm_io_base | 1);
pci_config_writeb(d, 0x80, 0x01); /* enable PM io space */
smb_io_base = SMB_IO_BASE;
@@ -1381,7 +1387,7 @@ void acpi_bios_init(void)
fadt->acpi_disable = 0xf0;
fadt->pm1a_evt_blk = cpu_to_le32(pm_io_base);
fadt->pm1a_cnt_blk = cpu_to_le32(pm_io_base + 0x04);
- fadt->pm_tmr_blk = cpu_to_le32(pm_io_base + 0x08);
+ fadt->pm_tmr_blk = cpu_to_le32(pmtmr_base);
fadt->pm1_evt_len = 4;
fadt->pm1_cnt_len = 2;
fadt->pm_tmr_len = 4;
Index: kvm-userspace.tip/qemu/hw/acpi.c
===================================================================
--- kvm-userspace.tip.orig/qemu/hw/acpi.c
+++ kvm-userspace.tip/qemu/hw/acpi.c
@@ -40,6 +40,10 @@ typedef struct PIIX4PMState {
uint16_t pmsts;
uint16_t pmen;
uint16_t pmcntrl;
+ uint32_t pmtimer_base;
+ uint8_t direct_access;
+ int32_t pmtimer_offset;
+ uint32_t pmtimer_io_offset;
uint8_t apmc;
uint8_t apms;
QEMUTimer *tmr_timer;
@@ -82,42 +86,51 @@ static uint32_t get_pmtmr(PIIX4PMState *
{
uint32_t d;
d = muldiv64(qemu_get_clock(vm_clock), PM_FREQ, ticks_per_sec);
- return d & 0xffffff;
+ d += s->pmtimer_offset;
+ d &= 0xffffff;
+ return d;
}
static int get_pmsts(PIIX4PMState *s)
{
- int64_t d;
- int pmsts;
- pmsts = s->pmsts;
- d = muldiv64(qemu_get_clock(vm_clock), PM_FREQ, ticks_per_sec);
- if (d >= s->tmr_overflow_time)
- s->pmsts |= TMROF_EN;
- return pmsts;
+ return s->pmsts;
+}
+
+static void schedule_pmtmr_sci(PIIX4PMState *s)
+{
+ int64_t expire_time;
+ uint32_t pmtmr, left;
+
+ if (s->direct_access)
+ qemu_kvm_get_pmtimer(&pmtmr);
+ else
+ pmtmr = get_pmtmr(s);
+
+ left = (1 << 24) - pmtmr;
+ expire_time = muldiv64(left, ticks_per_sec, PM_FREQ);
+ expire_time += qemu_get_clock(vm_clock);
+ qemu_mod_timer(s->tmr_timer, expire_time);
}
static void pm_update_sci(PIIX4PMState *s)
{
int sci_level, pmsts;
- int64_t expire_time;
pmsts = get_pmsts(s);
sci_level = (((pmsts & s->pmen) &
(RTC_EN | PWRBTN_EN | GBL_EN | TMROF_EN)) != 0);
qemu_set_irq(s->irq, sci_level);
/* schedule a timer interruption if needed */
- if ((s->pmen & TMROF_EN) && !(pmsts & TMROF_EN)) {
- expire_time = muldiv64(s->tmr_overflow_time, ticks_per_sec, PM_FREQ);
- qemu_mod_timer(s->tmr_timer, expire_time);
- s->tmr_overflow_time += 0x800000;
- } else {
+ if ((s->pmen & TMROF_EN) && !(s->pmsts & TMROF_EN))
+ schedule_pmtmr_sci(s);
+ else
qemu_del_timer(s->tmr_timer);
- }
}
static void pm_tmr_timer(void *opaque)
{
PIIX4PMState *s = opaque;
+ s->pmsts |= TMROF_EN;
pm_update_sci(s);
}
@@ -152,18 +165,8 @@ static void pm_ioport_writew(void *opaqu
addr &= 0x3f;
switch(addr) {
case 0x00:
- {
- int64_t d;
- int pmsts;
- pmsts = get_pmsts(s);
- if (pmsts & val & TMROF_EN) {
- /* if TMRSTS is reset, then compute the new overflow time */
- d = muldiv64(qemu_get_clock(vm_clock), PM_FREQ, ticks_per_sec);
- s->tmr_overflow_time = (d + 0x800000LL) & ~0x7fffffLL;
- }
- s->pmsts &= ~val;
- pm_update_sci(s);
- }
+ s->pmsts &= ~val;
+ pm_update_sci(s);
break;
case 0x02:
s->pmen = val;
@@ -235,14 +238,10 @@ static uint32_t pm_ioport_readl(void *op
uint32_t val;
addr &= 0x3f;
- switch(addr) {
- case 0x08:
+ if (addr == s->pmtimer_io_offset)
val = get_pmtmr(s);
- break;
- default:
+ else
val = 0;
- break;
- }
#ifdef DEBUG
printf("PM readl port=0x%04x val=0x%08x\n", addr, val);
#endif
@@ -433,9 +432,9 @@ static uint32_t smb_ioport_readb(void *o
return val;
}
-static void pm_io_space_update(PIIX4PMState *s)
+static void pm_io_space_update(PIIX4PMState *s, int migration)
{
- uint32_t pm_io_base;
+ uint32_t pm_io_base, pmtmr_len;
if (s->dev.config[0x80] & 1) {
pm_io_base = le32_to_cpu(*(uint32_t *)(s->dev.config + 0x40));
@@ -443,14 +442,29 @@ static void pm_io_space_update(PIIX4PMSt
/* XXX: need to improve memory and ioport allocation */
#if defined(DEBUG)
- printf("PM: mapping to 0x%x\n", pm_io_base);
+ printf("PM: mapping to 0x%x mig=%d\n", pm_io_base, migration);
#endif
register_ioport_write(pm_io_base, 64, 1, pm_ioport_writeb, s);
register_ioport_read(pm_io_base, 64, 1, pm_ioport_readb, s);
register_ioport_write(pm_io_base, 64, 2, pm_ioport_writew, s);
register_ioport_read(pm_io_base, 64, 2, pm_ioport_readw, s);
- register_ioport_write(pm_io_base, 64, 4, pm_ioport_writel, s);
- register_ioport_read(pm_io_base, 64, 4, pm_ioport_readl, s);
+
+ if (migration) {
+ s->pmtimer_io_offset = 0x08;
+ pmtmr_len = 64;
+ } else if (host_pmtimer_base) {
+ s->pmtimer_base = host_pmtimer_base;
+ s->pmtimer_io_offset = 0x0;
+ pmtmr_len = 4;
+ s->direct_access = 1;
+ } else {
+ s->pmtimer_base = pm_io_base;
+ s->pmtimer_io_offset = 0x08;
+ pmtmr_len = 64;
+ }
+
+ register_ioport_write(s->pmtimer_base, pmtmr_len, 4, pm_ioport_writel,
s);
+ register_ioport_read(s->pmtimer_base, pmtmr_len, 4, pm_ioport_readl,
s);
}
}
@@ -459,12 +473,13 @@ static void pm_write_config(PCIDevice *d
{
pci_default_write_config(d, address, val, len);
if (address == 0x80)
- pm_io_space_update((PIIX4PMState *)d);
+ pm_io_space_update((PIIX4PMState *)d, 0);
}
static void pm_save(QEMUFile* f,void *opaque)
{
PIIX4PMState *s = opaque;
+ uint32_t pmtmr_val;
pci_device_save(&s->dev, f);
@@ -475,6 +490,14 @@ static void pm_save(QEMUFile* f,void *op
qemu_put_8s(f, &s->apms);
qemu_put_timer(f, s->tmr_timer);
qemu_put_be64(f, s->tmr_overflow_time);
+ qemu_put_be32(f, s->pmtimer_base);
+ if (s->direct_access) {
+ if (qemu_kvm_get_pmtimer(&pmtmr_val) < 0)
+ pmtmr_val = 1 << 30;
+ } else
+ pmtmr_val = get_pmtmr(s);
+
+ qemu_put_be32(f, pmtmr_val);
}
static int pm_load(QEMUFile* f,void* opaque,int version_id)
@@ -482,7 +505,7 @@ static int pm_load(QEMUFile* f,void* opa
PIIX4PMState *s = opaque;
int ret;
- if (version_id > 1)
+ if (version_id > 2)
return -EINVAL;
ret = pci_device_load(&s->dev, f);
@@ -496,10 +519,19 @@ static int pm_load(QEMUFile* f,void* opa
qemu_get_8s(f, &s->apms);
qemu_get_timer(f, s->tmr_timer);
s->tmr_overflow_time=qemu_get_be64(f);
+ if (version_id >= 2) {
+ uint32_t pmtmr_val;
- pm_io_space_update(s);
+ s->pmtimer_base = qemu_get_be32(f);
+ pmtmr_val = qemu_get_be32(f);
+ if (pmtmr_val & (1 << 30))
+ return -EINVAL;
+ s->pmtimer_offset = pmtmr_val - get_pmtmr(s);
+ }
- return 0;
+ pm_io_space_update(s, 1);
+
+ return 0;
}
i2c_bus *piix4_pm_init(PCIBus *bus, int devfn, uint32_t smb_io_base,
@@ -548,7 +580,7 @@ i2c_bus *piix4_pm_init(PCIBus *bus, int
s->tmr_timer = qemu_new_timer(vm_clock, pm_tmr_timer, s);
- register_savevm("piix4_pm", 0, 1, pm_save, pm_load, s);
+ register_savevm("piix4_pm", 0, 2, pm_save, pm_load, s);
s->smbus = i2c_init_bus();
s->irq = sci_irq;
Index: kvm-userspace.tip/qemu/hw/pc.c
===================================================================
--- kvm-userspace.tip.orig/qemu/hw/pc.c
+++ kvm-userspace.tip/qemu/hw/pc.c
@@ -253,6 +253,11 @@ static void cmos_init(ram_addr_t ram_siz
}
rtc_set_memory(s, 0x5f, smp_cpus - 1);
+ rtc_set_memory(s, 0x60, host_pmtimer_base);
+ rtc_set_memory(s, 0x61, host_pmtimer_base >> 8);
+ rtc_set_memory(s, 0x62, host_pmtimer_base >> 16);
+ rtc_set_memory(s, 0x63, host_pmtimer_base >> 24);
+
if (ram_size > (16 * 1024 * 1024))
val = (ram_size / 65536) - ((16 * 1024 * 1024) / 65536);
else
Index: kvm-userspace.tip/qemu/qemu-kvm-x86.c
===================================================================
--- kvm-userspace.tip.orig/qemu/qemu-kvm-x86.c
+++ kvm-userspace.tip/qemu/qemu-kvm-x86.c
@@ -11,12 +11,17 @@
#include <string.h>
#include "hw/hw.h"
+#include "sysemu.h"
#include "qemu-kvm.h"
#include <libkvm.h>
#include <pthread.h>
#include <sys/utsname.h>
#include <linux/kvm_para.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
#define MSR_IA32_TSC 0x10
@@ -545,6 +550,59 @@ static int get_para_features(kvm_context
return features;
}
+int kvm_arch_qemu_init(void)
+{
+ int fd, ret;
+ char buf[16384];
+ char *line, *saveptr;
+ struct kvm_pmtimer pmtmr;
+
+ if (no_direct_pmtimer)
+ return 0;
+
+ fd = open("/proc/ioports", O_RDONLY);
+ if (fd == -1) {
+ perror("open /proc/ioports");
+ exit(0);
+ }
+ ret = read(fd, buf, 16384);
+ if (ret == -1) {
+ perror("read /proc/ioports");
+ exit(0);
+ }
+
+ line = strtok_r(buf, "\n", &saveptr);
+ do {
+ char *pmstr;
+ line = pmstr = strtok_r(NULL, "\n", &saveptr);
+ if (pmstr && strstr(pmstr, "ACPI PM_TMR")) {
+ pmstr = strtok(line, "-");
+ while (*pmstr == ' ')
+ pmstr++;
+ host_pmtimer_base = strtoul(pmstr, NULL, 16);
+ /*
+ * Fail now instead of during migration
+ */
+ if (kvm_get_pmtimer(kvm_context, &pmtmr) < 0)
+ host_pmtimer_base = 0;
+ break;
+ }
+ } while (line);
+
+ return 0;
+}
+
+int qemu_kvm_get_pmtimer(uint32_t *value)
+{
+ int ret = 0;
+ struct kvm_pmtimer pmtmr;
+
+ ret = kvm_get_pmtimer(kvm_context, &pmtmr);
+ *value = pmtmr.val & 0xffffff;
+
+ return ret;
+}
+
int kvm_arch_qemu_init_env(CPUState *cenv)
{
struct kvm_cpuid_entry cpuid_ent[100];
Index: kvm-userspace.tip/qemu/qemu-kvm.c
===================================================================
--- kvm-userspace.tip.orig/qemu/qemu-kvm.c
+++ kvm-userspace.tip/qemu/qemu-kvm.c
@@ -677,6 +677,7 @@ int kvm_qemu_create_context(void)
r = kvm_arch_qemu_create_context();
if(r <0)
kvm_qemu_destroy();
+ kvm_arch_qemu_init();
return 0;
}
Index: kvm-userspace.tip/qemu/qemu-kvm.h
===================================================================
--- kvm-userspace.tip.orig/qemu/qemu-kvm.h
+++ kvm-userspace.tip/qemu/qemu-kvm.h
@@ -49,6 +49,7 @@ void kvm_cpu_destroy_phys_mem(target_phy
unsigned long size);
int kvm_arch_qemu_create_context(void);
+int kvm_arch_qemu_init(void);
void kvm_arch_save_regs(CPUState *env);
void kvm_arch_load_regs(CPUState *env);
@@ -60,6 +61,8 @@ int kvm_arch_has_work(CPUState *env);
int kvm_arch_try_push_interrupts(void *opaque);
void kvm_arch_update_regs_for_sipi(CPUState *env);
void kvm_arch_cpu_reset(CPUState *env);
+int qemu_kvm_get_pmtimer(uint32_t *value);
+
CPUState *qemu_kvm_cpu_env(int index);
Index: kvm-userspace.tip/qemu/sysemu.h
===================================================================
--- kvm-userspace.tip.orig/qemu/sysemu.h
+++ kvm-userspace.tip/qemu/sysemu.h
@@ -94,6 +94,7 @@ extern int win2k_install_hack;
extern int alt_grab;
extern int usb_enabled;
extern int smp_cpus;
+extern unsigned int host_pmtimer_base;
extern int cursor_hide;
extern int graphic_rotate;
extern int no_quit;
@@ -101,6 +102,7 @@ extern int semihosting_enabled;
extern int autostart;
extern int old_param;
extern int hpagesize;
+extern int no_direct_pmtimer;
extern const char *bootp_filename;
Index: kvm-userspace.tip/qemu/vl.c
===================================================================
--- kvm-userspace.tip.orig/qemu/vl.c
+++ kvm-userspace.tip/qemu/vl.c
@@ -209,6 +209,7 @@ int win2k_install_hack = 0;
int usb_enabled = 0;
static VLANState *first_vlan;
int smp_cpus = 1;
+unsigned int host_pmtimer_base;
const char *vnc_display;
#if defined(TARGET_SPARC)
#define MAX_CPUS 16
@@ -235,6 +236,7 @@ int time_drift_fix = 0;
unsigned int kvm_shadow_memory = 0;
const char *mem_path = NULL;
int hpagesize = 0;
+int no_direct_pmtimer = 0;
const char *cpu_vendor_string;
#ifdef TARGET_ARM
int old_param = 0;
@@ -7931,6 +7933,7 @@ enum {
QEMU_OPTION_tdf,
QEMU_OPTION_kvm_shadow_memory,
QEMU_OPTION_mempath,
+ QEMU_OPTION_no_direct_pmtimer,
};
typedef struct QEMUOption {
@@ -8058,6 +8061,7 @@ const QEMUOption qemu_options[] = {
{ "clock", HAS_ARG, QEMU_OPTION_clock },
{ "startdate", HAS_ARG, QEMU_OPTION_startdate },
{ "mem-path", HAS_ARG, QEMU_OPTION_mempath },
+ { "no-direct-pmtimer", 0, QEMU_OPTION_no_direct_pmtimer },
{ NULL },
};
@@ -8962,6 +8966,9 @@ int main(int argc, char **argv)
case QEMU_OPTION_mempath:
mem_path = optarg;
break;
+ case QEMU_OPTION_no_direct_pmtimer:
+ no_direct_pmtimer = 1;
+ break;
case QEMU_OPTION_name:
qemu_name = optarg;
break;
--
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at http://vger.kernel.org/majordomo-info.html