[PATCH v2 5/6] VMX: Validate capability MSRs
Check for required-0 or required-1 bits as well as known field value restrictions. Also check the consistency between VMX_*_CTLS and VMX_TRUE_*_CTLS and between CR0/4_FIXED0 and CR0/4_FIXED1. Signed-off-by: Jan Kiszka --- x86/vmx.c | 74 ++- x86/vmx.h | 5 +++-- 2 files changed, 76 insertions(+), 3 deletions(-) diff --git a/x86/vmx.c b/x86/vmx.c index f01e443..5c3498a 100644 --- a/x86/vmx.c +++ b/x86/vmx.c @@ -661,6 +661,77 @@ static void test_vmptrst(void) report("test vmptrst", (!ret) && (vmcs1 == vmcs2)); } +struct vmx_ctl_msr { + const char *name; + u32 index, true_index; + u32 default1; +} vmx_ctl_msr[] = { + { "MSR_IA32_VMX_PINBASED_CTLS", MSR_IA32_VMX_PINBASED_CTLS, + MSR_IA32_VMX_TRUE_PIN, 0x16 }, + { "MSR_IA32_VMX_PROCBASED_CTLS", MSR_IA32_VMX_PROCBASED_CTLS, + MSR_IA32_VMX_TRUE_PROC, 0x401e172 }, + { "MSR_IA32_VMX_PROCBASED_CTLS2", MSR_IA32_VMX_PROCBASED_CTLS2, + MSR_IA32_VMX_PROCBASED_CTLS2, 0 }, + { "MSR_IA32_VMX_EXIT_CTLS", MSR_IA32_VMX_EXIT_CTLS, + MSR_IA32_VMX_TRUE_EXIT, 0x36dff }, + { "MSR_IA32_VMX_ENTRY_CTLS", MSR_IA32_VMX_ENTRY_CTLS, + MSR_IA32_VMX_TRUE_ENTRY, 0x11ff }, +}; + +static void test_vmx_caps(void) +{ + u64 val, default1, fixed0, fixed1; + union vmx_ctrl_msr ctrl, true_ctrl; + unsigned int n; + bool ok; + + printf("\nTest suite: VMX capability reporting\n"); + + report("MSR_IA32_VMX_BASIC", + (basic.revision & (1ul << 31)) == 0 && + basic.size > 0 && basic.size <= 4096 && + (basic.type == 0 || basic.type == 6) && + basic.reserved1 == 0 && basic.reserved2 == 0); + + val = rdmsr(MSR_IA32_VMX_MISC); + report("MSR_IA32_VMX_MISC", + (!(ctrl_cpu_rev[1].clr & CPU_URG) || val & (1ul << 5)) && + ((val >> 16) & 0x1ff) <= 256 && + (val & 0xc0007e00) == 0); + + for (n = 0; n < ARRAY_SIZE(vmx_ctl_msr); n++) { + ctrl.val = rdmsr(vmx_ctl_msr[n].index); + default1 = vmx_ctl_msr[n].default1; + ok = (ctrl.set & default1) == default1 && + ((ctrl.set ^ ctrl.clr) & ~ctrl.clr) == 0; + if (ok && basic.ctrl) { + true_ctrl.val = rdmsr(vmx_ctl_msr[n].true_index); + ok = ctrl.clr == true_ctrl.clr && + ((ctrl.set ^ true_ctrl.set) & ~default1) == 0; + } + report(vmx_ctl_msr[n].name, ok); + } + + fixed0 = rdmsr(MSR_IA32_VMX_CR0_FIXED0); + fixed1 = rdmsr(MSR_IA32_VMX_CR0_FIXED1); + report("MSR_IA32_VMX_IA32_VMX_CR0_FIXED0/1", + ((fixed0 ^ fixed1) & ~fixed1) == 0); + + fixed0 = rdmsr(MSR_IA32_VMX_CR4_FIXED0); + fixed1 = rdmsr(MSR_IA32_VMX_CR4_FIXED1); + report("MSR_IA32_VMX_IA32_VMX_CR4_FIXED0/1", + ((fixed0 ^ fixed1) & ~fixed1) == 0); + + val = rdmsr(MSR_IA32_VMX_VMCS_ENUM); + report("MSR_IA32_VMX_VMCS_ENUM", + (val & 0x3e) >= 0x2a && + (val & 0xfc01Ull) == 0); + + val = rdmsr(MSR_IA32_VMX_EPT_VPID_CAP); + report("MSR_IA32_VMX_EPT_VPID_CAP", + (val & 0xf07ef9eebebeUll) == 0); +} + /* This function can only be called in guest */ static void __attribute__((__used__)) hypercall(u32 hypercall_no) { @@ -803,7 +874,7 @@ static int test_run(struct vmx_test *test) regs = test->guest_regs; vmcs_write(GUEST_RFLAGS, regs.rflags | 0x2); launched = 0; - printf("\nTest suite : %s\n", test->name); + printf("\nTest suite: %s\n", test->name); vmx_run(); if (vmx_off()) { printf("%s : vmxoff failed.\n", __func__); @@ -842,6 +913,7 @@ int main(void) goto exit; } test_vmxoff(); + test_vmx_caps(); while (vmx_tests[++i].name != NULL) if (test_run(&vmx_tests[i])) diff --git a/x86/vmx.h b/x86/vmx.h index 00f2842..87457b1 100644 --- a/x86/vmx.h +++ b/x86/vmx.h @@ -46,12 +46,13 @@ union vmx_basic { struct { u32 revision; u32 size:13, - : 3, + reserved1: 3, width:1, dual:1, type:4, insouts:1, - ctrl:1; + ctrl:1, + reserved2:8; }; }; -- 1.8.1.1.298.ge7eed54 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 3/6] VMX: Test both interception and execution of instructions
Extend the instruction interception test to also check for interception-free execution. Signed-off-by: Jan Kiszka --- x86/vmx_tests.c | 121 +--- 1 file changed, 72 insertions(+), 49 deletions(-) diff --git a/x86/vmx_tests.c b/x86/vmx_tests.c index 5485e4c..f02de3d 100644 --- a/x86/vmx_tests.c +++ b/x86/vmx_tests.c @@ -807,7 +807,6 @@ static int iobmp_exit_handler() #define INSN_CPU0 0 #define INSN_CPU1 1 #define INSN_ALWAYS_TRAP 2 -#define INSN_NEVER_TRAP3 #define FIELD_EXIT_QUAL(1 << 1) #define FIELD_INSN_INFO(1 << 2) @@ -818,7 +817,7 @@ asm( "insn_mwait: mwait;ret\n\t" "insn_rdpmc: rdpmc;ret\n\t" "insn_rdtsc: rdtsc;ret\n\t" - "insn_cr3_load: mov %rax,%cr3;ret\n\t" + "insn_cr3_load: mov cr3,%rax; mov %rax,%cr3;ret\n\t" "insn_cr3_store: mov %cr3,%rax;ret\n\t" #ifdef __x86_64__ "insn_cr8_load: mov %rax,%cr8;ret\n\t" @@ -848,6 +847,7 @@ extern void insn_cpuid(); extern void insn_invd(); u32 cur_insn; +u64 cr3; struct insn_table { const char *name; @@ -901,55 +901,56 @@ static struct insn_table insn_table[] = { static int insn_intercept_init() { - u32 ctrl_cpu[2]; + u32 ctrl_cpu; - ctrl_cpu[0] = vmcs_read(CPU_EXEC_CTRL0); - ctrl_cpu[0] |= CPU_HLT | CPU_INVLPG | CPU_MWAIT | CPU_RDPMC | CPU_RDTSC | - CPU_CR3_LOAD | CPU_CR3_STORE | -#ifdef __x86_64__ - CPU_CR8_LOAD | CPU_CR8_STORE | -#endif - CPU_MONITOR | CPU_PAUSE | CPU_SECONDARY; - ctrl_cpu[0] &= ctrl_cpu_rev[0].clr; - vmcs_write(CPU_EXEC_CTRL0, ctrl_cpu[0]); - ctrl_cpu[1] = vmcs_read(CPU_EXEC_CTRL1); - ctrl_cpu[1] |= CPU_WBINVD | CPU_RDRAND; - ctrl_cpu[1] &= ctrl_cpu_rev[1].clr; - vmcs_write(CPU_EXEC_CTRL1, ctrl_cpu[1]); + ctrl_cpu = ctrl_cpu_rev[0].set | CPU_SECONDARY; + ctrl_cpu &= ctrl_cpu_rev[0].clr; + vmcs_write(CPU_EXEC_CTRL0, ctrl_cpu); + vmcs_write(CPU_EXEC_CTRL1, ctrl_cpu_rev[1].set); + cr3 = read_cr3(); return VMX_TEST_START; } static void insn_intercept_main() { - cur_insn = 0; - while(insn_table[cur_insn].name != NULL) { - vmx_set_test_stage(cur_insn); - if ((insn_table[cur_insn].type == INSN_CPU0 - && !(ctrl_cpu_rev[0].clr & insn_table[cur_insn].flag)) - || (insn_table[cur_insn].type == INSN_CPU1 - && !(ctrl_cpu_rev[1].clr & insn_table[cur_insn].flag))) { - printf("\tCPU_CTRL1.CPU_%s is not supported.\n", - insn_table[cur_insn].name); + char msg[80]; + + for (cur_insn = 0; insn_table[cur_insn].name != NULL; cur_insn++) { + vmx_set_test_stage(cur_insn * 2); + if ((insn_table[cur_insn].type == INSN_CPU0 && +!(ctrl_cpu_rev[0].clr & insn_table[cur_insn].flag)) || + (insn_table[cur_insn].type == INSN_CPU1 && +!(ctrl_cpu_rev[1].clr & insn_table[cur_insn].flag))) { + printf("\tCPU_CTRL%d.CPU_%s is not supported.\n", + insn_table[cur_insn].type - INSN_CPU0, + insn_table[cur_insn].name); continue; } + + if ((insn_table[cur_insn].type == INSN_CPU0 && +!(ctrl_cpu_rev[0].set & insn_table[cur_insn].flag)) || + (insn_table[cur_insn].type == INSN_CPU1 && +!(ctrl_cpu_rev[1].set & insn_table[cur_insn].flag))) { + /* skip hlt, it stalls the guest and is tested below */ + if (insn_table[cur_insn].insn_func != insn_hlt) + insn_table[cur_insn].insn_func(); + snprintf(msg, sizeof(msg), "execute %s", +insn_table[cur_insn].name); + report(msg, vmx_get_test_stage() == cur_insn * 2); + } else if (insn_table[cur_insn].type != INSN_ALWAYS_TRAP) + printf("\tCPU_CTRL%d.CPU_%s always traps.\n", + insn_table[cur_insn].type - INSN_CPU0, + insn_table[cur_insn].name); + + vmcall(); + insn_table[cur_insn].insn_func(); - switch (insn_table[cur_insn].type) { - case INSN_CPU0: - case INSN_CPU1: - case INSN_ALWAYS_TRAP: - if (vmx_get_test_stage() != cur_insn + 1) - report(insn_table[cur_insn].name, 0); - else - report(insn_table[cur_insn].name, 1); - break; - case INSN_NEVER_TRAP: - if (vmx_get_test_stage
[PATCH v2 6/6] VMX: Test behavior on set and cleared save/load debug controls
This particularly checks the case when debug controls are not to be loaded/saved on host-guest transitions. We have to fake results related to IA32_DEBUGCTL as support for this MSR is missing KVM. The test already contains all bits required once KVM adds support. Signed-off-by: Jan Kiszka --- x86/vmx.h | 2 + x86/vmx_tests.c | 112 2 files changed, 114 insertions(+) diff --git a/x86/vmx.h b/x86/vmx.h index 87457b1..022f1df 100644 --- a/x86/vmx.h +++ b/x86/vmx.h @@ -305,6 +305,7 @@ enum Reason { #define X86_EFLAGS_ZF 0x0040 /* Zero Flag */ enum Ctrl_exi { + EXI_SAVE_DBGCTLS= 1UL << 2, EXI_HOST_64 = 1UL << 9, EXI_LOAD_PERF = 1UL << 12, EXI_INTA= 1UL << 15, @@ -316,6 +317,7 @@ enum Ctrl_exi { }; enum Ctrl_ent { + ENT_LOAD_DBGCTLS= 1UL << 2, ENT_GUEST_64= 1UL << 9, ENT_LOAD_PAT= 1UL << 14, ENT_LOAD_EFER = 1UL << 15, diff --git a/x86/vmx_tests.c b/x86/vmx_tests.c index f02de3d..a97b6d6 100644 --- a/x86/vmx_tests.c +++ b/x86/vmx_tests.c @@ -1398,6 +1398,116 @@ static int interrupt_exit_handler(void) return VMX_TEST_VMEXIT; } +static int dbgctls_init(struct vmcs *vmcs) +{ + u64 dr7 = 0x402; + u64 zero = 0; + + msr_bmp_init(); + asm volatile( + "mov %0,%%dr0\n\t" + "mov %0,%%dr1\n\t" + "mov %0,%%dr2\n\t" + "mov %1,%%dr7\n\t" + : : "r" (zero), "r" (dr7)); + wrmsr(MSR_IA32_DEBUGCTLMSR, 0x1); + vmcs_write(GUEST_DR7, 0x404); + vmcs_write(GUEST_DEBUGCTL, 0x2); + + vmcs_write(ENT_CONTROLS, vmcs_read(ENT_CONTROLS) | ENT_LOAD_DBGCTLS); + vmcs_write(EXI_CONTROLS, vmcs_read(EXI_CONTROLS) | EXI_SAVE_DBGCTLS); + + return VMX_TEST_START; +} + +static void dbgctls_main(void) +{ + u64 dr7, debugctl; + + asm volatile("mov %%dr7,%0" : "=r" (dr7)); + debugctl = rdmsr(MSR_IA32_DEBUGCTLMSR); + /* Commented out: KVM does not support DEBUGCTL so far */ + report("Load debug controls", dr7 == 0x404 /* && debugctl == 0x2 */); + + dr7 = 0x408; + asm volatile("mov %0,%%dr7" : : "r" (dr7)); + wrmsr(MSR_IA32_DEBUGCTLMSR, 0x3); + + vmx_set_test_stage(0); + vmcall(); + report("Save debug controls", vmx_get_test_stage() == 1); + + if (ctrl_enter_rev.set & ENT_LOAD_DBGCTLS || + ctrl_exit_rev.set & EXI_SAVE_DBGCTLS) { + printf("\tDebug controls are always loaded/saved\n"); + return; + } + vmx_set_test_stage(2); + vmcall(); + + asm volatile("mov %%dr7,%0" : "=r" (dr7)); + debugctl = rdmsr(MSR_IA32_DEBUGCTLMSR); + /* Commented out: KVM does not support DEBUGCTL so far */ + report("Guest=host debug controls", dr7 == 0x402 /* && debugctl == 0x1 */); + + dr7 = 0x408; + asm volatile("mov %0,%%dr7" : : "r" (dr7)); + wrmsr(MSR_IA32_DEBUGCTLMSR, 0x3); + + vmx_set_test_stage(3); + vmcall(); + report("Don't save debug controls", vmx_get_test_stage() == 4); +} + +static int dbgctls_exit_handler(void) +{ + unsigned int reason = vmcs_read(EXI_REASON) & 0xff; + u32 insn_len = vmcs_read(EXI_INST_LEN); + u64 guest_rip = vmcs_read(GUEST_RIP); + u64 dr7, debugctl; + + asm volatile("mov %%dr7,%0" : "=r" (dr7)); + debugctl = rdmsr(MSR_IA32_DEBUGCTLMSR); + + switch (reason) { + case VMX_VMCALL: + switch (vmx_get_test_stage()) { + case 0: + if (dr7 == 0x400 && debugctl == 0 && + vmcs_read(GUEST_DR7) == 0x408 /* && + Commented out: KVM does not support DEBUGCTL so far + vmcs_read(GUEST_DEBUGCTL) == 0x3 */) + vmx_inc_test_stage(); + break; + case 2: + dr7 = 0x402; + asm volatile("mov %0,%%dr7" : : "r" (dr7)); + wrmsr(MSR_IA32_DEBUGCTLMSR, 0x1); + vmcs_write(GUEST_DR7, 0x404); + vmcs_write(GUEST_DEBUGCTL, 0x2); + + vmcs_write(ENT_CONTROLS, + vmcs_read(ENT_CONTROLS) & ~ENT_LOAD_DBGCTLS); + vmcs_write(EXI_CONTROLS, + vmcs_read(EXI_CONTROLS) & ~EXI_SAVE_DBGCTLS); + break; + case 3: + if (dr7 == 0x400 && debugctl == 0 && + vmcs_read(GUEST_DR7) == 0x404 /* && + Commented out: KVM does not support DEBUGCTL so far + vmcs_read(GUEST_DEBUGCTL) == 0x2 */) + vmx_inc_test_stage(); + break; +
[PATCH v2 0/6] kvm-unit-tests: more instr. interceptions, debug control migration
Changes in v2 according to review remarks: - refactored get/set_stage interface - unified vmx_ctrl_* unions - used vmx_ctrl_msr in capability test - changed commented-out debugctl tests Jan Kiszka (6): VMX: Add tests for CR3 and CR8 interception VMX: Rework test stage interface VMX: Test both interception and execution of instructions VMX: Unify vmx_ctrl_* unions to vmx_ctrl_msr VMX: Validate capability MSRs VMX: Test behavior on set and cleared save/load debug controls x86/vmx.c | 108 - x86/vmx.h | 44 ++--- x86/vmx_tests.c | 495 3 files changed, 443 insertions(+), 204 deletions(-) -- 1.8.1.1.298.ge7eed54 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 4/6] VMX: Unify vmx_ctrl_* unions to vmx_ctrl_msr
Signed-off-by: Jan Kiszka --- x86/vmx.c | 8 x86/vmx.h | 31 +-- 2 files changed, 9 insertions(+), 30 deletions(-) diff --git a/x86/vmx.c b/x86/vmx.c index ba6a02b..f01e443 100644 --- a/x86/vmx.c +++ b/x86/vmx.c @@ -49,10 +49,10 @@ bool launched; u64 host_rflags; union vmx_basic basic; -union vmx_ctrl_pin ctrl_pin_rev; -union vmx_ctrl_cpu ctrl_cpu_rev[2]; -union vmx_ctrl_exit ctrl_exit_rev; -union vmx_ctrl_ent ctrl_enter_rev; +union vmx_ctrl_msr ctrl_pin_rev; +union vmx_ctrl_msr ctrl_cpu_rev[2]; +union vmx_ctrl_msr ctrl_exit_rev; +union vmx_ctrl_msr ctrl_enter_rev; union vmx_ept_vpid ept_vpid; extern struct descriptor_table_ptr gdt64_desc; diff --git a/x86/vmx.h b/x86/vmx.h index 1a8ae4c..00f2842 100644 --- a/x86/vmx.h +++ b/x86/vmx.h @@ -55,28 +55,7 @@ union vmx_basic { }; }; -union vmx_ctrl_pin { - u64 val; - struct { - u32 set, clr; - }; -}; - -union vmx_ctrl_cpu { - u64 val; - struct { - u32 set, clr; - }; -}; - -union vmx_ctrl_exit { - u64 val; - struct { - u32 set, clr; - }; -}; - -union vmx_ctrl_ent { +union vmx_ctrl_msr { u64 val; struct { u32 set, clr; @@ -508,10 +487,10 @@ enum Ctrl1 { extern struct regs regs; extern union vmx_basic basic; -extern union vmx_ctrl_pin ctrl_pin_rev; -extern union vmx_ctrl_cpu ctrl_cpu_rev[2]; -extern union vmx_ctrl_exit ctrl_exit_rev; -extern union vmx_ctrl_ent ctrl_enter_rev; +extern union vmx_ctrl_msr ctrl_pin_rev; +extern union vmx_ctrl_msr ctrl_cpu_rev[2]; +extern union vmx_ctrl_msr ctrl_exit_rev; +extern union vmx_ctrl_msr ctrl_enter_rev; extern union vmx_ept_vpid ept_vpid; void vmx_set_test_stage(u32 s); -- 1.8.1.1.298.ge7eed54 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 2/6] VMX: Rework test stage interface
Consistently access the stage only via the helper functions. To enforce this, move them from vmx_tests.c to vmx.c. At this chance, introduce a stage incrementation helper. Signed-off-by: Jan Kiszka --- x86/vmx.c | 26 ++ x86/vmx.h | 4 + x86/vmx_tests.c | 250 +++- 3 files changed, 151 insertions(+), 129 deletions(-) diff --git a/x86/vmx.c b/x86/vmx.c index 1182eef..ba6a02b 100644 --- a/x86/vmx.c +++ b/x86/vmx.c @@ -62,6 +62,32 @@ extern void *vmx_return; extern void *entry_sysenter; extern void *guest_entry; +static volatile u32 stage; + +void vmx_set_test_stage(u32 s) +{ + barrier(); + stage = s; + barrier(); +} + +u32 vmx_get_test_stage(void) +{ + u32 s; + + barrier(); + s = stage; + barrier(); + return s; +} + +void vmx_inc_test_stage(void) +{ + barrier(); + stage++; + barrier(); +} + static int make_vmcs_current(struct vmcs *vmcs) { bool ret; diff --git a/x86/vmx.h b/x86/vmx.h index 69a5385..1a8ae4c 100644 --- a/x86/vmx.h +++ b/x86/vmx.h @@ -514,6 +514,10 @@ extern union vmx_ctrl_exit ctrl_exit_rev; extern union vmx_ctrl_ent ctrl_enter_rev; extern union vmx_ept_vpid ept_vpid; +void vmx_set_test_stage(u32 s); +u32 vmx_get_test_stage(void); +void vmx_inc_test_stage(void); + static inline int vmcs_clear(struct vmcs *vmcs) { bool ret; diff --git a/x86/vmx_tests.c b/x86/vmx_tests.c index 149a591..5485e4c 100644 --- a/x86/vmx_tests.c +++ b/x86/vmx_tests.c @@ -14,7 +14,6 @@ u64 ia32_pat; u64 ia32_efer; -volatile u32 stage; void *io_bitmap_a, *io_bitmap_b; u16 ioport; @@ -27,23 +26,6 @@ static inline void vmcall() asm volatile("vmcall"); } -static inline void set_stage(u32 s) -{ - barrier(); - stage = s; - barrier(); -} - -static inline u32 get_stage() -{ - u32 s; - - barrier(); - s = stage; - barrier(); - return s; -} - void basic_guest_main() { } @@ -122,23 +104,23 @@ void preemption_timer_main() { tsc_val = rdtsc(); if (ctrl_exit_rev.clr & EXI_SAVE_PREEMPT) { - set_stage(0); + vmx_set_test_stage(0); vmcall(); - if (get_stage() == 1) + if (vmx_get_test_stage() == 1) vmcall(); } - set_stage(1); - while (get_stage() == 1) { + vmx_set_test_stage(1); + while (vmx_get_test_stage() == 1) { if (((rdtsc() - tsc_val) >> preempt_scale) > 10 * preempt_val) { - set_stage(2); + vmx_set_test_stage(2); vmcall(); } } tsc_val = rdtsc(); asm volatile ("hlt"); vmcall(); - set_stage(5); + vmx_set_test_stage(5); vmcall(); } @@ -155,13 +137,13 @@ int preemption_timer_exit_handler() insn_len = vmcs_read(EXI_INST_LEN); switch (reason) { case VMX_PREEMPT: - switch (get_stage()) { + switch (vmx_get_test_stage()) { case 1: case 2: report("busy-wait for preemption timer", ((rdtsc() - tsc_val) >> preempt_scale) >= preempt_val); - set_stage(3); + vmx_set_test_stage(3); vmcs_write(PREEMPT_TIMER_VALUE, preempt_val); return VMX_TEST_RESUME; case 3: @@ -170,7 +152,7 @@ int preemption_timer_exit_handler() report("preemption timer during hlt", ((rdtsc() - tsc_val) >> preempt_scale) >= preempt_val && guest_halted); - set_stage(4); + vmx_set_test_stage(4); vmcs_write(PIN_CONTROLS, vmcs_read(PIN_CONTROLS) & ~PIN_PREEMPT); vmcs_write(GUEST_ACTV_STATE, ACTV_ACTIVE); @@ -187,11 +169,11 @@ int preemption_timer_exit_handler() break; case VMX_VMCALL: vmcs_write(GUEST_RIP, guest_rip + insn_len); - switch (get_stage()) { + switch (vmx_get_test_stage()) { case 0: report("Keep preemption value", vmcs_read(PREEMPT_TIMER_VALUE) == preempt_val); - set_stage(1); + vmx_set_test_stage(1); vmcs_write(PREEMPT_TIMER_VALUE, preempt_val); ctrl_exit = (vmcs_read(EXI_CONTROLS) | EXI_SAVE_PREEMPT) & ctrl_exit_rev.clr; @@ -203,12 +185,12 @@ int preemption_timer_exit_handler() return VMX_TEST_RESUME; case 2:
[PATCH v2 1/6] VMX: Add tests for CR3 and CR8 interception
Need to fix FIELD_* constants for this to make the exit qualification check work. Signed-off-by: Jan Kiszka --- x86/vmx.h | 2 ++ x86/vmx_tests.c | 32 +--- 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/x86/vmx.h b/x86/vmx.h index 26dd161..69a5385 100644 --- a/x86/vmx.h +++ b/x86/vmx.h @@ -357,6 +357,8 @@ enum Ctrl0 { CPU_RDTSC = 1ul << 12, CPU_CR3_LOAD= 1ul << 15, CPU_CR3_STORE = 1ul << 16, + CPU_CR8_LOAD= 1ul << 19, + CPU_CR8_STORE = 1ul << 20, CPU_TPR_SHADOW = 1ul << 21, CPU_NMI_WINDOW = 1ul << 22, CPU_IO = 1ul << 24, diff --git a/x86/vmx_tests.c b/x86/vmx_tests.c index a40cb18..149a591 100644 --- a/x86/vmx_tests.c +++ b/x86/vmx_tests.c @@ -820,8 +820,8 @@ static int iobmp_exit_handler() #define INSN_ALWAYS_TRAP 2 #define INSN_NEVER_TRAP3 -#define FIELD_EXIT_QUAL0 -#define FIELD_INSN_INFO1 +#define FIELD_EXIT_QUAL(1 << 1) +#define FIELD_INSN_INFO(1 << 2) asm( "insn_hlt: hlt;ret\n\t" @@ -829,6 +829,12 @@ asm( "insn_mwait: mwait;ret\n\t" "insn_rdpmc: rdpmc;ret\n\t" "insn_rdtsc: rdtsc;ret\n\t" + "insn_cr3_load: mov %rax,%cr3;ret\n\t" + "insn_cr3_store: mov %cr3,%rax;ret\n\t" +#ifdef __x86_64__ + "insn_cr8_load: mov %rax,%cr8;ret\n\t" + "insn_cr8_store: mov %cr8,%rax;ret\n\t" +#endif "insn_monitor: monitor;ret\n\t" "insn_pause: pause;ret\n\t" "insn_wbinvd: wbinvd;ret\n\t" @@ -840,6 +846,12 @@ extern void insn_invlpg(); extern void insn_mwait(); extern void insn_rdpmc(); extern void insn_rdtsc(); +extern void insn_cr3_load(); +extern void insn_cr3_store(); +#ifdef __x86_64__ +extern void insn_cr8_load(); +extern void insn_cr8_store(); +#endif extern void insn_monitor(); extern void insn_pause(); extern void insn_wbinvd(); @@ -856,7 +868,7 @@ struct insn_table { u32 reason; ulong exit_qual; u32 insn_info; - // Use FIELD_EXIT_QUAL and FIELD_INSN_INFO to efines + // Use FIELD_EXIT_QUAL and FIELD_INSN_INFO to define // which field need to be tested, reason is always tested u32 test_field; }; @@ -877,6 +889,16 @@ static struct insn_table insn_table[] = { {"MWAIT", CPU_MWAIT, insn_mwait, INSN_CPU0, 36, 0, 0, 0}, {"RDPMC", CPU_RDPMC, insn_rdpmc, INSN_CPU0, 15, 0, 0, 0}, {"RDTSC", CPU_RDTSC, insn_rdtsc, INSN_CPU0, 16, 0, 0, 0}, + {"CR3 load", CPU_CR3_LOAD, insn_cr3_load, INSN_CPU0, 28, 0x3, 0, + FIELD_EXIT_QUAL}, + {"CR3 store", CPU_CR3_STORE, insn_cr3_store, INSN_CPU0, 28, 0x13, 0, + FIELD_EXIT_QUAL}, +#ifdef __x86_64__ + {"CR8 load", CPU_CR8_LOAD, insn_cr8_load, INSN_CPU0, 28, 0x8, 0, + FIELD_EXIT_QUAL}, + {"CR8 store", CPU_CR8_STORE, insn_cr8_store, INSN_CPU0, 28, 0x18, 0, + FIELD_EXIT_QUAL}, +#endif {"MONITOR", CPU_MONITOR, insn_monitor, INSN_CPU0, 39, 0, 0, 0}, {"PAUSE", CPU_PAUSE, insn_pause, INSN_CPU0, 40, 0, 0, 0}, // Flags for Secondary Processor-Based VM-Execution Controls @@ -894,6 +916,10 @@ static int insn_intercept_init() ctrl_cpu[0] = vmcs_read(CPU_EXEC_CTRL0); ctrl_cpu[0] |= CPU_HLT | CPU_INVLPG | CPU_MWAIT | CPU_RDPMC | CPU_RDTSC | + CPU_CR3_LOAD | CPU_CR3_STORE | +#ifdef __x86_64__ + CPU_CR8_LOAD | CPU_CR8_STORE | +#endif CPU_MONITOR | CPU_PAUSE | CPU_SECONDARY; ctrl_cpu[0] &= ctrl_cpu_rev[0].clr; vmcs_write(CPU_EXEC_CTRL0, ctrl_cpu[0]); -- 1.8.1.1.298.ge7eed54 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 1/6] VMX: Add tests for CR3 and CR8 interception
Il 17/06/2014 09:04, Jan Kiszka ha scritto: -#define FIELD_EXIT_QUAL0 -#define FIELD_INSN_INFO1 +#define FIELD_EXIT_QUAL(1 << 1) +#define FIELD_INSN_INFO(1 << 2) Heh, you probably wanted 1<<0 and 1<<1. I'll fix it up. :) Paolo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 1/6] VMX: Add tests for CR3 and CR8 interception
On 2014-06-17 09:41, Paolo Bonzini wrote: > Il 17/06/2014 09:04, Jan Kiszka ha scritto: >> >> -#define FIELD_EXIT_QUAL0 >> -#define FIELD_INSN_INFO1 >> +#define FIELD_EXIT_QUAL(1 << 1) >> +#define FIELD_INSN_INFO(1 << 2) > > Heh, you probably wanted 1<<0 and 1<<1. I'll fix it up. :) Yeah... thanks in advance. Jan -- Siemens AG, Corporate Technology, CT RTC ITP SES-DE Corporate Competence Center Embedded Linux -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/6] kvm_stat: Rework platform detection
The current platform detection is a little bit messy. We look for lines in /proc/cpuinfo starting with 'flags' OR 'vendor-id', and scan both for values we know will only occur in one or the other. We also keep scanning once we've found a value, which could be a feature, but isn't in this case. We'd also like to add another platform, powerpc, which will just make it worse. So clean it up in preparation. Signed-off-by: Michael Ellerman --- scripts/kvm/kvm_stat | 44 +--- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/scripts/kvm/kvm_stat b/scripts/kvm/kvm_stat index fe2eae3..98c81a8 100755 --- a/scripts/kvm/kvm_stat +++ b/scripts/kvm/kvm_stat @@ -169,27 +169,41 @@ generic_exit_reasons = { 23: 'EPR', } -vendor_exit_reasons = { +x86_exit_reasons = { 'vmx': vmx_exit_reasons, 'svm': svm_exit_reasons, -'IBM/S390': generic_exit_reasons, } -syscall_numbers = { -'IBM/S390': 331, -} - -sc_perf_evt_open = 298 - +sc_perf_evt_open = None exit_reasons = None -for line in file('/proc/cpuinfo').readlines(): -if line.startswith('flags') or line.startswith('vendor_id'): -for flag in line.split(): -if flag in vendor_exit_reasons: -exit_reasons = vendor_exit_reasons[flag] -if flag in syscall_numbers: -sc_perf_evt_open = syscall_numbers[flag] +def x86_init(flag): +globals().update({ +'sc_perf_evt_open' : 298, +'exit_reasons' : x86_exit_reasons[flag], +}) + +def s390_init(): +globals().update({ +'sc_perf_evt_open' : 331, +'exit_reasons' : generic_exit_reasons, +}) + +def detect_platform(): +for line in file('/proc/cpuinfo').readlines(): +if line.startswith('flags'): +for flag in line.split(): +if flag in x86_exit_reasons: +x86_init(flag) +return +elif line.startswith('vendor_id'): +for flag in line.split(): +if flag == 'IBM/S390': +s390_init() +return + +detect_platform() + filters = { 'kvm_exit': ('exit_reason', exit_reasons) } -- 1.9.1 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 5/6] kvm_stat: Abstract ioctl numbers
Unfortunately ioctl numbers are platform specific, so abstract them out of the code so they can be overridden. As it happens x86 and s390 share the same values, so nothing needs to change yet. Signed-off-by: Michael Ellerman --- scripts/kvm/kvm_stat | 12 +--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/scripts/kvm/kvm_stat b/scripts/kvm/kvm_stat index 2468a22..2e0f5ed 100755 --- a/scripts/kvm/kvm_stat +++ b/scripts/kvm/kvm_stat @@ -180,6 +180,12 @@ filters = { 'kvm_userspace_exit': ('reason', generic_exit_reasons) } +ioctl_numbers = { +'SET_FILTER' : 0x40082406, +'ENABLE' : 0x2400, +'DISABLE': 0x2401, +} + def x86_init(flag): globals().update({ 'sc_perf_evt_open' : 298, @@ -304,14 +310,14 @@ class Event(object): raise Exception('perf_event_open failed') if filter: import fcntl -fcntl.ioctl(fd, 0x40082406, filter) +fcntl.ioctl(fd, ioctl_numbers['SET_FILTER'], filter) self.fd = fd def enable(self): import fcntl -fcntl.ioctl(self.fd, 0x2400, 0) +fcntl.ioctl(self.fd, ioctl_numbers['ENABLE'], 0) def disable(self): import fcntl -fcntl.ioctl(self.fd, 0x2401, 0) +fcntl.ioctl(self.fd, ioctl_numbers['DISABLE'], 0) class TracepointProvider(object): def __init__(self): -- 1.9.1 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 4/6] kvm_stat: Fix tracepoint filter definition for s390
Although we have the exit_reasons defined for s390, as far as I can tell they never take effect. That is because there is no 'kvm_exit' tracepoint defined for s390. What is defined, for all platforms, is 'kvm_userspace_exit'. That tracepoint uses the generic_exit_reason, but the filter parameter is 'reason'. So invert the way we setup filters, define it by default for the generic tracepoint 'kvm_userspace_exit', and let x86 override it. Doing it this way will also work for powerpc when we add it. Signed-off-by: Michael Ellerman --- scripts/kvm/kvm_stat | 14 +++--- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/scripts/kvm/kvm_stat b/scripts/kvm/kvm_stat index 98c81a8..2468a22 100755 --- a/scripts/kvm/kvm_stat +++ b/scripts/kvm/kvm_stat @@ -175,18 +175,22 @@ x86_exit_reasons = { } sc_perf_evt_open = None -exit_reasons = None + +filters = { +'kvm_userspace_exit': ('reason', generic_exit_reasons) +} def x86_init(flag): globals().update({ 'sc_perf_evt_open' : 298, -'exit_reasons' : x86_exit_reasons[flag], +'filters' : { +'kvm_exit': ('exit_reason', x86_exit_reasons[flag]) +}, }) def s390_init(): globals().update({ 'sc_perf_evt_open' : 331, -'exit_reasons' : generic_exit_reasons, }) def detect_platform(): @@ -204,10 +208,6 @@ def detect_platform(): detect_platform() -filters = { -'kvm_exit': ('exit_reason', exit_reasons) -} - def invert(d): return dict((x[1], x[0]) for x in d.iteritems()) -- 1.9.1 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/6] kvm_stat: Only consider online cpus
In kvm_stat we grovel through /sys to find out how many cpus are in the system. However if a cpu is offline it will still be present in /sys, and the perf_event_open() will fail. Modify the logic to only return online cpus. We need to be careful on systems which don't support cpu hotplug, the online file will not be present at all. Signed-off-by: Michael Ellerman --- scripts/kvm/kvm_stat | 24 ++-- 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/scripts/kvm/kvm_stat b/scripts/kvm/kvm_stat index d7e97e7..2a788bc 100755 --- a/scripts/kvm/kvm_stat +++ b/scripts/kvm/kvm_stat @@ -311,18 +311,30 @@ class TracepointProvider(object): self.select(fields) def fields(self): return self._fields + +def _online_cpus(self): +l = [] +pattern = r'cpu([0-9]+)' +basedir = '/sys/devices/system/cpu' +for entry in os.listdir(basedir): +match = re.match(pattern, entry) +if not match: +continue +path = os.path.join(basedir, entry, 'online') +if os.path.exists(path) and open(path).read().strip() != '1': +continue +l.append(int(match.group(1))) +return l + def _setup(self, _fields): self._fields = _fields -cpure = r'cpu([0-9]+)' -self.cpus = [int(re.match(cpure, x).group(1)) - for x in os.listdir('/sys/devices/system/cpu') - if re.match(cpure, x)] +cpus = self._online_cpus() import resource -nfiles = len(self.cpus) * 1000 +nfiles = len(cpus) * 1000 resource.setrlimit(resource.RLIMIT_NOFILE, (nfiles, nfiles)) events = [] self.group_leaders = [] -for cpu in self.cpus: +for cpu in cpus: group = Group(cpu) for name in _fields: tracepoint = name -- 1.9.1 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/6] kvm_stat: Fix the non-x86 exit reasons
In kvm_stat we have a dictionary of exit reasons for s390. Firstly these are not s390 specific, they are the generic exit reasons. So rename the dictionary to reflect that. Secondly, the values are defined using hex, but in the kernel header they are decimal. That means values above 9 in kvm_stat are incorrect. As far as I can tell this does not matter in practice because s390 does not define a kvm_exit trace point. While we're there, fix the whitespace to match the rest of the file. Signed-off-by: Michael Ellerman --- scripts/kvm/kvm_stat | 49 +++-- 1 file changed, 27 insertions(+), 22 deletions(-) diff --git a/scripts/kvm/kvm_stat b/scripts/kvm/kvm_stat index 2a788bc..fe2eae3 100755 --- a/scripts/kvm/kvm_stat +++ b/scripts/kvm/kvm_stat @@ -141,33 +141,38 @@ svm_exit_reasons = { 0x400: 'NPF', } -s390_exit_reasons = { - 0x000: 'UNKNOWN', - 0x001: 'EXCEPTION', - 0x002: 'IO', - 0x003: 'HYPERCALL', - 0x004: 'DEBUG', - 0x005: 'HLT', - 0x006: 'MMIO', - 0x007: 'IRQ_WINDOW_OPEN', - 0x008: 'SHUTDOWN', - 0x009: 'FAIL_ENTRY', - 0x010: 'INTR', - 0x011: 'SET_TPR', - 0x012: 'TPR_ACCESS', - 0x013: 'S390_SIEIC', - 0x014: 'S390_RESET', - 0x015: 'DCR', - 0x016: 'NMI', - 0x017: 'INTERNAL_ERROR', - 0x018: 'OSI', - 0x019: 'PAPR_HCALL', +# From include/uapi/linux/kvm.h, KVM_EXIT_xxx +generic_exit_reasons = { + 0: 'UNKNOWN', + 1: 'EXCEPTION', + 2: 'IO', + 3: 'HYPERCALL', + 4: 'DEBUG', + 5: 'HLT', + 6: 'MMIO', + 7: 'IRQ_WINDOW_OPEN', + 8: 'SHUTDOWN', + 9: 'FAIL_ENTRY', +10: 'INTR', +11: 'SET_TPR', +12: 'TPR_ACCESS', +13: 'S390_SIEIC', +14: 'S390_RESET', +15: 'DCR', +16: 'NMI', +17: 'INTERNAL_ERROR', +18: 'OSI', +19: 'PAPR_HCALL', +20: 'S390_UCONTROL', +21: 'WATCHDOG', +22: 'S390_TSCH', +23: 'EPR', } vendor_exit_reasons = { 'vmx': vmx_exit_reasons, 'svm': svm_exit_reasons, -'IBM/S390': s390_exit_reasons, +'IBM/S390': generic_exit_reasons, } syscall_numbers = { -- 1.9.1 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 6/6] kvm_stat: Add powerpc support
Add support for powerpc platforms. We use uname -m, which allows us to detect ppc, ppc64 and ppc64le/el. Signed-off-by: Michael Ellerman --- scripts/kvm/kvm_stat | 18 -- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/scripts/kvm/kvm_stat b/scripts/kvm/kvm_stat index 2e0f5ed..db867fe 100755 --- a/scripts/kvm/kvm_stat +++ b/scripts/kvm/kvm_stat @@ -12,7 +12,7 @@ # the COPYING file in the top-level directory. import curses -import sys, os, time, optparse +import sys, os, time, optparse, ctypes class DebugfsProvider(object): def __init__(self): @@ -199,7 +199,21 @@ def s390_init(): 'sc_perf_evt_open' : 331, }) +def ppc_init(): +globals().update({ +'sc_perf_evt_open' : 319, +'ioctl_numbers' : { +'SET_FILTER' : 0x80002406 | (ctypes.sizeof(ctypes.c_char_p) << 16), +'ENABLE' : 0x20002400, +'DISABLE': 0x20002401, +} +}) + def detect_platform(): +if os.uname()[4].startswith('ppc'): +ppc_init() +return + for line in file('/proc/cpuinfo').readlines(): if line.startswith('flags'): for flag in line.split(): @@ -220,7 +234,7 @@ def invert(d): for f in filters: filters[f] = (filters[f][0], invert(filters[f][1])) -import ctypes, struct, array +import struct, array libc = ctypes.CDLL('libc.so.6') syscall = libc.syscall -- 1.9.1 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 5/6] VMX: Validate capability MSRs
Il 17/06/2014 09:04, Jan Kiszka ha scritto: + default1 = vmx_ctl_msr[n].default1; + ok = (ctrl.set & default1) == default1 && + ((ctrl.set ^ ctrl.clr) & ~ctrl.clr) == 0; Thanks, now I can understand what's going on. :) It can still be simplified though. This is just (ctrl.set & ~ctrl.clr) == 0: (a ^ b) & ~b == 0 -> (a & ~b) ^ (b & ~b) == 0 -> a & ~b == 0 This expresses just as well the concept that set=1, clr=0 is an impossible combination. Also, it's a bit easier to read if the second line is written as a separate statement: ok = ok && (ctrl.set & ~ctrl.clr) == 0; + if (ok && basic.ctrl) { + true_ctrl.val = rdmsr(vmx_ctl_msr[n].true_index); + ok = ctrl.clr == true_ctrl.clr && + ((ctrl.set ^ true_ctrl.set) & ~default1) == 0; This is ((ctrl.set ^ true_ctrl.set) & ~default1 == 0 -> ((ctrl.set & ~default1) ^ (true_ctrl.set & ~default1) == 0 -> ((ctrl.set & ~default1) == (true_ctrl.set & ~default1) A bit longer, but clearer: the difference between ctrl.set and true_ctrl.set is only that true_ctrl.set can clear some default-1 bits. Or you can simplify it further: -> (ctrl.set | default1) == (true_ctrl.set | default1) -> ctrl.set == (true_ctrl.set | default1) Also clearer: the difference between ctrl.set and true_ctrl.set is only that default-1 bits must be 1 in ctrl.set. Pick the one you prefer. Again, using a separate statement makes it easier to read in my opinion; in fact I would also write both statements as ok = ok && ... even though it's redundant for the clr test. Can you submit v3 of this patch only? Paolo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 6/6] kvm_stat: Add powerpc support
On 17.06.14 09:54, Michael Ellerman wrote: Add support for powerpc platforms. We use uname -m, which allows us to detect ppc, ppc64 and ppc64le/el. Signed-off-by: Michael Ellerman Could you please add support for PR KVM tracepoints along the way? There we do know the exit reason for every single guest <-> host transition. I would like to move to a similar model with HV in the future, so we can hopefully just reuse this by then. Alex --- scripts/kvm/kvm_stat | 18 -- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/scripts/kvm/kvm_stat b/scripts/kvm/kvm_stat index 2e0f5ed..db867fe 100755 --- a/scripts/kvm/kvm_stat +++ b/scripts/kvm/kvm_stat @@ -12,7 +12,7 @@ # the COPYING file in the top-level directory. import curses -import sys, os, time, optparse +import sys, os, time, optparse, ctypes class DebugfsProvider(object): def __init__(self): @@ -199,7 +199,21 @@ def s390_init(): 'sc_perf_evt_open' : 331, }) +def ppc_init(): +globals().update({ +'sc_perf_evt_open' : 319, +'ioctl_numbers' : { +'SET_FILTER' : 0x80002406 | (ctypes.sizeof(ctypes.c_char_p) << 16), +'ENABLE' : 0x20002400, +'DISABLE': 0x20002401, +} +}) + def detect_platform(): +if os.uname()[4].startswith('ppc'): +ppc_init() +return + for line in file('/proc/cpuinfo').readlines(): if line.startswith('flags'): for flag in line.split(): @@ -220,7 +234,7 @@ def invert(d): for f in filters: filters[f] = (filters[f][0], invert(filters[f][1])) -import ctypes, struct, array +import struct, array libc = ctypes.CDLL('libc.so.6') syscall = libc.syscall -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/6] PPC: Add asm helpers for BE 32bit load/store
On 17.06.14 02:51, Paul Mackerras wrote: On Wed, Jun 11, 2014 at 12:33:46PM +0200, Alexander Graf wrote: >From assembly code we might not only have to explicitly BE access 64bit values, but sometimes also 32bit ones. Add helpers that allow for easy use of lwzx/stwx in their respective byte-reverse or native form. Signed-off-by: Alexander Graf CC: Benjamin Herrenschmidt --- arch/powerpc/include/asm/asm-compat.h | 4 1 file changed, 4 insertions(+) diff --git a/arch/powerpc/include/asm/asm-compat.h b/arch/powerpc/include/asm/asm-compat.h index 4b237aa..21164ff 100644 --- a/arch/powerpc/include/asm/asm-compat.h +++ b/arch/powerpc/include/asm/asm-compat.h @@ -34,10 +34,14 @@ #define PPC_MIN_STKFRM112 #ifdef __BIG_ENDIAN__ +#define LWZX_BEstringify_in_c(lwzx) #define LDX_BEstringify_in_c(ldx) +#define STDX_BEstringify_in_c(stwx) This should be STWX_BE, shouldn't it? And there I thought I would get away without anyone noticing it :). Yes, of course. Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 5/6] KVM: PPC: Book3S HV: Access XICS in BE
On 17.06.14 03:02, Paul Mackerras wrote: On Wed, Jun 11, 2014 at 12:33:50PM +0200, Alexander Graf wrote: On the exit path from the guest we check what type of interrupt we received if we received one. This means we're doing hardware access to the XICS interrupt controller. However, when running on a little endian system, this access is byte reversed. So let's make sure to swizzle the bytes back again and virtuall make XICS accesses big endian. ... @@ -2241,7 +2253,8 @@ kvmppc_read_intr: 42: /* It's not an IPI and it's for the host, stash it in the PACA * before exit, it will be picked up by the host ICP driver */ - stw r0, HSTATE_SAVED_XIRR(r13) + li r4, HSTATE_SAVED_XIRR + STWX_BE r0, r13, r4 This is a paca field, not something mandated by PAPR or shared with the guest, so why do we need to keep it BE? If you do make it BE, don't you also need to fix kvmppc_get_xics_latch()? Yikes. Yes. Thanks a lot for the catch! Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] powerpc/kvm: support to handle sw breakpoint
On 14.06.14 23:08, Madhavan Srinivasan wrote: This patch adds kernel side support for software breakpoint. Design is that, by using an illegal instruction, we trap to hypervisor via Emulation Assistance interrupt, where we check for the illegal instruction and accordingly we return to Host or Guest. Patch mandates use of "abs" instruction (primary opcode 31 and extended opcode 360) as sw breakpoint instruction. Based on PowerISA v2.01, ABS instruction has been dropped from the architecture and treated an illegal instruction. Signed-off-by: Madhavan Srinivasan --- arch/powerpc/kvm/book3s.c| 3 ++- arch/powerpc/kvm/book3s_hv.c | 23 +++ 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index c254c27..b40fe5d 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -789,7 +789,8 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { - return -EINVAL; + vcpu->guest_debug = dbg->control; + return 0; } void kvmppc_decrementer_func(unsigned long data) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 7a12edb..688421d 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -67,6 +67,14 @@ /* Used as a "null" value for timebase values */ #define TB_NIL(~(u64)0) +/* + * SW_BRK_DBG_INT is debug Instruction for supporting Software Breakpoint. + * Instruction mnemonic is ABS, primary opcode is 31 and extended opcode is 360. + * Based on PowerISA v2.01, ABS instruction has been dropped from the architecture + * and treated an illegal instruction. + */ +#define SW_BRK_DBG_INT 0x7c0002d0 The instruction we use to trap needs to get exposed to user space via a ONE_REG property. Also, why don't we use twi always or something else that actually is defined as illegal instruction? I would like to see this shared with book3s_32 PR. + static void kvmppc_end_cede(struct kvm_vcpu *vcpu); static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); @@ -721,12 +729,19 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, break; /* * This occurs if the guest executes an illegal instruction. -* We just generate a program interrupt to the guest, since -* we don't emulate any guest instructions at this stage. +* To support software breakpoint, we check for the sw breakpoint +* instruction to return back to host, if not we just generate a +* program interrupt to the guest. */ case BOOK3S_INTERRUPT_H_EMUL_ASSIST: - kvmppc_core_queue_program(vcpu, SRR1_PROGILL); - r = RESUME_GUEST; + if (vcpu->arch.last_inst == SW_BRK_DBG_INT) { Don't access last_inst directly. Instead use the provided helpers. + run->exit_reason = KVM_EXIT_DEBUG; + run->debug.arch.address = vcpu->arch.pc; + r = RESUME_HOST; + } else { + kvmppc_core_queue_program(vcpu, 0x8); magic numbers + r = RESUME_GUEST; + } break; /* * This occurs if the guest (kernel or userspace), does something that Please enable PR KVM as well while you're at it. Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2] KVM: PPC: e500mc: Enhance tlb invalidation condition on vcpu schedule
On vcpu schedule, the condition checked for tlb pollution is too loose. The tlb entries of a vcpu become polluted (vs stale) only when a different vcpu within the same logical partition runs in-between. Optimize the tlb invalidation condition taking into account the logical partition id. With the new invalidation condition, a guest shows 4% performance improvement on P5020DS while running a memory stress application with the cpu oversubscribed, the other guest running a cpu intensive workload. Guest - old invalidation condition real 3.89 user 3.87 sys 0.01 Guest - enhanced invalidation condition real 3.75 user 3.73 sys 0.01 Host real 3.70 user 1.85 sys 0.00 The memory stress application accesses 4KB pages backed by 75% of available TLB0 entries: char foo[ENTRIES][4096] __attribute__ ((aligned (4096))); int main() { char bar; int i, j; for (i = 0; i < ITERATIONS; i++) for (j = 0; j < ENTRIES; j++) bar = foo[j][0]; return 0; } Signed-off-by: Mihai Caraman Cc: Scott Wood --- v2: - improve patch name and description - add performance results arch/powerpc/kvm/e500mc.c | 20 +--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c index 17e4562..d3b814b0 100644 --- a/arch/powerpc/kvm/e500mc.c +++ b/arch/powerpc/kvm/e500mc.c @@ -111,10 +111,12 @@ void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr) } static DEFINE_PER_CPU(struct kvm_vcpu *, last_vcpu_on_cpu); +static DEFINE_PER_CPU(int, last_lpid_on_cpu); static void kvmppc_core_vcpu_load_e500mc(struct kvm_vcpu *vcpu, int cpu) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + bool update_last = false, inval_tlb = false; kvmppc_booke_vcpu_load(vcpu, cpu); @@ -140,12 +142,24 @@ static void kvmppc_core_vcpu_load_e500mc(struct kvm_vcpu *vcpu, int cpu) mtspr(SPRN_GDEAR, vcpu->arch.shared->dar); mtspr(SPRN_GESR, vcpu->arch.shared->esr); - if (vcpu->arch.oldpir != mfspr(SPRN_PIR) || - __get_cpu_var(last_vcpu_on_cpu) != vcpu) { - kvmppc_e500_tlbil_all(vcpu_e500); + if (vcpu->arch.oldpir != mfspr(SPRN_PIR)) { + /* stale tlb entries */ + inval_tlb = update_last = true; + } else if (__get_cpu_var(last_vcpu_on_cpu) != vcpu) { + update_last = true; + /* polluted tlb entries */ + inval_tlb = __get_cpu_var(last_lpid_on_cpu) == + vcpu->kvm->arch.lpid; + } + + if (update_last) { __get_cpu_var(last_vcpu_on_cpu) = vcpu; + __get_cpu_var(last_lpid_on_cpu) = vcpu->kvm->arch.lpid; } + if (inval_tlb) + kvmppc_e500_tlbil_all(vcpu_e500); + kvmppc_load_guest_fp(vcpu); } -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] Why I advise against using ivshmem
Hello all, On 06/17/2014 04:54 AM, Stefan Hajnoczi wrote: ivshmem has a performance disadvantage for guest-to-host communication. Since the shared memory is exposed as PCI BARs, the guest has to memcpy into the shared memory. vhost-user can access guest memory directly and avoid the copy inside the guest. Actually, you can avoid this memory copy using frameworks like DPDK. Unless someone steps up and maintains ivshmem, I think it should be deprecated and dropped from QEMU. Then I can maintain ivshmem for QEMU. If this is ok, I will send a patch for MAINTAINERS file. -- David Marchand -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: market share??
On Thu, Jun 12, 2014 at 06:48:14PM +0200, urgrue wrote: > Does anyone have ANY idea where I could find out about market share and > adoption rates of linux hypervisors (kvm vs xen vs vmware, mainly)? > I've not found anything despite extensive searching, which is really kind of > problematic when you're trying to convince the bosses... A few years ago IDC published this analysis of KVM. A lot has changed since then but it's fun to look back and read: http://www-03.ibm.com/systems/resources/systems_virtualization_idc__kvmforservervirtualization.pdf For example, it says KVM has opportunities in the cloud. Today KVM is the default and best supported OpenStack hypervisor. I would focus on more than just KVM (the hypervisor). Consider what sort of virtualization solution you need. Are you doing server consolidation inside your organization? Are you doing virtual server hosting? Are you doing virtual desktop infrastructure? Then look at the management tools and ecosystem for that problem space. It doesn't matter how good the hypervisor is if your management tooling is a pain to use or doesn't support the storage or networking architecture you use. Next, evaluate it. KVM is free, try it out, prove it works. At that point you'll know which choice is right for you. Stefan pgpuijseQchYk.pgp Description: PGP signature
Re: [PATCH] KVM: PPC: e500mc: Relax tlb invalidation condition on vcpu schedule
On 13.06.14 21:42, Scott Wood wrote: On Fri, 2014-06-13 at 16:55 +0200, Alexander Graf wrote: On 13.06.14 16:43, mihai.cara...@freescale.com wrote: -Original Message- From: Alexander Graf [mailto:ag...@suse.de] Sent: Thursday, June 12, 2014 8:05 PM To: Caraman Mihai Claudiu-B02008 Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; linuxppc- d...@lists.ozlabs.org; Wood Scott-B07421 Subject: Re: [PATCH] KVM: PPC: e500mc: Relax tlb invalidation condition on vcpu schedule On 06/12/2014 04:00 PM, Mihai Caraman wrote: On vcpu schedule, the condition checked for tlb pollution is too tight. The tlb entries of one vcpu are polluted when a different vcpu from the same partition runs in-between. Relax the current tlb invalidation condition taking into account the lpid. Can you quantify the performance improvement from this? We've had bugs in this area before, so let's make sure it's worth it before making this more complicated. Signed-off-by: Mihai Caraman freescale.com> Your mailer is broken? :) This really should be an @. I think this should work. Scott, please ack. Alex, you were right. I screwed up the patch description by inverting relax and tight terms :) It should have been more like this: KVM: PPC: e500mc: Enhance tlb invalidation condition on vcpu schedule On vcpu schedule, the condition checked for tlb pollution is too loose. The tlb entries of a vcpu are polluted (vs stale) only when a different vcpu within the same logical partition runs in-between. Optimize the tlb invalidation condition taking into account the lpid. Can't we give every vcpu its own lpid? Or don't we trap on global invalidates? That would significantly increase the odds of exhausting LPIDs, especially on large chips like t4240 with similarly large VMs. If we were to do that, the LPIDs would need to be dynamically assigned (like PIDs), and should probably be a separate numberspace per physical core. True, I didn't realize we only have so few of them. It would however save us from most flushing as long as we have spare LPIDs available :). Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] KVM: PPC: Book3s PR: Disable AIL mode with OPAL
On 12.06.14 05:56, Paul Mackerras wrote: On Tue, Jun 10, 2014 at 07:23:00PM +0200, Alexander Graf wrote: When we're using PR KVM we must not allow the CPU to take interrupts in virtual mode, as the SLB does not contain host kernel mappings when running inside the guest context. To make sure we get good performance for non-KVM tasks but still properly functioning PR KVM, let's just disable AIL whenever a vcpu is scheduled in. This patch fixes running PR KVM on POWER8 bare metal for me. We already handle this for the situation where we're running under a hypervisor with the calls to pSeries_disable_reloc_on_exc() and pSeries_enable_reloc_on_exc() in kvmppc_core_init_vm_pr() and kvmppc_core_destroy_vm_pr() respectively. The obvious approach to fixing this problem would be to generalize those calls, perhaps via a ppc_md callback, to work on the powernv platform too. If you don't want to do that, for instance because those calls are defined to operate across the whole machine rather than a single CPU thread, and you prefer to affect just the one thread we're running on, then I think you need to explain that in the commit message. It's what I've done at first, yes. Unfortunately the pSeries call is system global, while we need to do the AIL switching per cpu on bare metal. Once you start considering CPU hotplug and how that affects the secondary bringup paths you start wondering whether it's worth the hassle :). This way you can have a single guest running which only slows down (disables AIL) on its own vcpu, while all the others still enjoy the benefits of AIL. Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] powerpc/kvm: support to handle sw breakpoint
On Tue, 2014-06-17 at 10:54 +0200, Alexander Graf wrote: > > Also, why don't we use twi always or something else that actually is > defined as illegal instruction? I would like to see this shared with > book3s_32 PR. twi will be directed to the guest on HV no ? We want a real illegal because those go to the host (for potential emulation by the HV). I'm trying to see if I can get the architect to set one in stone in a future proof way. Cheers, Ben. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] powerpc/kvm: support to handle sw breakpoint
On 17.06.14 11:22, Benjamin Herrenschmidt wrote: On Tue, 2014-06-17 at 10:54 +0200, Alexander Graf wrote: Also, why don't we use twi always or something else that actually is defined as illegal instruction? I would like to see this shared with book3s_32 PR. twi will be directed to the guest on HV no ? We want a real illegal because those go to the host (for potential emulation by the HV). Ah, good point. I guess we need different one for PR and HV then to ensure compatibility with older ISAs on PR. Alex I'm trying to see if I can get the architect to set one in stone in a future proof way. Cheers, Ben. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] powerpc/kvm: support to handle sw breakpoint
On Tue, 2014-06-17 at 11:25 +0200, Alexander Graf wrote: > On 17.06.14 11:22, Benjamin Herrenschmidt wrote: > > On Tue, 2014-06-17 at 10:54 +0200, Alexander Graf wrote: > >> Also, why don't we use twi always or something else that actually is > >> defined as illegal instruction? I would like to see this shared with > >> book3s_32 PR. > > twi will be directed to the guest on HV no ? We want a real illegal > > because those go to the host (for potential emulation by the HV). > > Ah, good point. I guess we need different one for PR and HV then to > ensure compatibility with older ISAs on PR. Well, we also need to be careful with what happens if a PR guest puts that instruction in, do that stop its HV guest/host ? What if it's done in userspace ? Do that stop the kernel ? :-) Maddy, I haven't checked, does your patch ensure that we only ever stop if the instruction is at a recorded bkpt address ? It still means that a userspace process can practically DOS its kernel by issuing a lot of these causing a crapload of exits. Cheers, Ben. > Alex > > > I'm > > trying to see if I can get the architect to set one in stone in a future > > proof way. > > > > Cheers, > > Ben. > > > > -- > To unsubscribe from this list: send the line "unsubscribe kvm" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] powerpc/kvm: support to handle sw breakpoint
On 17.06.14 11:32, Benjamin Herrenschmidt wrote: On Tue, 2014-06-17 at 11:25 +0200, Alexander Graf wrote: On 17.06.14 11:22, Benjamin Herrenschmidt wrote: On Tue, 2014-06-17 at 10:54 +0200, Alexander Graf wrote: Also, why don't we use twi always or something else that actually is defined as illegal instruction? I would like to see this shared with book3s_32 PR. twi will be directed to the guest on HV no ? We want a real illegal because those go to the host (for potential emulation by the HV). Ah, good point. I guess we need different one for PR and HV then to ensure compatibility with older ISAs on PR. Well, we also need to be careful with what happens if a PR guest puts that instruction in, do that stop its HV guest/host ? What if it's done in userspace ? Do that stop the kernel ? :-) The way SW breakpointing is handled is that when we see one, it gets deflected into user space. User space then has an array of breakpoints it configured itself. If the breakpoint is part of that list, it consumes it. If not, it injects a debug interrupt (program in this case) into the guest. That way we can overlay that one instruction with as many layers as we like :). We only get a performance hit on execution of that instruction. Maddy, I haven't checked, does your patch ensure that we only ever stop if the instruction is at a recorded bkpt address ? It still means that a userspace process can practically DOS its kernel by issuing a lot of these causing a crapload of exits. Only user space knows about its breakpoint addresses, so we have to deflect. However since time still ticks on, we only increase jitter of the guest. The process would still get scheduled away after the same amount of real time, no? Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] Why I advise against using ivshmem
Il 17/06/2014 11:03, David Marchand ha scritto: Unless someone steps up and maintains ivshmem, I think it should be deprecated and dropped from QEMU. Then I can maintain ivshmem for QEMU. If this is ok, I will send a patch for MAINTAINERS file. Typically, adding yourself to maintainers is done only after having proved your ability to be a maintainer. :) So, let's stop talking and go back to code! You can start doing what was suggested elsewhere in the thread: get the server and uio driver merged into the QEMU tree, document the protocol in docs/specs/ivshmem_device_spec.txt, and start fixing bugs such as the ones that Markus reported. Since ivshmem is basically KVM-only (it has a soft dependency on ioeventfd), CC the patches to kvm@vger.kernel.org and I'll merge them via the KVM tree for now. I'll (more than) gladly give maintainership away in due time. Paolo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2] KVM: PPC: Book3s PR: Disable AIL mode with OPAL
When we're using PR KVM we must not allow the CPU to take interrupts in virtual mode, as the SLB does not contain host kernel mappings when running inside the guest context. To make sure we get good performance for non-KVM tasks but still properly functioning PR KVM, let's just disable AIL whenever a vcpu is scheduled in. This is fundamentally different from how we deal with AIL on pSeries type machines where we disable AIL for the whole machine as soon as a single KVM VM is up. The reason for that is easy - on pSeries we do not have control over per-cpu configuration of AIL. We also don't want to mess with CPU hotplug races and AIL configuration, so setting it per CPU is easier and more flexible. This patch fixes running PR KVM on POWER8 bare metal for me. Signed-off-by: Alexander Graf --- v1 -> v2: - Extend patch description diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 3da412e..8ea7da4 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -71,6 +71,12 @@ static void kvmppc_core_vcpu_load_pr(struct kvm_vcpu *vcpu, int cpu) svcpu->in_use = 0; svcpu_put(svcpu); #endif + + /* Disable AIL if supported */ + if (cpu_has_feature(CPU_FTR_HVMODE) && + cpu_has_feature(CPU_FTR_ARCH_207S)) + mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~LPCR_AIL); + vcpu->cpu = smp_processor_id(); #ifdef CONFIG_PPC_BOOK3S_32 current->thread.kvm_shadow_vcpu = vcpu->arch.shadow_vcpu; @@ -91,6 +97,12 @@ static void kvmppc_core_vcpu_put_pr(struct kvm_vcpu *vcpu) kvmppc_giveup_ext(vcpu, MSR_FP | MSR_VEC | MSR_VSX); kvmppc_giveup_fac(vcpu, FSCR_TAR_LG); + + /* Enable AIL if supported */ + if (cpu_has_feature(CPU_FTR_HVMODE) && + cpu_has_feature(CPU_FTR_ARCH_207S)) + mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) | LPCR_AIL_3); + vcpu->cpu = -1; } -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2] KVM: PPC: Book3s PR: Disable AIL mode with OPAL
On Tue, Jun 17, 2014 at 12:03:58PM +0200, Alexander Graf wrote: > When we're using PR KVM we must not allow the CPU to take interrupts > in virtual mode, as the SLB does not contain host kernel mappings > when running inside the guest context. > > To make sure we get good performance for non-KVM tasks but still > properly functioning PR KVM, let's just disable AIL whenever a vcpu > is scheduled in. > > This is fundamentally different from how we deal with AIL on pSeries > type machines where we disable AIL for the whole machine as soon as > a single KVM VM is up. > > The reason for that is easy - on pSeries we do not have control over > per-cpu configuration of AIL. We also don't want to mess with CPU hotplug > races and AIL configuration, so setting it per CPU is easier and more > flexible. > > This patch fixes running PR KVM on POWER8 bare metal for me. > > Signed-off-by: Alexander Graf Acked-by: Paul Mackerras -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 5/6] KVM: PPC: Book3S HV: Access XICS in BE
On 17.06.14 10:37, Alexander Graf wrote: On 17.06.14 03:02, Paul Mackerras wrote: On Wed, Jun 11, 2014 at 12:33:50PM +0200, Alexander Graf wrote: On the exit path from the guest we check what type of interrupt we received if we received one. This means we're doing hardware access to the XICS interrupt controller. However, when running on a little endian system, this access is byte reversed. So let's make sure to swizzle the bytes back again and virtuall make XICS accesses big endian. ... @@ -2241,7 +2253,8 @@ kvmppc_read_intr: 42:/* It's not an IPI and it's for the host, stash it in the PACA * before exit, it will be picked up by the host ICP driver */ -stwr0, HSTATE_SAVED_XIRR(r13) +lir4, HSTATE_SAVED_XIRR +STWX_BEr0, r13, r4 This is a paca field, not something mandated by PAPR or shared with the guest, so why do we need to keep it BE? If you do make it BE, don't you also need to fix kvmppc_get_xics_latch()? Yikes. Yes. Thanks a lot for the catch! Eh, no. What we do is we read (good on BE, byte reversed) into r0. Then we swab32() from r0 to r3 on LE, mr from r0 to r3 on BE. r3 gets truncated along the way. The reason we maintain r0 as wrong-endian is that we write it back using the cache inhibited stwcix instruction: stwcix r0, r6, r7 /* EOI it */ So during the lifetime of r0 as XIRR it's always byte-reversed on LE. That's why we store it using STWX_BE into hstate, because that's the time when we actually swab32() it for further interpretation. Alternatively I could clobber a different register and maintain the byte swapped variant in there if you prefer. Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Divide error in kvm_unlock_kick()
I see kernel 3.15 is now out, so I retested with 3.15 guest and host. I'm still getting exactly the same guest kernel panic: a divide error in kvm_unlock_kick with -cpu host, but not with -cpu qemu64: divide error: [#1] PREEMPT SMP Modules linked in: CPU: 1 PID: 781 Comm: mkdir Not tainted 3.15.0-guest #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS Bochs 01/01/2011 task: 88007cbf6180 ti: 88088000 task.ti: 88088000 RIP: 0010:[] [] kvm_unlock_kick+0x63/0x6b RSP: :88007fc83d38 EFLAGS: 00010046 RAX: 0005 RBX: RCX: 0002 RDX: 0002 RSI: 88007fd11d80 RDI: 81994840 RBP: 88007fd11d80 R08: R09: 81994840 R10: 88007c480c88 R11: 0005 R12: cec0 R13: 88007d38332a R14: 0002 R15: 88007d382d00 FS: 7fdabf7fd700() GS:88007fc8() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: 7fd0643f6509 CR3: 7c028000 CR4: 000406e0 Stack: 00011d80 0002 88007fd11d80 8156f83f 810dba53 0046 88007fd0 88007d3bbe70 81845da8 0003 Call Trace: [] ? _raw_spin_unlock+0x32/0x55 [] ? try_to_wake_up+0x1ed/0x20f [] ? autoremove_wake_function+0x9/0x2a [] ? __wake_up_common+0x47/0x73 [] ? __wake_up+0x33/0x44 [] ? irq_work_run+0x72/0x8f [] ? smp_irq_work_interrupt+0x26/0x2b [] ? irq_work_interrupt+0x6d/0x80 [] ? try_to_wake_up+0x1fe/0x20f [] ? native_apic_msr_read+0x6/0x4e [] ? _raw_spin_unlock_irqrestore+0x3d/0x65 [] ? rcu_process_callbacks+0x15e/0x47d [] ? execute_in_process_context+0x55/0x55 [] ? __do_softirq+0xe0/0x1e6 [] ? irq_exit+0x3c/0x81 [] ? smp_apic_timer_interrupt+0x3b/0x46 [] ? apic_timer_interrupt+0x6d/0x80 Code: 0c c5 c0 b8 87 81 49 8d 04 0c 48 8b 30 48 39 ee 75 ca 8a 40 08 38 d8 75 c3 48 c7 c0 22 b0 00 00 31 db 0f b7 0c 08 b8 05 00 00 00 <0f> 01 c1 5b 5d 41 5c c3 4c 8d 54 24 08 48 83 e4 f0 b9 0a 00 00 RIP [] kvm_unlock_kick+0x63/0x6b RSP ---[ end trace 949b1bf47cc57d09 ]--- Kernel panic - not syncing: Fatal exception in interrupt Shutting down cpus with NMI Kernel Offset: 0x0 from 0x8100 (relocation range: 0x8000-0x9fff) ---[ end Kernel panic - not syncing: Fatal exception in interrupt I'm at a complete loss as to what to do next to debug this. Any help would be extremely gratefully received! I've put 3.15 host and guest configs here: http://cdw.me.uk/tmp/3.15-guest-config.txt http://cdw.me.uk/tmp/3.15-host-config.txt dmesg just after boot here: http://cdw.me.uk/tmp/3.15-guest-dmesg.txt http://cdw.me.uk/tmp/3.15-host-dmesg.txt and /proc/cpuinfo from both host and guest here: http://cdw.me.uk/tmp/3.15-guest-cpuinfo.txt http://cdw.me.uk/tmp/3.15-host-cpuinfo.txt The qemu command line was qemu-system-x86 -enable-kvm -cpu host -machine q35 -m 2048 -name omega \ -smp sockets=1,cores=4 -pidfile /run/omega.pid -runas nobody \ -serial stdio -vga none -vnc none -kernel /boot/vmlinuz-guest \ -append "console=ttyS0 root=/dev/vda" \ -drive file=/dev/guest/omega,cache=none,format=raw,if=virtio \ -device virtio-rng-pci \ -device virtio-net-pci,netdev=nic,mac=02:14:72:3c:69:54 \ -netdev tap,id=nic,fd=3,vhost=on 3<>/dev/tapNNN but removing the -machine q35 and -device virtio-rng-pci doesn't affect the crash. Dropping to -smp 1, running with -cpu qemu64, or compiling the guest kernel without paravirtualised spinlock support does remove the panic, albeit at the cost of performance. Best wishes, Chris. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] powerpc/kvm: support to handle sw breakpoint
On Tuesday 17 June 2014 02:24 PM, Alexander Graf wrote: > > On 14.06.14 23:08, Madhavan Srinivasan wrote: >> This patch adds kernel side support for software breakpoint. >> Design is that, by using an illegal instruction, we trap to hypervisor >> via Emulation Assistance interrupt, where we check for the illegal >> instruction >> and accordingly we return to Host or Guest. Patch mandates use of >> "abs" instruction >> (primary opcode 31 and extended opcode 360) as sw breakpoint instruction. >> Based on PowerISA v2.01, ABS instruction has been dropped from the >> architecture >> and treated an illegal instruction. >> >> Signed-off-by: Madhavan Srinivasan >> --- >> arch/powerpc/kvm/book3s.c| 3 ++- >> arch/powerpc/kvm/book3s_hv.c | 23 +++ >> 2 files changed, 21 insertions(+), 5 deletions(-) >> >> diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c >> index c254c27..b40fe5d 100644 >> --- a/arch/powerpc/kvm/book3s.c >> +++ b/arch/powerpc/kvm/book3s.c >> @@ -789,7 +789,8 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu >> *vcpu, >> int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, >> struct kvm_guest_debug *dbg) >> { >> -return -EINVAL; >> +vcpu->guest_debug = dbg->control; >> +return 0; >> } >> void kvmppc_decrementer_func(unsigned long data) >> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c >> index 7a12edb..688421d 100644 >> --- a/arch/powerpc/kvm/book3s_hv.c >> +++ b/arch/powerpc/kvm/book3s_hv.c >> @@ -67,6 +67,14 @@ >> /* Used as a "null" value for timebase values */ >> #define TB_NIL(~(u64)0) >> +/* >> + * SW_BRK_DBG_INT is debug Instruction for supporting Software >> Breakpoint. >> + * Instruction mnemonic is ABS, primary opcode is 31 and extended >> opcode is 360. >> + * Based on PowerISA v2.01, ABS instruction has been dropped from the >> architecture >> + * and treated an illegal instruction. >> + */ >> +#define SW_BRK_DBG_INT 0x7c0002d0 > > The instruction we use to trap needs to get exposed to user space via a > ONE_REG property. > Yes. I got to know about that from Bharat (patchset "ppc debug: Add debug stub support"). I will change it. > Also, why don't we use twi always or something else that actually is > defined as illegal instruction? I would like to see this shared with > book3s_32 PR. > >> + >> static void kvmppc_end_cede(struct kvm_vcpu *vcpu); >> static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); >> @@ -721,12 +729,19 @@ static int kvmppc_handle_exit_hv(struct >> kvm_run *run, struct kvm_vcpu *vcpu, >> break; >> /* >>* This occurs if the guest executes an illegal instruction. >> - * We just generate a program interrupt to the guest, since >> - * we don't emulate any guest instructions at this stage. >> + * To support software breakpoint, we check for the sw breakpoint >> + * instruction to return back to host, if not we just generate a >> + * program interrupt to the guest. >>*/ >> case BOOK3S_INTERRUPT_H_EMUL_ASSIST: >> -kvmppc_core_queue_program(vcpu, SRR1_PROGILL); >> -r = RESUME_GUEST; >> +if (vcpu->arch.last_inst == SW_BRK_DBG_INT) { > > Don't access last_inst directly. Instead use the provided helpers. > Ok. Will look and replace it. >> +run->exit_reason = KVM_EXIT_DEBUG; >> +run->debug.arch.address = vcpu->arch.pc; >> +r = RESUME_HOST; >> +} else { >> +kvmppc_core_queue_program(vcpu, 0x8); > > magic numbers ^ I did not understand this? >> +r = RESUME_GUEST; >> +} >> break; >> /* >>* This occurs if the guest (kernel or userspace), does >> something that > > Please enable PR KVM as well while you're at it. > My bad, I did not try the PR KVM. I will try it out. > > Alex > Thanks for review Regards Maddy -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] powerpc/kvm: support to handle sw breakpoint
On 17.06.14 13:07, Madhavan Srinivasan wrote: On Tuesday 17 June 2014 02:24 PM, Alexander Graf wrote: On 14.06.14 23:08, Madhavan Srinivasan wrote: This patch adds kernel side support for software breakpoint. Design is that, by using an illegal instruction, we trap to hypervisor via Emulation Assistance interrupt, where we check for the illegal instruction and accordingly we return to Host or Guest. Patch mandates use of "abs" instruction (primary opcode 31 and extended opcode 360) as sw breakpoint instruction. Based on PowerISA v2.01, ABS instruction has been dropped from the architecture and treated an illegal instruction. Signed-off-by: Madhavan Srinivasan --- arch/powerpc/kvm/book3s.c| 3 ++- arch/powerpc/kvm/book3s_hv.c | 23 +++ 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index c254c27..b40fe5d 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -789,7 +789,8 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { -return -EINVAL; +vcpu->guest_debug = dbg->control; +return 0; } void kvmppc_decrementer_func(unsigned long data) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 7a12edb..688421d 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -67,6 +67,14 @@ /* Used as a "null" value for timebase values */ #define TB_NIL(~(u64)0) +/* + * SW_BRK_DBG_INT is debug Instruction for supporting Software Breakpoint. + * Instruction mnemonic is ABS, primary opcode is 31 and extended opcode is 360. + * Based on PowerISA v2.01, ABS instruction has been dropped from the architecture + * and treated an illegal instruction. + */ +#define SW_BRK_DBG_INT 0x7c0002d0 The instruction we use to trap needs to get exposed to user space via a ONE_REG property. Yes. I got to know about that from Bharat (patchset "ppc debug: Add debug stub support"). I will change it. Also, why don't we use twi always or something else that actually is defined as illegal instruction? I would like to see this shared with book3s_32 PR. + static void kvmppc_end_cede(struct kvm_vcpu *vcpu); static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); @@ -721,12 +729,19 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, break; /* * This occurs if the guest executes an illegal instruction. - * We just generate a program interrupt to the guest, since - * we don't emulate any guest instructions at this stage. + * To support software breakpoint, we check for the sw breakpoint + * instruction to return back to host, if not we just generate a + * program interrupt to the guest. */ case BOOK3S_INTERRUPT_H_EMUL_ASSIST: -kvmppc_core_queue_program(vcpu, SRR1_PROGILL); -r = RESUME_GUEST; +if (vcpu->arch.last_inst == SW_BRK_DBG_INT) { Don't access last_inst directly. Instead use the provided helpers. Ok. Will look and replace it. +run->exit_reason = KVM_EXIT_DEBUG; +run->debug.arch.address = vcpu->arch.pc; +r = RESUME_HOST; +} else { +kvmppc_core_queue_program(vcpu, 0x8); magic numbers ^ I did not understand this? You're replacing the readable SRR1_PROGILL with the unreadable 0x8. That's bad. Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] powerpc/kvm: support to handle sw breakpoint
On Tuesday 17 June 2014 03:02 PM, Benjamin Herrenschmidt wrote: > On Tue, 2014-06-17 at 11:25 +0200, Alexander Graf wrote: >> On 17.06.14 11:22, Benjamin Herrenschmidt wrote: >>> On Tue, 2014-06-17 at 10:54 +0200, Alexander Graf wrote: Also, why don't we use twi always or something else that actually is defined as illegal instruction? I would like to see this shared with book3s_32 PR. >>> twi will be directed to the guest on HV no ? We want a real illegal >>> because those go to the host (for potential emulation by the HV). >> >> Ah, good point. I guess we need different one for PR and HV then to >> ensure compatibility with older ISAs on PR. > > Well, we also need to be careful with what happens if a PR guest puts > that instruction in, do that stop its HV guest/host ? > Damn, my mail client is messed up. did not see the mail till now. I havent tried this incase of PR guest kernel. I will need to try this before commenting. > What if it's done in userspace ? Do that stop the kernel ? :-) > Basically flow is that, when we see this instruction, we return to host, and host checks for address in the SW array and if not it returns to kernel. > Maddy, I haven't checked, does your patch ensure that we only ever stop > if the instruction is at a recorded bkpt address ? It still means that a > userspace process can practically DOS its kernel by issuing a lot of > these causing a crapload of exits. > This is valid, userspace can create a mess, need to handle this, meaning incase if we dont find a valid SW breakpoint for this address in the HOST, we need to route it to guest and kill it at app. Regards Maddy > Cheers, > Ben. > >> Alex >> >>> I'm >>> trying to see if I can get the architect to set one in stone in a future >>> proof way. >>> >>> Cheers, >>> Ben. >>> >> >> -- >> To unsubscribe from this list: send the line "unsubscribe kvm" in >> the body of a message to majord...@vger.kernel.org >> More majordomo info at http://vger.kernel.org/majordomo-info.html > > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] powerpc/kvm: support to handle sw breakpoint
On Tuesday 17 June 2014 04:38 PM, Alexander Graf wrote: > > On 17.06.14 13:07, Madhavan Srinivasan wrote: >> On Tuesday 17 June 2014 02:24 PM, Alexander Graf wrote: >>> On 14.06.14 23:08, Madhavan Srinivasan wrote: This patch adds kernel side support for software breakpoint. Design is that, by using an illegal instruction, we trap to hypervisor via Emulation Assistance interrupt, where we check for the illegal instruction and accordingly we return to Host or Guest. Patch mandates use of "abs" instruction (primary opcode 31 and extended opcode 360) as sw breakpoint instruction. Based on PowerISA v2.01, ABS instruction has been dropped from the architecture and treated an illegal instruction. Signed-off-by: Madhavan Srinivasan --- arch/powerpc/kvm/book3s.c| 3 ++- arch/powerpc/kvm/book3s_hv.c | 23 +++ 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index c254c27..b40fe5d 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -789,7 +789,8 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { -return -EINVAL; +vcpu->guest_debug = dbg->control; +return 0; } void kvmppc_decrementer_func(unsigned long data) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 7a12edb..688421d 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -67,6 +67,14 @@ /* Used as a "null" value for timebase values */ #define TB_NIL(~(u64)0) +/* + * SW_BRK_DBG_INT is debug Instruction for supporting Software Breakpoint. + * Instruction mnemonic is ABS, primary opcode is 31 and extended opcode is 360. + * Based on PowerISA v2.01, ABS instruction has been dropped from the architecture + * and treated an illegal instruction. + */ +#define SW_BRK_DBG_INT 0x7c0002d0 >>> The instruction we use to trap needs to get exposed to user space via a >>> ONE_REG property. >>> >> Yes. I got to know about that from Bharat (patchset "ppc debug: Add >> debug stub support"). I will change it. >> >>> Also, why don't we use twi always or something else that actually is >>> defined as illegal instruction? I would like to see this shared with >>> book3s_32 PR. >>> + static void kvmppc_end_cede(struct kvm_vcpu *vcpu); static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); @@ -721,12 +729,19 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, break; /* * This occurs if the guest executes an illegal instruction. - * We just generate a program interrupt to the guest, since - * we don't emulate any guest instructions at this stage. + * To support software breakpoint, we check for the sw breakpoint + * instruction to return back to host, if not we just generate a + * program interrupt to the guest. */ case BOOK3S_INTERRUPT_H_EMUL_ASSIST: -kvmppc_core_queue_program(vcpu, SRR1_PROGILL); -r = RESUME_GUEST; +if (vcpu->arch.last_inst == SW_BRK_DBG_INT) { >>> Don't access last_inst directly. Instead use the provided helpers. >>> >> Ok. Will look and replace it. >> +run->exit_reason = KVM_EXIT_DEBUG; +run->debug.arch.address = vcpu->arch.pc; +r = RESUME_HOST; +} else { +kvmppc_core_queue_program(vcpu, 0x8); >>> magic numbers >> ^ >> I did not understand this? > > You're replacing the readable SRR1_PROGILL with the unreadable 0x8. > That's bad. > Oops. My bad. Will undo that. I guess I messed up when was re basing it. > > Alex > Thanks for review Regards Maddy -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] powerpc/kvm: support to handle sw breakpoint
On Tuesday 17 June 2014 03:13 PM, Alexander Graf wrote: > > On 17.06.14 11:32, Benjamin Herrenschmidt wrote: >> On Tue, 2014-06-17 at 11:25 +0200, Alexander Graf wrote: >>> On 17.06.14 11:22, Benjamin Herrenschmidt wrote: On Tue, 2014-06-17 at 10:54 +0200, Alexander Graf wrote: > Also, why don't we use twi always or something else that actually is > defined as illegal instruction? I would like to see this shared with > book3s_32 PR. twi will be directed to the guest on HV no ? We want a real illegal because those go to the host (for potential emulation by the HV). >>> Ah, good point. I guess we need different one for PR and HV then to >>> ensure compatibility with older ISAs on PR. >> Well, we also need to be careful with what happens if a PR guest puts >> that instruction in, do that stop its HV guest/host ? >> >> What if it's done in userspace ? Do that stop the kernel ? :-) > > The way SW breakpointing is handled is that when we see one, it gets > deflected into user space. User space then has an array of breakpoints > it configured itself. If the breakpoint is part of that list, it > consumes it. If not, it injects a debug interrupt (program in this case) > into the guest. > > That way we can overlay that one instruction with as many layers as we > like :). We only get a performance hit on execution of that instruction. > >> Maddy, I haven't checked, does your patch ensure that we only ever stop >> if the instruction is at a recorded bkpt address ? It still means that a >> userspace process can practically DOS its kernel by issuing a lot of >> these causing a crapload of exits. > > Only user space knows about its breakpoint addresses, so we have to > deflect. However since time still ticks on, we only increase jitter of > the guest. The process would still get scheduled away after the same ^^^ Where is this taken care. I am still trying to understand. Kindly can you explain or point to the code. Will help. > amount of real time, no? > > > Alex > Thanks for review. Regards Maddy -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
KVM call for 2014-06-24
Hi Please, send any topic that you are interested in covering. * Machine as QOM (Marcel) (repeat) - Solve the 'sensible' issues. - Choose as first tasks the ones that will be more useful. - Select solutions agreed by everyone. Thanks, Juan. Call details: 15:00 CEST 13:00 UTC 09:00 EDT Every two weeks If you need phone number details, contact me privately. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] powerpc/kvm: support to handle sw breakpoint
On 17.06.14 13:20, Madhavan Srinivasan wrote: On Tuesday 17 June 2014 03:13 PM, Alexander Graf wrote: On 17.06.14 11:32, Benjamin Herrenschmidt wrote: On Tue, 2014-06-17 at 11:25 +0200, Alexander Graf wrote: On 17.06.14 11:22, Benjamin Herrenschmidt wrote: On Tue, 2014-06-17 at 10:54 +0200, Alexander Graf wrote: Also, why don't we use twi always or something else that actually is defined as illegal instruction? I would like to see this shared with book3s_32 PR. twi will be directed to the guest on HV no ? We want a real illegal because those go to the host (for potential emulation by the HV). Ah, good point. I guess we need different one for PR and HV then to ensure compatibility with older ISAs on PR. Well, we also need to be careful with what happens if a PR guest puts that instruction in, do that stop its HV guest/host ? What if it's done in userspace ? Do that stop the kernel ? :-) The way SW breakpointing is handled is that when we see one, it gets deflected into user space. User space then has an array of breakpoints it configured itself. If the breakpoint is part of that list, it consumes it. If not, it injects a debug interrupt (program in this case) into the guest. That way we can overlay that one instruction with as many layers as we like :). We only get a performance hit on execution of that instruction. Maddy, I haven't checked, does your patch ensure that we only ever stop if the instruction is at a recorded bkpt address ? It still means that a userspace process can practically DOS its kernel by issuing a lot of these causing a crapload of exits. Only user space knows about its breakpoint addresses, so we have to deflect. However since time still ticks on, we only increase jitter of the guest. The process would still get scheduled away after the same ^^^ Where is this taken care. I am still trying to understand. Kindly can you explain or point to the code. Will help. We tell the guest via VPA about its steal time which includes QEMU time. Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Coupling between KVM_IRQFD and KVM_SET_GSI_ROUTING?
Hello, I have a question related to KVM_IRQFD and KVM_SET_GSI_ROUTING ioctl relationship. When reading the KVM API documentation I do not understand there is any dependency between KVM_IRQFD and KVM_SET_GSI_ROUTING. According to the text it seems only the gsi field is used and interpreted as the irqchip pin. However irqchip.c kvm_set_irq code relies on an existing and not dummy routing table. My question is: does anyone agree on the fact the user-side must set a consistent routing table using KVM_SET_GSI_ROUTING before using KVM_IRQFD? The other alternative would have been to build a default identity GSI routing table in the kernel (gsi = irqchip.pin). In the positive, shouldn't we clarify the KVM API documentation? Thank you in advance Best Regards Eric -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2] ARM: KVM: add irqfd and irq routing support
On 06/06/2014 02:16 PM, Christoffer Dall wrote: > On Mon, Jun 02, 2014 at 02:54:12PM +0100, Marc Zyngier wrote: >> Hi Eric, >> >> On Mon, Jun 02 2014 at 8:29:56 am BST, Eric Auger >> wrote: >>> This patch enables irqfd and irq routing on ARM. >>> >>> It turns on CONFIG_HAVE_KVM_EVENTFD and CONFIG_HAVE_KVM_IRQ_ROUTING >>> >>> irqfd framework enables to assign physical IRQs to guests. >>> >>> 1) user-side uses KVM_IRQFD VM ioctl to pass KVM a kvm_irqfd struct that >>> associates a VM, an eventfd, an IRQ number (aka. the GSI). When an actor >>> signals the eventfd (typically a VFIO platform driver), the irqfd subsystem >>> injects the specified IRQ into the VM (the "GSI" takes the semantic of a >>> virtual IRQ for that guest). >> >> Just so I can understand how this works: Who EOIs (handles) the physical >> interrupt? If it is the VFIO driver, then I don't see how you prevent >> the interrupt from firing again immediately (unless this is an edge >> interrupt?). >> >>> 2) the other use case is user-side does 1) and uses KVM_SET_GSI_ROUTING >>> VM ioctl to create an association between a VM, a physical IRQ (aka GSI) and >>> a virtual IRQ (aka irchip.pin). This creates a so-called GSI routing entry. >>> When someone triggers the eventfd, irqfd handles it but uses the specified >>> routing and eventually injects irqchip.pin virtual IRQ into the guest. In >>> that >>> context the GSI takes the semantic of a physical IRQ while the irqchip.pin >>> takes the semantic of a virtual IRQ. >>> >>> in 1) routing is used by irqfd but an identity routing is created by default >>> making the gsi = irqchip.pin. Note on ARM there is a single interrupt >>> controller kind, the GIC. >>> >>> GSI routing mostly is implemented in generic irqchip.c. >>> The tiny ARM specific part is directly implemented in the virtual interrupt >>> controller (vgic.c) as it is done for powerpc for instance. This option was >>> prefered compared to implementing other #ifdef in irq_comm.c (x86 and ia64). >>> Hence irq_comm.c is not used at all. >>> >>> Routing currently is not used for anything else than irqfd IRQ injection. >>> Only >>> SPI can be injected. This means the vgic is not totally hidden behind the >>> irqchip. There are separate discussions on PPI/SGI routing. >>> >>> Only level sensitive IRQs are supported (with a registered resampler). As a >>> reminder the resampler is a second eventfd called by irqfd framework when >>> the >>> virtual IRQ is completed by the guest. This eventfd is supposed to be >>> handled >>> on user-side >>> >>> MSI routing is not supported yet. >>> >>> This work was tested with Calxeda Midway xgmac main interrupt (with and >>> without >>> explicit user routing) with qemu-system-arm and QEMU VFIO platform device. >>> >>> changes v1 -> v2: >>> 2 fixes: >>> - v1 assumed gsi/irqchip.pin was already incremented by >>> VGIC_NR_PRIVATE_IRQS. >>> This is now vgic_set_assigned_irq that increments it before injection. >>> - v2 now handles the case where a pending assigned irq is cleared through >>> MMIO access. The irq is properly acked allowing the resamplefd handler >>> to possibly unmask the physical IRQ. >>> >>> Signed-off-by: Eric Auger >>> >>> Conflicts: >>> Documentation/virtual/kvm/api.txt >>> arch/arm/kvm/Kconfig >>> >>> Conflicts: >>> Documentation/virtual/kvm/api.txt >>> --- >>> Documentation/virtual/kvm/api.txt | 4 +- >>> arch/arm/include/uapi/asm/kvm.h | 8 +++ >>> arch/arm/kvm/Kconfig | 2 + >>> arch/arm/kvm/Makefile | 1 + >>> arch/arm/kvm/irq.h| 25 +++ >>> virt/kvm/arm/vgic.c | 141 >>> -- >>> 6 files changed, 174 insertions(+), 7 deletions(-) >>> create mode 100644 arch/arm/kvm/irq.h >>> >>> diff --git a/Documentation/virtual/kvm/api.txt >>> b/Documentation/virtual/kvm/api.txt >>> index b4f5365..b376334 100644 >>> --- a/Documentation/virtual/kvm/api.txt >>> +++ b/Documentation/virtual/kvm/api.txt >>> @@ -1339,7 +1339,7 @@ KVM_ASSIGN_DEV_IRQ. Partial deassignment of host or >>> guest IRQ is allowed. >>> 4.52 KVM_SET_GSI_ROUTING >>> >>> Capability: KVM_CAP_IRQ_ROUTING >>> -Architectures: x86 ia64 s390 >>> +Architectures: x86 ia64 s390 arm >>> Type: vm ioctl >>> Parameters: struct kvm_irq_routing (in) >>> Returns: 0 on success, -1 on error >>> @@ -2126,7 +2126,7 @@ into the hash PTE second double word). >>> 4.75 KVM_IRQFD >>> >>> Capability: KVM_CAP_IRQFD >>> -Architectures: x86 s390 >>> +Architectures: x86 s390 arm >>> Type: vm ioctl >>> Parameters: struct kvm_irqfd (in) >>> Returns: 0 on success, -1 on error >>> diff --git a/arch/arm/include/uapi/asm/kvm.h >>> b/arch/arm/include/uapi/asm/kvm.h >>> index ef0c878..89b864d 100644 >>> --- a/arch/arm/include/uapi/asm/kvm.h >>> +++ b/arch/arm/include/uapi/asm/kvm.h >>> @@ -192,6 +192,14 @@ struct kvm_arch_memory_slot { >>> /* Highest supported SPI, from VGIC_NR_IRQS */ >>> #define KVM_ARM_IRQ_GIC_MAX
Re: [PATCH] kvm-unit-tests/x86: fix unittests.cfg typo
On Thu, Apr 17, 2014 at 04:17:42PM +0200, Andrew Jones wrote: > Signed-off-by: Andrew Jones > --- > x86/unittests.cfg | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/x86/unittests.cfg b/x86/unittests.cfg > index 7930c026a38d6..d78fe0eafe2b6 100644 > --- a/x86/unittests.cfg > +++ b/x86/unittests.cfg > @@ -156,5 +156,5 @@ extra_params = -cpu host,+vmx > arch = x86_64 > > [debug] > -file = debug.flag > +file = debug.flat > arch = x86_64 > -- > 1.8.1.4 > ping? -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH] KVM: PPC: e500mc: Relax tlb invalidation condition on vcpu schedule
> -Original Message- > From: Alexander Graf [mailto:ag...@suse.de] > Sent: Tuesday, June 17, 2014 12:09 PM > To: Wood Scott-B07421 > Cc: Caraman Mihai Claudiu-B02008; kvm-...@vger.kernel.org; > kvm@vger.kernel.org; linuxppc-...@lists.ozlabs.org > Subject: Re: [PATCH] KVM: PPC: e500mc: Relax tlb invalidation condition > on vcpu schedule > > > On 13.06.14 21:42, Scott Wood wrote: > > On Fri, 2014-06-13 at 16:55 +0200, Alexander Graf wrote: > >> On 13.06.14 16:43, mihai.cara...@freescale.com wrote: > -Original Message- > From: Alexander Graf [mailto:ag...@suse.de] > Sent: Thursday, June 12, 2014 8:05 PM > To: Caraman Mihai Claudiu-B02008 > Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; linuxppc- > d...@lists.ozlabs.org; Wood Scott-B07421 > Subject: Re: [PATCH] KVM: PPC: e500mc: Relax tlb invalidation > condition > on vcpu schedule > > On 06/12/2014 04:00 PM, Mihai Caraman wrote: > > On vcpu schedule, the condition checked for tlb pollution is too > tight. > > The tlb entries of one vcpu are polluted when a different vcpu from > the > > same partition runs in-between. Relax the current tlb invalidation > > condition taking into account the lpid. > > Can you quantify the performance improvement from this? We've had bugs > > in this area before, so let's make sure it's worth it before making > this > > more complicated. > > > > Signed-off-by: Mihai Caraman freescale.com> > Your mailer is broken? :) > This really should be an @. > > I think this should work. Scott, please ack. > >>> Alex, you were right. I screwed up the patch description by inverting > relax > >>> and tight terms :) It should have been more like this: > >>> > >>> KVM: PPC: e500mc: Enhance tlb invalidation condition on vcpu schedule > >>> > >>> On vcpu schedule, the condition checked for tlb pollution is too > loose. > >>> The tlb entries of a vcpu are polluted (vs stale) only when a > different vcpu > >>> within the same logical partition runs in-between. Optimize the tlb > invalidation > >>> condition taking into account the lpid. > >> Can't we give every vcpu its own lpid? Or don't we trap on global > >> invalidates? > > That would significantly increase the odds of exhausting LPIDs, > > especially on large chips like t4240 with similarly large VMs. If we > > were to do that, the LPIDs would need to be dynamically assigned (like > > PIDs), and should probably be a separate numberspace per physical core. > > True, I didn't realize we only have so few of them. It would however > save us from most flushing as long as we have spare LPIDs available :). Yes, we had this proposal on the table for e6500 multithreaded core. This core lacks tlb write conditional instruction, so an OS needs to use locks to protect itself against concurrent tlb writes executed from sibling threads. When we expose hw treads as single-threaded vcpus (useful when the user opt not to pin vcpus), the guest can't no longer protect itself optimally (it can protect tlb writes across all threads but this is not acceptable). So instead, we found a solution at hypervisor level by assigning different logical partition ids to guest's vcpus running simultaneous on sibling hw threads. Currently in FSL SDK we allocate two lpids to each guest. I am also a proponent for using all LPID space (63 values) per (multi-threaded) physical core, which will lead to fewer invalidates on vcpu schedule and will accommodate the solution described above. -Mike
Re: KVM call for 2014-06-24
Juan Quintela wrote: > Hi I sent the first mail to the wrong address, please notice that qemu-devel was missing). Fixed in this resent. Sorry for the inconveniences. Later, Juan. > Please, send any topic that you are interested in covering. > > * Machine as QOM (Marcel) (repeat) > - Solve the 'sensible' issues. > - Choose as first tasks the ones that will be more useful. > - Select solutions agreed by everyone. > > > Thanks, Juan. > > Call details: > > 15:00 CEST > 13:00 UTC > 09:00 EDT > > Every two weeks > > If you need phone number details, contact me privately. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 5/6] KVM: PPC: Book3S HV: Access XICS in BE
On Tue, Jun 17, 2014 at 12:22:32PM +0200, Alexander Graf wrote: > > Eh, no. What we do is we read (good on BE, byte reversed) into r0. Then we > swab32() from r0 to r3 on LE, mr from r0 to r3 on BE. > > r3 gets truncated along the way. > > The reason we maintain r0 as wrong-endian is that we write it back using the > cache inhibited stwcix instruction: > > >stwcix r0, r6, r7 /* EOI it */ > > So during the lifetime of r0 as XIRR it's always byte-reversed on LE. That's > why we store it using STWX_BE into hstate, because that's the time when we > actually swab32() it for further interpretation. So the STWX_BE is more like a be32_to_cpu than a cpu_to_be32, which is what the name STWX_BE would suggest. Sounds like it at least deserves a comment, or (as you suggest) rearrange the register usage so a normal store works. Paul. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 5/6] KVM: PPC: Book3S HV: Access XICS in BE
On 17.06.14 14:13, Paul Mackerras wrote: On Tue, Jun 17, 2014 at 12:22:32PM +0200, Alexander Graf wrote: Eh, no. What we do is we read (good on BE, byte reversed) into r0. Then we swab32() from r0 to r3 on LE, mr from r0 to r3 on BE. r3 gets truncated along the way. The reason we maintain r0 as wrong-endian is that we write it back using the cache inhibited stwcix instruction: stwcix r0, r6, r7 /* EOI it */ So during the lifetime of r0 as XIRR it's always byte-reversed on LE. That's why we store it using STWX_BE into hstate, because that's the time when we actually swab32() it for further interpretation. So the STWX_BE is more like a be32_to_cpu than a cpu_to_be32, which is what the name STWX_BE would suggest. Sounds like it at least deserves a comment, or (as you suggest) rearrange the register usage so a normal store works. Yes, I have this now: From a94a66437ec714ec5650f6d8fec050a33e4477ca Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Wed, 11 Jun 2014 10:37:52 +0200 Subject: [PATCH] KVM: PPC: Book3S HV: Access XICS in BE On the exit path from the guest we check what type of interrupt we received if we received one. This means we're doing hardware access to the XICS interrupt controller. However, when running on a little endian system, this access is byte reversed. So let's make sure to swizzle the bytes back again and virtually make XICS accesses big endian. Signed-off-by: Alexander Graf --- v1 -> v2: - Make code easier to follow diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 1a2f471..9829e18 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -36,6 +36,13 @@ #define NAPPING_CEDE1 #define NAPPING_NOVCPU2 +.macro bswap32 regd, regs +srwi\regd,\regs,24 +rlwimi\regd,\regs,24,16,23 +rlwimi\regd,\regs,8,8,15 +rlwimi\regd,\regs,24,0,7 +.endm + /* * Call kvmppc_hv_entry in real mode. * Must be called with interrupts hard-disabled. @@ -2325,7 +2332,12 @@ kvmppc_read_intr: cmpdir6, 0 beq-1f lwzcixr0, r6, r7 -rlwinm.r3, r0, 0, 0xff +#ifdef __LITTLE_ENDIAN__ +bswap32r4, r0 +#else +mrr4, r0 +#endif +rlwinm.r3, r4, 0, 0xff sync beq1f/* if nothing pending in the ICP */ @@ -2360,7 +2372,7 @@ kvmppc_read_intr: 42:/* It's not an IPI and it's for the host, stash it in the PACA * before exit, it will be picked up by the host ICP driver */ -stwr0, HSTATE_SAVED_XIRR(r13) +stwr4, HSTATE_SAVED_XIRR(r13) lir3, 1 b1b Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH][RFC] kvm-unit-tests: report: add report_skip
Add report_skip(), which is report(), but with another condition allowing it to output PASS/FAIL/SKIP, rather than only PASS/FAIL. This allows report output to stay more consistent between systems/configurations that may or may not support all tests. Currently I only have a downstream use case for this, but maybe report output sequences of the following form report(msg1, cond1) -> FAIL report(msg2, cond1 && cond2) -> FAIL report(msg3, cond1 && cond3) -> FAIL would look better as report(msg1, cond1) -> FAIL report_skip(msg2, !cond1, cond1 && cond2) -> SKIP report_skip(msg3, !cond1, cond1 && cond3) -> SKIP Signed-off-by: Andrew Jones --- lib/libcflat.h | 1 + lib/report.c | 34 ++ 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/lib/libcflat.h b/lib/libcflat.h index f734fdee2df18..01f2769ad1773 100644 --- a/lib/libcflat.h +++ b/lib/libcflat.h @@ -61,5 +61,6 @@ extern long atol(const char *ptr); #define NULL ((void *)0UL) void report(const char *msg_fmt, bool pass, ...); +void report_skip(const char *msg_fmt, bool can_skip, bool pass, ...); int report_summary(void); #endif diff --git a/lib/report.c b/lib/report.c index ff562a13c541e..77db22b6c2d8d 100644 --- a/lib/report.c +++ b/lib/report.c @@ -5,32 +5,50 @@ * * Authors: * Jan Kiszka + * Andrew Jones * * This work is licensed under the terms of the GNU LGPL, version 2. */ #include "libcflat.h" -static unsigned int tests, failures; +static unsigned int tests, failures, skipped; -void report(const char *msg_fmt, bool pass, ...) +void va_report_skip(const char *msg_fmt, bool can_skip, bool pass, va_list va) { + char *fail = can_skip ? "SKIP" : "FAIL"; char buf[2000]; - va_list va; tests++; - printf("%s: ", pass ? "PASS" : "FAIL"); - va_start(va, pass); + printf("%s: ", pass ? "PASS" : fail); vsnprintf(buf, sizeof(buf), msg_fmt, va); - va_end(va); puts(buf); puts("\n"); - if (!pass) + if (!pass && can_skip) + skipped++; + else if (!pass) failures++; } +void report(const char *msg_fmt, bool pass, ...) +{ + va_list va; + va_start(va, pass); + va_report_skip(msg_fmt, false, pass, va); + va_end(va); +} + +void report_skip(const char *msg_fmt, bool can_skip, bool pass, ...) +{ + va_list va; + va_start(va, pass); + va_report_skip(msg_fmt, can_skip, pass, va); + va_end(va); +} + int report_summary(void) { - printf("\nSUMMARY: %d tests, %d failures\n", tests, failures); + printf("\nSUMMARY: %d tests, %d failures, %d skipped\n", + tests, failures, skipped); return failures > 0 ? 1 : 0; } -- 1.9.3 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH][RFC] kvm-unit-tests: report: add report_skip
Il 17/06/2014 14:21, Andrew Jones ha scritto: would look better as report(msg1, cond1) -> FAIL report_skip(msg2, !cond1, cond1 && cond2) -> SKIP report_skip(msg3, !cond1, cond1 && cond3) -> SKIP I think a lot of the time there is other code before the report that you want to skip. Is anything more a "report_skip(msg1);" (that print a SKIP message) that useful? So you can do if (!(msr & 0x1000) { report_skip("Frob bit 12 of msr"); return; } Unless what you really want is not a SKIP but an expected failure, then I agree with the idea. Something like report_xfail(msg2, cond1, cond2) would print: PASS if cond1 = false, cond2 = true FAIL if cond1 = false, cond2 = false XPASS if cond1 = true, cond2 = true XFAIL if cond1 = true, cond2 = false An XPASS would ultimately exit with status 1, just like a FAIL. Paolo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH][RFC] kvm-unit-tests: report: add report_skip
On Tue, Jun 17, 2014 at 02:46:30PM +0200, Paolo Bonzini wrote: > Il 17/06/2014 14:21, Andrew Jones ha scritto: > >would look better as > > > >report(msg1, cond1) -> FAIL > >report_skip(msg2, !cond1, cond1 && cond2) -> SKIP > >report_skip(msg3, !cond1, cond1 && cond3) -> SKIP > > I think a lot of the time there is other code before the report that you > want to skip. Is anything more a "report_skip(msg1);" (that print a SKIP > message) that useful? So you can do > > if (!(msr & 0x1000) { > report_skip("Frob bit 12 of msr"); This is enough to output "msg: SKIP" in a format consistent with PASS and FAIL. However, it's not necessarily enough to output the test case too. E.g. we'd need if (cond1) { /* do stuff to prepare for report conditions */ report("msg1", cond2); report("msg2", cond3); } else { report_skip("msg1"); report_skip("msg2"); } It'd be nice to avoid the message redundancy in the cases that we're able to. However, ... > return; > } > > Unless what you really want is not a SKIP but an expected failure, then I ...yes, I was thinking along the lines of an 'expected failure' report, not the above issue, and 'XFAIL' does convey that idea better. > agree with the idea. Something like > > report_xfail(msg2, cond1, cond2) > > would print: > > PASS if cond1 = false, cond2 = true > FAIL if cond1 = false, cond2 = false > XPASS if cond1 = true, cond2 = true > XFAIL if cond1 = true, cond2 = false > > An XPASS would ultimately exit with status 1, just like a FAIL. Yeah, XPASS is good. report_skip would make PASS ambiguous. Not good. I'll drop the idea of outputting SKIP, switch to report_xfail, and send a v2. Thanks, drew > > Paolo > -- > To unsubscribe from this list: send the line "unsubscribe kvm" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH kvm-unit-tests] report: add report_xfail for expected failures
Add report_xfail(), which is report(), but with another condition allowing it to output PASS/FAIL/XPASS/XFAIL, rather than only PASS/FAIL. This allows report output to stay more consistent between systems/configurations that may or may not support all tests. Signed-off-by: Andrew Jones --- lib/libcflat.h | 1 + lib/report.c | 37 + 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/lib/libcflat.h b/lib/libcflat.h index cb6663d33ef5a..c9577350ec275 100644 --- a/lib/libcflat.h +++ b/lib/libcflat.h @@ -63,5 +63,6 @@ extern long atol(const char *ptr); #define NULL ((void *)0UL) void report(const char *msg_fmt, bool pass, ...); +void report_xfail(const char *msg_fmt, bool xfail, bool pass, ...); int report_summary(void); #endif diff --git a/lib/report.c b/lib/report.c index ff562a13c541e..43d7a65ec8812 100644 --- a/lib/report.c +++ b/lib/report.c @@ -5,32 +5,53 @@ * * Authors: * Jan Kiszka + * Andrew Jones * * This work is licensed under the terms of the GNU LGPL, version 2. */ #include "libcflat.h" -static unsigned int tests, failures; +static unsigned int tests, failures, xfailures; -void report(const char *msg_fmt, bool pass, ...) +void va_report_xfail(const char *msg_fmt, bool xfail, bool cond, va_list va) { + char *pass = xfail ? "XPASS" : "PASS"; + char *fail = xfail ? "XFAIL" : "FAIL"; char buf[2000]; - va_list va; tests++; - printf("%s: ", pass ? "PASS" : "FAIL"); - va_start(va, pass); + printf("%s: ", cond ? pass : fail); vsnprintf(buf, sizeof(buf), msg_fmt, va); - va_end(va); puts(buf); puts("\n"); - if (!pass) + if (xfail && cond) + failures++; + else if (xfail) + xfailures++; + else if (!cond) failures++; } +void report(const char *msg_fmt, bool pass, ...) +{ + va_list va; + va_start(va, pass); + va_report_xfail(msg_fmt, false, pass, va); + va_end(va); +} + +void report_xfail(const char *msg_fmt, bool xfail, bool pass, ...) +{ + va_list va; + va_start(va, pass); + va_report_xfail(msg_fmt, xfail, pass, va); + va_end(va); +} + int report_summary(void) { - printf("\nSUMMARY: %d tests, %d failures\n", tests, failures); + printf("\nSUMMARY: %d tests, %d failures, %d xfailures\n", + tests, failures, xfailures); return failures > 0 ? 1 : 0; } -- 1.9.3 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] powerpc/kvm: support to handle sw breakpoint
On 17.06.14 13:13, Madhavan Srinivasan wrote: On Tuesday 17 June 2014 04:38 PM, Alexander Graf wrote: On 17.06.14 13:07, Madhavan Srinivasan wrote: On Tuesday 17 June 2014 02:24 PM, Alexander Graf wrote: On 14.06.14 23:08, Madhavan Srinivasan wrote: This patch adds kernel side support for software breakpoint. Design is that, by using an illegal instruction, we trap to hypervisor via Emulation Assistance interrupt, where we check for the illegal instruction and accordingly we return to Host or Guest. Patch mandates use of "abs" instruction (primary opcode 31 and extended opcode 360) as sw breakpoint instruction. Based on PowerISA v2.01, ABS instruction has been dropped from the architecture and treated an illegal instruction. Signed-off-by: Madhavan Srinivasan --- arch/powerpc/kvm/book3s.c| 3 ++- arch/powerpc/kvm/book3s_hv.c | 23 +++ 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index c254c27..b40fe5d 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -789,7 +789,8 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { -return -EINVAL; +vcpu->guest_debug = dbg->control; +return 0; } void kvmppc_decrementer_func(unsigned long data) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 7a12edb..688421d 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -67,6 +67,14 @@ /* Used as a "null" value for timebase values */ #define TB_NIL(~(u64)0) +/* + * SW_BRK_DBG_INT is debug Instruction for supporting Software Breakpoint. + * Instruction mnemonic is ABS, primary opcode is 31 and extended opcode is 360. + * Based on PowerISA v2.01, ABS instruction has been dropped from the architecture + * and treated an illegal instruction. + */ +#define SW_BRK_DBG_INT 0x7c0002d0 The instruction we use to trap needs to get exposed to user space via a ONE_REG property. Yes. I got to know about that from Bharat (patchset "ppc debug: Add debug stub support"). I will change it. Also please make sure to pick an instruction that preferably looks identical regardless of guest endianness. Segher suggested 0x0000. Does that trap properly for you? Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2] KVM: PPC: e500mc: Enhance tlb invalidation condition on vcpu schedule
On Tue, 2014-06-17 at 12:02 +0300, Mihai Caraman wrote: > On vcpu schedule, the condition checked for tlb pollution is too loose. > The tlb entries of a vcpu become polluted (vs stale) only when a different > vcpu within the same logical partition runs in-between. Optimize the tlb > invalidation condition taking into account the logical partition id. > > With the new invalidation condition, a guest shows 4% performance improvement > on P5020DS while running a memory stress application with the cpu > oversubscribed, > the other guest running a cpu intensive workload. See https://lists.ozlabs.org/pipermail/linuxppc-dev/2014-June/118547.html -Scott -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] KVM: PPC: e500mc: Relax tlb invalidation condition on vcpu schedule
On Thu, 2014-06-12 at 19:04 +0200, Alexander Graf wrote: > On 06/12/2014 04:00 PM, Mihai Caraman wrote: > > @@ -140,12 +142,24 @@ static void kvmppc_core_vcpu_load_e500mc(struct > > kvm_vcpu *vcpu, int cpu) > > mtspr(SPRN_GDEAR, vcpu->arch.shared->dar); > > mtspr(SPRN_GESR, vcpu->arch.shared->esr); > > > > - if (vcpu->arch.oldpir != mfspr(SPRN_PIR) || > > - __get_cpu_var(last_vcpu_on_cpu) != vcpu) { > > - kvmppc_e500_tlbil_all(vcpu_e500); > > + if (vcpu->arch.oldpir != mfspr(SPRN_PIR)) { > > + /* tlb entries deprecated */ > > + inval_tlb = update_last = true; > > + } else if (__get_cpu_var(last_vcpu_on_cpu) != vcpu) { > > + update_last = true; > > + /* tlb entries polluted */ > > + inval_tlb = __get_cpu_var(last_lpid_on_cpu) == > > + vcpu->kvm->arch.lpid; > > + } What about the following sequence on one CPU: LPID 1, vcpu A LPID 2, vcpu C LPID 1, vcpu B LPID 2, vcpu C doesn't invalidate LPID 1, vcpu A doesn't invalidate In the last line, vcpu A last ran on this cpu (oldpir matches), but LPID 2 last ran on this cpu (last_lpid_on_cpu does not match) -- but an invalidation has never happened since vcpu B from LPID 1 ran on this cpu. -Scott -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/6] KVM: x86: Wrong emulation on 'xadd X, X'
Paolo Bonzini writes: > Il 16/06/2014 19:38, Bandan Das ha scritto: >> Nadav Amit writes: >> >>> The emulator does not emulate the xadd instruction correctly if the two >>> operands are the same. In this (unlikely) situation the result should be >>> the >>> sum of X and X (2X) when it is currently X. The solution is to first >>> perform >>> writeback to the source, before writing to the destination. The only >>> instruction which should be affected is xadd, as the other instructions that >>> perform writeback to the source use the extended accumlator (e.g., RAX:RDX). >>> >>> Signed-off-by: Nadav Amit >>> --- >>> arch/x86/kvm/emulate.c | 10 +- >>> 1 file changed, 5 insertions(+), 5 deletions(-) >>> >>> diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c >>> index f0b0a10..3c8d867 100644 >>> --- a/arch/x86/kvm/emulate.c >>> +++ b/arch/x86/kvm/emulate.c >>> @@ -4711,17 +4711,17 @@ special_insn: >>> goto done; >>> >>> writeback: >>> - if (!(ctxt->d & NoWrite)) { >>> - rc = writeback(ctxt, &ctxt->dst); >>> - if (rc != X86EMUL_CONTINUE) >>> - goto done; >>> - } >>> if (ctxt->d & SrcWrite) { >>> BUG_ON(ctxt->src.type == OP_MEM || ctxt->src.type == >>> OP_MEM_STR); >> While we are here, I think we should replace this BUG_ON with a warning >> and return X86EMUL_UNHANDLEABLE if the condition is true. > > Sure, please post a patch and I'll apply it right away. Well, I meant it more as a review and "suggested changes" to this patchset Nadav posted, but yeah, if you prefer, I can post a change myself. I will make a pass through other uses of BUG() in the code too. > Paolo > > -- > To unsubscribe from this list: send the line "unsubscribe kvm" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] KVM: PPC: Book3S PR: Handle hyp doorbell exits
If we're running PR KVM in HV mode, we may get hypervisor doorbell interrupts. Handle those the same way we treat normal doorbells. Signed-off-by: Alexander Graf --- arch/powerpc/kvm/book3s_pr.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 8ea7da4..3b82e86 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -988,6 +988,7 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, case BOOK3S_INTERRUPT_DECREMENTER: case BOOK3S_INTERRUPT_HV_DECREMENTER: case BOOK3S_INTERRUPT_DOORBELL: + case BOOK3S_INTERRUPT_H_DOORBELL: vcpu->stat.dec_exits++; r = RESUME_GUEST; break; -- 1.8.1.4 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 3/4] KVM: PPC: Book3S HV: Fix ABIv2 indirect branch issue
On 12.06.14 10:16, Anton Blanchard wrote: To establish addressability quickly, ABIv2 requires the target address of the function being called to be in r12. Signed-off-by: Anton Blanchard Thanks, applied to kvm-ppc-queue. Alex --- Index: b/arch/powerpc/kvm/book3s_hv_rmhandlers.S === --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1920,8 +1920,8 @@ hcall_try_real_mode: lwaxr3,r3,r4 cmpwi r3,0 beq guest_exit_cont - add r3,r3,r4 - mtctr r3 + add r12,r3,r4 + mtctr r12 mr r3,r9 /* get vcpu pointer */ ld r4,VCPU_GPR(R4)(r9) bctrl -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 4/4] KVM: PPC: Assembly functions exported to modules need _GLOBAL_TOC()
On 12.06.14 10:16, Anton Blanchard wrote: Both kvmppc_hv_entry_trampoline and kvmppc_entry_trampoline are assembly functions that are exported to modules and also require a valid r2. As such we need to use _GLOBAL_TOC so we provide a global entry point that establishes the TOC (r2). Signed-off-by: Anton Blanchard Thanks, applied to kvm-ppc-queue. I've not applied patches 1 and 2 for now, as they break BE module support. Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] KVM: PPC: Book3S PR: Fix ABIv2 on LE
We switched to ABIv2 on Little Endian systems now which gets rid of the dotted function names. Branch to the actual functions when we see such a system. Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/book3s_interrupts.S b/arch/powerpc/kvm/book3s_interrupts.S index e2c29e3..d044b8b 100644 --- a/arch/powerpc/kvm/book3s_interrupts.S +++ b/arch/powerpc/kvm/book3s_interrupts.S @@ -25,7 +25,11 @@ #include #if defined(CONFIG_PPC_BOOK3S_64) +#if defined(_CALL_ELF) && _CALL_ELF == 2 +#define FUNC(name) name +#else #define FUNC(name) GLUE(.,name) +#endif #define GET_SHADOW_VCPU(reg)addi reg, r13, PACA_SVCPU #elif defined(CONFIG_PPC_BOOK3S_32) diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S index 9eec675..cdcbb63 100644 --- a/arch/powerpc/kvm/book3s_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_rmhandlers.S @@ -36,7 +36,11 @@ #if defined(CONFIG_PPC_BOOK3S_64) +#if defined(_CALL_ELF) && _CALL_ELF == 2 +#define FUNC(name) name +#else #define FUNC(name) GLUE(.,name) +#endif #elif defined(CONFIG_PPC_BOOK3S_32) -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/7] KVM: PPC: Book3S HV: Make HTAB code LE host aware
When running on an LE host all data structures are kept in little endian byte order. However, the HTAB still needs to be maintained in big endian. So every time we access any HTAB we need to make sure we do so in the right byte order. Fix up all accesses to manually byte swap. Signed-off-by: Alexander Graf --- v1 -> v2: - Add __be32 hints - Fix H_REMOVE --- arch/powerpc/include/asm/kvm_book3s.h| 4 +- arch/powerpc/include/asm/kvm_book3s_64.h | 15 +++- arch/powerpc/kvm/book3s_64_mmu_hv.c | 128 ++- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 146 ++- 4 files changed, 164 insertions(+), 129 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index a20cc0b..028f4b1 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -161,9 +161,9 @@ extern pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, bool writing, bool *writable); extern void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, unsigned long *rmap, long pte_index, int realmode); -extern void kvmppc_invalidate_hpte(struct kvm *kvm, unsigned long *hptep, +extern void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep, unsigned long pte_index); -void kvmppc_clear_ref_hpte(struct kvm *kvm, unsigned long *hptep, +void kvmppc_clear_ref_hpte(struct kvm *kvm, __be64 *hptep, unsigned long pte_index); extern void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long addr, unsigned long *nb_ret); diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index c7871f3..e504f88 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -59,20 +59,29 @@ extern unsigned long kvm_rma_pages; /* These bits are reserved in the guest view of the HPTE */ #define HPTE_GR_RESERVED HPTE_GR_MODIFIED -static inline long try_lock_hpte(unsigned long *hpte, unsigned long bits) +static inline long try_lock_hpte(__be64 *hpte, unsigned long bits) { unsigned long tmp, old; + __be64 be_lockbit, be_bits; + + /* +* We load/store in native endian, but the HTAB is in big endian. If +* we byte swap all data we apply on the PTE we're implicitly correct +* again. +*/ + be_lockbit = cpu_to_be64(HPTE_V_HVLOCK); + be_bits = cpu_to_be64(bits); asm volatile(" ldarx %0,0,%2\n" " and.%1,%0,%3\n" " bne 2f\n" -" ori %0,%0,%4\n" +" or %0,%0,%4\n" " stdcx. %0,0,%2\n" " beq+2f\n" " mr %1,%3\n" "2:isync" : "=&r" (tmp), "=&r" (old) -: "r" (hpte), "r" (bits), "i" (HPTE_V_HVLOCK) +: "r" (hpte), "r" (be_bits), "r" (be_lockbit) : "cc", "memory"); return old == 0; } diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 8056107..2d154d9 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -450,7 +450,7 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, unsigned long slb_v; unsigned long pp, key; unsigned long v, gr; - unsigned long *hptep; + __be64 *hptep; int index; int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR); @@ -473,13 +473,13 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, preempt_enable(); return -ENOENT; } - hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4)); - v = hptep[0] & ~HPTE_V_HVLOCK; + hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4)); + v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; gr = kvm->arch.revmap[index].guest_rpte; /* Unlock the HPTE */ asm volatile("lwsync" : : : "memory"); - hptep[0] = v; + hptep[0] = cpu_to_be64(v); preempt_enable(); gpte->eaddr = eaddr; @@ -583,7 +583,8 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned long ea, unsigned long dsisr) { struct kvm *kvm = vcpu->kvm; - unsigned long *hptep, hpte[3], r; + unsigned long hpte[3], r; + __be64 *hptep; unsigned long mmu_seq, psize, pte_size; unsigned long gpa_base, gfn_base; unsigned long gpa, gfn, hva, pfn; @@ -606,16 +607,16 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, if (ea != vcpu->arch.pgfault_addr) return RESUME_GUEST;
[PATCH 3/7] KVM: PPC: Book3S HV: Access guest VPA in BE
There are a few shared data structures between the host and the guest. Most of them get registered through the VPA interface. These data structures are defined to always be in big endian byte order, so let's make sure we always access them in big endian. Signed-off-by: Alexander Graf --- v1 -> v2: - Add __be hints - Fix dtl_idx --- arch/powerpc/kvm/book3s_hv.c | 22 +++--- arch/powerpc/kvm/book3s_hv_ras.c | 6 +++--- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 1562acf..f7bcc72 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -270,7 +270,7 @@ struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id) static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa) { vpa->__old_status |= LPPACA_OLD_SHARED_PROC; - vpa->yield_count = 1; + vpa->yield_count = cpu_to_be32(1); } static int set_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *v, @@ -293,8 +293,8 @@ static int set_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *v, struct reg_vpa { u32 dummy; union { - u16 hword; - u32 word; + __be16 hword; + __be32 word; } length; }; @@ -333,9 +333,9 @@ static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu, if (va == NULL) return H_PARAMETER; if (subfunc == H_VPA_REG_VPA) - len = ((struct reg_vpa *)va)->length.hword; + len = be16_to_cpu(((struct reg_vpa *)va)->length.hword); else - len = ((struct reg_vpa *)va)->length.word; + len = be32_to_cpu(((struct reg_vpa *)va)->length.word); kvmppc_unpin_guest_page(kvm, va, vpa, false); /* Check length */ @@ -540,18 +540,18 @@ static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu, return; memset(dt, 0, sizeof(struct dtl_entry)); dt->dispatch_reason = 7; - dt->processor_id = vc->pcpu + vcpu->arch.ptid; - dt->timebase = now + vc->tb_offset; - dt->enqueue_to_dispatch_time = stolen; - dt->srr0 = kvmppc_get_pc(vcpu); - dt->srr1 = vcpu->arch.shregs.msr; + dt->processor_id = cpu_to_be16(vc->pcpu + vcpu->arch.ptid); + dt->timebase = cpu_to_be64(now + vc->tb_offset); + dt->enqueue_to_dispatch_time = cpu_to_be32(stolen); + dt->srr0 = cpu_to_be64(kvmppc_get_pc(vcpu)); + dt->srr1 = cpu_to_be64(vcpu->arch.shregs.msr); ++dt; if (dt == vcpu->arch.dtl.pinned_end) dt = vcpu->arch.dtl.pinned_addr; vcpu->arch.dtl_ptr = dt; /* order writing *dt vs. writing vpa->dtl_idx */ smp_wmb(); - vpa->dtl_idx = ++vcpu->arch.dtl_index; + vpa->dtl_idx = cpu_to_be64(++vcpu->arch.dtl_index); vcpu->arch.dtl.dirty = true; } diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c index 768a9f9..6a3ba8a 100644 --- a/arch/powerpc/kvm/book3s_hv_ras.c +++ b/arch/powerpc/kvm/book3s_hv_ras.c @@ -45,14 +45,14 @@ static void reload_slb(struct kvm_vcpu *vcpu) return; /* Sanity check */ - n = min_t(u32, slb->persistent, SLB_MIN_SIZE); + n = min_t(u32, be32_to_cpu(slb->persistent), SLB_MIN_SIZE); if ((void *) &slb->save_area[n] > vcpu->arch.slb_shadow.pinned_end) return; /* Load up the SLB from that */ for (i = 0; i < n; ++i) { - unsigned long rb = slb->save_area[i].esid; - unsigned long rs = slb->save_area[i].vsid; + unsigned long rb = be64_to_cpu(slb->save_area[i].esid); + unsigned long rs = be64_to_cpu(slb->save_area[i].vsid); rb = (rb & ~0xFFFul) | i; /* insert entry number */ asm volatile("slbmte %0,%1" : : "r" (rs), "r" (rb)); -- 1.8.1.4 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 7/7] KVM: PPC: Book3S HV: Enable for little endian hosts
Now that we've fixed all the issues that HV KVM code had on little endian hosts, we can enable it in the kernel configuration for users to play with. Signed-off-by: Alexander Graf --- arch/powerpc/kvm/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index d6a53b9..8aeeda1 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -75,7 +75,6 @@ config KVM_BOOK3S_64 config KVM_BOOK3S_64_HV tristate "KVM support for POWER7 and PPC970 using hypervisor mode in host" depends on KVM_BOOK3S_64 - depends on !CPU_LITTLE_ENDIAN select KVM_BOOK3S_HV_POSSIBLE select MMU_NOTIFIER select CMA -- 1.8.1.4 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 4/7] KVM: PPC: Book3S HV: Access host lppaca and shadow slb in BE
Some data structures are always stored in big endian. Among those are the LPPACA fields as well as the shadow slb. These structures might be shared with a hypervisor. So whenever we access those fields, make sure we do so in big endian byte order. Signed-off-by: Alexander Graf --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 20 ++-- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 7e7fa01..75c7e22 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -32,10 +32,6 @@ #define VCPU_GPRS_TM(reg) (((reg) * ULONG_SIZE) + VCPU_GPR_TM) -#ifdef __LITTLE_ENDIAN__ -#error Need to fix lppaca and SLB shadow accesses in little endian mode -#endif - /* Values in HSTATE_NAPPING(r13) */ #define NAPPING_CEDE 1 #define NAPPING_NOVCPU 2 @@ -595,9 +591,10 @@ kvmppc_got_guest: ld r3, VCPU_VPA(r4) cmpdi r3, 0 beq 25f - lwz r5, LPPACA_YIELDCOUNT(r3) + li r6, LPPACA_YIELDCOUNT + LWZX_BE r5, r3, r6 addir5, r5, 1 - stw r5, LPPACA_YIELDCOUNT(r3) + STWX_BE r5, r3, r6 li r6, 1 stb r6, VCPU_VPA_DIRTY(r4) 25: @@ -1442,9 +1439,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_TM) ld r8, VCPU_VPA(r9)/* do they have a VPA? */ cmpdi r8, 0 beq 25f - lwz r3, LPPACA_YIELDCOUNT(r8) + li r4, LPPACA_YIELDCOUNT + LWZX_BE r3, r8, r4 addir3, r3, 1 - stw r3, LPPACA_YIELDCOUNT(r8) + STWX_BE r3, r8, r4 li r3, 1 stb r3, VCPU_VPA_DIRTY(r9) 25: @@ -1757,8 +1755,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) 33:ld r8,PACA_SLBSHADOWPTR(r13) .rept SLB_NUM_BOLTED - ld r5,SLBSHADOW_SAVEAREA(r8) - ld r6,SLBSHADOW_SAVEAREA+8(r8) + li r3, SLBSHADOW_SAVEAREA + LDX_BE r5, r8, r3 + addir3, r3, 8 + LDX_BE r6, r8, r3 andis. r7,r5,SLB_ESID_V@h beq 1f slbmte r6,r5 -- 1.8.1.4 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 5/7] KVM: PPC: Book3S HV: Access XICS in BE
On the exit path from the guest we check what type of interrupt we received if we received one. This means we're doing hardware access to the XICS interrupt controller. However, when running on a little endian system, this access is byte reversed. So let's make sure to swizzle the bytes back again and virtually make XICS accesses big endian. Signed-off-by: Alexander Graf --- v1 -> v2: - Make code easier to follow and use hardware for bswap --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 18 ++ 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 75c7e22..1a71f60 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -2325,7 +2325,18 @@ kvmppc_read_intr: cmpdi r6, 0 beq-1f lwzcix r0, r6, r7 - rlwinm. r3, r0, 0, 0xff + /* +* Save XIRR for later. Since we get in in reverse endian on LE +* systems, save it byte reversed and fetch it back in host endian. +*/ + li r3, HSTATE_SAVED_XIRR + STWX_BE r0, r3, r13 +#ifdef __LITTLE_ENDIAN__ + lwz r3, HSTATE_SAVED_XIRR(r13) +#else + mr r3, r0 +#endif + rlwinm. r3, r3, 0, 0xff sync beq 1f /* if nothing pending in the ICP */ @@ -2357,10 +2368,9 @@ kvmppc_read_intr: li r3, -1 1: blr -42:/* It's not an IPI and it's for the host, stash it in the PACA -* before exit, it will be picked up by the host ICP driver +42:/* It's not an IPI and it's for the host. We saved a copy of XIRR in +* the PACA earlier, it will be picked up by the host ICP driver */ - stw r0, HSTATE_SAVED_XIRR(r13) li r3, 1 b 1b -- 1.8.1.4 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/7] PPC: Add asm helpers for BE 32bit load/store
>From assembly code we might not only have to explicitly BE access 64bit values, but sometimes also 32bit ones. Add helpers that allow for easy use of lwzx/stwx in their respective byte-reverse or native form. Signed-off-by: Alexander Graf CC: Benjamin Herrenschmidt --- v1 -> v2: - fix typo in STWX_BE --- arch/powerpc/include/asm/asm-compat.h | 4 1 file changed, 4 insertions(+) diff --git a/arch/powerpc/include/asm/asm-compat.h b/arch/powerpc/include/asm/asm-compat.h index 4b237aa..21be8ae 100644 --- a/arch/powerpc/include/asm/asm-compat.h +++ b/arch/powerpc/include/asm/asm-compat.h @@ -34,10 +34,14 @@ #define PPC_MIN_STKFRM 112 #ifdef __BIG_ENDIAN__ +#define LWZX_BEstringify_in_c(lwzx) #define LDX_BE stringify_in_c(ldx) +#define STWX_BEstringify_in_c(stwx) #define STDX_BEstringify_in_c(stdx) #else +#define LWZX_BEstringify_in_c(lwbrx) #define LDX_BE stringify_in_c(ldbrx) +#define STWX_BEstringify_in_c(stwbrx) #define STDX_BEstringify_in_c(stdbrx) #endif -- 1.8.1.4 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 0/7] KVM: PPC: Book3S HV: Enable on little endian hosts
So far we've been able to successfully run HV KVM on big endian hosts, but once you dive into little endian land things start to fall apart. This patch set enables HV KVM for little endian hosts. This should be the final piece left missing to get little endian systems fully en par with big endian ones in the KVM world - modulo bugs. For now guest threading support is still slightly flaky, but I'm sure that's only a minor breakage somewhere that we'll find soon. v1 -> v2: - fix typo in STWX_BE - Add __be hints - Fix H_REMOVE - Fix dtl_idx - Make XICS code easier to follow and use memory for bswap Alex Alexander Graf (7): PPC: Add asm helpers for BE 32bit load/store KVM: PPC: Book3S HV: Make HTAB code LE host aware KVM: PPC: Book3S HV: Access guest VPA in BE KVM: PPC: Book3S HV: Access host lppaca and shadow slb in BE KVM: PPC: Book3S HV: Access XICS in BE KVM: PPC: Book3S HV: Fix ABIv2 on LE KVM: PPC: Book3S HV: Enable for little endian hosts arch/powerpc/include/asm/asm-compat.h| 4 + arch/powerpc/include/asm/kvm_book3s.h| 4 +- arch/powerpc/include/asm/kvm_book3s_64.h | 15 +++- arch/powerpc/kvm/Kconfig | 1 - arch/powerpc/kvm/book3s_64_mmu_hv.c | 128 ++- arch/powerpc/kvm/book3s_hv.c | 22 ++--- arch/powerpc/kvm/book3s_hv_ras.c | 6 +- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 146 ++- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 60 - 9 files changed, 220 insertions(+), 166 deletions(-) -- 1.8.1.4 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 6/7] KVM: PPC: Book3S HV: Fix ABIv2 on LE
We use ABIv2 on Little Endian systems which gets rid of the dotted function names. Branch to the actual functions when we see such a system. Signed-off-by: Alexander Graf --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 22 ++ 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 1a71f60..1ff3ebd 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -36,6 +36,12 @@ #define NAPPING_CEDE 1 #define NAPPING_NOVCPU 2 +#if defined(_CALL_ELF) && _CALL_ELF == 2 +#define FUNC(name) name +#else +#define FUNC(name) GLUE(.,name) +#endif + /* * Call kvmppc_hv_entry in real mode. * Must be called with interrupts hard-disabled. @@ -668,9 +674,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_TM) mr r31, r4 addir3, r31, VCPU_FPRS_TM - bl .load_fp_state + bl FUNC(load_fp_state) addir3, r31, VCPU_VRS_TM - bl .load_vr_state + bl FUNC(load_vr_state) mr r4, r31 lwz r7, VCPU_VRSAVE_TM(r4) mtspr SPRN_VRSAVE, r7 @@ -1414,9 +1420,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_TM) /* Save FP/VSX. */ addir3, r9, VCPU_FPRS_TM - bl .store_fp_state + bl FUNC(store_fp_state) addir3, r9, VCPU_VRS_TM - bl .store_vr_state + bl FUNC(store_vr_state) mfspr r6, SPRN_VRSAVE stw r6, VCPU_VRSAVE_TM(r9) 1: @@ -2405,11 +2411,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) mtmsrd r8 isync addir3,r3,VCPU_FPRS - bl .store_fp_state + bl FUNC(store_fp_state) #ifdef CONFIG_ALTIVEC BEGIN_FTR_SECTION addir3,r31,VCPU_VRS - bl .store_vr_state + bl FUNC(store_vr_state) END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) #endif mfspr r6,SPRN_VRSAVE @@ -2441,11 +2447,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) mtmsrd r8 isync addir3,r4,VCPU_FPRS - bl .load_fp_state + bl FUNC(load_fp_state) #ifdef CONFIG_ALTIVEC BEGIN_FTR_SECTION addir3,r31,VCPU_VRS - bl .load_vr_state + bl FUNC(load_vr_state) END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) #endif lwz r7,VCPU_VRSAVE(r31) -- 1.8.1.4 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/6] KVM: x86: Wrong emulation on 'xadd X, X'
Il 17/06/2014 17:35, Bandan Das ha scritto: Well, I meant it more as a review and "suggested changes" to this patchset Nadav posted, but yeah, if you prefer, I can post a change myself. I will make a pass through other uses of BUG() in the code too. I'd prefer that, there's no need to make Nadav do even more work. Paolo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Hang on reboot in FreeBSD guest on Linux KVM host
On Jun 17, 2014, at 12:05 AM, Gleb Natapov wrote: > On Tue, Jun 17, 2014 at 06:21:23AM +0200, Paolo Bonzini wrote: >> Il 16/06/2014 18:47, John Nielsen ha scritto: >>> On Jun 16, 2014, at 10:39 AM, Paolo Bonzini wrote: >>> Il 16/06/2014 18:09, John Nielsen ha scritto: >>> The only substantial difference on the hardware side is the CPU. >>> The hosts where the problem occurs use "Intel(R) Xeon(R) CPU >>> E5-2650 v2 @ 2.60GHz", while the hosts that don't show the >>> problem use the prior revision, "Intel(R) Xeon(R) CPU E5-2650 0 @ >>> 2.00GHz". Can you do "grep . /sys/module/kvm_intel/parameters/*" on both hosts please? >>> >>> No differences that I can see. Output below. >> >> Not really: >> >>> Working host: >>> Intel(R) Xeon(R) CPU E5-2650 0 @ 2.00GHz >>> # grep . /sys/module/kvm_intel/parameters/* >>> /sys/module/kvm_intel/parameters/enable_apicv:N >>> >>> Problem host: >>> Intel(R) Xeon(R) CPU E5-2650 v2 @ 2.60GHz >>> # grep . /sys/module/kvm_intel/parameters/* >>> /sys/module/kvm_intel/parameters/enable_apicv:Y >> >> So we have a clue. Let me study the code more, I'll try to get back with a >> suggestion. Wow, can't believe I missed that. Good catch! > Does disabling apicv on E5-2650 v2 make reboot problem go away? Yes it does! # modprobe kvm_intel /sys/module/kvm_intel/parameters/enable_apicv:Y # /usr/bin/qemu-system-x86_64 -machine accel=kvm -name bsdtest -m 512 -smp 2,sockets=1,cores=1,threads=2 -drive file=./20140613_FreeBSD_9.2-RELEASE_ufs.qcow2,if=none,id=drive0,format=qcow2 -device virtio-blk-pci,scsi=off,drive=drive0 -vnc 0.0.0.0:0 -net none [problem occurs] # rmmod kvm_intel # modprobe kvm_intel enable_apicv=N /sys/module/kvm_intel/parameters/enable_apicv:N # /usr/bin/qemu-system-x86_64 -machine accel=kvm -name bsdtest -m 512 -smp 2,sockets=1,cores=1,threads=2 -drive file=./20140613_FreeBSD_9.2-RELEASE_ufs.qcow2,if=none,id=drive0,format=qcow2 -device virtio-blk-pci,scsi=off,drive=drive0 -vnc 0.0.0.0:0 -net none [problem does not occur] Thank you. This both narrows the problem considerably and provides an acceptable workaround. It would still be nice to see it fixed, of course. Keep me CC'ed as I'm not on the KVM list. JN -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PULL 044/103] Add kvm_eventfds_enabled function
From: Nikolay Nikolaev Add a function to check if the eventfd capability is present in KVM in the host kernel. Signed-off-by: Antonios Motakis Signed-off-by: Nikolay Nikolaev Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Acked-by: Paolo Bonzini --- include/sysemu/kvm.h | 11 +++ kvm-all.c| 4 kvm-stub.c | 1 + 3 files changed, 16 insertions(+) diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index e79e92c..c4556ad 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -43,6 +43,7 @@ extern bool kvm_allowed; extern bool kvm_kernel_irqchip; extern bool kvm_async_interrupts_allowed; extern bool kvm_halt_in_kernel_allowed; +extern bool kvm_eventfds_allowed; extern bool kvm_irqfds_allowed; extern bool kvm_msi_via_irqfd_allowed; extern bool kvm_gsi_routing_allowed; @@ -83,6 +84,15 @@ extern bool kvm_readonly_mem_allowed; #define kvm_halt_in_kernel() (kvm_halt_in_kernel_allowed) /** + * kvm_eventfds_enabled: + * + * Returns: true if we can use eventfds to receive notifications + * from a KVM CPU (ie the kernel supports eventds and we are running + * with a configuration where it is meaningful to use them). + */ +#define kvm_eventfds_enabled() (kvm_eventfds_allowed) + +/** * kvm_irqfds_enabled: * * Returns: true if we can use irqfds to inject interrupts into @@ -128,6 +138,7 @@ extern bool kvm_readonly_mem_allowed; #define kvm_irqchip_in_kernel() (false) #define kvm_async_interrupts_enabled() (false) #define kvm_halt_in_kernel() (false) +#define kvm_eventfds_enabled() (false) #define kvm_irqfds_enabled() (false) #define kvm_msi_via_irqfd_enabled() (false) #define kvm_gsi_routing_allowed() (false) diff --git a/kvm-all.c b/kvm-all.c index 4e19eff..92f56d8 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -113,6 +113,7 @@ KVMState *kvm_state; bool kvm_kernel_irqchip; bool kvm_async_interrupts_allowed; bool kvm_halt_in_kernel_allowed; +bool kvm_eventfds_allowed; bool kvm_irqfds_allowed; bool kvm_msi_via_irqfd_allowed; bool kvm_gsi_routing_allowed; @@ -1541,6 +1542,9 @@ int kvm_init(MachineClass *mc) (kvm_check_extension(s, KVM_CAP_READONLY_MEM) > 0); #endif +kvm_eventfds_allowed = +(kvm_check_extension(s, KVM_CAP_IOEVENTFD) > 0); + ret = kvm_arch_init(s); if (ret < 0) { goto err; diff --git a/kvm-stub.c b/kvm-stub.c index ac33d86..8e7737c 100644 --- a/kvm-stub.c +++ b/kvm-stub.c @@ -22,6 +22,7 @@ KVMState *kvm_state; bool kvm_kernel_irqchip; bool kvm_async_interrupts_allowed; +bool kvm_eventfds_allowed; bool kvm_irqfds_allowed; bool kvm_msi_via_irqfd_allowed; bool kvm_gsi_routing_allowed; -- MST -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH kvm-unit-tests] report: add report_xfail for expected failures
Il 17/06/2014 16:36, Andrew Jones ha scritto: Add report_xfail(), which is report(), but with another condition allowing it to output PASS/FAIL/XPASS/XFAIL, rather than only PASS/FAIL. This allows report output to stay more consistent between systems/configurations that may or may not support all tests. Signed-off-by: Andrew Jones --- lib/libcflat.h | 1 + lib/report.c | 37 + 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/lib/libcflat.h b/lib/libcflat.h index cb6663d33ef5a..c9577350ec275 100644 --- a/lib/libcflat.h +++ b/lib/libcflat.h @@ -63,5 +63,6 @@ extern long atol(const char *ptr); #define NULL ((void *)0UL) void report(const char *msg_fmt, bool pass, ...); +void report_xfail(const char *msg_fmt, bool xfail, bool pass, ...); int report_summary(void); #endif diff --git a/lib/report.c b/lib/report.c index ff562a13c541e..43d7a65ec8812 100644 --- a/lib/report.c +++ b/lib/report.c @@ -5,32 +5,53 @@ * * Authors: * Jan Kiszka + * Andrew Jones * * This work is licensed under the terms of the GNU LGPL, version 2. */ #include "libcflat.h" -static unsigned int tests, failures; +static unsigned int tests, failures, xfailures; -void report(const char *msg_fmt, bool pass, ...) +void va_report_xfail(const char *msg_fmt, bool xfail, bool cond, va_list va) { + char *pass = xfail ? "XPASS" : "PASS"; + char *fail = xfail ? "XFAIL" : "FAIL"; char buf[2000]; - va_list va; tests++; - printf("%s: ", pass ? "PASS" : "FAIL"); - va_start(va, pass); + printf("%s: ", cond ? pass : fail); vsnprintf(buf, sizeof(buf), msg_fmt, va); - va_end(va); puts(buf); puts("\n"); - if (!pass) + if (xfail && cond) + failures++; + else if (xfail) + xfailures++; + else if (!cond) failures++; } +void report(const char *msg_fmt, bool pass, ...) +{ + va_list va; + va_start(va, pass); + va_report_xfail(msg_fmt, false, pass, va); + va_end(va); +} + +void report_xfail(const char *msg_fmt, bool xfail, bool pass, ...) +{ + va_list va; + va_start(va, pass); + va_report_xfail(msg_fmt, xfail, pass, va); + va_end(va); +} + int report_summary(void) { - printf("\nSUMMARY: %d tests, %d failures\n", tests, failures); + printf("\nSUMMARY: %d tests, %d failures, %d xfailures\n", + tests, failures, xfailures); return failures > 0 ? 1 : 0; } Looks good, will apply in the next few days. Paolo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] kvm-unit-tests/x86: fix unittests.cfg typo
Il 17/04/2014 16:17, Andrew Jones ha scritto: Signed-off-by: Andrew Jones --- x86/unittests.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/x86/unittests.cfg b/x86/unittests.cfg index 7930c026a38d6..d78fe0eafe2b6 100644 --- a/x86/unittests.cfg +++ b/x86/unittests.cfg @@ -156,5 +156,5 @@ extra_params = -cpu host,+vmx arch = x86_64 [debug] -file = debug.flag +file = debug.flat arch = x86_64 Applying, thanks. Paolo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH v2] KVM: PPC: e500mc: Enhance tlb invalidation condition on vcpu schedule
> -Original Message- > From: Wood Scott-B07421 > Sent: Tuesday, June 17, 2014 6:36 PM > To: Caraman Mihai Claudiu-B02008 > Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; linuxppc- > d...@lists.ozlabs.org > Subject: Re: [PATCH v2] KVM: PPC: e500mc: Enhance tlb invalidation > condition on vcpu schedule > > On Tue, 2014-06-17 at 12:02 +0300, Mihai Caraman wrote: > > On vcpu schedule, the condition checked for tlb pollution is too loose. > > The tlb entries of a vcpu become polluted (vs stale) only when a > different > > vcpu within the same logical partition runs in-between. Optimize the > tlb > > invalidation condition taking into account the logical partition id. > > > > With the new invalidation condition, a guest shows 4% performance > improvement > > on P5020DS while running a memory stress application with the cpu > oversubscribed, > > the other guest running a cpu intensive workload. > > See > https://lists.ozlabs.org/pipermail/linuxppc-dev/2014-June/118547.html Thanks. The original code needs just a simple adjustment to benefit from this optimization, please review v3. - Mike
Re: [PATCH v2] KVM: PPC: e500mc: Enhance tlb invalidation condition on vcpu schedule
On Tue, 2014-06-17 at 14:04 -0500, Caraman Mihai Claudiu-B02008 wrote: > > -Original Message- > > From: Wood Scott-B07421 > > Sent: Tuesday, June 17, 2014 6:36 PM > > To: Caraman Mihai Claudiu-B02008 > > Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; linuxppc- > > d...@lists.ozlabs.org > > Subject: Re: [PATCH v2] KVM: PPC: e500mc: Enhance tlb invalidation > > condition on vcpu schedule > > > > On Tue, 2014-06-17 at 12:02 +0300, Mihai Caraman wrote: > > > On vcpu schedule, the condition checked for tlb pollution is too loose. > > > The tlb entries of a vcpu become polluted (vs stale) only when a > > different > > > vcpu within the same logical partition runs in-between. Optimize the > > tlb > > > invalidation condition taking into account the logical partition id. > > > > > > With the new invalidation condition, a guest shows 4% performance > > improvement > > > on P5020DS while running a memory stress application with the cpu > > oversubscribed, > > > the other guest running a cpu intensive workload. > > > > See > > https://lists.ozlabs.org/pipermail/linuxppc-dev/2014-June/118547.html > > Thanks. The original code needs just a simple adjustment to benefit from > this optimization, please review v3. Where is v3? Or is it forthcoming? -Scott -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v3] KVM: PPC: e500mc: Enhance tlb invalidation condition on vcpu schedule
On vcpu schedule, the condition checked for tlb pollution is too loose. The tlb entries of a vcpu become polluted (vs stale) only when a different vcpu within the same logical partition runs in-between. Optimize the tlb invalidation condition keeping last_vcpu_on_cpu per logical partition id. With the new invalidation condition, a guest shows 4% performance improvement on P5020DS while running a memory stress application with the cpu oversubscribed, the other guest running a cpu intensive workload. Guest - old invalidation condition real 3.89 user 3.87 sys 0.01 Guest - enhanced invalidation condition real 3.75 user 3.73 sys 0.01 Host real 3.70 user 1.85 sys 0.00 The memory stress application accesses 4KB pages backed by 75% of available TLB0 entries: char foo[ENTRIES][4096] __attribute__ ((aligned (4096))); int main() { char bar; int i, j; for (i = 0; i < ITERATIONS; i++) for (j = 0; j < ENTRIES; j++) bar = foo[j][0]; return 0; } Signed-off-by: Mihai Caraman Cc: Scott Wood --- v3: - use existing logic while keeping last_vcpu_per_cpu per lpid v2: - improve patch name and description - add performance results arch/powerpc/kvm/e500mc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c index 17e4562..95e33e3 100644 --- a/arch/powerpc/kvm/e500mc.c +++ b/arch/powerpc/kvm/e500mc.c @@ -110,7 +110,7 @@ void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr) { } -static DEFINE_PER_CPU(struct kvm_vcpu *, last_vcpu_on_cpu); +static DEFINE_PER_CPU(struct kvm_vcpu * [KVMPPC_NR_LPIDS], last_vcpu_on_cpu); static void kvmppc_core_vcpu_load_e500mc(struct kvm_vcpu *vcpu, int cpu) { @@ -141,9 +141,9 @@ static void kvmppc_core_vcpu_load_e500mc(struct kvm_vcpu *vcpu, int cpu) mtspr(SPRN_GESR, vcpu->arch.shared->esr); if (vcpu->arch.oldpir != mfspr(SPRN_PIR) || - __get_cpu_var(last_vcpu_on_cpu) != vcpu) { + __get_cpu_var(last_vcpu_on_cpu)[vcpu->kvm->arch.lpid] != vcpu) { kvmppc_e500_tlbil_all(vcpu_e500); - __get_cpu_var(last_vcpu_on_cpu) = vcpu; + __get_cpu_var(last_vcpu_on_cpu)[vcpu->kvm->arch.lpid] = vcpu; } kvmppc_load_guest_fp(vcpu); -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v3] KVM: PPC: e500mc: Enhance tlb invalidation condition on vcpu schedule
On Tue, 2014-06-17 at 22:09 +0300, Mihai Caraman wrote: > On vcpu schedule, the condition checked for tlb pollution is too loose. > The tlb entries of a vcpu become polluted (vs stale) only when a different > vcpu within the same logical partition runs in-between. Optimize the tlb > invalidation condition keeping last_vcpu_on_cpu per logical partition id. > > With the new invalidation condition, a guest shows 4% performance improvement > on P5020DS while running a memory stress application with the cpu > oversubscribed, > the other guest running a cpu intensive workload. > > Guest - old invalidation condition > real 3.89 > user 3.87 > sys 0.01 > > Guest - enhanced invalidation condition > real 3.75 > user 3.73 > sys 0.01 > > Host > real 3.70 > user 1.85 > sys 0.00 > > The memory stress application accesses 4KB pages backed by 75% of available > TLB0 entries: > > char foo[ENTRIES][4096] __attribute__ ((aligned (4096))); > > int main() > { > char bar; > int i, j; > > for (i = 0; i < ITERATIONS; i++) > for (j = 0; j < ENTRIES; j++) > bar = foo[j][0]; > > return 0; > } > > Signed-off-by: Mihai Caraman > Cc: Scott Wood > --- > v3: > - use existing logic while keeping last_vcpu_per_cpu per lpid > > v2: > - improve patch name and description > - add performance results > > > arch/powerpc/kvm/e500mc.c | 6 +++--- > 1 file changed, 3 insertions(+), 3 deletions(-) > > diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c > index 17e4562..95e33e3 100644 > --- a/arch/powerpc/kvm/e500mc.c > +++ b/arch/powerpc/kvm/e500mc.c > @@ -110,7 +110,7 @@ void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 > old_msr) > { > } > > -static DEFINE_PER_CPU(struct kvm_vcpu *, last_vcpu_on_cpu); > +static DEFINE_PER_CPU(struct kvm_vcpu * [KVMPPC_NR_LPIDS], last_vcpu_on_cpu); Hmm, I didn't know you could express types like that. Is this special syntax that only works for typeof? No space after * Name should be adjusted to match, something like last_vcpu_of_lpid (with the _on_cpu being implied by the fact that it's PER_CPU). -Scott -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH v3] KVM: PPC: e500mc: Enhance tlb invalidation condition on vcpu schedule
> > -static DEFINE_PER_CPU(struct kvm_vcpu *, last_vcpu_on_cpu); > > +static DEFINE_PER_CPU(struct kvm_vcpu * [KVMPPC_NR_LPIDS], > last_vcpu_on_cpu); > > Hmm, I didn't know you could express types like that. Is this special > syntax that only works for typeof? Yes, AFAIK. > No space after * Checkpatch complains about the missing space ;) > > Name should be adjusted to match, something like last_vcpu_of_lpid (with > the _on_cpu being implied by the fact that it's PER_CPU). I was thinking to the long name but it was not appealing, I will change it to last_vcpu_of_lpid. -Mike
Re: [PATCH v3] KVM: PPC: e500mc: Enhance tlb invalidation condition on vcpu schedule
On Tue, 2014-06-17 at 14:42 -0500, Caraman Mihai Claudiu-B02008 wrote: > > > -static DEFINE_PER_CPU(struct kvm_vcpu *, last_vcpu_on_cpu); > > > +static DEFINE_PER_CPU(struct kvm_vcpu * [KVMPPC_NR_LPIDS], > > last_vcpu_on_cpu); > > > > Hmm, I didn't know you could express types like that. Is this special > > syntax that only works for typeof? > > Yes, AFAIK. > > > No space after * > > Checkpatch complains about the missing space ;) Checkpatch is wrong, which isn't surprising given that this is unusual syntax. We don't normally put a space after * when used to represent a pointer. -Scott -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH v3] KVM: PPC: e500mc: Enhance tlb invalidation condition on vcpu schedule
> -Original Message- > From: Wood Scott-B07421 > Sent: Tuesday, June 17, 2014 10:48 PM > To: Caraman Mihai Claudiu-B02008 > Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; linuxppc- > d...@lists.ozlabs.org > Subject: Re: [PATCH v3] KVM: PPC: e500mc: Enhance tlb invalidation > condition on vcpu schedule > > On Tue, 2014-06-17 at 14:42 -0500, Caraman Mihai Claudiu-B02008 wrote: > > > > -static DEFINE_PER_CPU(struct kvm_vcpu *, last_vcpu_on_cpu); > > > > +static DEFINE_PER_CPU(struct kvm_vcpu * [KVMPPC_NR_LPIDS], > > > last_vcpu_on_cpu); > > > > > > Hmm, I didn't know you could express types like that. Is this > special > > > syntax that only works for typeof? > > > > Yes, AFAIK. > > > > > No space after * > > > > Checkpatch complains about the missing space ;) > > Checkpatch is wrong, which isn't surprising given that this is unusual > syntax. We don't normally put a space after * when used to represent a > pointer. This is not something new. See [PATCH 04/10] percpu: cleanup percpu array definitions: https://lkml.org/lkml/2009/6/24/26 -Mike N�r��yb�X��ǧv�^�){.n�+h����ܨ}���Ơz�&j:+v���zZ+��+zf���h���~i���z��w���?�&�)ߢf
Re: [PATCH 01/11] qspinlock: A simple generic 4-byte queue spinlock
> > + new = tail | (val & _Q_LOCKED_MASK); > > + > > + old = atomic_cmpxchg(&lock->val, val, new); > > + if (old == val) > > + break; > > + > > + val = old; > > + } > > + > > + /* > > +* we won the trylock; forget about queueing. > > +*/ > > + if (new == _Q_LOCKED_VAL) > > + goto release; > > + > > + /* > > +* if there was a previous node; link it and wait. > > +*/ > > + if (old & ~_Q_LOCKED_MASK) { > > + prev = decode_tail(old); > > + ACCESS_ONCE(prev->next) = node; > > + > > + arch_mcs_spin_lock_contended(&node->locked); Could you add a comment here: /* We are spinning forever until the previous node updates locked - which it does once the it has updated lock->val with our tail number. */ > > + } > > + > > + /* > > +* we're at the head of the waitqueue, wait for the owner to go away. > > +* > > +* *,x -> *,0 > > +*/ > > + while ((val = atomic_read(&lock->val)) & _Q_LOCKED_MASK) > > + cpu_relax(); > > + > > + /* > > +* claim the lock: > > +* > > +* n,0 -> 0,1 : lock, uncontended > > +* *,0 -> *,1 : lock, contended > > +*/ > > + for (;;) { > > + new = _Q_LOCKED_VAL; > > + if (val != tail) > > + new |= val; > ..snip.. > > Could you help a bit in explaining it in English please? After looking at the assembler code I finally figured out how we can get here. And the 'contended' part threw me off. Somehow I imagined there are two more more CPUs stampeding here and trying to update the lock->val. But in reality the other CPUs are stuck in the arch_mcs_spin_lock_contended spinning on their local value. Perhaps you could add this comment. /* Once queue_spin_unlock is called (which _subtracts_ _Q_LOCKED_VAL from the lock->val and still preserving the tail data), the winner gets to claim the ticket. Since we still need the other CPUs to continue and preserve the strict ordering in which they setup node->next, we: 1) update lock->val to the tail value (so tail CPU and its index) with _Q_LOCKED_VAL. 2). Once we are done, we poke the other CPU (the one that linked to us) by writting to node->locked (below) so they can make progress and loop on lock->val changing from _Q_LOCKED_MASK to zero). */ -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v3] KVM: PPC: e500mc: Enhance tlb invalidation condition on vcpu schedule
On Tue, 2014-06-17 at 15:02 -0500, Caraman Mihai Claudiu-B02008 wrote: > > -Original Message- > > From: Wood Scott-B07421 > > Sent: Tuesday, June 17, 2014 10:48 PM > > To: Caraman Mihai Claudiu-B02008 > > Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; linuxppc- > > d...@lists.ozlabs.org > > Subject: Re: [PATCH v3] KVM: PPC: e500mc: Enhance tlb invalidation > > condition on vcpu schedule > > > > On Tue, 2014-06-17 at 14:42 -0500, Caraman Mihai Claudiu-B02008 wrote: > > > > > -static DEFINE_PER_CPU(struct kvm_vcpu *, last_vcpu_on_cpu); > > > > > +static DEFINE_PER_CPU(struct kvm_vcpu * [KVMPPC_NR_LPIDS], > > > > last_vcpu_on_cpu); > > > > > > > > Hmm, I didn't know you could express types like that. Is this > > special > > > > syntax that only works for typeof? > > > > > > Yes, AFAIK. > > > > > > > No space after * > > > > > > Checkpatch complains about the missing space ;) > > > > Checkpatch is wrong, which isn't surprising given that this is unusual > > syntax. We don't normally put a space after * when used to represent a > > pointer. > > This is not something new. See [PATCH 04/10] percpu: cleanup percpu array > definitions: > > https://lkml.org/lkml/2009/6/24/26 I didn't say it was new, just unusual, and checkpatch doesn't recognize it. Checkpatch shouldn't be blindly and silently obeyed when it says something strange. -Scott -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 01/11] qspinlock: A simple generic 4-byte queue spinlock
> + * The basic principle of a queue-based spinlock can best be understood > + * by studying a classic queue-based spinlock implementation called the > + * MCS lock. The paper below provides a good description for this kind > + * of lock. > + * > + * http://www.cise.ufl.edu/tr/DOC/REP-1992-71.pdf > + * > + * This queue spinlock implementation is based on the MCS lock, however to > make > + * it fit the 4 bytes we assume spinlock_t to be, and preserve its existing > + * API, we must modify it some. > + * > + * In particular; where the traditional MCS lock consists of a tail pointer > + * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to > + * unlock the next pending (next->locked), we compress both these: {tail, > + * next->locked} into a single u32 value. > + * > + * Since a spinlock disables recursion of its own context and there is a > limit > + * to the contexts that can nest; namely: task, softirq, hardirq, nmi, we can > + * encode the tail as and index indicating this context and a cpu number. > + * > + * We can further change the first spinner to spin on a bit in the lock word > + * instead of its node; whereby avoiding the need to carry a node from lock > to > + * unlock, and preserving API. You also made changes (compared to the MCS) in that the unlock path is not spinning waiting for the successor and that the job of passing the lock is not done in the unlock path either. Instead all of that is now done in the path of the lock acquirer logic. Could you update the comment to say that please? Thanks. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH v3] KVM: PPC: e500mc: Enhance tlb invalidation condition on vcpu schedule
> -Original Message- > From: Wood Scott-B07421 > Sent: Tuesday, June 17, 2014 11:05 PM > To: Caraman Mihai Claudiu-B02008 > Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; linuxppc- > d...@lists.ozlabs.org > Subject: Re: [PATCH v3] KVM: PPC: e500mc: Enhance tlb invalidation > condition on vcpu schedule > > On Tue, 2014-06-17 at 15:02 -0500, Caraman Mihai Claudiu-B02008 wrote: > > > -Original Message- > > > From: Wood Scott-B07421 > > > Sent: Tuesday, June 17, 2014 10:48 PM > > > To: Caraman Mihai Claudiu-B02008 > > > Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; linuxppc- > > > d...@lists.ozlabs.org > > > Subject: Re: [PATCH v3] KVM: PPC: e500mc: Enhance tlb invalidation > > > condition on vcpu schedule > > > > > > On Tue, 2014-06-17 at 14:42 -0500, Caraman Mihai Claudiu-B02008 > wrote: > > > > > > -static DEFINE_PER_CPU(struct kvm_vcpu *, last_vcpu_on_cpu); > > > > > > +static DEFINE_PER_CPU(struct kvm_vcpu * [KVMPPC_NR_LPIDS], > > > > > last_vcpu_on_cpu); > > > > > > > > > > Hmm, I didn't know you could express types like that. Is this > > > special > > > > > syntax that only works for typeof? > > > > > > > > Yes, AFAIK. > > > > > > > > > No space after * > > > > > > > > Checkpatch complains about the missing space ;) > > > > > > Checkpatch is wrong, which isn't surprising given that this is > unusual > > > syntax. We don't normally put a space after * when used to represent > a > > > pointer. > > > > This is not something new. See [PATCH 04/10] percpu: cleanup percpu > array > > definitions: > > > > https://lkml.org/lkml/2009/6/24/26 > > I didn't say it was new, just unusual, and checkpatch doesn't recognize > it. Checkpatch shouldn't be blindly and silently obeyed when it says > something strange. I agree with you about the syntax and I know other cases where checkpatch is a moron. For similar corner cases checkpatch maintainers did not wanted (or found it difficult) to make an exception. I would also like to see Alex opinion on this. -Mike
Re: [PATCH 03/11] qspinlock: Add pending bit
On Sun, Jun 15, 2014 at 02:47:00PM +0200, Peter Zijlstra wrote: > Because the qspinlock needs to touch a second cacheline; add a pending > bit and allow a single in-word spinner before we punt to the second > cacheline. Could you add this in the description please: And by second cacheline we mean the local 'node'. That is the: mcs_nodes[0] and mcs_nodes[idx] Perhaps it might be better then to split this in the header file as this is trying to not be a slowpath code - but rather - a pre-slow-path-lets-try-if-we can do another cmpxchg in case the unlocker has just unlocked itself. So something like: diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h index e8a7ae8..29cc9c7 100644 --- a/include/asm-generic/qspinlock.h +++ b/include/asm-generic/qspinlock.h @@ -75,11 +75,21 @@ extern void queue_spin_lock_slowpath(struct qspinlock *lock, u32 val); */ static __always_inline void queue_spin_lock(struct qspinlock *lock) { - u32 val; + u32 val, new; val = atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL); if (likely(val == 0)) return; + + /* One more attempt - but if we fail mark it as pending. */ + if (val == _Q_LOCKED_VAL) { + new = Q_LOCKED_VAL |_Q_PENDING_VAL; + + old = atomic_cmpxchg(&lock->val, val, new); + if (old == _Q_LOCKED_VAL) /* YEEY! */ + return; + val = old; + } queue_spin_lock_slowpath(lock, val); } and then the slowpath preserves most of the old logic path (with the pending bit stuff)? > > Signed-off-by: Peter Zijlstra > --- > include/asm-generic/qspinlock_types.h | 12 ++- > kernel/locking/qspinlock.c| 109 > +++--- > 2 files changed, 97 insertions(+), 24 deletions(-) > > --- a/include/asm-generic/qspinlock_types.h > +++ b/include/asm-generic/qspinlock_types.h > @@ -39,8 +39,9 @@ typedef struct qspinlock { > * Bitfields in the atomic value: > * > * 0- 7: locked byte > - * 8- 9: tail index > - * 10-31: tail cpu (+1) > + * 8: pending > + * 9-10: tail index > + * 11-31: tail cpu (+1) > */ > #define _Q_SET_MASK(type) (((1U << _Q_ ## type ## _BITS) - 1)\ > << _Q_ ## type ## _OFFSET) > @@ -48,7 +49,11 @@ typedef struct qspinlock { > #define _Q_LOCKED_BITS 8 > #define _Q_LOCKED_MASK _Q_SET_MASK(LOCKED) > > -#define _Q_TAIL_IDX_OFFSET (_Q_LOCKED_OFFSET + _Q_LOCKED_BITS) > +#define _Q_PENDING_OFFSET(_Q_LOCKED_OFFSET + _Q_LOCKED_BITS) > +#define _Q_PENDING_BITS 1 > +#define _Q_PENDING_MASK _Q_SET_MASK(PENDING) > + > +#define _Q_TAIL_IDX_OFFSET (_Q_PENDING_OFFSET + _Q_PENDING_BITS) > #define _Q_TAIL_IDX_BITS 2 > #define _Q_TAIL_IDX_MASK _Q_SET_MASK(TAIL_IDX) > > @@ -57,5 +62,6 @@ typedef struct qspinlock { > #define _Q_TAIL_CPU_MASK _Q_SET_MASK(TAIL_CPU) > > #define _Q_LOCKED_VAL(1U << _Q_LOCKED_OFFSET) > +#define _Q_PENDING_VAL (1U << _Q_PENDING_OFFSET) > > #endif /* __ASM_GENERIC_QSPINLOCK_TYPES_H */ > --- a/kernel/locking/qspinlock.c > +++ b/kernel/locking/qspinlock.c > @@ -83,24 +83,28 @@ static inline struct mcs_spinlock *decod > return per_cpu_ptr(&mcs_nodes[idx], cpu); > } > > +#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) > + > /** > * queue_spin_lock_slowpath - acquire the queue spinlock > * @lock: Pointer to queue spinlock structure > * @val: Current value of the queue spinlock 32-bit word > * > - * (queue tail, lock bit) > - * > - * fast :slow : > unlock > - *: : > - * uncontended (0,0) --:--> (0,1) :--> > (*,0) > - *: | ^./ : > - *: v \ | : > - * uncontended:(n,x) --+--> (n,0) | : > - * queue: | ^--' | : > - *: v | : > - * contended :(*,x) --+--> (*,0) -> (*,1) ---' : > - * queue: ^--' : > + * (queue tail, pending bit, lock bit) > * > + * fast :slow : > unlock > + * : : > + * uncontended (0,0,0) -:--> (0,0,1) --:--> > (*,*,0) > + * : | ^.--. / : > + * : v \ \| : > + * pending :(0,1,1) +--> (0,1,0) \ | : > + * : | ^--' |
Re: [PATCH v3] KVM: PPC: e500mc: Enhance tlb invalidation condition on vcpu schedule
On 17.06.14 22:36, mihai.cara...@freescale.com wrote: -Original Message- From: Wood Scott-B07421 Sent: Tuesday, June 17, 2014 11:05 PM To: Caraman Mihai Claudiu-B02008 Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; linuxppc- d...@lists.ozlabs.org Subject: Re: [PATCH v3] KVM: PPC: e500mc: Enhance tlb invalidation condition on vcpu schedule On Tue, 2014-06-17 at 15:02 -0500, Caraman Mihai Claudiu-B02008 wrote: -Original Message- From: Wood Scott-B07421 Sent: Tuesday, June 17, 2014 10:48 PM To: Caraman Mihai Claudiu-B02008 Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; linuxppc- d...@lists.ozlabs.org Subject: Re: [PATCH v3] KVM: PPC: e500mc: Enhance tlb invalidation condition on vcpu schedule On Tue, 2014-06-17 at 14:42 -0500, Caraman Mihai Claudiu-B02008 wrote: -static DEFINE_PER_CPU(struct kvm_vcpu *, last_vcpu_on_cpu); +static DEFINE_PER_CPU(struct kvm_vcpu * [KVMPPC_NR_LPIDS], last_vcpu_on_cpu); Hmm, I didn't know you could express types like that. Is this special syntax that only works for typeof? Yes, AFAIK. No space after * Checkpatch complains about the missing space ;) Checkpatch is wrong, which isn't surprising given that this is unusual syntax. We don't normally put a space after * when used to represent a pointer. This is not something new. See [PATCH 04/10] percpu: cleanup percpu array definitions: https://lkml.org/lkml/2009/6/24/26 I didn't say it was new, just unusual, and checkpatch doesn't recognize it. Checkpatch shouldn't be blindly and silently obeyed when it says something strange. I agree with you about the syntax and I know other cases where checkpatch is a moron. For similar corner cases checkpatch maintainers did not wanted (or found it difficult) to make an exception. I would also like to see Alex opinion on this. I usually like to apply common sense :). Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 03/11] qspinlock: Add pending bit
On 06/17/2014 04:36 PM, Konrad Rzeszutek Wilk wrote: On Sun, Jun 15, 2014 at 02:47:00PM +0200, Peter Zijlstra wrote: Because the qspinlock needs to touch a second cacheline; add a pending bit and allow a single in-word spinner before we punt to the second cacheline. Could you add this in the description please: And by second cacheline we mean the local 'node'. That is the: mcs_nodes[0] and mcs_nodes[idx] Perhaps it might be better then to split this in the header file as this is trying to not be a slowpath code - but rather - a pre-slow-path-lets-try-if-we can do another cmpxchg in case the unlocker has just unlocked itself. So something like: diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h index e8a7ae8..29cc9c7 100644 --- a/include/asm-generic/qspinlock.h +++ b/include/asm-generic/qspinlock.h @@ -75,11 +75,21 @@ extern void queue_spin_lock_slowpath(struct qspinlock *lock, u32 val); */ static __always_inline void queue_spin_lock(struct qspinlock *lock) { - u32 val; + u32 val, new; val = atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL); if (likely(val == 0)) return; + + /* One more attempt - but if we fail mark it as pending. */ + if (val == _Q_LOCKED_VAL) { + new = Q_LOCKED_VAL |_Q_PENDING_VAL; + + old = atomic_cmpxchg(&lock->val, val, new); + if (old == _Q_LOCKED_VAL) /* YEEY! */ + return; No, it can leave like that. The unlock path will not clear the pending bit. We are trying to make the fastpath as simple as possible as it may be inlined. The complexity of the queue spinlock is in the slowpath. Moreover, an cmpxchg followed immediately followed by another cmpxchg will just increase the level of memory contention when a lock is fairly contended. The chance of second cmpxchg() succeeding will be pretty low. -Longman -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 04/11] qspinlock: Extract out the exchange of tail code word
On Sun, Jun 15, 2014 at 02:47:01PM +0200, Peter Zijlstra wrote: > From: Waiman Long > > This patch extracts the logic for the exchange of new and previous tail > code words into a new xchg_tail() function which can be optimized in a > later patch. And also adds a third try on acquiring the lock. That I think should be a seperate patch. And instead of saying 'later patch' you should spell out the name of the patch. Especially as this might not be obvious from somebody doing git bisection. > > Signed-off-by: Waiman Long > Signed-off-by: Peter Zijlstra > --- > include/asm-generic/qspinlock_types.h |2 + > kernel/locking/qspinlock.c| 58 > +- > 2 files changed, 38 insertions(+), 22 deletions(-) > > --- a/include/asm-generic/qspinlock_types.h > +++ b/include/asm-generic/qspinlock_types.h > @@ -61,6 +61,8 @@ typedef struct qspinlock { > #define _Q_TAIL_CPU_BITS (32 - _Q_TAIL_CPU_OFFSET) > #define _Q_TAIL_CPU_MASK _Q_SET_MASK(TAIL_CPU) > > +#define _Q_TAIL_MASK (_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK) > + > #define _Q_LOCKED_VAL(1U << _Q_LOCKED_OFFSET) > #define _Q_PENDING_VAL (1U << _Q_PENDING_OFFSET) > > --- a/kernel/locking/qspinlock.c > +++ b/kernel/locking/qspinlock.c > @@ -86,6 +86,31 @@ static inline struct mcs_spinlock *decod > #define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) > > /** > + * xchg_tail - Put in the new queue tail code word & retrieve previous one > + * @lock : Pointer to queue spinlock structure > + * @tail : The new queue tail code word > + * Return: The previous queue tail code word > + * > + * xchg(lock, tail) > + * > + * p,*,* -> n,*,* ; prev = xchg(lock, node) > + */ > +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) > +{ > + u32 old, new, val = atomic_read(&lock->val); > + > + for (;;) { > + new = (val & _Q_LOCKED_PENDING_MASK) | tail; > + old = atomic_cmpxchg(&lock->val, val, new); > + if (old == val) > + break; > + > + val = old; > + } > + return old; > +} > + > +/** > * queue_spin_lock_slowpath - acquire the queue spinlock > * @lock: Pointer to queue spinlock structure > * @val: Current value of the queue spinlock 32-bit word > @@ -182,36 +207,25 @@ void queue_spin_lock_slowpath(struct qsp > node->next = NULL; > > /* > - * we already touched the queueing cacheline; don't bother with pending > - * stuff. > - * > - * trylock || xchg(lock, node) > - * > - * 0,0,0 -> 0,0,1 ; trylock > - * p,y,x -> n,y,x ; prev = xchg(lock, node) > + * We touched a (possibly) cold cacheline in the per-cpu queue node; > + * attempt the trylock once more in the hope someone let go while we > + * weren't watching. >*/ > - for (;;) { > - new = _Q_LOCKED_VAL; > - if (val) > - new = tail | (val & _Q_LOCKED_PENDING_MASK); > - > - old = atomic_cmpxchg(&lock->val, val, new); > - if (old == val) > - break; > - > - val = old; > - } > + if (queue_spin_trylock(lock)) > + goto release; So now are three of them? One in queue_spin_lock, then at the start of this function when checking for the pending bit, and the once more here. And that is because the local cache line might be cold for the 'mcs_index' struct? That all seems to be a bit of experimental. But then we are already in the slowpath so we could as well do: for (i = 0; i < 10; i++) if (queue_spin_trylock(lock)) goto release; And would have the same effect. > > /* > - * we won the trylock; forget about queueing. > + * we already touched the queueing cacheline; don't bother with pending > + * stuff. I guess we could also just erase the pending bit if we wanted too. The optimistic spinning will still hit go to the queue label as lock->val will have the tail value. > + * > + * p,*,* -> n,*,* >*/ > - if (new == _Q_LOCKED_VAL) > - goto release; > + old = xchg_tail(lock, tail); > > /* >* if there was a previous node; link it and wait. >*/ > - if (old & ~_Q_LOCKED_PENDING_MASK) { > + if (old & _Q_TAIL_MASK) { > prev = decode_tail(old); > ACCESS_ONCE(prev->next) = node; > > > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 03/11] qspinlock: Add pending bit
On Tue, Jun 17, 2014 at 04:51:57PM -0400, Waiman Long wrote: > On 06/17/2014 04:36 PM, Konrad Rzeszutek Wilk wrote: > >On Sun, Jun 15, 2014 at 02:47:00PM +0200, Peter Zijlstra wrote: > >>Because the qspinlock needs to touch a second cacheline; add a pending > >>bit and allow a single in-word spinner before we punt to the second > >>cacheline. > >Could you add this in the description please: > > > >And by second cacheline we mean the local 'node'. That is the: > >mcs_nodes[0] and mcs_nodes[idx] > > > >Perhaps it might be better then to split this in the header file > >as this is trying to not be a slowpath code - but rather - a > >pre-slow-path-lets-try-if-we can do another cmpxchg in case > >the unlocker has just unlocked itself. > > > >So something like: > > > >diff --git a/include/asm-generic/qspinlock.h > >b/include/asm-generic/qspinlock.h > >index e8a7ae8..29cc9c7 100644 > >--- a/include/asm-generic/qspinlock.h > >+++ b/include/asm-generic/qspinlock.h > >@@ -75,11 +75,21 @@ extern void queue_spin_lock_slowpath(struct qspinlock > >*lock, u32 val); > > */ > > static __always_inline void queue_spin_lock(struct qspinlock *lock) > > { > >-u32 val; > >+u32 val, new; > > > > val = atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL); > > if (likely(val == 0)) > > return; > >+ > >+/* One more attempt - but if we fail mark it as pending. */ > >+if (val == _Q_LOCKED_VAL) { > >+new = Q_LOCKED_VAL |_Q_PENDING_VAL; > >+ > >+old = atomic_cmpxchg(&lock->val, val, new); > >+if (old == _Q_LOCKED_VAL) /* YEEY! */ > >+return; > > No, it can leave like that. The unlock path will not clear the pending bit. Err, you are right. It needs to go back in the slowpath. > We are trying to make the fastpath as simple as possible as it may be > inlined. The complexity of the queue spinlock is in the slowpath. Sure, but then it shouldn't be called slowpath anymore as it is not slow. It is a combination of fast path (the potential chance of grabbing the lock and setting the pending lock) and the real slow path (the queuing). Perhaps it should be called 'queue_spinlock_complex' ? > > Moreover, an cmpxchg followed immediately followed by another cmpxchg will > just increase the level of memory contention when a lock is fairly > contended. The chance of second cmpxchg() succeeding will be pretty low. Then why even do the pending bit - which is what the slowpath does for the first time. And if it grabs it (And sets the pending bit) it immediately exits. Why not perculate that piece of code in-to this header. And the leave all that slow code (queing, mcs_lock access, etc) in the slowpath. > > -Longman > > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v11 14/16] pvqspinlock: Add qspinlock para-virtualization support
On Sun, Jun 15, 2014 at 03:16:54PM +0200, Peter Zijlstra wrote: > On Thu, Jun 12, 2014 at 04:48:41PM -0400, Waiman Long wrote: > > I don't have a good understanding of the kernel alternatives mechanism. > > I didn't either; I do now, cost me a whole day reading up on > alternative/paravirt code patching. > > See the patches I just send out; I got the 'native' case with paravirt > enabled to be one NOP worse than the native case without paravirt -- for > queue_spin_unlock. > > The lock slowpath is several nops and some pointless movs more expensive. You could use the asm goto which would optimize the fast path to be the 'native' case. That way you wouldn't have the the nops and movs in the path. (And asm goto also uses the alternative_asm macros). -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 03/11] qspinlock: Add pending bit
On Tue, Jun 17, 2014 at 05:07:29PM -0400, Konrad Rzeszutek Wilk wrote: > On Tue, Jun 17, 2014 at 04:51:57PM -0400, Waiman Long wrote: > > On 06/17/2014 04:36 PM, Konrad Rzeszutek Wilk wrote: > > >On Sun, Jun 15, 2014 at 02:47:00PM +0200, Peter Zijlstra wrote: > > >>Because the qspinlock needs to touch a second cacheline; add a pending > > >>bit and allow a single in-word spinner before we punt to the second > > >>cacheline. > > >Could you add this in the description please: > > > > > >And by second cacheline we mean the local 'node'. That is the: > > >mcs_nodes[0] and mcs_nodes[idx] > > > > > >Perhaps it might be better then to split this in the header file > > >as this is trying to not be a slowpath code - but rather - a > > >pre-slow-path-lets-try-if-we can do another cmpxchg in case > > >the unlocker has just unlocked itself. > > > > > >So something like: > > > > > >diff --git a/include/asm-generic/qspinlock.h > > >b/include/asm-generic/qspinlock.h > > >index e8a7ae8..29cc9c7 100644 > > >--- a/include/asm-generic/qspinlock.h > > >+++ b/include/asm-generic/qspinlock.h > > >@@ -75,11 +75,21 @@ extern void queue_spin_lock_slowpath(struct qspinlock > > >*lock, u32 val); > > > */ > > > static __always_inline void queue_spin_lock(struct qspinlock *lock) > > > { > > >- u32 val; > > >+ u32 val, new; > > > > > > val = atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL); > > > if (likely(val == 0)) > > > return; > > >+ > > >+ /* One more attempt - but if we fail mark it as pending. */ > > >+ if (val == _Q_LOCKED_VAL) { > > >+ new = Q_LOCKED_VAL |_Q_PENDING_VAL; > > >+ > > >+ old = atomic_cmpxchg(&lock->val, val, new); > > >+ if (old == _Q_LOCKED_VAL) /* YEEY! */ > > >+ return; > > > > No, it can leave like that. The unlock path will not clear the pending bit. > > Err, you are right. It needs to go back in the slowpath. What I should have wrote is: if (old == 0) /* YEEY */ return; As that would the same thing as this patch does on the pending bit - that is if we can on the second compare and exchange set the pending bit (and the lock) and the lock has been released - we are good. And it is a quick path. > > > We are trying to make the fastpath as simple as possible as it may be > > inlined. The complexity of the queue spinlock is in the slowpath. > > Sure, but then it shouldn't be called slowpath anymore as it is not > slow. It is a combination of fast path (the potential chance of > grabbing the lock and setting the pending lock) and the real slow > path (the queuing). Perhaps it should be called 'queue_spinlock_complex' ? > I forgot to mention - that was the crux of my comments - just change the slowpath to complex name at that point to better reflect what it does. > > > > Moreover, an cmpxchg followed immediately followed by another cmpxchg will > > just increase the level of memory contention when a lock is fairly > > contended. The chance of second cmpxchg() succeeding will be pretty low. > > Then why even do the pending bit - which is what the slowpath does > for the first time. And if it grabs it (And sets the pending bit) it > immediately exits. Why not perculate that piece of code in-to this header. > > And the leave all that slow code (queing, mcs_lock access, etc) in the > slowpath. > > > > > -Longman > > > > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5 08/12] target-mips: Call kvm_mips_reset_vcpu() from mips_cpu_reset()
When KVM is enabled call kvm_mips_reset_vcpu() from mips_cpu_reset() as done for other targets since commit 50a2c6e55fa2 (kvm: reset state from the CPU's reset method). Signed-off-by: James Hogan Cc: Aurelien Jarno Cc: Paolo Bonzini Cc: Gleb Natapov --- Changes in v5: - New patch, based on commit 50a2c6e55fa2 (kvm: reset state from the CPU's reset method). --- target-mips/cpu.c | 8 1 file changed, 8 insertions(+) diff --git a/target-mips/cpu.c b/target-mips/cpu.c index dd954fc55a10..b3e0e6cce7b6 100644 --- a/target-mips/cpu.c +++ b/target-mips/cpu.c @@ -19,7 +19,9 @@ */ #include "cpu.h" +#include "kvm_mips.h" #include "qemu-common.h" +#include "sysemu/kvm.h" static void mips_cpu_set_pc(CPUState *cs, vaddr value) @@ -87,6 +89,12 @@ static void mips_cpu_reset(CPUState *s) tlb_flush(s, 1); cpu_state_reset(env); + +#ifndef CONFIG_USER_ONLY +if (kvm_enabled()) { +kvm_mips_reset_vcpu(cpu); +} +#endif } static void mips_cpu_realizefn(DeviceState *dev, Error **errp) -- 1.9.3 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5 09/12] hw/mips: In KVM mode, inject IRQ2 (I/O) interrupts via ioctls
From: Sanjay Lal COP0 emulation is in-kernel for KVM, so inject IRQ2 (I/O) interrupts via ioctls. Signed-off-by: Sanjay Lal Signed-off-by: James Hogan Reviewed-by: Aurelien Jarno Reviewed-by: Andreas Färber --- Changes in v5: - Fix typo in subject (s/interupts/interrupts/) Changes in v3: - Pass MIPSCPU to kvm_mips_set_[ipi_]interrupt (Andreas Färber). Changes in v2: - Expand commit message - Remove #ifdef CONFIG_KVM since it's guarded by kvm_enabled() already --- hw/mips/mips_int.c | 11 +++ 1 file changed, 11 insertions(+) diff --git a/hw/mips/mips_int.c b/hw/mips/mips_int.c index 7dbd24d3d6e5..d740046ba151 100644 --- a/hw/mips/mips_int.c +++ b/hw/mips/mips_int.c @@ -23,6 +23,8 @@ #include "hw/hw.h" #include "hw/mips/cpudevs.h" #include "cpu.h" +#include "sysemu/kvm.h" +#include "kvm_mips.h" static void cpu_mips_irq_request(void *opaque, int irq, int level) { @@ -35,8 +37,17 @@ static void cpu_mips_irq_request(void *opaque, int irq, int level) if (level) { env->CP0_Cause |= 1 << (irq + CP0Ca_IP); + +if (kvm_enabled() && irq == 2) { +kvm_mips_set_interrupt(cpu, irq, level); +} + } else { env->CP0_Cause &= ~(1 << (irq + CP0Ca_IP)); + +if (kvm_enabled() && irq == 2) { +kvm_mips_set_interrupt(cpu, irq, level); +} } if (env->CP0_Cause & CP0Ca_IP_mask) { -- 1.9.3 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5 06/12] kvm: Allow arch to set sigmask length
MIPS/Linux is unusual in having 128 signals rather than just 64 like most other architectures. This means its sigmask is 16 bytes instead of 8, so allow arches to override the sigmask->len value passed to the KVM_SET_SIGNAL_MASK ioctl in kvm_set_signal_mask() by calling kvm_set_sigmask_len() from kvm_arch_init(). Otherwise default to 8 bytes. Signed-off-by: James Hogan Cc: Aurelien Jarno Cc: Sanjay Lal Cc: Gleb Natapov Cc: Paolo Bonzini Cc: Peter Maydell --- Changes in v3: - Rewrote to allow sigmask length to be set by kvm_arch_init(), so that MIPS can set it to 16 as it has 128 signals. This is better than cluttering kvm-all.c with TARGET_* ifdefs (Peter Maydell). Changes in v2: - Expand commit message - Reword comment --- include/sysemu/kvm.h | 2 ++ kvm-all.c| 11 ++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index e79e92c50e3f..de4bdaa40e56 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -325,6 +325,8 @@ int kvm_check_extension(KVMState *s, unsigned int extension); uint32_t kvm_arch_get_supported_cpuid(KVMState *env, uint32_t function, uint32_t index, int reg); +void kvm_set_sigmask_len(KVMState *s, unsigned int sigmask_len); + #if !defined(CONFIG_USER_ONLY) int kvm_physical_memory_addr_from_host(KVMState *s, void *ram_addr, hwaddr *phys_addr); diff --git a/kvm-all.c b/kvm-all.c index 4e19eff0efee..0d485b644539 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -99,6 +99,7 @@ struct KVMState * they're not. Linux, glibc and *BSD all treat ioctl numbers as * unsigned, and treating them as signed here can break things */ unsigned irq_set_ioctl; +unsigned int sigmask_len; #ifdef KVM_CAP_IRQ_ROUTING struct kvm_irq_routing *irq_routes; int nr_allocated_irq_routes; @@ -1397,6 +1398,8 @@ int kvm_init(MachineClass *mc) assert(TARGET_PAGE_SIZE <= getpagesize()); page_size_init(); +s->sigmask_len = 8; + #ifdef KVM_CAP_SET_GUEST_DEBUG QTAILQ_INIT(&s->kvm_sw_breakpoints); #endif @@ -1575,6 +1578,11 @@ err: return ret; } +void kvm_set_sigmask_len(KVMState *s, unsigned int sigmask_len) +{ +s->sigmask_len = sigmask_len; +} + static void kvm_handle_io(uint16_t port, void *data, int direction, int size, uint32_t count) { @@ -2095,6 +2103,7 @@ void kvm_remove_all_breakpoints(CPUState *cpu) int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset) { +KVMState *s = kvm_state; struct kvm_signal_mask *sigmask; int r; @@ -2104,7 +2113,7 @@ int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset) sigmask = g_malloc(sizeof(*sigmask) + sizeof(*sigset)); -sigmask->len = 8; +sigmask->len = s->sigmask_len; memcpy(sigmask->sigset, sigset, sizeof(*sigset)); r = kvm_vcpu_ioctl(cpu, KVM_SET_SIGNAL_MASK, sigmask); g_free(sigmask); -- 1.9.3 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5 12/12] MAINTAINERS: Add entry for MIPS KVM
Add MAINTAINERS entry for MIPS KVM. Signed-off-by: James Hogan --- Changes in v4: - Add MAINTAINERS entry for MIPS KVM. --- MAINTAINERS | 5 + 1 file changed, 5 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 51a6f51842be..0a637c90c679 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -176,6 +176,11 @@ M: Peter Maydell S: Maintained F: target-arm/kvm.c +MIPS +M: James Hogan +S: Maintained +F: target-mips/kvm.c + PPC M: Alexander Graf S: Maintained -- 1.9.3 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5 03/12] hw/mips: Add API to convert KVM guest KSEG0 <-> GPA
From: Sanjay Lal Add API for converting physical addresses to KVM guest KSEG0 addresses, and fix the existing API for converting KSEG0 addresses to physical addresses to work in the KVM case. Both have the same sized KSEG0, so it's just a case of fixing the mask. In KVM trap and emulate mode both the guest kernel and guest userspace execute in useg: Guest User address space: 0x..0x3fff Guest Kernel Unmapped: 0x4000..0x5fff Guest Kernel Mapped:0x6000..0x7fff Signed-off-by: Sanjay Lal Signed-off-by: James Hogan Cc: Aurelien Jarno --- Changes in v5: - KSEG0 doesn't actually change size, so fix mask in cpu_mips_kseg0_to_phys() instead of having the KVM specific cpu_mips_kvm_um_kseg0_to_phys(). Changes in v2: - Expand commit message - Remove unnecessary include --- hw/mips/addr.c| 7 ++- include/hw/mips/cpudevs.h | 2 ++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/hw/mips/addr.c b/hw/mips/addr.c index 99488f1d2a6f..ff3b952600bb 100644 --- a/hw/mips/addr.c +++ b/hw/mips/addr.c @@ -25,10 +25,15 @@ uint64_t cpu_mips_kseg0_to_phys(void *opaque, uint64_t addr) { -return addr & 0x7fffll; +return addr & 0x1fffll; } uint64_t cpu_mips_phys_to_kseg0(void *opaque, uint64_t addr) { return addr | ~0x7fffll; } + +uint64_t cpu_mips_kvm_um_phys_to_kseg0(void *opaque, uint64_t addr) +{ +return addr | 0x4000ll; +} diff --git a/include/hw/mips/cpudevs.h b/include/hw/mips/cpudevs.h index 6bea24bf102d..b2626f2922f4 100644 --- a/include/hw/mips/cpudevs.h +++ b/include/hw/mips/cpudevs.h @@ -5,6 +5,8 @@ /* mips_addr.c */ uint64_t cpu_mips_kseg0_to_phys(void *opaque, uint64_t addr); uint64_t cpu_mips_phys_to_kseg0(void *opaque, uint64_t addr); +uint64_t cpu_mips_kvm_um_phys_to_kseg0(void *opaque, uint64_t addr); + /* mips_int.c */ void cpu_mips_irq_init_cpu(CPUMIPSState *env); -- 1.9.3 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5 02/12] hw/mips/cputimer: Don't start periodic timer in KVM mode
From: Sanjay Lal Compare/Count timer interrupts are handled in-kernel for KVM. Therefore don't bother creating the timer at init time if KVM is enabled. This will conveniently avoid attempts to set the timeout when cpu_mips_store_count() is called at reset with KVM enabled, treating the timer as stopped so that CP0_Count is modified directly. Signed-off-by: Sanjay Lal [james.ho...@imgtec.com: Update after "target-mips: Reset CPU timer consistently" which moves timer start to reset time] Signed-off-by: James Hogan Cc: Aurelien Jarno Cc: Paolo Bonzini --- Changes in v5: - Update commit message and comment in cpu_mips_clock_init() after the patch "target-mips: Reset CPU timer consistently") (Paolo Bonzini). Changes in v2: - Expand commit message - Rebase on v1.7.0 - Wrap comment --- hw/mips/cputimer.c | 9 - 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/hw/mips/cputimer.c b/hw/mips/cputimer.c index 6900a745c60c..577c9aeab877 100644 --- a/hw/mips/cputimer.c +++ b/hw/mips/cputimer.c @@ -23,6 +23,7 @@ #include "hw/hw.h" #include "hw/mips/cpudevs.h" #include "qemu/timer.h" +#include "sysemu/kvm.h" #define TIMER_FREQ 100 * 1000 * 1000 @@ -146,5 +147,11 @@ static void mips_timer_cb (void *opaque) void cpu_mips_clock_init (CPUMIPSState *env) { -env->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, &mips_timer_cb, env); +/* + * If we're in KVM mode, don't create the periodic timer, that is handled in + * kernel. + */ +if (!kvm_enabled()) { +env->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, &mips_timer_cb, env); +} } -- 1.9.3 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5 01/12] target-mips: Reset CPU timer consistently
The MIPS CPU timer (CP0 Count/Compare registers & QEMU timer) is reset at machine initialisation, including starting the timeout. Both registers however are placed before mvp in CPUMIPSState so they will both be zeroed on reset by the memset in mips_cpu_reset() including soon after init. This doesn't take into account that the timer may be running, in which case env->CP0_Count will represent the delta against the VM clock and the timeout will need updating. At init time (cpu_mips_clock_init()), lets only create the timer. Setting Count = 1 and starting the timer (cpu_mips_store_count()) can be done at reset time from cpu_state_reset(), which is after the memset. There is also no need to set CP0_Compare = 0 as that is already handled by the memset. Note that a reset occurs from mips_cpu_realizefn() which is before the machine init callback has had a chance to set up the CPU interrupts and the CPU timer, so env->timer will be NULL. This case is handled explicitly in cpu_mips_store_count(), treating the timer as disabled (which will also be the right thing to do when KVM support is added). Reported-by: Paolo Bonzini Signed-off-by: James Hogan Cc: Aurelien Jarno --- Changes in v5: - New patch (Paolo Bonzini). --- hw/mips/cputimer.c | 9 ++--- target-mips/translate.c | 2 ++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/hw/mips/cputimer.c b/hw/mips/cputimer.c index c8b4b000cd0e..6900a745c60c 100644 --- a/hw/mips/cputimer.c +++ b/hw/mips/cputimer.c @@ -85,7 +85,12 @@ uint32_t cpu_mips_get_count (CPUMIPSState *env) void cpu_mips_store_count (CPUMIPSState *env, uint32_t count) { -if (env->CP0_Cause & (1 << CP0Ca_DC)) +/* + * This gets called from cpu_state_reset(), potentially before timer init. + * So env->timer may be NULL, which is also the case with KVM enabled so + * treat timer as disabled in that case. + */ +if (env->CP0_Cause & (1 << CP0Ca_DC) || !env->timer) env->CP0_Count = count; else { /* Store new count register */ @@ -142,6 +147,4 @@ static void mips_timer_cb (void *opaque) void cpu_mips_clock_init (CPUMIPSState *env) { env->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, &mips_timer_cb, env); -env->CP0_Compare = 0; -cpu_mips_store_count(env, 1); } diff --git a/target-mips/translate.c b/target-mips/translate.c index 2c4c80103d14..b63f30fc3a76 100644 --- a/target-mips/translate.c +++ b/target-mips/translate.c @@ -16091,6 +16091,8 @@ void cpu_state_reset(CPUMIPSState *env) /* Count register increments in debug mode, EJTAG version 1 */ env->CP0_Debug = (1 << CP0DB_CNT) | (0x1 << CP0DB_VER); +cpu_mips_store_count(env, 1); + if (env->CP0_Config3 & (1 << CP0C3_MT)) { int i; -- 1.9.3 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5 07/12] target-mips: kvm: Add main KVM support for MIPS
From: Sanjay Lal Implement the main KVM arch API for MIPS. Signed-off-by: Sanjay Lal Signed-off-by: James Hogan Cc: Aurelien Jarno Cc: Gleb Natapov Cc: Paolo Bonzini Cc: Andreas Färber Cc: Peter Maydell --- Changes in v5: - Rename kvm_arch_reset_vcpu to kvm_mips_reset_vcpu based on commit 50a2c6e55fa2 (kvm: reset state from the CPU's reset method). - Rename kvm_mips_te_{put,get}_cp0_registers() functions to drop the "te_" since they're not really specific to T&E. - Pass level through from kvm_arch_put_registers() to kvm_mips_put_cp0_registers() rather than hard coding it to KVM_PUT_FULL_STATE. - Fix KVM_REG_MIPS_CP0_* definitions to set KVM_REG_MIPS and KVM_REG_SIZE_U32/KVM_REG_SIZE_U64 (using a macro). - Remove unused KVM_REG_MIPS_CP0_* definitions for now. - Correct type of kvm_mips_{get,put}_one_{,ul}reg() reg_id argument to uint64_t. Various high bits must be set to disambiguate the architecture and register size. - Add register accessors for always-64-bit registers (rather than ulong registers). These are needed for virtual KVM registers for controlling the KVM Compare/Count timer. - Simplify register access functions slightly. - Save and restore KVM timer state with the rest of the state, and also when VM clock is started or stopped. When the KVM timer state is restored (or VM clock restarted) it is resumed with the stored count at the monotonic time when the VM clock was last stopped. If the VM clock hasn't been stopped it resumes from the monotonic time when the state was saved (i.e. as if the timer was never stopped). Changes since RFC patch on kernel KVM thread "[PATCH v2 00/23] MIPS: KVM: Fixes and guest timer rewrite"): - Simplified, removing extra state for storing VM time of save/restore, at the cost of losing/gaining time when VM gets stopped and started (Paolo Bonzini). - Save and restore the UserLocal and HWREna CP0 registers. - Improve get/put KVM register error handling with DPRINTFs and fall through so that getting/putting of all the registers is attempted even if one of them fails due to being unimplemented in the kernel. Changes in v4: (No functional changes, assembly output unchanged) - Use int32_t instead of int32 (which is for softfloat) in kvm register accessors (Andreas Färber). - Use uint64_t instead of __u64 (which is really just for kernel headers) in the kvm register accessors (Andreas Färber). - Cast pointer to uintptr_t rather than target_ulong in kvm register accessors. - Remove some redundant casts in kvm register accessors. Changes in v3: - s/dprintf/DPRINTF/ (Andreas Färber). - Use "cs" rather than "cpu" or "env" for CPUState variable names (Andreas Färber). - Use CPUMIPSState rather than CPUArchState (Andreas Färber). - Pass MIPSCPU to cpu_mips_io_interrupts_pending() rather than CPUMIPSState (Andreas Färber). - Remove spurious parentheses around cpu_mips_io_interrupts_pending() call (Andreas Färber). - Pass MIPSCPU to kvm_mips_set_[ipi_]interrupt (Andreas Färber). - Make use of error_report (Andreas Färber) and clean up error messages a little to include __func__. - Remove inline kvm_mips_{put,get}_one_[ul]reg() declarations from kvm_mips.h. They're only used in target-mips/kvm.c anyway. - Make kvm_arch_{put,get}_registers static within target-mips/kvm.c and remove from kvm_mips.h. - Set sigmask length to 16 from kvm_arch_init() since MIPS Linux has 128 signals. This is better than cluttering kvm_all.c with TARGET_* ifdefs (Peter Maydell). Changes in v2: - Expand commit message - Checkpatch cleanups. - Some interrupt bug fixes from Yann Le Du - Add get/set register functionality from Yann Le Du - Use new 64 bit compatible ABI from Cavium from Sanjay Lal - Add dummy kvm_arch_init_irq_routing() The common KVM code insists on calling kvm_arch_init_irq_routing() as soon as it sees kernel header support for it (regardless of whether QEMU supports it). Provide a dummy function to satisfy this. - Remove request_interrupt_window code (Peter Maydell) --- target-mips/kvm.c | 683 + target-mips/kvm_mips.h | 26 ++ 2 files changed, 709 insertions(+) create mode 100644 target-mips/kvm.c create mode 100644 target-mips/kvm_mips.h diff --git a/target-mips/kvm.c b/target-mips/kvm.c new file mode 100644 index ..844e5bbe5f92 --- /dev/null +++ b/target-mips/kvm.c @@ -0,0 +1,683 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * KVM/MIPS: MIPS specific KVM APIs + * + * Copyright (C) 2012-2014 Imagination Technologies Ltd. + * Authors: Sanjay Lal +*/ + +#include +#include +#include + +#include + +#include "qemu-common.h" +#include "qemu/error-report.h" +#include "qemu/timer.h" +#include "sysemu/sysemu.h"
[PATCH v5 10/12] hw/mips: malta: Add KVM support
In KVM mode the bootrom is loaded and executed from the last 1MB of DRAM. Based on "[PATCH 12/12] KVM/MIPS: General KVM support and support for SMP Guests" by Sanjay Lal . Signed-off-by: James Hogan Reviewed-by: Aurelien Jarno Cc: Peter Maydell Cc: Sanjay Lal --- Changes in v5: - Kseg0 doesn't actually change size, so use cpu_mips_kseg0_to_phys() rather than having the KVM specific cpu_mips_kvm_um_kseg0_to_phys(). Changes in v3: - Remove unnecessary includes, especially linux/kvm.h which isn't a good idea on non-Linux (Peter Maydell). Changes in v2: - Removal of cps / GIC / SMP support - Minimal bootloader modified to execute safely from RAM - Remove "Writing bootloader to final 1MB of RAM" printf --- hw/mips/mips_malta.c | 73 ++-- 1 file changed, 53 insertions(+), 20 deletions(-) diff --git a/hw/mips/mips_malta.c b/hw/mips/mips_malta.c index f4a7d4712952..8bc5392b4223 100644 --- a/hw/mips/mips_malta.c +++ b/hw/mips/mips_malta.c @@ -51,6 +51,7 @@ #include "sysemu/qtest.h" #include "qemu/error-report.h" #include "hw/empty_slot.h" +#include "sysemu/kvm.h" //#define DEBUG_BOARD_INIT @@ -603,29 +604,31 @@ static void network_init(PCIBus *pci_bus) */ static void write_bootloader (CPUMIPSState *env, uint8_t *base, - int64_t kernel_entry) + int64_t run_addr, int64_t kernel_entry) { uint32_t *p; /* Small bootloader */ p = (uint32_t *)base; -stl_p(p++, 0x0bf00160); /* j 0x1fc00580 */ + +stl_p(p++, 0x0800 | /* j 0x1fc00580 */ + ((run_addr + 0x580) & 0x0fff) >> 2); stl_p(p++, 0x); /* nop */ /* YAMON service vector */ -stl_p(base + 0x500, 0xbfc00580); /* start: */ -stl_p(base + 0x504, 0xbfc0083c); /* print_count: */ -stl_p(base + 0x520, 0xbfc00580); /* start: */ -stl_p(base + 0x52c, 0xbfc00800); /* flush_cache: */ -stl_p(base + 0x534, 0xbfc00808); /* print: */ -stl_p(base + 0x538, 0xbfc00800); /* reg_cpu_isr: */ -stl_p(base + 0x53c, 0xbfc00800); /* unred_cpu_isr: */ -stl_p(base + 0x540, 0xbfc00800); /* reg_ic_isr: */ -stl_p(base + 0x544, 0xbfc00800); /* unred_ic_isr: */ -stl_p(base + 0x548, 0xbfc00800); /* reg_esr: */ -stl_p(base + 0x54c, 0xbfc00800); /* unreg_esr: */ -stl_p(base + 0x550, 0xbfc00800); /* getchar: */ -stl_p(base + 0x554, 0xbfc00800); /* syscon_read: */ +stl_p(base + 0x500, run_addr + 0x0580); /* start: */ +stl_p(base + 0x504, run_addr + 0x083c); /* print_count: */ +stl_p(base + 0x520, run_addr + 0x0580); /* start: */ +stl_p(base + 0x52c, run_addr + 0x0800); /* flush_cache: */ +stl_p(base + 0x534, run_addr + 0x0808); /* print: */ +stl_p(base + 0x538, run_addr + 0x0800); /* reg_cpu_isr: */ +stl_p(base + 0x53c, run_addr + 0x0800); /* unred_cpu_isr: */ +stl_p(base + 0x540, run_addr + 0x0800); /* reg_ic_isr: */ +stl_p(base + 0x544, run_addr + 0x0800); /* unred_ic_isr: */ +stl_p(base + 0x548, run_addr + 0x0800); /* reg_esr: */ +stl_p(base + 0x54c, run_addr + 0x0800); /* unreg_esr: */ +stl_p(base + 0x550, run_addr + 0x0800); /* getchar: */ +stl_p(base + 0x554, run_addr + 0x0800); /* syscon_read: */ /* Second part of the bootloader */ @@ -701,7 +704,7 @@ static void write_bootloader (CPUMIPSState *env, uint8_t *base, p = (uint32_t *) (base + 0x800); stl_p(p++, 0x03e8); /* jr ra */ stl_p(p++, 0x2402); /* li v0,0 */ - /* 808 YAMON print */ +/* 808 YAMON print */ stl_p(p++, 0x03e06821); /* move t5,ra */ stl_p(p++, 0x00805821); /* move t3,a0 */ stl_p(p++, 0x00a05021); /* move t2,a1 */ @@ -774,6 +777,7 @@ static int64_t load_kernel (void) uint32_t *prom_buf; long prom_size; int prom_index = 0; +uint64_t (*xlate_to_kseg0) (void *opaque, uint64_t addr); #ifdef TARGET_WORDS_BIGENDIAN big_endian = 1; @@ -788,6 +792,11 @@ static int64_t load_kernel (void) loaderparams.kernel_filename); exit(1); } +if (kvm_enabled()) { +xlate_to_kseg0 = cpu_mips_kvm_um_phys_to_kseg0; +} else { +xlate_to_kseg0 = cpu_mips_phys_to_kseg0; +} /* load initrd */ initrd_size = 0; @@ -820,7 +829,7 @@ static int64_t load_kernel (void) prom_set(prom_buf, prom_index++, "%s", loaderparams.kernel_filename); if (initrd_size > 0) { prom_set(prom_buf, prom_index++, "rd_start=0x%" PRIx64 " rd_size=%li %s", - cpu_mips_phys_to_kseg0(NULL, initrd_o