date:20140617

[PATCH v2 5/6] VMX: Validate capability MSRs

2014-06-17 Thread Jan Kiszka

Check for required-0 or required-1 bits as well as known field value
restrictions. Also check the consistency between VMX_*_CTLS and
VMX_TRUE_*_CTLS and between CR0/4_FIXED0 and CR0/4_FIXED1.

Signed-off-by: Jan Kiszka 
---
 x86/vmx.c | 74 ++-
 x86/vmx.h |  5 +++--
 2 files changed, 76 insertions(+), 3 deletions(-)

diff --git a/x86/vmx.c b/x86/vmx.c
index f01e443..5c3498a 100644
--- a/x86/vmx.c
+++ b/x86/vmx.c
@@ -661,6 +661,77 @@ static void test_vmptrst(void)
report("test vmptrst", (!ret) && (vmcs1 == vmcs2));
 }
 
+struct vmx_ctl_msr {
+   const char *name;
+   u32 index, true_index;
+   u32 default1;
+} vmx_ctl_msr[] = {
+   { "MSR_IA32_VMX_PINBASED_CTLS", MSR_IA32_VMX_PINBASED_CTLS,
+ MSR_IA32_VMX_TRUE_PIN, 0x16 },
+   { "MSR_IA32_VMX_PROCBASED_CTLS", MSR_IA32_VMX_PROCBASED_CTLS,
+ MSR_IA32_VMX_TRUE_PROC, 0x401e172 },
+   { "MSR_IA32_VMX_PROCBASED_CTLS2", MSR_IA32_VMX_PROCBASED_CTLS2,
+ MSR_IA32_VMX_PROCBASED_CTLS2, 0 },
+   { "MSR_IA32_VMX_EXIT_CTLS", MSR_IA32_VMX_EXIT_CTLS,
+ MSR_IA32_VMX_TRUE_EXIT, 0x36dff },
+   { "MSR_IA32_VMX_ENTRY_CTLS", MSR_IA32_VMX_ENTRY_CTLS,
+ MSR_IA32_VMX_TRUE_ENTRY, 0x11ff },
+};
+
+static void test_vmx_caps(void)
+{
+   u64 val, default1, fixed0, fixed1;
+   union vmx_ctrl_msr ctrl, true_ctrl;
+   unsigned int n;
+   bool ok;
+
+   printf("\nTest suite: VMX capability reporting\n");
+
+   report("MSR_IA32_VMX_BASIC",
+  (basic.revision & (1ul << 31)) == 0 &&
+  basic.size > 0 && basic.size <= 4096 &&
+  (basic.type == 0 || basic.type == 6) &&
+  basic.reserved1 == 0 && basic.reserved2 == 0);
+
+   val = rdmsr(MSR_IA32_VMX_MISC);
+   report("MSR_IA32_VMX_MISC",
+  (!(ctrl_cpu_rev[1].clr & CPU_URG) || val & (1ul << 5)) &&
+  ((val >> 16) & 0x1ff) <= 256 &&
+  (val & 0xc0007e00) == 0);
+
+   for (n = 0; n < ARRAY_SIZE(vmx_ctl_msr); n++) {
+   ctrl.val = rdmsr(vmx_ctl_msr[n].index);
+   default1 = vmx_ctl_msr[n].default1;
+   ok = (ctrl.set & default1) == default1 &&
+   ((ctrl.set ^ ctrl.clr) & ~ctrl.clr) == 0;
+   if (ok && basic.ctrl) {
+   true_ctrl.val = rdmsr(vmx_ctl_msr[n].true_index);
+   ok = ctrl.clr == true_ctrl.clr &&
+   ((ctrl.set ^ true_ctrl.set) & ~default1) == 0;
+   }
+   report(vmx_ctl_msr[n].name, ok);
+   }
+
+   fixed0 = rdmsr(MSR_IA32_VMX_CR0_FIXED0);
+   fixed1 = rdmsr(MSR_IA32_VMX_CR0_FIXED1);
+   report("MSR_IA32_VMX_IA32_VMX_CR0_FIXED0/1",
+  ((fixed0 ^ fixed1) & ~fixed1) == 0);
+
+   fixed0 = rdmsr(MSR_IA32_VMX_CR4_FIXED0);
+   fixed1 = rdmsr(MSR_IA32_VMX_CR4_FIXED1);
+   report("MSR_IA32_VMX_IA32_VMX_CR4_FIXED0/1",
+  ((fixed0 ^ fixed1) & ~fixed1) == 0);
+
+   val = rdmsr(MSR_IA32_VMX_VMCS_ENUM);
+   report("MSR_IA32_VMX_VMCS_ENUM",
+  (val & 0x3e) >= 0x2a &&
+  (val & 0xfc01Ull) == 0);
+
+   val = rdmsr(MSR_IA32_VMX_EPT_VPID_CAP);
+   report("MSR_IA32_VMX_EPT_VPID_CAP",
+  (val & 0xf07ef9eebebeUll) == 0);
+}
+
 /* This function can only be called in guest */
 static void __attribute__((__used__)) hypercall(u32 hypercall_no)
 {
@@ -803,7 +874,7 @@ static int test_run(struct vmx_test *test)
regs = test->guest_regs;
vmcs_write(GUEST_RFLAGS, regs.rflags | 0x2);
launched = 0;
-   printf("\nTest suite : %s\n", test->name);
+   printf("\nTest suite: %s\n", test->name);
vmx_run();
if (vmx_off()) {
printf("%s : vmxoff failed.\n", __func__);
@@ -842,6 +913,7 @@ int main(void)
goto exit;
}
test_vmxoff();
+   test_vmx_caps();
 
while (vmx_tests[++i].name != NULL)
if (test_run(&vmx_tests[i]))
diff --git a/x86/vmx.h b/x86/vmx.h
index 00f2842..87457b1 100644
--- a/x86/vmx.h
+++ b/x86/vmx.h
@@ -46,12 +46,13 @@ union vmx_basic {
struct {
u32 revision;
u32 size:13,
-   : 3,
+   reserved1: 3,
width:1,
dual:1,
type:4,
insouts:1,
-   ctrl:1;
+   ctrl:1,
+   reserved2:8;
};
 };
 
-- 
1.8.1.1.298.ge7eed54

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v2 3/6] VMX: Test both interception and execution of instructions

2014-06-17 Thread Jan Kiszka

Extend the instruction interception test to also check for
interception-free execution.

Signed-off-by: Jan Kiszka 
---
 x86/vmx_tests.c | 121 +---
 1 file changed, 72 insertions(+), 49 deletions(-)

diff --git a/x86/vmx_tests.c b/x86/vmx_tests.c
index 5485e4c..f02de3d 100644
--- a/x86/vmx_tests.c
+++ b/x86/vmx_tests.c
@@ -807,7 +807,6 @@ static int iobmp_exit_handler()
 #define INSN_CPU0  0
 #define INSN_CPU1  1
 #define INSN_ALWAYS_TRAP   2
-#define INSN_NEVER_TRAP3
 
 #define FIELD_EXIT_QUAL(1 << 1)
 #define FIELD_INSN_INFO(1 << 2)
@@ -818,7 +817,7 @@ asm(
"insn_mwait: mwait;ret\n\t"
"insn_rdpmc: rdpmc;ret\n\t"
"insn_rdtsc: rdtsc;ret\n\t"
-   "insn_cr3_load: mov %rax,%cr3;ret\n\t"
+   "insn_cr3_load: mov cr3,%rax; mov %rax,%cr3;ret\n\t"
"insn_cr3_store: mov %cr3,%rax;ret\n\t"
 #ifdef __x86_64__
"insn_cr8_load: mov %rax,%cr8;ret\n\t"
@@ -848,6 +847,7 @@ extern void insn_cpuid();
 extern void insn_invd();
 
 u32 cur_insn;
+u64 cr3;
 
 struct insn_table {
const char *name;
@@ -901,55 +901,56 @@ static struct insn_table insn_table[] = {
 
 static int insn_intercept_init()
 {
-   u32 ctrl_cpu[2];
+   u32 ctrl_cpu;
 
-   ctrl_cpu[0] = vmcs_read(CPU_EXEC_CTRL0);
-   ctrl_cpu[0] |= CPU_HLT | CPU_INVLPG | CPU_MWAIT | CPU_RDPMC | CPU_RDTSC 
|
-   CPU_CR3_LOAD | CPU_CR3_STORE |
-#ifdef __x86_64__
-   CPU_CR8_LOAD | CPU_CR8_STORE |
-#endif
-   CPU_MONITOR | CPU_PAUSE | CPU_SECONDARY;
-   ctrl_cpu[0] &= ctrl_cpu_rev[0].clr;
-   vmcs_write(CPU_EXEC_CTRL0, ctrl_cpu[0]);
-   ctrl_cpu[1] = vmcs_read(CPU_EXEC_CTRL1);
-   ctrl_cpu[1] |= CPU_WBINVD | CPU_RDRAND;
-   ctrl_cpu[1] &= ctrl_cpu_rev[1].clr;
-   vmcs_write(CPU_EXEC_CTRL1, ctrl_cpu[1]);
+   ctrl_cpu = ctrl_cpu_rev[0].set | CPU_SECONDARY;
+   ctrl_cpu &= ctrl_cpu_rev[0].clr;
+   vmcs_write(CPU_EXEC_CTRL0, ctrl_cpu);
+   vmcs_write(CPU_EXEC_CTRL1, ctrl_cpu_rev[1].set);
+   cr3 = read_cr3();
return VMX_TEST_START;
 }
 
 static void insn_intercept_main()
 {
-   cur_insn = 0;
-   while(insn_table[cur_insn].name != NULL) {
-   vmx_set_test_stage(cur_insn);
-   if ((insn_table[cur_insn].type == INSN_CPU0
-   && !(ctrl_cpu_rev[0].clr & insn_table[cur_insn].flag))
-   || (insn_table[cur_insn].type == INSN_CPU1
-   && !(ctrl_cpu_rev[1].clr & insn_table[cur_insn].flag))) 
{
-   printf("\tCPU_CTRL1.CPU_%s is not supported.\n",
-   insn_table[cur_insn].name);
+   char msg[80];
+
+   for (cur_insn = 0; insn_table[cur_insn].name != NULL; cur_insn++) {
+   vmx_set_test_stage(cur_insn * 2);
+   if ((insn_table[cur_insn].type == INSN_CPU0 &&
+!(ctrl_cpu_rev[0].clr & insn_table[cur_insn].flag)) ||
+   (insn_table[cur_insn].type == INSN_CPU1 &&
+!(ctrl_cpu_rev[1].clr & insn_table[cur_insn].flag))) {
+   printf("\tCPU_CTRL%d.CPU_%s is not supported.\n",
+  insn_table[cur_insn].type - INSN_CPU0,
+  insn_table[cur_insn].name);
continue;
}
+
+   if ((insn_table[cur_insn].type == INSN_CPU0 &&
+!(ctrl_cpu_rev[0].set & insn_table[cur_insn].flag)) ||
+   (insn_table[cur_insn].type == INSN_CPU1 &&
+!(ctrl_cpu_rev[1].set & insn_table[cur_insn].flag))) {
+   /* skip hlt, it stalls the guest and is tested below */
+   if (insn_table[cur_insn].insn_func != insn_hlt)
+   insn_table[cur_insn].insn_func();
+   snprintf(msg, sizeof(msg), "execute %s",
+insn_table[cur_insn].name);
+   report(msg, vmx_get_test_stage() == cur_insn * 2);
+   } else if (insn_table[cur_insn].type != INSN_ALWAYS_TRAP)
+   printf("\tCPU_CTRL%d.CPU_%s always traps.\n",
+  insn_table[cur_insn].type - INSN_CPU0,
+  insn_table[cur_insn].name);
+
+   vmcall();
+
insn_table[cur_insn].insn_func();
-   switch (insn_table[cur_insn].type) {
-   case INSN_CPU0:
-   case INSN_CPU1:
-   case INSN_ALWAYS_TRAP:
-   if (vmx_get_test_stage() != cur_insn + 1)
-   report(insn_table[cur_insn].name, 0);
-   else
-   report(insn_table[cur_insn].name, 1);
-   break;
-   case INSN_NEVER_TRAP:
-   if (vmx_get_test_stage

[PATCH v2 6/6] VMX: Test behavior on set and cleared save/load debug controls

2014-06-17 Thread Jan Kiszka

This particularly checks the case when debug controls are not to be
loaded/saved on host-guest transitions.

We have to fake results related to IA32_DEBUGCTL as support for this MSR
is missing KVM. The test already contains all bits required once KVM
adds support.

Signed-off-by: Jan Kiszka 
---
 x86/vmx.h   |   2 +
 x86/vmx_tests.c | 112 
 2 files changed, 114 insertions(+)

diff --git a/x86/vmx.h b/x86/vmx.h
index 87457b1..022f1df 100644
--- a/x86/vmx.h
+++ b/x86/vmx.h
@@ -305,6 +305,7 @@ enum Reason {
 #define X86_EFLAGS_ZF  0x0040 /* Zero Flag */
 
 enum Ctrl_exi {
+   EXI_SAVE_DBGCTLS= 1UL << 2,
EXI_HOST_64 = 1UL << 9,
EXI_LOAD_PERF   = 1UL << 12,
EXI_INTA= 1UL << 15,
@@ -316,6 +317,7 @@ enum Ctrl_exi {
 };
 
 enum Ctrl_ent {
+   ENT_LOAD_DBGCTLS= 1UL << 2,
ENT_GUEST_64= 1UL << 9,
ENT_LOAD_PAT= 1UL << 14,
ENT_LOAD_EFER   = 1UL << 15,
diff --git a/x86/vmx_tests.c b/x86/vmx_tests.c
index f02de3d..a97b6d6 100644
--- a/x86/vmx_tests.c
+++ b/x86/vmx_tests.c
@@ -1398,6 +1398,116 @@ static int interrupt_exit_handler(void)
return VMX_TEST_VMEXIT;
 }
 
+static int dbgctls_init(struct vmcs *vmcs)
+{
+   u64 dr7 = 0x402;
+   u64 zero = 0;
+
+   msr_bmp_init();
+   asm volatile(
+   "mov %0,%%dr0\n\t"
+   "mov %0,%%dr1\n\t"
+   "mov %0,%%dr2\n\t"
+   "mov %1,%%dr7\n\t"
+   : : "r" (zero), "r" (dr7));
+   wrmsr(MSR_IA32_DEBUGCTLMSR, 0x1);
+   vmcs_write(GUEST_DR7, 0x404);
+   vmcs_write(GUEST_DEBUGCTL, 0x2);
+
+   vmcs_write(ENT_CONTROLS, vmcs_read(ENT_CONTROLS) | ENT_LOAD_DBGCTLS);
+   vmcs_write(EXI_CONTROLS, vmcs_read(EXI_CONTROLS) | EXI_SAVE_DBGCTLS);
+
+   return VMX_TEST_START;
+}
+
+static void dbgctls_main(void)
+{
+   u64 dr7, debugctl;
+
+   asm volatile("mov %%dr7,%0" : "=r" (dr7));
+   debugctl = rdmsr(MSR_IA32_DEBUGCTLMSR);
+   /* Commented out: KVM does not support DEBUGCTL so far */
+   report("Load debug controls", dr7 == 0x404 /* && debugctl == 0x2 */);
+
+   dr7 = 0x408;
+   asm volatile("mov %0,%%dr7" : : "r" (dr7));
+   wrmsr(MSR_IA32_DEBUGCTLMSR, 0x3);
+
+   vmx_set_test_stage(0);
+   vmcall();
+   report("Save debug controls", vmx_get_test_stage() == 1);
+
+   if (ctrl_enter_rev.set & ENT_LOAD_DBGCTLS ||
+   ctrl_exit_rev.set & EXI_SAVE_DBGCTLS) {
+   printf("\tDebug controls are always loaded/saved\n");
+   return;
+   }
+   vmx_set_test_stage(2);
+   vmcall();
+
+   asm volatile("mov %%dr7,%0" : "=r" (dr7));
+   debugctl = rdmsr(MSR_IA32_DEBUGCTLMSR);
+   /* Commented out: KVM does not support DEBUGCTL so far */
+   report("Guest=host debug controls", dr7 == 0x402 /* && debugctl == 0x1 
*/);
+
+   dr7 = 0x408;
+   asm volatile("mov %0,%%dr7" : : "r" (dr7));
+   wrmsr(MSR_IA32_DEBUGCTLMSR, 0x3);
+
+   vmx_set_test_stage(3);
+   vmcall();
+   report("Don't save debug controls", vmx_get_test_stage() == 4);
+}
+
+static int dbgctls_exit_handler(void)
+{
+   unsigned int reason = vmcs_read(EXI_REASON) & 0xff;
+   u32 insn_len = vmcs_read(EXI_INST_LEN);
+   u64 guest_rip = vmcs_read(GUEST_RIP);
+   u64 dr7, debugctl;
+
+   asm volatile("mov %%dr7,%0" : "=r" (dr7));
+   debugctl = rdmsr(MSR_IA32_DEBUGCTLMSR);
+
+   switch (reason) {
+   case VMX_VMCALL:
+   switch (vmx_get_test_stage()) {
+   case 0:
+   if (dr7 == 0x400 && debugctl == 0 &&
+   vmcs_read(GUEST_DR7) == 0x408 /* &&
+   Commented out: KVM does not support DEBUGCTL so far
+   vmcs_read(GUEST_DEBUGCTL) == 0x3 */)
+   vmx_inc_test_stage();
+   break;
+   case 2:
+   dr7 = 0x402;
+   asm volatile("mov %0,%%dr7" : : "r" (dr7));
+   wrmsr(MSR_IA32_DEBUGCTLMSR, 0x1);
+   vmcs_write(GUEST_DR7, 0x404);
+   vmcs_write(GUEST_DEBUGCTL, 0x2);
+
+   vmcs_write(ENT_CONTROLS,
+   vmcs_read(ENT_CONTROLS) & ~ENT_LOAD_DBGCTLS);
+   vmcs_write(EXI_CONTROLS,
+   vmcs_read(EXI_CONTROLS) & ~EXI_SAVE_DBGCTLS);
+   break;
+   case 3:
+   if (dr7 == 0x400 && debugctl == 0 &&
+   vmcs_read(GUEST_DR7) == 0x404 /* &&
+   Commented out: KVM does not support DEBUGCTL so far
+   vmcs_read(GUEST_DEBUGCTL) == 0x2 */)
+   vmx_inc_test_stage();
+   break;
+

[PATCH v2 0/6] kvm-unit-tests: more instr. interceptions, debug control migration

2014-06-17 Thread Jan Kiszka

Changes in v2 according to review remarks:
- refactored get/set_stage interface
- unified vmx_ctrl_* unions
- used vmx_ctrl_msr in capability test
- changed commented-out debugctl tests

Jan Kiszka (6):
  VMX: Add tests for CR3 and CR8 interception
  VMX: Rework test stage interface
  VMX: Test both interception and execution of instructions
  VMX: Unify vmx_ctrl_* unions to vmx_ctrl_msr
  VMX: Validate capability MSRs
  VMX: Test behavior on set and cleared save/load debug controls

 x86/vmx.c   | 108 -
 x86/vmx.h   |  44 ++---
 x86/vmx_tests.c | 495 
 3 files changed, 443 insertions(+), 204 deletions(-)

-- 
1.8.1.1.298.ge7eed54

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v2 4/6] VMX: Unify vmx_ctrl_* unions to vmx_ctrl_msr

2014-06-17 Thread Jan Kiszka

Signed-off-by: Jan Kiszka 
---
 x86/vmx.c |  8 
 x86/vmx.h | 31 +--
 2 files changed, 9 insertions(+), 30 deletions(-)

diff --git a/x86/vmx.c b/x86/vmx.c
index ba6a02b..f01e443 100644
--- a/x86/vmx.c
+++ b/x86/vmx.c
@@ -49,10 +49,10 @@ bool launched;
 u64 host_rflags;
 
 union vmx_basic basic;
-union vmx_ctrl_pin ctrl_pin_rev;
-union vmx_ctrl_cpu ctrl_cpu_rev[2];
-union vmx_ctrl_exit ctrl_exit_rev;
-union vmx_ctrl_ent ctrl_enter_rev;
+union vmx_ctrl_msr ctrl_pin_rev;
+union vmx_ctrl_msr ctrl_cpu_rev[2];
+union vmx_ctrl_msr ctrl_exit_rev;
+union vmx_ctrl_msr ctrl_enter_rev;
 union vmx_ept_vpid  ept_vpid;
 
 extern struct descriptor_table_ptr gdt64_desc;
diff --git a/x86/vmx.h b/x86/vmx.h
index 1a8ae4c..00f2842 100644
--- a/x86/vmx.h
+++ b/x86/vmx.h
@@ -55,28 +55,7 @@ union vmx_basic {
};
 };
 
-union vmx_ctrl_pin {
-   u64 val;
-   struct {
-   u32 set, clr;
-   };
-};
-
-union vmx_ctrl_cpu {
-   u64 val;
-   struct {
-   u32 set, clr;
-   };
-};
-
-union vmx_ctrl_exit {
-   u64 val;
-   struct {
-   u32 set, clr;
-   };
-};
-
-union vmx_ctrl_ent {
+union vmx_ctrl_msr {
u64 val;
struct {
u32 set, clr;
@@ -508,10 +487,10 @@ enum Ctrl1 {
 extern struct regs regs;
 
 extern union vmx_basic basic;
-extern union vmx_ctrl_pin ctrl_pin_rev;
-extern union vmx_ctrl_cpu ctrl_cpu_rev[2];
-extern union vmx_ctrl_exit ctrl_exit_rev;
-extern union vmx_ctrl_ent ctrl_enter_rev;
+extern union vmx_ctrl_msr ctrl_pin_rev;
+extern union vmx_ctrl_msr ctrl_cpu_rev[2];
+extern union vmx_ctrl_msr ctrl_exit_rev;
+extern union vmx_ctrl_msr ctrl_enter_rev;
 extern union vmx_ept_vpid  ept_vpid;
 
 void vmx_set_test_stage(u32 s);
-- 
1.8.1.1.298.ge7eed54

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v2 2/6] VMX: Rework test stage interface

2014-06-17 Thread Jan Kiszka

Consistently access the stage only via the helper functions. To enforce
this, move them from vmx_tests.c to vmx.c. At this chance, introduce a
stage incrementation helper.

Signed-off-by: Jan Kiszka 
---
 x86/vmx.c   |  26 ++
 x86/vmx.h   |   4 +
 x86/vmx_tests.c | 250 +++-
 3 files changed, 151 insertions(+), 129 deletions(-)

diff --git a/x86/vmx.c b/x86/vmx.c
index 1182eef..ba6a02b 100644
--- a/x86/vmx.c
+++ b/x86/vmx.c
@@ -62,6 +62,32 @@ extern void *vmx_return;
 extern void *entry_sysenter;
 extern void *guest_entry;
 
+static volatile u32 stage;
+
+void vmx_set_test_stage(u32 s)
+{
+   barrier();
+   stage = s;
+   barrier();
+}
+
+u32 vmx_get_test_stage(void)
+{
+   u32 s;
+
+   barrier();
+   s = stage;
+   barrier();
+   return s;
+}
+
+void vmx_inc_test_stage(void)
+{
+   barrier();
+   stage++;
+   barrier();
+}
+
 static int make_vmcs_current(struct vmcs *vmcs)
 {
bool ret;
diff --git a/x86/vmx.h b/x86/vmx.h
index 69a5385..1a8ae4c 100644
--- a/x86/vmx.h
+++ b/x86/vmx.h
@@ -514,6 +514,10 @@ extern union vmx_ctrl_exit ctrl_exit_rev;
 extern union vmx_ctrl_ent ctrl_enter_rev;
 extern union vmx_ept_vpid  ept_vpid;
 
+void vmx_set_test_stage(u32 s);
+u32 vmx_get_test_stage(void);
+void vmx_inc_test_stage(void);
+
 static inline int vmcs_clear(struct vmcs *vmcs)
 {
bool ret;
diff --git a/x86/vmx_tests.c b/x86/vmx_tests.c
index 149a591..5485e4c 100644
--- a/x86/vmx_tests.c
+++ b/x86/vmx_tests.c
@@ -14,7 +14,6 @@
 
 u64 ia32_pat;
 u64 ia32_efer;
-volatile u32 stage;
 void *io_bitmap_a, *io_bitmap_b;
 u16 ioport;
 
@@ -27,23 +26,6 @@ static inline void vmcall()
asm volatile("vmcall");
 }
 
-static inline void set_stage(u32 s)
-{
-   barrier();
-   stage = s;
-   barrier();
-}
-
-static inline u32 get_stage()
-{
-   u32 s;
-
-   barrier();
-   s = stage;
-   barrier();
-   return s;
-}
-
 void basic_guest_main()
 {
 }
@@ -122,23 +104,23 @@ void preemption_timer_main()
 {
tsc_val = rdtsc();
if (ctrl_exit_rev.clr & EXI_SAVE_PREEMPT) {
-   set_stage(0);
+   vmx_set_test_stage(0);
vmcall();
-   if (get_stage() == 1)
+   if (vmx_get_test_stage() == 1)
vmcall();
}
-   set_stage(1);
-   while (get_stage() == 1) {
+   vmx_set_test_stage(1);
+   while (vmx_get_test_stage() == 1) {
if (((rdtsc() - tsc_val) >> preempt_scale)
> 10 * preempt_val) {
-   set_stage(2);
+   vmx_set_test_stage(2);
vmcall();
}
}
tsc_val = rdtsc();
asm volatile ("hlt");
vmcall();
-   set_stage(5);
+   vmx_set_test_stage(5);
vmcall();
 }
 
@@ -155,13 +137,13 @@ int preemption_timer_exit_handler()
insn_len = vmcs_read(EXI_INST_LEN);
switch (reason) {
case VMX_PREEMPT:
-   switch (get_stage()) {
+   switch (vmx_get_test_stage()) {
case 1:
case 2:
report("busy-wait for preemption timer",
   ((rdtsc() - tsc_val) >> preempt_scale) >=
   preempt_val);
-   set_stage(3);
+   vmx_set_test_stage(3);
vmcs_write(PREEMPT_TIMER_VALUE, preempt_val);
return VMX_TEST_RESUME;
case 3:
@@ -170,7 +152,7 @@ int preemption_timer_exit_handler()
report("preemption timer during hlt",
   ((rdtsc() - tsc_val) >> preempt_scale) >=
   preempt_val && guest_halted);
-   set_stage(4);
+   vmx_set_test_stage(4);
vmcs_write(PIN_CONTROLS,
   vmcs_read(PIN_CONTROLS) & ~PIN_PREEMPT);
vmcs_write(GUEST_ACTV_STATE, ACTV_ACTIVE);
@@ -187,11 +169,11 @@ int preemption_timer_exit_handler()
break;
case VMX_VMCALL:
vmcs_write(GUEST_RIP, guest_rip + insn_len);
-   switch (get_stage()) {
+   switch (vmx_get_test_stage()) {
case 0:
report("Keep preemption value",
   vmcs_read(PREEMPT_TIMER_VALUE) == preempt_val);
-   set_stage(1);
+   vmx_set_test_stage(1);
vmcs_write(PREEMPT_TIMER_VALUE, preempt_val);
ctrl_exit = (vmcs_read(EXI_CONTROLS) |
EXI_SAVE_PREEMPT) & ctrl_exit_rev.clr;
@@ -203,12 +185,12 @@ int preemption_timer_exit_handler()
return VMX_TEST_RESUME;
case 2:

[PATCH v2 1/6] VMX: Add tests for CR3 and CR8 interception

2014-06-17 Thread Jan Kiszka

Need to fix FIELD_* constants for this to make the exit qualification
check work.

Signed-off-by: Jan Kiszka 
---
 x86/vmx.h   |  2 ++
 x86/vmx_tests.c | 32 +---
 2 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/x86/vmx.h b/x86/vmx.h
index 26dd161..69a5385 100644
--- a/x86/vmx.h
+++ b/x86/vmx.h
@@ -357,6 +357,8 @@ enum Ctrl0 {
CPU_RDTSC   = 1ul << 12,
CPU_CR3_LOAD= 1ul << 15,
CPU_CR3_STORE   = 1ul << 16,
+   CPU_CR8_LOAD= 1ul << 19,
+   CPU_CR8_STORE   = 1ul << 20,
CPU_TPR_SHADOW  = 1ul << 21,
CPU_NMI_WINDOW  = 1ul << 22,
CPU_IO  = 1ul << 24,
diff --git a/x86/vmx_tests.c b/x86/vmx_tests.c
index a40cb18..149a591 100644
--- a/x86/vmx_tests.c
+++ b/x86/vmx_tests.c
@@ -820,8 +820,8 @@ static int iobmp_exit_handler()
 #define INSN_ALWAYS_TRAP   2
 #define INSN_NEVER_TRAP3
 
-#define FIELD_EXIT_QUAL0
-#define FIELD_INSN_INFO1
+#define FIELD_EXIT_QUAL(1 << 1)
+#define FIELD_INSN_INFO(1 << 2)
 
 asm(
"insn_hlt: hlt;ret\n\t"
@@ -829,6 +829,12 @@ asm(
"insn_mwait: mwait;ret\n\t"
"insn_rdpmc: rdpmc;ret\n\t"
"insn_rdtsc: rdtsc;ret\n\t"
+   "insn_cr3_load: mov %rax,%cr3;ret\n\t"
+   "insn_cr3_store: mov %cr3,%rax;ret\n\t"
+#ifdef __x86_64__
+   "insn_cr8_load: mov %rax,%cr8;ret\n\t"
+   "insn_cr8_store: mov %cr8,%rax;ret\n\t"
+#endif
"insn_monitor: monitor;ret\n\t"
"insn_pause: pause;ret\n\t"
"insn_wbinvd: wbinvd;ret\n\t"
@@ -840,6 +846,12 @@ extern void insn_invlpg();
 extern void insn_mwait();
 extern void insn_rdpmc();
 extern void insn_rdtsc();
+extern void insn_cr3_load();
+extern void insn_cr3_store();
+#ifdef __x86_64__
+extern void insn_cr8_load();
+extern void insn_cr8_store();
+#endif
 extern void insn_monitor();
 extern void insn_pause();
 extern void insn_wbinvd();
@@ -856,7 +868,7 @@ struct insn_table {
u32 reason;
ulong exit_qual;
u32 insn_info;
-   // Use FIELD_EXIT_QUAL and FIELD_INSN_INFO to efines
+   // Use FIELD_EXIT_QUAL and FIELD_INSN_INFO to define
// which field need to be tested, reason is always tested
u32 test_field;
 };
@@ -877,6 +889,16 @@ static struct insn_table insn_table[] = {
{"MWAIT", CPU_MWAIT, insn_mwait, INSN_CPU0, 36, 0, 0, 0},
{"RDPMC", CPU_RDPMC, insn_rdpmc, INSN_CPU0, 15, 0, 0, 0},
{"RDTSC", CPU_RDTSC, insn_rdtsc, INSN_CPU0, 16, 0, 0, 0},
+   {"CR3 load", CPU_CR3_LOAD, insn_cr3_load, INSN_CPU0, 28, 0x3, 0,
+   FIELD_EXIT_QUAL},
+   {"CR3 store", CPU_CR3_STORE, insn_cr3_store, INSN_CPU0, 28, 0x13, 0,
+   FIELD_EXIT_QUAL},
+#ifdef __x86_64__
+   {"CR8 load", CPU_CR8_LOAD, insn_cr8_load, INSN_CPU0, 28, 0x8, 0,
+   FIELD_EXIT_QUAL},
+   {"CR8 store", CPU_CR8_STORE, insn_cr8_store, INSN_CPU0, 28, 0x18, 0,
+   FIELD_EXIT_QUAL},
+#endif
{"MONITOR", CPU_MONITOR, insn_monitor, INSN_CPU0, 39, 0, 0, 0},
{"PAUSE", CPU_PAUSE, insn_pause, INSN_CPU0, 40, 0, 0, 0},
// Flags for Secondary Processor-Based VM-Execution Controls
@@ -894,6 +916,10 @@ static int insn_intercept_init()
 
ctrl_cpu[0] = vmcs_read(CPU_EXEC_CTRL0);
ctrl_cpu[0] |= CPU_HLT | CPU_INVLPG | CPU_MWAIT | CPU_RDPMC | CPU_RDTSC 
|
+   CPU_CR3_LOAD | CPU_CR3_STORE |
+#ifdef __x86_64__
+   CPU_CR8_LOAD | CPU_CR8_STORE |
+#endif
CPU_MONITOR | CPU_PAUSE | CPU_SECONDARY;
ctrl_cpu[0] &= ctrl_cpu_rev[0].clr;
vmcs_write(CPU_EXEC_CTRL0, ctrl_cpu[0]);
-- 
1.8.1.1.298.ge7eed54

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2 1/6] VMX: Add tests for CR3 and CR8 interception

2014-06-17 Thread Paolo Bonzini


Il 17/06/2014 09:04, Jan Kiszka ha scritto:


-#define FIELD_EXIT_QUAL0
-#define FIELD_INSN_INFO1
+#define FIELD_EXIT_QUAL(1 << 1)
+#define FIELD_INSN_INFO(1 << 2)


Heh, you probably wanted 1<<0 and 1<<1.  I'll fix it up. :)

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2 1/6] VMX: Add tests for CR3 and CR8 interception

2014-06-17 Thread Jan Kiszka

On 2014-06-17 09:41, Paolo Bonzini wrote:
> Il 17/06/2014 09:04, Jan Kiszka ha scritto:
>>
>> -#define FIELD_EXIT_QUAL0
>> -#define FIELD_INSN_INFO1
>> +#define FIELD_EXIT_QUAL(1 << 1)
>> +#define FIELD_INSN_INFO(1 << 2)
> 
> Heh, you probably wanted 1<<0 and 1<<1.  I'll fix it up. :)

Yeah... thanks in advance.

Jan

-- 
Siemens AG, Corporate Technology, CT RTC ITP SES-DE
Corporate Competence Center Embedded Linux
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 3/6] kvm_stat: Rework platform detection

2014-06-17 Thread Michael Ellerman

The current platform detection is a little bit messy. We look for lines
in /proc/cpuinfo starting with 'flags' OR 'vendor-id', and scan both
for values we know will only occur in one or the other. We also keep
scanning once we've found a value, which could be a feature, but isn't
in this case.

We'd also like to add another platform, powerpc, which will just make it
worse. So clean it up in preparation.

Signed-off-by: Michael Ellerman 
---
 scripts/kvm/kvm_stat | 44 +---
 1 file changed, 29 insertions(+), 15 deletions(-)

diff --git a/scripts/kvm/kvm_stat b/scripts/kvm/kvm_stat
index fe2eae3..98c81a8 100755
--- a/scripts/kvm/kvm_stat
+++ b/scripts/kvm/kvm_stat
@@ -169,27 +169,41 @@ generic_exit_reasons = {
 23: 'EPR',
 }
 
-vendor_exit_reasons = {
+x86_exit_reasons = {
 'vmx': vmx_exit_reasons,
 'svm': svm_exit_reasons,
-'IBM/S390': generic_exit_reasons,
 }
 
-syscall_numbers = {
-'IBM/S390': 331,
-}
-
-sc_perf_evt_open = 298
-
+sc_perf_evt_open = None
 exit_reasons = None
 
-for line in file('/proc/cpuinfo').readlines():
-if line.startswith('flags') or line.startswith('vendor_id'):
-for flag in line.split():
-if flag in vendor_exit_reasons:
-exit_reasons = vendor_exit_reasons[flag]
-if flag in syscall_numbers:
-sc_perf_evt_open = syscall_numbers[flag]
+def x86_init(flag):
+globals().update({
+'sc_perf_evt_open' : 298,
+'exit_reasons' : x86_exit_reasons[flag],
+})
+
+def s390_init():
+globals().update({
+'sc_perf_evt_open' : 331,
+'exit_reasons' : generic_exit_reasons,
+})
+
+def detect_platform():
+for line in file('/proc/cpuinfo').readlines():
+if line.startswith('flags'):
+for flag in line.split():
+if flag in x86_exit_reasons:
+x86_init(flag)
+return
+elif line.startswith('vendor_id'):
+for flag in line.split():
+if flag == 'IBM/S390':
+s390_init()
+return
+
+detect_platform()
+
 filters = {
 'kvm_exit': ('exit_reason', exit_reasons)
 }
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 5/6] kvm_stat: Abstract ioctl numbers

2014-06-17 Thread Michael Ellerman

Unfortunately ioctl numbers are platform specific, so abstract them out
of the code so they can be overridden. As it happens x86 and s390 share
the same values, so nothing needs to change yet.

Signed-off-by: Michael Ellerman 
---
 scripts/kvm/kvm_stat | 12 +---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/scripts/kvm/kvm_stat b/scripts/kvm/kvm_stat
index 2468a22..2e0f5ed 100755
--- a/scripts/kvm/kvm_stat
+++ b/scripts/kvm/kvm_stat
@@ -180,6 +180,12 @@ filters = {
 'kvm_userspace_exit': ('reason', generic_exit_reasons)
 }
 
+ioctl_numbers = {
+'SET_FILTER' : 0x40082406,
+'ENABLE' : 0x2400,
+'DISABLE': 0x2401,
+}
+
 def x86_init(flag):
 globals().update({
 'sc_perf_evt_open' : 298,
@@ -304,14 +310,14 @@ class Event(object):
 raise Exception('perf_event_open failed')
 if filter:
 import fcntl
-fcntl.ioctl(fd, 0x40082406, filter)
+fcntl.ioctl(fd, ioctl_numbers['SET_FILTER'], filter)
 self.fd = fd
 def enable(self):
 import fcntl
-fcntl.ioctl(self.fd, 0x2400, 0)
+fcntl.ioctl(self.fd, ioctl_numbers['ENABLE'], 0)
 def disable(self):
 import fcntl
-fcntl.ioctl(self.fd, 0x2401, 0)
+fcntl.ioctl(self.fd, ioctl_numbers['DISABLE'], 0)
 
 class TracepointProvider(object):
 def __init__(self):
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 4/6] kvm_stat: Fix tracepoint filter definition for s390

2014-06-17 Thread Michael Ellerman

Although we have the exit_reasons defined for s390, as far as I can tell
they never take effect. That is because there is no 'kvm_exit'
tracepoint defined for s390.

What is defined, for all platforms, is 'kvm_userspace_exit'. That
tracepoint uses the generic_exit_reason, but the filter parameter is
'reason'.

So invert the way we setup filters, define it by default for the generic
tracepoint 'kvm_userspace_exit', and let x86 override it. Doing it this
way will also work for powerpc when we add it.

Signed-off-by: Michael Ellerman 
---
 scripts/kvm/kvm_stat | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/scripts/kvm/kvm_stat b/scripts/kvm/kvm_stat
index 98c81a8..2468a22 100755
--- a/scripts/kvm/kvm_stat
+++ b/scripts/kvm/kvm_stat
@@ -175,18 +175,22 @@ x86_exit_reasons = {
 }
 
 sc_perf_evt_open = None
-exit_reasons = None
+
+filters = {
+'kvm_userspace_exit': ('reason', generic_exit_reasons)
+}
 
 def x86_init(flag):
 globals().update({
 'sc_perf_evt_open' : 298,
-'exit_reasons' : x86_exit_reasons[flag],
+'filters' : {
+'kvm_exit': ('exit_reason', x86_exit_reasons[flag])
+},
 })
 
 def s390_init():
 globals().update({
 'sc_perf_evt_open' : 331,
-'exit_reasons' : generic_exit_reasons,
 })
 
 def detect_platform():
@@ -204,10 +208,6 @@ def detect_platform():
 
 detect_platform()
 
-filters = {
-'kvm_exit': ('exit_reason', exit_reasons)
-}
-
 def invert(d):
 return dict((x[1], x[0]) for x in d.iteritems())
 
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/6] kvm_stat: Only consider online cpus

2014-06-17 Thread Michael Ellerman

In kvm_stat we grovel through /sys to find out how many cpus are in the
system. However if a cpu is offline it will still be present in /sys,
and the perf_event_open() will fail.

Modify the logic to only return online cpus. We need to be careful on
systems which don't support cpu hotplug, the online file will not be
present at all.

Signed-off-by: Michael Ellerman 
---
 scripts/kvm/kvm_stat | 24 ++--
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/scripts/kvm/kvm_stat b/scripts/kvm/kvm_stat
index d7e97e7..2a788bc 100755
--- a/scripts/kvm/kvm_stat
+++ b/scripts/kvm/kvm_stat
@@ -311,18 +311,30 @@ class TracepointProvider(object):
 self.select(fields)
 def fields(self):
 return self._fields
+
+def _online_cpus(self):
+l = []
+pattern = r'cpu([0-9]+)'
+basedir = '/sys/devices/system/cpu'
+for entry in os.listdir(basedir):
+match = re.match(pattern, entry)
+if not match:
+continue
+path = os.path.join(basedir, entry, 'online')
+if os.path.exists(path) and open(path).read().strip() != '1':
+continue
+l.append(int(match.group(1)))
+return l
+
 def _setup(self, _fields):
 self._fields = _fields
-cpure = r'cpu([0-9]+)'
-self.cpus = [int(re.match(cpure, x).group(1))
- for x in os.listdir('/sys/devices/system/cpu')
- if re.match(cpure, x)]
+cpus = self._online_cpus()
 import resource
-nfiles = len(self.cpus) * 1000
+nfiles = len(cpus) * 1000
 resource.setrlimit(resource.RLIMIT_NOFILE, (nfiles, nfiles))
 events = []
 self.group_leaders = []
-for cpu in self.cpus:
+for cpu in cpus:
 group = Group(cpu)
 for name in _fields:
 tracepoint = name
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/6] kvm_stat: Fix the non-x86 exit reasons

2014-06-17 Thread Michael Ellerman

In kvm_stat we have a dictionary of exit reasons for s390. Firstly these
are not s390 specific, they are the generic exit reasons. So rename the
dictionary to reflect that.

Secondly, the values are defined using hex, but in the kernel header
they are decimal. That means values above 9 in kvm_stat are incorrect.

As far as I can tell this does not matter in practice because s390 does
not define a kvm_exit trace point.

While we're there, fix the whitespace to match the rest of the file.

Signed-off-by: Michael Ellerman 
---
 scripts/kvm/kvm_stat | 49 +++--
 1 file changed, 27 insertions(+), 22 deletions(-)

diff --git a/scripts/kvm/kvm_stat b/scripts/kvm/kvm_stat
index 2a788bc..fe2eae3 100755
--- a/scripts/kvm/kvm_stat
+++ b/scripts/kvm/kvm_stat
@@ -141,33 +141,38 @@ svm_exit_reasons = {
 0x400: 'NPF',
 }
 
-s390_exit_reasons = {
-   0x000: 'UNKNOWN',
-   0x001: 'EXCEPTION',
-   0x002: 'IO',
-   0x003: 'HYPERCALL',
-   0x004: 'DEBUG',
-   0x005: 'HLT',
-   0x006: 'MMIO',
-   0x007: 'IRQ_WINDOW_OPEN',
-   0x008: 'SHUTDOWN',
-   0x009: 'FAIL_ENTRY',
-   0x010: 'INTR',
-   0x011: 'SET_TPR',
-   0x012: 'TPR_ACCESS',
-   0x013: 'S390_SIEIC',
-   0x014: 'S390_RESET',
-   0x015: 'DCR',
-   0x016: 'NMI',
-   0x017: 'INTERNAL_ERROR',
-   0x018: 'OSI',
-   0x019: 'PAPR_HCALL',
+# From include/uapi/linux/kvm.h, KVM_EXIT_xxx
+generic_exit_reasons = {
+ 0: 'UNKNOWN',
+ 1: 'EXCEPTION',
+ 2: 'IO',
+ 3: 'HYPERCALL',
+ 4: 'DEBUG',
+ 5: 'HLT',
+ 6: 'MMIO',
+ 7: 'IRQ_WINDOW_OPEN',
+ 8: 'SHUTDOWN',
+ 9: 'FAIL_ENTRY',
+10: 'INTR',
+11: 'SET_TPR',
+12: 'TPR_ACCESS',
+13: 'S390_SIEIC',
+14: 'S390_RESET',
+15: 'DCR',
+16: 'NMI',
+17: 'INTERNAL_ERROR',
+18: 'OSI',
+19: 'PAPR_HCALL',
+20: 'S390_UCONTROL',
+21: 'WATCHDOG',
+22: 'S390_TSCH',
+23: 'EPR',
 }
 
 vendor_exit_reasons = {
 'vmx': vmx_exit_reasons,
 'svm': svm_exit_reasons,
-'IBM/S390': s390_exit_reasons,
+'IBM/S390': generic_exit_reasons,
 }
 
 syscall_numbers = {
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 6/6] kvm_stat: Add powerpc support

2014-06-17 Thread Michael Ellerman

Add support for powerpc platforms. We use uname -m, which allows us to
detect ppc, ppc64 and ppc64le/el.

Signed-off-by: Michael Ellerman 
---
 scripts/kvm/kvm_stat | 18 --
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/scripts/kvm/kvm_stat b/scripts/kvm/kvm_stat
index 2e0f5ed..db867fe 100755
--- a/scripts/kvm/kvm_stat
+++ b/scripts/kvm/kvm_stat
@@ -12,7 +12,7 @@
 # the COPYING file in the top-level directory.
 
 import curses
-import sys, os, time, optparse
+import sys, os, time, optparse, ctypes
 
 class DebugfsProvider(object):
 def __init__(self):
@@ -199,7 +199,21 @@ def s390_init():
 'sc_perf_evt_open' : 331,
 })
 
+def ppc_init():
+globals().update({
+'sc_perf_evt_open' : 319,
+'ioctl_numbers' : {
+'SET_FILTER' : 0x80002406 | (ctypes.sizeof(ctypes.c_char_p) << 16),
+'ENABLE' : 0x20002400,
+'DISABLE': 0x20002401,
+}
+})
+
 def detect_platform():
+if os.uname()[4].startswith('ppc'):
+ppc_init()
+return
+
 for line in file('/proc/cpuinfo').readlines():
 if line.startswith('flags'):
 for flag in line.split():
@@ -220,7 +234,7 @@ def invert(d):
 for f in filters:
 filters[f] = (filters[f][0], invert(filters[f][1]))
 
-import ctypes, struct, array
+import struct, array
 
 libc = ctypes.CDLL('libc.so.6')
 syscall = libc.syscall
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2 5/6] VMX: Validate capability MSRs

2014-06-17 Thread Paolo Bonzini


Il 17/06/2014 09:04, Jan Kiszka ha scritto:

+   default1 = vmx_ctl_msr[n].default1;
+   ok = (ctrl.set & default1) == default1 &&
+   ((ctrl.set ^ ctrl.clr) & ~ctrl.clr) == 0;


Thanks, now I can understand what's going on. :)  It can still be 
simplified though.


This is just (ctrl.set & ~ctrl.clr) == 0:

(a ^ b) & ~b == 0
->   (a & ~b) ^ (b & ~b) == 0
->   a & ~b == 0

This expresses just as well the concept that set=1, clr=0 is an 
impossible combination.


Also, it's a bit easier to read if the second line is written as a 
separate statement:


ok = ok && (ctrl.set & ~ctrl.clr) == 0;


+   if (ok && basic.ctrl) {
+   true_ctrl.val = rdmsr(vmx_ctl_msr[n].true_index);
+   ok = ctrl.clr == true_ctrl.clr &&
+   ((ctrl.set ^ true_ctrl.set) & ~default1) == 0;


This is

((ctrl.set ^ true_ctrl.set) & ~default1 == 0
->   ((ctrl.set & ~default1) ^ (true_ctrl.set & ~default1) == 0
->   ((ctrl.set & ~default1) == (true_ctrl.set & ~default1)

A bit longer, but clearer: the difference between ctrl.set and 
true_ctrl.set is only that true_ctrl.set can clear some default-1 bits.


Or you can simplify it further:

->  (ctrl.set | default1) == (true_ctrl.set | default1)
->  ctrl.set == (true_ctrl.set | default1)

Also clearer: the difference between ctrl.set and true_ctrl.set is only 
that default-1 bits must be 1 in ctrl.set.  Pick the one you prefer.


Again, using a separate statement makes it easier to read in my opinion; 
in fact I would also write both statements as


ok = ok && ...

even though it's redundant for the clr test.

Can you submit v3 of this patch only?

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 6/6] kvm_stat: Add powerpc support

2014-06-17 Thread Alexander Graf



On 17.06.14 09:54, Michael Ellerman wrote:

Add support for powerpc platforms. We use uname -m, which allows us to
detect ppc, ppc64 and ppc64le/el.

Signed-off-by: Michael Ellerman 


Could you please add support for PR KVM tracepoints along the way? There 
we do know the exit reason for every single guest <-> host transition. I 
would like to move to a similar model with HV in the future, so we can 
hopefully just reuse this by then.



Alex


---
  scripts/kvm/kvm_stat | 18 --
  1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/scripts/kvm/kvm_stat b/scripts/kvm/kvm_stat
index 2e0f5ed..db867fe 100755
--- a/scripts/kvm/kvm_stat
+++ b/scripts/kvm/kvm_stat
@@ -12,7 +12,7 @@
  # the COPYING file in the top-level directory.
  
  import curses

-import sys, os, time, optparse
+import sys, os, time, optparse, ctypes
  
  class DebugfsProvider(object):

  def __init__(self):
@@ -199,7 +199,21 @@ def s390_init():
  'sc_perf_evt_open' : 331,
  })
  
+def ppc_init():

+globals().update({
+'sc_perf_evt_open' : 319,
+'ioctl_numbers' : {
+'SET_FILTER' : 0x80002406 | (ctypes.sizeof(ctypes.c_char_p) << 16),
+'ENABLE' : 0x20002400,
+'DISABLE': 0x20002401,
+}
+})
+
  def detect_platform():
+if os.uname()[4].startswith('ppc'):
+ppc_init()
+return
+
  for line in file('/proc/cpuinfo').readlines():
  if line.startswith('flags'):
  for flag in line.split():
@@ -220,7 +234,7 @@ def invert(d):
  for f in filters:
  filters[f] = (filters[f][0], invert(filters[f][1]))
  
-import ctypes, struct, array

+import struct, array
  
  libc = ctypes.CDLL('libc.so.6')

  syscall = libc.syscall


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 1/6] PPC: Add asm helpers for BE 32bit load/store

2014-06-17 Thread Alexander Graf



On 17.06.14 02:51, Paul Mackerras wrote:

On Wed, Jun 11, 2014 at 12:33:46PM +0200, Alexander Graf wrote:

>From assembly code we might not only have to explicitly BE access 64bit values,
but sometimes also 32bit ones. Add helpers that allow for easy use of lwzx/stwx
in their respective byte-reverse or native form.

Signed-off-by: Alexander Graf 
CC: Benjamin Herrenschmidt 
---
  arch/powerpc/include/asm/asm-compat.h | 4 
  1 file changed, 4 insertions(+)

diff --git a/arch/powerpc/include/asm/asm-compat.h 
b/arch/powerpc/include/asm/asm-compat.h
index 4b237aa..21164ff 100644
--- a/arch/powerpc/include/asm/asm-compat.h
+++ b/arch/powerpc/include/asm/asm-compat.h
@@ -34,10 +34,14 @@
  #define PPC_MIN_STKFRM112
  
  #ifdef __BIG_ENDIAN__

+#define LWZX_BEstringify_in_c(lwzx)
  #define LDX_BEstringify_in_c(ldx)
+#define STDX_BEstringify_in_c(stwx)

This should be STWX_BE, shouldn't it?


And there I thought I would get away without anyone noticing it :). Yes, 
of course.



Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 5/6] KVM: PPC: Book3S HV: Access XICS in BE

2014-06-17 Thread Alexander Graf



On 17.06.14 03:02, Paul Mackerras wrote:

On Wed, Jun 11, 2014 at 12:33:50PM +0200, Alexander Graf wrote:

On the exit path from the guest we check what type of interrupt we received
if we received one. This means we're doing hardware access to the XICS interrupt
controller.

However, when running on a little endian system, this access is byte reversed.

So let's make sure to swizzle the bytes back again and virtuall make XICS
accesses big endian.

...


@@ -2241,7 +2253,8 @@ kvmppc_read_intr:
  42:   /* It's not an IPI and it's for the host, stash it in the PACA
 * before exit, it will be picked up by the host ICP driver
 */
-   stw r0, HSTATE_SAVED_XIRR(r13)
+   li  r4, HSTATE_SAVED_XIRR
+   STWX_BE r0, r13, r4

This is a paca field, not something mandated by PAPR or shared with
the guest, so why do we need to keep it BE?  If you do make it BE,
don't you also need to fix kvmppc_get_xics_latch()?


Yikes. Yes. Thanks a lot for the catch!


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] powerpc/kvm: support to handle sw breakpoint

2014-06-17 Thread Alexander Graf



On 14.06.14 23:08, Madhavan Srinivasan wrote:

This patch adds kernel side support for software breakpoint.
Design is that, by using an illegal instruction, we trap to hypervisor
via Emulation Assistance interrupt, where we check for the illegal instruction
and accordingly we return to Host or Guest. Patch mandates use of "abs" 
instruction
(primary opcode 31 and extended opcode 360) as sw breakpoint instruction.
Based on PowerISA v2.01, ABS instruction has been dropped from the architecture
and treated an illegal instruction.

Signed-off-by: Madhavan Srinivasan 
---
  arch/powerpc/kvm/book3s.c|  3 ++-
  arch/powerpc/kvm/book3s_hv.c | 23 +++
  2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index c254c27..b40fe5d 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -789,7 +789,8 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
  int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
struct kvm_guest_debug *dbg)
  {
-   return -EINVAL;
+   vcpu->guest_debug = dbg->control;
+   return 0;
  }
  
  void kvmppc_decrementer_func(unsigned long data)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 7a12edb..688421d 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -67,6 +67,14 @@
  /* Used as a "null" value for timebase values */
  #define TB_NIL(~(u64)0)
  
+/*

+ * SW_BRK_DBG_INT is debug Instruction for supporting Software Breakpoint.
+ * Instruction mnemonic is ABS, primary opcode is 31 and extended opcode is 
360.
+ * Based on PowerISA v2.01, ABS instruction has been dropped from the 
architecture
+ * and treated an illegal instruction.
+ */
+#define SW_BRK_DBG_INT 0x7c0002d0


The instruction we use to trap needs to get exposed to user space via a 
ONE_REG property.


Also, why don't we use twi always or something else that actually is 
defined as illegal instruction? I would like to see this shared with 
book3s_32 PR.



+
  static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
  static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
  
@@ -721,12 +729,19 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,

break;
/*
 * This occurs if the guest executes an illegal instruction.
-* We just generate a program interrupt to the guest, since
-* we don't emulate any guest instructions at this stage.
+* To support software breakpoint, we check for the sw breakpoint
+* instruction to return back to host, if not we just generate a
+* program interrupt to the guest.
 */
case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
-   kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
-   r = RESUME_GUEST;
+   if (vcpu->arch.last_inst == SW_BRK_DBG_INT) {


Don't access last_inst directly. Instead use the provided helpers.


+   run->exit_reason = KVM_EXIT_DEBUG;
+   run->debug.arch.address = vcpu->arch.pc;
+   r = RESUME_HOST;
+   } else {
+   kvmppc_core_queue_program(vcpu, 0x8);


magic numbers


+   r = RESUME_GUEST;
+   }
break;
/*
 * This occurs if the guest (kernel or userspace), does something that


Please enable PR KVM as well while you're at it.


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v2] KVM: PPC: e500mc: Enhance tlb invalidation condition on vcpu schedule

2014-06-17 Thread Mihai Caraman

On vcpu schedule, the condition checked for tlb pollution is too loose.
The tlb entries of a vcpu become polluted (vs stale) only when a different
vcpu within the same logical partition runs in-between. Optimize the tlb
invalidation condition taking into account the logical partition id.

With the new invalidation condition, a guest shows 4% performance improvement
on P5020DS while running a memory stress application with the cpu 
oversubscribed,
the other guest running a cpu intensive workload.

Guest - old invalidation condition
  real 3.89
  user 3.87
  sys 0.01

Guest - enhanced invalidation condition
  real 3.75
  user 3.73
  sys 0.01

Host
  real 3.70
  user 1.85
  sys 0.00

The memory stress application accesses 4KB pages backed by 75% of available
TLB0 entries:

char foo[ENTRIES][4096] __attribute__ ((aligned (4096)));

int main()
{
char bar;
int i, j;

for (i = 0; i < ITERATIONS; i++)
for (j = 0; j < ENTRIES; j++)
bar = foo[j][0];

return 0;
}

Signed-off-by: Mihai Caraman 
Cc: Scott Wood 
---
v2:
 - improve patch name and description
 - add performance results

 arch/powerpc/kvm/e500mc.c | 20 +---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c
index 17e4562..d3b814b0 100644
--- a/arch/powerpc/kvm/e500mc.c
+++ b/arch/powerpc/kvm/e500mc.c
@@ -111,10 +111,12 @@ void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 
old_msr)
 }
 
 static DEFINE_PER_CPU(struct kvm_vcpu *, last_vcpu_on_cpu);
+static DEFINE_PER_CPU(int, last_lpid_on_cpu);
 
 static void kvmppc_core_vcpu_load_e500mc(struct kvm_vcpu *vcpu, int cpu)
 {
struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+   bool update_last = false, inval_tlb = false;
 
kvmppc_booke_vcpu_load(vcpu, cpu);
 
@@ -140,12 +142,24 @@ static void kvmppc_core_vcpu_load_e500mc(struct kvm_vcpu 
*vcpu, int cpu)
mtspr(SPRN_GDEAR, vcpu->arch.shared->dar);
mtspr(SPRN_GESR, vcpu->arch.shared->esr);
 
-   if (vcpu->arch.oldpir != mfspr(SPRN_PIR) ||
-   __get_cpu_var(last_vcpu_on_cpu) != vcpu) {
-   kvmppc_e500_tlbil_all(vcpu_e500);
+   if (vcpu->arch.oldpir != mfspr(SPRN_PIR)) {
+   /* stale tlb entries */
+   inval_tlb = update_last = true;
+   } else if (__get_cpu_var(last_vcpu_on_cpu) != vcpu) {
+   update_last = true;
+   /* polluted tlb entries */
+   inval_tlb = __get_cpu_var(last_lpid_on_cpu) ==
+   vcpu->kvm->arch.lpid;
+   }
+
+   if (update_last) {
__get_cpu_var(last_vcpu_on_cpu) = vcpu;
+   __get_cpu_var(last_lpid_on_cpu) = vcpu->kvm->arch.lpid;
}
 
+   if (inval_tlb)
+   kvmppc_e500_tlbil_all(vcpu_e500);
+
kvmppc_load_guest_fp(vcpu);
 }
 
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Qemu-devel] Why I advise against using ivshmem

2014-06-17 Thread David Marchand


Hello all,

On 06/17/2014 04:54 AM, Stefan Hajnoczi wrote:

ivshmem has a performance disadvantage for guest-to-host
communication.  Since the shared memory is exposed as PCI BARs, the
guest has to memcpy into the shared memory.

vhost-user can access guest memory directly and avoid the copy inside the guest.


Actually, you can avoid this memory copy using frameworks like DPDK.



Unless someone steps up and maintains ivshmem, I think it should be
deprecated and dropped from QEMU.


Then I can maintain ivshmem for QEMU.
If this is ok, I will send a patch for MAINTAINERS file.


--
David Marchand
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: market share??

2014-06-17 Thread Stefan Hajnoczi

On Thu, Jun 12, 2014 at 06:48:14PM +0200, urgrue wrote:
> Does anyone have ANY idea where I could find out about market share and
> adoption rates of linux hypervisors (kvm vs xen vs vmware, mainly)?
> I've not found anything despite extensive searching, which is really kind of
> problematic when you're trying to convince the bosses...

A few years ago IDC published this analysis of KVM.  A lot has changed
since then but it's fun to look back and read:

http://www-03.ibm.com/systems/resources/systems_virtualization_idc__kvmforservervirtualization.pdf

For example, it says KVM has opportunities in the cloud.  Today KVM is
the default and best supported OpenStack hypervisor.

I would focus on more than just KVM (the hypervisor).  Consider what
sort of virtualization solution you need.  Are you doing server
consolidation inside your organization?  Are you doing virtual server
hosting?  Are you doing virtual desktop infrastructure?

Then look at the management tools and ecosystem for that problem space.
It doesn't matter how good the hypervisor is if your management tooling
is a pain to use or doesn't support the storage or networking
architecture you use.

Next, evaluate it.  KVM is free, try it out, prove it works.

At that point you'll know which choice is right for you.

Stefan

pgpuijseQchYk.pgp
Description: PGP signature

Re: [PATCH] KVM: PPC: e500mc: Relax tlb invalidation condition on vcpu schedule

2014-06-17 Thread Alexander Graf



On 13.06.14 21:42, Scott Wood wrote:

On Fri, 2014-06-13 at 16:55 +0200, Alexander Graf wrote:

On 13.06.14 16:43, mihai.cara...@freescale.com wrote:

-Original Message-
From: Alexander Graf [mailto:ag...@suse.de]
Sent: Thursday, June 12, 2014 8:05 PM
To: Caraman Mihai Claudiu-B02008
Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; linuxppc-
d...@lists.ozlabs.org; Wood Scott-B07421
Subject: Re: [PATCH] KVM: PPC: e500mc: Relax tlb invalidation condition
on vcpu schedule

On 06/12/2014 04:00 PM, Mihai Caraman wrote:

On vcpu schedule, the condition checked for tlb pollution is too tight.
The tlb entries of one vcpu are polluted when a different vcpu from the
same partition runs in-between. Relax the current tlb invalidation
condition taking into account the lpid.

Can you quantify the performance improvement from this?  We've had bugs
in this area before, so let's make sure it's worth it before making this
more complicated.


Signed-off-by: Mihai Caraman  freescale.com>

Your mailer is broken? :)
This really should be an @.

I think this should work. Scott, please ack.

Alex, you were right. I screwed up the patch description by inverting relax
and tight terms :) It should have been more like this:

KVM: PPC: e500mc: Enhance tlb invalidation condition on vcpu schedule

On vcpu schedule, the condition checked for tlb pollution is too loose.
The tlb entries of a vcpu are polluted (vs stale) only when a different vcpu
within the same logical partition runs in-between. Optimize the tlb invalidation
condition taking into account the lpid.

Can't we give every vcpu its own lpid? Or don't we trap on global
invalidates?

That would significantly increase the odds of exhausting LPIDs,
especially on large chips like t4240 with similarly large VMs.  If we
were to do that, the LPIDs would need to be dynamically assigned (like
PIDs), and should probably be a separate numberspace per physical core.


True, I didn't realize we only have so few of them. It would however 
save us from most flushing as long as we have spare LPIDs available :).



Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] KVM: PPC: Book3s PR: Disable AIL mode with OPAL

2014-06-17 Thread Alexander Graf



On 12.06.14 05:56, Paul Mackerras wrote:

On Tue, Jun 10, 2014 at 07:23:00PM +0200, Alexander Graf wrote:

When we're using PR KVM we must not allow the CPU to take interrupts
in virtual mode, as the SLB does not contain host kernel mappings
when running inside the guest context.

To make sure we get good performance for non-KVM tasks but still
properly functioning PR KVM, let's just disable AIL whenever a vcpu
is scheduled in.

This patch fixes running PR KVM on POWER8 bare metal for me.

We already handle this for the situation where we're running under a
hypervisor with the calls to pSeries_disable_reloc_on_exc() and
pSeries_enable_reloc_on_exc() in kvmppc_core_init_vm_pr() and
kvmppc_core_destroy_vm_pr() respectively.

The obvious approach to fixing this problem would be to generalize
those calls, perhaps via a ppc_md callback, to work on the powernv
platform too.  If you don't want to do that, for instance because
those calls are defined to operate across the whole machine rather
than a single CPU thread, and you prefer to affect just the one thread
we're running on, then I think you need to explain that in the commit
message.


It's what I've done at first, yes. Unfortunately the pSeries call is 
system global, while we need to do the AIL switching per cpu on bare metal.


Once you start considering CPU hotplug and how that affects the 
secondary bringup paths you start wondering whether it's worth the 
hassle :). This way you can have a single guest running which only slows 
down (disables AIL) on its own vcpu, while all the others still enjoy 
the benefits of AIL.



Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] powerpc/kvm: support to handle sw breakpoint

2014-06-17 Thread Benjamin Herrenschmidt

On Tue, 2014-06-17 at 10:54 +0200, Alexander Graf wrote:
> 
> Also, why don't we use twi always or something else that actually is 
> defined as illegal instruction? I would like to see this shared with 
> book3s_32 PR.

twi will be directed to the guest on HV no ? We want a real illegal
because those go to the host (for potential emulation by the HV). I'm
trying to see if I can get the architect to set one in stone in a future
proof way.

Cheers,
Ben.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] powerpc/kvm: support to handle sw breakpoint

2014-06-17 Thread Alexander Graf



On 17.06.14 11:22, Benjamin Herrenschmidt wrote:

On Tue, 2014-06-17 at 10:54 +0200, Alexander Graf wrote:

Also, why don't we use twi always or something else that actually is
defined as illegal instruction? I would like to see this shared with
book3s_32 PR.

twi will be directed to the guest on HV no ? We want a real illegal
because those go to the host (for potential emulation by the HV).


Ah, good point. I guess we need different one for PR and HV then to 
ensure compatibility with older ISAs on PR.



Alex


I'm
trying to see if I can get the architect to set one in stone in a future
proof way.

Cheers,
Ben.



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] powerpc/kvm: support to handle sw breakpoint

2014-06-17 Thread Benjamin Herrenschmidt

On Tue, 2014-06-17 at 11:25 +0200, Alexander Graf wrote:
> On 17.06.14 11:22, Benjamin Herrenschmidt wrote:
> > On Tue, 2014-06-17 at 10:54 +0200, Alexander Graf wrote:
> >> Also, why don't we use twi always or something else that actually is
> >> defined as illegal instruction? I would like to see this shared with
> >> book3s_32 PR.
> > twi will be directed to the guest on HV no ? We want a real illegal
> > because those go to the host (for potential emulation by the HV).
> 
> Ah, good point. I guess we need different one for PR and HV then to 
> ensure compatibility with older ISAs on PR.

Well, we also need to be careful with what happens if a PR guest puts
that instruction in, do that stop its HV guest/host ?

What if it's done in userspace ? Do that stop the kernel ? :-)

Maddy, I haven't checked, does your patch ensure that we only ever stop
if the instruction is at a recorded bkpt address ? It still means that a
userspace process can practically DOS its kernel by issuing a lot of
these causing a crapload of exits.

Cheers,
Ben.

> Alex
> 
> > I'm
> > trying to see if I can get the architect to set one in stone in a future
> > proof way.
> >
> > Cheers,
> > Ben.
> >
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] powerpc/kvm: support to handle sw breakpoint

2014-06-17 Thread Alexander Graf



On 17.06.14 11:32, Benjamin Herrenschmidt wrote:

On Tue, 2014-06-17 at 11:25 +0200, Alexander Graf wrote:

On 17.06.14 11:22, Benjamin Herrenschmidt wrote:

On Tue, 2014-06-17 at 10:54 +0200, Alexander Graf wrote:

Also, why don't we use twi always or something else that actually is
defined as illegal instruction? I would like to see this shared with
book3s_32 PR.

twi will be directed to the guest on HV no ? We want a real illegal
because those go to the host (for potential emulation by the HV).

Ah, good point. I guess we need different one for PR and HV then to
ensure compatibility with older ISAs on PR.

Well, we also need to be careful with what happens if a PR guest puts
that instruction in, do that stop its HV guest/host ?

What if it's done in userspace ? Do that stop the kernel ? :-)


The way SW breakpointing is handled is that when we see one, it gets 
deflected into user space. User space then has an array of breakpoints 
it configured itself. If the breakpoint is part of that list, it 
consumes it. If not, it injects a debug interrupt (program in this case) 
into the guest.


That way we can overlay that one instruction with as many layers as we 
like :). We only get a performance hit on execution of that instruction.



Maddy, I haven't checked, does your patch ensure that we only ever stop
if the instruction is at a recorded bkpt address ? It still means that a
userspace process can practically DOS its kernel by issuing a lot of
these causing a crapload of exits.


Only user space knows about its breakpoint addresses, so we have to 
deflect. However since time still ticks on, we only increase jitter of 
the guest. The process would still get scheduled away after the same 
amount of real time, no?



Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Qemu-devel] Why I advise against using ivshmem

2014-06-17 Thread Paolo Bonzini


Il 17/06/2014 11:03, David Marchand ha scritto:

Unless someone steps up and maintains ivshmem, I think it should be
deprecated and dropped from QEMU.


Then I can maintain ivshmem for QEMU.
If this is ok, I will send a patch for MAINTAINERS file.


Typically, adding yourself to maintainers is done only after having 
proved your ability to be a maintainer. :)


So, let's stop talking and go back to code!  You can start doing what 
was suggested elsewhere in the thread: get the server and uio driver 
merged into the QEMU tree, document the protocol in 
docs/specs/ivshmem_device_spec.txt, and start fixing bugs such as the 
ones that Markus reported.


Since ivshmem is basically KVM-only (it has a soft dependency on 
ioeventfd), CC the patches to kvm@vger.kernel.org and I'll merge them 
via the KVM tree for now.  I'll (more than) gladly give maintainership 
away in due time.


Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v2] KVM: PPC: Book3s PR: Disable AIL mode with OPAL

2014-06-17 Thread Alexander Graf

When we're using PR KVM we must not allow the CPU to take interrupts
in virtual mode, as the SLB does not contain host kernel mappings
when running inside the guest context.

To make sure we get good performance for non-KVM tasks but still
properly functioning PR KVM, let's just disable AIL whenever a vcpu
is scheduled in.

This is fundamentally different from how we deal with AIL on pSeries
type machines where we disable AIL for the whole machine as soon as
a single KVM VM is up.

The reason for that is easy - on pSeries we do not have control over
per-cpu configuration of AIL. We also don't want to mess with CPU hotplug
races and AIL configuration, so setting it per CPU is easier and more
flexible.

This patch fixes running PR KVM on POWER8 bare metal for me.

Signed-off-by: Alexander Graf 

---

v1 -> v2:

  - Extend patch description

diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 3da412e..8ea7da4 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -71,6 +71,12 @@ static void kvmppc_core_vcpu_load_pr(struct kvm_vcpu *vcpu, 
int cpu)
svcpu->in_use = 0;
svcpu_put(svcpu);
 #endif
+
+   /* Disable AIL if supported */
+   if (cpu_has_feature(CPU_FTR_HVMODE) &&
+   cpu_has_feature(CPU_FTR_ARCH_207S))
+   mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~LPCR_AIL);
+
vcpu->cpu = smp_processor_id();
 #ifdef CONFIG_PPC_BOOK3S_32
current->thread.kvm_shadow_vcpu = vcpu->arch.shadow_vcpu;
@@ -91,6 +97,12 @@ static void kvmppc_core_vcpu_put_pr(struct kvm_vcpu *vcpu)
 
kvmppc_giveup_ext(vcpu, MSR_FP | MSR_VEC | MSR_VSX);
kvmppc_giveup_fac(vcpu, FSCR_TAR_LG);
+
+   /* Enable AIL if supported */
+   if (cpu_has_feature(CPU_FTR_HVMODE) &&
+   cpu_has_feature(CPU_FTR_ARCH_207S))
+   mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) | LPCR_AIL_3);
+
vcpu->cpu = -1;
 }
 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2] KVM: PPC: Book3s PR: Disable AIL mode with OPAL

2014-06-17 Thread Paul Mackerras

On Tue, Jun 17, 2014 at 12:03:58PM +0200, Alexander Graf wrote:
> When we're using PR KVM we must not allow the CPU to take interrupts
> in virtual mode, as the SLB does not contain host kernel mappings
> when running inside the guest context.
> 
> To make sure we get good performance for non-KVM tasks but still
> properly functioning PR KVM, let's just disable AIL whenever a vcpu
> is scheduled in.
> 
> This is fundamentally different from how we deal with AIL on pSeries
> type machines where we disable AIL for the whole machine as soon as
> a single KVM VM is up.
> 
> The reason for that is easy - on pSeries we do not have control over
> per-cpu configuration of AIL. We also don't want to mess with CPU hotplug
> races and AIL configuration, so setting it per CPU is easier and more
> flexible.
> 
> This patch fixes running PR KVM on POWER8 bare metal for me.
> 
> Signed-off-by: Alexander Graf 

Acked-by: Paul Mackerras 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 5/6] KVM: PPC: Book3S HV: Access XICS in BE

2014-06-17 Thread Alexander Graf



On 17.06.14 10:37, Alexander Graf wrote:


On 17.06.14 03:02, Paul Mackerras wrote:

On Wed, Jun 11, 2014 at 12:33:50PM +0200, Alexander Graf wrote:
On the exit path from the guest we check what type of interrupt we 
received
if we received one. This means we're doing hardware access to the 
XICS interrupt

controller.

However, when running on a little endian system, this access is byte 
reversed.


So let's make sure to swizzle the bytes back again and virtuall make 
XICS

accesses big endian.

...


@@ -2241,7 +2253,8 @@ kvmppc_read_intr:
  42:/* It's not an IPI and it's for the host, stash it in the PACA
   * before exit, it will be picked up by the host ICP driver
   */
-stwr0, HSTATE_SAVED_XIRR(r13)
+lir4, HSTATE_SAVED_XIRR
+STWX_BEr0, r13, r4

This is a paca field, not something mandated by PAPR or shared with
the guest, so why do we need to keep it BE?  If you do make it BE,
don't you also need to fix kvmppc_get_xics_latch()?


Yikes. Yes. Thanks a lot for the catch!


Eh, no. What we do is we read (good on BE, byte reversed) into r0. Then 
we swab32() from r0 to r3 on LE, mr from r0 to r3 on BE.


r3 gets truncated along the way.

The reason we maintain r0 as wrong-endian is that we write it back using 
the cache inhibited stwcix instruction:



stwcix  r0, r6, r7  /* EOI it */


So during the lifetime of r0 as XIRR it's always byte-reversed on LE. 
That's why we store it using STWX_BE into hstate, because that's the 
time when we actually swab32() it for further interpretation.


Alternatively I could clobber a different register and maintain the byte 
swapped variant in there if you prefer.



Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Divide error in kvm_unlock_kick()

2014-06-17 Thread Chris Webb

I see kernel 3.15 is now out, so I retested with 3.15 guest and host. I'm
still getting exactly the same guest kernel panic: a divide error in
kvm_unlock_kick with -cpu host, but not with -cpu qemu64:

divide error:  [#1] PREEMPT SMP 
Modules linked in:
CPU: 1 PID: 781 Comm: mkdir Not tainted 3.15.0-guest #1
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS Bochs 01/01/2011
task: 88007cbf6180 ti: 88088000 task.ti: 88088000
RIP: 0010:[]  [] kvm_unlock_kick+0x63/0x6b
RSP: :88007fc83d38  EFLAGS: 00010046
RAX: 0005 RBX:  RCX: 0002
RDX: 0002 RSI: 88007fd11d80 RDI: 81994840
RBP: 88007fd11d80 R08:  R09: 81994840
R10: 88007c480c88 R11: 0005 R12: cec0
R13: 88007d38332a R14: 0002 R15: 88007d382d00
FS:  7fdabf7fd700() GS:88007fc8() knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2: 7fd0643f6509 CR3: 7c028000 CR4: 000406e0
Stack:
 00011d80 0002 88007fd11d80 8156f83f
 810dba53 0046 88007fd0 88007d3bbe70
 81845da8 0003  
Call Trace:
  
 [] ? _raw_spin_unlock+0x32/0x55
 [] ? try_to_wake_up+0x1ed/0x20f
 [] ? autoremove_wake_function+0x9/0x2a
 [] ? __wake_up_common+0x47/0x73
 [] ? __wake_up+0x33/0x44
 [] ? irq_work_run+0x72/0x8f
 [] ? smp_irq_work_interrupt+0x26/0x2b
 [] ? irq_work_interrupt+0x6d/0x80
 [] ? try_to_wake_up+0x1fe/0x20f
 [] ? native_apic_msr_read+0x6/0x4e
 [] ? _raw_spin_unlock_irqrestore+0x3d/0x65
 [] ? rcu_process_callbacks+0x15e/0x47d
 [] ? execute_in_process_context+0x55/0x55
 [] ? __do_softirq+0xe0/0x1e6
 [] ? irq_exit+0x3c/0x81
 [] ? smp_apic_timer_interrupt+0x3b/0x46
 [] ? apic_timer_interrupt+0x6d/0x80
  
Code: 0c c5 c0 b8 87 81 49 8d 04 0c 48 8b 30 48 39 ee 75 ca 8a 40 08 38 d8 75 
c3 48 c7 c0 22 b0 00 00 31 db 0f b7 0c 08 b8 05 00 00 00 <0f> 01 c1 5b 5d 41 5c 
c3 4c 8d 54 24 08 48 83 e4 f0 b9 0a 00 00 
RIP  [] kvm_unlock_kick+0x63/0x6b
 RSP 
---[ end trace 949b1bf47cc57d09 ]---
Kernel panic - not syncing: Fatal exception in interrupt
Shutting down cpus with NMI
Kernel Offset: 0x0 from 0x8100 (relocation range: 
0x8000-0x9fff)
---[ end Kernel panic - not syncing: Fatal exception in interrupt

I'm at a complete loss as to what to do next to debug this. Any help would be
extremely gratefully received!

I've put 3.15 host and guest configs here:

  http://cdw.me.uk/tmp/3.15-guest-config.txt
  http://cdw.me.uk/tmp/3.15-host-config.txt

dmesg just after boot here:

  http://cdw.me.uk/tmp/3.15-guest-dmesg.txt
  http://cdw.me.uk/tmp/3.15-host-dmesg.txt

and /proc/cpuinfo from both host and guest here:

  http://cdw.me.uk/tmp/3.15-guest-cpuinfo.txt
  http://cdw.me.uk/tmp/3.15-host-cpuinfo.txt

The qemu command line was

  qemu-system-x86 -enable-kvm -cpu host -machine q35 -m 2048 -name omega \
-smp sockets=1,cores=4 -pidfile /run/omega.pid -runas nobody \
-serial stdio -vga none -vnc none -kernel /boot/vmlinuz-guest \
-append "console=ttyS0 root=/dev/vda" \
-drive file=/dev/guest/omega,cache=none,format=raw,if=virtio \
-device virtio-rng-pci \
-device virtio-net-pci,netdev=nic,mac=02:14:72:3c:69:54 \
-netdev tap,id=nic,fd=3,vhost=on 3<>/dev/tapNNN

but removing the -machine q35 and -device virtio-rng-pci doesn't affect the
crash.

Dropping to -smp 1, running with -cpu qemu64, or compiling the guest kernel
without paravirtualised spinlock support does remove the panic, albeit at the
cost of performance.

Best wishes,

Chris.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] powerpc/kvm: support to handle sw breakpoint

2014-06-17 Thread Madhavan Srinivasan

On Tuesday 17 June 2014 02:24 PM, Alexander Graf wrote:
> 
> On 14.06.14 23:08, Madhavan Srinivasan wrote:
>> This patch adds kernel side support for software breakpoint.
>> Design is that, by using an illegal instruction, we trap to hypervisor
>> via Emulation Assistance interrupt, where we check for the illegal
>> instruction
>> and accordingly we return to Host or Guest. Patch mandates use of
>> "abs" instruction
>> (primary opcode 31 and extended opcode 360) as sw breakpoint instruction.
>> Based on PowerISA v2.01, ABS instruction has been dropped from the
>> architecture
>> and treated an illegal instruction.
>>
>> Signed-off-by: Madhavan Srinivasan 
>> ---
>>   arch/powerpc/kvm/book3s.c|  3 ++-
>>   arch/powerpc/kvm/book3s_hv.c | 23 +++
>>   2 files changed, 21 insertions(+), 5 deletions(-)
>>
>> diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
>> index c254c27..b40fe5d 100644
>> --- a/arch/powerpc/kvm/book3s.c
>> +++ b/arch/powerpc/kvm/book3s.c
>> @@ -789,7 +789,8 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu
>> *vcpu,
>>   int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
>>   struct kvm_guest_debug *dbg)
>>   {
>> -return -EINVAL;
>> +vcpu->guest_debug = dbg->control;
>> +return 0;
>>   }
>> void kvmppc_decrementer_func(unsigned long data)
>> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
>> index 7a12edb..688421d 100644
>> --- a/arch/powerpc/kvm/book3s_hv.c
>> +++ b/arch/powerpc/kvm/book3s_hv.c
>> @@ -67,6 +67,14 @@
>>   /* Used as a "null" value for timebase values */
>>   #define TB_NIL(~(u64)0)
>>   +/*
>> + * SW_BRK_DBG_INT is debug Instruction for supporting Software
>> Breakpoint.
>> + * Instruction mnemonic is ABS, primary opcode is 31 and extended
>> opcode is 360.
>> + * Based on PowerISA v2.01, ABS instruction has been dropped from the
>> architecture
>> + * and treated an illegal instruction.
>> + */
>> +#define SW_BRK_DBG_INT 0x7c0002d0
> 
> The instruction we use to trap needs to get exposed to user space via a
> ONE_REG property.
> 

Yes. I got to know about that from Bharat (patchset "ppc debug: Add
debug stub support"). I will change it.

> Also, why don't we use twi always or something else that actually is
> defined as illegal instruction? I would like to see this shared with
> book3s_32 PR.
> 
>> +
>>   static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
>>   static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
>>   @@ -721,12 +729,19 @@ static int kvmppc_handle_exit_hv(struct
>> kvm_run *run, struct kvm_vcpu *vcpu,
>>   break;
>>   /*
>>* This occurs if the guest executes an illegal instruction.
>> - * We just generate a program interrupt to the guest, since
>> - * we don't emulate any guest instructions at this stage.
>> + * To support software breakpoint, we check for the sw breakpoint
>> + * instruction to return back to host, if not we just generate a
>> + * program interrupt to the guest.
>>*/
>>   case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
>> -kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
>> -r = RESUME_GUEST;
>> +if (vcpu->arch.last_inst == SW_BRK_DBG_INT) {
> 
> Don't access last_inst directly. Instead use the provided helpers.
> 

Ok. Will look and replace it.

>> +run->exit_reason = KVM_EXIT_DEBUG;
>> +run->debug.arch.address = vcpu->arch.pc;
>> +r = RESUME_HOST;
>> +} else {
>> +kvmppc_core_queue_program(vcpu, 0x8);
> 
> magic numbers
^
I did not understand this?

>> +r = RESUME_GUEST;
>> +}
>>   break;
>>   /*
>>* This occurs if the guest (kernel or userspace), does
>> something that
> 
> Please enable PR KVM as well while you're at it.
> 
My bad, I did not try the PR KVM. I will try it out.

> 
> Alex
> 
Thanks for review
Regards
Maddy

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] powerpc/kvm: support to handle sw breakpoint

2014-06-17 Thread Alexander Graf



On 17.06.14 13:07, Madhavan Srinivasan wrote:

On Tuesday 17 June 2014 02:24 PM, Alexander Graf wrote:

On 14.06.14 23:08, Madhavan Srinivasan wrote:

This patch adds kernel side support for software breakpoint.
Design is that, by using an illegal instruction, we trap to hypervisor
via Emulation Assistance interrupt, where we check for the illegal
instruction
and accordingly we return to Host or Guest. Patch mandates use of
"abs" instruction
(primary opcode 31 and extended opcode 360) as sw breakpoint instruction.
Based on PowerISA v2.01, ABS instruction has been dropped from the
architecture
and treated an illegal instruction.

Signed-off-by: Madhavan Srinivasan 
---
   arch/powerpc/kvm/book3s.c|  3 ++-
   arch/powerpc/kvm/book3s_hv.c | 23 +++
   2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index c254c27..b40fe5d 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -789,7 +789,8 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu
*vcpu,
   int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
   struct kvm_guest_debug *dbg)
   {
-return -EINVAL;
+vcpu->guest_debug = dbg->control;
+return 0;
   }
 void kvmppc_decrementer_func(unsigned long data)
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 7a12edb..688421d 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -67,6 +67,14 @@
   /* Used as a "null" value for timebase values */
   #define TB_NIL(~(u64)0)
   +/*
+ * SW_BRK_DBG_INT is debug Instruction for supporting Software
Breakpoint.
+ * Instruction mnemonic is ABS, primary opcode is 31 and extended
opcode is 360.
+ * Based on PowerISA v2.01, ABS instruction has been dropped from the
architecture
+ * and treated an illegal instruction.
+ */
+#define SW_BRK_DBG_INT 0x7c0002d0

The instruction we use to trap needs to get exposed to user space via a
ONE_REG property.


Yes. I got to know about that from Bharat (patchset "ppc debug: Add
debug stub support"). I will change it.


Also, why don't we use twi always or something else that actually is
defined as illegal instruction? I would like to see this shared with
book3s_32 PR.


+
   static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
   static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
   @@ -721,12 +729,19 @@ static int kvmppc_handle_exit_hv(struct
kvm_run *run, struct kvm_vcpu *vcpu,
   break;
   /*
* This occurs if the guest executes an illegal instruction.
- * We just generate a program interrupt to the guest, since
- * we don't emulate any guest instructions at this stage.
+ * To support software breakpoint, we check for the sw breakpoint
+ * instruction to return back to host, if not we just generate a
+ * program interrupt to the guest.
*/
   case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
-kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
-r = RESUME_GUEST;
+if (vcpu->arch.last_inst == SW_BRK_DBG_INT) {

Don't access last_inst directly. Instead use the provided helpers.


Ok. Will look and replace it.


+run->exit_reason = KVM_EXIT_DEBUG;
+run->debug.arch.address = vcpu->arch.pc;
+r = RESUME_HOST;
+} else {
+kvmppc_core_queue_program(vcpu, 0x8);

magic numbers

^
I did not understand this?


You're replacing the readable SRR1_PROGILL with the unreadable 0x8. 
That's bad.



Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] powerpc/kvm: support to handle sw breakpoint

2014-06-17 Thread Madhavan Srinivasan

On Tuesday 17 June 2014 03:02 PM, Benjamin Herrenschmidt wrote:
> On Tue, 2014-06-17 at 11:25 +0200, Alexander Graf wrote:
>> On 17.06.14 11:22, Benjamin Herrenschmidt wrote:
>>> On Tue, 2014-06-17 at 10:54 +0200, Alexander Graf wrote:
 Also, why don't we use twi always or something else that actually is
 defined as illegal instruction? I would like to see this shared with
 book3s_32 PR.
>>> twi will be directed to the guest on HV no ? We want a real illegal
>>> because those go to the host (for potential emulation by the HV).
>>
>> Ah, good point. I guess we need different one for PR and HV then to 
>> ensure compatibility with older ISAs on PR.
> 
> Well, we also need to be careful with what happens if a PR guest puts
> that instruction in, do that stop its HV guest/host ?
> 

Damn, my mail client is messed up. did not see the mail till now.


I havent tried this incase of PR guest kernel. I will need to try this
before commenting.

> What if it's done in userspace ? Do that stop the kernel ? :-)
> 

Basically flow is that, when we see this instruction, we return to host,
and host checks for address in the SW array and if not it returns to kernel.

> Maddy, I haven't checked, does your patch ensure that we only ever stop
> if the instruction is at a recorded bkpt address ? It still means that a
> userspace process can practically DOS its kernel by issuing a lot of
> these causing a crapload of exits.
> 
This is valid, userspace can create a mess, need to handle this, meaning
incase if we dont find a valid SW breakpoint for this address in the
HOST, we need to route it to guest and kill it at app.

Regards
Maddy

> Cheers,
> Ben.
> 
>> Alex
>>
>>> I'm
>>> trying to see if I can get the architect to set one in stone in a future
>>> proof way.
>>>
>>> Cheers,
>>> Ben.
>>>
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe kvm" in
>> the body of a message to majord...@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] powerpc/kvm: support to handle sw breakpoint

2014-06-17 Thread Madhavan Srinivasan

On Tuesday 17 June 2014 04:38 PM, Alexander Graf wrote:
> 
> On 17.06.14 13:07, Madhavan Srinivasan wrote:
>> On Tuesday 17 June 2014 02:24 PM, Alexander Graf wrote:
>>> On 14.06.14 23:08, Madhavan Srinivasan wrote:
 This patch adds kernel side support for software breakpoint.
 Design is that, by using an illegal instruction, we trap to hypervisor
 via Emulation Assistance interrupt, where we check for the illegal
 instruction
 and accordingly we return to Host or Guest. Patch mandates use of
 "abs" instruction
 (primary opcode 31 and extended opcode 360) as sw breakpoint
 instruction.
 Based on PowerISA v2.01, ABS instruction has been dropped from the
 architecture
 and treated an illegal instruction.

 Signed-off-by: Madhavan Srinivasan 
 ---
arch/powerpc/kvm/book3s.c|  3 ++-
arch/powerpc/kvm/book3s_hv.c | 23 +++
2 files changed, 21 insertions(+), 5 deletions(-)

 diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
 index c254c27..b40fe5d 100644
 --- a/arch/powerpc/kvm/book3s.c
 +++ b/arch/powerpc/kvm/book3s.c
 @@ -789,7 +789,8 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu
 *vcpu,
int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
struct kvm_guest_debug *dbg)
{
 -return -EINVAL;
 +vcpu->guest_debug = dbg->control;
 +return 0;
}
  void kvmppc_decrementer_func(unsigned long data)
 diff --git a/arch/powerpc/kvm/book3s_hv.c
 b/arch/powerpc/kvm/book3s_hv.c
 index 7a12edb..688421d 100644
 --- a/arch/powerpc/kvm/book3s_hv.c
 +++ b/arch/powerpc/kvm/book3s_hv.c
 @@ -67,6 +67,14 @@
/* Used as a "null" value for timebase values */
#define TB_NIL(~(u64)0)
+/*
 + * SW_BRK_DBG_INT is debug Instruction for supporting Software
 Breakpoint.
 + * Instruction mnemonic is ABS, primary opcode is 31 and extended
 opcode is 360.
 + * Based on PowerISA v2.01, ABS instruction has been dropped from the
 architecture
 + * and treated an illegal instruction.
 + */
 +#define SW_BRK_DBG_INT 0x7c0002d0
>>> The instruction we use to trap needs to get exposed to user space via a
>>> ONE_REG property.
>>>
>> Yes. I got to know about that from Bharat (patchset "ppc debug: Add
>> debug stub support"). I will change it.
>>
>>> Also, why don't we use twi always or something else that actually is
>>> defined as illegal instruction? I would like to see this shared with
>>> book3s_32 PR.
>>>
 +
static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
@@ -721,12 +729,19 @@ static int kvmppc_handle_exit_hv(struct
 kvm_run *run, struct kvm_vcpu *vcpu,
break;
/*
 * This occurs if the guest executes an illegal instruction.
 - * We just generate a program interrupt to the guest, since
 - * we don't emulate any guest instructions at this stage.
 + * To support software breakpoint, we check for the sw breakpoint
 + * instruction to return back to host, if not we just generate a
 + * program interrupt to the guest.
 */
case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
 -kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
 -r = RESUME_GUEST;
 +if (vcpu->arch.last_inst == SW_BRK_DBG_INT) {
>>> Don't access last_inst directly. Instead use the provided helpers.
>>>
>> Ok. Will look and replace it.
>>
 +run->exit_reason = KVM_EXIT_DEBUG;
 +run->debug.arch.address = vcpu->arch.pc;
 +r = RESUME_HOST;
 +} else {
 +kvmppc_core_queue_program(vcpu, 0x8);
>>> magic numbers
>> ^
>> I did not understand this?
> 
> You're replacing the readable SRR1_PROGILL with the unreadable 0x8.
> That's bad.
> 

Oops. My bad. Will undo that. I guess I messed up when was re basing it.

> 
> Alex
> 
Thanks for review
Regards
Maddy

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] powerpc/kvm: support to handle sw breakpoint

2014-06-17 Thread Madhavan Srinivasan

On Tuesday 17 June 2014 03:13 PM, Alexander Graf wrote:
> 
> On 17.06.14 11:32, Benjamin Herrenschmidt wrote:
>> On Tue, 2014-06-17 at 11:25 +0200, Alexander Graf wrote:
>>> On 17.06.14 11:22, Benjamin Herrenschmidt wrote:
 On Tue, 2014-06-17 at 10:54 +0200, Alexander Graf wrote:
> Also, why don't we use twi always or something else that actually is
> defined as illegal instruction? I would like to see this shared with
> book3s_32 PR.
 twi will be directed to the guest on HV no ? We want a real illegal
 because those go to the host (for potential emulation by the HV).
>>> Ah, good point. I guess we need different one for PR and HV then to
>>> ensure compatibility with older ISAs on PR.
>> Well, we also need to be careful with what happens if a PR guest puts
>> that instruction in, do that stop its HV guest/host ?
>>
>> What if it's done in userspace ? Do that stop the kernel ? :-)
> 
> The way SW breakpointing is handled is that when we see one, it gets
> deflected into user space. User space then has an array of breakpoints
> it configured itself. If the breakpoint is part of that list, it
> consumes it. If not, it injects a debug interrupt (program in this case)
> into the guest.
> 
> That way we can overlay that one instruction with as many layers as we
> like :). We only get a performance hit on execution of that instruction.
> 
>> Maddy, I haven't checked, does your patch ensure that we only ever stop
>> if the instruction is at a recorded bkpt address ? It still means that a
>> userspace process can practically DOS its kernel by issuing a lot of
>> these causing a crapload of exits.
> 
> Only user space knows about its breakpoint addresses, so we have to
> deflect. However since time still ticks on, we only increase jitter of
> the guest. The process would still get scheduled away after the same
 ^^^ Where is this taken care. I am still trying to understand. Kindly
can you explain or point to the code. Will help.

> amount of real time, no?
> 
> 
> Alex
> 
Thanks for review.
Regards
Maddy



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

KVM call for 2014-06-24

2014-06-17 Thread Juan Quintela


Hi

Please, send any topic that you are interested in covering.

* Machine as QOM (Marcel) (repeat)
 - Solve the 'sensible' issues.
 - Choose as first tasks the ones that will be more useful.
 - Select solutions agreed by everyone.


Thanks, Juan.

Call details:

15:00 CEST
13:00 UTC
09:00 EDT

Every two weeks

If you need phone number details,  contact me privately.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] powerpc/kvm: support to handle sw breakpoint

2014-06-17 Thread Alexander Graf



On 17.06.14 13:20, Madhavan Srinivasan wrote:

On Tuesday 17 June 2014 03:13 PM, Alexander Graf wrote:

On 17.06.14 11:32, Benjamin Herrenschmidt wrote:

On Tue, 2014-06-17 at 11:25 +0200, Alexander Graf wrote:

On 17.06.14 11:22, Benjamin Herrenschmidt wrote:

On Tue, 2014-06-17 at 10:54 +0200, Alexander Graf wrote:

Also, why don't we use twi always or something else that actually is
defined as illegal instruction? I would like to see this shared with
book3s_32 PR.

twi will be directed to the guest on HV no ? We want a real illegal
because those go to the host (for potential emulation by the HV).

Ah, good point. I guess we need different one for PR and HV then to
ensure compatibility with older ISAs on PR.

Well, we also need to be careful with what happens if a PR guest puts
that instruction in, do that stop its HV guest/host ?

What if it's done in userspace ? Do that stop the kernel ? :-)

The way SW breakpointing is handled is that when we see one, it gets
deflected into user space. User space then has an array of breakpoints
it configured itself. If the breakpoint is part of that list, it
consumes it. If not, it injects a debug interrupt (program in this case)
into the guest.

That way we can overlay that one instruction with as many layers as we
like :). We only get a performance hit on execution of that instruction.


Maddy, I haven't checked, does your patch ensure that we only ever stop
if the instruction is at a recorded bkpt address ? It still means that a
userspace process can practically DOS its kernel by issuing a lot of
these causing a crapload of exits.

Only user space knows about its breakpoint addresses, so we have to
deflect. However since time still ticks on, we only increase jitter of
the guest. The process would still get scheduled away after the same

  ^^^ Where is this taken care. I am still trying to understand. Kindly
can you explain or point to the code. Will help.


We tell the guest via VPA about its steal time which includes QEMU time.


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Coupling between KVM_IRQFD and KVM_SET_GSI_ROUTING?

2014-06-17 Thread Eric Auger

Hello,

I have a question related to KVM_IRQFD and KVM_SET_GSI_ROUTING ioctl
relationship.

When reading the KVM API documentation I do not understand there is any
dependency between KVM_IRQFD and KVM_SET_GSI_ROUTING. According to the
text it seems only the gsi field is used and interpreted as the irqchip pin.

However irqchip.c kvm_set_irq code relies on an existing and not dummy
routing table.

My question is: does anyone agree on the fact the user-side must set a
consistent routing table using KVM_SET_GSI_ROUTING before using
KVM_IRQFD? The other alternative would have been to build a default
identity GSI routing table in the kernel (gsi = irqchip.pin).

In the positive, shouldn't we clarify the KVM API documentation?

Thank you in advance

Best Regards

Eric
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2] ARM: KVM: add irqfd and irq routing support

2014-06-17 Thread Eric Auger

On 06/06/2014 02:16 PM, Christoffer Dall wrote:
> On Mon, Jun 02, 2014 at 02:54:12PM +0100, Marc Zyngier wrote:
>> Hi Eric,
>>
>> On Mon, Jun 02 2014 at  8:29:56 am BST, Eric Auger  
>> wrote:
>>> This patch enables irqfd and irq routing on ARM.
>>>
>>> It turns on CONFIG_HAVE_KVM_EVENTFD and CONFIG_HAVE_KVM_IRQ_ROUTING
>>>
>>> irqfd framework enables to assign physical IRQs to guests.
>>>
>>> 1) user-side uses KVM_IRQFD VM ioctl to pass KVM a kvm_irqfd struct that
>>> associates a VM, an eventfd, an IRQ number (aka. the GSI). When an actor
>>> signals the eventfd (typically a VFIO platform driver), the irqfd subsystem
>>> injects the specified IRQ into the VM (the "GSI" takes the semantic of a
>>> virtual IRQ for that guest).
>>
>> Just so I can understand how this works: Who EOIs (handles) the physical
>> interrupt? If it is the VFIO driver, then I don't see how you prevent
>> the interrupt from firing again immediately (unless this is an edge
>> interrupt?).
>>
>>> 2) the other use case is user-side does 1) and uses KVM_SET_GSI_ROUTING
>>> VM ioctl to create an association between a VM, a physical IRQ (aka GSI) and
>>> a virtual IRQ (aka irchip.pin).  This creates a so-called GSI routing entry.
>>> When someone triggers the eventfd, irqfd handles it but uses the specified
>>> routing and eventually injects irqchip.pin virtual IRQ into the guest. In 
>>> that
>>> context the GSI takes the semantic of a physical IRQ while the irqchip.pin
>>> takes the semantic of a virtual IRQ.
>>>
>>> in 1) routing is used by irqfd but an identity routing is created by default
>>> making the gsi = irqchip.pin. Note on ARM there is a single interrupt
>>> controller kind, the GIC.
>>>
>>> GSI routing mostly is implemented in generic irqchip.c.
>>> The tiny ARM specific part is directly implemented in the virtual interrupt
>>> controller (vgic.c) as it is done for powerpc for instance. This option was
>>> prefered compared to implementing other #ifdef in irq_comm.c (x86 and ia64).
>>> Hence irq_comm.c is not used at all.
>>>
>>> Routing currently is not used for anything else than irqfd IRQ injection. 
>>> Only
>>> SPI can be injected. This means the vgic is not totally hidden behind the
>>> irqchip. There are separate discussions on PPI/SGI routing.
>>>
>>> Only level sensitive IRQs are supported (with a registered resampler). As a
>>> reminder the resampler is a second eventfd called by irqfd framework when 
>>> the
>>> virtual IRQ is completed by the guest. This eventfd is supposed to be 
>>> handled
>>> on user-side
>>>
>>> MSI routing is not supported yet.
>>>
>>> This work was tested with Calxeda Midway xgmac main interrupt (with and 
>>> without
>>> explicit user routing) with qemu-system-arm and QEMU VFIO platform device.
>>>
>>> changes v1 -> v2:
>>> 2 fixes:
>>> - v1 assumed gsi/irqchip.pin was already incremented by 
>>> VGIC_NR_PRIVATE_IRQS.
>>>   This is now vgic_set_assigned_irq that increments it before injection.
>>> - v2 now handles the case where a pending assigned irq is cleared through
>>>   MMIO access. The irq is properly acked allowing the resamplefd handler
>>>   to possibly unmask the physical IRQ.
>>>
>>> Signed-off-by: Eric Auger 
>>>
>>> Conflicts:
>>> Documentation/virtual/kvm/api.txt
>>> arch/arm/kvm/Kconfig
>>>
>>> Conflicts:
>>> Documentation/virtual/kvm/api.txt
>>> ---
>>>  Documentation/virtual/kvm/api.txt |   4 +-
>>>  arch/arm/include/uapi/asm/kvm.h   |   8 +++
>>>  arch/arm/kvm/Kconfig  |   2 +
>>>  arch/arm/kvm/Makefile |   1 +
>>>  arch/arm/kvm/irq.h|  25 +++
>>>  virt/kvm/arm/vgic.c   | 141 
>>> --
>>>  6 files changed, 174 insertions(+), 7 deletions(-)
>>>  create mode 100644 arch/arm/kvm/irq.h
>>>
>>> diff --git a/Documentation/virtual/kvm/api.txt 
>>> b/Documentation/virtual/kvm/api.txt
>>> index b4f5365..b376334 100644
>>> --- a/Documentation/virtual/kvm/api.txt
>>> +++ b/Documentation/virtual/kvm/api.txt
>>> @@ -1339,7 +1339,7 @@ KVM_ASSIGN_DEV_IRQ. Partial deassignment of host or 
>>> guest IRQ is allowed.
>>>  4.52 KVM_SET_GSI_ROUTING
>>>
>>>  Capability: KVM_CAP_IRQ_ROUTING
>>> -Architectures: x86 ia64 s390
>>> +Architectures: x86 ia64 s390 arm
>>>  Type: vm ioctl
>>>  Parameters: struct kvm_irq_routing (in)
>>>  Returns: 0 on success, -1 on error
>>> @@ -2126,7 +2126,7 @@ into the hash PTE second double word).
>>>  4.75 KVM_IRQFD
>>>
>>>  Capability: KVM_CAP_IRQFD
>>> -Architectures: x86 s390
>>> +Architectures: x86 s390 arm
>>>  Type: vm ioctl
>>>  Parameters: struct kvm_irqfd (in)
>>>  Returns: 0 on success, -1 on error
>>> diff --git a/arch/arm/include/uapi/asm/kvm.h 
>>> b/arch/arm/include/uapi/asm/kvm.h
>>> index ef0c878..89b864d 100644
>>> --- a/arch/arm/include/uapi/asm/kvm.h
>>> +++ b/arch/arm/include/uapi/asm/kvm.h
>>> @@ -192,6 +192,14 @@ struct kvm_arch_memory_slot {
>>>  /* Highest supported SPI, from VGIC_NR_IRQS */
>>>  #define KVM_ARM_IRQ_GIC_MAX

Re: [PATCH] kvm-unit-tests/x86: fix unittests.cfg typo

2014-06-17 Thread Andrew Jones

On Thu, Apr 17, 2014 at 04:17:42PM +0200, Andrew Jones wrote:
> Signed-off-by: Andrew Jones 
> ---
>  x86/unittests.cfg | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/x86/unittests.cfg b/x86/unittests.cfg
> index 7930c026a38d6..d78fe0eafe2b6 100644
> --- a/x86/unittests.cfg
> +++ b/x86/unittests.cfg
> @@ -156,5 +156,5 @@ extra_params = -cpu host,+vmx
>  arch = x86_64
>  
>  [debug]
> -file = debug.flag
> +file = debug.flat
>  arch = x86_64
> -- 
> 1.8.1.4
> 

ping?
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

RE: [PATCH] KVM: PPC: e500mc: Relax tlb invalidation condition on vcpu schedule

2014-06-17 Thread mihai.cara...@freescale.com

> -Original Message-
> From: Alexander Graf [mailto:ag...@suse.de]
> Sent: Tuesday, June 17, 2014 12:09 PM
> To: Wood Scott-B07421
> Cc: Caraman Mihai Claudiu-B02008; kvm-...@vger.kernel.org;
> kvm@vger.kernel.org; linuxppc-...@lists.ozlabs.org
> Subject: Re: [PATCH] KVM: PPC: e500mc: Relax tlb invalidation condition
> on vcpu schedule
> 
> 
> On 13.06.14 21:42, Scott Wood wrote:
> > On Fri, 2014-06-13 at 16:55 +0200, Alexander Graf wrote:
> >> On 13.06.14 16:43, mihai.cara...@freescale.com wrote:
>  -Original Message-
>  From: Alexander Graf [mailto:ag...@suse.de]
>  Sent: Thursday, June 12, 2014 8:05 PM
>  To: Caraman Mihai Claudiu-B02008
>  Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; linuxppc-
>  d...@lists.ozlabs.org; Wood Scott-B07421
>  Subject: Re: [PATCH] KVM: PPC: e500mc: Relax tlb invalidation
> condition
>  on vcpu schedule
> 
>  On 06/12/2014 04:00 PM, Mihai Caraman wrote:
> > On vcpu schedule, the condition checked for tlb pollution is too
> tight.
> > The tlb entries of one vcpu are polluted when a different vcpu from
> the
> > same partition runs in-between. Relax the current tlb invalidation
> > condition taking into account the lpid.
> > Can you quantify the performance improvement from this?  We've had bugs
> > in this area before, so let's make sure it's worth it before making
> this
> > more complicated.
> >
> > Signed-off-by: Mihai Caraman  freescale.com>
>  Your mailer is broken? :)
>  This really should be an @.
> 
>  I think this should work. Scott, please ack.
> >>> Alex, you were right. I screwed up the patch description by inverting
> relax
> >>> and tight terms :) It should have been more like this:
> >>>
> >>> KVM: PPC: e500mc: Enhance tlb invalidation condition on vcpu schedule
> >>>
> >>> On vcpu schedule, the condition checked for tlb pollution is too
> loose.
> >>> The tlb entries of a vcpu are polluted (vs stale) only when a
> different vcpu
> >>> within the same logical partition runs in-between. Optimize the tlb
> invalidation
> >>> condition taking into account the lpid.
> >> Can't we give every vcpu its own lpid? Or don't we trap on global
> >> invalidates?
> > That would significantly increase the odds of exhausting LPIDs,
> > especially on large chips like t4240 with similarly large VMs.  If we
> > were to do that, the LPIDs would need to be dynamically assigned (like
> > PIDs), and should probably be a separate numberspace per physical core.
> 
> True, I didn't realize we only have so few of them. It would however
> save us from most flushing as long as we have spare LPIDs available :).

Yes, we had this proposal on the table for e6500 multithreaded core. This
core lacks tlb write conditional instruction, so an OS needs to use locks
to protect itself against concurrent tlb writes executed from sibling threads.
When we expose hw treads as single-threaded vcpus (useful when the user opt
not to pin vcpus), the guest can't no longer protect itself optimally
(it can protect tlb writes across all threads but this is not acceptable).
So instead, we found a solution at hypervisor level by assigning different
logical partition ids to guest's vcpus running simultaneous on sibling hw
threads. Currently in FSL SDK we allocate two lpids to each guest.

I am also a proponent for using all LPID space (63 values) per (multi-threaded)
physical core, which will lead to fewer invalidates on vcpu schedule and will
accommodate the solution described above.

-Mike

Re: KVM call for 2014-06-24

2014-06-17 Thread Juan Quintela

Juan Quintela  wrote:
> Hi

I sent the first mail to the wrong address, please notice that
qemu-devel was missing).  Fixed in this resent.

Sorry for the inconveniences.

Later, Juan.

> Please, send any topic that you are interested in covering.
>
> * Machine as QOM (Marcel) (repeat)
>  - Solve the 'sensible' issues.
>  - Choose as first tasks the ones that will be more useful.
>  - Select solutions agreed by everyone.
>
>
> Thanks, Juan.
>
> Call details:
>
> 15:00 CEST
> 13:00 UTC
> 09:00 EDT
>
> Every two weeks
>
> If you need phone number details,  contact me privately.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 5/6] KVM: PPC: Book3S HV: Access XICS in BE

2014-06-17 Thread Paul Mackerras

On Tue, Jun 17, 2014 at 12:22:32PM +0200, Alexander Graf wrote:
> 
> Eh, no. What we do is we read (good on BE, byte reversed) into r0. Then we
> swab32() from r0 to r3 on LE, mr from r0 to r3 on BE.
> 
> r3 gets truncated along the way.
> 
> The reason we maintain r0 as wrong-endian is that we write it back using the
> cache inhibited stwcix instruction:
> 
> >stwcix  r0, r6, r7  /* EOI it */
> 
> So during the lifetime of r0 as XIRR it's always byte-reversed on LE. That's
> why we store it using STWX_BE into hstate, because that's the time when we
> actually swab32() it for further interpretation.

So the STWX_BE is more like a be32_to_cpu than a cpu_to_be32, which is
what the name STWX_BE would suggest.  Sounds like it at least deserves
a comment, or (as you suggest) rearrange the register usage so a
normal store works.

Paul.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 5/6] KVM: PPC: Book3S HV: Access XICS in BE

2014-06-17 Thread Alexander Graf



On 17.06.14 14:13, Paul Mackerras wrote:

On Tue, Jun 17, 2014 at 12:22:32PM +0200, Alexander Graf wrote:

Eh, no. What we do is we read (good on BE, byte reversed) into r0. Then we
swab32() from r0 to r3 on LE, mr from r0 to r3 on BE.

r3 gets truncated along the way.

The reason we maintain r0 as wrong-endian is that we write it back using the
cache inhibited stwcix instruction:


stwcix  r0, r6, r7  /* EOI it */

So during the lifetime of r0 as XIRR it's always byte-reversed on LE. That's
why we store it using STWX_BE into hstate, because that's the time when we
actually swab32() it for further interpretation.

So the STWX_BE is more like a be32_to_cpu than a cpu_to_be32, which is
what the name STWX_BE would suggest.  Sounds like it at least deserves
a comment, or (as you suggest) rearrange the register usage so a
normal store works.


Yes, I have this now:


From a94a66437ec714ec5650f6d8fec050a33e4477ca Mon Sep 17 00:00:00 2001
From: Alexander Graf 
Date: Wed, 11 Jun 2014 10:37:52 +0200
Subject: [PATCH] KVM: PPC: Book3S HV: Access XICS in BE

On the exit path from the guest we check what type of interrupt we received
if we received one. This means we're doing hardware access to the XICS 
interrupt

controller.

However, when running on a little endian system, this access is byte 
reversed.


So let's make sure to swizzle the bytes back again and virtually make XICS
accesses big endian.

Signed-off-by: Alexander Graf 

---

v1 -> v2:

  - Make code easier to follow

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
b/arch/powerpc/kvm/book3s_hv_rmhandlers.S

index 1a2f471..9829e18 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -36,6 +36,13 @@
 #define NAPPING_CEDE1
 #define NAPPING_NOVCPU2

+.macro bswap32 regd, regs
+srwi\regd,\regs,24
+rlwimi\regd,\regs,24,16,23
+rlwimi\regd,\regs,8,8,15
+rlwimi\regd,\regs,24,0,7
+.endm
+
 /*
  * Call kvmppc_hv_entry in real mode.
  * Must be called with interrupts hard-disabled.
@@ -2325,7 +2332,12 @@ kvmppc_read_intr:
 cmpdir6, 0
 beq-1f
 lwzcixr0, r6, r7
-rlwinm.r3, r0, 0, 0xff
+#ifdef __LITTLE_ENDIAN__
+bswap32r4, r0
+#else
+mrr4, r0
+#endif
+rlwinm.r3, r4, 0, 0xff
 sync
 beq1f/* if nothing pending in the ICP */

@@ -2360,7 +2372,7 @@ kvmppc_read_intr:
 42:/* It's not an IPI and it's for the host, stash it in the PACA
  * before exit, it will be picked up by the host ICP driver
  */
-stwr0, HSTATE_SAVED_XIRR(r13)
+stwr4, HSTATE_SAVED_XIRR(r13)
 lir3, 1
 b1b


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH][RFC] kvm-unit-tests: report: add report_skip

2014-06-17 Thread Andrew Jones

Add report_skip(), which is report(), but with another condition
allowing it to output PASS/FAIL/SKIP, rather than only PASS/FAIL.
This allows report output to stay more consistent between
systems/configurations that may or may not support all tests.
Currently I only have a downstream use case for this, but maybe
report output sequences of the following form

report(msg1, cond1)   -> FAIL
report(msg2, cond1 && cond2)  -> FAIL
report(msg3, cond1 && cond3)  -> FAIL

would look better as

report(msg1, cond1)   -> FAIL
report_skip(msg2, !cond1, cond1 && cond2) -> SKIP
report_skip(msg3, !cond1, cond1 && cond3) -> SKIP

Signed-off-by: Andrew Jones 
---
 lib/libcflat.h |  1 +
 lib/report.c   | 34 ++
 2 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/lib/libcflat.h b/lib/libcflat.h
index f734fdee2df18..01f2769ad1773 100644
--- a/lib/libcflat.h
+++ b/lib/libcflat.h
@@ -61,5 +61,6 @@ extern long atol(const char *ptr);
 #define NULL ((void *)0UL)
 
 void report(const char *msg_fmt, bool pass, ...);
+void report_skip(const char *msg_fmt, bool can_skip, bool pass, ...);
 int report_summary(void);
 #endif
diff --git a/lib/report.c b/lib/report.c
index ff562a13c541e..77db22b6c2d8d 100644
--- a/lib/report.c
+++ b/lib/report.c
@@ -5,32 +5,50 @@
  *
  * Authors:
  *  Jan Kiszka 
+ *  Andrew Jones 
  *
  * This work is licensed under the terms of the GNU LGPL, version 2.
  */
 
 #include "libcflat.h"
 
-static unsigned int tests, failures;
+static unsigned int tests, failures, skipped;
 
-void report(const char *msg_fmt, bool pass, ...)
+void va_report_skip(const char *msg_fmt, bool can_skip, bool pass, va_list va)
 {
+   char *fail = can_skip ? "SKIP" : "FAIL";
char buf[2000];
-   va_list va;
 
tests++;
-   printf("%s: ", pass ? "PASS" : "FAIL");
-   va_start(va, pass);
+   printf("%s: ", pass ? "PASS" : fail);
vsnprintf(buf, sizeof(buf), msg_fmt, va);
-   va_end(va);
puts(buf);
puts("\n");
-   if (!pass)
+   if (!pass && can_skip)
+   skipped++;
+   else if (!pass)
failures++;
 }
 
+void report(const char *msg_fmt, bool pass, ...)
+{
+   va_list va;
+   va_start(va, pass);
+   va_report_skip(msg_fmt, false, pass, va);
+   va_end(va);
+}
+
+void report_skip(const char *msg_fmt, bool can_skip, bool pass, ...)
+{
+   va_list va;
+   va_start(va, pass);
+   va_report_skip(msg_fmt, can_skip, pass, va);
+   va_end(va);
+}
+
 int report_summary(void)
 {
-   printf("\nSUMMARY: %d tests, %d failures\n", tests, failures);
+   printf("\nSUMMARY: %d tests, %d failures, %d skipped\n",
+   tests, failures, skipped);
return failures > 0 ? 1 : 0;
 }
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH][RFC] kvm-unit-tests: report: add report_skip

2014-06-17 Thread Paolo Bonzini


Il 17/06/2014 14:21, Andrew Jones ha scritto:

would look better as

report(msg1, cond1)   -> FAIL
report_skip(msg2, !cond1, cond1 && cond2) -> SKIP
report_skip(msg3, !cond1, cond1 && cond3) -> SKIP


I think a lot of the time there is other code before the report that you 
want to skip.  Is anything more a "report_skip(msg1);" (that print a 
SKIP message) that useful?  So you can do


if (!(msr & 0x1000) {
report_skip("Frob bit 12 of msr");
return;
}

Unless what you really want is not a SKIP but an expected failure, then 
I agree with the idea.  Something like


  report_xfail(msg2, cond1, cond2)

would print:

  PASS   if cond1 = false, cond2 = true
  FAIL   if cond1 = false, cond2 = false
  XPASS  if cond1 = true, cond2 = true
  XFAIL  if cond1 = true, cond2 = false

An XPASS would ultimately exit with status 1, just like a FAIL.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH][RFC] kvm-unit-tests: report: add report_skip

2014-06-17 Thread Andrew Jones

On Tue, Jun 17, 2014 at 02:46:30PM +0200, Paolo Bonzini wrote:
> Il 17/06/2014 14:21, Andrew Jones ha scritto:
> >would look better as
> >
> >report(msg1, cond1)   -> FAIL
> >report_skip(msg2, !cond1, cond1 && cond2) -> SKIP
> >report_skip(msg3, !cond1, cond1 && cond3) -> SKIP
> 
> I think a lot of the time there is other code before the report that you
> want to skip.  Is anything more a "report_skip(msg1);" (that print a SKIP
> message) that useful?  So you can do
> 
> if (!(msr & 0x1000) {
> report_skip("Frob bit 12 of msr");

This is enough to output "msg: SKIP" in a format consistent with PASS
and FAIL. However, it's not necessarily enough to output the test case
too. E.g. we'd need

if (cond1) {
  /* do stuff to prepare for report conditions */
  report("msg1", cond2);
  report("msg2", cond3);
} else {
  report_skip("msg1");
  report_skip("msg2");
}

It'd be nice to avoid the message redundancy in the cases that we're
able to. However, ...

> return;
> }
> 
> Unless what you really want is not a SKIP but an expected failure, then I

...yes, I was thinking along the lines of an 'expected failure' report,
not the above issue, and 'XFAIL' does convey that idea better.

> agree with the idea.  Something like
> 
>   report_xfail(msg2, cond1, cond2)
> 
> would print:
> 
>   PASS   if cond1 = false, cond2 = true
>   FAIL   if cond1 = false, cond2 = false
>   XPASS  if cond1 = true, cond2 = true
>   XFAIL  if cond1 = true, cond2 = false
> 
> An XPASS would ultimately exit with status 1, just like a FAIL.

Yeah, XPASS is good. report_skip would make PASS ambiguous. Not good.
I'll drop the idea of outputting SKIP, switch to report_xfail, and send
a v2.

Thanks,
drew


> 
> Paolo
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kvm-unit-tests] report: add report_xfail for expected failures

2014-06-17 Thread Andrew Jones

Add report_xfail(), which is report(), but with another condition
allowing it to output PASS/FAIL/XPASS/XFAIL, rather than only
PASS/FAIL. This allows report output to stay more consistent
between systems/configurations that may or may not support all
tests.

Signed-off-by: Andrew Jones 
---
 lib/libcflat.h |  1 +
 lib/report.c   | 37 +
 2 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/lib/libcflat.h b/lib/libcflat.h
index cb6663d33ef5a..c9577350ec275 100644
--- a/lib/libcflat.h
+++ b/lib/libcflat.h
@@ -63,5 +63,6 @@ extern long atol(const char *ptr);
 #define NULL ((void *)0UL)
 
 void report(const char *msg_fmt, bool pass, ...);
+void report_xfail(const char *msg_fmt, bool xfail, bool pass, ...);
 int report_summary(void);
 #endif
diff --git a/lib/report.c b/lib/report.c
index ff562a13c541e..43d7a65ec8812 100644
--- a/lib/report.c
+++ b/lib/report.c
@@ -5,32 +5,53 @@
  *
  * Authors:
  *  Jan Kiszka 
+ *  Andrew Jones 
  *
  * This work is licensed under the terms of the GNU LGPL, version 2.
  */
 
 #include "libcflat.h"
 
-static unsigned int tests, failures;
+static unsigned int tests, failures, xfailures;
 
-void report(const char *msg_fmt, bool pass, ...)
+void va_report_xfail(const char *msg_fmt, bool xfail, bool cond, va_list va)
 {
+   char *pass = xfail ? "XPASS" : "PASS";
+   char *fail = xfail ? "XFAIL" : "FAIL";
char buf[2000];
-   va_list va;
 
tests++;
-   printf("%s: ", pass ? "PASS" : "FAIL");
-   va_start(va, pass);
+   printf("%s: ", cond ? pass : fail);
vsnprintf(buf, sizeof(buf), msg_fmt, va);
-   va_end(va);
puts(buf);
puts("\n");
-   if (!pass)
+   if (xfail && cond)
+   failures++;
+   else if (xfail)
+   xfailures++;
+   else if (!cond)
failures++;
 }
 
+void report(const char *msg_fmt, bool pass, ...)
+{
+   va_list va;
+   va_start(va, pass);
+   va_report_xfail(msg_fmt, false, pass, va);
+   va_end(va);
+}
+
+void report_xfail(const char *msg_fmt, bool xfail, bool pass, ...)
+{
+   va_list va;
+   va_start(va, pass);
+   va_report_xfail(msg_fmt, xfail, pass, va);
+   va_end(va);
+}
+
 int report_summary(void)
 {
-   printf("\nSUMMARY: %d tests, %d failures\n", tests, failures);
+   printf("\nSUMMARY: %d tests, %d failures, %d xfailures\n",
+   tests, failures, xfailures);
return failures > 0 ? 1 : 0;
 }
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] powerpc/kvm: support to handle sw breakpoint

2014-06-17 Thread Alexander Graf



On 17.06.14 13:13, Madhavan Srinivasan wrote:

On Tuesday 17 June 2014 04:38 PM, Alexander Graf wrote:

On 17.06.14 13:07, Madhavan Srinivasan wrote:

On Tuesday 17 June 2014 02:24 PM, Alexander Graf wrote:

On 14.06.14 23:08, Madhavan Srinivasan wrote:

This patch adds kernel side support for software breakpoint.
Design is that, by using an illegal instruction, we trap to hypervisor
via Emulation Assistance interrupt, where we check for the illegal
instruction
and accordingly we return to Host or Guest. Patch mandates use of
"abs" instruction
(primary opcode 31 and extended opcode 360) as sw breakpoint
instruction.
Based on PowerISA v2.01, ABS instruction has been dropped from the
architecture
and treated an illegal instruction.

Signed-off-by: Madhavan Srinivasan 
---
arch/powerpc/kvm/book3s.c|  3 ++-
arch/powerpc/kvm/book3s_hv.c | 23 +++
2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index c254c27..b40fe5d 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -789,7 +789,8 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu
*vcpu,
int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
struct kvm_guest_debug *dbg)
{
-return -EINVAL;
+vcpu->guest_debug = dbg->control;
+return 0;
}
  void kvmppc_decrementer_func(unsigned long data)
diff --git a/arch/powerpc/kvm/book3s_hv.c
b/arch/powerpc/kvm/book3s_hv.c
index 7a12edb..688421d 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -67,6 +67,14 @@
/* Used as a "null" value for timebase values */
#define TB_NIL(~(u64)0)
+/*
+ * SW_BRK_DBG_INT is debug Instruction for supporting Software
Breakpoint.
+ * Instruction mnemonic is ABS, primary opcode is 31 and extended
opcode is 360.
+ * Based on PowerISA v2.01, ABS instruction has been dropped from the
architecture
+ * and treated an illegal instruction.
+ */
+#define SW_BRK_DBG_INT 0x7c0002d0

The instruction we use to trap needs to get exposed to user space via a
ONE_REG property.


Yes. I got to know about that from Bharat (patchset "ppc debug: Add
debug stub support"). I will change it.


Also please make sure to pick an instruction that preferably looks 
identical regardless of guest endianness. Segher suggested 0x0000. 
Does that trap properly for you?



Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2] KVM: PPC: e500mc: Enhance tlb invalidation condition on vcpu schedule

2014-06-17 Thread Scott Wood

On Tue, 2014-06-17 at 12:02 +0300, Mihai Caraman wrote:
> On vcpu schedule, the condition checked for tlb pollution is too loose.
> The tlb entries of a vcpu become polluted (vs stale) only when a different
> vcpu within the same logical partition runs in-between. Optimize the tlb
> invalidation condition taking into account the logical partition id.
> 
> With the new invalidation condition, a guest shows 4% performance improvement
> on P5020DS while running a memory stress application with the cpu 
> oversubscribed,
> the other guest running a cpu intensive workload.

See
https://lists.ozlabs.org/pipermail/linuxppc-dev/2014-June/118547.html

-Scott


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] KVM: PPC: e500mc: Relax tlb invalidation condition on vcpu schedule

2014-06-17 Thread Scott Wood

On Thu, 2014-06-12 at 19:04 +0200, Alexander Graf wrote:
> On 06/12/2014 04:00 PM, Mihai Caraman wrote:
> > @@ -140,12 +142,24 @@ static void kvmppc_core_vcpu_load_e500mc(struct 
> > kvm_vcpu *vcpu, int cpu)
> > mtspr(SPRN_GDEAR, vcpu->arch.shared->dar);
> > mtspr(SPRN_GESR, vcpu->arch.shared->esr);
> >   
> > -   if (vcpu->arch.oldpir != mfspr(SPRN_PIR) ||
> > -   __get_cpu_var(last_vcpu_on_cpu) != vcpu) {
> > -   kvmppc_e500_tlbil_all(vcpu_e500);
> > +   if (vcpu->arch.oldpir != mfspr(SPRN_PIR)) {
> > +   /* tlb entries deprecated */
> > +   inval_tlb = update_last = true;
> > +   } else if (__get_cpu_var(last_vcpu_on_cpu) != vcpu) {
> > +   update_last = true;
> > +   /* tlb entries polluted */
> > +   inval_tlb = __get_cpu_var(last_lpid_on_cpu) ==
> > +   vcpu->kvm->arch.lpid;
> > +   }

What about the following sequence on one CPU:

LPID 1, vcpu A
LPID 2, vcpu C
LPID 1, vcpu B
LPID 2, vcpu C  doesn't invalidate
LPID 1, vcpu A  doesn't invalidate

In the last line, vcpu A last ran on this cpu (oldpir matches), but LPID
2 last ran on this cpu (last_lpid_on_cpu does not match) -- but an
invalidation has never happened since vcpu B from LPID 1 ran on this
cpu.

-Scott


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 2/6] KVM: x86: Wrong emulation on 'xadd X, X'

2014-06-17 Thread Bandan Das

Paolo Bonzini  writes:

> Il 16/06/2014 19:38, Bandan Das ha scritto:
>> Nadav Amit  writes:
>>
>>> The emulator does not emulate the xadd instruction correctly if the two
>>> operands are the same.  In this (unlikely) situation the result should be 
>>> the
>>> sum of X and X (2X) when it is currently X.  The solution is to first 
>>> perform
>>> writeback to the source, before writing to the destination.  The only
>>> instruction which should be affected is xadd, as the other instructions that
>>> perform writeback to the source use the extended accumlator (e.g., RAX:RDX).
>>>
>>> Signed-off-by: Nadav Amit 
>>> ---
>>>  arch/x86/kvm/emulate.c | 10 +-
>>>  1 file changed, 5 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
>>> index f0b0a10..3c8d867 100644
>>> --- a/arch/x86/kvm/emulate.c
>>> +++ b/arch/x86/kvm/emulate.c
>>> @@ -4711,17 +4711,17 @@ special_insn:
>>> goto done;
>>>
>>>  writeback:
>>> -   if (!(ctxt->d & NoWrite)) {
>>> -   rc = writeback(ctxt, &ctxt->dst);
>>> -   if (rc != X86EMUL_CONTINUE)
>>> -   goto done;
>>> -   }
>>> if (ctxt->d & SrcWrite) {
>>> BUG_ON(ctxt->src.type == OP_MEM || ctxt->src.type == 
>>> OP_MEM_STR);
>> While we are here, I think we should replace this BUG_ON with a warning
>> and return X86EMUL_UNHANDLEABLE if the condition is true.
>
> Sure, please post a patch and I'll apply it right away.

Well, I meant it more as a review and "suggested changes" to this patchset 
Nadav posted, but yeah, if you prefer, I can post a change myself. I will
make a pass through other uses of BUG() in the code too. 

> Paolo
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] KVM: PPC: Book3S PR: Handle hyp doorbell exits

2014-06-17 Thread Alexander Graf

If we're running PR KVM in HV mode, we may get hypervisor doorbell interrupts.
Handle those the same way we treat normal doorbells.

Signed-off-by: Alexander Graf 
---
 arch/powerpc/kvm/book3s_pr.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 8ea7da4..3b82e86 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -988,6 +988,7 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct 
kvm_vcpu *vcpu,
case BOOK3S_INTERRUPT_DECREMENTER:
case BOOK3S_INTERRUPT_HV_DECREMENTER:
case BOOK3S_INTERRUPT_DOORBELL:
+   case BOOK3S_INTERRUPT_H_DOORBELL:
vcpu->stat.dec_exits++;
r = RESUME_GUEST;
break;
-- 
1.8.1.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 3/4] KVM: PPC: Book3S HV: Fix ABIv2 indirect branch issue

2014-06-17 Thread Alexander Graf



On 12.06.14 10:16, Anton Blanchard wrote:

To establish addressability quickly, ABIv2 requires the target
address of the function being called to be in r12.

Signed-off-by: Anton Blanchard 


Thanks, applied to kvm-ppc-queue.


Alex


---

Index: b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
===
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1920,8 +1920,8 @@ hcall_try_real_mode:
lwaxr3,r3,r4
cmpwi   r3,0
beq guest_exit_cont
-   add r3,r3,r4
-   mtctr   r3
+   add r12,r3,r4
+   mtctr   r12
mr  r3,r9   /* get vcpu pointer */
ld  r4,VCPU_GPR(R4)(r9)
bctrl


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 4/4] KVM: PPC: Assembly functions exported to modules need _GLOBAL_TOC()

2014-06-17 Thread Alexander Graf



On 12.06.14 10:16, Anton Blanchard wrote:

Both kvmppc_hv_entry_trampoline and kvmppc_entry_trampoline are
assembly functions that are exported to modules and also require
a valid r2.

As such we need to use _GLOBAL_TOC so we provide a global entry
point that establishes the TOC (r2).

Signed-off-by: Anton Blanchard 


Thanks, applied to kvm-ppc-queue.

I've not applied patches 1 and 2 for now, as they break BE module support.


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] KVM: PPC: Book3S PR: Fix ABIv2 on LE

2014-06-17 Thread Alexander Graf

We switched to ABIv2 on Little Endian systems now which gets rid of the
dotted function names. Branch to the actual functions when we see such
a system.

Signed-off-by: Alexander Graf 

diff --git a/arch/powerpc/kvm/book3s_interrupts.S 
b/arch/powerpc/kvm/book3s_interrupts.S
index e2c29e3..d044b8b 100644
--- a/arch/powerpc/kvm/book3s_interrupts.S
+++ b/arch/powerpc/kvm/book3s_interrupts.S
@@ -25,7 +25,11 @@
 #include 
 
 #if defined(CONFIG_PPC_BOOK3S_64)
+#if defined(_CALL_ELF) && _CALL_ELF == 2
+#define FUNC(name) name
+#else
 #define FUNC(name) GLUE(.,name)
+#endif
 #define GET_SHADOW_VCPU(reg)addi   reg, r13, PACA_SVCPU
 
 #elif defined(CONFIG_PPC_BOOK3S_32)
diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S 
b/arch/powerpc/kvm/book3s_rmhandlers.S
index 9eec675..cdcbb63 100644
--- a/arch/powerpc/kvm/book3s_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_rmhandlers.S
@@ -36,7 +36,11 @@
 
 #if defined(CONFIG_PPC_BOOK3S_64)
 
+#if defined(_CALL_ELF) && _CALL_ELF == 2
+#define FUNC(name) name
+#else
 #define FUNC(name) GLUE(.,name)
+#endif
 
 #elif defined(CONFIG_PPC_BOOK3S_32)
 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/7] KVM: PPC: Book3S HV: Make HTAB code LE host aware

2014-06-17 Thread Alexander Graf

When running on an LE host all data structures are kept in little endian
byte order. However, the HTAB still needs to be maintained in big endian.

So every time we access any HTAB we need to make sure we do so in the right
byte order. Fix up all accesses to manually byte swap.

Signed-off-by: Alexander Graf 

---

v1 -> v2:

  - Add __be32 hints
  - Fix H_REMOVE
---
 arch/powerpc/include/asm/kvm_book3s.h|   4 +-
 arch/powerpc/include/asm/kvm_book3s_64.h |  15 +++-
 arch/powerpc/kvm/book3s_64_mmu_hv.c  | 128 ++-
 arch/powerpc/kvm/book3s_hv_rm_mmu.c  | 146 ++-
 4 files changed, 164 insertions(+), 129 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index a20cc0b..028f4b1 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -161,9 +161,9 @@ extern pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t 
gfn, bool writing,
bool *writable);
 extern void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
unsigned long *rmap, long pte_index, int realmode);
-extern void kvmppc_invalidate_hpte(struct kvm *kvm, unsigned long *hptep,
+extern void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep,
unsigned long pte_index);
-void kvmppc_clear_ref_hpte(struct kvm *kvm, unsigned long *hptep,
+void kvmppc_clear_ref_hpte(struct kvm *kvm, __be64 *hptep,
unsigned long pte_index);
 extern void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long addr,
unsigned long *nb_ret);
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h 
b/arch/powerpc/include/asm/kvm_book3s_64.h
index c7871f3..e504f88 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -59,20 +59,29 @@ extern unsigned long kvm_rma_pages;
 /* These bits are reserved in the guest view of the HPTE */
 #define HPTE_GR_RESERVED   HPTE_GR_MODIFIED
 
-static inline long try_lock_hpte(unsigned long *hpte, unsigned long bits)
+static inline long try_lock_hpte(__be64 *hpte, unsigned long bits)
 {
unsigned long tmp, old;
+   __be64 be_lockbit, be_bits;
+
+   /*
+* We load/store in native endian, but the HTAB is in big endian. If
+* we byte swap all data we apply on the PTE we're implicitly correct
+* again.
+*/
+   be_lockbit = cpu_to_be64(HPTE_V_HVLOCK);
+   be_bits = cpu_to_be64(bits);
 
asm volatile("  ldarx   %0,0,%2\n"
 "  and.%1,%0,%3\n"
 "  bne 2f\n"
-"  ori %0,%0,%4\n"
+"  or  %0,%0,%4\n"
 "  stdcx.  %0,0,%2\n"
 "  beq+2f\n"
 "  mr  %1,%3\n"
 "2:isync"
 : "=&r" (tmp), "=&r" (old)
-: "r" (hpte), "r" (bits), "i" (HPTE_V_HVLOCK)
+: "r" (hpte), "r" (be_bits), "r" (be_lockbit)
 : "cc", "memory");
return old == 0;
 }
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c 
b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 8056107..2d154d9 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -450,7 +450,7 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu 
*vcpu, gva_t eaddr,
unsigned long slb_v;
unsigned long pp, key;
unsigned long v, gr;
-   unsigned long *hptep;
+   __be64 *hptep;
int index;
int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR);
 
@@ -473,13 +473,13 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu 
*vcpu, gva_t eaddr,
preempt_enable();
return -ENOENT;
}
-   hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4));
-   v = hptep[0] & ~HPTE_V_HVLOCK;
+   hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4));
+   v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
gr = kvm->arch.revmap[index].guest_rpte;
 
/* Unlock the HPTE */
asm volatile("lwsync" : : : "memory");
-   hptep[0] = v;
+   hptep[0] = cpu_to_be64(v);
preempt_enable();
 
gpte->eaddr = eaddr;
@@ -583,7 +583,8 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct 
kvm_vcpu *vcpu,
unsigned long ea, unsigned long dsisr)
 {
struct kvm *kvm = vcpu->kvm;
-   unsigned long *hptep, hpte[3], r;
+   unsigned long hpte[3], r;
+   __be64 *hptep;
unsigned long mmu_seq, psize, pte_size;
unsigned long gpa_base, gfn_base;
unsigned long gpa, gfn, hva, pfn;
@@ -606,16 +607,16 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, 
struct kvm_vcpu *vcpu,
if (ea != vcpu->arch.pgfault_addr)
return RESUME_GUEST;

[PATCH 3/7] KVM: PPC: Book3S HV: Access guest VPA in BE

2014-06-17 Thread Alexander Graf

There are a few shared data structures between the host and the guest. Most
of them get registered through the VPA interface.

These data structures are defined to always be in big endian byte order, so
let's make sure we always access them in big endian.

Signed-off-by: Alexander Graf 

---

v1 -> v2:

  - Add __be hints
  - Fix dtl_idx
---
 arch/powerpc/kvm/book3s_hv.c | 22 +++---
 arch/powerpc/kvm/book3s_hv_ras.c |  6 +++---
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 1562acf..f7bcc72 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -270,7 +270,7 @@ struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id)
 static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa)
 {
vpa->__old_status |= LPPACA_OLD_SHARED_PROC;
-   vpa->yield_count = 1;
+   vpa->yield_count = cpu_to_be32(1);
 }
 
 static int set_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *v,
@@ -293,8 +293,8 @@ static int set_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa 
*v,
 struct reg_vpa {
u32 dummy;
union {
-   u16 hword;
-   u32 word;
+   __be16 hword;
+   __be32 word;
} length;
 };
 
@@ -333,9 +333,9 @@ static unsigned long do_h_register_vpa(struct kvm_vcpu 
*vcpu,
if (va == NULL)
return H_PARAMETER;
if (subfunc == H_VPA_REG_VPA)
-   len = ((struct reg_vpa *)va)->length.hword;
+   len = be16_to_cpu(((struct reg_vpa *)va)->length.hword);
else
-   len = ((struct reg_vpa *)va)->length.word;
+   len = be32_to_cpu(((struct reg_vpa *)va)->length.word);
kvmppc_unpin_guest_page(kvm, va, vpa, false);
 
/* Check length */
@@ -540,18 +540,18 @@ static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
return;
memset(dt, 0, sizeof(struct dtl_entry));
dt->dispatch_reason = 7;
-   dt->processor_id = vc->pcpu + vcpu->arch.ptid;
-   dt->timebase = now + vc->tb_offset;
-   dt->enqueue_to_dispatch_time = stolen;
-   dt->srr0 = kvmppc_get_pc(vcpu);
-   dt->srr1 = vcpu->arch.shregs.msr;
+   dt->processor_id = cpu_to_be16(vc->pcpu + vcpu->arch.ptid);
+   dt->timebase = cpu_to_be64(now + vc->tb_offset);
+   dt->enqueue_to_dispatch_time = cpu_to_be32(stolen);
+   dt->srr0 = cpu_to_be64(kvmppc_get_pc(vcpu));
+   dt->srr1 = cpu_to_be64(vcpu->arch.shregs.msr);
++dt;
if (dt == vcpu->arch.dtl.pinned_end)
dt = vcpu->arch.dtl.pinned_addr;
vcpu->arch.dtl_ptr = dt;
/* order writing *dt vs. writing vpa->dtl_idx */
smp_wmb();
-   vpa->dtl_idx = ++vcpu->arch.dtl_index;
+   vpa->dtl_idx = cpu_to_be64(++vcpu->arch.dtl_index);
vcpu->arch.dtl.dirty = true;
 }
 
diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c
index 768a9f9..6a3ba8a 100644
--- a/arch/powerpc/kvm/book3s_hv_ras.c
+++ b/arch/powerpc/kvm/book3s_hv_ras.c
@@ -45,14 +45,14 @@ static void reload_slb(struct kvm_vcpu *vcpu)
return;
 
/* Sanity check */
-   n = min_t(u32, slb->persistent, SLB_MIN_SIZE);
+   n = min_t(u32, be32_to_cpu(slb->persistent), SLB_MIN_SIZE);
if ((void *) &slb->save_area[n] > vcpu->arch.slb_shadow.pinned_end)
return;
 
/* Load up the SLB from that */
for (i = 0; i < n; ++i) {
-   unsigned long rb = slb->save_area[i].esid;
-   unsigned long rs = slb->save_area[i].vsid;
+   unsigned long rb = be64_to_cpu(slb->save_area[i].esid);
+   unsigned long rs = be64_to_cpu(slb->save_area[i].vsid);
 
rb = (rb & ~0xFFFul) | i;   /* insert entry number */
asm volatile("slbmte %0,%1" : : "r" (rs), "r" (rb));
-- 
1.8.1.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 7/7] KVM: PPC: Book3S HV: Enable for little endian hosts

2014-06-17 Thread Alexander Graf

Now that we've fixed all the issues that HV KVM code had on little endian
hosts, we can enable it in the kernel configuration for users to play with.

Signed-off-by: Alexander Graf 
---
 arch/powerpc/kvm/Kconfig | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index d6a53b9..8aeeda1 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -75,7 +75,6 @@ config KVM_BOOK3S_64
 config KVM_BOOK3S_64_HV
tristate "KVM support for POWER7 and PPC970 using hypervisor mode in 
host"
depends on KVM_BOOK3S_64
-   depends on !CPU_LITTLE_ENDIAN
select KVM_BOOK3S_HV_POSSIBLE
select MMU_NOTIFIER
select CMA
-- 
1.8.1.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 4/7] KVM: PPC: Book3S HV: Access host lppaca and shadow slb in BE

2014-06-17 Thread Alexander Graf

Some data structures are always stored in big endian. Among those are the LPPACA
fields as well as the shadow slb. These structures might be shared with a
hypervisor.

So whenever we access those fields, make sure we do so in big endian byte order.

Signed-off-by: Alexander Graf 
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 20 ++--
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 7e7fa01..75c7e22 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -32,10 +32,6 @@
 
 #define VCPU_GPRS_TM(reg) (((reg) * ULONG_SIZE) + VCPU_GPR_TM)
 
-#ifdef __LITTLE_ENDIAN__
-#error Need to fix lppaca and SLB shadow accesses in little endian mode
-#endif
-
 /* Values in HSTATE_NAPPING(r13) */
 #define NAPPING_CEDE   1
 #define NAPPING_NOVCPU 2
@@ -595,9 +591,10 @@ kvmppc_got_guest:
ld  r3, VCPU_VPA(r4)
cmpdi   r3, 0
beq 25f
-   lwz r5, LPPACA_YIELDCOUNT(r3)
+   li  r6, LPPACA_YIELDCOUNT
+   LWZX_BE r5, r3, r6
addir5, r5, 1
-   stw r5, LPPACA_YIELDCOUNT(r3)
+   STWX_BE r5, r3, r6
li  r6, 1
stb r6, VCPU_VPA_DIRTY(r4)
 25:
@@ -1442,9 +1439,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_TM)
ld  r8, VCPU_VPA(r9)/* do they have a VPA? */
cmpdi   r8, 0
beq 25f
-   lwz r3, LPPACA_YIELDCOUNT(r8)
+   li  r4, LPPACA_YIELDCOUNT
+   LWZX_BE r3, r8, r4
addir3, r3, 1
-   stw r3, LPPACA_YIELDCOUNT(r8)
+   STWX_BE r3, r8, r4
li  r3, 1
stb r3, VCPU_VPA_DIRTY(r9)
 25:
@@ -1757,8 +1755,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 33:ld  r8,PACA_SLBSHADOWPTR(r13)
 
.rept   SLB_NUM_BOLTED
-   ld  r5,SLBSHADOW_SAVEAREA(r8)
-   ld  r6,SLBSHADOW_SAVEAREA+8(r8)
+   li  r3, SLBSHADOW_SAVEAREA
+   LDX_BE  r5, r8, r3
+   addir3, r3, 8
+   LDX_BE  r6, r8, r3
andis.  r7,r5,SLB_ESID_V@h
beq 1f
slbmte  r6,r5
-- 
1.8.1.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 5/7] KVM: PPC: Book3S HV: Access XICS in BE

2014-06-17 Thread Alexander Graf

On the exit path from the guest we check what type of interrupt we received
if we received one. This means we're doing hardware access to the XICS interrupt
controller.

However, when running on a little endian system, this access is byte reversed.

So let's make sure to swizzle the bytes back again and virtually make XICS
accesses big endian.

Signed-off-by: Alexander Graf 

---

v1 -> v2:

  - Make code easier to follow and use hardware for bswap
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 18 ++
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 75c7e22..1a71f60 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -2325,7 +2325,18 @@ kvmppc_read_intr:
cmpdi   r6, 0
beq-1f
lwzcix  r0, r6, r7
-   rlwinm. r3, r0, 0, 0xff
+   /*
+* Save XIRR for later. Since we get in in reverse endian on LE
+* systems, save it byte reversed and fetch it back in host endian.
+*/
+   li  r3, HSTATE_SAVED_XIRR
+   STWX_BE r0, r3, r13
+#ifdef __LITTLE_ENDIAN__
+   lwz r3, HSTATE_SAVED_XIRR(r13)
+#else
+   mr  r3, r0
+#endif
+   rlwinm. r3, r3, 0, 0xff
sync
beq 1f  /* if nothing pending in the ICP */
 
@@ -2357,10 +2368,9 @@ kvmppc_read_intr:
li  r3, -1
 1: blr
 
-42:/* It's not an IPI and it's for the host, stash it in the PACA
-* before exit, it will be picked up by the host ICP driver
+42:/* It's not an IPI and it's for the host. We saved a copy of XIRR in
+* the PACA earlier, it will be picked up by the host ICP driver
 */
-   stw r0, HSTATE_SAVED_XIRR(r13)
li  r3, 1
b   1b
 
-- 
1.8.1.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/7] PPC: Add asm helpers for BE 32bit load/store

2014-06-17 Thread Alexander Graf

>From assembly code we might not only have to explicitly BE access 64bit values,
but sometimes also 32bit ones. Add helpers that allow for easy use of lwzx/stwx
in their respective byte-reverse or native form.

Signed-off-by: Alexander Graf 
CC: Benjamin Herrenschmidt 

---

v1 -> v2:

  - fix typo in STWX_BE
---
 arch/powerpc/include/asm/asm-compat.h | 4 
 1 file changed, 4 insertions(+)

diff --git a/arch/powerpc/include/asm/asm-compat.h 
b/arch/powerpc/include/asm/asm-compat.h
index 4b237aa..21be8ae 100644
--- a/arch/powerpc/include/asm/asm-compat.h
+++ b/arch/powerpc/include/asm/asm-compat.h
@@ -34,10 +34,14 @@
 #define PPC_MIN_STKFRM 112
 
 #ifdef __BIG_ENDIAN__
+#define LWZX_BEstringify_in_c(lwzx)
 #define LDX_BE stringify_in_c(ldx)
+#define STWX_BEstringify_in_c(stwx)
 #define STDX_BEstringify_in_c(stdx)
 #else
+#define LWZX_BEstringify_in_c(lwbrx)
 #define LDX_BE stringify_in_c(ldbrx)
+#define STWX_BEstringify_in_c(stwbrx)
 #define STDX_BEstringify_in_c(stdbrx)
 #endif
 
-- 
1.8.1.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 0/7] KVM: PPC: Book3S HV: Enable on little endian hosts

2014-06-17 Thread Alexander Graf

So far we've been able to successfully run HV KVM on big endian hosts, but
once you dive into little endian land things start to fall apart.

This patch set enables HV KVM for little endian hosts. This should be the
final piece left missing to get little endian systems fully en par with big
endian ones in the KVM world - modulo bugs.

For now guest threading support is still slightly flaky, but I'm sure that's
only a minor breakage somewhere that we'll find soon.

v1 -> v2:

  - fix typo in STWX_BE
  - Add __be hints
  - Fix H_REMOVE
  - Fix dtl_idx
  - Make XICS code easier to follow and use memory for bswap

Alex

Alexander Graf (7):
  PPC: Add asm helpers for BE 32bit load/store
  KVM: PPC: Book3S HV: Make HTAB code LE host aware
  KVM: PPC: Book3S HV: Access guest VPA in BE
  KVM: PPC: Book3S HV: Access host lppaca and shadow slb in BE
  KVM: PPC: Book3S HV: Access XICS in BE
  KVM: PPC: Book3S HV: Fix ABIv2 on LE
  KVM: PPC: Book3S HV: Enable for little endian hosts

 arch/powerpc/include/asm/asm-compat.h|   4 +
 arch/powerpc/include/asm/kvm_book3s.h|   4 +-
 arch/powerpc/include/asm/kvm_book3s_64.h |  15 +++-
 arch/powerpc/kvm/Kconfig |   1 -
 arch/powerpc/kvm/book3s_64_mmu_hv.c  | 128 ++-
 arch/powerpc/kvm/book3s_hv.c |  22 ++---
 arch/powerpc/kvm/book3s_hv_ras.c |   6 +-
 arch/powerpc/kvm/book3s_hv_rm_mmu.c  | 146 ++-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S  |  60 -
 9 files changed, 220 insertions(+), 166 deletions(-)

-- 
1.8.1.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 6/7] KVM: PPC: Book3S HV: Fix ABIv2 on LE

2014-06-17 Thread Alexander Graf

We use ABIv2 on Little Endian systems which gets rid of the dotted function
names. Branch to the actual functions when we see such a system.

Signed-off-by: Alexander Graf 
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 22 ++
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 1a71f60..1ff3ebd 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -36,6 +36,12 @@
 #define NAPPING_CEDE   1
 #define NAPPING_NOVCPU 2
 
+#if defined(_CALL_ELF) && _CALL_ELF == 2
+#define FUNC(name) name
+#else
+#define FUNC(name) GLUE(.,name)
+#endif
+
 /*
  * Call kvmppc_hv_entry in real mode.
  * Must be called with interrupts hard-disabled.
@@ -668,9 +674,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_TM)
 
mr  r31, r4
addir3, r31, VCPU_FPRS_TM
-   bl  .load_fp_state
+   bl  FUNC(load_fp_state)
addir3, r31, VCPU_VRS_TM
-   bl  .load_vr_state
+   bl  FUNC(load_vr_state)
mr  r4, r31
lwz r7, VCPU_VRSAVE_TM(r4)
mtspr   SPRN_VRSAVE, r7
@@ -1414,9 +1420,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_TM)
 
/* Save FP/VSX. */
addir3, r9, VCPU_FPRS_TM
-   bl  .store_fp_state
+   bl  FUNC(store_fp_state)
addir3, r9, VCPU_VRS_TM
-   bl  .store_vr_state
+   bl  FUNC(store_vr_state)
mfspr   r6, SPRN_VRSAVE
stw r6, VCPU_VRSAVE_TM(r9)
 1:
@@ -2405,11 +2411,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
mtmsrd  r8
isync
addir3,r3,VCPU_FPRS
-   bl  .store_fp_state
+   bl  FUNC(store_fp_state)
 #ifdef CONFIG_ALTIVEC
 BEGIN_FTR_SECTION
addir3,r31,VCPU_VRS
-   bl  .store_vr_state
+   bl  FUNC(store_vr_state)
 END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
 #endif
mfspr   r6,SPRN_VRSAVE
@@ -2441,11 +2447,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
mtmsrd  r8
isync
addir3,r4,VCPU_FPRS
-   bl  .load_fp_state
+   bl  FUNC(load_fp_state)
 #ifdef CONFIG_ALTIVEC
 BEGIN_FTR_SECTION
addir3,r31,VCPU_VRS
-   bl  .load_vr_state
+   bl  FUNC(load_vr_state)
 END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
 #endif
lwz r7,VCPU_VRSAVE(r31)
-- 
1.8.1.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 2/6] KVM: x86: Wrong emulation on 'xadd X, X'

2014-06-17 Thread Paolo Bonzini


Il 17/06/2014 17:35, Bandan Das ha scritto:

Well, I meant it more as a review and "suggested changes" to this patchset
Nadav posted, but yeah, if you prefer, I can post a change myself. I will
make a pass through other uses of BUG() in the code too.


I'd prefer that, there's no need to make Nadav do even more work.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Hang on reboot in FreeBSD guest on Linux KVM host

2014-06-17 Thread John Nielsen

On Jun 17, 2014, at 12:05 AM, Gleb Natapov  wrote:

> On Tue, Jun 17, 2014 at 06:21:23AM +0200, Paolo Bonzini wrote:
>> Il 16/06/2014 18:47, John Nielsen ha scritto:
>>> On Jun 16, 2014, at 10:39 AM, Paolo Bonzini  wrote:
>>> 
 Il 16/06/2014 18:09, John Nielsen ha scritto:
>>> The only substantial difference on the hardware side is the CPU.
>>> The hosts where the problem occurs use "Intel(R) Xeon(R) CPU
>>> E5-2650 v2 @ 2.60GHz", while the hosts that don't show the
>>> problem use the prior revision, "Intel(R) Xeon(R) CPU E5-2650 0 @
>>> 2.00GHz".
 
 Can you do "grep . /sys/module/kvm_intel/parameters/*" on both hosts 
 please?
>>> 
>>> No differences that I can see. Output below.
>> 
>> Not really:
>> 
>>> Working host:
>>> Intel(R) Xeon(R) CPU E5-2650 0 @ 2.00GHz
>>> # grep . /sys/module/kvm_intel/parameters/*
>>> /sys/module/kvm_intel/parameters/enable_apicv:N
>>> 
>>> Problem host:
>>> Intel(R) Xeon(R) CPU E5-2650 v2 @ 2.60GHz
>>> # grep . /sys/module/kvm_intel/parameters/*
>>> /sys/module/kvm_intel/parameters/enable_apicv:Y
>> 
>> So we have a clue.  Let me study the code more, I'll try to get back with a
>> suggestion.

Wow, can't believe I missed that. Good catch!

> Does disabling apicv on E5-2650 v2 make reboot problem go away?

Yes it does!

# modprobe kvm_intel
/sys/module/kvm_intel/parameters/enable_apicv:Y
# /usr/bin/qemu-system-x86_64 -machine accel=kvm -name bsdtest -m 512 -smp 
2,sockets=1,cores=1,threads=2 -drive 
file=./20140613_FreeBSD_9.2-RELEASE_ufs.qcow2,if=none,id=drive0,format=qcow2 
-device virtio-blk-pci,scsi=off,drive=drive0 -vnc 0.0.0.0:0 -net none

[problem occurs]

# rmmod kvm_intel
# modprobe kvm_intel enable_apicv=N
/sys/module/kvm_intel/parameters/enable_apicv:N
# /usr/bin/qemu-system-x86_64 -machine accel=kvm -name bsdtest -m 512 -smp 
2,sockets=1,cores=1,threads=2 -drive 
file=./20140613_FreeBSD_9.2-RELEASE_ufs.qcow2,if=none,id=drive0,format=qcow2 
-device virtio-blk-pci,scsi=off,drive=drive0 -vnc 0.0.0.0:0 -net none

[problem does not occur]

Thank you. This both narrows the problem considerably and provides an 
acceptable workaround. It would still be nice to see it fixed, of course. Keep 
me CC'ed as I'm not on the KVM list.

JN

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PULL 044/103] Add kvm_eventfds_enabled function

2014-06-17 Thread Michael S. Tsirkin

From: Nikolay Nikolaev 

Add a function to check if the eventfd capability is present in KVM in
the host kernel.

Signed-off-by: Antonios Motakis 
Signed-off-by: Nikolay Nikolaev 
Reviewed-by: Michael S. Tsirkin 
Signed-off-by: Michael S. Tsirkin 
Acked-by: Paolo Bonzini 
---
 include/sysemu/kvm.h | 11 +++
 kvm-all.c|  4 
 kvm-stub.c   |  1 +
 3 files changed, 16 insertions(+)

diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index e79e92c..c4556ad 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -43,6 +43,7 @@ extern bool kvm_allowed;
 extern bool kvm_kernel_irqchip;
 extern bool kvm_async_interrupts_allowed;
 extern bool kvm_halt_in_kernel_allowed;
+extern bool kvm_eventfds_allowed;
 extern bool kvm_irqfds_allowed;
 extern bool kvm_msi_via_irqfd_allowed;
 extern bool kvm_gsi_routing_allowed;
@@ -83,6 +84,15 @@ extern bool kvm_readonly_mem_allowed;
 #define kvm_halt_in_kernel() (kvm_halt_in_kernel_allowed)
 
 /**
+ * kvm_eventfds_enabled:
+ *
+ * Returns: true if we can use eventfds to receive notifications
+ * from a KVM CPU (ie the kernel supports eventds and we are running
+ * with a configuration where it is meaningful to use them).
+ */
+#define kvm_eventfds_enabled() (kvm_eventfds_allowed)
+
+/**
  * kvm_irqfds_enabled:
  *
  * Returns: true if we can use irqfds to inject interrupts into
@@ -128,6 +138,7 @@ extern bool kvm_readonly_mem_allowed;
 #define kvm_irqchip_in_kernel() (false)
 #define kvm_async_interrupts_enabled() (false)
 #define kvm_halt_in_kernel() (false)
+#define kvm_eventfds_enabled() (false)
 #define kvm_irqfds_enabled() (false)
 #define kvm_msi_via_irqfd_enabled() (false)
 #define kvm_gsi_routing_allowed() (false)
diff --git a/kvm-all.c b/kvm-all.c
index 4e19eff..92f56d8 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -113,6 +113,7 @@ KVMState *kvm_state;
 bool kvm_kernel_irqchip;
 bool kvm_async_interrupts_allowed;
 bool kvm_halt_in_kernel_allowed;
+bool kvm_eventfds_allowed;
 bool kvm_irqfds_allowed;
 bool kvm_msi_via_irqfd_allowed;
 bool kvm_gsi_routing_allowed;
@@ -1541,6 +1542,9 @@ int kvm_init(MachineClass *mc)
 (kvm_check_extension(s, KVM_CAP_READONLY_MEM) > 0);
 #endif
 
+kvm_eventfds_allowed =
+(kvm_check_extension(s, KVM_CAP_IOEVENTFD) > 0);
+
 ret = kvm_arch_init(s);
 if (ret < 0) {
 goto err;
diff --git a/kvm-stub.c b/kvm-stub.c
index ac33d86..8e7737c 100644
--- a/kvm-stub.c
+++ b/kvm-stub.c
@@ -22,6 +22,7 @@
 KVMState *kvm_state;
 bool kvm_kernel_irqchip;
 bool kvm_async_interrupts_allowed;
+bool kvm_eventfds_allowed;
 bool kvm_irqfds_allowed;
 bool kvm_msi_via_irqfd_allowed;
 bool kvm_gsi_routing_allowed;
-- 
MST

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH kvm-unit-tests] report: add report_xfail for expected failures

2014-06-17 Thread Paolo Bonzini


Il 17/06/2014 16:36, Andrew Jones ha scritto:

Add report_xfail(), which is report(), but with another condition
allowing it to output PASS/FAIL/XPASS/XFAIL, rather than only
PASS/FAIL. This allows report output to stay more consistent
between systems/configurations that may or may not support all
tests.

Signed-off-by: Andrew Jones 
---
 lib/libcflat.h |  1 +
 lib/report.c   | 37 +
 2 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/lib/libcflat.h b/lib/libcflat.h
index cb6663d33ef5a..c9577350ec275 100644
--- a/lib/libcflat.h
+++ b/lib/libcflat.h
@@ -63,5 +63,6 @@ extern long atol(const char *ptr);
 #define NULL ((void *)0UL)

 void report(const char *msg_fmt, bool pass, ...);
+void report_xfail(const char *msg_fmt, bool xfail, bool pass, ...);
 int report_summary(void);
 #endif
diff --git a/lib/report.c b/lib/report.c
index ff562a13c541e..43d7a65ec8812 100644
--- a/lib/report.c
+++ b/lib/report.c
@@ -5,32 +5,53 @@
  *
  * Authors:
  *  Jan Kiszka 
+ *  Andrew Jones 
  *
  * This work is licensed under the terms of the GNU LGPL, version 2.
  */

 #include "libcflat.h"

-static unsigned int tests, failures;
+static unsigned int tests, failures, xfailures;

-void report(const char *msg_fmt, bool pass, ...)
+void va_report_xfail(const char *msg_fmt, bool xfail, bool cond, va_list va)
 {
+   char *pass = xfail ? "XPASS" : "PASS";
+   char *fail = xfail ? "XFAIL" : "FAIL";
char buf[2000];
-   va_list va;

tests++;
-   printf("%s: ", pass ? "PASS" : "FAIL");
-   va_start(va, pass);
+   printf("%s: ", cond ? pass : fail);
vsnprintf(buf, sizeof(buf), msg_fmt, va);
-   va_end(va);
puts(buf);
puts("\n");
-   if (!pass)
+   if (xfail && cond)
+   failures++;
+   else if (xfail)
+   xfailures++;
+   else if (!cond)
failures++;
 }

+void report(const char *msg_fmt, bool pass, ...)
+{
+   va_list va;
+   va_start(va, pass);
+   va_report_xfail(msg_fmt, false, pass, va);
+   va_end(va);
+}
+
+void report_xfail(const char *msg_fmt, bool xfail, bool pass, ...)
+{
+   va_list va;
+   va_start(va, pass);
+   va_report_xfail(msg_fmt, xfail, pass, va);
+   va_end(va);
+}
+
 int report_summary(void)
 {
-   printf("\nSUMMARY: %d tests, %d failures\n", tests, failures);
+   printf("\nSUMMARY: %d tests, %d failures, %d xfailures\n",
+   tests, failures, xfailures);
return failures > 0 ? 1 : 0;
 }



Looks good, will apply in the next few days.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] kvm-unit-tests/x86: fix unittests.cfg typo

2014-06-17 Thread Paolo Bonzini


Il 17/04/2014 16:17, Andrew Jones ha scritto:

Signed-off-by: Andrew Jones 
---
 x86/unittests.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/x86/unittests.cfg b/x86/unittests.cfg
index 7930c026a38d6..d78fe0eafe2b6 100644
--- a/x86/unittests.cfg
+++ b/x86/unittests.cfg
@@ -156,5 +156,5 @@ extra_params = -cpu host,+vmx
 arch = x86_64

 [debug]
-file = debug.flag
+file = debug.flat
 arch = x86_64




Applying, thanks.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

RE: [PATCH v2] KVM: PPC: e500mc: Enhance tlb invalidation condition on vcpu schedule

2014-06-17 Thread mihai.cara...@freescale.com

> -Original Message-
> From: Wood Scott-B07421
> Sent: Tuesday, June 17, 2014 6:36 PM
> To: Caraman Mihai Claudiu-B02008
> Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; linuxppc-
> d...@lists.ozlabs.org
> Subject: Re: [PATCH v2] KVM: PPC: e500mc: Enhance tlb invalidation
> condition on vcpu schedule
> 
> On Tue, 2014-06-17 at 12:02 +0300, Mihai Caraman wrote:
> > On vcpu schedule, the condition checked for tlb pollution is too loose.
> > The tlb entries of a vcpu become polluted (vs stale) only when a
> different
> > vcpu within the same logical partition runs in-between. Optimize the
> tlb
> > invalidation condition taking into account the logical partition id.
> >
> > With the new invalidation condition, a guest shows 4% performance
> improvement
> > on P5020DS while running a memory stress application with the cpu
> oversubscribed,
> > the other guest running a cpu intensive workload.
> 
> See
> https://lists.ozlabs.org/pipermail/linuxppc-dev/2014-June/118547.html

Thanks. The original code needs just a simple adjustment to benefit from
this optimization, please review v3.

- Mike

Re: [PATCH v2] KVM: PPC: e500mc: Enhance tlb invalidation condition on vcpu schedule

2014-06-17 Thread Scott Wood

On Tue, 2014-06-17 at 14:04 -0500, Caraman Mihai Claudiu-B02008 wrote:
> > -Original Message-
> > From: Wood Scott-B07421
> > Sent: Tuesday, June 17, 2014 6:36 PM
> > To: Caraman Mihai Claudiu-B02008
> > Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; linuxppc-
> > d...@lists.ozlabs.org
> > Subject: Re: [PATCH v2] KVM: PPC: e500mc: Enhance tlb invalidation
> > condition on vcpu schedule
> > 
> > On Tue, 2014-06-17 at 12:02 +0300, Mihai Caraman wrote:
> > > On vcpu schedule, the condition checked for tlb pollution is too loose.
> > > The tlb entries of a vcpu become polluted (vs stale) only when a
> > different
> > > vcpu within the same logical partition runs in-between. Optimize the
> > tlb
> > > invalidation condition taking into account the logical partition id.
> > >
> > > With the new invalidation condition, a guest shows 4% performance
> > improvement
> > > on P5020DS while running a memory stress application with the cpu
> > oversubscribed,
> > > the other guest running a cpu intensive workload.
> > 
> > See
> > https://lists.ozlabs.org/pipermail/linuxppc-dev/2014-June/118547.html
> 
> Thanks. The original code needs just a simple adjustment to benefit from
> this optimization, please review v3.

Where is v3?  Or is it forthcoming?

-Scott


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v3] KVM: PPC: e500mc: Enhance tlb invalidation condition on vcpu schedule

2014-06-17 Thread Mihai Caraman

On vcpu schedule, the condition checked for tlb pollution is too loose.
The tlb entries of a vcpu become polluted (vs stale) only when a different
vcpu within the same logical partition runs in-between. Optimize the tlb
invalidation condition keeping last_vcpu_on_cpu per logical partition id.

With the new invalidation condition, a guest shows 4% performance improvement
on P5020DS while running a memory stress application with the cpu 
oversubscribed,
the other guest running a cpu intensive workload.

Guest - old invalidation condition
  real 3.89
  user 3.87
  sys 0.01

Guest - enhanced invalidation condition
  real 3.75
  user 3.73
  sys 0.01

Host
  real 3.70
  user 1.85
  sys 0.00

The memory stress application accesses 4KB pages backed by 75% of available
TLB0 entries:

char foo[ENTRIES][4096] __attribute__ ((aligned (4096)));

int main()
{
char bar;
int i, j;

for (i = 0; i < ITERATIONS; i++)
for (j = 0; j < ENTRIES; j++)
bar = foo[j][0];

return 0;
}

Signed-off-by: Mihai Caraman 
Cc: Scott Wood 
---
v3:
 - use existing logic while keeping last_vcpu_per_cpu per lpid
 
v2:
 - improve patch name and description
 - add performance results


 arch/powerpc/kvm/e500mc.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c
index 17e4562..95e33e3 100644
--- a/arch/powerpc/kvm/e500mc.c
+++ b/arch/powerpc/kvm/e500mc.c
@@ -110,7 +110,7 @@ void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 
old_msr)
 {
 }
 
-static DEFINE_PER_CPU(struct kvm_vcpu *, last_vcpu_on_cpu);
+static DEFINE_PER_CPU(struct kvm_vcpu * [KVMPPC_NR_LPIDS], last_vcpu_on_cpu);
 
 static void kvmppc_core_vcpu_load_e500mc(struct kvm_vcpu *vcpu, int cpu)
 {
@@ -141,9 +141,9 @@ static void kvmppc_core_vcpu_load_e500mc(struct kvm_vcpu 
*vcpu, int cpu)
mtspr(SPRN_GESR, vcpu->arch.shared->esr);
 
if (vcpu->arch.oldpir != mfspr(SPRN_PIR) ||
-   __get_cpu_var(last_vcpu_on_cpu) != vcpu) {
+   __get_cpu_var(last_vcpu_on_cpu)[vcpu->kvm->arch.lpid] != vcpu) {
kvmppc_e500_tlbil_all(vcpu_e500);
-   __get_cpu_var(last_vcpu_on_cpu) = vcpu;
+   __get_cpu_var(last_vcpu_on_cpu)[vcpu->kvm->arch.lpid] = vcpu;
}
 
kvmppc_load_guest_fp(vcpu);
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3] KVM: PPC: e500mc: Enhance tlb invalidation condition on vcpu schedule

2014-06-17 Thread Scott Wood

On Tue, 2014-06-17 at 22:09 +0300, Mihai Caraman wrote:
> On vcpu schedule, the condition checked for tlb pollution is too loose.
> The tlb entries of a vcpu become polluted (vs stale) only when a different
> vcpu within the same logical partition runs in-between. Optimize the tlb
> invalidation condition keeping last_vcpu_on_cpu per logical partition id.
> 
> With the new invalidation condition, a guest shows 4% performance improvement
> on P5020DS while running a memory stress application with the cpu 
> oversubscribed,
> the other guest running a cpu intensive workload.
> 
> Guest - old invalidation condition
>   real 3.89
>   user 3.87
>   sys 0.01
> 
> Guest - enhanced invalidation condition
>   real 3.75
>   user 3.73
>   sys 0.01
> 
> Host
>   real 3.70
>   user 1.85
>   sys 0.00
> 
> The memory stress application accesses 4KB pages backed by 75% of available
> TLB0 entries:
> 
> char foo[ENTRIES][4096] __attribute__ ((aligned (4096)));
> 
> int main()
> {
>   char bar;
>   int i, j;
> 
>   for (i = 0; i < ITERATIONS; i++)
>   for (j = 0; j < ENTRIES; j++)
>   bar = foo[j][0];
> 
>   return 0;
> }
> 
> Signed-off-by: Mihai Caraman 
> Cc: Scott Wood 
> ---
> v3:
>  - use existing logic while keeping last_vcpu_per_cpu per lpid
>  
> v2:
>  - improve patch name and description
>  - add performance results
> 
> 
>  arch/powerpc/kvm/e500mc.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c
> index 17e4562..95e33e3 100644
> --- a/arch/powerpc/kvm/e500mc.c
> +++ b/arch/powerpc/kvm/e500mc.c
> @@ -110,7 +110,7 @@ void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 
> old_msr)
>  {
>  }
>  
> -static DEFINE_PER_CPU(struct kvm_vcpu *, last_vcpu_on_cpu);
> +static DEFINE_PER_CPU(struct kvm_vcpu * [KVMPPC_NR_LPIDS], last_vcpu_on_cpu);

Hmm, I didn't know you could express types like that.  Is this special
syntax that only works for typeof?

No space after *

Name should be adjusted to match, something like last_vcpu_of_lpid (with
the _on_cpu being implied by the fact that it's PER_CPU).

-Scott


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

RE: [PATCH v3] KVM: PPC: e500mc: Enhance tlb invalidation condition on vcpu schedule

2014-06-17 Thread mihai.cara...@freescale.com

> > -static DEFINE_PER_CPU(struct kvm_vcpu *, last_vcpu_on_cpu);
> > +static DEFINE_PER_CPU(struct kvm_vcpu * [KVMPPC_NR_LPIDS],
> last_vcpu_on_cpu);
> 
> Hmm, I didn't know you could express types like that.  Is this special
> syntax that only works for typeof?

Yes, AFAIK.

> No space after *

Checkpatch complains about the missing space ;)

> 
> Name should be adjusted to match, something like last_vcpu_of_lpid (with
> the _on_cpu being implied by the fact that it's PER_CPU).

I was thinking to the long name but it was not appealing, I will change it to
last_vcpu_of_lpid.

-Mike

Re: [PATCH v3] KVM: PPC: e500mc: Enhance tlb invalidation condition on vcpu schedule

2014-06-17 Thread Scott Wood

On Tue, 2014-06-17 at 14:42 -0500, Caraman Mihai Claudiu-B02008 wrote:
> > > -static DEFINE_PER_CPU(struct kvm_vcpu *, last_vcpu_on_cpu);
> > > +static DEFINE_PER_CPU(struct kvm_vcpu * [KVMPPC_NR_LPIDS],
> > last_vcpu_on_cpu);
> > 
> > Hmm, I didn't know you could express types like that.  Is this special
> > syntax that only works for typeof?
> 
> Yes, AFAIK.
> 
> > No space after *
> 
> Checkpatch complains about the missing space ;)

Checkpatch is wrong, which isn't surprising given that this is unusual
syntax.  We don't normally put a space after * when used to represent a
pointer.

-Scott


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

RE: [PATCH v3] KVM: PPC: e500mc: Enhance tlb invalidation condition on vcpu schedule

2014-06-17 Thread mihai.cara...@freescale.com

> -Original Message-
> From: Wood Scott-B07421
> Sent: Tuesday, June 17, 2014 10:48 PM
> To: Caraman Mihai Claudiu-B02008
> Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; linuxppc-
> d...@lists.ozlabs.org
> Subject: Re: [PATCH v3] KVM: PPC: e500mc: Enhance tlb invalidation
> condition on vcpu schedule
> 
> On Tue, 2014-06-17 at 14:42 -0500, Caraman Mihai Claudiu-B02008 wrote:
> > > > -static DEFINE_PER_CPU(struct kvm_vcpu *, last_vcpu_on_cpu);
> > > > +static DEFINE_PER_CPU(struct kvm_vcpu * [KVMPPC_NR_LPIDS],
> > > last_vcpu_on_cpu);
> > >
> > > Hmm, I didn't know you could express types like that.  Is this
> special
> > > syntax that only works for typeof?
> >
> > Yes, AFAIK.
> >
> > > No space after *
> >
> > Checkpatch complains about the missing space ;)
> 
> Checkpatch is wrong, which isn't surprising given that this is unusual
> syntax.  We don't normally put a space after * when used to represent a
> pointer.

This is not something new. See [PATCH 04/10] percpu: cleanup percpu array
definitions:

https://lkml.org/lkml/2009/6/24/26

-Mike
N�r��yb�X��ǧv�^�)޺{.n�+h����ܨ}���Ơz�&j:+v���zZ+��+zf���h���~i���z��w���?�&�)ߢf

Re: [PATCH 01/11] qspinlock: A simple generic 4-byte queue spinlock

2014-06-17 Thread Konrad Rzeszutek Wilk

> > +   new = tail | (val & _Q_LOCKED_MASK);
> > +
> > +   old = atomic_cmpxchg(&lock->val, val, new);
> > +   if (old == val)
> > +   break;
> > +
> > +   val = old;
> > +   }
> > +
> > +   /*
> > +* we won the trylock; forget about queueing.
> > +*/
> > +   if (new == _Q_LOCKED_VAL)
> > +   goto release;
> > +
> > +   /*
> > +* if there was a previous node; link it and wait.
> > +*/
> > +   if (old & ~_Q_LOCKED_MASK) {
> > +   prev = decode_tail(old);
> > +   ACCESS_ONCE(prev->next) = node;
> > +
> > +   arch_mcs_spin_lock_contended(&node->locked);

Could you add a comment here:

/* We are spinning forever until the previous node updates locked - which
it does once the it has updated lock->val with our tail number. */

> > +   }
> > +
> > +   /*
> > +* we're at the head of the waitqueue, wait for the owner to go away.
> > +*
> > +* *,x -> *,0
> > +*/
> > +   while ((val = atomic_read(&lock->val)) & _Q_LOCKED_MASK)
> > +   cpu_relax();
> > +
> > +   /*
> > +* claim the lock:
> > +*
> > +* n,0 -> 0,1 : lock, uncontended
> > +* *,0 -> *,1 : lock, contended
> > +*/
> > +   for (;;) {
> > +   new = _Q_LOCKED_VAL;
> > +   if (val != tail)
> > +   new |= val;
> 
..snip..
> 
> Could you help a bit in explaining it in English please?

After looking at the assembler code I finally figured out how
we can get here. And the 'contended' part threw me off. Somehow
I imagined there are two more more CPUs stampeding here and 
trying to update the lock->val. But in reality the other CPUs
are stuck in the arch_mcs_spin_lock_contended spinning on their
local value.

Perhaps you could add this comment.

/* Once queue_spin_unlock is called (which _subtracts_ _Q_LOCKED_VAL from
the lock->val and still preserving the tail data), the winner gets to
claim the ticket. Since we still need the other CPUs to continue and
preserve the strict ordering in which they setup node->next, we:
 1) update lock->val to the tail value (so tail CPU and its index) with
_Q_LOCKED_VAL.
 2). Once we are done, we poke the other CPU (the one that linked to
us) by writting to node->locked (below) so they can make progress and
loop on lock->val changing from _Q_LOCKED_MASK to zero).

*/
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3] KVM: PPC: e500mc: Enhance tlb invalidation condition on vcpu schedule

2014-06-17 Thread Scott Wood

On Tue, 2014-06-17 at 15:02 -0500, Caraman Mihai Claudiu-B02008 wrote:
> > -Original Message-
> > From: Wood Scott-B07421
> > Sent: Tuesday, June 17, 2014 10:48 PM
> > To: Caraman Mihai Claudiu-B02008
> > Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; linuxppc-
> > d...@lists.ozlabs.org
> > Subject: Re: [PATCH v3] KVM: PPC: e500mc: Enhance tlb invalidation
> > condition on vcpu schedule
> > 
> > On Tue, 2014-06-17 at 14:42 -0500, Caraman Mihai Claudiu-B02008 wrote:
> > > > > -static DEFINE_PER_CPU(struct kvm_vcpu *, last_vcpu_on_cpu);
> > > > > +static DEFINE_PER_CPU(struct kvm_vcpu * [KVMPPC_NR_LPIDS],
> > > > last_vcpu_on_cpu);
> > > >
> > > > Hmm, I didn't know you could express types like that.  Is this
> > special
> > > > syntax that only works for typeof?
> > >
> > > Yes, AFAIK.
> > >
> > > > No space after *
> > >
> > > Checkpatch complains about the missing space ;)
> > 
> > Checkpatch is wrong, which isn't surprising given that this is unusual
> > syntax.  We don't normally put a space after * when used to represent a
> > pointer.
> 
> This is not something new. See [PATCH 04/10] percpu: cleanup percpu array
> definitions:
> 
>   https://lkml.org/lkml/2009/6/24/26

I didn't say it was new, just unusual, and checkpatch doesn't recognize
it.  Checkpatch shouldn't be blindly and silently obeyed when it says
something strange.

-Scott


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 01/11] qspinlock: A simple generic 4-byte queue spinlock

2014-06-17 Thread Konrad Rzeszutek Wilk

> + * The basic principle of a queue-based spinlock can best be understood
> + * by studying a classic queue-based spinlock implementation called the
> + * MCS lock. The paper below provides a good description for this kind
> + * of lock.
> + *
> + * http://www.cise.ufl.edu/tr/DOC/REP-1992-71.pdf
> + *
> + * This queue spinlock implementation is based on the MCS lock, however to 
> make
> + * it fit the 4 bytes we assume spinlock_t to be, and preserve its existing
> + * API, we must modify it some.
> + *
> + * In particular; where the traditional MCS lock consists of a tail pointer
> + * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to
> + * unlock the next pending (next->locked), we compress both these: {tail,
> + * next->locked} into a single u32 value.
> + *
> + * Since a spinlock disables recursion of its own context and there is a 
> limit
> + * to the contexts that can nest; namely: task, softirq, hardirq, nmi, we can
> + * encode the tail as and index indicating this context and a cpu number.
> + *
> + * We can further change the first spinner to spin on a bit in the lock word
> + * instead of its node; whereby avoiding the need to carry a node from lock 
> to
> + * unlock, and preserving API.

You also made changes (compared to the MCS) in that the unlock path is not
spinning waiting for the successor and that the job of passing the lock
is not done in the unlock path either.

Instead all of that is now done in the path of the lock acquirer logic. 

Could you update the comment to say that please?

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

RE: [PATCH v3] KVM: PPC: e500mc: Enhance tlb invalidation condition on vcpu schedule

2014-06-17 Thread mihai.cara...@freescale.com

> -Original Message-
> From: Wood Scott-B07421
> Sent: Tuesday, June 17, 2014 11:05 PM
> To: Caraman Mihai Claudiu-B02008
> Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; linuxppc-
> d...@lists.ozlabs.org
> Subject: Re: [PATCH v3] KVM: PPC: e500mc: Enhance tlb invalidation
> condition on vcpu schedule
> 
> On Tue, 2014-06-17 at 15:02 -0500, Caraman Mihai Claudiu-B02008 wrote:
> > > -Original Message-
> > > From: Wood Scott-B07421
> > > Sent: Tuesday, June 17, 2014 10:48 PM
> > > To: Caraman Mihai Claudiu-B02008
> > > Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; linuxppc-
> > > d...@lists.ozlabs.org
> > > Subject: Re: [PATCH v3] KVM: PPC: e500mc: Enhance tlb invalidation
> > > condition on vcpu schedule
> > >
> > > On Tue, 2014-06-17 at 14:42 -0500, Caraman Mihai Claudiu-B02008
> wrote:
> > > > > > -static DEFINE_PER_CPU(struct kvm_vcpu *, last_vcpu_on_cpu);
> > > > > > +static DEFINE_PER_CPU(struct kvm_vcpu * [KVMPPC_NR_LPIDS],
> > > > > last_vcpu_on_cpu);
> > > > >
> > > > > Hmm, I didn't know you could express types like that.  Is this
> > > special
> > > > > syntax that only works for typeof?
> > > >
> > > > Yes, AFAIK.
> > > >
> > > > > No space after *
> > > >
> > > > Checkpatch complains about the missing space ;)
> > >
> > > Checkpatch is wrong, which isn't surprising given that this is
> unusual
> > > syntax.  We don't normally put a space after * when used to represent
> a
> > > pointer.
> >
> > This is not something new. See [PATCH 04/10] percpu: cleanup percpu
> array
> > definitions:
> >
> > https://lkml.org/lkml/2009/6/24/26
> 
> I didn't say it was new, just unusual, and checkpatch doesn't recognize
> it.  Checkpatch shouldn't be blindly and silently obeyed when it says
> something strange.

I agree with you about the syntax and I know other cases where checkpatch
is a moron. For similar corner cases checkpatch maintainers did not wanted
(or found it difficult) to make an exception. I would also like to see Alex
opinion on this.

-Mike

Re: [PATCH 03/11] qspinlock: Add pending bit

2014-06-17 Thread Konrad Rzeszutek Wilk

On Sun, Jun 15, 2014 at 02:47:00PM +0200, Peter Zijlstra wrote:
> Because the qspinlock needs to touch a second cacheline; add a pending
> bit and allow a single in-word spinner before we punt to the second
> cacheline.

Could you add this in the description please:

And by second cacheline we mean the local 'node'. That is the:
mcs_nodes[0] and mcs_nodes[idx]

Perhaps it might be better then to split this in the header file
as this is trying to not be a slowpath code - but rather - a
pre-slow-path-lets-try-if-we can do another cmpxchg in case
the unlocker has just unlocked itself.

So something like:

diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
index e8a7ae8..29cc9c7 100644
--- a/include/asm-generic/qspinlock.h
+++ b/include/asm-generic/qspinlock.h
@@ -75,11 +75,21 @@ extern void queue_spin_lock_slowpath(struct qspinlock 
*lock, u32 val);
  */
 static __always_inline void queue_spin_lock(struct qspinlock *lock)
 {
-   u32 val;
+   u32 val, new;
 
val = atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL);
if (likely(val == 0))
return;
+
+   /* One more attempt - but if we fail mark it as pending. */
+   if (val == _Q_LOCKED_VAL) {
+   new = Q_LOCKED_VAL |_Q_PENDING_VAL;
+
+   old = atomic_cmpxchg(&lock->val, val, new);
+   if (old == _Q_LOCKED_VAL) /* YEEY! */
+   return;
+   val = old;
+   }
queue_spin_lock_slowpath(lock, val);
 }

and then the slowpath preserves most of the old logic path
(with the pending bit stuff)?

 
> 
> Signed-off-by: Peter Zijlstra 
> ---
>  include/asm-generic/qspinlock_types.h |   12 ++-
>  kernel/locking/qspinlock.c|  109 
> +++---
>  2 files changed, 97 insertions(+), 24 deletions(-)
> 
> --- a/include/asm-generic/qspinlock_types.h
> +++ b/include/asm-generic/qspinlock_types.h
> @@ -39,8 +39,9 @@ typedef struct qspinlock {
>   * Bitfields in the atomic value:
>   *
>   *  0- 7: locked byte
> - *  8- 9: tail index
> - * 10-31: tail cpu (+1)
> + * 8: pending
> + *  9-10: tail index
> + * 11-31: tail cpu (+1)
>   */
>  #define  _Q_SET_MASK(type)   (((1U << _Q_ ## type ## _BITS) - 1)\
> << _Q_ ## type ## _OFFSET)
> @@ -48,7 +49,11 @@ typedef struct qspinlock {
>  #define _Q_LOCKED_BITS   8
>  #define _Q_LOCKED_MASK   _Q_SET_MASK(LOCKED)
>  
> -#define _Q_TAIL_IDX_OFFSET   (_Q_LOCKED_OFFSET + _Q_LOCKED_BITS)
> +#define _Q_PENDING_OFFSET(_Q_LOCKED_OFFSET + _Q_LOCKED_BITS)
> +#define _Q_PENDING_BITS  1
> +#define _Q_PENDING_MASK  _Q_SET_MASK(PENDING)
> +
> +#define _Q_TAIL_IDX_OFFSET   (_Q_PENDING_OFFSET + _Q_PENDING_BITS)
>  #define _Q_TAIL_IDX_BITS 2
>  #define _Q_TAIL_IDX_MASK _Q_SET_MASK(TAIL_IDX)
>  
> @@ -57,5 +62,6 @@ typedef struct qspinlock {
>  #define _Q_TAIL_CPU_MASK _Q_SET_MASK(TAIL_CPU)
>  
>  #define _Q_LOCKED_VAL(1U << _Q_LOCKED_OFFSET)
> +#define _Q_PENDING_VAL   (1U << _Q_PENDING_OFFSET)
>  
>  #endif /* __ASM_GENERIC_QSPINLOCK_TYPES_H */
> --- a/kernel/locking/qspinlock.c
> +++ b/kernel/locking/qspinlock.c
> @@ -83,24 +83,28 @@ static inline struct mcs_spinlock *decod
>   return per_cpu_ptr(&mcs_nodes[idx], cpu);
>  }
>  
> +#define _Q_LOCKED_PENDING_MASK   (_Q_LOCKED_MASK | _Q_PENDING_MASK)
> +
>  /**
>   * queue_spin_lock_slowpath - acquire the queue spinlock
>   * @lock: Pointer to queue spinlock structure
>   * @val: Current value of the queue spinlock 32-bit word
>   *
> - * (queue tail, lock bit)
> - *
> - *  fast  :slow  :
> unlock
> - *:  :
> - * uncontended  (0,0)   --:--> (0,1) :--> 
> (*,0)
> - *:   | ^./  :
> - *:   v   \   |  :
> - * uncontended:(n,x) --+--> (n,0) |  :
> - *   queue:   | ^--'  |  :
> - *:   v   |  :
> - * contended  :(*,x) --+--> (*,0) -> (*,1) ---'  :
> - *   queue: ^--' :
> + * (queue tail, pending bit, lock bit)
>   *
> + *  fast :slow  :
> unlock
> + *   :  :
> + * uncontended  (0,0,0) -:--> (0,0,1) --:--> 
> (*,*,0)
> + *   :   | ^.--. /  :
> + *   :   v   \  \|  :
> + * pending   :(0,1,1) +--> (0,1,0)   \   |  :
> + *   :   | ^--'  |

Re: [PATCH v3] KVM: PPC: e500mc: Enhance tlb invalidation condition on vcpu schedule

2014-06-17 Thread Alexander Graf

On 17.06.14 22:36, mihai.cara...@freescale.com wrote:

-Original Message-
From: Wood Scott-B07421
Sent: Tuesday, June 17, 2014 11:05 PM
To: Caraman Mihai Claudiu-B02008
Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; linuxppc-
d...@lists.ozlabs.org
Subject: Re: [PATCH v3] KVM: PPC: e500mc: Enhance tlb invalidation
condition on vcpu schedule

On Tue, 2014-06-17 at 15:02 -0500, Caraman Mihai Claudiu-B02008 wrote:

-Original Message-
From: Wood Scott-B07421
Sent: Tuesday, June 17, 2014 10:48 PM
To: Caraman Mihai Claudiu-B02008
Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; linuxppc-
d...@lists.ozlabs.org
Subject: Re: [PATCH v3] KVM: PPC: e500mc: Enhance tlb invalidation
condition on vcpu schedule

On Tue, 2014-06-17 at 14:42 -0500, Caraman Mihai Claudiu-B02008

wrote:

-static DEFINE_PER_CPU(struct kvm_vcpu *, last_vcpu_on_cpu);
+static DEFINE_PER_CPU(struct kvm_vcpu * [KVMPPC_NR_LPIDS],

last_vcpu_on_cpu);

Hmm, I didn't know you could express types like that.  Is this

special

syntax that only works for typeof?

Yes, AFAIK.

No space after *

Checkpatch complains about the missing space ;)

Checkpatch is wrong, which isn't surprising given that this is

unusual

syntax.  We don't normally put a space after * when used to represent

a

pointer.

This is not something new. See [PATCH 04/10] percpu: cleanup percpu

array

definitions:

https://lkml.org/lkml/2009/6/24/26

I didn't say it was new, just unusual, and checkpatch doesn't recognize
it.  Checkpatch shouldn't be blindly and silently obeyed when it says
something strange.

I agree with you about the syntax and I know other cases where checkpatch
is a moron. For similar corner cases checkpatch maintainers did not wanted
(or found it difficult) to make an exception. I would also like to see Alex
opinion on this.

I usually like to apply common sense :).

Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 03/11] qspinlock: Add pending bit

2014-06-17 Thread Waiman Long


On 06/17/2014 04:36 PM, Konrad Rzeszutek Wilk wrote:

On Sun, Jun 15, 2014 at 02:47:00PM +0200, Peter Zijlstra wrote:

Because the qspinlock needs to touch a second cacheline; add a pending
bit and allow a single in-word spinner before we punt to the second
cacheline.

Could you add this in the description please:

And by second cacheline we mean the local 'node'. That is the:
mcs_nodes[0] and mcs_nodes[idx]

Perhaps it might be better then to split this in the header file
as this is trying to not be a slowpath code - but rather - a
pre-slow-path-lets-try-if-we can do another cmpxchg in case
the unlocker has just unlocked itself.

So something like:

diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
index e8a7ae8..29cc9c7 100644
--- a/include/asm-generic/qspinlock.h
+++ b/include/asm-generic/qspinlock.h
@@ -75,11 +75,21 @@ extern void queue_spin_lock_slowpath(struct qspinlock 
*lock, u32 val);
   */
  static __always_inline void queue_spin_lock(struct qspinlock *lock)
  {
-   u32 val;
+   u32 val, new;

val = atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL);
if (likely(val == 0))
return;
+
+   /* One more attempt - but if we fail mark it as pending. */
+   if (val == _Q_LOCKED_VAL) {
+   new = Q_LOCKED_VAL |_Q_PENDING_VAL;
+
+   old = atomic_cmpxchg(&lock->val, val, new);
+   if (old == _Q_LOCKED_VAL) /* YEEY! */
+   return;


No, it can leave like that. The unlock path will not clear the pending 
bit. We are trying to make the fastpath as simple as possible as it may 
be inlined. The complexity of the queue spinlock is in the slowpath.


Moreover, an cmpxchg followed immediately followed by another cmpxchg 
will just increase the level of memory contention when a lock is fairly 
contended. The chance of second cmpxchg() succeeding will be pretty low.


-Longman


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 04/11] qspinlock: Extract out the exchange of tail code word

2014-06-17 Thread Konrad Rzeszutek Wilk

On Sun, Jun 15, 2014 at 02:47:01PM +0200, Peter Zijlstra wrote:
> From: Waiman Long 
> 
> This patch extracts the logic for the exchange of new and previous tail
> code words into a new xchg_tail() function which can be optimized in a
> later patch.

And also adds a third try on acquiring the lock. That I think should
be a seperate patch.

And instead of saying 'later patch' you should spell out the name
of the patch. Especially as this might not be obvious from somebody
doing git bisection.

> 
> Signed-off-by: Waiman Long 
> Signed-off-by: Peter Zijlstra 
> ---
>  include/asm-generic/qspinlock_types.h |2 +
>  kernel/locking/qspinlock.c|   58 
> +-
>  2 files changed, 38 insertions(+), 22 deletions(-)
> 
> --- a/include/asm-generic/qspinlock_types.h
> +++ b/include/asm-generic/qspinlock_types.h
> @@ -61,6 +61,8 @@ typedef struct qspinlock {
>  #define _Q_TAIL_CPU_BITS (32 - _Q_TAIL_CPU_OFFSET)
>  #define _Q_TAIL_CPU_MASK _Q_SET_MASK(TAIL_CPU)
>  
> +#define _Q_TAIL_MASK (_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK)
> +
>  #define _Q_LOCKED_VAL(1U << _Q_LOCKED_OFFSET)
>  #define _Q_PENDING_VAL   (1U << _Q_PENDING_OFFSET)
>  
> --- a/kernel/locking/qspinlock.c
> +++ b/kernel/locking/qspinlock.c
> @@ -86,6 +86,31 @@ static inline struct mcs_spinlock *decod
>  #define _Q_LOCKED_PENDING_MASK   (_Q_LOCKED_MASK | _Q_PENDING_MASK)
>  
>  /**
> + * xchg_tail - Put in the new queue tail code word & retrieve previous one
> + * @lock : Pointer to queue spinlock structure
> + * @tail : The new queue tail code word
> + * Return: The previous queue tail code word
> + *
> + * xchg(lock, tail)
> + *
> + * p,*,* -> n,*,* ; prev = xchg(lock, node)
> + */
> +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
> +{
> + u32 old, new, val = atomic_read(&lock->val);
> +
> + for (;;) {
> + new = (val & _Q_LOCKED_PENDING_MASK) | tail;
> + old = atomic_cmpxchg(&lock->val, val, new);
> + if (old == val)
> + break;
> +
> + val = old;
> + }
> + return old;
> +}
> +
> +/**
>   * queue_spin_lock_slowpath - acquire the queue spinlock
>   * @lock: Pointer to queue spinlock structure
>   * @val: Current value of the queue spinlock 32-bit word
> @@ -182,36 +207,25 @@ void queue_spin_lock_slowpath(struct qsp
>   node->next = NULL;
>  
>   /*
> -  * we already touched the queueing cacheline; don't bother with pending
> -  * stuff.
> -  *
> -  * trylock || xchg(lock, node)
> -  *
> -  * 0,0,0 -> 0,0,1 ; trylock
> -  * p,y,x -> n,y,x ; prev = xchg(lock, node)
> +  * We touched a (possibly) cold cacheline in the per-cpu queue node;
> +  * attempt the trylock once more in the hope someone let go while we
> +  * weren't watching.
>*/
> - for (;;) {
> - new = _Q_LOCKED_VAL;
> - if (val)
> - new = tail | (val & _Q_LOCKED_PENDING_MASK);
> -
> - old = atomic_cmpxchg(&lock->val, val, new);
> - if (old == val)
> - break;
> -
> - val = old;
> - }
> + if (queue_spin_trylock(lock))
> + goto release;

So now are three of them? One in queue_spin_lock, then at the start
of this function when checking for the pending bit, and the once more
here. And that is because the local cache line might be cold for the
'mcs_index' struct?

That all seems to be a bit of experimental. But then we are already
in the slowpath so we could as well do:

for (i = 0; i < 10; i++)
if (queue_spin_trylock(lock))
goto release;

And would have the same effect.


>  
>   /*
> -  * we won the trylock; forget about queueing.
> +  * we already touched the queueing cacheline; don't bother with pending
> +  * stuff.

I guess we could also just erase the pending bit if we wanted too. The
optimistic spinning will still hit go to the queue label as lock->val will
have the tail value.

> +  *
> +  * p,*,* -> n,*,*
>*/
> - if (new == _Q_LOCKED_VAL)
> - goto release;
> + old = xchg_tail(lock, tail);
>  
>   /*
>* if there was a previous node; link it and wait.
>*/
> - if (old & ~_Q_LOCKED_PENDING_MASK) {
> + if (old & _Q_TAIL_MASK) {
>   prev = decode_tail(old);
>   ACCESS_ONCE(prev->next) = node;
>  
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 03/11] qspinlock: Add pending bit

2014-06-17 Thread Konrad Rzeszutek Wilk

On Tue, Jun 17, 2014 at 04:51:57PM -0400, Waiman Long wrote:
> On 06/17/2014 04:36 PM, Konrad Rzeszutek Wilk wrote:
> >On Sun, Jun 15, 2014 at 02:47:00PM +0200, Peter Zijlstra wrote:
> >>Because the qspinlock needs to touch a second cacheline; add a pending
> >>bit and allow a single in-word spinner before we punt to the second
> >>cacheline.
> >Could you add this in the description please:
> >
> >And by second cacheline we mean the local 'node'. That is the:
> >mcs_nodes[0] and mcs_nodes[idx]
> >
> >Perhaps it might be better then to split this in the header file
> >as this is trying to not be a slowpath code - but rather - a
> >pre-slow-path-lets-try-if-we can do another cmpxchg in case
> >the unlocker has just unlocked itself.
> >
> >So something like:
> >
> >diff --git a/include/asm-generic/qspinlock.h 
> >b/include/asm-generic/qspinlock.h
> >index e8a7ae8..29cc9c7 100644
> >--- a/include/asm-generic/qspinlock.h
> >+++ b/include/asm-generic/qspinlock.h
> >@@ -75,11 +75,21 @@ extern void queue_spin_lock_slowpath(struct qspinlock 
> >*lock, u32 val);
> >   */
> >  static __always_inline void queue_spin_lock(struct qspinlock *lock)
> >  {
> >-u32 val;
> >+u32 val, new;
> >
> > val = atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL);
> > if (likely(val == 0))
> > return;
> >+
> >+/* One more attempt - but if we fail mark it as pending. */
> >+if (val == _Q_LOCKED_VAL) {
> >+new = Q_LOCKED_VAL |_Q_PENDING_VAL;
> >+
> >+old = atomic_cmpxchg(&lock->val, val, new);
> >+if (old == _Q_LOCKED_VAL) /* YEEY! */
> >+return;
> 
> No, it can leave like that. The unlock path will not clear the pending bit.

Err, you are right. It needs to go back in the slowpath.

> We are trying to make the fastpath as simple as possible as it may be
> inlined. The complexity of the queue spinlock is in the slowpath.

Sure, but then it shouldn't be called slowpath anymore as it is not
slow. It is a combination of fast path (the potential chance of
grabbing the lock and setting the pending lock) and the real slow
path (the queuing). Perhaps it should be called 'queue_spinlock_complex' ?

> 
> Moreover, an cmpxchg followed immediately followed by another cmpxchg will
> just increase the level of memory contention when a lock is fairly
> contended. The chance of second cmpxchg() succeeding will be pretty low.

Then why even do the pending bit - which is what the slowpath does
for the first time. And if it grabs it (And sets the pending bit) it
immediately exits. Why not perculate that piece of code in-to this header.

And the leave all that slow code (queing, mcs_lock access, etc) in the slowpath.

> 
> -Longman
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v11 14/16] pvqspinlock: Add qspinlock para-virtualization support

2014-06-17 Thread Konrad Rzeszutek Wilk

On Sun, Jun 15, 2014 at 03:16:54PM +0200, Peter Zijlstra wrote:
> On Thu, Jun 12, 2014 at 04:48:41PM -0400, Waiman Long wrote:
> > I don't have a good understanding of the kernel alternatives mechanism.
> 
> I didn't either; I do now, cost me a whole day reading up on
> alternative/paravirt code patching.
> 
> See the patches I just send out; I got the 'native' case with paravirt
> enabled to be one NOP worse than the native case without paravirt -- for
> queue_spin_unlock.
> 
> The lock slowpath is several nops and some pointless movs more expensive.

You could use the asm goto which would optimize the fast path to be the
'native' case. That way you wouldn't have the the nops and movs in the
path.

(And asm goto also uses the alternative_asm macros).
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 03/11] qspinlock: Add pending bit

2014-06-17 Thread Konrad Rzeszutek Wilk

On Tue, Jun 17, 2014 at 05:07:29PM -0400, Konrad Rzeszutek Wilk wrote:
> On Tue, Jun 17, 2014 at 04:51:57PM -0400, Waiman Long wrote:
> > On 06/17/2014 04:36 PM, Konrad Rzeszutek Wilk wrote:
> > >On Sun, Jun 15, 2014 at 02:47:00PM +0200, Peter Zijlstra wrote:
> > >>Because the qspinlock needs to touch a second cacheline; add a pending
> > >>bit and allow a single in-word spinner before we punt to the second
> > >>cacheline.
> > >Could you add this in the description please:
> > >
> > >And by second cacheline we mean the local 'node'. That is the:
> > >mcs_nodes[0] and mcs_nodes[idx]
> > >
> > >Perhaps it might be better then to split this in the header file
> > >as this is trying to not be a slowpath code - but rather - a
> > >pre-slow-path-lets-try-if-we can do another cmpxchg in case
> > >the unlocker has just unlocked itself.
> > >
> > >So something like:
> > >
> > >diff --git a/include/asm-generic/qspinlock.h 
> > >b/include/asm-generic/qspinlock.h
> > >index e8a7ae8..29cc9c7 100644
> > >--- a/include/asm-generic/qspinlock.h
> > >+++ b/include/asm-generic/qspinlock.h
> > >@@ -75,11 +75,21 @@ extern void queue_spin_lock_slowpath(struct qspinlock 
> > >*lock, u32 val);
> > >   */
> > >  static __always_inline void queue_spin_lock(struct qspinlock *lock)
> > >  {
> > >-  u32 val;
> > >+  u32 val, new;
> > >
> > >   val = atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL);
> > >   if (likely(val == 0))
> > >   return;
> > >+
> > >+  /* One more attempt - but if we fail mark it as pending. */
> > >+  if (val == _Q_LOCKED_VAL) {
> > >+  new = Q_LOCKED_VAL |_Q_PENDING_VAL;
> > >+
> > >+  old = atomic_cmpxchg(&lock->val, val, new);
> > >+  if (old == _Q_LOCKED_VAL) /* YEEY! */
> > >+  return;
> > 
> > No, it can leave like that. The unlock path will not clear the pending bit.
> 
> Err, you are right. It needs to go back in the slowpath.

What I should have wrote is:

if (old == 0) /* YEEY */
  return;

As that would the same thing as this patch does on the pending bit - that
is if we can on the second compare and exchange set the pending bit (and the
lock) and the lock has been released - we are good.

And it is a quick path.

> 
> > We are trying to make the fastpath as simple as possible as it may be
> > inlined. The complexity of the queue spinlock is in the slowpath.
> 
> Sure, but then it shouldn't be called slowpath anymore as it is not
> slow. It is a combination of fast path (the potential chance of
> grabbing the lock and setting the pending lock) and the real slow
> path (the queuing). Perhaps it should be called 'queue_spinlock_complex' ?
> 

I forgot to mention - that was the crux of my comments - just change
the slowpath to complex name at that point to better reflect what
it does.

> > 
> > Moreover, an cmpxchg followed immediately followed by another cmpxchg will
> > just increase the level of memory contention when a lock is fairly
> > contended. The chance of second cmpxchg() succeeding will be pretty low.
> 
> Then why even do the pending bit - which is what the slowpath does
> for the first time. And if it grabs it (And sets the pending bit) it
> immediately exits. Why not perculate that piece of code in-to this header.
> 
> And the leave all that slow code (queing, mcs_lock access, etc) in the 
> slowpath.
> 
> > 
> > -Longman
> > 
> > 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v5 08/12] target-mips: Call kvm_mips_reset_vcpu() from mips_cpu_reset()

2014-06-17 Thread James Hogan

When KVM is enabled call kvm_mips_reset_vcpu() from mips_cpu_reset() as
done for other targets since commit 50a2c6e55fa2 (kvm: reset state from
the CPU's reset method).

Signed-off-by: James Hogan 
Cc: Aurelien Jarno 
Cc: Paolo Bonzini 
Cc: Gleb Natapov 
---
Changes in v5:
 - New patch, based on commit 50a2c6e55fa2 (kvm: reset state from the
   CPU's reset method).
---
 target-mips/cpu.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/target-mips/cpu.c b/target-mips/cpu.c
index dd954fc55a10..b3e0e6cce7b6 100644
--- a/target-mips/cpu.c
+++ b/target-mips/cpu.c
@@ -19,7 +19,9 @@
  */
 
 #include "cpu.h"
+#include "kvm_mips.h"
 #include "qemu-common.h"
+#include "sysemu/kvm.h"
 
 
 static void mips_cpu_set_pc(CPUState *cs, vaddr value)
@@ -87,6 +89,12 @@ static void mips_cpu_reset(CPUState *s)
 tlb_flush(s, 1);
 
 cpu_state_reset(env);
+
+#ifndef CONFIG_USER_ONLY
+if (kvm_enabled()) {
+kvm_mips_reset_vcpu(cpu);
+}
+#endif
 }
 
 static void mips_cpu_realizefn(DeviceState *dev, Error **errp)
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v5 09/12] hw/mips: In KVM mode, inject IRQ2 (I/O) interrupts via ioctls

2014-06-17 Thread James Hogan

From: Sanjay Lal 

COP0 emulation is in-kernel for KVM, so inject IRQ2 (I/O) interrupts via
ioctls.

Signed-off-by: Sanjay Lal 
Signed-off-by: James Hogan 
Reviewed-by: Aurelien Jarno 
Reviewed-by: Andreas Färber 
---
Changes in v5:
 - Fix typo in subject (s/interupts/interrupts/)

Changes in v3:
 - Pass MIPSCPU to kvm_mips_set_[ipi_]interrupt (Andreas Färber).

Changes in v2:
 - Expand commit message
 - Remove #ifdef CONFIG_KVM since it's guarded by kvm_enabled() already
---
 hw/mips/mips_int.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/hw/mips/mips_int.c b/hw/mips/mips_int.c
index 7dbd24d3d6e5..d740046ba151 100644
--- a/hw/mips/mips_int.c
+++ b/hw/mips/mips_int.c
@@ -23,6 +23,8 @@
 #include "hw/hw.h"
 #include "hw/mips/cpudevs.h"
 #include "cpu.h"
+#include "sysemu/kvm.h"
+#include "kvm_mips.h"
 
 static void cpu_mips_irq_request(void *opaque, int irq, int level)
 {
@@ -35,8 +37,17 @@ static void cpu_mips_irq_request(void *opaque, int irq, int 
level)
 
 if (level) {
 env->CP0_Cause |= 1 << (irq + CP0Ca_IP);
+
+if (kvm_enabled() && irq == 2) {
+kvm_mips_set_interrupt(cpu, irq, level);
+}
+
 } else {
 env->CP0_Cause &= ~(1 << (irq + CP0Ca_IP));
+
+if (kvm_enabled() && irq == 2) {
+kvm_mips_set_interrupt(cpu, irq, level);
+}
 }
 
 if (env->CP0_Cause & CP0Ca_IP_mask) {
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v5 06/12] kvm: Allow arch to set sigmask length

2014-06-17 Thread James Hogan

MIPS/Linux is unusual in having 128 signals rather than just 64 like
most other architectures. This means its sigmask is 16 bytes instead of
8, so allow arches to override the sigmask->len value passed to the
KVM_SET_SIGNAL_MASK ioctl in kvm_set_signal_mask() by calling
kvm_set_sigmask_len() from kvm_arch_init(). Otherwise default to 8
bytes.

Signed-off-by: James Hogan 
Cc: Aurelien Jarno 
Cc: Sanjay Lal 
Cc: Gleb Natapov 
Cc: Paolo Bonzini 
Cc: Peter Maydell 
---
Changes in v3:
 - Rewrote to allow sigmask length to be set by kvm_arch_init(), so that
   MIPS can set it to 16 as it has 128 signals. This is better than
   cluttering kvm-all.c with TARGET_* ifdefs (Peter Maydell).

Changes in v2:
 - Expand commit message
 - Reword comment
---
 include/sysemu/kvm.h |  2 ++
 kvm-all.c| 11 ++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index e79e92c50e3f..de4bdaa40e56 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -325,6 +325,8 @@ int kvm_check_extension(KVMState *s, unsigned int 
extension);
 uint32_t kvm_arch_get_supported_cpuid(KVMState *env, uint32_t function,
   uint32_t index, int reg);
 
+void kvm_set_sigmask_len(KVMState *s, unsigned int sigmask_len);
+
 #if !defined(CONFIG_USER_ONLY)
 int kvm_physical_memory_addr_from_host(KVMState *s, void *ram_addr,
hwaddr *phys_addr);
diff --git a/kvm-all.c b/kvm-all.c
index 4e19eff0efee..0d485b644539 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -99,6 +99,7 @@ struct KVMState
  * they're not.  Linux, glibc and *BSD all treat ioctl numbers as
  * unsigned, and treating them as signed here can break things */
 unsigned irq_set_ioctl;
+unsigned int sigmask_len;
 #ifdef KVM_CAP_IRQ_ROUTING
 struct kvm_irq_routing *irq_routes;
 int nr_allocated_irq_routes;
@@ -1397,6 +1398,8 @@ int kvm_init(MachineClass *mc)
 assert(TARGET_PAGE_SIZE <= getpagesize());
 page_size_init();
 
+s->sigmask_len = 8;
+
 #ifdef KVM_CAP_SET_GUEST_DEBUG
 QTAILQ_INIT(&s->kvm_sw_breakpoints);
 #endif
@@ -1575,6 +1578,11 @@ err:
 return ret;
 }
 
+void kvm_set_sigmask_len(KVMState *s, unsigned int sigmask_len)
+{
+s->sigmask_len = sigmask_len;
+}
+
 static void kvm_handle_io(uint16_t port, void *data, int direction, int size,
   uint32_t count)
 {
@@ -2095,6 +2103,7 @@ void kvm_remove_all_breakpoints(CPUState *cpu)
 
 int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset)
 {
+KVMState *s = kvm_state;
 struct kvm_signal_mask *sigmask;
 int r;
 
@@ -2104,7 +2113,7 @@ int kvm_set_signal_mask(CPUState *cpu, const sigset_t 
*sigset)
 
 sigmask = g_malloc(sizeof(*sigmask) + sizeof(*sigset));
 
-sigmask->len = 8;
+sigmask->len = s->sigmask_len;
 memcpy(sigmask->sigset, sigset, sizeof(*sigset));
 r = kvm_vcpu_ioctl(cpu, KVM_SET_SIGNAL_MASK, sigmask);
 g_free(sigmask);
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v5 12/12] MAINTAINERS: Add entry for MIPS KVM

2014-06-17 Thread James Hogan

Add MAINTAINERS entry for MIPS KVM.

Signed-off-by: James Hogan 
---
Changes in v4:
 - Add MAINTAINERS entry for MIPS KVM.
---
 MAINTAINERS | 5 +
 1 file changed, 5 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 51a6f51842be..0a637c90c679 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -176,6 +176,11 @@ M: Peter Maydell 
 S: Maintained
 F: target-arm/kvm.c
 
+MIPS
+M: James Hogan 
+S: Maintained
+F: target-mips/kvm.c
+
 PPC
 M: Alexander Graf 
 S: Maintained
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v5 03/12] hw/mips: Add API to convert KVM guest KSEG0 <-> GPA

2014-06-17 Thread James Hogan

From: Sanjay Lal 

Add API for converting physical addresses to KVM guest KSEG0 addresses,
and fix the existing API for converting KSEG0 addresses to physical
addresses to work in the KVM case. Both have the same sized KSEG0, so
it's just a case of fixing the mask.

In KVM trap and emulate mode both the guest kernel and guest userspace
execute in useg:
Guest User address space:   0x..0x3fff
Guest Kernel Unmapped:  0x4000..0x5fff
Guest Kernel Mapped:0x6000..0x7fff

Signed-off-by: Sanjay Lal 
Signed-off-by: James Hogan 
Cc: Aurelien Jarno 
---
Changes in v5:
 - KSEG0 doesn't actually change size, so fix mask in
   cpu_mips_kseg0_to_phys() instead of having the KVM specific
   cpu_mips_kvm_um_kseg0_to_phys().

Changes in v2:
 - Expand commit message
 - Remove unnecessary include
---
 hw/mips/addr.c| 7 ++-
 include/hw/mips/cpudevs.h | 2 ++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/hw/mips/addr.c b/hw/mips/addr.c
index 99488f1d2a6f..ff3b952600bb 100644
--- a/hw/mips/addr.c
+++ b/hw/mips/addr.c
@@ -25,10 +25,15 @@
 
 uint64_t cpu_mips_kseg0_to_phys(void *opaque, uint64_t addr)
 {
-return addr & 0x7fffll;
+return addr & 0x1fffll;
 }
 
 uint64_t cpu_mips_phys_to_kseg0(void *opaque, uint64_t addr)
 {
 return addr | ~0x7fffll;
 }
+
+uint64_t cpu_mips_kvm_um_phys_to_kseg0(void *opaque, uint64_t addr)
+{
+return addr | 0x4000ll;
+}
diff --git a/include/hw/mips/cpudevs.h b/include/hw/mips/cpudevs.h
index 6bea24bf102d..b2626f2922f4 100644
--- a/include/hw/mips/cpudevs.h
+++ b/include/hw/mips/cpudevs.h
@@ -5,6 +5,8 @@
 /* mips_addr.c */
 uint64_t cpu_mips_kseg0_to_phys(void *opaque, uint64_t addr);
 uint64_t cpu_mips_phys_to_kseg0(void *opaque, uint64_t addr);
+uint64_t cpu_mips_kvm_um_phys_to_kseg0(void *opaque, uint64_t addr);
+
 
 /* mips_int.c */
 void cpu_mips_irq_init_cpu(CPUMIPSState *env);
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v5 02/12] hw/mips/cputimer: Don't start periodic timer in KVM mode

2014-06-17 Thread James Hogan

From: Sanjay Lal 

Compare/Count timer interrupts are handled in-kernel for KVM. Therefore
don't bother creating the timer at init time if KVM is enabled. This
will conveniently avoid attempts to set the timeout when
cpu_mips_store_count() is called at reset with KVM enabled, treating the
timer as stopped so that CP0_Count is modified directly.

Signed-off-by: Sanjay Lal 
[james.ho...@imgtec.com: Update after "target-mips: Reset CPU timer
consistently" which moves timer start to reset time]
Signed-off-by: James Hogan 
Cc: Aurelien Jarno 
Cc: Paolo Bonzini 
---
Changes in v5:
 - Update commit message and comment in cpu_mips_clock_init() after the
   patch "target-mips: Reset CPU timer consistently") (Paolo Bonzini).

Changes in v2:
 - Expand commit message
 - Rebase on v1.7.0
 - Wrap comment
---
 hw/mips/cputimer.c | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/hw/mips/cputimer.c b/hw/mips/cputimer.c
index 6900a745c60c..577c9aeab877 100644
--- a/hw/mips/cputimer.c
+++ b/hw/mips/cputimer.c
@@ -23,6 +23,7 @@
 #include "hw/hw.h"
 #include "hw/mips/cpudevs.h"
 #include "qemu/timer.h"
+#include "sysemu/kvm.h"
 
 #define TIMER_FREQ 100 * 1000 * 1000
 
@@ -146,5 +147,11 @@ static void mips_timer_cb (void *opaque)
 
 void cpu_mips_clock_init (CPUMIPSState *env)
 {
-env->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, &mips_timer_cb, env);
+/*
+ * If we're in KVM mode, don't create the periodic timer, that is handled 
in
+ * kernel.
+ */
+if (!kvm_enabled()) {
+env->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, &mips_timer_cb, env);
+}
 }
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v5 01/12] target-mips: Reset CPU timer consistently

2014-06-17 Thread James Hogan

The MIPS CPU timer (CP0 Count/Compare registers & QEMU timer) is
reset at machine initialisation, including starting the timeout. Both
registers however are placed before mvp in CPUMIPSState so they will
both be zeroed on reset by the memset in mips_cpu_reset() including soon
after init. This doesn't take into account that the timer may be
running, in which case env->CP0_Count will represent the delta against
the VM clock and the timeout will need updating.

At init time (cpu_mips_clock_init()), lets only create the timer.
Setting Count = 1 and starting the timer (cpu_mips_store_count()) can be
done at reset time from cpu_state_reset(), which is after the memset.
There is also no need to set CP0_Compare = 0 as that is already handled
by the memset.

Note that a reset occurs from mips_cpu_realizefn() which is before the
machine init callback has had a chance to set up the CPU interrupts and
the CPU timer, so env->timer will be NULL. This case is handled
explicitly in cpu_mips_store_count(), treating the timer as disabled
(which will also be the right thing to do when KVM support is added).

Reported-by: Paolo Bonzini 
Signed-off-by: James Hogan 
Cc: Aurelien Jarno 
---
Changes in v5:
 - New patch (Paolo Bonzini).
---
 hw/mips/cputimer.c  | 9 ++---
 target-mips/translate.c | 2 ++
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/hw/mips/cputimer.c b/hw/mips/cputimer.c
index c8b4b000cd0e..6900a745c60c 100644
--- a/hw/mips/cputimer.c
+++ b/hw/mips/cputimer.c
@@ -85,7 +85,12 @@ uint32_t cpu_mips_get_count (CPUMIPSState *env)
 
 void cpu_mips_store_count (CPUMIPSState *env, uint32_t count)
 {
-if (env->CP0_Cause & (1 << CP0Ca_DC))
+/*
+ * This gets called from cpu_state_reset(), potentially before timer init.
+ * So env->timer may be NULL, which is also the case with KVM enabled so
+ * treat timer as disabled in that case.
+ */
+if (env->CP0_Cause & (1 << CP0Ca_DC) || !env->timer)
 env->CP0_Count = count;
 else {
 /* Store new count register */
@@ -142,6 +147,4 @@ static void mips_timer_cb (void *opaque)
 void cpu_mips_clock_init (CPUMIPSState *env)
 {
 env->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, &mips_timer_cb, env);
-env->CP0_Compare = 0;
-cpu_mips_store_count(env, 1);
 }
diff --git a/target-mips/translate.c b/target-mips/translate.c
index 2c4c80103d14..b63f30fc3a76 100644
--- a/target-mips/translate.c
+++ b/target-mips/translate.c
@@ -16091,6 +16091,8 @@ void cpu_state_reset(CPUMIPSState *env)
 /* Count register increments in debug mode, EJTAG version 1 */
 env->CP0_Debug = (1 << CP0DB_CNT) | (0x1 << CP0DB_VER);
 
+cpu_mips_store_count(env, 1);
+
 if (env->CP0_Config3 & (1 << CP0C3_MT)) {
 int i;
 
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v5 07/12] target-mips: kvm: Add main KVM support for MIPS

2014-06-17 Thread James Hogan

From: Sanjay Lal 

Implement the main KVM arch API for MIPS.

Signed-off-by: Sanjay Lal 
Signed-off-by: James Hogan 
Cc: Aurelien Jarno 
Cc: Gleb Natapov 
Cc: Paolo Bonzini 
Cc: Andreas Färber 
Cc: Peter Maydell 
---
Changes in v5:
 - Rename kvm_arch_reset_vcpu to kvm_mips_reset_vcpu based on commit
   50a2c6e55fa2 (kvm: reset state from the CPU's reset method).
 - Rename kvm_mips_te_{put,get}_cp0_registers() functions to drop the
   "te_" since they're not really specific to T&E.
 - Pass level through from kvm_arch_put_registers() to
   kvm_mips_put_cp0_registers() rather than hard coding it to
   KVM_PUT_FULL_STATE.
 - Fix KVM_REG_MIPS_CP0_* definitions to set KVM_REG_MIPS and
   KVM_REG_SIZE_U32/KVM_REG_SIZE_U64 (using a macro).
 - Remove unused KVM_REG_MIPS_CP0_* definitions for now.
 - Correct type of kvm_mips_{get,put}_one_{,ul}reg() reg_id argument to
   uint64_t. Various high bits must be set to disambiguate the
   architecture and register size.
 - Add register accessors for always-64-bit registers (rather than ulong
   registers). These are needed for virtual KVM registers for
   controlling the KVM Compare/Count timer.
 - Simplify register access functions slightly.
 - Save and restore KVM timer state with the rest of the state, and also
   when VM clock is started or stopped. When the KVM timer state is
   restored (or VM clock restarted) it is resumed with the stored count
   at the monotonic time when the VM clock was last stopped. If the VM
   clock hasn't been stopped it resumes from the monotonic time when the
   state was saved (i.e. as if the timer was never stopped).
   Changes since RFC patch on kernel KVM thread "[PATCH v2 00/23] MIPS:
 KVM: Fixes and guest timer rewrite"):
- Simplified, removing extra state for storing VM time of
  save/restore, at the cost of losing/gaining time when VM gets
  stopped and started (Paolo Bonzini).
 - Save and restore the UserLocal and HWREna CP0 registers.
 - Improve get/put KVM register error handling with DPRINTFs and fall
   through so that getting/putting of all the registers is attempted
   even if one of them fails due to being unimplemented in the kernel.

Changes in v4:
(No functional changes, assembly output unchanged)
 - Use int32_t instead of int32 (which is for softfloat) in kvm register
   accessors (Andreas Färber).
 - Use uint64_t instead of __u64 (which is really just for kernel
   headers) in the kvm register accessors (Andreas Färber).
 - Cast pointer to uintptr_t rather than target_ulong in kvm register
   accessors.
 - Remove some redundant casts in kvm register accessors.

Changes in v3:
 - s/dprintf/DPRINTF/ (Andreas Färber).
 - Use "cs" rather than "cpu" or "env" for CPUState variable names
   (Andreas Färber).
 - Use CPUMIPSState rather than CPUArchState (Andreas Färber).
 - Pass MIPSCPU to cpu_mips_io_interrupts_pending() rather than
   CPUMIPSState (Andreas Färber).
 - Remove spurious parentheses around cpu_mips_io_interrupts_pending()
   call (Andreas Färber).
 - Pass MIPSCPU to kvm_mips_set_[ipi_]interrupt (Andreas Färber).
 - Make use of error_report (Andreas Färber) and clean up error messages
   a little to include __func__.
 - Remove inline kvm_mips_{put,get}_one_[ul]reg() declarations from
   kvm_mips.h. They're only used in target-mips/kvm.c anyway.
 - Make kvm_arch_{put,get}_registers static within target-mips/kvm.c and
   remove from kvm_mips.h.
 - Set sigmask length to 16 from kvm_arch_init() since MIPS Linux has
   128 signals. This is better than cluttering kvm_all.c with TARGET_*
   ifdefs (Peter Maydell).

Changes in v2:
 - Expand commit message
 - Checkpatch cleanups.
 - Some interrupt bug fixes from Yann Le Du 
 - Add get/set register functionality from Yann Le Du 
 - Use new 64 bit compatible ABI from Cavium from Sanjay Lal
   
 - Add dummy kvm_arch_init_irq_routing()
   The common KVM code insists on calling kvm_arch_init_irq_routing() as
   soon as it sees kernel header support for it (regardless of whether
   QEMU supports it). Provide a dummy function to satisfy this.
 - Remove request_interrupt_window code (Peter Maydell)
---
 target-mips/kvm.c  | 683 +
 target-mips/kvm_mips.h |  26 ++
 2 files changed, 709 insertions(+)
 create mode 100644 target-mips/kvm.c
 create mode 100644 target-mips/kvm_mips.h

diff --git a/target-mips/kvm.c b/target-mips/kvm.c
new file mode 100644
index ..844e5bbe5f92
--- /dev/null
+++ b/target-mips/kvm.c
@@ -0,0 +1,683 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * KVM/MIPS: MIPS specific KVM APIs
+ *
+ * Copyright (C) 2012-2014 Imagination Technologies Ltd.
+ * Authors: Sanjay Lal 
+*/
+
+#include 
+#include 
+#include 
+
+#include 
+
+#include "qemu-common.h"
+#include "qemu/error-report.h"
+#include "qemu/timer.h"
+#include "sysemu/sysemu.h"

[PATCH v5 10/12] hw/mips: malta: Add KVM support

2014-06-17 Thread James Hogan

In KVM mode the bootrom is loaded and executed from the last 1MB of
DRAM.

Based on "[PATCH 12/12] KVM/MIPS: General KVM support and support for
SMP Guests" by Sanjay Lal .

Signed-off-by: James Hogan 
Reviewed-by: Aurelien Jarno 
Cc: Peter Maydell 
Cc: Sanjay Lal 
---
Changes in v5:
 - Kseg0 doesn't actually change size, so use cpu_mips_kseg0_to_phys()
   rather than having the KVM specific cpu_mips_kvm_um_kseg0_to_phys().

Changes in v3:
 - Remove unnecessary includes, especially linux/kvm.h which isn't a
   good idea on non-Linux (Peter Maydell).

Changes in v2:
 - Removal of cps / GIC / SMP support
 - Minimal bootloader modified to execute safely from RAM
 - Remove "Writing bootloader to final 1MB of RAM" printf
---
 hw/mips/mips_malta.c | 73 ++--
 1 file changed, 53 insertions(+), 20 deletions(-)

diff --git a/hw/mips/mips_malta.c b/hw/mips/mips_malta.c
index f4a7d4712952..8bc5392b4223 100644
--- a/hw/mips/mips_malta.c
+++ b/hw/mips/mips_malta.c
@@ -51,6 +51,7 @@
 #include "sysemu/qtest.h"
 #include "qemu/error-report.h"
 #include "hw/empty_slot.h"
+#include "sysemu/kvm.h"
 
 //#define DEBUG_BOARD_INIT
 
@@ -603,29 +604,31 @@ static void network_init(PCIBus *pci_bus)
 */
 
 static void write_bootloader (CPUMIPSState *env, uint8_t *base,
-  int64_t kernel_entry)
+  int64_t run_addr, int64_t kernel_entry)
 {
 uint32_t *p;
 
 /* Small bootloader */
 p = (uint32_t *)base;
-stl_p(p++, 0x0bf00160);  /* j 
0x1fc00580 */
+
+stl_p(p++, 0x0800 |  /* j 
0x1fc00580 */
+ ((run_addr + 0x580) & 0x0fff) >> 2);
 stl_p(p++, 0x);  /* nop */
 
 /* YAMON service vector */
-stl_p(base + 0x500, 0xbfc00580);  /* start: */
-stl_p(base + 0x504, 0xbfc0083c);  /* print_count: */
-stl_p(base + 0x520, 0xbfc00580);  /* start: */
-stl_p(base + 0x52c, 0xbfc00800);  /* flush_cache: */
-stl_p(base + 0x534, 0xbfc00808);  /* print: */
-stl_p(base + 0x538, 0xbfc00800);  /* reg_cpu_isr: */
-stl_p(base + 0x53c, 0xbfc00800);  /* unred_cpu_isr: */
-stl_p(base + 0x540, 0xbfc00800);  /* reg_ic_isr: */
-stl_p(base + 0x544, 0xbfc00800);  /* unred_ic_isr: */
-stl_p(base + 0x548, 0xbfc00800);  /* reg_esr: */
-stl_p(base + 0x54c, 0xbfc00800);  /* unreg_esr: */
-stl_p(base + 0x550, 0xbfc00800);  /* getchar: */
-stl_p(base + 0x554, 0xbfc00800);  /* syscon_read: */
+stl_p(base + 0x500, run_addr + 0x0580);  /* start: */
+stl_p(base + 0x504, run_addr + 0x083c);  /* print_count: */
+stl_p(base + 0x520, run_addr + 0x0580);  /* start: */
+stl_p(base + 0x52c, run_addr + 0x0800);  /* flush_cache: */
+stl_p(base + 0x534, run_addr + 0x0808);  /* print: */
+stl_p(base + 0x538, run_addr + 0x0800);  /* reg_cpu_isr: */
+stl_p(base + 0x53c, run_addr + 0x0800);  /* unred_cpu_isr: */
+stl_p(base + 0x540, run_addr + 0x0800);  /* reg_ic_isr: */
+stl_p(base + 0x544, run_addr + 0x0800);  /* unred_ic_isr: */
+stl_p(base + 0x548, run_addr + 0x0800);  /* reg_esr: */
+stl_p(base + 0x54c, run_addr + 0x0800);  /* unreg_esr: */
+stl_p(base + 0x550, run_addr + 0x0800);  /* getchar: */
+stl_p(base + 0x554, run_addr + 0x0800);  /* syscon_read: */
 
 
 /* Second part of the bootloader */
@@ -701,7 +704,7 @@ static void write_bootloader (CPUMIPSState *env, uint8_t 
*base,
 p = (uint32_t *) (base + 0x800);
 stl_p(p++, 0x03e8); /* jr ra */
 stl_p(p++, 0x2402); /* li v0,0 */
-   /* 808 YAMON print */
+/* 808 YAMON print */
 stl_p(p++, 0x03e06821); /* move t5,ra 
*/
 stl_p(p++, 0x00805821); /* move t3,a0 
*/
 stl_p(p++, 0x00a05021); /* move t2,a1 
*/
@@ -774,6 +777,7 @@ static int64_t load_kernel (void)
 uint32_t *prom_buf;
 long prom_size;
 int prom_index = 0;
+uint64_t (*xlate_to_kseg0) (void *opaque, uint64_t addr);
 
 #ifdef TARGET_WORDS_BIGENDIAN
 big_endian = 1;
@@ -788,6 +792,11 @@ static int64_t load_kernel (void)
 loaderparams.kernel_filename);
 exit(1);
 }
+if (kvm_enabled()) {
+xlate_to_kseg0 = cpu_mips_kvm_um_phys_to_kseg0;
+} else {
+xlate_to_kseg0 = cpu_mips_phys_to_kseg0;
+}
 
 /* load initrd */
 initrd_size = 0;
@@ -820,7 +829,7 @@ static int64_t load_kernel (void)
 prom_set(prom_buf, prom_index++, "%s", loaderparams.kernel_filename);
 if (initrd_size > 0) {
 prom_set(prom_buf, prom_index++, "rd_start=0x%" PRIx64 " rd_size=%li 
%s",
- cpu_mips_phys_to_kseg0(NULL, initrd_o

1 2 >

1 - 100 of 115 matches

Mail list logo