On Tuesday 07 October 2014 11:03 AM, Benjamin Herrenschmidt wrote: > On Wed, 2014-10-01 at 13:16 +0530, Shreyas B. Prabhu wrote: >> Winkle causes power to be gated off to the entire chiplet. Hence the >> hypervisor/firmware state in the entire chiplet is lost. >> >> This patch adds necessary infrastructure to support waking up from >> hypervisor state loss. Specifically does following: >> - Before entering winkle, save state of registers that need to be >> restored on wake up (SDR1, HFSCR) > > Add ... to your list, it's not exhaustive, is it ?
I use interrupt stack frame for only SDR1 and HFSCR. The rest of the SPRs are restored via PORE in the next patch. I'll change the comments to better reflect this. > >> - SRR1 bits 46:47 which is used to identify which power saving mode cpu >> woke up from is '11' for both winkle and sleep. Hence introduce a flag >> in PACA to distinguish b/w winkle and sleep. >> >> - Upon waking up, restore all saved registers, recover slb >> >> Cc: Benjamin Herrenschmidt <b...@kernel.crashing.org> >> Cc: Paul Mackerras <pau...@samba.org> >> Cc: Michael Ellerman <m...@ellerman.id.au> >> Cc: linuxppc-...@lists.ozlabs.org >> Suggested-by: Vaidyanathan Srinivasan <sva...@linux.vnet.ibm.com> >> Signed-off-by: Shreyas B. Prabhu <shre...@linux.vnet.ibm.com> >> --- >> arch/powerpc/include/asm/machdep.h | 1 + >> arch/powerpc/include/asm/paca.h | 3 ++ >> arch/powerpc/include/asm/ppc-opcode.h | 2 + >> arch/powerpc/include/asm/processor.h | 2 + >> arch/powerpc/kernel/asm-offsets.c | 1 + >> arch/powerpc/kernel/exceptions-64s.S | 8 ++-- >> arch/powerpc/kernel/idle.c | 11 +++++ >> arch/powerpc/kernel/idle_power7.S | 81 >> +++++++++++++++++++++++++++++++++- >> arch/powerpc/platforms/powernv/setup.c | 24 ++++++++++ >> 9 files changed, 127 insertions(+), 6 deletions(-) >> >> diff --git a/arch/powerpc/include/asm/machdep.h >> b/arch/powerpc/include/asm/machdep.h >> index f37014f..0a3ced9 100644 >> --- a/arch/powerpc/include/asm/machdep.h >> +++ b/arch/powerpc/include/asm/machdep.h >> @@ -301,6 +301,7 @@ struct machdep_calls { >> /* Idle handlers */ >> void (*setup_idle)(void); >> unsigned long (*power7_sleep)(void); >> + unsigned long (*power7_winkle)(void); >> }; > > Why does it need to be ppc_md ? Same comments as for sleep > >> extern void e500_idle(void); >> diff --git a/arch/powerpc/include/asm/paca.h >> b/arch/powerpc/include/asm/paca.h >> index a5139ea..3358f09 100644 >> --- a/arch/powerpc/include/asm/paca.h >> +++ b/arch/powerpc/include/asm/paca.h >> @@ -158,6 +158,9 @@ struct paca_struct { >> * early exception handler for use by high level C handler >> */ >> struct opal_machine_check_event *opal_mc_evt; >> + >> + /* Flag to distinguish b/w sleep and winkle */ >> + u8 offline_state; > > Not fan of the name. I'd rather you call it "wakeup_state_loss" or > something a bit more explicit about what that actually means if it's > going to be a boolean value. Otherwise make it an enumeration of > constants. > Okay. I'll change this. >> #endif >> #ifdef CONFIG_PPC_BOOK3S_64 >> /* Exclusive emergency stack pointer for machine check exception. */ >> diff --git a/arch/powerpc/include/asm/ppc-opcode.h >> b/arch/powerpc/include/asm/ppc-opcode.h >> index 6f85362..5155be7 100644 >> --- a/arch/powerpc/include/asm/ppc-opcode.h >> +++ b/arch/powerpc/include/asm/ppc-opcode.h >> @@ -194,6 +194,7 @@ >> >> #define PPC_INST_NAP 0x4c000364 >> #define PPC_INST_SLEEP 0x4c0003a4 >> +#define PPC_INST_WINKLE 0x4c0003e4 >> >> /* A2 specific instructions */ >> #define PPC_INST_ERATWE 0x7c0001a6 >> @@ -374,6 +375,7 @@ >> >> #define PPC_NAP stringify_in_c(.long PPC_INST_NAP) >> #define PPC_SLEEP stringify_in_c(.long PPC_INST_SLEEP) >> +#define PPC_WINKLE stringify_in_c(.long PPC_INST_WINKLE) >> >> /* BHRB instructions */ >> #define PPC_CLRBHRB stringify_in_c(.long PPC_INST_CLRBHRB) >> diff --git a/arch/powerpc/include/asm/processor.h >> b/arch/powerpc/include/asm/processor.h >> index 41953cd..00e3df9 100644 >> --- a/arch/powerpc/include/asm/processor.h >> +++ b/arch/powerpc/include/asm/processor.h >> @@ -455,6 +455,8 @@ extern void arch_setup_idle(void); >> extern void power7_nap(int check_irq); >> extern unsigned long power7_sleep(void); >> extern unsigned long __power7_sleep(void); >> +extern unsigned long power7_winkle(void); >> +extern unsigned long __power7_winkle(void); >> extern void flush_instruction_cache(void); >> extern void hard_reset_now(void); >> extern void poweroff_now(void); >> diff --git a/arch/powerpc/kernel/asm-offsets.c >> b/arch/powerpc/kernel/asm-offsets.c >> index 9d7dede..ea98817 100644 >> --- a/arch/powerpc/kernel/asm-offsets.c >> +++ b/arch/powerpc/kernel/asm-offsets.c >> @@ -731,6 +731,7 @@ int main(void) >> DEFINE(OPAL_MC_SRR0, offsetof(struct opal_machine_check_event, srr0)); >> DEFINE(OPAL_MC_SRR1, offsetof(struct opal_machine_check_event, srr1)); >> DEFINE(PACA_OPAL_MC_EVT, offsetof(struct paca_struct, opal_mc_evt)); >> + DEFINE(PACAOFFLINESTATE, offsetof(struct paca_struct, offline_state)); >> #endif >> >> return 0; >> diff --git a/arch/powerpc/kernel/exceptions-64s.S >> b/arch/powerpc/kernel/exceptions-64s.S >> index c64f3cc0..261f348 100644 >> --- a/arch/powerpc/kernel/exceptions-64s.S >> +++ b/arch/powerpc/kernel/exceptions-64s.S >> @@ -115,9 +115,7 @@ BEGIN_FTR_SECTION >> #endif >> >> /* Running native on arch 2.06 or later, check if we are >> - * waking up from nap. We only handle no state loss and >> - * supervisor state loss. We do -not- handle hypervisor >> - * state loss at this time. >> + * waking up from power saving mode. >> */ >> mfspr r13,SPRN_SRR1 >> rlwinm. r13,r13,47-31,30,31 >> @@ -133,8 +131,8 @@ BEGIN_FTR_SECTION >> b power7_wakeup_noloss >> 2: b power7_wakeup_loss >> >> - /* Fast Sleep wakeup on PowerNV */ >> -8: b power7_wakeup_tb_loss >> + /* Fast Sleep / Winkle wakeup on PowerNV */ >> +8: b power7_wakeup_hv_state_loss >> >> 9: >> END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) >> diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c >> index 1f268e0..ed46217 100644 >> --- a/arch/powerpc/kernel/idle.c >> +++ b/arch/powerpc/kernel/idle.c >> @@ -98,6 +98,17 @@ unsigned long power7_sleep(void) >> return ret; >> } >> >> +unsigned long power7_winkle(void) >> +{ >> + unsigned long ret; >> + >> + if (ppc_md.power7_winkle) >> + ret = ppc_md.power7_winkle(); >> + else >> + ret = __power7_winkle(); >> + return ret; >> +} >> + >> int powersave_nap; >> >> #ifdef CONFIG_SYSCTL >> diff --git a/arch/powerpc/kernel/idle_power7.S >> b/arch/powerpc/kernel/idle_power7.S >> index c3481c9..87b2556 100644 >> --- a/arch/powerpc/kernel/idle_power7.S >> +++ b/arch/powerpc/kernel/idle_power7.S >> @@ -18,6 +18,13 @@ >> #include <asm/hw_irq.h> >> #include <asm/kvm_book3s_asm.h> >> #include <asm/opal.h> >> +#include <asm/mmu-hash64.h> >> + >> +/* >> + * Use volatile GPRs' space to save essential SPRs before entering winkle >> + */ >> +#define _SDR1 GPR3 >> +#define _TSCR GPR4 >> >> #undef DEBUG >> >> @@ -39,6 +46,7 @@ >> * Pass requested state in r3: >> * 0 - nap >> * 1 - sleep >> + * 2 - winkle >> * >> * To check IRQ_HAPPENED in r4 >> * 0 - don't check >> @@ -109,9 +117,27 @@ _GLOBAL(power7_enter_nap_mode) >> #endif >> cmpwi cr0,r3,1 >> beq 2f >> + cmpwi cr0,r3,2 >> + beq 3f >> IDLE_STATE_ENTER_SEQ(PPC_NAP) >> /* No return */ >> -2: IDLE_STATE_ENTER_SEQ(PPC_SLEEP) >> +2: >> + li r4,1 >> + stb r4,PACAOFFLINESTATE(r13) >> + IDLE_STATE_ENTER_SEQ(PPC_SLEEP) >> + /* No return */ >> + >> +3: >> + mfspr r4,SPRN_SDR1 >> + std r4,_SDR1(r1) >> + >> + mfspr r4,SPRN_TSCR >> + std r4,_TSCR(r1) >> + >> + /* Enter winkle */ >> + li r4,0 >> + stb r4,PACAOFFLINESTATE(r13) >> + IDLE_STATE_ENTER_SEQ(PPC_WINKLE) >> /* No return */ >> >> _GLOBAL(power7_idle) >> @@ -187,6 +213,59 @@ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, >> 66); \ >> 20: nop; >> >> >> +_GLOBAL(__power7_winkle) >> + li r3,2 >> + li r4,1 >> + b power7_powersave_common >> + /* No return */ >> + >> +_GLOBAL(power7_wakeup_hv_state_loss) >> + /* Check paca flag to diffentiate b/w fast sleep and winkle */ >> + lbz r4,PACAOFFLINESTATE(13) >> + cmpwi cr0,r4,0 >> + bne power7_wakeup_tb_loss >> + >> + ld r2,PACATOC(r13); >> + ld r1,PACAR1(r13) >> + >> + bl __restore_cpu_power8 > > So if I understand correctly, you use a per-cpu flag not a per-core flag > which means we will assume a pessimistic case of having to restore stuff > even if the core didn't actually enter winkle (because the last thread > to go down went to sleep). This is sub-optimal. Also see below: > >> + /* Time base re-sync */ >> + li r3,OPAL_RESYNC_TIMEBASE >> + bl opal_call_realmode; > > You will also resync the timebase (and restore all the core shared SPRs) > for each thread. This is problematic, especially with KVM as you could > have a situation where: > > - The first thread comes out and starts diving into KVM > > - The other threads start coming out while the first one is doing the > above. > > Thus the first thread might already be manipulating some core registers > (SDR1 etc...) while the secondaries come back and ... whack it. Worse, > the primary might have applied the TB offset using TBU40 while the > secondaries resync the timebase back to the host value, incurring a > loss of TB for the guest. > Such a race is prevented with kvm_hstate.hwthread_req and kvm_hstate.hwthread_state paca flags. The current flow when a guest is scheduled on a core : -> Primary thread sets kvm_hstate.hwthread_req paca flag for all the secondary threads. -> Waits for all the secondary threads to to change state to !KVM_HWTHREAD_IN_KERNEL -> and later call __kvmppc_vcore_entry which down the line changes SDR1 and other per core registers. Therefore kvm_hstate.hwthread_req is set to 1 for all the threads in the core *before* SDR1 is switched. And when a secondary thread is woken up to execute guest, in 0x100 we check hwthread_req and branch to kvm_start_guest if set. Therefore secondary threads woken up for guest do not execute the power7_wakeup_hv_state_loss and therefore there is no danger of overwriting SDR1 or TBU40. Now lets consider the case where a guest is scheduled on the core and a secondary thread is woken up even though there is no vcpu to run on it. (Say its woken up by a stray IPI). In this case, again in 0x100 we branch to kvm_start_guest, and here when there is no vcpu to run, it executes nap. So again there no danger of overwriting SDR1. >> + /* Restore SLB from PACA */ >> + ld r8,PACA_SLBSHADOWPTR(r13) >> + >> + .rept SLB_NUM_BOLTED >> + li r3, SLBSHADOW_SAVEAREA >> + LDX_BE r5, r8, r3 >> + addi r3, r3, 8 >> + LDX_BE r6, r8, r3 >> + andis. r7,r5,SLB_ESID_V@h >> + beq 1f >> + slbmte r6,r5 >> +1: addi r8,r8,16 >> + .endr >> + >> + ld r4,_SDR1(r1) >> + mtspr SPRN_SDR1,r4 >> + >> + ld r4,_TSCR(r1) >> + mtspr SPRN_TSCR,r4 >> + >> + REST_NVGPRS(r1) >> + REST_GPR(2, r1) >> + ld r3,_CCR(r1) >> + ld r4,_MSR(r1) >> + ld r5,_NIP(r1) >> + addi r1,r1,INT_FRAME_SIZE >> + mtcr r3 >> + mfspr r3,SPRN_SRR1 /* Return SRR1 */ >> + mtspr SPRN_SRR1,r4 >> + mtspr SPRN_SRR0,r5 >> + rfid >> + >> _GLOBAL(power7_wakeup_tb_loss) >> ld r2,PACATOC(r13); >> ld r1,PACAR1(r13) >> diff --git a/arch/powerpc/platforms/powernv/setup.c >> b/arch/powerpc/platforms/powernv/setup.c >> index 9d9a898..f45b52d 100644 >> --- a/arch/powerpc/platforms/powernv/setup.c >> +++ b/arch/powerpc/platforms/powernv/setup.c >> @@ -370,6 +370,29 @@ static unsigned long pnv_power7_sleep(void) >> return srr1; >> } >> >> +/* >> + * We need to keep track of offline cpus also for calling >> + * fastsleep workaround appropriately >> + */ >> +static unsigned long pnv_power7_winkle(void) >> +{ >> + int cpu, primary_thread; >> + unsigned long srr1; >> + >> + cpu = smp_processor_id(); >> + primary_thread = cpu_first_thread_sibling(cpu); >> + >> + if (need_fastsleep_workaround) { >> + pnv_apply_fastsleep_workaround(1, primary_thread); >> + srr1 = __power7_winkle(); >> + pnv_apply_fastsleep_workaround(0, primary_thread); >> + } else { >> + srr1 = __power7_winkle(); >> + } >> + return srr1; >> +} >> + >> + >> static void __init pnv_setup_machdep_opal(void) >> { >> ppc_md.get_boot_time = opal_get_boot_time; >> @@ -384,6 +407,7 @@ static void __init pnv_setup_machdep_opal(void) >> ppc_md.handle_hmi_exception = opal_handle_hmi_exception; >> ppc_md.setup_idle = pnv_setup_idle; >> ppc_md.power7_sleep = pnv_power7_sleep; >> + ppc_md.power7_winkle = pnv_power7_winkle; >> } >> >> #ifdef CONFIG_PPC_POWERNV_RTAS > > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/