On Tuesday 07 October 2014 11:03 AM, Benjamin Herrenschmidt wrote:
> On Wed, 2014-10-01 at 13:16 +0530, Shreyas B. Prabhu wrote:
>> Winkle causes power to be gated off to the entire chiplet. Hence the
>> hypervisor/firmware state in the entire chiplet is lost.
>>
>> This patch adds necessary infrastructure to support waking up from
>> hypervisor state loss. Specifically does following:
>> - Before entering winkle, save state of registers that need to be
>>   restored on wake up (SDR1, HFSCR)
> 
>  Add ... to your list, it's not exhaustive, is it ?

I use interrupt stack frame for only SDR1 and HFSCR. The rest of the
SPRs are restored via PORE in the next patch. I'll change the comments
to better reflect this.

> 
>> - SRR1 bits 46:47 which is used to identify which power saving mode cpu
>>   woke up from is '11' for both winkle and sleep. Hence introduce a flag
>>   in PACA to distinguish b/w winkle and sleep.
>>
>> - Upon waking up, restore all saved registers, recover slb
>>
>> Cc: Benjamin Herrenschmidt <b...@kernel.crashing.org>
>> Cc: Paul Mackerras <pau...@samba.org>
>> Cc: Michael Ellerman <m...@ellerman.id.au>
>> Cc: linuxppc-...@lists.ozlabs.org
>> Suggested-by: Vaidyanathan Srinivasan <sva...@linux.vnet.ibm.com>
>> Signed-off-by: Shreyas B. Prabhu <shre...@linux.vnet.ibm.com>
>> ---
>>  arch/powerpc/include/asm/machdep.h     |  1 +
>>  arch/powerpc/include/asm/paca.h        |  3 ++
>>  arch/powerpc/include/asm/ppc-opcode.h  |  2 +
>>  arch/powerpc/include/asm/processor.h   |  2 +
>>  arch/powerpc/kernel/asm-offsets.c      |  1 +
>>  arch/powerpc/kernel/exceptions-64s.S   |  8 ++--
>>  arch/powerpc/kernel/idle.c             | 11 +++++
>>  arch/powerpc/kernel/idle_power7.S      | 81 
>> +++++++++++++++++++++++++++++++++-
>>  arch/powerpc/platforms/powernv/setup.c | 24 ++++++++++
>>  9 files changed, 127 insertions(+), 6 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/machdep.h 
>> b/arch/powerpc/include/asm/machdep.h
>> index f37014f..0a3ced9 100644
>> --- a/arch/powerpc/include/asm/machdep.h
>> +++ b/arch/powerpc/include/asm/machdep.h
>> @@ -301,6 +301,7 @@ struct machdep_calls {
>>      /* Idle handlers */
>>      void            (*setup_idle)(void);
>>      unsigned long   (*power7_sleep)(void);
>> +    unsigned long   (*power7_winkle)(void);
>>  };
> 
> Why does it need to be ppc_md ? Same comments as for sleep
> 
>>  extern void e500_idle(void);
>> diff --git a/arch/powerpc/include/asm/paca.h 
>> b/arch/powerpc/include/asm/paca.h
>> index a5139ea..3358f09 100644
>> --- a/arch/powerpc/include/asm/paca.h
>> +++ b/arch/powerpc/include/asm/paca.h
>> @@ -158,6 +158,9 @@ struct paca_struct {
>>       * early exception handler for use by high level C handler
>>       */
>>      struct opal_machine_check_event *opal_mc_evt;
>> +
>> +    /* Flag to distinguish b/w sleep and winkle */
>> +    u8 offline_state;
> 
> Not fan of the name. I'd rather you call it "wakeup_state_loss" or
> something a bit more explicit about what that actually means if it's
> going to be a boolean value. Otherwise make it an enumeration of
> constants.
> 
Okay. I'll change this.

>>  #endif
>>  #ifdef CONFIG_PPC_BOOK3S_64
>>      /* Exclusive emergency stack pointer for machine check exception. */
>> diff --git a/arch/powerpc/include/asm/ppc-opcode.h 
>> b/arch/powerpc/include/asm/ppc-opcode.h
>> index 6f85362..5155be7 100644
>> --- a/arch/powerpc/include/asm/ppc-opcode.h
>> +++ b/arch/powerpc/include/asm/ppc-opcode.h
>> @@ -194,6 +194,7 @@
>>  
>>  #define PPC_INST_NAP                        0x4c000364
>>  #define PPC_INST_SLEEP                      0x4c0003a4
>> +#define PPC_INST_WINKLE                     0x4c0003e4
>>  
>>  /* A2 specific instructions */
>>  #define PPC_INST_ERATWE                     0x7c0001a6
>> @@ -374,6 +375,7 @@
>>  
>>  #define PPC_NAP                     stringify_in_c(.long PPC_INST_NAP)
>>  #define PPC_SLEEP           stringify_in_c(.long PPC_INST_SLEEP)
>> +#define PPC_WINKLE          stringify_in_c(.long PPC_INST_WINKLE)
>>  
>>  /* BHRB instructions */
>>  #define PPC_CLRBHRB         stringify_in_c(.long PPC_INST_CLRBHRB)
>> diff --git a/arch/powerpc/include/asm/processor.h 
>> b/arch/powerpc/include/asm/processor.h
>> index 41953cd..00e3df9 100644
>> --- a/arch/powerpc/include/asm/processor.h
>> +++ b/arch/powerpc/include/asm/processor.h
>> @@ -455,6 +455,8 @@ extern void arch_setup_idle(void);
>>  extern void power7_nap(int check_irq);
>>  extern unsigned long power7_sleep(void);
>>  extern unsigned long __power7_sleep(void);
>> +extern unsigned long power7_winkle(void);
>> +extern unsigned long __power7_winkle(void);
>>  extern void flush_instruction_cache(void);
>>  extern void hard_reset_now(void);
>>  extern void poweroff_now(void);
>> diff --git a/arch/powerpc/kernel/asm-offsets.c 
>> b/arch/powerpc/kernel/asm-offsets.c
>> index 9d7dede..ea98817 100644
>> --- a/arch/powerpc/kernel/asm-offsets.c
>> +++ b/arch/powerpc/kernel/asm-offsets.c
>> @@ -731,6 +731,7 @@ int main(void)
>>      DEFINE(OPAL_MC_SRR0, offsetof(struct opal_machine_check_event, srr0));
>>      DEFINE(OPAL_MC_SRR1, offsetof(struct opal_machine_check_event, srr1));
>>      DEFINE(PACA_OPAL_MC_EVT, offsetof(struct paca_struct, opal_mc_evt));
>> +    DEFINE(PACAOFFLINESTATE, offsetof(struct paca_struct, offline_state));
>>  #endif
>>  
>>      return 0;
>> diff --git a/arch/powerpc/kernel/exceptions-64s.S 
>> b/arch/powerpc/kernel/exceptions-64s.S
>> index c64f3cc0..261f348 100644
>> --- a/arch/powerpc/kernel/exceptions-64s.S
>> +++ b/arch/powerpc/kernel/exceptions-64s.S
>> @@ -115,9 +115,7 @@ BEGIN_FTR_SECTION
>>  #endif
>>  
>>      /* Running native on arch 2.06 or later, check if we are
>> -     * waking up from nap. We only handle no state loss and
>> -     * supervisor state loss. We do -not- handle hypervisor
>> -     * state loss at this time.
>> +     * waking up from power saving mode.
>>       */
>>      mfspr   r13,SPRN_SRR1
>>      rlwinm. r13,r13,47-31,30,31
>> @@ -133,8 +131,8 @@ BEGIN_FTR_SECTION
>>      b       power7_wakeup_noloss
>>  2:  b       power7_wakeup_loss
>>  
>> -    /* Fast Sleep wakeup on PowerNV */
>> -8:  b       power7_wakeup_tb_loss
>> +    /* Fast Sleep / Winkle wakeup on PowerNV */
>> +8:  b       power7_wakeup_hv_state_loss
>>  
>>  9:
>>  END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
>> diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c
>> index 1f268e0..ed46217 100644
>> --- a/arch/powerpc/kernel/idle.c
>> +++ b/arch/powerpc/kernel/idle.c
>> @@ -98,6 +98,17 @@ unsigned long power7_sleep(void)
>>      return ret;
>>  }
>>  
>> +unsigned long power7_winkle(void)
>> +{
>> +    unsigned long ret;
>> +
>> +    if (ppc_md.power7_winkle)
>> +            ret = ppc_md.power7_winkle();
>> +    else
>> +            ret = __power7_winkle();
>> +    return ret;
>> +}
>> +
>>  int powersave_nap;
>>  
>>  #ifdef CONFIG_SYSCTL
>> diff --git a/arch/powerpc/kernel/idle_power7.S 
>> b/arch/powerpc/kernel/idle_power7.S
>> index c3481c9..87b2556 100644
>> --- a/arch/powerpc/kernel/idle_power7.S
>> +++ b/arch/powerpc/kernel/idle_power7.S
>> @@ -18,6 +18,13 @@
>>  #include <asm/hw_irq.h>
>>  #include <asm/kvm_book3s_asm.h>
>>  #include <asm/opal.h>
>> +#include <asm/mmu-hash64.h>
>> +
>> +/*
>> + * Use volatile GPRs' space to save essential SPRs before entering winkle
>> + */
>> +#define _SDR1       GPR3
>> +#define _TSCR       GPR4
>>  
>>  #undef DEBUG
>>  
>> @@ -39,6 +46,7 @@
>>   * Pass requested state in r3:
>>   *  0 - nap
>>   *  1 - sleep
>> + *  2 - winkle
>>   *
>>   * To check IRQ_HAPPENED in r4
>>   *  0 - don't check
>> @@ -109,9 +117,27 @@ _GLOBAL(power7_enter_nap_mode)
>>  #endif
>>      cmpwi   cr0,r3,1
>>      beq     2f
>> +    cmpwi   cr0,r3,2
>> +    beq     3f
>>      IDLE_STATE_ENTER_SEQ(PPC_NAP)
>>      /* No return */
>> -2:  IDLE_STATE_ENTER_SEQ(PPC_SLEEP)
>> +2:
>> +    li      r4,1
>> +    stb     r4,PACAOFFLINESTATE(r13)
>> +    IDLE_STATE_ENTER_SEQ(PPC_SLEEP)
>> +    /* No return */
>> +
>> +3:
>> +    mfspr   r4,SPRN_SDR1
>> +    std     r4,_SDR1(r1)
>> +
>> +    mfspr   r4,SPRN_TSCR
>> +    std     r4,_TSCR(r1)
>> +
>> +    /* Enter winkle */
>> +        li      r4,0
>> +        stb     r4,PACAOFFLINESTATE(r13)
>> +    IDLE_STATE_ENTER_SEQ(PPC_WINKLE)
>>      /* No return */
>>  
>>  _GLOBAL(power7_idle)
>> @@ -187,6 +213,59 @@ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 
>> 66);                \
>>  20: nop;
>>  
>>
>> +_GLOBAL(__power7_winkle)
>> +    li      r3,2
>> +    li      r4,1
>> +    b       power7_powersave_common
>> +    /* No return */
>> +
>> +_GLOBAL(power7_wakeup_hv_state_loss)
>> +    /* Check paca flag to diffentiate b/w fast sleep and winkle */
>> +    lbz     r4,PACAOFFLINESTATE(13)
>> +    cmpwi   cr0,r4,0
>> +    bne     power7_wakeup_tb_loss
>> +
>> +    ld      r2,PACATOC(r13);
>> +    ld      r1,PACAR1(r13)
>> +
>> +    bl      __restore_cpu_power8
> 
> So if I understand correctly, you use a per-cpu flag not a per-core flag
> which means we will assume a pessimistic case of having to restore stuff
> even if the core didn't actually enter winkle (because the last thread
> to go down went to sleep). This is sub-optimal. Also see below:
> 
>> +    /* Time base re-sync */
>> +    li      r3,OPAL_RESYNC_TIMEBASE
>> +    bl      opal_call_realmode;
> 
> You will also resync the timebase (and restore all the core shared SPRs)
> for each thread. This is problematic, especially with KVM as you could
> have a situation where:
> 
>  - The first thread comes out and starts diving into KVM
> 
>  - The other threads start coming out while the first one is doing the
> above.
> 
> Thus the first thread might already be manipulating some core registers
> (SDR1 etc...) while the secondaries come back and ... whack it. Worse,
> the primary might have applied the TB offset using TBU40 while the
> secondaries resync the timebase back to the host value, incurring a
> loss of TB for the guest.
>

Such a race is prevented with kvm_hstate.hwthread_req and
kvm_hstate.hwthread_state paca flags.

The current flow when a guest is scheduled on a core :
-> Primary thread sets kvm_hstate.hwthread_req paca flag for all the
secondary threads.
-> Waits for all the secondary threads to to change state to
!KVM_HWTHREAD_IN_KERNEL
-> and later call __kvmppc_vcore_entry which down the line changes SDR1
and other per core registers. Therefore kvm_hstate.hwthread_req is set
to 1 for all the threads in the core *before* SDR1 is switched.

And when a secondary thread is woken up to execute guest, in 0x100 we
check hwthread_req and branch to kvm_start_guest if set. Therefore
secondary threads woken up for guest do not execute the
power7_wakeup_hv_state_loss and therefore there is no danger of
overwriting SDR1 or TBU40.

Now lets consider the case where a guest is scheduled on the core and a
secondary thread is woken up even though there is no vcpu to run on it.
(Say its woken up by a stray IPI). In this case, again in 0x100 we
branch to kvm_start_guest, and here when there is no vcpu to run, it
executes nap. So again there no danger of overwriting SDR1.

>> +    /* Restore SLB  from PACA */
>> +    ld      r8,PACA_SLBSHADOWPTR(r13)
>> +
>> +    .rept   SLB_NUM_BOLTED
>> +    li      r3, SLBSHADOW_SAVEAREA
>> +    LDX_BE  r5, r8, r3
>> +    addi    r3, r3, 8
>> +    LDX_BE  r6, r8, r3
>> +    andis.  r7,r5,SLB_ESID_V@h
>> +    beq     1f
>> +    slbmte  r6,r5
>> +1:  addi    r8,r8,16
>> +    .endr
>> +
>> +    ld      r4,_SDR1(r1)
>> +    mtspr   SPRN_SDR1,r4
>> +
>> +    ld      r4,_TSCR(r1)
>> +    mtspr   SPRN_TSCR,r4
>> +
>> +    REST_NVGPRS(r1)
>> +    REST_GPR(2, r1)
>> +    ld      r3,_CCR(r1)
>> +    ld      r4,_MSR(r1)
>> +    ld      r5,_NIP(r1)
>> +    addi    r1,r1,INT_FRAME_SIZE
>> +    mtcr    r3
>> +    mfspr   r3,SPRN_SRR1            /* Return SRR1 */
>> +    mtspr   SPRN_SRR1,r4
>> +    mtspr   SPRN_SRR0,r5
>> +    rfid
>> +
>>  _GLOBAL(power7_wakeup_tb_loss)
>>      ld      r2,PACATOC(r13);
>>      ld      r1,PACAR1(r13)
>> diff --git a/arch/powerpc/platforms/powernv/setup.c 
>> b/arch/powerpc/platforms/powernv/setup.c
>> index 9d9a898..f45b52d 100644
>> --- a/arch/powerpc/platforms/powernv/setup.c
>> +++ b/arch/powerpc/platforms/powernv/setup.c
>> @@ -370,6 +370,29 @@ static unsigned long pnv_power7_sleep(void)
>>      return srr1;
>>  }
>>  
>> +/*
>> + * We need to keep track of offline cpus also for calling
>> + * fastsleep workaround appropriately
>> + */
>> +static unsigned long pnv_power7_winkle(void)
>> +{
>> +    int cpu, primary_thread;
>> +    unsigned long srr1;
>> +
>> +    cpu = smp_processor_id();
>> +    primary_thread = cpu_first_thread_sibling(cpu);
>> +
>> +    if (need_fastsleep_workaround) {
>> +            pnv_apply_fastsleep_workaround(1, primary_thread);
>> +            srr1 = __power7_winkle();
>> +            pnv_apply_fastsleep_workaround(0, primary_thread);
>> +    } else {
>> +                    srr1 = __power7_winkle();
>> +    }
>> +    return srr1;
>> +}
>> +
>> +
>>  static void __init pnv_setup_machdep_opal(void)
>>  {
>>      ppc_md.get_boot_time = opal_get_boot_time;
>> @@ -384,6 +407,7 @@ static void __init pnv_setup_machdep_opal(void)
>>      ppc_md.handle_hmi_exception = opal_handle_hmi_exception;
>>      ppc_md.setup_idle = pnv_setup_idle;
>>      ppc_md.power7_sleep = pnv_power7_sleep;
>> +    ppc_md.power7_winkle = pnv_power7_winkle;
>>  }
>>  
>>  #ifdef CONFIG_PPC_POWERNV_RTAS
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to