The first patch is something I had meant to do forever since the introduction of mwait-idle. The 2nd patch addresses a latent problem (becoming an active one with patch 3) in C-state selection when actually entering an idle state. The 3rd patch is my counterproposal to Brian's intended abuse (as I would call it) of the mwait-idle driver. The final two patches are ones I've been carrying for about 5 years now, initially waiting for a response to my proposed edits to Ross'es patches, and eventually sort of forgotten (but re-based as needed).
1: x86/cpuidle: switch to uniform meaning of "max_cstate=" 2: x86/cpuidle: really use C1 for "urgent" CPUs 3: x86/AMD: make C-state handling independent of Dom0 4: x86: allow limiting the max C-state sub-state 5: tools/libxc: allow controlling the max C-state sub-state Due to mail mangling issues I'm attaching all patches here, alongside the individual mails in reply to this one. Jan
x86/cpuidle: switch to uniform meaning of "max_cstate=" While the MWAIT idle driver already takes it to mean an actual C state, the ACPI idle driver so far used it as a list index. The list index, however, is an implementation detail of Xen and affected by firmware settings (i.e. not necessarily uniform for a particular system). While touching this code also avoid invoking menu_get_trace_data() when tracing is not active. For consistency do this also for the MWAIT driver. Note that I'm intentionally not adding any sorting logic to set_cx(): Before and after this patch we assume entries to arrive in order, so this would be an orthogonal change. Take the opportunity and add minimal documentation for the command line option. Signed-off-by: Jan Beulich <jbeul...@suse.com> --- v2: Adjust xenpm output wording a little. Explicitly log "unlimited". --- TBD: I wonder if we really need struct acpi_processor_cx's idx field anymore. It's used in a number of (questionable) places ... --- a/docs/misc/xen-command-line.pandoc +++ b/docs/misc/xen-command-line.pandoc @@ -1376,6 +1376,8 @@ This option is ignored in **pv-shim** mo ### max_cstate (x86) > `= <integer>` +Specify the deepest C-state CPUs are permitted to be placed in. + ### max_gsi_irqs (x86) > `= <integer>` --- a/tools/misc/xenpm.c +++ b/tools/misc/xenpm.c @@ -64,7 +64,7 @@ void show_help(void) " set-sched-smt enable|disable enable/disable scheduler smt power saving\n" " set-vcpu-migration-delay <num> set scheduler vcpu migration delay in us\n" " get-vcpu-migration-delay get scheduler vcpu migration delay\n" - " set-max-cstate <num> set the C-State limitation (<num> >= 0)\n" + " set-max-cstate <num>|'unlimited' set the C-State limitation (<num> >= 0)\n" " start [seconds] start collect Cx/Px statistics,\n" " output after CTRL-C or SIGINT or several seconds.\n" " enable-turbo-mode [cpuid] enable Turbo Mode for processors that support it.\n" @@ -194,7 +194,11 @@ static int show_max_cstate(xc_interface if ( (ret = xc_get_cpuidle_max_cstate(xc_handle, &value)) ) return ret; - printf("Max possible C-state: C%d\n\n", value); + if ( value < XEN_SYSCTL_CX_UNLIMITED ) + printf("Max possible C-state: C%"PRIu32"\n\n", value); + else + printf("All C-states allowed\n\n"); + return 0; } @@ -1117,18 +1121,24 @@ void get_vcpu_migration_delay_func(int a void set_max_cstate_func(int argc, char *argv[]) { int value; + char buf[12]; - if ( argc != 1 || sscanf(argv[0], "%d", &value) != 1 || value < 0 ) + if ( argc != 1 || + (sscanf(argv[0], "%d", &value) == 1 + ? value < 0 + : (value = XEN_SYSCTL_CX_UNLIMITED, strcmp(argv[0], "unlimited"))) ) { - fprintf(stderr, "Missing or invalid argument(s)\n"); + fprintf(stderr, "Missing, excess, or invalid argument(s)\n"); exit(EINVAL); } + snprintf(buf, ARRAY_SIZE(buf), "C%d", value); + if ( !xc_set_cpuidle_max_cstate(xc_handle, (uint32_t)value) ) - printf("set max_cstate to C%d succeeded\n", value); + printf("max C-state set to %s\n", value >= 0 ? buf : argv[0]); else - fprintf(stderr, "set max_cstate to C%d failed (%d - %s)\n", - value, errno, strerror(errno)); + fprintf(stderr, "Failed to set max C-state to %s (%d - %s)\n", + value >= 0 ? buf : argv[0], errno, strerror(errno)); } void enable_turbo_mode(int argc, char *argv[]) --- a/xen/arch/x86/acpi/cpu_idle.c +++ b/xen/arch/x86/acpi/cpu_idle.c @@ -103,7 +103,7 @@ bool lapic_timer_init(void) } void (*__read_mostly pm_idle_save)(void); -unsigned int max_cstate __read_mostly = ACPI_PROCESSOR_MAX_POWER - 1; +unsigned int max_cstate __read_mostly = UINT_MAX; integer_param("max_cstate", max_cstate); static bool __read_mostly local_apic_timer_c2_ok; boolean_param("lapic_timer_c2_ok", local_apic_timer_c2_ok); @@ -344,7 +344,10 @@ static void dump_cx(unsigned char key) unsigned int cpu; printk("'%c' pressed -> printing ACPI Cx structures\n", key); - printk("max cstate: C%u\n", max_cstate); + if ( max_cstate < UINT_MAX ) + printk("max state: C%u\n", max_cstate); + else + printk("max state: unlimited\n"); for_each_present_cpu ( cpu ) { struct acpi_processor_power *power = processor_powers[cpu]; @@ -582,13 +585,19 @@ static void acpi_processor_idle(void) if ( max_cstate > 0 && power && !sched_has_urgent_vcpu() && (next_state = cpuidle_current_governor->select(power)) > 0 ) { - cx = &power->states[next_state]; - if ( cx->type == ACPI_STATE_C3 && power->flags.bm_check && - acpi_idle_bm_check() ) - cx = power->safe_state; - if ( cx->idx > max_cstate ) - cx = &power->states[max_cstate]; - menu_get_trace_data(&exp, &pred); + do { + cx = &power->states[next_state]; + } while ( cx->type > max_cstate && --next_state ); + if ( next_state ) + { + if ( cx->type == ACPI_STATE_C3 && power->flags.bm_check && + acpi_idle_bm_check() ) + cx = power->safe_state; + if ( tb_init_done ) + menu_get_trace_data(&exp, &pred); + } + else + cx = NULL; } if ( !cx ) { @@ -1396,12 +1405,12 @@ int pmstat_reset_cx_stat(uint32_t cpuid) void cpuidle_disable_deep_cstate(void) { - if ( max_cstate > 1 ) + if ( max_cstate > ACPI_STATE_C1 ) { if ( local_apic_timer_c2_ok ) - max_cstate = 2; + max_cstate = ACPI_STATE_C2; else - max_cstate = 1; + max_cstate = ACPI_STATE_C1; } hpet_disable_legacy_broadcast(); @@ -1409,7 +1418,8 @@ void cpuidle_disable_deep_cstate(void) bool cpuidle_using_deep_cstate(void) { - return xen_cpuidle && max_cstate > (local_apic_timer_c2_ok ? 2 : 1); + return xen_cpuidle && max_cstate > (local_apic_timer_c2_ok ? ACPI_STATE_C2 + : ACPI_STATE_C1); } static int cpu_callback( --- a/xen/arch/x86/cpu/mwait-idle.c +++ b/xen/arch/x86/cpu/mwait-idle.c @@ -731,7 +731,8 @@ static void mwait_idle(void) } while (cx->type > max_cstate && --next_state); if (!next_state) cx = NULL; - menu_get_trace_data(&exp, &pred); + else if (tb_init_done) + menu_get_trace_data(&exp, &pred); } if (!cx) { if (pm_idle_save) --- a/xen/arch/x86/time.c +++ b/xen/arch/x86/time.c @@ -2268,7 +2268,7 @@ static void dump_softtsc(unsigned char k else if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC ) ) { printk("TSC has constant rate, "); - if (max_cstate <= 2 && tsc_max_warp == 0) + if ( max_cstate <= ACPI_STATE_C2 && tsc_max_warp == 0 ) printk("no deep Cstates, passed warp test, deemed reliable, "); else printk("deep Cstates possible, so not reliable, "); --- a/xen/drivers/acpi/pmstat.c +++ b/xen/drivers/acpi/pmstat.c @@ -451,6 +451,7 @@ int do_pm_op(struct xen_sysctl_pm_op *op case XEN_SYSCTL_pm_op_get_max_cstate: { + BUILD_BUG_ON(XEN_SYSCTL_CX_UNLIMITED != UINT_MAX); op->u.get_max_cstate = acpi_get_cstate_limit(); break; } --- a/xen/include/public/sysctl.h +++ b/xen/include/public/sysctl.h @@ -370,6 +370,7 @@ struct xen_sysctl_pm_op { struct xen_set_cpufreq_para set_para; uint64_aligned_t get_avgfreq; uint32_t set_sched_opt_smt; +#define XEN_SYSCTL_CX_UNLIMITED 0xffffffff uint32_t get_max_cstate; uint32_t set_max_cstate; } u;
x86/cpuidle: really use C1 for "urgent" CPUs For one on recent AMD CPUs entering C1 (if available at all) requires use of MWAIT, while HLT (i.e. default_idle()) would put the processor into as deep as CC6. And then even on other vendors' CPUs we should avoid entering default_idle() when the intended state can be reached by using the active idle driver's facilities. Signed-off-by: Jan Beulich <jbeul...@suse.com> Reviewed-by: Andrew Cooper <andrew.coop...@citrix.com> --- a/xen/arch/x86/acpi/cpu_idle.c +++ b/xen/arch/x86/acpi/cpu_idle.c @@ -582,12 +582,15 @@ static void acpi_processor_idle(void) u32 exp = 0, pred = 0; u32 irq_traced[4] = { 0 }; - if ( max_cstate > 0 && power && !sched_has_urgent_vcpu() && + if ( max_cstate > 0 && power && (next_state = cpuidle_current_governor->select(power)) > 0 ) { + unsigned int max_state = sched_has_urgent_vcpu() ? ACPI_STATE_C1 + : max_cstate; + do { cx = &power->states[next_state]; - } while ( cx->type > max_cstate && --next_state ); + } while ( cx->type > max_state && --next_state ); if ( next_state ) { if ( cx->type == ACPI_STATE_C3 && power->flags.bm_check && --- a/xen/arch/x86/cpu/mwait-idle.c +++ b/xen/arch/x86/cpu/mwait-idle.c @@ -724,11 +724,14 @@ static void mwait_idle(void) u64 before, after; u32 exp = 0, pred = 0, irq_traced[4] = { 0 }; - if (max_cstate > 0 && power && !sched_has_urgent_vcpu() && + if (max_cstate > 0 && power && (next_state = cpuidle_current_governor->select(power)) > 0) { + unsigned int max_state = sched_has_urgent_vcpu() ? ACPI_STATE_C1 + : max_cstate; + do { cx = &power->states[next_state]; - } while (cx->type > max_cstate && --next_state); + } while (cx->type > max_state && --next_state); if (!next_state) cx = NULL; else if (tb_init_done)
x86/AMD: make C-state handling independent of Dom0 At least for more recent CPUs, following what BKDG / PPR suggest for the BIOS to surface via ACPI we can make ourselves independent of Dom0 uploading respective data. Signed-off-by: Jan Beulich <jbeul...@suse.com> --- v2: Handle Hygon Fam18. Set local_apic_timer_c2_ok (for Fam17 and Hygon Fam18 only for now). --- TBD: We may want to verify that HLT indeed is configured to enter CC6. TBD: Brian's series specifies .target_residency as 1000 for CC6; may want to do so here as well. Question then is whether this value is also suitable for the older families. TBD: I guess we could extend this to families older then Fam15 as well. --- a/xen/arch/x86/acpi/cpu_idle.c +++ b/xen/arch/x86/acpi/cpu_idle.c @@ -110,6 +110,8 @@ boolean_param("lapic_timer_c2_ok", local struct acpi_processor_power *__read_mostly processor_powers[NR_CPUS]; +static int8_t __read_mostly vendor_override; + struct hw_residencies { uint64_t mc0; @@ -1214,6 +1216,9 @@ long set_cx_pminfo(uint32_t acpi_id, str if ( pm_idle_save && pm_idle != acpi_processor_idle ) return 0; + if ( vendor_override > 0 ) + return 0; + print_cx_pminfo(acpi_id, power); cpu_id = get_cpu_id(acpi_id); @@ -1286,6 +1291,103 @@ long set_cx_pminfo(uint32_t acpi_id, str return 0; } +static void amd_cpuidle_init(struct acpi_processor_power *power) +{ + unsigned int i, nr = 0; + const struct cpuinfo_x86 *c = ¤t_cpu_data; + const unsigned int ecx_req = CPUID5_ECX_EXTENSIONS_SUPPORTED | + CPUID5_ECX_INTERRUPT_BREAK; + const struct acpi_processor_cx *cx = NULL; + static const struct acpi_processor_cx fam17[] = { + { + .type = ACPI_STATE_C1, + .entry_method = ACPI_CSTATE_EM_FFH, + .address = 0, + .latency = 1, + }, + { + .type = ACPI_STATE_C2, + .entry_method = ACPI_CSTATE_EM_HALT, + .latency = 400, + }, + }; + + if ( pm_idle_save && pm_idle != acpi_processor_idle ) + return; + + if ( vendor_override < 0 ) + return; + + switch ( c->x86 ) + { + case 0x18: + if ( boot_cpu_data.x86_vendor != X86_VENDOR_HYGON ) + { + default: + vendor_override = -1; + return; + } + /* fall through */ + case 0x17: + if ( cpu_has_monitor && c->cpuid_level >= CPUID_MWAIT_LEAF && + (cpuid_ecx(CPUID_MWAIT_LEAF) & ecx_req) == ecx_req ) + { + cx = fam17; + nr = ARRAY_SIZE(fam17); + local_apic_timer_c2_ok = true; + break; + } + /* fall through */ + case 0x15: + case 0x16: + cx = &fam17[1]; + nr = ARRAY_SIZE(fam17) - 1; + break; + } + + power->flags.has_cst = true; + + for ( i = 0; i < nr; ++i ) + { + if ( cx[i].type > max_cstate ) + break; + power->states[i + 1] = cx[i]; + power->states[i + 1].idx = i + 1; + power->states[i + 1].target_residency = cx[i].latency * latency_factor; + } + + if ( i ) + { + power->count = i + 1; + power->safe_state = &power->states[i]; + + if ( !vendor_override ) + { + if ( !boot_cpu_has(X86_FEATURE_ARAT) ) + hpet_broadcast_init(); + + if ( !lapic_timer_init() ) + { + vendor_override = -1; + cpuidle_init_cpu(power->cpu); + return; + } + + if ( !pm_idle_save ) + { + pm_idle_save = pm_idle; + pm_idle = acpi_processor_idle; + } + + dead_idle = acpi_dead_idle; + + vendor_override = 1; + } + } + else + vendor_override = -1; +} + uint32_t pmstat_get_cx_nr(uint32_t cpuid) { return processor_powers[cpuid] ? processor_powers[cpuid]->count : 0; @@ -1432,8 +1534,8 @@ static int cpu_callback( int rc = 0; /* - * Only hook on CPU_UP_PREPARE because a dead cpu may utilize the info - * to enter deep C-state. + * Only hook on CPU_UP_PREPARE / CPU_ONLINE because a dead cpu may utilize + * the info to enter deep C-state. */ switch ( action ) { @@ -1442,6 +1544,13 @@ static int cpu_callback( if ( !rc && cpuidle_current_governor->enable ) rc = cpuidle_current_governor->enable(processor_powers[cpu]); break; + + case CPU_ONLINE: + if ( (boot_cpu_data.x86_vendor & + (X86_VENDOR_AMD | X86_VENDOR_HYGON)) && + processor_powers[cpu] ) + amd_cpuidle_init(processor_powers[cpu]); + break; } return !rc ? NOTIFY_DONE : notifier_from_errno(rc);
x86: allow limiting the max C-state sub-state Allow limiting the max C-state sub-state by appending to the max_cstate command-line parameter. E.g. max_cstate=1,0 The limit only applies to the highest legal C-state. For example: max_cstate = 1, max_csubstate = 0 ==> C0, C1 okay, but not C1E max_cstate = 1, max_csubstate = 1 ==> C0, C1 and C1E okay, but not C2 max_cstate = 2, max_csubstate = 0 ==> C0, C1, C1E, C2 okay, but not C3 max_cstate = 2, max_csubstate = 1 ==> C0, C1, C1E, C2 okay, but not C3 Signed-off-by: Ross Lagerwall <ross.lagerw...@citrix.com> Signed-off-by: Jan Beulich <jbeul...@suse.com> --- v2: Explicitly log "unlimited". Pass NULL in the 2nd simple_strtoul() invocation. --- a/docs/misc/xen-command-line.pandoc +++ b/docs/misc/xen-command-line.pandoc @@ -1374,9 +1374,11 @@ Specify the maximum number of CPUs that This option is ignored in **pv-shim** mode. ### max_cstate (x86) -> `= <integer>` +> `= <integer>[,<integer>]` -Specify the deepest C-state CPUs are permitted to be placed in. +Specify the deepest C-state CPUs are permitted to be placed in, and +optionally the maximum sub C-state to be used used. The latter only applies +to the highest permitted C-state. ### max_gsi_irqs (x86) > `= <integer>` --- a/xen/arch/x86/acpi/cpu_idle.c +++ b/xen/arch/x86/acpi/cpu_idle.c @@ -104,7 +104,17 @@ bool lapic_timer_init(void) void (*__read_mostly pm_idle_save)(void); unsigned int max_cstate __read_mostly = UINT_MAX; -integer_param("max_cstate", max_cstate); +unsigned int max_csubstate __read_mostly = UINT_MAX; + +static int __init parse_cstate(const char *s) +{ + max_cstate = simple_strtoul(s, &s, 0); + if ( *s == ',' ) + max_csubstate = simple_strtoul(s + 1, NULL, 0); + return 0; +} +custom_param("max_cstate", parse_cstate); + static bool __read_mostly local_apic_timer_c2_ok; boolean_param("lapic_timer_c2_ok", local_apic_timer_c2_ok); @@ -347,7 +357,13 @@ static void dump_cx(unsigned char key) printk("'%c' pressed -> printing ACPI Cx structures\n", key); if ( max_cstate < UINT_MAX ) + { printk("max state: C%u\n", max_cstate); + if ( max_csubstate < UINT_MAX ) + printk("max sub-state: %u\n", max_csubstate); + else + printk("max sub-state: unlimited\n"); + } else printk("max state: unlimited\n"); for_each_present_cpu ( cpu ) @@ -592,7 +608,13 @@ static void acpi_processor_idle(void) do { cx = &power->states[next_state]; - } while ( cx->type > max_state && --next_state ); + } while ( (cx->type > max_state || + cx->entry_method == ACPI_CSTATE_EM_NONE || + (cx->entry_method == ACPI_CSTATE_EM_FFH && + cx->type == max_cstate && + (cx->address & MWAIT_SUBSTATE_MASK) > max_csubstate)) && + --next_state ); + cx = &power->states[next_state]; if ( next_state ) { if ( cx->type == ACPI_STATE_C3 && power->flags.bm_check && --- a/xen/arch/x86/cpu/mwait-idle.c +++ b/xen/arch/x86/cpu/mwait-idle.c @@ -731,7 +731,9 @@ static void mwait_idle(void) do { cx = &power->states[next_state]; - } while (cx->type > max_state && --next_state); + } while ((cx->type > max_state || (cx->type == max_cstate && + MWAIT_HINT2SUBSTATE(cx->address) > max_csubstate)) && + --next_state); if (!next_state) cx = NULL; else if (tb_init_done) --- a/xen/include/xen/acpi.h +++ b/xen/include/xen/acpi.h @@ -141,13 +141,21 @@ void acpi_unregister_gsi (u32 gsi); #ifdef CONFIG_ACPI_CSTATE /* - * Set highest legal C-state - * 0: C0 okay, but not C1 - * 1: C1 okay, but not C2 - * 2: C2 okay, but not C3 etc. + * max_cstate sets the highest legal C-state. + * max_cstate = 0: C0 okay, but not C1 + * max_cstate = 1: C1 okay, but not C2 + * max_cstate = 2: C2 okay, but not C3 etc. + + * max_csubstate sets the highest legal C-state sub-state. Only applies to the + * highest legal C-state. + * max_cstate = 1, max_csubstate = 0 ==> C0, C1 okay, but not C1E + * max_cstate = 1, max_csubstate = 1 ==> C0, C1 and C1E okay, but not C2 + * max_cstate = 2, max_csubstate = 0 ==> C0, C1, C1E, C2 okay, but not C3 + * max_cstate = 2, max_csubstate = 1 ==> C0, C1, C1E, C2 okay, but not C3 */ extern unsigned int max_cstate; +extern unsigned int max_csubstate; static inline unsigned int acpi_get_cstate_limit(void) {
tools/libxc: allow controlling the max C-state sub-state Signed-off-by: Ross Lagerwall <ross.lagerw...@citrix.com> Make handling in do_pm_op() more homogeneous: Before interpreting op->cpuid as such, handle all operations not acting on a particular CPU. Also expose the setting via xenpm. Signed-off-by: Jan Beulich <jbeul...@suse.com> Acked-by: Wei Liu <wei.l...@citrix.com> --- v2: Adjust xenpm's usage message. Also adjust its output wording a little. --- a/tools/libxc/xc_pm.c +++ b/tools/libxc/xc_pm.c @@ -367,7 +367,7 @@ int xc_set_sched_opt_smt(xc_interface *x return rc; } -int xc_get_cpuidle_max_cstate(xc_interface *xch, uint32_t *value) +static int get_max_cstate(xc_interface *xch, uint32_t *value, uint32_t type) { int rc; DECLARE_SYSCTL; @@ -379,7 +379,7 @@ int xc_get_cpuidle_max_cstate(xc_interfa } sysctl.cmd = XEN_SYSCTL_pm_op; sysctl.u.pm_op.cmd = XEN_SYSCTL_pm_op_get_max_cstate; - sysctl.u.pm_op.cpuid = 0; + sysctl.u.pm_op.cpuid = type; sysctl.u.pm_op.u.get_max_cstate = 0; rc = do_sysctl(xch, &sysctl); *value = sysctl.u.pm_op.u.get_max_cstate; @@ -387,7 +387,17 @@ int xc_get_cpuidle_max_cstate(xc_interfa return rc; } -int xc_set_cpuidle_max_cstate(xc_interface *xch, uint32_t value) +int xc_get_cpuidle_max_cstate(xc_interface *xch, uint32_t *value) +{ + return get_max_cstate(xch, value, 0); +} + +int xc_get_cpuidle_max_csubstate(xc_interface *xch, uint32_t *value) +{ + return get_max_cstate(xch, value, 1); +} + +static int set_max_cstate(xc_interface *xch, uint32_t value, uint32_t type) { DECLARE_SYSCTL; @@ -398,12 +408,22 @@ int xc_set_cpuidle_max_cstate(xc_interfa } sysctl.cmd = XEN_SYSCTL_pm_op; sysctl.u.pm_op.cmd = XEN_SYSCTL_pm_op_set_max_cstate; - sysctl.u.pm_op.cpuid = 0; + sysctl.u.pm_op.cpuid = type; sysctl.u.pm_op.u.set_max_cstate = value; return do_sysctl(xch, &sysctl); } +int xc_set_cpuidle_max_cstate(xc_interface *xch, uint32_t value) +{ + return set_max_cstate(xch, value, 0); +} + +int xc_set_cpuidle_max_csubstate(xc_interface *xch, uint32_t value) +{ + return set_max_cstate(xch, value, 1); +} + int xc_enable_turbo(xc_interface *xch, int cpuid) { DECLARE_SYSCTL; --- a/tools/libxc/include/xenctrl.h +++ b/tools/libxc/include/xenctrl.h @@ -1906,6 +1906,9 @@ int xc_set_sched_opt_smt(xc_interface *x int xc_get_cpuidle_max_cstate(xc_interface *xch, uint32_t *value); int xc_set_cpuidle_max_cstate(xc_interface *xch, uint32_t value); +int xc_get_cpuidle_max_csubstate(xc_interface *xch, uint32_t *value); +int xc_set_cpuidle_max_csubstate(xc_interface *xch, uint32_t value); + int xc_enable_turbo(xc_interface *xch, int cpuid); int xc_disable_turbo(xc_interface *xch, int cpuid); --- a/tools/misc/xenpm.c +++ b/tools/misc/xenpm.c @@ -64,7 +64,9 @@ void show_help(void) " set-sched-smt enable|disable enable/disable scheduler smt power saving\n" " set-vcpu-migration-delay <num> set scheduler vcpu migration delay in us\n" " get-vcpu-migration-delay get scheduler vcpu migration delay\n" - " set-max-cstate <num>|'unlimited' set the C-State limitation (<num> >= 0)\n" + " set-max-cstate <num>|'unlimited'[,<num2>|'unlimited']\n" + " set the C-State limitation (<num> >= 0) and\n" + " optionally the C-sub-state limitation (<num2> >= 0)\n" " start [seconds] start collect Cx/Px statistics,\n" " output after CTRL-C or SIGINT or several seconds.\n" " enable-turbo-mode [cpuid] enable Turbo Mode for processors that support it.\n" @@ -195,7 +197,15 @@ static int show_max_cstate(xc_interface return ret; if ( value < XEN_SYSCTL_CX_UNLIMITED ) - printf("Max possible C-state: C%"PRIu32"\n\n", value); + { + printf("Max possible C-state: C%"PRIu32"\n", value); + if ( (ret = xc_get_cpuidle_max_csubstate(xc_handle, &value)) ) + return ret; + if ( value < XEN_SYSCTL_CX_UNLIMITED ) + printf("Max possible substate: %"PRIu32"\n\n", value); + else + puts(""); + } else printf("All C-states allowed\n\n"); @@ -1120,13 +1130,17 @@ void get_vcpu_migration_delay_func(int a void set_max_cstate_func(int argc, char *argv[]) { - int value; + int value, subval = XEN_SYSCTL_CX_UNLIMITED; char buf[12]; - if ( argc != 1 || + if ( argc < 1 || argc > 2 || (sscanf(argv[0], "%d", &value) == 1 ? value < 0 - : (value = XEN_SYSCTL_CX_UNLIMITED, strcmp(argv[0], "unlimited"))) ) + : (value = XEN_SYSCTL_CX_UNLIMITED, strcmp(argv[0], "unlimited"))) || + (argc == 2 && + (sscanf(argv[1], "%d", &subval) == 1 + ? subval < 0 + : (subval = XEN_SYSCTL_CX_UNLIMITED, strcmp(argv[1], "unlimited")))) ) { fprintf(stderr, "Missing, excess, or invalid argument(s)\n"); exit(EINVAL); @@ -1137,8 +1151,23 @@ void set_max_cstate_func(int argc, char if ( !xc_set_cpuidle_max_cstate(xc_handle, (uint32_t)value) ) printf("max C-state set to %s\n", value >= 0 ? buf : argv[0]); else + { fprintf(stderr, "Failed to set max C-state to %s (%d - %s)\n", value >= 0 ? buf : argv[0], errno, strerror(errno)); + return; + } + + if ( value != XEN_SYSCTL_CX_UNLIMITED ) + { + snprintf(buf, ARRAY_SIZE(buf), "%d", subval); + + if ( !xc_set_cpuidle_max_csubstate(xc_handle, (uint32_t)subval) ) + printf("max C-substate set to %s succeeded\n", + subval >= 0 ? buf : "unlimited"); + else + fprintf(stderr, "Failed to set max C-substate to %s (%d - %s)\n", + subval >= 0 ? buf : "unlimited", errno, strerror(errno)); + } } void enable_turbo_mode(int argc, char *argv[]) --- a/xen/drivers/acpi/pmstat.c +++ b/xen/drivers/acpi/pmstat.c @@ -398,7 +398,40 @@ int do_pm_op(struct xen_sysctl_pm_op *op int ret = 0; const struct processor_pminfo *pmpt; - if ( !op || op->cpuid >= nr_cpu_ids || !cpu_online(op->cpuid) ) + switch ( op->cmd ) + { + case XEN_SYSCTL_pm_op_set_sched_opt_smt: + { + uint32_t saved_value = sched_smt_power_savings; + + if ( op->cpuid != 0 ) + return -EINVAL; + sched_smt_power_savings = !!op->u.set_sched_opt_smt; + op->u.set_sched_opt_smt = saved_value; + return 0; + } + + case XEN_SYSCTL_pm_op_get_max_cstate: + BUILD_BUG_ON(XEN_SYSCTL_CX_UNLIMITED != UINT_MAX); + if ( op->cpuid == 0 ) + op->u.get_max_cstate = acpi_get_cstate_limit(); + else if ( op->cpuid == 1 ) + op->u.get_max_cstate = acpi_get_csubstate_limit(); + else + ret = -EINVAL; + return ret; + + case XEN_SYSCTL_pm_op_set_max_cstate: + if ( op->cpuid == 0 ) + acpi_set_cstate_limit(op->u.set_max_cstate); + else if ( op->cpuid == 1 ) + acpi_set_csubstate_limit(op->u.set_max_cstate); + else + ret = -EINVAL; + return ret; + } + + if ( op->cpuid >= nr_cpu_ids || !cpu_online(op->cpuid) ) return -EINVAL; pmpt = processor_pminfo[op->cpuid]; @@ -438,30 +471,6 @@ int do_pm_op(struct xen_sysctl_pm_op *op break; } - case XEN_SYSCTL_pm_op_set_sched_opt_smt: - { - uint32_t saved_value; - - saved_value = sched_smt_power_savings; - sched_smt_power_savings = !!op->u.set_sched_opt_smt; - op->u.set_sched_opt_smt = saved_value; - - break; - } - - case XEN_SYSCTL_pm_op_get_max_cstate: - { - BUILD_BUG_ON(XEN_SYSCTL_CX_UNLIMITED != UINT_MAX); - op->u.get_max_cstate = acpi_get_cstate_limit(); - break; - } - - case XEN_SYSCTL_pm_op_set_max_cstate: - { - acpi_set_cstate_limit(op->u.set_max_cstate); - break; - } - case XEN_SYSCTL_pm_op_enable_turbo: { ret = cpufreq_update_turbo(op->cpuid, CPUFREQ_TURBO_ENABLED); --- a/xen/include/public/sysctl.h +++ b/xen/include/public/sysctl.h @@ -350,7 +350,11 @@ struct xen_sysctl_pm_op { /* set/reset scheduler power saving option */ #define XEN_SYSCTL_pm_op_set_sched_opt_smt 0x21 - /* cpuidle max_cstate access command */ + /* + * cpuidle max C-state and max C-sub-state access command: + * Set cpuid to 0 for max C-state. + * Set cpuid to 1 for max C-sub-state. + */ #define XEN_SYSCTL_pm_op_get_max_cstate 0x22 #define XEN_SYSCTL_pm_op_set_max_cstate 0x23 --- a/xen/include/xen/acpi.h +++ b/xen/include/xen/acpi.h @@ -166,9 +166,22 @@ static inline void acpi_set_cstate_limit max_cstate = new_limit; return; } + +static inline unsigned int acpi_get_csubstate_limit(void) +{ + return max_csubstate; +} + +static inline void acpi_set_csubstate_limit(unsigned int new_limit) +{ + max_csubstate = new_limit; +} + #else static inline unsigned int acpi_get_cstate_limit(void) { return 0; } static inline void acpi_set_cstate_limit(unsigned int new_limit) { return; } +static inline unsigned int acpi_get_csubstate_limit(void) { return 0; } +static inline void acpi_set_csubstate_limit(unsigned int new_limit) { return; } #endif #ifdef XEN_GUEST_HANDLE_PARAM
_______________________________________________ Xen-devel mailing list Xen-devel@lists.xenproject.org https://lists.xenproject.org/mailman/listinfo/xen-devel