[PATCH 1/2] powerpc/powernv: Add definition of OPAL_MSG_OCC message type
Add OPAL_MSG_OCC message definition to opal_message_type to notify OCC events like reset, load and throttled. Host performance can be affected when OCC is reset or OCC throttles the max Pstate. We can register to opal_message_notifier to receive OPAL_MSG_OCC type of message. The reset and load OCC events are notified to kernel when FSP sends OCC_RESET and OCC_LOAD commands. Both reset and load messages are sent to kernel on successful completion of reset and load operation respectively. The throttle OCC event indicates that the Pmax of the chip is reduced. The chip_id and throttle reason for reducing Pmax is also queued along with the message. Additional opal message type OPAL_MSG_PRD is added to maintain compatibility between opal and kernel definition of opal_message_type. Signed-off-by: Shilpasri G Bhat --- arch/powerpc/include/asm/opal-api.h | 8 1 file changed, 8 insertions(+) diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 0321a90..50053b7 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -352,6 +352,14 @@ enum opal_msg_type { OPAL_MSG_SHUTDOWN, /* params[0] = 1 reboot, 0 shutdown */ OPAL_MSG_HMI_EVT, OPAL_MSG_DPO, + OPAL_MSG_PRD, + OPAL_MSG_OCC, /* +* params[0] = 0 reset, +* 1 load, +* 2 throttle +* params[1] = chip_id +* params[2] = throttle_status +*/ OPAL_MSG_TYPE_MAX, }; -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 2/2] cpufreq: powernv: Register for OCC related opal_message notification
OCC is an On-Chip-Controller which takes care of power and thermal safety of the chip. During runtime due to power failure or overtemperature the OCC may throttle the frequencies of the CPUs to remain within the power budget. We want the cpufreq driver to be aware of such situations to be able to report it to the user. We register to opal_message_notifier to receive OCC messages from opal. powernv_cpufreq_throttle_check() reports any frequency throttling and this patch will report the reason or event that caused throttling. We can be throttled if OCC is reset or OCC limits Pmax due to power or thermal reasons. We are also notified of unthrottling after an OCC reset or if OCC restores Pmax on the chip. Signed-off-by: Shilpasri G Bhat CC: "Rafael J. Wysocki" CC: Viresh Kumar CC: linux...@vger.kernel.org --- drivers/cpufreq/powernv-cpufreq.c | 70 ++- 1 file changed, 69 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index ebef0d8..5718765 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -32,6 +32,7 @@ #include #include #include /* Required for cpu_sibling_mask() in UP configs */ +#include #define POWERNV_MAX_PSTATES256 #define PMSR_PSAFE_ENABLE (1UL << 30) @@ -40,7 +41,7 @@ #define PMSR_LP(x) ((x >> 48) & 0xFF) static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1]; -static bool rebooting, throttled; +static bool rebooting, throttled, occ_reset; /* * Note: The set of pstates consists of contiguous integers, the @@ -395,6 +396,72 @@ static struct notifier_block powernv_cpufreq_reboot_nb = { .notifier_call = powernv_cpufreq_reboot_notifier, }; +static char throttle_reason[6][50] = { "No throttling", + "Power Cap", + "Processor Over Temperature", + "Power Supply Failure", + "OverCurrent", + "OCC Reset" +}; + +static int powernv_cpufreq_occ_msg(struct notifier_block *nb, + unsigned long msg_type, void *msg) +{ + struct opal_msg *occ_msg = msg; + uint64_t token; + uint64_t chip_id, reason; + + if (msg_type != OPAL_MSG_OCC) + return 0; + token = be64_to_cpu(occ_msg->params[0]); + switch (token) { + case 0: + occ_reset = true; + /* +* powernv_cpufreq_throttle_check() is called in +* target() callback which can detect the throttle state +* for governors like ondemand. +* But static governors will not call target() often thus +* report throttling here. +*/ + if (!throttled) { + throttled = true; + pr_crit("CPU Frequency is throttled\n"); + } + pr_info("OCC in Reset\n"); + break; + case 1: + pr_info("OCC is Loaded\n"); + break; + case 2: + chip_id = be64_to_cpu(occ_msg->params[1]); + reason = be64_to_cpu(occ_msg->params[2]); + if (occ_reset) { + occ_reset = false; + throttled = false; + pr_info("OCC is Active\n"); + /* Sanity check for static governors */ + powernv_cpufreq_throttle_check(smp_processor_id()); + } else if (reason) { + throttled = true; + pr_info("Pmax reduced due to %s on chip %x\n", + throttle_reason[reason], (int)chip_id); + } else { + throttled = false; + pr_info("%s on chip %x\n", + throttle_reason[reason], (int)chip_id); + } + break; + } + return 0; +} + +static struct notifier_block powernv_cpufreq_opal_nb = { + .notifier_call = powernv_cpufreq_occ_msg, + .next = NULL, + .priority = 0, +}; + static void powernv_cpufreq_stop_cpu(struct cpufreq_policy *policy) { struct powernv_smp_call_data freq_data; @@ -430,6 +497,7 @@ static int __init powernv_cpufreq_init(void) } register_reboot_notifier(&powernv_cpufreq_reboot_nb); + opal_message_notifier_register(OPAL_MSG_OCC, &powernv_cpufreq_opal_nb); return cpufreq_register_driver(&powernv_cpufreq_driver); } module_init(powernv_cpufreq_init); -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 2/2] cpufreq: powernv: Register for OCC related opal_message notification
Hi Viresh, On 04/27/2015 10:02 AM, Viresh Kumar wrote: > On 22 April 2015 at 22:34, Shilpasri G Bhat > wrote: >> diff --git a/drivers/cpufreq/powernv-cpufreq.c >> b/drivers/cpufreq/powernv-cpufreq.c > >> +static char throttle_reason[6][50] = { "No throttling", > > Don't need to mention 6 here. > > And the max length you need right now is 27, so maybe s/50/30 ? > > Also, start 'No Throttling' in a new line, like below. Will do. > >> + "Power Cap", >> + "Processor Over Temperature", >> + "Power Supply Failure", >> + "OverCurrent", > > s/OverCurrent/Over Current/ ? Okay. > >> + "OCC Reset" >> +}; >> + >> +static int powernv_cpufreq_occ_msg(struct notifier_block *nb, >> + unsigned long msg_type, void *msg) >> +{ >> + struct opal_msg *occ_msg = msg; >> + uint64_t token; >> + uint64_t chip_id, reason; >> + >> + if (msg_type != OPAL_MSG_OCC) >> + return 0; > > Blank line here. Okay > >> + token = be64_to_cpu(occ_msg->params[0]); > > Here as well.. > >> + switch (token) { >> + case 0: >> + occ_reset = true; >> + /* >> +* powernv_cpufreq_throttle_check() is called in >> +* target() callback which can detect the throttle state >> +* for governors like ondemand. >> +* But static governors will not call target() often thus >> +* report throttling here. >> +*/ > > Now, do I understand correctly that this notifier will be called as > soon as we switch throttling state ? > > If yes, then do we still need the throttle_check() routine you added > earlier ? Maybe not. We cannot remove throttle_check() routine for the following reasons: 1) To report old firmware bugs which do not restore frequency control to host after an OCC reset. 2) In BMC based boxes if OCC crashes currently firmware will not send 'reset' and 'load' messages, in such cases throttle_check() will be sufficient to monitor a throttled state caused by 'reset'. 3) Throttle reporting in old firmwares which do not have this notification. > >> + if (!throttled) { >> + throttled = true; >> + pr_crit("CPU Frequency is throttled\n"); >> + } >> + pr_info("OCC in Reset\n"); >> + break; >> + case 1: >> + pr_info("OCC is Loaded\n"); >> + break; >> + case 2: >> + chip_id = be64_to_cpu(occ_msg->params[1]); >> + reason = be64_to_cpu(occ_msg->params[2]); > > Blank line here. Okay > >> + if (occ_reset) { >> + occ_reset = false; >> + throttled = false; >> + pr_info("OCC is Active\n"); >> + /* Sanity check for static governors */ >> + powernv_cpufreq_throttle_check(smp_processor_id()); >> + } else if (reason) { >> + throttled = true; >> + pr_info("Pmax reduced due to %s on chip %x\n", >> + throttle_reason[reason], >> (int)chip_id); >> + } else { >> + throttled = false; >> + pr_info("%s on chip %x\n", >> + throttle_reason[reason], >> (int)chip_id); >> + } > > Run checkpatch with --strict option, and you will see some warnings. Okay will do. Thanks and Regards, Shilpa ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 2/2] cpufreq: powernv: Register for OCC related opal_message notification
Hi Preeti, On 04/23/2015 05:28 PM, Preeti U Murthy wrote: > Hi Shilpa, > > On 04/22/2015 10:34 PM, Shilpasri G Bhat wrote: >> OCC is an On-Chip-Controller which takes care of power and thermal >> safety of the chip. During runtime due to power failure or >> overtemperature the OCC may throttle the frequencies of the CPUs to >> remain within the power budget. >> >> We want the cpufreq driver to be aware of such situations to be able >> to report it to the user. We register to opal_message_notifier to >> receive OCC messages from opal. >> >> powernv_cpufreq_throttle_check() reports any frequency throttling and >> this patch will report the reason or event that caused throttling. We >> can be throttled if OCC is reset or OCC limits Pmax due to power or >> thermal reasons. We are also notified of unthrottling after an OCC >> reset or if OCC restores Pmax on the chip. >> >> Signed-off-by: Shilpasri G Bhat >> CC: "Rafael J. Wysocki" >> CC: Viresh Kumar >> CC: linux...@vger.kernel.org >> --- >> drivers/cpufreq/powernv-cpufreq.c | 70 >> ++- >> 1 file changed, 69 insertions(+), 1 deletion(-) >> >> diff --git a/drivers/cpufreq/powernv-cpufreq.c >> b/drivers/cpufreq/powernv-cpufreq.c >> index ebef0d8..5718765 100644 >> --- a/drivers/cpufreq/powernv-cpufreq.c >> +++ b/drivers/cpufreq/powernv-cpufreq.c >> @@ -32,6 +32,7 @@ >> #include >> #include >> #include /* Required for cpu_sibling_mask() in UP configs */ >> +#include >> >> #define POWERNV_MAX_PSTATES 256 >> #define PMSR_PSAFE_ENABLE (1UL << 30) >> @@ -40,7 +41,7 @@ >> #define PMSR_LP(x) ((x >> 48) & 0xFF) >> >> static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1]; >> -static bool rebooting, throttled; >> +static bool rebooting, throttled, occ_reset; >> >> /* >> * Note: The set of pstates consists of contiguous integers, the >> @@ -395,6 +396,72 @@ static struct notifier_block powernv_cpufreq_reboot_nb >> = { >> .notifier_call = powernv_cpufreq_reboot_notifier, >> }; >> >> +static char throttle_reason[6][50] = { "No throttling", >> +"Power Cap", >> +"Processor Over Temperature", >> +"Power Supply Failure", >> +"OverCurrent", >> +"OCC Reset" >> + }; >> + >> +static int powernv_cpufreq_occ_msg(struct notifier_block *nb, >> +unsigned long msg_type, void *msg) >> +{ >> +struct opal_msg *occ_msg = msg; >> +uint64_t token; >> +uint64_t chip_id, reason; >> + >> +if (msg_type != OPAL_MSG_OCC) >> +return 0; >> +token = be64_to_cpu(occ_msg->params[0]); >> +switch (token) { >> +case 0: >> +occ_reset = true; >> +/* >> + * powernv_cpufreq_throttle_check() is called in >> + * target() callback which can detect the throttle state >> + * for governors like ondemand. >> + * But static governors will not call target() often thus >> + * report throttling here. >> + */ >> +if (!throttled) { >> +throttled = true; >> +pr_crit("CPU Frequency is throttled\n"); >> +} >> +pr_info("OCC in Reset\n"); >> +break; >> +case 1: >> +pr_info("OCC is Loaded\n"); >> +break; >> +case 2: > > You may want to replace the numbers with macros. Like > OCC_RESET,OCC_LOAD, OCC_THROTTLE for better readability. Okay will do. > >> +chip_id = be64_to_cpu(occ_msg->params[1]); >> +reason = be64_to_cpu(occ_msg->params[2]); >> +if (occ_reset) { >> +occ_reset = false; >> +throttled = false; >> +pr_info("OCC is Active\n"); >> +/* Sanity check for static governors */ >> +powernv_cpufreq_throttle_check(smp_processor_id()); >> +} else if (reason) { >> +throttled = true; >> +pr_info("Pmax reduced due to %s on chip %x\n", >> +throttle_reason[reason], (int)chip_id); >> +} else { >> +throttled = false; >> +pr_info("%s on chip %x\n", >> +throttle_reason[reason], (int)chip_id); > > Don't you need a powernv_cpufreq_throttle_check() here? Or is it ok to > rely on the OCC notification for unthrottle ? Yes we need to check. Fixing this in v2. Thanks and Regards, Shilpa ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v2 0/2] powernv: cpufreq: Report frequency throttle by OCC
This patchset intends to add frequency throttle reporting mechanism to powernv-cpufreq driver when OCC throttles the frequency. OCC is an On-Chip-Controller which takes care of the power and thermal safety of the chip. The CPU frequency can be throttled during an OCC reset or when OCC tries to limit the max allowed frequency. The patchset will report such conditions so as to keep the user informed about reason for the drop in performance of workloads when frequency is throttled. Shilpasri G Bhat (2): powerpc/powernv: Add definition of OPAL_MSG_OCC message type cpufreq: powernv: Register for OCC related opal_message notification arch/powerpc/include/asm/opal-api.h | 8 ++ drivers/cpufreq/powernv-cpufreq.c | 181 +--- 2 files changed, 174 insertions(+), 15 deletions(-) -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v2 1/2] powerpc/powernv: Add definition of OPAL_MSG_OCC message type
Add OPAL_MSG_OCC message definition to opal_message_type to receive OCC events like reset, load and throttled. Host performance can be affected when OCC is reset or OCC throttles the max Pstate. We can register to opal_message_notifier to receive OPAL_MSG_OCC type of message and report it to the userspace so as to keep the user informed about the reason for a performance drop in workloads. The reset and load OCC events are notified to kernel when FSP sends OCC_RESET and OCC_LOAD commands. Both reset and load messages are sent to kernel on successful completion of reset and load operation respectively. The throttle OCC event indicates that the Pmax of the chip is reduced. The chip_id and throttle reason for reducing Pmax is also queued along with the message. Additional opal message type OPAL_MSG_PRD is added to maintain compatibility between opal and kernel definition of opal_message_type. Signed-off-by: Shilpasri G Bhat Reviewed-by: Preeti U Murthy --- Changes from v1: - Update the commit changelog arch/powerpc/include/asm/opal-api.h | 8 1 file changed, 8 insertions(+) diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 0321a90..50053b7 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -352,6 +352,14 @@ enum opal_msg_type { OPAL_MSG_SHUTDOWN, /* params[0] = 1 reboot, 0 shutdown */ OPAL_MSG_HMI_EVT, OPAL_MSG_DPO, + OPAL_MSG_PRD, + OPAL_MSG_OCC, /* +* params[0] = 0 reset, +* 1 load, +* 2 throttle +* params[1] = chip_id +* params[2] = throttle_status +*/ OPAL_MSG_TYPE_MAX, }; -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v2 2/2] cpufreq: powernv: Register for OCC related opal_message notification
OCC is an On-Chip-Controller which takes care of power and thermal safety of the chip. During runtime due to power failure or overtemperature the OCC may throttle the frequencies of the CPUs to remain within the power budget. We want the cpufreq driver to be aware of such situations to be able to report it to the user. We register to opal_message_notifier to receive OCC messages from opal. powernv_cpufreq_throttle_check() reports any frequency throttling and this patch will report the reason or event that caused throttling. We can be throttled if OCC is reset or OCC limits Pmax due to power or thermal reasons. We are also notified of unthrottling after an OCC reset or if OCC restores Pmax on the chip. Signed-off-by: Shilpasri G Bhat CC: "Rafael J. Wysocki" CC: Viresh Kumar CC: Preeti U Murthy CC: linux...@vger.kernel.org --- Changes from v1: - Add macros to define OCC_RESET, OCC_LOAD and OCC_THROTTLE - Define a structure to store chip id, chip mask which has bits set for cpus present in the chip, throttled state and a work_struct. - Modify powernv_cpufreq_throttle_check() to be called via smp_call() - On Pmax throttling/unthrottling update 'chip.throttled' and not the global 'throttled' as Pmax capping is local to the chip. - Remove the condition which checks if local pstate is less than Pmin while checking for Psafe frequency. When OCC becomes active after reset we update 'thottled' to false and when the cpufreq governor initiates a pstate change, the local pstate will be in Psafe and we will be reporting a false positive when we are not throttled. - Schedule a kworker on receiving throttling/unthrottling OCC message for that chip and schedule on all chips after receiving active. - After an OCC reset all the cpus will be in Psafe frequency. So call target() and restore the frequency to policy->cur after OCC_ACTIVE and Pmax unthrottling - Taken care of Viresh and Preeti's comments. drivers/cpufreq/powernv-cpufreq.c | 181 ++ 1 file changed, 166 insertions(+), 15 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index ebef0d8..b356c9d 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -27,20 +27,33 @@ #include #include #include +#include #include #include #include #include /* Required for cpu_sibling_mask() in UP configs */ +#include #define POWERNV_MAX_PSTATES256 #define PMSR_PSAFE_ENABLE (1UL << 30) #define PMSR_SPR_EM_DISABLE(1UL << 31) #define PMSR_MAX(x)((x >> 32) & 0xFF) -#define PMSR_LP(x) ((x >> 48) & 0xFF) +#define OCC_RESET 0 +#define OCC_LOAD 1 +#define OCC_THROTTLE 2 static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1]; -static bool rebooting, throttled; +static bool rebooting, throttled, occ_reset; + +static struct chip { + int id; + bool throttled; + cpumask_t mask; + struct work_struct throttle; +} *chips; + +static int nr_chips; /* * Note: The set of pstates consists of contiguous integers, the @@ -298,28 +311,33 @@ static inline unsigned int get_nominal_index(void) return powernv_pstate_info.max - powernv_pstate_info.nominal; } -static void powernv_cpufreq_throttle_check(unsigned int cpu) +static void powernv_cpufreq_throttle_check(void *data) { + unsigned int cpu = smp_processor_id(); unsigned long pmsr; - int pmsr_pmax, pmsr_lp; + int pmsr_pmax, i; pmsr = get_pmspr(SPRN_PMSR); + for (i = 0; i < nr_chips; i++) + if (chips[i].id == cpu_to_chip_id(cpu)) + break; + /* Check for Pmax Capping */ pmsr_pmax = (s8)PMSR_MAX(pmsr); if (pmsr_pmax != powernv_pstate_info.max) { - throttled = true; - pr_info("CPU %d Pmax is reduced to %d\n", cpu, pmsr_pmax); - pr_info("Max allowed Pstate is capped\n"); + if (chips[i].throttled) + goto next; + chips[i].throttled = true; + pr_info("CPU %d on chip %d Pmax is reduced to %d\n", cpu, + chips[i].id, pmsr_pmax); + } else { + chips[i].throttled = false; } - /* -* Check for Psafe by reading LocalPstate -* or check if Psafe_mode_active is set in PMSR. -*/ - pmsr_lp = (s8)PMSR_LP(pmsr); - if ((pmsr_lp < powernv_pstate_info.min) || - (pmsr & PMSR_PSAFE_ENABLE)) { + /* Check if Psafe_mode_active is set in PMSR. */ +next: + if (pmsr & PMSR_PSAFE_ENABLE) { throttled = true; pr_info("Pstate set to safe frequency\n"); } @@ -350,
Re: [PATCH v2 2/2] cpufreq: powernv: Register for OCC related opal_message notification
Hi Viresh, On 04/28/2015 12:18 PM, Viresh Kumar wrote: > On 28 April 2015 at 11:53, Shilpasri G Bhat > wrote: > >> Changes from v1: >> - Add macros to define OCC_RESET, OCC_LOAD and OCC_THROTTLE >> - Define a structure to store chip id, chip mask which has bits set >> for cpus present in the chip, throttled state and a work_struct. >> - Modify powernv_cpufreq_throttle_check() to be called via smp_call() > > Why ? I might have missed it but there should be some reasoning behind > what you are changing. My bad I haven't added explicit comment to state reason behind this change. I modified the definition of *throttle_check() to match the function definition to be called via smp_call() instead of adding an additional wrapper around *throttle_check(). OCC is a chip entity and any local throttle state changes should be associated to cpus belonging to that chip. The *throttle_check() will read the core register PMSR to verify throttling. All the cores in a chip will have the same throttled state as they are managed by a the same OCC in that chip. smp_call() is required to ensure *throttle_check() is called on a cpu belonging to the chip for which we have received throttled/unthrottled notification. We could be handling throttled/unthrottled notification of 'chip1' in 'chip2' so do an smp_call() on 'chip1'. We are irq_disabled in powernv_cpufreq_occ_msg() the notification handler. Thus the use of kworker to do an smp_call and restore policy->cur. OCC_RESET is global event it affects frequency of all chips. Pmax capping is local event, it affects the frequency of a chip. > >> - On Pmax throttling/unthrottling update 'chip.throttled' and not the >> global 'throttled' as Pmax capping is local to the chip. >> - Remove the condition which checks if local pstate is less than Pmin >> while checking for Psafe frequency. When OCC becomes active after >> reset we update 'thottled' to false and when the cpufreq governor >> initiates a pstate change, the local pstate will be in Psafe and we >> will be reporting a false positive when we are not throttled. >> - Schedule a kworker on receiving throttling/unthrottling OCC message >> for that chip and schedule on all chips after receiving active. >> - After an OCC reset all the cpus will be in Psafe frequency. So call >> target() and restore the frequency to policy->cur after OCC_ACTIVE >> and Pmax unthrottling >> - Taken care of Viresh and Preeti's comments. > > That's a lot. I am not an expert here and so really can't comment on > the internals of ppc. But, is it patch solving a single problem ? I don't > know, I somehow got the impression that it can be split into multiple > (smaller & review-able) patches. Only if it makes sense. Your call. All the changes introduced in this patch is centered around opal_message notification handler powernv_cpufreq_occ_msg(). I can split it into multiple patches but it all will be relevant only to solve the above problem. > >> diff --git a/drivers/cpufreq/powernv-cpufreq.c >> b/drivers/cpufreq/powernv-cpufreq.c > >> +void powernv_cpufreq_work_fn(struct work_struct *work) >> +{ >> + struct chip *c = container_of(work, struct chip, throttle); >> + unsigned int cpu; >> + >> + smp_call_function_any(&c->mask, >> + powernv_cpufreq_throttle_check, NULL, 0); >> + >> + for_each_cpu(cpu, &c->mask) { > > for_each_online_cpu ? I want to iterate on all the cpus in a chip stored in 'struct chip.mask'. If you were intending me to avoid 'if(!cpu_online(cpu))' then will the following do: for_each_cpu_and(cpu, &c->mask, cpu_online_mask) > >> + int index; >> + struct cpufreq_frequency_table *freq_table; >> + struct cpufreq_policy cpu_policy; > > Name it policy. Okay. > >> + >> + if (!cpu_online(cpu)) >> + continue; > > And you can kill this.. > >> + cpufreq_get_policy(&cpu_policy, cpu); >> + freq_table = cpufreq_frequency_get_table(cpu_policy.cpu); > > Just do, policy->freq_table. Okay. > > >> +static int powernv_cpufreq_occ_msg(struct notifier_block *nb, >> + unsigned long msg_type, void *msg) >> +{ > >> + if (reason && reason <= 5) >> + pr_info("OCC: Chip %d Pmax reduced due to %s\n", >> + (int)chip_id, throttle_reason[reason]); >> + else
Re: [PATCH v2 2/2] cpufreq: powernv: Register for OCC related opal_message notification
On 04/28/2015 02:23 PM, Viresh Kumar wrote: > On 28 April 2015 at 13:48, Shilpasri G Bhat > wrote: >> My bad I haven't added explicit comment to state reason behind this change. >> >> I modified the definition of *throttle_check() to match the function >> definition >> to be called via smp_call() instead of adding an additional wrapper around >> *throttle_check(). >> >> OCC is a chip entity and any local throttle state changes should be >> associated >> to cpus belonging to that chip. The *throttle_check() will read the core >> register PMSR to verify throttling. All the cores in a chip will have the >> same >> throttled state as they are managed by a the same OCC in that chip. >> >> smp_call() is required to ensure *throttle_check() is called on a cpu >> belonging >> to the chip for which we have received throttled/unthrottled notification. We >> could be handling throttled/unthrottled notification of 'chip1' in 'chip2' >> so do >> an smp_call() on 'chip1'. > > Okay. Lets talk about the code that is already present in mainline. Isn't that > suffering from this issue ? If yes, then you need to bugfix that separately. Nope. The upstream code does not have this issue as it does not have checks to detect unthrottling state. The unthrottling i.e, 'throttled=false' is being handled only in this patchset. Yes this can be fixed separately. > >> We are irq_disabled in powernv_cpufreq_occ_msg() the notification handler. >> Thus the use of kworker to do an smp_call and restore policy->cur. >> >> OCC_RESET is global event it affects frequency of all chips. Pmax capping is >> local event, it affects the frequency of a chip. >> > >>> That's a lot. I am not an expert here and so really can't comment on >>> the internals of ppc. But, is it patch solving a single problem ? I don't >>> know, I somehow got the impression that it can be split into multiple >>> (smaller & review-able) patches. Only if it makes sense. Your call. >> >> All the changes introduced in this patch is centered around opal_message >> notification handler powernv_cpufreq_occ_msg(). I can split it into multiple >> patches but it all will be relevant only to solve the above problem. > > And that's what I meant here. Yes, this all is solving a central problem, but > a patch must be divided into separate, independently working, entities. > Yup agree. Will do. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v3 0/6] powernv: cpufreq: Report frequency throttle by OCC
This patchset intends to add frequency throttle reporting mechanism to powernv-cpufreq driver when OCC throttles the frequency. OCC is an On-Chip-Controller which takes care of the power and thermal safety of the chip. The CPU frequency can be throttled during an OCC reset or when OCC tries to limit the max allowed frequency. The patchset will report such conditions so as to keep the user informed about reason for the drop in performance of workloads when frequency is throttled. Changes from v2: - Split into multiple patches - Semantic fixes Shilpasri G Bhat (6): cpufreq: poowernv: Handle throttling due to Pmax capping at chip level powerpc/powernv: Add definition of OPAL_MSG_OCC message type cpufreq: powernv: Register for OCC related opal_message notification cpufreq: powernv: Call throttle_check() on receiving OCC_THROTTLE cpufreq: powernv: Report Psafe only if PMSR.psafe_mode_active bit is set cpufreq: powernv: Restore cpu frequency to policy->cur on unthrottling arch/powerpc/include/asm/opal-api.h | 8 ++ drivers/cpufreq/powernv-cpufreq.c | 199 +--- 2 files changed, 192 insertions(+), 15 deletions(-) -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v3 1/6] cpufreq: poowernv: Handle throttling due to Pmax capping at chip level
The On-Chip-Controller(OCC) can throttle cpu frequency by reducing the max allowed frequency for that chip if the chip exceeds its power or temperature limits. As Pmax capping is a chip level condition report this throttling behavior at chip level and also do not set the global 'throttled' on Pmax capping instead set the per-chip throttled variable. Report unthrottling if Pmax is restored after throttling. This patch adds a structure to store chip id and throttled state of the chip. Signed-off-by: Shilpasri G Bhat --- drivers/cpufreq/powernv-cpufreq.c | 59 --- 1 file changed, 55 insertions(+), 4 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index ebef0d8..d0c18c9 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -42,6 +43,13 @@ static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1]; static bool rebooting, throttled; +static struct chip { + unsigned int id; + bool throttled; +} *chips; + +static int nr_chips; + /* * Note: The set of pstates consists of contiguous integers, the * smallest of which is indicated by powernv_pstate_info.min, the @@ -301,22 +309,33 @@ static inline unsigned int get_nominal_index(void) static void powernv_cpufreq_throttle_check(unsigned int cpu) { unsigned long pmsr; - int pmsr_pmax, pmsr_lp; + int pmsr_pmax, pmsr_lp, i; pmsr = get_pmspr(SPRN_PMSR); + for (i = 0; i < nr_chips; i++) + if (chips[i].id == cpu_to_chip_id(cpu)) + break; + /* Check for Pmax Capping */ pmsr_pmax = (s8)PMSR_MAX(pmsr); if (pmsr_pmax != powernv_pstate_info.max) { - throttled = true; - pr_info("CPU %d Pmax is reduced to %d\n", cpu, pmsr_pmax); - pr_info("Max allowed Pstate is capped\n"); + if (chips[i].throttled) + goto next; + chips[i].throttled = true; + pr_info("CPU %d on Chip %u has Pmax reduced to %d\n", cpu, + chips[i].id, pmsr_pmax); + } else if (chips[i].throttled) { + chips[i].throttled = false; + pr_info("CPU %d on Chip %u has Pmax restored to %d\n", cpu, + chips[i].id, pmsr_pmax); } /* * Check for Psafe by reading LocalPstate * or check if Psafe_mode_active is set in PMSR. */ +next: pmsr_lp = (s8)PMSR_LP(pmsr); if ((pmsr_lp < powernv_pstate_info.min) || (pmsr & PMSR_PSAFE_ENABLE)) { @@ -414,6 +433,33 @@ static struct cpufreq_driver powernv_cpufreq_driver = { .attr = powernv_cpu_freq_attr, }; +static int init_chip_info(void) +{ + unsigned int chip[256]; + unsigned int cpu, i; + unsigned int prev_chip_id = UINT_MAX; + + for_each_possible_cpu(cpu) { + unsigned int id = cpu_to_chip_id(cpu); + + if (prev_chip_id != id) { + prev_chip_id = id; + chip[nr_chips++] = id; + } + } + + chips = kmalloc_array(nr_chips, sizeof(struct chip), GFP_KERNEL); + if (!chips) + return -ENOMEM; + + for (i = 0; i < nr_chips; i++) { + chips[i].id = chip[i]; + chips[i].throttled = false; + } + + return 0; +} + static int __init powernv_cpufreq_init(void) { int rc = 0; @@ -429,6 +475,11 @@ static int __init powernv_cpufreq_init(void) return rc; } + /* Populate chip info */ + rc = init_chip_info(); + if (rc) + return rc; + register_reboot_notifier(&powernv_cpufreq_reboot_nb); return cpufreq_register_driver(&powernv_cpufreq_driver); } -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v3 2/6] powerpc/powernv: Add definition of OPAL_MSG_OCC message type
Add OPAL_MSG_OCC message definition to opal_message_type to receive OCC events like reset, load and throttled. Host performance can be affected when OCC is reset or OCC throttles the max Pstate. We can register to opal_message_notifier to receive OPAL_MSG_OCC type of message and report it to the userspace so as to keep the user informed about the reason for a performance drop in workloads. The reset and load OCC events are notified to kernel when FSP sends OCC_RESET and OCC_LOAD commands. Both reset and load messages are sent to kernel on successful completion of reset and load operation respectively. The throttle OCC event indicates that the Pmax of the chip is reduced. The chip_id and throttle reason for reducing Pmax is also queued along with the message. Additional opal message type OPAL_MSG_PRD is added to maintain compatibility between opal and kernel definition of opal_message_type. Signed-off-by: Shilpasri G Bhat Reviewed-by: Preeti U Murthy --- No change from V2 Change from v1: - Update the commit changelog arch/powerpc/include/asm/opal-api.h | 8 1 file changed, 8 insertions(+) diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 0321a90..50053b7 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -352,6 +352,14 @@ enum opal_msg_type { OPAL_MSG_SHUTDOWN, /* params[0] = 1 reboot, 0 shutdown */ OPAL_MSG_HMI_EVT, OPAL_MSG_DPO, + OPAL_MSG_PRD, + OPAL_MSG_OCC, /* +* params[0] = 0 reset, +* 1 load, +* 2 throttle +* params[1] = chip_id +* params[2] = throttle_status +*/ OPAL_MSG_TYPE_MAX, }; -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v3 4/6] cpufreq: powernv: Call throttle_check() on receiving OCC_THROTTLE
Re-evaluate the chip's throttled state on recieving OCC_THROTTLE notification by executing *throttle_check() on any one of the cpu on the chip. This is a sanity check to verify if we were indeed throttled/unthrottled after receiving OCC_THROTTLE notification. We cannot call *throttle_check() directly from the notification handler because we could be handling chip1's notification in chip2. So initiate an smp_call to execute *throttle_check(). We are irq-disabled in the notification handler, so use a worker thread to smp_call throttle_check() on any of the cpu in the chipmask. Signed-off-by: Shilpasri G Bhat --- drivers/cpufreq/powernv-cpufreq.c | 28 ++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 9268424..9618813 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -50,6 +50,8 @@ static bool rebooting, throttled, occ_reset; static struct chip { unsigned int id; bool throttled; + cpumask_t mask; + struct work_struct throttle; } *chips; static int nr_chips; @@ -310,8 +312,9 @@ static inline unsigned int get_nominal_index(void) return powernv_pstate_info.max - powernv_pstate_info.nominal; } -static void powernv_cpufreq_throttle_check(unsigned int cpu) +static void powernv_cpufreq_throttle_check(void *data) { + unsigned int cpu = smp_processor_id(); unsigned long pmsr; int pmsr_pmax, pmsr_lp, i; @@ -373,7 +376,7 @@ static int powernv_cpufreq_target_index(struct cpufreq_policy *policy, return 0; if (!throttled) - powernv_cpufreq_throttle_check(smp_processor_id()); + powernv_cpufreq_throttle_check(NULL); freq_data.pstate_id = powernv_freqs[new_index].driver_data; @@ -418,6 +421,14 @@ static struct notifier_block powernv_cpufreq_reboot_nb = { .notifier_call = powernv_cpufreq_reboot_notifier, }; +void powernv_cpufreq_work_fn(struct work_struct *work) +{ + struct chip *chip = container_of(work, struct chip, throttle); + + smp_call_function_any(&chip->mask, + powernv_cpufreq_throttle_check, NULL, 0); +} + static char throttle_reason[][30] = { "No throttling", "Power Cap", @@ -433,6 +444,7 @@ static int powernv_cpufreq_occ_msg(struct notifier_block *nb, struct opal_msg *occ_msg = msg; uint64_t token; uint64_t chip_id, reason; + int i; if (msg_type != OPAL_MSG_OCC) return 0; @@ -466,6 +478,10 @@ static int powernv_cpufreq_occ_msg(struct notifier_block *nb, occ_reset = false; throttled = false; pr_info("OCC: Active\n"); + + for (i = 0; i < nr_chips; i++) + schedule_work(&chips[i].throttle); + return 0; } @@ -476,6 +492,12 @@ static int powernv_cpufreq_occ_msg(struct notifier_block *nb, else if (!reason) pr_info("OCC: Chip %u %s\n", (unsigned int)chip_id, throttle_reason[reason]); + else + return 0; + + for (i = 0; i < nr_chips; i++) + if (chips[i].id == chip_id) + schedule_work(&chips[i].throttle); } return 0; } @@ -527,6 +549,8 @@ static int init_chip_info(void) for (i = 0; i < nr_chips; i++) { chips[i].id = chip[i]; chips[i].throttled = false; + cpumask_copy(&chips[i].mask, cpumask_of_node(chip[i])); + INIT_WORK(&chips[i].throttle, powernv_cpufreq_work_fn); } return 0; -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v3 3/6] cpufreq: powernv: Register for OCC related opal_message notification
OCC is an On-Chip-Controller which takes care of power and thermal safety of the chip. During runtime due to power failure or overtemperature the OCC may throttle the frequencies of the CPUs to remain within the power budget. We want the cpufreq driver to be aware of such situations to be able to report the reason to the user. We register to opal_message_notifier to receive OCC messages from opal. powernv_cpufreq_throttle_check() reports any frequency throttling and this patch will report the reason or event that caused throttling. We can be throttled if OCC is reset or OCC limits Pmax due to power or thermal reasons. We are also notified of unthrottling after an OCC reset or if OCC restores Pmax on the chip. Signed-off-by: Shilpasri G Bhat --- Changes from v2: - Patch split in to multiple patches. - This patch contains only the opal_message notification handler Changes from v1: - Add macros to define OCC_RESET, OCC_LOAD and OCC_THROTTLE - Define a structure to store chip id, chip mask which has bits set for cpus present in the chip, throttled state and a work_struct. - Modify powernv_cpufreq_throttle_check() to be called via smp_call() - On Pmax throttling/unthrottling update 'chip.throttled' and not the global 'throttled' as Pmax capping is local to the chip. - Remove the condition which checks if local pstate is less than Pmin while checking for Psafe frequency. When OCC becomes active after reset we update 'thottled' to false and when the cpufreq governor initiates a pstate change, the local pstate will be in Psafe and we will be reporting a false positive when we are not throttled. - Schedule a kworker on receiving throttling/unthrottling OCC message for that chip and schedule on all chips after receiving active. - After an OCC reset all the cpus will be in Psafe frequency. So call target() and restore the frequency to policy->cur after OCC_ACTIVE and Pmax unthrottling - Taken care of Viresh and Preeti's comments. drivers/cpufreq/powernv-cpufreq.c | 75 ++- 1 file changed, 74 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index d0c18c9..9268424 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -33,15 +33,19 @@ #include #include #include /* Required for cpu_sibling_mask() in UP configs */ +#include #define POWERNV_MAX_PSTATES256 #define PMSR_PSAFE_ENABLE (1UL << 30) #define PMSR_SPR_EM_DISABLE(1UL << 31) #define PMSR_MAX(x)((x >> 32) & 0xFF) #define PMSR_LP(x) ((x >> 48) & 0xFF) +#define OCC_RESET 0 +#define OCC_LOAD 1 +#define OCC_THROTTLE 2 static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1]; -static bool rebooting, throttled; +static bool rebooting, throttled, occ_reset; static struct chip { unsigned int id; @@ -414,6 +418,74 @@ static struct notifier_block powernv_cpufreq_reboot_nb = { .notifier_call = powernv_cpufreq_reboot_notifier, }; +static char throttle_reason[][30] = { + "No throttling", + "Power Cap", + "Processor Over Temperature", + "Power Supply Failure", + "Over Current", + "OCC Reset" +}; + +static int powernv_cpufreq_occ_msg(struct notifier_block *nb, + unsigned long msg_type, void *msg) +{ + struct opal_msg *occ_msg = msg; + uint64_t token; + uint64_t chip_id, reason; + + if (msg_type != OPAL_MSG_OCC) + return 0; + + token = be64_to_cpu(occ_msg->params[0]); + + switch (token) { + case OCC_RESET: + occ_reset = true; + /* +* powernv_cpufreq_throttle_check() is called in +* target() callback which can detect the throttle state +* for governors like ondemand. +* But static governors will not call target() often thus +* report throttling here. +*/ + if (!throttled) { + throttled = true; + pr_crit("CPU Frequency is throttled\n"); + } + pr_info("OCC: Reset\n"); + break; + case OCC_LOAD: + pr_info("OCC: Loaded\n"); + break; + case OCC_THROTTLE: + chip_id = be64_to_cpu(occ_msg->params[1]); + reason = be64_to_cpu(occ_msg->params[2]); + + if (occ_reset) {
[PATCH v3 5/6] cpufreq: powernv: Report Psafe only if PMSR.psafe_mode_active bit is set
On a reset cycle of OCC, although the system retires from safe frequency state the local pstate is not restored to Pmin or last requested pstate. Now if the cpufreq governor initiates a pstate change, the local pstate will be in Psafe and we will be reporting a false positive when we are not throttled. So in powernv_cpufreq_throttle_check() remove the condition which checks if local pstate is less than Pmin while checking for Psafe frequency. If the cpus are forced to Psafe then PMSR.psafe_mode_active bit will be set. So, when OCCs become active this bit will be cleared. Let us just rely on this bit for reporting throttling. Signed-off-by: Shilpasri G Bhat --- drivers/cpufreq/powernv-cpufreq.c | 12 +++- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 9618813..0a59d5b 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -39,7 +39,6 @@ #define PMSR_PSAFE_ENABLE (1UL << 30) #define PMSR_SPR_EM_DISABLE(1UL << 31) #define PMSR_MAX(x)((x >> 32) & 0xFF) -#define PMSR_LP(x) ((x >> 48) & 0xFF) #define OCC_RESET 0 #define OCC_LOAD 1 #define OCC_THROTTLE 2 @@ -316,7 +315,7 @@ static void powernv_cpufreq_throttle_check(void *data) { unsigned int cpu = smp_processor_id(); unsigned long pmsr; - int pmsr_pmax, pmsr_lp, i; + int pmsr_pmax, i; pmsr = get_pmspr(SPRN_PMSR); @@ -338,14 +337,9 @@ static void powernv_cpufreq_throttle_check(void *data) chips[i].id, pmsr_pmax); } - /* -* Check for Psafe by reading LocalPstate -* or check if Psafe_mode_active is set in PMSR. -*/ + /* Check if Psafe_mode_active is set in PMSR. */ next: - pmsr_lp = (s8)PMSR_LP(pmsr); - if ((pmsr_lp < powernv_pstate_info.min) || - (pmsr & PMSR_PSAFE_ENABLE)) { + if (pmsr & PMSR_PSAFE_ENABLE) { throttled = true; pr_info("Pstate set to safe frequency\n"); } -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v3 6/6] cpufreq: powernv: Restore cpu frequency to policy->cur on unthrottling
If frequency is throttled due to OCC reset then cpus will be in Psafe frequency, so restore the frequency on all cpus to policy->cur when OCCs are active again. And if frequency is throttled due to Pmax capping then restore the frequency of all the cpus in the chip on unthrottling. Signed-off-by: Shilpasri G Bhat --- drivers/cpufreq/powernv-cpufreq.c | 31 +-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 0a59d5b..b2915bc 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -51,6 +51,7 @@ static struct chip { bool throttled; cpumask_t mask; struct work_struct throttle; + bool restore; } *chips; static int nr_chips; @@ -418,9 +419,29 @@ static struct notifier_block powernv_cpufreq_reboot_nb = { void powernv_cpufreq_work_fn(struct work_struct *work) { struct chip *chip = container_of(work, struct chip, throttle); + unsigned int cpu; + cpumask_var_t mask; smp_call_function_any(&chip->mask, powernv_cpufreq_throttle_check, NULL, 0); + + if (!chip->restore) + return; + + chip->restore = false; + cpumask_copy(mask, &chip->mask); + for_each_cpu_and(cpu, mask, cpu_online_mask) { + int index, tcpu; + struct cpufreq_policy policy; + + cpufreq_get_policy(&policy, cpu); + cpufreq_frequency_table_target(&policy, policy.freq_table, + policy.cur, + CPUFREQ_RELATION_C, &index); + powernv_cpufreq_target_index(&policy, index); + for_each_cpu(tcpu, policy.cpus) + cpumask_clear_cpu(tcpu, mask); + } } static char throttle_reason[][30] = { @@ -473,8 +494,10 @@ static int powernv_cpufreq_occ_msg(struct notifier_block *nb, throttled = false; pr_info("OCC: Active\n"); - for (i = 0; i < nr_chips; i++) + for (i = 0; i < nr_chips; i++) { + chips[i].restore = true; schedule_work(&chips[i].throttle); + } return 0; } @@ -490,8 +513,11 @@ static int powernv_cpufreq_occ_msg(struct notifier_block *nb, return 0; for (i = 0; i < nr_chips; i++) - if (chips[i].id == chip_id) + if (chips[i].id == chip_id) { + if (!reason) + chips[i].restore = true; schedule_work(&chips[i].throttle); + } } return 0; } @@ -545,6 +571,7 @@ static int init_chip_info(void) chips[i].throttled = false; cpumask_copy(&chips[i].mask, cpumask_of_node(chip[i])); INIT_WORK(&chips[i].throttle, powernv_cpufreq_work_fn); + chips[i].restore = false; } return 0; -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v3 1/6] cpufreq: poowernv: Handle throttling due to Pmax capping at chip level
Hi Preeti, On 05/05/2015 09:21 AM, Preeti U Murthy wrote: > Hi Shilpa, > > On 05/04/2015 02:24 PM, Shilpasri G Bhat wrote: >> The On-Chip-Controller(OCC) can throttle cpu frequency by reducing the >> max allowed frequency for that chip if the chip exceeds its power or >> temperature limits. As Pmax capping is a chip level condition report >> this throttling behavior at chip level and also do not set the global >> 'throttled' on Pmax capping instead set the per-chip throttled >> variable. Report unthrottling if Pmax is restored after throttling. >> >> This patch adds a structure to store chip id and throttled state of >> the chip. >> >> Signed-off-by: Shilpasri G Bhat >> --- >> drivers/cpufreq/powernv-cpufreq.c | 59 >> --- >> 1 file changed, 55 insertions(+), 4 deletions(-) >> >> diff --git a/drivers/cpufreq/powernv-cpufreq.c >> b/drivers/cpufreq/powernv-cpufreq.c >> index ebef0d8..d0c18c9 100644 >> --- a/drivers/cpufreq/powernv-cpufreq.c >> +++ b/drivers/cpufreq/powernv-cpufreq.c >> @@ -27,6 +27,7 @@ >> #include >> #include >> #include >> +#include >> >> #include >> #include >> @@ -42,6 +43,13 @@ >> static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1]; >> static bool rebooting, throttled; >> >> +static struct chip { >> +unsigned int id; >> +bool throttled; >> +} *chips; >> + >> +static int nr_chips; >> + >> /* >> * Note: The set of pstates consists of contiguous integers, the >> * smallest of which is indicated by powernv_pstate_info.min, the >> @@ -301,22 +309,33 @@ static inline unsigned int get_nominal_index(void) >> static void powernv_cpufreq_throttle_check(unsigned int cpu) >> { >> unsigned long pmsr; >> -int pmsr_pmax, pmsr_lp; >> +int pmsr_pmax, pmsr_lp, i; >> >> pmsr = get_pmspr(SPRN_PMSR); >> >> +for (i = 0; i < nr_chips; i++) >> +if (chips[i].id == cpu_to_chip_id(cpu)) >> +break; >> + >> /* Check for Pmax Capping */ >> pmsr_pmax = (s8)PMSR_MAX(pmsr); >> if (pmsr_pmax != powernv_pstate_info.max) { >> -throttled = true; >> -pr_info("CPU %d Pmax is reduced to %d\n", cpu, pmsr_pmax); >> -pr_info("Max allowed Pstate is capped\n"); >> +if (chips[i].throttled) >> +goto next; >> +chips[i].throttled = true; >> +pr_info("CPU %d on Chip %u has Pmax reduced to %d\n", cpu, >> +chips[i].id, pmsr_pmax); >> +} else if (chips[i].throttled) { >> +chips[i].throttled = false; > > Is this check on pmax sufficient to indicate that the chip is unthrottled ? Unthrottling due to Pmax uncapping here is specific to a chip. So it is sufficient to decide throttling/unthrottling when OCC is active for that chip. > >> +pr_info("CPU %d on Chip %u has Pmax restored to %d\n", cpu, >> +chips[i].id, pmsr_pmax); >> } >> >> /* >> * Check for Psafe by reading LocalPstate >> * or check if Psafe_mode_active is set in PMSR. >> */ >> +next: >> pmsr_lp = (s8)PMSR_LP(pmsr); >> if ((pmsr_lp < powernv_pstate_info.min) || >> (pmsr & PMSR_PSAFE_ENABLE)) { >> @@ -414,6 +433,33 @@ static struct cpufreq_driver powernv_cpufreq_driver = { >> .attr = powernv_cpu_freq_attr, > > What about the situation where although occ is active, this particular > chip has been throttled and we end up repeatedly reporting "pstate set > to safe" and "frequency control disabled from OS" ? Should we not have a > check on (chips[i].throttled) before reporting an anomaly for these two > scenarios as well just like you have for pmsr_pmax ? We will not have "Psafe" and "frequency control disabled" repeatedly printed because of global variable 'throttled', which is set to true on passing any of these two conditions. It is quite unlikely behavior to have only one chip in "Psafe" or "frequency control disabled" state. These two conditions are most likely to happen during an OCC reset cycle which will occur across all chips. Thanks and Regards, Shilpa ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v3 4/6] cpufreq: powernv: Call throttle_check() on receiving OCC_THROTTLE
Hi Preeti, On 05/05/2015 09:30 AM, Preeti U Murthy wrote: > Hi Shilpa, > > On 05/04/2015 02:24 PM, Shilpasri G Bhat wrote: >> Re-evaluate the chip's throttled state on recieving OCC_THROTTLE >> notification by executing *throttle_check() on any one of the cpu on >> the chip. This is a sanity check to verify if we were indeed >> throttled/unthrottled after receiving OCC_THROTTLE notification. >> >> We cannot call *throttle_check() directly from the notification >> handler because we could be handling chip1's notification in chip2. So >> initiate an smp_call to execute *throttle_check(). We are irq-disabled >> in the notification handler, so use a worker thread to smp_call >> throttle_check() on any of the cpu in the chipmask. > > I see that the first patch takes care of reporting *per-chip* throttling > for pmax capping condition. But where are we taking care of reporting > "pstate set to safe" and "freq control disabled" scenarios per-chip ? > IMO let us not have "psafe" and "freq control disabled" states managed per-chip. Because when the above two conditions occur it is likely to happen across all chips during an OCC reset cycle. So I am setting 'throttled' to false on OCC_ACTIVE and re-verifying if it actually is the case by invoking *throttle_check(). >> >> Signed-off-by: Shilpasri G Bhat >> --- >> drivers/cpufreq/powernv-cpufreq.c | 28 ++-- >> 1 file changed, 26 insertions(+), 2 deletions(-) >> >> diff --git a/drivers/cpufreq/powernv-cpufreq.c >> b/drivers/cpufreq/powernv-cpufreq.c >> index 9268424..9618813 100644 >> --- a/drivers/cpufreq/powernv-cpufreq.c >> +++ b/drivers/cpufreq/powernv-cpufreq.c >> @@ -50,6 +50,8 @@ static bool rebooting, throttled, occ_reset; >> static struct chip { >> unsigned int id; >> bool throttled; >> +cpumask_t mask; >> +struct work_struct throttle; >> } *chips; >> >> static int nr_chips; >> @@ -310,8 +312,9 @@ static inline unsigned int get_nominal_index(void) >> return powernv_pstate_info.max - powernv_pstate_info.nominal; >> } >> >> -static void powernv_cpufreq_throttle_check(unsigned int cpu) >> +static void powernv_cpufreq_throttle_check(void *data) >> { >> +unsigned int cpu = smp_processor_id(); >> unsigned long pmsr; >> int pmsr_pmax, pmsr_lp, i; >> >> @@ -373,7 +376,7 @@ static int powernv_cpufreq_target_index(struct >> cpufreq_policy *policy, >> return 0; >> >> if (!throttled) >> -powernv_cpufreq_throttle_check(smp_processor_id()); >> +powernv_cpufreq_throttle_check(NULL); >> >> freq_data.pstate_id = powernv_freqs[new_index].driver_data; >> >> @@ -418,6 +421,14 @@ static struct notifier_block powernv_cpufreq_reboot_nb >> = { >> .notifier_call = powernv_cpufreq_reboot_notifier, >> }; >> >> +void powernv_cpufreq_work_fn(struct work_struct *work) >> +{ >> +struct chip *chip = container_of(work, struct chip, throttle); >> + >> +smp_call_function_any(&chip->mask, >> + powernv_cpufreq_throttle_check, NULL, 0); >> +} >> + >> static char throttle_reason[][30] = { >> "No throttling", >> "Power Cap", >> @@ -433,6 +444,7 @@ static int powernv_cpufreq_occ_msg(struct notifier_block >> *nb, >> struct opal_msg *occ_msg = msg; >> uint64_t token; >> uint64_t chip_id, reason; >> +int i; >> >> if (msg_type != OPAL_MSG_OCC) >> return 0; >> @@ -466,6 +478,10 @@ static int powernv_cpufreq_occ_msg(struct >> notifier_block *nb, >> occ_reset = false; >> throttled = false; >> pr_info("OCC: Active\n"); >> + >> +for (i = 0; i < nr_chips; i++) >> +schedule_work(&chips[i].throttle); >> + >> return 0; >> } >> >> @@ -476,6 +492,12 @@ static int powernv_cpufreq_occ_msg(struct >> notifier_block *nb, >> else if (!reason) >> pr_info("OCC: Chip %u %s\n", (unsigned int)chip_id, >> throttle_reason[reason]); >> +else >> +return 0; > > Why the else section ? The code can never reach here, can
Re: [PATCH v3 1/6] cpufreq: poowernv: Handle throttling due to Pmax capping at chip level
On 05/05/2015 02:08 PM, Preeti U Murthy wrote: > On 05/05/2015 11:36 AM, Shilpasri G Bhat wrote: >> Hi Preeti, >> >> On 05/05/2015 09:21 AM, Preeti U Murthy wrote: >>> Hi Shilpa, >>> >>> On 05/04/2015 02:24 PM, Shilpasri G Bhat wrote: >>>> The On-Chip-Controller(OCC) can throttle cpu frequency by reducing the >>>> max allowed frequency for that chip if the chip exceeds its power or >>>> temperature limits. As Pmax capping is a chip level condition report >>>> this throttling behavior at chip level and also do not set the global >>>> 'throttled' on Pmax capping instead set the per-chip throttled >>>> variable. Report unthrottling if Pmax is restored after throttling. >>>> >>>> This patch adds a structure to store chip id and throttled state of >>>> the chip. >>>> >>>> Signed-off-by: Shilpasri G Bhat >>>> --- >>>> drivers/cpufreq/powernv-cpufreq.c | 59 >>>> --- >>>> 1 file changed, 55 insertions(+), 4 deletions(-) >>>> >>>> diff --git a/drivers/cpufreq/powernv-cpufreq.c >>>> b/drivers/cpufreq/powernv-cpufreq.c >>>> index ebef0d8..d0c18c9 100644 >>>> --- a/drivers/cpufreq/powernv-cpufreq.c >>>> +++ b/drivers/cpufreq/powernv-cpufreq.c >>>> @@ -27,6 +27,7 @@ >>>> #include >>>> #include >>>> #include >>>> +#include >>>> >>>> #include >>>> #include >>>> @@ -42,6 +43,13 @@ >>>> static struct cpufreq_frequency_table >>>> powernv_freqs[POWERNV_MAX_PSTATES+1]; >>>> static bool rebooting, throttled; >>>> >>>> +static struct chip { >>>> + unsigned int id; >>>> + bool throttled; >>>> +} *chips; >>>> + >>>> +static int nr_chips; >>>> + >>>> /* >>>> * Note: The set of pstates consists of contiguous integers, the >>>> * smallest of which is indicated by powernv_pstate_info.min, the >>>> @@ -301,22 +309,33 @@ static inline unsigned int get_nominal_index(void) >>>> static void powernv_cpufreq_throttle_check(unsigned int cpu) >>>> { >>>>unsigned long pmsr; >>>> - int pmsr_pmax, pmsr_lp; >>>> + int pmsr_pmax, pmsr_lp, i; >>>> >>>>pmsr = get_pmspr(SPRN_PMSR); >>>> >>>> + for (i = 0; i < nr_chips; i++) >>>> + if (chips[i].id == cpu_to_chip_id(cpu)) >>>> + break; >>>> + >>>>/* Check for Pmax Capping */ >>>>pmsr_pmax = (s8)PMSR_MAX(pmsr); >>>>if (pmsr_pmax != powernv_pstate_info.max) { >>>> - throttled = true; >>>> - pr_info("CPU %d Pmax is reduced to %d\n", cpu, pmsr_pmax); >>>> - pr_info("Max allowed Pstate is capped\n"); >>>> + if (chips[i].throttled) >>>> + goto next; >>>> + chips[i].throttled = true; >>>> + pr_info("CPU %d on Chip %u has Pmax reduced to %d\n", cpu, >>>> + chips[i].id, pmsr_pmax); >>>> + } else if (chips[i].throttled) { >>>> + chips[i].throttled = false; >>> >>> Is this check on pmax sufficient to indicate that the chip is unthrottled ? >> >> Unthrottling due to Pmax uncapping here is specific to a chip. So it is >> sufficient to decide throttling/unthrottling when OCC is active for that >> chip. > > Ok then we can perhaps exit after detecting unthrottling here. This won't work for older firmwares which do not clear "Frequency control enabled bit" on OCC reset cycle. So let us check for remaining two conditions on unthrottling as well. >> >>> >>>> + pr_info("CPU %d on Chip %u has Pmax restored to %d\n", cpu, >>>> + chips[i].id, pmsr_pmax); >>>>} >>>> >>>>/* >>>> * Check for Psafe by reading LocalPstate >>>> * or check if Psafe_mode_active is set in PMSR. >>>> */ >>>> +next: >>>>pmsr_lp = (s8)PMSR_LP(pmsr); >>>>if ((pmsr_lp < powernv_pstate_info.min) || >>>>(pmsr & PMSR_PSAFE_ENABLE)) { >>>> @@ -414,6 +433,33 @@ static struct cpufreq_driver powernv_cpu
[PATCH] cpuidle: powernv/pseries: Decrease the snooze residency
The idle cpus which stay in snooze for a long period can degrade the perfomance of the sibling cpus. If the cpu stays in snooze for more than target residency of the next available idle state, then exit from snooze. This gives a chance to the cpuidle governor to re-evaluate the last idle state of the cpu to promote it to deeper idle states. Signed-off-by: Shilpasri G Bhat --- drivers/cpuidle/cpuidle-powernv.c | 12 drivers/cpuidle/cpuidle-pseries.c | 11 +++ 2 files changed, 23 insertions(+) diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 5937207..1e3ef5e 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -29,18 +29,25 @@ struct cpuidle_driver powernv_idle_driver = { static int max_idle_state; static struct cpuidle_state *cpuidle_state_table; +static u64 snooze_timeout; +static bool snooze_timeout_en; static int snooze_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { + u64 snooze_exit_time; + local_irq_enable(); set_thread_flag(TIF_POLLING_NRFLAG); + snooze_exit_time = get_tb() + snooze_timeout; ppc64_runlatch_off(); while (!need_resched()) { HMT_low(); HMT_very_low(); + if (snooze_timeout_en && get_tb() > snooze_exit_time) + break; } HMT_medium(); @@ -252,6 +259,11 @@ static int powernv_idle_probe(void) cpuidle_state_table = powernv_states; /* Device tree can indicate more idle states */ max_idle_state = powernv_add_idle_states(); + if (max_idle_state > 1) { + snooze_timeout_en = true; + snooze_timeout = powernv_states[1].target_residency * +tb_ticks_per_usec; + } } else return -ENODEV; diff --git a/drivers/cpuidle/cpuidle-pseries.c b/drivers/cpuidle/cpuidle-pseries.c index bb9e2b6..07135e0 100644 --- a/drivers/cpuidle/cpuidle-pseries.c +++ b/drivers/cpuidle/cpuidle-pseries.c @@ -27,6 +27,8 @@ struct cpuidle_driver pseries_idle_driver = { static int max_idle_state; static struct cpuidle_state *cpuidle_state_table; +static u64 snooze_timeout; +static bool snooze_timeout_en; static inline void idle_loop_prolog(unsigned long *in_purr) { @@ -58,14 +60,18 @@ static int snooze_loop(struct cpuidle_device *dev, int index) { unsigned long in_purr; + u64 snooze_exit_time; idle_loop_prolog(&in_purr); local_irq_enable(); set_thread_flag(TIF_POLLING_NRFLAG); + snooze_exit_time = get_tb() + snooze_timeout; while (!need_resched()) { HMT_low(); HMT_very_low(); + if (snooze_timeout_en && get_tb() > snooze_exit_time) + break; } HMT_medium(); @@ -244,6 +250,11 @@ static int pseries_idle_probe(void) } else return -ENODEV; + if (max_idle_state > 1) { + snooze_timeout_en = true; + snooze_timeout = cpuidle_state_table[1].target_residency * +tb_ticks_per_usec; + } return 0; } -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v2] cpufreq: powernv: Set the cpus to nominal frequency during reboot/kexec
This patch ensures the cpus to kexec/reboot at nominal frequency. Nominal frequency is the highest cpu frequency on PowerPC at which the cores can run without getting throttled. If the host kernel had set the cpus to a low pstate and then it kexecs/reboots to a cpufreq disabled kernel it would cause the target kernel to perform poorly. It will also increase the boot up time of the target kernel. So set the cpus to high pstate, in this case to nominal frequency before rebooting to avoid such scenarios. The reboot notifier will set the cpus to nominal frequncy. Changes v1->v2: Invoke .target() driver callback to set the cpus to nominal frequency in reboot notifier, instead of calling cpufreq_suspend() as suggested by Viresh Kumar. Modified the commit message. Signed-off-by: Shilpasri G Bhat Reviewed-by: Preeti U Murthy --- drivers/cpufreq/powernv-cpufreq.c | 35 +++ 1 file changed, 35 insertions(+) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 379c083..ba27c49 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -35,6 +36,7 @@ #define POWERNV_MAX_PSTATES256 static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1]; +static bool rebooting; /* * Note: The set of pstates consists of contiguous integers, the @@ -284,6 +286,15 @@ static void set_pstate(void *freq_data) } /* + * get_nominal_index: Returns the index corresponding to the nominal + * pstate in the cpufreq table + */ +static inline unsigned int get_nominal_index(void) +{ + return powernv_pstate_info.max - powernv_pstate_info.nominal; +} + +/* * powernv_cpufreq_target_index: Sets the frequency corresponding to * the cpufreq table entry indexed by new_index on the cpus in the * mask policy->cpus @@ -293,6 +304,9 @@ static int powernv_cpufreq_target_index(struct cpufreq_policy *policy, { struct powernv_smp_call_data freq_data; + if (unlikely(rebooting) && new_index != get_nominal_index()) + return -EBUSY; + freq_data.pstate_id = powernv_freqs[new_index].driver_data; /* @@ -317,6 +331,25 @@ static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy) return cpufreq_table_validate_and_show(policy, powernv_freqs); } +static int powernv_cpufreq_reboot_notifier(struct notifier_block *nb, + unsigned long action, void *unused) +{ + int cpu; + struct cpufreq_policy cpu_policy; + + rebooting = true; + for_each_online_cpu(cpu) { + cpufreq_get_policy(&cpu_policy, cpu); + powernv_cpufreq_target_index(&cpu_policy, get_nominal_index()); + } + + return NOTIFY_DONE; +} + +static struct notifier_block powernv_cpufreq_reboot_nb = { + .notifier_call = powernv_cpufreq_reboot_notifier, +}; + static struct cpufreq_driver powernv_cpufreq_driver = { .name = "powernv-cpufreq", .flags = CPUFREQ_CONST_LOOPS, @@ -342,12 +375,14 @@ static int __init powernv_cpufreq_init(void) return rc; } + register_reboot_notifier(&powernv_cpufreq_reboot_nb); return cpufreq_register_driver(&powernv_cpufreq_driver); } module_init(powernv_cpufreq_init); static void __exit powernv_cpufreq_exit(void) { + unregister_reboot_notifier(&powernv_cpufreq_reboot_nb); cpufreq_unregister_driver(&powernv_cpufreq_driver); } module_exit(powernv_cpufreq_exit); -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v3] cpufreq: powernv: Set the cpus to nominal frequency during reboot/kexec
This patch ensures the cpus to kexec/reboot at nominal frequency. Nominal frequency is the highest cpu frequency on PowerPC at which the cores can run without getting throttled. If the host kernel had set the cpus to a low pstate and then it kexecs/reboots to a cpufreq disabled kernel it would cause the target kernel to perform poorly. It will also increase the boot up time of the target kernel. So set the cpus to high pstate, in this case to nominal frequency before rebooting to avoid such scenarios. The reboot notifier will set the cpus to nominal frequncy. Signed-off-by: Shilpasri G Bhat Suggested-by: Viresh Kumar Reviewed-by: Preeti U Murthy --- Changes v2->v3: We return EBUSY when cpufreq governor tries to change the frequency after rebooting is set to true. This results in console being flushed with error messages indicating failed attempts to change the frequency. So instead of returning EBUSY we return 0 to stop the governor from changing the frequency without alerting a failure to do the same on reboot, as this is not an errorneaos condition. Changes v1->v2: Invoke .target() driver callback to set the cpus to nominal frequency in reboot notifier, instead of calling cpufreq_suspend() as suggested by Viresh Kumar. Modified the commit message. drivers/cpufreq/powernv-cpufreq.c | 35 +++ 1 file changed, 35 insertions(+) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 379c083..f772a55 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -35,6 +36,7 @@ #define POWERNV_MAX_PSTATES256 static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1]; +static bool rebooting; /* * Note: The set of pstates consists of contiguous integers, the @@ -284,6 +286,15 @@ static void set_pstate(void *freq_data) } /* + * get_nominal_index: Returns the index corresponding to the nominal + * pstate in the cpufreq table + */ +static inline unsigned int get_nominal_index(void) +{ + return powernv_pstate_info.max - powernv_pstate_info.nominal; +} + +/* * powernv_cpufreq_target_index: Sets the frequency corresponding to * the cpufreq table entry indexed by new_index on the cpus in the * mask policy->cpus @@ -293,6 +304,9 @@ static int powernv_cpufreq_target_index(struct cpufreq_policy *policy, { struct powernv_smp_call_data freq_data; + if (unlikely(rebooting) && new_index != get_nominal_index()) + return 0; + freq_data.pstate_id = powernv_freqs[new_index].driver_data; /* @@ -317,6 +331,25 @@ static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy) return cpufreq_table_validate_and_show(policy, powernv_freqs); } +static int powernv_cpufreq_reboot_notifier(struct notifier_block *nb, + unsigned long action, void *unused) +{ + int cpu; + struct cpufreq_policy cpu_policy; + + rebooting = true; + for_each_online_cpu(cpu) { + cpufreq_get_policy(&cpu_policy, cpu); + powernv_cpufreq_target_index(&cpu_policy, get_nominal_index()); + } + + return NOTIFY_DONE; +} + +static struct notifier_block powernv_cpufreq_reboot_nb = { + .notifier_call = powernv_cpufreq_reboot_notifier, +}; + static struct cpufreq_driver powernv_cpufreq_driver = { .name = "powernv-cpufreq", .flags = CPUFREQ_CONST_LOOPS, @@ -342,12 +375,14 @@ static int __init powernv_cpufreq_init(void) return rc; } + register_reboot_notifier(&powernv_cpufreq_reboot_nb); return cpufreq_register_driver(&powernv_cpufreq_driver); } module_init(powernv_cpufreq_init); static void __exit powernv_cpufreq_exit(void) { + unregister_reboot_notifier(&powernv_cpufreq_reboot_nb); cpufreq_unregister_driver(&powernv_cpufreq_driver); } module_exit(powernv_cpufreq_exit); -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v2] cpuidle: powernv/pseries: Auto-promotion of snooze to deeper idle state
The idle cpus which stay in snooze for a long period can degrade the perfomance of the sibling cpus. If the cpu stays in snooze for more than target residency of the next available idle state, then exit from snooze. This gives a chance to the cpuidle governor to re-evaluate the last idle state of the cpu to promote it to deeper idle states. Signed-off-by: Shilpasri G Bhat --- Changes from v1: -Modified commit message drivers/cpuidle/cpuidle-powernv.c | 12 drivers/cpuidle/cpuidle-pseries.c | 11 +++ 2 files changed, 23 insertions(+) diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 5937207..1e3ef5e 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -29,18 +29,25 @@ struct cpuidle_driver powernv_idle_driver = { static int max_idle_state; static struct cpuidle_state *cpuidle_state_table; +static u64 snooze_timeout; +static bool snooze_timeout_en; static int snooze_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { + u64 snooze_exit_time; + local_irq_enable(); set_thread_flag(TIF_POLLING_NRFLAG); + snooze_exit_time = get_tb() + snooze_timeout; ppc64_runlatch_off(); while (!need_resched()) { HMT_low(); HMT_very_low(); + if (snooze_timeout_en && get_tb() > snooze_exit_time) + break; } HMT_medium(); @@ -252,6 +259,11 @@ static int powernv_idle_probe(void) cpuidle_state_table = powernv_states; /* Device tree can indicate more idle states */ max_idle_state = powernv_add_idle_states(); + if (max_idle_state > 1) { + snooze_timeout_en = true; + snooze_timeout = powernv_states[1].target_residency * +tb_ticks_per_usec; + } } else return -ENODEV; diff --git a/drivers/cpuidle/cpuidle-pseries.c b/drivers/cpuidle/cpuidle-pseries.c index bb9e2b6..07135e0 100644 --- a/drivers/cpuidle/cpuidle-pseries.c +++ b/drivers/cpuidle/cpuidle-pseries.c @@ -27,6 +27,8 @@ struct cpuidle_driver pseries_idle_driver = { static int max_idle_state; static struct cpuidle_state *cpuidle_state_table; +static u64 snooze_timeout; +static bool snooze_timeout_en; static inline void idle_loop_prolog(unsigned long *in_purr) { @@ -58,14 +60,18 @@ static int snooze_loop(struct cpuidle_device *dev, int index) { unsigned long in_purr; + u64 snooze_exit_time; idle_loop_prolog(&in_purr); local_irq_enable(); set_thread_flag(TIF_POLLING_NRFLAG); + snooze_exit_time = get_tb() + snooze_timeout; while (!need_resched()) { HMT_low(); HMT_very_low(); + if (snooze_timeout_en && get_tb() > snooze_exit_time) + break; } HMT_medium(); @@ -244,6 +250,11 @@ static int pseries_idle_probe(void) } else return -ENODEV; + if (max_idle_state > 1) { + snooze_timeout_en = true; + snooze_timeout = cpuidle_state_table[1].target_residency * +tb_ticks_per_usec; + } return 0; } -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v4 0/6] powernv: cpufreq: Report frequency throttle by OCC
This patchset intends to add frequency throttle reporting mechanism to powernv-cpufreq driver when OCC throttles the frequency. OCC is an On-Chip-Controller which takes care of the power and thermal safety of the chip. The CPU frequency can be throttled during an OCC reset or when OCC tries to limit the max allowed frequency. The patchset will report such conditions so as to keep the user informed about reason for the drop in performance of workloads when frequency is throttled. Changes from v3: - Rebased on top of 4.2-rc1 - Minor changes in patch 2,3,4,6 this does not change the functionality of the code - 594fcb9ec9e powerpc/powernv: Expose OPAL APIs required by PRD interface , this patch fixes the build error due to which this series was initially dropped ERROR: ".opal_message_notifier_register" drivers/cpufreq/powernv-cpufreq.ko] undefined! Changes from v2: - Split into multiple patches - Semantic fixes Shilpasri G Bhat (6): cpufreq: powernv: Handle throttling due to Pmax capping at chip level powerpc/powernv: Add definition of OPAL_MSG_OCC message type cpufreq: powernv: Register for OCC related opal_message notification cpufreq: powernv: Call throttle_check() on receiving OCC_THROTTLE cpufreq: powernv: Report Psafe only if PMSR.psafe_mode_active bit is set cpufreq: powernv: Restore cpu frequency to policy->cur on unthrottling arch/powerpc/include/asm/opal-api.h | 12 +++ drivers/cpufreq/powernv-cpufreq.c | 195 +--- 2 files changed, 192 insertions(+), 15 deletions(-) -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v4 1/6] cpufreq: powernv: Handle throttling due to Pmax capping at chip level
The On-Chip-Controller(OCC) can throttle cpu frequency by reducing the max allowed frequency for that chip if the chip exceeds its power or temperature limits. As Pmax capping is a chip level condition report this throttling behavior at chip level and also do not set the global 'throttled' on Pmax capping instead set the per-chip throttled variable. Report unthrottling if Pmax is restored after throttling. This patch adds a structure to store chip id and throttled state of the chip. Signed-off-by: Shilpasri G Bhat Reviewed-by: Preeti U Murthy --- No change from v3 drivers/cpufreq/powernv-cpufreq.c | 59 --- 1 file changed, 55 insertions(+), 4 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index ebef0d8..d0c18c9 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -42,6 +43,13 @@ static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1]; static bool rebooting, throttled; +static struct chip { + unsigned int id; + bool throttled; +} *chips; + +static int nr_chips; + /* * Note: The set of pstates consists of contiguous integers, the * smallest of which is indicated by powernv_pstate_info.min, the @@ -301,22 +309,33 @@ static inline unsigned int get_nominal_index(void) static void powernv_cpufreq_throttle_check(unsigned int cpu) { unsigned long pmsr; - int pmsr_pmax, pmsr_lp; + int pmsr_pmax, pmsr_lp, i; pmsr = get_pmspr(SPRN_PMSR); + for (i = 0; i < nr_chips; i++) + if (chips[i].id == cpu_to_chip_id(cpu)) + break; + /* Check for Pmax Capping */ pmsr_pmax = (s8)PMSR_MAX(pmsr); if (pmsr_pmax != powernv_pstate_info.max) { - throttled = true; - pr_info("CPU %d Pmax is reduced to %d\n", cpu, pmsr_pmax); - pr_info("Max allowed Pstate is capped\n"); + if (chips[i].throttled) + goto next; + chips[i].throttled = true; + pr_info("CPU %d on Chip %u has Pmax reduced to %d\n", cpu, + chips[i].id, pmsr_pmax); + } else if (chips[i].throttled) { + chips[i].throttled = false; + pr_info("CPU %d on Chip %u has Pmax restored to %d\n", cpu, + chips[i].id, pmsr_pmax); } /* * Check for Psafe by reading LocalPstate * or check if Psafe_mode_active is set in PMSR. */ +next: pmsr_lp = (s8)PMSR_LP(pmsr); if ((pmsr_lp < powernv_pstate_info.min) || (pmsr & PMSR_PSAFE_ENABLE)) { @@ -414,6 +433,33 @@ static struct cpufreq_driver powernv_cpufreq_driver = { .attr = powernv_cpu_freq_attr, }; +static int init_chip_info(void) +{ + unsigned int chip[256]; + unsigned int cpu, i; + unsigned int prev_chip_id = UINT_MAX; + + for_each_possible_cpu(cpu) { + unsigned int id = cpu_to_chip_id(cpu); + + if (prev_chip_id != id) { + prev_chip_id = id; + chip[nr_chips++] = id; + } + } + + chips = kmalloc_array(nr_chips, sizeof(struct chip), GFP_KERNEL); + if (!chips) + return -ENOMEM; + + for (i = 0; i < nr_chips; i++) { + chips[i].id = chip[i]; + chips[i].throttled = false; + } + + return 0; +} + static int __init powernv_cpufreq_init(void) { int rc = 0; @@ -429,6 +475,11 @@ static int __init powernv_cpufreq_init(void) return rc; } + /* Populate chip info */ + rc = init_chip_info(); + if (rc) + return rc; + register_reboot_notifier(&powernv_cpufreq_reboot_nb); return cpufreq_register_driver(&powernv_cpufreq_driver); } -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v4 2/6] powerpc/powernv: Add definition of OPAL_MSG_OCC message type
Add OPAL_MSG_OCC message definition to opal_message_type to receive OCC events like reset, load and throttled. Host performance can be affected when OCC is reset or OCC throttles the max Pstate. We can register to opal_message_notifier to receive OPAL_MSG_OCC type of message and report it to the userspace so as to keep the user informed about the reason for a performance drop in workloads. The reset and load OCC events are notified to kernel when FSP sends OCC_RESET and OCC_LOAD commands. Both reset and load messages are sent to kernel on successful completion of reset and load operation respectively. The throttle OCC event indicates that the Pmax of the chip is reduced. The chip_id and throttle reason for reducing Pmax is also queued along with the message. CC: Stewart Smith Signed-off-by: Shilpasri G Bhat --- Changes from v3: - '0d7cd8550d3 powerpc/powernv: Add opal-prd channel' this patch adds the definition of OPAL_MSG_PRD, so remove it and update the changelog. - Move the definitions of OCC_RESET, OCC_LOAD and OCC_THROTTLE from drivers/cpufreq/powernv-cpufreq.c to arch/powerpc/include/asm/opal-api.h - Define OCC_MAX_THROTTLE_STATUS - Add a wrapper structure 'opal_occ_msg' to copy 'struct opal_msg.params[0..2]' This structure will define the parameters received from firmware to maintain compatibility for any future additions. No change from v2 Change from v1: - Update the commit changelog arch/powerpc/include/asm/opal-api.h | 12 1 file changed, 12 insertions(+) diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index e9e4c52..64dc9f5 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -361,6 +361,7 @@ enum opal_msg_type { OPAL_MSG_HMI_EVT, OPAL_MSG_DPO, OPAL_MSG_PRD, + OPAL_MSG_OCC, OPAL_MSG_TYPE_MAX, }; @@ -700,6 +701,17 @@ struct opal_prd_msg_header { struct opal_prd_msg; +#define OCC_RESET 0 +#define OCC_LOAD1 +#define OCC_THROTTLE2 +#define OCC_MAX_THROTTLE_STATUS 5 + +struct opal_occ_msg { + __be64 type; + __be64 chip; + __be64 throttle_status; +}; + /* * SG entries * -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v4 3/6] cpufreq: powernv: Register for OCC related opal_message notification
OCC is an On-Chip-Controller which takes care of power and thermal safety of the chip. During runtime due to power failure or overtemperature the OCC may throttle the frequencies of the CPUs to remain within the power budget. We want the cpufreq driver to be aware of such situations to be able to report the reason to the user. We register to opal_message_notifier to receive OCC messages from opal. powernv_cpufreq_throttle_check() reports any frequency throttling and this patch will report the reason or event that caused throttling. We can be throttled if OCC is reset or OCC limits Pmax due to power or thermal reasons. We are also notified of unthrottling after an OCC reset or if OCC restores Pmax on the chip. Signed-off-by: Shilpasri G Bhat --- Changes from v3: - Move the macro definitions of OCC_RESET, OCC_LOAD, OCC_THROTTLE to arch/powerpc/include/asm/opal-api.h - Use 'struct opal_occ_msg' to copy the 'opal_msg->params[]' and refer the members of this structure in the code; Replace 'chip_id', 'token' and 'reason' with omsg.chip, omsg.type, omsg.throttle_status - Use OCC_MAX_THROTTLE_STATUS instead of the magic number. - Add opal_message_notifier_unregister() Changes from v2: - Patch split in to multiple patches. - This patch contains only the opal_message notification handler Changes from v1: - Add macros to define OCC_RESET, OCC_LOAD and OCC_THROTTLE - Define a structure to store chip id, chip mask which has bits set for cpus present in the chip, throttled state and a work_struct. - Modify powernv_cpufreq_throttle_check() to be called via smp_call() - On Pmax throttling/unthrottling update 'chip.throttled' and not the global 'throttled' as Pmax capping is local to the chip. - Remove the condition which checks if local pstate is less than Pmin while checking for Psafe frequency. When OCC becomes active after reset we update 'thottled' to false and when the cpufreq governor initiates a pstate change, the local pstate will be in Psafe and we will be reporting a false positive when we are not throttled. - Schedule a kworker on receiving throttling/unthrottling OCC message for that chip and schedule on all chips after receiving active. - After an OCC reset all the cpus will be in Psafe frequency. So call target() and restore the frequency to policy->cur after OCC_ACTIVE and Pmax unthrottling - Taken care of Viresh and Preeti's comments. drivers/cpufreq/powernv-cpufreq.c | 71 ++- 1 file changed, 70 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index d0c18c9..1f59958 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -33,6 +33,7 @@ #include #include #include /* Required for cpu_sibling_mask() in UP configs */ +#include #define POWERNV_MAX_PSTATES256 #define PMSR_PSAFE_ENABLE (1UL << 30) @@ -41,7 +42,7 @@ #define PMSR_LP(x) ((x >> 48) & 0xFF) static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1]; -static bool rebooting, throttled; +static bool rebooting, throttled, occ_reset; static struct chip { unsigned int id; @@ -414,6 +415,71 @@ static struct notifier_block powernv_cpufreq_reboot_nb = { .notifier_call = powernv_cpufreq_reboot_notifier, }; +static char throttle_reason[][30] = { + "No throttling", + "Power Cap", + "Processor Over Temperature", + "Power Supply Failure", + "Over Current", + "OCC Reset" +}; + +static int powernv_cpufreq_occ_msg(struct notifier_block *nb, + unsigned long msg_type, void *_msg) +{ + struct opal_msg *msg = _msg; + struct opal_occ_msg omsg; + + if (msg_type != OPAL_MSG_OCC) + return 0; + + memcpy(&omsg, msg->params, sizeof(omsg)); + + switch (omsg.type) { + case OCC_RESET: + occ_reset = true; + /* +* powernv_cpufreq_throttle_check() is called in +* target() callback which can detect the throttle state +* for governors like ondemand. +* But static governors will not call target() often thus +* report throttling here. +*/ + if (!throttled) { + throttled = true; + pr_crit("CPU Frequency is throttled\n"); + } + pr_info("OCC: Reset\n"); + break; + case OCC_LOAD: +
[PATCH v4 4/6] cpufreq: powernv: Call throttle_check() on receiving OCC_THROTTLE
Re-evaluate the chip's throttled state on recieving OCC_THROTTLE notification by executing *throttle_check() on any one of the cpu on the chip. This is a sanity check to verify if we were indeed throttled/unthrottled after receiving OCC_THROTTLE notification. We cannot call *throttle_check() directly from the notification handler because we could be handling chip1's notification in chip2. So initiate an smp_call to execute *throttle_check(). We are irq-disabled in the notification handler, so use a worker thread to smp_call throttle_check() on any of the cpu in the chipmask. Signed-off-by: Shilpasri G Bhat --- Changes from v3: - Refer to the members of 'struct opal_occ_msg' in the patch. Replace 'chip_id' with 'omsg.chip' drivers/cpufreq/powernv-cpufreq.c | 28 ++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 1f59958..f2da30a 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -47,6 +47,8 @@ static bool rebooting, throttled, occ_reset; static struct chip { unsigned int id; bool throttled; + cpumask_t mask; + struct work_struct throttle; } *chips; static int nr_chips; @@ -307,8 +309,9 @@ static inline unsigned int get_nominal_index(void) return powernv_pstate_info.max - powernv_pstate_info.nominal; } -static void powernv_cpufreq_throttle_check(unsigned int cpu) +static void powernv_cpufreq_throttle_check(void *data) { + unsigned int cpu = smp_processor_id(); unsigned long pmsr; int pmsr_pmax, pmsr_lp, i; @@ -370,7 +373,7 @@ static int powernv_cpufreq_target_index(struct cpufreq_policy *policy, return 0; if (!throttled) - powernv_cpufreq_throttle_check(smp_processor_id()); + powernv_cpufreq_throttle_check(NULL); freq_data.pstate_id = powernv_freqs[new_index].driver_data; @@ -415,6 +418,14 @@ static struct notifier_block powernv_cpufreq_reboot_nb = { .notifier_call = powernv_cpufreq_reboot_notifier, }; +void powernv_cpufreq_work_fn(struct work_struct *work) +{ + struct chip *chip = container_of(work, struct chip, throttle); + + smp_call_function_any(&chip->mask, + powernv_cpufreq_throttle_check, NULL, 0); +} + static char throttle_reason[][30] = { "No throttling", "Power Cap", @@ -429,6 +440,7 @@ static int powernv_cpufreq_occ_msg(struct notifier_block *nb, { struct opal_msg *msg = _msg; struct opal_occ_msg omsg; + int i; if (msg_type != OPAL_MSG_OCC) return 0; @@ -459,6 +471,10 @@ static int powernv_cpufreq_occ_msg(struct notifier_block *nb, occ_reset = false; throttled = false; pr_info("OCC: Active\n"); + + for (i = 0; i < nr_chips; i++) + schedule_work(&chips[i].throttle); + return 0; } @@ -470,6 +486,12 @@ static int powernv_cpufreq_occ_msg(struct notifier_block *nb, else if (!omsg.throttle_status) pr_info("OCC: Chip %u %s\n", (unsigned int)omsg.chip, throttle_reason[omsg.throttle_status]); + else + return 0; + + for (i = 0; i < nr_chips; i++) + if (chips[i].id == omsg.chip) + schedule_work(&chips[i].throttle); } return 0; } @@ -521,6 +543,8 @@ static int init_chip_info(void) for (i = 0; i < nr_chips; i++) { chips[i].id = chip[i]; chips[i].throttled = false; + cpumask_copy(&chips[i].mask, cpumask_of_node(chip[i])); + INIT_WORK(&chips[i].throttle, powernv_cpufreq_work_fn); } return 0; -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v4 6/6] cpufreq: powernv: Restore cpu frequency to policy->cur on unthrottling
If frequency is throttled due to OCC reset then cpus will be in Psafe frequency, so restore the frequency on all cpus to policy->cur when OCCs are active again. And if frequency is throttled due to Pmax capping then restore the frequency of all the cpus in the chip on unthrottling. Signed-off-by: Shilpasri G Bhat --- Changes from v3: - Refer to the members of 'struct opal_occ_msg' in the patch. Replace 'reason' with 'omsg.throttle_status' drivers/cpufreq/powernv-cpufreq.c | 31 +-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index d6d7e68..824141a 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -48,6 +48,7 @@ static struct chip { bool throttled; cpumask_t mask; struct work_struct throttle; + bool restore; } *chips; static int nr_chips; @@ -415,9 +416,29 @@ static struct notifier_block powernv_cpufreq_reboot_nb = { void powernv_cpufreq_work_fn(struct work_struct *work) { struct chip *chip = container_of(work, struct chip, throttle); + unsigned int cpu; + cpumask_var_t mask; smp_call_function_any(&chip->mask, powernv_cpufreq_throttle_check, NULL, 0); + + if (!chip->restore) + return; + + chip->restore = false; + cpumask_copy(mask, &chip->mask); + for_each_cpu_and(cpu, mask, cpu_online_mask) { + int index, tcpu; + struct cpufreq_policy policy; + + cpufreq_get_policy(&policy, cpu); + cpufreq_frequency_table_target(&policy, policy.freq_table, + policy.cur, + CPUFREQ_RELATION_C, &index); + powernv_cpufreq_target_index(&policy, index); + for_each_cpu(tcpu, policy.cpus) + cpumask_clear_cpu(tcpu, mask); + } } static char throttle_reason[][30] = { @@ -466,8 +487,10 @@ static int powernv_cpufreq_occ_msg(struct notifier_block *nb, throttled = false; pr_info("OCC: Active\n"); - for (i = 0; i < nr_chips; i++) + for (i = 0; i < nr_chips; i++) { + chips[i].restore = true; schedule_work(&chips[i].throttle); + } return 0; } @@ -484,8 +507,11 @@ static int powernv_cpufreq_occ_msg(struct notifier_block *nb, return 0; for (i = 0; i < nr_chips; i++) - if (chips[i].id == omsg.chip) + if (chips[i].id == omsg.chip) { + if (!omsg.throttle_status) + chips[i].restore = true; schedule_work(&chips[i].throttle); + } } return 0; } @@ -539,6 +565,7 @@ static int init_chip_info(void) chips[i].throttled = false; cpumask_copy(&chips[i].mask, cpumask_of_node(chip[i])); INIT_WORK(&chips[i].throttle, powernv_cpufreq_work_fn); + chips[i].restore = false; } return 0; -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v4 5/6] cpufreq: powernv: Report Psafe only if PMSR.psafe_mode_active bit is set
On a reset cycle of OCC, although the system retires from safe frequency state the local pstate is not restored to Pmin or last requested pstate. Now if the cpufreq governor initiates a pstate change, the local pstate will be in Psafe and we will be reporting a false positive when we are not throttled. So in powernv_cpufreq_throttle_check() remove the condition which checks if local pstate is less than Pmin while checking for Psafe frequency. If the cpus are forced to Psafe then PMSR.psafe_mode_active bit will be set. So, when OCCs become active this bit will be cleared. Let us just rely on this bit for reporting throttling. Signed-off-by: Shilpasri G Bhat Reviewed-by: Preeti U Murthy --- No changes from v3 drivers/cpufreq/powernv-cpufreq.c | 12 +++- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index f2da30a..d6d7e68 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -39,7 +39,6 @@ #define PMSR_PSAFE_ENABLE (1UL << 30) #define PMSR_SPR_EM_DISABLE(1UL << 31) #define PMSR_MAX(x)((x >> 32) & 0xFF) -#define PMSR_LP(x) ((x >> 48) & 0xFF) static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1]; static bool rebooting, throttled, occ_reset; @@ -313,7 +312,7 @@ static void powernv_cpufreq_throttle_check(void *data) { unsigned int cpu = smp_processor_id(); unsigned long pmsr; - int pmsr_pmax, pmsr_lp, i; + int pmsr_pmax, i; pmsr = get_pmspr(SPRN_PMSR); @@ -335,14 +334,9 @@ static void powernv_cpufreq_throttle_check(void *data) chips[i].id, pmsr_pmax); } - /* -* Check for Psafe by reading LocalPstate -* or check if Psafe_mode_active is set in PMSR. -*/ + /* Check if Psafe_mode_active is set in PMSR. */ next: - pmsr_lp = (s8)PMSR_LP(pmsr); - if ((pmsr_lp < powernv_pstate_info.min) || - (pmsr & PMSR_PSAFE_ENABLE)) { + if (pmsr & PMSR_PSAFE_ENABLE) { throttled = true; pr_info("Pstate set to safe frequency\n"); } -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v4 3/6] cpufreq: powernv: Register for OCC related opal_message notification
Hi Joel, On 07/15/2015 11:47 AM, Joel Stanley wrote: > Hello, > > On Mon, 2015-07-13 at 19:39 +0530, Shilpasri G Bhat wrote: >> diff --git a/drivers/cpufreq/powernv-cpufreq.c >> b/drivers/cpufreq/powernv-cpufreq.c >> index d0c18c9..1f59958 100644 >> --- a/drivers/cpufreq/powernv-cpufreq.c >> +++ b/drivers/cpufreq/powernv-cpufreq.c >> @@ -414,6 +415,71 @@ static struct notifier_block powernv_cpufreq_reboot_nb >> = { >> .notifier_call = powernv_cpufreq_reboot_notifier, >> }; >> >> +static char throttle_reason[][30] = { >> +"No throttling", >> +"Power Cap", >> +"Processor Over Temperature", >> +"Power Supply Failure", >> +"Over Current", >> +"OCC Reset" >> + }; >> + >> +static int powernv_cpufreq_occ_msg(struct notifier_block *nb, >> + unsigned long msg_type, void *_msg) >> +{ >> +struct opal_msg *msg = _msg; >> +struct opal_occ_msg omsg; >> + >> +if (msg_type != OPAL_MSG_OCC) >> +return 0; >> + >> +memcpy(&omsg, msg->params, sizeof(omsg)); > > You need to ensure the of the members of struct opal_occ_msg are in the > correct byte order when copying them over. > > Have you tested this code with in a little endian configuration? Ah yes this wont work in LE. I tested the below diff in both BE/LE configuration on Power8 box which has FSP. - memcpy(&omsg, msg->params, sizeof(omsg)); + omsg.type = be64_to_cpu(msg->params[0]); + omsg.chip = be64_to_cpu(msg->params[1]); + omsg.throttle_status = be64_to_cpu(msg->params[2]); > > Do the messages you're sending make sense for a system that has a BMC > instead of a FSP? For a system with BMC, only OCC_THROTTLE will be received by the host. The remaining two (OCC_RESET and OCC_LOAD) are sent only in FSP based systems. OCC_THROTTLE is sent by opal which polls on the throttle_status byte in the OPAL-OCC shared memory region. > > Cheers, > > Joel > Thanks and Regards, Shilpa ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v5 3/6] cpufreq: powernv: Register for OCC related opal_message notification
OCC is an On-Chip-Controller which takes care of power and thermal safety of the chip. During runtime due to power failure or overtemperature the OCC may throttle the frequencies of the CPUs to remain within the power budget. We want the cpufreq driver to be aware of such situations to be able to report the reason to the user. We register to opal_message_notifier to receive OCC messages from opal. powernv_cpufreq_throttle_check() reports any frequency throttling and this patch will report the reason or event that caused throttling. We can be throttled if OCC is reset or OCC limits Pmax due to power or thermal reasons. We are also notified of unthrottling after an OCC reset or if OCC restores Pmax on the chip. Signed-off-by: Shilpasri G Bhat Acked-by: Viresh Kumar --- Changes from v4: - Replace memcpy() with be64_to_cpu() to copy the msg->params[] Changes from v3: - Move the macro definitions of OCC_RESET, OCC_LOAD, OCC_THROTTLE to arch/powerpc/include/asm/opal-api.h - Use 'struct opal_occ_msg' to copy the 'opal_msg->params[]' and refer the members of this structure in the code; Replace 'chip_id', 'token' and 'reason' with omsg.chip, omsg.type, omsg.throttle_status - Use OCC_MAX_THROTTLE_STATUS instead of the magic number. - Add opal_message_notifier_unregister() Changes from v2: - Patch split in to multiple patches. - This patch contains only the opal_message notification handler Changes from v1: - Add macros to define OCC_RESET, OCC_LOAD and OCC_THROTTLE - Define a structure to store chip id, chip mask which has bits set for cpus present in the chip, throttled state and a work_struct. - Modify powernv_cpufreq_throttle_check() to be called via smp_call() - On Pmax throttling/unthrottling update 'chip.throttled' and not the global 'throttled' as Pmax capping is local to the chip. - Remove the condition which checks if local pstate is less than Pmin while checking for Psafe frequency. When OCC becomes active after reset we update 'thottled' to false and when the cpufreq governor initiates a pstate change, the local pstate will be in Psafe and we will be reporting a false positive when we are not throttled. - Schedule a kworker on receiving throttling/unthrottling OCC message for that chip and schedule on all chips after receiving active. - After an OCC reset all the cpus will be in Psafe frequency. So call target() and restore the frequency to policy->cur after OCC_ACTIVE and Pmax unthrottling - Taken care of Viresh and Preeti's comments. drivers/cpufreq/powernv-cpufreq.c | 74 ++- 1 file changed, 73 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index d0c18c9..a634199 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -33,6 +33,7 @@ #include #include #include /* Required for cpu_sibling_mask() in UP configs */ +#include #define POWERNV_MAX_PSTATES256 #define PMSR_PSAFE_ENABLE (1UL << 30) @@ -41,7 +42,7 @@ #define PMSR_LP(x) ((x >> 48) & 0xFF) static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1]; -static bool rebooting, throttled; +static bool rebooting, throttled, occ_reset; static struct chip { unsigned int id; @@ -414,6 +415,74 @@ static struct notifier_block powernv_cpufreq_reboot_nb = { .notifier_call = powernv_cpufreq_reboot_notifier, }; +static char throttle_reason[][30] = { + "No throttling", + "Power Cap", + "Processor Over Temperature", + "Power Supply Failure", + "Over Current", + "OCC Reset" +}; + +static int powernv_cpufreq_occ_msg(struct notifier_block *nb, + unsigned long msg_type, void *_msg) +{ + struct opal_msg *msg = _msg; + struct opal_occ_msg omsg; + + if (msg_type != OPAL_MSG_OCC) + return 0; + + omsg.type = be64_to_cpu(msg->params[0]); + + switch (omsg.type) { + case OCC_RESET: + occ_reset = true; + /* +* powernv_cpufreq_throttle_check() is called in +* target() callback which can detect the throttle state +* for governors like ondemand. +* But static governors will not call target() often thus +* report throttling here. +*/ + if (!throttled) { + throttled = true; + pr_crit("CPU Frequency is throttled\n"); +
[PATCH v5 4/6] cpufreq: powernv: Call throttle_check() on receiving OCC_THROTTLE
Re-evaluate the chip's throttled state on recieving OCC_THROTTLE notification by executing *throttle_check() on any one of the cpu on the chip. This is a sanity check to verify if we were indeed throttled/unthrottled after receiving OCC_THROTTLE notification. We cannot call *throttle_check() directly from the notification handler because we could be handling chip1's notification in chip2. So initiate an smp_call to execute *throttle_check(). We are irq-disabled in the notification handler, so use a worker thread to smp_call throttle_check() on any of the cpu in the chipmask. Signed-off-by: Shilpasri G Bhat Acked-by: Viresh Kumar --- No changes from v4 Changes from v3: - Refer to the members of 'struct opal_occ_msg' in the patch. Replace 'chip_id' with 'omsg.chip' drivers/cpufreq/powernv-cpufreq.c | 28 ++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index a634199..22f33ff 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -47,6 +47,8 @@ static bool rebooting, throttled, occ_reset; static struct chip { unsigned int id; bool throttled; + cpumask_t mask; + struct work_struct throttle; } *chips; static int nr_chips; @@ -307,8 +309,9 @@ static inline unsigned int get_nominal_index(void) return powernv_pstate_info.max - powernv_pstate_info.nominal; } -static void powernv_cpufreq_throttle_check(unsigned int cpu) +static void powernv_cpufreq_throttle_check(void *data) { + unsigned int cpu = smp_processor_id(); unsigned long pmsr; int pmsr_pmax, pmsr_lp, i; @@ -370,7 +373,7 @@ static int powernv_cpufreq_target_index(struct cpufreq_policy *policy, return 0; if (!throttled) - powernv_cpufreq_throttle_check(smp_processor_id()); + powernv_cpufreq_throttle_check(NULL); freq_data.pstate_id = powernv_freqs[new_index].driver_data; @@ -415,6 +418,14 @@ static struct notifier_block powernv_cpufreq_reboot_nb = { .notifier_call = powernv_cpufreq_reboot_notifier, }; +void powernv_cpufreq_work_fn(struct work_struct *work) +{ + struct chip *chip = container_of(work, struct chip, throttle); + + smp_call_function_any(&chip->mask, + powernv_cpufreq_throttle_check, NULL, 0); +} + static char throttle_reason[][30] = { "No throttling", "Power Cap", @@ -429,6 +440,7 @@ static int powernv_cpufreq_occ_msg(struct notifier_block *nb, { struct opal_msg *msg = _msg; struct opal_occ_msg omsg; + int i; if (msg_type != OPAL_MSG_OCC) return 0; @@ -462,6 +474,10 @@ static int powernv_cpufreq_occ_msg(struct notifier_block *nb, occ_reset = false; throttled = false; pr_info("OCC: Active\n"); + + for (i = 0; i < nr_chips; i++) + schedule_work(&chips[i].throttle); + return 0; } @@ -473,6 +489,12 @@ static int powernv_cpufreq_occ_msg(struct notifier_block *nb, else if (!omsg.throttle_status) pr_info("OCC: Chip %u %s\n", (unsigned int)omsg.chip, throttle_reason[omsg.throttle_status]); + else + return 0; + + for (i = 0; i < nr_chips; i++) + if (chips[i].id == omsg.chip) + schedule_work(&chips[i].throttle); } return 0; } @@ -524,6 +546,8 @@ static int init_chip_info(void) for (i = 0; i < nr_chips; i++) { chips[i].id = chip[i]; chips[i].throttled = false; + cpumask_copy(&chips[i].mask, cpumask_of_node(chip[i])); + INIT_WORK(&chips[i].throttle, powernv_cpufreq_work_fn); } return 0; -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v5 5/6] cpufreq: powernv: Report Psafe only if PMSR.psafe_mode_active bit is set
On a reset cycle of OCC, although the system retires from safe frequency state the local pstate is not restored to Pmin or last requested pstate. Now if the cpufreq governor initiates a pstate change, the local pstate will be in Psafe and we will be reporting a false positive when we are not throttled. So in powernv_cpufreq_throttle_check() remove the condition which checks if local pstate is less than Pmin while checking for Psafe frequency. If the cpus are forced to Psafe then PMSR.psafe_mode_active bit will be set. So, when OCCs become active this bit will be cleared. Let us just rely on this bit for reporting throttling. Signed-off-by: Shilpasri G Bhat Reviewed-by: Preeti U Murthy Acked-by: Viresh Kumar --- No changes from v4 drivers/cpufreq/powernv-cpufreq.c | 12 +++- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 22f33ff..90b4293 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -39,7 +39,6 @@ #define PMSR_PSAFE_ENABLE (1UL << 30) #define PMSR_SPR_EM_DISABLE(1UL << 31) #define PMSR_MAX(x)((x >> 32) & 0xFF) -#define PMSR_LP(x) ((x >> 48) & 0xFF) static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1]; static bool rebooting, throttled, occ_reset; @@ -313,7 +312,7 @@ static void powernv_cpufreq_throttle_check(void *data) { unsigned int cpu = smp_processor_id(); unsigned long pmsr; - int pmsr_pmax, pmsr_lp, i; + int pmsr_pmax, i; pmsr = get_pmspr(SPRN_PMSR); @@ -335,14 +334,9 @@ static void powernv_cpufreq_throttle_check(void *data) chips[i].id, pmsr_pmax); } - /* -* Check for Psafe by reading LocalPstate -* or check if Psafe_mode_active is set in PMSR. -*/ + /* Check if Psafe_mode_active is set in PMSR. */ next: - pmsr_lp = (s8)PMSR_LP(pmsr); - if ((pmsr_lp < powernv_pstate_info.min) || - (pmsr & PMSR_PSAFE_ENABLE)) { + if (pmsr & PMSR_PSAFE_ENABLE) { throttled = true; pr_info("Pstate set to safe frequency\n"); } -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v5 1/6] cpufreq: powernv: Handle throttling due to Pmax capping at chip level
The On-Chip-Controller(OCC) can throttle cpu frequency by reducing the max allowed frequency for that chip if the chip exceeds its power or temperature limits. As Pmax capping is a chip level condition report this throttling behavior at chip level and also do not set the global 'throttled' on Pmax capping instead set the per-chip throttled variable. Report unthrottling if Pmax is restored after throttling. This patch adds a structure to store chip id and throttled state of the chip. Signed-off-by: Shilpasri G Bhat Reviewed-by: Preeti U Murthy Acked-by: Viresh Kumar --- No change from v4 drivers/cpufreq/powernv-cpufreq.c | 59 --- 1 file changed, 55 insertions(+), 4 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index ebef0d8..d0c18c9 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -42,6 +43,13 @@ static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1]; static bool rebooting, throttled; +static struct chip { + unsigned int id; + bool throttled; +} *chips; + +static int nr_chips; + /* * Note: The set of pstates consists of contiguous integers, the * smallest of which is indicated by powernv_pstate_info.min, the @@ -301,22 +309,33 @@ static inline unsigned int get_nominal_index(void) static void powernv_cpufreq_throttle_check(unsigned int cpu) { unsigned long pmsr; - int pmsr_pmax, pmsr_lp; + int pmsr_pmax, pmsr_lp, i; pmsr = get_pmspr(SPRN_PMSR); + for (i = 0; i < nr_chips; i++) + if (chips[i].id == cpu_to_chip_id(cpu)) + break; + /* Check for Pmax Capping */ pmsr_pmax = (s8)PMSR_MAX(pmsr); if (pmsr_pmax != powernv_pstate_info.max) { - throttled = true; - pr_info("CPU %d Pmax is reduced to %d\n", cpu, pmsr_pmax); - pr_info("Max allowed Pstate is capped\n"); + if (chips[i].throttled) + goto next; + chips[i].throttled = true; + pr_info("CPU %d on Chip %u has Pmax reduced to %d\n", cpu, + chips[i].id, pmsr_pmax); + } else if (chips[i].throttled) { + chips[i].throttled = false; + pr_info("CPU %d on Chip %u has Pmax restored to %d\n", cpu, + chips[i].id, pmsr_pmax); } /* * Check for Psafe by reading LocalPstate * or check if Psafe_mode_active is set in PMSR. */ +next: pmsr_lp = (s8)PMSR_LP(pmsr); if ((pmsr_lp < powernv_pstate_info.min) || (pmsr & PMSR_PSAFE_ENABLE)) { @@ -414,6 +433,33 @@ static struct cpufreq_driver powernv_cpufreq_driver = { .attr = powernv_cpu_freq_attr, }; +static int init_chip_info(void) +{ + unsigned int chip[256]; + unsigned int cpu, i; + unsigned int prev_chip_id = UINT_MAX; + + for_each_possible_cpu(cpu) { + unsigned int id = cpu_to_chip_id(cpu); + + if (prev_chip_id != id) { + prev_chip_id = id; + chip[nr_chips++] = id; + } + } + + chips = kmalloc_array(nr_chips, sizeof(struct chip), GFP_KERNEL); + if (!chips) + return -ENOMEM; + + for (i = 0; i < nr_chips; i++) { + chips[i].id = chip[i]; + chips[i].throttled = false; + } + + return 0; +} + static int __init powernv_cpufreq_init(void) { int rc = 0; @@ -429,6 +475,11 @@ static int __init powernv_cpufreq_init(void) return rc; } + /* Populate chip info */ + rc = init_chip_info(); + if (rc) + return rc; + register_reboot_notifier(&powernv_cpufreq_reboot_nb); return cpufreq_register_driver(&powernv_cpufreq_driver); } -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v5 2/6] powerpc/powernv: Add definition of OPAL_MSG_OCC message type
Add OPAL_MSG_OCC message definition to opal_message_type to receive OCC events like reset, load and throttled. Host performance can be affected when OCC is reset or OCC throttles the max Pstate. We can register to opal_message_notifier to receive OPAL_MSG_OCC type of message and report it to the userspace so as to keep the user informed about the reason for a performance drop in workloads. The reset and load OCC events are notified to kernel when FSP sends OCC_RESET and OCC_LOAD commands. Both reset and load messages are sent to kernel on successful completion of reset and load operation respectively. The throttle OCC event indicates that the Pmax of the chip is reduced. The chip_id and throttle reason for reducing Pmax is also queued along with the message. CC: Stewart Smith Signed-off-by: Shilpasri G Bhat Acked-by: Viresh Kumar --- No change from v4 Changes from v3: - '0d7cd8550d3 powerpc/powernv: Add opal-prd channel' this patch adds the definition of OPAL_MSG_PRD, so remove it and update the changelog. - Move the definitions of OCC_RESET, OCC_LOAD and OCC_THROTTLE from drivers/cpufreq/powernv-cpufreq.c to arch/powerpc/include/asm/opal-api.h - Define OCC_MAX_THROTTLE_STATUS - Add a wrapper structure 'opal_occ_msg' to copy 'struct opal_msg.params[0..2]' This structure will define the parameters received from firmware to maintain compatibility for any future additions. No change from v2 Change from v1: - Update the commit changelog arch/powerpc/include/asm/opal-api.h | 12 1 file changed, 12 insertions(+) diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index e9e4c52..64dc9f5 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -361,6 +361,7 @@ enum opal_msg_type { OPAL_MSG_HMI_EVT, OPAL_MSG_DPO, OPAL_MSG_PRD, + OPAL_MSG_OCC, OPAL_MSG_TYPE_MAX, }; @@ -700,6 +701,17 @@ struct opal_prd_msg_header { struct opal_prd_msg; +#define OCC_RESET 0 +#define OCC_LOAD1 +#define OCC_THROTTLE2 +#define OCC_MAX_THROTTLE_STATUS 5 + +struct opal_occ_msg { + __be64 type; + __be64 chip; + __be64 throttle_status; +}; + /* * SG entries * -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v5 0/6] powernv: cpufreq: Report frequency throttle by OCC
This patchset intends to add frequency throttle reporting mechanism to powernv-cpufreq driver when OCC throttles the frequency. OCC is an On-Chip-Controller which takes care of the power and thermal safety of the chip. The CPU frequency can be throttled during an OCC reset or when OCC tries to limit the max allowed frequency. The patchset will report such conditions so as to keep the user informed about reason for the drop in performance of workloads when frequency is throttled. Changes from v4: - Taken care of Joel Stanley's comment, modification in patch[3]. This replaces memcpy() with be64_to_cpu() and no change in functionality of the patch Changes from v3: - Rebased on top of 4.2-rc1 - Minor changes in patch 2,3,4,6 this does not change the functionality of the code - 594fcb9ec9e powerpc/powernv: Expose OPAL APIs required by PRD interface , this patch fixes the build error due to which this series was initially dropped ERROR: ".opal_message_notifier_register" drivers/cpufreq/powernv-cpufreq.ko] undefined! Changes from v2: - Split into multiple patches - Semantic fixes Shilpasri G Bhat (6): cpufreq: powernv: Handle throttling due to Pmax capping at chip level powerpc/powernv: Add definition of OPAL_MSG_OCC message type cpufreq: powernv: Register for OCC related opal_message notification cpufreq: powernv: Call throttle_check() on receiving OCC_THROTTLE cpufreq: powernv: Report Psafe only if PMSR.psafe_mode_active bit is set cpufreq: powernv: Restore cpu frequency to policy->cur on unthrottling arch/powerpc/include/asm/opal-api.h | 12 +++ drivers/cpufreq/powernv-cpufreq.c | 198 +--- 2 files changed, 195 insertions(+), 15 deletions(-) -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v5 6/6] cpufreq: powernv: Restore cpu frequency to policy->cur on unthrottling
If frequency is throttled due to OCC reset then cpus will be in Psafe frequency, so restore the frequency on all cpus to policy->cur when OCCs are active again. And if frequency is throttled due to Pmax capping then restore the frequency of all the cpus in the chip on unthrottling. Signed-off-by: Shilpasri G Bhat Acked-by: Viresh Kumar --- No changes from v4 Changes from v3: - Refer to the members of 'struct opal_occ_msg' in the patch. Replace 'reason' with 'omsg.throttle_status' drivers/cpufreq/powernv-cpufreq.c | 31 +-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 90b4293..546e056 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -48,6 +48,7 @@ static struct chip { bool throttled; cpumask_t mask; struct work_struct throttle; + bool restore; } *chips; static int nr_chips; @@ -415,9 +416,29 @@ static struct notifier_block powernv_cpufreq_reboot_nb = { void powernv_cpufreq_work_fn(struct work_struct *work) { struct chip *chip = container_of(work, struct chip, throttle); + unsigned int cpu; + cpumask_var_t mask; smp_call_function_any(&chip->mask, powernv_cpufreq_throttle_check, NULL, 0); + + if (!chip->restore) + return; + + chip->restore = false; + cpumask_copy(mask, &chip->mask); + for_each_cpu_and(cpu, mask, cpu_online_mask) { + int index, tcpu; + struct cpufreq_policy policy; + + cpufreq_get_policy(&policy, cpu); + cpufreq_frequency_table_target(&policy, policy.freq_table, + policy.cur, + CPUFREQ_RELATION_C, &index); + powernv_cpufreq_target_index(&policy, index); + for_each_cpu(tcpu, policy.cpus) + cpumask_clear_cpu(tcpu, mask); + } } static char throttle_reason[][30] = { @@ -469,8 +490,10 @@ static int powernv_cpufreq_occ_msg(struct notifier_block *nb, throttled = false; pr_info("OCC: Active\n"); - for (i = 0; i < nr_chips; i++) + for (i = 0; i < nr_chips; i++) { + chips[i].restore = true; schedule_work(&chips[i].throttle); + } return 0; } @@ -487,8 +510,11 @@ static int powernv_cpufreq_occ_msg(struct notifier_block *nb, return 0; for (i = 0; i < nr_chips; i++) - if (chips[i].id == omsg.chip) + if (chips[i].id == omsg.chip) { + if (!omsg.throttle_status) + chips[i].restore = true; schedule_work(&chips[i].throttle); + } } return 0; } @@ -542,6 +568,7 @@ static int init_chip_info(void) chips[i].throttled = false; cpumask_copy(&chips[i].mask, cpumask_of_node(chip[i])); INIT_WORK(&chips[i].throttle, powernv_cpufreq_work_fn); + chips[i].restore = false; } return 0; -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v5 2/6] powerpc/powernv: Add definition of OPAL_MSG_OCC message type
Hi Stewart, On 08/10/2015 05:53 AM, Stewart Smith wrote: > Shilpasri G Bhat writes: >> Add OPAL_MSG_OCC message definition to opal_message_type to receive >> OCC events like reset, load and throttled. Host performance can be >> affected when OCC is reset or OCC throttles the max Pstate. >> We can register to opal_message_notifier to receive OPAL_MSG_OCC type >> of message and report it to the userspace so as to keep the user >> informed about the reason for a performance drop in workloads. >> >> The reset and load OCC events are notified to kernel when FSP sends >> OCC_RESET and OCC_LOAD commands. Both reset and load messages are >> sent to kernel on successful completion of reset and load operation >> respectively. > > How is this done on OpenPower systems? Explanation involving just what > OPAL does is likely better, rather than explaining in context of FSP, > which Linux has no real knowledge of (OPAL provides all abstraction of > it). > In OpenPower systems, opal will only send OCC throttled event. OCC reset and load messages are not sent to kernel. How about the following git log message? Add OPAL_MSG_OCC message definition to opal_message_type to receive OCC events like reset, load and throttled. Host performance can be affected when OCC is reset or OCC throttles the max Pstate. We can register to opal_message_notifier to receive OPAL_MSG_OCC type of message and report it to the userspace so as to keep the user informed about the reason for a performance drop in workloads. Opal will send reset and load events to kernel on successful completion of reset and load operation of OCC. During this duration the cpu frequency will be throttled until OCC is started. Opal will send a throttle message during the OCC reset-cycle to indicate that OCC is active. Opal will send throttle message to kernel when OCC is active to indicate that the Pmax of the chip is reduced. The chip_id and throttle reason for reducing Pmax is queued along with the message. Thanks and Regards, Shilpa ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v5 3/6] cpufreq: powernv: Register for OCC related opal_message notification
On 08/10/2015 07:11 AM, Stewart Smith wrote: > Shilpasri G Bhat writes: >> diff --git a/drivers/cpufreq/powernv-cpufreq.c >> b/drivers/cpufreq/powernv-cpufreq.c >> index d0c18c9..a634199 100644 >> --- a/drivers/cpufreq/powernv-cpufreq.c >> +++ b/drivers/cpufreq/powernv-cpufreq.c >> @@ -33,6 +33,7 @@ >> #include >> #include >> #include /* Required for cpu_sibling_mask() in UP configs */ >> +#include >> >> #define POWERNV_MAX_PSTATES 256 >> #define PMSR_PSAFE_ENABLE (1UL << 30) >> @@ -41,7 +42,7 @@ >> #define PMSR_LP(x) ((x >> 48) & 0xFF) >> >> static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1]; >> -static bool rebooting, throttled; >> +static bool rebooting, throttled, occ_reset; >> >> static struct chip { >> unsigned int id; >> @@ -414,6 +415,74 @@ static struct notifier_block powernv_cpufreq_reboot_nb >> = { >> .notifier_call = powernv_cpufreq_reboot_notifier, >> }; >> >> +static char throttle_reason[][30] = { >> +"No throttling", >> +"Power Cap", >> +"Processor Over Temperature", >> +"Power Supply Failure", >> +"Over Current", >> +"OCC Reset" >> + }; >> + >> +static int powernv_cpufreq_occ_msg(struct notifier_block *nb, >> + unsigned long msg_type, void *_msg) >> +{ >> +struct opal_msg *msg = _msg; >> +struct opal_occ_msg omsg; >> + >> +if (msg_type != OPAL_MSG_OCC) >> +return 0; >> + >> +omsg.type = be64_to_cpu(msg->params[0]); >> + >> +switch (omsg.type) { >> +case OCC_RESET: >> +occ_reset = true; >> +/* >> + * powernv_cpufreq_throttle_check() is called in >> + * target() callback which can detect the throttle state >> + * for governors like ondemand. >> + * But static governors will not call target() often thus >> + * report throttling here. >> + */ >> +if (!throttled) { >> +throttled = true; >> +pr_crit("CPU Frequency is throttled\n"); >> +} >> +pr_info("OCC: Reset\n"); >> +break; >> +case OCC_LOAD: >> +pr_info("OCC: Loaded\n"); >> +break; > > I wonder if we could have the log messages be a bit clearer here, odds > are, unless you're one of the people reading this code, you have no idea > what an OCC is or what on earth "OCC: Loaded" means and why this > *doesn't* mean that your CPUs are no longer throttled so that your > computer doesn't catch fire/break/add 1+1 and get 4. > > Also, do we export this information via sysfs somewhere? It would seem > to want to go along with other cpufreq/cpu info there. No we don't export the throttling status of the cpu via sysfs. Since the throttling state is common across the chip, the per_cpu export will be redundant. Did you mean something like one of the below: 1)/sys/devices/system/cpu/cpufreq/chipN_throttle 2)/sys/devices/system/cpu/cpuN/cpufreq/throttle > > It feels like we could do much better at informing users as to what is > going on maybe something like: > > "OCC (On Chip Controller - enforces hard thermal/power limits) Resetting: CPU > frequency throttled for duration" > "OCC Loading, CPU frequency throttled until OCC started" > "OCC Active, CPU frequency no longer throttled" > Okay will change the messages. Thanks and Regards, Shilpa ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH] cpufreq: powernv: Register the driver with reboot notifier
This patch ensures the cpus to kexec/reboot at nominal frequency. Nominal frequency is the highest cpu frequency on PowerPC at which the cores can run without getting throttled. If the host kernel had set the cpus to a low pstate and then it kexecs/reboots to a cpufreq disabled kernel it would cause the target kernel to perform poorly. It will also increase the boot up time of the target kernel. So set the cpus to high pstate, in this case to nominal frequency before rebooting to avoid such scenarios. The reboot notifier will suspend the cpufreq governor and enable nominal frequency to be set during a reboot/kexec similar to the suspend operartion. Signed-off-by: Shilpasri G Bhat Reviewed-by: Preeti U Murthy --- drivers/cpufreq/powernv-cpufreq.c | 16 1 file changed, 16 insertions(+) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 379c083..e9f3d3a 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -314,9 +315,21 @@ static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy) for (i = 0; i < threads_per_core; i++) cpumask_set_cpu(base + i, policy->cpus); + policy->suspend_freq = pstate_id_to_freq(powernv_pstate_info.nominal); return cpufreq_table_validate_and_show(policy, powernv_freqs); } +static int powernv_cpufreq_reboot_notifier(struct notifier_block *nb, + unsigned long action, void *unused) +{ + cpufreq_suspend(); + return NOTIFY_DONE; +} + +static struct notifier_block powernv_cpufreq_reboot_nb = { + .notifier_call = powernv_cpufreq_reboot_notifier, +}; + static struct cpufreq_driver powernv_cpufreq_driver = { .name = "powernv-cpufreq", .flags = CPUFREQ_CONST_LOOPS, @@ -325,6 +338,7 @@ static struct cpufreq_driver powernv_cpufreq_driver = { .target_index = powernv_cpufreq_target_index, .get= powernv_cpufreq_get, .attr = powernv_cpu_freq_attr, + .suspend= cpufreq_generic_suspend, }; static int __init powernv_cpufreq_init(void) @@ -342,12 +356,14 @@ static int __init powernv_cpufreq_init(void) return rc; } + register_reboot_notifier(&powernv_cpufreq_reboot_nb); return cpufreq_register_driver(&powernv_cpufreq_driver); } module_init(powernv_cpufreq_init); static void __exit powernv_cpufreq_exit(void) { + unregister_reboot_notifier(&powernv_cpufreq_reboot_nb); cpufreq_unregister_driver(&powernv_cpufreq_driver); } module_exit(powernv_cpufreq_exit); -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH] cpufreq: powernv: Register the driver with reboot notifier
On 08/18/2014 01:16 PM, Viresh Kumar wrote: On 14 August 2014 16:49, Shilpasri G Bhat wrote: This patch ensures the cpus to kexec/reboot at nominal frequency. Nominal frequency is the highest cpu frequency on PowerPC at which the cores can run without getting throttled. If the host kernel had set the cpus to a low pstate and then it kexecs/reboots to a cpufreq disabled kernel it would cause the target kernel to perform poorly. It will also increase the boot up time of the target kernel. So set the cpus to high pstate, in this case to nominal frequency before rebooting to avoid such scenarios. The reboot notifier will suspend the cpufreq governor and enable nominal frequency to be set during a reboot/kexec similar to the suspend operartion. Signed-off-by: Shilpasri G Bhat Reviewed-by: Preeti U Murthy --- drivers/cpufreq/powernv-cpufreq.c | 16 1 file changed, 16 insertions(+) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 379c083..e9f3d3a 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -314,9 +315,21 @@ static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy) for (i = 0; i < threads_per_core; i++) cpumask_set_cpu(base + i, policy->cpus); + policy->suspend_freq = pstate_id_to_freq(powernv_pstate_info.nominal); return cpufreq_table_validate_and_show(policy, powernv_freqs); } +static int powernv_cpufreq_reboot_notifier(struct notifier_block *nb, + unsigned long action, void *unused) +{ + cpufreq_suspend(); + return NOTIFY_DONE; +} + +static struct notifier_block powernv_cpufreq_reboot_nb = { + .notifier_call = powernv_cpufreq_reboot_notifier, +}; + static struct cpufreq_driver powernv_cpufreq_driver = { .name = "powernv-cpufreq", .flags = CPUFREQ_CONST_LOOPS, @@ -325,6 +338,7 @@ static struct cpufreq_driver powernv_cpufreq_driver = { .target_index = powernv_cpufreq_target_index, .get= powernv_cpufreq_get, .attr = powernv_cpu_freq_attr, + .suspend= cpufreq_generic_suspend, I couldn't understand why you have added a notifier here. This callback by itself should be enough. Isn't it? And then you have called cpufreq_suspend(), which is absolutely wrong, from that notifier.. Hi Viresh, The intention here is stop the cpufreq governor and then to set the cpus to nominal frequency so as to ensure that the frequency won't be changed later. The .suspend callback of the driver is not called during reboot/kexec. So we need an explicit reboot notifier to call cpufreq-suspend() to suffice the requirement. Thanks and Regards, Shilpa ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v10 0/2] cpufreq: powernv: Export throttle stat attributes and a bug fix
Patch[1] solves a bug in the module{init/exit} path and Patch[2] exports the throttle stats for the chip. This patchset is on top of linux-pm/linux-next. Changes from v9: - Patch[1/2] is newly added to correctly handle error path in powernv_cpufreq_init() and unregistration path in powernv_cpufreq_exit() as suggested by Viresh. - Patch[2/2] is rebased on top of Patch[1/2]. Shilpasri G Bhat (2): cpufreq: powernv: Fix bugs in powernv_cpufreq_{init/exit} cpufreq: powernv: Add sysfs attributes to show throttle stats Documentation/ABI/testing/sysfs-devices-system-cpu | 69 + drivers/cpufreq/powernv-cpufreq.c | 162 ++--- 2 files changed, 212 insertions(+), 19 deletions(-) -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v10 1/2] cpufreq: powernv: Fix bugs in powernv_cpufreq_{init/exit}
Unregister the notifiers if cpufreq_driver_register() fails in powernv_cpufreq_init(). Re-arrange the unregistration and cleanup routines in powernv_cpufreq_exit() to free all the resources after the driver has unregistered. Signed-off-by: Shilpasri G Bhat --- drivers/cpufreq/powernv-cpufreq.c | 40 --- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 1bbc10a..50bf120 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -595,6 +595,19 @@ out: return ret; } +static inline void clean_chip_info(void) +{ + kfree(chips); + kfree(core_to_chip_map); +} + +static inline void unregister_all_notifiers(void) +{ + opal_message_notifier_unregister(OPAL_MSG_OCC, +&powernv_cpufreq_opal_nb); + unregister_reboot_notifier(&powernv_cpufreq_reboot_nb); +} + static int __init powernv_cpufreq_init(void) { int rc = 0; @@ -605,30 +618,35 @@ static int __init powernv_cpufreq_init(void) /* Discover pstates from device tree and init */ rc = init_powernv_pstates(); - if (rc) { - pr_info("powernv-cpufreq disabled. System does not support PState control\n"); - return rc; - } + if (rc) + goto out; /* Populate chip info */ rc = init_chip_info(); if (rc) - return rc; + goto out; register_reboot_notifier(&powernv_cpufreq_reboot_nb); opal_message_notifier_register(OPAL_MSG_OCC, &powernv_cpufreq_opal_nb); - return cpufreq_register_driver(&powernv_cpufreq_driver); + + rc = cpufreq_register_driver(&powernv_cpufreq_driver); + if (!rc) + return 0; + + pr_info("Failed to register the cpufreq driver (%d)\n", rc); + unregister_all_notifiers(); + clean_chip_info(); +out: + pr_info("Platform driver disabled. System does not support PState control\n"); + return rc; } module_init(powernv_cpufreq_init); static void __exit powernv_cpufreq_exit(void) { - unregister_reboot_notifier(&powernv_cpufreq_reboot_nb); - opal_message_notifier_unregister(OPAL_MSG_OCC, -&powernv_cpufreq_opal_nb); - kfree(chips); - kfree(core_to_chip_map); cpufreq_unregister_driver(&powernv_cpufreq_driver); + unregister_all_notifiers(); + clean_chip_info(); } module_exit(powernv_cpufreq_exit); -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v10 2/2] cpufreq: powernv: Add sysfs attributes to show throttle stats
Create sysfs attributes to export throttle information in /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats directory. The newly added sysfs files are as follows: 1)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/turbo_stat 2)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/sub-turbo_stat 3)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/unthrottle 4)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/powercap 5)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/overtemp 6)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/supply_fault 7)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/overcurrent 8)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/occ_reset Detailed explanation of each attribute is added to Documentation/ABI/testing/sysfs-devices-system-cpu CC: linux-...@vger.kernel.org Signed-off-by: Shilpasri G Bhat --- Changes from v9: - Modified documentation. - s/throttle_nominal/throttle_sub_turbo Changes from v8: - Moved the sysfs attributes from cpu/cpufreq/chipX to cpuX/cpufreq/throttle_stats - Adhering to one-value-per-file, replace throttle_table with multiple sysfs files. - Using CPUFREQ_POLICY_NOTIFIER to add/remove attribute_group. Documentation/ABI/testing/sysfs-devices-system-cpu | 69 +++ drivers/cpufreq/powernv-cpufreq.c | 127 +++-- 2 files changed, 187 insertions(+), 9 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index b683e8e..eaa2c87 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -271,3 +271,72 @@ Description: Parameters for the CPU cache attributes - WriteBack: data is written only to the cache line and the modified cache line is written to main memory only when it is replaced + +What: /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/turbo_stat + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/sub_turbo_stat + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/unthrottle + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/powercap + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/overtemp + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/supply_fault + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/overcurrent + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/occ_reset +Date: Feb 2016 +Contact: Linux kernel mailing list + Linux for PowerPC mailing list +Description: POWERNV CPUFreq driver's frequency throttle stats directory and + attributes + + 'cpuX/cpufreq/throttle_stats' directory contains the CPU frequency + throttle stat attributes for the chip. The throttle stats of a cpu + is common across all the cpus belonging to a chip. Below are the + throttle attributes exported in the 'throttle_stats' directory: + + - turbo_stat : This file gives the total number of times the max + frequency is throttled to lower frequency in turbo (at and above + nominal frequency) range of frequencies. + + - sub_turbo_stat : This file gives the total number of times the + max frequency is throttled to lower frequency in sub-turbo(below + nominal frequency) range of frequencies. + + - unthrottle : This file gives the total number of times the max + frequency is unthrottled after being throttled. + + - powercap : This file gives the total number of times the max + frequency is throttled due to 'Power Capping'. + + - overtemp : This file gives the total number of times the max + frequency is throttled due to 'CPU Over Temperature'. + + - supply_fault : This file gives the total number of times the + max frequency is throttled due to 'Power Supply Failure'. + + - overcurrent : This file gives the total number of times the + max frequency is throttled due to 'Overcurrent'. + + - occ_reset : This file gives the total number of times the max + frequency is throttled due to 'OCC Reset'. + + The sysfs attributes representing different throttle reasons like + powercap, overtemp, supply_fault, overcurrent and occ_reset map to + the reasons provided by OCC firmware for throttling the frequency. + +What: /sys/devices/system/cpu/cpufreq/policyX/throttle_stats + /sys/devices/system/cpu/cpu
[PATCH] cpufreq: powernv: Define per_cpu chip pointer to optimize hot-path
From: Michael Neuling "cpufreq: powernv: Remove cpu_to_chip_id() from hot-path" introduced 'core_to_chip_map' array to cache the chip-id of all cores. Replace this with per_cpu variable that stores the pointer to the chip-array. This removes the linear lookup and provides a neater and simpler solution. Signed-off-by: Michael Neuling Tested-by: Shilpasri G Bhat --- - Rebased the patch on top of linux-pm/linux-next - nr_chips is defined static, so it will be initialized to zero - Moved the initialization of the per_cpu variable after 'chips' is allocated - Removed 'core_to_chip_map' drivers/cpufreq/powernv-cpufreq.c | 50 +-- 1 file changed, 17 insertions(+), 33 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 50bf120..a00bcc2 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -44,7 +44,6 @@ static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1]; static bool rebooting, throttled, occ_reset; -static unsigned int *core_to_chip_map; static const char * const throttle_reason[] = { "No throttling", @@ -65,6 +64,7 @@ static struct chip { } *chips; static int nr_chips; +static DEFINE_PER_CPU(struct chip *, chip_info); /* * Note: The set of pstates consists of contiguous integers, the @@ -324,34 +324,31 @@ static inline unsigned int get_nominal_index(void) static void powernv_cpufreq_throttle_check(void *data) { + struct chip *chip; unsigned int cpu = smp_processor_id(); - unsigned int chip_id = core_to_chip_map[cpu_core_index_of_thread(cpu)]; unsigned long pmsr; - int pmsr_pmax, i; + int pmsr_pmax; pmsr = get_pmspr(SPRN_PMSR); - - for (i = 0; i < nr_chips; i++) - if (chips[i].id == chip_id) - break; + chip = this_cpu_read(chip_info); /* Check for Pmax Capping */ pmsr_pmax = (s8)PMSR_MAX(pmsr); if (pmsr_pmax != powernv_pstate_info.max) { - if (chips[i].throttled) + if (chip->throttled) goto next; - chips[i].throttled = true; + chip->throttled = true; if (pmsr_pmax < powernv_pstate_info.nominal) pr_warn_once("CPU %d on Chip %u has Pmax reduced below nominal frequency (%d < %d)\n", -cpu, chips[i].id, pmsr_pmax, +cpu, chip->id, pmsr_pmax, powernv_pstate_info.nominal); - trace_powernv_throttle(chips[i].id, - throttle_reason[chips[i].throttle_reason], + trace_powernv_throttle(chip->id, + throttle_reason[chip->throttle_reason], pmsr_pmax); - } else if (chips[i].throttled) { - chips[i].throttled = false; - trace_powernv_throttle(chips[i].id, - throttle_reason[chips[i].throttle_reason], + } else if (chip->throttled) { + chip->throttled = false; + trace_powernv_throttle(chip->id, + throttle_reason[chip->throttle_reason], pmsr_pmax); } @@ -558,47 +555,34 @@ static int init_chip_info(void) unsigned int chip[256]; unsigned int cpu, i; unsigned int prev_chip_id = UINT_MAX; - cpumask_t cpu_mask; - int ret = -ENOMEM; - - core_to_chip_map = kcalloc(cpu_nr_cores(), sizeof(unsigned int), - GFP_KERNEL); - if (!core_to_chip_map) - goto out; - cpumask_copy(&cpu_mask, cpu_possible_mask); - for_each_cpu(cpu, &cpu_mask) { + for_each_possible_cpu(cpu) { unsigned int id = cpu_to_chip_id(cpu); if (prev_chip_id != id) { prev_chip_id = id; chip[nr_chips++] = id; } - core_to_chip_map[cpu_core_index_of_thread(cpu)] = id; - cpumask_andnot(&cpu_mask, &cpu_mask, cpu_sibling_mask(cpu)); } chips = kcalloc(nr_chips, sizeof(struct chip), GFP_KERNEL); if (!chips) - goto free_chip_map; + return -ENOMEM; for (i = 0; i < nr_chips; i++) { chips[i].id = chip[i]; cpumask_copy(&chips[i].mask, cpumask_of_node(chip[i])); INIT_WORK(&chips[i].throttle, powernv_cpufreq_work_fn); + for_each_cpu(cpu, &chips[i].mask) + per_cpu(chip_info, cpu) = &chips[i]; } return 0; -free_chip_ma
[PATCH RESEND] cpufreq: powernv: Define per_cpu chip pointer to optimize hot-path
From: Michael Neuling "96c4726f01cd cpufreq: powernv: Remove cpu_to_chip_id() from hot-path" introduced 'core_to_chip_map' array to cache the chip-id of all cores. Replace this with per_cpu variable that stores the pointer to the chip-array. This removes the linear lookup and provides a neater and simpler solution. Signed-off-by: Michael Neuling Signed-off-by: Shilpasri G Bhat Acked-by: Viresh Kumar --- - Included S-o-b and ACK tags - Added the commit id to the commit log drivers/cpufreq/powernv-cpufreq.c | 50 +-- 1 file changed, 17 insertions(+), 33 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 50bf120..a00bcc2 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -44,7 +44,6 @@ static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1]; static bool rebooting, throttled, occ_reset; -static unsigned int *core_to_chip_map; static const char * const throttle_reason[] = { "No throttling", @@ -65,6 +64,7 @@ static struct chip { } *chips; static int nr_chips; +static DEFINE_PER_CPU(struct chip *, chip_info); /* * Note: The set of pstates consists of contiguous integers, the @@ -324,34 +324,31 @@ static inline unsigned int get_nominal_index(void) static void powernv_cpufreq_throttle_check(void *data) { + struct chip *chip; unsigned int cpu = smp_processor_id(); - unsigned int chip_id = core_to_chip_map[cpu_core_index_of_thread(cpu)]; unsigned long pmsr; - int pmsr_pmax, i; + int pmsr_pmax; pmsr = get_pmspr(SPRN_PMSR); - - for (i = 0; i < nr_chips; i++) - if (chips[i].id == chip_id) - break; + chip = this_cpu_read(chip_info); /* Check for Pmax Capping */ pmsr_pmax = (s8)PMSR_MAX(pmsr); if (pmsr_pmax != powernv_pstate_info.max) { - if (chips[i].throttled) + if (chip->throttled) goto next; - chips[i].throttled = true; + chip->throttled = true; if (pmsr_pmax < powernv_pstate_info.nominal) pr_warn_once("CPU %d on Chip %u has Pmax reduced below nominal frequency (%d < %d)\n", -cpu, chips[i].id, pmsr_pmax, +cpu, chip->id, pmsr_pmax, powernv_pstate_info.nominal); - trace_powernv_throttle(chips[i].id, - throttle_reason[chips[i].throttle_reason], + trace_powernv_throttle(chip->id, + throttle_reason[chip->throttle_reason], pmsr_pmax); - } else if (chips[i].throttled) { - chips[i].throttled = false; - trace_powernv_throttle(chips[i].id, - throttle_reason[chips[i].throttle_reason], + } else if (chip->throttled) { + chip->throttled = false; + trace_powernv_throttle(chip->id, + throttle_reason[chip->throttle_reason], pmsr_pmax); } @@ -558,47 +555,34 @@ static int init_chip_info(void) unsigned int chip[256]; unsigned int cpu, i; unsigned int prev_chip_id = UINT_MAX; - cpumask_t cpu_mask; - int ret = -ENOMEM; - - core_to_chip_map = kcalloc(cpu_nr_cores(), sizeof(unsigned int), - GFP_KERNEL); - if (!core_to_chip_map) - goto out; - cpumask_copy(&cpu_mask, cpu_possible_mask); - for_each_cpu(cpu, &cpu_mask) { + for_each_possible_cpu(cpu) { unsigned int id = cpu_to_chip_id(cpu); if (prev_chip_id != id) { prev_chip_id = id; chip[nr_chips++] = id; } - core_to_chip_map[cpu_core_index_of_thread(cpu)] = id; - cpumask_andnot(&cpu_mask, &cpu_mask, cpu_sibling_mask(cpu)); } chips = kcalloc(nr_chips, sizeof(struct chip), GFP_KERNEL); if (!chips) - goto free_chip_map; + return -ENOMEM; for (i = 0; i < nr_chips; i++) { chips[i].id = chip[i]; cpumask_copy(&chips[i].mask, cpumask_of_node(chip[i])); INIT_WORK(&chips[i].throttle, powernv_cpufreq_work_fn); + for_each_cpu(cpu, &chips[i].mask) + per_cpu(chip_info, cpu) = &chips[i]; } return 0; -free_chip_map: - kfree(core_to_chip_map); -out: - return ret; } static inline void clean_chip_info(void) { k
[PATCH v11] cpufreq: powernv: Add sysfs attributes to show throttle stats
Create sysfs attributes to export throttle information in /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats directory. The newly added sysfs files are as follows: 1)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/turbo_stat 2)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/sub-turbo_stat 3)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/unthrottle 4)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/powercap 5)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/overtemp 6)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/supply_fault 7)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/overcurrent 8)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/occ_reset Detailed explanation of each attribute is added to Documentation/ABI/testing/sysfs-devices-system-cpu CC: linux-...@vger.kernel.org Signed-off-by: Shilpasri G Bhat --- Changes from v10: - Removed policy_notifiers to use driver->init() instead to create sysfs - sysfs attributes are removed by kobject_put(policy->kobj) - Rebased on top of http://lkml.iu.edu/hypermail/linux/kernel/1603.2/02268.html Changes from v9: - Modified documentation. - s/throttle_nominal/throttle_sub_turbo Changes from v8: - Moved the sysfs attributes from cpu/cpufreq/chipX to cpuX/cpufreq/throttle_stats - Adhering to one-value-per-file, replace throttle_table with multiple sysfs files. - Using CPUFREQ_POLICY_NOTIFIER to add/remove attribute_group. Documentation/ABI/testing/sysfs-devices-system-cpu | 69 +++ drivers/cpufreq/powernv-cpufreq.c | 80 +- 2 files changed, 146 insertions(+), 3 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index b683e8e..1650133 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -271,3 +271,72 @@ Description: Parameters for the CPU cache attributes - WriteBack: data is written only to the cache line and the modified cache line is written to main memory only when it is replaced + +What: /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/turbo_stat + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/sub_turbo_stat + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/unthrottle + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/powercap + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/overtemp + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/supply_fault + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/overcurrent + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/occ_reset +Date: March 2016 +Contact: Linux kernel mailing list + Linux for PowerPC mailing list +Description: POWERNV CPUFreq driver's frequency throttle stats directory and + attributes + + 'cpuX/cpufreq/throttle_stats' directory contains the CPU frequency + throttle stat attributes for the chip. The throttle stats of a cpu + is common across all the cpus belonging to a chip. Below are the + throttle attributes exported in the 'throttle_stats' directory: + + - turbo_stat : This file gives the total number of times the max + frequency is throttled to lower frequency in turbo (at and above + nominal frequency) range of frequencies. + + - sub_turbo_stat : This file gives the total number of times the + max frequency is throttled to lower frequency in sub-turbo(below + nominal frequency) range of frequencies. + + - unthrottle : This file gives the total number of times the max + frequency is unthrottled after being throttled. + + - powercap : This file gives the total number of times the max + frequency is throttled due to 'Power Capping'. + + - overtemp : This file gives the total number of times the max + frequency is throttled due to 'CPU Over Temperature'. + + - supply_fault : This file gives the total number of times the + max frequency is throttled due to 'Power Supply Failure'. + + - overcurrent : This file gives the total number of times the + max frequency is throttled due to 'Overcurrent'. + + - occ_reset : This file gives the total number of times the max + frequency is throttled due to 'OCC Reset'. + + The sysfs attributes representing different throttle reasons like + powercap, over
[PATCH v12] cpufreq: powernv: Add sysfs attributes to show throttle stats
Create sysfs attributes to export throttle information in /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats directory. The newly added sysfs files are as follows: 1)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/turbo_stat 2)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/sub-turbo_stat 3)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/unthrottle 4)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/powercap 5)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/overtemp 6)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/supply_fault 7)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/overcurrent 8)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/occ_reset Detailed explanation of each attribute is added to Documentation/ABI/testing/sysfs-devices-system-cpu CC: linux-...@vger.kernel.org Signed-off-by: Shilpasri G Bhat --- Changes from v11: - Removed '*create_throttle_sysfs' - policy->driver_data is used instead to maintain the flag for one-time creation of throttle sysfs files. Changes from v10: - Removed policy_notifiers to use driver->init() instead to create sysfs - sysfs attributes are removed by kobject_put(policy->kobj) - Rebased on top of http://lkml.iu.edu/hypermail/linux/kernel/1603.2/02268.html Changes from v9: - Modified documentation. - s/throttle_nominal/throttle_sub_turbo Changes from v8: - Moved the sysfs attributes from cpu/cpufreq/chipX to cpuX/cpufreq/throttle_stats - Adhering to one-value-per-file, replace throttle_table with multiple sysfs files. - Using CPUFREQ_POLICY_NOTIFIER to add/remove attribute_group. Documentation/ABI/testing/sysfs-devices-system-cpu | 69 drivers/cpufreq/powernv-cpufreq.c | 74 +- 2 files changed, 140 insertions(+), 3 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index b683e8e..1650133 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -271,3 +271,72 @@ Description: Parameters for the CPU cache attributes - WriteBack: data is written only to the cache line and the modified cache line is written to main memory only when it is replaced + +What: /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/turbo_stat + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/sub_turbo_stat + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/unthrottle + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/powercap + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/overtemp + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/supply_fault + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/overcurrent + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/occ_reset +Date: March 2016 +Contact: Linux kernel mailing list + Linux for PowerPC mailing list +Description: POWERNV CPUFreq driver's frequency throttle stats directory and + attributes + + 'cpuX/cpufreq/throttle_stats' directory contains the CPU frequency + throttle stat attributes for the chip. The throttle stats of a cpu + is common across all the cpus belonging to a chip. Below are the + throttle attributes exported in the 'throttle_stats' directory: + + - turbo_stat : This file gives the total number of times the max + frequency is throttled to lower frequency in turbo (at and above + nominal frequency) range of frequencies. + + - sub_turbo_stat : This file gives the total number of times the + max frequency is throttled to lower frequency in sub-turbo(below + nominal frequency) range of frequencies. + + - unthrottle : This file gives the total number of times the max + frequency is unthrottled after being throttled. + + - powercap : This file gives the total number of times the max + frequency is throttled due to 'Power Capping'. + + - overtemp : This file gives the total number of times the max + frequency is throttled due to 'CPU Over Temperature'. + + - supply_fault : This file gives the total number of times the + max frequency is throttled due to 'Power Supply Failure'. + + - overcurrent : This file gives the total number of times the + max frequency is throttled due to 'Overcurrent'. + + - occ_reset : This file gives the total number of times the max +
[PATCH v13] cpufreq: powernv: Add sysfs attributes to show throttle stats
Create sysfs attributes to export throttle information in /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats directory. The newly added sysfs files are as follows: 1)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/turbo_stat 2)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/sub-turbo_stat 3)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/unthrottle 4)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/powercap 5)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/overtemp 6)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/supply_fault 7)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/overcurrent 8)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/occ_reset Detailed explanation of each attribute is added to Documentation/ABI/testing/sysfs-devices-system-cpu CC: linux-...@vger.kernel.org Signed-off-by: Shilpasri G Bhat --- Changes from v12: - Removed (void *) typecast - Move the definition of ret inside the 'if' block Changes from v11: - Removed '*create_throttle_sysfs' - policy->driver_data is used instead to maintain the flag for one-time creation of throttle sysfs files. Changes from v10: - Removed policy_notifiers to use driver->init() instead to create sysfs - sysfs attributes are removed by kobject_put(policy->kobj) - Rebased on top of http://lkml.iu.edu/hypermail/linux/kernel/1603.2/02268.html Changes from v9: - Modified documentation. - s/throttle_nominal/throttle_sub_turbo Changes from v8: - Moved the sysfs attributes from cpu/cpufreq/chipX to cpuX/cpufreq/throttle_stats - Adhering to one-value-per-file, replace throttle_table with multiple sysfs files. - Using CPUFREQ_POLICY_NOTIFIER to add/remove attribute_group. Documentation/ABI/testing/sysfs-devices-system-cpu | 69 drivers/cpufreq/powernv-cpufreq.c | 74 +- 2 files changed, 141 insertions(+), 2 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index b683e8e..1650133 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -271,3 +271,72 @@ Description: Parameters for the CPU cache attributes - WriteBack: data is written only to the cache line and the modified cache line is written to main memory only when it is replaced + +What: /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/turbo_stat + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/sub_turbo_stat + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/unthrottle + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/powercap + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/overtemp + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/supply_fault + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/overcurrent + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/occ_reset +Date: March 2016 +Contact: Linux kernel mailing list + Linux for PowerPC mailing list +Description: POWERNV CPUFreq driver's frequency throttle stats directory and + attributes + + 'cpuX/cpufreq/throttle_stats' directory contains the CPU frequency + throttle stat attributes for the chip. The throttle stats of a cpu + is common across all the cpus belonging to a chip. Below are the + throttle attributes exported in the 'throttle_stats' directory: + + - turbo_stat : This file gives the total number of times the max + frequency is throttled to lower frequency in turbo (at and above + nominal frequency) range of frequencies. + + - sub_turbo_stat : This file gives the total number of times the + max frequency is throttled to lower frequency in sub-turbo(below + nominal frequency) range of frequencies. + + - unthrottle : This file gives the total number of times the max + frequency is unthrottled after being throttled. + + - powercap : This file gives the total number of times the max + frequency is throttled due to 'Power Capping'. + + - overtemp : This file gives the total number of times the max + frequency is throttled due to 'CPU Over Temperature'. + + - supply_fault : This file gives the total number of times the + max frequency is throttled due to 'Power Supply Failure'. + + - overcurrent : This file gives the total number of times the + max frequency is throttled due to 'Overcu
[PATCH] cpufreq: powernv: Fixes initialization of chip and chip mask
commit 735366fc4077 ("cpufreq: powernv: Call throttle_check() on receiving OCC_THROTTLE") used cpumask_of_node() as the chip mask. But this mask contains only online cpus. This breaks a setup where cpufreq is initialized with few offline cores and made online later. So this patch fixes this bug by scanning all the possible cpus and sets the cpu in the chip mask. It also fixes the chip discovery with non-contiguous cpu mask. This patch creates a list of chips 'powernv_chip_list' to replace the chip array for cleaner initialization. Signed-off-by: Shilpasri G Bhat Reviewed-by: Gautham R. Shenoy --- drivers/cpufreq/powernv-cpufreq.c | 80 +++ 1 file changed, 48 insertions(+), 32 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 39ac78c..0581a59 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -64,7 +64,9 @@ enum throttle_reason_type { OCC_MAX_REASON }; -static struct chip { +static LIST_HEAD(powernv_chip_list); + +struct chip { unsigned int id; bool throttled; bool restore; @@ -74,9 +76,9 @@ static struct chip { int throttle_turbo; int throttle_sub_turbo; int reason[OCC_MAX_REASON]; -} *chips; + struct list_head list; +}; -static int nr_chips; static DEFINE_PER_CPU(struct chip *, chip_info); /* @@ -528,12 +530,22 @@ out: put_online_cpus(); } +static inline struct chip *find_chip(unsigned int id) +{ + struct chip *chip; + + list_for_each_entry(chip, &powernv_chip_list, list) + if (chip->id == id) + return chip; + return NULL; +} + static int powernv_cpufreq_occ_msg(struct notifier_block *nb, unsigned long msg_type, void *_msg) { struct opal_msg *msg = _msg; struct opal_occ_msg omsg; - int i; + struct chip *chip; if (msg_type != OPAL_MSG_OCC) return 0; @@ -569,28 +581,27 @@ static int powernv_cpufreq_occ_msg(struct notifier_block *nb, throttled = false; pr_info("OCC Active, CPU frequency is no longer throttled\n"); - for (i = 0; i < nr_chips; i++) { - chips[i].restore = true; - schedule_work(&chips[i].throttle); + list_for_each_entry(chip, &powernv_chip_list, list) { + chip->restore = true; + schedule_work(&chip->throttle); } return 0; } - for (i = 0; i < nr_chips; i++) - if (chips[i].id == omsg.chip) - break; - + chip = find_chip(omsg.chip); + if (!chip) + return -EINVAL; if (omsg.throttle_status >= 0 && omsg.throttle_status <= OCC_MAX_THROTTLE_STATUS) { - chips[i].throttle_reason = omsg.throttle_status; - chips[i].reason[omsg.throttle_status]++; + chip->throttle_reason = omsg.throttle_status; + chip->reason[omsg.throttle_status]++; } if (!omsg.throttle_status) - chips[i].restore = true; + chip->restore = true; - schedule_work(&chips[i].throttle); + schedule_work(&chip->throttle); } return 0; } @@ -622,37 +633,42 @@ static struct cpufreq_driver powernv_cpufreq_driver = { static int init_chip_info(void) { - unsigned int chip[256]; - unsigned int cpu, i; + unsigned int cpu; unsigned int prev_chip_id = UINT_MAX; + struct chip *chip = NULL; for_each_possible_cpu(cpu) { unsigned int id = cpu_to_chip_id(cpu); if (prev_chip_id != id) { prev_chip_id = id; - chip[nr_chips++] = id; + chip = find_chip(id); } + if (!chip) { + chip = kzalloc(sizeof(struct chip), GFP_KERNEL); + if (!chip) + goto out; + chip->id = id; + INIT_WORK(&chip->throttle, powernv_cpufreq_work_fn); + INIT_LIST_HEAD(&chip->list); + list_add(&chip->list, &powernv_chip_list); + } + cpumask_set_cpu(cpu, &chip->mask); + per_cpu(chip_info, cpu) = chip; } - - chips = kcalloc(nr_chips, sizeof(struct chip), GFP_KERNEL); - if (!chips) -
[PATCH] cpufreq: powernv: Increase the verbosity of OCC console messages
Modify the OCC reset/load/active event message to make it clearer for the user to understand the event and effect of the event. Suggested-by: Stewart Smith Signed-off-by: Shilpasri G Bhat --- This patch is based on top of linux-next/master drivers/cpufreq/powernv-cpufreq.c | 9 + 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 546e056..64994e1 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -465,6 +465,7 @@ static int powernv_cpufreq_occ_msg(struct notifier_block *nb, switch (omsg.type) { case OCC_RESET: occ_reset = true; + pr_info("OCC (On Chip Controller - enforces hard thermal/power limits) Resetting\n"); /* * powernv_cpufreq_throttle_check() is called in * target() callback which can detect the throttle state @@ -474,12 +475,12 @@ static int powernv_cpufreq_occ_msg(struct notifier_block *nb, */ if (!throttled) { throttled = true; - pr_crit("CPU Frequency is throttled\n"); + pr_crit("CPU frequency is throttled for duration\n"); } - pr_info("OCC: Reset\n"); + break; case OCC_LOAD: - pr_info("OCC: Loaded\n"); + pr_info("OCC Loading, CPU frequency is throttled until OCC is started\n"); break; case OCC_THROTTLE: omsg.chip = be64_to_cpu(msg->params[1]); @@ -488,7 +489,7 @@ static int powernv_cpufreq_occ_msg(struct notifier_block *nb, if (occ_reset) { occ_reset = false; throttled = false; - pr_info("OCC: Active\n"); + pr_info("OCC Active, CPU frequency is no longer throttled\n"); for (i = 0; i < nr_chips; i++) { chips[i].restore = true; -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH] cpufreq: powernv: Export frequency throttle state of the chip through sysfs
Create a sysfs 'throttle' attribute per-chip(per-numa-node) to reflect the throttle state of the chip. The usersapce programs can poll on this attribute to keep an eye on the throttle state. Currently we print a log message to notify the user of throttling event. The performance-sensitive applications can monitor the throttle state using this attribute. Following file is created in sysfs: /sys/devices/system/node/nodeN/throttle 'throttle' attribute has the following values: 0 : frequency is unthrottled 1 : frequency is throttled Suggested-by: Stewart Smith Signed-off-by: Shilpasri G Bhat --- This patch is based on top of linux-next/master drivers/cpufreq/powernv-cpufreq.c | 21 + 1 file changed, 21 insertions(+) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 64994e1..aed6c34 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -28,6 +28,8 @@ #include #include #include +#include +#include #include #include @@ -413,6 +415,23 @@ static struct notifier_block powernv_cpufreq_reboot_nb = { .notifier_call = powernv_cpufreq_reboot_notifier, }; +static ssize_t throttle_show(struct device *dev, +struct device_attribute *attr, char *buf) +{ + int i; + + for (i = 0; i < nr_chips; i++) + if (chips[i].id == dev->id) { + smp_call_function_any(&chips[i].mask, + powernv_cpufreq_throttle_check, + NULL, 1); + return sprintf(buf, "%d\n", throttled || + chips[i].throttled); + } +} + +static DEVICE_ATTR(throttle, 0400, throttle_show, NULL); + void powernv_cpufreq_work_fn(struct work_struct *work) { struct chip *chip = container_of(work, struct chip, throttle); @@ -570,6 +589,8 @@ static int init_chip_info(void) cpumask_copy(&chips[i].mask, cpumask_of_node(chip[i])); INIT_WORK(&chips[i].throttle, powernv_cpufreq_work_fn); chips[i].restore = false; + device_create_file(&node_devices[chip[i]]->dev, + &dev_attr_throttle); } return 0; -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH] cpufreq: powernv: Export frequency throttle state of the chip through sysfs
On 08/27/2015 03:01 PM, Michael Ellerman wrote: > On Thu, 2015-08-27 at 14:43 +0530, Shilpasri G Bhat wrote: >> Create a sysfs 'throttle' attribute per-chip(per-numa-node) to reflect >> the throttle state of the chip. The usersapce programs can poll on >> this attribute to keep an eye on the throttle state. Currently we >> print a log message to notify the user of throttling event. The >> performance-sensitive applications can monitor the throttle state >> using this attribute. > > Performance sensitive applications can *poll* on a sysfs file, which does a > loop over all chips and potentially spams the console with pr_crit() messages > ? > > That does not sound like a recipe for success. Okay. > > What problem are we actually trying to solve here? How to export the throttle information to the user space, such that it is consumable by the applications (for example to discard the benchmark run if the frequency was throttled)? We already print the throttle information to console. Can we extend this to notify/broadcast as an event to the applications? Thanks and Regards, Shilpa ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH] cpufreq : powernv: Report Pmax throttling if capped below nominal frequency
Log a 'critical' message if the max frequency is reduced below nominal frequency. We already log 'info' message if the max frequency is capped below turbo frequency. CPU should guarantee atleast nominal frequency, but not turbo frequency in all system configurations and environments. So report the pmax throttling with severity when Pmax is dipped below nominal frequency. Signed-off-by: Shilpasri G Bhat --- drivers/cpufreq/powernv-cpufreq.c | 9 +++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 64994e1..2d9ed42 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -327,8 +327,13 @@ static void powernv_cpufreq_throttle_check(void *data) if (chips[i].throttled) goto next; chips[i].throttled = true; - pr_info("CPU %d on Chip %u has Pmax reduced to %d\n", cpu, - chips[i].id, pmsr_pmax); + if (pmsr_pmax < powernv_pstate_info.nominal) + pr_crit("CPU %d on Chip %u has Pmax reduced to %d\n", + cpu, chips[i].id, pmsr_pmax); + else + pr_info("CPU %d on Chip %u has Pmax reduced to %d\n", + cpu, chips[i].id, pmsr_pmax); + } else if (chips[i].throttled) { chips[i].throttled = false; pr_info("CPU %d on Chip %u has Pmax restored to %d\n", cpu, -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v2] cpufreq : powernv: Report Pmax throttling if capped below nominal frequency
Log a 'critical' message if the max frequency is reduced below nominal frequency. We already log 'info' message if the max frequency is capped below turbo frequency. CPU should guarantee atleast nominal frequency, but not turbo frequency in all system configurations and environments. So report the pmax throttling with severity when Pmax is dipped below nominal frequency. Signed-off-by: Shilpasri G Bhat --- Changes from v1: - Modified the printk messages as per Viresh's suggestion. drivers/cpufreq/powernv-cpufreq.c | 10 -- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 64994e1..cb50138 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -327,8 +327,14 @@ static void powernv_cpufreq_throttle_check(void *data) if (chips[i].throttled) goto next; chips[i].throttled = true; - pr_info("CPU %d on Chip %u has Pmax reduced to %d\n", cpu, - chips[i].id, pmsr_pmax); + if (pmsr_pmax < powernv_pstate_info.nominal) + pr_crit("CPU %d on Chip %u has Pmax reduced below nominal frequency (%d < %d)\n", + cpu, chips[i].id, pmsr_pmax, + powernv_pstate_info.nominal); + else + pr_info("CPU %d on Chip %u has Pmax reduced below turbo frequency (%d < %d)\n", + cpu, chips[i].id, pmsr_pmax, + powernv_pstate_info.max); } else if (chips[i].throttled) { chips[i].throttled = false; pr_info("CPU %d on Chip %u has Pmax restored to %d\n", cpu, -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH] cpufreq: powernv: Redesign the presentation of throttle notification
Replace the throttling event console messages to perf trace event "power:powernv_throttle" and throttle counter stats which are exported in sysfs. The newly added sysfs files are as follows: 1)/sys/devices/system/node/node0/throttle_frequencies This gives the throttle stats for each of the available frequencies. The throttle stat of a frequency is the total number of times the max frequency was reduced to that frequency. # cat /sys/devices/system/node/node0/throttle_frequencies 4023000 0 399 0 3956000 1 3923000 0 389 0 3857000 2 3823000 0 379 0 3757000 2 3724000 1 369 1 ... 2)/sys/devices/system/node/node0/throttle_reasons This gives the stats for each of the supported throttle reasons. This gives the total number of times the frequency was throttled due to each of the reasons. # cat /sys/devices/system/node/node0/throttle_reasons No throttling 7 Power Cap 0 Processor Over Temperature 7 Power Supply Failure 0 Over Current 0 OCC Reset 0 3)/sys/devices/system/node/node0/throttle_stat This gives the total number of throttle events occurred in turbo range of frequencies and non-turbo(below nominal) range of frequencies. # cat /sys/devices/system/node/node0/throttle_stat Turbo 7 Nominal 0 Signed-off-by: Shilpasri G Bhat --- drivers/cpufreq/powernv-cpufreq.c | 186 +- include/trace/events/power.h | 22 + 2 files changed, 166 insertions(+), 42 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index cb50138..bdde9d6 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -28,6 +28,9 @@ #include #include #include +#include +#include +#include #include #include @@ -43,12 +46,27 @@ static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1]; static bool rebooting, throttled, occ_reset; +static char throttle_reason[][30] = { + "No throttling", + "Power Cap", + "Processor Over Temperature", + "Power Supply Failure", + "Over Current", + "OCC Reset" +}; + static struct chip { unsigned int id; bool throttled; cpumask_t mask; struct work_struct throttle; bool restore; + /* Pmax throttle stats */ + int throt_reason; + int throt_turbo; + int throt_nominal; + int reason[OCC_MAX_THROTTLE_STATUS + 1]; + int *pstate_stat; } *chips; static int nr_chips; @@ -309,40 +327,54 @@ static inline unsigned int get_nominal_index(void) return powernv_pstate_info.max - powernv_pstate_info.nominal; } -static void powernv_cpufreq_throttle_check(void *data) +static void powernv_cpufreq_read_pmax(void *data) { unsigned int cpu = smp_processor_id(); unsigned long pmsr; - int pmsr_pmax, i; - - pmsr = get_pmspr(SPRN_PMSR); + int pmsr_pmax, index, i; for (i = 0; i < nr_chips; i++) if (chips[i].id == cpu_to_chip_id(cpu)) break; - /* Check for Pmax Capping */ + pmsr = get_pmspr(SPRN_PMSR); pmsr_pmax = (s8)PMSR_MAX(pmsr); if (pmsr_pmax != powernv_pstate_info.max) { if (chips[i].throttled) - goto next; + return; chips[i].throttled = true; - if (pmsr_pmax < powernv_pstate_info.nominal) - pr_crit("CPU %d on Chip %u has Pmax reduced below nominal frequency (%d < %d)\n", + if (pmsr_pmax < powernv_pstate_info.nominal) { + pr_warn("CPU %d on Chip %u has Pmax reduced below nominal frequency (%d < %d)\n", cpu, chips[i].id, pmsr_pmax, powernv_pstate_info.nominal); - else - pr_info("CPU %d on Chip %u has Pmax reduced below turbo frequency (%d < %d)\n", - cpu, chips[i].id, pmsr_pmax, - powernv_pstate_info.max); + chips[i].throt_nominal++; + } else { + chips[i].throt_turbo++; + } + index = powernv_pstate_info.max - pmsr_pmax; + if (index >= 0 && index < powernv_pstate_info.nr_pstates) + chips[i].pstate_stat[index]++; + trace_powernv_throttle(chips[i].id, + throttle_reason[chips[i].throt_reason], + pmsr_pmax);
[PATCH v2 0/2] cpufreq: powernv: Redesign the presentation of throttle notification
In POWER8, OCC(On-Chip-Controller) can throttle the frequency of the CPU when the chip crosses its thermal and power limits. Currently, powernv-cpufreq driver detects and reports this event as a console message. Some boxes may not sustain the max turbo frequency in all conditions and can be throttled frequently. This can lead to the flooding of console with throttle messages. So this patchset aims to redesign the presentation of this event via sysfs counters and tracepoints. This patchset will add a perf trace point "power:powernv_throttle" and sysfs throttle counter stats in /sys/devices/system/cpu/cpufreq/chipN. Shilpasri G Bhat (2): cpufreq: powernv/tracing: Add powernv_throttle tracepoint cpufreq: powernv: Redesign the presentation of throttle notification drivers/cpufreq/powernv-cpufreq.c | 247 +++--- include/trace/events/power.h | 22 kernel/trace/power-traces.c | 1 + 3 files changed, 227 insertions(+), 43 deletions(-) -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v2 1/2] cpufreq: powernv/tracing: Add powernv_throttle tracepoint
This patch adds the powernv_throttle tracepoint to trace the CPU frequency throttling event, which is used by the powernv-cpufreq driver in POWER8. Signed-off-by: Shilpasri G Bhat CC: Ingo Molnar CC: Steven Rostedt --- Changes from v1: - Export the tracepoint include/trace/events/power.h | 22 ++ kernel/trace/power-traces.c | 1 + 2 files changed, 23 insertions(+) diff --git a/include/trace/events/power.h b/include/trace/events/power.h index 284244e..19e5030 100644 --- a/include/trace/events/power.h +++ b/include/trace/events/power.h @@ -38,6 +38,28 @@ DEFINE_EVENT(cpu, cpu_idle, TP_ARGS(state, cpu_id) ); +TRACE_EVENT(powernv_throttle, + + TP_PROTO(int chip_id, const char *reason, int pmax), + + TP_ARGS(chip_id, reason, pmax), + + TP_STRUCT__entry( + __field(int, chip_id) + __string(reason, reason) + __field(int, pmax) + ), + + TP_fast_assign( + __entry->chip_id = chip_id; + __assign_str(reason, reason); + __entry->pmax = pmax; + ), + + TP_printk("Chip %d Pmax %d %s", __entry->chip_id, + __entry->pmax, __get_str(reason)) +); + TRACE_EVENT(pstate_sample, TP_PROTO(u32 core_busy, diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index eb4220a..81b8745 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -15,4 +15,5 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume); EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); +EXPORT_TRACEPOINT_SYMBOL_GPL(powernv_throttle); -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v2 2/2] cpufreq: powernv: Redesign the presentation of throttle notification
Replace the throttling event console messages to perf trace point "power:powernv_throttle" and throttle counter stats which are exported in sysfs in /sys/devices/system/cpu/cpufreq/chipN. The newly added sysfs files are as follows: 1)/sys/devices/system/cpu/cpufreq/chip0/throttle_frequencies This gives the throttle stats for each of the available frequencies. The throttle stat of a frequency is the total number of times the max frequency is reduced to that frequency. # cat /sys/devices/system/cpu/cpufreq/chip0/throttle_frequencies 4023000 0 399 0 3956000 1 3923000 0 389 0 3857000 2 3823000 0 379 0 3757000 2 3724000 1 369 1 ... 2)/sys/devices/system/cpu/cpufreq/chip0/throttle_reasons This directory contains throttle reason files. Each file gives the total number of times the max frequency is throttled, except for 'throttle_reset', which gives the total number of times the max frequency is unthrottled after being throttled. # cd /sys/devices/system/cpu/cpufreq/chip0/throttle_reasons # cat cpu_over_temperature 7 # cat occ_reset 0 # cat over_current 0 # cat power_cap 0 # cat power_supply_failure 0 # cat throttle_reset 7 3)/sys/devices/system/cpu/cpufreq/chip0/throttle_stat This gives the total number of events of max frequency throttling to lower frequencies in the turbo range of frequencies and the sub-turbo(at and below nominal) range of frequencies. # cat /sys/devices/system/cpu/cpufreq/chip0/throttle_stat turbo 7 sub-turbo 0 Signed-off-by: Shilpasri G Bhat --- Changes from v1: - Added a kobject to struct chip - Grouped the throttle reasons under a separate attribute_group and exported each reason as individual file. - Moved the sysfs files from /sys/devices/system/node/nodeN to /sys/devices/system/cpu/cpufreq/chipN - As suggested by Paul Clarke replaced 'Nominal' with 'sub-turbo' and char * throttle_reason[][30] by const char * const throttle_reason[]. - Modified the commit message. drivers/cpufreq/powernv-cpufreq.c | 247 +++--- 1 file changed, 204 insertions(+), 43 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index cb50138..00caef1 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -43,12 +44,37 @@ static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1]; static bool rebooting, throttled, occ_reset; +static const char * const throttle_reason[] = { + "No throttling", + "Power Cap", + "Processor Over Temperature", + "Power Supply Failure", + "Over Current", + "OCC Reset" +}; + +enum throt_reason_type { + NO_THROTTLE = 0, + POWERCAP, + CPU_OVERTEMP, + POWER_SUPPLY_FAILURE, + OVERCURRENT, + OCC_RESET_THROTTLE +}; + static struct chip { unsigned int id; bool throttled; cpumask_t mask; struct work_struct throttle; bool restore; + /* Pmax throttle stats */ + int throt_reason; + int throt_turbo; + int throt_nominal; + int reason[OCC_MAX_THROTTLE_STATUS + 1]; + int *pstate_stat; + struct kobject *kobj; } *chips; static int nr_chips; @@ -309,11 +335,11 @@ static inline unsigned int get_nominal_index(void) return powernv_pstate_info.max - powernv_pstate_info.nominal; } -static void powernv_cpufreq_throttle_check(void *data) +static void powernv_cpufreq_read_pmax(void *data) { unsigned int cpu = smp_processor_id(); unsigned long pmsr; - int pmsr_pmax, i; + int pmsr_pmax, i, index; pmsr = get_pmspr(SPRN_PMSR); @@ -321,28 +347,43 @@ static void powernv_cpufreq_throttle_check(void *data) if (chips[i].id == cpu_to_chip_id(cpu)) break; - /* Check for Pmax Capping */ pmsr_pmax = (s8)PMSR_MAX(pmsr); if (pmsr_pmax != powernv_pstate_info.max) { if (chips[i].throttled) - goto next; + return; chips[i].throttled = true; - if (pmsr_pmax < powernv_pstate_info.nominal) - pr_crit("CPU %d on Chip %u has Pmax reduced below nominal frequency (%d < %d)\n", - cpu, chips[i].id, pmsr_pmax, + if (pmsr_pmax < powernv_pstate_info.nominal) { + pr_warn_once("CPU %d on Chip %u has Pmax reduced below nominal frequency (%d < %d)\n", +cpu, chips[i].id, pmsr_pmax, powernv_pstate_info.nominal); - else - pr_info(
Re: [PATCH] cpufreq: powernv: Redesign the presentation of throttle notification
Hi, On 12/15/2015 02:59 AM, Paul Clarke wrote: > On 12/13/2015 12:17 PM, Shilpasri G Bhat wrote: >> Replace the throttling event console messages to perf trace event >> "power:powernv_throttle" and throttle counter stats which are >> exported in sysfs. The newly added sysfs files are as follows: >> >> 1)/sys/devices/system/node/node0/throttle_frequencies >>This gives the throttle stats for each of the available frequencies. >>The throttle stat of a frequency is the total number of times the max >>frequency was reduced to that frequency. >># cat /sys/devices/system/node/node0/throttle_frequencies >>4023000 0 >>399 0 >>3956000 1 >>3923000 0 >>389 0 >>3857000 2 >>3823000 0 >>379 0 >>3757000 2 >>3724000 1 >>369 1 >>... > > Is this data useful? It seems like "elapsed time" at each frequency might be > more useful, if any. > Yes elapsed time is more useful data here. But the concern here is with the accuracy of measurement/observation of elapsed time by the kernel. OCC can throttle/unthrottle the frequency at the granularity of 250us. Although OCC updates the throttle status to HOMER region immediately there may be a delay in propagating this message by the opal-poller to the driver. So instead we might want OCC to give us the throttled elapsed time stat for each frequency and opal-poller/driver can take the snapshot of this info every n seconds. >> 2)/sys/devices/system/node/node0/throttle_reasons >>This gives the stats for each of the supported throttle reasons. >>This gives the total number of times the frequency was throttled due >>to each of the reasons. >># cat /sys/devices/system/node/node0/throttle_reasons >>No throttling 7 >>Power Cap 0 >>Processor Over Temperature 7 >>Power Supply Failure 0 >>Over Current 0 >>OCC Reset 0 >> >> 3)/sys/devices/system/node/node0/throttle_stat >>This gives the total number of throttle events occurred in turbo >>range of frequencies and non-turbo(below nominal) range of >>frequencies. > > non-turbo should read "at or below nominal". Maybe "sub-turbo" is a better > term(?) > >> # cat /sys/devices/system/node/node0/throttle_stat >>Turbo 7 >>Nominal 0 > > Should this read "Non-turbo" or "Sub-turbo" instead of "Nominal", since the > events could well occur when already operating below nominal. > Agree. Applied 'sub-turbo' in v2 >> Signed-off-by: Shilpasri G Bhat >> --- >> drivers/cpufreq/powernv-cpufreq.c | 186 >> +- >> include/trace/events/power.h | 22 + >> 2 files changed, 166 insertions(+), 42 deletions(-) >> >> diff --git a/drivers/cpufreq/powernv-cpufreq.c >> b/drivers/cpufreq/powernv-cpufreq.c >> index cb50138..bdde9d6 100644 >> --- a/drivers/cpufreq/powernv-cpufreq.c >> +++ b/drivers/cpufreq/powernv-cpufreq.c >> @@ -28,6 +28,9 @@ >> #include >> #include >> #include >> +#include >> +#include >> +#include >> >> #include >> #include >> @@ -43,12 +46,27 @@ >> static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1]; >> static bool rebooting, throttled, occ_reset; >> >> +static char throttle_reason[][30] = { >> +"No throttling", >> +"Power Cap", >> +"Processor Over Temperature", >> +"Power Supply Failure", >> +"Over Current", >> +"OCC Reset" >> + }; > > I'm curious if this would be slightly more efficiently implemented as: > static const char *throttle_reason[] = { ... }; > > Do you need 30 characters per string for a reason? > > Regardless, it should be const. Modified the declaration in v2 version of the patch. > > [...] > -- > PC Thanks and Regards, Shilpa ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v3 0/2] cpufreq: powernv: Redesign the presentation of throttle notification
In POWER8, OCC(On-Chip-Controller) can throttle the frequency of the CPU when the chip crosses its thermal and power limits. Currently, powernv-cpufreq driver detects and reports this event as a console message. Some boxes may not sustain the max turbo frequency in all conditions and can be throttled frequently. This can lead to the flooding of console with throttle messages. So this patchset aims to redesign the presentation of this event via sysfs counters and tracepoints. This patchset will add a perf trace point "power:powernv_throttle" and sysfs throttle counter stats in /sys/devices/system/cpu/cpufreq/chipN. Changes from v2: - Fixed kbuild test warning. drivers/cpufreq/powernv-cpufreq.c:609:2: warning: ignoring return value of 'kstrtoint', declared with attribute warn_unused_result [-Wunused-result] Shilpasri G Bhat (2): cpufreq: powernv/tracing: Add powernv_throttle tracepoint cpufreq: powernv: Redesign the presentation of throttle notification drivers/cpufreq/powernv-cpufreq.c | 256 +++--- include/trace/events/power.h | 22 kernel/trace/power-traces.c | 1 + 3 files changed, 236 insertions(+), 43 deletions(-) -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v3 2/2] cpufreq: powernv: Redesign the presentation of throttle notification
Replace the throttling event console messages to perf trace point "power:powernv_throttle" and throttle counter stats which are exported in sysfs in /sys/devices/system/cpu/cpufreq/chipN. The newly added sysfs files are as follows: 1)/sys/devices/system/cpu/cpufreq/chip0/throttle_frequencies This gives the throttle stats for each of the available frequencies. The throttle stat of a frequency is the total number of times the max frequency is reduced to that frequency. # cat /sys/devices/system/cpu/cpufreq/chip0/throttle_frequencies 4023000 0 399 0 3956000 1 3923000 0 389 0 3857000 2 3823000 0 379 0 3757000 2 3724000 1 369 1 ... 2)/sys/devices/system/cpu/cpufreq/chip0/throttle_reasons This directory contains throttle reason files. Each file gives the total number of times the max frequency is throttled, except for 'throttle_reset', which gives the total number of times the max frequency is unthrottled after being throttled. # cd /sys/devices/system/cpu/cpufreq/chip0/throttle_reasons # cat cpu_over_temperature 7 # cat occ_reset 0 # cat over_current 0 # cat power_cap 0 # cat power_supply_failure 0 # cat throttle_reset 7 3)/sys/devices/system/cpu/cpufreq/chip0/throttle_stat This gives the total number of events of max frequency throttling to lower frequencies in the turbo range of frequencies and the sub-turbo(at and below nominal) range of frequencies. # cat /sys/devices/system/cpu/cpufreq/chip0/throttle_stat turbo 7 sub-turbo 0 Signed-off-by: Shilpasri G Bhat --- Changes from v2: - Fixed kbuild test warning. drivers/cpufreq/powernv-cpufreq.c:609:2: warning: ignoring return value of 'kstrtoint', declared with attribute warn_unused_result [-Wunused-result] Changes from v1: - Added a kobject to struct chip - Grouped the throttle reasons under a separate attribute_group and exported each reason as individual file. - Moved the sysfs files from /sys/devices/system/node/nodeN to /sys/devices/system/cpu/cpufreq/chipN - As suggested by Paul Clarke replaced 'Nominal' with 'sub-turbo' and char * throttle_reason[][30] by const char * const throttle_reason[]. - Modified the commit message. drivers/cpufreq/powernv-cpufreq.c | 256 +++--- 1 file changed, 213 insertions(+), 43 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index cb50138..5574f06 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -43,12 +44,37 @@ static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1]; static bool rebooting, throttled, occ_reset; +static const char * const throttle_reason[] = { + "No throttling", + "Power Cap", + "Processor Over Temperature", + "Power Supply Failure", + "Over Current", + "OCC Reset" +}; + +enum throt_reason_type { + NO_THROTTLE = 0, + POWERCAP, + CPU_OVERTEMP, + POWER_SUPPLY_FAILURE, + OVERCURRENT, + OCC_RESET_THROTTLE +}; + static struct chip { unsigned int id; bool throttled; cpumask_t mask; struct work_struct throttle; bool restore; + /* Pmax throttle stats */ + int throt_reason; + int throt_turbo; + int throt_nominal; + int reason[OCC_MAX_THROTTLE_STATUS + 1]; + int *pstate_stat; + struct kobject *kobj; } *chips; static int nr_chips; @@ -309,11 +335,11 @@ static inline unsigned int get_nominal_index(void) return powernv_pstate_info.max - powernv_pstate_info.nominal; } -static void powernv_cpufreq_throttle_check(void *data) +static void powernv_cpufreq_read_pmax(void *data) { unsigned int cpu = smp_processor_id(); unsigned long pmsr; - int pmsr_pmax, i; + int pmsr_pmax, i, index; pmsr = get_pmspr(SPRN_PMSR); @@ -321,28 +347,43 @@ static void powernv_cpufreq_throttle_check(void *data) if (chips[i].id == cpu_to_chip_id(cpu)) break; - /* Check for Pmax Capping */ pmsr_pmax = (s8)PMSR_MAX(pmsr); if (pmsr_pmax != powernv_pstate_info.max) { if (chips[i].throttled) - goto next; + return; chips[i].throttled = true; - if (pmsr_pmax < powernv_pstate_info.nominal) - pr_crit("CPU %d on Chip %u has Pmax reduced below nominal frequency (%d < %d)\n", - cpu, chips[i].id, pmsr_pmax, + if (pmsr_pmax < powernv_pstate_info.nominal) { + pr_warn_once("CPU %d on Chip %u has Pmax reduced below nominal frequency (%d &
[PATCH v3 1/2] cpufreq: powernv/tracing: Add powernv_throttle tracepoint
This patch adds the powernv_throttle tracepoint to trace the CPU frequency throttling event, which is used by the powernv-cpufreq driver in POWER8. Signed-off-by: Shilpasri G Bhat CC: Ingo Molnar CC: Steven Rostedt --- No changes from v2. Changes from v1: - Export the tracepoint include/trace/events/power.h | 22 ++ kernel/trace/power-traces.c | 1 + 2 files changed, 23 insertions(+) diff --git a/include/trace/events/power.h b/include/trace/events/power.h index 284244e..19e5030 100644 --- a/include/trace/events/power.h +++ b/include/trace/events/power.h @@ -38,6 +38,28 @@ DEFINE_EVENT(cpu, cpu_idle, TP_ARGS(state, cpu_id) ); +TRACE_EVENT(powernv_throttle, + + TP_PROTO(int chip_id, const char *reason, int pmax), + + TP_ARGS(chip_id, reason, pmax), + + TP_STRUCT__entry( + __field(int, chip_id) + __string(reason, reason) + __field(int, pmax) + ), + + TP_fast_assign( + __entry->chip_id = chip_id; + __assign_str(reason, reason); + __entry->pmax = pmax; + ), + + TP_printk("Chip %d Pmax %d %s", __entry->chip_id, + __entry->pmax, __get_str(reason)) +); + TRACE_EVENT(pstate_sample, TP_PROTO(u32 core_busy, diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index eb4220a..81b8745 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -15,4 +15,5 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume); EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); +EXPORT_TRACEPOINT_SYMBOL_GPL(powernv_throttle); -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v4 0/4] cpufreq: powernv: Redesign the presentation of throttle notification
In POWER8, OCC(On-Chip-Controller) can throttle the frequency of the CPU when the chip crosses its thermal and power limits. Currently, powernv-cpufreq driver detects and reports this event as a console message. Some machines may not sustain the max turbo frequency in all conditions and can be throttled frequently. This can lead to the flooding of console with throttle messages. So this patchset aims to redesign the presentation of this event via sysfs counters and tracepoints. Patches [2] to [4] will add a perf trace point "power:powernv_throttle" and sysfs throttle counter stats in /sys/devices/system/cpu/cpufreq/chipN. Patch [1] solves a bug in powernv_cpufreq_throttle_check(), which calls in to cpu_to_chip_id() in hot path which reads DT every time to find the chip id. Changes from v3: - Add a fix to replace cpu_to_chip_id() with simpler PIR shift to obtain the chip id. - Break patch2 in to two patches separating the tracepoint and sysfs attribute changes. Changes from v2: - Fixed kbuild test warning. drivers/cpufreq/powernv-cpufreq.c:609:2: warning: ignoring return value of 'kstrtoint', declared with attribute warn_unused_result [-Wunused-result] Shilpasri G Bhat (4): cpufreq: powernv: Remove cpu_to_chip_id() from hot-path cpufreq: powernv/tracing: Add powernv_throttle tracepoint cpufreq: powernv: Add a trace print for the throttle event cpufreq: powernv: Add sysfs attributes to show throttle stats drivers/cpufreq/powernv-cpufreq.c | 279 +++--- include/trace/events/power.h | 22 +++ kernel/trace/power-traces.c | 1 + 3 files changed, 250 insertions(+), 52 deletions(-) -- 1.9.1 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v4 1/4] cpufreq: powernv: Remove cpu_to_chip_id() from hot-path
cpu_to_chip_id() does a DT walk through to find out the chip id by taking a contended device tree lock. This adds an unnecessary overhead in a hot-path. So instead of cpu_to_chip_id() use PIR of the cpu to find the chip id. Reported-by: Anton Blanchard Signed-off-by: Shilpasri G Bhat --- drivers/cpufreq/powernv-cpufreq.c | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index cb50138..597a084 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -39,6 +39,7 @@ #define PMSR_PSAFE_ENABLE (1UL << 30) #define PMSR_SPR_EM_DISABLE(1UL << 31) #define PMSR_MAX(x)((x >> 32) & 0xFF) +#define pir_to_chip_id(pir)(((pir) >> 7) & 0x3f) static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1]; static bool rebooting, throttled, occ_reset; @@ -312,13 +313,14 @@ static inline unsigned int get_nominal_index(void) static void powernv_cpufreq_throttle_check(void *data) { unsigned int cpu = smp_processor_id(); + unsigned int chip_id = pir_to_chip_id(hard_smp_processor_id()); unsigned long pmsr; int pmsr_pmax, i; pmsr = get_pmspr(SPRN_PMSR); for (i = 0; i < nr_chips; i++) - if (chips[i].id == cpu_to_chip_id(cpu)) + if (chips[i].id == chip_id) break; /* Check for Pmax Capping */ @@ -558,7 +560,8 @@ static int init_chip_info(void) unsigned int prev_chip_id = UINT_MAX; for_each_possible_cpu(cpu) { - unsigned int id = cpu_to_chip_id(cpu); + unsigned int id = + pir_to_chip_id(get_hard_smp_processor_id(cpu)); if (prev_chip_id != id) { prev_chip_id = id; -- 1.9.1 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v4 2/4] cpufreq: powernv/tracing: Add powernv_throttle tracepoint
This patch adds the powernv_throttle tracepoint to trace the CPU frequency throttling event, which is used by the powernv-cpufreq driver in POWER8. Signed-off-by: Shilpasri G Bhat CC: Ingo Molnar CC: Steven Rostedt --- No changes from v2 and v3. include/trace/events/power.h | 22 ++ kernel/trace/power-traces.c | 1 + 2 files changed, 23 insertions(+) diff --git a/include/trace/events/power.h b/include/trace/events/power.h index 284244e..19e5030 100644 --- a/include/trace/events/power.h +++ b/include/trace/events/power.h @@ -38,6 +38,28 @@ DEFINE_EVENT(cpu, cpu_idle, TP_ARGS(state, cpu_id) ); +TRACE_EVENT(powernv_throttle, + + TP_PROTO(int chip_id, const char *reason, int pmax), + + TP_ARGS(chip_id, reason, pmax), + + TP_STRUCT__entry( + __field(int, chip_id) + __string(reason, reason) + __field(int, pmax) + ), + + TP_fast_assign( + __entry->chip_id = chip_id; + __assign_str(reason, reason); + __entry->pmax = pmax; + ), + + TP_printk("Chip %d Pmax %d %s", __entry->chip_id, + __entry->pmax, __get_str(reason)) +); + TRACE_EVENT(pstate_sample, TP_PROTO(u32 core_busy, diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index eb4220a..81b8745 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -15,4 +15,5 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume); EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); +EXPORT_TRACEPOINT_SYMBOL_GPL(powernv_throttle); -- 1.9.1 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v4 3/4] cpufreq: powernv: Add a trace print for the throttle event
Record the throttle event with a trace print replacing the printk, except for events like throttling below nominal and occ reset event which print a warning message. Signed-off-by: Shilpasri G Bhat --- Changes from v3: - Separate this patch to contain trace_point changes - Move struct chip member 'restore' of type bool above 'mask' to reduce structure padding. No changes from v2. Changes from v1: - As suggested by Paul Clarke replaced char * throttle_reason[][30] by const char * const throttle_reason[]. drivers/cpufreq/powernv-cpufreq.c | 95 --- 1 file changed, 49 insertions(+), 46 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 597a084..c98a6e7 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -44,12 +45,22 @@ static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1]; static bool rebooting, throttled, occ_reset; +static const char * const throttle_reason[] = { + "No throttling", + "Power Cap", + "Processor Over Temperature", + "Power Supply Failure", + "Over Current", + "OCC Reset" +}; + static struct chip { unsigned int id; bool throttled; + bool restore; + u8 throt_reason; cpumask_t mask; struct work_struct throttle; - bool restore; } *chips; static int nr_chips; @@ -310,41 +321,49 @@ static inline unsigned int get_nominal_index(void) return powernv_pstate_info.max - powernv_pstate_info.nominal; } -static void powernv_cpufreq_throttle_check(void *data) +static void powernv_cpufreq_check_pmax(void) { unsigned int cpu = smp_processor_id(); unsigned int chip_id = pir_to_chip_id(hard_smp_processor_id()); - unsigned long pmsr; int pmsr_pmax, i; - pmsr = get_pmspr(SPRN_PMSR); + pmsr_pmax = (s8)PMSR_MAX(get_pmspr(SPRN_PMSR)); for (i = 0; i < nr_chips; i++) if (chips[i].id == chip_id) break; - /* Check for Pmax Capping */ - pmsr_pmax = (s8)PMSR_MAX(pmsr); if (pmsr_pmax != powernv_pstate_info.max) { if (chips[i].throttled) - goto next; + return; + chips[i].throttled = true; if (pmsr_pmax < powernv_pstate_info.nominal) - pr_crit("CPU %d on Chip %u has Pmax reduced below nominal frequency (%d < %d)\n", - cpu, chips[i].id, pmsr_pmax, - powernv_pstate_info.nominal); - else - pr_info("CPU %d on Chip %u has Pmax reduced below turbo frequency (%d < %d)\n", - cpu, chips[i].id, pmsr_pmax, - powernv_pstate_info.max); + pr_warn_once("CPU %d on Chip %u has Pmax reduced below nominal frequency (%d < %d)\n", +cpu, chips[i].id, pmsr_pmax, +powernv_pstate_info.nominal); + + trace_powernv_throttle(chips[i].id, + throttle_reason[chips[i].throt_reason], + pmsr_pmax); } else if (chips[i].throttled) { chips[i].throttled = false; - pr_info("CPU %d on Chip %u has Pmax restored to %d\n", cpu, - chips[i].id, pmsr_pmax); + trace_powernv_throttle(chips[i].id, + throttle_reason[chips[i].throt_reason], + pmsr_pmax); } +} + +static void powernv_cpufreq_throttle_check(void *data) +{ + unsigned long pmsr; + + pmsr = get_pmspr(SPRN_PMSR); + + /* Check for Pmax Capping */ + powernv_cpufreq_check_pmax(); /* Check if Psafe_mode_active is set in PMSR. */ -next: if (pmsr & PMSR_PSAFE_ENABLE) { throttled = true; pr_info("Pstate set to safe frequency\n"); @@ -358,7 +377,7 @@ next: if (throttled) { pr_info("PMSR = %16lx\n", pmsr); - pr_crit("CPU Frequency could be throttled\n"); + pr_warn("CPU Frequency could be throttled\n"); } } @@ -449,15 +468,6 @@ void powernv_cpufreq_work_fn(struct work_struct *work) } } -static char throttle_reason[][30] = { - "No throttling", - "Power Cap", - "Processor Over Temperature", -
[PATCH v4 4/4] cpufreq: powernv: Add sysfs attributes to show throttle stats
Create sysfs attributes to export throttle information in /sys/devices/system/cpu/cpufreq/chipN. The newly added sysfs files are as follows: 1)/sys/devices/system/cpu/cpufreq/chip0/throttle_frequencies This gives the throttle stats for each of the available frequencies. The throttle stat of a frequency is the total number of times the max frequency is reduced to that frequency. # cat /sys/devices/system/cpu/cpufreq/chip0/throttle_frequencies 4023000 0 399 0 3956000 1 3923000 0 389 0 3857000 2 3823000 0 379 0 3757000 2 3724000 1 369 1 ... 2)/sys/devices/system/cpu/cpufreq/chip0/throttle_reasons This directory contains throttle reason files. Each file gives the total number of times the max frequency is throttled, except for 'throttle_reset', which gives the total number of times the max frequency is unthrottled after being throttled. # cd /sys/devices/system/cpu/cpufreq/chip0/throttle_reasons # cat cpu_over_temperature 7 # cat occ_reset 0 # cat over_current 0 # cat power_cap 0 # cat power_supply_failure 0 # cat throttle_reset 7 3)/sys/devices/system/cpu/cpufreq/chip0/throttle_stat This gives the total number of events of max frequency throttling to lower frequencies in the turbo range of frequencies and the sub-turbo(at and below nominal) range of frequencies. # cat /sys/devices/system/cpu/cpufreq/chip0/throttle_stat turbo 7 sub-turbo 0 Signed-off-by: Shilpasri G Bhat --- Changes from v3: - Seperate the patch to contain only the throttle sysfs attribute changes. - Add helper inline function get_chip_index() Changes from v2: - Fixed kbuild test warning. drivers/cpufreq/powernv-cpufreq.c:609:2: warning: ignoring return value of 'kstrtoint', declared with attribute warn_unused_result [-Wunused-result] Changes from v1: - Added a kobject to struct chip - Grouped the throttle reasons under a separate attribute_group and exported each reason as individual file. - Moved the sysfs files from /sys/devices/system/node/nodeN to /sys/devices/system/cpu/cpufreq/chipN - As suggested by Paul Clarke replaced 'Nominal' with 'sub-turbo'. - Modified the commit message. drivers/cpufreq/powernv-cpufreq.c | 177 +- 1 file changed, 173 insertions(+), 4 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index c98a6e7..40ccd9d 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -54,6 +54,16 @@ static const char * const throttle_reason[] = { "OCC Reset" }; +enum throt_reason_type { + NO_THROTTLE = 0, + POWERCAP, + CPU_OVERTEMP, + POWER_SUPPLY_FAILURE, + OVERCURRENT, + OCC_RESET_THROTTLE, + OCC_MAX_REASON +}; + static struct chip { unsigned int id; bool throttled; @@ -61,6 +71,11 @@ static struct chip { u8 throt_reason; cpumask_t mask; struct work_struct throttle; + int throt_turbo; + int throt_nominal; + int reason[OCC_MAX_REASON]; + int *pstate_stat; + struct kobject *kobj; } *chips; static int nr_chips; @@ -195,6 +210,113 @@ static struct freq_attr *powernv_cpu_freq_attr[] = { NULL, }; +static inline int get_chip_index(struct kobject *kobj) +{ + int i, id; + + i = kstrtoint(kobj->name + 4, 0, &id); + if (i) + return i; + + for (i = 0; i < nr_chips; i++) + if (chips[i].id == id) + return i; + return -EINVAL; +} + +static ssize_t throttle_freq_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int i, count = 0, id; + + id = get_chip_index(kobj); + if (id < 0) + return id; + + for (i = 0; i < powernv_pstate_info.nr_pstates; i++) + count += sprintf(&buf[count], "%d %d\n", + powernv_freqs[i].frequency, + chips[id].pstate_stat[i]); + + return count; +} + +static struct kobj_attribute attr_throttle_frequencies = +__ATTR(throttle_frequencies, 0444, throttle_freq_show, NULL); + +static ssize_t throttle_stat_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int id, count = 0; + + id = get_chip_index(kobj); + if (id < 0) + return id; + + count += sprintf(&buf[count], "turbo %d\n", chips[id].throt_turbo); + count += sprintf(&buf[count], "sub-turbo %d\n", + chips[id].throt_nominal); + + return count; +} + +static struct kobj_attribute attr_throttle_stat = +__ATTR(throttle_stat, 0444, throttle_stat_show, NULL); + +#define define_th
[PATCH RESEND v4 0/4] cpufreq: powernv: Redesign the presentation of throttle notification
In POWER8, OCC(On-Chip-Controller) can throttle the frequency of the CPU when the chip crosses its thermal and power limits. Currently, powernv-cpufreq driver detects and reports this event as a console message. Some machines may not sustain the max turbo frequency in all conditions and can be throttled frequently. This can lead to the flooding of console with throttle messages. So this patchset aims to redesign the presentation of this event via sysfs counters and tracepoints. Patches [2] to [4] will add a perf trace point "power:powernv_throttle" and sysfs throttle counter stats in /sys/devices/system/cpu/cpufreq/chipN. Patch [1] solves a bug in powernv_cpufreq_throttle_check(), which calls in to cpu_to_chip_id() in hot path which reads DT every time to find the chip id. Resending the patchset as I has cc'ed sta...@vger.kernel.org in developemnt cycle and used --in-reply-to to post a new version. Changes from v3: - Add a fix to replace cpu_to_chip_id() with simpler PIR shift to obtain the chip id. - Break patch2 in to two patches separating the tracepoint and sysfs attribute changes. Changes from v2: - Fixed kbuild test warning. drivers/cpufreq/powernv-cpufreq.c:609:2: warning: ignoring return value of 'kstrtoint', declared with attribute warn_unused_result [-Wunused-result] Shilpasri G Bhat (4): cpufreq: powernv: Remove cpu_to_chip_id() from hot-path cpufreq: powernv/tracing: Add powernv_throttle tracepoint cpufreq: powernv: Add a trace print for the throttle event cpufreq: powernv: Add sysfs attributes to show throttle stats drivers/cpufreq/powernv-cpufreq.c | 279 +++--- include/trace/events/power.h | 22 +++ kernel/trace/power-traces.c | 1 + 3 files changed, 250 insertions(+), 52 deletions(-) -- 1.9.1 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH RESEND v4 1/4] cpufreq: powernv: Remove cpu_to_chip_id() from hot-path
cpu_to_chip_id() does a DT walk through to find out the chip id by taking a contended device tree lock. This adds an unnecessary overhead in a hot-path. So instead of cpu_to_chip_id() use PIR of the cpu to find the chip id. Reported-by: Anton Blanchard Signed-off-by: Shilpasri G Bhat --- drivers/cpufreq/powernv-cpufreq.c | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index cb50138..597a084 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -39,6 +39,7 @@ #define PMSR_PSAFE_ENABLE (1UL << 30) #define PMSR_SPR_EM_DISABLE(1UL << 31) #define PMSR_MAX(x)((x >> 32) & 0xFF) +#define pir_to_chip_id(pir)(((pir) >> 7) & 0x3f) static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1]; static bool rebooting, throttled, occ_reset; @@ -312,13 +313,14 @@ static inline unsigned int get_nominal_index(void) static void powernv_cpufreq_throttle_check(void *data) { unsigned int cpu = smp_processor_id(); + unsigned int chip_id = pir_to_chip_id(hard_smp_processor_id()); unsigned long pmsr; int pmsr_pmax, i; pmsr = get_pmspr(SPRN_PMSR); for (i = 0; i < nr_chips; i++) - if (chips[i].id == cpu_to_chip_id(cpu)) + if (chips[i].id == chip_id) break; /* Check for Pmax Capping */ @@ -558,7 +560,8 @@ static int init_chip_info(void) unsigned int prev_chip_id = UINT_MAX; for_each_possible_cpu(cpu) { - unsigned int id = cpu_to_chip_id(cpu); + unsigned int id = + pir_to_chip_id(get_hard_smp_processor_id(cpu)); if (prev_chip_id != id) { prev_chip_id = id; -- 1.9.1 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH RESEND v4 3/4] cpufreq: powernv: Add a trace print for the throttle event
Record the throttle event with a trace print replacing the printk, except for events like throttling below nominal and occ reset event which print a warning message. Signed-off-by: Shilpasri G Bhat --- Changes from v3: - Separate this patch to contain trace_point changes - Move struct chip member 'restore' of type bool above 'mask' to reduce structure padding. No changes from v2. Changes from v1: - As suggested by Paul Clarke replaced char * throttle_reason[][30] by const char * const throttle_reason[]. drivers/cpufreq/powernv-cpufreq.c | 95 --- 1 file changed, 49 insertions(+), 46 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 597a084..c98a6e7 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -44,12 +45,22 @@ static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1]; static bool rebooting, throttled, occ_reset; +static const char * const throttle_reason[] = { + "No throttling", + "Power Cap", + "Processor Over Temperature", + "Power Supply Failure", + "Over Current", + "OCC Reset" +}; + static struct chip { unsigned int id; bool throttled; + bool restore; + u8 throt_reason; cpumask_t mask; struct work_struct throttle; - bool restore; } *chips; static int nr_chips; @@ -310,41 +321,49 @@ static inline unsigned int get_nominal_index(void) return powernv_pstate_info.max - powernv_pstate_info.nominal; } -static void powernv_cpufreq_throttle_check(void *data) +static void powernv_cpufreq_check_pmax(void) { unsigned int cpu = smp_processor_id(); unsigned int chip_id = pir_to_chip_id(hard_smp_processor_id()); - unsigned long pmsr; int pmsr_pmax, i; - pmsr = get_pmspr(SPRN_PMSR); + pmsr_pmax = (s8)PMSR_MAX(get_pmspr(SPRN_PMSR)); for (i = 0; i < nr_chips; i++) if (chips[i].id == chip_id) break; - /* Check for Pmax Capping */ - pmsr_pmax = (s8)PMSR_MAX(pmsr); if (pmsr_pmax != powernv_pstate_info.max) { if (chips[i].throttled) - goto next; + return; + chips[i].throttled = true; if (pmsr_pmax < powernv_pstate_info.nominal) - pr_crit("CPU %d on Chip %u has Pmax reduced below nominal frequency (%d < %d)\n", - cpu, chips[i].id, pmsr_pmax, - powernv_pstate_info.nominal); - else - pr_info("CPU %d on Chip %u has Pmax reduced below turbo frequency (%d < %d)\n", - cpu, chips[i].id, pmsr_pmax, - powernv_pstate_info.max); + pr_warn_once("CPU %d on Chip %u has Pmax reduced below nominal frequency (%d < %d)\n", +cpu, chips[i].id, pmsr_pmax, +powernv_pstate_info.nominal); + + trace_powernv_throttle(chips[i].id, + throttle_reason[chips[i].throt_reason], + pmsr_pmax); } else if (chips[i].throttled) { chips[i].throttled = false; - pr_info("CPU %d on Chip %u has Pmax restored to %d\n", cpu, - chips[i].id, pmsr_pmax); + trace_powernv_throttle(chips[i].id, + throttle_reason[chips[i].throt_reason], + pmsr_pmax); } +} + +static void powernv_cpufreq_throttle_check(void *data) +{ + unsigned long pmsr; + + pmsr = get_pmspr(SPRN_PMSR); + + /* Check for Pmax Capping */ + powernv_cpufreq_check_pmax(); /* Check if Psafe_mode_active is set in PMSR. */ -next: if (pmsr & PMSR_PSAFE_ENABLE) { throttled = true; pr_info("Pstate set to safe frequency\n"); @@ -358,7 +377,7 @@ next: if (throttled) { pr_info("PMSR = %16lx\n", pmsr); - pr_crit("CPU Frequency could be throttled\n"); + pr_warn("CPU Frequency could be throttled\n"); } } @@ -449,15 +468,6 @@ void powernv_cpufreq_work_fn(struct work_struct *work) } } -static char throttle_reason[][30] = { - "No throttling", - "Power Cap", - "Processor Over Temperature", -
[PATCH RESEND v4 2/4] cpufreq: powernv/tracing: Add powernv_throttle tracepoint
This patch adds the powernv_throttle tracepoint to trace the CPU frequency throttling event, which is used by the powernv-cpufreq driver in POWER8. Signed-off-by: Shilpasri G Bhat CC: Ingo Molnar CC: Steven Rostedt --- No changes from v2 and v3. include/trace/events/power.h | 22 ++ kernel/trace/power-traces.c | 1 + 2 files changed, 23 insertions(+) diff --git a/include/trace/events/power.h b/include/trace/events/power.h index 284244e..19e5030 100644 --- a/include/trace/events/power.h +++ b/include/trace/events/power.h @@ -38,6 +38,28 @@ DEFINE_EVENT(cpu, cpu_idle, TP_ARGS(state, cpu_id) ); +TRACE_EVENT(powernv_throttle, + + TP_PROTO(int chip_id, const char *reason, int pmax), + + TP_ARGS(chip_id, reason, pmax), + + TP_STRUCT__entry( + __field(int, chip_id) + __string(reason, reason) + __field(int, pmax) + ), + + TP_fast_assign( + __entry->chip_id = chip_id; + __assign_str(reason, reason); + __entry->pmax = pmax; + ), + + TP_printk("Chip %d Pmax %d %s", __entry->chip_id, + __entry->pmax, __get_str(reason)) +); + TRACE_EVENT(pstate_sample, TP_PROTO(u32 core_busy, diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index eb4220a..81b8745 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -15,4 +15,5 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume); EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); +EXPORT_TRACEPOINT_SYMBOL_GPL(powernv_throttle); -- 1.9.1 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH RESEND v4 4/4] cpufreq: powernv: Add sysfs attributes to show throttle stats
Create sysfs attributes to export throttle information in /sys/devices/system/cpu/cpufreq/chipN. The newly added sysfs files are as follows: 1)/sys/devices/system/cpu/cpufreq/chip0/throttle_frequencies This gives the throttle stats for each of the available frequencies. The throttle stat of a frequency is the total number of times the max frequency is reduced to that frequency. # cat /sys/devices/system/cpu/cpufreq/chip0/throttle_frequencies 4023000 0 399 0 3956000 1 3923000 0 389 0 3857000 2 3823000 0 379 0 3757000 2 3724000 1 369 1 ... 2)/sys/devices/system/cpu/cpufreq/chip0/throttle_reasons This directory contains throttle reason files. Each file gives the total number of times the max frequency is throttled, except for 'throttle_reset', which gives the total number of times the max frequency is unthrottled after being throttled. # cd /sys/devices/system/cpu/cpufreq/chip0/throttle_reasons # cat cpu_over_temperature 7 # cat occ_reset 0 # cat over_current 0 # cat power_cap 0 # cat power_supply_failure 0 # cat throttle_reset 7 3)/sys/devices/system/cpu/cpufreq/chip0/throttle_stat This gives the total number of events of max frequency throttling to lower frequencies in the turbo range of frequencies and the sub-turbo(at and below nominal) range of frequencies. # cat /sys/devices/system/cpu/cpufreq/chip0/throttle_stat turbo 7 sub-turbo 0 Signed-off-by: Shilpasri G Bhat --- Changes from v3: - Seperate the patch to contain only the throttle sysfs attribute changes. - Add helper inline function get_chip_index() Changes from v2: - Fixed kbuild test warning. drivers/cpufreq/powernv-cpufreq.c:609:2: warning: ignoring return value of 'kstrtoint', declared with attribute warn_unused_result [-Wunused-result] Changes from v1: - Added a kobject to struct chip - Grouped the throttle reasons under a separate attribute_group and exported each reason as individual file. - Moved the sysfs files from /sys/devices/system/node/nodeN to /sys/devices/system/cpu/cpufreq/chipN - As suggested by Paul Clarke replaced 'Nominal' with 'sub-turbo'. - Modified the commit message. drivers/cpufreq/powernv-cpufreq.c | 177 +- 1 file changed, 173 insertions(+), 4 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index c98a6e7..40ccd9d 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -54,6 +54,16 @@ static const char * const throttle_reason[] = { "OCC Reset" }; +enum throt_reason_type { + NO_THROTTLE = 0, + POWERCAP, + CPU_OVERTEMP, + POWER_SUPPLY_FAILURE, + OVERCURRENT, + OCC_RESET_THROTTLE, + OCC_MAX_REASON +}; + static struct chip { unsigned int id; bool throttled; @@ -61,6 +71,11 @@ static struct chip { u8 throt_reason; cpumask_t mask; struct work_struct throttle; + int throt_turbo; + int throt_nominal; + int reason[OCC_MAX_REASON]; + int *pstate_stat; + struct kobject *kobj; } *chips; static int nr_chips; @@ -195,6 +210,113 @@ static struct freq_attr *powernv_cpu_freq_attr[] = { NULL, }; +static inline int get_chip_index(struct kobject *kobj) +{ + int i, id; + + i = kstrtoint(kobj->name + 4, 0, &id); + if (i) + return i; + + for (i = 0; i < nr_chips; i++) + if (chips[i].id == id) + return i; + return -EINVAL; +} + +static ssize_t throttle_freq_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int i, count = 0, id; + + id = get_chip_index(kobj); + if (id < 0) + return id; + + for (i = 0; i < powernv_pstate_info.nr_pstates; i++) + count += sprintf(&buf[count], "%d %d\n", + powernv_freqs[i].frequency, + chips[id].pstate_stat[i]); + + return count; +} + +static struct kobj_attribute attr_throttle_frequencies = +__ATTR(throttle_frequencies, 0444, throttle_freq_show, NULL); + +static ssize_t throttle_stat_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int id, count = 0; + + id = get_chip_index(kobj); + if (id < 0) + return id; + + count += sprintf(&buf[count], "turbo %d\n", chips[id].throt_turbo); + count += sprintf(&buf[count], "sub-turbo %d\n", + chips[id].throt_nominal); + + return count; +} + +static struct kobj_attribute attr_throttle_stat = +__ATTR(throttle_stat, 0444, throttle_stat_show, NULL); + +#define define_th
[PATCH v5 0/5] cpufreq: powernv: Redesign the presentation of throttle notification and solve bug-fixes in the driver
In POWER8, OCC(On-Chip-Controller) can throttle the frequency of the CPU when the chip crosses its thermal and power limits. Currently, powernv-cpufreq driver detects and reports this event as a console message. Some machines may not sustain the max turbo frequency in all conditions and can be throttled frequently. This can lead to the flooding of console with throttle messages. So this patchset aims to redesign the presentation of this event via sysfs counters and tracepoints. And it also fixes couple of bugs reported in the driver. - Patch [1] fixes the cpu hot-plug bug in powernv_cpufreq_work_fn(). - Patch [2] solves a bug in powernv_cpufreq_throttle_check(), which calls in to cpu_to_chip_id() in hot path which reads DT every time to find the chip id. - Patches [3] to [5] will add a perf trace point "power:powernv_throttle" and sysfs throttle counter stats in /sys/devices/system/cpu/cpufreq/chipN. Changes from v4: - Fix a hot-plug bug in powernv_cpufreq_work_fn() - Changes wrt Gautham's and Shreyas's comments Changes from v3: - Add a fix to replace cpu_to_chip_id() with simpler PIR shift to obtain the chip id. - Break patch2 in to two patches separating the tracepoint and sysfs attribute changes. Changes from v2: - Fixed kbuild test warning. drivers/cpufreq/powernv-cpufreq.c:609:2: warning: ignoring return value of 'kstrtoint', declared with attribute warn_unused_result [-Wunused-result] Shilpasri G Bhat (5): cpufreq: powernv: Hot-plug safe the kworker thread cpufreq: powernv: Remove cpu_to_chip_id() from hot-path cpufreq: powernv/tracing: Add powernv_throttle tracepoint cpufreq: powernv: Replace pr_info with trace print for throttle event cpufreq: powernv: Add sysfs attributes to show throttle stats drivers/cpufreq/powernv-cpufreq.c | 312 +++--- include/trace/events/power.h | 22 +++ kernel/trace/power-traces.c | 1 + 3 files changed, 280 insertions(+), 55 deletions(-) -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v5 1/5] cpufreq: powernv: Hot-plug safe the kworker thread
In the kworker_thread powernv_cpufreq_work_fn(), we can end up sending an IPI to a cpu going offline. This is a rare corner case which is fixed using {get/put}_online_cpus(). Along with this fix, this patch adds changes to do oneshot cpumask_{clear/and} operation. Suggested-by: Shreyas B Prabhu Suggested-by: Gautham R Shenoy Signed-off-by: Shilpasri G Bhat Reviewed-by: Gautham R. Shenoy --- drivers/cpufreq/powernv-cpufreq.c | 18 ++ 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 547890f..50a5b21 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -423,18 +423,19 @@ void powernv_cpufreq_work_fn(struct work_struct *work) { struct chip *chip = container_of(work, struct chip, throttle); unsigned int cpu; - cpumask_var_t mask; + cpumask_t mask; - smp_call_function_any(&chip->mask, + get_online_cpus(); + cpumask_and(&mask, &chip->mask, cpu_online_mask); + smp_call_function_any(&mask, powernv_cpufreq_throttle_check, NULL, 0); if (!chip->restore) - return; + goto out; chip->restore = false; - cpumask_copy(mask, &chip->mask); - for_each_cpu_and(cpu, mask, cpu_online_mask) { - int index, tcpu; + for_each_cpu(cpu, &mask) { + int index; struct cpufreq_policy policy; cpufreq_get_policy(&policy, cpu); @@ -442,9 +443,10 @@ void powernv_cpufreq_work_fn(struct work_struct *work) policy.cur, CPUFREQ_RELATION_C, &index); powernv_cpufreq_target_index(&policy, index); - for_each_cpu(tcpu, policy.cpus) - cpumask_clear_cpu(tcpu, mask); + cpumask_andnot(&mask, &mask, policy.cpus); } +out: + put_online_cpus(); } static char throttle_reason[][30] = { -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v5 2/5] cpufreq: powernv: Remove cpu_to_chip_id() from hot-path
cpu_to_chip_id() does a DT walk through to find out the chip id by taking a contended device tree lock. This adds an unnecessary overhead in a hot path. So instead of calling cpu_to_chip_id() everytime cache the chip ids for all cores in the array 'core_to_chip_map' and use it in the hotpath. Reported-by: Anton Blanchard Signed-off-by: Shilpasri G Bhat Reviewed-by: Gautham R. Shenoy --- Changes from v4: - Taken care of Shreyas's comments to add a core_to_chip_map array to store the chip id. drivers/cpufreq/powernv-cpufreq.c | 24 +--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 50a5b21..5ea103d 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -42,6 +42,7 @@ static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1]; static bool rebooting, throttled, occ_reset; +static unsigned int *core_to_chip_map; static struct chip { unsigned int id; @@ -312,13 +313,14 @@ static inline unsigned int get_nominal_index(void) static void powernv_cpufreq_throttle_check(void *data) { unsigned int cpu = smp_processor_id(); + unsigned int chip_id = core_to_chip_map[cpu_core_index_of_thread(cpu)]; unsigned long pmsr; int pmsr_pmax, i; pmsr = get_pmspr(SPRN_PMSR); for (i = 0; i < nr_chips; i++) - if (chips[i].id == cpu_to_chip_id(cpu)) + if (chips[i].id == chip_id) break; /* Check for Pmax Capping */ @@ -558,19 +560,29 @@ static int init_chip_info(void) unsigned int chip[256]; unsigned int cpu, i; unsigned int prev_chip_id = UINT_MAX; + cpumask_t cpu_mask; + int ret = -ENOMEM; - for_each_possible_cpu(cpu) { + cpumask_copy(&cpu_mask, cpu_possible_mask); + core_to_chip_map = kcalloc(cpu_nr_cores(), sizeof(unsigned int), + GFP_KERNEL); + if (!core_to_chip_map) + goto out; + + for_each_cpu(cpu, &cpu_mask) { unsigned int id = cpu_to_chip_id(cpu); if (prev_chip_id != id) { prev_chip_id = id; chip[nr_chips++] = id; } + core_to_chip_map[cpu_core_index_of_thread(cpu)] = id; + cpumask_andnot(&cpu_mask, &cpu_mask, cpu_sibling_mask(cpu)); } chips = kmalloc_array(nr_chips, sizeof(struct chip), GFP_KERNEL); if (!chips) - return -ENOMEM; + goto free_chip_map; for (i = 0; i < nr_chips; i++) { chips[i].id = chip[i]; @@ -581,6 +593,10 @@ static int init_chip_info(void) } return 0; +free_chip_map: + kfree(core_to_chip_map); +out: + return ret; } static int __init powernv_cpufreq_init(void) @@ -614,6 +630,8 @@ static void __exit powernv_cpufreq_exit(void) unregister_reboot_notifier(&powernv_cpufreq_reboot_nb); opal_message_notifier_unregister(OPAL_MSG_OCC, &powernv_cpufreq_opal_nb); + kfree(chips); + kfree(core_to_chip_map); cpufreq_unregister_driver(&powernv_cpufreq_driver); } module_exit(powernv_cpufreq_exit); -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v5 3/5] cpufreq: powernv/tracing: Add powernv_throttle tracepoint
This patch adds the powernv_throttle tracepoint to trace the CPU frequency throttling event, which is used by the powernv-cpufreq driver in POWER8. Signed-off-by: Shilpasri G Bhat Reviewed-by: Gautham R. Shenoy CC: Ingo Molnar CC: Steven Rostedt --- No changes since v2. include/trace/events/power.h | 22 ++ kernel/trace/power-traces.c | 1 + 2 files changed, 23 insertions(+) diff --git a/include/trace/events/power.h b/include/trace/events/power.h index 284244e..19e5030 100644 --- a/include/trace/events/power.h +++ b/include/trace/events/power.h @@ -38,6 +38,28 @@ DEFINE_EVENT(cpu, cpu_idle, TP_ARGS(state, cpu_id) ); +TRACE_EVENT(powernv_throttle, + + TP_PROTO(int chip_id, const char *reason, int pmax), + + TP_ARGS(chip_id, reason, pmax), + + TP_STRUCT__entry( + __field(int, chip_id) + __string(reason, reason) + __field(int, pmax) + ), + + TP_fast_assign( + __entry->chip_id = chip_id; + __assign_str(reason, reason); + __entry->pmax = pmax; + ), + + TP_printk("Chip %d Pmax %d %s", __entry->chip_id, + __entry->pmax, __get_str(reason)) +); + TRACE_EVENT(pstate_sample, TP_PROTO(u32 core_busy, diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index eb4220a..81b8745 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -15,4 +15,5 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume); EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); +EXPORT_TRACEPOINT_SYMBOL_GPL(powernv_throttle); -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v5 4/5] cpufreq: powernv: Replace pr_info with trace print for throttle event
Currently we use printk message to notify the throttle event. But this can flood the console if the cpu is throttled frequently. So replace the printk with the tracepoint to notify the throttle event. And also events like throttle below nominal frequency and OCC_RESET are reduced to pr_warn/pr_warn_once as pointed by MFG to not mark them as critical messages. This patch adds 'throt_reason' to struct chip to store the throttle reason. Signed-off-by: Shilpasri G Bhat Reviewed-by: Gautham R. Shenoy --- Changes from v4: - Taken care of Gautham's comments to remove the new function powernv_cpufreq_check_pmax() - Modified commit message Changes from v3: - Separate this patch to contain trace_point changes - Move struct chip member 'restore' of type bool above 'mask' to reduce structure padding. No changes from v2. Changes from v1: - As suggested by Paul Clarke replaced char * throttle_reason[][30] by const char * const throttle_reason[]. drivers/cpufreq/powernv-cpufreq.c | 73 ++- 1 file changed, 34 insertions(+), 39 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 5ea103d..0ed7d82 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -44,12 +45,22 @@ static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1]; static bool rebooting, throttled, occ_reset; static unsigned int *core_to_chip_map; +static const char * const throttle_reason[] = { + "No throttling", + "Power Cap", + "Processor Over Temperature", + "Power Supply Failure", + "Over Current", + "OCC Reset" +}; + static struct chip { unsigned int id; bool throttled; + bool restore; + u8 throt_reason; cpumask_t mask; struct work_struct throttle; - bool restore; } *chips; static int nr_chips; @@ -330,17 +341,17 @@ static void powernv_cpufreq_throttle_check(void *data) goto next; chips[i].throttled = true; if (pmsr_pmax < powernv_pstate_info.nominal) - pr_crit("CPU %d on Chip %u has Pmax reduced below nominal frequency (%d < %d)\n", - cpu, chips[i].id, pmsr_pmax, - powernv_pstate_info.nominal); - else - pr_info("CPU %d on Chip %u has Pmax reduced below turbo frequency (%d < %d)\n", - cpu, chips[i].id, pmsr_pmax, - powernv_pstate_info.max); + pr_warn_once("CPU %d on Chip %u has Pmax reduced below nominal frequency (%d < %d)\n", +cpu, chips[i].id, pmsr_pmax, +powernv_pstate_info.nominal); + trace_powernv_throttle(chips[i].id, + throttle_reason[chips[i].throt_reason], + pmsr_pmax); } else if (chips[i].throttled) { chips[i].throttled = false; - pr_info("CPU %d on Chip %u has Pmax restored to %d\n", cpu, - chips[i].id, pmsr_pmax); + trace_powernv_throttle(chips[i].id, + throttle_reason[chips[i].throt_reason], + pmsr_pmax); } /* Check if Psafe_mode_active is set in PMSR. */ @@ -358,7 +369,7 @@ next: if (throttled) { pr_info("PMSR = %16lx\n", pmsr); - pr_crit("CPU Frequency could be throttled\n"); + pr_warn("CPU Frequency could be throttled\n"); } } @@ -451,15 +462,6 @@ out: put_online_cpus(); } -static char throttle_reason[][30] = { - "No throttling", - "Power Cap", - "Processor Over Temperature", - "Power Supply Failure", - "Over Current", - "OCC Reset" -}; - static int powernv_cpufreq_occ_msg(struct notifier_block *nb, unsigned long msg_type, void *_msg) { @@ -485,7 +487,7 @@ static int powernv_cpufreq_occ_msg(struct notifier_block *nb, */ if (!throttled) { throttled = true; - pr_crit("CPU frequency is throttled for duration\n"); + pr_warn
[PATCH v5 5/5] cpufreq: powernv: Add sysfs attributes to show throttle stats
Create sysfs attributes to export throttle information in /sys/devices/system/cpu/cpufreq/chipN. The newly added sysfs files are as follows: 1)/sys/devices/system/cpu/cpufreq/chip0/throttle_frequencies This gives the throttle stats for each of the available frequencies. The throttle stat of a frequency is the total number of times the max frequency is reduced to that frequency. # cat /sys/devices/system/cpu/cpufreq/chip0/throttle_frequencies 4023000 0 399 0 3956000 1 3923000 0 389 0 3857000 2 3823000 0 379 0 3757000 2 3724000 1 369 1 ... 2)/sys/devices/system/cpu/cpufreq/chip0/throttle_reasons This directory contains throttle reason files. Each file gives the total number of times the max frequency is throttled, except for 'throttle_reset', which gives the total number of times the max frequency is unthrottled after being throttled. # cd /sys/devices/system/cpu/cpufreq/chip0/throttle_reasons # cat cpu_over_temperature 7 # cat occ_reset 0 # cat over_current 0 # cat power_cap 0 # cat power_supply_failure 0 # cat throttle_reset 7 3)/sys/devices/system/cpu/cpufreq/chip0/throttle_stat This gives the total number of events of max frequency throttling to lower frequencies in the turbo range of frequencies and the sub-turbo(at and below nominal) range of frequencies. # cat /sys/devices/system/cpu/cpufreq/chip0/throttle_stat turbo 7 sub-turbo 0 Signed-off-by: Shilpasri G Bhat --- Changes from v4: - Taken care of Gautham's comments to use inline get_chip_index() Changes from v3: - Seperate the patch to contain only the throttle sysfs attribute changes. - Add helper inline function get_chip_index() Changes from v2: - Fixed kbuild test warning. drivers/cpufreq/powernv-cpufreq.c:609:2: warning: ignoring return value of 'kstrtoint', declared with attribute warn_unused_result [-Wunused-result] Changes from v1: - Added a kobject to struct chip - Grouped the throttle reasons under a separate attribute_group and exported each reason as individual file. - Moved the sysfs files from /sys/devices/system/node/nodeN to /sys/devices/system/cpu/cpufreq/chipN - As suggested by Paul Clarke replaced 'Nominal' with 'sub-turbo'. - Modified the commit message. drivers/cpufreq/powernv-cpufreq.c | 205 -- 1 file changed, 196 insertions(+), 9 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 0ed7d82..ecda5f7 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -54,6 +54,16 @@ static const char * const throttle_reason[] = { "OCC Reset" }; +enum throt_reason_type { + NO_THROTTLE = 0, + POWERCAP, + CPU_OVERTEMP, + POWER_SUPPLY_FAILURE, + OVERCURRENT, + OCC_RESET_THROTTLE, + OCC_MAX_REASON +}; + static struct chip { unsigned int id; bool throttled; @@ -61,6 +71,11 @@ static struct chip { u8 throt_reason; cpumask_t mask; struct work_struct throttle; + int throt_turbo; + int throt_nominal; + int reason[OCC_MAX_REASON]; + int *pstate_stat; + struct kobject *kobj; } *chips; static int nr_chips; @@ -195,6 +210,128 @@ static struct freq_attr *powernv_cpu_freq_attr[] = { NULL, }; +static inline int get_chip_index(unsigned int id) +{ + int i; + + for (i = 0; i < nr_chips; i++) + if (chips[i].id == id) + return i; + + return -EINVAL; +} + +static ssize_t throttle_freq_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int i, count = 0, id; + + i = kstrtoint(kobj->name + 4, 0, &id); + if (i) + return i; + + id = get_chip_index(id); + if (id < 0) { + pr_warn_once("%s Matching chip-id not found\n", __func__); + return id; + } + + for (i = 0; i < powernv_pstate_info.nr_pstates; i++) + count += sprintf(&buf[count], "%d %d\n", + powernv_freqs[i].frequency, + chips[id].pstate_stat[i]); + + return count; +} + +static struct kobj_attribute attr_throttle_frequencies = +__ATTR(throttle_frequencies, 0444, throttle_freq_show, NULL); + +static ssize_t throttle_stat_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int ret, id, count = 0; + + ret = kstrtoint(kobj->name + 4, 0, &id); + if (ret) + return ret; + + id = get_chip_index(id); + if (id < 0) { + pr_warn_once("%s Matching chip-id not found\n", __func__); + return id; + } + + count
[PATCH v6 0/5] cpufreq: powernv: Redesign the presentation of throttle notification and solve bug-fixes in the driver
In POWER8, OCC(On-Chip-Controller) can throttle the frequency of the CPU when the chip crosses its thermal and power limits. Currently, powernv-cpufreq driver detects and reports this event as a console message. Some machines may not sustain the max turbo frequency in all conditions and can be throttled frequently. This can lead to the flooding of console with throttle messages. So this patchset aims to redesign the presentation of this event via sysfs counters and tracepoints. And it also fixes couple of bugs reported in the driver. - Patch [1] fixes the cpu hot-plug bug in powernv_cpufreq_work_fn(). - Patch [2] solves a bug in powernv_cpufreq_throttle_check(), which calls in to cpu_to_chip_id() in hot path which reads DT every time to find the chip id. - Patches [3] to [5] will add a perf trace point "power:powernv_throttle" and sysfs throttle counter stats in /sys/devices/system/cpu/cpufreq/chipN. Changes from v5: - Fix kbuild error: drivers/cpufreq/powernv-cpufreq.c:428:2: error: implicit declaration of function 'get_online_cpus' [-Werror=implicit-function-declaration] Changes from v4: - Fix a hot-plug bug in powernv_cpufreq_work_fn() - Changes wrt Gautham's and Shreyas's comments Changes from v3: - Add a fix to replace cpu_to_chip_id() with simpler PIR shift to obtain the chip id. - Break patch2 in to two patches separating the tracepoint and sysfs attribute changes. Changes from v2: - Fixed kbuild test warning. drivers/cpufreq/powernv-cpufreq.c:609:2: warning: ignoring return value of 'kstrtoint', declared with attribute warn_unused_result [-Wunused-result] Shilpasri G Bhat (5): cpufreq: powernv: Hot-plug safe the kworker thread cpufreq: powernv: Remove cpu_to_chip_id() from hot-path cpufreq: powernv/tracing: Add powernv_throttle tracepoint cpufreq: powernv: Replace pr_info with trace print for throttle event cpufreq: powernv: Add sysfs attributes to show throttle stats drivers/cpufreq/powernv-cpufreq.c | 313 +++--- include/trace/events/power.h | 22 +++ kernel/trace/power-traces.c | 1 + 3 files changed, 281 insertions(+), 55 deletions(-) -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v6 1/5] cpufreq: powernv: Hot-plug safe the kworker thread
In the kworker_thread powernv_cpufreq_work_fn(), we can end up sending an IPI to a cpu going offline. This is a rare corner case which is fixed using {get/put}_online_cpus(). Along with this fix, this patch adds changes to do oneshot cpumask_{clear/and} operation. Suggested-by: Shreyas B Prabhu Suggested-by: Gautham R Shenoy Signed-off-by: Shilpasri G Bhat Reviewed-by: Gautham R. Shenoy --- Changes form v5: - Fix the kbuild-error: drivers/cpufreq/powernv-cpufreq.c:428:2: error: implicit declaration of function 'get_online_cpus' [-Werror=implicit-function-declaration drivers/cpufreq/powernv-cpufreq.c | 19 +++ 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 547890f..140c75f 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -423,18 +424,19 @@ void powernv_cpufreq_work_fn(struct work_struct *work) { struct chip *chip = container_of(work, struct chip, throttle); unsigned int cpu; - cpumask_var_t mask; + cpumask_t mask; - smp_call_function_any(&chip->mask, + get_online_cpus(); + cpumask_and(&mask, &chip->mask, cpu_online_mask); + smp_call_function_any(&mask, powernv_cpufreq_throttle_check, NULL, 0); if (!chip->restore) - return; + goto out; chip->restore = false; - cpumask_copy(mask, &chip->mask); - for_each_cpu_and(cpu, mask, cpu_online_mask) { - int index, tcpu; + for_each_cpu(cpu, &mask) { + int index; struct cpufreq_policy policy; cpufreq_get_policy(&policy, cpu); @@ -442,9 +444,10 @@ void powernv_cpufreq_work_fn(struct work_struct *work) policy.cur, CPUFREQ_RELATION_C, &index); powernv_cpufreq_target_index(&policy, index); - for_each_cpu(tcpu, policy.cpus) - cpumask_clear_cpu(tcpu, mask); + cpumask_andnot(&mask, &mask, policy.cpus); } +out: + put_online_cpus(); } static char throttle_reason[][30] = { -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v6 2/5] cpufreq: powernv: Remove cpu_to_chip_id() from hot-path
cpu_to_chip_id() does a DT walk through to find out the chip id by taking a contended device tree lock. This adds an unnecessary overhead in a hot path. So instead of calling cpu_to_chip_id() everytime cache the chip ids for all cores in the array 'core_to_chip_map' and use it in the hotpath. Reported-by: Anton Blanchard Signed-off-by: Shilpasri G Bhat Reviewed-by: Gautham R. Shenoy --- No changes from v5. Changes from v4: - Taken care of Shreyas's comments to add a core_to_chip_map array to store the chip id. drivers/cpufreq/powernv-cpufreq.c | 24 +--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 140c75f..6f186dc 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -43,6 +43,7 @@ static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1]; static bool rebooting, throttled, occ_reset; +static unsigned int *core_to_chip_map; static struct chip { unsigned int id; @@ -313,13 +314,14 @@ static inline unsigned int get_nominal_index(void) static void powernv_cpufreq_throttle_check(void *data) { unsigned int cpu = smp_processor_id(); + unsigned int chip_id = core_to_chip_map[cpu_core_index_of_thread(cpu)]; unsigned long pmsr; int pmsr_pmax, i; pmsr = get_pmspr(SPRN_PMSR); for (i = 0; i < nr_chips; i++) - if (chips[i].id == cpu_to_chip_id(cpu)) + if (chips[i].id == chip_id) break; /* Check for Pmax Capping */ @@ -559,19 +561,29 @@ static int init_chip_info(void) unsigned int chip[256]; unsigned int cpu, i; unsigned int prev_chip_id = UINT_MAX; + cpumask_t cpu_mask; + int ret = -ENOMEM; - for_each_possible_cpu(cpu) { + cpumask_copy(&cpu_mask, cpu_possible_mask); + core_to_chip_map = kcalloc(cpu_nr_cores(), sizeof(unsigned int), + GFP_KERNEL); + if (!core_to_chip_map) + goto out; + + for_each_cpu(cpu, &cpu_mask) { unsigned int id = cpu_to_chip_id(cpu); if (prev_chip_id != id) { prev_chip_id = id; chip[nr_chips++] = id; } + core_to_chip_map[cpu_core_index_of_thread(cpu)] = id; + cpumask_andnot(&cpu_mask, &cpu_mask, cpu_sibling_mask(cpu)); } chips = kmalloc_array(nr_chips, sizeof(struct chip), GFP_KERNEL); if (!chips) - return -ENOMEM; + goto free_chip_map; for (i = 0; i < nr_chips; i++) { chips[i].id = chip[i]; @@ -582,6 +594,10 @@ static int init_chip_info(void) } return 0; +free_chip_map: + kfree(core_to_chip_map); +out: + return ret; } static int __init powernv_cpufreq_init(void) @@ -615,6 +631,8 @@ static void __exit powernv_cpufreq_exit(void) unregister_reboot_notifier(&powernv_cpufreq_reboot_nb); opal_message_notifier_unregister(OPAL_MSG_OCC, &powernv_cpufreq_opal_nb); + kfree(chips); + kfree(core_to_chip_map); cpufreq_unregister_driver(&powernv_cpufreq_driver); } module_exit(powernv_cpufreq_exit); -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v6 4/5] cpufreq: powernv: Replace pr_info with trace print for throttle event
Currently we use printk message to notify the throttle event. But this can flood the console if the cpu is throttled frequently. So replace the printk with the tracepoint to notify the throttle event. And also events like throttle below nominal frequency and OCC_RESET are reduced to pr_warn/pr_warn_once as pointed by MFG to not mark them as critical messages. This patch adds 'throt_reason' to struct chip to store the throttle reason. Signed-off-by: Shilpasri G Bhat Reviewed-by: Gautham R. Shenoy --- No changes from v5. Changes from v4: - Taken care of Gautham's comments to remove the new function powernv_cpufreq_check_pmax() - Modified commit message Changes from v3: - Separate this patch to contain trace_point changes - Move struct chip member 'restore' of type bool above 'mask' to reduce structure padding. No changes from v2. Changes from v1: - As suggested by Paul Clarke replaced char * throttle_reason[][30] by const char * const throttle_reason[]. drivers/cpufreq/powernv-cpufreq.c | 73 ++- 1 file changed, 34 insertions(+), 39 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 6f186dc..2d09274 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -45,12 +46,22 @@ static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1]; static bool rebooting, throttled, occ_reset; static unsigned int *core_to_chip_map; +static const char * const throttle_reason[] = { + "No throttling", + "Power Cap", + "Processor Over Temperature", + "Power Supply Failure", + "Over Current", + "OCC Reset" +}; + static struct chip { unsigned int id; bool throttled; + bool restore; + u8 throt_reason; cpumask_t mask; struct work_struct throttle; - bool restore; } *chips; static int nr_chips; @@ -331,17 +342,17 @@ static void powernv_cpufreq_throttle_check(void *data) goto next; chips[i].throttled = true; if (pmsr_pmax < powernv_pstate_info.nominal) - pr_crit("CPU %d on Chip %u has Pmax reduced below nominal frequency (%d < %d)\n", - cpu, chips[i].id, pmsr_pmax, - powernv_pstate_info.nominal); - else - pr_info("CPU %d on Chip %u has Pmax reduced below turbo frequency (%d < %d)\n", - cpu, chips[i].id, pmsr_pmax, - powernv_pstate_info.max); + pr_warn_once("CPU %d on Chip %u has Pmax reduced below nominal frequency (%d < %d)\n", +cpu, chips[i].id, pmsr_pmax, +powernv_pstate_info.nominal); + trace_powernv_throttle(chips[i].id, + throttle_reason[chips[i].throt_reason], + pmsr_pmax); } else if (chips[i].throttled) { chips[i].throttled = false; - pr_info("CPU %d on Chip %u has Pmax restored to %d\n", cpu, - chips[i].id, pmsr_pmax); + trace_powernv_throttle(chips[i].id, + throttle_reason[chips[i].throt_reason], + pmsr_pmax); } /* Check if Psafe_mode_active is set in PMSR. */ @@ -359,7 +370,7 @@ next: if (throttled) { pr_info("PMSR = %16lx\n", pmsr); - pr_crit("CPU Frequency could be throttled\n"); + pr_warn("CPU Frequency could be throttled\n"); } } @@ -452,15 +463,6 @@ out: put_online_cpus(); } -static char throttle_reason[][30] = { - "No throttling", - "Power Cap", - "Processor Over Temperature", - "Power Supply Failure", - "Over Current", - "OCC Reset" -}; - static int powernv_cpufreq_occ_msg(struct notifier_block *nb, unsigned long msg_type, void *_msg) { @@ -486,7 +488,7 @@ static int powernv_cpufreq_occ_msg(struct notifier_block *nb, */ if (!throttled) { throttled = true; - pr_crit("CPU frequency is throttled for duration\n
[PATCH v6 3/5] cpufreq: powernv/tracing: Add powernv_throttle tracepoint
This patch adds the powernv_throttle tracepoint to trace the CPU frequency throttling event, which is used by the powernv-cpufreq driver in POWER8. Signed-off-by: Shilpasri G Bhat Reviewed-by: Gautham R. Shenoy CC: Ingo Molnar CC: Steven Rostedt --- No changes since v2. include/trace/events/power.h | 22 ++ kernel/trace/power-traces.c | 1 + 2 files changed, 23 insertions(+) diff --git a/include/trace/events/power.h b/include/trace/events/power.h index 284244e..19e5030 100644 --- a/include/trace/events/power.h +++ b/include/trace/events/power.h @@ -38,6 +38,28 @@ DEFINE_EVENT(cpu, cpu_idle, TP_ARGS(state, cpu_id) ); +TRACE_EVENT(powernv_throttle, + + TP_PROTO(int chip_id, const char *reason, int pmax), + + TP_ARGS(chip_id, reason, pmax), + + TP_STRUCT__entry( + __field(int, chip_id) + __string(reason, reason) + __field(int, pmax) + ), + + TP_fast_assign( + __entry->chip_id = chip_id; + __assign_str(reason, reason); + __entry->pmax = pmax; + ), + + TP_printk("Chip %d Pmax %d %s", __entry->chip_id, + __entry->pmax, __get_str(reason)) +); + TRACE_EVENT(pstate_sample, TP_PROTO(u32 core_busy, diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index eb4220a..81b8745 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -15,4 +15,5 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume); EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); +EXPORT_TRACEPOINT_SYMBOL_GPL(powernv_throttle); -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v6 5/5] cpufreq: powernv: Add sysfs attributes to show throttle stats
Create sysfs attributes to export throttle information in /sys/devices/system/cpu/cpufreq/chipN. The newly added sysfs files are as follows: 1)/sys/devices/system/cpu/cpufreq/chip0/throttle_frequencies This gives the throttle stats for each of the available frequencies. The throttle stat of a frequency is the total number of times the max frequency is reduced to that frequency. # cat /sys/devices/system/cpu/cpufreq/chip0/throttle_frequencies 4023000 0 399 0 3956000 1 3923000 0 389 0 3857000 2 3823000 0 379 0 3757000 2 3724000 1 369 1 ... 2)/sys/devices/system/cpu/cpufreq/chip0/throttle_reasons This directory contains throttle reason files. Each file gives the total number of times the max frequency is throttled, except for 'throttle_reset', which gives the total number of times the max frequency is unthrottled after being throttled. # cd /sys/devices/system/cpu/cpufreq/chip0/throttle_reasons # cat cpu_over_temperature 7 # cat occ_reset 0 # cat over_current 0 # cat power_cap 0 # cat power_supply_failure 0 # cat throttle_reset 7 3)/sys/devices/system/cpu/cpufreq/chip0/throttle_stat This gives the total number of events of max frequency throttling to lower frequencies in the turbo range of frequencies and the sub-turbo(at and below nominal) range of frequencies. # cat /sys/devices/system/cpu/cpufreq/chip0/throttle_stat turbo 7 sub-turbo 0 Signed-off-by: Shilpasri G Bhat Reviewed-by: Gautham R. Shenoy --- No changes from v5. Changes from v4: - Taken care of Gautham's comments to use inline get_chip_index() Changes from v3: - Seperate the patch to contain only the throttle sysfs attribute changes. - Add helper inline function get_chip_index() Changes from v2: - Fixed kbuild test warning. drivers/cpufreq/powernv-cpufreq.c:609:2: warning: ignoring return value of 'kstrtoint', declared with attribute warn_unused_result [-Wunused-result] Changes from v1: - Added a kobject to struct chip - Grouped the throttle reasons under a separate attribute_group and exported each reason as individual file. - Moved the sysfs files from /sys/devices/system/node/nodeN to /sys/devices/system/cpu/cpufreq/chipN - As suggested by Paul Clarke replaced 'Nominal' with 'sub-turbo'. drivers/cpufreq/powernv-cpufreq.c | 205 -- 1 file changed, 196 insertions(+), 9 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 2d09274..7d65c82 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -55,6 +55,16 @@ static const char * const throttle_reason[] = { "OCC Reset" }; +enum throt_reason_type { + NO_THROTTLE = 0, + POWERCAP, + CPU_OVERTEMP, + POWER_SUPPLY_FAILURE, + OVERCURRENT, + OCC_RESET_THROTTLE, + OCC_MAX_REASON +}; + static struct chip { unsigned int id; bool throttled; @@ -62,6 +72,11 @@ static struct chip { u8 throt_reason; cpumask_t mask; struct work_struct throttle; + int throt_turbo; + int throt_nominal; + int reason[OCC_MAX_REASON]; + int *pstate_stat; + struct kobject *kobj; } *chips; static int nr_chips; @@ -196,6 +211,128 @@ static struct freq_attr *powernv_cpu_freq_attr[] = { NULL, }; +static inline int get_chip_index(unsigned int id) +{ + int i; + + for (i = 0; i < nr_chips; i++) + if (chips[i].id == id) + return i; + + return -EINVAL; +} + +static ssize_t throttle_freq_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int i, count = 0, id; + + i = kstrtoint(kobj->name + 4, 0, &id); + if (i) + return i; + + id = get_chip_index(id); + if (id < 0) { + pr_warn_once("%s Matching chip-id not found\n", __func__); + return id; + } + + for (i = 0; i < powernv_pstate_info.nr_pstates; i++) + count += sprintf(&buf[count], "%d %d\n", + powernv_freqs[i].frequency, + chips[id].pstate_stat[i]); + + return count; +} + +static struct kobj_attribute attr_throttle_frequencies = +__ATTR(throttle_frequencies, 0444, throttle_freq_show, NULL); + +static ssize_t throttle_stat_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int ret, id, count = 0; + + ret = kstrtoint(kobj->name + 4, 0, &id); + if (ret) + return ret; + + id = get_chip_index(id); + if (id < 0) { + pr_warn_once("%s Matching chip-id not found\n", __func__); + return id; + } + + count
Re: [PATCH v6 5/5] cpufreq: powernv: Add sysfs attributes to show throttle stats
Hi, On 01/23/2016 02:10 PM, Balbir Singh wrote: > On Fri, 22 Jan 2016 12:49:05 +0530 > Shilpasri G Bhat wrote: > >> Create sysfs attributes to export throttle information in >> /sys/devices/system/cpu/cpufreq/chipN. The newly added sysfs files are as >> follows: >> >> 1)/sys/devices/system/cpu/cpufreq/chip0/throttle_frequencies >> This gives the throttle stats for each of the available frequencies. >> The throttle stat of a frequency is the total number of times the max >> frequency is reduced to that frequency. >> # cat /sys/devices/system/cpu/cpufreq/chip0/throttle_frequencies >> 4023000 0 >> 399 0 >> 3956000 1 >> 3923000 0 >> 389 0 >> 3857000 2 >> 3823000 0 >> 379 0 >> 3757000 2 >> 3724000 1 >> 369 1 >> ... >> >> 2)/sys/devices/system/cpu/cpufreq/chip0/throttle_reasons >> This directory contains throttle reason files. Each file gives the >> total number of times the max frequency is throttled, except for >> 'throttle_reset', which gives the total number of times the max > > Is reset a good name? Ideally a reset, reset's stats. Is unthrottle_count a better name? > >> frequency is unthrottled after being throttled. >> # cd /sys/devices/system/cpu/cpufreq/chip0/throttle_reasons >> # cat cpu_over_temperature >> 7 >> # cat occ_reset >> 0 >> # cat over_current >> 0 >> # cat power_cap >> 0 >> # cat power_supply_failure >> 0 >> # cat throttle_reset >> 7 >> >> 3)/sys/devices/system/cpu/cpufreq/chip0/throttle_stat >> This gives the total number of events of max frequency throttling to >> lower frequencies in the turbo range of frequencies and the sub-turbo(at >> and below nominal) range of frequencies. >> # cat /sys/devices/system/cpu/cpufreq/chip0/throttle_stat >> turbo 7 >> sub-turbo 0 >> >> Signed-off-by: Shilpasri G Bhat >> Reviewed-by: Gautham R. Shenoy > snip > >> >> +enum throt_reason_type { >> +NO_THROTTLE = 0, > NO is not throttled or number_throttle? It stands for not throttled. > >> +POWERCAP, >> +CPU_OVERTEMP, >> +POWER_SUPPLY_FAILURE, >> +OVERCURRENT, >> +OCC_RESET_THROTTLE, >> +OCC_MAX_REASON >> +}; >> + >> static struct chip { >> unsigned int id; >> bool throttled; >> @@ -62,6 +72,11 @@ static struct chip { >> u8 throt_reason; >> cpumask_t mask; >> struct work_struct throttle; >> +int throt_turbo; > > The enum uses _THROTTLE so why can't the struct member > be throttle_nominal? throttle_turbo? Okay will change the struct members to throttle_* > >> +int throt_nominal; >> +int reason[OCC_MAX_REASON]; >> +int *pstate_stat; >> +struct kobject *kobj; >> } *chips; >> >> static int nr_chips; >> @@ -196,6 +211,128 @@ static struct freq_attr *powernv_cpu_freq_attr[] = { >> NULL, >> }; >> >> +static inline int get_chip_index(unsigned int id) >> +{ >> +int i; >> + >> +for (i = 0; i < nr_chips; i++) >> +if (chips[i].id == id) >> +return i; >> + >> +return -EINVAL; >> +} >> + >> +static ssize_t throttle_freq_show(struct kobject *kobj, >> + struct kobj_attribute *attr, char *buf) >> +{ >> +int i, count = 0, id; >> + >> +i = kstrtoint(kobj->name + 4, 0, &id); > > Why the +4 magic, make it more readable? Okay. Will do. > >> +if (i) >> +return i; >> + >> +id = get_chip_index(id); >> +if (id < 0) { >> +pr_warn_once("%s Matching chip-id not found\n", __func__); > > The pr_warn_once should also print which chip-id was not found, please > add that to the print Okay will do. > >> +return id; >> +} >> + >> +for (i = 0; i < powernv_pstate_info.nr_pstates; i++) >> +count += sprintf(&buf[count], "%d %d\n", >> + powernv_freqs[i].frequency, >> + chips[id].pstate_stat[i]); >> + >> +return count; >> +} >> + >> +static struct kobj_attribute attr_throttle_frequencies = >> +__ATTR(throttle_frequencies, 0444, throttle_freq_show, NULL); >> + >> +static ssize_t throttle_stat_show(struct kobject *kobj, >> + struct kobj_attribute *attr, char *buf) >> +{ >> +int ret, id, count = 0; >> + >> +ret = kstrtoint(kobj->name + 4, 0, &id); >> +if (ret) >> +return ret; >> + >> +id = get_chip_index(id); >> +if (id < 0) { >> +pr_warn_once("%s Matching chip-id not found\n", __func__); >> +return id; >> +} >> + > > The above 9 lines look like common code, you can easily collapse it > instead of repeating it okay will do. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v7 0/6] cpufreq: powernv: Redesign the presentation of throttle notification and solve bug-fixes in the driver
In POWER8, OCC(On-Chip-Controller) can throttle the frequency of the CPU when the chip crosses its thermal and power limits. Currently, powernv-cpufreq driver detects and reports this event as a console message. Some machines may not sustain the max turbo frequency in all conditions and can be throttled frequently. This can lead to the flooding of console with throttle messages. So this patchset aims to redesign the presentation of this event via sysfs counters and tracepoints. And it also fixes couple of bugs reported in the driver. - Patch [1] fixes a memory leak bug - Patch [2] fixes the cpu hot-plug bug in powernv_cpufreq_work_fn(). - Patch [3] solves a bug in powernv_cpufreq_throttle_check(), which calls in to cpu_to_chip_id() in hot path which reads DT every time to find the chip id. - Patches [4] to [6] will add a perf trace point "power:powernv_throttle" and sysfs throttle counter stats in /sys/devices/system/cpu/cpufreq/chipN. Changes from v6: - Changes wrt comments from Balbir Singh and Viresh Kumar. Changes from v5: - Fix kbuild error: drivers/cpufreq/powernv-cpufreq.c:428:2: error: implicit declaration of function 'get_online_cpus' [-Werror=implicit-function-declaration] Changes from v4: - Fix a hot-plug bug in powernv_cpufreq_work_fn() - Changes wrt Gautham's and Shreyas's comments Changes from v3: - Add a fix to replace cpu_to_chip_id() with simpler PIR shift to obtain the chip id. - Break patch2 in to two patches separating the tracepoint and sysfs attribute changes. Changes from v2: - Fixed kbuild test warning. drivers/cpufreq/powernv-cpufreq.c:609:2: warning: ignoring return value of 'kstrtoint', declared with attribute warn_unused_result [-Wunused-result] Shilpasri G Bhat (6): cpufre: powernv: Free 'chips' on module exit cpufreq: powernv: Hot-plug safe the kworker thread cpufreq: powernv: Remove cpu_to_chip_id() from hot-path cpufreq: powernv/tracing: Add powernv_throttle tracepoint cpufreq: powernv: Replace pr_info with trace print for throttle event cpufreq: powernv: Add sysfs attributes to show throttle stats Documentation/ABI/testing/sysfs-devices-system-cpu | 45 +++ drivers/cpufreq/powernv-cpufreq.c | 313 + include/trace/events/power.h | 22 ++ kernel/trace/power-traces.c| 1 + 4 files changed, 326 insertions(+), 55 deletions(-) -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v7 1/6] cpufreq: powernv: Free 'chips' on module exit
This will free the dynamically allocated memory of'chips' on module exit. Signed-off-by: Shilpasri G Bhat --- drivers/cpufreq/powernv-cpufreq.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 547890f..53f980b 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -612,6 +612,7 @@ static void __exit powernv_cpufreq_exit(void) unregister_reboot_notifier(&powernv_cpufreq_reboot_nb); opal_message_notifier_unregister(OPAL_MSG_OCC, &powernv_cpufreq_opal_nb); + kfree(chips); cpufreq_unregister_driver(&powernv_cpufreq_driver); } module_exit(powernv_cpufreq_exit); -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v7 2/6] cpufreq: powernv: Hot-plug safe the kworker thread
In the kworker_thread powernv_cpufreq_work_fn(), we can end up sending an IPI to a cpu going offline. This is a rare corner case which is fixed using {get/put}_online_cpus(). Along with this fix, this patch adds changes to do oneshot cpumask_{clear/and} operation. Suggested-by: Shreyas B Prabhu Suggested-by: Gautham R Shenoy Signed-off-by: Shilpasri G Bhat Reviewed-by: Gautham R. Shenoy Acked-by: Viresh Kumar --- No changes from v6. Changes form v5: - Fix the kbuild-error: drivers/cpufreq/powernv-cpufreq.c:428:2: error: implicit declaration of function 'get_online_cpus' [-Werror=implicit-function-declaration drivers/cpufreq/powernv-cpufreq.c | 19 +++ 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 53f980b..a271b0f 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -423,18 +424,19 @@ void powernv_cpufreq_work_fn(struct work_struct *work) { struct chip *chip = container_of(work, struct chip, throttle); unsigned int cpu; - cpumask_var_t mask; + cpumask_t mask; - smp_call_function_any(&chip->mask, + get_online_cpus(); + cpumask_and(&mask, &chip->mask, cpu_online_mask); + smp_call_function_any(&mask, powernv_cpufreq_throttle_check, NULL, 0); if (!chip->restore) - return; + goto out; chip->restore = false; - cpumask_copy(mask, &chip->mask); - for_each_cpu_and(cpu, mask, cpu_online_mask) { - int index, tcpu; + for_each_cpu(cpu, &mask) { + int index; struct cpufreq_policy policy; cpufreq_get_policy(&policy, cpu); @@ -442,9 +444,10 @@ void powernv_cpufreq_work_fn(struct work_struct *work) policy.cur, CPUFREQ_RELATION_C, &index); powernv_cpufreq_target_index(&policy, index); - for_each_cpu(tcpu, policy.cpus) - cpumask_clear_cpu(tcpu, mask); + cpumask_andnot(&mask, &mask, policy.cpus); } +out: + put_online_cpus(); } static char throttle_reason[][30] = { -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v7 3/6] cpufreq: powernv: Remove cpu_to_chip_id() from hot-path
cpu_to_chip_id() does a DT walk through to find out the chip id by taking a contended device tree lock. This adds an unnecessary overhead in a hot path. So instead of calling cpu_to_chip_id() everytime cache the chip ids for all cores in the array 'core_to_chip_map' and use it in the hotpath. Reported-by: Anton Blanchard Signed-off-by: Shilpasri G Bhat --- Changes from v6: - Minor changes to move the code 'cpumask_copy()' after 'core_to_chip_map' is allocated. - Move 'kfree(chips)' to a separate patch. No changes from v5. Changes from v4: - Taken care of Shreyas's comments to add a core_to_chip_map array to store the chip id. drivers/cpufreq/powernv-cpufreq.c | 23 --- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index a271b0f..c670314 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -43,6 +43,7 @@ static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1]; static bool rebooting, throttled, occ_reset; +static unsigned int *core_to_chip_map; static struct chip { unsigned int id; @@ -313,13 +314,14 @@ static inline unsigned int get_nominal_index(void) static void powernv_cpufreq_throttle_check(void *data) { unsigned int cpu = smp_processor_id(); + unsigned int chip_id = core_to_chip_map[cpu_core_index_of_thread(cpu)]; unsigned long pmsr; int pmsr_pmax, i; pmsr = get_pmspr(SPRN_PMSR); for (i = 0; i < nr_chips; i++) - if (chips[i].id == cpu_to_chip_id(cpu)) + if (chips[i].id == chip_id) break; /* Check for Pmax Capping */ @@ -559,19 +561,29 @@ static int init_chip_info(void) unsigned int chip[256]; unsigned int cpu, i; unsigned int prev_chip_id = UINT_MAX; + cpumask_t cpu_mask; + int ret = -ENOMEM; - for_each_possible_cpu(cpu) { + core_to_chip_map = kcalloc(cpu_nr_cores(), sizeof(unsigned int), + GFP_KERNEL); + if (!core_to_chip_map) + goto out; + + cpumask_copy(&cpu_mask, cpu_possible_mask); + for_each_cpu(cpu, &cpu_mask) { unsigned int id = cpu_to_chip_id(cpu); if (prev_chip_id != id) { prev_chip_id = id; chip[nr_chips++] = id; } + core_to_chip_map[cpu_core_index_of_thread(cpu)] = id; + cpumask_andnot(&cpu_mask, &cpu_mask, cpu_sibling_mask(cpu)); } chips = kmalloc_array(nr_chips, sizeof(struct chip), GFP_KERNEL); if (!chips) - return -ENOMEM; + goto free_chip_map; for (i = 0; i < nr_chips; i++) { chips[i].id = chip[i]; @@ -582,6 +594,10 @@ static int init_chip_info(void) } return 0; +free_chip_map: + kfree(core_to_chip_map); +out: + return ret; } static int __init powernv_cpufreq_init(void) @@ -616,6 +632,7 @@ static void __exit powernv_cpufreq_exit(void) opal_message_notifier_unregister(OPAL_MSG_OCC, &powernv_cpufreq_opal_nb); kfree(chips); + kfree(core_to_chip_map); cpufreq_unregister_driver(&powernv_cpufreq_driver); } module_exit(powernv_cpufreq_exit); -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v7 4/6] cpufreq: powernv/tracing: Add powernv_throttle tracepoint
This patch adds the powernv_throttle tracepoint to trace the CPU frequency throttling event, which is used by the powernv-cpufreq driver in POWER8. Signed-off-by: Shilpasri G Bhat Reviewed-by: Gautham R. Shenoy CC: Ingo Molnar CC: Steven Rostedt --- No changes since v2. include/trace/events/power.h | 22 ++ kernel/trace/power-traces.c | 1 + 2 files changed, 23 insertions(+) diff --git a/include/trace/events/power.h b/include/trace/events/power.h index 284244e..19e5030 100644 --- a/include/trace/events/power.h +++ b/include/trace/events/power.h @@ -38,6 +38,28 @@ DEFINE_EVENT(cpu, cpu_idle, TP_ARGS(state, cpu_id) ); +TRACE_EVENT(powernv_throttle, + + TP_PROTO(int chip_id, const char *reason, int pmax), + + TP_ARGS(chip_id, reason, pmax), + + TP_STRUCT__entry( + __field(int, chip_id) + __string(reason, reason) + __field(int, pmax) + ), + + TP_fast_assign( + __entry->chip_id = chip_id; + __assign_str(reason, reason); + __entry->pmax = pmax; + ), + + TP_printk("Chip %d Pmax %d %s", __entry->chip_id, + __entry->pmax, __get_str(reason)) +); + TRACE_EVENT(pstate_sample, TP_PROTO(u32 core_busy, diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index eb4220a..81b8745 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -15,4 +15,5 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume); EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); +EXPORT_TRACEPOINT_SYMBOL_GPL(powernv_throttle); -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v7 5/6] cpufreq: powernv: Replace pr_info with trace print for throttle event
Currently we use printk message to notify the throttle event. But this can flood the console if the cpu is throttled frequently. So replace the printk with the tracepoint to notify the throttle event. And also events like throttle below nominal frequency and OCC_RESET are reduced to pr_warn/pr_warn_once as pointed by MFG to not mark them as critical messages. This patch adds 'throttle_reason' to struct chip to store the throttle reason. Signed-off-by: Shilpasri G Bhat --- Changes from v6: - Rename struct chip member 'throt_reason' to 'throttle_reason' No changes from v5. Changes from v4: - Taken care of Gautham's comments to remove the new function powernv_cpufreq_check_pmax() - Modified commit message Changes from v3: - Separate this patch to contain trace_point changes - Move struct chip member 'restore' of type bool above 'mask' to reduce structure padding. No changes from v2. Changes from v1: - As suggested by Paul Clarke replaced char * throttle_reason[][30] by const char * const throttle_reason[]. drivers/cpufreq/powernv-cpufreq.c | 73 ++- 1 file changed, 34 insertions(+), 39 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index c670314..1bbc10a 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -45,12 +46,22 @@ static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1]; static bool rebooting, throttled, occ_reset; static unsigned int *core_to_chip_map; +static const char * const throttle_reason[] = { + "No throttling", + "Power Cap", + "Processor Over Temperature", + "Power Supply Failure", + "Over Current", + "OCC Reset" +}; + static struct chip { unsigned int id; bool throttled; + bool restore; + u8 throttle_reason; cpumask_t mask; struct work_struct throttle; - bool restore; } *chips; static int nr_chips; @@ -331,17 +342,17 @@ static void powernv_cpufreq_throttle_check(void *data) goto next; chips[i].throttled = true; if (pmsr_pmax < powernv_pstate_info.nominal) - pr_crit("CPU %d on Chip %u has Pmax reduced below nominal frequency (%d < %d)\n", - cpu, chips[i].id, pmsr_pmax, - powernv_pstate_info.nominal); - else - pr_info("CPU %d on Chip %u has Pmax reduced below turbo frequency (%d < %d)\n", - cpu, chips[i].id, pmsr_pmax, - powernv_pstate_info.max); + pr_warn_once("CPU %d on Chip %u has Pmax reduced below nominal frequency (%d < %d)\n", +cpu, chips[i].id, pmsr_pmax, +powernv_pstate_info.nominal); + trace_powernv_throttle(chips[i].id, + throttle_reason[chips[i].throttle_reason], + pmsr_pmax); } else if (chips[i].throttled) { chips[i].throttled = false; - pr_info("CPU %d on Chip %u has Pmax restored to %d\n", cpu, - chips[i].id, pmsr_pmax); + trace_powernv_throttle(chips[i].id, + throttle_reason[chips[i].throttle_reason], + pmsr_pmax); } /* Check if Psafe_mode_active is set in PMSR. */ @@ -359,7 +370,7 @@ next: if (throttled) { pr_info("PMSR = %16lx\n", pmsr); - pr_crit("CPU Frequency could be throttled\n"); + pr_warn("CPU Frequency could be throttled\n"); } } @@ -452,15 +463,6 @@ out: put_online_cpus(); } -static char throttle_reason[][30] = { - "No throttling", - "Power Cap", - "Processor Over Temperature", - "Power Supply Failure", - "Over Current", - "OCC Reset" -}; - static int powernv_cpufreq_occ_msg(struct notifier_block *nb, unsigned long msg_type, void *_msg) { @@ -486,7 +488,7 @@ static int powernv_cpufreq_occ_msg(struct notifier_block *nb, */ if (!throttled) { throttled = true; - p
[PATCH v7 6/6] cpufreq: powernv: Add sysfs attributes to show throttle stats
Create sysfs attributes to export throttle information in /sys/devices/system/cpu/cpufreq/chipN. The newly added sysfs files are as follows: 1)/sys/devices/system/cpu/cpufreq/chip0/throttle_frequencies This gives the throttle stats for each of the available frequencies. The throttle stat of a frequency is the total number of times the max frequency is reduced to that frequency. # cat /sys/devices/system/cpu/cpufreq/chip0/throttle_frequencies 4023000 0 399 0 3956000 1 3923000 0 389 0 3857000 2 3823000 0 379 0 3757000 2 3724000 1 369 1 ... 2)/sys/devices/system/cpu/cpufreq/chip0/throttle_reasons This directory contains throttle reason files. Each file gives the total number of times the max frequency is throttled, except for 'unthrottle_count', which gives the total number of times the max frequency is unthrottled after being throttled. # cd /sys/devices/system/cpu/cpufreq/chip0/throttle_reasons # cat cpu_over_temperature 7 # cat occ_reset 0 # cat over_current 0 # cat power_cap 0 # cat power_supply_failure 0 # cat unthrottle_count 7 3)/sys/devices/system/cpu/cpufreq/chip0/throttle_stat This gives the total number of events of max frequency throttling to lower frequencies in the turbo range of frequencies and the sub-turbo(at and below nominal) range of frequencies. # cat /sys/devices/system/cpu/cpufreq/chip0/throttle_stat turbo 7 sub-turbo 0 Signed-off-by: Shilpasri G Bhat Cc: linux-...@vger.kernel.org --- Changes from v6: - Rename struct chip members 'throt_{nominal/turbo}' to throttle_* - Rename sysfs throttle_reason attribute 'throttle_reset' to 'unthrottle_count' - Add sysfs attribute details in Documentation/ABI/testing/sysfs-devices-system-cpu - Add helper routine get_chip_index_from_kobj() for throttle sysfs attribute show() to get chip index from kobject. - Add the chip id in the pr_warn_once No changes from v5. Changes from v4: - Taken care of Gautham's comments to use inline get_chip_index() Changes from v3: - Seperate the patch to contain only the throttle sysfs attribute changes. - Add helper inline function get_chip_index() Changes from v2: - Fixed kbuild test warning. drivers/cpufreq/powernv-cpufreq.c:609:2: warning: ignoring return value of 'kstrtoint', declared with attribute warn_unused_result [-Wunused-result] Changes from v1: - Added a kobject to struct chip - Grouped the throttle reasons under a separate attribute_group and exported each reason as individual file. - Moved the sysfs files from /sys/devices/system/node/nodeN to /sys/devices/system/cpu/cpufreq/chipN - As suggested by Paul Clarke replaced 'Nominal' with 'sub-turbo'. Documentation/ABI/testing/sysfs-devices-system-cpu | 45 + drivers/cpufreq/powernv-cpufreq.c | 205 - 2 files changed, 241 insertions(+), 9 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index b683e8e..dea4620 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -271,3 +271,48 @@ Description: Parameters for the CPU cache attributes - WriteBack: data is written only to the cache line and the modified cache line is written to main memory only when it is replaced + +What: /sys/devices/system/cpu/cpufreq/chip*/throttle_stats +Date: Jan 2016 +Contact: Linux kernel mailing list + Linux for PowerPC mailing list +Description: CPU Frequency throttle stat for the chip + + This attribute gives the total number of events of max-frequency + throttling to any lower frequency in the turbo(above nominal) and + the sub-turbo (at and below nominal) range of frequencies. + + +What: /sys/devices/system/cpu/cpufreq/chip*/throttle_frequencies +Date: Jan 2016 +Contact: Linux kernel mailing list + Linux for PowerPC mailing list +Description: CPU Frequency throttle stat for all available frequencies in the chip + + This attribute gives the throttle stats for each of the available + frequencies. The throttle stat of a frequency is the total + number of times the max frequency is reduced to that frequency. + +What: /sys/devices/system/cpu/cpufreq/chip*/throttle_reasons/ +Date: Jan 2016 +Contact: Linux kernel mailing list + Linux for PowerPC mailing list +Description: CPU Frequency throttle reason stat for the chip + + This directory contains throttle reason files. Each file gives + the total number of times the max frequency is throttled, except + for
Re: [PATCH v7 6/6] cpufreq: powernv: Add sysfs attributes to show throttle stats
Hi Viresh, On 01/28/2016 02:10 PM, Viresh Kumar wrote: > On 28-01-16, 12:55, Shilpasri G Bhat wrote: >> diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu >> b/Documentation/ABI/testing/sysfs-devices-system-cpu >> index b683e8e..dea4620 100644 >> --- a/Documentation/ABI/testing/sysfs-devices-system-cpu >> +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu >> @@ -271,3 +271,48 @@ Description:Parameters for the CPU cache attributes >> - WriteBack: data is written only to the cache line and >> the modified cache line is written to main >> memory only when it is replaced >> + >> +What: /sys/devices/system/cpu/cpufreq/chip*/throttle_stats > > What about the chip directory ? Shouldn't that be documented? And > shouldn't that mention that this is just for powerpc ? > > And before that, I don't think that you are doing this properly. I am > sorry that I never came to a point where I could review it, and you > continued with it, version after version. > > But, I really have strong objections to the way this is done. And you > are making things more complex then they are. > > So, these stats are per-policy, right ? First of all sorry about the version log. No these stats are not per-policy. They are per-chip. The throttle event is common for all cores in the chip. > > Then why aren't they added on the policy->kobj instead, just like > cpufreq-stats? And maybe inside cpufreq-stats folder only? > > That will solve many complexities you have in place here and will look > sane as well. > > Right now, you have stats as two places, cpu/cpufreq/chip/ and > cpu/cpuX/cpufreq/stats/, which doesn't look wise and adds to > confusion. > > What do you say? > Yes agree that it will be much cleaner with policy->kobj. But using policy->kobj will result in multiple copies of the throttle-chip stats exported for each policy in the chip. And moving it to cpu/cpuX/cpufreq/stats/ will add a dependency on CONFIG_CPU_FREQ_STAT We want throttle attributes to be either in cpu/cpufreq or cpu/cpuX/cpufreq. If multiple copies is not an issue, then I will move it to cpu/cpuX/cpufreq. Thanks and Regards, Shilpa ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v7 6/6] cpufreq: powernv: Add sysfs attributes to show throttle stats
On 01/28/2016 03:11 PM, Viresh Kumar wrote: > On 28-01-16, 15:06, Shilpasri G Bhat wrote: >> No these stats are not per-policy. They are per-chip. The throttle event is >> common for all cores in the chip. > > How do you define a chip? And how is it different then the group of > CPUs represented by the policy ? > Chip is a group of policies. Hmm yes I see your point. We anyways maintain frequency stats which is per-policy. We might as well have throttle stats exported per-policy which points to per-chip data. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v8 0/6] cpufreq: powernv: Redesign the presentation of throttle notification and solve bug-fixes in the driver
In POWER8, OCC(On-Chip-Controller) can throttle the frequency of the CPU when the chip crosses its thermal and power limits. Currently, powernv-cpufreq driver detects and reports this event as a console message. Some machines may not sustain the max turbo frequency in all conditions and can be throttled frequently. This can lead to the flooding of console with throttle messages. So this patchset aims to redesign the presentation of this event via sysfs counters and tracepoints. And it also fixes couple of bugs reported in the driver. - Patch [1] fixes a memory leak bug - Patch [2] fixes the cpu hot-plug bug in powernv_cpufreq_work_fn(). - Patch [3] solves a bug in powernv_cpufreq_throttle_check(), which calls in to cpu_to_chip_id() in hot path which reads DT every time to find the chip id. - Patches [4] to [6] will add a perf trace point "power:powernv_throttle" and sysfs throttle counter stats in /sys/devices/system/cpu/cpufreq/chipN. Changes from v7: - Changes in patch[6] involves adding a table to represent the throtle stats in frequency X reason layout. Detailed version log in the patch. Changes from v6: - Changes wrt comments from Balbir Singh and Viresh Kumar. Details in the version log of the patches. Changes from v5: - Fix kbuild error: drivers/cpufreq/powernv-cpufreq.c:428:2: error: implicit declaration of function 'get_online_cpus' [-Werror=implicit-function-declaration] Changes from v4: - Fix a hot-plug bug in powernv_cpufreq_work_fn() - Changes wrt Gautham's and Shreyas's comments Changes from v3: - Add a fix to replace cpu_to_chip_id() with simpler PIR shift to obtain the chip id. - Break patch2 in to two patches separating the tracepoint and sysfs attribute changes. Changes from v2: - Fixed kbuild test warning. drivers/cpufreq/powernv-cpufreq.c:609:2: warning: ignoring return value of 'kstrtoint', declared with attribute warn_unused_result [-Wunused-result] Shilpasri G Bhat (6): cpufre: powernv: Free 'chips' on module exit cpufreq: powernv: Hot-plug safe the kworker thread cpufreq: powernv: Remove cpu_to_chip_id() from hot-path cpufreq: powernv/tracing: Add powernv_throttle tracepoint cpufreq: powernv: Replace pr_info with trace print for throttle event cpufreq: powernv: Add sysfs attributes to show throttle stats Documentation/ABI/testing/sysfs-devices-system-cpu | 66 + drivers/cpufreq/powernv-cpufreq.c | 303 + include/trace/events/power.h | 22 ++ kernel/trace/power-traces.c| 1 + 4 files changed, 337 insertions(+), 55 deletions(-) -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v8 1/6] cpufreq: powernv: Free 'chips' on module exit
This will free the dynamically allocated memory of 'chips' on module exit. Signed-off-by: Shilpasri G Bhat Reviewed-by: Gautham R. Shenoy Acked-by: Viresh Kumar --- Changes from v7: - Minor typo fix in the commit message drivers/cpufreq/powernv-cpufreq.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 547890f..53f980b 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -612,6 +612,7 @@ static void __exit powernv_cpufreq_exit(void) unregister_reboot_notifier(&powernv_cpufreq_reboot_nb); opal_message_notifier_unregister(OPAL_MSG_OCC, &powernv_cpufreq_opal_nb); + kfree(chips); cpufreq_unregister_driver(&powernv_cpufreq_driver); } module_exit(powernv_cpufreq_exit); -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v8 2/6] cpufreq: powernv: Hot-plug safe the kworker thread
In the kworker_thread powernv_cpufreq_work_fn(), we can end up sending an IPI to a cpu going offline. This is a rare corner case which is fixed using {get/put}_online_cpus(). Along with this fix, this patch adds changes to do oneshot cpumask_{clear/and} operation. Suggested-by: Shreyas B Prabhu Suggested-by: Gautham R Shenoy Signed-off-by: Shilpasri G Bhat Reviewed-by: Gautham R. Shenoy Acked-by: Viresh Kumar --- No changes from v7. drivers/cpufreq/powernv-cpufreq.c | 19 +++ 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 53f980b..a271b0f 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -423,18 +424,19 @@ void powernv_cpufreq_work_fn(struct work_struct *work) { struct chip *chip = container_of(work, struct chip, throttle); unsigned int cpu; - cpumask_var_t mask; + cpumask_t mask; - smp_call_function_any(&chip->mask, + get_online_cpus(); + cpumask_and(&mask, &chip->mask, cpu_online_mask); + smp_call_function_any(&mask, powernv_cpufreq_throttle_check, NULL, 0); if (!chip->restore) - return; + goto out; chip->restore = false; - cpumask_copy(mask, &chip->mask); - for_each_cpu_and(cpu, mask, cpu_online_mask) { - int index, tcpu; + for_each_cpu(cpu, &mask) { + int index; struct cpufreq_policy policy; cpufreq_get_policy(&policy, cpu); @@ -442,9 +444,10 @@ void powernv_cpufreq_work_fn(struct work_struct *work) policy.cur, CPUFREQ_RELATION_C, &index); powernv_cpufreq_target_index(&policy, index); - for_each_cpu(tcpu, policy.cpus) - cpumask_clear_cpu(tcpu, mask); + cpumask_andnot(&mask, &mask, policy.cpus); } +out: + put_online_cpus(); } static char throttle_reason[][30] = { -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v8 3/6] cpufreq: powernv: Remove cpu_to_chip_id() from hot-path
cpu_to_chip_id() does a DT walk through to find out the chip id by taking a contended device tree lock. This adds an unnecessary overhead in a hot path. So instead of calling cpu_to_chip_id() everytime cache the chip ids for all cores in the array 'core_to_chip_map' and use it in the hotpath. Reported-by: Anton Blanchard Signed-off-by: Shilpasri G Bhat Reviewed-by: Gautham R. Shenoy Acked-by: Viresh Kumar --- No changes from v7. drivers/cpufreq/powernv-cpufreq.c | 23 --- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index a271b0f..c670314 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -43,6 +43,7 @@ static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1]; static bool rebooting, throttled, occ_reset; +static unsigned int *core_to_chip_map; static struct chip { unsigned int id; @@ -313,13 +314,14 @@ static inline unsigned int get_nominal_index(void) static void powernv_cpufreq_throttle_check(void *data) { unsigned int cpu = smp_processor_id(); + unsigned int chip_id = core_to_chip_map[cpu_core_index_of_thread(cpu)]; unsigned long pmsr; int pmsr_pmax, i; pmsr = get_pmspr(SPRN_PMSR); for (i = 0; i < nr_chips; i++) - if (chips[i].id == cpu_to_chip_id(cpu)) + if (chips[i].id == chip_id) break; /* Check for Pmax Capping */ @@ -559,19 +561,29 @@ static int init_chip_info(void) unsigned int chip[256]; unsigned int cpu, i; unsigned int prev_chip_id = UINT_MAX; + cpumask_t cpu_mask; + int ret = -ENOMEM; - for_each_possible_cpu(cpu) { + core_to_chip_map = kcalloc(cpu_nr_cores(), sizeof(unsigned int), + GFP_KERNEL); + if (!core_to_chip_map) + goto out; + + cpumask_copy(&cpu_mask, cpu_possible_mask); + for_each_cpu(cpu, &cpu_mask) { unsigned int id = cpu_to_chip_id(cpu); if (prev_chip_id != id) { prev_chip_id = id; chip[nr_chips++] = id; } + core_to_chip_map[cpu_core_index_of_thread(cpu)] = id; + cpumask_andnot(&cpu_mask, &cpu_mask, cpu_sibling_mask(cpu)); } chips = kmalloc_array(nr_chips, sizeof(struct chip), GFP_KERNEL); if (!chips) - return -ENOMEM; + goto free_chip_map; for (i = 0; i < nr_chips; i++) { chips[i].id = chip[i]; @@ -582,6 +594,10 @@ static int init_chip_info(void) } return 0; +free_chip_map: + kfree(core_to_chip_map); +out: + return ret; } static int __init powernv_cpufreq_init(void) @@ -616,6 +632,7 @@ static void __exit powernv_cpufreq_exit(void) opal_message_notifier_unregister(OPAL_MSG_OCC, &powernv_cpufreq_opal_nb); kfree(chips); + kfree(core_to_chip_map); cpufreq_unregister_driver(&powernv_cpufreq_driver); } module_exit(powernv_cpufreq_exit); -- 1.9.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev