[PATCH v4 0/5] powerpc/perf: IMC trace-mode support

2019-04-15 Thread Anju T Sudhakar
   
  0 : Enable/Disable 
  1 : 0 -> Accumulation Mode 
  1 -> Trace Mode
  2:3   : Reserved   
  4-6   : PB scope   
  7 : Reserved   
  8:50  : Counter Address
  51:63 : Reserved 

--

PMI interrupt handling is avoided, since IMC trace mode snapshots the
program counter and update to the memory. And this also provide a way for
the operating system to do instruction sampling in real time without
PMI(Performance Monitoring Interrupts) processing overhead. 


Performance data using 'perf top' with and without trace-imc event:

PMI interrupts count when `perf top` command is executed without trace-imc 
event.

# cat /proc/interrupts  (a snippet from the output)
9944  1072804804   1644804   1306  
804804804804804804804  
804804   1961   1602804804   1258  
[-]
803803803803803803803  
803803803803804804804 
804804804804804804803 
803803803803803   1306803 
803   Performance monitoring interrupts   


`perf top` with trace-imc (executed right after 'perf top' without trace-imc 
event):

# perf top -e trace_imc/trace_cycles/  
12.50%  [kernel]  [k] arch_cpu_idle
11.81%  [kernel]  [k] __next_timer_interrupt   
11.22%  [kernel]  [k] rcu_idle_enter   
10.25%  [kernel]  [k] find_next_bit
 7.91%  [kernel]  [k] do_idle  
 7.69%  [kernel]  [k] rcu_dynticks_eqs_exit
 5.20%  [kernel]  [k] tick_nohz_idle_stop_tick 
 [---]  

# cat /proc/interrupts (a snippet from the output) 

9944  1072804804   1644804   1306  
804804804804804804804  
804804   1961   1602804804   1258  
[-]
803803803803803803803  
803803803804804804804
804804804804804804803 
803803803803803   1306803 
803   Performance monitoring interrupts   

The PMI interrupts count remains the same.


Changelog:
--
>From v3 -> v4:

* trace_imc_refc is introduced. So that even if, core-imc
is disabled, trace-imc can be used.

* trace_imc_pmu_sched_task is removed and opal start/stop
is invoked in trace_imc_event_add/del function.

 
Suggestions/comments are welcome.

Anju T Sudhakar (4):
  powerpc/include: Add data structures and macros for IMC trace mode
  powerpc/perf: Rearrange setting of ldbar for thread-imc
  powerpc/perf: Trace imc events detection and cpuhotplug
  powerpc/perf: Trace imc PMU functions

Madhavan Srinivasan (1):
  powerpc/perf: Add privileged access check for thread_imc

 arch/powerpc/include/asm/imc-pmu.h|  39 +++
 arch/powerpc/include/asm/opal-api.h   |   1 +
 arch/powerpc/perf/imc-pmu.c   | 318 +-
 arch/powerpc/platforms/powernv/opal-imc.c |   3 +
 include/linux/cpuhotplug.h|   1 +
 5 files changed, 351 insertions(+), 11 deletions(-)

-- 
2.17.2



[PATCH v4 1/5] powerpc/include: Add data structures and macros for IMC trace mode

2019-04-15 Thread Anju T Sudhakar
Add the macros needed for IMC (In-Memory Collection Counters) trace-mode
and data structure to hold the trace-imc record data.
Also, add the new type "OPAL_IMC_COUNTERS_TRACE" in 'opal-api.h', since
there is a new switch case added in the opal-calls for IMC.

Signed-off-by: Anju T Sudhakar 
Reviewed-by: Madhavan Srinivasan 
---
 arch/powerpc/include/asm/imc-pmu.h  | 39 +
 arch/powerpc/include/asm/opal-api.h |  1 +
 2 files changed, 40 insertions(+)

diff --git a/arch/powerpc/include/asm/imc-pmu.h 
b/arch/powerpc/include/asm/imc-pmu.h
index 69f516ecb2fd..7c2ef0e42661 100644
--- a/arch/powerpc/include/asm/imc-pmu.h
+++ b/arch/powerpc/include/asm/imc-pmu.h
@@ -33,6 +33,7 @@
  */
 #define THREAD_IMC_LDBAR_MASK   0x0003e000ULL
 #define THREAD_IMC_ENABLE   0x8000ULL
+#define TRACE_IMC_ENABLE   0x4000ULL
 
 /*
  * For debugfs interface for imc-mode and imc-command
@@ -59,6 +60,34 @@ struct imc_events {
char *scale;
 };
 
+/*
+ * Trace IMC hardware updates a 64bytes record on
+ * Core Performance Monitoring Counter (CPMC)
+ * overflow. Here is the layout for the trace imc record
+ *
+ * DW 0 : Timebase
+ * DW 1 : Program Counter
+ * DW 2 : PIDR information
+ * DW 3 : CPMC1
+ * DW 4 : CPMC2
+ * DW 5 : CPMC3
+ * Dw 6 : CPMC4
+ * DW 7 : Timebase
+ * .
+ *
+ * The following is the data structure to hold trace imc data.
+ */
+struct trace_imc_data {
+   u64 tb1;
+   u64 ip;
+   u64 val;
+   u64 cpmc1;
+   u64 cpmc2;
+   u64 cpmc3;
+   u64 cpmc4;
+   u64 tb2;
+};
+
 /* Event attribute array index */
 #define IMC_FORMAT_ATTR0
 #define IMC_EVENT_ATTR 1
@@ -68,6 +97,13 @@ struct imc_events {
 /* PMU Format attribute macros */
 #define IMC_EVENT_OFFSET_MASK  0xULL
 
+/*
+ * Macro to mask bits 0:21 of first double word(which is the timebase) to
+ * compare with 8th double word (timebase) of trace imc record data.
+ */
+#define IMC_TRACE_RECORD_TB1_MASK  0x3ffULL
+
+
 /*
  * Device tree parser code detects IMC pmu support and
  * registers new IMC pmus. This structure will hold the
@@ -113,6 +149,7 @@ struct imc_pmu_ref {
 
 enum {
IMC_TYPE_THREAD = 0x1,
+   IMC_TYPE_TRACE  = 0x2,
IMC_TYPE_CORE   = 0x4,
IMC_TYPE_CHIP   = 0x10,
 };
@@ -123,6 +160,8 @@ enum {
 #define IMC_DOMAIN_NEST1
 #define IMC_DOMAIN_CORE2
 #define IMC_DOMAIN_THREAD  3
+/* For trace-imc the domain is still thread but it operates in trace-mode */
+#define IMC_DOMAIN_TRACE   4
 
 extern int init_imc_pmu(struct device_node *parent,
struct imc_pmu *pmu_ptr, int pmu_id);
diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index 870fb7b239ea..a4130b21b159 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -1118,6 +1118,7 @@ enum {
 enum {
OPAL_IMC_COUNTERS_NEST = 1,
OPAL_IMC_COUNTERS_CORE = 2,
+   OPAL_IMC_COUNTERS_TRACE = 3,
 };
 
 
-- 
2.17.2



[PATCH v4 2/5] powerpc/perf: Rearrange setting of ldbar for thread-imc

2019-04-15 Thread Anju T Sudhakar
LDBAR holds the memory address allocated for each cpu. For thread-imc
the mode bit (i.e bit 1) of LDBAR is set to accumulation.
Currently, ldbar is loaded with per cpu memory address and mode set to
accumulation at boot time.

To enable trace-imc, the mode bit of ldbar should be set to 'trace'. So to
accommodate trace-mode of IMC, reposition setting of ldbar for thread-imc
to thread_imc_event_add(). Also reset ldbar at thread_imc_event_del().

Signed-off-by: Anju T Sudhakar 
Reviewed-by: Madhavan Srinivasan 
---
 arch/powerpc/perf/imc-pmu.c | 28 +---
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index b1c37cc3fa98..51f1d3eaaa6d 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -788,8 +788,11 @@ static int core_imc_event_init(struct perf_event *event)
 }
 
 /*
- * Allocates a page of memory for each of the online cpus, and write the
- * physical base address of that page to the LDBAR for that cpu.
+ * Allocates a page of memory for each of the online cpus, and load
+ * LDBAR with 0.
+ * The physical base address of the page allocated for a cpu will be
+ * written to the LDBAR for that cpu, when the thread-imc event
+ * is added.
  *
  * LDBAR Register Layout:
  *
@@ -807,7 +810,7 @@ static int core_imc_event_init(struct perf_event *event)
  */
 static int thread_imc_mem_alloc(int cpu_id, int size)
 {
-   u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, cpu_id);
+   u64 *local_mem = per_cpu(thread_imc_mem, cpu_id);
int nid = cpu_to_node(cpu_id);
 
if (!local_mem) {
@@ -824,9 +827,7 @@ static int thread_imc_mem_alloc(int cpu_id, int size)
per_cpu(thread_imc_mem, cpu_id) = local_mem;
}
 
-   ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | 
THREAD_IMC_ENABLE;
-
-   mtspr(SPRN_LDBAR, ldbar_value);
+   mtspr(SPRN_LDBAR, 0);
return 0;
 }
 
@@ -977,6 +978,7 @@ static int thread_imc_event_add(struct perf_event *event, 
int flags)
 {
int core_id;
struct imc_pmu_ref *ref;
+   u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, 
smp_processor_id());
 
if (flags & PERF_EF_START)
imc_event_start(event, flags);
@@ -985,6 +987,9 @@ static int thread_imc_event_add(struct perf_event *event, 
int flags)
return -EINVAL;
 
core_id = smp_processor_id() / threads_per_core;
+   ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | 
THREAD_IMC_ENABLE;
+   mtspr(SPRN_LDBAR, ldbar_value);
+
/*
 * imc pmus are enabled only when it is used.
 * See if this is triggered for the first time.
@@ -1016,11 +1021,7 @@ static void thread_imc_event_del(struct perf_event 
*event, int flags)
int core_id;
struct imc_pmu_ref *ref;
 
-   /*
-* Take a snapshot and calculate the delta and update
-* the event counter values.
-*/
-   imc_event_update(event);
+   mtspr(SPRN_LDBAR, 0);
 
core_id = smp_processor_id() / threads_per_core;
ref = &core_imc_refc[core_id];
@@ -1039,6 +1040,11 @@ static void thread_imc_event_del(struct perf_event 
*event, int flags)
ref->refc = 0;
}
mutex_unlock(&ref->lock);
+   /*
+* Take a snapshot and calculate the delta and update
+* the event counter values.
+*/
+   imc_event_update(event);
 }
 
 /* update_pmu_ops : Populate the appropriate operations for "pmu" */
-- 
2.17.2



[PATCH v4 3/5] powerpc/perf: Add privileged access check for thread_imc

2019-04-15 Thread Anju T Sudhakar
From: Madhavan Srinivasan 

Add code to restrict user access to thread_imc pmu since
some event report privilege level information.

Fixes: f74c89bd80fb3 ('powerpc/perf: Add thread IMC PMU support')
Signed-off-by: Madhavan Srinivasan 
Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/perf/imc-pmu.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 51f1d3eaaa6d..7fe258e17dfe 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -859,6 +859,9 @@ static int thread_imc_event_init(struct perf_event *event)
if (event->attr.type != event->pmu->type)
return -ENOENT;
 
+   if (!capable(CAP_SYS_ADMIN))
+   return -EACCES;
+
/* Sampling not supported */
if (event->hw.sample_period)
return -EINVAL;
-- 
2.17.2



[PATCH v4 4/5] powerpc/perf: Trace imc events detection and cpuhotplug

2019-04-15 Thread Anju T Sudhakar
Patch detects trace-imc events, does memory initilizations for each online
cpu, and registers cpuhotplug call-backs.

Signed-off-by: Anju T Sudhakar 
Reviewed-by: Madhavan Srinivasan 
---
 arch/powerpc/perf/imc-pmu.c   | 104 ++
 arch/powerpc/platforms/powernv/opal-imc.c |   3 +
 include/linux/cpuhotplug.h|   1 +
 3 files changed, 108 insertions(+)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 7fe258e17dfe..3fe0222885bc 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -43,6 +43,11 @@ static DEFINE_PER_CPU(u64 *, thread_imc_mem);
 static struct imc_pmu *thread_imc_pmu;
 static int thread_imc_mem_size;
 
+/* Trace IMC data structures */
+static DEFINE_PER_CPU(u64 *, trace_imc_mem);
+static struct imc_pmu_ref *trace_imc_refc;
+static int trace_imc_mem_size;
+
 static struct imc_pmu *imc_event_to_pmu(struct perf_event *event)
 {
return container_of(event->pmu, struct imc_pmu, pmu);
@@ -1050,6 +1055,59 @@ static void thread_imc_event_del(struct perf_event 
*event, int flags)
imc_event_update(event);
 }
 
+/*
+ * Allocate a page of memory for each cpu, and load LDBAR with 0.
+ */
+static int trace_imc_mem_alloc(int cpu_id, int size)
+{
+   u64 *local_mem = per_cpu(trace_imc_mem, cpu_id);
+   int phys_id = cpu_to_node(cpu_id), rc = 0;
+   int core_id = (cpu_id / threads_per_core);
+
+   if (!local_mem) {
+   local_mem = page_address(alloc_pages_node(phys_id,
+   GFP_KERNEL | __GFP_ZERO | 
__GFP_THISNODE |
+   __GFP_NOWARN, get_order(size)));
+   if (!local_mem)
+   return -ENOMEM;
+   per_cpu(trace_imc_mem, cpu_id) = local_mem;
+
+   /* Initialise the counters for trace mode */
+   rc = opal_imc_counters_init(OPAL_IMC_COUNTERS_TRACE, __pa((void 
*)local_mem),
+   get_hard_smp_processor_id(cpu_id));
+   if (rc) {
+   pr_info("IMC:opal init failed for trace imc\n");
+   return rc;
+   }
+   }
+
+   /* Init the mutex, if not already */
+   trace_imc_refc[core_id].id = core_id;
+   mutex_init(&trace_imc_refc[core_id].lock);
+
+   mtspr(SPRN_LDBAR, 0);
+   return 0;
+}
+
+static int ppc_trace_imc_cpu_online(unsigned int cpu)
+{
+   return trace_imc_mem_alloc(cpu, trace_imc_mem_size);
+}
+
+static int ppc_trace_imc_cpu_offline(unsigned int cpu)
+{
+   mtspr(SPRN_LDBAR, 0);
+   return 0;
+}
+
+static int trace_imc_cpu_init(void)
+{
+   return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE,
+ "perf/powerpc/imc_trace:online",
+ ppc_trace_imc_cpu_online,
+ ppc_trace_imc_cpu_offline);
+}
+
 /* update_pmu_ops : Populate the appropriate operations for "pmu" */
 static int update_pmu_ops(struct imc_pmu *pmu)
 {
@@ -1172,6 +1230,18 @@ static void cleanup_all_thread_imc_memory(void)
}
 }
 
+static void cleanup_all_trace_imc_memory(void)
+{
+   int i, order = get_order(trace_imc_mem_size);
+
+   for_each_online_cpu(i) {
+   if (per_cpu(trace_imc_mem, i))
+   free_pages((u64)per_cpu(trace_imc_mem, i), order);
+
+   }
+   kfree(trace_imc_refc);
+}
+
 /* Function to free the attr_groups which are dynamically allocated */
 static void imc_common_mem_free(struct imc_pmu *pmu_ptr)
 {
@@ -1213,6 +1283,11 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu 
*pmu_ptr)
cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE);
cleanup_all_thread_imc_memory();
}
+
+   if (pmu_ptr->domain == IMC_DOMAIN_TRACE) {
+   cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE);
+   cleanup_all_trace_imc_memory();
+   }
 }
 
 /*
@@ -1295,6 +1370,27 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct 
device_node *parent,
 
thread_imc_pmu = pmu_ptr;
break;
+   case IMC_DOMAIN_TRACE:
+   /* Update the pmu name */
+   pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s", s, "_imc");
+   if (!pmu_ptr->pmu.name)
+   return -ENOMEM;
+
+   nr_cores = DIV_ROUND_UP(num_possible_cpus(), threads_per_core);
+   trace_imc_refc = kcalloc(nr_cores, sizeof(struct imc_pmu_ref),
+   GFP_KERNEL);
+   if (!trace_imc_refc)
+   return -ENOMEM;
+
+   trace_imc_mem_size = pmu_ptr->counter_mem_size;
+   for_each_online_cpu(cpu) {
+   res = trace_imc_mem_alloc(cpu, trace_imc_mem_size);
+   if

[PATCH v4 5/5] powerpc/perf: Trace imc PMU functions

2019-04-15 Thread Anju T Sudhakar
Add PMU functions to support trace-imc.

Signed-off-by: Anju T Sudhakar 
Reviewed-by: Madhavan Srinivasan 
---
 arch/powerpc/perf/imc-pmu.c | 183 
 1 file changed, 183 insertions(+)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 3fe0222885bc..3f433cc96b18 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -1108,6 +1108,182 @@ static int trace_imc_cpu_init(void)
  ppc_trace_imc_cpu_offline);
 }
 
+static u64 get_trace_imc_event_base_addr(void)
+{
+   return (u64)per_cpu(trace_imc_mem, smp_processor_id());
+}
+
+/*
+ * Function to parse trace-imc data obtained
+ * and to prepare the perf sample.
+ */
+static int trace_imc_prepare_sample(struct trace_imc_data *mem,
+   struct perf_sample_data *data,
+   u64 *prev_tb,
+   struct perf_event_header *header,
+   struct perf_event *event)
+{
+   /* Sanity checks for a valid record */
+   if (be64_to_cpu(READ_ONCE(mem->tb1)) > *prev_tb)
+   *prev_tb = be64_to_cpu(READ_ONCE(mem->tb1));
+   else
+   return -EINVAL;
+
+   if ((be64_to_cpu(READ_ONCE(mem->tb1)) & IMC_TRACE_RECORD_TB1_MASK) !=
+be64_to_cpu(READ_ONCE(mem->tb2)))
+   return -EINVAL;
+
+   /* Prepare perf sample */
+   data->ip =  be64_to_cpu(READ_ONCE(mem->ip));
+   data->period = event->hw.last_period;
+
+   header->type = PERF_RECORD_SAMPLE;
+   header->size = sizeof(*header) + event->header_size;
+   header->misc = 0;
+
+   if (is_kernel_addr(data->ip))
+   header->misc |= PERF_RECORD_MISC_KERNEL;
+   else
+   header->misc |= PERF_RECORD_MISC_USER;
+
+   perf_event_header__init_id(header, data, event);
+
+   return 0;
+}
+
+static void dump_trace_imc_data(struct perf_event *event)
+{
+   struct trace_imc_data *mem;
+   int i, ret;
+   u64 prev_tb = 0;
+
+   mem = (struct trace_imc_data *)get_trace_imc_event_base_addr();
+   for (i = 0; i < (trace_imc_mem_size / sizeof(struct trace_imc_data));
+   i++, mem++) {
+   struct perf_sample_data data;
+   struct perf_event_header header;
+
+   ret = trace_imc_prepare_sample(mem, &data, &prev_tb, &header, 
event);
+   if (ret) /* Exit, if not a valid record */
+   break;
+   else {
+   /* If this is a valid record, create the sample */
+   struct perf_output_handle handle;
+
+   if (perf_output_begin(&handle, event, header.size))
+   return;
+
+   perf_output_sample(&handle, &header, &data, event);
+   perf_output_end(&handle);
+   }
+   }
+}
+
+static int trace_imc_event_add(struct perf_event *event, int flags)
+{
+   int core_id = smp_processor_id() / threads_per_core;
+   struct imc_pmu_ref *ref = NULL;
+   u64 local_mem, ldbar_value;
+
+   /* Set trace-imc bit in ldbar and load ldbar with per-thread memory 
address */
+   local_mem = get_trace_imc_event_base_addr();
+   ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | 
TRACE_IMC_ENABLE;
+
+   if (core_imc_refc)
+   ref = &core_imc_refc[core_id];
+   if (!ref) {
+   /* If core-imc is not enabled, use trace-imc reference count */
+   if (trace_imc_refc)
+   ref = &trace_imc_refc[core_id];
+   if (!ref)
+   return -EINVAL;
+   }
+   mtspr(SPRN_LDBAR, ldbar_value);
+   mutex_lock(&ref->lock);
+   if (ref->refc == 0) {
+   if (opal_imc_counters_start(OPAL_IMC_COUNTERS_TRACE,
+   get_hard_smp_processor_id(smp_processor_id( 
{
+   mutex_unlock(&ref->lock);
+   pr_err("trace-imc: Unable to start the counters for 
core %d\n", core_id);
+   mtspr(SPRN_LDBAR, 0);
+   return -EINVAL;
+   }
+   }
+   ++ref->refc;
+   mutex_unlock(&ref->lock);
+
+   return 0;
+}
+
+static void trace_imc_event_read(struct perf_event *event)
+{
+   return;
+}
+
+static void trace_imc_event_stop(struct perf_event *event, int flags)
+{
+   u64 local_mem = get_trace_imc_event_base_addr();
+   dump_trace_imc_data(event);
+   memset((void *)local_mem, 0, sizeof(u64));
+}
+
+static void trace_imc_event_start(struct perf_event *event, int flags)
+{
+   return;
+}
+
+static void trace_imc_event_del(struct perf_event *event, int flags)
+{
+   int core_id = smp_proc

Re: [PATCH v4 0/5] powerpc/perf: IMC trace-mode support

2019-04-16 Thread Anju T Sudhakar

Hi,

Kindly ignore this series, since patch 5/5 in this series doesn't 
incorporate the event-format change


that I've done in v4 of this series.


Apologies for the inconvenience. I will post the updated v5 soon.


Thanks,

Anju

On 4/15/19 3:41 PM, Anju T Sudhakar wrote:

IMC (In-Memory collection counters) is a hardware monitoring facility
that collects large number of hardware performance events.
POWER9 support two modes for IMC which are the Accumulation mode and
Trace mode. In Accumulation mode, event counts are accumulated in system
Memory. Hypervisor then reads the posted counts periodically or when
requested. In IMC Trace mode, the 64 bit trace scom value is initialized
with the event information. The CPMC*SEL and CPMC_LOAD in the trace scom, 
specifies
the event to be monitored and the sampling duration. On each overflow in the
CPMC*SEL, hardware snapshots the program counter along with event counts
and writes into memory pointed by LDBAR. LDBAR has bits to indicate whether
hardware is configured for accumulation or trace mode.
Currently the event monitored for trace-mode is fixed as cycle.

Trace-IMC Implementation:
--
To enable trace-imc, we need to

* Add trace node in the DTS file for power9, so that the new trace node can
be discovered by the kernel.

Information included in the DTS file are as follows, (a snippet from
the ima-catalog)

TRACE_IMC: trace-events {
  #address-cells = <0x1>;
  #size-cells = <0x1>;
  event at 1020 {
 event-name = "cycles" ;
 reg = <0x1020 0x8>;
 desc = "Reference cycles" ;
  };
  };
  trace@0 {
 compatible = "ibm,imc-counters";
 events-prefix = "trace_";
 reg = <0x0 0x8>;
 events = < &TRACE_IMC >;
 type = <0x2>;
 size = <0x4>;
  };

OP-BUILD changes needed to include the "trace node" is already pulled in
to the ima-catalog repo.

ps://github.com/open-power/op-build/commit/d3e75dc26d1283d7d5eb444bff1ec9e40d5dfc07

* Enchance the opal_imc_counters_* calls to support this new trace mode
in imc. Add support to initialize the trace-mode scom.

TRACE_IMC_SCOM bit representation:

0:1 : SAMPSEL
2:33: CPMC_LOAD
34:40   : CPMC1SEL
41:47   : CPMC2SEL
48:50   : BUFFERSIZE
51:63   : RESERVED

CPMC_LOAD contains the sampling duration. SAMPSEL and CPMC*SEL determines
the event to count. BUFFRSIZE indicates the memory range. On each overflow,
hardware snapshots program counter along with event counts and update the
memory and reloads the CMPC_LOAD value for the next sampling duration.
IMC hardware does not support exceptions, so it quietly wraps around if
memory buffer reaches the end.

OPAL support for IMC trace mode is already upstream.

* Set LDBAR spr to enable imc-trace mode.
  
   LDBAR Layout:
  
   0 : Enable/Disable

   1 : 0 -> Accumulation Mode
   1 -> Trace Mode
   2:3   : Reserved
   4-6   : PB scope
   7 : Reserved
   8:50  : Counter Address
   51:63 : Reserved

--

PMI interrupt handling is avoided, since IMC trace mode snapshots the
program counter and update to the memory. And this also provide a way for
the operating system to do instruction sampling in real time without
PMI(Performance Monitoring Interrupts) processing overhead. 


Performance data using 'perf top' with and without trace-imc event:

PMI interrupts count when `perf top` command is executed without trace-imc 
event.

# cat /proc/interrupts  (a snippet from the output)
9944  1072804804   1644804   1306
804804804804804804804
804804   1961   1602804804   1258
[-]
803803803803803803803
803803803803   

[PATCH v4 0/5] powerpc/perf: IMC trace-mode support

2019-04-16 Thread Anju T Sudhakar
   
  0 : Enable/Disable 
  1 : 0 -> Accumulation Mode 
  1 -> Trace Mode
  2:3   : Reserved   
  4-6   : PB scope   
  7 : Reserved   
  8:50  : Counter Address
  51:63 : Reserved 

--

PMI interrupt handling is avoided, since IMC trace mode snapshots the
program counter and update to the memory. And this also provide a way for
the operating system to do instruction sampling in real time without
PMI(Performance Monitoring Interrupts) processing overhead. 


Performance data using 'perf top' with and without trace-imc event:

PMI interrupts count when `perf top` command is executed without trace-imc 
event.

# cat /proc/interrupts  (a snippet from the output)
9944  1072804804   1644804   1306  
804804804804804804804  
804804   1961   1602804804   1258  
[-]
803803803803803803803  
803803803803804804804 
804804804804804804803 
803803803803803   1306803 
803   Performance monitoring interrupts   


`perf top` with trace-imc (executed right after 'perf top' without trace-imc 
event):

# perf top -e trace_imc/trace_cycles/  
12.50%  [kernel]  [k] arch_cpu_idle
11.81%  [kernel]  [k] __next_timer_interrupt   
11.22%  [kernel]  [k] rcu_idle_enter   
10.25%  [kernel]  [k] find_next_bit
 7.91%  [kernel]  [k] do_idle  
 7.69%  [kernel]  [k] rcu_dynticks_eqs_exit
 5.20%  [kernel]  [k] tick_nohz_idle_stop_tick 
 [---]  

# cat /proc/interrupts (a snippet from the output) 

9944  1072804804   1644804   1306  
804804804804804804804  
804804   1961   1602804804   1258  
[-]
803803803803803803803  
803803803804804804804
804804804804804804803 
803803803803803   1306803 
803   Performance monitoring interrupts   

The PMI interrupts count remains the same.


Changelog:
--
>From v3 -> v4:

* trace_imc_refc is introduced. So that even if, core-imc
is disabled, trace-imc can be used.

* trace_imc_pmu_sched_task is removed and opal start/stop
is invoked in trace_imc_event_add/del function.

 
Suggestions/comments are welcome.

Anju T Sudhakar (4):   
  powerpc/include: Add data structures and macros for IMC trace mode   
  powerpc/perf: Rearrange setting of ldbar for thread-imc  
  powerpc/perf: Trace imc events detection and cpuhotplug  
  powerpc/perf: Trace imc PMU functions
   
Madhavan Srinivasan (1):   
  powerpc/perf: Add privileged access check for thread_imc 
 

[PATCH v4 1/5] powerpc/include: Add data structures and macros for IMC trace mode

2019-04-16 Thread Anju T Sudhakar
Add the macros needed for IMC (In-Memory Collection Counters) trace-mode
and data structure to hold the trace-imc record data.
Also, add the new type "OPAL_IMC_COUNTERS_TRACE" in 'opal-api.h', since
there is a new switch case added in the opal-calls for IMC.

Signed-off-by: Anju T Sudhakar 
Reviewed-by: Madhavan Srinivasan 
---
 arch/powerpc/include/asm/imc-pmu.h  | 39 +
 arch/powerpc/include/asm/opal-api.h |  1 +
 2 files changed, 40 insertions(+)

diff --git a/arch/powerpc/include/asm/imc-pmu.h 
b/arch/powerpc/include/asm/imc-pmu.h
index 69f516ecb2fd..7c2ef0e42661 100644
--- a/arch/powerpc/include/asm/imc-pmu.h
+++ b/arch/powerpc/include/asm/imc-pmu.h
@@ -33,6 +33,7 @@
  */
 #define THREAD_IMC_LDBAR_MASK   0x0003e000ULL
 #define THREAD_IMC_ENABLE   0x8000ULL
+#define TRACE_IMC_ENABLE   0x4000ULL
 
 /*
  * For debugfs interface for imc-mode and imc-command
@@ -59,6 +60,34 @@ struct imc_events {
char *scale;
 };
 
+/*
+ * Trace IMC hardware updates a 64bytes record on
+ * Core Performance Monitoring Counter (CPMC)
+ * overflow. Here is the layout for the trace imc record
+ *
+ * DW 0 : Timebase
+ * DW 1 : Program Counter
+ * DW 2 : PIDR information
+ * DW 3 : CPMC1
+ * DW 4 : CPMC2
+ * DW 5 : CPMC3
+ * Dw 6 : CPMC4
+ * DW 7 : Timebase
+ * .
+ *
+ * The following is the data structure to hold trace imc data.
+ */
+struct trace_imc_data {
+   u64 tb1;
+   u64 ip;
+   u64 val;
+   u64 cpmc1;
+   u64 cpmc2;
+   u64 cpmc3;
+   u64 cpmc4;
+   u64 tb2;
+};
+
 /* Event attribute array index */
 #define IMC_FORMAT_ATTR0
 #define IMC_EVENT_ATTR 1
@@ -68,6 +97,13 @@ struct imc_events {
 /* PMU Format attribute macros */
 #define IMC_EVENT_OFFSET_MASK  0xULL
 
+/*
+ * Macro to mask bits 0:21 of first double word(which is the timebase) to
+ * compare with 8th double word (timebase) of trace imc record data.
+ */
+#define IMC_TRACE_RECORD_TB1_MASK  0x3ffULL
+
+
 /*
  * Device tree parser code detects IMC pmu support and
  * registers new IMC pmus. This structure will hold the
@@ -113,6 +149,7 @@ struct imc_pmu_ref {
 
 enum {
IMC_TYPE_THREAD = 0x1,
+   IMC_TYPE_TRACE  = 0x2,
IMC_TYPE_CORE   = 0x4,
IMC_TYPE_CHIP   = 0x10,
 };
@@ -123,6 +160,8 @@ enum {
 #define IMC_DOMAIN_NEST1
 #define IMC_DOMAIN_CORE2
 #define IMC_DOMAIN_THREAD  3
+/* For trace-imc the domain is still thread but it operates in trace-mode */
+#define IMC_DOMAIN_TRACE   4
 
 extern int init_imc_pmu(struct device_node *parent,
struct imc_pmu *pmu_ptr, int pmu_id);
diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index 870fb7b239ea..a4130b21b159 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -1118,6 +1118,7 @@ enum {
 enum {
OPAL_IMC_COUNTERS_NEST = 1,
OPAL_IMC_COUNTERS_CORE = 2,
+   OPAL_IMC_COUNTERS_TRACE = 3,
 };
 
 
-- 
2.17.2



[PATCH v4 3/5] powerpc/perf: Add privileged access check for thread_imc

2019-04-16 Thread Anju T Sudhakar
From: Madhavan Srinivasan 

Add code to restrict user access to thread_imc pmu since
some event report privilege level information.

Fixes: f74c89bd80fb3 ('powerpc/perf: Add thread IMC PMU support')
Signed-off-by: Madhavan Srinivasan 
Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/perf/imc-pmu.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 51f1d3eaaa6d..7fe258e17dfe 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -859,6 +859,9 @@ static int thread_imc_event_init(struct perf_event *event)
if (event->attr.type != event->pmu->type)
return -ENOENT;
 
+   if (!capable(CAP_SYS_ADMIN))
+   return -EACCES;
+
/* Sampling not supported */
if (event->hw.sample_period)
return -EINVAL;
-- 
2.17.2



[PATCH v4 2/5] powerpc/perf: Rearrange setting of ldbar for thread-imc

2019-04-16 Thread Anju T Sudhakar
LDBAR holds the memory address allocated for each cpu. For thread-imc
the mode bit (i.e bit 1) of LDBAR is set to accumulation.
Currently, ldbar is loaded with per cpu memory address and mode set to
accumulation at boot time.

To enable trace-imc, the mode bit of ldbar should be set to 'trace'. So to
accommodate trace-mode of IMC, reposition setting of ldbar for thread-imc
to thread_imc_event_add(). Also reset ldbar at thread_imc_event_del().

Signed-off-by: Anju T Sudhakar 
Reviewed-by: Madhavan Srinivasan 
---
 arch/powerpc/perf/imc-pmu.c | 28 +---
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index b1c37cc3fa98..51f1d3eaaa6d 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -788,8 +788,11 @@ static int core_imc_event_init(struct perf_event *event)
 }
 
 /*
- * Allocates a page of memory for each of the online cpus, and write the
- * physical base address of that page to the LDBAR for that cpu.
+ * Allocates a page of memory for each of the online cpus, and load
+ * LDBAR with 0.
+ * The physical base address of the page allocated for a cpu will be
+ * written to the LDBAR for that cpu, when the thread-imc event
+ * is added.
  *
  * LDBAR Register Layout:
  *
@@ -807,7 +810,7 @@ static int core_imc_event_init(struct perf_event *event)
  */
 static int thread_imc_mem_alloc(int cpu_id, int size)
 {
-   u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, cpu_id);
+   u64 *local_mem = per_cpu(thread_imc_mem, cpu_id);
int nid = cpu_to_node(cpu_id);
 
if (!local_mem) {
@@ -824,9 +827,7 @@ static int thread_imc_mem_alloc(int cpu_id, int size)
per_cpu(thread_imc_mem, cpu_id) = local_mem;
}
 
-   ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | 
THREAD_IMC_ENABLE;
-
-   mtspr(SPRN_LDBAR, ldbar_value);
+   mtspr(SPRN_LDBAR, 0);
return 0;
 }
 
@@ -977,6 +978,7 @@ static int thread_imc_event_add(struct perf_event *event, 
int flags)
 {
int core_id;
struct imc_pmu_ref *ref;
+   u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, 
smp_processor_id());
 
if (flags & PERF_EF_START)
imc_event_start(event, flags);
@@ -985,6 +987,9 @@ static int thread_imc_event_add(struct perf_event *event, 
int flags)
return -EINVAL;
 
core_id = smp_processor_id() / threads_per_core;
+   ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | 
THREAD_IMC_ENABLE;
+   mtspr(SPRN_LDBAR, ldbar_value);
+
/*
 * imc pmus are enabled only when it is used.
 * See if this is triggered for the first time.
@@ -1016,11 +1021,7 @@ static void thread_imc_event_del(struct perf_event 
*event, int flags)
int core_id;
struct imc_pmu_ref *ref;
 
-   /*
-* Take a snapshot and calculate the delta and update
-* the event counter values.
-*/
-   imc_event_update(event);
+   mtspr(SPRN_LDBAR, 0);
 
core_id = smp_processor_id() / threads_per_core;
ref = &core_imc_refc[core_id];
@@ -1039,6 +1040,11 @@ static void thread_imc_event_del(struct perf_event 
*event, int flags)
ref->refc = 0;
}
mutex_unlock(&ref->lock);
+   /*
+* Take a snapshot and calculate the delta and update
+* the event counter values.
+*/
+   imc_event_update(event);
 }
 
 /* update_pmu_ops : Populate the appropriate operations for "pmu" */
-- 
2.17.2



[PATCH v4 4/5] powerpc/perf: Trace imc events detection and cpuhotplug

2019-04-16 Thread Anju T Sudhakar
Patch detects trace-imc events, does memory initilizations for each online
cpu, and registers cpuhotplug call-backs.

Signed-off-by: Anju T Sudhakar 
Reviewed-by: Madhavan Srinivasan 
---
 arch/powerpc/perf/imc-pmu.c   | 104 ++
 arch/powerpc/platforms/powernv/opal-imc.c |   3 +
 include/linux/cpuhotplug.h|   1 +
 3 files changed, 108 insertions(+)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 7fe258e17dfe..3fe0222885bc 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -43,6 +43,11 @@ static DEFINE_PER_CPU(u64 *, thread_imc_mem);
 static struct imc_pmu *thread_imc_pmu;
 static int thread_imc_mem_size;
 
+/* Trace IMC data structures */
+static DEFINE_PER_CPU(u64 *, trace_imc_mem);
+static struct imc_pmu_ref *trace_imc_refc;
+static int trace_imc_mem_size;
+
 static struct imc_pmu *imc_event_to_pmu(struct perf_event *event)
 {
return container_of(event->pmu, struct imc_pmu, pmu);
@@ -1050,6 +1055,59 @@ static void thread_imc_event_del(struct perf_event 
*event, int flags)
imc_event_update(event);
 }
 
+/*
+ * Allocate a page of memory for each cpu, and load LDBAR with 0.
+ */
+static int trace_imc_mem_alloc(int cpu_id, int size)
+{
+   u64 *local_mem = per_cpu(trace_imc_mem, cpu_id);
+   int phys_id = cpu_to_node(cpu_id), rc = 0;
+   int core_id = (cpu_id / threads_per_core);
+
+   if (!local_mem) {
+   local_mem = page_address(alloc_pages_node(phys_id,
+   GFP_KERNEL | __GFP_ZERO | 
__GFP_THISNODE |
+   __GFP_NOWARN, get_order(size)));
+   if (!local_mem)
+   return -ENOMEM;
+   per_cpu(trace_imc_mem, cpu_id) = local_mem;
+
+   /* Initialise the counters for trace mode */
+   rc = opal_imc_counters_init(OPAL_IMC_COUNTERS_TRACE, __pa((void 
*)local_mem),
+   get_hard_smp_processor_id(cpu_id));
+   if (rc) {
+   pr_info("IMC:opal init failed for trace imc\n");
+   return rc;
+   }
+   }
+
+   /* Init the mutex, if not already */
+   trace_imc_refc[core_id].id = core_id;
+   mutex_init(&trace_imc_refc[core_id].lock);
+
+   mtspr(SPRN_LDBAR, 0);
+   return 0;
+}
+
+static int ppc_trace_imc_cpu_online(unsigned int cpu)
+{
+   return trace_imc_mem_alloc(cpu, trace_imc_mem_size);
+}
+
+static int ppc_trace_imc_cpu_offline(unsigned int cpu)
+{
+   mtspr(SPRN_LDBAR, 0);
+   return 0;
+}
+
+static int trace_imc_cpu_init(void)
+{
+   return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE,
+ "perf/powerpc/imc_trace:online",
+ ppc_trace_imc_cpu_online,
+ ppc_trace_imc_cpu_offline);
+}
+
 /* update_pmu_ops : Populate the appropriate operations for "pmu" */
 static int update_pmu_ops(struct imc_pmu *pmu)
 {
@@ -1172,6 +1230,18 @@ static void cleanup_all_thread_imc_memory(void)
}
 }
 
+static void cleanup_all_trace_imc_memory(void)
+{
+   int i, order = get_order(trace_imc_mem_size);
+
+   for_each_online_cpu(i) {
+   if (per_cpu(trace_imc_mem, i))
+   free_pages((u64)per_cpu(trace_imc_mem, i), order);
+
+   }
+   kfree(trace_imc_refc);
+}
+
 /* Function to free the attr_groups which are dynamically allocated */
 static void imc_common_mem_free(struct imc_pmu *pmu_ptr)
 {
@@ -1213,6 +1283,11 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu 
*pmu_ptr)
cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE);
cleanup_all_thread_imc_memory();
}
+
+   if (pmu_ptr->domain == IMC_DOMAIN_TRACE) {
+   cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE);
+   cleanup_all_trace_imc_memory();
+   }
 }
 
 /*
@@ -1295,6 +1370,27 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct 
device_node *parent,
 
thread_imc_pmu = pmu_ptr;
break;
+   case IMC_DOMAIN_TRACE:
+   /* Update the pmu name */
+   pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s", s, "_imc");
+   if (!pmu_ptr->pmu.name)
+   return -ENOMEM;
+
+   nr_cores = DIV_ROUND_UP(num_possible_cpus(), threads_per_core);
+   trace_imc_refc = kcalloc(nr_cores, sizeof(struct imc_pmu_ref),
+   GFP_KERNEL);
+   if (!trace_imc_refc)
+   return -ENOMEM;
+
+   trace_imc_mem_size = pmu_ptr->counter_mem_size;
+   for_each_online_cpu(cpu) {
+   res = trace_imc_mem_alloc(cpu, trace_imc_mem_size);
+   if

[PATCH v4 5/5] powerpc/perf: Trace imc PMU functions

2019-04-16 Thread Anju T Sudhakar
Add PMU functions to support trace-imc.

Signed-off-by: Anju T Sudhakar 
Reviewed-by: Madhavan Srinivasan 
---
 arch/powerpc/perf/imc-pmu.c | 205 +++-
 1 file changed, 204 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 3fe0222885bc..cc9724561bf2 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -53,7 +53,7 @@ static struct imc_pmu *imc_event_to_pmu(struct perf_event 
*event)
return container_of(event->pmu, struct imc_pmu, pmu);
 }
 
-PMU_FORMAT_ATTR(event, "config:0-40");
+PMU_FORMAT_ATTR(event, "config:0-61");
 PMU_FORMAT_ATTR(offset, "config:0-31");
 PMU_FORMAT_ATTR(rvalue, "config:32");
 PMU_FORMAT_ATTR(mode, "config:33-40");
@@ -70,6 +70,25 @@ static struct attribute_group imc_format_group = {
.attrs = imc_format_attrs,
 };
 
+/* Format attribute for imc trace-mode */
+PMU_FORMAT_ATTR(cpmc_reserved, "config:0-19");
+PMU_FORMAT_ATTR(cpmc_event, "config:20-27");
+PMU_FORMAT_ATTR(cpmc_samplesel, "config:28-29");
+PMU_FORMAT_ATTR(cpmc_load, "config:30-61");
+static struct attribute *trace_imc_format_attrs[] = {
+   &format_attr_event.attr,
+   &format_attr_cpmc_reserved.attr,
+   &format_attr_cpmc_event.attr,
+   &format_attr_cpmc_samplesel.attr,
+   &format_attr_cpmc_load.attr,
+   NULL,
+};
+
+static struct attribute_group trace_imc_format_group = {
+.name = "format",
+.attrs = trace_imc_format_attrs,
+};
+
 /* Get the cpumask printed to a buffer "buf" */
 static ssize_t imc_pmu_cpumask_get_attr(struct device *dev,
struct device_attribute *attr,
@@ -1108,6 +1127,182 @@ static int trace_imc_cpu_init(void)
  ppc_trace_imc_cpu_offline);
 }
 
+static u64 get_trace_imc_event_base_addr(void)
+{
+   return (u64)per_cpu(trace_imc_mem, smp_processor_id());
+}
+
+/*
+ * Function to parse trace-imc data obtained
+ * and to prepare the perf sample.
+ */
+static int trace_imc_prepare_sample(struct trace_imc_data *mem,
+   struct perf_sample_data *data,
+   u64 *prev_tb,
+   struct perf_event_header *header,
+   struct perf_event *event)
+{
+   /* Sanity checks for a valid record */
+   if (be64_to_cpu(READ_ONCE(mem->tb1)) > *prev_tb)
+   *prev_tb = be64_to_cpu(READ_ONCE(mem->tb1));
+   else
+   return -EINVAL;
+
+   if ((be64_to_cpu(READ_ONCE(mem->tb1)) & IMC_TRACE_RECORD_TB1_MASK) !=
+be64_to_cpu(READ_ONCE(mem->tb2)))
+   return -EINVAL;
+
+   /* Prepare perf sample */
+   data->ip =  be64_to_cpu(READ_ONCE(mem->ip));
+   data->period = event->hw.last_period;
+
+   header->type = PERF_RECORD_SAMPLE;
+   header->size = sizeof(*header) + event->header_size;
+   header->misc = 0;
+
+   if (is_kernel_addr(data->ip))
+   header->misc |= PERF_RECORD_MISC_KERNEL;
+   else
+   header->misc |= PERF_RECORD_MISC_USER;
+
+   perf_event_header__init_id(header, data, event);
+
+   return 0;
+}
+
+static void dump_trace_imc_data(struct perf_event *event)
+{
+   struct trace_imc_data *mem;
+   int i, ret;
+   u64 prev_tb = 0;
+
+   mem = (struct trace_imc_data *)get_trace_imc_event_base_addr();
+   for (i = 0; i < (trace_imc_mem_size / sizeof(struct trace_imc_data));
+   i++, mem++) {
+   struct perf_sample_data data;
+   struct perf_event_header header;
+
+   ret = trace_imc_prepare_sample(mem, &data, &prev_tb, &header, 
event);
+   if (ret) /* Exit, if not a valid record */
+   break;
+   else {
+   /* If this is a valid record, create the sample */
+   struct perf_output_handle handle;
+
+   if (perf_output_begin(&handle, event, header.size))
+   return;
+
+   perf_output_sample(&handle, &header, &data, event);
+   perf_output_end(&handle);
+   }
+   }
+}
+
+static int trace_imc_event_add(struct perf_event *event, int flags)
+{
+   int core_id = smp_processor_id() / threads_per_core;
+   struct imc_pmu_ref *ref = NULL;
+   u64 local_mem, ldbar_value;
+
+   /* Set trace-imc bit in ldbar and load ldbar with per-thread memory 
address */
+   local_mem = get_trace_imc_event_base_addr();
+   ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | 
TRACE_IMC_ENABLE;
+
+   if (core_imc_refc)
+   ref = &core_imc_ref

Re: [PATCH v4 0/5] powerpc/perf: IMC trace-mode support

2019-04-16 Thread Anju T Sudhakar



On 4/16/19 3:14 PM, Anju T Sudhakar wrote:

Hi,

Kindly ignore this series, since patch 5/5 in this series doesn't 
incorporate the event-format change


that I've done in v4 of this series.


Apologies for the inconvenience. I will post the updated v5 soon.



s/v5/v4



Thanks,

Anju

On 4/15/19 3:41 PM, Anju T Sudhakar wrote:

IMC (In-Memory collection counters) is a hardware monitoring facility
that collects large number of hardware performance events.
POWER9 support two modes for IMC which are the Accumulation mode and
Trace mode. In Accumulation mode, event counts are accumulated in system
Memory. Hypervisor then reads the posted counts periodically or when
requested. In IMC Trace mode, the 64 bit trace scom value is initialized
with the event information. The CPMC*SEL and CPMC_LOAD in the trace 
scom, specifies
the event to be monitored and the sampling duration. On each overflow 
in the

CPMC*SEL, hardware snapshots the program counter along with event counts
and writes into memory pointed by LDBAR. LDBAR has bits to indicate 
whether

hardware is configured for accumulation or trace mode.
Currently the event monitored for trace-mode is fixed as cycle.

Trace-IMC Implementation:
--
To enable trace-imc, we need to

* Add trace node in the DTS file for power9, so that the new trace 
node can

be discovered by the kernel.

Information included in the DTS file are as follows, (a snippet from
the ima-catalog)

TRACE_IMC: trace-events {
  #address-cells = <0x1>;
  #size-cells = <0x1>;
  event at 1020 {
 event-name = "cycles" ;
 reg = <0x1020 0x8>;
 desc = "Reference cycles" ;
  };
  };
  trace@0 {
 compatible = "ibm,imc-counters";
 events-prefix = "trace_";
 reg = <0x0 0x8>;
 events = < &TRACE_IMC >;
 type = <0x2>;
 size = <0x4>;
  };

OP-BUILD changes needed to include the "trace node" is already pulled in
to the ima-catalog repo.

ps://github.com/open-power/op-build/commit/d3e75dc26d1283d7d5eb444bff1ec9e40d5dfc07 



* Enchance the opal_imc_counters_* calls to support this new trace mode
in imc. Add support to initialize the trace-mode scom.

TRACE_IMC_SCOM bit representation:

0:1 : SAMPSEL
2:33    : CPMC_LOAD
34:40   : CPMC1SEL
41:47   : CPMC2SEL
48:50   : BUFFERSIZE
51:63   : RESERVED

CPMC_LOAD contains the sampling duration. SAMPSEL and CPMC*SEL 
determines
the event to count. BUFFRSIZE indicates the memory range. On each 
overflow,
hardware snapshots program counter along with event counts and update 
the

memory and reloads the CMPC_LOAD value for the next sampling duration.
IMC hardware does not support exceptions, so it quietly wraps around if
memory buffer reaches the end.

OPAL support for IMC trace mode is already upstream.

* Set LDBAR spr to enable imc-trace mode.
   LDBAR Layout:
   0 : Enable/Disable
   1 : 0 -> Accumulation Mode
   1 -> Trace Mode
   2:3   : Reserved
   4-6   : PB scope
   7 : Reserved
   8:50  : Counter Address
   51:63 : Reserved

--

PMI interrupt handling is avoided, since IMC trace mode snapshots the
program counter and update to the memory. And this also provide a way 
for

the operating system to do instruction sampling in real time without
PMI(Performance Monitoring Interrupts) processing overhead.
Performance data using 'perf top' with and without trace-imc event:

PMI interrupts count when `perf top` command is executed without 
trace-imc event.


# cat /proc/interrupts  (a snippet from the output)
9944  1072    804    804   1644    804 1306
804    804    804    804    804 804    804
804    804   1961   1602    804    804 1258
[-]
803    803    803    803    803 803    803
803    803    803    803    804 804    804
804    804    804    804    804 804    803
803    803    803    803    803 1306    803
803   Performance monitoring interrupts


`perf top` with trace-imc (executed right after 'perf top' without 
trace-imc event):


# perf top -e trace_imc/trace_cycles/
12.50%  [kernel]  [k] arch_cpu_idle
11.81%  [kernel]  [k] __next_timer_interrupt
11.22%  [kernel]  [k] rcu_idle_enter
10.25%  [kernel]  [k] find_next_bit
  7.91%  [kernel]  [k] do_idle
  7.69%  [kernel]  [k] rcu_dynticks_eqs_exit
  5.20%  [kernel]  [k] tick_nohz_idle_stop_tick
  [---]

# cat /proc/interrupts (a snippet from the output)

9944  1072    804    804   1644    804 1306
804    804    804    804    804 804    804
804    804   1961   1602    804    804 1258
[---

[PATCH v3 1/2] powerpc/perf: Implement a global lock to avoid races between trace, core and thread imc events.

2020-02-27 Thread Anju T Sudhakar
IMC(In-memory Collection Counters) does performance monitoring in
two different modes, i.e accumulation mode(core-imc and thread-imc events),
and trace mode(trace-imc events). A cpu thread can either be in
accumulation-mode or trace-mode at a time and this is done via the LDBAR
register in POWER architecture. The current design does not address the
races between thread-imc and trace-imc events.

Patch implements a global id and lock to avoid the races between
core, trace and thread imc events. With this global id-lock
implementation, the system can either run core, thread or trace imc
events at a time. i.e. to run any core-imc events, thread/trace imc events
should not be enabled/monitored.

Signed-off-by: Anju T Sudhakar 
---
Changes from v2->v3:

- Addressed the off-line comments from Michael Ellerman
- Optimized the *_event_init code path for trace, core and thread imc
- Handled the global refc in cpuhotplug scenario
- Re-order the patch series
- Removed the selftest patches and will send as a follow up patch

Changes from v1 -> v2:

- Added self test patches to the series.
---
 arch/powerpc/perf/imc-pmu.c | 165 ++--
 1 file changed, 141 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index cb50a9e1fd2d..a366e2ec0351 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -44,6 +44,16 @@ static DEFINE_PER_CPU(u64 *, trace_imc_mem);
 static struct imc_pmu_ref *trace_imc_refc;
 static int trace_imc_mem_size;

+/*
+ * Global data structure used to avoid races between thread,
+ * core and trace-imc
+ */
+static struct imc_pmu_ref imc_global_refc = {
+   .lock = __MUTEX_INITIALIZER(imc_global_refc.lock),
+   .id = 0,
+   .refc = 0,
+};
+
 static struct imc_pmu *imc_event_to_pmu(struct perf_event *event)
 {
return container_of(event->pmu, struct imc_pmu, pmu);
@@ -698,6 +708,13 @@ static int ppc_core_imc_cpu_offline(unsigned int cpu)
return -EINVAL;

ref->refc = 0;
+   /*
+* Reduce the global reference count, if this is the
+* last cpu in this core and core-imc event running
+* in this cpu.
+*/
+   if (imc_global_refc.id == IMC_DOMAIN_CORE)
+   imc_global_refc.refc--;
}
return 0;
 }
@@ -710,6 +727,23 @@ static int core_imc_pmu_cpumask_init(void)
 ppc_core_imc_cpu_offline);
 }

+static void reset_global_refc(struct perf_event *event)
+{
+   mutex_lock(&imc_global_refc.lock);
+   imc_global_refc.refc--;
+
+   /*
+* If no other thread is running any
+* event for this domain(thread/core/trace),
+* set the global id to zero.
+*/
+   if (imc_global_refc.refc <= 0) {
+   imc_global_refc.refc = 0;
+   imc_global_refc.id = 0;
+   }
+   mutex_unlock(&imc_global_refc.lock);
+}
+
 static void core_imc_counters_release(struct perf_event *event)
 {
int rc, core_id;
@@ -759,6 +793,8 @@ static void core_imc_counters_release(struct perf_event 
*event)
ref->refc = 0;
}
mutex_unlock(&ref->lock);
+
+   reset_global_refc(event);
 }

 static int core_imc_event_init(struct perf_event *event)
@@ -819,6 +855,29 @@ static int core_imc_event_init(struct perf_event *event)
++ref->refc;
mutex_unlock(&ref->lock);

+   /*
+* Since the system can run either in accumulation or trace-mode
+* of IMC at a time, core-imc events are allowed only if no other
+* trace/thread imc events are enabled/monitored.
+*
+* Take the global lock, and check the refc.id
+* to know whether any other trace/thread imc
+* events are running.
+*/
+   mutex_lock(&imc_global_refc.lock);
+   if (imc_global_refc.id == 0 || imc_global_refc.id == IMC_DOMAIN_CORE) {
+   /*
+* No other trace/thread imc events are running in
+* the system, so set the refc.id to core-imc.
+*/
+   imc_global_refc.id = IMC_DOMAIN_CORE;
+   imc_global_refc.refc++;
+   } else {
+   mutex_unlock(&imc_global_refc.lock);
+   return -EBUSY;
+   }
+   mutex_unlock(&imc_global_refc.lock);
+
event->hw.event_base = (u64)pcmi->vbase + (config & 
IMC_EVENT_OFFSET_MASK);
event->destroy = core_imc_counters_release;
return 0;
@@ -877,7 +936,20 @@ static int ppc_thread_imc_cpu_online(unsigned int cpu)

 static int ppc_thread_imc_cpu_offline(unsigned int cpu)
 {
-   mtspr(SPRN_LDBAR, 0);
+   /*
+* Set the bit 0 of LDBAR to zero.
+*
+* If bit 0 of

[PATCH v3 2/2] powerpc/powernv: Re-enable imc trace-mode in kernel

2020-02-27 Thread Anju T Sudhakar
commit <249fad734a25> ""powerpc/perf: Disable trace_imc pmu"
disables IMC(In-Memory Collection) trace-mode in kernel, since frequent
mode switching between accumulation mode and trace mode via the spr LDBAR
in the hardware can trigger a checkstop(system crash).

Patch to re-enable imc-trace mode in kernel.

The previous patch(1/2) in this series will address the mode switching issue
by implementing a global lock, and will restrict the usage of
accumulation and trace-mode at a time.

Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/platforms/powernv/opal-imc.c | 9 +
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/opal-imc.c 
b/arch/powerpc/platforms/powernv/opal-imc.c
index 000b350d4060..3b4518f4b643 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -278,14 +278,7 @@ static int opal_imc_counters_probe(struct platform_device 
*pdev)
domain = IMC_DOMAIN_THREAD;
break;
case IMC_TYPE_TRACE:
-   /*
-* FIXME. Using trace_imc events to monitor application
-* or KVM thread performance can cause a checkstop
-* (system crash).
-* Disable it for now.
-*/
-   pr_info_once("IMC: disabling trace_imc PMU\n");
-   domain = -1;
+   domain = IMC_DOMAIN_TRACE;
break;
default:
pr_warn("IMC Unknown Device type \n");
-- 
2.20.1



[PATCH v4 1/2] powerpc/perf: Implement a global lock to avoid races between trace, core and thread imc events.

2020-03-12 Thread Anju T Sudhakar
IMC(In-memory Collection Counters) does performance monitoring in
two different modes, i.e accumulation mode(core-imc and thread-imc events),
and trace mode(trace-imc events). A cpu thread can either be in
accumulation-mode or trace-mode at a time and this is done via the LDBAR
register in POWER architecture. The current design does not address the
races between thread-imc and trace-imc events.

Patch implements a global id and lock to avoid the races between
core, trace and thread imc events. With this global id-lock
implementation, the system can either run core, thread or trace imc
events at a time. i.e. to run any core-imc events, thread/trace imc events
should not be enabled/monitored.

Signed-off-by: Anju T Sudhakar 
---
Changes from v3->v4:

- Added mutex lock for thread, core and trace imc cpu offline path.

Changes from v2->v3:

- Addressed the off-line comments from Michael Ellerman
- Optimized the *_event_init code path for trace, core and thread imc
- Handled the global refc in cpuhotplug scenario
- Re-order the patch series
- Removed the selftest patches and will send as a follow up patch

Changes from v1 -> v2:

- Added self test patches to the series.

---
 arch/powerpc/perf/imc-pmu.c | 173 +++-
 1 file changed, 149 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index cb50a9e1fd2d..eb82dda884e5 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -44,6 +44,16 @@ static DEFINE_PER_CPU(u64 *, trace_imc_mem);
 static struct imc_pmu_ref *trace_imc_refc;
 static int trace_imc_mem_size;
 
+/*
+ * Global data structure used to avoid races between thread,
+ * core and trace-imc
+ */
+static struct imc_pmu_ref imc_global_refc = {
+   .lock = __MUTEX_INITIALIZER(imc_global_refc.lock),
+   .id = 0,
+   .refc = 0,
+};
+
 static struct imc_pmu *imc_event_to_pmu(struct perf_event *event)
 {
return container_of(event->pmu, struct imc_pmu, pmu);
@@ -698,6 +708,16 @@ static int ppc_core_imc_cpu_offline(unsigned int cpu)
return -EINVAL;
 
ref->refc = 0;
+   /*
+* Reduce the global reference count, if this is the
+* last cpu in this core and core-imc event running
+* in this cpu.
+*/
+   mutex_lock(&imc_global_refc.lock);
+   if (imc_global_refc.id == IMC_DOMAIN_CORE)
+   imc_global_refc.refc--;
+
+   mutex_unlock(&imc_global_refc.lock);
}
return 0;
 }
@@ -710,6 +730,23 @@ static int core_imc_pmu_cpumask_init(void)
 ppc_core_imc_cpu_offline);
 }
 
+static void reset_global_refc(struct perf_event *event)
+{
+   mutex_lock(&imc_global_refc.lock);
+   imc_global_refc.refc--;
+
+   /*
+* If no other thread is running any
+* event for this domain(thread/core/trace),
+* set the global id to zero.
+*/
+   if (imc_global_refc.refc <= 0) {
+   imc_global_refc.refc = 0;
+   imc_global_refc.id = 0;
+   }
+   mutex_unlock(&imc_global_refc.lock);
+}
+
 static void core_imc_counters_release(struct perf_event *event)
 {
int rc, core_id;
@@ -759,6 +796,8 @@ static void core_imc_counters_release(struct perf_event 
*event)
ref->refc = 0;
}
mutex_unlock(&ref->lock);
+
+   reset_global_refc(event);
 }
 
 static int core_imc_event_init(struct perf_event *event)
@@ -819,6 +858,29 @@ static int core_imc_event_init(struct perf_event *event)
++ref->refc;
mutex_unlock(&ref->lock);
 
+   /*
+* Since the system can run either in accumulation or trace-mode
+* of IMC at a time, core-imc events are allowed only if no other
+* trace/thread imc events are enabled/monitored.
+*
+* Take the global lock, and check the refc.id
+* to know whether any other trace/thread imc
+* events are running.
+*/
+   mutex_lock(&imc_global_refc.lock);
+   if (imc_global_refc.id == 0 || imc_global_refc.id == IMC_DOMAIN_CORE) {
+   /*
+* No other trace/thread imc events are running in
+* the system, so set the refc.id to core-imc.
+*/
+   imc_global_refc.id = IMC_DOMAIN_CORE;
+   imc_global_refc.refc++;
+   } else {
+   mutex_unlock(&imc_global_refc.lock);
+   return -EBUSY;
+   }
+   mutex_unlock(&imc_global_refc.lock);
+
event->hw.event_base = (u64)pcmi->vbase + (config & 
IMC_EVENT_OFFSET_MASK);
event->destroy = core_imc_counters_release;
return 0;
@@ -877,7 +939,23 @@ static int ppc_t

[PATCH v4 2/2] powerpc/powernv: Re-enable imc trace-mode in kernel

2020-03-12 Thread Anju T Sudhakar
commit <249fad734a25> ""powerpc/perf: Disable trace_imc pmu"
disables IMC(In-Memory Collection) trace-mode in kernel, since frequent
mode switching between accumulation mode and trace mode via the spr LDBAR
in the hardware can trigger a checkstop(system crash).

Patch to re-enable imc-trace mode in kernel.

The previous patch(1/2) in this series will address the mode switching issue
by implementing a global lock, and will restrict the usage of
accumulation and trace-mode at a time.

Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/platforms/powernv/opal-imc.c | 9 +
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/opal-imc.c 
b/arch/powerpc/platforms/powernv/opal-imc.c
index 968b9a4d1cd9..7824cc364bc4 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -268,14 +268,7 @@ static int opal_imc_counters_probe(struct platform_device 
*pdev)
domain = IMC_DOMAIN_THREAD;
break;
case IMC_TYPE_TRACE:
-   /*
-* FIXME. Using trace_imc events to monitor application
-* or KVM thread performance can cause a checkstop
-* (system crash).
-* Disable it for now.
-*/
-   pr_info_once("IMC: disabling trace_imc PMU\n");
-   domain = -1;
+   domain = IMC_DOMAIN_TRACE;
break;
default:
pr_warn("IMC Unknown Device type \n");
-- 
2.20.1



[PATCH v3] platforms/powernv: Avoid re-registration of imc debugfs directory

2019-11-26 Thread Anju T Sudhakar
export_imc_mode_and_cmd() function which creates the debugfs interface for
imc-mode and imc-command, is invoked when each nest pmu units is
registered.
When the first nest pmu unit is registered, export_imc_mode_and_cmd()
creates 'imc' directory under `/debug/powerpc/`. In the subsequent
invocations debugfs_create_dir() function returns, since the directory
already exists.

The recent commit  (debugfs: make error message a bit more
verbose), throws a warning if we try to invoke `debugfs_create_dir()`
with an already existing directory name.

Address this warning by making the debugfs directory registration
in the opal_imc_counters_probe() function, i.e invoke
export_imc_mode_and_cmd() function from the probe function.

Signed-off-by: Anju T Sudhakar 
---
Changes from v2 -> v3:

* Invoke export_imc_mode_and_cmd(), which does the imc debugfs
  directory registration and deletion, from the probe fucntion.
* Change the return type of imc_pmu_create() to get the
  control block address for nest units in the probe
  function
* Remove unnecessary comments

---
 arch/powerpc/platforms/powernv/opal-imc.c | 39 +--
 1 file changed, 16 insertions(+), 23 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/opal-imc.c 
b/arch/powerpc/platforms/powernv/opal-imc.c
index e04b206..3b4518f 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -59,10 +59,6 @@ static void export_imc_mode_and_cmd(struct device_node *node,
 
imc_debugfs_parent = debugfs_create_dir("imc", powerpc_debugfs_root);
 
-   /*
-* Return here, either because 'imc' directory already exists,
-* Or failed to create a new one.
-*/
if (!imc_debugfs_parent)
return;
 
@@ -135,7 +131,6 @@ static int imc_get_mem_addr_nest(struct device_node *node,
}
 
pmu_ptr->imc_counter_mmaped = true;
-   export_imc_mode_and_cmd(node, pmu_ptr);
kfree(base_addr_arr);
kfree(chipid_arr);
return 0;
@@ -151,7 +146,7 @@ static int imc_get_mem_addr_nest(struct device_node *node,
  * and domain as the inputs.
  * Allocates memory for the struct imc_pmu, sets up its domain, size and 
offsets
  */
-static int imc_pmu_create(struct device_node *parent, int pmu_index, int 
domain)
+static struct imc_pmu *imc_pmu_create(struct device_node *parent, int 
pmu_index, int domain)
 {
int ret = 0;
struct imc_pmu *pmu_ptr;
@@ -159,27 +154,23 @@ static int imc_pmu_create(struct device_node *parent, int 
pmu_index, int domain)
 
/* Return for unknown domain */
if (domain < 0)
-   return -EINVAL;
+   return NULL;
 
/* memory for pmu */
pmu_ptr = kzalloc(sizeof(*pmu_ptr), GFP_KERNEL);
if (!pmu_ptr)
-   return -ENOMEM;
+   return NULL;
 
/* Set the domain */
pmu_ptr->domain = domain;
 
ret = of_property_read_u32(parent, "size", &pmu_ptr->counter_mem_size);
-   if (ret) {
-   ret = -EINVAL;
+   if (ret)
goto free_pmu;
-   }
 
if (!of_property_read_u32(parent, "offset", &offset)) {
-   if (imc_get_mem_addr_nest(parent, pmu_ptr, offset)) {
-   ret = -EINVAL;
+   if (imc_get_mem_addr_nest(parent, pmu_ptr, offset))
goto free_pmu;
-   }
}
 
/* Function to register IMC pmu */
@@ -190,14 +181,14 @@ static int imc_pmu_create(struct device_node *parent, int 
pmu_index, int domain)
if (pmu_ptr->domain == IMC_DOMAIN_NEST)
kfree(pmu_ptr->mem_info);
kfree(pmu_ptr);
-   return ret;
+   return NULL;
}
 
-   return 0;
+   return pmu_ptr;
 
 free_pmu:
kfree(pmu_ptr);
-   return ret;
+   return NULL;
 }
 
 static void disable_nest_pmu_counters(void)
@@ -254,6 +245,7 @@ int get_max_nest_dev(void)
 static int opal_imc_counters_probe(struct platform_device *pdev)
 {
struct device_node *imc_dev = pdev->dev.of_node;
+   struct imc_pmu *pmu;
int pmu_count = 0, domain;
bool core_imc_reg = false, thread_imc_reg = false;
u32 type;
@@ -269,6 +261,7 @@ static int opal_imc_counters_probe(struct platform_device 
*pdev)
}
 
for_each_compatible_node(imc_dev, NULL, IMC_DTB_UNIT_COMPAT) {
+   pmu = NULL;
if (of_property_read_u32(imc_dev, "type", &type)) {
pr_warn("IMC Device without type property\n");
continue;
@@ -293,9 +286,13 @@ static int opal_imc_counters_probe(struct platform_device 
*pdev)
break;
}
 
-   if (!imc_pmu_create(imc_dev, pmu_count, domain)) {
-   

[PATCH 1/2] tools/perf: set no_auxtrace for powerpc

2020-04-28 Thread Anju T Sudhakar
x86/perf_regs.h is included by util/intel-pt.c, which will get compiled
when buiding perf on powerpc. Since x86/perf_regs.h has
`PERF_EXTENDED_REG_MASK` defined, defining `PERF_EXTENDED_REG_MASK` for
powerpc to add support for perf extended regs will result in perf build
error on powerpc.

Currently powerpc architecture is not having support for auxtrace. So as
a workaround for this issue, set NO_AUXTRACE for powerpc.

Signed-off-by: Anju T Sudhakar 
---
 tools/perf/arch/powerpc/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/perf/arch/powerpc/Makefile b/tools/perf/arch/powerpc/Makefile
index e58d00d62f02..9ebb5f513605 100644
--- a/tools/perf/arch/powerpc/Makefile
+++ b/tools/perf/arch/powerpc/Makefile
@@ -3,6 +3,7 @@ ifndef NO_DWARF
 PERF_HAVE_DWARF_REGS := 1
 endif
 
+NO_AUXTRACE := 1
 HAVE_KVM_STAT_SUPPORT := 1
 PERF_HAVE_ARCH_REGS_QUERY_REGISTER_OFFSET := 1
 PERF_HAVE_JITDUMP := 1
-- 
2.20.1



[PATCH 0/2] powerpc/perf: Add support for perf extended regs in powerpc

2020-04-28 Thread Anju T Sudhakar
Patch set to add support for perf extended register capability in
powerpc. The capability flag PERF_PMU_CAP_EXTENDED_REGS, is used to
indicate the PMU which support extended registers. The generic code
define the mask of extended registers as 0 for non supported architectures.

patch 2/2 defines this PERF_PMU_CAP_EXTENDED_REGS mask to output the
values of mmcr0,mmcr1,mmcr2 for POWER9.
   
x86/perf_regs.h is included by util/intel-pt.c, which will get compiled   
when buiding perf on powerpc. Since x86/perf_regs.h has
`PERF_EXTENDED_REG_MASK` defined, defining `PERF_EXTENDED_REG_MASK` for   
powerpc to add support for perf extended regs will result in perf build   
error on powerpc. Currently powerpc architecture is not having support
for auxtrace. So as a workaround for this issue, patch 1/2 set
NO_AUXTRACE for powerpc. (Any other solutions are welcome.)

Patch 2/2 also add extended regs to sample_reg_mask in the tool side to use
with `-I?` option.

Anju T Sudhakar (2):
  tools/perf: set no_auxtrace for powerpc
  powerpc/perf: Add support for outputting extended regs in perf
intr_regs

 arch/powerpc/include/asm/perf_event_server.h  |  5 +++
 arch/powerpc/include/uapi/asm/perf_regs.h | 13 +++-
 arch/powerpc/perf/core-book3s.c   |  1 +
 arch/powerpc/perf/perf_regs.c | 29 ++--
 arch/powerpc/perf/power9-pmu.c|  1 +
 .../arch/powerpc/include/uapi/asm/perf_regs.h | 13 +++-
 tools/perf/arch/powerpc/Makefile  |  1 +
 tools/perf/arch/powerpc/include/perf_regs.h   |  6 +++-
 tools/perf/arch/powerpc/util/perf_regs.c  | 33 +++
 9 files changed, 96 insertions(+), 6 deletions(-)

-- 
2.20.1



[PATCH 2/2] powerpc/perf: Add support for outputting extended regs in perf intr_regs

2020-04-28 Thread Anju T Sudhakar
The capability flag PERF_PMU_CAP_EXTENDED_REGS, is used to indicate the
PMU which support extended registers. The generic code define the mask
of extended registers as 0 for non supported architectures.

Add support for extended registers in POWER9 architecture. For POWER9,
the extended registers are mmcr0, mmc1 and mmcr2.

REG_RESERVED mask is redefined to accommodate the extended registers.

With patch:


# perf record -I?
available registers: r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 r13 r14
r15 r16 r17 r18 r19 r20 r21 r22 r23 r24 r25 r26 r27 r28 r29 r30 r31 nip
msr orig_r3 ctr link xer ccr softe trap dar dsisr sier mmcra mmcr0
mmcr1 mmcr2

# perf record -I ls
# perf script -D

PERF_RECORD_SAMPLE(IP, 0x1): 9019/9019: 0 period: 1 addr: 0
... intr regs: mask 0x ABI 64-bit
 r00xc011b12c
 r10xc03f9a98b930
 r20xc1a32100
 r30xc03f8fe9a800
 r40xc03fd181
 r50x3e32557150
 r60xc03f9a98b908
 r70xffc1cdae06ac
 r80x818
[.]
 r31   0xc03ffd047230
 nip   0xc011b2c0
 msr   0x90009033
 orig_r3 0xc011b21c
 ctr   0xc0119380
 link  0xc011b12c
 xer   0x0
 ccr   0x2800
 softe 0x1
 trap  0xf00
 dar   0x0
 dsisr 0x800
 sier  0x0
 mmcra 0x800
 mmcr0 0x82008090
 mmcr1 0x1e00
 mmcr2 0x0
 ... thread: perf:9019

Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/include/asm/perf_event_server.h  |  5 +++
 arch/powerpc/include/uapi/asm/perf_regs.h | 13 +++-
 arch/powerpc/perf/core-book3s.c   |  1 +
 arch/powerpc/perf/perf_regs.c | 29 ++--
 arch/powerpc/perf/power9-pmu.c|  1 +
 .../arch/powerpc/include/uapi/asm/perf_regs.h | 13 +++-
 tools/perf/arch/powerpc/include/perf_regs.h   |  6 +++-
 tools/perf/arch/powerpc/util/perf_regs.c  | 33 +++
 8 files changed, 95 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/perf_event_server.h 
b/arch/powerpc/include/asm/perf_event_server.h
index 3e9703f44c7c..1d15953bd99e 100644
--- a/arch/powerpc/include/asm/perf_event_server.h
+++ b/arch/powerpc/include/asm/perf_event_server.h
@@ -55,6 +55,11 @@ struct power_pmu {
int *blacklist_ev;
/* BHRB entries in the PMU */
int bhrb_nr;
+   /*
+* set this flag with `PERF_PMU_CAP_EXTENDED_REGS` if
+* the pmu supports extended perf regs capability
+*/
+   int capabilities;
 };
 
 /*
diff --git a/arch/powerpc/include/uapi/asm/perf_regs.h 
b/arch/powerpc/include/uapi/asm/perf_regs.h
index f599064dd8dc..604b831378fe 100644
--- a/arch/powerpc/include/uapi/asm/perf_regs.h
+++ b/arch/powerpc/include/uapi/asm/perf_regs.h
@@ -48,6 +48,17 @@ enum perf_event_powerpc_regs {
PERF_REG_POWERPC_DSISR,
PERF_REG_POWERPC_SIER,
PERF_REG_POWERPC_MMCRA,
-   PERF_REG_POWERPC_MAX,
+   /* Extended registers */
+   PERF_REG_POWERPC_MMCR0,
+   PERF_REG_POWERPC_MMCR1,
+   PERF_REG_POWERPC_MMCR2,
+   PERF_REG_EXTENDED_MAX,
+   /* Max regs without the extended regs */
+   PERF_REG_POWERPC_MAX = PERF_REG_POWERPC_MMCRA + 1,
 };
+
+#define PERF_REG_PMU_MASK  ((1ULL << PERF_REG_POWERPC_MAX) - 1)
+#define PERF_REG_EXTENDED_MASK  (((1ULL << (PERF_REG_EXTENDED_MAX))\
+   - 1) - PERF_REG_PMU_MASK)
+
 #endif /* _UAPI_ASM_POWERPC_PERF_REGS_H */
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 3dcfecf858f3..f56b77800a7b 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -2276,6 +2276,7 @@ int register_power_pmu(struct power_pmu *pmu)
 
power_pmu.attr_groups = ppmu->attr_groups;
 
+   power_pmu.capabilities |= (ppmu->capabilities & 
PERF_PMU_CAP_EXTENDED_REGS);
 #ifdef MSR_HV
/*
 * Use FCHV to ignore kernel events if MSR.HV is set.
diff --git a/arch/powerpc/perf/perf_regs.c b/arch/powerpc/perf/perf_regs.c
index a213a0aa5d25..57aa02568caf 100644
--- a/arch/powerpc/perf/perf_regs.c
+++ b/arch/powerpc/perf/perf_regs.c
@@ -15,7 +15,8 @@
 
 #define PT_REGS_OFFSET(id, r) [id] = offsetof(struct pt_regs, r)
 
-#define REG_RESERVED (~((1ULL << PERF_REG_POWERPC_MAX) - 1))
+#define REG_RESERVED (~(PERF_REG_EXTENDED_MASK) &  \
+   (~((1ULL << PERF_REG_POWERPC_MAX) - 1)))
 
 static unsigned int pt_regs_offset[PERF_REG_POWERPC_MAX] = {
PT_REGS_OFFSET(PERF_REG_POWERPC_R0,  gpr[0]),
@@ -69,10 +70,22 @@ static unsigned int pt_regs_offset[PERF_REG_POWERPC_MAX] = {
PT_REGS_OFFSET(PERF_REG_POWERPC_MMCRA, dsisr),
 };
 
+/* Function to return the extended register values */
+static u64 get_ext_regs_value(int idx)
+{
+   switch (idx) {
+   case PERF_REG_POWERPC_MMCR0:
+

Re: [bug report] powerpc/perf: Add nest IMC PMU support

2018-10-24 Thread Anju T Sudhakar

Hi,


On 10/18/18 3:03 PM, Dan Carpenter wrote:

Hello Anju T Sudhakar,

The patch 885dcd709ba9: "powerpc/perf: Add nest IMC PMU support" from
Jul 19, 2017, leads to the following static checker warning:

arch/powerpc/perf/imc-pmu.c:506 nest_imc_event_init()
warn: 'pcni' can't be NULL.


Unfortunately this warning didn't appear when I checked with smatch.

Could you please provide the steps to reproduce this?

This is the commit id with which I build smatch: commit 
79fe36620a7a3a45d1a51d62238da250fb8db920


But anyway I am looking into the code part. Thanks for mentioning this.

I will update soon.


Thanks,

Anju





[PATCH] powerpc/perf: Fix loop exit condition in nest_imc_event_init

2018-11-27 Thread Anju T Sudhakar
The data structure (i.e struct imc_mem_info) to hold the memory address
information for nest imc units is allocated based on the number of nodes
in the system.

nest_imc_event_init() traverse this struct array to calculate the memory
base address for the event-cpu. If we fail to find a match for the event
cpu's chip-id in imc_mem_info struct array, then the do-while loop will
iterate until we crash.

Fix this by changing the loop exit condition based on the number of
nodes in the system.

Reported-by: Dan Carpenter  
Fixes: 885dcd709ba91 ( powerpc/perf: Add nest IMC PMU support)
Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/perf/imc-pmu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 78514170cf71..e9dc771f3e3d 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -459,7 +459,7 @@ static void nest_imc_counters_release(struct perf_event 
*event)
 
 static int nest_imc_event_init(struct perf_event *event)
 {
-   int chip_id, rc, node_id;
+   int chip_id, rc, node_id, nr_chips = num_possible_nodes();
u32 l_config, config = event->attr.config;
struct imc_mem_info *pcni;
struct imc_pmu *pmu;
@@ -508,7 +508,7 @@ static int nest_imc_event_init(struct perf_event *event)
break;
}
pcni++;
-   } while (pcni);
+   } while (--nr_chips);
 
if (!flag)
return -ENODEV;
-- 
2.17.1



[PATCH] powerpc/perf: Return accordingly on invalid chip-id in

2018-11-27 Thread Anju T Sudhakar
Nest hardware counter memory resides in a per-chip reserve-memory.
During nest_imc_event_init(), chip-id of the event-cpu is considered to
calculate the base memory addresss for that cpu. Return, proper error
condition if the chip_id calculated is invalid.

Reported-by: Dan Carpenter 
Fixes: 885dcd709ba91 ("powerpc/perf: Add nest IMC PMU support")
Reviewed-by: Madhavan Srinivasan 
Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/perf/imc-pmu.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 6954636b16d1..78514170cf71 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -496,6 +496,11 @@ static int nest_imc_event_init(struct perf_event *event)
 * Get the base memory addresss for this cpu.
 */
chip_id = cpu_to_chip_id(event->cpu);
+
+   /* Return, if chip_id is not valid */
+   if (chip_id < 0)
+   return -ENODEV;
+
pcni = pmu->mem_info;
do {
if (pcni->id == chip_id) {
-- 
2.17.1



[PATCH 2/4] powerpc/perf: Rearrange setting of ldbar for thread-imc

2018-11-28 Thread Anju T Sudhakar
LDBAR holds the memory address allocated for each cpu. For thread-imc
the mode bit (i.e bit 1) of LDBAR is set to accumulation.
Currently, ldbar is loaded with per cpu memory address and mode set to
accumulation at boot time.

To enable trace-imc, the mode bit of ldbar should be set to 'trace'. So to
accommodate trace-mode of IMC, reposition setting of ldbar for thread-imc
to thread_imc_event_add(). Also reset ldbar at thread_imc_event_del().

Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/perf/imc-pmu.c | 28 +---
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index f292a3f284f1..3bef46f8417d 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -806,8 +806,11 @@ static int core_imc_event_init(struct perf_event *event)
 }
 
 /*
- * Allocates a page of memory for each of the online cpus, and write the
- * physical base address of that page to the LDBAR for that cpu.
+ * Allocates a page of memory for each of the online cpus, and load
+ * LDBAR with 0.
+ * The physical base address of the page allocated for a cpu will be
+ * written to the LDBAR for that cpu, when the thread-imc event
+ * is added.
  *
  * LDBAR Register Layout:
  *
@@ -825,7 +828,7 @@ static int core_imc_event_init(struct perf_event *event)
  */
 static int thread_imc_mem_alloc(int cpu_id, int size)
 {
-   u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, cpu_id);
+   u64 *local_mem = per_cpu(thread_imc_mem, cpu_id);
int nid = cpu_to_node(cpu_id);
 
if (!local_mem) {
@@ -842,9 +845,7 @@ static int thread_imc_mem_alloc(int cpu_id, int size)
per_cpu(thread_imc_mem, cpu_id) = local_mem;
}
 
-   ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | 
THREAD_IMC_ENABLE;
-
-   mtspr(SPRN_LDBAR, ldbar_value);
+   mtspr(SPRN_LDBAR, 0);
return 0;
 }
 
@@ -995,6 +996,7 @@ static int thread_imc_event_add(struct perf_event *event, 
int flags)
 {
int core_id;
struct imc_pmu_ref *ref;
+   u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, 
smp_processor_id());
 
if (flags & PERF_EF_START)
imc_event_start(event, flags);
@@ -1003,6 +1005,9 @@ static int thread_imc_event_add(struct perf_event *event, 
int flags)
return -EINVAL;
 
core_id = smp_processor_id() / threads_per_core;
+   ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | 
THREAD_IMC_ENABLE;
+   mtspr(SPRN_LDBAR, ldbar_value);
+
/*
 * imc pmus are enabled only when it is used.
 * See if this is triggered for the first time.
@@ -1034,11 +1039,7 @@ static void thread_imc_event_del(struct perf_event 
*event, int flags)
int core_id;
struct imc_pmu_ref *ref;
 
-   /*
-* Take a snapshot and calculate the delta and update
-* the event counter values.
-*/
-   imc_event_update(event);
+   mtspr(SPRN_LDBAR, 0);
 
core_id = smp_processor_id() / threads_per_core;
ref = &core_imc_refc[core_id];
@@ -1057,6 +1058,11 @@ static void thread_imc_event_del(struct perf_event 
*event, int flags)
ref->refc = 0;
}
mutex_unlock(&ref->lock);
+   /*
+* Take a snapshot and calculate the delta and update
+* the event counter values.
+*/
+   imc_event_update(event);
 }
 
 /* update_pmu_ops : Populate the appropriate operations for "pmu" */
-- 
2.17.1



[PATCH 1/4] powerpc/include: Add data structures and macros for IMC trace mode

2018-11-28 Thread Anju T Sudhakar
Add the macros needed for IMC (In-Memory Collection Counters) trace-mode
and data structure to hold the trace-imc record data.
Also, add the new type "OPAL_IMC_COUNTERS_TRACE" in 'opal-api.h', since
there is a new switch case added in the opal-calls for IMC.

Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/include/asm/imc-pmu.h  | 39 +
 arch/powerpc/include/asm/opal-api.h |  1 +
 2 files changed, 40 insertions(+)

diff --git a/arch/powerpc/include/asm/imc-pmu.h 
b/arch/powerpc/include/asm/imc-pmu.h
index 69f516ecb2fd..7c2ef0e42661 100644
--- a/arch/powerpc/include/asm/imc-pmu.h
+++ b/arch/powerpc/include/asm/imc-pmu.h
@@ -33,6 +33,7 @@
  */
 #define THREAD_IMC_LDBAR_MASK   0x0003e000ULL
 #define THREAD_IMC_ENABLE   0x8000ULL
+#define TRACE_IMC_ENABLE   0x4000ULL
 
 /*
  * For debugfs interface for imc-mode and imc-command
@@ -59,6 +60,34 @@ struct imc_events {
char *scale;
 };
 
+/*
+ * Trace IMC hardware updates a 64bytes record on
+ * Core Performance Monitoring Counter (CPMC)
+ * overflow. Here is the layout for the trace imc record
+ *
+ * DW 0 : Timebase
+ * DW 1 : Program Counter
+ * DW 2 : PIDR information
+ * DW 3 : CPMC1
+ * DW 4 : CPMC2
+ * DW 5 : CPMC3
+ * Dw 6 : CPMC4
+ * DW 7 : Timebase
+ * .
+ *
+ * The following is the data structure to hold trace imc data.
+ */
+struct trace_imc_data {
+   u64 tb1;
+   u64 ip;
+   u64 val;
+   u64 cpmc1;
+   u64 cpmc2;
+   u64 cpmc3;
+   u64 cpmc4;
+   u64 tb2;
+};
+
 /* Event attribute array index */
 #define IMC_FORMAT_ATTR0
 #define IMC_EVENT_ATTR 1
@@ -68,6 +97,13 @@ struct imc_events {
 /* PMU Format attribute macros */
 #define IMC_EVENT_OFFSET_MASK  0xULL
 
+/*
+ * Macro to mask bits 0:21 of first double word(which is the timebase) to
+ * compare with 8th double word (timebase) of trace imc record data.
+ */
+#define IMC_TRACE_RECORD_TB1_MASK  0x3ffULL
+
+
 /*
  * Device tree parser code detects IMC pmu support and
  * registers new IMC pmus. This structure will hold the
@@ -113,6 +149,7 @@ struct imc_pmu_ref {
 
 enum {
IMC_TYPE_THREAD = 0x1,
+   IMC_TYPE_TRACE  = 0x2,
IMC_TYPE_CORE   = 0x4,
IMC_TYPE_CHIP   = 0x10,
 };
@@ -123,6 +160,8 @@ enum {
 #define IMC_DOMAIN_NEST1
 #define IMC_DOMAIN_CORE2
 #define IMC_DOMAIN_THREAD  3
+/* For trace-imc the domain is still thread but it operates in trace-mode */
+#define IMC_DOMAIN_TRACE   4
 
 extern int init_imc_pmu(struct device_node *parent,
struct imc_pmu *pmu_ptr, int pmu_id);
diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index 870fb7b239ea..a4130b21b159 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -1118,6 +1118,7 @@ enum {
 enum {
OPAL_IMC_COUNTERS_NEST = 1,
OPAL_IMC_COUNTERS_CORE = 2,
+   OPAL_IMC_COUNTERS_TRACE = 3,
 };
 
 
-- 
2.17.1



[PATCH 3/4] powerpc/perf: Trace imc events detection and cpuhotplug

2018-11-28 Thread Anju T Sudhakar
Patch detects trace-imc events, does memory initilizations for each online
cpu, and registers cpuhotplug call-backs.

Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/perf/imc-pmu.c   | 91 +++
 arch/powerpc/platforms/powernv/opal-imc.c |  3 +
 include/linux/cpuhotplug.h|  1 +
 3 files changed, 95 insertions(+)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 3bef46f8417d..d9ffe7f03f1e 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -43,6 +43,10 @@ static DEFINE_PER_CPU(u64 *, thread_imc_mem);
 static struct imc_pmu *thread_imc_pmu;
 static int thread_imc_mem_size;
 
+/* Trace IMC data structures */
+static DEFINE_PER_CPU(u64 *, trace_imc_mem);
+static int trace_imc_mem_size;
+
 static struct imc_pmu *imc_event_to_pmu(struct perf_event *event)
 {
return container_of(event->pmu, struct imc_pmu, pmu);
@@ -1065,6 +1069,54 @@ static void thread_imc_event_del(struct perf_event 
*event, int flags)
imc_event_update(event);
 }
 
+/*
+ * Allocate a page of memory for each cpu, and load LDBAR with 0.
+ */
+static int trace_imc_mem_alloc(int cpu_id, int size)
+{
+   u64 *local_mem = per_cpu(trace_imc_mem, cpu_id);
+   int phys_id = cpu_to_node(cpu_id), rc = 0;
+
+   if (!local_mem) {
+   local_mem = page_address(alloc_pages_node(phys_id,
+   GFP_KERNEL | __GFP_ZERO | 
__GFP_THISNODE |
+   __GFP_NOWARN, get_order(size)));
+   if (!local_mem)
+   return -ENOMEM;
+   per_cpu(trace_imc_mem, cpu_id) = local_mem;
+
+   /* Initialise the counters for trace mode */
+   rc = opal_imc_counters_init(OPAL_IMC_COUNTERS_TRACE, __pa((void 
*)local_mem),
+   get_hard_smp_processor_id(cpu_id));
+   if (rc) {
+   pr_info("IMC:opal init failed for trace imc\n");
+   return rc;
+   }
+   }
+
+   mtspr(SPRN_LDBAR, 0);
+   return 0;
+}
+
+static int ppc_trace_imc_cpu_online(unsigned int cpu)
+{
+   return trace_imc_mem_alloc(cpu, trace_imc_mem_size);
+}
+
+static int ppc_trace_imc_cpu_offline(unsigned int cpu)
+{
+   mtspr(SPRN_LDBAR, 0);
+   return 0;
+}
+
+static int trace_imc_cpu_init(void)
+{
+   return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE,
+ "perf/powerpc/imc_trace:online",
+ ppc_trace_imc_cpu_online,
+ ppc_trace_imc_cpu_offline);
+}
+
 /* update_pmu_ops : Populate the appropriate operations for "pmu" */
 static int update_pmu_ops(struct imc_pmu *pmu)
 {
@@ -1186,6 +1238,17 @@ static void cleanup_all_thread_imc_memory(void)
}
 }
 
+static void cleanup_all_trace_imc_memory(void)
+{
+   int i, order = get_order(trace_imc_mem_size);
+
+   for_each_online_cpu(i) {
+   if (per_cpu(trace_imc_mem, i))
+   free_pages((u64)per_cpu(trace_imc_mem, i), order);
+
+   }
+}
+
 /* Function to free the attr_groups which are dynamically allocated */
 static void imc_common_mem_free(struct imc_pmu *pmu_ptr)
 {
@@ -1227,6 +1290,11 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu 
*pmu_ptr)
cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE);
cleanup_all_thread_imc_memory();
}
+
+   if (pmu_ptr->domain == IMC_DOMAIN_TRACE) {
+   cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE);
+   cleanup_all_trace_imc_memory();
+   }
 }
 
 /*
@@ -1309,6 +1377,21 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct 
device_node *parent,
 
thread_imc_pmu = pmu_ptr;
break;
+   case IMC_DOMAIN_TRACE:
+   /* Update the pmu name */
+   pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s", s, "_imc");
+   if (!pmu_ptr->pmu.name)
+   return -ENOMEM;
+
+   trace_imc_mem_size = pmu_ptr->counter_mem_size;
+   for_each_online_cpu(cpu) {
+   res = trace_imc_mem_alloc(cpu, trace_imc_mem_size);
+   if (res) {
+   cleanup_all_trace_imc_memory();
+   goto err;
+   }
+   }
+   break;
default:
return -EINVAL;
}
@@ -1381,6 +1464,14 @@ int init_imc_pmu(struct device_node *parent, struct 
imc_pmu *pmu_ptr, int pmu_id
goto err_free_mem;
}
 
+   break;
+   case IMC_DOMAIN_TRACE:
+   ret = trace_imc_cpu_init();
+   if (ret) {
+   cleanup_all_trace_imc_memory();
+   goto err_fre

[PATCH 4/4] powerpc/perf: Trace imc PMU functions

2018-11-28 Thread Anju T Sudhakar
Add PMU functions to support trace-imc.

Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/perf/imc-pmu.c | 172 
 1 file changed, 172 insertions(+)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index d9ffe7f03f1e..18af7c3e2345 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -1117,6 +1117,170 @@ static int trace_imc_cpu_init(void)
  ppc_trace_imc_cpu_offline);
 }
 
+static u64 get_trace_imc_event_base_addr(void)
+{
+   return (u64)per_cpu(trace_imc_mem, smp_processor_id());
+}
+
+/*
+ * Function to parse trace-imc data obtained
+ * and to prepare the perf sample.
+ */
+static int trace_imc_prepare_sample(struct trace_imc_data *mem,
+   struct perf_sample_data *data,
+   u64 *prev_tb,
+   struct perf_event_header *header,
+   struct perf_event *event)
+{
+   /* Sanity checks for a valid record */
+   if (be64_to_cpu(READ_ONCE(mem->tb1)) > *prev_tb)
+   *prev_tb = be64_to_cpu(READ_ONCE(mem->tb1));
+   else
+   return -EINVAL;
+
+   if ((be64_to_cpu(READ_ONCE(mem->tb1)) & IMC_TRACE_RECORD_TB1_MASK) !=
+be64_to_cpu(READ_ONCE(mem->tb2)))
+   return -EINVAL;
+
+   /* Prepare perf sample */
+   data->ip =  be64_to_cpu(READ_ONCE(mem->ip));
+   data->period = event->hw.last_period;
+
+   header->type = PERF_RECORD_SAMPLE;
+   header->size = sizeof(*header) + event->header_size;
+   header->misc = 0;
+
+   if (is_kernel_addr(data->ip))
+   header->misc |= PERF_RECORD_MISC_KERNEL;
+   else
+   header->misc |= PERF_RECORD_MISC_USER;
+
+   perf_event_header__init_id(header, data, event);
+
+   return 0;
+}
+
+static void dump_trace_imc_data(struct perf_event *event)
+{
+   struct trace_imc_data *mem;
+   int i, ret;
+   u64 prev_tb = 0;
+
+   mem = (struct trace_imc_data *)get_trace_imc_event_base_addr();
+   for (i = 0; i < (trace_imc_mem_size / sizeof(struct trace_imc_data));
+   i++, mem++) {
+   struct perf_sample_data data;
+   struct perf_event_header header;
+
+   ret = trace_imc_prepare_sample(mem, &data, &prev_tb, &header, 
event);
+   if (ret) /* Exit, if not a valid record */
+   break;
+   else {
+   /* If this is a valid record, create the sample */
+   struct perf_output_handle handle;
+
+   if (perf_output_begin(&handle, event, header.size))
+   return;
+
+   perf_output_sample(&handle, &header, &data, event);
+   perf_output_end(&handle);
+   }
+   }
+}
+
+static int trace_imc_event_add(struct perf_event *event, int flags)
+{
+   /* Enable the sched_task to start the engine */
+   perf_sched_cb_inc(event->ctx->pmu);
+   return 0;
+}
+
+static void trace_imc_event_read(struct perf_event *event)
+{
+   dump_trace_imc_data(event);
+}
+
+static void trace_imc_event_stop(struct perf_event *event, int flags)
+{
+   trace_imc_event_read(event);
+}
+
+static void trace_imc_event_start(struct perf_event *event, int flags)
+{
+   return;
+}
+
+static void trace_imc_event_del(struct perf_event *event, int flags)
+{
+   perf_sched_cb_dec(event->ctx->pmu);
+}
+
+void trace_imc_pmu_sched_task(struct perf_event_context *ctx,
+   bool sched_in)
+{
+   int core_id = smp_processor_id() / threads_per_core;
+   struct imc_pmu_ref *ref;
+   u64 local_mem, ldbar_value;
+
+   /* Set trace-imc bit in ldbar and load ldbar with per-thread memory 
address */
+   local_mem = get_trace_imc_event_base_addr();
+   ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | 
TRACE_IMC_ENABLE;
+
+   ref = &core_imc_refc[core_id];
+   if (!ref)
+   return;
+
+   if (sched_in) {
+   mtspr(SPRN_LDBAR, ldbar_value);
+   mutex_lock(&ref->lock);
+   if (ref->refc == 0) {
+   if (opal_imc_counters_start(OPAL_IMC_COUNTERS_TRACE,
+   
get_hard_smp_processor_id(smp_processor_id( {
+   mutex_unlock(&ref->lock);
+   pr_err("trace-imc: Unable to start the counters 
for core %d\n", core_id);
+   mtspr(SPRN_LDBAR, 0);
+   return;
+   }
+   }
+   ++ref->refc;
+   mutex_unlock(&ref->lock);
+   } else

[PATCH 0/4] powerpc/perf: IMC trace-mode support

2018-11-28 Thread Anju T Sudhakar
sable
1 : 0 -> Accumulation Mode
1 -> Trace Mode
2:3   : Reserved
4-6   : PB scope
7 : Reserved
8:50  : Counter Address
51:63 : Reserved

   

Key benefit of imc trace-mode is, each sample record contains the address
pointer along with other information. So that, we can profile the IP
without interrupting the application.

Performance data using 'perf top' with and without trace-imc event:

When the application is monitored with trace-imc event, we dont take any   
PMI interrupts.

PMI interrupts count when `perf top` command is executed without trac-imc event.

# perf top  
12.53%  [kernel]   [k] arch_cpu_idle   
11.32%  [kernel]   [k] rcu_idle_enter  
10.76%  [kernel]   [k] __next_timer_interrupt  
 9.49%  [kernel]   [k] find_next_bit   
 8.06%  [kernel]   [k] rcu_dynticks_eqs_exit   
 7.82%  [kernel]   [k] do_idle 
 5.71%  [kernel]   [k] tick_nohz_idle_stop_tic 
 [---]  
# cat /proc/interrupts  (a snippet from the output)
9944  1072804804   1644804   1306  
804804804804804804804  
804804   1961   1602804804   1258  
[-]
803803803803803803803  
803803803803804804804 
804804804804804804803 
803803803803803   1306803 
803   Performance monitoring interrupts   


`perf top` with trace-imc (right after 'perf top' without trace-imc event):

# perf top -e trace_imc/trace_cycles/  
12.50%  [kernel]  [k] arch_cpu_idle
11.81%  [kernel]  [k] __next_timer_interrupt   
11.22%  [kernel]  [k] rcu_idle_enter   
10.25%  [kernel]  [k] find_next_bit
 7.91%  [kernel]  [k] do_idle  
 7.69%  [kernel]  [k] rcu_dynticks_eqs_exit
 5.20%  [kernel]  [k] tick_nohz_idle_stop_tick 
 [---]  

# cat /proc/interrupts (a snippet from the output) 

9944  1072804804   1644804   1306  
804804804804804804804  
804804   1961   1602804804   1258  
[-]
803803803803803803803  
803803803804804804804
804804804804804804803 
803803803803803   1306803 
803   Performance monitoring interrupts   
    
The PMI interrupts count remains the same.

Anju T Sudhakar (4):
  powerpc/include: Add data structures and macros for IMC trace mode
  powerpc/perf: Rearrange setting of ldbar for thread-imc
  powerpc/perf: Trace imc events detectionand cpuhotplug
  powerpc/perf: Trace imc PMU functions

 arch/powerpc/include/asm/imc-pmu.h|  39 +++
 arch/powerpc/include/asm/opal-api.h   |   1 +
 arch/powerpc/perf/imc-pmu.c   | 291 +-
 arch/powerpc/platforms/powernv/opal-imc.c |   3 +
 include/linux/cpuhotplug.h|   1 +
 5 files changed, 324 insertions(+), 11 deletions(-)

-- 
2.17.1



[PATCH v2 0/5] powerpc/perf: IMC trace-mode support

2018-12-14 Thread Anju T Sudhakar
ists.ozlabs.org/pipermail/skiboot/2018-December/012883.html

* Set LDBAR spr to enable imc-trace mode.

LDBAR Layout:

0 : Enable/Disable
1 : 0 -> Accumulation Mode
1 -> Trace Mode
2:3   : Reserved
4-6   : PB scope
7 : Reserved
8:50  : Counter Address
51:63 : Reserved

   

Key benefit of imc trace-mode is, each sample record contains the address
pointer along with other information. So that, we can profile the IP
without interrupting the application.

Performance data using 'perf top' with and without trace-imc event:

When the application is monitored with trace-imc event, we dont take any   
PMI interrupts.

PMI interrupts count when `perf top` command is executed without trac-imc event.

# perf top  
12.53%  [kernel]   [k] arch_cpu_idle   
11.32%  [kernel]   [k] rcu_idle_enter  
10.76%  [kernel]   [k] __next_timer_interrupt  
 9.49%  [kernel]   [k] find_next_bit   
 8.06%  [kernel]   [k] rcu_dynticks_eqs_exit   
 7.82%  [kernel]   [k] do_idle 
 5.71%  [kernel]   [k] tick_nohz_idle_stop_tic 
 [---]  
# cat /proc/interrupts  (a snippet from the output)
9944  1072804804   1644804   1306  
804804804804804804804  
804804   1961   1602804804   1258  
[-]
803803803803803803803  
803803803803804804804 
804804804804804804803 
803803803803803   1306803 
803   Performance monitoring interrupts   


`perf top` with trace-imc (right after 'perf top' without trace-imc event):

# perf top -e trace_imc/trace_cycles/  
12.50%  [kernel]  [k] arch_cpu_idle
11.81%  [kernel]  [k] __next_timer_interrupt   
11.22%  [kernel]  [k] rcu_idle_enter   
10.25%  [kernel]  [k] find_next_bit
 7.91%  [kernel]  [k] do_idle  
 7.69%  [kernel]  [k] rcu_dynticks_eqs_exit
 5.20%  [kernel]  [k] tick_nohz_idle_stop_tick 
 [---]  

# cat /proc/interrupts (a snippet from the output) 

9944  1072804804   1644804   1306  
804804804804804804804  
804804   1961   1602804804   1258  
[-]
803803803803803803803  
803803803804804804804
804804804804804804803 
803803803803803   1306803 
803   Performance monitoring interrupts   

The PMI interrupts count remains the same.  

Changelog:

>From v1 -> v2
--

* Added privileged access check for thread-imc and trace-imc

Suggestions/comments are welcome.

Anju T Sudhakar (4):
  powerpc/include: Add data structures and macros for IMC trace mode
  powerpc/perf: Rearrange setting of ldbar for thread-imc
  powerpc/perf: Trace imc events detection and cpuhotplug
  powerpc/perf: Trace imc PMU functions

Madhavan Srinivasan (1):
  powerpc/perf: Add privileged access che

[PATCH v2 1/5] powerpc/include: Add data structures and macros for IMC trace mode

2018-12-14 Thread Anju T Sudhakar
Add the macros needed for IMC (In-Memory Collection Counters) trace-mode
and data structure to hold the trace-imc record data.
Also, add the new type "OPAL_IMC_COUNTERS_TRACE" in 'opal-api.h', since
there is a new switch case added in the opal-calls for IMC.

Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/include/asm/imc-pmu.h  | 39 +
 arch/powerpc/include/asm/opal-api.h |  1 +
 2 files changed, 40 insertions(+)

diff --git a/arch/powerpc/include/asm/imc-pmu.h 
b/arch/powerpc/include/asm/imc-pmu.h
index 69f516ecb2fd..7c2ef0e42661 100644
--- a/arch/powerpc/include/asm/imc-pmu.h
+++ b/arch/powerpc/include/asm/imc-pmu.h
@@ -33,6 +33,7 @@
  */
 #define THREAD_IMC_LDBAR_MASK   0x0003e000ULL
 #define THREAD_IMC_ENABLE   0x8000ULL
+#define TRACE_IMC_ENABLE   0x4000ULL
 
 /*
  * For debugfs interface for imc-mode and imc-command
@@ -59,6 +60,34 @@ struct imc_events {
char *scale;
 };
 
+/*
+ * Trace IMC hardware updates a 64bytes record on
+ * Core Performance Monitoring Counter (CPMC)
+ * overflow. Here is the layout for the trace imc record
+ *
+ * DW 0 : Timebase
+ * DW 1 : Program Counter
+ * DW 2 : PIDR information
+ * DW 3 : CPMC1
+ * DW 4 : CPMC2
+ * DW 5 : CPMC3
+ * Dw 6 : CPMC4
+ * DW 7 : Timebase
+ * .
+ *
+ * The following is the data structure to hold trace imc data.
+ */
+struct trace_imc_data {
+   u64 tb1;
+   u64 ip;
+   u64 val;
+   u64 cpmc1;
+   u64 cpmc2;
+   u64 cpmc3;
+   u64 cpmc4;
+   u64 tb2;
+};
+
 /* Event attribute array index */
 #define IMC_FORMAT_ATTR0
 #define IMC_EVENT_ATTR 1
@@ -68,6 +97,13 @@ struct imc_events {
 /* PMU Format attribute macros */
 #define IMC_EVENT_OFFSET_MASK  0xULL
 
+/*
+ * Macro to mask bits 0:21 of first double word(which is the timebase) to
+ * compare with 8th double word (timebase) of trace imc record data.
+ */
+#define IMC_TRACE_RECORD_TB1_MASK  0x3ffULL
+
+
 /*
  * Device tree parser code detects IMC pmu support and
  * registers new IMC pmus. This structure will hold the
@@ -113,6 +149,7 @@ struct imc_pmu_ref {
 
 enum {
IMC_TYPE_THREAD = 0x1,
+   IMC_TYPE_TRACE  = 0x2,
IMC_TYPE_CORE   = 0x4,
IMC_TYPE_CHIP   = 0x10,
 };
@@ -123,6 +160,8 @@ enum {
 #define IMC_DOMAIN_NEST1
 #define IMC_DOMAIN_CORE2
 #define IMC_DOMAIN_THREAD  3
+/* For trace-imc the domain is still thread but it operates in trace-mode */
+#define IMC_DOMAIN_TRACE   4
 
 extern int init_imc_pmu(struct device_node *parent,
struct imc_pmu *pmu_ptr, int pmu_id);
diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index 870fb7b239ea..a4130b21b159 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -1118,6 +1118,7 @@ enum {
 enum {
OPAL_IMC_COUNTERS_NEST = 1,
OPAL_IMC_COUNTERS_CORE = 2,
+   OPAL_IMC_COUNTERS_TRACE = 3,
 };
 
 
-- 
2.17.1



[PATCH v2 4/5] powerpc/perf: Trace imc events detection and cpuhotplug

2018-12-14 Thread Anju T Sudhakar
Patch detects trace-imc events, does memory initilizations for each online
cpu, and registers cpuhotplug call-backs.

Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/perf/imc-pmu.c   | 91 +++
 arch/powerpc/platforms/powernv/opal-imc.c |  3 +
 include/linux/cpuhotplug.h|  1 +
 3 files changed, 95 insertions(+)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 5ca80545a849..1f09265c8fb0 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -43,6 +43,10 @@ static DEFINE_PER_CPU(u64 *, thread_imc_mem);
 static struct imc_pmu *thread_imc_pmu;
 static int thread_imc_mem_size;
 
+/* Trace IMC data structures */
+static DEFINE_PER_CPU(u64 *, trace_imc_mem);
+static int trace_imc_mem_size;
+
 static struct imc_pmu *imc_event_to_pmu(struct perf_event *event)
 {
return container_of(event->pmu, struct imc_pmu, pmu);
@@ -1068,6 +1072,54 @@ static void thread_imc_event_del(struct perf_event 
*event, int flags)
imc_event_update(event);
 }
 
+/*
+ * Allocate a page of memory for each cpu, and load LDBAR with 0.
+ */
+static int trace_imc_mem_alloc(int cpu_id, int size)
+{
+   u64 *local_mem = per_cpu(trace_imc_mem, cpu_id);
+   int phys_id = cpu_to_node(cpu_id), rc = 0;
+
+   if (!local_mem) {
+   local_mem = page_address(alloc_pages_node(phys_id,
+   GFP_KERNEL | __GFP_ZERO | 
__GFP_THISNODE |
+   __GFP_NOWARN, get_order(size)));
+   if (!local_mem)
+   return -ENOMEM;
+   per_cpu(trace_imc_mem, cpu_id) = local_mem;
+
+   /* Initialise the counters for trace mode */
+   rc = opal_imc_counters_init(OPAL_IMC_COUNTERS_TRACE, __pa((void 
*)local_mem),
+   get_hard_smp_processor_id(cpu_id));
+   if (rc) {
+   pr_info("IMC:opal init failed for trace imc\n");
+   return rc;
+   }
+   }
+
+   mtspr(SPRN_LDBAR, 0);
+   return 0;
+}
+
+static int ppc_trace_imc_cpu_online(unsigned int cpu)
+{
+   return trace_imc_mem_alloc(cpu, trace_imc_mem_size);
+}
+
+static int ppc_trace_imc_cpu_offline(unsigned int cpu)
+{
+   mtspr(SPRN_LDBAR, 0);
+   return 0;
+}
+
+static int trace_imc_cpu_init(void)
+{
+   return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE,
+ "perf/powerpc/imc_trace:online",
+ ppc_trace_imc_cpu_online,
+ ppc_trace_imc_cpu_offline);
+}
+
 /* update_pmu_ops : Populate the appropriate operations for "pmu" */
 static int update_pmu_ops(struct imc_pmu *pmu)
 {
@@ -1189,6 +1241,17 @@ static void cleanup_all_thread_imc_memory(void)
}
 }
 
+static void cleanup_all_trace_imc_memory(void)
+{
+   int i, order = get_order(trace_imc_mem_size);
+
+   for_each_online_cpu(i) {
+   if (per_cpu(trace_imc_mem, i))
+   free_pages((u64)per_cpu(trace_imc_mem, i), order);
+
+   }
+}
+
 /* Function to free the attr_groups which are dynamically allocated */
 static void imc_common_mem_free(struct imc_pmu *pmu_ptr)
 {
@@ -1230,6 +1293,11 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu 
*pmu_ptr)
cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE);
cleanup_all_thread_imc_memory();
}
+
+   if (pmu_ptr->domain == IMC_DOMAIN_TRACE) {
+   cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE);
+   cleanup_all_trace_imc_memory();
+   }
 }
 
 /*
@@ -1312,6 +1380,21 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct 
device_node *parent,
 
thread_imc_pmu = pmu_ptr;
break;
+   case IMC_DOMAIN_TRACE:
+   /* Update the pmu name */
+   pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s", s, "_imc");
+   if (!pmu_ptr->pmu.name)
+   return -ENOMEM;
+
+   trace_imc_mem_size = pmu_ptr->counter_mem_size;
+   for_each_online_cpu(cpu) {
+   res = trace_imc_mem_alloc(cpu, trace_imc_mem_size);
+   if (res) {
+   cleanup_all_trace_imc_memory();
+   goto err;
+   }
+   }
+   break;
default:
return -EINVAL;
}
@@ -1384,6 +1467,14 @@ int init_imc_pmu(struct device_node *parent, struct 
imc_pmu *pmu_ptr, int pmu_id
goto err_free_mem;
}
 
+   break;
+   case IMC_DOMAIN_TRACE:
+   ret = trace_imc_cpu_init();
+   if (ret) {
+   cleanup_all_trace_imc_memory();
+   goto err_fre

[PATCH v2 3/5] powerpc/perf: Add privileged access check for thread_imc

2018-12-14 Thread Anju T Sudhakar
From: Madhavan Srinivasan 

Add code to restrict user access to thread_imc pmu since
some event report privilege level information.

Fixes: f74c89bd80fb3 ('powerpc/perf: Add thread IMC PMU support')
Signed-off-by: Madhavan Srinivasan 
Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/perf/imc-pmu.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 3bef46f8417d..5ca80545a849 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -877,6 +877,9 @@ static int thread_imc_event_init(struct perf_event *event)
if (event->attr.type != event->pmu->type)
return -ENOENT;
 
+   if (!capable(CAP_SYS_ADMIN))
+   return -EACCES;
+
/* Sampling not supported */
if (event->hw.sample_period)
return -EINVAL;
-- 
2.17.1



[PATCH v2 5/5] powerpc/perf: Trace imc PMU functions

2018-12-14 Thread Anju T Sudhakar
Add PMU functions to support trace-imc.

Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/perf/imc-pmu.c | 175 
 1 file changed, 175 insertions(+)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 1f09265c8fb0..32ff0e449fca 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -1120,6 +1120,173 @@ static int trace_imc_cpu_init(void)
  ppc_trace_imc_cpu_offline);
 }
 
+static u64 get_trace_imc_event_base_addr(void)
+{
+   return (u64)per_cpu(trace_imc_mem, smp_processor_id());
+}
+
+/*
+ * Function to parse trace-imc data obtained
+ * and to prepare the perf sample.
+ */
+static int trace_imc_prepare_sample(struct trace_imc_data *mem,
+   struct perf_sample_data *data,
+   u64 *prev_tb,
+   struct perf_event_header *header,
+   struct perf_event *event)
+{
+   /* Sanity checks for a valid record */
+   if (be64_to_cpu(READ_ONCE(mem->tb1)) > *prev_tb)
+   *prev_tb = be64_to_cpu(READ_ONCE(mem->tb1));
+   else
+   return -EINVAL;
+
+   if ((be64_to_cpu(READ_ONCE(mem->tb1)) & IMC_TRACE_RECORD_TB1_MASK) !=
+be64_to_cpu(READ_ONCE(mem->tb2)))
+   return -EINVAL;
+
+   /* Prepare perf sample */
+   data->ip =  be64_to_cpu(READ_ONCE(mem->ip));
+   data->period = event->hw.last_period;
+
+   header->type = PERF_RECORD_SAMPLE;
+   header->size = sizeof(*header) + event->header_size;
+   header->misc = 0;
+
+   if (is_kernel_addr(data->ip))
+   header->misc |= PERF_RECORD_MISC_KERNEL;
+   else
+   header->misc |= PERF_RECORD_MISC_USER;
+
+   perf_event_header__init_id(header, data, event);
+
+   return 0;
+}
+
+static void dump_trace_imc_data(struct perf_event *event)
+{
+   struct trace_imc_data *mem;
+   int i, ret;
+   u64 prev_tb = 0;
+
+   mem = (struct trace_imc_data *)get_trace_imc_event_base_addr();
+   for (i = 0; i < (trace_imc_mem_size / sizeof(struct trace_imc_data));
+   i++, mem++) {
+   struct perf_sample_data data;
+   struct perf_event_header header;
+
+   ret = trace_imc_prepare_sample(mem, &data, &prev_tb, &header, 
event);
+   if (ret) /* Exit, if not a valid record */
+   break;
+   else {
+   /* If this is a valid record, create the sample */
+   struct perf_output_handle handle;
+
+   if (perf_output_begin(&handle, event, header.size))
+   return;
+
+   perf_output_sample(&handle, &header, &data, event);
+   perf_output_end(&handle);
+   }
+   }
+}
+
+static int trace_imc_event_add(struct perf_event *event, int flags)
+{
+   /* Enable the sched_task to start the engine */
+   perf_sched_cb_inc(event->ctx->pmu);
+   return 0;
+}
+
+static void trace_imc_event_read(struct perf_event *event)
+{
+   dump_trace_imc_data(event);
+}
+
+static void trace_imc_event_stop(struct perf_event *event, int flags)
+{
+   trace_imc_event_read(event);
+}
+
+static void trace_imc_event_start(struct perf_event *event, int flags)
+{
+   return;
+}
+
+static void trace_imc_event_del(struct perf_event *event, int flags)
+{
+   perf_sched_cb_dec(event->ctx->pmu);
+}
+
+void trace_imc_pmu_sched_task(struct perf_event_context *ctx,
+   bool sched_in)
+{
+   int core_id = smp_processor_id() / threads_per_core;
+   struct imc_pmu_ref *ref;
+   u64 local_mem, ldbar_value;
+
+   /* Set trace-imc bit in ldbar and load ldbar with per-thread memory 
address */
+   local_mem = get_trace_imc_event_base_addr();
+   ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | 
TRACE_IMC_ENABLE;
+
+   ref = &core_imc_refc[core_id];
+   if (!ref)
+   return;
+
+   if (sched_in) {
+   mtspr(SPRN_LDBAR, ldbar_value);
+   mutex_lock(&ref->lock);
+   if (ref->refc == 0) {
+   if (opal_imc_counters_start(OPAL_IMC_COUNTERS_TRACE,
+   
get_hard_smp_processor_id(smp_processor_id( {
+   mutex_unlock(&ref->lock);
+   pr_err("trace-imc: Unable to start the counters 
for core %d\n", core_id);
+   mtspr(SPRN_LDBAR, 0);
+   return;
+   }
+   }
+   ++ref->refc;
+   mutex_unlock(&ref->lock);
+   } else

[PATCH v2 2/5] powerpc/perf: Rearrange setting of ldbar for thread-imc

2018-12-14 Thread Anju T Sudhakar
LDBAR holds the memory address allocated for each cpu. For thread-imc
the mode bit (i.e bit 1) of LDBAR is set to accumulation.
Currently, ldbar is loaded with per cpu memory address and mode set to
accumulation at boot time.

To enable trace-imc, the mode bit of ldbar should be set to 'trace'. So to
accommodate trace-mode of IMC, reposition setting of ldbar for thread-imc
to thread_imc_event_add(). Also reset ldbar at thread_imc_event_del().

Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/perf/imc-pmu.c | 28 +---
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index f292a3f284f1..3bef46f8417d 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -806,8 +806,11 @@ static int core_imc_event_init(struct perf_event *event)
 }
 
 /*
- * Allocates a page of memory for each of the online cpus, and write the
- * physical base address of that page to the LDBAR for that cpu.
+ * Allocates a page of memory for each of the online cpus, and load
+ * LDBAR with 0.
+ * The physical base address of the page allocated for a cpu will be
+ * written to the LDBAR for that cpu, when the thread-imc event
+ * is added.
  *
  * LDBAR Register Layout:
  *
@@ -825,7 +828,7 @@ static int core_imc_event_init(struct perf_event *event)
  */
 static int thread_imc_mem_alloc(int cpu_id, int size)
 {
-   u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, cpu_id);
+   u64 *local_mem = per_cpu(thread_imc_mem, cpu_id);
int nid = cpu_to_node(cpu_id);
 
if (!local_mem) {
@@ -842,9 +845,7 @@ static int thread_imc_mem_alloc(int cpu_id, int size)
per_cpu(thread_imc_mem, cpu_id) = local_mem;
}
 
-   ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | 
THREAD_IMC_ENABLE;
-
-   mtspr(SPRN_LDBAR, ldbar_value);
+   mtspr(SPRN_LDBAR, 0);
return 0;
 }
 
@@ -995,6 +996,7 @@ static int thread_imc_event_add(struct perf_event *event, 
int flags)
 {
int core_id;
struct imc_pmu_ref *ref;
+   u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, 
smp_processor_id());
 
if (flags & PERF_EF_START)
imc_event_start(event, flags);
@@ -1003,6 +1005,9 @@ static int thread_imc_event_add(struct perf_event *event, 
int flags)
return -EINVAL;
 
core_id = smp_processor_id() / threads_per_core;
+   ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | 
THREAD_IMC_ENABLE;
+   mtspr(SPRN_LDBAR, ldbar_value);
+
/*
 * imc pmus are enabled only when it is used.
 * See if this is triggered for the first time.
@@ -1034,11 +1039,7 @@ static void thread_imc_event_del(struct perf_event 
*event, int flags)
int core_id;
struct imc_pmu_ref *ref;
 
-   /*
-* Take a snapshot and calculate the delta and update
-* the event counter values.
-*/
-   imc_event_update(event);
+   mtspr(SPRN_LDBAR, 0);
 
core_id = smp_processor_id() / threads_per_core;
ref = &core_imc_refc[core_id];
@@ -1057,6 +1058,11 @@ static void thread_imc_event_del(struct perf_event 
*event, int flags)
ref->refc = 0;
}
mutex_unlock(&ref->lock);
+   /*
+* Take a snapshot and calculate the delta and update
+* the event counter values.
+*/
+   imc_event_update(event);
 }
 
 /* update_pmu_ops : Populate the appropriate operations for "pmu" */
-- 
2.17.1



[PATCH] powerpc/imc: Dont create debugfs files for cpu-less nodes

2019-06-28 Thread Anju T Sudhakar
From: Madhavan Srinivasan 

Commit <684d984038aa> ('powerpc/powernv: Add debugfs interface for imc-mode
and imc') added debugfs interface for the nest imc pmu devices to support
changing of different ucode modes. Primarily adding this capability for
debug. But when doing so, the code did not consider the case of cpu-less
nodes. So when reading the _cmd_ or _mode_ file of a cpu-less node
will create this crash.

[ 1139.415461][ T5301] Faulting instruction address: 0xc00d0d58
[ 1139.415492][ T5301] Oops: Kernel access of bad area, sig: 11 [#1]
[ 1139.415509][ T5301] LE PAGE_SIZE=64K MMU=Radix MMU=Hash SMP NR_CPUS=256
DEBUG_PAGEALLOC NUMA PowerNV
[ 1139.415542][ T5301] Modules linked in: i2c_opal i2c_core ip_tables x_tables
xfs sd_mod bnx2x mdio ahci libahci tg3 libphy libata firmware_class dm_mirror
dm_region_hash dm_log dm_mod
[ 1139.415595][ T5301] CPU: 67 PID: 5301 Comm: cat Not tainted 5.2.0-rc6-next-
20190627+ #19
[ 1139.415634][ T5301] NIP:  c00d0d58 LR: c049aa18 
CTR:c00d0d50
[ 1139.415675][ T5301] REGS: c00020194548f9e0 TRAP: 0300   Not tainted  
(5.2.0-rc6-next-20190627+)
[ 1139.415705][ T5301] MSR:  90009033   
CR:28022822  XER: 
[ 1139.415777][ T5301] CFAR: c049aa14 DAR: 0003fc08 
DSISR:4000 IRQMASK: 0
[ 1139.415777][ T5301] GPR00: c049aa18 c00020194548fc70 
c16f8b03fc08
[ 1139.415777][ T5301] GPR04: c00020194548fcd0  
14884e7300011eaa
[ 1139.415777][ T5301] GPR08: 7eea5a52 c00d0d50 

[ 1139.415777][ T5301] GPR12: c00d0d50 c000201fff7f8c00 

[ 1139.415777][ T5301] GPR16: 000d 7fffeb0c3368 

[ 1139.415777][ T5301] GPR20:   
0002
[ 1139.415777][ T5301] GPR24:   
000200010ec9
[ 1139.415777][ T5301] GPR28: c00020194548fdf0 c00020049a584ef8 
c00020049a584ea8
[ 1139.416116][ T5301] NIP [c00d0d58] imc_mem_get+0x8/0x20
[ 1139.416143][ T5301] LR [c049aa18] simple_attr_read+0x118/0x170
[ 1139.416158][ T5301] Call Trace:
[ 1139.416182][ T5301] [c00020194548fc70] 
[c049a970]simple_attr_read+0x70/0x170 (unreliable)
[ 1139.416255][ T5301] [c00020194548fd10] 
[c054385c]debugfs_attr_read+0x6c/0xb0
[ 1139.416305][ T5301] [c00020194548fd60] [c0454c1c]__vfs_read+0x3c/0x70
[ 1139.416363][ T5301] [c00020194548fd80] [c0454d0c] vfs_read+0xbc/0x1a0
[ 1139.416392][ T5301] [c00020194548fdd0] [c045519c]ksys_read+0x7c/0x140
[ 1139.416434][ T5301] [c00020194548fe20] 
[c000b108]system_call+0x5c/0x70
[ 1139.416473][ T5301] Instruction dump:
[ 1139.416511][ T5301] 4e800020 6000 7c0802a6 6000 7c801d28 3860 
4e800020 6000
[ 1139.416572][ T5301] 6000 6000 7c0802a6 6000 <7d201c28> 3860 
f924 4e800020
[ 1139.416636][ T5301] ---[ end trace c44d1fb4ace04784 ]---
[ 1139.520686][ T5301]
[ 1140.520820][ T5301] Kernel panic - not syncing: Fatal exception

Patch adds a check to avoid creation of these files to cpu-less nodes.

Fixes: 684d984038aa ('powerpc/powernv: Add debugfs interface for imc-mode and 
imc')
Reported-by: Qian Cai 
Signed-off-by: Madhavan Srinivasan 
Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/platforms/powernv/opal-imc.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/opal-imc.c 
b/arch/powerpc/platforms/powernv/opal-imc.c
index 186109bdd41b..12c8964a2f9c 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -13,6 +13,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -56,6 +57,7 @@ static void export_imc_mode_and_cmd(struct device_node *node,
int chip = 0, nid;
char mode[16], cmd[16];
u32 cb_offset;
+   const struct cpumask *l_cpumask;
 
imc_debugfs_parent = debugfs_create_dir("imc", powerpc_debugfs_root);
 
@@ -70,6 +72,14 @@ static void export_imc_mode_and_cmd(struct device_node *node,
cb_offset = IMC_CNTL_BLK_OFFSET;
 
for_each_node(nid) {
+   /*
+* Since these are related to nest pmu,
+* create only if the node has any cpu in it.
+*/
+   l_cpumask = cpumask_of_node(nid);
+   if (cpumask_empty(l_cpumask))
+   continue;
+
loc = (u64)(pmu_ptr->mem_info[chip].vbase) + cb_offset;
imc_mode_addr = (u64 *)(loc + IMC_CNTL_BLK_MODE_OFFSET);
sprintf(mode, "imc_mode_%d", nid);
-- 
2.20.1



Re: power9 NUMA crash while reading debugfs imc_cmd

2019-06-28 Thread Anju T Sudhakar



On 6/28/19 9:04 AM, Qian Cai wrote:



On Jun 27, 2019, at 11:12 PM, Michael Ellerman  wrote:

Qian Cai  writes:

Read of debugfs imc_cmd file for a memory-less node will trigger a crash below
on this power9 machine which has the following NUMA layout.

What type of machine is it?

description: PowerNV
product: 8335-GTH (ibm,witherspoon)
vendor: IBM
width: 64 bits
capabilities: smp powernv opal



Hi Qian Cai,

Could you please try with this patch: 
https://lists.ozlabs.org/pipermail/linuxppc-dev/2019-June/192803.html


and see if the issue is resolved?


Thanks,

Anju




[PATCH v2 1/3] tools/perf: Move kvm-stat header file from conditional inclusion to common include section

2019-07-18 Thread Anju T Sudhakar
Move kvm-stat header file to the common include section, and make the
definitions in the header file under the conditional inclusion 
`#ifdef HAVE_KVM_STAT_SUPPORT`.

This helps to define other perf kvm related function prototypes in
kvm-stat header file, which may not need kvm-stat support.

Signed-off-by: Anju T Sudhakar 
---
 tools/perf/builtin-kvm.c   | 2 +-
 tools/perf/util/kvm-stat.h | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
index b33c83489120..5d2b34d290a3 100644
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c
@@ -19,6 +19,7 @@
 #include "util/top.h"
 #include "util/data.h"
 #include "util/ordered-events.h"
+#include "util/kvm-stat.h"
 
 #include 
 #ifdef HAVE_TIMERFD_SUPPORT
@@ -55,7 +56,6 @@ static const char *get_filename_for_perf_kvm(void)
 }
 
 #ifdef HAVE_KVM_STAT_SUPPORT
-#include "util/kvm-stat.h"
 
 void exit_event_get_key(struct perf_evsel *evsel,
struct perf_sample *sample,
diff --git a/tools/perf/util/kvm-stat.h b/tools/perf/util/kvm-stat.h
index 1403dec189b4..b3b2670e1a2b 100644
--- a/tools/perf/util/kvm-stat.h
+++ b/tools/perf/util/kvm-stat.h
@@ -2,6 +2,8 @@
 #ifndef __PERF_KVM_STAT_H
 #define __PERF_KVM_STAT_H
 
+#ifdef HAVE_KVM_STAT_SUPPORT
+
 #include "../perf.h"
 #include "tool.h"
 #include "stat.h"
@@ -144,5 +146,6 @@ extern const int decode_str_len;
 extern const char *kvm_exit_reason;
 extern const char *kvm_entry_trace;
 extern const char *kvm_exit_trace;
+#endif /* HAVE_KVM_STAT_SUPPORT */
 
 #endif /* __PERF_KVM_STAT_H */
-- 
2.20.1



[PATCH v2 2/3] tools/perf: Add arch neutral function to choose event for perf kvm record

2019-07-18 Thread Anju T Sudhakar
'perf kvm record' uses 'cycles'(if the user did not specify any event) as
the default event to profile the guest.
This will not provide any proper samples from the guest incase of
powerpc architecture, since in powerpc the PMUs are controlled by
the guest rather than the host.

Patch adds a function to pick an arch specific event for 'perf kvm record',
instead of selecting 'cycles' as a default event for all architectures.

For powerpc this function checks for any user specified event, and if there
isn't any it returns invalid instead of proceeding with 'cycles' event.

Signed-off-by: Anju T Sudhakar 
---

Changes from v1->v2
* Cross-build issue for aarch64, reported by Ravi is fixed.
---

 tools/perf/arch/powerpc/util/kvm-stat.c | 37 +
 tools/perf/builtin-kvm.c| 12 +++-
 tools/perf/util/kvm-stat.h  |  1 +
 3 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/tools/perf/arch/powerpc/util/kvm-stat.c 
b/tools/perf/arch/powerpc/util/kvm-stat.c
index f9db341c47b6..c55e7405940e 100644
--- a/tools/perf/arch/powerpc/util/kvm-stat.c
+++ b/tools/perf/arch/powerpc/util/kvm-stat.c
@@ -8,6 +8,7 @@
 
 #include "book3s_hv_exits.h"
 #include "book3s_hcalls.h"
+#include 
 
 #define NR_TPS 4
 
@@ -172,3 +173,39 @@ int cpu_isa_init(struct perf_kvm_stat *kvm, const char 
*cpuid __maybe_unused)
 
return ret;
 }
+
+/*
+ * Incase of powerpc architecture, pmu registers are programmable
+ * by guest kernel. So monitoring guest via host may not provide
+ * valid samples. It is better to fail the "perf kvm record"
+ * with default "cycles" event to monitor guest in powerpc.
+ *
+ * Function to parse the arguments and return appropriate values.
+ */
+int kvm_add_default_arch_event(int *argc, const char **argv)
+{
+   const char **tmp;
+   bool event = false;
+   int i, j = *argc;
+
+   const struct option event_options[] = {
+   OPT_BOOLEAN('e', "event", &event, NULL),
+   OPT_END()
+   };
+
+   tmp = calloc(j + 1, sizeof(char *));
+   if (!tmp)
+   return -EINVAL;
+
+   for (i = 0; i < j; i++)
+   tmp[i] = argv[i];
+
+   parse_options(j, tmp, event_options, NULL, PARSE_OPT_KEEP_UNKNOWN);
+   if (!event) {
+   free(tmp);
+   return -EINVAL;
+   }
+
+   free(tmp);
+   return 0;
+}
diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
index 5d2b34d290a3..d03750da051b 100644
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c
@@ -1510,11 +1510,21 @@ static int kvm_cmd_stat(const char *file_name, int 
argc, const char **argv)
 }
 #endif /* HAVE_KVM_STAT_SUPPORT */
 
+int __weak kvm_add_default_arch_event(int *argc __maybe_unused,
+   const char **argv __maybe_unused)
+{
+   return 0;
+}
+
 static int __cmd_record(const char *file_name, int argc, const char **argv)
 {
-   int rec_argc, i = 0, j;
+   int rec_argc, i = 0, j, ret;
const char **rec_argv;
 
+   ret = kvm_add_default_arch_event(&argc, argv);
+   if (ret)
+   return -EINVAL;
+
rec_argc = argc + 2;
rec_argv = calloc(rec_argc + 1, sizeof(char *));
rec_argv[i++] = strdup("record");
diff --git a/tools/perf/util/kvm-stat.h b/tools/perf/util/kvm-stat.h
index b3b2670e1a2b..81a5bf4fbc71 100644
--- a/tools/perf/util/kvm-stat.h
+++ b/tools/perf/util/kvm-stat.h
@@ -148,4 +148,5 @@ extern const char *kvm_entry_trace;
 extern const char *kvm_exit_trace;
 #endif /* HAVE_KVM_STAT_SUPPORT */
 
+extern int kvm_add_default_arch_event(int *argc, const char **argv);
 #endif /* __PERF_KVM_STAT_H */
-- 
2.20.1



[PATCH v2 3/3] tools/perf: Set 'trace_cycles' as defaultevent for perf kvm record in powerpc

2019-07-18 Thread Anju T Sudhakar
Use 'trace_imc/trace_cycles' as the default event for 'perf kvm record'
in powerpc.

Signed-off-by: Anju T Sudhakar 
---
 tools/perf/arch/powerpc/util/kvm-stat.c | 15 +++
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/tools/perf/arch/powerpc/util/kvm-stat.c 
b/tools/perf/arch/powerpc/util/kvm-stat.c
index c55e7405940e..0a06626fb18a 100644
--- a/tools/perf/arch/powerpc/util/kvm-stat.c
+++ b/tools/perf/arch/powerpc/util/kvm-stat.c
@@ -177,8 +177,9 @@ int cpu_isa_init(struct perf_kvm_stat *kvm, const char 
*cpuid __maybe_unused)
 /*
  * Incase of powerpc architecture, pmu registers are programmable
  * by guest kernel. So monitoring guest via host may not provide
- * valid samples. It is better to fail the "perf kvm record"
- * with default "cycles" event to monitor guest in powerpc.
+ * valid samples with default 'cycles' event. It is better to use
+ * 'trace_imc/trace_cycles' event for guest profiling, since it
+ * can track the guest instruction pointer in the trace-record.
  *
  * Function to parse the arguments and return appropriate values.
  */
@@ -202,8 +203,14 @@ int kvm_add_default_arch_event(int *argc, const char 
**argv)
 
parse_options(j, tmp, event_options, NULL, PARSE_OPT_KEEP_UNKNOWN);
if (!event) {
-   free(tmp);
-   return -EINVAL;
+   if (pmu_have_event("trace_imc", "trace_cycles")) {
+   argv[j++] = strdup("-e");
+   argv[j++] = strdup("trace_imc/trace_cycles/");
+   *argc += 2;
+   } else {
+   free(tmp);
+   return -EINVAL;
+   }
}
 
free(tmp);
-- 
2.20.1



Re: [PATCH v2] powerpc/imc: Dont create debugfs files for cpu-less nodes

2019-07-23 Thread Anju T Sudhakar

Hi Qian,

On 7/16/19 12:11 AM, Qian Cai wrote:

On Thu, 2019-07-11 at 14:53 +1000, Michael Ellerman wrote:

Hi Maddy,

Madhavan Srinivasan  writes:

diff --git a/arch/powerpc/platforms/powernv/opal-imc.c
b/arch/powerpc/platforms/powernv/opal-imc.c
index 186109bdd41b..e04b20625cb9 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -69,20 +69,20 @@ static void export_imc_mode_and_cmd(struct device_node
*node,
    if (of_property_read_u32(node, "cb_offset", &cb_offset))
    cb_offset = IMC_CNTL_BLK_OFFSET;
  
-	for_each_node(nid) {

-   loc = (u64)(pmu_ptr->mem_info[chip].vbase) + cb_offset;
+   while (ptr->vbase != NULL) {

This means you'll bail out as soon as you find a node with no vbase, but
it's possible we could have a CPU-less node intermingled with other
nodes.

So I think you want to keep the for loop, but continue if you see a NULL
vbase?

Not sure if this will also takes care of some of those messages during the boot
on today's linux-next even without this patch.


[   18.077780][T1] debugfs: Directory 'imc' with parent 'powerpc' already
present!




This is introduced by a recent commit: c33d442328f55 (debugfs: make 
error message a bit more verbose).


So basically, the debugfs imc_* file is created per node, and is created 
by the first nest unit which is


being registered. For the subsequent nest units, debugfs_create_dir() 
will just return since the imc_* file already


exist.

The commit "c33d442328f55 (debugfs: make error message a bit more 
verbose)", prints


a message if the debugfs file already exists in debugfs_create_dir(). 
That is why we are encountering these


messages now.


This patch (i.e, powerpc/imc: Dont create debugfs files for cpu-less 
nodes) will address the initial issue, i.e


"numa crash while reading imc_* debugfs files for cpu less nodes", and 
will not address these debugfs messages.



But yeah this is a good catch. We can have some checks to avoid these 
debugfs messages.



Hi Michael,

Do we need to have a separate patch to address these debugfs messages, 
or can we address the same


in the next version of this patch itself?


Thanks,

Anju






Re: [RFC PATCH 3/4] powerpc/perf: fix imc allocation failure

2019-07-23 Thread Anju T Sudhakar



On 7/22/19 11:16 PM, Nicholas Piggin wrote:

alloc_pages_node return value should be tested before applying
page_address.

Cc: Anju T Sudhakar 
Cc: Madhavan Srinivasan 
Signed-off-by: Nicholas Piggin 
---


Tested-by: Anju T Sudhakar 



[PATCH v2] powerpc/perf: Fix loop exit condition in nest_imc_event_init

2018-12-17 Thread Anju T Sudhakar
The data structure (i.e struct imc_mem_info) to hold the memory address
information for nest imc units is allocated based on the number of nodes
in the system.

nest_imc_event_init() traverse this struct array to calculate the memory
base address for the event-cpu. If we fail to find a match for the event
cpu's chip-id in imc_mem_info struct array, then the do-while loop will
iterate until we crash.

Fix this by changing the loop exit condition based on the number of 
non zero vbase elements in the array, since the allocation is done for
nr_chips + 1.

Reported-by: Dan Carpenter  
Fixes: 885dcd709ba91 ( powerpc/perf: Add nest IMC PMU support)
Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/perf/imc-pmu.c   | 2 +-
 arch/powerpc/platforms/powernv/opal-imc.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 4f34c75..d1009fe 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -508,7 +508,7 @@ static int nest_imc_event_init(struct perf_event *event)
break;
}
pcni++;
-   } while (pcni);
+   } while (pcni->vbase != 0);
 
if (!flag)
return -ENODEV;
diff --git a/arch/powerpc/platforms/powernv/opal-imc.c 
b/arch/powerpc/platforms/powernv/opal-imc.c
index 58a0794..3d27f02 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -127,7 +127,7 @@ static int imc_get_mem_addr_nest(struct device_node *node,
nr_chips))
goto error;
 
-   pmu_ptr->mem_info = kcalloc(nr_chips, sizeof(*pmu_ptr->mem_info),
+   pmu_ptr->mem_info = kcalloc(nr_chips + 1, sizeof(*pmu_ptr->mem_info),
GFP_KERNEL);
if (!pmu_ptr->mem_info)
goto error;
-- 
1.8.3.1



[PATCH v3 1/5] powerpc/include: Add data structures and macros for IMC trace mode

2019-02-06 Thread Anju T Sudhakar
Add the macros needed for IMC (In-Memory Collection Counters) trace-mode
and data structure to hold the trace-imc record data.
Also, add the new type "OPAL_IMC_COUNTERS_TRACE" in 'opal-api.h', since
there is a new switch case added in the opal-calls for IMC.

Signed-off-by: Anju T Sudhakar 
Reviewed-by: Madhavan Srinivasan 
---
 arch/powerpc/include/asm/imc-pmu.h  | 39 +
 arch/powerpc/include/asm/opal-api.h |  1 +
 2 files changed, 40 insertions(+)

diff --git a/arch/powerpc/include/asm/imc-pmu.h 
b/arch/powerpc/include/asm/imc-pmu.h
index 69f516ecb2fd..7c2ef0e42661 100644
--- a/arch/powerpc/include/asm/imc-pmu.h
+++ b/arch/powerpc/include/asm/imc-pmu.h
@@ -33,6 +33,7 @@
  */
 #define THREAD_IMC_LDBAR_MASK   0x0003e000ULL
 #define THREAD_IMC_ENABLE   0x8000ULL
+#define TRACE_IMC_ENABLE   0x4000ULL
 
 /*
  * For debugfs interface for imc-mode and imc-command
@@ -59,6 +60,34 @@ struct imc_events {
char *scale;
 };
 
+/*
+ * Trace IMC hardware updates a 64bytes record on
+ * Core Performance Monitoring Counter (CPMC)
+ * overflow. Here is the layout for the trace imc record
+ *
+ * DW 0 : Timebase
+ * DW 1 : Program Counter
+ * DW 2 : PIDR information
+ * DW 3 : CPMC1
+ * DW 4 : CPMC2
+ * DW 5 : CPMC3
+ * Dw 6 : CPMC4
+ * DW 7 : Timebase
+ * .
+ *
+ * The following is the data structure to hold trace imc data.
+ */
+struct trace_imc_data {
+   u64 tb1;
+   u64 ip;
+   u64 val;
+   u64 cpmc1;
+   u64 cpmc2;
+   u64 cpmc3;
+   u64 cpmc4;
+   u64 tb2;
+};
+
 /* Event attribute array index */
 #define IMC_FORMAT_ATTR0
 #define IMC_EVENT_ATTR 1
@@ -68,6 +97,13 @@ struct imc_events {
 /* PMU Format attribute macros */
 #define IMC_EVENT_OFFSET_MASK  0xULL
 
+/*
+ * Macro to mask bits 0:21 of first double word(which is the timebase) to
+ * compare with 8th double word (timebase) of trace imc record data.
+ */
+#define IMC_TRACE_RECORD_TB1_MASK  0x3ffULL
+
+
 /*
  * Device tree parser code detects IMC pmu support and
  * registers new IMC pmus. This structure will hold the
@@ -113,6 +149,7 @@ struct imc_pmu_ref {
 
 enum {
IMC_TYPE_THREAD = 0x1,
+   IMC_TYPE_TRACE  = 0x2,
IMC_TYPE_CORE   = 0x4,
IMC_TYPE_CHIP   = 0x10,
 };
@@ -123,6 +160,8 @@ enum {
 #define IMC_DOMAIN_NEST1
 #define IMC_DOMAIN_CORE2
 #define IMC_DOMAIN_THREAD  3
+/* For trace-imc the domain is still thread but it operates in trace-mode */
+#define IMC_DOMAIN_TRACE   4
 
 extern int init_imc_pmu(struct device_node *parent,
struct imc_pmu *pmu_ptr, int pmu_id);
diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index 870fb7b239ea..a4130b21b159 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -1118,6 +1118,7 @@ enum {
 enum {
OPAL_IMC_COUNTERS_NEST = 1,
OPAL_IMC_COUNTERS_CORE = 2,
+   OPAL_IMC_COUNTERS_TRACE = 3,
 };
 
 
-- 
2.17.1



[PATCH v3 0/5] powerpc/perf: IMC trace-mode support

2019-02-06 Thread Anju T Sudhakar
ists.ozlabs.org/pipermail/skiboot/2018-December/012883.html

* Set LDBAR spr to enable imc-trace mode.

LDBAR Layout:

0 : Enable/Disable
1 : 0 -> Accumulation Mode
1 -> Trace Mode
2:3   : Reserved
4-6   : PB scope
7 : Reserved
8:50  : Counter Address
51:63 : Reserved

   

Key benefit of imc trace-mode is, each sample record contains the address
pointer along with other information. So that, we can profile the IP
without interrupting the application.

Performance data using 'perf top' with and without trace-imc event:

When the application is monitored with trace-imc event, we dont take any   
PMI interrupts.

PMI interrupts count when `perf top` command is executed without trac-imc event.

# perf top  
12.53%  [kernel]   [k] arch_cpu_idle   
11.32%  [kernel]   [k] rcu_idle_enter  
10.76%  [kernel]   [k] __next_timer_interrupt  
 9.49%  [kernel]   [k] find_next_bit   
 8.06%  [kernel]   [k] rcu_dynticks_eqs_exit   
 7.82%  [kernel]   [k] do_idle 
 5.71%  [kernel]   [k] tick_nohz_idle_stop_tic 
 [---]  
# cat /proc/interrupts  (a snippet from the output)
9944  1072804804   1644804   1306  
804804804804804804804  
804804   1961   1602804804   1258  
[-]
803803803803803803803  
803803803803804804804 
804804804804804804803 
803803803803803   1306803 
803   Performance monitoring interrupts   


`perf top` with trace-imc (right after 'perf top' without trace-imc event):

# perf top -e trace_imc/trace_cycles/  
12.50%  [kernel]  [k] arch_cpu_idle
11.81%  [kernel]  [k] __next_timer_interrupt   
11.22%  [kernel]  [k] rcu_idle_enter   
10.25%  [kernel]  [k] find_next_bit
 7.91%  [kernel]  [k] do_idle  
 7.69%  [kernel]  [k] rcu_dynticks_eqs_exit
 5.20%  [kernel]  [k] tick_nohz_idle_stop_tick 
 [---]  

# cat /proc/interrupts (a snippet from the output) 

9944  1072804804   1644804   1306  
804804804804804804804  
804804   1961   1602804804   1258  
[-]
803803803803803803803  
803803803804804804804
804804804804804804803 
803803803803803   1306803 
803   Performance monitoring interrupts   

The PMI interrupts count remains the same.  

Changelog:

>From v2 -> v3
--

* Redefined the event format for trace-imc.

Suggestions/comments are welcome.


Anju T Sudhakar (4):
  powerpc/include: Add data structures and macros for IMC trace mode
  powerpc/perf: Rearrange setting of ldbar for thread-imc
  powerpc/perf: Trace imc events detection and cpuhotplug
  powerpc/perf: Trace imc PMU functions

Madhavan Srinivasan (1):
  powerpc/perf: Add privileged access check for thread_imc

[PATCH v3 2/5] powerpc/perf: Rearrange setting of ldbar for thread-imc

2019-02-06 Thread Anju T Sudhakar
LDBAR holds the memory address allocated for each cpu. For thread-imc
the mode bit (i.e bit 1) of LDBAR is set to accumulation.
Currently, ldbar is loaded with per cpu memory address and mode set to
accumulation at boot time.

To enable trace-imc, the mode bit of ldbar should be set to 'trace'. So to
accommodate trace-mode of IMC, reposition setting of ldbar for thread-imc
to thread_imc_event_add(). Also reset ldbar at thread_imc_event_del().

Signed-off-by: Anju T Sudhakar 
Reviewed-by: Madhavan Srinivasan 
---
 arch/powerpc/perf/imc-pmu.c | 28 +---
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index f292a3f284f1..3bef46f8417d 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -806,8 +806,11 @@ static int core_imc_event_init(struct perf_event *event)
 }
 
 /*
- * Allocates a page of memory for each of the online cpus, and write the
- * physical base address of that page to the LDBAR for that cpu.
+ * Allocates a page of memory for each of the online cpus, and load
+ * LDBAR with 0.
+ * The physical base address of the page allocated for a cpu will be
+ * written to the LDBAR for that cpu, when the thread-imc event
+ * is added.
  *
  * LDBAR Register Layout:
  *
@@ -825,7 +828,7 @@ static int core_imc_event_init(struct perf_event *event)
  */
 static int thread_imc_mem_alloc(int cpu_id, int size)
 {
-   u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, cpu_id);
+   u64 *local_mem = per_cpu(thread_imc_mem, cpu_id);
int nid = cpu_to_node(cpu_id);
 
if (!local_mem) {
@@ -842,9 +845,7 @@ static int thread_imc_mem_alloc(int cpu_id, int size)
per_cpu(thread_imc_mem, cpu_id) = local_mem;
}
 
-   ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | 
THREAD_IMC_ENABLE;
-
-   mtspr(SPRN_LDBAR, ldbar_value);
+   mtspr(SPRN_LDBAR, 0);
return 0;
 }
 
@@ -995,6 +996,7 @@ static int thread_imc_event_add(struct perf_event *event, 
int flags)
 {
int core_id;
struct imc_pmu_ref *ref;
+   u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, 
smp_processor_id());
 
if (flags & PERF_EF_START)
imc_event_start(event, flags);
@@ -1003,6 +1005,9 @@ static int thread_imc_event_add(struct perf_event *event, 
int flags)
return -EINVAL;
 
core_id = smp_processor_id() / threads_per_core;
+   ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | 
THREAD_IMC_ENABLE;
+   mtspr(SPRN_LDBAR, ldbar_value);
+
/*
 * imc pmus are enabled only when it is used.
 * See if this is triggered for the first time.
@@ -1034,11 +1039,7 @@ static void thread_imc_event_del(struct perf_event 
*event, int flags)
int core_id;
struct imc_pmu_ref *ref;
 
-   /*
-* Take a snapshot and calculate the delta and update
-* the event counter values.
-*/
-   imc_event_update(event);
+   mtspr(SPRN_LDBAR, 0);
 
core_id = smp_processor_id() / threads_per_core;
ref = &core_imc_refc[core_id];
@@ -1057,6 +1058,11 @@ static void thread_imc_event_del(struct perf_event 
*event, int flags)
ref->refc = 0;
}
mutex_unlock(&ref->lock);
+   /*
+* Take a snapshot and calculate the delta and update
+* the event counter values.
+*/
+   imc_event_update(event);
 }
 
 /* update_pmu_ops : Populate the appropriate operations for "pmu" */
-- 
2.17.1



[PATCH v3 3/5] powerpc/perf: Add privileged access check for thread_imc

2019-02-06 Thread Anju T Sudhakar
From: Madhavan Srinivasan 

Add code to restrict user access to thread_imc pmu since
some event report privilege level information.

Fixes: f74c89bd80fb3 ('powerpc/perf: Add thread IMC PMU support')
Signed-off-by: Madhavan Srinivasan 
Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/perf/imc-pmu.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 3bef46f8417d..5ca80545a849 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -877,6 +877,9 @@ static int thread_imc_event_init(struct perf_event *event)
if (event->attr.type != event->pmu->type)
return -ENOENT;
 
+   if (!capable(CAP_SYS_ADMIN))
+   return -EACCES;
+
/* Sampling not supported */
if (event->hw.sample_period)
return -EINVAL;
-- 
2.17.1



[PATCH v3 4/5] powerpc/perf: Trace imc events detection and cpuhotplug

2019-02-06 Thread Anju T Sudhakar
Patch detects trace-imc events, does memory initilizations for each online
cpu, and registers cpuhotplug call-backs.

Signed-off-by: Anju T Sudhakar 
Reviewed-by: Madhavan Srinivasan 
---
 arch/powerpc/perf/imc-pmu.c   | 91 +++
 arch/powerpc/platforms/powernv/opal-imc.c |  3 +
 include/linux/cpuhotplug.h|  1 +
 3 files changed, 95 insertions(+)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 5ca80545a849..1f09265c8fb0 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -43,6 +43,10 @@ static DEFINE_PER_CPU(u64 *, thread_imc_mem);
 static struct imc_pmu *thread_imc_pmu;
 static int thread_imc_mem_size;
 
+/* Trace IMC data structures */
+static DEFINE_PER_CPU(u64 *, trace_imc_mem);
+static int trace_imc_mem_size;
+
 static struct imc_pmu *imc_event_to_pmu(struct perf_event *event)
 {
return container_of(event->pmu, struct imc_pmu, pmu);
@@ -1068,6 +1072,54 @@ static void thread_imc_event_del(struct perf_event 
*event, int flags)
imc_event_update(event);
 }
 
+/*
+ * Allocate a page of memory for each cpu, and load LDBAR with 0.
+ */
+static int trace_imc_mem_alloc(int cpu_id, int size)
+{
+   u64 *local_mem = per_cpu(trace_imc_mem, cpu_id);
+   int phys_id = cpu_to_node(cpu_id), rc = 0;
+
+   if (!local_mem) {
+   local_mem = page_address(alloc_pages_node(phys_id,
+   GFP_KERNEL | __GFP_ZERO | 
__GFP_THISNODE |
+   __GFP_NOWARN, get_order(size)));
+   if (!local_mem)
+   return -ENOMEM;
+   per_cpu(trace_imc_mem, cpu_id) = local_mem;
+
+   /* Initialise the counters for trace mode */
+   rc = opal_imc_counters_init(OPAL_IMC_COUNTERS_TRACE, __pa((void 
*)local_mem),
+   get_hard_smp_processor_id(cpu_id));
+   if (rc) {
+   pr_info("IMC:opal init failed for trace imc\n");
+   return rc;
+   }
+   }
+
+   mtspr(SPRN_LDBAR, 0);
+   return 0;
+}
+
+static int ppc_trace_imc_cpu_online(unsigned int cpu)
+{
+   return trace_imc_mem_alloc(cpu, trace_imc_mem_size);
+}
+
+static int ppc_trace_imc_cpu_offline(unsigned int cpu)
+{
+   mtspr(SPRN_LDBAR, 0);
+   return 0;
+}
+
+static int trace_imc_cpu_init(void)
+{
+   return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE,
+ "perf/powerpc/imc_trace:online",
+ ppc_trace_imc_cpu_online,
+ ppc_trace_imc_cpu_offline);
+}
+
 /* update_pmu_ops : Populate the appropriate operations for "pmu" */
 static int update_pmu_ops(struct imc_pmu *pmu)
 {
@@ -1189,6 +1241,17 @@ static void cleanup_all_thread_imc_memory(void)
}
 }
 
+static void cleanup_all_trace_imc_memory(void)
+{
+   int i, order = get_order(trace_imc_mem_size);
+
+   for_each_online_cpu(i) {
+   if (per_cpu(trace_imc_mem, i))
+   free_pages((u64)per_cpu(trace_imc_mem, i), order);
+
+   }
+}
+
 /* Function to free the attr_groups which are dynamically allocated */
 static void imc_common_mem_free(struct imc_pmu *pmu_ptr)
 {
@@ -1230,6 +1293,11 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu 
*pmu_ptr)
cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE);
cleanup_all_thread_imc_memory();
}
+
+   if (pmu_ptr->domain == IMC_DOMAIN_TRACE) {
+   cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE);
+   cleanup_all_trace_imc_memory();
+   }
 }
 
 /*
@@ -1312,6 +1380,21 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct 
device_node *parent,
 
thread_imc_pmu = pmu_ptr;
break;
+   case IMC_DOMAIN_TRACE:
+   /* Update the pmu name */
+   pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s", s, "_imc");
+   if (!pmu_ptr->pmu.name)
+   return -ENOMEM;
+
+   trace_imc_mem_size = pmu_ptr->counter_mem_size;
+   for_each_online_cpu(cpu) {
+   res = trace_imc_mem_alloc(cpu, trace_imc_mem_size);
+   if (res) {
+   cleanup_all_trace_imc_memory();
+   goto err;
+   }
+   }
+   break;
default:
return -EINVAL;
}
@@ -1384,6 +1467,14 @@ int init_imc_pmu(struct device_node *parent, struct 
imc_pmu *pmu_ptr, int pmu_id
goto err_free_mem;
}
 
+   break;
+   case IMC_DOMAIN_TRACE:
+   ret = trace_imc_cpu_init();
+   if (ret) {
+   cleanup_

[PATCH v3 5/5] powerpc/perf: Trace imc PMU functions

2019-02-06 Thread Anju T Sudhakar
Add PMU functions to support trace-imc and define the format for
trace-imc events.

Signed-off-by: Anju T Sudhakar 
Reviewed-by: Madhavan Srinivasan 
---
 arch/powerpc/perf/imc-pmu.c | 197 +++-
 1 file changed, 196 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 1f09265c8fb0..0f1a30f11f6a 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -52,7 +52,7 @@ static struct imc_pmu *imc_event_to_pmu(struct perf_event 
*event)
return container_of(event->pmu, struct imc_pmu, pmu);
 }
 
-PMU_FORMAT_ATTR(event, "config:0-40");
+PMU_FORMAT_ATTR(event, "config:0-61");
 PMU_FORMAT_ATTR(offset, "config:0-31");
 PMU_FORMAT_ATTR(rvalue, "config:32");
 PMU_FORMAT_ATTR(mode, "config:33-40");
@@ -69,6 +69,25 @@ static struct attribute_group imc_format_group = {
.attrs = imc_format_attrs,
 };
 
+/* Format attribute for imc trace-mode */
+PMU_FORMAT_ATTR(cpmc_reserved, "config:0-19");
+PMU_FORMAT_ATTR(cpmc_event, "config:20-27");
+PMU_FORMAT_ATTR(cpmc_samplesel, "config:28-29");
+PMU_FORMAT_ATTR(cpmc_load, "config:30-61");
+static struct attribute *trace_imc_format_attrs[] = {
+   &format_attr_event.attr,
+   &format_attr_cpmc_reserved.attr,
+   &format_attr_cpmc_event.attr,
+   &format_attr_cpmc_samplesel.attr,
+   &format_attr_cpmc_load.attr,
+   NULL,
+};
+
+static struct attribute_group trace_imc_format_group = {
+   .name = "format",
+   .attrs = trace_imc_format_attrs,
+};
+
 /* Get the cpumask printed to a buffer "buf" */
 static ssize_t imc_pmu_cpumask_get_attr(struct device *dev,
struct device_attribute *attr,
@@ -1120,6 +1139,173 @@ static int trace_imc_cpu_init(void)
  ppc_trace_imc_cpu_offline);
 }
 
+static u64 get_trace_imc_event_base_addr(void)
+{
+   return (u64)per_cpu(trace_imc_mem, smp_processor_id());
+}
+
+/*
+ * Function to parse trace-imc data obtained
+ * and to prepare the perf sample.
+ */
+static int trace_imc_prepare_sample(struct trace_imc_data *mem,
+   struct perf_sample_data *data,
+   u64 *prev_tb,
+   struct perf_event_header *header,
+   struct perf_event *event)
+{
+   /* Sanity checks for a valid record */
+   if (be64_to_cpu(READ_ONCE(mem->tb1)) > *prev_tb)
+   *prev_tb = be64_to_cpu(READ_ONCE(mem->tb1));
+   else
+   return -EINVAL;
+
+   if ((be64_to_cpu(READ_ONCE(mem->tb1)) & IMC_TRACE_RECORD_TB1_MASK) !=
+be64_to_cpu(READ_ONCE(mem->tb2)))
+   return -EINVAL;
+
+   /* Prepare perf sample */
+   data->ip =  be64_to_cpu(READ_ONCE(mem->ip));
+   data->period = event->hw.last_period;
+
+   header->type = PERF_RECORD_SAMPLE;
+   header->size = sizeof(*header) + event->header_size;
+   header->misc = 0;
+
+   if (is_kernel_addr(data->ip))
+   header->misc |= PERF_RECORD_MISC_KERNEL;
+   else
+   header->misc |= PERF_RECORD_MISC_USER;
+
+   perf_event_header__init_id(header, data, event);
+
+   return 0;
+}
+
+static void dump_trace_imc_data(struct perf_event *event)
+{
+   struct trace_imc_data *mem;
+   int i, ret;
+   u64 prev_tb = 0;
+
+   mem = (struct trace_imc_data *)get_trace_imc_event_base_addr();
+   for (i = 0; i < (trace_imc_mem_size / sizeof(struct trace_imc_data));
+   i++, mem++) {
+   struct perf_sample_data data;
+   struct perf_event_header header;
+
+   ret = trace_imc_prepare_sample(mem, &data, &prev_tb, &header, 
event);
+   if (ret) /* Exit, if not a valid record */
+   break;
+   else {
+   /* If this is a valid record, create the sample */
+   struct perf_output_handle handle;
+
+   if (perf_output_begin(&handle, event, header.size))
+   return;
+
+   perf_output_sample(&handle, &header, &data, event);
+   perf_output_end(&handle);
+   }
+   }
+}
+
+static int trace_imc_event_add(struct perf_event *event, int flags)
+{
+   /* Enable the sched_task to start the engine */
+   perf_sched_cb_inc(event->ctx->pmu);
+   return 0;
+}
+
+static void trace_imc_event_read(struct perf_event *event)
+{
+   dump_trace_imc_data(event);
+}
+
+static void trace_imc_event_stop(struct perf_event *event, int flags)
+{
+   trace_imc_event_read(event);
+}
+
+static void trace_imc_event_start(struct perf_e

[PATCH 1/2] powerpc/powernv: Re-enable imc trace-mode in kernel

2020-01-20 Thread Anju T Sudhakar
commit <249fad734a25> ""powerpc/perf: Disable trace_imc pmu"
disables IMC(In-Memory Collection) trace-mode in kernel, since frequent
mode switching between accumulation mode and trace mode via the spr LDBAR
in the hardware can trigger a checkstop(system crash).

Patch to re-enable imc-trace mode in kernel.

The following patch in this series will address the mode switching issue
by implementing a global lock, and will restrict the usage of
accumulation and trace-mode at a time.

Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/platforms/powernv/opal-imc.c | 9 +
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/opal-imc.c 
b/arch/powerpc/platforms/powernv/opal-imc.c
index 000b350d4060..3b4518f4b643 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -278,14 +278,7 @@ static int opal_imc_counters_probe(struct platform_device 
*pdev)
domain = IMC_DOMAIN_THREAD;
break;
case IMC_TYPE_TRACE:
-   /*
-* FIXME. Using trace_imc events to monitor application
-* or KVM thread performance can cause a checkstop
-* (system crash).
-* Disable it for now.
-*/
-   pr_info_once("IMC: disabling trace_imc PMU\n");
-   domain = -1;
+   domain = IMC_DOMAIN_TRACE;
break;
default:
pr_warn("IMC Unknown Device type \n");
-- 
2.20.1



[PATCH 2/2] powerpc/perf: Implement a global lock to avoid races between trace, core and thread imc events.

2020-01-20 Thread Anju T Sudhakar
IMC(In-memory Collection Counters) does performance monitoring in
two different modes, i.e accumulation mode(core-imc and thread-imc events),
and trace mode(trace-imc events). A cpu thread can either be in
accumulation-mode or trace-mode at a time and this is done via the LDBAR
register in POWER architecture. The current design does not address the
races between thread-imc and trace-imc events.

Patch implements a global id and lock to avoid the races between
core, trace and thread imc events. With this global id-lock
implementation, the system can either run core, thread or trace imc
events at a time. i.e. to run any core-imc events, thread/trace imc events
should not be enabled/monitored.
 
Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/perf/imc-pmu.c | 177 +++-
 1 file changed, 153 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index cb50a9e1fd2d..2e220f199530 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -44,6 +44,16 @@ static DEFINE_PER_CPU(u64 *, trace_imc_mem);
 static struct imc_pmu_ref *trace_imc_refc;
 static int trace_imc_mem_size;
 
+/*
+ * Global data structure used to avoid races between thread,
+ * core and trace-imc
+ */
+static struct imc_pmu_ref imc_global_refc = {
+   .lock = __MUTEX_INITIALIZER(imc_global_refc.lock),
+   .id = 0,
+   .refc = 0,
+};
+
 static struct imc_pmu *imc_event_to_pmu(struct perf_event *event)
 {
return container_of(event->pmu, struct imc_pmu, pmu);
@@ -759,6 +769,20 @@ static void core_imc_counters_release(struct perf_event 
*event)
ref->refc = 0;
}
mutex_unlock(&ref->lock);
+
+   mutex_lock(&imc_global_refc.lock);
+   if (imc_global_refc.id == IMC_DOMAIN_CORE) {
+   imc_global_refc.refc--;
+   /*
+* If no other thread is running any core-imc
+* event, set the global id to zero.
+*/
+   if (imc_global_refc.refc <= 0) {
+   imc_global_refc.refc = 0;
+   imc_global_refc.id = 0;
+   }
+   }
+   mutex_unlock(&imc_global_refc.lock);
 }
 
 static int core_imc_event_init(struct perf_event *event)
@@ -779,6 +803,22 @@ static int core_imc_event_init(struct perf_event *event)
if (event->cpu < 0)
return -EINVAL;
 
+   /*
+* Take the global lock, and make sure
+* no other thread is running any trace OR thread imc event
+*/
+   mutex_lock(&imc_global_refc.lock);
+   if (imc_global_refc.id == 0) {
+   imc_global_refc.id = IMC_DOMAIN_CORE;
+   imc_global_refc.refc++;
+   } else if (imc_global_refc.id == IMC_DOMAIN_CORE) {
+   imc_global_refc.refc++;
+   } else {
+   mutex_unlock(&imc_global_refc.lock);
+   return -EBUSY;
+   }
+   mutex_unlock(&imc_global_refc.lock);
+
event->hw.idx = -1;
pmu = imc_event_to_pmu(event);
 
@@ -877,7 +917,16 @@ static int ppc_thread_imc_cpu_online(unsigned int cpu)
 
 static int ppc_thread_imc_cpu_offline(unsigned int cpu)
 {
-   mtspr(SPRN_LDBAR, 0);
+   /*
+* Toggle the bit 0 of LDBAR.
+*
+* If bit 0 of LDBAR is unset, it will stop posting
+* the counetr data to memory.
+* For thread-imc, bit 0 of LDBAR will be set to 1 in the
+* event_add function. So toggle this bit here, to stop the updates
+* to memory in the cpu_offline path.
+*/
+   mtspr(SPRN_LDBAR, (mfspr(SPRN_LDBAR) ^ (1UL << 63)));
return 0;
 }
 
@@ -889,6 +938,24 @@ static int thread_imc_cpu_init(void)
  ppc_thread_imc_cpu_offline);
 }
 
+static void thread_imc_counters_release(struct perf_event *event)
+{
+
+   mutex_lock(&imc_global_refc.lock);
+   if (imc_global_refc.id == IMC_DOMAIN_THREAD) {
+   imc_global_refc.refc--;
+   /*
+* If no other thread is running any thread-imc
+* event, set the global id to zero.
+*/
+   if (imc_global_refc.refc <= 0) {
+   imc_global_refc.refc = 0;
+   imc_global_refc.id = 0;
+   }
+   }
+   mutex_unlock(&imc_global_refc.lock);
+}
+
 static int thread_imc_event_init(struct perf_event *event)
 {
u32 config = event->attr.config;
@@ -905,6 +972,27 @@ static int thread_imc_event_init(struct perf_event *event)
if (event->hw.sample_period)
return -EINVAL;
 
+   mutex_lock(&imc_global_refc.lock);
+   /*
+* Check if any other thread is running
+* core-engine, if not set the global id to
+* thread-imc.
+*/
+   if (imc_global_refc.id == 0) {
+   imc_global_refc.id = IMC_

[PATCH v2 0/5] Re-enable IMC trace-mode

2020-01-21 Thread Anju T Sudhakar
commit <249fad734a25> ""powerpc/perf: Disable trace_imc pmu"   
disables IMC(In-Memory Collection) trace-mode in kernel, since frequent   
mode switching between accumulation mode and trace mode via the spr LDBAR  
in the hardware can trigger a checkstop(system crash).

This patch series re-enables IMC trace mode and fixes the mode switching
issue by global lock mechanism.

Patch 3/5,4/5 and 5/5 provides a selftest to verify the global-lock
mechanism.

Changes from v1 -> v2:
-
- Added self test patches to the series.

Anju T Sudhakar (2):
  powerpc/powernv: Re-enable imc trace-mode in kernel
  powerpc/perf: Implement a global lock to avoid races between trace,
core and thread imc events.

Madhavan Srinivasan (3):
  powerpc/perf: Add an interface sub-folder to imc pmu
  selftest/powerpc/pmc: Support to include interface test for Memory
Counter PMUs
  selftest/powerpc/pmu: Testcase for imc global lock mechanism


 arch/powerpc/include/asm/imc-pmu.h|  11 +-
 arch/powerpc/perf/imc-pmu.c   | 196 +++---
 arch/powerpc/platforms/powernv/opal-imc.c |   9 +-
 tools/testing/selftests/powerpc/pmu/Makefile  |   7 +-
 .../powerpc/pmu/mem_counters/Makefile |  21 ++
 .../pmu/mem_counters/imc_global_lock_test.c   |  68 ++
 .../powerpc/pmu/mem_counters/mem_counters.c   |  99 +
 .../powerpc/pmu/mem_counters/mem_counters.h   |  36 
 8 files changed, 408 insertions(+), 39 deletions(-)
 create mode 100644 tools/testing/selftests/powerpc/pmu/mem_counters/Makefile
 create mode 100644 
tools/testing/selftests/powerpc/pmu/mem_counters/imc_global_lock_test.c
 create mode 100644 
tools/testing/selftests/powerpc/pmu/mem_counters/mem_counters.c
 create mode 100644 
tools/testing/selftests/powerpc/pmu/mem_counters/mem_counters.h

-- 
2.18.1



[PATCH v2 1/5] powerpc/powernv: Re-enable imc trace-mode in kernel

2020-01-21 Thread Anju T Sudhakar
commit <249fad734a25> ""powerpc/perf: Disable trace_imc pmu"
disables IMC(In-Memory Collection) trace-mode in kernel, since frequent
mode switching between accumulation mode and trace mode via the spr LDBAR
in the hardware can trigger a checkstop(system crash).

Patch to re-enable imc-trace mode in kernel.

The following patch in this series will address the mode switching issue
by implementing a global lock, and will restrict the usage of
accumulation and trace-mode at a time.

Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/platforms/powernv/opal-imc.c | 9 +
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/opal-imc.c 
b/arch/powerpc/platforms/powernv/opal-imc.c
index 000b350d4060..3b4518f4b643 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -278,14 +278,7 @@ static int opal_imc_counters_probe(struct platform_device 
*pdev)
domain = IMC_DOMAIN_THREAD;
break;
case IMC_TYPE_TRACE:
-   /*
-* FIXME. Using trace_imc events to monitor application
-* or KVM thread performance can cause a checkstop
-* (system crash).
-* Disable it for now.
-*/
-   pr_info_once("IMC: disabling trace_imc PMU\n");
-   domain = -1;
+   domain = IMC_DOMAIN_TRACE;
break;
default:
pr_warn("IMC Unknown Device type \n");
-- 
2.20.1



[PATCH v2 4/5] selftest/powerpc/pmc: Support to include interface test for Memory Counter PMUs

2020-01-21 Thread Anju T Sudhakar
From: Madhavan Srinivasan 

Patch to add support to include interface tests for
memory counter PMUs as part of selftest.
These PMUs are primarily used to understand socket/chip/core
resourage usage. In PowerNV envirnoment, the perf interface
registered to access these counters are called "In Memory Collection"
(IMC) and in PowerVM, the perf interface registered to access
these counters are called "hv_24x7".

New folder "mem_counters" added under selftest/powerpc/pmu.
This will include interface tests for both "imc" and "hv_24x7"
pmus. Patch adds base/common functioned needed.
To make blame easier, a place-holder test function added to
this patch. Subsequent patch will fill in the actual test
content.

Signed-off-by: Madhavan Srinivasan 
---
 tools/testing/selftests/powerpc/pmu/Makefile  |  7 +-
 .../powerpc/pmu/mem_counters/Makefile | 21 
 .../pmu/mem_counters/imc_global_lock_test.c   | 21 
 .../powerpc/pmu/mem_counters/mem_counters.c   | 99 +++
 .../powerpc/pmu/mem_counters/mem_counters.h   | 36 +++
 5 files changed, 182 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/powerpc/pmu/mem_counters/Makefile
 create mode 100644 
tools/testing/selftests/powerpc/pmu/mem_counters/imc_global_lock_test.c
 create mode 100644 
tools/testing/selftests/powerpc/pmu/mem_counters/mem_counters.c
 create mode 100644 
tools/testing/selftests/powerpc/pmu/mem_counters/mem_counters.h

diff --git a/tools/testing/selftests/powerpc/pmu/Makefile 
b/tools/testing/selftests/powerpc/pmu/Makefile
index 19046db995fe..e352eceac0a9 100644
--- a/tools/testing/selftests/powerpc/pmu/Makefile
+++ b/tools/testing/selftests/powerpc/pmu/Makefile
@@ -8,7 +8,7 @@ EXTRA_SOURCES := ../harness.c event.c lib.c ../utils.c
 top_srcdir = ../../../../..
 include ../../lib.mk
 
-all: $(TEST_GEN_PROGS) ebb
+all: $(TEST_GEN_PROGS) ebb mem_counters
 
 $(TEST_GEN_PROGS): $(EXTRA_SOURCES)
 
@@ -43,4 +43,7 @@ clean:
 ebb:
TARGET=$@; BUILD_TARGET=$$OUTPUT/$$TARGET; mkdir -p $$BUILD_TARGET; 
$(MAKE) OUTPUT=$$BUILD_TARGET -k -C $$TARGET all
 
-.PHONY: all run_tests clean ebb
+mem_counters:
+   TARGET=$@; BUILD_TARGET=$$OUTPUT/$$TARGET; mkdir -p $$BUILD_TARGET; 
$(MAKE) OUTPUT=$$BUILD_TARGET -k -C $$TARGET all
+
+.PHONY: all run_tests clean ebb mem_counters
diff --git a/tools/testing/selftests/powerpc/pmu/mem_counters/Makefile 
b/tools/testing/selftests/powerpc/pmu/mem_counters/Makefile
new file mode 100644
index ..f39ebe30ab70
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/mem_counters/Makefile
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: GPL-2.0
+include ../../../../../../scripts/Kbuild.include
+
+noarg:
+   $(MAKE) -C ../../
+
+CFLAGS += -m64
+
+# Toolchains may build PIE by default which breaks the assembly
+no-pie-option := $(call try-run, echo 'int main() { return 0; }' | \
+$(CC) -Werror $(KBUILD_CPPFLAGS) $(CC_OPTION_CFLAGS) -no-pie -x c - -o 
"$$TMP", -no-pie)
+
+LDFLAGS += $(no-pie-option)
+
+TEST_GEN_PROGS := imc_global_lock_test
+
+top_srcdir = ../../../../../..
+include ../../../lib.mk
+
+$(TEST_GEN_PROGS): ../../harness.c ../../utils.c ../event.c ../lib.c 
./mem_counters.c \
+   imc_global_lock_test.c
diff --git 
a/tools/testing/selftests/powerpc/pmu/mem_counters/imc_global_lock_test.c 
b/tools/testing/selftests/powerpc/pmu/mem_counters/imc_global_lock_test.c
new file mode 100644
index ..ea687ffc1990
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/mem_counters/imc_global_lock_test.c
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2020, Madhavan Srinivasan, IBM Corp.
+ */
+
+#include "mem_counters.h"
+
+static int testcase(void)
+{
+   return 0;
+}
+
+static int imc_global_lock_test(void)
+{
+   return eat_cpu(testcase);
+}
+
+int main(void)
+{
+   return test_harness(imc_global_lock_test, "imc_global_lock_test");
+}
diff --git a/tools/testing/selftests/powerpc/pmu/mem_counters/mem_counters.c 
b/tools/testing/selftests/powerpc/pmu/mem_counters/mem_counters.c
new file mode 100644
index ..b0ee1319f018
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/mem_counters/mem_counters.c
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2020, Madhavan Srinivasan, IBM Corp.
+ */
+
+#include "mem_counters.h"
+
+/*
+ * mem_counters.c will contain common/basic functions
+ * to support testcases for both In Memory Collection (IMC)
+ * and hv_24x7 counters.
+ */
+
+
+/*
+ * Since device type enum starts with 1,
+ * have the first entry in the array as a placeholder.
+ */
+const char mem_counters_dev_path[][30] = {
+   "",
+   "/sys/devices/thread_imc",
+   "/sys/devices/trace_imc",
+   "/sys/devices/core_imc",
+   "/sys/devices/hv_24x7",
+   "",
+};
+
+const char mem_counters_dev_type_path[][35] = {
+   "",
+   "/sys/devices/thread_imc/type",
+   "/sys/devices/trace_imc/type",
+   "/sys/devices/core_imc/t

[PATCH v2 5/5] selftest/powerpc/pmu: Testcase for imc global lock mechanism

2020-01-21 Thread Anju T Sudhakar
From: Madhavan Srinivasan 

Signed-off-by: Madhavan Srinivasan 
---
 .../pmu/mem_counters/imc_global_lock_test.c   | 49 ++-
 1 file changed, 48 insertions(+), 1 deletion(-)

diff --git 
a/tools/testing/selftests/powerpc/pmu/mem_counters/imc_global_lock_test.c 
b/tools/testing/selftests/powerpc/pmu/mem_counters/imc_global_lock_test.c
index ea687ffc1990..f643dba8ecc0 100644
--- a/tools/testing/selftests/powerpc/pmu/mem_counters/imc_global_lock_test.c
+++ b/tools/testing/selftests/powerpc/pmu/mem_counters/imc_global_lock_test.c
@@ -5,9 +5,56 @@
 
 #include "mem_counters.h"
 
+static  bool check_imc_interface_glob_lck(void)
+{
+   if (!access("/sys/devices/thread_imc/interface/glob_lck", F_OK))
+   return true;
+
+   return false;
+}
+
 static int testcase(void)
 {
-   return 0;
+   struct event events[2];
+
+   if (!check_imc_interface_glob_lck()) {
+   printf("Test not supported\n");
+   return MAGIC_SKIP_RETURN_VALUE;
+   }
+
+   if (!is_mem_counters_device_enabled(CORE) || 
!is_mem_counters_device_enabled(THREAD)) {
+   printf("%s: IMC device not found. So exiting the test\n", 
__FUNCTION__);
+   return -1;
+   }
+
+   if (setup_mem_counters_event(THREAD, &events[0], 0xe0, 
"thread_imc/cycles")) {
+   printf("%s setup_mem_counters_event for thread_imc failed\n", 
__FUNCTION__);
+   return -1;
+   }
+
+   if (setup_mem_counters_event(CORE, &events[1], 0xe0, 
"core_imc/cycles")) {
+   printf("%s setup_mem_counters_event for core_imc failed\n", 
__FUNCTION__);
+   return -1;
+   }
+
+   if (event_open(&events[0])) {
+   perror("thread_imc: perf_event_open");
+   return -1;
+   }
+
+   /*
+* If we have the Global lock patchset applied to kernel
+* event_open for events[1] should fail with resource busy
+*/
+   if (event_open_with_cpu(&events[1], 0)) {
+   /*
+* Check for the errno to certify the test result
+*/
+   if (errno == 16) // Resource busy (EBUSY)
+   return 0;
+   }
+
+   return -1;
 }
 
 static int imc_global_lock_test(void)
-- 
2.20.1



[PATCH v2 2/5] powerpc/perf: Implement a global lock to avoid races between trace, core and thread imc events.

2020-01-21 Thread Anju T Sudhakar
IMC(In-memory Collection Counters) does performance monitoring in
two different modes, i.e accumulation mode(core-imc and thread-imc events),
and trace mode(trace-imc events). A cpu thread can either be in
accumulation-mode or trace-mode at a time and this is done via the LDBAR
register in POWER architecture. The current design does not address the
races between thread-imc and trace-imc events.

Patch implements a global id and lock to avoid the races between
core, trace and thread imc events. With this global id-lock
implementation, the system can either run core, thread or trace imc
events at a time. i.e. to run any core-imc events, thread/trace imc events
should not be enabled/monitored.

Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/perf/imc-pmu.c | 177 +++-
 1 file changed, 153 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index cb50a9e1fd2d..2e220f199530 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -44,6 +44,16 @@ static DEFINE_PER_CPU(u64 *, trace_imc_mem);
 static struct imc_pmu_ref *trace_imc_refc;
 static int trace_imc_mem_size;
 
+/*
+ * Global data structure used to avoid races between thread,
+ * core and trace-imc
+ */
+static struct imc_pmu_ref imc_global_refc = {
+   .lock = __MUTEX_INITIALIZER(imc_global_refc.lock),
+   .id = 0,
+   .refc = 0,
+};
+
 static struct imc_pmu *imc_event_to_pmu(struct perf_event *event)
 {
return container_of(event->pmu, struct imc_pmu, pmu);
@@ -759,6 +769,20 @@ static void core_imc_counters_release(struct perf_event 
*event)
ref->refc = 0;
}
mutex_unlock(&ref->lock);
+
+   mutex_lock(&imc_global_refc.lock);
+   if (imc_global_refc.id == IMC_DOMAIN_CORE) {
+   imc_global_refc.refc--;
+   /*
+* If no other thread is running any core-imc
+* event, set the global id to zero.
+*/
+   if (imc_global_refc.refc <= 0) {
+   imc_global_refc.refc = 0;
+   imc_global_refc.id = 0;
+   }
+   }
+   mutex_unlock(&imc_global_refc.lock);
 }
 
 static int core_imc_event_init(struct perf_event *event)
@@ -779,6 +803,22 @@ static int core_imc_event_init(struct perf_event *event)
if (event->cpu < 0)
return -EINVAL;
 
+   /*
+* Take the global lock, and make sure
+* no other thread is running any trace OR thread imc event
+*/
+   mutex_lock(&imc_global_refc.lock);
+   if (imc_global_refc.id == 0) {
+   imc_global_refc.id = IMC_DOMAIN_CORE;
+   imc_global_refc.refc++;
+   } else if (imc_global_refc.id == IMC_DOMAIN_CORE) {
+   imc_global_refc.refc++;
+   } else {
+   mutex_unlock(&imc_global_refc.lock);
+   return -EBUSY;
+   }
+   mutex_unlock(&imc_global_refc.lock);
+
event->hw.idx = -1;
pmu = imc_event_to_pmu(event);
 
@@ -877,7 +917,16 @@ static int ppc_thread_imc_cpu_online(unsigned int cpu)
 
 static int ppc_thread_imc_cpu_offline(unsigned int cpu)
 {
-   mtspr(SPRN_LDBAR, 0);
+   /*
+* Toggle the bit 0 of LDBAR.
+*
+* If bit 0 of LDBAR is unset, it will stop posting
+* the counetr data to memory.
+* For thread-imc, bit 0 of LDBAR will be set to 1 in the
+* event_add function. So toggle this bit here, to stop the updates
+* to memory in the cpu_offline path.
+*/
+   mtspr(SPRN_LDBAR, (mfspr(SPRN_LDBAR) ^ (1UL << 63)));
return 0;
 }
 
@@ -889,6 +938,24 @@ static int thread_imc_cpu_init(void)
  ppc_thread_imc_cpu_offline);
 }
 
+static void thread_imc_counters_release(struct perf_event *event)
+{
+
+   mutex_lock(&imc_global_refc.lock);
+   if (imc_global_refc.id == IMC_DOMAIN_THREAD) {
+   imc_global_refc.refc--;
+   /*
+* If no other thread is running any thread-imc
+* event, set the global id to zero.
+*/
+   if (imc_global_refc.refc <= 0) {
+   imc_global_refc.refc = 0;
+   imc_global_refc.id = 0;
+   }
+   }
+   mutex_unlock(&imc_global_refc.lock);
+}
+
 static int thread_imc_event_init(struct perf_event *event)
 {
u32 config = event->attr.config;
@@ -905,6 +972,27 @@ static int thread_imc_event_init(struct perf_event *event)
if (event->hw.sample_period)
return -EINVAL;
 
+   mutex_lock(&imc_global_refc.lock);
+   /*
+* Check if any other thread is running
+* core-engine, if not set the global id to
+* thread-imc.
+*/
+   if (imc_global_refc.id == 0) {
+   imc_global_refc.id = IMC_

[PATCH v2 3/5] powerpc/perf: Add an interface sub-folder to imc pmu

2020-01-21 Thread Anju T Sudhakar
From: Madhavan Srinivasan 

Patch adds an interface attribute folder to imc pmu.
This is intended to include pmu intreface capabilities
which will be useful to userspace likes selftest
testcases. Patch adds a "glob_lck" file to notify to
userspace of global lock mechanism added to imc devices
like core, thread and trace.

"glob_lck" will be used by selftest file to execute
interface test for the global lock mechanism.

Signed-off-by: Madhavan Srinivasan 
---
 arch/powerpc/include/asm/imc-pmu.h | 11 ++-
 arch/powerpc/perf/imc-pmu.c| 19 +++
 2 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/imc-pmu.h 
b/arch/powerpc/include/asm/imc-pmu.h
index 4da4fcba0684..1b2c33c30e7c 100644
--- a/arch/powerpc/include/asm/imc-pmu.h
+++ b/arch/powerpc/include/asm/imc-pmu.h
@@ -87,8 +87,9 @@ struct trace_imc_data {
 /* Event attribute array index */
 #define IMC_FORMAT_ATTR0
 #define IMC_EVENT_ATTR 1
-#define IMC_CPUMASK_ATTR   2
-#define IMC_NULL_ATTR  3
+#define IMC_INTERFACE_ATTR 2
+#define IMC_CPUMASK_ATTR   3
+#define IMC_NULL_ATTR  4
 
 /* PMU Format attribute macros */
 #define IMC_EVENT_OFFSET_MASK  0xULL
@@ -114,10 +115,10 @@ struct imc_pmu {
/*
 * Attribute groups for the PMU. Slot 0 used for
 * format attribute, slot 1 used for cpusmask attribute,
-* slot 2 used for event attribute. Slot 3 keep as
-* NULL.
+* slot 2 used for event attribute. Slot 3 used for interface
+* attribute and Slot 4 is NULL.
 */
-   const struct attribute_group *attr_groups[4];
+   const struct attribute_group *attr_groups[5];
u32 counter_mem_size;
int domain;
/*
diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 2e220f199530..3f49664f29f1 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -54,6 +54,24 @@ static struct imc_pmu_ref imc_global_refc = {
.refc = 0,
 };
 
+static ssize_t glob_lck_show(struct device *dev,
+   struct device_attribute *attr, char *buf)
+{
+   return sprintf(buf, "%d\n", 1);
+}
+
+static DEVICE_ATTR_RO(glob_lck);
+
+static struct attribute *imc_interface_attrs[] = {
+   &dev_attr_glob_lck.attr,
+   NULL,
+};
+
+static struct attribute_group imc_interface_group = {
+   .name = "interface",
+   .attrs = imc_interface_attrs,
+};
+
 static struct imc_pmu *imc_event_to_pmu(struct perf_event *event)
 {
return container_of(event->pmu, struct imc_pmu, pmu);
@@ -1462,6 +1480,7 @@ static int update_pmu_ops(struct imc_pmu *pmu)
pmu->pmu.attr_groups = pmu->attr_groups;
pmu->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
pmu->attr_groups[IMC_FORMAT_ATTR] = &imc_format_group;
+   pmu->attr_groups[IMC_INTERFACE_ATTR] = &imc_interface_group;
 
switch (pmu->domain) {
case IMC_DOMAIN_NEST:
-- 
2.20.1



Re: [PATCH v5 07/10] powerpc/perf: open access for CAP_PERFMON privileged process

2020-01-22 Thread Anju T Sudhakar



On 1/20/20 5:00 PM, Alexey Budankov wrote:

Open access to monitoring for CAP_PERFMON privileged processes.
For backward compatibility reasons access to the monitoring remains
open for CAP_SYS_ADMIN privileged processes but CAP_SYS_ADMIN usage
for secure monitoring is discouraged with respect to CAP_PERFMON
capability. Providing the access under CAP_PERFMON capability singly,
without the rest of CAP_SYS_ADMIN credentials, excludes chances to
misuse the credentials and makes the operations more secure.

Signed-off-by: Alexey Budankov
---


Acked-by: Anju T Sudhakar



[PATCH] platform/powernv: Avoid re-registration of imc debugfs directory

2019-08-20 Thread Anju T Sudhakar
export_imc_mode_and_cmd() function which creates the debugfs interface for
imc-mode and imc-command, is invoked when each nest pmu units is
registered.
When the first nest pmu unit is registered, export_imc_mode_and_cmd()
creates 'imc' directory under `/debug/powerpc`. In the subsequent
invocations debugfs_create_dir() function returns, since the directory
already exists.

The recent commit  (debugfs: make error message a bit more
verbose), throws a warning if we try to invoke `debugfs_create_dir()`
with an already existing directory name.

Address this warning by lookup for an existing 'imc' directory,
and do not invoke debugfs_create_dir(), if the debugfs interface for
imc already exists.

This patch is based on:
   https://lists.ozlabs.org/pipermail/linuxppc-dev/2019-July/192979.html

Signed-off-by: Anju T Sudhakar 
Tested-by: Nageswara R Sastry 
---
 arch/powerpc/platforms/powernv/opal-imc.c | 13 +
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/opal-imc.c 
b/arch/powerpc/platforms/powernv/opal-imc.c
index e04b20625cb9..fc2f0e60a44d 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -55,14 +55,19 @@ static void export_imc_mode_and_cmd(struct device_node 
*node,
static u64 loc, *imc_mode_addr, *imc_cmd_addr;
char mode[16], cmd[16];
u32 cb_offset;
+   struct dentry *dir = NULL;
struct imc_mem_info *ptr = pmu_ptr->mem_info;
 
+
+   /* Return, if 'imc' interface already exists */
+   dir = debugfs_lookup("imc", powerpc_debugfs_root);
+   if (dir) {
+   dput(dir);
+   return;
+   }
imc_debugfs_parent = debugfs_create_dir("imc", powerpc_debugfs_root);
 
-   /*
-* Return here, either because 'imc' directory already exists,
-* Or failed to create a new one.
-*/
+   /* Return here, if failed to create the directory */
if (!imc_debugfs_parent)
return;
 
-- 
2.20.1



Re: [PATCH] platform/powernv: Avoid re-registration of imc debugfs directory

2019-08-20 Thread Anju T Sudhakar

Hi,

On 8/21/19 10:16 AM, Oliver O'Halloran wrote:

On Wed, Aug 21, 2019 at 2:10 PM Anju T Sudhakar  wrote:

export_imc_mode_and_cmd() function which creates the debugfs interface for
imc-mode and imc-command, is invoked when each nest pmu units is
registered.
When the first nest pmu unit is registered, export_imc_mode_and_cmd()
creates 'imc' directory under `/debug/powerpc`. In the subsequent
invocations debugfs_create_dir() function returns, since the directory
already exists.

The recent commit  (debugfs: make error message a bit more
verbose), throws a warning if we try to invoke `debugfs_create_dir()`
with an already existing directory name.

Address this warning by lookup for an existing 'imc' directory,
and do not invoke debugfs_create_dir(), if the debugfs interface for
imc already exists.

This patch is based on:
https://lists.ozlabs.org/pipermail/linuxppc-dev/2019-July/192979.html

Signed-off-by: Anju T Sudhakar 
Tested-by: Nageswara R Sastry 
---
  arch/powerpc/platforms/powernv/opal-imc.c | 13 +
  1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/opal-imc.c 
b/arch/powerpc/platforms/powernv/opal-imc.c
index e04b20625cb9..fc2f0e60a44d 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -55,14 +55,19 @@ static void export_imc_mode_and_cmd(struct device_node 
*node,
 static u64 loc, *imc_mode_addr, *imc_cmd_addr;
 char mode[16], cmd[16];
 u32 cb_offset;
+   struct dentry *dir = NULL;
 struct imc_mem_info *ptr = pmu_ptr->mem_info;

+
+   /* Return, if 'imc' interface already exists */
+   dir = debugfs_lookup("imc", powerpc_debugfs_root);
+   if (dir) {
+   dput(dir);
+   return;
+   }
 imc_debugfs_parent = debugfs_create_dir("imc", powerpc_debugfs_root);

Is there a reason why we create the debugfs directory here and not in
opal_imc_counters_probe()? There's logic to remove the debugfs
directory in _probe() already so it seems like a more natural place to
it.


Good point. But we can only create the parent directory,

i.e 'imc' directory in `_probe()` function and the entries can be

created only here. The reason is, this debugfs entries are only for

IMC nest units. So, to get the imc mode and command values from

the nest memory region we need the relevant offsets from the control 
block structure.


Since imc_get_mem_addr_nest() function reads this address

for each chip, we invoke the function to create the debugfs

entries after this values are populated(i.e export_imc_mode_and_cmd() in 
invoked by


imc_get_mem_addr_nest()).

Also, if we create the parent directory in `_probe()` function,

we need to track whether the entries(i.e imc_cmd and imc_mode) are 
created or not.



Regards,

Anju



[PATCH v2] platform/powernv: Avoid re-registration of imc debugfs directory

2019-09-03 Thread Anju T Sudhakar
export_imc_mode_and_cmd() function which creates the debugfs interface for
imc-mode and imc-command, is invoked when each nest pmu units is
registered.
When the first nest pmu unit is registered, export_imc_mode_and_cmd()
creates 'imc' directory under `/debug/powerpc/`. In the subsequent
invocations debugfs_create_dir() function returns, since the directory
already exists.

The recent commit  (debugfs: make error message a bit more
verbose), throws a warning if we try to invoke `debugfs_create_dir()`
with an already existing directory name.

Address this warning by searching for an existing 'imc' directory,
and do not invoke debugfs_create_dir(), if the debugfs interface for
imc already exists.

This patch is based on:
https://lists.ozlabs.org/pipermail/linuxppc-dev/2019-August/195898.html

Signed-off-by: Anju T Sudhakar 
Tested-by: Nageswara R Sastry 
---
Changes from v1 -> v2

* Minor changes in the commit message.
---
 arch/powerpc/platforms/powernv/opal-imc.c | 13 +
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/opal-imc.c 
b/arch/powerpc/platforms/powernv/opal-imc.c
index e04b20625cb9..fc2f0e60a44d 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -55,14 +55,19 @@ static void export_imc_mode_and_cmd(struct device_node 
*node,
static u64 loc, *imc_mode_addr, *imc_cmd_addr;
char mode[16], cmd[16];
u32 cb_offset;
+   struct dentry *dir = NULL;
struct imc_mem_info *ptr = pmu_ptr->mem_info;
 
+
+   /* Return, if 'imc' interface already exists */
+   dir = debugfs_lookup("imc", powerpc_debugfs_root);
+   if (dir) {
+   dput(dir);
+   return;
+   }
imc_debugfs_parent = debugfs_create_dir("imc", powerpc_debugfs_root);
 
-   /*
-* Return here, either because 'imc' directory already exists,
-* Or failed to create a new one.
-*/
+   /* Return here, if failed to create the directory */
if (!imc_debugfs_parent)
return;
 
-- 
2.20.1



[PATCH] powerpc/perf: Add kernel support for new MSR[HV PR] bits in trace-imc.

2020-07-02 Thread Anju T Sudhakar
IMC trace-mode record has MSR[HV PR] bits added in the third DW. 
These bits can be used to set the cpumode for the instruction pointer
captured in each sample.
  
   
Add support in kernel to use these bits to set the cpumode for 
each sample.   
   
Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/include/asm/imc-pmu.h |  5 +
 arch/powerpc/perf/imc-pmu.c| 29 -
 2 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/imc-pmu.h 
b/arch/powerpc/include/asm/imc-pmu.h
index 4da4fcba0684..4f897993b710 100644
--- a/arch/powerpc/include/asm/imc-pmu.h
+++ b/arch/powerpc/include/asm/imc-pmu.h
@@ -99,6 +99,11 @@ struct trace_imc_data {
  */
 #define IMC_TRACE_RECORD_TB1_MASK  0x3ffULL
 
+/*
+ * Bit 0:1 in third DW of IMC trace record
+ * specifies the MSR[HV PR] values.
+ */
+#define IMC_TRACE_RECORD_VAL_HVPR(x)   ((x) >> 62)
 
 /*
  * Device tree parser code detects IMC pmu support and
diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index cb50a9e1fd2d..310922fed9eb 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -1178,11 +1178,30 @@ static int trace_imc_prepare_sample(struct 
trace_imc_data *mem,
header->size = sizeof(*header) + event->header_size;
header->misc = 0;
 
-   if (is_kernel_addr(data->ip))
-   header->misc |= PERF_RECORD_MISC_KERNEL;
-   else
-   header->misc |= PERF_RECORD_MISC_USER;
-
+   if (cpu_has_feature(CPU_FTRS_POWER9)) {
+   if (is_kernel_addr(data->ip))
+   header->misc |= PERF_RECORD_MISC_KERNEL;
+   else
+   header->misc |= PERF_RECORD_MISC_USER;
+   } else {
+   switch (IMC_TRACE_RECORD_VAL_HVPR(mem->val)) {
+   case 0:/* when MSR HV and PR not set in the trace-record */
+   header->misc |= PERF_RECORD_MISC_GUEST_KERNEL;
+   break;
+   case 1: /* MSR HV is 0 and PR is 1 */
+   header->misc |= PERF_RECORD_MISC_GUEST_USER;
+   break;
+   case 2: /* MSR Hv is 1 and PR is 0 */
+   header->misc |= PERF_RECORD_MISC_HYPERVISOR;
+   break;
+   case 3: /* MSR HV is 1 and PR is 1 */
+   header->misc |= PERF_RECORD_MISC_USER;
+   break;
+   default:
+   pr_info("IMC: Unable to set the flag based on MSR 
bits\n");
+   break;
+   }
+   }
perf_event_header__init_id(header, data, event);
 
return 0;
-- 
2.25.4



[PATCH] powerpc/imc: Add documentation for IMC and trace-mode

2019-05-10 Thread Anju T Sudhakar
Documentation for IMC(In-Memory Collection Counters) infrastructure
and trace-mode of IMC.

Signed-off-by: Anju T Sudhakar 
---
 Documentation/powerpc/imc.txt | 195 ++
 1 file changed, 195 insertions(+)
 create mode 100644 Documentation/powerpc/imc.txt

diff --git a/Documentation/powerpc/imc.txt b/Documentation/powerpc/imc.txt
new file mode 100644
index ..9c32e059f3be
--- /dev/null
+++ b/Documentation/powerpc/imc.txt
@@ -0,0 +1,195 @@
+   ===
+   IMC (In-Memory Collection Counters)
+   ===
+   Date created: 10 May 2019
+
+Table of Contents:
+--
+   - Basic overview
+   - IMC example Usage
+   - IMC Trace Mode
+   - LDBAR Register Layout
+   - TRACE_IMC_SCOM bit representation
+   - Trace IMC example usage
+   - Benefits of using IMC trace-mode
+
+
+Basic overview
+==
+
+IMC (In-Memory collection counters) is a hardware monitoring facility
+that collects large number of hardware performance events at Nest level
+(these are on-chip but off-core), Core level and Thread level.
+
+The Nest PMU counters are handled by a Nest IMC microcode which runs
+in the OCC (On-Chip Controller) complex. The microcode collects the
+counter data and moves the nest IMC counter data to memory.
+
+The Core and Thread IMC PMU counters are handled in the core. Core
+level PMU counters give us the IMC counters' data per core and thread
+level PMU counters give us the IMC counters' data per CPU thread.
+
+OPAL obtains the IMC PMU and supported events information from the
+IMC Catalog and passes on to the kernel via the device tree. The event's
+information contains :
+ - Event name
+ - Event Offset
+ - Event description
+and, maybe :
+ - Event scale
+ - Event unit
+
+Some PMUs may have a common scale and unit values for all their
+supported events. For those cases, the scale and unit properties for
+those events must be inherited from the PMU.
+
+The event offset in the memory is where the counter data gets
+accumulated.
+
+IMC catalog is available at:
+   https://github.com/open-power/ima-catalog
+
+The kernel discovers the IMC counters information in the device tree
+at the "imc-counters" device node which has a compatible field
+"ibm,opal-in-memory-counters". From the device tree, the kernel parses
+the PMUs and their event's information and register the PMU and it
+attributes in the kernel.
+
+IMC example usage
+=
+
+# perf list
+
+  [...]
+  nest_mcs01/PM_MCS01_64B_RD_DISP_PORT01/[Kernel PMU event]
+  nest_mcs01/PM_MCS01_64B_RD_DISP_PORT23/[Kernel PMU event]
+
+  [...]
+  core_imc/CPM_0THRD_NON_IDLE_PCYC/  [Kernel PMU event]
+  core_imc/CPM_1THRD_NON_IDLE_INST/  [Kernel PMU event]
+
+  [...]
+  thread_imc/CPM_0THRD_NON_IDLE_PCYC/[Kernel PMU event]
+  thread_imc/CPM_1THRD_NON_IDLE_INST/[Kernel PMU event]
+
+To see per chip data for nest_mcs0/PM_MCS_DOWN_128B_DATA_XFER_MC0/ :
+ # ./perf stat -e "nest_mcs01/PM_MCS01_64B_WR_DISP_PORT01/" -a --per-socket
+
+To see non-idle instructions for core 0 :
+ # ./perf stat -e "core_imc/CPM_NON_IDLE_INST/" -C 0 -I 1000
+
+To see non-idle instructions for a "make" :
+ # ./perf stat -e "thread_imc/CPM_NON_IDLE_PCYC/" make
+
+
+IMC Trace-mode
+===
+
+POWER9 support two modes for IMC which are the Accumulation mode and
+Trace mode. In Accumulation mode, event counts are accumulated in system
+Memory. Hypervisor then reads the posted counts periodically or when
+requested. In IMC Trace mode, the 64 bit trace scom value is initialized
+with the event information. The CPMC*SEL and CPMC_LOAD in the trace scom,
+specifies the event to be monitored and the sampling duration. On each
+overflow in the CPMC*SEL, hardware snapshots the program counter along
+with event counts and writes into memory pointed by LDBAR.
+
+LDBAR is a 64 bit special purpose per thread register, it has bits to
+indicate whether hardware is configured for accumulation or trace mode.
+
+* LDBAR Register Layout:
+   0 : Enable/Disable
+   1 : 0 -> Accumulation Mode
+   1 -> Trace Mode
+   2:3   : Reserved
+   4-6   : PB scope
+   7 : Reserved
+   8:50  : Counter Address
+   51:63 : Reserved
+
+* TRACE_IMC_SCOM bit representation:
+
+   0:1 : SAMPSEL
+   2:33: CPMC_LOAD
+   34:40   : CPMC1SEL
+   41:47   : CPMC2SEL
+   48:50   : BUFFERSIZE
+   51:63   : RESERVED
+
+CPMC_LOAD contains the sampling duration. SAMPSEL and CPMC*SEL determines
+the event to count. BUFFRSIZE indicates the memory range. On each overflow,
+hardware snapshots program counter along with event counts and update the
+memory and reloads the CMPC_LOAD 

[PATCH] powerpc/powernv: Return for invalid IMC domain

2019-05-20 Thread Anju T Sudhakar
Currently init_imc_pmu() can be failed either because
an IMC unit with invalid domain(i.e an IMC node not
supported by the kernel) is attempted a pmu-registration
or something went wrong while registering a valid IMC unit.
In both the cases kernel provides a 'Registration failed'
error message.

Example:
Log message, when trace-imc node is not supported by the kernel, and the
skiboot supports trace-imc node.

So for kernel, trace-imc node is now an unknown domain.

[1.731870] nest_phb5_imc performance monitor hardware support registered
[1.731944] nest_powerbus0_imc performance monitor hardware support 
registered
[1.734458] thread_imc performance monitor hardware support registered
[1.734460] IMC Unknown Device type
[1.734462] IMC PMU (null) Register failed
[1.734558] nest_xlink0_imc performance monitor hardware support registered
[1.734614] nest_xlink1_imc performance monitor hardware support registered
[1.734670] nest_xlink2_imc performance monitor hardware support registered
[1.747043] Initialise system trusted keyrings
[1.747054] Key type blacklist registered


To avoid ambiguity on the error message, return for invalid domain
before attempting a pmu registration. 

Fixes: 8f95faaac56c1 (`powerpc/powernv: Detect and create IMC device`)
Reported-by: Pavaman Subramaniyam 
Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/platforms/powernv/opal-imc.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/opal-imc.c 
b/arch/powerpc/platforms/powernv/opal-imc.c
index 58a0794..4e8b0e1 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -161,6 +161,10 @@ static int imc_pmu_create(struct device_node *parent, int 
pmu_index, int domain)
struct imc_pmu *pmu_ptr;
u32 offset;
 
+   /* Return for unknown domain */
+   if (domain < 0)
+   return -EINVAL;
+
/* memory for pmu */
pmu_ptr = kzalloc(sizeof(*pmu_ptr), GFP_KERNEL);
if (!pmu_ptr)
-- 
1.8.3.1



[PATCH] powerpc/perf: Use cpumask_last() to determine the

2019-05-20 Thread Anju T Sudhakar
Nest and core imc(In-memory Collection counters) assigns a particular
cpu as the designated target for counter data collection.
During system boot, the first online cpu in a chip gets assigned as
the designated cpu for that chip(for nest-imc) and the first online cpu
in a core gets assigned as the designated cpu for that core(for core-imc).

If the designated cpu goes offline, the next online cpu from the same
chip(for nest-imc)/core(for core-imc) is assigned as the next target,
and the event context is migrated to the target cpu.
Currently, cpumask_any_but() function is used to find the target cpu.
Though this function is expected to return a `random` cpu, this always
returns the next online cpu.

If all cpus in a chip/core is offlined in a sequential manner, starting
from the first cpu, the event migration has to happen for all the cpus
which goes offline. Since the migration process involves a grace period,
the total time taken to offline all the cpus will be significantly high.

Example:
In a system which has 2 sockets, with
NUMA node0 CPU(s): 0-87
NUMA node8 CPU(s): 88-175

Time taken to offline cpu 88-175:
real2m56.099s
user0m0.191s
sys 0m0.000s

Use cpumask_last() to choose the target cpu, when the designated cpu
goes online, so the migration will happen only when the last_cpu in the
mask goes offline. This way the time taken to offline all cpus in a
chip/core can be reduced.

With the patch, 

Time taken  to offline cpu 88-175:
real0m12.207s
user0m0.171s
sys 0m0.000s

cpumask_last() is a better way to find the target cpu, since in most of the
cases cpuhotplug is performed in an increasing order(even in ppc64_cpu).

cpumask_any_but() can still be used to check the possibility of other
online cpus from the same chip/core if the last cpu in the mask goes
offline.

Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/perf/imc-pmu.c | 14 --
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 31fa753..fbfd6e7 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -366,7 +366,14 @@ static int ppc_nest_imc_cpu_offline(unsigned int cpu)
 */
nid = cpu_to_node(cpu);
l_cpumask = cpumask_of_node(nid);
-   target = cpumask_any_but(l_cpumask, cpu);
+   target = cpumask_last(l_cpumask);
+
+   /*
+* If this(target) is the last cpu in the cpumask for this chip,
+* check for any possible online cpu in the chip.
+*/
+   if (unlikely(target == cpu))
+   target = cpumask_any_but(l_cpumask, cpu);
 
/*
 * Update the cpumask with the target cpu and
@@ -671,7 +678,10 @@ static int ppc_core_imc_cpu_offline(unsigned int cpu)
return 0;
 
/* Find any online cpu in that core except the current "cpu" */
-   ncpu = cpumask_any_but(cpu_sibling_mask(cpu), cpu);
+   ncpu = cpumask_last(cpu_sibling_mask(cpu));
+
+   if (unlikely(ncpu == cpu))
+   ncpu = cpumask_any_but(cpu_sibling_mask(cpu), cpu);
 
if (ncpu >= 0 && ncpu < nr_cpu_ids) {
cpumask_set_cpu(ncpu, &core_imc_cpumask);
-- 
1.8.3.1



Re: [PATCH] powerpc/perf: Use cpumask_last() to determine the

2019-05-20 Thread Anju T Sudhakar

Hi,

Somehow the subject of this patch didn't appear completely here.

The Subject of this patch is as follows,

`Subject [PATCH] powerpc/perf: Use cpumask_last() to determine the 
designated cpu for nest/core units.`


Thanks,

Anju


On 5/20/19 2:35 PM, Anju T Sudhakar wrote:

Nest and core imc(In-memory Collection counters) assigns a particular
cpu as the designated target for counter data collection.
During system boot, the first online cpu in a chip gets assigned as
the designated cpu for that chip(for nest-imc) and the first online cpu
in a core gets assigned as the designated cpu for that core(for core-imc).

If the designated cpu goes offline, the next online cpu from the same
chip(for nest-imc)/core(for core-imc) is assigned as the next target,
and the event context is migrated to the target cpu.
Currently, cpumask_any_but() function is used to find the target cpu.
Though this function is expected to return a `random` cpu, this always
returns the next online cpu.

If all cpus in a chip/core is offlined in a sequential manner, starting
from the first cpu, the event migration has to happen for all the cpus
which goes offline. Since the migration process involves a grace period,
the total time taken to offline all the cpus will be significantly high.





Re: [PATCH] powerpc/powernv: Return for invalid IMC domain

2019-05-22 Thread Anju T Sudhakar

Hi,

On 5/21/19 5:18 PM, Michael Ellerman wrote:

Anju T Sudhakar  writes:

Currently init_imc_pmu() can be failed either because
an IMC unit with invalid domain(i.e an IMC node not
supported by the kernel) is attempted a pmu-registration
or something went wrong while registering a valid IMC unit.
In both the cases kernel provides a 'Registration failed'
error message.

Example:
Log message, when trace-imc node is not supported by the kernel, and the
skiboot supports trace-imc node.

So for kernel, trace-imc node is now an unknown domain.

[1.731870] nest_phb5_imc performance monitor hardware support registered
[1.731944] nest_powerbus0_imc performance monitor hardware support 
registered
[1.734458] thread_imc performance monitor hardware support registered
[1.734460] IMC Unknown Device type
[1.734462] IMC PMU (null) Register failed
[1.734558] nest_xlink0_imc performance monitor hardware support registered
[1.734614] nest_xlink1_imc performance monitor hardware support registered
[1.734670] nest_xlink2_imc performance monitor hardware support registered
[1.747043] Initialise system trusted keyrings
[1.747054] Key type blacklist registered


To avoid ambiguity on the error message, return for invalid domain
before attempting a pmu registration.

What do we print once the patch is applied?



Once the patch is applied, we return for invalid domains. so we will 
only have


`/IMC Unknown Device type/` message printed for *unknown domains*.

And `/IMC PMU (null) Register failed/` message will appear only if the

registration fails for a *known domain*.



Thanks,

Anju




[PATCH 1/2] tools/perf: Add arch neutral function to choose event for perf kvm record

2019-05-24 Thread Anju T Sudhakar
'perf kvm record' uses 'cycles'(if the user did not specify any event) as
the default event to profile the guest.
This will not provide any proper samples from the guest incase of
powerpc architecture, since in powerpc the PMUs are controlled by
the guest rather than the host.

Patch adds a function to pick an arch specific event for 'perf kvm record',
instead of selecting 'cycles' as a default event for all architectures.

For powerpc this function checks for any user specified event, and if there
isn't any it returns invalid instead of proceeding with 'cycles' event.

Signed-off-by: Anju T Sudhakar 
---
 tools/perf/arch/powerpc/util/kvm-stat.c | 37 +
 tools/perf/builtin-kvm.c| 12 +++-
 tools/perf/util/kvm-stat.h  |  2 +-
 3 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/tools/perf/arch/powerpc/util/kvm-stat.c 
b/tools/perf/arch/powerpc/util/kvm-stat.c
index f9db341c47b6..66f8fe500945 100644
--- a/tools/perf/arch/powerpc/util/kvm-stat.c
+++ b/tools/perf/arch/powerpc/util/kvm-stat.c
@@ -8,6 +8,7 @@
 
 #include "book3s_hv_exits.h"
 #include "book3s_hcalls.h"
+#include 
 
 #define NR_TPS 4
 
@@ -172,3 +173,39 @@ int cpu_isa_init(struct perf_kvm_stat *kvm, const char 
*cpuid __maybe_unused)
 
return ret;
 }
+
+/*
+ * Incase of powerpc architecture, pmu registers are programmable
+ * by guest kernel. So monitoring guest via host may not provide
+ * valid samples. It is better to fail the "perf kvm record"
+ * with default "cycles" event to monitor guest in powerpc.
+ *
+ * Function to parse the arguments and return appropriate values.
+ */
+int kvm_add_default_arch_event(int *argc, const char **argv)
+{
+   const char **tmp;
+   bool event = false;
+   int i, j = *argc;
+
+   const struct option event_options[] = {
+   OPT_BOOLEAN('e', "event", &event, NULL),
+   OPT_END()
+   };
+
+   tmp = calloc(j + 1, sizeof(char *));
+   if (!tmp)
+   return -EINVAL;
+
+   for (i = 0; i < j; i++)
+   tmp[i] = argv[i];
+
+   parse_options(j, tmp, event_options, NULL, 0);
+   if (!event) {
+   free(tmp);
+   return -EINVAL;
+   }
+
+   free(tmp);
+   return 0;
+}
diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
index dbb6f737a3e2..fe33b3ec55c9 100644
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c
@@ -1510,11 +1510,21 @@ static int kvm_cmd_stat(const char *file_name, int 
argc, const char **argv)
 }
 #endif /* HAVE_KVM_STAT_SUPPORT */
 
+int __weak kvm_add_default_arch_event(int *argc __maybe_unused,
+   const char **argv __maybe_unused)
+{
+   return 0;
+}
+
 static int __cmd_record(const char *file_name, int argc, const char **argv)
 {
-   int rec_argc, i = 0, j;
+   int rec_argc, i = 0, j, ret;
const char **rec_argv;
 
+   ret = kvm_add_default_arch_event(&argc, argv);
+   if (ret)
+   return -EINVAL;
+
rec_argc = argc + 2;
rec_argv = calloc(rec_argc + 1, sizeof(char *));
rec_argv[i++] = strdup("record");
diff --git a/tools/perf/util/kvm-stat.h b/tools/perf/util/kvm-stat.h
index 1403dec189b4..da38b56c46cb 100644
--- a/tools/perf/util/kvm-stat.h
+++ b/tools/perf/util/kvm-stat.h
@@ -144,5 +144,5 @@ extern const int decode_str_len;
 extern const char *kvm_exit_reason;
 extern const char *kvm_entry_trace;
 extern const char *kvm_exit_trace;
-
+extern int kvm_add_default_arch_event(int *argc, const char **argv);
 #endif /* __PERF_KVM_STAT_H */
-- 
2.17.2



[PATCH 2/2] tools/perf: Set 'trace_cycles' as defaultevent for perf kvm record in powerpc

2019-05-24 Thread Anju T Sudhakar
Use 'trace_imc/trace_cycles' as the default event for 'perf kvm record'
in powerpc.

Signed-off-by: Anju T Sudhakar 
---
 tools/perf/arch/powerpc/util/kvm-stat.c | 15 +++
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/tools/perf/arch/powerpc/util/kvm-stat.c 
b/tools/perf/arch/powerpc/util/kvm-stat.c
index 66f8fe500945..b552884263df 100644
--- a/tools/perf/arch/powerpc/util/kvm-stat.c
+++ b/tools/perf/arch/powerpc/util/kvm-stat.c
@@ -177,8 +177,9 @@ int cpu_isa_init(struct perf_kvm_stat *kvm, const char 
*cpuid __maybe_unused)
 /*
  * Incase of powerpc architecture, pmu registers are programmable
  * by guest kernel. So monitoring guest via host may not provide
- * valid samples. It is better to fail the "perf kvm record"
- * with default "cycles" event to monitor guest in powerpc.
+ * valid samples with default 'cycles' event. It is better to use
+ * 'trace_imc/trace_cycles' event for guest profiling, since it
+ * can track the guest instruction pointer in the trace-record.
  *
  * Function to parse the arguments and return appropriate values.
  */
@@ -202,8 +203,14 @@ int kvm_add_default_arch_event(int *argc, const char 
**argv)
 
parse_options(j, tmp, event_options, NULL, 0);
if (!event) {
-   free(tmp);
-   return -EINVAL;
+   if (pmu_have_event("trace_imc", "trace_cycles")) {
+   argv[j++] = strdup("-e");
+   argv[j++] = strdup("trace_imc/trace_cycles/");
+   *argc += 2;
+   } else {
+   free(tmp);
+   return -EINVAL;
+   }
}
 
free(tmp);
-- 
2.17.2



[PATCH v2] powerpc/perf: Use cpumask_last() to determine the designated cpu for nest/core units.

2019-06-09 Thread Anju T Sudhakar
Nest and core imc(In-memory Collection counters) assigns a particular
cpu as the designated target for counter data collection.
During system boot, the first online cpu in a chip gets assigned as
the designated cpu for that chip(for nest-imc) and the first online cpu
in a core gets assigned as the designated cpu for that core(for core-imc).

If the designated cpu goes offline, the next online cpu from the same
chip(for nest-imc)/core(for core-imc) is assigned as the next target,
and the event context is migrated to the target cpu.
Currently, cpumask_any_but() function is used to find the target cpu.
Though this function is expected to return a `random` cpu, this always
returns the next online cpu.

If all cpus in a chip/core is offlined in a sequential manner, starting
from the first cpu, the event migration has to happen for all the cpus
which goes offline. Since the migration process involves a grace period,
the total time taken to offline all the cpus will be significantly high.

Example:
In a system which has 2 sockets, with
NUMA node0 CPU(s): 0-87
NUMA node8 CPU(s): 88-175

Time taken to offline cpu 88-175:
real2m56.099s
user0m0.191s
sys 0m0.000s

Use cpumask_last() to choose the target cpu, when the designated cpu
goes online, so the migration will happen only when the last_cpu in the
mask goes offline. This way the time taken to offline all cpus in a
chip/core can be reduced.

With the patch, 

Time taken  to offline cpu 88-175:
real0m12.207s
user0m0.171s
sys 0m0.000s


Offlining all cpus in reverse order is also taken care because,
cpumask_any_but() is used to find the designated cpu if the last cpu in
the mask goes offline. Since cpumask_any_but() always return the first
cpu in the mask, that becomes the designated cpu and migration will happen
only when the first_cpu in the mask goes offline.

Example:
With the patch,

Time taken to offline cpu from 175-88:
real0m9.330s
user0m0.110s
sys 0m0.000s

Signed-off-by: Anju T Sudhakar 
Reviewed-by: Madhavan Srinivasan 
---

Changes from v1:
Modified the commit log with more info.
---

 arch/powerpc/perf/imc-pmu.c | 14 --
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 31fa753..fbfd6e7 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -366,7 +366,14 @@ static int ppc_nest_imc_cpu_offline(unsigned int cpu)
 */
nid = cpu_to_node(cpu);
l_cpumask = cpumask_of_node(nid);
-   target = cpumask_any_but(l_cpumask, cpu);
+   target = cpumask_last(l_cpumask);
+
+   /*
+* If this(target) is the last cpu in the cpumask for this chip,
+* check for any possible online cpu in the chip.
+*/
+   if (unlikely(target == cpu))
+   target = cpumask_any_but(l_cpumask, cpu);
 
/*
 * Update the cpumask with the target cpu and
@@ -671,7 +678,10 @@ static int ppc_core_imc_cpu_offline(unsigned int cpu)
return 0;
 
/* Find any online cpu in that core except the current "cpu" */
-   ncpu = cpumask_any_but(cpu_sibling_mask(cpu), cpu);
+   ncpu = cpumask_last(cpu_sibling_mask(cpu));
+
+   if (unlikely(ncpu == cpu))
+   ncpu = cpumask_any_but(cpu_sibling_mask(cpu), cpu);
 
if (ncpu >= 0 && ncpu < nr_cpu_ids) {
cpumask_set_cpu(ncpu, &core_imc_cpumask);
-- 
1.8.3.1



[PATCH RESEND 1/2] tools/perf: Add arch neutral function to choose event for perf kvm record

2019-06-09 Thread Anju T Sudhakar
'perf kvm record' uses 'cycles'(if the user did not specify any event) as
the default event to profile the guest.
This will not provide any proper samples from the guest incase of
powerpc architecture, since in powerpc the PMUs are controlled by
the guest rather than the host.

Patch adds a function to pick an arch specific event for 'perf kvm record',
instead of selecting 'cycles' as a default event for all architectures.

For powerpc this function checks for any user specified event, and if there
isn't any it returns invalid instead of proceeding with 'cycles' event.

Signed-off-by: Anju T Sudhakar 
---
 tools/perf/arch/powerpc/util/kvm-stat.c | 37 +
 tools/perf/builtin-kvm.c| 12 +++-
 tools/perf/util/kvm-stat.h  |  2 +-
 3 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/tools/perf/arch/powerpc/util/kvm-stat.c 
b/tools/perf/arch/powerpc/util/kvm-stat.c
index f9db341c47b6..66f8fe500945 100644
--- a/tools/perf/arch/powerpc/util/kvm-stat.c
+++ b/tools/perf/arch/powerpc/util/kvm-stat.c
@@ -8,6 +8,7 @@
 
 #include "book3s_hv_exits.h"
 #include "book3s_hcalls.h"
+#include 
 
 #define NR_TPS 4
 
@@ -172,3 +173,39 @@ int cpu_isa_init(struct perf_kvm_stat *kvm, const char 
*cpuid __maybe_unused)
 
return ret;
 }
+
+/*
+ * Incase of powerpc architecture, pmu registers are programmable
+ * by guest kernel. So monitoring guest via host may not provide
+ * valid samples. It is better to fail the "perf kvm record"
+ * with default "cycles" event to monitor guest in powerpc.
+ *
+ * Function to parse the arguments and return appropriate values.
+ */
+int kvm_add_default_arch_event(int *argc, const char **argv)
+{
+   const char **tmp;
+   bool event = false;
+   int i, j = *argc;
+
+   const struct option event_options[] = {
+   OPT_BOOLEAN('e', "event", &event, NULL),
+   OPT_END()
+   };
+
+   tmp = calloc(j + 1, sizeof(char *));
+   if (!tmp)
+   return -EINVAL;
+
+   for (i = 0; i < j; i++)
+   tmp[i] = argv[i];
+
+   parse_options(j, tmp, event_options, NULL, 0);
+   if (!event) {
+   free(tmp);
+   return -EINVAL;
+   }
+
+   free(tmp);
+   return 0;
+}
diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
index dbb6f737a3e2..fe33b3ec55c9 100644
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c
@@ -1510,11 +1510,21 @@ static int kvm_cmd_stat(const char *file_name, int 
argc, const char **argv)
 }
 #endif /* HAVE_KVM_STAT_SUPPORT */
 
+int __weak kvm_add_default_arch_event(int *argc __maybe_unused,
+   const char **argv __maybe_unused)
+{
+   return 0;
+}
+
 static int __cmd_record(const char *file_name, int argc, const char **argv)
 {
-   int rec_argc, i = 0, j;
+   int rec_argc, i = 0, j, ret;
const char **rec_argv;
 
+   ret = kvm_add_default_arch_event(&argc, argv);
+   if (ret)
+   return -EINVAL;
+
rec_argc = argc + 2;
rec_argv = calloc(rec_argc + 1, sizeof(char *));
rec_argv[i++] = strdup("record");
diff --git a/tools/perf/util/kvm-stat.h b/tools/perf/util/kvm-stat.h
index 1403dec189b4..da38b56c46cb 100644
--- a/tools/perf/util/kvm-stat.h
+++ b/tools/perf/util/kvm-stat.h
@@ -144,5 +144,5 @@ extern const int decode_str_len;
 extern const char *kvm_exit_reason;
 extern const char *kvm_entry_trace;
 extern const char *kvm_exit_trace;
-
+extern int kvm_add_default_arch_event(int *argc, const char **argv);
 #endif /* __PERF_KVM_STAT_H */
-- 
2.17.2



[PATCH RESEND 2/2] tools/perf: Set 'trace_cycles' as defaultevent for perf kvm record in powerpc

2019-06-09 Thread Anju T Sudhakar
Use 'trace_imc/trace_cycles' as the default event for 'perf kvm record'
in powerpc.

Signed-off-by: Anju T Sudhakar 
---
 tools/perf/arch/powerpc/util/kvm-stat.c | 15 +++
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/tools/perf/arch/powerpc/util/kvm-stat.c 
b/tools/perf/arch/powerpc/util/kvm-stat.c
index 66f8fe500945..b552884263df 100644
--- a/tools/perf/arch/powerpc/util/kvm-stat.c
+++ b/tools/perf/arch/powerpc/util/kvm-stat.c
@@ -177,8 +177,9 @@ int cpu_isa_init(struct perf_kvm_stat *kvm, const char 
*cpuid __maybe_unused)
 /*
  * Incase of powerpc architecture, pmu registers are programmable
  * by guest kernel. So monitoring guest via host may not provide
- * valid samples. It is better to fail the "perf kvm record"
- * with default "cycles" event to monitor guest in powerpc.
+ * valid samples with default 'cycles' event. It is better to use
+ * 'trace_imc/trace_cycles' event for guest profiling, since it
+ * can track the guest instruction pointer in the trace-record.
  *
  * Function to parse the arguments and return appropriate values.
  */
@@ -202,8 +203,14 @@ int kvm_add_default_arch_event(int *argc, const char 
**argv)
 
parse_options(j, tmp, event_options, NULL, 0);
if (!event) {
-   free(tmp);
-   return -EINVAL;
+   if (pmu_have_event("trace_imc", "trace_cycles")) {
+   argv[j++] = strdup("-e");
+   argv[j++] = strdup("trace_imc/trace_cycles/");
+   *argc += 2;
+   } else {
+   free(tmp);
+   return -EINVAL;
+   }
}
 
free(tmp);
-- 
2.17.2



Re: [PATCH v2] powerpc/perf: Use cpumask_last() to determine the designated cpu for nest/core units.

2019-06-11 Thread Anju T Sudhakar

Hi Leonardo,

On 6/11/19 12:17 AM, Leonardo Bras wrote:

On Mon, 2019-06-10 at 12:02 +0530, Anju T Sudhakar wrote:

Nest and core imc(In-memory Collection counters) assigns a particular
cpu as the designated target for counter data collection.
During system boot, the first online cpu in a chip gets assigned as
the designated cpu for that chip(for nest-imc) and the first online cpu
in a core gets assigned as the designated cpu for that core(for core-imc).

If the designated cpu goes offline, the next online cpu from the same
chip(for nest-imc)/core(for core-imc) is assigned as the next target,
and the event context is migrated to the target cpu.
Currently, cpumask_any_but() function is used to find the target cpu.
Though this function is expected to return a `random` cpu, this always
returns the next online cpu.

If all cpus in a chip/core is offlined in a sequential manner, starting
from the first cpu, the event migration has to happen for all the cpus
which goes offline. Since the migration process involves a grace period,
the total time taken to offline all the cpus will be significantly high.

Seems like a very interesting work.
Out of curiosity, have you used 'chcpu -d' to create your benchmark?


Here I did not use chcpu to disable the cpu.

I used a script which will offline cpus 88-175 by echoing  `0` to

/sys/devices/system/cpu/cpu*/online.


Regards,

Anju




Re: [PATCH v5 06/13] powerpc/perf: IMC pmu cpumask and cpu hotplug support

2017-03-27 Thread Anju T Sudhakar

Hi Gautham,


Thank you for reviewing the patch.


On Thursday 23 March 2017 05:22 PM, Gautham R Shenoy wrote:

Hi Hemant, Maddy,

On Thu, Mar 16, 2017 at 01:05:00PM +0530, Madhavan Srinivasan wrote:

From: Hemant Kumar 

Adds cpumask attribute to be used by each IMC pmu. Only one cpu (any
online CPU) from each chip for nest PMUs is designated to read counters.

On CPU hotplug, dying CPU is checked to see whether it is one of the
designated cpus, if yes, next online cpu from the same chip (for nest
units) is designated as new cpu to read counters. For this purpose, we
introduce a new state : CPUHP_AP_PERF_POWERPC_NEST_ONLINE.

Cc: Gautham R. Shenoy 
Cc: Balbir Singh 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Anton Blanchard 
Cc: Sukadev Bhattiprolu 
Cc: Michael Neuling 
Cc: Stewart Smith 
Cc: Daniel Axtens 
Cc: Stephane Eranian 
Cc: Anju T Sudhakar 
Signed-off-by: Hemant Kumar 
Signed-off-by: Madhavan Srinivasan 
---
  arch/powerpc/include/asm/opal-api.h|   3 +-
  arch/powerpc/include/asm/opal.h|   3 +
  arch/powerpc/perf/imc-pmu.c| 163 -
  arch/powerpc/platforms/powernv/opal-wrappers.S |   1 +
  include/linux/cpuhotplug.h |   1 +
  5 files changed, 169 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index a0aa285869b5..e1c3d4837857 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -168,7 +168,8 @@
  #define OPAL_INT_SET_MFRR 125
  #define OPAL_PCI_TCE_KILL 126
  #define OPAL_NMMU_SET_PTCR127
-#define OPAL_LAST  127
+#define OPAL_NEST_IMC_COUNTERS_CONTROL 145
+#define OPAL_LAST  145

  /* Device tree flags */

diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 1ff03a6da76e..d93d08204243 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -227,6 +227,9 @@ int64_t opal_pci_tce_kill(uint64_t phb_id, uint32_t 
kill_type,
  uint64_t dma_addr, uint32_t npages);
  int64_t opal_nmmu_set_ptcr(uint64_t chip_id, uint64_t ptcr);

+int64_t opal_nest_imc_counters_control(uint64_t mode, uint64_t value1,
+   uint64_t value2, uint64_t value3);
+
  /* Internal functions */
  extern int early_init_dt_scan_opal(unsigned long node, const char *uname,
   int depth, void *data);
diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index f6f1ef9f56af..e46ff6d2a584 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -16,6 +16,7 @@
+static int ppc_nest_imc_cpu_online(unsigned int cpu)
+{

I take it that 'cpu' is coming online.


+   int nid, fcpu, ncpu;
+   struct cpumask *l_cpumask, tmp_mask;
+
+   /* Fint the cpumask of this node */
+   nid = cpu_to_node(cpu);
+   l_cpumask = cpumask_of_node(nid);
+
+   /*
+* If any of the cpu from this node is already present in the mask,
+* just return, if not, then set this cpu in the mask.
+*/
+   if (!cpumask_and(&tmp_mask, l_cpumask, &nest_imc_cpumask)) {

In this case, none of the cpus in the node are in the mask. So we set
and this cpu in the imc cpumask and return.


+   cpumask_set_cpu(cpu, &nest_imc_cpumask);
+   return 0;
+   }

But this case implies that there is already a CPU from the node which
is in the imc_cpumask. As per the comment above, we are supposed to
just return. So why are we doing the following ?

Either the comment above is incorrect or I am missing something here.


+
+   fcpu = cpumask_first(l_cpumask);
+   ncpu = cpumask_next(cpu, l_cpumask);
+   if (cpu == fcpu) {
+   if (cpumask_test_and_clear_cpu(ncpu, &nest_imc_cpumask)) {
+   cpumask_set_cpu(cpu, &nest_imc_cpumask);
+   nest_change_cpu_context(ncpu, cpu);
+   }
+   }

It seems that we want to set only the smallest online cpu in the node
in the nest_imc_cpumask. So, if the newly onlined cpu is the smallest,
we replace the previous representative with cpu.


Yes. you are right. Here we are designating the smallest online cpu in 
the node in the

nest_imc_mask. The comment above is only for the 'if' code block.



So, the comment above needs to be fixed.


Will update the comment to avoid confusion. :-)


Thanks,

Anju

+
+   return 0;
+}
+
+static int ppc_nest_imc_cpu_offline(unsigned int cpu)
+{
+   int nid, target = -1;
+   struct cpumask *l_cpumask;
+
+   /*
+* Check in the designated list for this cpu. Dont bother
+* if not one of them.
+*/
+   if (!cpumask_test_and_clear_cpu(cpu, &nest_imc_cpumask))
+   return 0;
+
+

Re: [PATCH 12/13] powerpc/perf: Thread imc cpuhotplug support

2017-03-27 Thread Anju T Sudhakar



On Thursday 23 March 2017 10:45 PM, Gautham R Shenoy wrote:

Hi Maddy, Anju,

On Thu, Mar 16, 2017 at 01:05:06PM +0530, Madhavan Srinivasan wrote:

From: Anju T Sudhakar 

This patch adds support for thread IMC on cpuhotplug.

When a cpu goes offline, the LDBAR for that cpu is disabled, and when it comes
back online the previous ldbar value is written back to the LDBAR for that cpu.

To register the hotplug functions for thread_imc, a new state
CPUHP_AP_PERF_POWERPC_THREADIMC_ONLINE is added to the list of existing
states.

Cc: Gautham R. Shenoy 
Cc: Balbir Singh 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Anton Blanchard 
Cc: Sukadev Bhattiprolu 
Cc: Michael Neuling 
Cc: Stewart Smith 
Cc: Daniel Axtens 
Cc: Stephane Eranian 
Signed-off-by: Anju T Sudhakar 
Signed-off-by: Madhavan Srinivasan 
---
  arch/powerpc/perf/imc-pmu.c | 33 -
  include/linux/cpuhotplug.h  |  1 +
  2 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 6802960db51c..2ff39fe2a5ce 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -687,6 +687,16 @@ static void cleanup_all_thread_imc_memory(void)
on_each_cpu(cleanup_thread_imc_memory, NULL, 1);
  }

+static void thread_imc_update_ldbar(unsigned int cpu_id)
+{
+   u64 ldbar_addr, ldbar_value;
+
+   ldbar_addr = (u64)virt_to_phys((void *)per_cpu_add[cpu_id]);
+   ldbar_value = (ldbar_addr & (u64)THREAD_IMC_LDBAR_MASK) |
+   (u64)THREAD_IMC_ENABLE;
+   mtspr(SPRN_LDBAR, ldbar_value);
+}
+
  /*
   * Allocates a page of memory for each of the online cpus, and, writes the
   * physical base address of that page to the LDBAR for that cpu. This starts
@@ -694,20 +704,33 @@ static void cleanup_all_thread_imc_memory(void)
   */
  static void thread_imc_mem_alloc(void *dummy)
  {
-   u64 ldbar_addr, ldbar_value;
int cpu_id = smp_processor_id();

per_cpu_add[cpu_id] = (u64)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
0);
-   ldbar_addr = (u64)virt_to_phys((void *)per_cpu_add[cpu_id]);
-   ldbar_value = (ldbar_addr & (u64)THREAD_IMC_LDBAR_MASK) |
-   (u64)THREAD_IMC_ENABLE;
-   mtspr(SPRN_LDBAR, ldbar_value);
+   thread_imc_update_ldbar(cpu_id);
+}
+
+static int ppc_thread_imc_cpu_online(unsigned int cpu)
+{
+   thread_imc_update_ldbar(cpu);
+   return 0;
+
+}
+
+static int ppc_thread_imc_cpu_offline(unsigned int cpu)
+{
+   mtspr(SPRN_LDBAR, 0);
+   return 0;
  }

This patch looks ok to me.

So it appears that in case of a full-core deep stop entry/exit you
will need to save/restore LDBAR as well. But I will take it up for the
next set of stop cleanups.

For this patch,
Reviewed-by: Gautham R. Shenoy 


Thank you for  reviewing the patch Gautham.

-Anju




  void thread_imc_cpu_init(void)
  {
on_each_cpu(thread_imc_mem_alloc, NULL, 1);
+   cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_THREADIMC_ONLINE,
+ "POWER_THREAD_IMC_ONLINE",
+  ppc_thread_imc_cpu_online,
+  ppc_thread_imc_cpu_offline);
  }

  static void thread_imc_ldbar_disable(void *dummy)
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index abde85d9511a..724df46b2c3c 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -138,6 +138,7 @@ enum cpuhp_state {
CPUHP_AP_PERF_ARM_L2X0_ONLINE,
CPUHP_AP_PERF_POWERPC_NEST_ONLINE,
CPUHP_AP_PERF_POWERPC_COREIMC_ONLINE,
+   CPUHP_AP_PERF_POWERPC_THREADIMC_ONLINE,
CPUHP_AP_PERF_ARM_QCOM_L2_ONLINE,
CPUHP_AP_WORKQUEUE_ONLINE,
CPUHP_AP_RCUTREE_ONLINE,
--
2.7.4


--
Thanks and Regards
gautham.




Re: [PATCH v6 03/11] powerpc/powernv: Detect supported IMC units and its events

2017-04-06 Thread Anju T Sudhakar

Hi Stewart,


Thanks for the review.


On Thursday 06 April 2017 02:07 PM, Stewart Smith wrote:

Madhavan Srinivasan  writes:

--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -33,6 +33,388 @@



+static void imc_pmu_setup(struct device_node *parent)
+{
+   struct device_node *child;
+   int pmu_count = 0, rc = 0;
+   const struct property *pp;
+
+   if (!parent)
+   return;
+
+   /* Setup all the IMC pmus */
+   for_each_child_of_node(parent, child) {
+   pp = of_get_property(child, "compatible", NULL);
+   if (pp) {
+   /*
+* If there is a node with a "compatible" field,
+* that's a PMU node
+*/
+   rc = imc_pmu_create(child, pmu_count);
+   if (rc)
+   return;
+   pmu_count++;
+   }
+   }
+}

This doesn't strike me as the right kind of structure, the presence of a
compatible property really just says "hey, there's this device and it's
compatible with these ways of accessing it".

I'm guessing the idea behind having imc-nest-offset/size in a top level
node is because it's common to everything under it and the aim is to not
blow up the device tree to be enormous.

So why not go after each ibm,imc-counters-nest compatible node under the
top level ibm,opal-in-memory-counters node? (i'm not convinced that
having ibm,ibmc-counters-nest versus ibm,imc-counters-core and
ibm,imc-counters-thread as I see in the dts is correct though, as
they're all accessed exactly the same way?)



The idea here is, we have one directory which contains common events 
information for nest(same incase of core and thread), and one directory 
for each nest(/core/thread) pmu.
So while parsing we need to make sure that the node which we are parsing 
is the pmu node, not the node which contains the common event 
information. We use the "compatible" property here for that purpose. 
Because we don't have a compatible property for the node which contains 
events info.





Regards,
Anju



Re: [PATCH v6 03/11] powerpc/powernv: Detect supported IMC units and its events

2017-04-17 Thread Anju T Sudhakar

Hi Michael,


On Thursday 13 April 2017 05:13 PM, Michael Ellerman wrote:

Anju T Sudhakar  writes:

On Thursday 06 April 2017 02:07 PM, Stewart Smith wrote:

Madhavan Srinivasan  writes:

--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -33,6 +33,388 @@



+static void imc_pmu_setup(struct device_node *parent)
+{
+   struct device_node *child;
+   int pmu_count = 0, rc = 0;
+   const struct property *pp;
+
+   if (!parent)
+   return;
+
+   /* Setup all the IMC pmus */
+   for_each_child_of_node(parent, child) {
+   pp = of_get_property(child, "compatible", NULL);
+   if (pp) {
+   /*
+* If there is a node with a "compatible" field,
+* that's a PMU node
+*/
+   rc = imc_pmu_create(child, pmu_count);
+   if (rc)
+   return;
+   pmu_count++;
+   }
+   }
+}

This doesn't strike me as the right kind of structure, the presence of a
compatible property really just says "hey, there's this device and it's
compatible with these ways of accessing it".

I'm guessing the idea behind having imc-nest-offset/size in a top level
node is because it's common to everything under it and the aim is to not
blow up the device tree to be enormous.

So why not go after each ibm,imc-counters-nest compatible node under the
top level ibm,opal-in-memory-counters node? (i'm not convinced that
having ibm,ibmc-counters-nest versus ibm,imc-counters-core and
ibm,imc-counters-thread as I see in the dts is correct though, as
they're all accessed exactly the same way?)

The idea here is, we have one directory which contains common events
information for nest(same incase of core and thread), and one directory
for each nest(/core/thread) pmu.
So while parsing we need to make sure that the node which we are parsing
is the pmu node, not the node which contains the common event
information. We use the "compatible" property here for that purpose.
Because we don't have a compatible property for the node which contains
events info.

That's a really bad hack.

You can use the compatible property to detect the node you're looking
for, but you need to look at the *value* of the property and check it's
what you expect. Just checking that it's there is fragile.

cheers





ok. I will rework this code.



Thanks,
Anju



[PATCH v7 00/10] IMC Instrumentation Support

2017-04-18 Thread Anju T Sudhakar
y one 
SMT thread is executing non-idle code" ;
};
[...]
   core {
compatible = "ibm,imc-counters-core";
events-prefix = "CPM_";
unit = "";
scale = "";
reg = <0x0 0x8>;
events = < &CORE_EVENTS >;
};

thread {
compatible = "ibm,imc-counters-core";
events-prefix = "CPM_";
unit = "";
scale = "";
reg = <0x0 0x8>;
events = < &CORE_EVENTS >;
};
};

>From the device tree, the kernel parses the PMUs and their events'
information.

After parsing the IMC PMUs and their events, the PMUs and their
attributes are registered in the kernel.

This patchset (patches 9 and 10) configure the thread level IMC PMUs
to count for tasks, which give us the thread level metric values per
task.

Example Usage :
 # perf list

  [...]
  nest_mcs0/PM_MCS_DOWN_128B_DATA_XFER_MC0/   [Kernel PMU event]
  nest_mcs0/PM_MCS_DOWN_128B_DATA_XFER_MC0_LAST_SAMPLE/ [Kernel PMU event]
  [...]
  core_imc/CPM_NON_IDLE_INST/[Kernel PMU event]
  core_imc/CPM_NON_IDLE_PCYC/[Kernel PMU event]
  [...]
  thread_imc/CPM_NON_IDLE_INST/  [Kernel PMU event]
  thread_imc/CPM_NON_IDLE_PCYC/  [Kernel PMU event]

To see per chip data for nest_mcs0/PM_MCS_DOWN_128B_DATA_XFER_MC0/ :
 # perf stat -e "nest_mcs0/PM_MCS_DOWN_128B_DATA_XFER_MC0/" -a --per-socket

To see non-idle instructions for core 0 :
 # ./perf stat -e "core_imc/CPM_NON_IDLE_INST/" -C 0 -I 1000

To see non-idle instructions for a "make" :
 # ./perf stat -e "thread_imc/CPM_NON_IDLE_PCYC/" make

Comments/feedback/suggestions are welcome.


TODO:
1)Add a sysfs interface to disable the Core imc (both for ldbar and pdbar)


Changelog:

v6 -> v7:
 - Updated the commit message and code comments.
 - Changed the counter init code to disable the
   nest/core counters by default and enable only 
   when it is used.
 - Updated the pmu-setup code to register the
   PMUs which doesn't have events.
 - replaced imc_event_info_val() to imc_event_prop_update()
 - Updated the imc_pmu_setup() code, by checking for the "value"
   of compatible property instead of merely checking for compatible.
 - removed imc_get_domain().
 - init_imc_pmu() and imc_pmu_setup() are made  __init.
 - update_max_val() is invoked immediately after updating the offset value.
v5 -> v6:
 - merged few patches for the readability and code flow
 - Updated the commit message and code comments.
 - Added kdump check.
 - updated cpuhotplug code and added checks for perf migration context
 - Added READ_ONCE() when reading the counter data.
 - replaced of_property_read_u32() with of_get_address() for "reg" property read
 - replaced UNKNOWN_DOMAIN with IMC_DOMAIN_UNKNOWN
 v4 -> v5:
 - Updated opal call numbers
 - Added a patch to disable Core-IMC device using shutdown callback
 - Added patch to support cpuhotplug for thread-imc
 - Added patch to disable and enable core imc engine in cpuhot plug path
 v3 -> v4 :
 - Changed the events parser code to discover the PMU and events because
   of the changed format of the IMC DTS file (Patch 3).
 - Implemented the two TODOs to include core and thread IMC support with
   this patchset (Patches 7 through 10).
 - Changed the CPU hotplug code of Nest IMC PMUs to include a new state
   CPUHP_AP_PERF_POWERPC_NEST_ONLINE (Patch 6).
 v2 -> v3 :
 - Changed all references for IMA (In-Memory Accumulation) to IMC (In-Memory
   Collection).
 v1 -> v2 :
 - Account for the cases where a PMU can have a common scale and unit
   values for all its supported events (Patch 3/6).
 - Fixed a Build error (for maple_defconfig) by enabling imc_pmu.o
   only for CONFIG_PPC_POWERNV=y (Patch 4/6)
 - Read from the "event-name" property instead of "name" for an event
   node (Patch 3/6).

Anju T Sudhakar (6):
  powerpc/powernv: Autoload IMC device driver module
  powerpc/powernv: Detect supported IMC units and its events
  powerpc/perf: IMC pmu cpumask and cpuhotplug support
  powerpc/powernv: Thread IMC events detection
  powerpc/perf: Thread imc cpuhotplug support
  powerpc/perf: Thread IMC PMU functions

Hemant Kumar (4):
  powerpc/powernv: Data structure and macros definitions for IMC
  powerpc/perf: Add generic IMC pmu group and event functions
  powerpc/powernv: Core IMC events detection
  powerpc/perf: PMU functions for Core IMC and hotplugging


 arch/powerpc/include/asm/imc-pmu.h |  118 +++
 arch/powerpc/include/asm/opal-api.h|   21 +-
 arch/powerpc/include/asm/opal.h|   14 +
 arch/powerpc/perf/Makefile |3 +
 arch/powerpc/perf/i

[PATCH v7 01/10] powerpc/powernv: Data structure and macros definitions for IMC

2017-04-18 Thread Anju T Sudhakar
From: Hemant Kumar 

Create a new header file to add the data structures and
macros needed for In-Memory Collection (IMC) counter support.

Signed-off-by: Anju T Sudhakar 
Signed-off-by: Hemant Kumar 
Signed-off-by: Madhavan Srinivasan 
---
 arch/powerpc/include/asm/imc-pmu.h | 95 ++
 1 file changed, 95 insertions(+)
 create mode 100644 arch/powerpc/include/asm/imc-pmu.h

diff --git a/arch/powerpc/include/asm/imc-pmu.h 
b/arch/powerpc/include/asm/imc-pmu.h
new file mode 100644
index 000..d0193c8
--- /dev/null
+++ b/arch/powerpc/include/asm/imc-pmu.h
@@ -0,0 +1,95 @@
+#ifndef PPC_POWERNV_IMC_PMU_DEF_H
+#define PPC_POWERNV_IMC_PMU_DEF_H
+
+/*
+ * IMC Nest Performance Monitor counter support.
+ *
+ * Copyright (C) 2017 Madhavan Srinivasan, IBM Corporation.
+ *   (C) 2017 Anju T Sudhakar, IBM Corporation.
+ *   (C) 2017 Hemant K Shaw, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+/*
+ * For static allocation of some of the structures.
+ */
+#define IMC_MAX_CHIPS  32
+#define IMC_MAX_PMUS   32
+
+/*
+ * This macro is used for memory buffer allocation of
+ * event names and event string
+ */
+#define IMC_MAX_NAME_VAL_LEN   96
+
+/*
+ * Currently Microcode supports a max of 256KB of counter memory
+ * in the reserved memory region. Max pages to mmap (considering 4K PAGESIZE).
+ */
+#define IMC_NEST_MAX_PAGES 64
+
+/*
+ *Compatbility macros for IMC devices
+ */
+#define IMC_DTB_COMPAT "ibm,opal-in-memory-counters"
+#define IMC_DTB_NEST_COMPAT"ibm,imc-counters-nest"
+
+/*
+ * Structure to hold per chip specific memory address
+ * information for nest pmus. Nest Counter data are exported
+ * in per-chip reserved memory region by the PORE Engine.
+ */
+struct perchip_nest_info {
+   u32 chip_id;
+   u64 pbase;
+   u64 vbase[IMC_NEST_MAX_PAGES];
+   u64 size;
+};
+
+/*
+ * Place holder for nest pmu events and values.
+ */
+struct imc_events {
+   char *ev_name;
+   char *ev_value;
+};
+
+#define IMC_FORMAT_ATTR0
+#define IMC_CPUMASK_ATTR   1
+#define IMC_EVENT_ATTR 2
+#define IMC_NULL_ATTR  3
+
+/*
+ * Device tree parser code detects IMC pmu support and
+ * registers new IMC pmus. This structure will
+ * hold the pmu functions and attrs for each imc pmu and
+ * will be referenced at the time of pmu registration.
+ */
+struct imc_pmu {
+   struct pmu pmu;
+   int domain;
+   /*
+* Attribute groups for the PMU. Slot 0 used for
+* format attribute, slot 1 used for cpusmask attribute,
+* slot 2 used for event attribute. Slot 3 keep as
+* NULL.
+*/
+   const struct attribute_group *attr_groups[4];
+};
+
+/*
+ * Domains for IMC PMUs
+ */
+#define IMC_DOMAIN_NEST1
+#define IMC_DOMAIN_UNKNOWN -1
+
+#endif /* PPC_POWERNV_IMC_PMU_DEF_H */
-- 
2.7.4



[PATCH v7 05/10] powerpc/perf: IMC pmu cpumask and cpuhotplug support

2017-04-18 Thread Anju T Sudhakar
Adds cpumask attribute to be used by each IMC pmu. Only one cpu (any
online CPU) from each chip for nest PMUs is designated to read counters.

On CPU hotplug, dying CPU is checked to see whether it is one of the
designated cpus, if yes, next online cpu from the same chip (for nest
units) is designated as new cpu to read counters. For this purpose, we
introduce a new state : CPUHP_AP_PERF_POWERPC_NEST_ONLINE.

Signed-off-by: Anju T Sudhakar 
Signed-off-by: Hemant Kumar 
Signed-off-by: Madhavan Srinivasan 
---
 arch/powerpc/include/asm/imc-pmu.h |   4 +
 arch/powerpc/include/asm/opal-api.h|  13 +-
 arch/powerpc/include/asm/opal.h|  12 ++
 arch/powerpc/perf/imc-pmu.c| 250 -
 arch/powerpc/platforms/powernv/opal-wrappers.S |   1 +
 include/linux/cpuhotplug.h |   1 +
 6 files changed, 275 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/imc-pmu.h 
b/arch/powerpc/include/asm/imc-pmu.h
index 6bbe184..1478d0f 100644
--- a/arch/powerpc/include/asm/imc-pmu.h
+++ b/arch/powerpc/include/asm/imc-pmu.h
@@ -92,6 +92,10 @@ struct imc_pmu {
 #define IMC_DOMAIN_NEST1
 #define IMC_DOMAIN_UNKNOWN -1
 
+#define IMC_COUNTER_ENABLE 1
+#define IMC_COUNTER_DISABLE0
+
+
 extern struct perchip_nest_info nest_perchip_info[IMC_MAX_CHIPS];
 extern struct imc_pmu *per_nest_pmu_arr[IMC_MAX_PMUS];
 extern int __init init_imc_pmu(struct imc_events *events,int idx, struct 
imc_pmu *pmu_ptr);
diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index a0aa285..23fc51e9 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -168,7 +168,8 @@
 #define OPAL_INT_SET_MFRR  125
 #define OPAL_PCI_TCE_KILL  126
 #define OPAL_NMMU_SET_PTCR 127
-#define OPAL_LAST  127
+#define OPAL_NEST_IMC_COUNTERS_CONTROL 149
+#define OPAL_LAST  149
 
 /* Device tree flags */
 
@@ -928,6 +929,16 @@ enum {
OPAL_PCI_TCE_KILL_ALL,
 };
 
+/* Operation argument to OPAL_NEST_IMC_COUNTERS_CONTROL */
+enum {
+   OPAL_NEST_IMC_PRODUCTION_MODE = 1,
+};
+
+enum {
+   OPAL_NEST_IMC_STOP,
+   OPAL_NEST_IMC_START,
+};
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* __OPAL_API_H */
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 1ff03a6..ffa4293 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -227,6 +227,18 @@ int64_t opal_pci_tce_kill(uint64_t phb_id, uint32_t 
kill_type,
  uint64_t dma_addr, uint32_t npages);
 int64_t opal_nmmu_set_ptcr(uint64_t chip_id, uint64_t ptcr);
 
+/*
+ * OPAL_NEST_IMC_COUNTERS_CONTROL:
+ * mode -- Target mode for the microcode to operation, currently only
+ *"PRODUCTION_MODE" is supported, but in-plan to support other modes
+ *like "DEBUG" mode specific to nest units.
+ *
+ * value1, value2, value3 -- Based on mode parameter, input values for these
+ *  will differ. These are documented in detail in OPAL-API docs.
+ */
+int64_t opal_nest_imc_counters_control(uint64_t mode, uint64_t value1,
+   uint64_t value2, uint64_t value3);
+
 /* Internal functions */
 extern int early_init_dt_scan_opal(unsigned long node, const char *uname,
   int depth, void *data);
diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 0dbab77..b86ef86 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -18,6 +18,11 @@
 
 struct perchip_nest_info nest_perchip_info[IMC_MAX_CHIPS];
 struct imc_pmu *per_nest_pmu_arr[IMC_MAX_PMUS];
+static cpumask_t nest_imc_cpumask;
+
+static atomic_t nest_events;
+/* Used to avoid races in calling enable/disable nest-pmu units*/
+static DEFINE_MUTEX(imc_nest_reserve);
 
 /* Needed for sanity check */
 extern u64 nest_max_offset;
@@ -33,6 +38,161 @@ static struct attribute_group imc_format_group = {
.attrs = imc_format_attrs,
 };
 
+/* Get the cpumask printed to a buffer "buf" */
+static ssize_t imc_pmu_cpumask_get_attr(struct device *dev,
+   struct device_attribute *attr,
+   char *buf)
+{
+   cpumask_t *active_mask;
+
+   active_mask = &nest_imc_cpumask;
+   return cpumap_print_to_pagebuf(true, buf, active_mask);
+}
+
+static DEVICE_ATTR(cpumask, S_IRUGO, imc_pmu_cpumask_get_attr, NULL);
+
+static struct attribute *imc_pmu_cpumask_attrs[] = {
+   &dev_attr_cpumask.attr,
+   NULL,
+};
+
+static struct attribute_group imc_pmu_cpumask_attr_group = {
+   .attrs = imc_pmu_cpumask_attrs,
+};
+
+/*
+ * nest_init : Initializes the nest imc engine for the current chip.
+ * by default the nest engine is disabled.
+ */
+

[PATCH v7 8/10] powerpc/powernv: Thread IMC events detection

2017-04-18 Thread Anju T Sudhakar
Patch adds support for detection of thread IMC events. It adds a new
domain IMC_DOMAIN_THREAD and it is determined with the help of the
compatibility string "ibm,imc-counters-thread" based on the IMC device
tree.

Signed-off-by: Anju T Sudhakar 
Signed-off-by: Hemant Kumar 
Signed-off-by: Madhavan Srinivasan 
---
 arch/powerpc/include/asm/imc-pmu.h|  2 ++
 arch/powerpc/perf/imc-pmu.c   |  1 +
 arch/powerpc/platforms/powernv/opal-imc.c | 18 +-
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/imc-pmu.h 
b/arch/powerpc/include/asm/imc-pmu.h
index bf5fb7c..6260e61 100644
--- a/arch/powerpc/include/asm/imc-pmu.h
+++ b/arch/powerpc/include/asm/imc-pmu.h
@@ -49,6 +49,7 @@
 #define IMC_DTB_COMPAT "ibm,opal-in-memory-counters"
 #define IMC_DTB_NEST_COMPAT"ibm,imc-counters-nest"
 #define IMC_DTB_CORE_COMPAT"ibm,imc-counters-core"
+#define IMC_DTB_THREAD_COMPAT  "ibm,imc-counters-thread"
 
 /*
  * Structure to hold per chip specific memory address
@@ -98,6 +99,7 @@ struct imc_pmu {
  */
 #define IMC_DOMAIN_NEST1
 #define IMC_DOMAIN_CORE2
+#define IMC_DOMAIN_THREAD  3
 #define IMC_DOMAIN_UNKNOWN -1
 
 #define IMC_COUNTER_ENABLE 1
diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index aa9e8ba..ac69d81 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -41,6 +41,7 @@ struct imc_pmu *core_imc_pmu;
 /* Needed for sanity check */
 extern u64 nest_max_offset;
 extern u64 core_max_offset;
+extern u64 thread_max_offset;
 
 PMU_FORMAT_ATTR(event, "config:0-20");
 static struct attribute *imc_format_attrs[] = {
diff --git a/arch/powerpc/platforms/powernv/opal-imc.c 
b/arch/powerpc/platforms/powernv/opal-imc.c
index 23507d7..940f6b9 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -35,6 +35,7 @@
 
 u64 nest_max_offset;
 u64 core_max_offset;
+u64 thread_max_offset;
 
 static int imc_event_prop_update(char *name, struct imc_events *events)
 {
@@ -119,6 +120,10 @@ static void update_max_value(u32 value, int pmu_domain)
if (core_max_offset < value)
core_max_offset = value;
break;
+   case IMC_DOMAIN_THREAD:
+   if (thread_max_offset < value)
+   thread_max_offset = value;
+   break;
default:
/* Unknown domain, return */
return;
@@ -362,7 +367,7 @@ static struct imc_events *imc_events_setup(struct 
device_node *parent,
 /*
  * imc_pmu_create : Takes the parent device which is the pmu unit and a
  *  pmu_index as the inputs.
- * Allocates memory for the pmu, sets up its domain (NEST/CORE), and
+ * Allocates memory for the pmu, sets up its domain (NEST/CORE/THREAD), and
  * calls imc_events_setup() to allocate memory for the events supported
  * by this pmu. Assigns a name for the pmu. Calls imc_events_node_parser()
  * to setup the individual events.
@@ -483,6 +488,17 @@ static void __init imc_pmu_setup(struct device_node 
*parent)
return;
pmu_count++;
}
+   /*
+* Loop through the imc-counters tree for each compatible
+* "ibm,imc-counters-thread", and update "struct imc_pmu".
+*/
+   for_each_compatible_node(child, NULL, IMC_DTB_THREAD_COMPAT) {
+   domain = IMC_DOMAIN_THREAD;
+   rc = imc_pmu_create(child, pmu_count, domain);
+   if (rc)
+   return;
+   pmu_count++;
+   }
 }
 
 static int opal_imc_counters_probe(struct platform_device *pdev)
-- 
2.7.4



[PATCH v7 9/10] powerpc/perf: Thread IMC PMU functions

2017-04-18 Thread Anju T Sudhakar
This patch adds the PMU functions required for event initialization,
read, update, add, del etc. for thread IMC PMU. Thread IMC PMUs are used
for per-task monitoring. 

For each CPU, a page of memory is allocated and is kept static i.e.,
these pages will exist till the machine shuts down. The base address of
this page is assigned to the ldbar of that cpu. As soon as we do that,
the thread IMC counters start running for that cpu and the data of these
counters are assigned to the page allocated. But we use this for
per-task monitoring. Whenever we start monitoring a task, the event is
added is onto the task. At that point, we read the initial value of the
event. Whenever, we stop monitoring the task, the final value is taken
and the difference is the event data.

Now, a task can move to a different cpu. Suppose a task X is moving from
cpu A to cpu B. When the task is scheduled out of A, we get an
event_del for A, and hence, the event data is updated. And, we stop
updating the X's event data. As soon as X moves on to B, event_add is
called for B, and we again update the event_data. And this is how it
keeps on updating the event data even when the task is scheduled on to
different cpus.

Signed-off-by: Anju T Sudhakar 
Signed-off-by: Hemant Kumar 
Signed-off-by: Madhavan Srinivasan 
---
 arch/powerpc/include/asm/imc-pmu.h|   5 +
 arch/powerpc/perf/imc-pmu.c   | 201 ++
 arch/powerpc/platforms/powernv/opal-imc.c |   3 +
 3 files changed, 209 insertions(+)

diff --git a/arch/powerpc/include/asm/imc-pmu.h 
b/arch/powerpc/include/asm/imc-pmu.h
index 6260e61..cc04712 100644
--- a/arch/powerpc/include/asm/imc-pmu.h
+++ b/arch/powerpc/include/asm/imc-pmu.h
@@ -42,6 +42,7 @@
  * IMC Core engine expects 8K bytes of memory for counter collection.
  */
 #define IMC_CORE_COUNTER_MEM   8192
+#define IMC_THREAD_COUNTER_MEM 8192
 
 /*
  *Compatbility macros for IMC devices
@@ -51,6 +52,9 @@
 #define IMC_DTB_CORE_COMPAT"ibm,imc-counters-core"
 #define IMC_DTB_THREAD_COMPAT  "ibm,imc-counters-thread"
 
+#define THREAD_IMC_LDBAR_MASK   0x0003e000
+#define THREAD_IMC_ENABLE   0x8000
+
 /*
  * Structure to hold per chip specific memory address
  * information for nest pmus. Nest Counter data are exported
@@ -110,4 +114,5 @@ extern struct imc_pmu *per_nest_pmu_arr[IMC_MAX_PMUS];
 extern struct imc_pmu *core_imc_pmu;
 extern int __init init_imc_pmu(struct imc_events *events,int idx, struct 
imc_pmu *pmu_ptr);
 void core_imc_disable(void);
+void thread_imc_disable(void);
 #endif /* PPC_POWERNV_IMC_PMU_DEF_H */
diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index ac69d81..b055748 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -38,6 +38,9 @@ static u64 per_core_pdbar_add[IMC_MAX_CHIPS][IMC_MAX_CORES];
 static cpumask_t core_imc_cpumask;
 struct imc_pmu *core_imc_pmu;
 
+/* Maintains base address for all the cpus */
+static u64 per_cpu_add[NR_CPUS];
+
 /* Needed for sanity check */
 extern u64 nest_max_offset;
 extern u64 core_max_offset;
@@ -482,6 +485,56 @@ static int core_imc_event_init(struct perf_event *event)
return 0;
 }
 
+static int thread_imc_event_init(struct perf_event *event)
+{
+   struct task_struct *target;
+
+   if (event->attr.type != event->pmu->type)
+   return -ENOENT;
+
+   /* Sampling not supported */
+   if (event->hw.sample_period)
+   return -EINVAL;
+
+   event->hw.idx = -1;
+
+   /* Sanity check for config (event offset) */
+   if (event->attr.config > thread_max_offset)
+   return -EINVAL;
+
+   target = event->hw.target;
+
+   if (!target)
+   return -EINVAL;
+
+   event->pmu->task_ctx_nr = perf_sw_context;
+   return 0;
+}
+
+static void thread_imc_read_counter(struct perf_event *event)
+{
+   u64 *addr, data;
+   int cpu_id = smp_processor_id();
+
+   addr = (u64 *)(per_cpu_add[cpu_id] + event->attr.config);
+   data = __be64_to_cpu(READ_ONCE(*addr));
+   local64_set(&event->hw.prev_count, data);
+}
+
+static void thread_imc_perf_event_update(struct perf_event *event)
+{
+   u64 counter_prev, counter_new, final_count, *addr;
+   int cpu_id = smp_processor_id();
+
+   addr = (u64 *)(per_cpu_add[cpu_id] + event->attr.config);
+   counter_prev = local64_read(&event->hw.prev_count);
+   counter_new = __be64_to_cpu(READ_ONCE(*addr));
+   final_count = counter_new - counter_prev;
+
+   local64_set(&event->hw.prev_count, counter_new);
+   local64_add(final_count, &event->count);
+}
+
 static void imc_read_counter(struct perf_event *event)
 {
u64 *addr, data;
@@ -723,6 +776,84 @@ static int core_imc_event_add(struct perf_event *event, 
int flags)
 }
 
 
+static void thread_imc_event_start

[PATCH v7 03/10] powerpc/powernv: Detect supported IMC units and its events

2017-04-18 Thread Anju T Sudhakar
Parse device tree to detect IMC units. Traverse through each IMC unit
node to find supported events and corresponding unit/scale files (if any).

Here is the DTS file for reference:


https://github.com/open-power/ima-catalog/blob/master/81E00612.4E0100.dts

The device tree for IMC counters starts at the node "imc-counters".
This node contains all the IMC PMU nodes and event nodes
for these IMC PMUs. The PMU nodes have an "events" property which has a
phandle value for the actual events node. The events are separated from
the PMU nodes to abstract out the common events. For example, PMU node
"mcs0", "mcs1" etc. will contain a pointer to "nest-mcs-events" since,
the events are common between these PMUs. These events have a different
prefix based on their relation to different PMUs, and hence, the PMU
nodes themselves contain an "events-prefix" property. The value for this
property concatenated to the event name, forms the actual event
name. Also, the PMU have a "reg" field as the base offset for the events
which belong to this PMU. This "reg" field is added to event's "reg" field 
in the "events" node, which gives us the location of the counter data. Kernel
code uses this offset as event configuration value.

Device tree parser code also looks for scale/unit property in the event
node and passes on the value as an event attr for perf interface to use
in the post processing by the perf tool. Some PMUs may have common scale
and unit properties which implies that all events supported by this PMU
inherit the scale and unit properties of the PMU itself. For those
events, we need to set the common unit and scale values.

For failure to initialize any unit or any event, disable that unit and
continue setting up the rest of them.

Signed-off-by: Hemant Kumar 
Signed-off-by: Anju T Sudhakar 
Signed-off-by: Madhavan Srinivasan 
---
 arch/powerpc/platforms/powernv/opal-imc.c | 413 ++
 1 file changed, 413 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/opal-imc.c 
b/arch/powerpc/platforms/powernv/opal-imc.c
index 3a87000..0ddaf7d 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -33,15 +33,428 @@
 #include 
 #include 
 
+u64 nest_max_offset;
 struct perchip_nest_info nest_perchip_info[IMC_MAX_CHIPS];
+struct imc_pmu *per_nest_pmu_arr[IMC_MAX_PMUS];
+
+static int imc_event_prop_update(char *name, struct imc_events *events)
+{
+   char *buf;
+
+   if (!events || !name)
+   return -EINVAL;
+
+   /* memory for content */
+   buf = kzalloc(IMC_MAX_NAME_VAL_LEN, GFP_KERNEL);
+   if (!buf)
+   return -ENOMEM;
+
+   events->ev_name = name;
+   events->ev_value = buf;
+   return 0;
+}
+
+static int imc_event_prop_str(struct property *pp, char *name,
+ struct imc_events *events)
+{
+   int ret;
+
+   ret = imc_event_prop_update(name, events);
+   if (ret)
+   return ret;
+
+   if (!pp->value || (strnlen(pp->value, pp->length) == pp->length) ||
+  (pp->length > IMC_MAX_NAME_VAL_LEN))
+   return -EINVAL;
+   strncpy(events->ev_value, (const char *)pp->value, pp->length);
+
+   return 0;
+}
+
+static int imc_event_prop_val(char *name, u32 val,
+ struct imc_events *events)
+{
+   int ret;
+
+   ret = imc_event_prop_update(name, events);
+   if (ret)
+   return ret;
+   snprintf(events->ev_value, IMC_MAX_NAME_VAL_LEN, "event=0x%x", val);
+
+   return 0;
+}
+
+static int set_event_property(struct property *pp, char *event_prop,
+ struct imc_events *events, char *ev_name)
+{
+   char *buf;
+   int ret;
+
+   buf = kzalloc(IMC_MAX_NAME_VAL_LEN, GFP_KERNEL);
+   if (!buf)
+   return -ENOMEM;
+
+   sprintf(buf, "%s.%s", ev_name, event_prop);
+   ret = imc_event_prop_str(pp, buf, events);
+   if (ret) {
+   if (events->ev_name)
+   kfree(events->ev_name);
+   if (events->ev_value)
+   kfree(events->ev_value);
+   }
+   return ret;
+}
+
+/*
+ * Updates the maximum offset for an event in the pmu with domain
+ * "pmu_domain".
+ */
+static void update_max_value(u32 value, int pmu_domain)
+{
+   switch (pmu_domain) {
+   case IMC_DOMAIN_NEST:
+   if (nest_max_offset < value)
+   nest_max_offset = value;
+   break;
+   default:
+   /* Unknown domain, return */
+   return;
+   }
+}
+
+/*
+ * imc_events_node_parser: Parse the event node "dev" and assign the parsed
+ * information to event "e

[PATCH v7 02/10] powerpc/powernv: Autoload IMC device driver module

2017-04-18 Thread Anju T Sudhakar
This patch does three things :
 - Enables "opal.c" to create a platform device for the IMC interface
   according to the appropriate compatibility string.
 - Find the reserved-memory region details from the system device tree
   and get the base address of HOMER (Reserved memory) region address for each 
chip.
 - We also get the Nest PMU counter data offsets (in the HOMER region)
   and their sizes. The offsets for the counters' data are fixed and
   won't change from chip to chip.

The device tree parsing logic is separated from the PMU creation
functions (which is done in subsequent patches).

Patch also adds a CONFIG_HV_PERF_IMC_CTRS for the IMC driver.

Signed-off-by: Anju T Sudhakar 
Signed-off-by: Hemant Kumar 
Signed-off-by: Madhavan Srinivasan 
---
 arch/powerpc/platforms/powernv/Kconfig|  10 +++
 arch/powerpc/platforms/powernv/Makefile   |   1 +
 arch/powerpc/platforms/powernv/opal-imc.c | 140 ++
 arch/powerpc/platforms/powernv/opal.c |  18 
 4 files changed, 169 insertions(+)
 create mode 100644 arch/powerpc/platforms/powernv/opal-imc.c

diff --git a/arch/powerpc/platforms/powernv/Kconfig 
b/arch/powerpc/platforms/powernv/Kconfig
index 3a07e4d..1b90a98 100644
--- a/arch/powerpc/platforms/powernv/Kconfig
+++ b/arch/powerpc/platforms/powernv/Kconfig
@@ -27,3 +27,13 @@ config OPAL_PRD
help
  This enables the opal-prd driver, a facility to run processor
  recovery diagnostics on OpenPower machines
+
+config HV_PERF_IMC_CTRS
+   bool "Hypervisor supplied In Memory Collection PMU events (Nest & Core)"
+   default y
+   depends on PERF_EVENTS && PPC_POWERNV
+   help
+ Enable access to hypervisor supplied in-memory collection counters
+ in perf. IMC counters are available from Power9 systems.
+
+  If unsure, select Y.
diff --git a/arch/powerpc/platforms/powernv/Makefile 
b/arch/powerpc/platforms/powernv/Makefile
index b5d98cb..715e531 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -12,3 +12,4 @@ obj-$(CONFIG_PPC_SCOM)+= opal-xscom.o
 obj-$(CONFIG_MEMORY_FAILURE)   += opal-memory-errors.o
 obj-$(CONFIG_TRACEPOINTS)  += opal-tracepoints.o
 obj-$(CONFIG_OPAL_PRD) += opal-prd.o
+obj-$(CONFIG_HV_PERF_IMC_CTRS) += opal-imc.o
diff --git a/arch/powerpc/platforms/powernv/opal-imc.c 
b/arch/powerpc/platforms/powernv/opal-imc.c
new file mode 100644
index 000..3a87000
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -0,0 +1,140 @@
+/*
+ * OPAL IMC interface detection driver
+ * Supported on POWERNV platform
+ *
+ * Copyright   (C) 2017 Madhavan Srinivasan, IBM Corporation.
+ * (C) 2017 Anju T Sudhakar, IBM Corporation.
+ * (C) 2017 Hemant K Shaw, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+struct perchip_nest_info nest_perchip_info[IMC_MAX_CHIPS];
+
+/*
+ * imc_pmu_setup : Setup the IMC PMUs (children of "parent").
+ */
+static void __init imc_pmu_setup(struct device_node *parent)
+{
+   if (!parent)
+   return;
+}
+
+static int opal_imc_counters_probe(struct platform_device *pdev)
+{
+   struct device_node *imc_dev, *dn, *rm_node = NULL;
+   struct perchip_nest_info *pcni;
+   u32 pages, nest_offset, nest_size, chip_id;
+   int i = 0;
+   const __be32 *addrp;
+   u64 reg_addr, reg_size;
+
+   if (!pdev || !pdev->dev.of_node)
+   return -ENODEV;
+
+   /*
+* Check whether this is kdump kernel. If yes, just return.
+*/
+   if (is_kdump_kernel())
+   return -ENODEV;
+
+   imc_dev = pdev->dev.of_node;
+
+   /*
+* Nest counter data are saved in a reserved memory called HOMER.
+* "imc-nest-offset" identifies the counter data location within HOMER.
+* size : size of the entire nest-counters region
+*/
+   if (of_property_read_u32(imc_dev, "imc-nest-offset", &nest_offset))
+   goto err;
+
+   if (of_property_read_u32(imc_dev, "imc-nest-size", &nest_size))
+   goto err;
+
+   /* Sanity check */
+   if ((nest_size/PAGE_SIZE) > IMC_NEST_MAX_PAGES)
+   goto err;
+
+   /* Find the "HOMER region" for each chip 

[PATCH v7 06/10] powerpc/powernv: Core IMC events detection

2017-04-18 Thread Anju T Sudhakar
From: Hemant Kumar 

This patch adds support for detection of core IMC events along with the
Nest IMC events. It adds a new domain IMC_DOMAIN_CORE and its determined
with the help of the compatibility string "ibm,imc-counters-core" based
on the IMC device tree.

Signed-off-by: Anju T Sudhakar 
Signed-off-by: Hemant Kumar 
Signed-off-by: Madhavan Srinivasan 
---
 arch/powerpc/include/asm/imc-pmu.h|  4 +++-
 arch/powerpc/perf/imc-pmu.c   |  3 +++
 arch/powerpc/platforms/powernv/opal-imc.c | 28 +---
 3 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/imc-pmu.h 
b/arch/powerpc/include/asm/imc-pmu.h
index 1478d0f..37fdd79 100644
--- a/arch/powerpc/include/asm/imc-pmu.h
+++ b/arch/powerpc/include/asm/imc-pmu.h
@@ -42,6 +42,7 @@
  */
 #define IMC_DTB_COMPAT "ibm,opal-in-memory-counters"
 #define IMC_DTB_NEST_COMPAT"ibm,imc-counters-nest"
+#define IMC_DTB_CORE_COMPAT"ibm,imc-counters-core"
 
 /*
  * Structure to hold per chip specific memory address
@@ -90,13 +91,14 @@ struct imc_pmu {
  * Domains for IMC PMUs
  */
 #define IMC_DOMAIN_NEST1
+#define IMC_DOMAIN_CORE2
 #define IMC_DOMAIN_UNKNOWN -1
 
 #define IMC_COUNTER_ENABLE 1
 #define IMC_COUNTER_DISABLE0
 
-
 extern struct perchip_nest_info nest_perchip_info[IMC_MAX_CHIPS];
 extern struct imc_pmu *per_nest_pmu_arr[IMC_MAX_PMUS];
+extern struct imc_pmu *core_imc_pmu;
 extern int __init init_imc_pmu(struct imc_events *events,int idx, struct 
imc_pmu *pmu_ptr);
 #endif /* PPC_POWERNV_IMC_PMU_DEF_H */
diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index b86ef86..6fdac40 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -24,8 +24,11 @@ static atomic_t nest_events;
 /* Used to avoid races in calling enable/disable nest-pmu units*/
 static DEFINE_MUTEX(imc_nest_reserve);
 
+struct imc_pmu *core_imc_pmu;
+
 /* Needed for sanity check */
 extern u64 nest_max_offset;
+extern u64 core_max_offset;
 
 PMU_FORMAT_ATTR(event, "config:0-20");
 static struct attribute *imc_format_attrs[] = {
diff --git a/arch/powerpc/platforms/powernv/opal-imc.c 
b/arch/powerpc/platforms/powernv/opal-imc.c
index 61f6d67..d712ef3 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -34,6 +34,7 @@
 #include 
 
 u64 nest_max_offset;
+u64 core_max_offset;
 
 static int imc_event_prop_update(char *name, struct imc_events *events)
 {
@@ -114,6 +115,10 @@ static void update_max_value(u32 value, int pmu_domain)
if (nest_max_offset < value)
nest_max_offset = value;
break;
+   case IMC_DOMAIN_CORE:
+   if (core_max_offset < value)
+   core_max_offset = value;
+   break;
default:
/* Unknown domain, return */
return;
@@ -357,7 +362,7 @@ static struct imc_events *imc_events_setup(struct 
device_node *parent,
 /*
  * imc_pmu_create : Takes the parent device which is the pmu unit and a
  *  pmu_index as the inputs.
- * Allocates memory for the pmu, sets up its domain (NEST), and
+ * Allocates memory for the pmu, sets up its domain (NEST/CORE), and
  * calls imc_events_setup() to allocate memory for the events supported
  * by this pmu. Assigns a name for the pmu. Calls imc_events_node_parser()
  * to setup the individual events.
@@ -386,7 +391,10 @@ static int imc_pmu_create(struct device_node *parent, int 
pmu_index, int domain)
goto free_pmu;
 
/* Needed for hotplug/migration */
-   per_nest_pmu_arr[pmu_index] = pmu_ptr;
+   if (pmu_ptr->domain == IMC_DOMAIN_CORE)
+   core_imc_pmu = pmu_ptr;
+   else if (pmu_ptr->domain == IMC_DOMAIN_NEST)
+   per_nest_pmu_arr[pmu_index] = pmu_ptr;
 
pp = of_find_property(parent, "name", NULL);
if (!pp) {
@@ -407,7 +415,10 @@ static int imc_pmu_create(struct device_node *parent, int 
pmu_index, int domain)
goto free_pmu;
}
/* Save the name to register it later */
-   sprintf(buf, "nest_%s", (char *)pp->value);
+   if (pmu_ptr->domain == IMC_DOMAIN_NEST)
+   sprintf(buf, "nest_%s", (char *)pp->value);
+   else
+   sprintf(buf, "%s_imc", (char *)pp->value);
pmu_ptr->pmu.name = (char *)buf;
 
/*
@@ -461,6 +472,17 @@ static void __init imc_pmu_setup(struct device_node 
*parent)
return;
pmu_count++;
}
+   /*
+* Loop through the imc-counters tree for each compatible
+* "ibm,imc-counters-core", and update "struct imc_pmu".
+*/
+   for_each_compatible_node(child, NULL, IMC_DTB_CORE_

[PATCH v7 10/10] powerpc/perf: Thread imc cpuhotplug support

2017-04-18 Thread Anju T Sudhakar
This patch adds support for thread IMC on cpuhotplug.

When a cpu goes offline, the LDBAR for that cpu is disabled, and when it comes
back online the previous ldbar value is written back to the LDBAR for that cpu.

To register the hotplug functions for thread_imc, a new state
CPUHP_AP_PERF_POWERPC_THREADIMC_ONLINE is added to the list of existing
states.

Reviewed-by: Gautham R. Shenoy 
Signed-off-by: Anju T Sudhakar 
Signed-off-by: Madhavan Srinivasan 
---
 arch/powerpc/perf/imc-pmu.c | 32 +++-
 include/linux/cpuhotplug.h  |  1 +
 2 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 6c7d3ed..daf9151 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -984,6 +984,16 @@ static void cleanup_all_thread_imc_memory(void)
on_each_cpu(cleanup_thread_imc_memory, NULL, 1);
 }
 
+static void thread_imc_update_ldbar(unsigned int cpu_id)
+{
+   u64 ldbar_addr, ldbar_value;
+
+   ldbar_addr = (u64)virt_to_phys((void *)per_cpu_add[cpu_id]);
+   ldbar_value = (ldbar_addr & (u64)THREAD_IMC_LDBAR_MASK) |
+   (u64)THREAD_IMC_ENABLE;
+   mtspr(SPRN_LDBAR, ldbar_value);
+}
+
 /*
  * Allocates a page of memory for each of the online cpus, and, writes the
  * physical base address of that page to the LDBAR for that cpu. This starts
@@ -991,21 +1001,33 @@ static void cleanup_all_thread_imc_memory(void)
  */
 static void thread_imc_mem_alloc(void *dummy)
 {
-   u64 ldbar_addr, ldbar_value;
int cpu_id = smp_processor_id();
int phys_id = topology_physical_package_id(smp_processor_id());
 
per_cpu_add[cpu_id] = (u64)alloc_pages_exact_nid(phys_id,
(size_t)IMC_THREAD_COUNTER_MEM, GFP_KERNEL | 
__GFP_ZERO);
-   ldbar_addr = (u64)virt_to_phys((void *)per_cpu_add[cpu_id]);
-   ldbar_value = (ldbar_addr & (u64)THREAD_IMC_LDBAR_MASK) |
-   (u64)THREAD_IMC_ENABLE;
-   mtspr(SPRN_LDBAR, ldbar_value);
+   thread_imc_update_ldbar(cpu_id);
+}
+
+static int ppc_thread_imc_cpu_online(unsigned int cpu)
+{
+   thread_imc_update_ldbar(cpu);
+   return 0;
 }
 
+static int ppc_thread_imc_cpu_offline(unsigned int cpu)
+{
+   mtspr(SPRN_LDBAR, 0);
+   return 0;
+ }
+
 void thread_imc_cpu_init(void)
 {
on_each_cpu(thread_imc_mem_alloc, NULL, 1);
+   cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_THREADIMC_ONLINE,
+   "POWER_THREAD_IMC_ONLINE",
+   ppc_thread_imc_cpu_online,
+   ppc_thread_imc_cpu_offline);
 }
 
 /*
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index e7b7712..bbec927 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -139,6 +139,7 @@ enum cpuhp_state {
CPUHP_AP_PERF_ARM_QCOM_L2_ONLINE,
CPUHP_AP_PERF_POWERPC_NEST_ONLINE,
CPUHP_AP_PERF_POWERPC_COREIMC_ONLINE,
+   CPUHP_AP_PERF_POWERPC_THREADIMC_ONLINE,
CPUHP_AP_WORKQUEUE_ONLINE,
CPUHP_AP_RCUTREE_ONLINE,
CPUHP_AP_ONLINE_DYN,
-- 
2.7.4



[PATCH v7 07/10] powerpc/perf: PMU functions for Core IMC and hotplugging

2017-04-18 Thread Anju T Sudhakar
From: Hemant Kumar 

This patch adds the PMU function to initialize a core IMC event. It also
adds cpumask initialization function for core IMC PMU. For
initialization, a 8KB of memory is allocated per core where the data
for core IMC counters will be accumulated. The base address for this
page is sent to OPAL via an OPAL call which initializes various SCOMs
related to Core IMC initialization. Upon any errors, the pages are
free'ed and core IMC counters are disabled using the same OPAL call.

For CPU hotplugging, a cpumask is initialized which contains an online
CPU from each core. If a cpu goes offline, we check whether that cpu
belongs to the core imc cpumask, if yes, then, we migrate the PMU
context to any other online cpu (if available) in that core. If a cpu
comes back online, then this cpu will be added to the core imc cpumask
only if there was no other cpu from that core in the previous cpumask.

To register the hotplug functions for core_imc, a new state
CPUHP_AP_PERF_POWERPC_COREIMC_ONLINE is added to the list of existing
states.

Patch also adds OPAL device shutdown callback. Needed to disable the
IMC core engine to handle kexec.

Signed-off-by: Hemant Kumar 
Signed-off-by: Anju T Sudhakar 
Signed-off-by: Madhavan Srinivasan 
---
 arch/powerpc/include/asm/imc-pmu.h |   7 +
 arch/powerpc/include/asm/opal-api.h|  10 +-
 arch/powerpc/include/asm/opal.h|   2 +
 arch/powerpc/perf/imc-pmu.c| 381 -
 arch/powerpc/platforms/powernv/opal-imc.c  |   7 +
 arch/powerpc/platforms/powernv/opal-wrappers.S |   1 +
 include/linux/cpuhotplug.h |   1 +
 7 files changed, 397 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/include/asm/imc-pmu.h 
b/arch/powerpc/include/asm/imc-pmu.h
index 37fdd79..bf5fb7c 100644
--- a/arch/powerpc/include/asm/imc-pmu.h
+++ b/arch/powerpc/include/asm/imc-pmu.h
@@ -24,6 +24,7 @@
  */
 #define IMC_MAX_CHIPS  32
 #define IMC_MAX_PMUS   32
+#define IMC_MAX_CORES  32
 
 /*
  * This macro is used for memory buffer allocation of
@@ -38,6 +39,11 @@
 #define IMC_NEST_MAX_PAGES 64
 
 /*
+ * IMC Core engine expects 8K bytes of memory for counter collection.
+ */
+#define IMC_CORE_COUNTER_MEM   8192
+
+/*
  *Compatbility macros for IMC devices
  */
 #define IMC_DTB_COMPAT "ibm,opal-in-memory-counters"
@@ -101,4 +107,5 @@ extern struct perchip_nest_info 
nest_perchip_info[IMC_MAX_CHIPS];
 extern struct imc_pmu *per_nest_pmu_arr[IMC_MAX_PMUS];
 extern struct imc_pmu *core_imc_pmu;
 extern int __init init_imc_pmu(struct imc_events *events,int idx, struct 
imc_pmu *pmu_ptr);
+void core_imc_disable(void);
 #endif /* PPC_POWERNV_IMC_PMU_DEF_H */
diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index 23fc51e9..971918d 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -169,7 +169,8 @@
 #define OPAL_PCI_TCE_KILL  126
 #define OPAL_NMMU_SET_PTCR 127
 #define OPAL_NEST_IMC_COUNTERS_CONTROL 149
-#define OPAL_LAST  149
+#define OPAL_CORE_IMC_COUNTERS_CONTROL 150
+#define OPAL_LAST  150
 
 /* Device tree flags */
 
@@ -939,6 +940,13 @@ enum {
OPAL_NEST_IMC_START,
 };
 
+/* Operation argument to Core IMC */
+enum {
+   OPAL_CORE_IMC_DISABLE,
+   OPAL_CORE_IMC_ENABLE,
+   OPAL_CORE_IMC_INIT,
+};
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* __OPAL_API_H */
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index ffa4293..6364458 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -238,6 +238,8 @@ int64_t opal_nmmu_set_ptcr(uint64_t chip_id, uint64_t ptcr);
  */
 int64_t opal_nest_imc_counters_control(uint64_t mode, uint64_t value1,
uint64_t value2, uint64_t value3);
+int64_t opal_core_imc_counters_control(uint64_t operation, uint64_t addr,
+   uint64_t value2, uint64_t value3);
 
 /* Internal functions */
 extern int early_init_dt_scan_opal(unsigned long node, const char *uname,
diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 6fdac40..e98a715 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -1,5 +1,5 @@
 /*
- * Nest Performance Monitor counter support.
+ * IMC Performance Monitor counter support.
  *
  * Copyright (C) 2017 Madhavan Srinivasan, IBM Corporation.
  *   (C) 2017 Anju T Sudhakar, IBM Corporation.
@@ -21,9 +21,21 @@ struct imc_pmu *per_nest_pmu_arr[IMC_MAX_PMUS];
 static cpumask_t nest_imc_cpumask;
 
 static atomic_t nest_events;
+static atomic_t core_events;
 /* Used to avoid races in calling enable/disable nest-pmu units*/
 static DEFINE_MUTEX(imc_nest_reserve);
+/* Used to avoid races in calling

[PATCH v7 4/10] powerpc/perf: Add generic IMC pmu groupand event functions

2017-04-18 Thread Anju T Sudhakar
From: Hemant Kumar 

Device tree IMC driver code parses the IMC units and their events. It
passes the information to IMC pmu code which is placed in powerpc/perf
as "imc-pmu.c".

Patch adds a set of generic imc pmu related event functions to be
used  by each imc pmu unit. Add code to setup format attribute and to
register imc pmus. Add a event_init function for nest_imc events.

Since, the IMC counters' data are periodically fed to a memory location,
the functions to read/update, start/stop, add/del can be generic and can
be used by all IMC PMU units.

Signed-off-by: Anju T Sudhakar 
Signed-off-by: Hemant Kumar 
Signed-off-by: Madhavan Srinivasan 
---
 arch/powerpc/include/asm/imc-pmu.h|   3 +
 arch/powerpc/perf/Makefile|   3 +
 arch/powerpc/perf/imc-pmu.c   | 269 ++
 arch/powerpc/platforms/powernv/opal-imc.c |  10 +-
 4 files changed, 283 insertions(+), 2 deletions(-)
 create mode 100644 arch/powerpc/perf/imc-pmu.c

diff --git a/arch/powerpc/include/asm/imc-pmu.h 
b/arch/powerpc/include/asm/imc-pmu.h
index d0193c8..6bbe184 100644
--- a/arch/powerpc/include/asm/imc-pmu.h
+++ b/arch/powerpc/include/asm/imc-pmu.h
@@ -92,4 +92,7 @@ struct imc_pmu {
 #define IMC_DOMAIN_NEST1
 #define IMC_DOMAIN_UNKNOWN -1
 
+extern struct perchip_nest_info nest_perchip_info[IMC_MAX_CHIPS];
+extern struct imc_pmu *per_nest_pmu_arr[IMC_MAX_PMUS];
+extern int __init init_imc_pmu(struct imc_events *events,int idx, struct 
imc_pmu *pmu_ptr);
 #endif /* PPC_POWERNV_IMC_PMU_DEF_H */
diff --git a/arch/powerpc/perf/Makefile b/arch/powerpc/perf/Makefile
index 4d606b9..b29d918 100644
--- a/arch/powerpc/perf/Makefile
+++ b/arch/powerpc/perf/Makefile
@@ -6,6 +6,9 @@ obj-$(CONFIG_PPC_PERF_CTRS) += core-book3s.o bhrb.o
 obj64-$(CONFIG_PPC_PERF_CTRS)  += power4-pmu.o ppc970-pmu.o power5-pmu.o \
   power5+-pmu.o power6-pmu.o power7-pmu.o \
   isa207-common.o power8-pmu.o power9-pmu.o
+
+obj-$(CONFIG_HV_PERF_IMC_CTRS) += imc-pmu.o
+
 obj32-$(CONFIG_PPC_PERF_CTRS)  += mpc7450-pmu.o
 
 obj-$(CONFIG_FSL_EMB_PERF_EVENT) += core-fsl-emb.o
diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
new file mode 100644
index 000..f09a37a
--- /dev/null
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -0,0 +1,269 @@
+/*
+ * Nest Performance Monitor counter support.
+ *
+ * Copyright (C) 2017 Madhavan Srinivasan, IBM Corporation.
+ *   (C) 2017 Anju T Sudhakar, IBM Corporation.
+ *   (C) 2017 Hemant K Shaw, IBM Corporation.
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+struct perchip_nest_info nest_perchip_info[IMC_MAX_CHIPS];
+struct imc_pmu *per_nest_pmu_arr[IMC_MAX_PMUS];
+
+/* Needed for sanity check */
+extern u64 nest_max_offset;
+
+PMU_FORMAT_ATTR(event, "config:0-20");
+static struct attribute *imc_format_attrs[] = {
+   &format_attr_event.attr,
+   NULL,
+};
+
+static struct attribute_group imc_format_group = {
+   .name = "format",
+   .attrs = imc_format_attrs,
+};
+
+static int nest_imc_event_init(struct perf_event *event)
+{
+   int chip_id;
+   u32 config = event->attr.config;
+   struct perchip_nest_info *pcni;
+
+   if (event->attr.type != event->pmu->type)
+   return -ENOENT;
+
+   /* Sampling not supported */
+   if (event->hw.sample_period)
+   return -EINVAL;
+
+   /* unsupported modes and filters */
+   if (event->attr.exclude_user   ||
+   event->attr.exclude_kernel ||
+   event->attr.exclude_hv ||
+   event->attr.exclude_idle   ||
+   event->attr.exclude_host   ||
+   event->attr.exclude_guest)
+   return -EINVAL;
+
+   if (event->cpu < 0)
+   return -EINVAL;
+
+   /* Sanity check for config (event offset) */
+   if (config > nest_max_offset)
+   return -EINVAL;
+
+   chip_id = topology_physical_package_id(event->cpu);
+   pcni = &nest_perchip_info[chip_id];
+
+   /*
+* Memory for Nest HW counter data could be in multiple pages.
+* Hence check and pick the right event base page for chip with
+* "chip_id" and add "config" to it".
+*/
+   event->hw.event_base = pcni->vbase[config/PAGE_SIZE] +
+   (config & ~PAGE_MASK);
+
+   return 0;
+}
+
+static void imc_read_counter(struct perf_event *event)
+{
+   u64 *addr, data;
+
+   /*
+* In-Memory Collection (IMC) counters are free flowing counters.
+* So we take a snapshot of the counter

[PATCH v8 00/10] IMC Instrumentation Support

2017-05-04 Thread Anju T Sudhakar
g non-idle code" ;
};
[...]
   core {
compatible = "ibm,imc-counters-core";
events-prefix = "CPM_";
unit = "";
scale = "";
reg = <0x0 0x8>;
events = < &CORE_EVENTS >;
};

thread {
compatible = "ibm,imc-counters-core";
events-prefix = "CPM_";
unit = "";
scale = "";
reg = <0x0 0x8>;
events = < &CORE_EVENTS >;
};
};

>From the device tree, the kernel parses the PMUs and their events'
information.

After parsing the IMC PMUs and their events, the PMUs and their
attributes are registered in the kernel.

This patchset (patches 9 and 10) configure the thread level IMC PMUs
to count for tasks, which give us the thread level metric values per
task.

Example Usage :
 # perf list

  [...]
  nest_mcs0/PM_MCS_DOWN_128B_DATA_XFER_MC0/   [Kernel PMU event]
  nest_mcs0/PM_MCS_DOWN_128B_DATA_XFER_MC0_LAST_SAMPLE/ [Kernel PMU event]
  [...]
  core_imc/CPM_NON_IDLE_INST/[Kernel PMU event]
  core_imc/CPM_NON_IDLE_PCYC/[Kernel PMU event]
  [...]
  thread_imc/CPM_NON_IDLE_INST/  [Kernel PMU event]
  thread_imc/CPM_NON_IDLE_PCYC/  [Kernel PMU event]

To see per chip data for nest_mcs0/PM_MCS_DOWN_128B_DATA_XFER_MC0/ :
 # perf stat -e "nest_mcs0/PM_MCS_DOWN_128B_DATA_XFER_MC0/" -a --per-socket

To see non-idle instructions for core 0 :
 # ./perf stat -e "core_imc/CPM_NON_IDLE_INST/" -C 0 -I 1000

To see non-idle instructions for a "make" :
 # ./perf stat -e "thread_imc/CPM_NON_IDLE_PCYC/" make

Comments/feedback/suggestions are welcome.


TODO:
1)Add a sysfs interface to disable the Core imc (both for ldbar and pdbar)


Changelog:

v7 -> v8:
 - opal-call API for nest and core is changed.
   OPAL_NEST_IMC_COUNTERS_CONTROL and
   OPAL_CORE_IMC_COUNTERS_CONTROL  is replaced with
   OPAL_IMC_COUNTERS_INIT, OPAL_IMC_COUNTERS_START and
   OPAL_IMC_COUNTERS_STOP.
 - thread_ima doesn't have CPUMASK_ATTR, hence added a
   fix in patch 09/10, which will swap the IMC_EVENT_ATTR
   slot with IMC_CPUMASK_ATTR.

v6 -> v7:
 - Updated the commit message and code comments.
 - Changed the counter init code to disable the
   nest/core counters by default and enable only 
   when it is used.
 - Updated the pmu-setup code to register the
   PMUs which doesn't have events.
 - replaced imc_event_info_val() to imc_event_prop_update()
 - Updated the imc_pmu_setup() code, by checking for the "value"
   of compatible property instead of merely checking for compatible.
 - removed imc_get_domain().
 - init_imc_pmu() and imc_pmu_setup() are made  __init.
 - update_max_val() is invoked immediately after updating the offset value.
v5 -> v6:
 - merged few patches for the readability and code flow
 - Updated the commit message and code comments.
 - updated cpuhotplug code and added checks for perf migration context
 - Added READ_ONCE() when reading the counter data.
 - replaced of_property_read_u32() with of_get_address() for "reg" property read
 - replaced UNKNOWN_DOMAIN with IMC_DOMAIN_UNKNOWN
 v4 -> v5:
 - Updated opal call numbers
 - Added a patch to disable Core-IMC device using shutdown callback
 - Added patch to support cpuhotplug for thread-imc
 - Added patch to disable and enable core imc engine in cpuhot plug path
 v3 -> v4 :
 - Changed the events parser code to discover the PMU and events because
   of the changed format of the IMC DTS file (Patch 3).
 - Implemented the two TODOs to include core and thread IMC support with
   this patchset (Patches 7 through 10).
 - Changed the CPU hotplug code of Nest IMC PMUs to include a new state
   CPUHP_AP_PERF_POWERPC_NEST_ONLINE (Patch 6).
 v2 -> v3 :
 - Changed all references for IMA (In-Memory Accumulation) to IMC (In-Memory
   Collection).
 v1 -> v2 :
 - Account for the cases where a PMU can have a common scale and unit
   values for all its supported events (Patch 3/6).
 - Fixed a Build error (for maple_defconfig) by enabling imc_pmu.o
   only for CONFIG_PPC_POWERNV=y (Patch 4/6)
 - Read from the "event-name" property instead of "name" for an event
   node (Patch 3/6).

Anju T Sudhakar (6):
  powerpc/powernv: Autoload IMC device driver module
  powerpc/powernv: Detect supported IMC units and its events
  powerpc/perf: IMC pmu cpumask and cpuhotplug support
  powerpc/powernv: Thread IMC events detection
  powerpc/perf: Thread IMC PMU functions
  powerpc/perf: Thread imc cpuhotplug support

Hemant Kumar (4):
  powerpc/powernv: Data structure and macros definitions for IMC
  powerpc/perf: Add generic IMC pmu groupand event funct

[PATCH v8 02/10] powerpc/powernv: Autoload IMC device driver module

2017-05-04 Thread Anju T Sudhakar
This patch does three things :
 - Enables "opal.c" to create a platform device for the IMC interface
   according to the appropriate compatibility string.
 - Find the reserved-memory region details from the system device tree
   and get the base address of HOMER (Reserved memory) region address for each 
chip.
 - We also get the Nest PMU counter data offsets (in the HOMER region)
   and their sizes. The offsets for the counters' data are fixed and
   won't change from chip to chip.

The device tree parsing logic is separated from the PMU creation
functions (which is done in subsequent patches).

Patch also adds a CONFIG_HV_PERF_IMC_CTRS for the IMC driver.

Signed-off-by: Anju T Sudhakar 
Signed-off-by: Hemant Kumar 
Signed-off-by: Madhavan Srinivasan 
---
 arch/powerpc/platforms/powernv/Kconfig|  10 +++
 arch/powerpc/platforms/powernv/Makefile   |   1 +
 arch/powerpc/platforms/powernv/opal-imc.c | 140 ++
 arch/powerpc/platforms/powernv/opal.c |  18 
 4 files changed, 169 insertions(+)
 create mode 100644 arch/powerpc/platforms/powernv/opal-imc.c

diff --git a/arch/powerpc/platforms/powernv/Kconfig 
b/arch/powerpc/platforms/powernv/Kconfig
index 3a07e4d..1b90a98 100644
--- a/arch/powerpc/platforms/powernv/Kconfig
+++ b/arch/powerpc/platforms/powernv/Kconfig
@@ -27,3 +27,13 @@ config OPAL_PRD
help
  This enables the opal-prd driver, a facility to run processor
  recovery diagnostics on OpenPower machines
+
+config HV_PERF_IMC_CTRS
+   bool "Hypervisor supplied In Memory Collection PMU events (Nest & Core)"
+   default y
+   depends on PERF_EVENTS && PPC_POWERNV
+   help
+ Enable access to hypervisor supplied in-memory collection counters
+ in perf. IMC counters are available from Power9 systems.
+
+  If unsure, select Y.
diff --git a/arch/powerpc/platforms/powernv/Makefile 
b/arch/powerpc/platforms/powernv/Makefile
index b5d98cb..715e531 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -12,3 +12,4 @@ obj-$(CONFIG_PPC_SCOM)+= opal-xscom.o
 obj-$(CONFIG_MEMORY_FAILURE)   += opal-memory-errors.o
 obj-$(CONFIG_TRACEPOINTS)  += opal-tracepoints.o
 obj-$(CONFIG_OPAL_PRD) += opal-prd.o
+obj-$(CONFIG_HV_PERF_IMC_CTRS) += opal-imc.o
diff --git a/arch/powerpc/platforms/powernv/opal-imc.c 
b/arch/powerpc/platforms/powernv/opal-imc.c
new file mode 100644
index 000..3a87000
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -0,0 +1,140 @@
+/*
+ * OPAL IMC interface detection driver
+ * Supported on POWERNV platform
+ *
+ * Copyright   (C) 2017 Madhavan Srinivasan, IBM Corporation.
+ * (C) 2017 Anju T Sudhakar, IBM Corporation.
+ * (C) 2017 Hemant K Shaw, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+struct perchip_nest_info nest_perchip_info[IMC_MAX_CHIPS];
+
+/*
+ * imc_pmu_setup : Setup the IMC PMUs (children of "parent").
+ */
+static void __init imc_pmu_setup(struct device_node *parent)
+{
+   if (!parent)
+   return;
+}
+
+static int opal_imc_counters_probe(struct platform_device *pdev)
+{
+   struct device_node *imc_dev, *dn, *rm_node = NULL;
+   struct perchip_nest_info *pcni;
+   u32 pages, nest_offset, nest_size, chip_id;
+   int i = 0;
+   const __be32 *addrp;
+   u64 reg_addr, reg_size;
+
+   if (!pdev || !pdev->dev.of_node)
+   return -ENODEV;
+
+   /*
+* Check whether this is kdump kernel. If yes, just return.
+*/
+   if (is_kdump_kernel())
+   return -ENODEV;
+
+   imc_dev = pdev->dev.of_node;
+
+   /*
+* Nest counter data are saved in a reserved memory called HOMER.
+* "imc-nest-offset" identifies the counter data location within HOMER.
+* size : size of the entire nest-counters region
+*/
+   if (of_property_read_u32(imc_dev, "imc-nest-offset", &nest_offset))
+   goto err;
+
+   if (of_property_read_u32(imc_dev, "imc-nest-size", &nest_size))
+   goto err;
+
+   /* Sanity check */
+   if ((nest_size/PAGE_SIZE) > IMC_NEST_MAX_PAGES)
+   goto err;
+
+   /* Find the "HOMER region" for each chip 

[PATCH v8 01/10] powerpc/powernv: Data structure and macros definitions for IMC

2017-05-04 Thread Anju T Sudhakar
From: Hemant Kumar 

Create a new header file to add the data structures and
macros needed for In-Memory Collection (IMC) counter support.

Signed-off-by: Anju T Sudhakar 
Signed-off-by: Hemant Kumar 
Signed-off-by: Madhavan Srinivasan 
---
 arch/powerpc/include/asm/imc-pmu.h | 95 ++
 1 file changed, 95 insertions(+)
 create mode 100644 arch/powerpc/include/asm/imc-pmu.h

diff --git a/arch/powerpc/include/asm/imc-pmu.h 
b/arch/powerpc/include/asm/imc-pmu.h
new file mode 100644
index 000..d0193c8
--- /dev/null
+++ b/arch/powerpc/include/asm/imc-pmu.h
@@ -0,0 +1,95 @@
+#ifndef PPC_POWERNV_IMC_PMU_DEF_H
+#define PPC_POWERNV_IMC_PMU_DEF_H
+
+/*
+ * IMC Nest Performance Monitor counter support.
+ *
+ * Copyright (C) 2017 Madhavan Srinivasan, IBM Corporation.
+ *   (C) 2017 Anju T Sudhakar, IBM Corporation.
+ *   (C) 2017 Hemant K Shaw, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+/*
+ * For static allocation of some of the structures.
+ */
+#define IMC_MAX_CHIPS  32
+#define IMC_MAX_PMUS   32
+
+/*
+ * This macro is used for memory buffer allocation of
+ * event names and event string
+ */
+#define IMC_MAX_NAME_VAL_LEN   96
+
+/*
+ * Currently Microcode supports a max of 256KB of counter memory
+ * in the reserved memory region. Max pages to mmap (considering 4K PAGESIZE).
+ */
+#define IMC_NEST_MAX_PAGES 64
+
+/*
+ *Compatbility macros for IMC devices
+ */
+#define IMC_DTB_COMPAT "ibm,opal-in-memory-counters"
+#define IMC_DTB_NEST_COMPAT"ibm,imc-counters-nest"
+
+/*
+ * Structure to hold per chip specific memory address
+ * information for nest pmus. Nest Counter data are exported
+ * in per-chip reserved memory region by the PORE Engine.
+ */
+struct perchip_nest_info {
+   u32 chip_id;
+   u64 pbase;
+   u64 vbase[IMC_NEST_MAX_PAGES];
+   u64 size;
+};
+
+/*
+ * Place holder for nest pmu events and values.
+ */
+struct imc_events {
+   char *ev_name;
+   char *ev_value;
+};
+
+#define IMC_FORMAT_ATTR0
+#define IMC_CPUMASK_ATTR   1
+#define IMC_EVENT_ATTR 2
+#define IMC_NULL_ATTR  3
+
+/*
+ * Device tree parser code detects IMC pmu support and
+ * registers new IMC pmus. This structure will
+ * hold the pmu functions and attrs for each imc pmu and
+ * will be referenced at the time of pmu registration.
+ */
+struct imc_pmu {
+   struct pmu pmu;
+   int domain;
+   /*
+* Attribute groups for the PMU. Slot 0 used for
+* format attribute, slot 1 used for cpusmask attribute,
+* slot 2 used for event attribute. Slot 3 keep as
+* NULL.
+*/
+   const struct attribute_group *attr_groups[4];
+};
+
+/*
+ * Domains for IMC PMUs
+ */
+#define IMC_DOMAIN_NEST1
+#define IMC_DOMAIN_UNKNOWN -1
+
+#endif /* PPC_POWERNV_IMC_PMU_DEF_H */
-- 
2.7.4



[PATCH v8 03/10] powerpc/powernv: Detect supported IMC units and its events

2017-05-04 Thread Anju T Sudhakar
Parse device tree to detect IMC units. Traverse through each IMC unit
node to find supported events and corresponding unit/scale files (if any).

Here is the DTS file for reference:


https://github.com/open-power/ima-catalog/blob/master/81E00612.4E0100.dts

The device tree for IMC counters starts at the node "imc-counters".
This node contains all the IMC PMU nodes and event nodes
for these IMC PMUs. The PMU nodes have an "events" property which has a
phandle value for the actual events node. The events are separated from
the PMU nodes to abstract out the common events. For example, PMU node
"mcs0", "mcs1" etc. will contain a pointer to "nest-mcs-events" since,
the events are common between these PMUs. These events have a different
prefix based on their relation to different PMUs, and hence, the PMU
nodes themselves contain an "events-prefix" property. The value for this
property concatenated to the event name, forms the actual event
name. Also, the PMU have a "reg" field as the base offset for the events
which belong to this PMU. This "reg" field is added to event's "reg" field
in the "events" node, which gives us the location of the counter data. Kernel
code uses this offset as event configuration value.

Device tree parser code also looks for scale/unit property in the event
node and passes on the value as an event attr for perf interface to use
in the post processing by the perf tool. Some PMUs may have common scale
and unit properties which implies that all events supported by this PMU
inherit the scale and unit properties of the PMU itself. For those
events, we need to set the common unit and scale values.

For failure to initialize any unit or any event, disable that unit and
continue setting up the rest of them.

Signed-off-by: Hemant Kumar 
Signed-off-by: Anju T Sudhakar 
Signed-off-by: Madhavan Srinivasan 
---
 arch/powerpc/platforms/powernv/opal-imc.c | 413 ++
 1 file changed, 413 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/opal-imc.c 
b/arch/powerpc/platforms/powernv/opal-imc.c
index 3a87000..0ddaf7d 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -33,15 +33,428 @@
 #include 
 #include 
 
+u64 nest_max_offset;
 struct perchip_nest_info nest_perchip_info[IMC_MAX_CHIPS];
+struct imc_pmu *per_nest_pmu_arr[IMC_MAX_PMUS];
+
+static int imc_event_prop_update(char *name, struct imc_events *events)
+{
+   char *buf;
+
+   if (!events || !name)
+   return -EINVAL;
+
+   /* memory for content */
+   buf = kzalloc(IMC_MAX_NAME_VAL_LEN, GFP_KERNEL);
+   if (!buf)
+   return -ENOMEM;
+
+   events->ev_name = name;
+   events->ev_value = buf;
+   return 0;
+}
+
+static int imc_event_prop_str(struct property *pp, char *name,
+ struct imc_events *events)
+{
+   int ret;
+
+   ret = imc_event_prop_update(name, events);
+   if (ret)
+   return ret;
+
+   if (!pp->value || (strnlen(pp->value, pp->length) == pp->length) ||
+  (pp->length > IMC_MAX_NAME_VAL_LEN))
+   return -EINVAL;
+   strncpy(events->ev_value, (const char *)pp->value, pp->length);
+
+   return 0;
+}
+
+static int imc_event_prop_val(char *name, u32 val,
+ struct imc_events *events)
+{
+   int ret;
+
+   ret = imc_event_prop_update(name, events);
+   if (ret)
+   return ret;
+   snprintf(events->ev_value, IMC_MAX_NAME_VAL_LEN, "event=0x%x", val);
+
+   return 0;
+}
+
+static int set_event_property(struct property *pp, char *event_prop,
+ struct imc_events *events, char *ev_name)
+{
+   char *buf;
+   int ret;
+
+   buf = kzalloc(IMC_MAX_NAME_VAL_LEN, GFP_KERNEL);
+   if (!buf)
+   return -ENOMEM;
+
+   sprintf(buf, "%s.%s", ev_name, event_prop);
+   ret = imc_event_prop_str(pp, buf, events);
+   if (ret) {
+   if (events->ev_name)
+   kfree(events->ev_name);
+   if (events->ev_value)
+   kfree(events->ev_value);
+   }
+   return ret;
+}
+
+/*
+ * Updates the maximum offset for an event in the pmu with domain
+ * "pmu_domain".
+ */
+static void update_max_value(u32 value, int pmu_domain)
+{
+   switch (pmu_domain) {
+   case IMC_DOMAIN_NEST:
+   if (nest_max_offset < value)
+   nest_max_offset = value;
+   break;
+   default:
+   /* Unknown domain, return */
+   return;
+   }
+}
+
+/*
+ * imc_events_node_parser: Parse the event node "dev" and assign the parsed
+ * information to event "e

[PATCH v8 04/10] powerpc/perf: Add generic IMC pmu groupand event functions

2017-05-04 Thread Anju T Sudhakar
From: Hemant Kumar 

Device tree IMC driver code parses the IMC units and their events. It
passes the information to IMC pmu code which is placed in powerpc/perf
as "imc-pmu.c".

Patch adds a set of generic imc pmu related event functions to be
used  by each imc pmu unit. Add code to setup format attribute and to
register imc pmus. Add a event_init function for nest_imc events.

Since, the IMC counters' data are periodically fed to a memory location,
the functions to read/update, start/stop, add/del can be generic and can
be used by all IMC PMU units.

Signed-off-by: Anju T Sudhakar 
Signed-off-by: Hemant Kumar 
Signed-off-by: Madhavan Srinivasan 
---
 arch/powerpc/include/asm/imc-pmu.h|   3 +
 arch/powerpc/perf/Makefile|   3 +
 arch/powerpc/perf/imc-pmu.c   | 269 ++
 arch/powerpc/platforms/powernv/opal-imc.c |  10 +-
 4 files changed, 283 insertions(+), 2 deletions(-)
 create mode 100644 arch/powerpc/perf/imc-pmu.c

diff --git a/arch/powerpc/include/asm/imc-pmu.h 
b/arch/powerpc/include/asm/imc-pmu.h
index d0193c8..6bbe184 100644
--- a/arch/powerpc/include/asm/imc-pmu.h
+++ b/arch/powerpc/include/asm/imc-pmu.h
@@ -92,4 +92,7 @@ struct imc_pmu {
 #define IMC_DOMAIN_NEST1
 #define IMC_DOMAIN_UNKNOWN -1
 
+extern struct perchip_nest_info nest_perchip_info[IMC_MAX_CHIPS];
+extern struct imc_pmu *per_nest_pmu_arr[IMC_MAX_PMUS];
+extern int __init init_imc_pmu(struct imc_events *events,int idx, struct 
imc_pmu *pmu_ptr);
 #endif /* PPC_POWERNV_IMC_PMU_DEF_H */
diff --git a/arch/powerpc/perf/Makefile b/arch/powerpc/perf/Makefile
index 4d606b9..b29d918 100644
--- a/arch/powerpc/perf/Makefile
+++ b/arch/powerpc/perf/Makefile
@@ -6,6 +6,9 @@ obj-$(CONFIG_PPC_PERF_CTRS) += core-book3s.o bhrb.o
 obj64-$(CONFIG_PPC_PERF_CTRS)  += power4-pmu.o ppc970-pmu.o power5-pmu.o \
   power5+-pmu.o power6-pmu.o power7-pmu.o \
   isa207-common.o power8-pmu.o power9-pmu.o
+
+obj-$(CONFIG_HV_PERF_IMC_CTRS) += imc-pmu.o
+
 obj32-$(CONFIG_PPC_PERF_CTRS)  += mpc7450-pmu.o
 
 obj-$(CONFIG_FSL_EMB_PERF_EVENT) += core-fsl-emb.o
diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
new file mode 100644
index 000..f09a37a
--- /dev/null
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -0,0 +1,269 @@
+/*
+ * Nest Performance Monitor counter support.
+ *
+ * Copyright (C) 2017 Madhavan Srinivasan, IBM Corporation.
+ *   (C) 2017 Anju T Sudhakar, IBM Corporation.
+ *   (C) 2017 Hemant K Shaw, IBM Corporation.
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+struct perchip_nest_info nest_perchip_info[IMC_MAX_CHIPS];
+struct imc_pmu *per_nest_pmu_arr[IMC_MAX_PMUS];
+
+/* Needed for sanity check */
+extern u64 nest_max_offset;
+
+PMU_FORMAT_ATTR(event, "config:0-20");
+static struct attribute *imc_format_attrs[] = {
+   &format_attr_event.attr,
+   NULL,
+};
+
+static struct attribute_group imc_format_group = {
+   .name = "format",
+   .attrs = imc_format_attrs,
+};
+
+static int nest_imc_event_init(struct perf_event *event)
+{
+   int chip_id;
+   u32 config = event->attr.config;
+   struct perchip_nest_info *pcni;
+
+   if (event->attr.type != event->pmu->type)
+   return -ENOENT;
+
+   /* Sampling not supported */
+   if (event->hw.sample_period)
+   return -EINVAL;
+
+   /* unsupported modes and filters */
+   if (event->attr.exclude_user   ||
+   event->attr.exclude_kernel ||
+   event->attr.exclude_hv ||
+   event->attr.exclude_idle   ||
+   event->attr.exclude_host   ||
+   event->attr.exclude_guest)
+   return -EINVAL;
+
+   if (event->cpu < 0)
+   return -EINVAL;
+
+   /* Sanity check for config (event offset) */
+   if (config > nest_max_offset)
+   return -EINVAL;
+
+   chip_id = topology_physical_package_id(event->cpu);
+   pcni = &nest_perchip_info[chip_id];
+
+   /*
+* Memory for Nest HW counter data could be in multiple pages.
+* Hence check and pick the right event base page for chip with
+* "chip_id" and add "config" to it".
+*/
+   event->hw.event_base = pcni->vbase[config/PAGE_SIZE] +
+   (config & ~PAGE_MASK);
+
+   return 0;
+}
+
+static void imc_read_counter(struct perf_event *event)
+{
+   u64 *addr, data;
+
+   /*
+* In-Memory Collection (IMC) counters are free flowing counters.
+* So we take a snapshot of the counter

[PATCH v8 05/10] powerpc/perf: IMC pmu cpumask and cpuhotplug support

2017-05-04 Thread Anju T Sudhakar
Adds cpumask attribute to be used by each IMC pmu. Only one cpu (any
online CPU) from each chip for nest PMUs is designated to read counters.

On CPU hotplug, dying CPU is checked to see whether it is one of the
designated cpus, if yes, next online cpu from the same chip (for nest
units) is designated as new cpu to read counters. For this purpose, we
introduce a new state : CPUHP_AP_PERF_POWERPC_NEST_ONLINE.

Signed-off-by: Anju T Sudhakar 
Signed-off-by: Hemant Kumar 
Signed-off-by: Madhavan Srinivasan 
---
 arch/powerpc/include/asm/imc-pmu.h |   4 +
 arch/powerpc/include/asm/opal-api.h|  12 +-
 arch/powerpc/include/asm/opal.h|   4 +
 arch/powerpc/perf/imc-pmu.c| 248 -
 arch/powerpc/platforms/powernv/opal-wrappers.S |   3 +
 include/linux/cpuhotplug.h |   1 +
 6 files changed, 266 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/imc-pmu.h 
b/arch/powerpc/include/asm/imc-pmu.h
index 6bbe184..1478d0f 100644
--- a/arch/powerpc/include/asm/imc-pmu.h
+++ b/arch/powerpc/include/asm/imc-pmu.h
@@ -92,6 +92,10 @@ struct imc_pmu {
 #define IMC_DOMAIN_NEST1
 #define IMC_DOMAIN_UNKNOWN -1
 
+#define IMC_COUNTER_ENABLE 1
+#define IMC_COUNTER_DISABLE0
+
+
 extern struct perchip_nest_info nest_perchip_info[IMC_MAX_CHIPS];
 extern struct imc_pmu *per_nest_pmu_arr[IMC_MAX_PMUS];
 extern int __init init_imc_pmu(struct imc_events *events,int idx, struct 
imc_pmu *pmu_ptr);
diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index a0aa285..ce863d9 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -168,7 +168,10 @@
 #define OPAL_INT_SET_MFRR  125
 #define OPAL_PCI_TCE_KILL  126
 #define OPAL_NMMU_SET_PTCR 127
-#define OPAL_LAST  127
+#define OPAL_IMC_COUNTERS_INIT 149
+#define OPAL_IMC_COUNTERS_START150
+#define OPAL_IMC_COUNTERS_STOP 151
+#define OPAL_LAST  151
 
 /* Device tree flags */
 
@@ -928,6 +931,13 @@ enum {
OPAL_PCI_TCE_KILL_ALL,
 };
 
+/* Argument to OPAL_IMC_COUNTERS_*  */
+enum {
+   OPAL_IMC_COUNTERS_NEST = 1,
+   OPAL_IMC_COUNTERS_CORE = 2,
+   OPAL_IMC_COUNTERS_THREAD = 3,
+};
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* __OPAL_API_H */
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 1ff03a6..9c16ec6 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -227,6 +227,10 @@ int64_t opal_pci_tce_kill(uint64_t phb_id, uint32_t 
kill_type,
  uint64_t dma_addr, uint32_t npages);
 int64_t opal_nmmu_set_ptcr(uint64_t chip_id, uint64_t ptcr);
 
+int64_t opal_imc_counters_init(uint32_t type, uint64_t address);
+int64_t opal_imc_counters_start(uint32_t type);
+int64_t opal_imc_counters_stop(uint32_t type);
+
 /* Internal functions */
 extern int early_init_dt_scan_opal(unsigned long node, const char *uname,
   int depth, void *data);
diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index f09a37a..40792424 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -18,6 +18,11 @@
 
 struct perchip_nest_info nest_perchip_info[IMC_MAX_CHIPS];
 struct imc_pmu *per_nest_pmu_arr[IMC_MAX_PMUS];
+static cpumask_t nest_imc_cpumask;
+
+static atomic_t nest_events;
+/* Used to avoid races in calling enable/disable nest-pmu units*/
+static DEFINE_MUTEX(imc_nest_reserve);
 
 /* Needed for sanity check */
 extern u64 nest_max_offset;
@@ -33,6 +38,160 @@ static struct attribute_group imc_format_group = {
.attrs = imc_format_attrs,
 };
 
+/* Get the cpumask printed to a buffer "buf" */
+static ssize_t imc_pmu_cpumask_get_attr(struct device *dev,
+   struct device_attribute *attr,
+   char *buf)
+{
+   cpumask_t *active_mask;
+
+   active_mask = &nest_imc_cpumask;
+   return cpumap_print_to_pagebuf(true, buf, active_mask);
+}
+
+static DEVICE_ATTR(cpumask, S_IRUGO, imc_pmu_cpumask_get_attr, NULL);
+
+static struct attribute *imc_pmu_cpumask_attrs[] = {
+   &dev_attr_cpumask.attr,
+   NULL,
+};
+
+static struct attribute_group imc_pmu_cpumask_attr_group = {
+   .attrs = imc_pmu_cpumask_attrs,
+};
+
+/*
+ * nest_init : Initializes the nest imc engine for the current chip.
+ * by default the nest engine is disabled.
+ */
+static void nest_init(int *cpu_opal_rc)
+{
+   int rc;
+
+   /*
+* OPAL figures out which CPU to start based on the CPU that is
+* currently running when we call into OPAL
+*/
+   rc = opal_imc_counters_stop(OPAL_IMC_COUNTERS_NEST);
+   if (rc)
+   cpu_opal_rc[smp_p

[PATCH v8 08/10] powerpc/powernv: Thread IMC events detection

2017-05-04 Thread Anju T Sudhakar
Patch adds support for detection of thread IMC events. It adds a new
domain IMC_DOMAIN_THREAD and it is determined with the help of the
compatibility string "ibm,imc-counters-thread" based on the IMC device
tree.

Signed-off-by: Anju T Sudhakar 
Signed-off-by: Hemant Kumar 
Signed-off-by: Madhavan Srinivasan 
---
 arch/powerpc/include/asm/imc-pmu.h|  2 ++
 arch/powerpc/perf/imc-pmu.c   |  1 +
 arch/powerpc/platforms/powernv/opal-imc.c | 18 +-
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/imc-pmu.h 
b/arch/powerpc/include/asm/imc-pmu.h
index bf5fb7c..6260e61 100644
--- a/arch/powerpc/include/asm/imc-pmu.h
+++ b/arch/powerpc/include/asm/imc-pmu.h
@@ -49,6 +49,7 @@
 #define IMC_DTB_COMPAT "ibm,opal-in-memory-counters"
 #define IMC_DTB_NEST_COMPAT"ibm,imc-counters-nest"
 #define IMC_DTB_CORE_COMPAT"ibm,imc-counters-core"
+#define IMC_DTB_THREAD_COMPAT  "ibm,imc-counters-thread"
 
 /*
  * Structure to hold per chip specific memory address
@@ -98,6 +99,7 @@ struct imc_pmu {
  */
 #define IMC_DOMAIN_NEST1
 #define IMC_DOMAIN_CORE2
+#define IMC_DOMAIN_THREAD  3
 #define IMC_DOMAIN_UNKNOWN -1
 
 #define IMC_COUNTER_ENABLE 1
diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index fb71825..9767714 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -41,6 +41,7 @@ struct imc_pmu *core_imc_pmu;
 /* Needed for sanity check */
 extern u64 nest_max_offset;
 extern u64 core_max_offset;
+extern u64 thread_max_offset;
 
 PMU_FORMAT_ATTR(event, "config:0-20");
 static struct attribute *imc_format_attrs[] = {
diff --git a/arch/powerpc/platforms/powernv/opal-imc.c 
b/arch/powerpc/platforms/powernv/opal-imc.c
index 23507d7..940f6b9 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -35,6 +35,7 @@
 
 u64 nest_max_offset;
 u64 core_max_offset;
+u64 thread_max_offset;
 
 static int imc_event_prop_update(char *name, struct imc_events *events)
 {
@@ -119,6 +120,10 @@ static void update_max_value(u32 value, int pmu_domain)
if (core_max_offset < value)
core_max_offset = value;
break;
+   case IMC_DOMAIN_THREAD:
+   if (thread_max_offset < value)
+   thread_max_offset = value;
+   break;
default:
/* Unknown domain, return */
return;
@@ -362,7 +367,7 @@ static struct imc_events *imc_events_setup(struct 
device_node *parent,
 /*
  * imc_pmu_create : Takes the parent device which is the pmu unit and a
  *  pmu_index as the inputs.
- * Allocates memory for the pmu, sets up its domain (NEST/CORE), and
+ * Allocates memory for the pmu, sets up its domain (NEST/CORE/THREAD), and
  * calls imc_events_setup() to allocate memory for the events supported
  * by this pmu. Assigns a name for the pmu. Calls imc_events_node_parser()
  * to setup the individual events.
@@ -483,6 +488,17 @@ static void __init imc_pmu_setup(struct device_node 
*parent)
return;
pmu_count++;
}
+   /*
+* Loop through the imc-counters tree for each compatible
+* "ibm,imc-counters-thread", and update "struct imc_pmu".
+*/
+   for_each_compatible_node(child, NULL, IMC_DTB_THREAD_COMPAT) {
+   domain = IMC_DOMAIN_THREAD;
+   rc = imc_pmu_create(child, pmu_count, domain);
+   if (rc)
+   return;
+   pmu_count++;
+   }
 }
 
 static int opal_imc_counters_probe(struct platform_device *pdev)
-- 
2.7.4



[PATCH v8 06/10] powerpc/powernv: Core IMC events detection

2017-05-04 Thread Anju T Sudhakar
From: Hemant Kumar 

This patch adds support for detection of core IMC events along with the
Nest IMC events. It adds a new domain IMC_DOMAIN_CORE and its determined
with the help of the compatibility string "ibm,imc-counters-core" based
on the IMC device tree.

Signed-off-by: Anju T Sudhakar 
Signed-off-by: Hemant Kumar 
Signed-off-by: Madhavan Srinivasan 
---
 arch/powerpc/include/asm/imc-pmu.h|  4 +++-
 arch/powerpc/perf/imc-pmu.c   |  3 +++
 arch/powerpc/platforms/powernv/opal-imc.c | 28 +---
 3 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/imc-pmu.h 
b/arch/powerpc/include/asm/imc-pmu.h
index 1478d0f..37fdd79 100644
--- a/arch/powerpc/include/asm/imc-pmu.h
+++ b/arch/powerpc/include/asm/imc-pmu.h
@@ -42,6 +42,7 @@
  */
 #define IMC_DTB_COMPAT "ibm,opal-in-memory-counters"
 #define IMC_DTB_NEST_COMPAT"ibm,imc-counters-nest"
+#define IMC_DTB_CORE_COMPAT"ibm,imc-counters-core"
 
 /*
  * Structure to hold per chip specific memory address
@@ -90,13 +91,14 @@ struct imc_pmu {
  * Domains for IMC PMUs
  */
 #define IMC_DOMAIN_NEST1
+#define IMC_DOMAIN_CORE2
 #define IMC_DOMAIN_UNKNOWN -1
 
 #define IMC_COUNTER_ENABLE 1
 #define IMC_COUNTER_DISABLE0
 
-
 extern struct perchip_nest_info nest_perchip_info[IMC_MAX_CHIPS];
 extern struct imc_pmu *per_nest_pmu_arr[IMC_MAX_PMUS];
+extern struct imc_pmu *core_imc_pmu;
 extern int __init init_imc_pmu(struct imc_events *events,int idx, struct 
imc_pmu *pmu_ptr);
 #endif /* PPC_POWERNV_IMC_PMU_DEF_H */
diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 40792424..c132df2 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -24,8 +24,11 @@ static atomic_t nest_events;
 /* Used to avoid races in calling enable/disable nest-pmu units*/
 static DEFINE_MUTEX(imc_nest_reserve);
 
+struct imc_pmu *core_imc_pmu;
+
 /* Needed for sanity check */
 extern u64 nest_max_offset;
+extern u64 core_max_offset;
 
 PMU_FORMAT_ATTR(event, "config:0-20");
 static struct attribute *imc_format_attrs[] = {
diff --git a/arch/powerpc/platforms/powernv/opal-imc.c 
b/arch/powerpc/platforms/powernv/opal-imc.c
index 61f6d67..d712ef3 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -34,6 +34,7 @@
 #include 
 
 u64 nest_max_offset;
+u64 core_max_offset;
 
 static int imc_event_prop_update(char *name, struct imc_events *events)
 {
@@ -114,6 +115,10 @@ static void update_max_value(u32 value, int pmu_domain)
if (nest_max_offset < value)
nest_max_offset = value;
break;
+   case IMC_DOMAIN_CORE:
+   if (core_max_offset < value)
+   core_max_offset = value;
+   break;
default:
/* Unknown domain, return */
return;
@@ -357,7 +362,7 @@ static struct imc_events *imc_events_setup(struct 
device_node *parent,
 /*
  * imc_pmu_create : Takes the parent device which is the pmu unit and a
  *  pmu_index as the inputs.
- * Allocates memory for the pmu, sets up its domain (NEST), and
+ * Allocates memory for the pmu, sets up its domain (NEST/CORE), and
  * calls imc_events_setup() to allocate memory for the events supported
  * by this pmu. Assigns a name for the pmu. Calls imc_events_node_parser()
  * to setup the individual events.
@@ -386,7 +391,10 @@ static int imc_pmu_create(struct device_node *parent, int 
pmu_index, int domain)
goto free_pmu;
 
/* Needed for hotplug/migration */
-   per_nest_pmu_arr[pmu_index] = pmu_ptr;
+   if (pmu_ptr->domain == IMC_DOMAIN_CORE)
+   core_imc_pmu = pmu_ptr;
+   else if (pmu_ptr->domain == IMC_DOMAIN_NEST)
+   per_nest_pmu_arr[pmu_index] = pmu_ptr;
 
pp = of_find_property(parent, "name", NULL);
if (!pp) {
@@ -407,7 +415,10 @@ static int imc_pmu_create(struct device_node *parent, int 
pmu_index, int domain)
goto free_pmu;
}
/* Save the name to register it later */
-   sprintf(buf, "nest_%s", (char *)pp->value);
+   if (pmu_ptr->domain == IMC_DOMAIN_NEST)
+   sprintf(buf, "nest_%s", (char *)pp->value);
+   else
+   sprintf(buf, "%s_imc", (char *)pp->value);
pmu_ptr->pmu.name = (char *)buf;
 
/*
@@ -461,6 +472,17 @@ static void __init imc_pmu_setup(struct device_node 
*parent)
return;
pmu_count++;
}
+   /*
+* Loop through the imc-counters tree for each compatible
+* "ibm,imc-counters-core", and update "struct imc_pmu".
+*/
+   for_each_compatible_node(child, NULL, IMC_DTB_CORE_

[PATCH v8 10/10] powerpc/perf: Thread imc cpuhotplug support

2017-05-04 Thread Anju T Sudhakar
This patch adds support for thread IMC on cpuhotplug.

When a cpu goes offline, the LDBAR for that cpu is disabled, and when it comes
back online the previous ldbar value is written back to the LDBAR for that cpu.

To register the hotplug functions for thread_imc, a new state
CPUHP_AP_PERF_POWERPC_THREADIMC_ONLINE is added to the list of existing
states.

Reviewed-by: Gautham R. Shenoy 
Signed-off-by: Anju T Sudhakar 
Signed-off-by: Madhavan Srinivasan 
---
 arch/powerpc/perf/imc-pmu.c | 32 +++-
 include/linux/cpuhotplug.h  |  1 +
 2 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index cfd112e..f10489f 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -982,6 +982,16 @@ static void cleanup_all_thread_imc_memory(void)
on_each_cpu(cleanup_thread_imc_memory, NULL, 1);
 }
 
+static void thread_imc_update_ldbar(unsigned int cpu_id)
+{
+   u64 ldbar_addr, ldbar_value;
+
+   ldbar_addr = (u64)virt_to_phys((void *)per_cpu_add[cpu_id]);
+   ldbar_value = (ldbar_addr & (u64)THREAD_IMC_LDBAR_MASK) |
+   (u64)THREAD_IMC_ENABLE;
+   mtspr(SPRN_LDBAR, ldbar_value);
+}
+
 /*
  * Allocates a page of memory for each of the online cpus, and, writes the
  * physical base address of that page to the LDBAR for that cpu. This starts
@@ -989,21 +999,33 @@ static void cleanup_all_thread_imc_memory(void)
  */
 static void thread_imc_mem_alloc(void *dummy)
 {
-   u64 ldbar_addr, ldbar_value;
int cpu_id = smp_processor_id();
int phys_id = topology_physical_package_id(smp_processor_id());
 
per_cpu_add[cpu_id] = (u64)alloc_pages_exact_nid(phys_id,
(size_t)IMC_THREAD_COUNTER_MEM, GFP_KERNEL | 
__GFP_ZERO);
-   ldbar_addr = (u64)virt_to_phys((void *)per_cpu_add[cpu_id]);
-   ldbar_value = (ldbar_addr & (u64)THREAD_IMC_LDBAR_MASK) |
-   (u64)THREAD_IMC_ENABLE;
-   mtspr(SPRN_LDBAR, ldbar_value);
+   thread_imc_update_ldbar(cpu_id);
+}
+
+static int ppc_thread_imc_cpu_online(unsigned int cpu)
+{
+   thread_imc_update_ldbar(cpu);
+   return 0;
 }
 
+static int ppc_thread_imc_cpu_offline(unsigned int cpu)
+{
+   mtspr(SPRN_LDBAR, 0);
+   return 0;
+ }
+
 void thread_imc_cpu_init(void)
 {
on_each_cpu(thread_imc_mem_alloc, NULL, 1);
+   cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_THREADIMC_ONLINE,
+   "POWER_THREAD_IMC_ONLINE",
+   ppc_thread_imc_cpu_online,
+   ppc_thread_imc_cpu_offline);
 }
 
 /*
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index e7b7712..bbec927 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -139,6 +139,7 @@ enum cpuhp_state {
CPUHP_AP_PERF_ARM_QCOM_L2_ONLINE,
CPUHP_AP_PERF_POWERPC_NEST_ONLINE,
CPUHP_AP_PERF_POWERPC_COREIMC_ONLINE,
+   CPUHP_AP_PERF_POWERPC_THREADIMC_ONLINE,
CPUHP_AP_WORKQUEUE_ONLINE,
CPUHP_AP_RCUTREE_ONLINE,
CPUHP_AP_ONLINE_DYN,
-- 
2.7.4



[PATCH v8 07/10] powerpc/perf: PMU functions for Core IMC and hotplugging

2017-05-04 Thread Anju T Sudhakar
From: Hemant Kumar 

This patch adds the PMU function to initialize a core IMC event. It also
adds cpumask initialization function for core IMC PMU. For
initialization, a 8KB of memory is allocated per core where the data
for core IMC counters will be accumulated. The base address for this
page is sent to OPAL via an OPAL call which initializes various SCOMs
related to Core IMC initialization. Upon any errors, the pages are
free'ed and core IMC counters are disabled using the same OPAL call.

For CPU hotplugging, a cpumask is initialized which contains an online
CPU from each core. If a cpu goes offline, we check whether that cpu
belongs to the core imc cpumask, if yes, then, we migrate the PMU
context to any other online cpu (if available) in that core. If a cpu
comes back online, then this cpu will be added to the core imc cpumask
only if there was no other cpu from that core in the previous cpumask.

To register the hotplug functions for core_imc, a new state
CPUHP_AP_PERF_POWERPC_COREIMC_ONLINE is added to the list of existing
states.

Patch also adds OPAL device shutdown callback. Needed to disable the
IMC core engine to handle kexec.

Signed-off-by: Hemant Kumar 
Signed-off-by: Anju T Sudhakar 
Signed-off-by: Madhavan Srinivasan 
---
 arch/powerpc/include/asm/imc-pmu.h|   7 +
 arch/powerpc/perf/imc-pmu.c   | 380 +-
 arch/powerpc/platforms/powernv/opal-imc.c |   7 +
 include/linux/cpuhotplug.h|   1 +
 4 files changed, 384 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/include/asm/imc-pmu.h 
b/arch/powerpc/include/asm/imc-pmu.h
index 37fdd79..bf5fb7c 100644
--- a/arch/powerpc/include/asm/imc-pmu.h
+++ b/arch/powerpc/include/asm/imc-pmu.h
@@ -24,6 +24,7 @@
  */
 #define IMC_MAX_CHIPS  32
 #define IMC_MAX_PMUS   32
+#define IMC_MAX_CORES  32
 
 /*
  * This macro is used for memory buffer allocation of
@@ -38,6 +39,11 @@
 #define IMC_NEST_MAX_PAGES 64
 
 /*
+ * IMC Core engine expects 8K bytes of memory for counter collection.
+ */
+#define IMC_CORE_COUNTER_MEM   8192
+
+/*
  *Compatbility macros for IMC devices
  */
 #define IMC_DTB_COMPAT "ibm,opal-in-memory-counters"
@@ -101,4 +107,5 @@ extern struct perchip_nest_info 
nest_perchip_info[IMC_MAX_CHIPS];
 extern struct imc_pmu *per_nest_pmu_arr[IMC_MAX_PMUS];
 extern struct imc_pmu *core_imc_pmu;
 extern int __init init_imc_pmu(struct imc_events *events,int idx, struct 
imc_pmu *pmu_ptr);
+void core_imc_disable(void);
 #endif /* PPC_POWERNV_IMC_PMU_DEF_H */
diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index c132df2..fb71825 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -1,5 +1,5 @@
 /*
- * Nest Performance Monitor counter support.
+ * IMC Performance Monitor counter support.
  *
  * Copyright (C) 2017 Madhavan Srinivasan, IBM Corporation.
  *   (C) 2017 Anju T Sudhakar, IBM Corporation.
@@ -21,9 +21,21 @@ struct imc_pmu *per_nest_pmu_arr[IMC_MAX_PMUS];
 static cpumask_t nest_imc_cpumask;
 
 static atomic_t nest_events;
+static atomic_t core_events;
 /* Used to avoid races in calling enable/disable nest-pmu units*/
 static DEFINE_MUTEX(imc_nest_reserve);
+/* Used to avoid races in calling enable/disable core-pmu units */
+static DEFINE_MUTEX(imc_core_reserve);
 
+/*
+ * Maintains base addresses for all the cores.
+ * MAX chip and core are defined as 32. So we
+ * statically allocate 8K for this structure.
+ *
+ * TODO -- Could be made dynamic
+ */
+static u64 per_core_pdbar_add[IMC_MAX_CHIPS][IMC_MAX_CORES];
+static cpumask_t core_imc_cpumask;
 struct imc_pmu *core_imc_pmu;
 
 /* Needed for sanity check */
@@ -46,9 +58,15 @@ static ssize_t imc_pmu_cpumask_get_attr(struct device *dev,
struct device_attribute *attr,
char *buf)
 {
+   struct pmu *pmu = dev_get_drvdata(dev);
cpumask_t *active_mask;
 
-   active_mask = &nest_imc_cpumask;
+   if (!strncmp(pmu->name, "nest_", strlen("nest_")))
+   active_mask = &nest_imc_cpumask;
+   else if (!strncmp(pmu->name, "core_", strlen("core_")))
+   active_mask = &core_imc_cpumask;
+   else
+   return 0;
return cpumap_print_to_pagebuf(true, buf, active_mask);
 }
 
@@ -64,6 +82,100 @@ static struct attribute_group imc_pmu_cpumask_attr_group = {
 };
 
 /*
+ * core_imc_mem_init : Initializes memory for the current core.
+ *
+ * Uses alloc_pages_exact_nid() and uses the returned address as an argument to
+ * an opal call to configure the pdbar. The address sent as an argument is
+ * converted to physical address before the opal call is made. This is the
+ * base address at which the core imc counters are populated.
+ */
+static int __meminit core_imc_mem_init(void

[PATCH v8 09/10] powerpc/perf: Thread IMC PMU functions

2017-05-04 Thread Anju T Sudhakar
This patch adds the PMU functions required for event initialization,
read, update, add, del etc. for thread IMC PMU. Thread IMC PMUs are used
for per-task monitoring. 

For each CPU, a page of memory is allocated and is kept static i.e.,
these pages will exist till the machine shuts down. The base address of
this page is assigned to the ldbar of that cpu. As soon as we do that,
the thread IMC counters start running for that cpu and the data of these
counters are assigned to the page allocated. But we use this for
per-task monitoring. Whenever we start monitoring a task, the event is
added is onto the task. At that point, we read the initial value of the
event. Whenever, we stop monitoring the task, the final value is taken
and the difference is the event data.

Now, a task can move to a different cpu. Suppose a task X is moving from
cpu A to cpu B. When the task is scheduled out of A, we get an
event_del for A, and hence, the event data is updated. And, we stop
updating the X's event data. As soon as X moves on to B, event_add is
called for B, and we again update the event_data. And this is how it
keeps on updating the event data even when the task is scheduled on to
different cpus.

Signed-off-by: Anju T Sudhakar 
Signed-off-by: Hemant Kumar 
Signed-off-by: Madhavan Srinivasan 
---
 arch/powerpc/include/asm/imc-pmu.h|   5 +
 arch/powerpc/perf/imc-pmu.c   | 209 +-
 arch/powerpc/platforms/powernv/opal-imc.c |   3 +
 3 files changed, 216 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/imc-pmu.h 
b/arch/powerpc/include/asm/imc-pmu.h
index 6260e61..cc04712 100644
--- a/arch/powerpc/include/asm/imc-pmu.h
+++ b/arch/powerpc/include/asm/imc-pmu.h
@@ -42,6 +42,7 @@
  * IMC Core engine expects 8K bytes of memory for counter collection.
  */
 #define IMC_CORE_COUNTER_MEM   8192
+#define IMC_THREAD_COUNTER_MEM 8192
 
 /*
  *Compatbility macros for IMC devices
@@ -51,6 +52,9 @@
 #define IMC_DTB_CORE_COMPAT"ibm,imc-counters-core"
 #define IMC_DTB_THREAD_COMPAT  "ibm,imc-counters-thread"
 
+#define THREAD_IMC_LDBAR_MASK   0x0003e000
+#define THREAD_IMC_ENABLE   0x8000
+
 /*
  * Structure to hold per chip specific memory address
  * information for nest pmus. Nest Counter data are exported
@@ -110,4 +114,5 @@ extern struct imc_pmu *per_nest_pmu_arr[IMC_MAX_PMUS];
 extern struct imc_pmu *core_imc_pmu;
 extern int __init init_imc_pmu(struct imc_events *events,int idx, struct 
imc_pmu *pmu_ptr);
 void core_imc_disable(void);
+void thread_imc_disable(void);
 #endif /* PPC_POWERNV_IMC_PMU_DEF_H */
diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 9767714..cfd112e 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -38,6 +38,9 @@ static u64 per_core_pdbar_add[IMC_MAX_CHIPS][IMC_MAX_CORES];
 static cpumask_t core_imc_cpumask;
 struct imc_pmu *core_imc_pmu;
 
+/* Maintains base address for all the cpus */
+static u64 per_cpu_add[NR_CPUS];
+
 /* Needed for sanity check */
 extern u64 nest_max_offset;
 extern u64 core_max_offset;
@@ -480,6 +483,56 @@ static int core_imc_event_init(struct perf_event *event)
return 0;
 }
 
+static int thread_imc_event_init(struct perf_event *event)
+{
+   struct task_struct *target;
+
+   if (event->attr.type != event->pmu->type)
+   return -ENOENT;
+
+   /* Sampling not supported */
+   if (event->hw.sample_period)
+   return -EINVAL;
+
+   event->hw.idx = -1;
+
+   /* Sanity check for config (event offset) */
+   if (event->attr.config > thread_max_offset)
+   return -EINVAL;
+
+   target = event->hw.target;
+
+   if (!target)
+   return -EINVAL;
+
+   event->pmu->task_ctx_nr = perf_sw_context;
+   return 0;
+}
+
+static void thread_imc_read_counter(struct perf_event *event)
+{
+   u64 *addr, data;
+   int cpu_id = smp_processor_id();
+
+   addr = (u64 *)(per_cpu_add[cpu_id] + event->attr.config);
+   data = __be64_to_cpu(READ_ONCE(*addr));
+   local64_set(&event->hw.prev_count, data);
+}
+
+static void thread_imc_perf_event_update(struct perf_event *event)
+{
+   u64 counter_prev, counter_new, final_count, *addr;
+   int cpu_id = smp_processor_id();
+
+   addr = (u64 *)(per_cpu_add[cpu_id] + event->attr.config);
+   counter_prev = local64_read(&event->hw.prev_count);
+   counter_new = __be64_to_cpu(READ_ONCE(*addr));
+   final_count = counter_new - counter_prev;
+
+   local64_set(&event->hw.prev_count, counter_new);
+   local64_add(final_count, &event->count);
+}
+
 static void imc_read_counter(struct perf_event *event)
 {
u64 *addr, data;
@@ -720,6 +773,84 @@ static int core_imc_event_add(struct perf_event *event, 
int flags)
 }
 
 
+

  1   2   3   >