perf: PMU functions for Core IMC and hotplugging

Madhavan Srinivasan Thu, 29 Jun 2017 02:15:49 -0700


On Thursday 29 June 2017 01:11 AM, Thomas Gleixner wrote:

On Thu, 29 Jun 2017, Anju T Sudhakar wrote:

+static void cleanup_all_core_imc_memory(struct imc_pmu *pmu_ptr)
+{
+       struct imc_mem_info *ptr;
+
+       for (ptr = pmu_ptr->mem_info; ptr; ptr++) {
+               if (ptr->vbase[0])
+                       free_pages((u64)ptr->vbase[0], 0);
+       }

This loop is broken beyond repair. If pmu_ptr->mem_info is not NULL, then
ptr will happily increment to the point where it wraps around to
NULL. Oh well.

+       kfree(pmu_ptr->mem_info);
+bool is_core_imc_mem_inited(int cpu)

This function is global because?

+{
+       struct imc_mem_info *mem_info;
+       int core_id = (cpu / threads_per_core);
+
+       mem_info = &core_imc_pmu->mem_info[core_id];
+       if ((mem_info->id == core_id) && (mem_info->vbase[0] != NULL))
+               return true;
+
+       return false;
+}
+
+/*
+ * imc_mem_init : Function to support memory allocation for core imc.
+ */
+static int imc_mem_init(struct imc_pmu *pmu_ptr)
+{

The function placement is horrible. This function belongs to the pmu init
stuff and wants to be placed there and not five pages away in the middle of
unrelated functions.

+       int nr_cores;
+
+       if (pmu_ptr->imc_counter_mmaped)
+               return 0;
+       nr_cores = num_present_cpus() / threads_per_core;
+       pmu_ptr->mem_info = kzalloc((sizeof(struct imc_mem_info) * nr_cores), 
GFP_KERNEL);
+       if (!pmu_ptr->mem_info)
+               return -ENOMEM;
+       return 0;
+}
+static int core_imc_event_init(struct perf_event *event)
+{
+       int core_id, rc;
+       u64 config = event->attr.config;
+       struct imc_mem_info *pcmi;
+       struct imc_pmu *pmu;
+
+       if (event->attr.type != event->pmu->type)
+               return -ENOENT;
+
+       /* Sampling not supported */
+       if (event->hw.sample_period)
+               return -EINVAL;
+
+       /* unsupported modes and filters */
+       if (event->attr.exclude_user   ||
+           event->attr.exclude_kernel ||
+           event->attr.exclude_hv     ||
+           event->attr.exclude_idle   ||
+           event->attr.exclude_host   ||
+           event->attr.exclude_guest)
+               return -EINVAL;
+
+       if (event->cpu < 0)
+               return -EINVAL;
+
+       event->hw.idx = -1;
+       pmu = imc_event_to_pmu(event);
+
+       /* Sanity check for config (event offset and rvalue) */
+       if (((config & IMC_EVENT_OFFSET_MASK) > pmu->counter_mem_size) ||
+           ((config & IMC_EVENT_RVALUE_MASK) != 0))
+               return -EINVAL;
+
+       if (!is_core_imc_mem_inited(event->cpu))
+               return -ENODEV;
+
+       core_id = event->cpu / threads_per_core;
+       pcmi = &pmu->mem_info[core_id];
+       if ((pcmi->id != core_id) || (!pcmi->vbase[0]))
+               return -ENODEV;
+
+       event->hw.event_base = (u64)pcmi->vbase[0] + (config & 
IMC_EVENT_OFFSET_MASK);
+       /*
+        * Core pmu units are enabled only when it is used.
+        * See if this is triggered for the first time.
+        * If yes, take the mutex lock and enable the core counters.
+        * If not, just increment the count in core_events.
+        */
+       if (atomic_inc_return(&core_events[core_id]) == 1) {
+               mutex_lock(&imc_core_reserve);
+               rc = opal_imc_counters_start(OPAL_IMC_COUNTERS_CORE,
+                                            
get_hard_smp_processor_id(event->cpu));
+               mutex_unlock(&imc_core_reserve);


Idea is to handle multiple event session for a given core and
yes, my bad, current implementation is racy/broken.
But an alternate approach is to have a per-core mutex and
per-core ref count to handle this.

event_init path:
per-core mutex lock
  if ( per-core[refcount] == 0) {
       rc = opal call to start the engine;
       if (rc on failure) {
       per-core mutex unlock;
       log error info;
       return error;
       }
  }
  increment the per-core[refcount];
per-core mutex unlock;


event_destroy path:
per-core mutex lock
  decrement the per-core[refcount];
  if ( per-core[refcount] == 0) {
     rc = opal call to stop the engine;
     if (rc on failure) {
       per-core mutex unlock;
       log the failure;
       return error;
     }
  } else if ( per-core[refcount] < 0) {
     WARN()
     per-core[refcount] = 0;
  }
per-core mutext unlock;

Kindly let me know your comment for this.

Maddy

That machinery here is racy as hell in several aspects.

CPU0                                   CPU1

atomic_inc_ret(core_events[0]) -> 1

preemption()
                                       atomic_inc_ret(core_events[0]) -> 2
                                       return 0;
                                
                                       Uses the event, without counters
                                       being started until the preempted
                                       task comes on the CPU again.

Here is another one:

CPU0                                   CPU1

atomic_dec_ret(core_events[0]) -> 0
                                       atomic_inc_ret(core_events[1] -> 1
                                       mutex_lock();
mutex_lock()                           start counter();
                                       mutex_unlock()

stop_counter();
mutex_unlock();
                                       Yay, another stale event!

Brilliant stuff that, or maybe not so much.

+               if (rc) {
+                       atomic_dec_return(&core_events[core_id]);

What's the point of using atomic_dec_return here if you ignore the return
value? Not that it matters much as the whole logic is crap anyway.

+                       pr_err("IMC: Unable to start the counters for core 
%d\n", core_id);
+                       return -ENODEV;
+               }
+       }
@@ -410,22 +658,38 @@ int init_imc_pmu(struct imc_events *events, int idx,
                 struct imc_pmu *pmu_ptr)
  {
        int ret = -ENODEV;
+

Sure ret needs to stay initialized, just in case imc_mem_init() does not
return anything magically, right?

+       ret = imc_mem_init(pmu_ptr);
+       if (ret)
+               goto err_free;
        /* Add cpumask and register for hotplug notification */
-       if (atomic_inc_return(&nest_pmus) == 1) {
-               /*
-                * Nest imc pmu need only one cpu per chip, we initialize the
-                * cpumask for the first nest imc pmu and use the same for the 
rest.
-                * To handle the cpuhotplug callback unregister, we track
-                * the number of nest pmus registers in "nest_pmus".
-                * "nest_imc_cpumask_initialized" is set to zero during 
cpuhotplug
-                * callback unregister.
-                */
-               ret = nest_pmu_cpumask_init();
+       switch (pmu_ptr->domain) {
+       case IMC_DOMAIN_NEST:
+               if (atomic_inc_return(&nest_pmus) == 1) {
+                       /*
+                        * Nest imc pmu need only one cpu per chip, we 
initialize
+                        * the cpumask for the first nest imc pmu and use the
+                        * same for the rest.
+                        * To handle the cpuhotplug callback unregister, we 
track
+                        * the number of nest pmus registers in "nest_pmus".
+                        * "nest_imc_cpumask_initialized" is set to zero during
+                        * cpuhotplug callback unregister.
+                        */
+                       ret = nest_pmu_cpumask_init();
+                       if (ret)
+                               goto err_free;
+                       mutex_lock(&imc_nest_inited_reserve);
+                       nest_imc_cpumask_initialized = 1;
+                       mutex_unlock(&imc_nest_inited_reserve);
+               }
+               break;
+       case IMC_DOMAIN_CORE:
+               ret = core_imc_pmu_cpumask_init();
                if (ret)
-                       goto err_free;
-               mutex_lock(&imc_nest_inited_reserve);
-               nest_imc_cpumask_initialized = 1;
-               mutex_unlock(&imc_nest_inited_reserve);
+                       return ret;

Oh, so now you replaced the goto with ret. What is actually taking care of
the cleanup if that fails?

+               break;
+       default:
+               return -1;      /* Unknown domain */
        }
        ret = update_events_in_group(events, idx, pmu_ptr);
        if (ret)
@@ -459,5 +723,10 @@ int init_imc_pmu(struct imc_events *events, int idx,
                        mutex_unlock(&imc_nest_inited_reserve);
                }
        }
+       /* For core_imc, we have allocated memory, we need to free it */
+       if (pmu_ptr->domain == IMC_DOMAIN_CORE) {
+               cleanup_all_core_imc_memory(pmu_ptr);
+               cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_CORE_IMC_ONLINE);

Cute. You cleanuo the memory stuff and then you let the hotplug core invoke
the offline callbacks which then deal with freed memory.

Thanks,

        tglx

Re: [PATCH v11 07/10] powerpc/perf: PMU functions for Core IMC and hotplugging

Reply via email to