Good patch!

> -----Original Message-----
> From: Kanaka Juvva [mailto:kanaka.d.ju...@linux.intel.com]
> Sent: Friday, June 12, 2015 12:58 AM
> To: Juvva, Kanaka D; Williamson, Glenn P; Fleming, Matt; Auld, Will;
> a...@firstfloor.org; Narayan, Ananth S; Herdrich, Andrew J; 
> t...@linutronix.de;
> x...@kernel.org; linux-kernel@vger.kernel.org; mi...@redhat.com;
> h...@zytor.com; l...@amacapital.net; dvlas...@redhat.com; b...@alien8.de;
> dave.han...@linux.intel.com; peter.p.waskiewicz...@intel.com;
> imamm...@redhat.com; b...@suse.de; ross.zwis...@linux.intel.com;
> jacob.w.s...@gmail.com; Brandewie, Dirk J; Shivappa, Vikas; Verplanke, Edwin;
> Kantecki, Tomasz
> Subject: [PATCH v1 1/2] perf,x86: add Intel Memory Bandwidth Monitoring
> (MBM) PMU
> 
> This patch adds Memory Bandwidth Monitoring events to the exitsing intel_cqm
> pmu in the Linux Kernel.
> 
> Intel MBM builds on Cache Monitoring Technology (CMT) infrastructure to allow
> monitoring of bandwidth from one level of the cache hierarchy to the next - in
> this case focusing on the L3 cache, which is typically backed directly by 
> system
> memory. As a  result of this implementation, memory bandwidth can be
> monitored.
> 
> MBM counters are available in Intel Xeon CPUs. The following events are
> implemented in the kernel and expose two MBM counters via perf_event
> interface:
> 
> - llc_local_bw: bandwidth consumption of all coress on socket
> - llc_total_bw: bandwidth consumption of all cores on socket + bandwidth
>   for QPI accesses
> 
> At present, the MBM events are checked at one second interval provided by the
> HRTIMER of the MBM event. MBM counters can overflow atmost once in a
> second and thus must be read atleast once in a second. Overflow is detected
> and handled. Cumulative average value is calculated for each bandwidth type
> upon reading a new value from the MSR.
> 
> Signed-off-by: Kanaka Juvva <kanaka.d.ju...@linux.intel.com>
> ---
>  arch/x86/include/asm/cpufeature.h          |   2 +
>  arch/x86/kernel/cpu/common.c               |   5 +-
>  arch/x86/kernel/cpu/perf_event_intel_cqm.c | 619
> +++++++++++++++++++++++++++--
>  3 files changed, 585 insertions(+), 41 deletions(-)
> 
> diff --git a/arch/x86/include/asm/cpufeature.h
> b/arch/x86/include/asm/cpufeature.h
> index 3d6606f..26505f6 100644
> --- a/arch/x86/include/asm/cpufeature.h
> +++ b/arch/x86/include/asm/cpufeature.h
> @@ -248,6 +248,8 @@
> 
>  /* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */
>  #define X86_FEATURE_CQM_LLC  (11*32+ 1) /* LLC QoS if 1 */
> +#define X86_FEATURE_CQM_MBM_TOTAL (12*32+1) /* LLC Total MBM
> monitoring
> +*/ #define X86_FEATURE_CQM_MBM_LOCAL (12*32+2) /* LLC Local MBM
> +monitoring  */
> 
>  /* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */
> #define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy
> monitoring if 1 */ diff --git a/arch/x86/kernel/cpu/common.c
> b/arch/x86/kernel/cpu/common.c index a62cf04..d046f46 100644
> --- a/arch/x86/kernel/cpu/common.c
> +++ b/arch/x86/kernel/cpu/common.c
> @@ -660,7 +660,10 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
>                       /* QoS sub-leaf, EAX=0Fh, ECX=1 */
>                       cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx);
>                       c->x86_capability[12] = edx;
> -                     if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) {
> +                     if ((cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC))
> +                           || ((cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL))
> +                           ||  (cpu_has(c,
> X86_FEATURE_CQM_MBM_LOCAL)))) {
> +
>                               c->x86_cache_max_rmid = ecx;
>                               c->x86_cache_occ_scale = ebx;
>                       }
> diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
> b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
> index 1880761..d210c5c 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
> @@ -12,10 +12,20 @@
>  #define MSR_IA32_PQR_ASSOC   0x0c8f
>  #define MSR_IA32_QM_CTR              0x0c8e
>  #define MSR_IA32_QM_EVTSEL   0x0c8d
> +#define MAX_MBM_CNTR            0xffffff
> +#define MBM_SOCKET_MAX          8
> +#define MBM_TIME_DELTA_MAX      1000
> +#define MBM_TIME_DELTA_MIN      1000
> +#define MBM_SCALING_FACTOR      1000
> +#define MBM_SCALING_HALF        (MBM_SCALING_FACTOR/2)
> +#define MBM_FIFO_SIZE_MIN       10
> +#define MBM_FIFO_SIZE_MAX       300
> 
> -static u32 cqm_max_rmid = -1;
> -static unsigned int cqm_l3_scale; /* supposedly cacheline size */
> 
> +static u32 cqm_max_rmid = -1;
> +static unsigned int cqm_l3_scale;/* supposedly cacheline size */ static
> +unsigned mbm_window_size = MBM_FIFO_SIZE_MIN; static bool cqm_llc_occ,
> +cqm_mbm;
>  /**
>   * struct intel_pqr_state - State cache for the PQR MSR
>   * @rmid:            The cached Resource Monitoring ID
> @@ -42,6 +52,34 @@ struct intel_pqr_state {
>   * interrupts disabled, which is sufficient for the protection.
>   */
>  static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
> +static DEFINE_PER_CPU(struct mbm_pmu *, mbm_pmu); static
> +DEFINE_PER_CPU(struct mbm_pmu *, mbm_pmu_to_free);
> +
> +/*
> + * mbm pmu is used for storing mbm_local and mbm_total
> + * events
> + */
> +struct mbm_pmu {
> +     spinlock_t       lock;
> +     int              n_active; /* number of active events */
> +     struct list_head active_list;
> +     struct pmu       *pmu; /* pointer to mbm perf_event */
> +     ktime_t          timer_interval; /* in ktime_t unit */
> +     struct hrtimer   hrtimer;
> +};
> +
> +struct sample {
> +     u64 bytes; /* mbm counter value read*/
> +     u64 cum_avg;   /* current running average of bandwidth */
> +     ktime_t prev_time;
> +     u64 index; /* current sample no */
> +     u32 mbmfifo[MBM_FIFO_SIZE_MAX]; /* window of last n bw values */
> +     u32  fifoin; /* mbmfifo in counter for sliding window */
> +     u32  fifoout; /* mbmfifo out counter for sliding window */ };
> +
> +struct sample *mbm_total; /* curent stats for  mbm_total events */
> +struct sample *mbm_local; /* current stats for mbm_local events */
> 
>  /*
>   * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
> @@ -66,6 +104,9 @@ static cpumask_t cqm_cpumask;
>  #define RMID_VAL_UNAVAIL     (1ULL << 62)
> 
>  #define QOS_L3_OCCUP_EVENT_ID        (1 << 0)
> +#define QOS_MBM_TOTAL_EVENT_ID  (1 << 1) #define
> QOS_MBM_LOCAL_EVENT_ID
> +0x3
> +
> 
>  #define QOS_EVENT_MASK       QOS_L3_OCCUP_EVENT_ID
> 
> @@ -90,7 +131,8 @@ static u32 intel_cqm_rotation_rmid;
>   */
>  static inline bool __rmid_valid(u32 rmid)  {
> -     if (!rmid || rmid == INVALID_RMID)
> +     WARN_ON_ONCE(rmid > cqm_max_rmid);
> +     if (!rmid || (rmid == INVALID_RMID) || (rmid > cqm_max_rmid))
>               return false;
> 
>       return true;
> @@ -125,6 +167,7 @@ struct cqm_rmid_entry {
>       enum rmid_recycle_state state;
>       struct list_head list;
>       unsigned long queue_time;
> +     bool config;
>  };
> 
>  /*
> @@ -176,6 +219,17 @@ static inline struct cqm_rmid_entry *__rmid_entry(u32
> rmid)
>       return entry;
>  }
> 
> +static void mbm_reset_stats(u32 rmid)
> +{
> +     u32  vrmid =  topology_physical_package_id(smp_processor_id()) *
> +                                                cqm_max_rmid + rmid;
> +
> +     if ((!cqm_mbm) || (!mbm_local) || (!mbm_total))
> +             return;
> +     memset(&mbm_local[vrmid], 0, sizeof(struct sample));
> +     memset(&mbm_total[vrmid], 0, sizeof(struct sample)); }
> +
>  /*
>   * Returns < 0 on fail.
>   *
> @@ -192,6 +246,7 @@ static u32 __get_rmid(void)
> 
>       entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry,
> list);
>       list_del(&entry->list);
> +     mbm_reset_stats(entry->rmid);
> 
>       return entry->rmid;
>  }
> @@ -207,6 +262,7 @@ static void __put_rmid(u32 rmid)
> 
>       entry->queue_time = jiffies;
>       entry->state = RMID_YOUNG;
> +     mbm_reset_stats(rmid);
> 
>       list_add_tail(&entry->list, &cqm_rmid_limbo_lru);  } @@ -232,6 +288,8
> @@ static int intel_cqm_setup_rmid_cache(void)
> 
>               INIT_LIST_HEAD(&entry->list);
>               entry->rmid = r;
> +             entry->config = false;
> +
>               cqm_rmid_ptrs[r] = entry;
> 
>               list_add_tail(&entry->list, &cqm_rmid_free_lru); @@ -254,6
> +312,8 @@ fail:
>               kfree(cqm_rmid_ptrs[r]);
> 
>       kfree(cqm_rmid_ptrs);
> +     kfree(mbm_local);
> +     kfree(mbm_total);
>       return -ENOMEM;
>  }
> 
> @@ -403,9 +463,11 @@ static void __intel_cqm_event_count(void *info);
> static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid)  {
>       struct perf_event *event;
> +
>       struct list_head *head = &group->hw.cqm_group_entry;
>       u32 old_rmid = group->hw.cqm_rmid;
> 
> +
>       lockdep_assert_held(&cache_mutex);
> 
>       /*
> @@ -494,6 +556,166 @@ static bool intel_cqm_sched_in_event(u32 rmid)
>       return false;
>  }
> 
> +static u32  mbm_fifo_sum_lastn_out(int rmid, bool is_localbw) {
> +     u32 val = 0, i, j, index;
> +
> +     if (is_localbw) {
> +             if (++mbm_local[rmid].fifoout >=  mbm_window_size)
> +                     mbm_local[rmid].fifoout =  0;
> +             index =  mbm_local[rmid].fifoout;
> +             for (i = 0; i < mbm_window_size - 1; i++) {
> +                     if (index + i >= mbm_window_size)
> +                             j = index + i - mbm_window_size;
> +                     else
> +                             j = index + i;
> +                     val += mbm_local[rmid].mbmfifo[j];
> +             }
> +             return val;
> +     }
> +
> +     if (++mbm_total[rmid].fifoout >=  mbm_window_size)
> +             mbm_total[rmid].fifoout = 0;
> +     index =  mbm_total[rmid].fifoout;
> +     for (i = 0; i < mbm_window_size - 1; i++) {
> +             if (index + i >= mbm_window_size)
> +                     j = index + i - mbm_window_size;
> +             else
> +                     j = index + i;
> +             val += mbm_total[rmid].mbmfifo[j];
> +     }
> +     return val;
> +}
> +
> +static int  mbm_fifo_in(int rmid, u32 val, bool is_localbw) {
> +     if (is_localbw) {
> +             mbm_local[rmid].mbmfifo[mbm_local[rmid].fifoin] = val;
> +             if (++mbm_local[rmid].fifoin >=  mbm_window_size)
> +                     mbm_local[rmid].fifoin =  0;
> +     } else {
> +             mbm_total[rmid].mbmfifo[mbm_total[rmid].fifoin] = val;
> +             if (++mbm_total[rmid].fifoin >=  mbm_window_size)
> +                     mbm_total[rmid].fifoin =  0;
> +     }
> +     return 0;
> +}
> +
> +/*
> + * __rmid_read_mbm checks whether it is LOCAL or GLOBAL MBM event and
> +reads
> + * its MSR counter. if (MSR current value < MSR previous value) it is
> +an
> + * overflow and overflow is handled. If MSR is read within last 100ms,
> + * then the value is ignored; this will suppress small deltas. We don't
> + * process MBM samples that are within 100ms. Bandwidth is calculated as:
> + * bandwidth = difference of last two msr counter values/time difference.
> + * cum_avg = Running Average bandwidth of last 'n' samples that are
> +processed
> + * Sliding window is used to save the last 'n' samples. Hence,
> + * n = sliding_window_size
> + * cum_avg is scaled down by a  factor MBM_SCALING_FACTOR and rounded
> +to nearest
> + * integer. User interface reads the BW in MB/sec.
> + * Rounding algorithm : (X + 0.5):
> + * where X is scaled BW value. To avoid floating point arithmetic :
> + * BW- unscaled value
> + * (BW + MBM_SCALING_HALF)/MBM_SCALING_FACTOR is computed which
> gives
> +the
> + * scaled bandwidth.
> + */
> +static u64 __rmid_read_mbm(unsigned int rmid, bool read_mbm_local) {
> +     u64 val, tmp, diff_time, cma, bytes, index;
> +     bool overflow = false;
> +     ktime_t cur_time;
> +     u32 tmp32 = rmid;
> +     u32 vrmid = topology_physical_package_id(smp_processor_id()) *
> +                                              cqm_max_rmid + rmid;
> +
> +     rmid = vrmid;
> +     cur_time = ktime_get_real();
> +     if (read_mbm_local) {
> +             cma = mbm_local[rmid].cum_avg;
> +             diff_time = ktime_ms_delta(cur_time,
> +                                mbm_local[rmid].prev_time);
> +             if (diff_time < 100)
> +                     return cma;
> +             mbm_local[rmid].prev_time = ktime_set(0,
> +                             (unsigned long)ktime_to_ns(cur_time));
> +             bytes = mbm_local[rmid].bytes;
> +             index = mbm_local[rmid].index;
> +             rmid = tmp32;
> +             wrmsr(MSR_IA32_QM_EVTSEL, QOS_MBM_LOCAL_EVENT_ID,
> rmid);
> +     } else {
> +             cma = mbm_total[rmid].cum_avg;
> +             diff_time = ktime_ms_delta(cur_time,
> +                                        mbm_total[rmid].prev_time);
> +             if (diff_time < 100)
> +                     return cma;
> +             mbm_total[rmid].prev_time = ktime_set(0,
> +                                     (unsigned long)ktime_to_ns(cur_time));
> +             bytes = mbm_total[rmid].bytes;
> +             index = mbm_total[rmid].index;
> +             rmid = tmp32;
> +             wrmsr(MSR_IA32_QM_EVTSEL, QOS_MBM_TOTAL_EVENT_ID,
> rmid);
> +        }
> +     rdmsrl(MSR_IA32_QM_CTR, val);
> +     if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
> +             return val;
> +
> +     tmp = val;
> +     if (val < bytes) /* overflow handle it */ {
> +             val = MAX_MBM_CNTR - bytes + val;
> +             overflow = true;
> +     } else
> +             val = val - bytes;
> +     if (diff_time < MBM_TIME_DELTA_MAX - MBM_TIME_DELTA_MIN)
> +             val =  (val * MBM_TIME_DELTA_MAX) / diff_time;
> +
> +     if ((diff_time > MBM_TIME_DELTA_MAX) && (!cma))
> +             /* First sample, we can't use the time delta */
> +             diff_time = MBM_TIME_DELTA_MAX;
> +
> +     rmid = vrmid;
> +     if ((diff_time <= MBM_TIME_DELTA_MAX + MBM_TIME_DELTA_MIN)
> ||
> +                     overflow) {
> +             int bw, ret;
> +
> +             if (index   & (index < mbm_window_size))
> +                     val = cma * MBM_SCALING_FACTOR  + val / index -
> +                                     cma / index;
> +             val = (val + MBM_SCALING_HALF) / MBM_SCALING_FACTOR;
> +             if (index >= mbm_window_size) {
> +                     /*
> +                      * Compute the sum of recent n-1 samples and slide the
> +                      * window by 1
> +                      */
> +                     ret = mbm_fifo_sum_lastn_out(rmid, read_mbm_local);
> +                     /* recalculate the running average with current bw */
> +                     ret = (ret + val) / mbm_window_size;
> +                     if (ret < 0)
> +                             ret = 0;
> +                     bw = val;
> +                     val = ret;
> +             } else
> +                     bw = val;
> +             /* save the recent bw in fifo */
> +             mbm_fifo_in(rmid, bw, read_mbm_local);
> +
> +             if (read_mbm_local) {
> +                     mbm_local[rmid].index++;
> +                     mbm_local[rmid].cum_avg = val;
> +                     mbm_local[rmid].bytes = tmp;
> +                     mbm_local[rmid].prev_time = ktime_set(0,
> +                                     (unsigned long)ktime_to_ns(cur_time));
> +             } else {
> +                     mbm_total[rmid].index++;
> +                     mbm_total[rmid].cum_avg = val;
> +                     mbm_total[rmid].bytes = tmp;
> +                     mbm_total[rmid].prev_time = ktime_set(0,
> +                                     (unsigned long)ktime_to_ns(cur_time));
> +             }
> +             return val;
> +     }
> +     return  cma;
> +}
> +
>  /*
>   * Initially use this constant for both the limbo queue time and the
>   * rotation timer interval, pmu::hrtimer_interval_ms.
> @@ -568,7 +790,8 @@ static bool intel_cqm_rmid_stabilize(unsigned int
> *available)
>       /*
>        * Test whether an RMID is free for each package.
>        */
> -     on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true);
> +     if (entry->config)
> +             on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL,
> true);
> 
>       list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) {
>               /*
> @@ -846,8 +1069,9 @@ static void intel_cqm_setup_event(struct perf_event
> *event,
>                                 struct perf_event **group)
>  {
>       struct perf_event *iter;
> -     bool conflict = false;
> +
>       u32 rmid;
> +     bool conflict = false;
> 
>       list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
>               rmid = iter->hw.cqm_rmid;
> @@ -875,6 +1099,40 @@ static void intel_cqm_setup_event(struct perf_event
> *event,
>       event->hw.cqm_rmid = rmid;
>  }
> 
> +static void intel_cqm_event_update(struct perf_event *event) {
> +     unsigned int rmid;
> +     u64 val = 0;
> +
> +     /*
> +      * Task events are handled by intel_cqm_event_count().
> +      */
> +     if (event->cpu == -1)
> +             return;
> +
> +     rmid = event->hw.cqm_rmid;
> +     if (!__rmid_valid(rmid))
> +             return;
> +
> +     switch (event->attr.config) {
> +     case QOS_MBM_TOTAL_EVENT_ID:
> +             val = __rmid_read_mbm(rmid, false);
> +             break;
> +     case QOS_MBM_LOCAL_EVENT_ID:
> +             val = __rmid_read_mbm(rmid, true);
> +             break;
> +     default:
> +             break;
> +     }
> +     /*
> +      * Ignore this reading on error states and do not update the value.
> +      */
> +     if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
> +             return;
> +
> +     local64_set(&event->count, val);
> +}
> +
>  static void intel_cqm_event_read(struct perf_event *event)  {
>       unsigned long flags;
> @@ -887,6 +1145,10 @@ static void intel_cqm_event_read(struct perf_event
> *event)
>       if (event->cpu == -1)
>               return;
> 
> +     if  ((event->attr.config & QOS_MBM_TOTAL_EVENT_ID) ||
> +          (event->attr.config & QOS_MBM_LOCAL_EVENT_ID))
> +             intel_cqm_event_update(event);
> +
>       raw_spin_lock_irqsave(&cache_lock, flags);
>       rmid = event->hw.cqm_rmid;
> 
> @@ -906,6 +1168,28 @@ out:
>       raw_spin_unlock_irqrestore(&cache_lock, flags);  }
> 
> +static void __intel_cqm_event_total_bw_count(void *info) {
> +     struct rmid_read *rr = info;
> +     u64 val;
> +
> +     val = __rmid_read_mbm(rr->rmid, false);
> +     if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
> +             return;
> +     atomic64_add(val, &rr->value);
> +}
> +
> +static void __intel_cqm_event_local_bw_count(void *info) {
> +     struct rmid_read *rr = info;
> +     u64 val;
> +
> +     val = __rmid_read_mbm(rr->rmid, true);
> +     if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
> +             return;
> +     atomic64_add(val, &rr->value);
> +}
> +
>  static void __intel_cqm_event_count(void *info)  {
>       struct rmid_read *rr = info;
> @@ -967,7 +1251,21 @@ static u64 intel_cqm_event_count(struct perf_event
> *event)
>       if (!__rmid_valid(rr.rmid))
>               goto out;
> 
> -     on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
> +     switch (event->attr.config) {
> +     case  QOS_L3_OCCUP_EVENT_ID:
> +             on_each_cpu_mask(&cqm_cpumask,
> __intel_cqm_event_count, &rr, 1);
> +             break;
> +     case  QOS_MBM_TOTAL_EVENT_ID:
> +             on_each_cpu_mask(&cqm_cpumask,
> __intel_cqm_event_total_bw_count,
> +                              &rr, 1);
> +             break;
> +     case  QOS_MBM_LOCAL_EVENT_ID:
> +             on_each_cpu_mask(&cqm_cpumask,
> __intel_cqm_event_local_bw_count,
> +                              &rr, 1);
> +             break;
> +     default:
> +             break;
> +     }
> 
>       raw_spin_lock_irqsave(&cache_lock, flags);
>       if (event->hw.cqm_rmid == rr.rmid)
> @@ -977,6 +1275,39 @@ out:
>       return __perf_event_count(event);
>  }
> 
> +static void mbm_start_hrtimer(struct mbm_pmu *pmu) {
> +     __hrtimer_start_range_ns(&(pmu->hrtimer),
> +                              pmu->timer_interval, 0,
> +                              HRTIMER_MODE_REL_PINNED, 0);
> +}
> +
> +static void mbm_stop_hrtimer(struct mbm_pmu *pmu) {
> +     hrtimer_cancel(&pmu->hrtimer);
> +}
> +
> +static enum hrtimer_restart mbm_hrtimer_handle(struct hrtimer *hrtimer)
> +{
> +     struct mbm_pmu *pmu = __this_cpu_read(mbm_pmu);
> +     struct perf_event *event;
> +
> +     if (!pmu->n_active)
> +             return HRTIMER_NORESTART;
> +     list_for_each_entry(event, &pmu->active_list, active_entry)
> +             intel_cqm_event_update(event);
> +     hrtimer_forward_now(hrtimer, pmu->timer_interval);
> +     return HRTIMER_RESTART;
> +}
> +
> +static void mbm_hrtimer_init(struct mbm_pmu *pmu) {
> +     struct hrtimer *hr = &pmu->hrtimer;
> +
> +     hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> +     hr->function = mbm_hrtimer_handle;
> +}
> +
>  static void intel_cqm_event_start(struct perf_event *event, int mode)  {
>       struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); @@ -995,19
> +1326,48 @@ static void intel_cqm_event_start(struct perf_event *event, int
> mode)
>       }
> 
>       state->rmid = rmid;
> +     if (event->attr.config & QOS_L3_OCCUP_EVENT_ID) {
> +             struct cqm_rmid_entry *entry;
> +
> +             entry = __rmid_entry(rmid);
> +             entry->config = true;
> +     }
>       wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid);
> +
> +     if (((event->attr.config & QOS_MBM_TOTAL_EVENT_ID) ||
> +          (event->attr.config & QOS_MBM_LOCAL_EVENT_ID))  &&
> (cqm_mbm)) {
> +             int cpu = get_cpu();
> +             struct mbm_pmu *pmu = per_cpu(mbm_pmu, cpu);
> +
> +             if (pmu) {
> +                     if (pmu->n_active == 0)
> +                             mbm_hrtimer_init(pmu);
> +                     pmu->n_active++;
> +                     list_add_tail(&event->active_entry,
> +                                   &pmu->active_list);
> +                     if (pmu->n_active == 1)
> +                             mbm_start_hrtimer(pmu);
> +             }
> +             put_cpu();
> +     }
>  }
> 
>  static void intel_cqm_event_stop(struct perf_event *event, int mode)  {
>       struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
> +     struct mbm_pmu *pmu = __this_cpu_read(mbm_pmu);
> 
>       if (event->hw.cqm_state & PERF_HES_STOPPED)
>               return;
> 
>       event->hw.cqm_state |= PERF_HES_STOPPED;
> 
> -     intel_cqm_event_read(event);
> +     if (event->attr.config & QOS_L3_OCCUP_EVENT_ID)
> +             intel_cqm_event_read(event);
> +
> +     if ((event->attr.config & QOS_MBM_TOTAL_EVENT_ID) ||
> +         (event->attr.config & QOS_MBM_LOCAL_EVENT_ID))
> +             intel_cqm_event_update(event);
> 
>       if (!--state->rmid_usecnt) {
>               state->rmid = 0;
> @@ -1015,8 +1375,18 @@ static void intel_cqm_event_stop(struct perf_event
> *event, int mode)
>       } else {
>               WARN_ON_ONCE(!state->rmid);
>       }
> +
> +     if (pmu) {
> +             WARN_ON_ONCE(pmu->n_active <= 0);
> +             pmu->n_active--;
> +             if (pmu->n_active == 0)
> +                     mbm_stop_hrtimer(pmu);
> +             list_del(&event->active_entry);
> +     }
> +
>  }
> 
> +
>  static int intel_cqm_event_add(struct perf_event *event, int mode)  {
>       unsigned long flags;
> @@ -1082,7 +1452,9 @@ static int intel_cqm_event_init(struct perf_event
> *event)
>       if (event->attr.type != intel_cqm_pmu.type)
>               return -ENOENT;
> 
> -     if (event->attr.config & ~QOS_EVENT_MASK)
> +     if (((event->attr.config & ~QOS_EVENT_MASK) &&
> +          (event->attr.config & ~QOS_MBM_TOTAL_EVENT_ID)) &&
> +          (event->attr.config & ~QOS_MBM_LOCAL_EVENT_ID))
>               return -EINVAL;
> 
>       /* unsupported modes and filters */
> @@ -1137,18 +1509,63 @@ EVENT_ATTR_STR(llc_occupancy.unit,
> intel_cqm_llc_unit, "Bytes");  EVENT_ATTR_STR(llc_occupancy.scale,
> intel_cqm_llc_scale, NULL);  EVENT_ATTR_STR(llc_occupancy.snapshot,
> intel_cqm_llc_snapshot, "1");
> 
> -static struct attribute *intel_cqm_events_attr[] = {
> +EVENT_ATTR_STR(llc_total_bw, intel_cqm_llc_total_bw, "event=0x02");
> +EVENT_ATTR_STR(llc_total_bw.per-pkg, intel_cqm_llc_total_bw_pkg, "1");
> +EVENT_ATTR_STR(llc_total_bw.unit, intel_cqm_llc_total_bw_unit,
> +"MB/sec"); EVENT_ATTR_STR(llc_total_bw.scale,
> +intel_cqm_llc_total_bw_scale, NULL);
> +EVENT_ATTR_STR(llc_total_bw.snapshot, intel_cqm_llc_total_bw_snapshot,
> +"1");
> +
> +EVENT_ATTR_STR(llc_local_bw, intel_cqm_llc_local_bw, "event=0x03");
> +EVENT_ATTR_STR(llc_local_bw.per-pkg, intel_cqm_llc_local_bw_pkg, "1");
> +EVENT_ATTR_STR(llc_local_bw.unit, intel_cqm_llc_local_bw_unit,
> +"MB/sec"); EVENT_ATTR_STR(llc_local_bw.scale,
> +intel_cqm_llc_local_bw_scale, NULL);
> +EVENT_ATTR_STR(llc_local_bw.snapshot, intel_cqm_llc_local_bw_snapshot,
> +"1");
> +
> +static struct attribute *intel_cmt_events_attr[] = {
> +     EVENT_PTR(intel_cqm_llc),
> +     EVENT_PTR(intel_cqm_llc_pkg),
> +     EVENT_PTR(intel_cqm_llc_unit),
> +     EVENT_PTR(intel_cqm_llc_scale),
> +     EVENT_PTR(intel_cqm_llc_snapshot),
> +     NULL,
> +};
> +
> +static struct attribute *intel_mbm_events_attr[] = {
> +     EVENT_PTR(intel_cqm_llc_total_bw),
> +     EVENT_PTR(intel_cqm_llc_local_bw),
> +     EVENT_PTR(intel_cqm_llc_total_bw_pkg),
> +     EVENT_PTR(intel_cqm_llc_local_bw_pkg),
> +     EVENT_PTR(intel_cqm_llc_total_bw_unit),
> +     EVENT_PTR(intel_cqm_llc_local_bw_unit),
> +     EVENT_PTR(intel_cqm_llc_total_bw_scale),
> +     EVENT_PTR(intel_cqm_llc_local_bw_scale),
> +     EVENT_PTR(intel_cqm_llc_total_bw_snapshot),
> +     EVENT_PTR(intel_cqm_llc_local_bw_snapshot),
> +     NULL,
> +};
> +
> +static struct attribute *intel_cmt_mbm_events_attr[] = {
>       EVENT_PTR(intel_cqm_llc),
> +     EVENT_PTR(intel_cqm_llc_total_bw),
> +     EVENT_PTR(intel_cqm_llc_local_bw),
>       EVENT_PTR(intel_cqm_llc_pkg),
> +     EVENT_PTR(intel_cqm_llc_total_bw_pkg),
> +     EVENT_PTR(intel_cqm_llc_local_bw_pkg),
>       EVENT_PTR(intel_cqm_llc_unit),
> +     EVENT_PTR(intel_cqm_llc_total_bw_unit),
> +     EVENT_PTR(intel_cqm_llc_local_bw_unit),
>       EVENT_PTR(intel_cqm_llc_scale),
> +     EVENT_PTR(intel_cqm_llc_total_bw_scale),
> +     EVENT_PTR(intel_cqm_llc_local_bw_scale),
>       EVENT_PTR(intel_cqm_llc_snapshot),
> +     EVENT_PTR(intel_cqm_llc_total_bw_snapshot),
> +     EVENT_PTR(intel_cqm_llc_local_bw_snapshot),
>       NULL,
>  };
> 
>  static struct attribute_group intel_cqm_events_group = {
>       .name = "events",
> -     .attrs = intel_cqm_events_attr,
> +     .attrs = NULL,
>  };
> 
>  PMU_FORMAT_ATTR(event, "config:0-7");
> @@ -1176,6 +1593,19 @@ max_recycle_threshold_show(struct device *dev,
> struct device_attribute *attr,  }
> 
>  static ssize_t
> +sliding_window_size_show(struct device *dev, struct device_attribute *attr,
> +                      char *page)
> +{
> +     ssize_t rv;
> +
> +     mutex_lock(&cache_mutex);
> +     rv = snprintf(page, PAGE_SIZE-1, "%u\n", mbm_window_size);
> +     mutex_unlock(&cache_mutex);
> +
> +     return rv;
> +}
> +
> +static ssize_t
>  max_recycle_threshold_store(struct device *dev,
>                           struct device_attribute *attr,
>                           const char *buf, size_t count)
> @@ -1203,10 +1633,35 @@ max_recycle_threshold_store(struct device *dev,
>       return count;
>  }
> 
> +static ssize_t
> +sliding_window_size_store(struct device *dev,
> +                       struct device_attribute *attr,
> +                       const char *buf, size_t count)
> +{
> +     unsigned int bytes;
> +     int ret;
> +
> +     ret = kstrtouint(buf, 0, &bytes);
> +     if (ret)
> +             return ret;
> +
> +     mutex_lock(&cache_mutex);
> +     if (bytes > 0 && bytes <= MBM_FIFO_SIZE_MAX)
> +             mbm_window_size = bytes;
> +     else
> +             bytes = MBM_FIFO_SIZE_MIN;
> +
> +     mutex_unlock(&cache_mutex);
> +
> +     return count;
> +}
> +
>  static DEVICE_ATTR_RW(max_recycle_threshold);
> +static DEVICE_ATTR_RW(sliding_window_size);
> 
>  static struct attribute *intel_cqm_attrs[] = {
>       &dev_attr_max_recycle_threshold.attr,
> +     &dev_attr_sliding_window_size.attr,
>       NULL,
>  };
> 
> @@ -1241,16 +1696,17 @@ static inline void cqm_pick_event_reader(int cpu)
> 
>       for_each_cpu(i, &cqm_cpumask) {
>               if (phys_id == topology_physical_package_id(i))
> -                     return; /* already got reader for this socket */
> +                     return; /* already got reader for this socket */
>       }
> 
>       cpumask_set_cpu(cpu, &cqm_cpumask);
>  }
> 
> -static void intel_cqm_cpu_prepare(unsigned int cpu)
> +static int intel_cqm_cpu_prepare(unsigned int cpu)
>  {
>       struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);
>       struct cpuinfo_x86 *c = &cpu_data(cpu);
> +     struct mbm_pmu *pmu = per_cpu(mbm_pmu, cpu);
> 
>       state->rmid = 0;
>       state->closid = 0;
> @@ -1258,12 +1714,27 @@ static void intel_cqm_cpu_prepare(unsigned int
> cpu)
> 
>       WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
>       WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
> +
> +     if ((!pmu) && (cqm_mbm)) {
> +             pmu = kzalloc_node(sizeof(*mbm_pmu), GFP_KERNEL,
> NUMA_NO_NODE);
> +             if (!pmu)
> +                     return  -ENOMEM;
> +             spin_lock_init(&pmu->lock);
> +             INIT_LIST_HEAD(&pmu->active_list);
> +             pmu->pmu = &intel_cqm_pmu;
> +             pmu->n_active = 0;
> +             pmu->timer_interval = ms_to_ktime(MBM_TIME_DELTA_MAX);
> +             per_cpu(mbm_pmu, cpu) = pmu;
> +             per_cpu(mbm_pmu_to_free, cpu) = NULL;
> +     }
> +     return 0;
>  }
> 
>  static void intel_cqm_cpu_exit(unsigned int cpu)  {
>       int phys_id = topology_physical_package_id(cpu);
>       int i;
> +     struct mbm_pmu *pmu = per_cpu(mbm_pmu, cpu);
> 
>       /*
>        * Is @cpu a designated cqm reader?
> @@ -1280,6 +1751,13 @@ static void intel_cqm_cpu_exit(unsigned int cpu)
>                       break;
>               }
>       }
> +
> +     /* cancel overflow polling timer for CPU */
> +     if (pmu)
> +             mbm_stop_hrtimer(pmu);
> +     kfree(mbm_local);
> +     kfree(mbm_total);
> +
>  }
> 
>  static int intel_cqm_cpu_notifier(struct notifier_block *nb, @@ -1289,7
> +1767,7 @@ static int intel_cqm_cpu_notifier(struct notifier_block *nb,
> 
>       switch (action & ~CPU_TASKS_FROZEN) {
>       case CPU_UP_PREPARE:
> -             intel_cqm_cpu_prepare(cpu);
> +             return intel_cqm_cpu_prepare(cpu);
>               break;
>       case CPU_DOWN_PREPARE:
>               intel_cqm_cpu_exit(cpu);
> @@ -1305,17 +1783,74 @@ static int intel_cqm_cpu_notifier(struct
> notifier_block *nb,  static const struct x86_cpu_id intel_cqm_match[] = {
>       { .vendor = X86_VENDOR_INTEL, .feature =
> X86_FEATURE_CQM_OCCUP_LLC },
>       {}
> +     }, intel_mbm_match[] = {
> +     { .vendor = X86_VENDOR_INTEL, .feature =
> X86_FEATURE_CQM_MBM_LOCAL },
> +     {}
>  };
> 
>  static int __init intel_cqm_init(void)
>  {
>       char *str, scale[20];
> -     int i, cpu, ret;
> +     int i = 0, cpu, ret;
> 
> -     if (!x86_match_cpu(intel_cqm_match))
> +     if (!x86_match_cpu(intel_cqm_match) &&
> +        (!x86_match_cpu(intel_mbm_match)))
>               return -ENODEV;
> 
>       cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
> +     cqm_max_rmid = boot_cpu_data.x86_cache_max_rmid;
> +
> +     if (x86_match_cpu(intel_cqm_match)) {
> +             cqm_llc_occ = true;
> +             intel_cqm_events_group.attrs = intel_cmt_events_attr;
> +     } else
> +             cqm_llc_occ = false;
> +
> +     if (x86_match_cpu(intel_mbm_match)) {
> +             u32 mbm_scale_rounded = 0;
> +
> +             cqm_mbm = true;
> +             cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
> +             /*
> +              * MBM counter values are  in bytes. To conver this to MB/sec,
> +              * we  scale the MBM scale factor by 1000. Another 1000 factor
> +              * scaling down is done
> +              * after reading the counter val i.e. in the function
> +              * __rmid_read_mbm()
> +              */
> +             mbm_scale_rounded =  (cqm_l3_scale + 500) / 1000;
> +             cqm_max_rmid = boot_cpu_data.x86_cache_max_rmid;
> +             snprintf(scale, sizeof(scale), "%u", mbm_scale_rounded);
> +             str = kstrdup(scale, GFP_KERNEL);
> +             if (!str) {
> +                     ret = -ENOMEM;
> +                     goto out;
> +             }
> +
> +             if (cqm_llc_occ)
> +                     intel_cqm_events_group.attrs =
> +                             intel_cmt_mbm_events_attr;
> +             else
> +                     intel_cqm_events_group.attrs =
> intel_mbm_events_attr;
> +
> +             event_attr_intel_cqm_llc_local_bw_scale.event_str
> +                   = event_attr_intel_cqm_llc_total_bw_scale.event_str = str;
> +             mbm_local = kzalloc_node(sizeof(struct sample) *
> +                             (cqm_max_rmid + 1) * MBM_SOCKET_MAX,
> +                                      GFP_KERNEL, NUMA_NO_NODE);
> +             if (!mbm_local) {
> +                     ret = -ENOMEM;
> +                     goto out;
> +             }
> +             mbm_total = kzalloc_node(sizeof(struct sample) *
> +                             (cqm_max_rmid + 1) * MBM_SOCKET_MAX,
> +                                      GFP_KERNEL, NUMA_NO_NODE);
> +             if (!mbm_total) {
> +                     ret = -ENOMEM;
> +                     goto out;
> +             }
> +     } else
> +             cqm_mbm = false;
> 
>       /*
>        * It's possible that not all resources support the same number @@ -
> 1328,44 +1863,48 @@ static int __init intel_cqm_init(void)
>        */
>       cpu_notifier_register_begin();
> 
> -     for_each_online_cpu(cpu) {
> -             struct cpuinfo_x86 *c = &cpu_data(cpu);
> +     if (cqm_llc_occ) {
> +             for_each_online_cpu(cpu) {
> +                     struct cpuinfo_x86 *c = &cpu_data(cpu);
> 
> -             if (c->x86_cache_max_rmid < cqm_max_rmid)
> -                     cqm_max_rmid = c->x86_cache_max_rmid;
> +                     if (c->x86_cache_max_rmid < cqm_max_rmid)
> +                             cqm_max_rmid = c->x86_cache_max_rmid;
> 
> -             if (c->x86_cache_occ_scale != cqm_l3_scale) {
> -                     pr_err("Multiple LLC scale values, disabling\n");
> -                     ret = -EINVAL;
> -                     goto out;
> +                     if (c->x86_cache_occ_scale != cqm_l3_scale) {
> +                             pr_err("Multiple LLC scale values, 
> disabling\n");
> +                             ret = -EINVAL;
> +                             goto out;
> +                     }
>               }
> -     }
> 
> -     /*
> -      * A reasonable upper limit on the max threshold is the number
> -      * of lines tagged per RMID if all RMIDs have the same number of
> -      * lines tagged in the LLC.
> -      *
> -      * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
> -      */
> -     __intel_cqm_max_threshold =
> +             /*
> +              * A reasonable upper limit on the max threshold is the number
> +              * of lines tagged per RMID if all RMIDs have the same number
> of
> +              * lines tagged in the LLC.
> +              *
> +              * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
> +              */
> +             __intel_cqm_max_threshold =
>               boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1);
> 
> -     snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
> -     str = kstrdup(scale, GFP_KERNEL);
> -     if (!str) {
> -             ret = -ENOMEM;
> -             goto out;
> -     }
> +             snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
> +             str = kstrdup(scale, GFP_KERNEL);
> +             if (!str) {
> +                     ret = -ENOMEM;
> +                     goto out;
> +             }
> 
> -     event_attr_intel_cqm_llc_scale.event_str = str;
> +             event_attr_intel_cqm_llc_scale.event_str = str;
> +     }
> 
>       ret = intel_cqm_setup_rmid_cache();
>       if (ret)
>               goto out;
> 
>       for_each_online_cpu(i) {
> -             intel_cqm_cpu_prepare(i);
> +             ret = intel_cqm_cpu_prepare(i);
> +             if (ret)
> +                     goto out;
>               cqm_pick_event_reader(i);
>       }
> 
> --
> 2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to