core: use context tstamp_data for skipped events on mux interrupt

Alexey Budankov Thu, 10 Aug 2017 08:58:35 -0700

Well, Ok.

I re-implemented this patch of patch set and also implemented a unit 
test that IMHO mimics the case mentioned above to check if it solves the issue.


The test is a single thread application that creates 272 fds for every of  
two INSTRUCTIONS_RETIRED events covering 272 cores of Intel Xeon Phi. 

The test simultaneously executes 1 million of instructions several times 
when the events are enabled and reads the events' counts, TOTAL_ENABLED and 
TOTAL_RUNNING over read() system call.

The first event is allocated, enabled and disabled at the beginning and 
the end of the test execution phase where as the second event allocation and
measurements are included into the measurement interval of the first event.

Below is what I am getting now when running the test on the patched kernel:

EID     CPU     COUNT           T_ENABLED       T_RUNNING        SCALE
--- 0 enabled ---
--- million instructions ---
0       26      1334            1666671         137366            8.24
0       94      138637          1735872         527098           30.37
0       162     874166          1823695         1083785          59.43
--- million instructions ---
0       26      1334            3328832         137366            4.13
0       94      1164146         3318599         2109825          63.58
0       162     874166          3390716         1083785          31.96
--- million instructions ---
0       26      1334            4835955         137366            2.84
0       94      2189671         4820750         3611976          74.93
0       162     874166          4934513         1083785          21.96
--- 1 enabled ---
--- million instructions ---
0       26      1334            22661310        137366            0.61
0       94      3248450         22667165        21458391         94.67
0       162     874166          22742990        1083785           4.77
1       94      1033387         2150307         2150307         100.00
--- million instructions ---
0       26      1334            24878504        137366            0.55
0       94      4289784         24869644        23660870         95.14
0       162     874166          24943564        1083785           4.34
1       94      2074675         4370708         4370708         100.00
--- 1 disabled ---
--- million instructions ---
0       26      1334            27681611        137366            0.50
0       94      5337752         27675968        26467194         95.63
0       162     874166          27749528        1083785           3.91
1       94      2089278         5024218         5024218         100.00
--- 0 disabled ---
--- million instructions ---
0       26      1334            29004868        137366            0.47
0       94      5372734         28964075        27755301         95.83
0       162     874166          29005751        1083785           3.74
1       94      2089278         5024218         5024218         100.00

The output demonstrates that test thread migrated two time during execution
thus several fds were employed for measuring amount of executed instructions.

Also the output shows that T_RUNNING values of events updated and
maintained so that sums of SCALE values for every event are near 100% 
(no multiplexing) after every million instructions execution.

Unit test code is attached for convenience.

The key thing in the patch is explicit updating of tstamp fields for
INACTIVE events in update_event_times().

Signed-off-by: Alexey Budankov <alexey.budan...@linux.intel.com>
---
kernel/events/core.c | 52 +++++++++++++++++++++++++++++++---------------------
 1 file changed, 31 insertions(+), 21 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0a4f619..d195fdc 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1391,6 +1391,27 @@ static u64 perf_event_time(struct perf_event *event)
        return ctx ? ctx->time : 0;
 }
 
+void perf_event_tstamp_init(struct perf_event *event)
+{
+       u64 tstamp = perf_event_time(event);
+
+       event->tstamp_enabled = tstamp;
+       event->tstamp_running = tstamp;
+       event->tstamp_stopped = tstamp;
+}
+
+void perf_event_tstamp_update(struct perf_event *event)
+{
+       u64 tstamp, delta;
+
+       tstamp = perf_event_time(event);
+
+       delta = tstamp - event->tstamp_stopped;
+
+       event->tstamp_running += delta;
+       event->tstamp_stopped = tstamp;
+}
+
 /*
  * Update the total_time_enabled and total_time_running fields for a event.
  */
@@ -1405,6 +1426,9 @@ static void update_event_times(struct perf_event *event)
            event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
                return;
 
+       if (event->state == PERF_EVENT_STATE_INACTIVE)
+               perf_event_tstamp_update(event);
+
        /*
         * in cgroup mode, time_enabled represents
         * the time the event was enabled AND active
@@ -1430,7 +1454,6 @@ static void update_event_times(struct perf_event *event)
                run_end = perf_event_time(event);
 
        event->total_time_running = run_end - event->tstamp_running;
-
 }
 
 /*
@@ -1954,9 +1977,6 @@ event_sched_out(struct perf_event *event,
                  struct perf_cpu_context *cpuctx,
                  struct perf_event_context *ctx)
 {
-       u64 tstamp = perf_event_time(event);
-       u64 delta;
-
        WARN_ON_ONCE(event->ctx != ctx);
        lockdep_assert_held(&ctx->lock);
 
@@ -1967,18 +1987,15 @@ event_sched_out(struct perf_event *event,
         * via read() for time_enabled, time_running:
         */
        if (event->state == PERF_EVENT_STATE_INACTIVE &&
-           !event_filter_match(event)) {
-               delta = tstamp - event->tstamp_stopped;
-               event->tstamp_running += delta;
-               event->tstamp_stopped = tstamp;
-       }
+           !event_filter_match(event))
+               perf_event_tstamp_update(event);
 
        if (event->state != PERF_EVENT_STATE_ACTIVE)
                return;
 
        perf_pmu_disable(event->pmu);
 
-       event->tstamp_stopped = tstamp;
+       event->tstamp_stopped = perf_event_time(event);
        event->pmu->del(event, 0);
        event->oncpu = -1;
        event->state = PERF_EVENT_STATE_INACTIVE;
@@ -2294,7 +2311,6 @@ group_sched_in(struct perf_event *group_event,
 {
        struct perf_event *event, *partial_group = NULL;
        struct pmu *pmu = ctx->pmu;
-       u64 now = ctx->time;
        bool simulate = false;
 
        if (group_event->state == PERF_EVENT_STATE_OFF)
@@ -2340,12 +2356,10 @@ group_sched_in(struct perf_event *group_event,
                if (event == partial_group)
                        simulate = true;
 
-               if (simulate) {
-                       event->tstamp_running += now - event->tstamp_stopped;
-                       event->tstamp_stopped = now;
-               } else {
+               if (simulate)
+                       perf_event_tstamp_update(event);
+               else
                        event_sched_out(event, cpuctx, ctx);
-               }
        }
        event_sched_out(group_event, cpuctx, ctx);
 
@@ -2390,13 +2404,9 @@ static int group_can_go_on(struct perf_event *event,
 static void add_event_to_ctx(struct perf_event *event,
                               struct perf_event_context *ctx)
 {
-       u64 tstamp = perf_event_time(event);
-
        list_add_event(event, ctx);
        perf_group_attach(event);
-       event->tstamp_enabled = tstamp;
-       event->tstamp_running = tstamp;
-       event->tstamp_stopped = tstamp;
+       perf_event_tstamp_init(event);
 }
 
 static void ctx_sched_out(struct perf_event_context *ctx,

/* check_multiplexing_read.c                                            */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <unistd.h>
#include <errno.h>

#include <sys/mman.h>

#include <sys/ioctl.h>
#include <asm/unistd.h>

#include "perf_event.h"
#include "test_utils.h"
#include "perf_helpers.h"
#include "instructions_testcode.h"

#define NUM_CPUS 272
#define NUM_EVENTS 2

int fd[NUM_EVENTS][NUM_CPUS];
long long events[NUM_EVENTS]={
        PERF_COUNT_HW_CPU_CYCLES,
        PERF_COUNT_HW_INSTRUCTIONS
};

static long long base_results[NUM_CPUS][3];

#define TIME_ENABLED    1
#define TIME_RUNNING    2

static int test_routine(void) {

        int i,result;

        printf("--- million instructions ---\n");

        for(i=0;i<1;i++) {
                result=instructions_million();
        }

        return result;
}

void alloc_events(long long config, int fd[NUM_CPUS])
{
        int ret,i,j;

        struct perf_event_attr pe;

        for(j=0;j<NUM_CPUS;j++) {
                memset(&pe,0,sizeof(struct perf_event_attr));
                pe.type=PERF_TYPE_HARDWARE;
                pe.size=sizeof(struct perf_event_attr);
                pe.config=config;
                pe.disabled=1;
                pe.exclude_kernel=1;
                pe.exclude_hv=1;
                pe.read_format=PERF_FORMAT_TOTAL_TIME_ENABLED |
                                PERF_FORMAT_TOTAL_TIME_RUNNING;

                fd[j]=perf_event_open(&pe,0,j,-1,0);
                if (fd[j]<0) {
                        fprintf(stderr,"Failed adding mpx event 0 %s\n",
                                strerror(errno));
                        return -1;
                }
        }
}

void free_events(int fd[NUM_CPUS])
{
        int j;
        for(j=0;j<NUM_CPUS;j++) {
                close(fd[j]);
        }
}

void enable_events(int fd[NUM_CPUS])
{
        int j,ret;
        for(j=0;j<NUM_CPUS;j++) {
                ret=ioctl(fd[j], PERF_EVENT_IOC_ENABLE,0);
                if (ret<0) {
                        fprintf(stderr,"Error starting event fd[%d]\n",j);
                }
        }
}

void disable_events(int fd[NUM_CPUS])
{
        int j,ret;
        for(j=0;j<NUM_CPUS;j++) {
                ret=ioctl(fd[j], PERF_EVENT_IOC_DISABLE,0);
                if (ret<0) {
                        fprintf(stderr,"Error stopping event fd[%d]\n",j);
                }
        }
}

void read_events(int i, int fd[NUM_CPUS])
{
        int j, ret;
        for(j=0;j<NUM_CPUS;j++) {
                ret=read(fd[j],&base_results[j],3*sizeof(long long));
                if (ret<3*sizeof(long long)) {
                        fprintf(stderr,"Event fd[0][%d] unexpected read size 
%d\n",j,ret);
                        return;
                }
                if (base_results[j][0])
                        printf("%d\t%d\t%lld\t\t\t%lld\t\t\t%lld\t\t\t%.2f\n", 
i, j,
                                base_results[j][0],
                                base_results[j][TIME_ENABLED],
                                base_results[j][TIME_RUNNING],
                                (double)base_results[j][TIME_RUNNING]/
                                (double)base_results[j][TIME_ENABLED] * 100.);
        }
}

int main(int argc, char** argv) {

        int ret,quiet,i,j;

        struct perf_event_attr pe;
        
        char test_string[]="Testing ...";

        printf("\nEID\tCPU\tCOUNT\t\t\tT_ENABLED\t\t\tT_RUNNING\t\t\tSCALE\n");

        alloc_events(events[1], fd[0]);
        enable_events(fd[0]);
        printf("--- 1 enabled\n");

        test_routine();
        read_events(0, fd[0]);

        test_routine();
        read_events(0, fd[0]);

        test_routine();
        read_events(0, fd[0]);

        alloc_events(events[1], fd[1]);
        enable_events(fd[1]);
        printf("--- 1 enabled\n");

        test_routine();
        read_events(0, fd[0]);
        read_events(1, fd[1]);

        test_routine();
        read_events(0, fd[0]);
        read_events(1, fd[1]);

        disable_events(fd[1]);
        printf("--- 1 disabled\n");

        test_routine();
        read_events(0,fd[0]);
        read_events(1,fd[1]);

        disable_events(fd[0]);
        printf("--- 0 disabled\n");
        test_routine();

        read_events(0,fd[0]);
        read_events(1,fd[1]);

        free_events(fd[1]);
        free_events(fd[0]);

        test_pass(test_string);

        return 0;
}

Re: [PATCH v6 2/3]: perf/core: use context tstamp_data for skipped events on mux interrupt

Reply via email to