Ironlake's counters are always enabled; userspace can simply send a MI_REPROT_PERF_COUNT packet to take a snapshot of them. This makes it easy to implement.
The counters are documented in the source code for the intel-gpu-tools intel_perf_counters utility. Signed-off-by: Kenneth Graunke <kenn...@whitecape.org> --- src/mesa/drivers/dri/i965/Makefile.sources | 1 + src/mesa/drivers/dri/i965/brw_context.c | 4 + src/mesa/drivers/dri/i965/brw_context.h | 7 + src/mesa/drivers/dri/i965/brw_defines.h | 7 + .../drivers/dri/i965/brw_performance_monitor.c | 372 +++++++++++++++++++++ src/mesa/drivers/dri/intel/intel_extensions.c | 3 + 6 files changed, 394 insertions(+) create mode 100644 src/mesa/drivers/dri/i965/brw_performance_monitor.c diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources index be8d630..a9c2754 100644 --- a/src/mesa/drivers/dri/i965/Makefile.sources +++ b/src/mesa/drivers/dri/i965/Makefile.sources @@ -68,6 +68,7 @@ i965_FILES = \ brw_gs_state.c \ brw_lower_texture_gradients.cpp \ brw_misc_state.c \ + brw_performance_monitor.c \ brw_program.c \ brw_primitive_restart.c \ brw_queryobj.c \ diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c index ceaf325..b8bb1b5 100644 --- a/src/mesa/drivers/dri/i965/brw_context.c +++ b/src/mesa/drivers/dri/i965/brw_context.c @@ -415,6 +415,10 @@ brwCreateContext(int api, _mesa_initialize_dispatch_tables(ctx); _mesa_initialize_vbo_vtxfmt(ctx); + if (ctx->Extensions.AMD_performance_monitor) { + brw_init_performance_monitors(brw); + } + return true; } diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index 114c369..4a203a2 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -1060,6 +1060,10 @@ struct brw_context bool begin_emitted; } query; + struct { + uint32_t total_counter_size; + } perfmon; + int num_atoms; const struct brw_tracked_state **atoms; @@ -1212,6 +1216,9 @@ void brw_upload_ubo_surfaces(struct brw_context *brw, struct gl_shader *shader, uint32_t *surf_offsets); +/* brw_performance_monitor.c */ +void brw_init_performance_monitors(struct brw_context *brw); + /* gen6_sol.c */ void brw_begin_transform_feedback(struct gl_context *ctx, GLenum mode, diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h index a13f9dc..1fea1d8 100644 --- a/src/mesa/drivers/dri/i965/brw_defines.h +++ b/src/mesa/drivers/dri/i965/brw_defines.h @@ -1649,6 +1649,13 @@ enum brw_wm_barycentric_interp_mode { #define CMD_MI_FLUSH 0x0200 +#define GEN5_MI_REPORT_PERF_COUNT ((0x26 << 23) | (3 - 2)) +/* DW0 */ +# define GEN5_MI_COUNTER_SET_0 (0 << 6) +# define GEN5_MI_COUNTER_SET_1 (1 << 6) +/* DW1 */ +# define MI_COUNTER_ADDRESS_GTT (1 << 0) +/* DW2: a user-defined report ID (written to the buffer but can be anything) */ /* Bitfields for the URB_WRITE message, DW2 of message header: */ #define URB_WRITE_PRIM_END 0x1 diff --git a/src/mesa/drivers/dri/i965/brw_performance_monitor.c b/src/mesa/drivers/dri/i965/brw_performance_monitor.c new file mode 100644 index 0000000..b351193 --- /dev/null +++ b/src/mesa/drivers/dri/i965/brw_performance_monitor.c @@ -0,0 +1,372 @@ +/* + * Copyright © 2012 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * \file brw_performance_monitor.c + * + * Implementation of the GL_AMD_performance_monitor extension. + * + * Currently only for Ironlake. + */ + +#include <limits.h> + +#include "main/bitset.h" +#include "main/macros.h" +#include "main/mtypes.h" +#include "main/performance_monitor.h" + +#include "brw_context.h" +#include "brw_defines.h" +#include "intel_batchbuffer.h" + +/** + * i965 representation of a performance monitor object. + */ +struct brw_perf_monitor_object +{ + /** The base class. */ + struct gl_perf_monitor_object base; + + /** + * The GPU-facing BO, holding raw counter data in a hardware specific form. + */ + drm_intel_bo *gpu_facing_bo; +}; + +/** Downcasting convenience macro. */ +static inline struct brw_perf_monitor_object * +brw_perf_monitor(struct gl_perf_monitor_object *m) +{ + return (struct brw_perf_monitor_object *) m; +} + +/******************************************************************************/ + +/** + * Group information: + * @{ + */ +enum group_id { + A_COUNTERS = 0, +}; + +const static struct gl_perf_monitor_group perf_groups[] = { + [A_COUNTERS] = { "Aggregating Counters", INT_MAX } +}; +/** @} */ + + +/** + * Ironlake counter information: + * @{ + */ +enum gen5_counter_id { + GEN5_FIRST_A_COUNTER = 0, + + GEN5_CS_STARVED = GEN5_FIRST_A_COUNTER, + GEN5_CS_STALLED, + GEN5_VF_STARVED, + GEN5_VF_STALLED, + GEN5_VS_STARVED, + GEN5_VS_STALLED, + GEN5_GS_STARVED, + GEN5_GS_STALLED, + GEN5_CL_STARVED, + GEN5_CL_STALLED, + GEN5_SF_STARVED, + GEN5_SF_STALLED, + GEN5_WZ_STARVED, + GEN5_WZ_STALLED, + GEN5_Z_BUFFER_READ_WRITE, + GEN5_EU_ACTIVE, + GEN5_EU_SUSPENDED, + GEN5_THREADS_LOADED, + GEN5_FILTERING_ACTIVE, + GEN5_PS_EXECUTED, + GEN5_SUBSPANS_WRITTEN, + GEN5_BYTES_READ_FOR_TEXTURE_READS, + GEN5_TEXELS_RETURNED_FROM_SAMPLER, + GEN5_POLYGONS_NOT_CULLED, + GEN5_MASF_HAS_VALID_MESSAGE, + GEN5_WRITES_READS_FROM_RC, + GEN5_DP_READS, + GEN5_MASF_HAS_VALID_MESSAGE_NOT_CONSUMED_BY_SAMPLER, + GEN5_EU_STALLED_FOR_MATH, + + GEN5_LAST_A_COUNTER = GEN5_FIRST_A_COUNTER + 28, +}; + +#define A_COUNTER(id, name) \ + { \ + .ID = id, \ + .Name = name, \ + .GroupID = A_COUNTERS, \ + .Type = GL_UNSIGNED_INT, \ + .Minimum = { .u32 = 0 }, \ + .Maximum = { .u32 = ~0 }, \ + } + +const static struct gl_perf_monitor_counter gen5_counters[] = { + A_COUNTER(GEN5_CS_STARVED, "cycles the CS unit is starved"), + A_COUNTER(GEN5_CS_STALLED, "cycles the CS unit is stalled"), + A_COUNTER(GEN5_VF_STARVED, "cycles the VF unit is starved"), + A_COUNTER(GEN5_VF_STALLED, "cycles the VF unit is stalled"), + A_COUNTER(GEN5_VS_STARVED, "cycles the VS unit is starved"), + A_COUNTER(GEN5_VS_STALLED, "cycles the VS unit is stalled"), + A_COUNTER(GEN5_GS_STARVED, "cycles the GS unit is starved"), + A_COUNTER(GEN5_GS_STALLED, "cycles the GS unit is stalled"), + A_COUNTER(GEN5_CL_STARVED, "cycles the CL unit is starved"), + A_COUNTER(GEN5_CL_STALLED, "cycles the CL unit is stalled"), + A_COUNTER(GEN5_SF_STARVED, "cycles the SF unit is starved"), + A_COUNTER(GEN5_SF_STALLED, "cycles the SF unit is stalled"), + A_COUNTER(GEN5_WZ_STARVED, "cycles the WZ unit is starved"), + A_COUNTER(GEN5_WZ_STALLED, "cycles the WZ unit is stalled"), + A_COUNTER(GEN5_Z_BUFFER_READ_WRITE, "Z buffer read/write"), + A_COUNTER(GEN5_EU_ACTIVE, "cycles each EU was active"), + A_COUNTER(GEN5_EU_SUSPENDED, "cycles each EU was suspended"), + A_COUNTER(GEN5_THREADS_LOADED, "cycles threads loaded all EUs"), + A_COUNTER(GEN5_FILTERING_ACTIVE, "cycles filtering active"), + A_COUNTER(GEN5_PS_EXECUTED, "cycles PS threads executed"), + A_COUNTER(GEN5_SUBSPANS_WRITTEN, "subspans written to RC"), + A_COUNTER(GEN5_BYTES_READ_FOR_TEXTURE_READS, "bytes read for texture reads"), + A_COUNTER(GEN5_TEXELS_RETURNED_FROM_SAMPLER, "texels returned from sampler"), + A_COUNTER(GEN5_POLYGONS_NOT_CULLED, "polygons not culled"), + A_COUNTER(GEN5_MASF_HAS_VALID_MESSAGE, "clocks MASF has valid message"), + A_COUNTER(GEN5_WRITES_READS_FROM_RC, "64b writes/reads from RC"), + A_COUNTER(GEN5_DP_READS, "reads on dataport"), + A_COUNTER(GEN5_MASF_HAS_VALID_MESSAGE_NOT_CONSUMED_BY_SAMPLER, "clocks MASF has valid msg not consumed by sampler"), + A_COUNTER(GEN5_EU_STALLED_FOR_MATH, "cycles any EU is stalled for math"), +}; +/** @} */ + +/******************************************************************************/ + +static void +snapshot_aggregating_counters(struct brw_context *brw, + drm_intel_bo *bo, uint32_t offset) +{ + struct intel_context *intel = &brw->intel; + + BEGIN_BATCH(6); + OUT_BATCH(GEN5_MI_REPORT_PERF_COUNT | GEN5_MI_COUNTER_SET_0); + OUT_RELOC(bo, + I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, + offset); + OUT_BATCH(0); + + OUT_BATCH(GEN5_MI_REPORT_PERF_COUNT | GEN5_MI_COUNTER_SET_1); + OUT_RELOC(bo, + I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, + offset + 64); + OUT_BATCH(0); + ADVANCE_BATCH(); +} + +static bool +aggregating_counters_needed(struct brw_context *brw, + struct gl_perf_monitor_object *m) +{ + struct intel_context *intel = &brw->intel; + + if (intel->gen == 5) { + return BITSET_TEST_RANGE(m->ActiveCounters, + GEN5_FIRST_A_COUNTER, GEN5_LAST_A_COUNTER); + } + assert(!"Unsupported generation in performance counter code."); + return false; +} + +/******************************************************************************/ + +/** + * Create a new performance monitor object. + */ +static struct gl_perf_monitor_object * +brw_new_perf_monitor() +{ + return calloc(1, sizeof(struct brw_perf_monitor_object)); +} + +/** + * Delete a performance monitor object. + */ +static void +brw_delete_perf_monitor(struct gl_perf_monitor_object *m) +{ + struct brw_perf_monitor_object *monitor = brw_perf_monitor(m); + + if (monitor->gpu_facing_bo) + drm_intel_bo_unreference(monitor->gpu_facing_bo); + + free(monitor); +} + +/** + * Driver hook for glBeginPerformanceMonitorAMD(). + */ +static void +brw_begin_perf_monitor(struct gl_context *ctx, + struct gl_perf_monitor_object *m) +{ + struct brw_context *brw = brw_context(ctx); + struct intel_context *intel = &brw->intel; + struct brw_perf_monitor_object *monitor = brw_perf_monitor(m); + + m->Active = true; + + /* If the GPU-facing BO already exists, throw it away. It contains + * old results and we're not interested in those any more. + */ + if (monitor->gpu_facing_bo) + drm_intel_bo_unreference(monitor->gpu_facing_bo); + + /* Create a new GPU-facing BO */ + monitor->gpu_facing_bo = + drm_intel_bo_alloc(intel->bufmgr, "performance monitor", 4096, 1); + + /* Take a shapshot of all active counters */ + if (aggregating_counters_needed(brw, m)) { + snapshot_aggregating_counters(brw, monitor->gpu_facing_bo, 0); + } +} + +/** + * Driver hook for glEndPerformanceMonitorAMD(). + */ +static void +brw_end_perf_monitor(struct gl_context *ctx, + struct gl_perf_monitor_object *m) +{ + struct brw_context *brw = brw_context(ctx); + struct brw_perf_monitor_object *monitor = brw_perf_monitor(m); + if (aggregating_counters_needed(brw, m)) { + snapshot_aggregating_counters(brw, monitor->gpu_facing_bo, + brw->perfmon.total_counter_size); + } +} + +/** + * Reset a performance monitor, throwing away any results. + */ +static void +brw_reset_perf_monitor(struct gl_context *ctx, + struct gl_perf_monitor_object *m) +{ + struct brw_perf_monitor_object *monitor = brw_perf_monitor(m); + + if (monitor->gpu_facing_bo) { + drm_intel_bo_unreference(monitor->gpu_facing_bo); + monitor->gpu_facing_bo = NULL; + } + + if (m->Active) { + brw_begin_perf_monitor(ctx, m); + } +} + +/** + * Is a performance monitor result available? + */ +static GLboolean +brw_is_perf_monitor_result_available(struct gl_perf_monitor_object *m) +{ + struct brw_perf_monitor_object *monitor = brw_perf_monitor(m); + return !m->Active && monitor->gpu_facing_bo && + !drm_intel_bo_busy(monitor->gpu_facing_bo); +} + +/** + * Get the performance monitor result. + */ +static void +brw_get_perf_monitor_result(struct gl_context *ctx, + struct gl_perf_monitor_object *m, + GLsizei data_size, + GLuint *data, + GLint *bytes_written) +{ + struct brw_context *brw = brw_context(ctx); + struct brw_perf_monitor_object *monitor = brw_perf_monitor(m); + + /* This hook should only be called when results are available. */ + assert(monitor->gpu_facing_bo != NULL); + + drm_intel_bo_map(monitor->gpu_facing_bo, false); + unsigned *gpu_bo = monitor->gpu_facing_bo->virtual; + + /* Copy data from the GPU-facing BO to the supplied array. + * + * The output data format is: <group ID, counter ID, value> for each + * active counter. The API allows counters to appear in any order. + */ + GLsizei offset = 0; + for (int i = 0; i < ctx->PerfMonitor.NumCounters; i++) { + const struct gl_perf_monitor_counter *c = &ctx->PerfMonitor.Counters[i]; + + if (!BITSET_TEST(m->ActiveCounters, i)) + continue; + + data[offset++] = c->GroupID; + data[offset++] = c->ID; + + /* Skip REPORT_ID and TIMESTAMP fields. */ + uint32_t first_index = (3 + c->ID) * sizeof(uint32_t); + uint32_t second_index = brw->perfmon.total_counter_size + first_index; + + /* Won't work for uint64_t values, but we don't have any */ + data[offset] = gpu_bo[second_index] - gpu_bo[first_index]; + offset += _mesa_perf_monitor_counter_size(c) / sizeof(uint32_t); + } + + drm_intel_bo_unmap(monitor->gpu_facing_bo); + + if (bytes_written) + *bytes_written = offset * sizeof(uint32_t); +} + +void +brw_init_performance_monitors(struct brw_context *brw) +{ + struct intel_context *intel = &brw->intel; + struct gl_context *ctx = &intel->ctx; + + ctx->Driver.NewPerfMonitor = brw_new_perf_monitor; + ctx->Driver.DeletePerfMonitor = brw_delete_perf_monitor; + ctx->Driver.BeginPerfMonitor = brw_begin_perf_monitor; + ctx->Driver.EndPerfMonitor = brw_end_perf_monitor; + ctx->Driver.ResetPerfMonitor = brw_reset_perf_monitor; + ctx->Driver.IsPerfMonitorResultAvailable = brw_is_perf_monitor_result_available; + ctx->Driver.GetPerfMonitorResult = brw_get_perf_monitor_result; + + if (intel->gen == 5) { + ctx->PerfMonitor.Groups = perf_groups; + ctx->PerfMonitor.NumGroups = ARRAY_SIZE(perf_groups); + + ctx->PerfMonitor.Counters = gen5_counters; + ctx->PerfMonitor.NumCounters = ARRAY_SIZE(gen5_counters); + + brw->perfmon.total_counter_size = + (3 + ctx->PerfMonitor.NumCounters) * sizeof(uint32_t); + } +} diff --git a/src/mesa/drivers/dri/intel/intel_extensions.c b/src/mesa/drivers/dri/intel/intel_extensions.c index 1ad728a..3ef7fb9 100755 --- a/src/mesa/drivers/dri/intel/intel_extensions.c +++ b/src/mesa/drivers/dri/intel/intel_extensions.c @@ -111,6 +111,9 @@ intelInitExtensions(struct gl_context *ctx) ctx->Extensions.ARB_texture_storage_multisample = true; } + if (intel->gen == 5) + ctx->Extensions.AMD_performance_monitor = true; + if (intel->gen >= 5) { ctx->Extensions.ARB_texture_query_lod = true; ctx->Extensions.EXT_timer_query = true; -- 1.8.2.1 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev