This can be used for two purposes: Using hand-coded shaders to determine per-instruction timings, or figuring out which shader to optimize in a whole application. On the psychonauts trace, we see:
type ID cycles spent % of total ... fs16 543: 339343898369 ( 339.34 Gcycles) 9.2% fs16 521: 532215110990 ( 532.22 Gcycles) 14.4% fs16 524: 1036231987390 (1036.23 Gcycles) 28.0% confirming our previous understanding that fragment shaders are where it's all at. But on GLBenchmark 2.7, we get: fs16 69: 205928219888 ( 205.93 Gcycles) 7.5% fs16 75: 364066413095 ( 364.07 Gcycles) 13.2% vs 87: 1107217698878 (1107.22 Gcycles) 40.3% That's interesting. I should look into that. Note that this doesn't cover the instructions that set up the message to the URB/FB write -- we'd need to convert the MRF usage in these instructions to GRFs so that our offsets/times don't overwrite our shader outputs. --- src/mesa/drivers/dri/i965/brw_context.c | 3 + src/mesa/drivers/dri/i965/brw_context.h | 28 ++++- src/mesa/drivers/dri/i965/brw_defines.h | 20 +++- src/mesa/drivers/dri/i965/brw_eu.h | 6 +- src/mesa/drivers/dri/i965/brw_eu_emit.c | 55 +++++++++- src/mesa/drivers/dri/i965/brw_fs.cpp | 101 +++++++++++++++++ src/mesa/drivers/dri/i965/brw_fs.h | 7 ++ src/mesa/drivers/dri/i965/brw_fs_emit.cpp | 4 + src/mesa/drivers/dri/i965/brw_program.c | 128 ++++++++++++++++++++++ src/mesa/drivers/dri/i965/brw_vec4.cpp | 81 ++++++++++++++ src/mesa/drivers/dri/i965/brw_vec4.h | 7 ++ src/mesa/drivers/dri/i965/brw_vec4_emit.cpp | 4 + src/mesa/drivers/dri/i965/brw_vs_surface_state.c | 10 ++ src/mesa/drivers/dri/i965/brw_vtbl.c | 14 +++ src/mesa/drivers/dri/i965/brw_wm_surface_state.c | 7 ++ src/mesa/drivers/dri/intel/intel_context.c | 6 + src/mesa/drivers/dri/intel/intel_context.h | 1 + 17 files changed, 475 insertions(+), 7 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c index 4b1b247..5665a3a 100644 --- a/src/mesa/drivers/dri/i965/brw_context.c +++ b/src/mesa/drivers/dri/i965/brw_context.c @@ -383,6 +383,9 @@ brwCreateContext(int api, brw_fs_alloc_reg_sets(brw); + if (INTEL_DEBUG & DEBUG_SHADER_TIME) + brw_init_shader_time(brw); + return true; } diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index 1abaee3..dc25cab 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -559,14 +559,15 @@ struct brw_vs_prog_data { #define SURF_INDEX_FRAG_CONST_BUFFER (BRW_MAX_DRAW_BUFFERS + 1) #define SURF_INDEX_TEXTURE(t) (BRW_MAX_DRAW_BUFFERS + 2 + (t)) #define SURF_INDEX_WM_UBO(u) (SURF_INDEX_TEXTURE(BRW_MAX_TEX_UNIT) + u) - +#define SURF_INDEX_WM_SHADER_TIME (SURF_INDEX_WM_UBO(12)) /** Maximum size of the binding table. */ -#define BRW_MAX_WM_SURFACES SURF_INDEX_WM_UBO(BRW_MAX_WM_UBOS) +#define BRW_MAX_WM_SURFACES (SURF_INDEX_WM_SHADER_TIME + 1) #define SURF_INDEX_VERT_CONST_BUFFER (0) #define SURF_INDEX_VS_TEXTURE(t) (SURF_INDEX_VERT_CONST_BUFFER + 1 + (t)) #define SURF_INDEX_VS_UBO(u) (SURF_INDEX_VS_TEXTURE(BRW_MAX_TEX_UNIT) + u) -#define BRW_MAX_VS_SURFACES SURF_INDEX_VS_UBO(BRW_MAX_VS_UBOS) +#define SURF_INDEX_VS_SHADER_TIME (SURF_INDEX_VS_UBO(12)) +#define BRW_MAX_VS_SURFACES (SURF_INDEX_VS_SHADER_TIME + 1) #define SURF_INDEX_SOL_BINDING(t) ((t)) #define BRW_MAX_GS_SURFACES SURF_INDEX_SOL_BINDING(BRW_MAX_SOL_BINDINGS) @@ -651,6 +652,13 @@ struct brw_tracked_state { void (*emit)( struct brw_context *brw ); }; +enum shader_time_shader_type { + ST_NONE, + ST_VS, + ST_FS8, + ST_FS16, +}; + /* Flags for brw->state.cache. */ #define CACHE_NEW_BLEND_STATE (1<<BRW_BLEND_STATE) @@ -1089,6 +1097,16 @@ struct brw_context uint32_t num_instances; int basevertex; + + struct { + drm_intel_bo *bo; + struct gl_shader_program **programs; + enum shader_time_shader_type *types; + uint64_t *cumulative; + int num_entries; + int max_entries; + double report_time; + } shader_time; }; /*====================================================================== @@ -1144,7 +1162,9 @@ void brwInitFragProgFuncs( struct dd_function_table *functions ); int brw_get_scratch_size(int size); void brw_get_scratch_bo(struct intel_context *intel, drm_intel_bo **scratch_bo, int size); - +void brw_init_shader_time(struct brw_context *brw); +void brw_collect_and_report_shader_time(struct brw_context *brw); +void brw_destroy_shader_time(struct brw_context *brw); /* brw_urb.c */ diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h index 6dc4707..b84d8f9 100644 --- a/src/mesa/drivers/dri/i965/brw_defines.h +++ b/src/mesa/drivers/dri/i965/brw_defines.h @@ -665,6 +665,8 @@ enum opcode { SHADER_OPCODE_TXS, FS_OPCODE_TXB, + SHADER_OPCODE_SHADER_TIME_ADD, + FS_OPCODE_DDX, FS_OPCODE_DDY, FS_OPCODE_PIXEL_X, @@ -729,6 +731,8 @@ enum opcode { #define BRW_ARF_CONTROL 0x80 #define BRW_ARF_NOTIFICATION_COUNT 0x90 #define BRW_ARF_IP 0xA0 +#define BRW_ARF_TDR 0xB0 +#define BRW_ARF_TIMESTAMP 0xC0 #define BRW_MRF_COMPR4 (1 << 7) @@ -956,7 +960,21 @@ enum brw_message_target { #define BRW_SCRATCH_SPACE_SIZE_1M 10 #define BRW_SCRATCH_SPACE_SIZE_2M 11 - +#define BRW_AOP_AND 1 +#define BRW_AOP_OR 2 +#define BRW_AOP_XOR 3 +#define BRW_AOP_MOV 4 +#define BRW_AOP_INC 5 +#define BRW_AOP_DEC 6 +#define BRW_AOP_ADD 7 +#define BRW_AOP_SUB 8 +#define BRW_AOP_REVSUB 9 +#define BRW_AOP_IMAX 10 +#define BRW_AOP_IMIN 11 +#define BRW_AOP_UMAX 12 +#define BRW_AOP_UMIN 13 +#define BRW_AOP_CMPWR 14 +#define BRW_AOP_PREDEC 15 #define CMD_URB_FENCE 0x6000 diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h index fce4f15..1415431 100644 --- a/src/mesa/drivers/dri/i965/brw_eu.h +++ b/src/mesa/drivers/dri/i965/brw_eu.h @@ -200,7 +200,7 @@ static INLINE struct brw_reg brw_reg( GLuint file, else if (file == BRW_MESSAGE_REGISTER_FILE) assert((nr & ~(1 << 7)) < BRW_MAX_MRF); else if (file == BRW_ARCHITECTURE_REGISTER_FILE) - assert(nr <= BRW_ARF_IP); + assert(nr <= BRW_ARF_TIMESTAMP); reg.type = type; reg.file = file; @@ -1005,6 +1005,10 @@ void brw_oword_block_write_scratch(struct brw_compile *p, int num_regs, GLuint offset); +void brw_shader_time_add(struct brw_compile *p, + int mrf, + uint32_t surf_index); + /* If/else/endif. Works by manipulating the execution flags on each * channel. */ diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c index eada9d9..87f1396 100644 --- a/src/mesa/drivers/dri/i965/brw_eu_emit.c +++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c @@ -317,7 +317,8 @@ void brw_set_src1(struct brw_compile *p, { assert(reg.file != BRW_MESSAGE_REGISTER_FILE); - assert(reg.nr < 128); + if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE) + assert(reg.nr < 128); gen7_convert_mrf_to_grf(p, ®); @@ -2430,3 +2431,55 @@ brw_svb_write(struct brw_compile *p, 0, /* end_of_thread */ send_commit_msg); /* send_commit_msg */ } + +/** + * This instruction is generated as a single-channel align1 instruction by + * both the VS and FS stages when using INTEL_DEBUG=shader_time. + * + * We can't use the typed atomic op in the FS because that has the execution + * mask ANDed with the pixel mask, but we just want to write the one dword for + * all the pixels. + * + * We don't use the SIMD4x2 atomic ops in the VS because want to just write + * one u32. So we use the same untyped atomic write message as the pixel + * shader. + * + * The untyped atomic operation requires a BUFFER surface type with RAW + * format, and is only accessible through the legacy DATA_CACHE dataport + * messages. + */ +void brw_shader_time_add(struct brw_compile *p, + int base_mrf, + uint32_t surf_index) +{ + struct intel_context *intel = &p->brw->intel; + assert(intel->gen >= 7); + + brw_push_insn_state(p); + brw_set_access_mode(p, BRW_ALIGN_1); + brw_set_mask_control(p, BRW_MASK_DISABLE); + struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND); + brw_pop_insn_state(p); + + /* We use brw_vec1_reg and unmasked because we want to increment the given + * offset only once. + */ + brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE, + BRW_ARF_NULL, 0)); + brw_set_src0(p, send, brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, + base_mrf, 0)); + + bool header_present = false; + bool eot = false; + uint32_t mlen = 2; /* offset, value */ + uint32_t rlen = 0; + brw_set_message_descriptor(p, send, + GEN7_SFID_DATAPORT_DATA_CACHE, + mlen, rlen, header_present, eot); + + send->bits3.ud |= 6 << 14; /* untyped atomic op */ + send->bits3.ud |= 0 << 13; /* no return data */ + send->bits3.ud |= 1 << 12; /* SIMD8 mode */ + send->bits3.ud |= BRW_AOP_ADD << 8; + send->bits3.ud |= surf_index << 0; +} diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index e143e7a..2908618 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -401,6 +401,99 @@ fs_visitor::type_size(const struct glsl_type *type) } } +fs_reg +fs_visitor::get_timestamp() +{ + assert(intel->gen >= 7); + + fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE, + BRW_ARF_TIMESTAMP, + 0), + BRW_REGISTER_TYPE_UD)); + + fs_reg dst = fs_reg(this, glsl_type::uint_type); + + fs_inst *mov = emit(MOV(dst, ts)); + /* We want to read the 3 fields we care about (mostly field 0, but also 2) + * even if it's not enabled in the dispatch. + */ + mov->force_writemask_all = true; + mov->force_uncompressed = true; + + /* The caller wants the low 32 bits of the timestamp. Since it's running + * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds, + * which is plenty of time for our purposes. It is identical across the + * EUs, but since it's tracking GPU core speed it will increment at a + * varying rate as render P-states change. + * + * The caller could also check if render P-states have changed (or anything + * else that might disrupt timing) by setting smear to 2 and checking if + * that field is != 0. + */ + dst.smear = 0; + + return dst; +} + +void +fs_visitor::emit_shader_time_begin() +{ + current_annotation = "shader time start"; + shader_start_time = get_timestamp(); +} + +void +fs_visitor::emit_shader_time_end() +{ + current_annotation = "shader time end"; + /* Choose an index in the buffer and set up tracking information for our + * printouts. + */ + int shader_time_index = brw->shader_time.num_entries++; + assert(shader_time_index <= brw->shader_time.max_entries); + if (dispatch_width == 8) { + brw->shader_time.types[shader_time_index] = ST_FS8; + } else { + assert(dispatch_width == 16); + brw->shader_time.types[shader_time_index] = ST_FS16; + } + if (prog) { + _mesa_reference_shader_program(ctx, + &brw->shader_time.programs[shader_time_index], + prog); + } + + /* Take the current timestamp and get the delta. */ + push_force_uncompressed(); + + fs_reg shader_end_time = get_timestamp(); + shader_start_time.negate = true; + fs_reg diff = fs_reg(this, glsl_type::uint_type); + emit(ADD(diff, shader_start_time, shader_end_time)); + + /* If there were no instructions between the two timestamp gets, the diff + * is 2 cycles. Remove that overhead, so I can forget about that when + * trying to determine the time taken for single instructions. + */ + emit(ADD(diff, diff, fs_reg(-2u))); + + int base_mrf = 6; + + fs_reg offset_mrf = fs_reg(MRF, base_mrf); + offset_mrf.type = BRW_REGISTER_TYPE_UD; + emit(MOV(offset_mrf, fs_reg(shader_time_index * 4))); + + fs_reg time_mrf = fs_reg(MRF, base_mrf + 1); + time_mrf.type = BRW_REGISTER_TYPE_UD; + emit(MOV(time_mrf, diff)); + + fs_inst *inst = emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD)); + inst->base_mrf = base_mrf; + inst->mlen = 2; + + pop_force_uncompressed(); +} + void fs_visitor::fail(const char *format, ...) { @@ -513,6 +606,8 @@ fs_visitor::implied_mrf_writes(fs_inst *inst) case SHADER_OPCODE_TXL: case SHADER_OPCODE_TXS: return 1; + case SHADER_OPCODE_SHADER_TIME_ADD: + return 0; case FS_OPCODE_FB_WRITE: return 2; case FS_OPCODE_PULL_CONSTANT_LOAD: @@ -2139,6 +2234,9 @@ fs_visitor::run() if (0) { emit_dummy_fs(); } else { + if (INTEL_DEBUG & DEBUG_SHADER_TIME) + emit_shader_time_begin(); + calculate_urb_setup(); if (intel->gen < 6) emit_interpolation_setup_gen4(); @@ -2162,6 +2260,9 @@ fs_visitor::run() if (failed) return false; + if (INTEL_DEBUG & DEBUG_SHADER_TIME) + emit_shader_time_end(); + emit_fb_writes(); split_virtual_grfs(); diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index dc85592..87af8f2 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -353,6 +353,10 @@ public: void emit_color_write(int target, int index, int first_color_mrf); void emit_fb_writes(); + + void emit_shader_time_begin(); + void emit_shader_time_end(); + bool try_rewrite_rhs_to_dst(ir_assignment *ir, fs_reg dst, fs_reg src, @@ -363,6 +367,8 @@ public: void resolve_ud_negate(fs_reg *reg); void resolve_bool_comparison(ir_rvalue *rvalue, fs_reg *reg); + fs_reg get_timestamp(); + struct brw_reg interp_reg(int location, int channel); int setup_uniform_values(int loc, const glsl_type *type); void setup_builtin_uniform_values(ir_variable *ir); @@ -424,6 +430,7 @@ public: fs_reg pixel_w; fs_reg delta_x[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT]; fs_reg delta_y[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT]; + fs_reg shader_start_time; int grf_used; diff --git a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp index 498b720..05b2f16 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp @@ -1029,6 +1029,10 @@ fs_generator::generate_code(exec_list *instructions) generate_mov_dispatch_to_flags(); break; + case SHADER_OPCODE_SHADER_TIME_ADD: + brw_shader_time_add(p, inst->base_mrf, SURF_INDEX_WM_SHADER_TIME); + break; + default: if (inst->opcode < (int) ARRAY_SIZE(opcode_descs)) { _mesa_problem(ctx, "Unsupported opcode `%s' in FS", diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c index 6bf5a6a..1ad186d 100644 --- a/src/mesa/drivers/dri/i965/brw_program.c +++ b/src/mesa/drivers/dri/i965/brw_program.c @@ -189,3 +189,131 @@ void brwInitFragProgFuncs( struct dd_function_table *functions ) functions->LinkShader = brw_link_shader; } + +void +brw_init_shader_time(struct brw_context *brw) +{ + struct intel_context *intel = &brw->intel; + + const int max_entries = 4096; + brw->shader_time.bo = drm_intel_bo_alloc(intel->bufmgr, "shader time", + max_entries * 4, 4096); + brw->shader_time.programs = rzalloc_array(brw, struct gl_shader_program *, + max_entries); + brw->shader_time.types = rzalloc_array(brw, enum shader_time_shader_type, + max_entries); + brw->shader_time.cumulative = rzalloc_array(brw, uint64_t, + max_entries); + brw->shader_time.max_entries = max_entries; +} + +static int +compare_time(const void *a, const void *b) +{ + uint64_t * const *a_val = a; + uint64_t * const *b_val = b; + + /* We don't just subtract because we're turning the value to an int. */ + if (**a_val < **b_val) + return -1; + else if (**a_val == **b_val) + return 0; + else + return 1; +} + +static void +brw_report_shader_time(struct brw_context *brw) +{ + if (!brw->shader_time.bo || !brw->shader_time.num_entries) + return; + + uint64_t *sorted[brw->shader_time.num_entries]; + double total = 0; + for (int i = 0; i < brw->shader_time.num_entries; i++) { + sorted[i] = &brw->shader_time.cumulative[i]; + total += brw->shader_time.cumulative[i]; + } + + if (total == 0) { + printf("No shader time collected yet\n"); + return; + } + + qsort(sorted, brw->shader_time.num_entries, sizeof(sorted[0]), compare_time); + + printf("\n"); + printf("type ID cycles spent %% of total\n"); + for (int s = 0; s < brw->shader_time.num_entries; s++) { + /* Work back from the sorted pointers times to a time to print. */ + int i = sorted[s] - brw->shader_time.cumulative; + + int shader_num = -1; + if (brw->shader_time.programs[i]) { + shader_num = brw->shader_time.programs[i]->Name; + } + + switch (brw->shader_time.types[i]) { + case ST_VS: + printf("vs %4d: ", shader_num); + break; + case ST_FS8: + printf("fs8 %4d: ", shader_num); + break; + case ST_FS16: + printf("fs16 %4d: ", shader_num); + break; + default: + printf("other: "); + break; + } + + printf("%16lld (%7.2f Gcycles) %4.1f%%\n", + (long long)brw->shader_time.cumulative[i], + (double)brw->shader_time.cumulative[i] / 1000000000.0, + (double)brw->shader_time.cumulative[i] / total * 100.0); + } +} + +static void +brw_collect_shader_time(struct brw_context *brw) +{ + if (!brw->shader_time.bo) + return; + + /* This probably stalls on the last rendering. We could fix that by + * delaying reading the reports, but it doesn't look like it's a big + * overhead compared to the cost of tracking the time in the first place. + */ + drm_intel_bo_map(brw->shader_time.bo, true); + + uint32_t *times = brw->shader_time.bo->virtual; + + for (int i = 0; i < brw->shader_time.num_entries; i++) { + brw->shader_time.cumulative[i] += times[i]; + } + + /* Zero the BO out to clear it out for our next collection. + */ + memset(times, 0, brw->shader_time.bo->size); + drm_intel_bo_unmap(brw->shader_time.bo); +} + +void +brw_collect_and_report_shader_time(struct brw_context *brw) +{ + brw_collect_shader_time(brw); + + if (brw->shader_time.report_time == 0 || + get_time() - brw->shader_time.report_time >= 1.0) { + brw_report_shader_time(brw); + brw->shader_time.report_time = get_time(); + } +} + +void +brw_destroy_shader_time(struct brw_context *brw) +{ + drm_intel_bo_unreference(brw->shader_time.bo); + brw->shader_time.bo = NULL; +} diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp index 5200daa..20f924f 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp @@ -26,6 +26,7 @@ extern "C" { #include "main/macros.h" +#include "main/shaderobj.h" #include "program/prog_print.h" #include "program/prog_parameter.h" } @@ -248,6 +249,8 @@ vec4_visitor::implied_mrf_writes(vec4_instruction *inst) return 2; case VS_OPCODE_SCRATCH_WRITE: return 3; + case SHADER_OPCODE_SHADER_TIME_ADD: + return 0; default: assert(!"not reached"); return inst->mlen; @@ -1039,9 +1042,84 @@ vec4_visitor::setup_payload(void) this->first_non_payload_grf = reg; } +src_reg +vec4_visitor::get_timestamp() +{ + assert(intel->gen >= 7); + + src_reg ts = src_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE, + BRW_ARF_TIMESTAMP, + 0), + BRW_REGISTER_TYPE_UD)); + + dst_reg dst = dst_reg(this, glsl_type::uvec4_type); + + vec4_instruction *mov = emit(MOV(dst, ts)); + /* We want to read the 3 fields we care about (mostly field 0, but also 2) + * even if it's not enabled in the dispatch. + */ + mov->force_writemask_all = true; + + return src_reg(dst); +} + +void +vec4_visitor::emit_shader_time_begin() +{ + current_annotation = "shader time start"; + shader_start_time = get_timestamp(); +} + +void +vec4_visitor::emit_shader_time_end() +{ + current_annotation = "shader time end"; + /* Choose an index in the buffer and set up tracking information for our + * printouts. + */ + int shader_time_index = brw->shader_time.num_entries++; + assert(shader_time_index <= brw->shader_time.max_entries); + brw->shader_time.types[shader_time_index] = ST_VS; + if (prog) { + _mesa_reference_shader_program(ctx, + &brw->shader_time.programs[shader_time_index], + prog); + } + + /* Take the current timestamp and get the delta. */ + src_reg shader_end_time = get_timestamp(); + shader_start_time.negate = true; + dst_reg diff = dst_reg(this, glsl_type::uint_type); + emit(ADD(diff, shader_start_time, shader_end_time)); + + /* If there were no instructions between the two timestamp gets, the diff + * is 2 cycles. Remove that overhead, so I can forget about that when + * trying to determine the time taken for single instructions. + */ + emit(ADD(diff, src_reg(diff), src_reg(-2u))); + + int base_mrf = 6; + + dst_reg offset_mrf = dst_reg(MRF, base_mrf); + offset_mrf.type = BRW_REGISTER_TYPE_UD; + emit(MOV(offset_mrf, src_reg(shader_time_index * 4))); + + dst_reg time_mrf = dst_reg(MRF, base_mrf + 1); + time_mrf.type = BRW_REGISTER_TYPE_UD; + emit(MOV(time_mrf, src_reg(diff))); + + vec4_instruction *inst; + inst = emit(SHADER_OPCODE_SHADER_TIME_ADD); + inst->base_mrf = base_mrf; + inst->mlen = 2; +} + bool vec4_visitor::run() { + if (INTEL_DEBUG & DEBUG_SHADER_TIME) + emit_shader_time_begin(); + emit_attribute_fixups(); /* Generate VS IR for main(). (the visitor only descends into @@ -1057,6 +1135,9 @@ vec4_visitor::run() if (c->key.userclip_active && !c->key.uses_clip_distance) setup_uniform_clipplane_values(); + if (INTEL_DEBUG & DEBUG_SHADER_TIME) + emit_shader_time_end(); + emit_urb_writes(); /* Before any optimization, push array accesses out to scratch diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h index 8c6f56a..5192561 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.h +++ b/src/mesa/drivers/dri/i965/brw_vec4.h @@ -302,6 +302,8 @@ public: int uniform_vector_size[MAX_UNIFORMS]; int uniforms; + src_reg shader_start_time; + struct hash_table *variable_ht; bool run(void); @@ -434,6 +436,9 @@ public: void emit_urb_slot(int mrf, int vert_result); void emit_urb_writes(void); + void emit_shader_time_begin(); + void emit_shader_time_end(); + src_reg get_scratch_offset(vec4_instruction *inst, src_reg *reladdr, int reg_offset); src_reg get_pull_constant_offset(vec4_instruction *inst, @@ -452,6 +457,8 @@ public: bool try_emit_sat(ir_expression *ir); void resolve_ud_negate(src_reg *reg); + src_reg get_timestamp(); + bool process_move_condition(ir_rvalue *ir); void dump_instruction(vec4_instruction *inst); diff --git a/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp b/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp index b8a1b59..44d9186 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp @@ -658,6 +658,10 @@ vec4_generator::generate_vs_instruction(vec4_instruction *instruction, generate_pull_constant_load(inst, dst, src[0], src[1]); break; + case SHADER_OPCODE_SHADER_TIME_ADD: + brw_shader_time_add(p, inst->base_mrf, SURF_INDEX_VS_SHADER_TIME); + break; + default: if (inst->opcode < (int) ARRAY_SIZE(opcode_descs)) { _mesa_problem(ctx, "Unsupported opcode in `%s' in VS\n", diff --git a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c index d70c36e..3985b48 100644 --- a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c +++ b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c @@ -138,9 +138,19 @@ const struct brw_tracked_state brw_vs_ubo_surfaces = { static void brw_vs_upload_binding_table(struct brw_context *brw) { + struct intel_context *intel = &brw->intel; uint32_t *bind; int i; + if (INTEL_DEBUG & DEBUG_SHADER_TIME) { + intel->vtbl.create_constant_surface(brw, brw->shader_time.bo, 0, + brw->shader_time.bo->size, + &brw->vs.surf_offset[SURF_INDEX_VS_SHADER_TIME]); + + assert(brw->vs.prog_data->num_surfaces <= SURF_INDEX_VS_SHADER_TIME); + brw->vs.prog_data->num_surfaces = SURF_INDEX_VS_SHADER_TIME; + } + /* CACHE_NEW_VS_PROG: Skip making a binding table if we don't use textures or * pull constants. */ diff --git a/src/mesa/drivers/dri/i965/brw_vtbl.c b/src/mesa/drivers/dri/i965/brw_vtbl.c index 0da6070..f2f0e7b 100644 --- a/src/mesa/drivers/dri/i965/brw_vtbl.c +++ b/src/mesa/drivers/dri/i965/brw_vtbl.c @@ -43,6 +43,7 @@ #include "intel_fbo.h" #include "brw_context.h" +#include "brw_program.h" #include "brw_defines.h" #include "brw_state.h" #include "brw_draw.h" @@ -69,6 +70,11 @@ static void brw_destroy_context( struct intel_context *intel ) { struct brw_context *brw = brw_context(&intel->ctx); + if (INTEL_DEBUG & DEBUG_SHADER_TIME) { + brw_collect_and_report_shader_time(brw); + brw_destroy_shader_time(brw); + } + brw_destroy_state(brw); brw_draw_destroy( brw ); @@ -201,6 +207,14 @@ static void brw_new_batch( struct intel_context *intel ) * next batch. */ brw->cache.bo_used_by_gpu = true; + + /* We need to periodically reap the shader time results, because rollover + * happens every few seconds. We also want to see results every once in a + * while, because many programs won't cleanly destroy our context, so the + * end-of-run printout may not happen. + */ + if (INTEL_DEBUG & DEBUG_SHADER_TIME) + brw_collect_and_report_shader_time(brw); } static void brw_invalidate_state( struct intel_context *intel, GLuint new_state ) diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c index a7866d5..66301a9 100644 --- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c +++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c @@ -1405,9 +1405,16 @@ const struct brw_tracked_state brw_wm_ubo_surfaces = { static void brw_upload_wm_binding_table(struct brw_context *brw) { + struct intel_context *intel = &brw->intel; uint32_t *bind; int i; + if (INTEL_DEBUG & DEBUG_SHADER_TIME) { + intel->vtbl.create_constant_surface(brw, brw->shader_time.bo, 0, + brw->shader_time.bo->size, + &brw->wm.surf_offset[SURF_INDEX_WM_SHADER_TIME]); + } + /* Might want to calculate nr_surfaces first, to avoid taking up so much * space for the binding table. */ diff --git a/src/mesa/drivers/dri/intel/intel_context.c b/src/mesa/drivers/dri/intel/intel_context.c index 204609e..cf340db 100644 --- a/src/mesa/drivers/dri/intel/intel_context.c +++ b/src/mesa/drivers/dri/intel/intel_context.c @@ -491,6 +491,7 @@ static const struct dri_debug_control debug_control[] = { { "vs", DEBUG_VS }, { "clip", DEBUG_CLIP }, { "aub", DEBUG_AUB }, + { "shader_time", DEBUG_SHADER_TIME }, { NULL, 0 } }; @@ -746,6 +747,11 @@ intelInitContext(struct intel_context *intel, INTEL_DEBUG = driParseDebugString(getenv("INTEL_DEBUG"), debug_control); if (INTEL_DEBUG & DEBUG_BUFMGR) dri_bufmgr_set_debug(intel->bufmgr, true); + if ((INTEL_DEBUG & DEBUG_SHADER_TIME) && intel->gen < 7) { + fprintf(stderr, + "shader_time debugging requires gen7 (Ivybridge) or better.\n"); + INTEL_DEBUG &= ~DEBUG_SHADER_TIME; + } if (INTEL_DEBUG & DEBUG_AUB) drm_intel_bufmgr_gem_set_aub_dump(intel->bufmgr, true); diff --git a/src/mesa/drivers/dri/intel/intel_context.h b/src/mesa/drivers/dri/intel/intel_context.h index eeefadf..8d544dc 100644 --- a/src/mesa/drivers/dri/intel/intel_context.h +++ b/src/mesa/drivers/dri/intel/intel_context.h @@ -456,6 +456,7 @@ extern int INTEL_DEBUG; #define DEBUG_VS 0x1000000 #define DEBUG_CLIP 0x2000000 #define DEBUG_AUB 0x4000000 +#define DEBUG_SHADER_TIME 0x8000000 #ifdef HAVE_ANDROID_PLATFORM #define LOG_TAG "INTEL-MESA" -- 1.7.10.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev