typo fixed (thanks) --- >8 ---
Prior to Skylake the Gen HW timestamps were driven by a 12.5MHz clock with the convenient property of being able to scale by an integer (80) to nanosecond units. For Skylake the frequency is 12MHz or a scale factor of 83.333333 This updates gen_device_info to track a floating point timebase_scale factor and makes corresponding _queryobj.c changes to no longer assume a scale factor of 80 works across all gens. Although the gen6_ code could have been been left alone, the changes keep the code more comparable, and it now shares a few utility functions for scaling raw timestamps and calculating deltas. The utility for calculating deltas takes into account 32 or 36bit overflow depending on the current kernel version. Note: this leaves the timestamp handling of ARB_query_buffer_object untouched, which continues to use an incorrect scale of 80 on Skylake for now. This is more awkward to solve since the scaling is currently done using a very limited uint64 ALU available to the command parser that doesn't support multiply or divide where it's already taking a large number of instructions just to effectively multiple by 80. This fixes piglit arb_timer_query-timestamp-get on Skylake Signed-off-by: Robert Bragg <rob...@sixbynine.org> --- src/intel/common/gen_device_info.c | 21 +++++++++--- src/intel/common/gen_device_info.h | 24 ++++++++++++++ src/mesa/drivers/dri/i965/brw_context.c | 15 +++++++++ src/mesa/drivers/dri/i965/brw_context.h | 3 ++ src/mesa/drivers/dri/i965/brw_queryobj.c | 53 ++++++++++++++++++++++++++++--- src/mesa/drivers/dri/i965/gen6_queryobj.c | 28 +++++----------- 6 files changed, 115 insertions(+), 29 deletions(-) diff --git a/src/intel/common/gen_device_info.c b/src/intel/common/gen_device_info.c index 30df0b2..20594b0 100644 --- a/src/intel/common/gen_device_info.c +++ b/src/intel/common/gen_device_info.c @@ -35,6 +35,7 @@ static const struct gen_device_info gen_device_info_i965 = { .urb = { .size = 256, }, + .timebase_scale = 80, }; static const struct gen_device_info gen_device_info_g4x = { @@ -50,6 +51,7 @@ static const struct gen_device_info gen_device_info_g4x = { .urb = { .size = 384, }, + .timebase_scale = 80, }; static const struct gen_device_info gen_device_info_ilk = { @@ -64,6 +66,7 @@ static const struct gen_device_info gen_device_info_ilk = { .urb = { .size = 1024, }, + .timebase_scale = 80, }; static const struct gen_device_info gen_device_info_snb_gt1 = { @@ -84,6 +87,7 @@ static const struct gen_device_info gen_device_info_snb_gt1 = { .max_vs_entries = 256, .max_gs_entries = 256, }, + .timebase_scale = 80, }; static const struct gen_device_info gen_device_info_snb_gt2 = { @@ -104,6 +108,7 @@ static const struct gen_device_info gen_device_info_snb_gt2 = { .max_vs_entries = 256, .max_gs_entries = 256, }, + .timebase_scale = 80, }; #define GEN7_FEATURES \ @@ -112,7 +117,8 @@ static const struct gen_device_info gen_device_info_snb_gt2 = { .must_use_separate_stencil = true, \ .has_llc = true, \ .has_pln = true, \ - .has_surface_tile_offset = true + .has_surface_tile_offset = true, \ + .timebase_scale = 80 static const struct gen_device_info gen_device_info_ivb_gt1 = { GEN7_FEATURES, .is_ivybridge = true, .gt = 1, @@ -254,7 +260,8 @@ static const struct gen_device_info gen_device_info_hsw_gt3 = { .max_tcs_threads = 504, \ .max_tes_threads = 504, \ .max_gs_threads = 504, \ - .max_wm_threads = 384 + .max_wm_threads = 384, \ + .timebase_scale = 80 static const struct gen_device_info gen_device_info_bdw_gt1 = { GEN8_FEATURES, .gt = 1, @@ -351,16 +358,19 @@ static const struct gen_device_info gen_device_info_skl_gt1 = { GEN9_FEATURES, .gt = 1, .num_slices = 1, .urb.size = 192, + .timebase_scale = 1000000000.0 / 12000000.0, }; static const struct gen_device_info gen_device_info_skl_gt2 = { GEN9_FEATURES, .gt = 2, .num_slices = 1, + .timebase_scale = 1000000000.0 / 12000000.0, }; static const struct gen_device_info gen_device_info_skl_gt3 = { GEN9_FEATURES, .gt = 3, .num_slices = 2, + .timebase_scale = 1000000000.0 / 12000000.0, }; static const struct gen_device_info gen_device_info_skl_gt4 = { @@ -375,6 +385,7 @@ static const struct gen_device_info gen_device_info_skl_gt4 = { * only 1008KB of this will be used." */ .urb.size = 1008 / 3, + .timebase_scale = 1000000000.0 / 12000000.0, }; static const struct gen_device_info gen_device_info_bxt = { @@ -397,7 +408,8 @@ static const struct gen_device_info gen_device_info_bxt = { .max_tcs_entries = 256, .max_tes_entries = 416, .max_gs_entries = 256, - } + }, + .timebase_scale = 1000000000.0 / 19200123.0, }; static const struct gen_device_info gen_device_info_bxt_2x6 = { @@ -420,7 +432,8 @@ static const struct gen_device_info gen_device_info_bxt_2x6 = { .max_tcs_entries = 128, .max_tes_entries = 208, .max_gs_entries = 128, - } + }, + .timebase_scale = 1000000000.0 / 19200123.0, }; /* * Note: for all KBL SKUs, the PRM says SKL for GS entries, not SKL+. diff --git a/src/intel/common/gen_device_info.h b/src/intel/common/gen_device_info.h index 10324e6..d17931e 100644 --- a/src/intel/common/gen_device_info.h +++ b/src/intel/common/gen_device_info.h @@ -142,6 +142,30 @@ struct gen_device_info unsigned max_tes_entries; unsigned max_gs_entries; } urb; + + /** + * For the longest time the timestamp frequency for Gen's timestamp counter + * could be assumed to be 12.5MHz, where the least significant bit neatly + * corresponded to 80 nanoseconds. + * + * Since Gen9 the numbers aren't so round, with a a frequency of 12MHz for + * SKL (or scale factor of 83.33333333) and a frequency of 19200123Hz for + * BXT. + * + * For simplicty to fit with the current code scaling by a single constant + * to map from raw timestamps to nanoseconds we now do the conversion in + * floating point instead of integer arithmetic. + * + * In general it's probably worth noting that the documented constants we + * have for the per-platform timestamp frequencies aren't perfect and + * shouldn't be trusted for scaling and comparing timestamps with a large + * delta. + * + * E.g. with crude testing on my system using the 'correct' scale factor I'm + * seeing a drift of ~2 milliseconds per second. + */ + double timebase_scale; + /** @} */ }; diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c index af8ed2c..2f71f52 100644 --- a/src/mesa/drivers/dri/i965/brw_context.c +++ b/src/mesa/drivers/dri/i965/brw_context.c @@ -519,6 +519,21 @@ brw_initialize_context_constants(struct brw_context *brw) ctx->Const.MaxCombinedShaderOutputResources = MAX_IMAGE_UNITS + BRW_MAX_DRAW_BUFFERS; + /* The timestamp register we can read for glGetTimestamp() is + * sometimes only 32 bits, before scaling to nanoseconds (depending + * on kernel). + * + * Once scaled to nanoseconds the timestamp would roll over at a + * non-power-of-two, so an application couldn't use + * GL_QUERY_COUNTER_BITS to handle rollover correctly. Instead, we + * report 36 bits and truncate at that (rolling over 5 times as + * often as the HW counter), and when the 32-bit counter rolls + * over, it happens to also be at a rollover in the reported value + * from near (1<<36) to 0. + * + * The low 32 bits rolls over in ~343 seconds. Our 36-bit result + * rolls over every ~69 seconds. + */ ctx->Const.QueryCounterBits.Timestamp = 36; ctx->Const.MaxTextureCoordUnits = 8; /* Mesa limit */ diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index 5c64c2f..f2dcff5 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -1349,6 +1349,9 @@ void brw_emit_query_begin(struct brw_context *brw); void brw_emit_query_end(struct brw_context *brw); void brw_query_counter(struct gl_context *ctx, struct gl_query_object *q); bool brw_is_query_pipelined(struct brw_query_object *query); +uint64_t brw_timebase_scale(struct brw_context *brw, uint64_t gpu_timestamp); +uint64_t brw_raw_timestamp_delta(struct brw_context *brw, + uint64_t time0, uint64_t time1); /** gen6_queryobj.c */ void gen6_init_queryobj_functions(struct dd_function_table *functions); diff --git a/src/mesa/drivers/dri/i965/brw_queryobj.c b/src/mesa/drivers/dri/i965/brw_queryobj.c index dda17de..ebd5776 100644 --- a/src/mesa/drivers/dri/i965/brw_queryobj.c +++ b/src/mesa/drivers/dri/i965/brw_queryobj.c @@ -42,6 +42,37 @@ #include "brw_state.h" #include "intel_batchbuffer.h" +uint64_t +brw_timebase_scale(struct brw_context *brw, uint64_t gpu_timestamp) +{ + const struct gen_device_info *devinfo = &brw->screen->devinfo; + + return (double)gpu_timestamp * devinfo->timebase_scale; +} + +/* As best we know currently, the Gen HW timestamps are 36bits across + * all platforms, which we need to account for when calculating a + * delta to measure elapsed time. + * + * The timestamps read via glGetTimestamp() / brw_get_timestamp() sometimes + * only have 32bits due to a kernel bug and so in that case we make sure to + * treat all raw timestamps as 32bits so they overflow consistently and remain + * comparable. + */ +uint64_t +brw_raw_timestamp_delta(struct brw_context *brw, uint64_t time0, uint64_t time1) +{ + if (brw->screen->hw_has_timestamp == 2) { + /* Kernel clips timestamps to 32bits in this case */ + return (uint32_t)time1 - (uint32_t)time0; + } else { + if (time0 > time1) + return (1ULL << 36) + time1 - time0; + else + return time1 - time0; + } +} + /** * Emit PIPE_CONTROLs to write the current GPU timestamp into a buffer. */ @@ -117,12 +148,18 @@ brw_queryobj_get_results(struct gl_context *ctx, /* The query BO contains the starting and ending timestamps. * Subtract the two and convert to nanoseconds. */ - query->Base.Result += 1000 * ((results[1] >> 32) - (results[0] >> 32)); + query->Base.Result = brw_raw_timestamp_delta(brw, results[0], results[1]); + query->Base.Result = brw_timebase_scale(brw, query->Base.Result); break; case GL_TIMESTAMP: /* The query BO contains a single timestamp value in results[0]. */ - query->Base.Result = 1000 * (results[0] >> 32); + query->Base.Result = brw_timebase_scale(brw, results[0]); + + /* Ensure the scaled timestamp overflows according to + * GL_QUERY_COUNTER_BITS + */ + query->Base.Result &= (1ull << ctx->Const.QueryCounterBits.Timestamp) - 1; break; case GL_SAMPLES_PASSED_ARB: @@ -508,9 +545,15 @@ brw_get_timestamp(struct gl_context *ctx) break; } - /* See logic in brw_queryobj_get_results() */ - result *= 80; - result &= (1ull << 36) - 1; + /* Scale to nanosecond units */ + result = brw_timebase_scale(brw, result); + + /* Ensure the scaled timestamp overflows according to + * GL_QUERY_COUNTER_BITS. Technically this isn't required if + * querying GL_TIMESTAMP via glGetInteger but it seems best to keep + * QueryObject and GetInteger timestamps consistent. + */ + result &= (1ull << ctx->Const.QueryCounterBits.Timestamp) - 1; return result; } diff --git a/src/mesa/drivers/dri/i965/gen6_queryobj.c b/src/mesa/drivers/dri/i965/gen6_queryobj.c index bbd3c44..3f6edf1 100644 --- a/src/mesa/drivers/dri/i965/gen6_queryobj.c +++ b/src/mesa/drivers/dri/i965/gen6_queryobj.c @@ -171,30 +171,18 @@ gen6_queryobj_get_results(struct gl_context *ctx, /* The query BO contains the starting and ending timestamps. * Subtract the two and convert to nanoseconds. */ - query->Base.Result += 80 * (results[1] - results[0]); + query->Base.Result = brw_raw_timestamp_delta(brw, results[0], results[1]); + query->Base.Result = brw_timebase_scale(brw, query->Base.Result); break; case GL_TIMESTAMP: - /* Our timer is a clock that increments every 80ns (regardless of - * other clock scaling in the system). The timestamp register we can - * read for glGetTimestamp() masks out the top 32 bits, so we do that - * here too to let the two counters be compared against each other. - * - * If we just multiplied that 32 bits of data by 80, it would roll - * over at a non-power-of-two, so an application couldn't use - * GL_QUERY_COUNTER_BITS to handle rollover correctly. Instead, we - * report 36 bits and truncate at that (rolling over 5 times as often - * as the HW counter), and when the 32-bit counter rolls over, it - * happens to also be at a rollover in the reported value from near - * (1<<36) to 0. - * - * The low 32 bits rolls over in ~343 seconds. Our 36-bit result - * rolls over every ~69 seconds. - * - * The query BO contains a single timestamp value in results[0]. + /* The query BO contains a single timestamp value in results[0]. */ + query->Base.Result = brw_timebase_scale(brw, results[0]); + + /* Ensure the scaled timestamp overflows according to + * GL_QUERY_COUNTER_BITS */ - query->Base.Result = 80 * (results[0] & 0xffffffff); - query->Base.Result &= (1ull << 36) - 1; + query->Base.Result &= (1ull << ctx->Const.QueryCounterBits.Timestamp) - 1; break; case GL_SAMPLES_PASSED_ARB: -- 2.10.1 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev