We can use our fence tracking mechanism for fine-grained waiting on results.
Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk> --- src/mesa/drivers/dri/i965/brw_conditional_render.c | 4 +- src/mesa/drivers/dri/i965/brw_context.c | 2 + src/mesa/drivers/dri/i965/brw_context.h | 10 +- src/mesa/drivers/dri/i965/brw_queryobj.c | 17 +-- src/mesa/drivers/dri/i965/gen6_queryobj.c | 132 +++++++++++++-------- src/mesa/drivers/dri/i965/hsw_queryobj.c | 16 +-- 6 files changed, 115 insertions(+), 66 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_conditional_render.c b/src/mesa/drivers/dri/i965/brw_conditional_render.c index 59f12d5df3..2c73697d5b 100644 --- a/src/mesa/drivers/dri/i965/brw_conditional_render.c +++ b/src/mesa/drivers/dri/i965/brw_conditional_render.c @@ -70,13 +70,13 @@ set_predicate_for_result(struct brw_context *brw, query->bo, I915_GEM_DOMAIN_INSTRUCTION, 0, /* write domain */ - 0 /* offset */); + 8*query->index /* offset */); brw_load_register_mem64(brw, MI_PREDICATE_SRC1, query->bo, I915_GEM_DOMAIN_INSTRUCTION, 0, /* write domain */ - 8 /* offset */); + 8*(query->index+1) /* offset */); if (inverted) load_op = MI_PREDICATE_LOADOP_LOAD; diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c index 541c2885fe..15e467b00a 100644 --- a/src/mesa/drivers/dri/i965/brw_context.c +++ b/src/mesa/drivers/dri/i965/brw_context.c @@ -998,6 +998,7 @@ brwCreateContext(gl_api api, brw->has_swizzling = screen->hw_has_swizzling; isl_device_init(&brw->isl_dev, devinfo, screen->hw_has_swizzling); + brw->query.last_index = 4096; brw->vs.base.stage = MESA_SHADER_VERTEX; brw->tcs.base.stage = MESA_SHADER_TESS_CTRL; @@ -1153,6 +1154,7 @@ intelDestroyContext(__DRIcontext * driContextPriv) brw_destroy_state(brw); brw_draw_destroy(brw); + brw_bo_put(brw->query.bo); brw_bo_put(brw->curbe.curbe_bo); brw_bo_put(brw->vs.base.scratch_bo); brw_bo_put(brw->tcs.base.scratch_bo); diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index 0152f0a482..5e2df95508 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -542,11 +542,15 @@ struct brw_vertex_element { struct brw_query_object { struct gl_query_object Base; + struct brw_fence fence; + /** Last query BO associated with this query. */ brw_bo *bo; /** Last index in bo with query data for this object. */ - int last_index; + unsigned index; + + uint64_t *results; }; #define MAX_GS_INPUT_VERTICES 6 @@ -1093,6 +1097,10 @@ struct brw_context } cc; struct { + brw_bo *bo; + uint64_t *map; + unsigned last_index; + struct brw_query_object *obj; bool begin_emitted; } query; diff --git a/src/mesa/drivers/dri/i965/brw_queryobj.c b/src/mesa/drivers/dri/i965/brw_queryobj.c index 34f55152ae..70c5d57f0a 100644 --- a/src/mesa/drivers/dri/i965/brw_queryobj.c +++ b/src/mesa/drivers/dri/i965/brw_queryobj.c @@ -125,7 +125,7 @@ brw_queryobj_get_results(struct gl_context *ctx, * run out of space in the query's BO and allocated a new one. If so, * this function was already called to accumulate the results so far. */ - for (i = 0; i < query->last_index; i++) { + for (i = 0; i < query->index; i++) { query->Base.Result += results[i * 2 + 1] - results[i * 2]; } break; @@ -135,7 +135,7 @@ brw_queryobj_get_results(struct gl_context *ctx, /* If the starting and ending PS_DEPTH_COUNT from any of the batches * differ, then some fragments passed the depth test. */ - for (i = 0; i < query->last_index; i++) { + for (i = 0; i < query->index; i++) { if (results[i * 2 + 1] != results[i * 2]) { query->Base.Result = GL_TRUE; break; @@ -182,6 +182,7 @@ brw_delete_query(struct gl_context *ctx, struct gl_query_object *q) { struct brw_query_object *query = (struct brw_query_object *)q; + brw_fence_finish(&query->fence); brw_bo_put(query->bo); free(query); } @@ -242,7 +243,7 @@ brw_begin_query(struct gl_context *ctx, struct gl_query_object *q) */ brw_bo_put(query->bo); query->bo = NULL; - query->last_index = -1; + query->index = -1; brw->query.obj = query; @@ -379,7 +380,7 @@ ensure_bo_has_space(struct gl_context *ctx, struct brw_query_object *query) assert(brw->gen < 6); - if (!query->bo || query->last_index * 2 + 1 >= 4096 / sizeof(uint64_t)) { + if (!query->bo || query->index * 2 + 1 >= 4096 / sizeof(uint64_t)) { if (query->bo != NULL) { /* The old query BO did not have enough space, so we allocated a new @@ -391,7 +392,7 @@ ensure_bo_has_space(struct gl_context *ctx, struct brw_query_object *query) query->bo = brw_bo_create(&brw->batch, "query", 4096, 0, BO_ALLOC_FOR_RENDER); - query->last_index = 0; + query->index = 0; } } @@ -432,7 +433,7 @@ brw_emit_query_begin(struct brw_context *brw) ensure_bo_has_space(ctx, query); - brw_write_depth_count(brw, query->bo, query->last_index * 2); + brw_write_depth_count(brw, query->bo, query->index * 2); brw->query.begin_emitted = true; } @@ -454,10 +455,10 @@ brw_emit_query_end(struct brw_context *brw) if (!brw->query.begin_emitted) return; - brw_write_depth_count(brw, query->bo, query->last_index * 2 + 1); + brw_write_depth_count(brw, query->bo, query->index * 2 + 1); brw->query.begin_emitted = false; - query->last_index++; + query->index++; } /** diff --git a/src/mesa/drivers/dri/i965/gen6_queryobj.c b/src/mesa/drivers/dri/i965/gen6_queryobj.c index 508398c8a2..18030090be 100644 --- a/src/mesa/drivers/dri/i965/gen6_queryobj.c +++ b/src/mesa/drivers/dri/i965/gen6_queryobj.c @@ -70,7 +70,8 @@ set_query_availability(struct brw_context *brw, struct brw_query_object *query, flags |= PIPE_CONTROL_CS_STALL; brw_emit_pipe_control_write(brw, flags, - query->bo, 2 * sizeof(uint64_t), + query->bo, + (query->index + 2) * sizeof(uint64_t), available); } } @@ -168,12 +169,8 @@ gen6_queryobj_get_results(struct gl_context *ctx, struct brw_query_object *query) { struct brw_context *brw = brw_context(ctx); + uint64_t *results = query->results; - if (query->bo == NULL) - return; - - uint64_t *results = - brw_bo_map(query->bo, MAP_READ, PERF_DEBUG(brw, "GetQuery")); switch (query->Base.Target) { case GL_TIME_ELAPSED: /* The query BO contains the starting and ending timestamps. @@ -256,12 +253,46 @@ gen6_queryobj_get_results(struct gl_context *ctx, /* Now that we've processed the data stored in the query's buffer object, * we can release it. */ + brw_fence_finish(&query->fence); brw_bo_put(query->bo); query->bo = NULL; query->Base.Ready = true; } +static int gen6_alloc_query(struct brw_context *brw, + struct brw_query_object *query) +{ + int idx; + + brw_fence_finish(&query->fence); + brw_bo_put(query->bo); + + if (brw->query.last_index > 4096/sizeof(uint64_t) - 4) { + brw_bo_put(brw->query.bo); + brw->query.bo = brw_bo_create(&brw->batch, "query results", + 4096, 0, BO_ALLOC_FOR_RENDER); + brw_bo_enable_snoop(brw->query.bo); + brw->query.map = + brw_bo_map(brw->query.bo, + MAP_READ | MAP_PERSISTENT | MAP_ASYNC, + NULL); + brw->query.last_index = 0; + } + + idx = brw->query.last_index; + brw->query.last_index += 2; + if (brw->ctx.Extensions.ARB_query_buffer_object && + brw_is_query_pipelined(query)) + brw->query.last_index += 2; + + query->bo = brw_bo_get(brw->query.bo); + query->index = idx; + query->results = brw->query.map + idx; + + return idx; +} + /** * Driver hook for glBeginQuery(). * @@ -273,11 +304,7 @@ gen6_begin_query(struct gl_context *ctx, struct gl_query_object *q) { struct brw_context *brw = brw_context(ctx); struct brw_query_object *query = (struct brw_query_object *)q; - - /* Since we're starting a new query, we need to throw away old results. */ - brw_bo_put(query->bo); - query->bo = brw_bo_create(&brw->batch, "query results", - 4096, 4096, BO_ALLOC_FOR_RENDER); + int idx = gen6_alloc_query(brw, query); if (brw_batch_begin(&brw->batch, 120, RENDER_RING) < 0) return; @@ -306,23 +333,23 @@ gen6_begin_query(struct gl_context *ctx, struct gl_query_object *q) * obtain the time elapsed. Notably, this includes time elapsed while * the system was doing other work, such as running other applications. */ - brw_write_timestamp(brw, query->bo, 0); + brw_write_timestamp(brw, query->bo, idx); break; case GL_ANY_SAMPLES_PASSED: case GL_ANY_SAMPLES_PASSED_CONSERVATIVE: case GL_SAMPLES_PASSED_ARB: - brw_write_depth_count(brw, query->bo, 0); + brw_write_depth_count(brw, query->bo, idx); break; case GL_PRIMITIVES_GENERATED: - write_primitives_generated(brw, query->bo, query->Base.Stream, 0); + write_primitives_generated(brw, query->bo, query->Base.Stream, idx); if (query->Base.Stream == 0) ctx->NewDriverState |= BRW_NEW_RASTERIZER_DISCARD; break; case GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN: - write_xfb_primitives_written(brw, query->bo, query->Base.Stream, 0); + write_xfb_primitives_written(brw, query->bo, query->Base.Stream, idx); break; case GL_VERTICES_SUBMITTED_ARB: @@ -336,7 +363,7 @@ gen6_begin_query(struct gl_context *ctx, struct gl_query_object *q) case GL_COMPUTE_SHADER_INVOCATIONS_ARB: case GL_TESS_CONTROL_SHADER_PATCHES_ARB: case GL_TESS_EVALUATION_SHADER_INVOCATIONS_ARB: - emit_pipeline_stat(brw, query->bo, query->Base.Stream, query->Base.Target, 0); + emit_pipeline_stat(brw, query->bo, query->Base.Stream, query->Base.Target, idx); break; default: @@ -359,6 +386,7 @@ gen6_end_query(struct gl_context *ctx, struct gl_query_object *q) { struct brw_context *brw = brw_context(ctx); struct brw_query_object *query = (struct brw_query_object *)q; + int idx = query->index + 1; if (brw_batch_begin(&brw->batch, 120, RENDER_RING) < 0) { query->Base.Ready = true; /* failed to submit query, return garbage */ @@ -369,23 +397,23 @@ gen6_end_query(struct gl_context *ctx, struct gl_query_object *q) switch (query->Base.Target) { case GL_TIME_ELAPSED: - brw_write_timestamp(brw, query->bo, 1); + brw_write_timestamp(brw, query->bo, idx); break; case GL_ANY_SAMPLES_PASSED: case GL_ANY_SAMPLES_PASSED_CONSERVATIVE: case GL_SAMPLES_PASSED_ARB: - brw_write_depth_count(brw, query->bo, 1); + brw_write_depth_count(brw, query->bo, idx); break; case GL_PRIMITIVES_GENERATED: - write_primitives_generated(brw, query->bo, query->Base.Stream, 1); + write_primitives_generated(brw, query->bo, query->Base.Stream, idx); if (query->Base.Stream == 0) ctx->NewDriverState |= BRW_NEW_RASTERIZER_DISCARD; break; case GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN: - write_xfb_primitives_written(brw, query->bo, query->Base.Stream, 1); + write_xfb_primitives_written(brw, query->bo, query->Base.Stream, idx); break; case GL_VERTICES_SUBMITTED_ARB: @@ -400,19 +428,47 @@ gen6_end_query(struct gl_context *ctx, struct gl_query_object *q) case GL_TESS_CONTROL_SHADER_PATCHES_ARB: case GL_TESS_EVALUATION_SHADER_INVOCATIONS_ARB: emit_pipeline_stat(brw, query->bo, - query->Base.Stream, query->Base.Target, 1); + query->Base.Stream, query->Base.Target, idx); break; default: unreachable("Unrecognized query target in brw_end_query()"); } - /* The current batch contains the commands to handle EndQuery(), - * but they won't actually execute until it is flushed. - */ + /* For ARB_query_buffer_object: The result is now available */ + set_query_availability(brw, query, true); + brw_batch_insert_fence(&brw->batch, &query->fence, 0); + + brw_batch_end(&brw->batch); +} + +/** + * Driver hook for glQueryCounter(). + * + * This handles GL_TIMESTAMP queries, which perform a pipelined read of the + * current GPU time. This is unlike GL_TIME_ELAPSED, which measures the + * time while the query is active. + */ +static void +gen6_query_counter(struct gl_context *ctx, struct gl_query_object *q) +{ + struct brw_context *brw = brw_context(ctx); + struct brw_query_object *query = (struct brw_query_object *) q; + int idx = gen6_alloc_query(brw, query); + + assert(q->Target == GL_TIMESTAMP); + if (brw_batch_begin(&brw->batch, 60, RENDER_RING) < 0) + return; + + brw_write_timestamp(brw, query->bo, idx); /* For ARB_query_buffer_object: The result is now available */ set_query_availability(brw, query, true); + brw_batch_insert_fence(&brw->batch, &query->fence, 0); + + /* The current batch contains the commands to handle EndQuery(), + * but they won't actually execute until it is flushed. + */ brw_batch_end(&brw->batch); } @@ -425,12 +481,14 @@ gen6_end_query(struct gl_context *ctx, struct gl_query_object *q) */ static void gen6_wait_query(struct gl_context *ctx, struct gl_query_object *q) { + struct brw_context *brw = brw_context(ctx); struct brw_query_object *query = (struct brw_query_object *)q; /* If the application has requested the query result, but this batch is * still contributing to it, flush it now to finish that work so the * result will become available (eventually). */ + brw_fence_wait(&query->fence, -1, PERF_DEBUG(brw, "GetQuery")); gen6_queryobj_get_results(ctx, query); } @@ -442,14 +500,9 @@ static void gen6_wait_query(struct gl_context *ctx, struct gl_query_object *q) */ static void gen6_check_query(struct gl_context *ctx, struct gl_query_object *q) { + struct brw_context *brw = brw_context(ctx); struct brw_query_object *query = (struct brw_query_object *)q; - /* If query->bo is NULL, we've already gathered the results - this is a - * redundant CheckQuery call. Ignore it. - */ - if (query->bo == NULL) - return; - /* From the GL_ARB_occlusion_query spec: * * "Instead of allowing for an infinite loop, performing a @@ -457,25 +510,10 @@ static void gen6_check_query(struct gl_context *ctx, struct gl_query_object *q) * not ready yet on the first time it is queried. This ensures that * the async query will return true in finite time. */ - if (!brw_bo_busy(query->bo, BUSY_READ | BUSY_FLUSH, - PERF_DEBUG(brw_context(ctx), "CheckQuery"))) { - gen6_queryobj_get_results(ctx, query); - } -} - -static void -gen6_query_counter(struct gl_context *ctx, struct gl_query_object *q) -{ - struct brw_context *brw = brw_context(ctx); - struct brw_query_object *query = (struct brw_query_object *)q; - - brw_query_counter(ctx, q); - - if (brw_batch_begin(&brw->batch, 120, RENDER_RING) < 0) + if (brw_fence_busy(&query->fence, PERF_DEBUG(brw, "CheckQuery"))) return; - set_query_availability(brw, query, true); - brw_batch_end(&brw->batch); + gen6_queryobj_get_results(ctx, query); } /* Initialize Gen6+-specific query object functions. */ diff --git a/src/mesa/drivers/dri/i965/hsw_queryobj.c b/src/mesa/drivers/dri/i965/hsw_queryobj.c index 47ffb881de..357e57bd9b 100644 --- a/src/mesa/drivers/dri/i965/hsw_queryobj.c +++ b/src/mesa/drivers/dri/i965/hsw_queryobj.c @@ -203,7 +203,7 @@ hsw_result_to_gpr0(struct gl_context *ctx, struct brw_query_object *query, query->bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, - 2 * sizeof(uint64_t)); + (query->index + 2) * sizeof(uint64_t)); return; } @@ -222,20 +222,20 @@ hsw_result_to_gpr0(struct gl_context *ctx, struct brw_query_object *query, query->bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, - 0 * sizeof(uint64_t)); + (query->index + 0) * sizeof(uint64_t)); } else { brw_load_register_mem64(brw, HSW_CS_GPR(1), query->bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, - 0 * sizeof(uint64_t)); + (query->index + 0) * sizeof(uint64_t)); brw_load_register_mem64(brw, HSW_CS_GPR(2), query->bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, - 1 * sizeof(uint64_t)); + (query->index + 1) * sizeof(uint64_t)); BEGIN_BATCH(5); OUT_BATCH(HSW_MI_MATH | (5 - 2)); @@ -299,14 +299,14 @@ store_query_result_imm(struct brw_context *brw, brw_bo *bo, } static void -set_predicate(struct brw_context *brw, brw_bo *query_bo) +set_predicate(struct brw_context *brw, struct brw_query_object *query) { brw_load_register_imm64(brw, MI_PREDICATE_SRC1, 0ull); /* Load query availability into SRC0 */ - brw_load_register_mem64(brw, MI_PREDICATE_SRC0, query_bo, + brw_load_register_mem64(brw, MI_PREDICATE_SRC0, query->bo, I915_GEM_DOMAIN_INSTRUCTION, 0, - 2 * sizeof(uint64_t)); + (query->index + 2) * sizeof(uint64_t)); /* predicate = !(query_availability == 0); */ BEGIN_BATCH(1); @@ -376,7 +376,7 @@ hsw_store_query_result(struct gl_context *ctx, struct gl_query_object *q, */ hsw_result_to_gpr0(ctx, query, buf, offset, pname, ptype); if (pipelined) - set_predicate(brw, query->bo); + set_predicate(brw, query); store_query_result_reg(brw, bo->buffer, offset, ptype, HSW_CS_GPR(0), pipelined); } else { -- 2.11.0 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev