From: Nicolai Hähnle <nicolai.haeh...@amd.com> The data is read when the render_cond_atom is emitted, so we must delay emitting the atom until after the flush.
Fixes: 0fe0320dc074 ("radeonsi: use optimal packet order when doing a pipeline sync") --- src/gallium/drivers/radeon/r600_pipe_common.h | 3 ++- src/gallium/drivers/radeon/r600_query.c | 9 ++++++--- src/gallium/drivers/radeonsi/si_state_draw.c | 15 ++++++++++----- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h index dca56734cd7..f78e38b65af 100644 --- a/src/gallium/drivers/radeon/r600_pipe_common.h +++ b/src/gallium/drivers/radeon/r600_pipe_common.h @@ -54,21 +54,22 @@ struct u_log_context; #define R600_RESOURCE_FLAG_TRANSFER (PIPE_RESOURCE_FLAG_DRV_PRIV << 0) #define R600_RESOURCE_FLAG_FLUSHED_DEPTH (PIPE_RESOURCE_FLAG_DRV_PRIV << 1) #define R600_RESOURCE_FLAG_FORCE_TILING (PIPE_RESOURCE_FLAG_DRV_PRIV << 2) #define R600_RESOURCE_FLAG_DISABLE_DCC (PIPE_RESOURCE_FLAG_DRV_PRIV << 3) #define R600_RESOURCE_FLAG_UNMAPPABLE (PIPE_RESOURCE_FLAG_DRV_PRIV << 4) #define R600_CONTEXT_STREAMOUT_FLUSH (1u << 0) /* Pipeline & streamout query controls. */ #define R600_CONTEXT_START_PIPELINE_STATS (1u << 1) #define R600_CONTEXT_STOP_PIPELINE_STATS (1u << 2) -#define R600_CONTEXT_PRIVATE_FLAG (1u << 3) +#define R600_CONTEXT_FLUSH_FOR_RENDER_COND (1u << 3) +#define R600_CONTEXT_PRIVATE_FLAG (1u << 4) /* special primitive types */ #define R600_PRIM_RECTANGLE_LIST PIPE_PRIM_MAX #define R600_NOT_QUERY 0xffffffff /* Debug flags. */ /* logging and features */ #define DBG_TEX (1 << 0) #define DBG_NIR (1 << 1) diff --git a/src/gallium/drivers/radeon/r600_query.c b/src/gallium/drivers/radeon/r600_query.c index f937612bc1f..03ff1018a71 100644 --- a/src/gallium/drivers/radeon/r600_query.c +++ b/src/gallium/drivers/radeon/r600_query.c @@ -1828,25 +1828,28 @@ static void r600_render_condition(struct pipe_context *ctx, * from launching the compute grid. */ rctx->render_cond = NULL; ctx->get_query_result_resource( ctx, query, true, PIPE_QUERY_TYPE_U64, 0, &rquery->workaround_buf->b.b, rquery->workaround_offset); /* Settings this in the render cond atom is too late, * so set it here. */ - rctx->flags |= rctx->screen->barrier_flags.L2_to_cp; - - atom->num_dw = 5; + rctx->flags |= rctx->screen->barrier_flags.L2_to_cp | + R600_CONTEXT_FLUSH_FOR_RENDER_COND; rctx->render_cond_force_off = old_force_off; + } + + if (needs_workaround) { + atom->num_dw = 5; } else { for (qbuf = &rquery->buffer; qbuf; qbuf = qbuf->previous) atom->num_dw += (qbuf->results_end / rquery->result_size) * 5; if (rquery->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) atom->num_dw *= R600_MAX_STREAMS; } } rctx->render_cond = query; diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index 1d8be49a480..81751d2186e 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -1385,34 +1385,39 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH))) { /* If we have to wait for idle, set all states first, so that all * SET packets are processed in parallel with previous draw calls. * Then upload descriptors, set shader pointers, and draw, and * prefetch at the end. This ensures that the time the CUs * are idle is very short. (there are only SET_SH packets between * the wait and the draw) */ struct r600_atom *shader_pointers = &sctx->shader_pointers.atom; + unsigned masked_atoms = 1u << shader_pointers->id; - /* Emit all states except shader pointers. */ - si_emit_all_states(sctx, info, 1 << shader_pointers->id); + if (unlikely(sctx->b.flags & R600_CONTEXT_FLUSH_FOR_RENDER_COND)) + masked_atoms |= 1u << sctx->b.render_cond_atom.id; + + /* Emit all states except shader pointers and render condition. */ + si_emit_all_states(sctx, info, masked_atoms); si_emit_cache_flush(sctx); /* <-- CUs are idle here. */ if (!si_upload_graphics_shader_descriptors(sctx)) return; /* Set shader pointers after descriptors are uploaded. */ - if (si_is_atom_dirty(sctx, shader_pointers)) { + if (si_is_atom_dirty(sctx, shader_pointers)) shader_pointers->emit(&sctx->b, NULL); - sctx->dirty_atoms = 0; - } + if (si_is_atom_dirty(sctx, &sctx->b.render_cond_atom)) + sctx->b.render_cond_atom.emit(&sctx->b, NULL); + sctx->dirty_atoms = 0; si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset); /* <-- CUs are busy here. */ /* Start prefetches after the draw has been started. Both will run * in parallel, but starting the draw first is more important. */ if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask) cik_emit_prefetch_L2(sctx); } else { -- 2.11.0 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev