From: Marek Olšák <marek.ol...@amd.com> The rework is needed to include ACQUIRE_MEM in the workaround by moving the workaround logic out of si_emit_all_states.
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=110355 Cc: 19.0 <mesa-sta...@lists.freedesktop.org> --- src/gallium/drivers/radeonsi/si_state_draw.c | 60 +++++++++++++++++----------- 1 file changed, 36 insertions(+), 24 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index e2fba41..b673c2f 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -889,20 +889,25 @@ static void si_emit_surface_sync(struct si_context *sctx, radeon_emit(cs, 0); /* CP_COHER_BASE_HI */ radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ } else { /* ACQUIRE_MEM is only required on a compute ring. */ radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, 0)); radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */ radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */ radeon_emit(cs, 0); /* CP_COHER_BASE */ radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ } + + /* ACQUIRE_MEM has an implicit context roll if the current context + * is busy. */ + if (sctx->has_graphics) + sctx->context_roll_counter++; } void si_emit_cache_flush(struct si_context *sctx) { struct radeon_cmdbuf *cs = sctx->gfx_cs; uint32_t flags = sctx->flags; if (!sctx->has_graphics) { /* Only process compute flags. */ flags &= SI_CONTEXT_INV_ICACHE | @@ -1216,40 +1221,25 @@ static void si_get_draw_start_count(struct si_context *sctx, } else { *start = info->start; *count = info->count; } } static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_info *info, unsigned skip_atom_mask) { unsigned num_patches = 0; - /* Vega10/Raven scissor bug workaround. When any context register is - * written (i.e. the GPU rolls the context), PA_SC_VPORT_SCISSOR - * registers must be written too. - */ - bool handle_scissor_bug = sctx->screen->has_gfx9_scissor_bug && - !si_is_atom_dirty(sctx, &sctx->atoms.s.scissors); - bool context_roll = false; /* set correctly for GFX9 only */ - context_roll |= si_emit_rasterizer_prim_state(sctx); + sctx->context_roll_counter |= si_emit_rasterizer_prim_state(sctx); if (sctx->tes_shader.cso) - context_roll |= si_emit_derived_tess_state(sctx, info, &num_patches); - - if (handle_scissor_bug && - (info->count_from_stream_output || - sctx->dirty_atoms & si_atoms_that_always_roll_context() || - sctx->dirty_states & si_states_that_always_roll_context() || - si_prim_restart_index_changed(sctx, info))) - context_roll = true; - - sctx->context_roll_counter = 0; + sctx->context_roll_counter |= + si_emit_derived_tess_state(sctx, info, &num_patches); /* Emit state atoms. */ unsigned mask = sctx->dirty_atoms & ~skip_atom_mask; while (mask) sctx->atoms.array[u_bit_scan(&mask)].emit(sctx); sctx->dirty_atoms &= skip_atom_mask; /* Emit states. */ mask = sctx->dirty_states; @@ -1258,26 +1248,20 @@ static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_i struct si_pm4_state *state = sctx->queued.array[i]; if (!state || sctx->emitted.array[i] == state) continue; si_pm4_emit(sctx, state); sctx->emitted.array[i] = state; } sctx->dirty_states = 0; - if (handle_scissor_bug && - (context_roll || sctx->context_roll_counter)) { - sctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1; - sctx->atoms.s.scissors.emit(sctx); - } - /* Emit draw states. */ si_emit_vs_state(sctx, info); si_emit_draw_registers(sctx, info, num_patches); } static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) { struct si_context *sctx = (struct si_context *)ctx; struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; struct pipe_resource *indexbuf = info->index.resource; @@ -1462,20 +1446,37 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i si_need_gfx_cs_space(sctx); /* Since we've called si_context_add_resource_size for vertex buffers, * this must be called after si_need_cs_space, because we must let * need_cs_space flush before we add buffers to the buffer list. */ if (!si_upload_vertex_buffer_descriptors(sctx)) goto return_cleanup; + /* Vega10/Raven scissor bug workaround. When any context register is + * written (i.e. the GPU rolls the context), PA_SC_VPORT_SCISSOR + * registers must be written too. + */ + bool handle_scissor_bug = sctx->screen->has_gfx9_scissor_bug && + !si_is_atom_dirty(sctx, &sctx->atoms.s.scissors); + + /* If this is > 0 after all the non-draw packets, a context roll occured. */ + sctx->context_roll_counter = 0; + + if (handle_scissor_bug && + (info->count_from_stream_output || + sctx->dirty_atoms & si_atoms_that_always_roll_context() || + sctx->dirty_states & si_states_that_always_roll_context() || + si_prim_restart_index_changed(sctx, info))) + sctx->context_roll_counter++; + /* Use optimal packet order based on whether we need to sync the pipeline. */ if (unlikely(sctx->flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB | SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH))) { /* If we have to wait for idle, set all states first, so that all * SET packets are processed in parallel with previous draw calls. * Then draw and prefetch at the end. This ensures that the time * the CUs are idle is very short. */ @@ -1489,20 +1490,25 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i /* Emit all states except possibly render condition. */ si_emit_all_states(sctx, info, masked_atoms); si_emit_cache_flush(sctx); /* <-- CUs are idle here. */ if (si_is_atom_dirty(sctx, &sctx->atoms.s.render_cond)) sctx->atoms.s.render_cond.emit(sctx); sctx->dirty_atoms = 0; + if (handle_scissor_bug && sctx->context_roll_counter) { + sctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1; + sctx->atoms.s.scissors.emit(sctx); + } + si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset); /* <-- CUs are busy here. */ /* Start prefetches after the draw has been started. Both will run * in parallel, but starting the draw first is more important. */ if (sctx->chip_class >= CIK && sctx->prefetch_L2_mask) cik_emit_prefetch_L2(sctx, false); } else { /* If we don't wait for idle, start prefetches first, then set @@ -1512,20 +1518,26 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i si_emit_cache_flush(sctx); /* Only prefetch the API VS and VBO descriptors. */ if (sctx->chip_class >= CIK && sctx->prefetch_L2_mask) cik_emit_prefetch_L2(sctx, true); if (!si_upload_graphics_shader_descriptors(sctx)) return; si_emit_all_states(sctx, info, 0); + + if (handle_scissor_bug && sctx->context_roll_counter) { + sctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1; + sctx->atoms.s.scissors.emit(sctx); + } + si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset); /* Prefetch the remaining shaders after the draw has been * started. */ if (sctx->chip_class >= CIK && sctx->prefetch_L2_mask) cik_emit_prefetch_L2(sctx, false); } if (unlikely(sctx->current_saved_cs)) { si_trace_emit(sctx); -- 2.7.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev