There is an ugly bug here: prefetches are skipped, because emit_all_states clears all dirty bits. Expect v2...
Marek On Fri, Aug 4, 2017 at 12:05 PM, Marek Olšák <mar...@gmail.com> wrote: > From: Marek Olšák <marek.ol...@amd.com> > > Process new SET packets in parallel with previous draw calls. > > This decreases [CP busy / SPI busy] by a very tiny amount (verified with > GRBM perf counters), and probably increases FPS by a very tiny amount > for apps that do pipeline syncs often. > --- > src/gallium/drivers/radeonsi/si_state_draw.c | 54 > ++++++++++++++++++++++++---- > 1 file changed, 48 insertions(+), 6 deletions(-) > > diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c > b/src/gallium/drivers/radeonsi/si_state_draw.c > index ae48115..06a18c1 100644 > --- a/src/gallium/drivers/radeonsi/si_state_draw.c > +++ b/src/gallium/drivers/radeonsi/si_state_draw.c > @@ -1173,30 +1173,31 @@ static bool si_cache_flush_and_prefetch(struct > si_context *sctx) > */ > if (!si_upload_graphics_shader_descriptors(sctx)) > return false; > > if (sctx->prefetch_L2) > cik_emit_prefetch_L2(sctx); > > return true; > } > > -static void si_emit_all_states(struct si_context *sctx, const struct > pipe_draw_info *info) > +static void si_emit_all_states(struct si_context *sctx, const struct > pipe_draw_info *info, > + unsigned skip_atom_mask) > { > /* Emit state atoms. */ > - unsigned mask = sctx->dirty_atoms; > + unsigned mask = sctx->dirty_atoms & ~skip_atom_mask; > while (mask) { > struct r600_atom *atom = sctx->atoms.array[u_bit_scan(&mask)]; > > atom->emit(&sctx->b, atom); > } > - sctx->dirty_atoms = 0; > + sctx->dirty_atoms &= skip_atom_mask; > > /* Emit states. */ > mask = sctx->dirty_states; > while (mask) { > unsigned i = u_bit_scan(&mask); > struct si_pm4_state *state = sctx->queued.array[i]; > > if (!state || sctx->emitted.array[i] == state) > continue; > > @@ -1384,23 +1385,64 @@ void si_draw_vbo(struct pipe_context *ctx, const > struct pipe_draw_info *info) > */ > if (!si_upload_vertex_buffer_descriptors(sctx)) > return; > > /* GFX9 scissor bug workaround. There is also a more efficient but > * more involved alternative workaround. */ > if (sctx->b.chip_class == GFX9 && > si_is_atom_dirty(sctx, &sctx->b.scissors.atom)) > sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH; > > - if (!si_cache_flush_and_prefetch(sctx)) > - return; > - si_emit_all_states(sctx, info); > + /* Use an optimal packet order based on whether we need to sync the > pipeline. */ > + if (unlikely(sctx->b.flags & (SI_CONTEXT_FLUSH_AND_INV_CB | > + SI_CONTEXT_FLUSH_AND_INV_DB | > + SI_CONTEXT_PS_PARTIAL_FLUSH | > + SI_CONTEXT_CS_PARTIAL_FLUSH))) { > + /* If we have to wait for idle, set all states first, so that > all > + * SET packets are processed in parallel with previous draw > calls. > + * Sequence: > + * - process SET packets except SET_SH packets for shader > pointers > + * - flush caches and wait for previous draw calls > + * - start CE dumps (might already be ongoing if there is no > CE-DE barrier) > + * - start prefetches > + * - process SET_SH packets for shader pointers > + * - wait for CE dumps > + * - draw > + */ > + struct r600_atom *shader_pointers = > &sctx->shader_userdata.atom; > + > + /* Emit all states except shader pointers. */ > + si_emit_all_states(sctx, info, 1 << shader_pointers->id); > + > + if (!si_cache_flush_and_prefetch(sctx)) > + return; > + > + /* Set shader pointers last. */ > + if (si_is_atom_dirty(sctx, shader_pointers)) { > + shader_pointers->emit(&sctx->b, NULL); > + sctx->dirty_atoms = 0; > + } > + } else { > + /* If we don't wait for idle, do CE dumps and start prefetches > + * first, so that they are being done in parallel with all SET > + * packets. Sequence: > + * - flush caches > + * - start CE dumps (might already be ongoing if CE is ahead) > + * - start prefetches > + * - process SET packets > + * - wait for CE dumps > + * - draw > + */ > + if (!si_cache_flush_and_prefetch(sctx)) > + return; > + si_emit_all_states(sctx, info, 0); > + } > > si_ce_pre_draw_synchronization(sctx); > si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset); > si_ce_post_draw_synchronization(sctx); > > if (sctx->trace_buf) > si_trace_emit(sctx); > > /* Workaround for a VGT hang when streamout is enabled. > * It must be done after drawing. */ > -- > 2.7.4 > _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev