On Wed, Apr 4, 2018, 6:07 AM Samuel Pitoiset <samuel.pitoi...@gmail.com> wrote:
> > > On 04/04/2018 03:59 AM, Marek Olšák wrote: > > From: Marek Olšák <marek.ol...@amd.com> > > > > so that the draw is started as soon as possible. > > --- > > src/gallium/drivers/radeonsi/si_cp_dma.c | 68 > ++++++++++++++++++---------- > > src/gallium/drivers/radeonsi/si_pipe.h | 2 +- > > src/gallium/drivers/radeonsi/si_state_draw.c | 11 ++++- > > src/util/bitscan.h | 8 ++++ > > 4 files changed, 61 insertions(+), 28 deletions(-) > > > > diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c > b/src/gallium/drivers/radeonsi/si_cp_dma.c > > index 15bd305a350..ea2c7cf7198 100644 > > --- a/src/gallium/drivers/radeonsi/si_cp_dma.c > > +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c > > @@ -514,80 +514,98 @@ static void cik_prefetch_shader_async(struct > si_context *sctx, > > static void cik_prefetch_VBO_descriptors(struct si_context *sctx) > > { > > if (!sctx->vertex_elements) > > return; > > > > cik_prefetch_TC_L2_async(sctx, &sctx->vb_descriptors_buffer->b.b, > > sctx->vb_descriptors_offset, > > > sctx->vertex_elements->desc_list_byte_size); > > } > > > > -void cik_emit_prefetch_L2(struct si_context *sctx) > > +/** > > + * Prefetch shaders and VBO descriptors. > > + * > > + * \param first_two Whether only the first 2 items should be > prefetched, > > + * which are usually the API VS and VBO descriptors. > > + */ > > +void cik_emit_prefetch_L2(struct si_context *sctx, bool first_two) > > { > > + unsigned mask; > > + > > + assert(sctx->prefetch_L2_mask); > > + > > + if (first_two) { > > + mask = 1 << u_bit_scan16(&sctx->prefetch_L2_mask); > > + > > + if (sctx->prefetch_L2_mask) > > + mask |= 1 << u_bit_scan16(&sctx->prefetch_L2_mask); > > Where do you reset the prefetch L2 mask ? It looks like to me that you > are going to prefetch VS/VBOs twice in the fast draw path. > u_bit_scan16 clears the returned bit. Marek > + } else { > > + mask = sctx->prefetch_L2_mask; > > + sctx->prefetch_L2_mask = 0; > > + } > > + > > /* Prefetch shaders and VBO descriptors to TC L2. */ > > if (sctx->b.chip_class >= GFX9) { > > /* Choose the right spot for the VBO prefetch. */ > > if (sctx->tes_shader.cso) { > > - if (sctx->prefetch_L2_mask & SI_PREFETCH_HS) > > + if (mask & SI_PREFETCH_HS) > > cik_prefetch_shader_async(sctx, > sctx->queued.named.hs); > > - if (sctx->prefetch_L2_mask & > SI_PREFETCH_VBO_DESCRIPTORS) > > + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) > > cik_prefetch_VBO_descriptors(sctx); > > - if (sctx->prefetch_L2_mask & SI_PREFETCH_GS) > > + if (mask & SI_PREFETCH_GS) > > cik_prefetch_shader_async(sctx, sctx-> > queued.named.gs); > > - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS) > > + if (mask & SI_PREFETCH_VS) > > cik_prefetch_shader_async(sctx, > sctx->queued.named.vs); > > } else if (sctx->gs_shader.cso) { > > - if (sctx->prefetch_L2_mask & SI_PREFETCH_GS) > > + if (mask & SI_PREFETCH_GS) > > cik_prefetch_shader_async(sctx, sctx-> > queued.named.gs); > > - if (sctx->prefetch_L2_mask & > SI_PREFETCH_VBO_DESCRIPTORS) > > + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) > > cik_prefetch_VBO_descriptors(sctx); > > - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS) > > + if (mask & SI_PREFETCH_VS) > > cik_prefetch_shader_async(sctx, > sctx->queued.named.vs); > > } else { > > - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS) > > + if (mask & SI_PREFETCH_VS) > > cik_prefetch_shader_async(sctx, > sctx->queued.named.vs); > > - if (sctx->prefetch_L2_mask & > SI_PREFETCH_VBO_DESCRIPTORS) > > + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) > > cik_prefetch_VBO_descriptors(sctx); > > } > > } else { > > /* SI-CI-VI */ > > /* Choose the right spot for the VBO prefetch. */ > > if (sctx->tes_shader.cso) { > > - if (sctx->prefetch_L2_mask & SI_PREFETCH_LS) > > + if (mask & SI_PREFETCH_LS) > > cik_prefetch_shader_async(sctx, sctx-> > queued.named.ls); > > - if (sctx->prefetch_L2_mask & > SI_PREFETCH_VBO_DESCRIPTORS) > > + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) > > cik_prefetch_VBO_descriptors(sctx); > > - if (sctx->prefetch_L2_mask & SI_PREFETCH_HS) > > + if (mask & SI_PREFETCH_HS) > > cik_prefetch_shader_async(sctx, > sctx->queued.named.hs); > > - if (sctx->prefetch_L2_mask & SI_PREFETCH_ES) > > + if (mask & SI_PREFETCH_ES) > > cik_prefetch_shader_async(sctx, sctx-> > queued.named.es); > > - if (sctx->prefetch_L2_mask & SI_PREFETCH_GS) > > + if (mask & SI_PREFETCH_GS) > > cik_prefetch_shader_async(sctx, sctx-> > queued.named.gs); > > - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS) > > + if (mask & SI_PREFETCH_VS) > > cik_prefetch_shader_async(sctx, > sctx->queued.named.vs); > > } else if (sctx->gs_shader.cso) { > > - if (sctx->prefetch_L2_mask & SI_PREFETCH_ES) > > + if (mask & SI_PREFETCH_ES) > > cik_prefetch_shader_async(sctx, sctx-> > queued.named.es); > > - if (sctx->prefetch_L2_mask & > SI_PREFETCH_VBO_DESCRIPTORS) > > + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) > > cik_prefetch_VBO_descriptors(sctx); > > - if (sctx->prefetch_L2_mask & SI_PREFETCH_GS) > > + if (mask & SI_PREFETCH_GS) > > cik_prefetch_shader_async(sctx, sctx-> > queued.named.gs); > > - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS) > > + if (mask & SI_PREFETCH_VS) > > cik_prefetch_shader_async(sctx, > sctx->queued.named.vs); > > } else { > > - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS) > > + if (mask & SI_PREFETCH_VS) > > cik_prefetch_shader_async(sctx, > sctx->queued.named.vs); > > - if (sctx->prefetch_L2_mask & > SI_PREFETCH_VBO_DESCRIPTORS) > > + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) > > cik_prefetch_VBO_descriptors(sctx); > > } > > } > > > > - if (sctx->prefetch_L2_mask & SI_PREFETCH_PS) > > + if (mask & SI_PREFETCH_PS) > > cik_prefetch_shader_async(sctx, sctx->queued.named.ps); > > - > > - sctx->prefetch_L2_mask = 0; > > } > > > > void si_init_cp_dma_functions(struct si_context *sctx) > > { > > sctx->b.b.clear_buffer = si_pipe_clear_buffer; > > } > > diff --git a/src/gallium/drivers/radeonsi/si_pipe.h > b/src/gallium/drivers/radeonsi/si_pipe.h > > index bb1aebdda42..62641fde5e3 100644 > > --- a/src/gallium/drivers/radeonsi/si_pipe.h > > +++ b/src/gallium/drivers/radeonsi/si_pipe.h > > @@ -688,21 +688,21 @@ enum r600_coherency { > > > > void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource > *dst, > > uint64_t offset, uint64_t size, unsigned value, > > enum r600_coherency coher); > > void si_copy_buffer(struct si_context *sctx, > > struct pipe_resource *dst, struct pipe_resource *src, > > uint64_t dst_offset, uint64_t src_offset, unsigned > size, > > unsigned user_flags); > > void cik_prefetch_TC_L2_async(struct si_context *sctx, struct > pipe_resource *buf, > > uint64_t offset, unsigned size); > > -void cik_emit_prefetch_L2(struct si_context *sctx); > > +void cik_emit_prefetch_L2(struct si_context *sctx, bool first_two); > > void si_init_cp_dma_functions(struct si_context *sctx); > > > > /* si_debug.c */ > > void si_auto_log_cs(void *data, struct u_log_context *log); > > void si_log_hw_flush(struct si_context *sctx); > > void si_log_draw_state(struct si_context *sctx, struct u_log_context > *log); > > void si_log_compute_state(struct si_context *sctx, struct > u_log_context *log); > > void si_init_debug_functions(struct si_context *sctx); > > void si_check_vm_faults(struct r600_common_context *ctx, > > struct radeon_saved_cs *saved, enum ring_type > ring); > > diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c > b/src/gallium/drivers/radeonsi/si_state_draw.c > > index 1e79ccca054..8446b1b50bc 100644 > > --- a/src/gallium/drivers/radeonsi/si_state_draw.c > > +++ b/src/gallium/drivers/radeonsi/si_state_draw.c > > @@ -1450,36 +1450,43 @@ void si_draw_vbo(struct pipe_context *ctx, const > struct pipe_draw_info *info) > > sctx->b.render_cond_atom.emit(&sctx->b, NULL); > > sctx->dirty_atoms = 0; > > > > si_emit_draw_packets(sctx, info, indexbuf, index_size, > index_offset); > > /* <-- CUs are busy here. */ > > > > /* Start prefetches after the draw has been started. Both > will run > > * in parallel, but starting the draw first is more > important. > > */ > > if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask) > > - cik_emit_prefetch_L2(sctx); > > + cik_emit_prefetch_L2(sctx, false); > > } else { > > /* If we don't wait for idle, start prefetches first, then > set > > * states, and draw at the end. > > */ > > if (sctx->b.flags) > > si_emit_cache_flush(sctx); > > > > + /* Only prefetch the first 2 items, e.g. the API VS and VBO > > + * descriptors. */ > > if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask) > > - cik_emit_prefetch_L2(sctx); > > + cik_emit_prefetch_L2(sctx, true); > > > > if (!si_upload_graphics_shader_descriptors(sctx)) > > return; > > > > si_emit_all_states(sctx, info, 0); > > si_emit_draw_packets(sctx, info, indexbuf, index_size, > index_offset); > > + > > + /* Prefetch the remaining shaders after the draw has been > > + * started. */ > > + if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask) > > + cik_emit_prefetch_L2(sctx, false); > > } > > > > if (unlikely(sctx->current_saved_cs)) { > > si_trace_emit(sctx); > > si_log_draw_state(sctx, sctx->b.log); > > } > > > > /* Workaround for a VGT hang when streamout is enabled. > > * It must be done after drawing. */ > > if ((sctx->b.family == CHIP_HAWAII || > > diff --git a/src/util/bitscan.h b/src/util/bitscan.h > > index 5cc75f0beba..78ff8e0cea1 100644 > > --- a/src/util/bitscan.h > > +++ b/src/util/bitscan.h > > @@ -89,20 +89,28 @@ ffsll(long long int val); > > > > > > /* Destructively loop over all of the bits in a mask as in: > > * > > * while (mymask) { > > * int i = u_bit_scan(&mymask); > > * ... process element i > > * } > > * > > */ > > +static inline int > > +u_bit_scan16(uint16_t *mask) > > +{ > > + const int i = ffs(*mask) - 1; > > + *mask ^= (1u << i); > > + return i; > > +} > > + > > static inline int > > u_bit_scan(unsigned *mask) > > { > > const int i = ffs(*mask) - 1; > > *mask ^= (1u << i); > > return i; > > } > > > > static inline int > > u_bit_scan64(uint64_t *mask) > > >
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev