Ping On Fri, Apr 6, 2018 at 10:31 PM, Marek Olšák <mar...@gmail.com> wrote:
> From: Marek Olšák <marek.ol...@amd.com> > > (This patch doesn't enable the behavior. It will be enabled in a later > commit.) > > Draw calls from multiple IBs can be executed in parallel. > > v2: do emit partial flushes on SI > v3: invalidate all shader caches at the beginning of IBs > v4: don't call si_emit_cache_flush in si_flush_gfx_cs if not needed, > only do this for flushes invoked internally > v5: empty IBs should wait for idle if the flush requires it > v6: split the commit > > If we artificially limit the number of draw calls per IB to 5, we'll get > a lot more IBs, leading to a lot more partial flushes. Let's see how > the removal of partial flushes changes GPU utilization in that scenario: > > With partial flushes (time busy): > CP: 99% > SPI: 86% > CB: 73: > > Without partial flushes (time busy): > CP: 99% > SPI: 93% > CB: 81% > --- > src/gallium/drivers/radeon/radeon_winsys.h | 7 ++++ > src/gallium/drivers/radeonsi/si_gfx_cs.c | 52 > ++++++++++++++++++++++-------- > src/gallium/drivers/radeonsi/si_pipe.h | 1 + > 3 files changed, 46 insertions(+), 14 deletions(-) > > diff --git a/src/gallium/drivers/radeon/radeon_winsys.h > b/src/gallium/drivers/radeon/radeon_winsys.h > index 157b2e40550..fae4fb7a95d 100644 > --- a/src/gallium/drivers/radeon/radeon_winsys.h > +++ b/src/gallium/drivers/radeon/radeon_winsys.h > @@ -21,20 +21,27 @@ > * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, > * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR > * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR > THE > * USE OR OTHER DEALINGS IN THE SOFTWARE. */ > > #ifndef RADEON_WINSYS_H > #define RADEON_WINSYS_H > > /* The public winsys interface header for the radeon driver. */ > > +/* Whether the next IB can start immediately and not wait for draws and > + * dispatches from the current IB to finish. */ > +#define RADEON_FLUSH_START_NEXT_GFX_IB_NOW (1u << 31) > + > +#define RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW \ > + (PIPE_FLUSH_ASYNC | RADEON_FLUSH_START_NEXT_GFX_IB_NOW) > + > #include "pipebuffer/pb_buffer.h" > > #include "amd/common/ac_gpu_info.h" > #include "amd/common/ac_surface.h" > > /* Tiling flags. */ > enum radeon_bo_layout { > RADEON_LAYOUT_LINEAR = 0, > RADEON_LAYOUT_TILED, > RADEON_LAYOUT_SQUARETILED, > diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c > b/src/gallium/drivers/radeonsi/si_gfx_cs.c > index 2d5e510b19e..63bff29e63a 100644 > --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c > +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c > @@ -62,25 +62,42 @@ void si_need_gfx_cs_space(struct si_context *ctx) > unsigned need_dwords = 2048 + ctx->num_cs_dw_queries_suspend; > if (!ctx->ws->cs_check_space(cs, need_dwords)) > si_flush_gfx_cs(ctx, PIPE_FLUSH_ASYNC, NULL); > } > > void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, > struct pipe_fence_handle **fence) > { > struct radeon_winsys_cs *cs = ctx->gfx_cs; > struct radeon_winsys *ws = ctx->ws; > + unsigned wait_flags = 0; > > if (ctx->gfx_flush_in_progress) > return; > > - if (!radeon_emitted(cs, ctx->initial_gfx_cs_size)) > + if (ctx->chip_class == VI && ctx->screen->info.drm_minor <= 1) { > + /* DRM 3.1.0 doesn't flush TC for VI correctly. */ > + wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | > + SI_CONTEXT_CS_PARTIAL_FLUSH | > + SI_CONTEXT_INV_GLOBAL_L2; > + } else if (ctx->chip_class == SI) { > + /* The kernel flushes L2 before shaders are finished. */ > + wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | > + SI_CONTEXT_CS_PARTIAL_FLUSH; > + } else if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) { > + wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | > + SI_CONTEXT_CS_PARTIAL_FLUSH; > + } > + > + /* Drop this flush if it's a no-op. */ > + if (!radeon_emitted(cs, ctx->initial_gfx_cs_size) && > + (!wait_flags || !ctx->gfx_last_ib_is_busy)) > return; > > if (si_check_device_reset(ctx)) > return; > > if (ctx->screen->debug_flags & DBG(CHECK_VM)) > flags &= ~PIPE_FLUSH_ASYNC; > > /* If the state tracker is flushing the GFX IB, si_flush_from_st is > * responsible for flushing the DMA IB and merging the fences from > both. > @@ -96,27 +113,25 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned > flags, > > if (!LIST_IS_EMPTY(&ctx->active_queries)) > si_suspend_queries(ctx); > > ctx->streamout.suspended = false; > if (ctx->streamout.begin_emitted) { > si_emit_streamout_end(ctx); > ctx->streamout.suspended = true; > } > > - ctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | > - SI_CONTEXT_PS_PARTIAL_FLUSH; > - > - /* DRM 3.1.0 doesn't flush TC for VI correctly. */ > - if (ctx->chip_class == VI && ctx->screen->info.drm_minor <= 1) > - ctx->flags |= SI_CONTEXT_INV_GLOBAL_L2 | > - SI_CONTEXT_INV_VMEM_L1; > + if (wait_flags) { > + ctx->flags |= wait_flags; > + si_emit_cache_flush(ctx); > + } > + ctx->gfx_last_ib_is_busy = wait_flags == 0; > > /* Make sure CP DMA is idle at the end of IBs after L2 prefetches > * because the kernel doesn't wait for it. */ > if (ctx->chip_class >= CIK) > si_cp_dma_wait_for_idle(ctx); > > if (ctx->current_saved_cs) { > si_trace_emit(ctx); > si_log_hw_flush(ctx); > > @@ -180,26 +195,35 @@ static void si_begin_gfx_cs_debug(struct si_context > *ctx) > > radeon_add_to_buffer_list(ctx, ctx->gfx_cs, > ctx->current_saved_cs->trace_buf, > RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE); > } > > void si_begin_new_gfx_cs(struct si_context *ctx) > { > if (ctx->is_debug) > si_begin_gfx_cs_debug(ctx); > > - /* Flush read caches at the beginning of CS not flushed by the > kernel. */ > - if (ctx->chip_class >= CIK) > - ctx->flags |= SI_CONTEXT_INV_SMEM_L1 | > - SI_CONTEXT_INV_ICACHE; > - > - ctx->flags |= SI_CONTEXT_START_PIPELINE_STATS; > + /* Always invalidate caches at the beginning of IBs, because > external > + * users (e.g. BO evictions and SDMA/UVD/VCE IBs) can modify our > + * buffers. > + * > + * Note that the cache flush done by the kernel at the end of GFX > IBs > + * isn't useful here, because that flush can finish after the > following > + * IB starts drawing. > + * > + * TODO: Do we also need to invalidate CB & DB caches? > + */ > + ctx->flags |= SI_CONTEXT_INV_ICACHE | > + SI_CONTEXT_INV_SMEM_L1 | > + SI_CONTEXT_INV_VMEM_L1 | > + SI_CONTEXT_INV_GLOBAL_L2 | > + SI_CONTEXT_START_PIPELINE_STATS; > > /* set all valid group as dirty so they get reemited on > * next draw command > */ > si_pm4_reset_emitted(ctx); > > /* The CS initialization should be emitted before everything else. > */ > si_pm4_emit(ctx, ctx->init_config); > if (ctx->init_config_gs_rings) > si_pm4_emit(ctx, ctx->init_config_gs_rings); > diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/ > radeonsi/si_pipe.h > index 0c90a6c6e46..f0f323ff3a7 100644 > --- a/src/gallium/drivers/radeonsi/si_pipe.h > +++ b/src/gallium/drivers/radeonsi/si_pipe.h > @@ -540,20 +540,21 @@ struct si_context { > void *vs_blit_texcoord; > struct si_screen *screen; > struct pipe_debug_callback debug; > LLVMTargetMachineRef tm; /* only non-threaded > compilation */ > struct si_shader_ctx_state fixed_func_tcs_shader; > struct r600_resource *wait_mem_scratch; > unsigned wait_mem_number; > uint16_t prefetch_L2_mask; > > bool gfx_flush_in_progress:1; > + bool gfx_last_ib_is_busy:1; > bool compute_is_busy:1; > > unsigned num_gfx_cs_flushes; > unsigned initial_gfx_cs_size; > unsigned gpu_reset_counter; > unsigned last_dirty_tex_counter; > unsigned last_compressed_colortex_counter; > unsigned last_num_draw_calls; > unsigned flags; /* flush flags */ > /* Current unaccounted memory usage. */ > -- > 2.15.1 > >
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev