From: Marek Olšák <marek.ol...@amd.com> Now draw calls from multiple IBs can be executed in parallel.
v2: do emit partial flushes on SI v3: invalidate all shader caches at the beginning of IBs If we artificially limit the number of draw calls per IB to 5, we'll get a lot more IBs, leading to a lot more partial flushes. Let's see how the removal of partial flushes changes GPU utilization in that scenario: With partial flushes (time busy): CP: 99% SPI: 86% CB: 73: Without partial flushes (time busy): CP: 99% SPI: 93% CB: 81% --- src/gallium/drivers/radeonsi/si_hw_context.c | 39 ++++++++++++++++++---------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c index 61c8d7067a1..b32b841a628 100644 --- a/src/gallium/drivers/radeonsi/si_hw_context.c +++ b/src/gallium/drivers/radeonsi/si_hw_context.c @@ -99,27 +99,31 @@ void si_context_gfx_flush(void *context, unsigned flags, if (!LIST_IS_EMPTY(&ctx->b.active_queries)) si_suspend_queries(&ctx->b); ctx->streamout.suspended = false; if (ctx->streamout.begin_emitted) { si_emit_streamout_end(ctx); ctx->streamout.suspended = true; } - ctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | - SI_CONTEXT_PS_PARTIAL_FLUSH; - - /* DRM 3.1.0 doesn't flush TC for VI correctly. */ - if (ctx->b.chip_class == VI && ctx->b.screen->info.drm_minor <= 1) - ctx->b.flags |= SI_CONTEXT_INV_GLOBAL_L2 | - SI_CONTEXT_INV_VMEM_L1; + if (ctx->b.chip_class == VI && ctx->b.screen->info.drm_minor <= 1) { + /* DRM 3.1.0 doesn't flush TC for VI correctly. */ + ctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | + SI_CONTEXT_CS_PARTIAL_FLUSH | + SI_CONTEXT_INV_GLOBAL_L2; + } else if (ctx->b.chip_class == SI) { + /* The kernel doesn't wait for idle before flushing and + * invalidating TC L2. */ + ctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | + SI_CONTEXT_CS_PARTIAL_FLUSH; + } si_emit_cache_flush(ctx); if (ctx->current_saved_cs) { si_trace_emit(ctx); si_log_hw_flush(ctx); /* Save the IB for debug contexts. */ si_save_cs(ws, cs, &ctx->current_saved_cs->gfx, true); ctx->current_saved_cs->flushed = true; @@ -180,26 +184,35 @@ static void si_begin_cs_debug(struct si_context *ctx) radeon_add_to_buffer_list(&ctx->b, &ctx->b.gfx, ctx->current_saved_cs->trace_buf, RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE); } void si_begin_new_cs(struct si_context *ctx) { if (ctx->is_debug) si_begin_cs_debug(ctx); - /* Flush read caches at the beginning of CS not flushed by the kernel. */ - if (ctx->b.chip_class >= CIK) - ctx->b.flags |= SI_CONTEXT_INV_SMEM_L1 | - SI_CONTEXT_INV_ICACHE; - - ctx->b.flags |= SI_CONTEXT_START_PIPELINE_STATS; + /* Always invalidate caches at the beginning of IBs, because external + * users (e.g. BO evictions and SDMA/UVD/VCE IBs) can modify our + * buffers. + * + * Note that the cache flush done by the kernel at the end of GFX IBs + * isn't useful here, because that flush can finish after the following + * IB starts drawing. + * + * TODO: Do we also need to invalidate CB & DB caches? + */ + ctx->b.flags |= SI_CONTEXT_INV_ICACHE | + SI_CONTEXT_INV_SMEM_L1 | + SI_CONTEXT_INV_VMEM_L1 | + SI_CONTEXT_INV_GLOBAL_L2 | + SI_CONTEXT_START_PIPELINE_STATS; /* set all valid group as dirty so they get reemited on * next draw command */ si_pm4_reset_emitted(ctx); /* The CS initialization should be emitted before everything else. */ si_pm4_emit(ctx, ctx->init_config); if (ctx->init_config_gs_rings) si_pm4_emit(ctx, ctx->init_config_gs_rings); -- 2.15.1 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev