On Tue, Aug 22, 2017 at 10:19 AM, Nicolai Hähnle <nhaeh...@gmail.com> wrote: > On 21.08.2017 23:54, Marek Olšák wrote: >> >> From: Marek Olšák <marek.ol...@amd.com> >> >> --- >> src/gallium/drivers/radeonsi/si_blit.c | 8 +++++--- >> src/gallium/drivers/radeonsi/si_pipe.h | 23 >> +++++++++++++++++++---- >> src/gallium/drivers/radeonsi/si_state.c | 19 +++++++++++++++---- >> src/gallium/drivers/radeonsi/si_state_draw.c | 11 +++++------ >> 4 files changed, 44 insertions(+), 17 deletions(-) >> >> diff --git a/src/gallium/drivers/radeonsi/si_blit.c >> b/src/gallium/drivers/radeonsi/si_blit.c >> index ae7f809..3228933 100644 >> --- a/src/gallium/drivers/radeonsi/si_blit.c >> +++ b/src/gallium/drivers/radeonsi/si_blit.c >> @@ -399,21 +399,22 @@ si_decompress_depth(struct si_context *sctx, >> if (inplace_planes & PIPE_MASK_Z) >> tex->dirty_level_mask = 0; >> if (inplace_planes & PIPE_MASK_S) >> tex->stencil_dirty_level_mask = 0; >> } >> } >> /* set_framebuffer_state takes care of coherency for >> single-sample. >> * The DB->CB copy uses CB for the final writes. >> */ >> if (copy_planes && tex->resource.b.b.nr_samples > 1) >> - si_make_CB_shader_coherent(sctx, >> tex->resource.b.b.nr_samples); >> + si_make_CB_shader_coherent(sctx, >> tex->resource.b.b.nr_samples, >> + false); >> } >> static void >> si_decompress_sampler_depth_textures(struct si_context *sctx, >> struct si_textures_info *textures) >> { >> unsigned i; >> unsigned mask = textures->needs_depth_decompress_mask; >> while (mask) { >> @@ -504,21 +505,22 @@ static void si_blit_decompress_color(struct >> pipe_context *ctx, >> } >> /* The texture will always be dirty if some layers aren't >> flushed. >> * I don't think this case occurs often though. */ >> if (first_layer == 0 && last_layer >= max_layer) { >> rtex->dirty_level_mask &= ~(1 << level); >> } >> } >> sctx->decompression_enabled = false; >> - si_make_CB_shader_coherent(sctx, rtex->resource.b.b.nr_samples); >> + si_make_CB_shader_coherent(sctx, rtex->resource.b.b.nr_samples, >> + vi_dcc_enabled(rtex, first_level)); >> } >> static void >> si_decompress_color_texture(struct si_context *sctx, struct r600_texture >> *tex, >> unsigned first_level, unsigned last_level) >> { >> /* CMASK or DCC can be discarded and we can still end up here. */ >> if (!tex->cmask.size && !tex->fmask.size && !tex->dcc_offset) >> return; >> @@ -1193,21 +1195,21 @@ static void si_do_CB_resolve(struct si_context >> *sctx, >> si_blitter_begin(&sctx->b.b, SI_COLOR_RESOLVE | >> (info->render_condition_enable ? 0 : >> SI_DISABLE_RENDER_COND)); >> util_blitter_custom_resolve_color(sctx->blitter, dst, dst_level, >> dst_z, >> info->src.resource, >> info->src.box.z, >> ~0, sctx->custom_blend_resolve, >> format); >> si_blitter_end(&sctx->b.b); >> /* Flush caches for possible texturing. */ >> - si_make_CB_shader_coherent(sctx, 1); >> + si_make_CB_shader_coherent(sctx, 1, false); >> } >> static bool do_hardware_msaa_resolve(struct pipe_context *ctx, >> const struct pipe_blit_info *info) >> { >> struct si_context *sctx = (struct si_context*)ctx; >> struct r600_texture *src = (struct >> r600_texture*)info->src.resource; >> struct r600_texture *dst = (struct >> r600_texture*)info->dst.resource; >> MAYBE_UNUSED struct r600_texture *rtmp; >> unsigned dst_width = u_minify(info->dst.resource->width0, >> info->dst.level); >> diff --git a/src/gallium/drivers/radeonsi/si_pipe.h >> b/src/gallium/drivers/radeonsi/si_pipe.h >> index 671c488..3e59e21 100644 >> --- a/src/gallium/drivers/radeonsi/si_pipe.h >> +++ b/src/gallium/drivers/radeonsi/si_pipe.h >> @@ -50,21 +50,24 @@ >> #define SI_CONTEXT_INV_ICACHE (R600_CONTEXT_PRIVATE_FLAG << 0) >> /* SMEM L1, other names: KCACHE, constant cache, DCACHE, data cache */ >> #define SI_CONTEXT_INV_SMEM_L1 (R600_CONTEXT_PRIVATE_FLAG >> << 1) >> /* VMEM L1 can optionally be bypassed (GLC=1). Other names: TC L1 */ >> #define SI_CONTEXT_INV_VMEM_L1 (R600_CONTEXT_PRIVATE_FLAG >> << 2) >> /* Used by everything except CB/DB, can be bypassed (SLC=1). Other >> names: TC L2 */ >> #define SI_CONTEXT_INV_GLOBAL_L2 (R600_CONTEXT_PRIVATE_FLAG << 3) >> /* Write dirty L2 lines back to memory (shader and CP DMA stores), but >> don't >> * invalidate L2. SI-CIK can't do it, so they will do complete >> invalidation. */ >> #define SI_CONTEXT_WRITEBACK_GLOBAL_L2 (R600_CONTEXT_PRIVATE_FLAG >> << 4) >> -/* gaps */ >> +/* Writeback & invalidate the L2 metadata cache. It can only be coupled >> with >> + * a CB or DB flush. */ >> +#define SI_CONTEXT_INV_L2_METADATA (R600_CONTEXT_PRIVATE_FLAG << 5) >> +/* gap */ >> /* Framebuffer caches. */ >> #define SI_CONTEXT_FLUSH_AND_INV_DB (R600_CONTEXT_PRIVATE_FLAG << 7) >> #define SI_CONTEXT_FLUSH_AND_INV_CB (R600_CONTEXT_PRIVATE_FLAG << 8) >> /* Engine synchronization. */ >> #define SI_CONTEXT_VS_PARTIAL_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 9) >> #define SI_CONTEXT_PS_PARTIAL_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 10) >> #define SI_CONTEXT_CS_PARTIAL_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 11) >> #define SI_CONTEXT_VGT_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 12) >> #define SI_CONTEXT_VGT_STREAMOUT_SYNC (R600_CONTEXT_PRIVATE_FLAG << 13) >> @@ -190,20 +193,21 @@ struct si_framebuffer { >> unsigned spi_shader_col_format_blend; >> unsigned spi_shader_col_format_blend_alpha; >> ubyte nr_samples:5; /* at most 16xAA */ >> ubyte log_samples:3; /* at most 4 = >> 16xAA */ >> ubyte compressed_cb_mask; >> ubyte color_is_int8; >> ubyte color_is_int10; >> ubyte dirty_cbufs; >> bool dirty_zsbuf; >> bool any_dst_linear; >> + bool CB_has_shader_readable_metadata; >> }; >> struct si_clip_state { >> struct r600_atom atom; >> struct pipe_clip_state state; >> bool any_nonzeros; >> }; >> struct si_sample_locs { >> struct r600_atom atom; >> @@ -588,28 +592,39 @@ si_optimal_tcc_alignment(struct si_context *sctx, >> unsigned upload_size) >> * the whole thing will fit into a cache line if we align it to >> its size. >> * The idea is that multiple small uploads can share a cache line. >> * If the upload size is greater, align it to the cache line size. >> */ >> alignment = util_next_power_of_two(upload_size); >> tcc_cache_line_size = sctx->screen->b.info.tcc_cache_line_size; >> return MIN2(alignment, tcc_cache_line_size); >> } >> static inline void >> -si_make_CB_shader_coherent(struct si_context *sctx, unsigned num_samples) >> +si_make_CB_shader_coherent(struct si_context *sctx, unsigned num_samples, >> + bool shaders_read_metadata) >> { >> sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_CB | >> SI_CONTEXT_INV_VMEM_L1; >> - /* Single-sample color is coherent with shaders on GFX9. */ >> - if (sctx->b.chip_class <= VI || num_samples >= 2) >> + if (sctx->b.chip_class >= GFX9) { >> + /* Single-sample color is coherent with shaders on GFX9, >> but >> + * L2 metadata must be flushed if shaders read metadata. >> + * (DCC, CMASK). >> + */ >> + if (num_samples >= 2) >> + sctx->b.flags |= SI_CONTEXT_INV_GLOBAL_L2; >> + else if (shaders_read_metadata) >> + sctx->b.flags |= SI_CONTEXT_INV_L2_METADATA; >> + } else { >> + /* SI-CI-VI */ >> sctx->b.flags |= SI_CONTEXT_INV_GLOBAL_L2; >> + } >> } >> static inline void >> si_make_DB_shader_coherent(struct si_context *sctx, unsigned >> num_samples, >> bool include_stencil) >> { >> sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_DB | >> SI_CONTEXT_INV_VMEM_L1; >> /* Single-sample depth (not stencil) is coherent with shaders on >> GFX9. */ >> diff --git a/src/gallium/drivers/radeonsi/si_state.c >> b/src/gallium/drivers/radeonsi/si_state.c >> index d116c07..e5d8d21 100644 >> --- a/src/gallium/drivers/radeonsi/si_state.c >> +++ b/src/gallium/drivers/radeonsi/si_state.c >> @@ -2566,21 +2566,22 @@ static void si_set_framebuffer_state(struct >> pipe_context *ctx, >> * >> * When MSAA is enabled, CB and TC caches are flushed on demand >> * (after FMASK decompression). Shader write -> FB read >> transitions >> * cannot happen for MSAA textures, because MSAA shader images are >> * not supported. >> * >> * Only flush and wait for CB if there is actually a bound color >> buffer. >> */ >> if (sctx->framebuffer.nr_samples <= 1 && >> sctx->framebuffer.state.nr_cbufs) >> - si_make_CB_shader_coherent(sctx, >> sctx->framebuffer.nr_samples); >> + si_make_CB_shader_coherent(sctx, >> sctx->framebuffer.nr_samples, >> + >> sctx->framebuffer.CB_has_shader_readable_metadata); >> sctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH; >> /* u_blitter doesn't invoke depth decompression when it does >> multiple >> * blits in a row, but the only case when it matters for DB is >> when >> * doing generate_mipmap. So here we flush DB manually between >> * individual generate_mipmap blits. >> * Note that lower mipmap levels aren't compressed. >> */ >> if (sctx->generate_mipmap_for_depth) >> @@ -2601,20 +2602,21 @@ static void si_set_framebuffer_state(struct >> pipe_context *ctx, >> sctx->framebuffer.spi_shader_col_format_alpha = 0; >> sctx->framebuffer.spi_shader_col_format_blend = 0; >> sctx->framebuffer.spi_shader_col_format_blend_alpha = 0; >> sctx->framebuffer.color_is_int8 = 0; >> sctx->framebuffer.color_is_int10 = 0; >> sctx->framebuffer.compressed_cb_mask = 0; >> sctx->framebuffer.nr_samples = >> util_framebuffer_get_num_samples(state); >> sctx->framebuffer.log_samples = >> util_logbase2(sctx->framebuffer.nr_samples); >> sctx->framebuffer.any_dst_linear = false; >> + sctx->framebuffer.CB_has_shader_readable_metadata = false; >> for (i = 0; i < state->nr_cbufs; i++) { >> if (!state->cbufs[i]) >> continue; >> surf = (struct r600_surface*)state->cbufs[i]; >> rtex = (struct r600_texture*)surf->base.texture; >> if (!surf->color_initialized) { >> si_initialize_color_surface(sctx, surf); >> @@ -2635,20 +2637,23 @@ static void si_set_framebuffer_state(struct >> pipe_context *ctx, >> if (surf->color_is_int10) >> sctx->framebuffer.color_is_int10 |= 1 << i; >> if (rtex->fmask.size) { >> sctx->framebuffer.compressed_cb_mask |= 1 << i; >> } >> if (rtex->surface.is_linear) >> sctx->framebuffer.any_dst_linear = true; >> + if (vi_dcc_enabled(rtex, surf->base.u.tex.level)) >> + sctx->framebuffer.CB_has_shader_readable_metadata >> = true; >> + >> r600_context_add_resource_size(ctx, surf->base.texture); >> p_atomic_inc(&rtex->framebuffers_bound); >> if (rtex->dcc_gather_statistics) { >> /* Dirty tracking must be enabled for DCC usage >> analysis. */ >> sctx->framebuffer.compressed_cb_mask |= 1 << i; >> vi_separate_dcc_start_query(ctx, rtex); >> } >> } >> @@ -4015,21 +4020,22 @@ static void si_set_tess_state(struct pipe_context >> *ctx, >> static void si_texture_barrier(struct pipe_context *ctx, unsigned >> flags) >> { >> struct si_context *sctx = (struct si_context *)ctx; >> si_update_fb_dirtiness_after_rendering(sctx); >> /* Multisample surfaces are flushed in si_decompress_textures. */ >> if (sctx->framebuffer.nr_samples <= 1 && >> sctx->framebuffer.state.nr_cbufs) >> - si_make_CB_shader_coherent(sctx, >> sctx->framebuffer.nr_samples); >> + si_make_CB_shader_coherent(sctx, >> sctx->framebuffer.nr_samples, >> + >> sctx->framebuffer.CB_has_shader_readable_metadata); >> } >> /* This only ensures coherency for shader image/buffer stores. */ >> static void si_memory_barrier(struct pipe_context *ctx, unsigned flags) >> { >> struct si_context *sctx = (struct si_context *)ctx; >> /* Subsequent commands must wait for all shader invocations to >> * complete. */ >> sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | >> @@ -4060,23 +4066,28 @@ static void si_memory_barrier(struct pipe_context >> *ctx, unsigned flags) >> } >> /* MSAA color, any depth and any stencil are flushed in >> * si_decompress_textures when needed. >> */ >> if (flags & PIPE_BARRIER_FRAMEBUFFER && >> sctx->framebuffer.nr_samples <= 1 && >> sctx->framebuffer.state.nr_cbufs) { >> sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_CB; >> - /* Single-sample color is coherent with TC on GFX9. */ >> - if (sctx->screen->b.chip_class <= VI) >> + if (sctx->b.chip_class >= GFX9) { >> + /* Single-sample color is coherent with TC on >> GFX9. */ >> + if >> (sctx->framebuffer.CB_has_shader_readable_metadata) >> + sctx->b.flags |= >> SI_CONTEXT_INV_L2_METADATA; > > > MemoryBarrier is about making *shader writes* visible to other parts of the > pipeline. > > So I think: > > - the comment above is misleading > > - there should be no need to invalidate the metadata, since DCC must already > have been disabled or at least decompressed for the relevant texture, and > the shader doesn't modify the metadata anyway.
You are right. I've replaced that locally with: if (sctx->b.chip_class <= VI) sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2; and no comment. Marek _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev