From: Marek Olšák <marek.ol...@amd.com> We should get fewer context rolls with the SET_CONTEXT_REG optimization, but it would have been for nothing if the scissor state rolled the context anyway. Don't emit the scissor state if there is no context roll. --- src/gallium/drivers/radeonsi/si_pipe.h | 1 + src/gallium/drivers/radeonsi/si_state.c | 31 ++++++++++++++---- src/gallium/drivers/radeonsi/si_state.h | 17 ++-------- .../drivers/radeonsi/si_state_binning.c | 7 ++++ src/gallium/drivers/radeonsi/si_state_draw.c | 32 +++++++++++-------- .../drivers/radeonsi/si_state_shaders.c | 23 +++++++++++++ .../drivers/radeonsi/si_state_viewport.c | 3 ++ 7 files changed, 80 insertions(+), 34 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 7ae17435ab6..6edc06cece7 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1016,20 +1016,21 @@ struct si_context { unsigned num_vs_flushes; unsigned num_ps_flushes; unsigned num_cs_flushes; unsigned num_cb_cache_flushes; unsigned num_db_cache_flushes; unsigned num_L2_invalidates; unsigned num_L2_writebacks; unsigned num_resident_handles; uint64_t num_alloc_tex_transfer_bytes; unsigned last_tex_ps_draw_ratio; /* for query */ + unsigned context_roll_counter; /* Queries. */ /* Maintain the list of active queries for pausing between IBs. */ int num_occlusion_queries; int num_perfect_occlusion_queries; struct list_head active_queries; unsigned num_cs_dw_queries_suspend; /* Render condition. */ struct pipe_query *render_cond; diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index d3c63406dd4..fa1fea5289c 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -81,34 +81,35 @@ static void si_emit_cb_render_state(struct si_context *sctx) * but there is not enough color outputs. This is undefined behavior, * so disable color writes completely. * * Reproducible with Unigine Heaven 4.0 and drirc missing. */ if (blend && blend->dual_src_blend && sctx->ps_shader.cso && (sctx->ps_shader.cso->info.colors_written & 0x3) != 0x3) cb_target_mask = 0; - radeon_opt_set_context_reg(sctx, R_028238_CB_TARGET_MASK, - SI_TRACKED_CB_TARGET_MASK, cb_target_mask); - /* GFX9: Flush DFSM when CB_TARGET_MASK changes. * I think we don't have to do anything between IBs. */ if (sctx->screen->dfsm_allowed && sctx->last_cb_target_mask != cb_target_mask) { sctx->last_cb_target_mask = cb_target_mask; radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0)); } + unsigned initial_cdw = cs->current.cdw; + radeon_opt_set_context_reg(sctx, R_028238_CB_TARGET_MASK, + SI_TRACKED_CB_TARGET_MASK, cb_target_mask); + if (sctx->chip_class >= VI) { /* DCC MSAA workaround for blending. * Alternatively, we can set CB_COLORi_DCC_CONTROL.OVERWRITE_- * COMBINER_DISABLE, but that would be more complicated. */ bool oc_disable = (sctx->chip_class == VI || sctx->chip_class == GFX9) && blend && blend->blend_enable_4bit & cb_target_mask && sctx->framebuffer.nr_samples >= 2; @@ -245,20 +246,22 @@ static void si_emit_cb_render_state(struct si_context *sctx) break; } } /* SX_PS_DOWNCONVERT, SX_BLEND_OPT_EPSILON, SX_BLEND_OPT_CONTROL */ radeon_opt_set_context_reg3(sctx, R_028754_SX_PS_DOWNCONVERT, SI_TRACKED_SX_PS_DOWNCONVERT, sx_ps_downconvert, sx_blend_opt_epsilon, sx_blend_opt_control); } + if (initial_cdw != cs->current.cdw) + sctx->context_roll_counter++; } /* * Blender functions */ static uint32_t si_translate_blend_function(int blend_func) { switch (blend_func) { case PIPE_BLEND_ADD: @@ -766,31 +769,35 @@ static void si_emit_clip_regs(struct si_context *sctx) /* Clip distances on points have no effect, so need to be implemented * as cull distances. This applies for the clipvertex case as well. * * Setting this for primitives other than points should have no adverse * effects. */ clipdist_mask &= rs->clip_plane_enable; culldist_mask |= clipdist_mask; + unsigned initial_cdw = sctx->gfx_cs->current.cdw; radeon_opt_set_context_reg(sctx, R_02881C_PA_CL_VS_OUT_CNTL, SI_TRACKED_PA_CL_VS_OUT_CNTL, vs_sel->pa_cl_vs_out_cntl | S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0F) != 0) | S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xF0) != 0) | clipdist_mask | (culldist_mask << 8)); radeon_opt_set_context_reg(sctx, R_028810_PA_CL_CLIP_CNTL, SI_TRACKED_PA_CL_CLIP_CNTL, rs->pa_cl_clip_cntl | ucp_mask | S_028810_CLIP_DISABLE(window_space)); + + if (initial_cdw != sctx->gfx_cs->current.cdw) + sctx->context_roll_counter++; } /* * inferred state between framebuffer and rasterizer */ static void si_update_poly_offset_state(struct si_context *sctx) { struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; if (!rs || !rs->uses_poly_offset || !sctx->framebuffer.state.zsbuf) { @@ -1345,20 +1352,21 @@ void si_save_qbo_state(struct si_context *sctx, struct si_qbo_state *st) st->saved_compute = sctx->cs_shader_state.program; si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &st->saved_const0); si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo); } static void si_emit_db_render_state(struct si_context *sctx) { struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; unsigned db_shader_control, db_render_control, db_count_control; + unsigned initial_cdw = sctx->gfx_cs->current.cdw; /* DB_RENDER_CONTROL */ if (sctx->dbcb_depth_copy_enabled || sctx->dbcb_stencil_copy_enabled) { db_render_control = S_028000_DEPTH_COPY(sctx->dbcb_depth_copy_enabled) | S_028000_STENCIL_COPY(sctx->dbcb_stencil_copy_enabled) | S_028000_COPY_CENTROID(1) | S_028000_COPY_SAMPLE(sctx->dbcb_copy_sample); } else if (sctx->db_flush_depth_inplace || sctx->db_flush_stencil_inplace) { @@ -1427,20 +1435,23 @@ static void si_emit_db_render_state(struct si_context *sctx) /* Disable the gl_SampleMask fragment shader output if MSAA is disabled. */ if (!rs->multisample_enable) db_shader_control &= C_02880C_MASK_EXPORT_ENABLE; if (sctx->screen->has_rbplus && !sctx->screen->rbplus_allowed) db_shader_control |= S_02880C_DUAL_QUAD_DISABLE(1); radeon_opt_set_context_reg(sctx, R_02880C_DB_SHADER_CONTROL, SI_TRACKED_DB_SHADER_CONTROL, db_shader_control); + + if (initial_cdw != sctx->gfx_cs->current.cdw) + sctx->context_roll_counter++; } /* * format translation */ static uint32_t si_translate_colorformat(enum pipe_format format) { const struct util_format_description *desc = util_format_description(format); if (!desc) return V_028C70_COLOR_INVALID; @@ -3482,35 +3493,41 @@ static void si_emit_msaa_config(struct si_context *sctx) db_eqaa |= S_028804_MAX_ANCHOR_SAMPLES(log_z_samples) | S_028804_PS_ITER_SAMPLES(log_ps_iter_samples) | S_028804_MASK_EXPORT_NUM_SAMPLES(log_samples) | S_028804_ALPHA_TO_MASK_NUM_SAMPLES(log_samples); sc_mode_cntl_1 |= S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1); } else if (sctx->smoothing_enabled) { db_eqaa |= S_028804_OVERRASTERIZATION_AMOUNT(log_samples); } } + unsigned initial_cdw = cs->current.cdw; + /* R_028BDC_PA_SC_LINE_CNTL, R_028BE0_PA_SC_AA_CONFIG */ radeon_opt_set_context_reg2(sctx, R_028BDC_PA_SC_LINE_CNTL, SI_TRACKED_PA_SC_LINE_CNTL, sc_line_cntl, sc_aa_config); /* R_028804_DB_EQAA */ radeon_opt_set_context_reg(sctx, R_028804_DB_EQAA, SI_TRACKED_DB_EQAA, db_eqaa); /* R_028A4C_PA_SC_MODE_CNTL_1 */ radeon_opt_set_context_reg(sctx, R_028A4C_PA_SC_MODE_CNTL_1, SI_TRACKED_PA_SC_MODE_CNTL_1, sc_mode_cntl_1); - /* GFX9: Flush DFSM when the AA mode changes. */ - if (sctx->screen->dfsm_allowed) { - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0)); + if (initial_cdw != cs->current.cdw) { + sctx->context_roll_counter++; + + /* GFX9: Flush DFSM when the AA mode changes. */ + if (sctx->screen->dfsm_allowed) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0)); + } } } void si_update_ps_iter_samples(struct si_context *sctx) { if (sctx->framebuffer.nr_samples > 1) si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); if (sctx->screen->dpbb_allowed) si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state); } diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index f52296d1119..83589e6918c 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -164,31 +164,27 @@ union si_state { struct si_pm4_state *ps; } named; struct si_pm4_state *array[0]; }; #define SI_STATE_IDX(name) \ (offsetof(union si_state, named.name) / sizeof(struct si_pm4_state *)) #define SI_STATE_BIT(name) (1 << SI_STATE_IDX(name)) #define SI_NUM_STATES (sizeof(union si_state) / sizeof(struct si_pm4_state *)) -static inline unsigned si_states_that_roll_context(void) +static inline unsigned si_states_that_always_roll_context(void) { return (SI_STATE_BIT(blend) | SI_STATE_BIT(rasterizer) | SI_STATE_BIT(dsa) | SI_STATE_BIT(poly_offset) | - SI_STATE_BIT(es) | - SI_STATE_BIT(gs) | - SI_STATE_BIT(vgt_shader_config) | - SI_STATE_BIT(vs) | - SI_STATE_BIT(ps)); + SI_STATE_BIT(vgt_shader_config)); } union si_state_atoms { struct { /* The order matters. */ struct si_atom render_cond; struct si_atom streamout_begin; struct si_atom streamout_enable; /* must be after streamout_begin */ struct si_atom framebuffer; struct si_atom msaa_sample_locs; @@ -209,39 +205,32 @@ union si_state_atoms { struct si_atom scratch_state; struct si_atom window_rectangles; } s; struct si_atom array[0]; }; #define SI_ATOM_BIT(name) (1 << (offsetof(union si_state_atoms, s.name) / \ sizeof(struct si_atom))) #define SI_NUM_ATOMS (sizeof(union si_state_atoms)/sizeof(struct si_atom*)) -static inline unsigned si_atoms_that_roll_context(void) +static inline unsigned si_atoms_that_always_roll_context(void) { return (SI_ATOM_BIT(streamout_begin) | SI_ATOM_BIT(streamout_enable) | SI_ATOM_BIT(framebuffer) | SI_ATOM_BIT(msaa_sample_locs) | - SI_ATOM_BIT(db_render_state) | - SI_ATOM_BIT(dpbb_state) | - SI_ATOM_BIT(msaa_config) | SI_ATOM_BIT(sample_mask) | - SI_ATOM_BIT(cb_render_state) | SI_ATOM_BIT(blend_color) | - SI_ATOM_BIT(clip_regs) | SI_ATOM_BIT(clip_state) | - SI_ATOM_BIT(guardband) | SI_ATOM_BIT(scissors) | SI_ATOM_BIT(viewports) | SI_ATOM_BIT(stencil_ref) | - SI_ATOM_BIT(spi_map) | SI_ATOM_BIT(scratch_state)); } struct si_shader_data { uint32_t sh_base[SI_NUM_SHADERS]; }; /* The list of registers whose emitted values are remembered by si_context. */ enum si_tracked_reg { SI_TRACKED_DB_RENDER_CONTROL, /* 2 consecutive registers */ diff --git a/src/gallium/drivers/radeonsi/si_state_binning.c b/src/gallium/drivers/radeonsi/si_state_binning.c index 4aad94d95f9..70c129242d1 100644 --- a/src/gallium/drivers/radeonsi/si_state_binning.c +++ b/src/gallium/drivers/radeonsi/si_state_binning.c @@ -303,28 +303,32 @@ static struct uvec2 si_get_depth_bin_size(struct si_context *sctx) { 193, 0, 0 }, }, }, }; return si_find_bin_size(sctx->screen, table, sum); } static void si_emit_dpbb_disable(struct si_context *sctx) { + unsigned initial_cdw = sctx->gfx_cs->current.cdw; + radeon_opt_set_context_reg(sctx, R_028C44_PA_SC_BINNER_CNTL_0, SI_TRACKED_PA_SC_BINNER_CNTL_0, S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) | S_028C44_DISABLE_START_OF_PRIM(1)); radeon_opt_set_context_reg(sctx, R_028060_DB_DFSM_CONTROL, SI_TRACKED_DB_DFSM_CONTROL, S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF) | S_028060_POPS_DRAIN_PS_ON_OVERLAP(1)); + if (initial_cdw != sctx->gfx_cs->current.cdw) + sctx->context_roll_counter++; } void si_emit_dpbb_state(struct si_context *sctx) { struct si_screen *sscreen = sctx->screen; struct si_state_blend *blend = sctx->queued.named.blend; struct si_state_dsa *dsa = sctx->queued.named.dsa; unsigned db_shader_control = sctx->ps_db_shader_control; assert(sctx->chip_class >= GFX9); @@ -412,28 +416,31 @@ void si_emit_dpbb_state(struct si_context *sctx) assert(0); } /* Emit registers. */ struct uvec2 bin_size_extend = {}; if (bin_size.x >= 32) bin_size_extend.x = util_logbase2(bin_size.x) - 5; if (bin_size.y >= 32) bin_size_extend.y = util_logbase2(bin_size.y) - 5; + unsigned initial_cdw = sctx->gfx_cs->current.cdw; radeon_opt_set_context_reg( sctx, R_028C44_PA_SC_BINNER_CNTL_0, SI_TRACKED_PA_SC_BINNER_CNTL_0, S_028C44_BINNING_MODE(V_028C44_BINNING_ALLOWED) | S_028C44_BIN_SIZE_X(bin_size.x == 16) | S_028C44_BIN_SIZE_Y(bin_size.y == 16) | S_028C44_BIN_SIZE_X_EXTEND(bin_size_extend.x) | S_028C44_BIN_SIZE_Y_EXTEND(bin_size_extend.y) | S_028C44_CONTEXT_STATES_PER_BIN(context_states_per_bin) | S_028C44_PERSISTENT_STATES_PER_BIN(persistent_states_per_bin) | S_028C44_DISABLE_START_OF_PRIM(disable_start_of_prim) | S_028C44_FPOVS_PER_BATCH(fpovs_per_batch) | S_028C44_OPTIMAL_BIN_SELECTION(1)); radeon_opt_set_context_reg(sctx, R_028060_DB_DFSM_CONTROL, SI_TRACKED_DB_DFSM_CONTROL, S_028060_PUNCHOUT_MODE(punchout_mode) | S_028060_POPS_DRAIN_PS_ON_OVERLAP(1)); + if (initial_cdw != sctx->gfx_cs->current.cdw) + sctx->context_roll_counter++; } diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index 69f723e4e4a..83eb646b791 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -1182,40 +1182,40 @@ static void si_get_draw_start_count(struct si_context *sctx, } else { *start = info->start; *count = info->count; } } static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_info *info, unsigned skip_atom_mask) { unsigned num_patches = 0; + /* Vega10/Raven scissor bug workaround. When any context register is + * written (i.e. the GPU rolls the context), PA_SC_VPORT_SCISSOR + * registers must be written too. + */ + bool handle_scissor_bug = (sctx->family == CHIP_VEGA10 || sctx->family == CHIP_RAVEN) && + !si_is_atom_dirty(sctx, &sctx->atoms.s.scissors); bool context_roll = false; /* set correctly for GFX9 only */ context_roll |= si_emit_rasterizer_prim_state(sctx); if (sctx->tes_shader.cso) context_roll |= si_emit_derived_tess_state(sctx, info, &num_patches); - if (info->count_from_stream_output) + + if (handle_scissor_bug && + (info->count_from_stream_output || + sctx->dirty_atoms & si_atoms_that_always_roll_context() || + sctx->dirty_states & si_states_that_always_roll_context() || + si_prim_restart_index_changed(sctx, info))) context_roll = true; - /* Vega10/Raven scissor bug workaround. When any context register is - * written (i.e. the GPU rolls the context), PA_SC_VPORT_SCISSOR - * registers must be written too. - */ - if ((sctx->family == CHIP_VEGA10 || sctx->family == CHIP_RAVEN) && - (context_roll || - sctx->dirty_atoms & si_atoms_that_roll_context() || - sctx->dirty_states & si_states_that_roll_context() || - si_prim_restart_index_changed(sctx, info))) { - sctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1; - si_mark_atom_dirty(sctx, &sctx->atoms.s.scissors); - } + sctx->context_roll_counter = 0; /* Emit state atoms. */ unsigned mask = sctx->dirty_atoms & ~skip_atom_mask; while (mask) sctx->atoms.array[u_bit_scan(&mask)].emit(sctx); sctx->dirty_atoms &= skip_atom_mask; /* Emit states. */ mask = sctx->dirty_states; @@ -1224,20 +1224,26 @@ static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_i struct si_pm4_state *state = sctx->queued.array[i]; if (!state || sctx->emitted.array[i] == state) continue; si_pm4_emit(sctx, state); sctx->emitted.array[i] = state; } sctx->dirty_states = 0; + if (handle_scissor_bug && + (context_roll || sctx->context_roll_counter)) { + sctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1; + sctx->atoms.s.scissors.emit(sctx); + } + /* Emit draw states. */ si_emit_vs_state(sctx, info); si_emit_draw_registers(sctx, info, num_patches); } void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) { struct si_context *sctx = (struct si_context *)ctx; struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; struct pipe_resource *indexbuf = info->index.resource; diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 2bdac33586b..ad7d21e7816 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -554,37 +554,41 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader) if (sscreen->info.chip_class <= VI) { si_pm4_set_reg(pm4, R_00B42C_SPI_SHADER_PGM_RSRC2_HS, shader->config.rsrc2); } } static void si_emit_shader_es(struct si_context *sctx) { struct si_shader *shader = sctx->queued.named.es->shader; + unsigned initial_cdw = sctx->gfx_cs->current.cdw; if (!shader) return; radeon_opt_set_context_reg(sctx, R_028AAC_VGT_ESGS_RING_ITEMSIZE, SI_TRACKED_VGT_ESGS_RING_ITEMSIZE, shader->selector->esgs_itemsize / 4); if (shader->selector->type == PIPE_SHADER_TESS_EVAL) radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM, shader->vgt_tf_param); if (shader->vgt_vertex_reuse_block_cntl) radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL, shader->vgt_vertex_reuse_block_cntl); + + if (initial_cdw != sctx->gfx_cs->current.cdw) + sctx->context_roll_counter++; } static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader) { struct si_pm4_state *pm4; unsigned num_user_sgprs; unsigned vgpr_comp_cnt; uint64_t va; unsigned oc_lds_en; @@ -755,20 +759,22 @@ static void gfx9_get_gs_info(struct si_shader_selector *es, out->max_prims_per_subgroup = out->gs_inst_prims_in_subgroup * gs->gs_max_out_vertices; out->lds_size = align(esgs_lds_size, 128) / 128; assert(out->max_prims_per_subgroup <= max_out_prims); } static void si_emit_shader_gs(struct si_context *sctx) { struct si_shader *shader = sctx->queued.named.gs->shader; + unsigned initial_cdw = sctx->gfx_cs->current.cdw; + if (!shader) return; /* R_028A60_VGT_GSVS_RING_OFFSET_1, R_028A64_VGT_GSVS_RING_OFFSET_2 * R_028A68_VGT_GSVS_RING_OFFSET_3, R_028A6C_VGT_GS_OUT_PRIM_TYPE */ radeon_opt_set_context_reg4(sctx, R_028A60_VGT_GSVS_RING_OFFSET_1, SI_TRACKED_VGT_GSVS_RING_OFFSET_1, shader->ctx_reg.gs.vgt_gsvs_ring_offset_1, shader->ctx_reg.gs.vgt_gsvs_ring_offset_2, shader->ctx_reg.gs.vgt_gsvs_ring_offset_3, @@ -815,20 +821,23 @@ static void si_emit_shader_gs(struct si_context *sctx) if (shader->key.part.gs.es->type == PIPE_SHADER_TESS_EVAL) radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM, shader->vgt_tf_param); if (shader->vgt_vertex_reuse_block_cntl) radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL, shader->vgt_vertex_reuse_block_cntl); } + + if (initial_cdw != sctx->gfx_cs->current.cdw) + sctx->context_roll_counter++; } static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) { struct si_shader_selector *sel = shader->selector; const ubyte *num_components = sel->info.num_stream_output_components; unsigned gs_num_invocations = sel->gs_num_invocations; struct si_pm4_state *pm4; uint64_t va; unsigned max_stream = sel->max_gs_stream; @@ -950,20 +959,22 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) S_00B228_FLOAT_MODE(shader->config.float_mode)); si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, S_00B22C_USER_SGPR(GFX6_GS_NUM_USER_SGPR) | S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0)); } } static void si_emit_shader_vs(struct si_context *sctx) { struct si_shader *shader = sctx->queued.named.vs->shader; + unsigned initial_cdw = sctx->gfx_cs->current.cdw; + if (!shader) return; radeon_opt_set_context_reg(sctx, R_028A40_VGT_GS_MODE, SI_TRACKED_VGT_GS_MODE, shader->ctx_reg.vs.vgt_gs_mode); radeon_opt_set_context_reg(sctx, R_028A84_VGT_PRIMITIVEID_EN, SI_TRACKED_VGT_PRIMITIVEID_EN, shader->ctx_reg.vs.vgt_primitiveid_en); @@ -987,20 +998,23 @@ static void si_emit_shader_vs(struct si_context *sctx) if (shader->selector->type == PIPE_SHADER_TESS_EVAL) radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM, shader->vgt_tf_param); if (shader->vgt_vertex_reuse_block_cntl) radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL, shader->vgt_vertex_reuse_block_cntl); + + if (initial_cdw != sctx->gfx_cs->current.cdw) + sctx->context_roll_counter++; } /** * Compute the state for \p shader, which will run as a vertex shader on the * hardware. * * If \p gs is non-NULL, it points to the geometry shader for which this shader * is the copy shader. */ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader, @@ -1149,20 +1163,22 @@ static unsigned si_get_spi_shader_col_format(struct si_shader *shader) for (i = 0; i < num_targets; i++) if (!(value & (0xf << (i * 4)))) value |= V_028714_SPI_SHADER_32_R << (i * 4); return value; } static void si_emit_shader_ps(struct si_context *sctx) { struct si_shader *shader = sctx->queued.named.ps->shader; + unsigned initial_cdw = sctx->gfx_cs->current.cdw; + if (!shader) return; /* R_0286CC_SPI_PS_INPUT_ENA, R_0286D0_SPI_PS_INPUT_ADDR*/ radeon_opt_set_context_reg2(sctx, R_0286CC_SPI_PS_INPUT_ENA, SI_TRACKED_SPI_PS_INPUT_ENA, shader->ctx_reg.ps.spi_ps_input_ena, shader->ctx_reg.ps.spi_ps_input_addr); radeon_opt_set_context_reg(sctx, R_0286E0_SPI_BARYC_CNTL, @@ -1174,20 +1190,23 @@ static void si_emit_shader_ps(struct si_context *sctx) /* R_028710_SPI_SHADER_Z_FORMAT, R_028714_SPI_SHADER_COL_FORMAT */ radeon_opt_set_context_reg2(sctx, R_028710_SPI_SHADER_Z_FORMAT, SI_TRACKED_SPI_SHADER_Z_FORMAT, shader->ctx_reg.ps.spi_shader_z_format, shader->ctx_reg.ps.spi_shader_col_format); radeon_opt_set_context_reg(sctx, R_02823C_CB_SHADER_MASK, SI_TRACKED_CB_SHADER_MASK, shader->ctx_reg.ps.cb_shader_mask); + + if (initial_cdw != sctx->gfx_cs->current.cdw) + sctx->context_roll_counter++; } static void si_shader_ps(struct si_shader *shader) { struct tgsi_shader_info *info = &shader->selector->info; struct si_pm4_state *pm4; unsigned spi_ps_in_control, spi_shader_col_format, cb_shader_mask; unsigned spi_baryc_cntl = S_0286E0_FRONT_FACE_ALL_BITS(1); uint64_t va; unsigned input_ena = shader->config.spi_ps_input_ena; @@ -2842,23 +2861,27 @@ static void si_emit_spi_map(struct si_context *sctx) spi_ps_input_cntl[num_written++] = si_get_ps_input_cntl(sctx, vs, bcol, i, bcol_interp[i]); } } assert(num_interp == num_written); /* R_028644_SPI_PS_INPUT_CNTL_0 */ /* Dota 2: Only ~16% of SPI map updates set different values. */ /* Talos: Only ~9% of SPI map updates set different values. */ + unsigned initial_cdw = sctx->gfx_cs->current.cdw; radeon_opt_set_context_regn(sctx, R_028644_SPI_PS_INPUT_CNTL_0, spi_ps_input_cntl, sctx->tracked_regs.spi_ps_input_cntl, num_interp); + + if (initial_cdw != sctx->gfx_cs->current.cdw) + sctx->context_roll_counter++; } /** * Writing CONFIG or UCONFIG VGT registers requires VGT_FLUSH before that. */ static void si_init_config_add_vgt_flush(struct si_context *sctx) { if (sctx->init_config_has_vgt_flush) return; diff --git a/src/gallium/drivers/radeonsi/si_state_viewport.c b/src/gallium/drivers/radeonsi/si_state_viewport.c index 819c773ba8e..587422e50ca 100644 --- a/src/gallium/drivers/radeonsi/si_state_viewport.c +++ b/src/gallium/drivers/radeonsi/si_state_viewport.c @@ -251,33 +251,36 @@ static void si_emit_guardband(struct si_context *ctx) /* Discard primitives that would lie entirely outside the clip * region. */ discard_x = MIN2(discard_x, guardband_x); discard_y = MIN2(discard_y, guardband_y); } /* If any of the GB registers is updated, all of them must be updated. * R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, R_028BEC_PA_CL_GB_VERT_DISC_ADJ * R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ, R_028BF4_PA_CL_GB_HORZ_DISC_ADJ */ + unsigned initial_cdw = ctx->gfx_cs->current.cdw; radeon_opt_set_context_reg4(ctx, R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ, fui(guardband_y), fui(discard_y), fui(guardband_x), fui(discard_x)); radeon_opt_set_context_reg(ctx, R_028234_PA_SU_HARDWARE_SCREEN_OFFSET, SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET, S_028234_HW_SCREEN_OFFSET_X(hw_screen_offset_x >> 4) | S_028234_HW_SCREEN_OFFSET_Y(hw_screen_offset_y >> 4)); radeon_opt_set_context_reg(ctx, R_028BE4_PA_SU_VTX_CNTL, SI_TRACKED_PA_SU_VTX_CNTL, S_028BE4_PIX_CENTER(rs->half_pixel_center) | S_028BE4_QUANT_MODE(V_028BE4_X_16_8_FIXED_POINT_1_256TH + vp_as_scissor.quant_mode)); + if (initial_cdw != ctx->gfx_cs->current.cdw) + ctx->context_roll_counter++; } static void si_emit_scissors(struct si_context *ctx) { struct radeon_cmdbuf *cs = ctx->gfx_cs; struct pipe_scissor_state *states = ctx->scissors.states; unsigned mask = ctx->scissors.dirty_mask; bool scissor_enabled = ctx->queued.named.rasterizer->scissor_enable; /* The simple case: Only 1 viewport is active. */ -- 2.17.1 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev