Thanks. It looks good. Marek
On Thu, Apr 25, 2019, 5:17 PM Dylan Baker <dy...@pnwbakers.com> wrote: > Hi Marek, > > I've tried to apply this to 19.0, I had to pull "radeonsi: add > si_debug_options > for convenient adding/removing of options", which is fine, but this patch > also > assumes your si compute-queue only patches, which aren't present in 19.0. > I've > made a small change to get it compiling, but I'm sure it's not the right > fix, so > if you could take a look at the staging/19.0 branch and let me know what > you'd > like to do I'd appreciate it. > > Thanks, > Dylan > > Quoting Marek Olšák (2019-04-18 14:46:27) > > From: Marek Olšák <marek.ol...@amd.com> > > > > Needed to track context rolls caused by streamout and ACQUIRE_MEM. > > ACQUIRE_MEM can occur outside of draw calls. > > > > Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=110355 > > > > v2: squashed patches and done more rework > > > > Cc: 19.0 <mesa-sta...@lists.freedesktop.org> > > --- > > src/gallium/drivers/radeonsi/si_pipe.c | 2 + > > src/gallium/drivers/radeonsi/si_pipe.h | 3 +- > > src/gallium/drivers/radeonsi/si_state.c | 8 +- > > .../drivers/radeonsi/si_state_binning.c | 4 +- > > src/gallium/drivers/radeonsi/si_state_draw.c | 86 +++++++++++-------- > > .../drivers/radeonsi/si_state_shaders.c | 10 +-- > > .../drivers/radeonsi/si_state_streamout.c | 1 + > > .../drivers/radeonsi/si_state_viewport.c | 2 +- > > 8 files changed, 68 insertions(+), 48 deletions(-) > > > > diff --git a/src/gallium/drivers/radeonsi/si_pipe.c > b/src/gallium/drivers/radeonsi/si_pipe.c > > index fa96ce34224..7209db9fb37 100644 > > --- a/src/gallium/drivers/radeonsi/si_pipe.c > > +++ b/src/gallium/drivers/radeonsi/si_pipe.c > > @@ -1072,20 +1072,22 @@ struct pipe_screen > *radeonsi_screen_create(struct radeon_winsys *ws, > > > > sscreen->has_out_of_order_rast = sscreen->info.chip_class >= VI > && > > sscreen->info.max_se >= 2 && > > !(sscreen->debug_flags & > DBG(NO_OUT_OF_ORDER)); > > sscreen->assume_no_z_fights = > > driQueryOptionb(config->options, > "radeonsi_assume_no_z_fights"); > > sscreen->commutative_blend_add = > > driQueryOptionb(config->options, > "radeonsi_commutative_blend_add"); > > sscreen->clear_db_cache_before_clear = > > driQueryOptionb(config->options, > "radeonsi_clear_db_cache_before_clear"); > > + sscreen->has_gfx9_scissor_bug = sscreen->info.family == > CHIP_VEGA10 || > > + sscreen->info.family == > CHIP_RAVEN; > > sscreen->has_msaa_sample_loc_bug = (sscreen->info.family >= > CHIP_POLARIS10 && > > sscreen->info.family <= > CHIP_POLARIS12) || > > sscreen->info.family == > CHIP_VEGA10 || > > sscreen->info.family == > CHIP_RAVEN; > > sscreen->has_ls_vgpr_init_bug = sscreen->info.family == > CHIP_VEGA10 || > > sscreen->info.family == > CHIP_RAVEN; > > sscreen->has_dcc_constant_encode = sscreen->info.family == > CHIP_RAVEN2; > > > > /* Only enable primitive binning on APUs by default. */ > > sscreen->dpbb_allowed = sscreen->info.family == CHIP_RAVEN || > > diff --git a/src/gallium/drivers/radeonsi/si_pipe.h > b/src/gallium/drivers/radeonsi/si_pipe.h > > index aaa95f32d20..a4c90a4f69f 100644 > > --- a/src/gallium/drivers/radeonsi/si_pipe.h > > +++ b/src/gallium/drivers/radeonsi/si_pipe.h > > @@ -463,20 +463,21 @@ struct si_screen { > > unsigned eqaa_force_coverage_samples; > > unsigned eqaa_force_z_samples; > > unsigned eqaa_force_color_samples; > > bool has_clear_state; > > bool has_distributed_tess; > > bool has_draw_indirect_multi; > > bool has_out_of_order_rast; > > bool assume_no_z_fights; > > bool commutative_blend_add; > > bool clear_db_cache_before_clear; > > + bool has_gfx9_scissor_bug; > > bool has_msaa_sample_loc_bug; > > bool has_ls_vgpr_init_bug; > > bool has_dcc_constant_encode; > > bool dpbb_allowed; > > bool dfsm_allowed; > > bool llvm_has_working_vgpr_indexing; > > > > /* Whether shaders are monolithic (1-part) or separate (3-part). > */ > > bool use_monolithic_shaders; > > bool record_llvm_ir; > > @@ -1062,21 +1063,21 @@ struct si_context { > > unsigned num_vs_flushes; > > unsigned num_ps_flushes; > > unsigned num_cs_flushes; > > unsigned num_cb_cache_flushes; > > unsigned num_db_cache_flushes; > > unsigned num_L2_invalidates; > > unsigned num_L2_writebacks; > > unsigned num_resident_handles; > > uint64_t num_alloc_tex_transfer_bytes; > > unsigned last_tex_ps_draw_ratio; /* for > query */ > > - unsigned context_roll_counter; > > + unsigned context_roll; > > > > /* Queries. */ > > /* Maintain the list of active queries for pausing between IBs. > */ > > int num_occlusion_queries; > > int num_perfect_occlusion_queries; > > struct list_head active_queries; > > unsigned num_cs_dw_queries_suspend; > > > > /* Render condition. */ > > struct pipe_query *render_cond; > > diff --git a/src/gallium/drivers/radeonsi/si_state.c > b/src/gallium/drivers/radeonsi/si_state.c > > index 757c17f7df8..bc7e777ad73 100644 > > --- a/src/gallium/drivers/radeonsi/si_state.c > > +++ b/src/gallium/drivers/radeonsi/si_state.c > > @@ -249,21 +249,21 @@ static void si_emit_cb_render_state(struct > si_context *sctx) > > } > > } > > > > /* SX_PS_DOWNCONVERT, SX_BLEND_OPT_EPSILON, > SX_BLEND_OPT_CONTROL */ > > radeon_opt_set_context_reg3(sctx, > R_028754_SX_PS_DOWNCONVERT, > > SI_TRACKED_SX_PS_DOWNCONVERT, > > sx_ps_downconvert, > sx_blend_opt_epsilon, > > sx_blend_opt_control); > > } > > if (initial_cdw != cs->current.cdw) > > - sctx->context_roll_counter++; > > + sctx->context_roll = true; > > } > > > > /* > > * Blender functions > > */ > > > > static uint32_t si_translate_blend_function(int blend_func) > > { > > switch (blend_func) { > > case PIPE_BLEND_ADD: > > @@ -786,21 +786,21 @@ static void si_emit_clip_regs(struct si_context > *sctx) > > S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0F) != > 0) | > > S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xF0) != > 0) | > > clipdist_mask | (culldist_mask << 8)); > > radeon_opt_set_context_reg(sctx, R_028810_PA_CL_CLIP_CNTL, > > SI_TRACKED_PA_CL_CLIP_CNTL, > > rs->pa_cl_clip_cntl | > > ucp_mask | > > S_028810_CLIP_DISABLE(window_space)); > > > > if (initial_cdw != sctx->gfx_cs->current.cdw) > > - sctx->context_roll_counter++; > > + sctx->context_roll = true; > > } > > > > /* > > * inferred state between framebuffer and rasterizer > > */ > > static void si_update_poly_offset_state(struct si_context *sctx) > > { > > struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; > > > > if (!rs || !rs->uses_poly_offset || > !sctx->framebuffer.state.zsbuf) { > > @@ -1448,21 +1448,21 @@ static void si_emit_db_render_state(struct > si_context *sctx) > > db_shader_control &= C_02880C_MASK_EXPORT_ENABLE; > > > > if (sctx->screen->has_rbplus && > > !sctx->screen->rbplus_allowed) > > db_shader_control |= S_02880C_DUAL_QUAD_DISABLE(1); > > > > radeon_opt_set_context_reg(sctx, R_02880C_DB_SHADER_CONTROL, > > SI_TRACKED_DB_SHADER_CONTROL, > db_shader_control); > > > > if (initial_cdw != sctx->gfx_cs->current.cdw) > > - sctx->context_roll_counter++; > > + sctx->context_roll = true; > > } > > > > /* > > * format translation > > */ > > static uint32_t si_translate_colorformat(enum pipe_format format) > > { > > const struct util_format_description *desc = > util_format_description(format); > > if (!desc) > > return V_028C70_COLOR_INVALID; > > @@ -3537,21 +3537,21 @@ static void si_emit_msaa_config(struct > si_context *sctx) > > SI_TRACKED_PA_SC_LINE_CNTL, > sc_line_cntl, > > sc_aa_config); > > /* R_028804_DB_EQAA */ > > radeon_opt_set_context_reg(sctx, R_028804_DB_EQAA, > SI_TRACKED_DB_EQAA, > > db_eqaa); > > /* R_028A4C_PA_SC_MODE_CNTL_1 */ > > radeon_opt_set_context_reg(sctx, R_028A4C_PA_SC_MODE_CNTL_1, > > SI_TRACKED_PA_SC_MODE_CNTL_1, > sc_mode_cntl_1); > > > > if (initial_cdw != cs->current.cdw) { > > - sctx->context_roll_counter++; > > + sctx->context_roll = true; > > > > /* GFX9: Flush DFSM when the AA mode changes. */ > > if (sctx->screen->dfsm_allowed) { > > radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); > > radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) > | EVENT_INDEX(0)); > > } > > } > > } > > > > void si_update_ps_iter_samples(struct si_context *sctx) > > diff --git a/src/gallium/drivers/radeonsi/si_state_binning.c > b/src/gallium/drivers/radeonsi/si_state_binning.c > > index 3516e561282..5c6c2e69b90 100644 > > --- a/src/gallium/drivers/radeonsi/si_state_binning.c > > +++ b/src/gallium/drivers/radeonsi/si_state_binning.c > > @@ -314,21 +314,21 @@ static void si_emit_dpbb_disable(struct si_context > *sctx) > > > > radeon_opt_set_context_reg(sctx, R_028C44_PA_SC_BINNER_CNTL_0, > > SI_TRACKED_PA_SC_BINNER_CNTL_0, > > > S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) | > > S_028C44_DISABLE_START_OF_PRIM(1)); > > radeon_opt_set_context_reg(sctx, R_028060_DB_DFSM_CONTROL, > > SI_TRACKED_DB_DFSM_CONTROL, > > > S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF) | > > S_028060_POPS_DRAIN_PS_ON_OVERLAP(1)); > > if (initial_cdw != sctx->gfx_cs->current.cdw) > > - sctx->context_roll_counter++; > > + sctx->context_roll = true; > > } > > > > void si_emit_dpbb_state(struct si_context *sctx) > > { > > struct si_screen *sscreen = sctx->screen; > > struct si_state_blend *blend = sctx->queued.named.blend; > > struct si_state_dsa *dsa = sctx->queued.named.dsa; > > unsigned db_shader_control = sctx->ps_db_shader_control; > > > > assert(sctx->chip_class >= GFX9); > > @@ -436,12 +436,12 @@ void si_emit_dpbb_state(struct si_context *sctx) > > S_028C44_CONTEXT_STATES_PER_BIN(context_states_per_bin) | > > > S_028C44_PERSISTENT_STATES_PER_BIN(persistent_states_per_bin) | > > S_028C44_DISABLE_START_OF_PRIM(disable_start_of_prim) | > > S_028C44_FPOVS_PER_BATCH(fpovs_per_batch) | > > S_028C44_OPTIMAL_BIN_SELECTION(1)); > > radeon_opt_set_context_reg(sctx, R_028060_DB_DFSM_CONTROL, > > SI_TRACKED_DB_DFSM_CONTROL, > > S_028060_PUNCHOUT_MODE(punchout_mode) > | > > S_028060_POPS_DRAIN_PS_ON_OVERLAP(1)); > > if (initial_cdw != sctx->gfx_cs->current.cdw) > > - sctx->context_roll_counter++; > > + sctx->context_roll = true; > > } > > diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c > b/src/gallium/drivers/radeonsi/si_state_draw.c > > index 2a514f144b9..8798f9ad0a0 100644 > > --- a/src/gallium/drivers/radeonsi/si_state_draw.c > > +++ b/src/gallium/drivers/radeonsi/si_state_draw.c > > @@ -59,21 +59,21 @@ static unsigned si_conv_pipe_prim(unsigned mode) > > return prim_conv[mode]; > > } > > > > /** > > * This calculates the LDS size for tessellation shaders (VS, TCS, TES). > > * LS.LDS_SIZE is shared by all 3 shader stages. > > * > > * The information about LDS and other non-compile-time parameters is > then > > * written to userdata SGPRs. > > */ > > -static bool si_emit_derived_tess_state(struct si_context *sctx, > > +static void si_emit_derived_tess_state(struct si_context *sctx, > > const struct pipe_draw_info *info, > > unsigned *num_patches) > > { > > struct radeon_cmdbuf *cs = sctx->gfx_cs; > > struct si_shader *ls_current; > > struct si_shader_selector *ls; > > /* The TES pointer will only be used for sctx->last_tcs. > > * It would be wrong to think that TCS = TES. */ > > struct si_shader_selector *tcs = > > sctx->tcs_shader.cso ? sctx->tcs_shader.cso : > sctx->tes_shader.cso; > > @@ -103,21 +103,21 @@ static bool si_emit_derived_tess_state(struct > si_context *sctx, > > ls = sctx->vs_shader.cso; > > } > > > > if (sctx->last_ls == ls_current && > > sctx->last_tcs == tcs && > > sctx->last_tes_sh_base == tes_sh_base && > > sctx->last_num_tcs_input_cp == num_tcs_input_cp && > > (!has_primid_instancing_bug || > > (sctx->last_tess_uses_primid == tess_uses_primid))) { > > *num_patches = sctx->last_num_patches; > > - return false; > > + return; > > } > > > > sctx->last_ls = ls_current; > > sctx->last_tcs = tcs; > > sctx->last_tes_sh_base = tes_sh_base; > > sctx->last_num_tcs_input_cp = num_tcs_input_cp; > > sctx->last_tess_uses_primid = tess_uses_primid; > > > > /* This calculates how shader inputs and outputs among VS, TCS, > and TES > > * are laid out in LDS. */ > > @@ -298,23 +298,22 @@ static bool si_emit_derived_tess_state(struct > si_context *sctx, > > > > if (sctx->last_ls_hs_config != ls_hs_config) { > > if (sctx->chip_class >= CIK) { > > radeon_set_context_reg_idx(cs, > R_028B58_VGT_LS_HS_CONFIG, 2, > > ls_hs_config); > > } else { > > radeon_set_context_reg(cs, > R_028B58_VGT_LS_HS_CONFIG, > > ls_hs_config); > > } > > sctx->last_ls_hs_config = ls_hs_config; > > - return true; /* true if the context rolls */ > > + sctx->context_roll = true; > > } > > - return false; > > } > > > > static unsigned si_num_prims_for_vertices(const struct pipe_draw_info > *info) > > { > > switch (info->mode) { > > case PIPE_PRIM_PATCHES: > > return info->count / info->vertices_per_patch; > > case PIPE_PRIM_POLYGON: > > return info->count >= 3; > > case SI_PRIM_RECTANGLE_LIST: > > @@ -534,44 +533,44 @@ static unsigned si_get_ia_multi_vgt_param(struct > si_context *sctx, > > (info->instance_count > 1 && > > (info->count_from_stream_output || > > si_num_prims_for_vertices(info) <= 1)))) > > sctx->flags |= SI_CONTEXT_VGT_FLUSH; > > } > > > > return ia_multi_vgt_param; > > } > > > > /* rast_prim is the primitive type after GS. */ > > -static bool si_emit_rasterizer_prim_state(struct si_context *sctx) > > +static void si_emit_rasterizer_prim_state(struct si_context *sctx) > > { > > struct radeon_cmdbuf *cs = sctx->gfx_cs; > > enum pipe_prim_type rast_prim = sctx->current_rast_prim; > > struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; > > > > /* Skip this if not rendering lines. */ > > if (!util_prim_is_lines(rast_prim)) > > - return false; > > + return; > > > > if (rast_prim == sctx->last_rast_prim && > > rs->pa_sc_line_stipple == sctx->last_sc_line_stipple) > > - return false; > > + return; > > > > /* For lines, reset the stipple pattern at each primitive. > Otherwise, > > * reset the stipple pattern at each packet (line strips, line > loops). > > */ > > radeon_set_context_reg(cs, R_028A0C_PA_SC_LINE_STIPPLE, > > rs->pa_sc_line_stipple | > > S_028A0C_AUTO_RESET_CNTL(rast_prim == PIPE_PRIM_LINES ? > 1 : 2)); > > > > sctx->last_rast_prim = rast_prim; > > sctx->last_sc_line_stipple = rs->pa_sc_line_stipple; > > - return true; /* true if the context rolls */ > > + sctx->context_roll = true; > > } > > > > static void si_emit_vs_state(struct si_context *sctx, > > const struct pipe_draw_info *info) > > { > > sctx->current_vs_state &= C_VS_STATE_INDEXED; > > sctx->current_vs_state |= S_VS_STATE_INDEXED(!!info->index_size); > > > > if (sctx->num_vs_blit_sgprs) { > > /* Re-emit the state after we leave u_blitter. */ > > @@ -652,20 +651,21 @@ static void si_emit_draw_registers(struct > si_context *sctx, > > radeon_set_context_reg(cs, > R_028A94_VGT_MULTI_PRIM_IB_RESET_EN, > > info->primitive_restart); > > > > sctx->last_primitive_restart_en = > info->primitive_restart; > > > > } > > if (si_prim_restart_index_changed(sctx, info)) { > > radeon_set_context_reg(cs, > R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, > > info->restart_index); > > sctx->last_restart_index = info->restart_index; > > + sctx->context_roll = true; > > } > > } > > > > static void si_emit_draw_packets(struct si_context *sctx, > > const struct pipe_draw_info *info, > > struct pipe_resource *indexbuf, > > unsigned index_size, > > unsigned index_offset) > > { > > struct pipe_draw_indirect_info *indirect = info->indirect; > > @@ -889,20 +889,25 @@ static void si_emit_surface_sync(struct si_context > *sctx, > > radeon_emit(cs, 0); /* CP_COHER_BASE_HI */ > > radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ > > } else { > > /* ACQUIRE_MEM is only required on a compute ring. */ > > radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, 0)); > > radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */ > > radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */ > > radeon_emit(cs, 0); /* CP_COHER_BASE */ > > radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ > > } > > + > > + /* ACQUIRE_MEM has an implicit context roll if the current > context > > + * is busy. */ > > + if (sctx->has_graphics) > > + sctx->context_roll = true; > > } > > > > void si_emit_cache_flush(struct si_context *sctx) > > { > > struct radeon_cmdbuf *cs = sctx->gfx_cs; > > uint32_t flags = sctx->flags; > > > > if (!sctx->has_graphics) { > > /* Only process compute flags. */ > > flags &= SI_CONTEXT_INV_ICACHE | > > @@ -1216,40 +1221,24 @@ static void si_get_draw_start_count(struct > si_context *sctx, > > } else { > > *start = info->start; > > *count = info->count; > > } > > } > > > > static void si_emit_all_states(struct si_context *sctx, const struct > pipe_draw_info *info, > > unsigned skip_atom_mask) > > { > > unsigned num_patches = 0; > > - /* Vega10/Raven scissor bug workaround. When any context > register is > > - * written (i.e. the GPU rolls the context), PA_SC_VPORT_SCISSOR > > - * registers must be written too. > > - */ > > - bool handle_scissor_bug = (sctx->family == CHIP_VEGA10 || > sctx->family == CHIP_RAVEN) && > > - !si_is_atom_dirty(sctx, > &sctx->atoms.s.scissors); > > - bool context_roll = false; /* set correctly for GFX9 only */ > > > > - context_roll |= si_emit_rasterizer_prim_state(sctx); > > + si_emit_rasterizer_prim_state(sctx); > > if (sctx->tes_shader.cso) > > - context_roll |= si_emit_derived_tess_state(sctx, info, > &num_patches); > > - > > - if (handle_scissor_bug && > > - (info->count_from_stream_output || > > - sctx->dirty_atoms & si_atoms_that_always_roll_context() || > > - sctx->dirty_states & si_states_that_always_roll_context() || > > - si_prim_restart_index_changed(sctx, info))) > > - context_roll = true; > > - > > - sctx->context_roll_counter = 0; > > + si_emit_derived_tess_state(sctx, info, &num_patches); > > > > /* Emit state atoms. */ > > unsigned mask = sctx->dirty_atoms & ~skip_atom_mask; > > while (mask) > > sctx->atoms.array[u_bit_scan(&mask)].emit(sctx); > > > > sctx->dirty_atoms &= skip_atom_mask; > > > > /* Emit states. */ > > mask = sctx->dirty_states; > > @@ -1258,26 +1247,20 @@ static void si_emit_all_states(struct si_context > *sctx, const struct pipe_draw_i > > struct si_pm4_state *state = sctx->queued.array[i]; > > > > if (!state || sctx->emitted.array[i] == state) > > continue; > > > > si_pm4_emit(sctx, state); > > sctx->emitted.array[i] = state; > > } > > sctx->dirty_states = 0; > > > > - if (handle_scissor_bug && > > - (context_roll || sctx->context_roll_counter)) { > > - sctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1; > > - sctx->atoms.s.scissors.emit(sctx); > > - } > > - > > /* Emit draw states. */ > > si_emit_vs_state(sctx, info); > > si_emit_draw_registers(sctx, info, num_patches); > > } > > > > static void si_draw_vbo(struct pipe_context *ctx, const struct > pipe_draw_info *info) > > { > > struct si_context *sctx = (struct si_context *)ctx; > > struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; > > struct pipe_resource *indexbuf = info->index.resource; > > @@ -1462,45 +1445,66 @@ static void si_draw_vbo(struct pipe_context > *ctx, const struct pipe_draw_info *i > > > > si_need_gfx_cs_space(sctx); > > > > /* Since we've called si_context_add_resource_size for vertex > buffers, > > * this must be called after si_need_cs_space, because we must > let > > * need_cs_space flush before we add buffers to the buffer list. > > */ > > if (!si_upload_vertex_buffer_descriptors(sctx)) > > goto return_cleanup; > > > > + /* Vega10/Raven scissor bug workaround. When any context > register is > > + * written (i.e. the GPU rolls the context), PA_SC_VPORT_SCISSOR > > + * registers must be written too. > > + */ > > + bool has_gfx9_scissor_bug = sctx->screen->has_gfx9_scissor_bug; > > + unsigned masked_atoms = 0; > > + > > + if (has_gfx9_scissor_bug) { > > + masked_atoms |= si_get_atom_bit(sctx, > &sctx->atoms.s.scissors); > > + > > + if (info->count_from_stream_output || > > + sctx->dirty_atoms & > si_atoms_that_always_roll_context() || > > + sctx->dirty_states & > si_states_that_always_roll_context()) > > + sctx->context_roll = true; > > + } > > + > > /* Use optimal packet order based on whether we need to sync the > pipeline. */ > > if (unlikely(sctx->flags & (SI_CONTEXT_FLUSH_AND_INV_CB | > > SI_CONTEXT_FLUSH_AND_INV_DB | > > SI_CONTEXT_PS_PARTIAL_FLUSH | > > SI_CONTEXT_CS_PARTIAL_FLUSH))) { > > /* If we have to wait for idle, set all states first, so > that all > > * SET packets are processed in parallel with previous > draw calls. > > * Then draw and prefetch at the end. This ensures that > the time > > * the CUs are idle is very short. > > */ > > - unsigned masked_atoms = 0; > > - > > if (unlikely(sctx->flags & > SI_CONTEXT_FLUSH_FOR_RENDER_COND)) > > masked_atoms |= si_get_atom_bit(sctx, > &sctx->atoms.s.render_cond); > > > > if (!si_upload_graphics_shader_descriptors(sctx)) > > goto return_cleanup; > > > > /* Emit all states except possibly render condition. */ > > si_emit_all_states(sctx, info, masked_atoms); > > si_emit_cache_flush(sctx); > > /* <-- CUs are idle here. */ > > > > if (si_is_atom_dirty(sctx, &sctx->atoms.s.render_cond)) > > sctx->atoms.s.render_cond.emit(sctx); > > + > > + if (has_gfx9_scissor_bug && > > + (sctx->context_roll || > > + si_is_atom_dirty(sctx, &sctx->atoms.s.scissors))) { > > + sctx->scissors.dirty_mask = (1 << > SI_MAX_VIEWPORTS) - 1; > > + sctx->atoms.s.scissors.emit(sctx); > > + } > > sctx->dirty_atoms = 0; > > > > si_emit_draw_packets(sctx, info, indexbuf, index_size, > index_offset); > > /* <-- CUs are busy here. */ > > > > /* Start prefetches after the draw has been started. > Both will run > > * in parallel, but starting the draw first is more > important. > > */ > > if (sctx->chip_class >= CIK && sctx->prefetch_L2_mask) > > cik_emit_prefetch_L2(sctx, false); > > @@ -1511,29 +1515,41 @@ static void si_draw_vbo(struct pipe_context > *ctx, const struct pipe_draw_info *i > > if (sctx->flags) > > si_emit_cache_flush(sctx); > > > > /* Only prefetch the API VS and VBO descriptors. */ > > if (sctx->chip_class >= CIK && sctx->prefetch_L2_mask) > > cik_emit_prefetch_L2(sctx, true); > > > > if (!si_upload_graphics_shader_descriptors(sctx)) > > return; > > > > - si_emit_all_states(sctx, info, 0); > > + si_emit_all_states(sctx, info, masked_atoms); > > + > > + if (has_gfx9_scissor_bug && > > + (sctx->context_roll || > > + si_is_atom_dirty(sctx, &sctx->atoms.s.scissors))) { > > + sctx->scissors.dirty_mask = (1 << > SI_MAX_VIEWPORTS) - 1; > > + sctx->atoms.s.scissors.emit(sctx); > > + } > > + sctx->dirty_atoms = 0; > > + > > si_emit_draw_packets(sctx, info, indexbuf, index_size, > index_offset); > > > > /* Prefetch the remaining shaders after the draw has been > > * started. */ > > if (sctx->chip_class >= CIK && sctx->prefetch_L2_mask) > > cik_emit_prefetch_L2(sctx, false); > > } > > > > + /* Clear the context roll flag after the draw call. */ > > + sctx->context_roll = false; > > + > > if (unlikely(sctx->current_saved_cs)) { > > si_trace_emit(sctx); > > si_log_draw_state(sctx, sctx->log); > > } > > > > /* Workaround for a VGT hang when streamout is enabled. > > * It must be done after drawing. */ > > if ((sctx->family == CHIP_HAWAII || > > sctx->family == CHIP_TONGA || > > sctx->family == CHIP_FIJI) && > > diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c > b/src/gallium/drivers/radeonsi/si_state_shaders.c > > index 5bdfd4f6ac1..d00bb170981 100644 > > --- a/src/gallium/drivers/radeonsi/si_state_shaders.c > > +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c > > @@ -569,21 +569,21 @@ static void si_emit_shader_es(struct si_context > *sctx) > > radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, > > SI_TRACKED_VGT_TF_PARAM, > > shader->vgt_tf_param); > > > > if (shader->vgt_vertex_reuse_block_cntl) > > radeon_opt_set_context_reg(sctx, > R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, > > > SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL, > > > shader->vgt_vertex_reuse_block_cntl); > > > > if (initial_cdw != sctx->gfx_cs->current.cdw) > > - sctx->context_roll_counter++; > > + sctx->context_roll = true; > > } > > > > static void si_shader_es(struct si_screen *sscreen, struct si_shader > *shader) > > { > > struct si_pm4_state *pm4; > > unsigned num_user_sgprs; > > unsigned vgpr_comp_cnt; > > uint64_t va; > > unsigned oc_lds_en; > > > > @@ -818,21 +818,21 @@ static void si_emit_shader_gs(struct si_context > *sctx) > > radeon_opt_set_context_reg(sctx, > R_028B6C_VGT_TF_PARAM, > > > SI_TRACKED_VGT_TF_PARAM, > > shader->vgt_tf_param); > > if (shader->vgt_vertex_reuse_block_cntl) > > radeon_opt_set_context_reg(sctx, > R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, > > > SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL, > > > shader->vgt_vertex_reuse_block_cntl); > > } > > > > if (initial_cdw != sctx->gfx_cs->current.cdw) > > - sctx->context_roll_counter++; > > + sctx->context_roll = true; > > } > > > > static void si_shader_gs(struct si_screen *sscreen, struct si_shader > *shader) > > { > > struct si_shader_selector *sel = shader->selector; > > const ubyte *num_components = > sel->info.num_stream_output_components; > > unsigned gs_num_invocations = sel->gs_num_invocations; > > struct si_pm4_state *pm4; > > uint64_t va; > > unsigned max_stream = sel->max_gs_stream; > > @@ -995,21 +995,21 @@ static void si_emit_shader_vs(struct si_context > *sctx) > > radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, > > SI_TRACKED_VGT_TF_PARAM, > > shader->vgt_tf_param); > > > > if (shader->vgt_vertex_reuse_block_cntl) > > radeon_opt_set_context_reg(sctx, > R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, > > > SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL, > > > shader->vgt_vertex_reuse_block_cntl); > > > > if (initial_cdw != sctx->gfx_cs->current.cdw) > > - sctx->context_roll_counter++; > > + sctx->context_roll = true; > > } > > > > /** > > * Compute the state for \p shader, which will run as a vertex shader > on the > > * hardware. > > * > > * If \p gs is non-NULL, it points to the geometry shader for which > this shader > > * is the copy shader. > > */ > > static void si_shader_vs(struct si_screen *sscreen, struct si_shader > *shader, > > @@ -1187,21 +1187,21 @@ static void si_emit_shader_ps(struct si_context > *sctx) > > radeon_opt_set_context_reg2(sctx, R_028710_SPI_SHADER_Z_FORMAT, > > SI_TRACKED_SPI_SHADER_Z_FORMAT, > > > shader->ctx_reg.ps.spi_shader_z_format, > > > shader->ctx_reg.ps.spi_shader_col_format); > > > > radeon_opt_set_context_reg(sctx, R_02823C_CB_SHADER_MASK, > > SI_TRACKED_CB_SHADER_MASK, > > shader->ctx_reg.ps.cb_shader_mask); > > > > if (initial_cdw != sctx->gfx_cs->current.cdw) > > - sctx->context_roll_counter++; > > + sctx->context_roll = true; > > } > > > > static void si_shader_ps(struct si_shader *shader) > > { > > struct tgsi_shader_info *info = &shader->selector->info; > > struct si_pm4_state *pm4; > > unsigned spi_ps_in_control, spi_shader_col_format, > cb_shader_mask; > > unsigned spi_baryc_cntl = S_0286E0_FRONT_FACE_ALL_BITS(1); > > uint64_t va; > > unsigned input_ena = shader->config.spi_ps_input_ena; > > @@ -2863,21 +2863,21 @@ static void si_emit_spi_map(struct si_context > *sctx) > > > > /* R_028644_SPI_PS_INPUT_CNTL_0 */ > > /* Dota 2: Only ~16% of SPI map updates set different values. */ > > /* Talos: Only ~9% of SPI map updates set different values. */ > > unsigned initial_cdw = sctx->gfx_cs->current.cdw; > > radeon_opt_set_context_regn(sctx, R_028644_SPI_PS_INPUT_CNTL_0, > > spi_ps_input_cntl, > > > sctx->tracked_regs.spi_ps_input_cntl, num_interp); > > > > if (initial_cdw != sctx->gfx_cs->current.cdw) > > - sctx->context_roll_counter++; > > + sctx->context_roll = true; > > } > > > > /** > > * Writing CONFIG or UCONFIG VGT registers requires VGT_FLUSH before > that. > > */ > > static void si_init_config_add_vgt_flush(struct si_context *sctx) > > { > > if (sctx->init_config_has_vgt_flush) > > return; > > > > diff --git a/src/gallium/drivers/radeonsi/si_state_streamout.c > b/src/gallium/drivers/radeonsi/si_state_streamout.c > > index 2bf6862c89b..2a0a4bef9a2 100644 > > --- a/src/gallium/drivers/radeonsi/si_state_streamout.c > > +++ b/src/gallium/drivers/radeonsi/si_state_streamout.c > > @@ -296,20 +296,21 @@ void si_emit_streamout_end(struct si_context *sctx) > > radeon_add_to_buffer_list(sctx, sctx->gfx_cs, > > t[i]->buf_filled_size, > > RADEON_USAGE_WRITE, > > RADEON_PRIO_SO_FILLED_SIZE); > > > > /* Zero the buffer size. The counters (primitives > generated, > > * primitives emitted) may be enabled even if there is > not > > * buffer bound. This ensures that the > primitives-emitted query > > * won't increment. */ > > radeon_set_context_reg(cs, > R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 0); > > + sctx->context_roll = true; > > > > t[i]->buf_filled_size_valid = true; > > } > > > > sctx->streamout.begin_emitted = false; > > } > > > > /* STREAMOUT CONFIG DERIVED STATE > > * > > * Streamout must be enabled for the PRIMITIVES_GENERATED query to work. > > diff --git a/src/gallium/drivers/radeonsi/si_state_viewport.c > b/src/gallium/drivers/radeonsi/si_state_viewport.c > > index f988da4520b..6f348a9b58d 100644 > > --- a/src/gallium/drivers/radeonsi/si_state_viewport.c > > +++ b/src/gallium/drivers/radeonsi/si_state_viewport.c > > @@ -276,21 +276,21 @@ static void si_emit_guardband(struct si_context > *ctx) > > radeon_opt_set_context_reg(ctx, > R_028234_PA_SU_HARDWARE_SCREEN_OFFSET, > > > SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET, > > > S_028234_HW_SCREEN_OFFSET_X(hw_screen_offset_x >> 4) | > > > S_028234_HW_SCREEN_OFFSET_Y(hw_screen_offset_y >> 4)); > > radeon_opt_set_context_reg(ctx, R_028BE4_PA_SU_VTX_CNTL, > > SI_TRACKED_PA_SU_VTX_CNTL, > > > S_028BE4_PIX_CENTER(rs->half_pixel_center) | > > > S_028BE4_QUANT_MODE(V_028BE4_X_16_8_FIXED_POINT_1_256TH + > > > vp_as_scissor.quant_mode)); > > if (initial_cdw != ctx->gfx_cs->current.cdw) > > - ctx->context_roll_counter++; > > + ctx->context_roll = true; > > } > > > > static void si_emit_scissors(struct si_context *ctx) > > { > > struct radeon_cmdbuf *cs = ctx->gfx_cs; > > struct pipe_scissor_state *states = ctx->scissors.states; > > unsigned mask = ctx->scissors.dirty_mask; > > bool scissor_enabled = > ctx->queued.named.rasterizer->scissor_enable; > > > > /* The simple case: Only 1 viewport is active. */ > > -- > > 2.17.1 > > > > _______________________________________________ > > mesa-dev mailing list > > mesa-dev@lists.freedesktop.org > > https://lists.freedesktop.org/mailman/listinfo/mesa-dev >
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev