By applied patch 1, gfx IB size changes obviously.
openarena GFX-IB-size changes from 7.64k to 7.44k. glxgears GFX-IB-size changes from 8.48k to 8.416k. The adding CPU workloads are quite small in this case and it's hard to measure it. So the assumption is that writing a SET_CONTEXT_REG packet is more expensive. And we emit too many redundant packets. Some registers rarely get different values. Thanks, Sonny ________________________________ From: Dave Airlie <airl...@gmail.com> Sent: Thursday, June 7, 2018 5:20:12 PM To: Jiang, Sonny Cc: mesa-dev Subject: Re: [Mesa-dev] [PATCH 1/6] radeonsi: emit_db_render_state packets optimization On 8 June 2018 at 02:13, Sonny Jiang <sonny.ji...@amd.com> wrote: > Remembering latest states of registers to eliminate redunant SET_CONTEXT_REG > packets > Does this help anywhere btw? Numbers for an app or game. In the past I've always found this sort of optimisation pointless, the CP on the GPU side does a pretty good job at nuking pointless register sets or at least hiding them. This will increase CPU and memory usage in some ways in favour of reducing CP and possibly memory bandwidth, but without numbers I'm not sure it's a good plan. Dave. > Signed-off-by: Sonny Jiang <sonny.ji...@amd.com> > --- > src/gallium/drivers/radeonsi/si_build_pm4.h | 43 +++++++++++++++++++++ > src/gallium/drivers/radeonsi/si_gfx_cs.c | 3 ++ > src/gallium/drivers/radeonsi/si_pipe.h | 2 + > src/gallium/drivers/radeonsi/si_state.c | 60 > +++++++++++++++-------------- > src/gallium/drivers/radeonsi/si_state.h | 16 ++++++++ > 5 files changed, 95 insertions(+), 29 deletions(-) > > diff --git a/src/gallium/drivers/radeonsi/si_build_pm4.h > b/src/gallium/drivers/radeonsi/si_build_pm4.h > index 22f5558..45d943f 100644 > --- a/src/gallium/drivers/radeonsi/si_build_pm4.h > +++ b/src/gallium/drivers/radeonsi/si_build_pm4.h > @@ -110,4 +110,47 @@ static inline void radeon_set_uconfig_reg_idx(struct > radeon_winsys_cs *cs, > radeon_emit(cs, value); > } > > +/* Emit PKT3_SET_CONTEXT_REG if the register value is different. */ > +static inline void radeon_opt_set_context_reg(struct si_context *sctx, > unsigned offset, > + enum si_tracked_reg reg, > unsigned value) > +{ > + struct radeon_winsys_cs *cs = sctx->gfx_cs; > + > + if (!(sctx->tracked_regs.reg_saved & (1 << reg)) || > + sctx->tracked_regs.reg_value[reg] != value ) { > + > + radeon_set_context_reg(cs, offset, value); > + > + sctx->tracked_regs.reg_saved |= 1 << reg; > + sctx->tracked_regs.reg_value[reg] = value; > + } > +} > + > +/** > + * Set 2 consecutive registers if any registers value is different. > + * @param offset starting register offset > + * @param value1 is written to first register > + * @param value2 is written to second register > + */ > +static inline void radeon_opt_set_context_reg2(struct si_context *sctx, > unsigned offset, > + enum si_tracked_reg reg, > unsigned value1, > + unsigned value2) > +{ > + struct radeon_winsys_cs *cs = sctx->gfx_cs; > + > + if (!(sctx->tracked_regs.reg_saved & (1 << reg)) || > + !(sctx->tracked_regs.reg_saved & (1 << (reg + 1))) || > + sctx->tracked_regs.reg_value[reg] != value1 || > + sctx->tracked_regs.reg_value[reg+1] != value2 ) { > + > + radeon_set_context_reg_seq(cs, offset, 2); > + radeon_emit(cs, value1); > + radeon_emit(cs, value2); > + > + sctx->tracked_regs.reg_value[reg] = value1; > + sctx->tracked_regs.reg_value[reg+1] = value2; > + sctx->tracked_regs.reg_saved |= 3 << reg; > + } > +} > + > #endif > diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c > b/src/gallium/drivers/radeonsi/si_gfx_cs.c > index ec74c1b..0b9a020 100644 > --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c > +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c > @@ -321,4 +321,7 @@ void si_begin_new_gfx_cs(struct si_context *ctx) > ctx->last_num_tcs_input_cp = -1; > > ctx->cs_shader_state.initialized = false; > + > + /* Set all saved registers state to unknown */ > + ctx->tracked_regs.reg_saved = 0; > } > diff --git a/src/gallium/drivers/radeonsi/si_pipe.h > b/src/gallium/drivers/radeonsi/si_pipe.h > index 5d1671f..82a8263 100644 > --- a/src/gallium/drivers/radeonsi/si_pipe.h > +++ b/src/gallium/drivers/radeonsi/si_pipe.h > @@ -1033,6 +1033,8 @@ struct si_context { > > void (*dma_clear_buffer)(struct si_context *sctx, struct > pipe_resource *dst, > uint64_t offset, uint64_t size, unsigned > value); > + > + struct si_tracked_regs tracked_regs; > }; > > /* cik_sdma.c */ > diff --git a/src/gallium/drivers/radeonsi/si_state.c > b/src/gallium/drivers/radeonsi/si_state.c > index 3a7e928..c95b929 100644 > --- a/src/gallium/drivers/radeonsi/si_state.c > +++ b/src/gallium/drivers/radeonsi/si_state.c > @@ -1343,28 +1343,25 @@ void si_save_qbo_state(struct si_context *sctx, > struct si_qbo_state *st) > > static void si_emit_db_render_state(struct si_context *sctx) > { > - struct radeon_winsys_cs *cs = sctx->gfx_cs; > struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; > - unsigned db_shader_control; > - > - radeon_set_context_reg_seq(cs, R_028000_DB_RENDER_CONTROL, 2); > + unsigned db_shader_control, db_render_control, db_count_control; > > /* DB_RENDER_CONTROL */ > if (sctx->dbcb_depth_copy_enabled || > sctx->dbcb_stencil_copy_enabled) { > - radeon_emit(cs, > - > S_028000_DEPTH_COPY(sctx->dbcb_depth_copy_enabled) | > - > S_028000_STENCIL_COPY(sctx->dbcb_stencil_copy_enabled) | > - S_028000_COPY_CENTROID(1) | > - S_028000_COPY_SAMPLE(sctx->dbcb_copy_sample)); > + db_render_control = > + S_028000_DEPTH_COPY(sctx->dbcb_depth_copy_enabled) | > + > S_028000_STENCIL_COPY(sctx->dbcb_stencil_copy_enabled) | > + S_028000_COPY_CENTROID(1) | > + S_028000_COPY_SAMPLE(sctx->dbcb_copy_sample); > } else if (sctx->db_flush_depth_inplace || > sctx->db_flush_stencil_inplace) { > - radeon_emit(cs, > - > S_028000_DEPTH_COMPRESS_DISABLE(sctx->db_flush_depth_inplace) | > - > S_028000_STENCIL_COMPRESS_DISABLE(sctx->db_flush_stencil_inplace)); > + db_render_control = > + > S_028000_DEPTH_COMPRESS_DISABLE(sctx->db_flush_depth_inplace) | > + > S_028000_STENCIL_COMPRESS_DISABLE(sctx->db_flush_stencil_inplace); > } else { > - radeon_emit(cs, > - S_028000_DEPTH_CLEAR_ENABLE(sctx->db_depth_clear) > | > - > S_028000_STENCIL_CLEAR_ENABLE(sctx->db_stencil_clear)); > + db_render_control = > + S_028000_DEPTH_CLEAR_ENABLE(sctx->db_depth_clear) | > + S_028000_STENCIL_CLEAR_ENABLE(sctx->db_stencil_clear); > } > > /* DB_COUNT_CONTROL (occlusion queries) */ > @@ -1373,28 +1370,33 @@ static void si_emit_db_render_state(struct si_context > *sctx) > bool perfect = sctx->num_perfect_occlusion_queries > 0; > > if (sctx->chip_class >= CIK) { > - radeon_emit(cs, > - S_028004_PERFECT_ZPASS_COUNTS(perfect) | > - > S_028004_SAMPLE_RATE(sctx->framebuffer.log_samples) | > - S_028004_ZPASS_ENABLE(1) | > - S_028004_SLICE_EVEN_ENABLE(1) | > - S_028004_SLICE_ODD_ENABLE(1)); > + db_count_control = > + S_028004_PERFECT_ZPASS_COUNTS(perfect) | > + > S_028004_SAMPLE_RATE(sctx->framebuffer.log_samples) | > + S_028004_ZPASS_ENABLE(1) | > + S_028004_SLICE_EVEN_ENABLE(1) | > + S_028004_SLICE_ODD_ENABLE(1); > } else { > - radeon_emit(cs, > - S_028004_PERFECT_ZPASS_COUNTS(perfect) | > - > S_028004_SAMPLE_RATE(sctx->framebuffer.log_samples)); > + db_count_control = > + S_028004_PERFECT_ZPASS_COUNTS(perfect) | > + > S_028004_SAMPLE_RATE(sctx->framebuffer.log_samples); > } > } else { > /* Disable occlusion queries. */ > if (sctx->chip_class >= CIK) { > - radeon_emit(cs, 0); > + db_count_control = 0; > } else { > - radeon_emit(cs, S_028004_ZPASS_INCREMENT_DISABLE(1)); > + db_count_control = > S_028004_ZPASS_INCREMENT_DISABLE(1); > } > } > > + radeon_opt_set_context_reg2(sctx, R_028000_DB_RENDER_CONTROL, > + SI_TRACKED_DB_RENDER_CONTROL, > db_render_control, > + db_count_control); > + > /* DB_RENDER_OVERRIDE2 */ > - radeon_set_context_reg(cs, R_028010_DB_RENDER_OVERRIDE2, > + radeon_opt_set_context_reg(sctx, R_028010_DB_RENDER_OVERRIDE2, > + SI_TRACKED_DB_RENDER_OVERRIDE2, > > S_028010_DISABLE_ZMASK_EXPCLEAR_OPTIMIZATION(sctx->db_depth_disable_expclear) > | > > S_028010_DISABLE_SMEM_EXPCLEAR_OPTIMIZATION(sctx->db_stencil_disable_expclear) > | > S_028010_DECOMPRESS_Z_ON_FLUSH(sctx->framebuffer.nr_samples > >= 4)); > @@ -1415,8 +1417,8 @@ static void si_emit_db_render_state(struct si_context > *sctx) > !sctx->screen->rbplus_allowed) > db_shader_control |= S_02880C_DUAL_QUAD_DISABLE(1); > > - radeon_set_context_reg(cs, R_02880C_DB_SHADER_CONTROL, > - db_shader_control); > + radeon_opt_set_context_reg(sctx, R_02880C_DB_SHADER_CONTROL, > + SI_TRACKED_DB_SHADER_CONTROL, > db_shader_control); > } > > /* > diff --git a/src/gallium/drivers/radeonsi/si_state.h > b/src/gallium/drivers/radeonsi/si_state.h > index d235f31..fb5f721 100644 > --- a/src/gallium/drivers/radeonsi/si_state.h > +++ b/src/gallium/drivers/radeonsi/si_state.h > @@ -206,6 +206,22 @@ struct si_shader_data { > uint32_t sh_base[SI_NUM_SHADERS]; > }; > > +/* The list of registers whose emitted values are remembered by si_context. > */ > +enum si_tracked_reg { > + SI_TRACKED_DB_RENDER_CONTROL, /* 2 consecutive registers */ > + SI_TRACKED_DB_COUNT_CONTROL, > + > + SI_TRACKED_DB_RENDER_OVERRIDE2, > + SI_TRACKED_DB_SHADER_CONTROL, > + > + SI_NUM_TRACKED_REGS, > +}; > + > +struct si_tracked_regs { > + uint32_t reg_saved; > + uint32_t reg_value[SI_NUM_TRACKED_REGS]; > +}; > + > /* Private read-write buffer slots. */ > enum { > SI_ES_RING_ESGS, > -- > 2.7.4 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/mesa-dev
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev