Sure
On Mon, 14 Jan 2019 at 16:50, Samuel Pitoiset <samuel.pitoi...@gmail.com> wrote: > > While you are on it, can you experiment the tracked ctx stuff that > RadeonSI implements (ie. SI_TRACKED_XXX)? > > This approach will likely be more costly from the CPU side, but it will > reduce the number of register changes a lot more. > > Not sure if that will improve anything though, but I think it's worth to > try? > > On 1/14/19 5:12 PM, Rhys Perry wrote: > > I did and found small improvements in Rise of the Tomb Raider. I > > measured framerates ~104.3% that of without the changes for the > > Geothermal Valley scene, ~101.2% for Spine of the Mountain and ~102.3% > > for Prophets Tomb. > > > > I found no change with Dota 2 but I've heard it's cpu-bound. > > > > On Mon, 14 Jan 2019 at 16:05, Samuel Pitoiset <samuel.pitoi...@gmail.com> > > wrote: > >> Did you benchmark? > >> > >> On 1/14/19 5:01 PM, Rhys Perry wrote: > >>> It's common in some applications to bind a new graphics pipeline without > >>> ending up changing any context registers. > >>> > >>> This has a pipline have two command buffers: one for setting context > >>> registers and one for everything else. The context register command buffer > >>> is only emitted if it differs from the previous pipeline's. > >>> > >>> Signed-off-by: Rhys Perry <pendingchao...@gmail.com> > >>> --- > >>> src/amd/vulkan/radv_cmd_buffer.c | 46 +++++-- > >>> src/amd/vulkan/radv_pipeline.c | 217 ++++++++++++++++--------------- > >>> src/amd/vulkan/radv_private.h | 2 + > >>> 3 files changed, 150 insertions(+), 115 deletions(-) > >>> > >>> diff --git a/src/amd/vulkan/radv_cmd_buffer.c > >>> b/src/amd/vulkan/radv_cmd_buffer.c > >>> index f41d6c0b3e7..59903ab64d8 100644 > >>> --- a/src/amd/vulkan/radv_cmd_buffer.c > >>> +++ b/src/amd/vulkan/radv_cmd_buffer.c > >>> @@ -634,7 +634,7 @@ radv_emit_descriptor_pointers(struct radv_cmd_buffer > >>> *cmd_buffer, > >>> } > >>> } > >>> > >>> -static void > >>> +static bool > >>> radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer, > >>> struct radv_pipeline *pipeline) > >>> { > >>> @@ -646,7 +646,7 @@ radv_update_multisample_state(struct radv_cmd_buffer > >>> *cmd_buffer, > >>> cmd_buffer->sample_positions_needed = true; > >>> > >>> if (old_pipeline && num_samples == > >>> old_pipeline->graphics.ms.num_samples) > >>> - return; > >>> + return false; > >>> > >>> radeon_set_context_reg_seq(cmd_buffer->cs, > >>> R_028BDC_PA_SC_LINE_CNTL, 2); > >>> radeon_emit(cmd_buffer->cs, ms->pa_sc_line_cntl); > >>> @@ -661,6 +661,8 @@ radv_update_multisample_state(struct radv_cmd_buffer > >>> *cmd_buffer, > >>> radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); > >>> radeon_emit(cmd_buffer->cs, > >>> EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0)); > >>> } > >>> + > >>> + return true; > >>> } > >>> > >>> static void > >>> @@ -863,15 +865,15 @@ radv_emit_rbplus_state(struct radv_cmd_buffer > >>> *cmd_buffer) > >>> radeon_emit(cmd_buffer->cs, sx_blend_opt_control); > >>> } > >>> > >>> -static void > >>> +static bool > >>> radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer) > >>> { > >>> struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; > >>> > >>> if (!pipeline || cmd_buffer->state.emitted_pipeline == pipeline) > >>> - return; > >>> + return false; > >>> > >>> - radv_update_multisample_state(cmd_buffer, pipeline); > >>> + bool context_roll = radv_update_multisample_state(cmd_buffer, > >>> pipeline); > >>> > >>> cmd_buffer->scratch_size_needed = > >>> MAX2(cmd_buffer->scratch_size_needed, > >>> @@ -884,6 +886,15 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer > >>> *cmd_buffer) > >>> > >>> radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, > >>> pipeline->cs.cdw); > >>> > >>> + if (!cmd_buffer->state.emitted_pipeline || > >>> + cmd_buffer->state.emitted_pipeline->ctx_cs.cdw != > >>> pipeline->ctx_cs.cdw || > >>> + cmd_buffer->state.emitted_pipeline->ctx_cs_hash != > >>> pipeline->ctx_cs_hash || > >>> + memcmp(cmd_buffer->state.emitted_pipeline->ctx_cs.buf, > >>> + pipeline->ctx_cs.buf, pipeline->ctx_cs.cdw * 4)) { > >>> + radeon_emit_array(cmd_buffer->cs, pipeline->ctx_cs.buf, > >>> pipeline->ctx_cs.cdw); > >>> + context_roll = true; > >>> + } > >>> + > >>> for (unsigned i = 0; i < MESA_SHADER_COMPUTE; i++) { > >>> if (!pipeline->shaders[i]) > >>> continue; > >>> @@ -902,6 +913,8 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer > >>> *cmd_buffer) > >>> cmd_buffer->state.emitted_pipeline = pipeline; > >>> > >>> cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_PIPELINE; > >>> + > >>> + return context_roll; > >>> } > >>> > >>> static void > >>> @@ -2859,6 +2872,8 @@ radv_emit_compute_pipeline(struct radv_cmd_buffer > >>> *cmd_buffer) > >>> if (!pipeline || pipeline == > >>> cmd_buffer->state.emitted_compute_pipeline) > >>> return; > >>> > >>> + assert(!pipeline->ctx_cs.cdw); > >>> + > >>> cmd_buffer->state.emitted_compute_pipeline = pipeline; > >>> > >>> radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, > >>> pipeline->cs.cdw); > >>> @@ -3609,30 +3624,30 @@ radv_emit_draw_packets(struct radv_cmd_buffer > >>> *cmd_buffer, > >>> * any context registers. > >>> */ > >>> static bool radv_need_late_scissor_emission(struct radv_cmd_buffer > >>> *cmd_buffer, > >>> - bool indexed_draw) > >>> + bool indexed_draw, > >>> + bool pipeline_context_roll) > >>> { > >>> struct radv_cmd_state *state = &cmd_buffer->state; > >>> > >>> if (!cmd_buffer->device->physical_device->has_scissor_bug) > >>> return false; > >>> > >>> + if (pipeline_context_roll) > >>> + return true; > >>> + > >>> uint32_t used_states = > >>> cmd_buffer->state.pipeline->graphics.needed_dynamic_state | > >>> ~RADV_CMD_DIRTY_DYNAMIC_ALL; > >>> > >>> /* Index, vertex and streamout buffers don't change context regs, > >>> and > >>> - * pipeline is handled later. > >>> + * pipeline is already handled. > >>> */ > >>> used_states &= ~(RADV_CMD_DIRTY_INDEX_BUFFER | > >>> RADV_CMD_DIRTY_VERTEX_BUFFER | > >>> RADV_CMD_DIRTY_STREAMOUT_BUFFER | > >>> RADV_CMD_DIRTY_PIPELINE); > >>> > >>> - /* Assume all state changes except these two can imply context > >>> rolls. */ > >>> if (cmd_buffer->state.dirty & used_states) > >>> return true; > >>> > >>> - if (cmd_buffer->state.emitted_pipeline != > >>> cmd_buffer->state.pipeline) > >>> - return true; > >>> - > >>> if (indexed_draw && state->pipeline->graphics.prim_restart_enable > >>> && > >>> (state->index_type ? 0xffffffffu : 0xffffu) != > >>> state->last_primitive_reset_index) > >>> return true; > >>> @@ -3644,14 +3659,19 @@ static void > >>> radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer, > >>> const struct radv_draw_info *info) > >>> { > >>> - bool late_scissor_emission = > >>> radv_need_late_scissor_emission(cmd_buffer, info->indexed); > >>> + bool late_scissor_emission; > >>> + bool pipeline_context_roll = false; > >>> > >>> if ((cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER) || > >>> cmd_buffer->state.emitted_pipeline != > >>> cmd_buffer->state.pipeline) > >>> radv_emit_rbplus_state(cmd_buffer); > >>> > >>> if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) > >>> - radv_emit_graphics_pipeline(cmd_buffer); > >>> + pipeline_context_roll = > >>> radv_emit_graphics_pipeline(cmd_buffer); > >>> + > >>> + late_scissor_emission = > >>> + radv_need_late_scissor_emission(cmd_buffer, info->indexed, > >>> + pipeline_context_roll); > >>> > >>> if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER) > >>> radv_emit_framebuffer_state(cmd_buffer); > >>> diff --git a/src/amd/vulkan/radv_pipeline.c > >>> b/src/amd/vulkan/radv_pipeline.c > >>> index 9d5da43532f..fba77584013 100644 > >>> --- a/src/amd/vulkan/radv_pipeline.c > >>> +++ b/src/amd/vulkan/radv_pipeline.c > >>> @@ -2525,7 +2525,7 @@ radv_compute_bin_size(struct radv_pipeline > >>> *pipeline, const VkGraphicsPipelineCr > >>> } > >>> > >>> static void > >>> -radv_pipeline_generate_binning_state(struct radeon_cmdbuf *cs, > >>> +radv_pipeline_generate_binning_state(struct radeon_cmdbuf *ctx_cs, > >>> struct radv_pipeline *pipeline, > >>> const VkGraphicsPipelineCreateInfo > >>> *pCreateInfo) > >>> { > >>> @@ -2575,15 +2575,15 @@ radv_pipeline_generate_binning_state(struct > >>> radeon_cmdbuf *cs, > >>> S_028C44_OPTIMAL_BIN_SELECTION(1); > >>> } > >>> > >>> - radeon_set_context_reg(cs, R_028C44_PA_SC_BINNER_CNTL_0, > >>> + radeon_set_context_reg(ctx_cs, R_028C44_PA_SC_BINNER_CNTL_0, > >>> pa_sc_binner_cntl_0); > >>> - radeon_set_context_reg(cs, R_028060_DB_DFSM_CONTROL, > >>> + radeon_set_context_reg(ctx_cs, R_028060_DB_DFSM_CONTROL, > >>> db_dfsm_control); > >>> } > >>> > >>> > >>> static void > >>> -radv_pipeline_generate_depth_stencil_state(struct radeon_cmdbuf *cs, > >>> +radv_pipeline_generate_depth_stencil_state(struct radeon_cmdbuf *ctx_cs, > >>> struct radv_pipeline > >>> *pipeline, > >>> const > >>> VkGraphicsPipelineCreateInfo *pCreateInfo, > >>> const struct > >>> radv_graphics_pipeline_create_info *extra) > >>> @@ -2656,35 +2656,35 @@ radv_pipeline_generate_depth_stencil_state(struct > >>> radeon_cmdbuf *cs, > >>> db_render_override |= S_02800C_DISABLE_VIEWPORT_CLAMP(1); > >>> } > >>> > >>> - radeon_set_context_reg(cs, R_028800_DB_DEPTH_CONTROL, > >>> db_depth_control); > >>> - radeon_set_context_reg(cs, R_02842C_DB_STENCIL_CONTROL, > >>> db_stencil_control); > >>> + radeon_set_context_reg(ctx_cs, R_028800_DB_DEPTH_CONTROL, > >>> db_depth_control); > >>> + radeon_set_context_reg(ctx_cs, R_02842C_DB_STENCIL_CONTROL, > >>> db_stencil_control); > >>> > >>> - radeon_set_context_reg(cs, R_028000_DB_RENDER_CONTROL, > >>> db_render_control); > >>> - radeon_set_context_reg(cs, R_02800C_DB_RENDER_OVERRIDE, > >>> db_render_override); > >>> - radeon_set_context_reg(cs, R_028010_DB_RENDER_OVERRIDE2, > >>> db_render_override2); > >>> + radeon_set_context_reg(ctx_cs, R_028000_DB_RENDER_CONTROL, > >>> db_render_control); > >>> + radeon_set_context_reg(ctx_cs, R_02800C_DB_RENDER_OVERRIDE, > >>> db_render_override); > >>> + radeon_set_context_reg(ctx_cs, R_028010_DB_RENDER_OVERRIDE2, > >>> db_render_override2); > >>> } > >>> > >>> static void > >>> -radv_pipeline_generate_blend_state(struct radeon_cmdbuf *cs, > >>> +radv_pipeline_generate_blend_state(struct radeon_cmdbuf *ctx_cs, > >>> struct radv_pipeline *pipeline, > >>> const struct radv_blend_state > >>> *blend) > >>> { > >>> - radeon_set_context_reg_seq(cs, R_028780_CB_BLEND0_CONTROL, 8); > >>> - radeon_emit_array(cs, blend->cb_blend_control, > >>> + radeon_set_context_reg_seq(ctx_cs, R_028780_CB_BLEND0_CONTROL, 8); > >>> + radeon_emit_array(ctx_cs, blend->cb_blend_control, > >>> 8); > >>> - radeon_set_context_reg(cs, R_028808_CB_COLOR_CONTROL, > >>> blend->cb_color_control); > >>> - radeon_set_context_reg(cs, R_028B70_DB_ALPHA_TO_MASK, > >>> blend->db_alpha_to_mask); > >>> + radeon_set_context_reg(ctx_cs, R_028808_CB_COLOR_CONTROL, > >>> blend->cb_color_control); > >>> + radeon_set_context_reg(ctx_cs, R_028B70_DB_ALPHA_TO_MASK, > >>> blend->db_alpha_to_mask); > >>> > >>> if (pipeline->device->physical_device->has_rbplus) { > >>> > >>> - radeon_set_context_reg_seq(cs, R_028760_SX_MRT0_BLEND_OPT, > >>> 8); > >>> - radeon_emit_array(cs, blend->sx_mrt_blend_opt, 8); > >>> + radeon_set_context_reg_seq(ctx_cs, > >>> R_028760_SX_MRT0_BLEND_OPT, 8); > >>> + radeon_emit_array(ctx_cs, blend->sx_mrt_blend_opt, 8); > >>> } > >>> > >>> - radeon_set_context_reg(cs, R_028714_SPI_SHADER_COL_FORMAT, > >>> blend->spi_shader_col_format); > >>> + radeon_set_context_reg(ctx_cs, R_028714_SPI_SHADER_COL_FORMAT, > >>> blend->spi_shader_col_format); > >>> > >>> - radeon_set_context_reg(cs, R_028238_CB_TARGET_MASK, > >>> blend->cb_target_mask); > >>> - radeon_set_context_reg(cs, R_02823C_CB_SHADER_MASK, > >>> blend->cb_shader_mask); > >>> + radeon_set_context_reg(ctx_cs, R_028238_CB_TARGET_MASK, > >>> blend->cb_target_mask); > >>> + radeon_set_context_reg(ctx_cs, R_02823C_CB_SHADER_MASK, > >>> blend->cb_shader_mask); > >>> > >>> pipeline->graphics.col_format = blend->spi_shader_col_format; > >>> pipeline->graphics.cb_target_mask = blend->cb_target_mask; > >>> @@ -2702,7 +2702,7 @@ radv_get_conservative_raster_mode(const > >>> VkPipelineRasterizationStateCreateInfo * > >>> } > >>> > >>> static void > >>> -radv_pipeline_generate_raster_state(struct radeon_cmdbuf *cs, > >>> +radv_pipeline_generate_raster_state(struct radeon_cmdbuf *ctx_cs, > >>> struct radv_pipeline *pipeline, > >>> const VkGraphicsPipelineCreateInfo > >>> *pCreateInfo) > >>> { > >>> @@ -2711,14 +2711,14 @@ radv_pipeline_generate_raster_state(struct > >>> radeon_cmdbuf *cs, > >>> radv_get_conservative_raster_mode(vkraster); > >>> uint32_t pa_sc_conservative_rast = > >>> S_028C4C_NULL_SQUAD_AA_MASK_ENABLE(1); > >>> > >>> - radeon_set_context_reg(cs, R_028810_PA_CL_CLIP_CNTL, > >>> + radeon_set_context_reg(ctx_cs, R_028810_PA_CL_CLIP_CNTL, > >>> S_028810_DX_CLIP_SPACE_DEF(1) | // vulkan > >>> uses DX conventions. > >>> > >>> S_028810_ZCLIP_NEAR_DISABLE(vkraster->depthClampEnable ? 1 : 0) | > >>> > >>> S_028810_ZCLIP_FAR_DISABLE(vkraster->depthClampEnable ? 1 : 0) | > >>> > >>> S_028810_DX_RASTERIZATION_KILL(vkraster->rasterizerDiscardEnable ? 1 : 0) > >>> | > >>> S_028810_DX_LINEAR_ATTR_CLIP_ENA(1)); > >>> > >>> - radeon_set_context_reg(cs, R_0286D4_SPI_INTERP_CONTROL_0, > >>> + radeon_set_context_reg(ctx_cs, R_0286D4_SPI_INTERP_CONTROL_0, > >>> S_0286D4_FLAT_SHADE_ENA(1) | > >>> S_0286D4_PNT_SPRITE_ENA(1) | > >>> > >>> S_0286D4_PNT_SPRITE_OVRD_X(V_0286D4_SPI_PNT_SPRITE_SEL_S) | > >>> @@ -2727,12 +2727,12 @@ radv_pipeline_generate_raster_state(struct > >>> radeon_cmdbuf *cs, > >>> > >>> S_0286D4_PNT_SPRITE_OVRD_W(V_0286D4_SPI_PNT_SPRITE_SEL_1) | > >>> S_0286D4_PNT_SPRITE_TOP_1(0)); /* vulkan is > >>> top to bottom - 1.0 at bottom */ > >>> > >>> - radeon_set_context_reg(cs, R_028BE4_PA_SU_VTX_CNTL, > >>> + radeon_set_context_reg(ctx_cs, R_028BE4_PA_SU_VTX_CNTL, > >>> S_028BE4_PIX_CENTER(1) | // TODO verify > >>> > >>> S_028BE4_ROUND_MODE(V_028BE4_X_ROUND_TO_EVEN) | > >>> > >>> S_028BE4_QUANT_MODE(V_028BE4_X_16_8_FIXED_POINT_1_256TH)); > >>> > >>> - radeon_set_context_reg(cs, R_028814_PA_SU_SC_MODE_CNTL, > >>> + radeon_set_context_reg(ctx_cs, R_028814_PA_SU_SC_MODE_CNTL, > >>> S_028814_FACE(vkraster->frontFace) | > >>> S_028814_CULL_FRONT(!!(vkraster->cullMode & > >>> VK_CULL_MODE_FRONT_BIT)) | > >>> S_028814_CULL_BACK(!!(vkraster->cullMode & > >>> VK_CULL_MODE_BACK_BIT)) | > >>> @@ -2773,37 +2773,37 @@ radv_pipeline_generate_raster_state(struct > >>> radeon_cmdbuf *cs, > >>> } > >>> } > >>> > >>> - radeon_set_context_reg(cs, > >>> R_028C4C_PA_SC_CONSERVATIVE_RASTERIZATION_CNTL, > >>> + radeon_set_context_reg(ctx_cs, > >>> R_028C4C_PA_SC_CONSERVATIVE_RASTERIZATION_CNTL, > >>> pa_sc_conservative_rast); > >>> } > >>> > >>> > >>> static void > >>> -radv_pipeline_generate_multisample_state(struct radeon_cmdbuf *cs, > >>> +radv_pipeline_generate_multisample_state(struct radeon_cmdbuf *ctx_cs, > >>> struct radv_pipeline > >>> *pipeline) > >>> { > >>> struct radv_multisample_state *ms = &pipeline->graphics.ms; > >>> > >>> - radeon_set_context_reg_seq(cs, R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2); > >>> - radeon_emit(cs, ms->pa_sc_aa_mask[0]); > >>> - radeon_emit(cs, ms->pa_sc_aa_mask[1]); > >>> + radeon_set_context_reg_seq(ctx_cs, > >>> R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2); > >>> + radeon_emit(ctx_cs, ms->pa_sc_aa_mask[0]); > >>> + radeon_emit(ctx_cs, ms->pa_sc_aa_mask[1]); > >>> > >>> - radeon_set_context_reg(cs, R_028804_DB_EQAA, ms->db_eqaa); > >>> - radeon_set_context_reg(cs, R_028A4C_PA_SC_MODE_CNTL_1, > >>> ms->pa_sc_mode_cntl_1); > >>> + radeon_set_context_reg(ctx_cs, R_028804_DB_EQAA, ms->db_eqaa); > >>> + radeon_set_context_reg(ctx_cs, R_028A4C_PA_SC_MODE_CNTL_1, > >>> ms->pa_sc_mode_cntl_1); > >>> > >>> /* The exclusion bits can be set to improve rasterization > >>> efficiency > >>> * if no sample lies on the pixel boundary (-8 sample offset). It's > >>> * currently always TRUE because the driver doesn't support 16 > >>> samples. > >>> */ > >>> bool exclusion = > >>> pipeline->device->physical_device->rad_info.chip_class >= CIK; > >>> - radeon_set_context_reg(cs, R_02882C_PA_SU_PRIM_FILTER_CNTL, > >>> + radeon_set_context_reg(ctx_cs, R_02882C_PA_SU_PRIM_FILTER_CNTL, > >>> S_02882C_XMAX_RIGHT_EXCLUSION(exclusion) | > >>> S_02882C_YMAX_BOTTOM_EXCLUSION(exclusion)); > >>> } > >>> > >>> static void > >>> -radv_pipeline_generate_vgt_gs_mode(struct radeon_cmdbuf *cs, > >>> - const struct radv_pipeline *pipeline) > >>> +radv_pipeline_generate_vgt_gs_mode(struct radeon_cmdbuf *ctx_cs, > >>> + struct radv_pipeline *pipeline) > >>> { > >>> const struct radv_vs_output_info *outinfo = > >>> get_vs_output_info(pipeline); > >>> > >>> @@ -2821,12 +2821,13 @@ radv_pipeline_generate_vgt_gs_mode(struct > >>> radeon_cmdbuf *cs, > >>> vgt_primitiveid_en = true; > >>> } > >>> > >>> - radeon_set_context_reg(cs, R_028A84_VGT_PRIMITIVEID_EN, > >>> vgt_primitiveid_en); > >>> - radeon_set_context_reg(cs, R_028A40_VGT_GS_MODE, vgt_gs_mode); > >>> + radeon_set_context_reg(ctx_cs, R_028A84_VGT_PRIMITIVEID_EN, > >>> vgt_primitiveid_en); > >>> + radeon_set_context_reg(ctx_cs, R_028A40_VGT_GS_MODE, vgt_gs_mode); > >>> } > >>> > >>> static void > >>> -radv_pipeline_generate_hw_vs(struct radeon_cmdbuf *cs, > >>> +radv_pipeline_generate_hw_vs(struct radeon_cmdbuf *ctx_cs, > >>> + struct radeon_cmdbuf *cs, > >>> struct radv_pipeline *pipeline, > >>> struct radv_shader_variant *shader) > >>> { > >>> @@ -2847,10 +2848,10 @@ radv_pipeline_generate_hw_vs(struct radeon_cmdbuf > >>> *cs, > >>> outinfo->writes_layer || > >>> outinfo->writes_viewport_index; > >>> > >>> - radeon_set_context_reg(cs, R_0286C4_SPI_VS_OUT_CONFIG, > >>> + radeon_set_context_reg(ctx_cs, R_0286C4_SPI_VS_OUT_CONFIG, > >>> S_0286C4_VS_EXPORT_COUNT(MAX2(1, > >>> outinfo->param_exports) - 1)); > >>> > >>> - radeon_set_context_reg(cs, R_02870C_SPI_SHADER_POS_FORMAT, > >>> + radeon_set_context_reg(ctx_cs, R_02870C_SPI_SHADER_POS_FORMAT, > >>> > >>> S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) | > >>> > >>> S_02870C_POS1_EXPORT_FORMAT(outinfo->pos_exports > 1 ? > >>> > >>> V_02870C_SPI_SHADER_4COMP : > >>> @@ -2862,13 +2863,13 @@ radv_pipeline_generate_hw_vs(struct radeon_cmdbuf > >>> *cs, > >>> > >>> V_02870C_SPI_SHADER_4COMP : > >>> > >>> V_02870C_SPI_SHADER_NONE)); > >>> > >>> - radeon_set_context_reg(cs, R_028818_PA_CL_VTE_CNTL, > >>> + radeon_set_context_reg(ctx_cs, R_028818_PA_CL_VTE_CNTL, > >>> S_028818_VTX_W0_FMT(1) | > >>> S_028818_VPORT_X_SCALE_ENA(1) | > >>> S_028818_VPORT_X_OFFSET_ENA(1) | > >>> S_028818_VPORT_Y_SCALE_ENA(1) | > >>> S_028818_VPORT_Y_OFFSET_ENA(1) | > >>> S_028818_VPORT_Z_SCALE_ENA(1) | > >>> S_028818_VPORT_Z_OFFSET_ENA(1)); > >>> > >>> - radeon_set_context_reg(cs, R_02881C_PA_CL_VS_OUT_CNTL, > >>> + radeon_set_context_reg(ctx_cs, R_02881C_PA_CL_VS_OUT_CNTL, > >>> > >>> S_02881C_USE_VTX_POINT_SIZE(outinfo->writes_pointsize) | > >>> > >>> S_02881C_USE_VTX_RENDER_TARGET_INDX(outinfo->writes_layer) | > >>> > >>> S_02881C_USE_VTX_VIEWPORT_INDX(outinfo->writes_viewport_index) | > >>> @@ -2880,7 +2881,7 @@ radv_pipeline_generate_hw_vs(struct radeon_cmdbuf > >>> *cs, > >>> clip_dist_mask); > >>> > >>> if (pipeline->device->physical_device->rad_info.chip_class <= VI) > >>> - radeon_set_context_reg(cs, R_028AB4_VGT_REUSE_OFF, > >>> + radeon_set_context_reg(ctx_cs, R_028AB4_VGT_REUSE_OFF, > >>> outinfo->writes_viewport_index); > >>> } > >>> > >>> @@ -2948,7 +2949,8 @@ radv_pipeline_generate_hw_hs(struct radeon_cmdbuf > >>> *cs, > >>> } > >>> > >>> static void > >>> -radv_pipeline_generate_vertex_shader(struct radeon_cmdbuf *cs, > >>> +radv_pipeline_generate_vertex_shader(struct radeon_cmdbuf *ctx_cs, > >>> + struct radeon_cmdbuf *cs, > >>> struct radv_pipeline *pipeline, > >>> const struct radv_tessellation_state > >>> *tess) > >>> { > >>> @@ -2964,11 +2966,12 @@ radv_pipeline_generate_vertex_shader(struct > >>> radeon_cmdbuf *cs, > >>> else if (vs->info.vs.as_es) > >>> radv_pipeline_generate_hw_es(cs, pipeline, vs); > >>> else > >>> - radv_pipeline_generate_hw_vs(cs, pipeline, vs); > >>> + radv_pipeline_generate_hw_vs(ctx_cs, cs, pipeline, vs); > >>> } > >>> > >>> static void > >>> -radv_pipeline_generate_tess_shaders(struct radeon_cmdbuf *cs, > >>> +radv_pipeline_generate_tess_shaders(struct radeon_cmdbuf *ctx_cs, > >>> + struct radeon_cmdbuf *cs, > >>> struct radv_pipeline *pipeline, > >>> const struct radv_tessellation_state > >>> *tess) > >>> { > >>> @@ -2984,24 +2987,25 @@ radv_pipeline_generate_tess_shaders(struct > >>> radeon_cmdbuf *cs, > >>> if (tes->info.tes.as_es) > >>> radv_pipeline_generate_hw_es(cs, pipeline, tes); > >>> else > >>> - radv_pipeline_generate_hw_vs(cs, pipeline, tes); > >>> + radv_pipeline_generate_hw_vs(ctx_cs, cs, pipeline, > >>> tes); > >>> } > >>> > >>> radv_pipeline_generate_hw_hs(cs, pipeline, tcs, tess); > >>> > >>> - radeon_set_context_reg(cs, R_028B6C_VGT_TF_PARAM, > >>> + radeon_set_context_reg(ctx_cs, R_028B6C_VGT_TF_PARAM, > >>> tess->tf_param); > >>> > >>> if (pipeline->device->physical_device->rad_info.chip_class >= CIK) > >>> - radeon_set_context_reg_idx(cs, R_028B58_VGT_LS_HS_CONFIG, 2, > >>> + radeon_set_context_reg_idx(ctx_cs, > >>> R_028B58_VGT_LS_HS_CONFIG, 2, > >>> tess->ls_hs_config); > >>> else > >>> - radeon_set_context_reg(cs, R_028B58_VGT_LS_HS_CONFIG, > >>> + radeon_set_context_reg(ctx_cs, R_028B58_VGT_LS_HS_CONFIG, > >>> tess->ls_hs_config); > >>> } > >>> > >>> static void > >>> -radv_pipeline_generate_geometry_shader(struct radeon_cmdbuf *cs, > >>> +radv_pipeline_generate_geometry_shader(struct radeon_cmdbuf *ctx_cs, > >>> + struct radeon_cmdbuf *cs, > >>> struct radv_pipeline *pipeline, > >>> const struct radv_gs_state > >>> *gs_state) > >>> { > >>> @@ -3022,32 +3026,32 @@ radv_pipeline_generate_geometry_shader(struct > >>> radeon_cmdbuf *cs, > >>> > >>> offset = num_components[0] * gs_max_out_vertices; > >>> > >>> - radeon_set_context_reg_seq(cs, R_028A60_VGT_GSVS_RING_OFFSET_1, 3); > >>> - radeon_emit(cs, offset); > >>> + radeon_set_context_reg_seq(ctx_cs, R_028A60_VGT_GSVS_RING_OFFSET_1, > >>> 3); > >>> + radeon_emit(ctx_cs, offset); > >>> if (max_stream >= 1) > >>> offset += num_components[1] * gs_max_out_vertices; > >>> - radeon_emit(cs, offset); > >>> + radeon_emit(ctx_cs, offset); > >>> if (max_stream >= 2) > >>> offset += num_components[2] * gs_max_out_vertices; > >>> - radeon_emit(cs, offset); > >>> + radeon_emit(ctx_cs, offset); > >>> if (max_stream >= 3) > >>> offset += num_components[3] * gs_max_out_vertices; > >>> - radeon_set_context_reg(cs, R_028AB0_VGT_GSVS_RING_ITEMSIZE, offset); > >>> + radeon_set_context_reg(ctx_cs, R_028AB0_VGT_GSVS_RING_ITEMSIZE, > >>> offset); > >>> > >>> - radeon_set_context_reg(cs, R_028B38_VGT_GS_MAX_VERT_OUT, > >>> gs->info.gs.vertices_out); > >>> + radeon_set_context_reg(ctx_cs, R_028B38_VGT_GS_MAX_VERT_OUT, > >>> gs->info.gs.vertices_out); > >>> > >>> - radeon_set_context_reg_seq(cs, R_028B5C_VGT_GS_VERT_ITEMSIZE, 4); > >>> - radeon_emit(cs, num_components[0]); > >>> - radeon_emit(cs, (max_stream >= 1) ? num_components[1] : 0); > >>> - radeon_emit(cs, (max_stream >= 2) ? num_components[2] : 0); > >>> - radeon_emit(cs, (max_stream >= 3) ? num_components[3] : 0); > >>> + radeon_set_context_reg_seq(ctx_cs, R_028B5C_VGT_GS_VERT_ITEMSIZE, > >>> 4); > >>> + radeon_emit(ctx_cs, num_components[0]); > >>> + radeon_emit(ctx_cs, (max_stream >= 1) ? num_components[1] : 0); > >>> + radeon_emit(ctx_cs, (max_stream >= 2) ? num_components[2] : 0); > >>> + radeon_emit(ctx_cs, (max_stream >= 3) ? num_components[3] : 0); > >>> > >>> uint32_t gs_num_invocations = gs->info.gs.invocations; > >>> - radeon_set_context_reg(cs, R_028B90_VGT_GS_INSTANCE_CNT, > >>> + radeon_set_context_reg(ctx_cs, R_028B90_VGT_GS_INSTANCE_CNT, > >>> S_028B90_CNT(MIN2(gs_num_invocations, 127)) > >>> | > >>> S_028B90_ENABLE(gs_num_invocations > 0)); > >>> > >>> - radeon_set_context_reg(cs, R_028AAC_VGT_ESGS_RING_ITEMSIZE, > >>> + radeon_set_context_reg(ctx_cs, R_028AAC_VGT_ESGS_RING_ITEMSIZE, > >>> gs_state->vgt_esgs_ring_itemsize); > >>> > >>> va = radv_buffer_get_va(gs->bo) + gs->bo_offset; > >>> @@ -3061,8 +3065,8 @@ radv_pipeline_generate_geometry_shader(struct > >>> radeon_cmdbuf *cs, > >>> radeon_emit(cs, gs->rsrc1); > >>> radeon_emit(cs, gs->rsrc2 | > >>> S_00B22C_LDS_SIZE(gs_state->lds_size)); > >>> > >>> - radeon_set_context_reg(cs, R_028A44_VGT_GS_ONCHIP_CNTL, > >>> gs_state->vgt_gs_onchip_cntl); > >>> - radeon_set_context_reg(cs, > >>> R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP, > >>> gs_state->vgt_gs_max_prims_per_subgroup); > >>> + radeon_set_context_reg(ctx_cs, R_028A44_VGT_GS_ONCHIP_CNTL, > >>> gs_state->vgt_gs_onchip_cntl); > >>> + radeon_set_context_reg(ctx_cs, > >>> R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP, > >>> gs_state->vgt_gs_max_prims_per_subgroup); > >>> } else { > >>> radeon_set_sh_reg_seq(cs, R_00B220_SPI_SHADER_PGM_LO_GS, > >>> 4); > >>> radeon_emit(cs, va >> 8); > >>> @@ -3071,7 +3075,7 @@ radv_pipeline_generate_geometry_shader(struct > >>> radeon_cmdbuf *cs, > >>> radeon_emit(cs, gs->rsrc2); > >>> } > >>> > >>> - radv_pipeline_generate_hw_vs(cs, pipeline, > >>> pipeline->gs_copy_shader); > >>> + radv_pipeline_generate_hw_vs(ctx_cs, cs, pipeline, > >>> pipeline->gs_copy_shader); > >>> } > >>> > >>> static uint32_t offset_to_ps_input(uint32_t offset, bool flat_shade) > >>> @@ -3093,8 +3097,8 @@ static uint32_t offset_to_ps_input(uint32_t offset, > >>> bool flat_shade) > >>> } > >>> > >>> static void > >>> -radv_pipeline_generate_ps_inputs(struct radeon_cmdbuf *cs, > >>> - struct radv_pipeline *pipeline) > >>> +radv_pipeline_generate_ps_inputs(struct radeon_cmdbuf *ctx_cs, > >>> + struct radv_pipeline *pipeline) > >>> { > >>> struct radv_shader_variant *ps = > >>> pipeline->shaders[MESA_SHADER_FRAGMENT]; > >>> const struct radv_vs_output_info *outinfo = > >>> get_vs_output_info(pipeline); > >>> @@ -3165,9 +3169,9 @@ radv_pipeline_generate_ps_inputs(struct > >>> radeon_cmdbuf *cs, > >>> } > >>> > >>> if (ps_offset) { > >>> - radeon_set_context_reg_seq(cs, > >>> R_028644_SPI_PS_INPUT_CNTL_0, ps_offset); > >>> + radeon_set_context_reg_seq(ctx_cs, > >>> R_028644_SPI_PS_INPUT_CNTL_0, ps_offset); > >>> for (unsigned i = 0; i < ps_offset; i++) { > >>> - radeon_emit(cs, ps_input_cntl[i]); > >>> + radeon_emit(ctx_cs, ps_input_cntl[i]); > >>> } > >>> } > >>> } > >>> @@ -3205,7 +3209,8 @@ radv_compute_db_shader_control(const struct > >>> radv_device *device, > >>> } > >>> > >>> static void > >>> -radv_pipeline_generate_fragment_shader(struct radeon_cmdbuf *cs, > >>> +radv_pipeline_generate_fragment_shader(struct radeon_cmdbuf *ctx_cs, > >>> + struct radeon_cmdbuf *cs, > >>> struct radv_pipeline *pipeline) > >>> { > >>> struct radv_shader_variant *ps; > >>> @@ -3221,22 +3226,22 @@ radv_pipeline_generate_fragment_shader(struct > >>> radeon_cmdbuf *cs, > >>> radeon_emit(cs, ps->rsrc1); > >>> radeon_emit(cs, ps->rsrc2); > >>> > >>> - radeon_set_context_reg(cs, R_02880C_DB_SHADER_CONTROL, > >>> + radeon_set_context_reg(ctx_cs, R_02880C_DB_SHADER_CONTROL, > >>> > >>> radv_compute_db_shader_control(pipeline->device, > >>> pipeline, > >>> ps)); > >>> > >>> - radeon_set_context_reg(cs, R_0286CC_SPI_PS_INPUT_ENA, > >>> + radeon_set_context_reg(ctx_cs, R_0286CC_SPI_PS_INPUT_ENA, > >>> ps->config.spi_ps_input_ena); > >>> > >>> - radeon_set_context_reg(cs, R_0286D0_SPI_PS_INPUT_ADDR, > >>> + radeon_set_context_reg(ctx_cs, R_0286D0_SPI_PS_INPUT_ADDR, > >>> ps->config.spi_ps_input_addr); > >>> > >>> - radeon_set_context_reg(cs, R_0286D8_SPI_PS_IN_CONTROL, > >>> + radeon_set_context_reg(ctx_cs, R_0286D8_SPI_PS_IN_CONTROL, > >>> > >>> S_0286D8_NUM_INTERP(ps->info.fs.num_interp)); > >>> > >>> - radeon_set_context_reg(cs, R_0286E0_SPI_BARYC_CNTL, > >>> pipeline->graphics.spi_baryc_cntl); > >>> + radeon_set_context_reg(ctx_cs, R_0286E0_SPI_BARYC_CNTL, > >>> pipeline->graphics.spi_baryc_cntl); > >>> > >>> - radeon_set_context_reg(cs, R_028710_SPI_SHADER_Z_FORMAT, > >>> + radeon_set_context_reg(ctx_cs, R_028710_SPI_SHADER_Z_FORMAT, > >>> > >>> ac_get_spi_shader_z_format(ps->info.info.ps.writes_z, > >>> > >>> ps->info.info.ps.writes_stencil, > >>> > >>> ps->info.info.ps.writes_sample_mask)); > >>> @@ -3249,7 +3254,7 @@ radv_pipeline_generate_fragment_shader(struct > >>> radeon_cmdbuf *cs, > >>> } > >>> > >>> static void > >>> -radv_pipeline_generate_vgt_vertex_reuse(struct radeon_cmdbuf *cs, > >>> +radv_pipeline_generate_vgt_vertex_reuse(struct radeon_cmdbuf *ctx_cs, > >>> struct radv_pipeline *pipeline) > >>> { > >>> if (pipeline->device->physical_device->rad_info.family < > >>> CHIP_POLARIS10) > >>> @@ -3260,7 +3265,7 @@ radv_pipeline_generate_vgt_vertex_reuse(struct > >>> radeon_cmdbuf *cs, > >>> radv_get_shader(pipeline, > >>> MESA_SHADER_TESS_EVAL)->info.tes.spacing == TESS_SPACING_FRACTIONAL_ODD) { > >>> vtx_reuse_depth = 14; > >>> } > >>> - radeon_set_context_reg(cs, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, > >>> + radeon_set_context_reg(ctx_cs, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, > >>> S_028C58_VTX_REUSE_DEPTH(vtx_reuse_depth)); > >>> } > >>> > >>> @@ -3330,38 +3335,46 @@ radv_pipeline_generate_pm4(struct radv_pipeline > >>> *pipeline, > >>> const struct radv_gs_state *gs, > >>> unsigned prim, unsigned gs_out) > >>> { > >>> - pipeline->cs.buf = malloc(4 * 256); > >>> - pipeline->cs.max_dw = 256; > >>> - > >>> - radv_pipeline_generate_depth_stencil_state(&pipeline->cs, pipeline, > >>> pCreateInfo, extra); > >>> - radv_pipeline_generate_blend_state(&pipeline->cs, pipeline, blend); > >>> - radv_pipeline_generate_raster_state(&pipeline->cs, pipeline, > >>> pCreateInfo); > >>> - radv_pipeline_generate_multisample_state(&pipeline->cs, pipeline); > >>> - radv_pipeline_generate_vgt_gs_mode(&pipeline->cs, pipeline); > >>> - radv_pipeline_generate_vertex_shader(&pipeline->cs, pipeline, tess); > >>> - radv_pipeline_generate_tess_shaders(&pipeline->cs, pipeline, tess); > >>> - radv_pipeline_generate_geometry_shader(&pipeline->cs, pipeline, gs); > >>> - radv_pipeline_generate_fragment_shader(&pipeline->cs, pipeline); > >>> - radv_pipeline_generate_ps_inputs(&pipeline->cs, pipeline); > >>> - radv_pipeline_generate_vgt_vertex_reuse(&pipeline->cs, pipeline); > >>> - radv_pipeline_generate_binning_state(&pipeline->cs, pipeline, > >>> pCreateInfo); > >>> - > >>> - radeon_set_context_reg(&pipeline->cs, R_0286E8_SPI_TMPRING_SIZE, > >>> + struct radeon_cmdbuf *ctx_cs = &pipeline->ctx_cs; > >>> + struct radeon_cmdbuf *cs = &pipeline->cs; > >>> + > >>> + cs->max_dw = 64; > >>> + ctx_cs->max_dw = 256; > >>> + cs->buf = malloc(4 * (cs->max_dw + ctx_cs->max_dw)); > >>> + ctx_cs->buf = cs->buf + cs->max_dw; > >>> + > >>> + radv_pipeline_generate_depth_stencil_state(ctx_cs, pipeline, > >>> pCreateInfo, extra); > >>> + radv_pipeline_generate_blend_state(ctx_cs, pipeline, blend); > >>> + radv_pipeline_generate_raster_state(ctx_cs, pipeline, pCreateInfo); > >>> + radv_pipeline_generate_multisample_state(ctx_cs, pipeline); > >>> + radv_pipeline_generate_vgt_gs_mode(ctx_cs, pipeline); > >>> + radv_pipeline_generate_vertex_shader(ctx_cs, cs, pipeline, tess); > >>> + radv_pipeline_generate_tess_shaders(ctx_cs, cs, pipeline, tess); > >>> + radv_pipeline_generate_geometry_shader(ctx_cs, cs, pipeline, gs); > >>> + radv_pipeline_generate_fragment_shader(ctx_cs, cs, pipeline); > >>> + radv_pipeline_generate_ps_inputs(ctx_cs, pipeline); > >>> + radv_pipeline_generate_vgt_vertex_reuse(ctx_cs, pipeline); > >>> + radv_pipeline_generate_binning_state(ctx_cs, pipeline, pCreateInfo); > >>> + > >>> + radeon_set_context_reg(ctx_cs, R_0286E8_SPI_TMPRING_SIZE, > >>> S_0286E8_WAVES(pipeline->max_waves) | > >>> > >>> S_0286E8_WAVESIZE(pipeline->scratch_bytes_per_wave >> 10)); > >>> > >>> - radeon_set_context_reg(&pipeline->cs, > >>> R_028B54_VGT_SHADER_STAGES_EN, > >>> radv_compute_vgt_shader_stages_en(pipeline)); > >>> + radeon_set_context_reg(ctx_cs, R_028B54_VGT_SHADER_STAGES_EN, > >>> radv_compute_vgt_shader_stages_en(pipeline)); > >>> > >>> if (pipeline->device->physical_device->rad_info.chip_class >= CIK) > >>> { > >>> - radeon_set_uconfig_reg_idx(&pipeline->cs, > >>> R_030908_VGT_PRIMITIVE_TYPE, 1, prim); > >>> + radeon_set_uconfig_reg_idx(cs, R_030908_VGT_PRIMITIVE_TYPE, > >>> 1, prim); > >>> } else { > >>> - radeon_set_config_reg(&pipeline->cs, > >>> R_008958_VGT_PRIMITIVE_TYPE, prim); > >>> + radeon_set_config_reg(cs, R_008958_VGT_PRIMITIVE_TYPE, > >>> prim); > >>> } > >>> - radeon_set_context_reg(&pipeline->cs, > >>> R_028A6C_VGT_GS_OUT_PRIM_TYPE, gs_out); > >>> + radeon_set_context_reg(ctx_cs, R_028A6C_VGT_GS_OUT_PRIM_TYPE, > >>> gs_out); > >>> > >>> - radeon_set_context_reg(&pipeline->cs, R_02820C_PA_SC_CLIPRECT_RULE, > >>> radv_compute_cliprect_rule(pCreateInfo)); > >>> + radeon_set_context_reg(ctx_cs, R_02820C_PA_SC_CLIPRECT_RULE, > >>> radv_compute_cliprect_rule(pCreateInfo)); > >>> > >>> - assert(pipeline->cs.cdw <= pipeline->cs.max_dw); > >>> + pipeline->ctx_cs_hash = _mesa_hash_data(ctx_cs->buf, ctx_cs->cdw * > >>> 4); > >>> + > >>> + assert(ctx_cs->cdw <= ctx_cs->max_dw); > >>> + assert(cs->cdw <= cs->max_dw); > >>> } > >>> > >>> static struct radv_ia_multi_vgt_param_helpers > >>> diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h > >>> index 6089ee6a607..f6534ae3309 100644 > >>> --- a/src/amd/vulkan/radv_private.h > >>> +++ b/src/amd/vulkan/radv_private.h > >>> @@ -1365,6 +1365,8 @@ struct radv_pipeline { > >>> VkShaderStageFlags active_stages; > >>> > >>> struct radeon_cmdbuf cs; > >>> + uint32_t ctx_cs_hash; > >>> + struct radeon_cmdbuf ctx_cs; > >>> > >>> struct radv_vertex_elements_info vertex_elements; > >>> _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev