On Fri, Feb 10, 2017 at 11:02:17AM -0800, Jason Ekstrand wrote: > This helps Dota 2 on Broadwell by 8-9%. I also hacked up the driver and > used the Sascha "shadowmapping" demo to get some results. Setting > uses_kill to true dropped the framerate on the demo by 25-30%. Enabling > the PMA fix brought it back up to around 90% of the original framerate. > > Reviewed-by: Lionel Landwerlin <lionel.g.landwer...@intel.com> > --- > src/intel/vulkan/TODO | 1 - > src/intel/vulkan/anv_cmd_buffer.c | 2 + > src/intel/vulkan/anv_genX.h | 3 + > src/intel/vulkan/anv_private.h | 17 +++++ > src/intel/vulkan/gen7_cmd_buffer.c | 7 ++ > src/intel/vulkan/gen8_cmd_buffer.c | 131 > +++++++++++++++++++++++++++++++++++++ > src/intel/vulkan/genX_blorp_exec.c | 5 ++ > src/intel/vulkan/genX_cmd_buffer.c | 15 ++++- > src/intel/vulkan/genX_pipeline.c | 42 ++++++++++++ > 9 files changed, 221 insertions(+), 2 deletions(-)
This patch is, Reviewed-by: Nanley Chery <nanley.g.ch...@intel.com> > > diff --git a/src/intel/vulkan/TODO b/src/intel/vulkan/TODO > index 38acc0d..f8b73a1 100644 > --- a/src/intel/vulkan/TODO > +++ b/src/intel/vulkan/TODO > @@ -12,5 +12,4 @@ Performance: > - Compressed multisample support > - Pushing pieces of UBOs? > - Enable guardband clipping > - - pma stall workaround > - Use soft-pin to avoid relocations > diff --git a/src/intel/vulkan/anv_cmd_buffer.c > b/src/intel/vulkan/anv_cmd_buffer.c > index 5886fa6..8c08f8d 100644 > --- a/src/intel/vulkan/anv_cmd_buffer.c > +++ b/src/intel/vulkan/anv_cmd_buffer.c > @@ -135,6 +135,8 @@ anv_cmd_state_reset(struct anv_cmd_buffer *cmd_buffer) > state->restart_index = UINT32_MAX; > state->dynamic = default_dynamic_state; > state->need_query_wa = true; > + state->pma_fix_enabled = false; > + state->hiz_enabled = false; > > if (state->attachments != NULL) { > vk_free(&cmd_buffer->pool->alloc, state->attachments); > diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h > index d04fe38..67147b0 100644 > --- a/src/intel/vulkan/anv_genX.h > +++ b/src/intel/vulkan/anv_genX.h > @@ -55,6 +55,9 @@ void genX(cmd_buffer_flush_dynamic_state)(struct > anv_cmd_buffer *cmd_buffer); > > void genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer); > > +void genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer, > + bool enable); > + > void > genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch, > const struct gen_l3_config *l3_config, > diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h > index 4fe3ebc..fa6032e 100644 > --- a/src/intel/vulkan/anv_private.h > +++ b/src/intel/vulkan/anv_private.h > @@ -1163,6 +1163,20 @@ struct anv_cmd_state { > bool need_query_wa; > > /** > + * Whether or not the gen8 PMA fix is enabled. We ensure that, at the top > + * of any command buffer it is disabled by disabling it in > EndCommandBuffer > + * and before invoking the secondary in ExecuteCommands. > + */ > + bool pma_fix_enabled; > + > + /** > + * Whether or not we know for certain that HiZ is enabled for the current > + * subpass. If, for whatever reason, we are unsure as to whether HiZ is > + * enabled or not, this will be false. > + */ > + bool hiz_enabled; > + > + /** > * Array length is anv_cmd_state::pass::attachment_count. Array content is > * valid only when recording a render pass instance. > */ > @@ -1465,8 +1479,11 @@ struct anv_pipeline { > > uint32_t cs_right_mask; > > + bool writes_depth; > + bool depth_test_enable; > bool writes_stencil; > bool depth_clamp_enable; > + bool kill_pixel; > > struct { > uint32_t sf[7]; > diff --git a/src/intel/vulkan/gen7_cmd_buffer.c > b/src/intel/vulkan/gen7_cmd_buffer.c > index 013ed87..c1a25e8 100644 > --- a/src/intel/vulkan/gen7_cmd_buffer.c > +++ b/src/intel/vulkan/gen7_cmd_buffer.c > @@ -260,6 +260,13 @@ genX(cmd_buffer_flush_dynamic_state)(struct > anv_cmd_buffer *cmd_buffer) > cmd_buffer->state.dirty = 0; > } > > +void > +genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer, > + bool enable) > +{ > + /* The NP PMA fix doesn't exist on gen7 */ > +} > + > void genX(CmdSetEvent)( > VkCommandBuffer commandBuffer, > VkEvent event, > diff --git a/src/intel/vulkan/gen8_cmd_buffer.c > b/src/intel/vulkan/gen8_cmd_buffer.c > index 8c8de62..0628f3a 100644 > --- a/src/intel/vulkan/gen8_cmd_buffer.c > +++ b/src/intel/vulkan/gen8_cmd_buffer.c > @@ -155,6 +155,133 @@ __emit_sf_state(struct anv_cmd_buffer *cmd_buffer) > #endif > > void > +genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer, bool > enable) > +{ > +#if GEN_GEN == 8 > + if (cmd_buffer->state.pma_fix_enabled == enable) > + return; > + > + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { > + pc.DepthCacheFlushEnable = true; > + pc.CommandStreamerStallEnable = true; > + pc.RenderTargetCacheFlushEnable = true; > + } > + > + uint32_t cache_mode; > + anv_pack_struct(&cache_mode, GENX(CACHE_MODE_1), > + .NPPMAFixEnable = enable, > + .NPEarlyZFailsDisable = enable, > + .NPPMAFixEnableMask = true, > + .NPEarlyZFailsDisableMask = true); > + anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) { > + lri.RegisterOffset = GENX(CACHE_MODE_1_num); > + lri.DataDWord = cache_mode; > + } > + > + /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache > + * Flush bits is often necessary. We do it regardless because it's > easier. > + * The render cache flush is also necessary if stencil writes are enabled. > + */ > + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { > + pc.DepthStallEnable = true; > + pc.DepthCacheFlushEnable = true; > + pc.RenderTargetCacheFlushEnable = true; > + } > + > + cmd_buffer->state.pma_fix_enabled = enable; > +#endif /* GEN_GEN == 8 */ > +} > + > +static inline bool > +want_depth_pma_fix(struct anv_cmd_buffer *cmd_buffer) > +{ > + assert(GEN_GEN == 8); > + > + /* From the Broadwell PRM Vol. 2c CACHE_MODE_1::NP_PMA_FIX_ENABLE: > + * > + * SW must set this bit in order to enable this fix when following > + * expression is TRUE. > + * > + * 3DSTATE_WM::ForceThreadDispatch != 1 && > + * !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0) && > + * (3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL) && > + * (3DSTATE_DEPTH_BUFFER::HIZ Enable) && > + * !(3DSTATE_WM::EDSC_Mode == EDSC_PREPS) && > + * (3DSTATE_PS_EXTRA::PixelShaderValid) && > + * !(3DSTATE_WM_HZ_OP::DepthBufferClear || > + * 3DSTATE_WM_HZ_OP::DepthBufferResolve || > + * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable || > + * 3DSTATE_WM_HZ_OP::StencilBufferClear) && > + * (3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable) && > + * (((3DSTATE_PS_EXTRA::PixelShaderKillsPixels || > + * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget || > + * 3DSTATE_PS_BLEND::AlphaToCoverageEnable || > + * 3DSTATE_PS_BLEND::AlphaTestEnable || > + * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) && > + * 3DSTATE_WM::ForceKillPix != ForceOff && > + * ((3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable && > + * 3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE) || > + * (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable && > + * 3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE && > + * 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE))) || > + * (3DSTATE_PS_EXTRA:: Pixel Shader Computed Depth mode != > PSCDEPTH_OFF)) > + */ > + > + /* These are always true: > + * 3DSTATE_WM::ForceThreadDispatch != 1 && > + * !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0) > + */ > + > + /* We only enable the PMA fix if we know for certain that HiZ is enabled. > + * If we don't know whether HiZ is enabled or not, we disable the PMA fix > + * and there is no harm. > + * > + * (3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL) && > + * 3DSTATE_DEPTH_BUFFER::HIZ Enable > + */ > + if (!cmd_buffer->state.hiz_enabled) > + return false; > + > + /* 3DSTATE_PS_EXTRA::PixelShaderValid */ > + struct anv_pipeline *pipeline = cmd_buffer->state.pipeline; > + if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) > + return false; > + > + /* !(3DSTATE_WM::EDSC_Mode == EDSC_PREPS) */ > + const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline); > + if (wm_prog_data->early_fragment_tests) > + return false; > + > + /* We never use anv_pipeline for HiZ ops so this is trivially true: > + * !(3DSTATE_WM_HZ_OP::DepthBufferClear || > + * 3DSTATE_WM_HZ_OP::DepthBufferResolve || > + * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable || > + * 3DSTATE_WM_HZ_OP::StencilBufferClear) > + */ > + > + /* 3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable */ > + if (!pipeline->depth_test_enable) > + return false; > + > + /* (((3DSTATE_PS_EXTRA::PixelShaderKillsPixels || > + * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget || > + * 3DSTATE_PS_BLEND::AlphaToCoverageEnable || > + * 3DSTATE_PS_BLEND::AlphaTestEnable || > + * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) && > + * 3DSTATE_WM::ForceKillPix != ForceOff && > + * ((3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable && > + * 3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE) || > + * (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable && > + * 3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE && > + * 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE))) || > + * (3DSTATE_PS_EXTRA:: Pixel Shader Computed Depth mode != PSCDEPTH_OFF)) > + */ > + return (pipeline->kill_pixel && (pipeline->writes_depth || > + pipeline->writes_stencil)) || > + wm_prog_data->computed_depth_mode != PSCDEPTH_OFF; > +} > + > +void > genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer) > { > struct anv_pipeline *pipeline = cmd_buffer->state.pipeline; > @@ -211,6 +338,7 @@ genX(cmd_buffer_flush_dynamic_state)(struct > anv_cmd_buffer *cmd_buffer) > } > > if (cmd_buffer->state.dirty & (ANV_CMD_DIRTY_PIPELINE | > + ANV_CMD_DIRTY_RENDER_TARGETS | > ANV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK > | > ANV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK)) > { > uint32_t wm_depth_stencil_dw[GENX(3DSTATE_WM_DEPTH_STENCIL_length)]; > @@ -234,6 +362,9 @@ genX(cmd_buffer_flush_dynamic_state)(struct > anv_cmd_buffer *cmd_buffer) > > anv_batch_emit_merge(&cmd_buffer->batch, wm_depth_stencil_dw, > pipeline->gen8.wm_depth_stencil); > + > + genX(cmd_buffer_enable_pma_fix)(cmd_buffer, > + want_depth_pma_fix(cmd_buffer)); > } > #else > if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS) { > diff --git a/src/intel/vulkan/genX_blorp_exec.c > b/src/intel/vulkan/genX_blorp_exec.c > index 663e6c9..6f0b063 100644 > --- a/src/intel/vulkan/genX_blorp_exec.c > +++ b/src/intel/vulkan/genX_blorp_exec.c > @@ -154,6 +154,11 @@ genX(blorp_exec)(struct blorp_batch *batch, > > genX(cmd_buffer_emit_gen7_depth_flush)(cmd_buffer); > > + /* BLORP doesn't do anything fancy with depth such as discards, so we want > + * the PMA fix off. Also, off is always the safe option. > + */ > + genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false); > + > blorp_exec(batch, params); > > cmd_buffer->state.vb_dirty = ~0; > diff --git a/src/intel/vulkan/genX_cmd_buffer.c > b/src/intel/vulkan/genX_cmd_buffer.c > index b6b7f74..66de93a 100644 > --- a/src/intel/vulkan/genX_cmd_buffer.c > +++ b/src/intel/vulkan/genX_cmd_buffer.c > @@ -627,6 +627,11 @@ genX(EndCommandBuffer)( > { > ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); > > + /* We want every command buffer to start with the PMA fix in a known > state, > + * so we disable it at the end of the command buffer. > + */ > + genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false); > + > genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); > > anv_cmd_buffer_end_batch_buffer(cmd_buffer); > @@ -644,6 +649,11 @@ genX(CmdExecuteCommands)( > > assert(primary->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY); > > + /* The secondary command buffers will assume that the PMA fix is disabled > + * when they begin executing. Make sure this is true. > + */ > + genX(cmd_buffer_enable_pma_fix)(primary, false); > + > for (uint32_t i = 0; i < commandBufferCount; i++) { > ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]); > > @@ -2181,7 +2191,8 @@ cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer > *cmd_buffer) > const bool has_stencil = > image && (image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT); > > - /* FIXME: Implement the PMA stall W/A */ > + cmd_buffer->state.hiz_enabled = has_hiz; > + > /* FIXME: Width and Height are wrong */ > > genX(cmd_buffer_emit_gen7_depth_flush)(cmd_buffer); > @@ -2419,6 +2430,8 @@ void genX(CmdEndRenderPass)( > > anv_cmd_buffer_resolve_subpass(cmd_buffer); > > + cmd_buffer->state.hiz_enabled = false; > + > #ifndef NDEBUG > anv_dump_add_framebuffer(cmd_buffer, cmd_buffer->state.framebuffer); > #endif > diff --git a/src/intel/vulkan/genX_pipeline.c > b/src/intel/vulkan/genX_pipeline.c > index 102b75b..5fe829b 100644 > --- a/src/intel/vulkan/genX_pipeline.c > +++ b/src/intel/vulkan/genX_pipeline.c > @@ -444,6 +444,10 @@ emit_rs_state(struct anv_pipeline *pipeline, > */ > #if GEN_GEN >= 8 > raster.DXMultisampleRasterizationEnable = true; > + /* NOTE: 3DSTATE_RASTER::ForcedSampleCount affects the BDW and SKL PMA fix > + * computations. If we ever set this bit to a different value, they will > + * need to be updated accordingly. > + */ > raster.ForcedSampleCount = FSC_NUMRASTSAMPLES_0; > raster.ForceMultisampling = false; > #else > @@ -653,6 +657,8 @@ emit_ds_state(struct anv_pipeline *pipeline, > * to make sure it's initialized to something useful. > */ > pipeline->writes_stencil = false; > + pipeline->writes_depth = false; > + pipeline->depth_test_enable = false; > memset(depth_stencil_dw, 0, sizeof(depth_stencil_dw)); > return; > } > @@ -711,6 +717,9 @@ emit_ds_state(struct anv_pipeline *pipeline, > if (info->depthTestEnable && info->depthCompareOp == VK_COMPARE_OP_EQUAL) > depth_stencil.DepthBufferWriteEnable = false; > > + pipeline->writes_depth = depth_stencil.DepthBufferWriteEnable; > + pipeline->depth_test_enable = depth_stencil.DepthTestEnable; > + > #if GEN_GEN <= 7 > GENX(DEPTH_STENCIL_STATE_pack)(NULL, depth_stencil_dw, &depth_stencil); > #else > @@ -1429,6 +1438,38 @@ emit_3dstate_vf_topology(struct anv_pipeline *pipeline) > } > #endif > > +static void > +compute_kill_pixel(struct anv_pipeline *pipeline, > + const VkPipelineMultisampleStateCreateInfo *ms_info, > + const struct anv_subpass *subpass) > +{ > + if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) { > + pipeline->kill_pixel = false; > + return; > + } > + > + const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline); > + > + /* This computes the KillPixel portion of the computation for whether or > + * not we want to enable the PMA fix on gen8. It's given by this chunk of > + * the giant formula: > + * > + * (3DSTATE_PS_EXTRA::PixelShaderKillsPixels || > + * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget || > + * 3DSTATE_PS_BLEND::AlphaToCoverageEnable || > + * 3DSTATE_PS_BLEND::AlphaTestEnable || > + * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) > + * > + * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable is always false and so is > + * 3DSTATE_PS_BLEND::AlphaTestEnable since Vulkan doesn't have a concept > + * of an alpha test. > + */ > + pipeline->kill_pixel = > + subpass->has_ds_self_dep || wm_prog_data->uses_kill || > + wm_prog_data->uses_omask || > + (ms_info && ms_info->alphaToCoverageEnable); > +} > + > static VkResult > genX(graphics_pipeline_create)( > VkDevice _device, > @@ -1466,6 +1507,7 @@ genX(graphics_pipeline_create)( > emit_ds_state(pipeline, pCreateInfo->pDepthStencilState, pass, subpass); > emit_cb_state(pipeline, pCreateInfo->pColorBlendState, > pCreateInfo->pMultisampleState); > + compute_kill_pixel(pipeline, pCreateInfo->pMultisampleState, subpass); > > emit_urb_setup(pipeline); > > -- > 2.5.0.400.gff86faf > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev