The AccessUAV bit is not quite what we want because it's more about coherency between storage operations than it is about dispatch. Also, the 3DSTATE_PS_EXTRA::PixelShaderHasUAV bit seems to cause hangs on Broadwell for unknown reasons so it's best to just leave it off. The 3DSTATE_WM::ForceThreadDispatchEnable bit, on the other hand, does exactly what we want and seems to work fine. --- src/mesa/drivers/dri/i965/brw_defines.h | 2 ++ src/mesa/drivers/dri/i965/gen8_ps_state.c | 52 ++++++++++--------------------- 2 files changed, 19 insertions(+), 35 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h index 3c5c6c4..9ad36ca 100644 --- a/src/mesa/drivers/dri/i965/brw_defines.h +++ b/src/mesa/drivers/dri/i965/brw_defines.h @@ -2733,6 +2733,8 @@ enum brw_barycentric_mode { # define GEN7_WM_MSDISPMODE_PERSAMPLE (0 << 31) # define GEN7_WM_MSDISPMODE_PERPIXEL (1 << 31) # define HSW_WM_UAV_ONLY (1 << 30) +# define GEN8_WM_FORCE_THREAD_DISPATCH_OFF (1 << 19) +# define GEN8_WM_FORCE_THREAD_DISPATCH_ON (2 << 19) #define _3DSTATE_PS 0x7820 /* GEN7+ */ /* DW1: kernel pointer */ diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c b/src/mesa/drivers/dri/i965/gen8_ps_state.c index 0346826..cca57e6 100644 --- a/src/mesa/drivers/dri/i965/gen8_ps_state.c +++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c @@ -74,39 +74,6 @@ gen8_upload_ps_extra(struct brw_context *brw, if (brw->gen >= 9 && prog_data->pulls_bary) dw1 |= GEN9_PSX_SHADER_PULLS_BARY; - /* The stricter cross-primitive coherency guarantees that the hardware - * gives us with the "Accesses UAV" bit set for at least one shader stage - * and the "UAV coherency required" bit set on the 3DPRIMITIVE command are - * redundant within the current image, atomic counter and SSBO GL APIs, - * which all have very loose ordering and coherency requirements and - * generally rely on the application to insert explicit barriers when a - * shader invocation is expected to see the memory writes performed by the - * invocations of some previous primitive. Regardless of the value of "UAV - * coherency required", the "Accesses UAV" bits will implicitly cause an in - * most cases useless DC flush when the lowermost stage with the bit set - * finishes execution. - * - * It would be nice to disable it, but in some cases we can't because on - * Gen8+ it also has an influence on rasterization via the PS UAV-only - * signal (which could be set independently from the coherency mechanism in - * the 3DSTATE_WM command on Gen7), and because in some cases it will - * determine whether the hardware skips execution of the fragment shader or - * not via the ThreadDispatchEnable signal. However if we know that - * GEN8_PS_BLEND_HAS_WRITEABLE_RT is going to be set and - * GEN8_PSX_PIXEL_SHADER_NO_RT_WRITE is not set it shouldn't make any - * difference so we may just disable it here. - * - * Gen8 hardware tries to compute ThreadDispatchEnable for us but doesn't - * take into account KillPixels when no depth or stencil writes are enabled. - * In order for occlusion queries to work correctly with no attachments, we - * need to force-enable here. - * - * BRW_NEW_FS_PROG_DATA | BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS | _NEW_COLOR - */ - if ((prog_data->has_side_effects || prog_data->uses_kill) && - !brw_color_buffer_write_enabled(brw)) - dw1 |= GEN8_PSX_SHADER_HAS_UAV; - if (prog_data->computed_stencil) { assert(brw->gen >= 9); dw1 |= GEN9_PSX_SHADER_COMPUTES_STENCIL; @@ -127,7 +94,6 @@ upload_ps_extra(struct brw_context *brw) const struct brw_tracked_state gen8_ps_extra = { .dirty = { - .mesa = _NEW_BUFFERS | _NEW_COLOR, .brw = BRW_NEW_BLORP | BRW_NEW_CONTEXT | BRW_NEW_FRAGMENT_PROGRAM | @@ -169,6 +135,20 @@ upload_wm_state(struct brw_context *brw) else if (wm_prog_data->has_side_effects) dw1 |= GEN7_WM_EARLY_DS_CONTROL_PSEXEC; + /* In most cases, the computation for the WM_INT::ThreadDispatchEnable + * does exactly what you want. However, when you don't have any color + * buffers or when depth/stencil writes are disabled, it misses a few + * cases. In those cases, we force it to dispatch the PS by using the + * 3DSTATE_WM::ForceThreadDispatchEnable bit. According to the docs, + * this bit is not validated and shouldn't be used. However, it seems + * to work fine and this is exactly the type of thing you would use such + * + * BRW_NEW_FS_PROG_DATA | _NEW_BUFFERS | _NEW_COLOR + */ + if ((wm_prog_data->has_side_effects || wm_prog_data->uses_kill) && + !brw_color_buffer_write_enabled(brw)) + dw1 |= GEN8_WM_FORCE_THREAD_DISPATCH_ON; + BEGIN_BATCH(2); OUT_BATCH(_3DSTATE_WM << 16 | (2 - 2)); OUT_BATCH(dw1); @@ -177,7 +157,9 @@ upload_wm_state(struct brw_context *brw) const struct brw_tracked_state gen8_wm_state = { .dirty = { - .mesa = _NEW_LINE | + .mesa = _NEW_BUFFERS | + _NEW_COLOR | + _NEW_LINE | _NEW_POLYGON, .brw = BRW_NEW_BLORP | BRW_NEW_CONTEXT | -- 2.5.0.400.gff86faf _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev