On Tue, Jul 11, 2017 at 3:34 PM, Nanley Chery <nanleych...@gmail.com> wrote:
> On Mon, Jul 10, 2017 at 04:00:23PM -0700, Jason Ekstrand wrote: > > On Wed, Jun 28, 2017 at 2:14 PM, Nanley Chery <nanleych...@gmail.com> > wrote: > > > > > Image layouts only let us know that an image *may* be fast-cleared. For > > > this reason we can end up with redundant resolves. Testing has shown > > > that such resolves can measurably hurt performance and that predicating > > > them can avoid the penalty. > > > > > > Signed-off-by: Nanley Chery <nanley.g.ch...@intel.com> > > > --- > > > src/intel/vulkan/anv_blorp.c | 3 +- > > > src/intel/vulkan/anv_private.h | 13 ++++-- > > > src/intel/vulkan/genX_cmd_buffer.c | 87 > ++++++++++++++++++++++++++++++ > > > ++++++-- > > > 3 files changed, 95 insertions(+), 8 deletions(-) > > > > > > diff --git a/src/intel/vulkan/anv_blorp.c > b/src/intel/vulkan/anv_blorp.c > > > index 35317ba6be..d06d7e2cc3 100644 > > > --- a/src/intel/vulkan/anv_blorp.c > > > +++ b/src/intel/vulkan/anv_blorp.c > > > @@ -1619,7 +1619,8 @@ anv_ccs_resolve(struct anv_cmd_buffer * const > > > cmd_buffer, > > > return; > > > > > > struct blorp_batch batch; > > > - blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, > 0); > > > + blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, > > > + BLORP_BATCH_PREDICATE_ENABLE); > > > > > > struct blorp_surf surf; > > > get_blorp_surf_for_anv_image(image, VK_IMAGE_ASPECT_COLOR_BIT, > > > diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_ > > > private.h > > > index be1623f3c3..951cf50842 100644 > > > --- a/src/intel/vulkan/anv_private.h > > > +++ b/src/intel/vulkan/anv_private.h > > > @@ -2118,11 +2118,16 @@ anv_fast_clear_state_entry_size(const struct > > > anv_device *device) > > > { > > > assert(device); > > > /* Entry contents: > > > - * +----------------------+ > > > - * | clear value dword(s) | > > > - * +----------------------+ > > > + * +--------------------------------------------+ > > > + * | clear value dword(s) | needs resolve dword | > > > + * +--------------------------------------------+ > > > */ > > > - return device->isl_dev.ss.clear_value_size; > > > + > > > + /* Ensure that the needs resolve dword is in fact dword-aligned to > > > enable > > > + * GPU memcpy operations. > > > + */ > > > + assert(device->isl_dev.ss.clear_value_size % 4 == 0); > > > + return device->isl_dev.ss.clear_value_size + 4; > > > } > > > > > > /* Returns true if a HiZ-enabled depth buffer can be sampled from. */ > > > diff --git a/src/intel/vulkan/genX_cmd_buffer.c > > > b/src/intel/vulkan/genX_cmd_buffer.c > > > index 62a2f22782..65d9c92783 100644 > > > --- a/src/intel/vulkan/genX_cmd_buffer.c > > > +++ b/src/intel/vulkan/genX_cmd_buffer.c > > > @@ -421,6 +421,59 @@ get_fast_clear_state_entry_offset(const struct > > > anv_device *device, > > > return offset; > > > } > > > > > > +#define MI_PREDICATE_SRC0 0x2400 > > > +#define MI_PREDICATE_SRC1 0x2408 > > > + > > > +enum ccs_resolve_state { > > > + CCS_RESOLVE_NOT_NEEDED, > > > + CCS_RESOLVE_NEEDED, > > > > > > > Are these two values sufficient? Do we ever have a scenario where we do > a > > partial resolve and then a full resolve? Do we need to be able to track > > that? > > > > > > Yes, they are. We don't currently have such a scenario. This may come up > later if we start temporarily enabling CCS_E, but I can't think of what > sequence of events would trigger that to happen. > > > > + CCS_RESOLVE_STARTING, > > > +}; > > > + > > > +/* Manages the state of an color image subresource to ensure resolves > are > > > + * performed properly. > > > + */ > > > +static void > > > +genX(set_resolve_state)(struct anv_cmd_buffer *cmd_buffer, > > > + const struct anv_image *image, > > > + unsigned level, > > > + enum ccs_resolve_state state) > > > +{ > > > + assert(cmd_buffer && image); > > > + assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT); > > > + assert(level < anv_image_aux_levels(image)); > > > + > > > + const uint32_t resolve_flag_offset = > > > + get_fast_clear_state_entry_offset(cmd_buffer->device, image, > > > level) + > > > + cmd_buffer->device->isl_dev.ss.clear_value_size; > > > + > > > + if (state != CCS_RESOLVE_STARTING) { > > > + assert(state == CCS_RESOLVE_NEEDED || state == > > > CCS_RESOLVE_NOT_NEEDED); > > > + /* The HW docs say that there is no way to guarantee the > completion > > > of > > > + * the following command. We use it nevertheless because it > shows no > > > + * issues in testing is currently being used in the GL driver. > > > + */ > > > + anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), > sdi) { > > > + sdi.Address = (struct anv_address) { image->bo, > > > resolve_flag_offset }; > > > + sdi.ImmediateData = state == CCS_RESOLVE_NEEDED; > > > + } > > > + } else { > > > + /* Make the pending predicated resolve a no-op if one is not > needed. > > > + * predicate = do_resolve = resolve_flag != 0; > > > + */ > > > + emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC1 , 0); > > > + emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC1 + 4, 0); > > > + emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC0 , 0); > > > + emit_lrm(&cmd_buffer->batch, MI_PREDICATE_SRC0 + 4, > > > + image->bo, resolve_flag_offset); > > > + anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) { > > > + mip.LoadOperation = LOAD_LOADINV; > > > + mip.CombineOperation = COMBINE_SET; > > > + mip.CompareOperation = COMPARE_SRCS_EQUAL; > > > + } > > > + } > > > > > > > This function does two very different things hidden behind an enum that, > in > > my view, only clouds what's going on. If state != STARTING, it *sets* > the > > flag, otherwise it *checks* the flag and emits MI_PREDICATE. Having them > > in the same function is confusing. Why not make this two separate > > functions: set_image_needs_resolve and setup_resolve_predicate or > something > > like that. > > > > > > Sounds good. It did feel awkward to use the RESOLVE_STARTING enum with > this function. I'll go with: > * load_needs_resolve_predicate(), > * set_image_needs_resolve(), > Those are good names. > * and replace the enums with true/false. > Sounds good to me. > Thanks, > Nanley > > > > +} > > > + > > > static void > > > init_fast_clear_state_entry(struct anv_cmd_buffer *cmd_buffer, > > > const struct anv_image *image, > > > @@ -430,6 +483,16 @@ init_fast_clear_state_entry(struct anv_cmd_buffer > > > *cmd_buffer, > > > assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT); > > > assert(level < anv_image_aux_levels(image)); > > > > > > + /* The resolve flag should updated to signify that > > > fast-clear/compression > > > + * data needs to be removed when leaving the undefined layout. Such > > > data > > > + * may need to be removed if it would cause accesses to the color > > > buffer > > > + * to return incorrect data. The fast clear data in CCS_D buffers > > > should > > > + * be removed because CCS_D isn't enabled all the time. > > > + */ > > > + genX(set_resolve_state)(cmd_buffer, image, level, > > > + image->aux_usage == ISL_AUX_USAGE_CCS_E ? > > > + CCS_RESOLVE_NOT_NEEDED : > CCS_RESOLVE_NEEDED); > > > + > > > /* The fast clear value dword(s) will be copied into a surface > state > > > object. > > > * Ensure that the restrictions of the fields in the dword(s) are > > > followed. > > > * > > > @@ -666,6 +729,8 @@ transition_color_buffer(struct anv_cmd_buffer > > > *cmd_buffer, > > > layer_count = MIN2(layer_count, anv_image_aux_layers(image, > > > level)); > > > } > > > > > > + genX(set_resolve_state)(cmd_buffer, image, level, > > > CCS_RESOLVE_STARTING); > > > + > > > /* Create a surface state with the right clear color and > perform the > > > * resolve. > > > */ > > > @@ -697,6 +762,8 @@ transition_color_buffer(struct anv_cmd_buffer > > > *cmd_buffer, > > > image->aux_usage == ISL_AUX_USAGE_CCS_E ? > > > BLORP_FAST_CLEAR_OP_RESOLVE_PARTIAL : > > > BLORP_FAST_CLEAR_OP_RESOLVE_FULL); > > > + > > > + genX(set_resolve_state)(cmd_buffer, image, level, > > > CCS_RESOLVE_NOT_NEEDED); > > > } > > > > > > cmd_buffer->state.pending_pipe_bits |= > > > @@ -2447,9 +2514,6 @@ void genX(CmdDispatch)( > > > #define GPGPU_DISPATCHDIMY 0x2504 > > > #define GPGPU_DISPATCHDIMZ 0x2508 > > > > > > -#define MI_PREDICATE_SRC0 0x2400 > > > -#define MI_PREDICATE_SRC1 0x2408 > > > - > > > void genX(CmdDispatchIndirect)( > > > VkCommandBuffer commandBuffer, > > > VkBuffer _buffer, > > > @@ -2856,6 +2920,23 @@ cmd_buffer_subpass_sync_fast_ > clear_values(struct > > > anv_cmd_buffer *cmd_buffer) > > > genX(copy_fast_clear_dwords)(cmd_buffer, > > > att_state->color_rt_state, > > > iview->image, > iview->isl.base_level, > > > true /* copy from ss */); > > > + > > > + /* Fast-clears impact whether or not a resolve will be > > > necessary. */ > > > + if (iview->image->aux_usage == ISL_AUX_USAGE_CCS_E && > > > + att_state->clear_color_is_zero) { > > > + /* This image always has the auxiliary buffer enabled. We > can > > > mark > > > + * the subresource as not needing a resolve because the > clear > > > color > > > + * will match what's in every RENDER_SURFACE_STATE object > > > when it's > > > + * being used for sampling. > > > + */ > > > + genX(set_resolve_state)(cmd_buffer, iview->image, > > > + iview->isl.base_level, > > > + CCS_RESOLVE_NOT_NEEDED); > > > + } else { > > > + genX(set_resolve_state)(cmd_buffer, iview->image, > > > + iview->isl.base_level, > > > + CCS_RESOLVE_NEEDED); > > > + } > > > } else if (rp_att->load_op == VK_ATTACHMENT_LOAD_OP_LOAD) { > > > /* The attachment may have been fast-cleared in a previous > render > > > * pass and the value is needed now. Update the surface > state(s). > > > -- > > > 2.13.1 > > > > > > _______________________________________________ > > > mesa-dev mailing list > > > mesa-dev@lists.freedesktop.org > > > https://lists.freedesktop.org/mailman/listinfo/mesa-dev > > > >
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev