This patch is also: Reviewed-by: Kenneth Graunke <kenn...@whitecape.org>
though it would probably be best if Jason or Lionel reviewed it as well. On Friday, August 9, 2019 5:20:39 PM PDT Francisco Jerez wrote: > See "i965/gen9: Optimize slice and subslice load balancing behavior." > for the rationale. Marked optional because no performance evaluation > has been done on this commit, it is provided to match the hashing > settings of the Iris driver. Test reports welcome. > --- > src/intel/vulkan/anv_genX.h | 4 ++ > src/intel/vulkan/anv_private.h | 6 ++ > src/intel/vulkan/genX_blorp_exec.c | 6 ++ > src/intel/vulkan/genX_cmd_buffer.c | 96 ++++++++++++++++++++++++++++++ > 4 files changed, 112 insertions(+) > > diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h > index a5435e566a3..06c6b467acf 100644 > --- a/src/intel/vulkan/anv_genX.h > +++ b/src/intel/vulkan/anv_genX.h > @@ -44,6 +44,10 @@ void genX(cmd_buffer_apply_pipe_flushes)(struct > anv_cmd_buffer *cmd_buffer); > > void genX(cmd_buffer_emit_gen7_depth_flush)(struct anv_cmd_buffer > *cmd_buffer); > > +void genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer, > + unsigned width, unsigned height, > + unsigned scale); > + > void genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer); > void genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer); > > diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h > index 2465f264354..b381386a716 100644 > --- a/src/intel/vulkan/anv_private.h > +++ b/src/intel/vulkan/anv_private.h > @@ -2421,6 +2421,12 @@ struct anv_cmd_state { > > bool conditional_render_enabled; > > + /** > + * Last rendering scale argument provided to > + * genX(cmd_buffer_emit_hashing_mode)(). > + */ > + unsigned current_hash_scale; > + > /** > * Array length is anv_cmd_state::pass::attachment_count. Array content is > * valid only when recording a render pass instance. > diff --git a/src/intel/vulkan/genX_blorp_exec.c > b/src/intel/vulkan/genX_blorp_exec.c > index 1592e7f7e3d..e9eedc06696 100644 > --- a/src/intel/vulkan/genX_blorp_exec.c > +++ b/src/intel/vulkan/genX_blorp_exec.c > @@ -223,6 +223,12 @@ genX(blorp_exec)(struct blorp_batch *batch, > genX(cmd_buffer_config_l3)(cmd_buffer, cfg); > } > > + const unsigned scale = params->fast_clear_op ? UINT_MAX : 1; > + if (cmd_buffer->state.current_hash_scale != scale) { > + genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, params->x1 - params->x0, > + params->y1 - params->y0, scale); > + } > + > #if GEN_GEN >= 11 > /* The PIPE_CONTROL command description says: > * > diff --git a/src/intel/vulkan/genX_cmd_buffer.c > b/src/intel/vulkan/genX_cmd_buffer.c > index 86ef1663ac4..e9e5570d49f 100644 > --- a/src/intel/vulkan/genX_cmd_buffer.c > +++ b/src/intel/vulkan/genX_cmd_buffer.c > @@ -1595,6 +1595,7 @@ genX(CmdExecuteCommands)( > */ > primary->state.current_pipeline = UINT32_MAX; > primary->state.current_l3_config = NULL; > + primary->state.current_hash_scale = 0; > > /* Each of the secondary command buffers will use its own state base > * address. We need to re-emit state base address for the primary after > @@ -2663,6 +2664,9 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer > *cmd_buffer) > > genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->urb.l3_config); > > + if (cmd_buffer->state.current_hash_scale != 1) > + genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, UINT_MAX, UINT_MAX, 1); > + > genX(flush_pipeline_select_3d)(cmd_buffer); > > if (vb_emit) { > @@ -3925,6 +3929,98 @@ genX(cmd_buffer_emit_gen7_depth_flush)(struct > anv_cmd_buffer *cmd_buffer) > } > } > > +/** > + * Update the pixel hashing modes that determine the balancing of PS threads > + * across subslices and slices. > + * > + * \param width Width bound of the rendering area (already scaled down if \p > + * scale is greater than 1). > + * \param height Height bound of the rendering area (already scaled down if > \p > + * scale is greater than 1). > + * \param scale The number of framebuffer samples that could potentially be > + * affected by an individual channel of the PS thread. This is > + * typically one for single-sampled rendering, but for > operations > + * like CCS resolves and fast clears a single PS invocation may > + * update a huge number of pixels, in which case a finer > + * balancing is desirable in order to maximally utilize the > + * bandwidth available. UINT_MAX can be used as shorthand for > + * "finest hashing mode available". > + */ > +void > +genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer, > + unsigned width, unsigned height, > + unsigned scale) > +{ > +#if GEN_GEN == 9 > + const struct gen_device_info *devinfo = &cmd_buffer->device->info; > + const unsigned slice_hashing[] = { > + /* Because all Gen9 platforms with more than one slice require > + * three-way subslice hashing, a single "normal" 16x16 slice hashing > + * block is guaranteed to suffer from substantial imbalance, with one > + * subslice receiving twice as much work as the other two in the > + * slice. > + * > + * The performance impact of that would be particularly severe when > + * three-way hashing is also in use for slice balancing (which is the > + * case for all Gen9 GT4 platforms), because one of the slices > + * receives one every three 16x16 blocks in either direction, which > + * is roughly the periodicity of the underlying subslice imbalance > + * pattern ("roughly" because in reality the hardware's > + * implementation of three-way hashing doesn't do exact modulo 3 > + * arithmetic, which somewhat decreases the magnitude of this effect > + * in practice). This leads to a systematic subslice imbalance > + * within that slice regardless of the size of the primitive. The > + * 32x32 hashing mode guarantees that the subslice imbalance within a > + * single slice hashing block is minimal, largely eliminating this > + * effect. > + */ > + _32x32, > + /* Finest slice hashing mode available. */ > + NORMAL > + }; > + const unsigned subslice_hashing[] = { > + /* 16x16 would provide a slight cache locality benefit especially > + * visible in the sampler L1 cache efficiency of low-bandwidth > + * non-LLC platforms, but it comes at the cost of greater subslice > + * imbalance for primitives of dimensions approximately intermediate > + * between 16x4 and 16x16. > + */ > + _16x4, > + /* Finest subslice hashing mode available. */ > + _8x4 > + }; > + /* Dimensions of the smallest hashing block of a given hashing mode. If > + * the rendering area is smaller than this there can't possibly be any > + * benefit from switching to this mode, so we optimize out the > + * transition. > + */ > + const unsigned min_size[][2] = { > + { 16, 4 }, > + { 8, 4 } > + }; > + const unsigned idx = scale > 1; > + > + if (width > min_size[idx][0] || height > min_size[idx][1]) { > + uint32_t gt_mode; > + > + anv_pack_struct(>_mode, GENX(GT_MODE), > + .SliceHashing = (devinfo->num_slices > 1 ? > slice_hashing[idx] : 0), > + .SliceHashingMask = (devinfo->num_slices > 1 ? -1 : 0), > + .SubsliceHashing = subslice_hashing[idx], > + .SubsliceHashingMask = -1); > + > + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { > + pc.StallAtPixelScoreboard = true; > + pc.CommandStreamerStallEnable = true; > + } > + > + emit_lri(&cmd_buffer->batch, GENX(GT_MODE_num), gt_mode); > + > + cmd_buffer->state.current_hash_scale = scale; > + } > +#endif > +} > + > static void > cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer) > { >
signature.asc
Description: This is a digitally signed message part.
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev