Reviewed-by: Jason Ekstrand <ja...@jlekstrand.net> Thanks for figuring this out!
On Mon, Aug 12, 2019 at 4:19 PM Francisco Jerez <curroje...@riseup.net> wrote: > See "i965/gen9: Optimize slice and subslice load balancing behavior." > for the rationale. According to Jason, improves Aztec Ruins > performance by 2.7%. > > Reviewed-by: Kenneth Graunke <kenn...@whitecape.org> (v1) > > v2: Undo CPU performance micro-optimization done in i965 and iris due > to lack of data justifying it on anv. Use > cmd_buffer_apply_pipe_flushes wrapper instead of emitting pipe > control command directly. (Jason) > --- > src/intel/vulkan/anv_genX.h | 4 ++ > src/intel/vulkan/anv_private.h | 6 ++ > src/intel/vulkan/genX_blorp_exec.c | 4 ++ > src/intel/vulkan/genX_cmd_buffer.c | 95 ++++++++++++++++++++++++++++++ > 4 files changed, 109 insertions(+) > > diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h > index a5435e566a3..06c6b467acf 100644 > --- a/src/intel/vulkan/anv_genX.h > +++ b/src/intel/vulkan/anv_genX.h > @@ -44,6 +44,10 @@ void genX(cmd_buffer_apply_pipe_flushes)(struct > anv_cmd_buffer *cmd_buffer); > > void genX(cmd_buffer_emit_gen7_depth_flush)(struct anv_cmd_buffer > *cmd_buffer); > > +void genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer, > + unsigned width, unsigned height, > + unsigned scale); > + > void genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer); > void genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer); > > diff --git a/src/intel/vulkan/anv_private.h > b/src/intel/vulkan/anv_private.h > index 2465f264354..b381386a716 100644 > --- a/src/intel/vulkan/anv_private.h > +++ b/src/intel/vulkan/anv_private.h > @@ -2421,6 +2421,12 @@ struct anv_cmd_state { > > bool > conditional_render_enabled; > > + /** > + * Last rendering scale argument provided to > + * genX(cmd_buffer_emit_hashing_mode)(). > + */ > + unsigned current_hash_scale; > + > /** > * Array length is anv_cmd_state::pass::attachment_count. Array > content is > * valid only when recording a render pass instance. > diff --git a/src/intel/vulkan/genX_blorp_exec.c > b/src/intel/vulkan/genX_blorp_exec.c > index 1592e7f7e3d..239bfb47433 100644 > --- a/src/intel/vulkan/genX_blorp_exec.c > +++ b/src/intel/vulkan/genX_blorp_exec.c > @@ -223,6 +223,10 @@ genX(blorp_exec)(struct blorp_batch *batch, > genX(cmd_buffer_config_l3)(cmd_buffer, cfg); > } > > + const unsigned scale = params->fast_clear_op ? UINT_MAX : 1; > + genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, params->x1 - params->x0, > + params->y1 - params->y0, scale); > + > #if GEN_GEN >= 11 > /* The PIPE_CONTROL command description says: > * > diff --git a/src/intel/vulkan/genX_cmd_buffer.c > b/src/intel/vulkan/genX_cmd_buffer.c > index 86ef1663ac4..10da132115c 100644 > --- a/src/intel/vulkan/genX_cmd_buffer.c > +++ b/src/intel/vulkan/genX_cmd_buffer.c > @@ -1595,6 +1595,7 @@ genX(CmdExecuteCommands)( > */ > primary->state.current_pipeline = UINT32_MAX; > primary->state.current_l3_config = NULL; > + primary->state.current_hash_scale = 0; > > /* Each of the secondary command buffers will use its own state base > * address. We need to re-emit state base address for the primary > after > @@ -2663,6 +2664,8 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer > *cmd_buffer) > > genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->urb.l3_config); > > + genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, UINT_MAX, UINT_MAX, 1); > + > genX(flush_pipeline_select_3d)(cmd_buffer); > > if (vb_emit) { > @@ -3925,6 +3928,98 @@ genX(cmd_buffer_emit_gen7_depth_flush)(struct > anv_cmd_buffer *cmd_buffer) > } > } > > +/** > + * Update the pixel hashing modes that determine the balancing of PS > threads > + * across subslices and slices. > + * > + * \param width Width bound of the rendering area (already scaled down if > \p > + * scale is greater than 1). > + * \param height Height bound of the rendering area (already scaled down > if \p > + * scale is greater than 1). > + * \param scale The number of framebuffer samples that could potentially > be > + * affected by an individual channel of the PS thread. This > is > + * typically one for single-sampled rendering, but for > operations > + * like CCS resolves and fast clears a single PS invocation > may > + * update a huge number of pixels, in which case a finer > + * balancing is desirable in order to maximally utilize the > + * bandwidth available. UINT_MAX can be used as shorthand > for > + * "finest hashing mode available". > + */ > +void > +genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer, > + unsigned width, unsigned height, > + unsigned scale) > +{ > +#if GEN_GEN == 9 > + const struct gen_device_info *devinfo = &cmd_buffer->device->info; > + const unsigned slice_hashing[] = { > + /* Because all Gen9 platforms with more than one slice require > + * three-way subslice hashing, a single "normal" 16x16 slice hashing > + * block is guaranteed to suffer from substantial imbalance, with > one > + * subslice receiving twice as much work as the other two in the > + * slice. > + * > + * The performance impact of that would be particularly severe when > + * three-way hashing is also in use for slice balancing (which is > the > + * case for all Gen9 GT4 platforms), because one of the slices > + * receives one every three 16x16 blocks in either direction, which > + * is roughly the periodicity of the underlying subslice imbalance > + * pattern ("roughly" because in reality the hardware's > + * implementation of three-way hashing doesn't do exact modulo 3 > + * arithmetic, which somewhat decreases the magnitude of this effect > + * in practice). This leads to a systematic subslice imbalance > + * within that slice regardless of the size of the primitive. The > + * 32x32 hashing mode guarantees that the subslice imbalance within > a > + * single slice hashing block is minimal, largely eliminating this > + * effect. > + */ > + _32x32, > + /* Finest slice hashing mode available. */ > + NORMAL > + }; > + const unsigned subslice_hashing[] = { > + /* 16x16 would provide a slight cache locality benefit especially > + * visible in the sampler L1 cache efficiency of low-bandwidth > + * non-LLC platforms, but it comes at the cost of greater subslice > + * imbalance for primitives of dimensions approximately intermediate > + * between 16x4 and 16x16. > + */ > + _16x4, > + /* Finest subslice hashing mode available. */ > + _8x4 > + }; > + /* Dimensions of the smallest hashing block of a given hashing mode. > If > + * the rendering area is smaller than this there can't possibly be any > + * benefit from switching to this mode, so we optimize out the > + * transition. > + */ > + const unsigned min_size[][2] = { > + { 16, 4 }, > + { 8, 4 } > + }; > + const unsigned idx = scale > 1; > + > + if (cmd_buffer->state.current_hash_scale != scale && > + (width > min_size[idx][0] || height > min_size[idx][1])) { > + uint32_t gt_mode; > + > + anv_pack_struct(>_mode, GENX(GT_MODE), > + .SliceHashing = (devinfo->num_slices > 1 ? > slice_hashing[idx] : 0), > + .SliceHashingMask = (devinfo->num_slices > 1 ? -1 : > 0), > + .SubsliceHashing = subslice_hashing[idx], > + .SubsliceHashingMask = -1); > + > + cmd_buffer->state.pending_pipe_bits |= > + ANV_PIPE_CS_STALL_BIT | ANV_PIPE_STALL_AT_SCOREBOARD_BIT; > + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); > + > + emit_lri(&cmd_buffer->batch, GENX(GT_MODE_num), gt_mode); > + > + cmd_buffer->state.current_hash_scale = scale; > + } > +#endif > +} > + > static void > cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer) > { > -- > 2.22.0 > >
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev