This gets us +2.7% on Aztec Ruins (5 runs on each branch) On Sat, Aug 10, 2019 at 7:31 AM Jason Ekstrand <ja...@jlekstrand.net> wrote:
> Let's hold this for at least a tiny bit. I'll run some benchmarks and may > want to tweak how in interacts with the rest of the vulkan state > tracking. > Thanks for figuring out a heuristic! > > --Jason > > On August 9, 2019 19:22:48 Francisco Jerez <curroje...@riseup.net> wrote: > > > See "i965/gen9: Optimize slice and subslice load balancing behavior." > > for the rationale. Marked optional because no performance evaluation > > has been done on this commit, it is provided to match the hashing > > settings of the Iris driver. Test reports welcome. > > --- > > src/intel/vulkan/anv_genX.h | 4 ++ > > src/intel/vulkan/anv_private.h | 6 ++ > > src/intel/vulkan/genX_blorp_exec.c | 6 ++ > > src/intel/vulkan/genX_cmd_buffer.c | 96 ++++++++++++++++++++++++++++++ > > 4 files changed, 112 insertions(+) > > > > diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h > > index a5435e566a3..06c6b467acf 100644 > > --- a/src/intel/vulkan/anv_genX.h > > +++ b/src/intel/vulkan/anv_genX.h > > @@ -44,6 +44,10 @@ void genX(cmd_buffer_apply_pipe_flushes)(struct > > anv_cmd_buffer *cmd_buffer); > > > > void genX(cmd_buffer_emit_gen7_depth_flush)(struct anv_cmd_buffer > *cmd_buffer); > > > > +void genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer > *cmd_buffer, > > + unsigned width, unsigned height, > > + unsigned scale); > > + > > void genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer); > > void genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer > *cmd_buffer); > > > > diff --git a/src/intel/vulkan/anv_private.h > b/src/intel/vulkan/anv_private.h > > index 2465f264354..b381386a716 100644 > > --- a/src/intel/vulkan/anv_private.h > > +++ b/src/intel/vulkan/anv_private.h > > @@ -2421,6 +2421,12 @@ struct anv_cmd_state { > > > > bool > conditional_render_enabled; > > > > + /** > > + * Last rendering scale argument provided to > > + * genX(cmd_buffer_emit_hashing_mode)(). > > + */ > > + unsigned current_hash_scale; > > + > > /** > > * Array length is anv_cmd_state::pass::attachment_count. Array > content is > > * valid only when recording a render pass instance. > > diff --git a/src/intel/vulkan/genX_blorp_exec.c > > b/src/intel/vulkan/genX_blorp_exec.c > > index 1592e7f7e3d..e9eedc06696 100644 > > --- a/src/intel/vulkan/genX_blorp_exec.c > > +++ b/src/intel/vulkan/genX_blorp_exec.c > > @@ -223,6 +223,12 @@ genX(blorp_exec)(struct blorp_batch *batch, > > genX(cmd_buffer_config_l3)(cmd_buffer, cfg); > > } > > > > + const unsigned scale = params->fast_clear_op ? UINT_MAX : 1; > > + if (cmd_buffer->state.current_hash_scale != scale) { > > + genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, params->x1 - > params->x0, > > + params->y1 - params->y0, > scale); > > + } > > + > > #if GEN_GEN >= 11 > > /* The PIPE_CONTROL command description says: > > * > > diff --git a/src/intel/vulkan/genX_cmd_buffer.c > > b/src/intel/vulkan/genX_cmd_buffer.c > > index 86ef1663ac4..e9e5570d49f 100644 > > --- a/src/intel/vulkan/genX_cmd_buffer.c > > +++ b/src/intel/vulkan/genX_cmd_buffer.c > > @@ -1595,6 +1595,7 @@ genX(CmdExecuteCommands)( > > */ > > primary->state.current_pipeline = UINT32_MAX; > > primary->state.current_l3_config = NULL; > > + primary->state.current_hash_scale = 0; > > > > /* Each of the secondary command buffers will use its own state base > > * address. We need to re-emit state base address for the primary > after > > @@ -2663,6 +2664,9 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer > > *cmd_buffer) > > > > genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->urb.l3_config); > > > > + if (cmd_buffer->state.current_hash_scale != 1) > > + genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, UINT_MAX, > UINT_MAX, 1); > > + > > genX(flush_pipeline_select_3d)(cmd_buffer); > > > > if (vb_emit) { > > @@ -3925,6 +3929,98 @@ genX(cmd_buffer_emit_gen7_depth_flush)(struct > > anv_cmd_buffer *cmd_buffer) > > } > > } > > > > +/** > > + * Update the pixel hashing modes that determine the balancing of PS > threads > > + * across subslices and slices. > > + * > > + * \param width Width bound of the rendering area (already scaled down > if \p > > + * scale is greater than 1). > > + * \param height Height bound of the rendering area (already scaled > down if \p > > + * scale is greater than 1). > > + * \param scale The number of framebuffer samples that could > potentially be > > + * affected by an individual channel of the PS thread. > This is > > + * typically one for single-sampled rendering, but for > operations > > + * like CCS resolves and fast clears a single PS > invocation may > > + * update a huge number of pixels, in which case a finer > > + * balancing is desirable in order to maximally utilize the > > + * bandwidth available. UINT_MAX can be used as shorthand > for > > + * "finest hashing mode available". > > + */ > > +void > > +genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer, > > + unsigned width, unsigned height, > > + unsigned scale) > > +{ > > +#if GEN_GEN == 9 > > + const struct gen_device_info *devinfo = &cmd_buffer->device->info; > > + const unsigned slice_hashing[] = { > > + /* Because all Gen9 platforms with more than one slice require > > + * three-way subslice hashing, a single "normal" 16x16 slice > hashing > > + * block is guaranteed to suffer from substantial imbalance, with > one > > + * subslice receiving twice as much work as the other two in the > > + * slice. > > + * > > + * The performance impact of that would be particularly severe > when > > + * three-way hashing is also in use for slice balancing (which is > the > > + * case for all Gen9 GT4 platforms), because one of the slices > > + * receives one every three 16x16 blocks in either direction, > which > > + * is roughly the periodicity of the underlying subslice imbalance > > + * pattern ("roughly" because in reality the hardware's > > + * implementation of three-way hashing doesn't do exact modulo 3 > > + * arithmetic, which somewhat decreases the magnitude of this > effect > > + * in practice). This leads to a systematic subslice imbalance > > + * within that slice regardless of the size of the primitive. The > > + * 32x32 hashing mode guarantees that the subslice imbalance > within a > > + * single slice hashing block is minimal, largely eliminating this > > + * effect. > > + */ > > + _32x32, > > + /* Finest slice hashing mode available. */ > > + NORMAL > > + }; > > + const unsigned subslice_hashing[] = { > > + /* 16x16 would provide a slight cache locality benefit especially > > + * visible in the sampler L1 cache efficiency of low-bandwidth > > + * non-LLC platforms, but it comes at the cost of greater subslice > > + * imbalance for primitives of dimensions approximately > intermediate > > + * between 16x4 and 16x16. > > + */ > > + _16x4, > > + /* Finest subslice hashing mode available. */ > > + _8x4 > > + }; > > + /* Dimensions of the smallest hashing block of a given hashing > mode. If > > + * the rendering area is smaller than this there can't possibly be > any > > + * benefit from switching to this mode, so we optimize out the > > + * transition. > > + */ > > + const unsigned min_size[][2] = { > > + { 16, 4 }, > > + { 8, 4 } > > + }; > > + const unsigned idx = scale > 1; > > + > > + if (width > min_size[idx][0] || height > min_size[idx][1]) { > > + uint32_t gt_mode; > > + > > + anv_pack_struct(>_mode, GENX(GT_MODE), > > + .SliceHashing = (devinfo->num_slices > 1 ? > > slice_hashing[idx] : 0), > > + .SliceHashingMask = (devinfo->num_slices > 1 ? -1 > : 0), > > + .SubsliceHashing = subslice_hashing[idx], > > + .SubsliceHashingMask = -1); > > + > > + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { > > + pc.StallAtPixelScoreboard = true; > > + pc.CommandStreamerStallEnable = true; > > + } > > + > > + emit_lri(&cmd_buffer->batch, GENX(GT_MODE_num), gt_mode); > > + > > + cmd_buffer->state.current_hash_scale = scale; > > + } > > +#endif > > +} > > + > > static void > > cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer) > > { > > -- > > 2.22.0 > > > > _______________________________________________ > > mesa-dev mailing list > > mesa-dev@lists.freedesktop.org > > https://lists.freedesktop.org/mailman/listinfo/mesa-dev > > > >
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev