gen9: Optimize slice and subslice load balancing behavior.

Jason Ekstrand Mon, 12 Aug 2019 14:33:17 -0700

Reviewed-by: Jason Ekstrand <ja...@jlekstrand.net>

Thanks for figuring this out!


On Mon, Aug 12, 2019 at 4:19 PM Francisco Jerez <curroje...@riseup.net>
wrote:

> See "i965/gen9: Optimize slice and subslice load balancing behavior."
> for the rationale.  According to Jason, improves Aztec Ruins
> performance by 2.7%.
>
> Reviewed-by: Kenneth Graunke <kenn...@whitecape.org> (v1)
>
> v2: Undo CPU performance micro-optimization done in i965 and iris due
>     to lack of data justifying it on anv.  Use
>     cmd_buffer_apply_pipe_flushes wrapper instead of emitting pipe
>     control command directly.  (Jason)
> ---
>  src/intel/vulkan/anv_genX.h        |  4 ++
>  src/intel/vulkan/anv_private.h     |  6 ++
>  src/intel/vulkan/genX_blorp_exec.c |  4 ++
>  src/intel/vulkan/genX_cmd_buffer.c | 95 ++++++++++++++++++++++++++++++
>  4 files changed, 109 insertions(+)
>
> diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h
> index a5435e566a3..06c6b467acf 100644
> --- a/src/intel/vulkan/anv_genX.h
> +++ b/src/intel/vulkan/anv_genX.h
> @@ -44,6 +44,10 @@ void genX(cmd_buffer_apply_pipe_flushes)(struct
> anv_cmd_buffer *cmd_buffer);
>
>  void genX(cmd_buffer_emit_gen7_depth_flush)(struct anv_cmd_buffer
> *cmd_buffer);
>
> +void genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer,
> +                                        unsigned width, unsigned height,
> +                                        unsigned scale);
> +
>  void genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer);
>  void genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer);
>
> diff --git a/src/intel/vulkan/anv_private.h
> b/src/intel/vulkan/anv_private.h
> index 2465f264354..b381386a716 100644
> --- a/src/intel/vulkan/anv_private.h
> +++ b/src/intel/vulkan/anv_private.h
> @@ -2421,6 +2421,12 @@ struct anv_cmd_state {
>
>     bool
>  conditional_render_enabled;
>
> +   /**
> +    * Last rendering scale argument provided to
> +    * genX(cmd_buffer_emit_hashing_mode)().
> +    */
> +   unsigned                                     current_hash_scale;
> +
>     /**
>      * Array length is anv_cmd_state::pass::attachment_count. Array
> content is
>      * valid only when recording a render pass instance.
> diff --git a/src/intel/vulkan/genX_blorp_exec.c
> b/src/intel/vulkan/genX_blorp_exec.c
> index 1592e7f7e3d..239bfb47433 100644
> --- a/src/intel/vulkan/genX_blorp_exec.c
> +++ b/src/intel/vulkan/genX_blorp_exec.c
> @@ -223,6 +223,10 @@ genX(blorp_exec)(struct blorp_batch *batch,
>        genX(cmd_buffer_config_l3)(cmd_buffer, cfg);
>     }
>
> +   const unsigned scale = params->fast_clear_op ? UINT_MAX : 1;
> +   genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, params->x1 - params->x0,
> +                                      params->y1 - params->y0, scale);
> +
>  #if GEN_GEN >= 11
>     /* The PIPE_CONTROL command description says:
>      *
> diff --git a/src/intel/vulkan/genX_cmd_buffer.c
> b/src/intel/vulkan/genX_cmd_buffer.c
> index 86ef1663ac4..10da132115c 100644
> --- a/src/intel/vulkan/genX_cmd_buffer.c
> +++ b/src/intel/vulkan/genX_cmd_buffer.c
> @@ -1595,6 +1595,7 @@ genX(CmdExecuteCommands)(
>      */
>     primary->state.current_pipeline = UINT32_MAX;
>     primary->state.current_l3_config = NULL;
> +   primary->state.current_hash_scale = 0;
>
>     /* Each of the secondary command buffers will use its own state base
>      * address.  We need to re-emit state base address for the primary
> after
> @@ -2663,6 +2664,8 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer
> *cmd_buffer)
>
>     genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->urb.l3_config);
>
> +   genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, UINT_MAX, UINT_MAX, 1);
> +
>     genX(flush_pipeline_select_3d)(cmd_buffer);
>
>     if (vb_emit) {
> @@ -3925,6 +3928,98 @@ genX(cmd_buffer_emit_gen7_depth_flush)(struct
> anv_cmd_buffer *cmd_buffer)
>     }
>  }
>
> +/**
> + * Update the pixel hashing modes that determine the balancing of PS
> threads
> + * across subslices and slices.
> + *
> + * \param width Width bound of the rendering area (already scaled down if
> \p
> + *              scale is greater than 1).
> + * \param height Height bound of the rendering area (already scaled down
> if \p
> + *               scale is greater than 1).
> + * \param scale The number of framebuffer samples that could potentially
> be
> + *              affected by an individual channel of the PS thread.  This
> is
> + *              typically one for single-sampled rendering, but for
> operations
> + *              like CCS resolves and fast clears a single PS invocation
> may
> + *              update a huge number of pixels, in which case a finer
> + *              balancing is desirable in order to maximally utilize the
> + *              bandwidth available.  UINT_MAX can be used as shorthand
> for
> + *              "finest hashing mode available".
> + */
> +void
> +genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer,
> +                                   unsigned width, unsigned height,
> +                                   unsigned scale)
> +{
> +#if GEN_GEN == 9
> +   const struct gen_device_info *devinfo = &cmd_buffer->device->info;
> +   const unsigned slice_hashing[] = {
> +      /* Because all Gen9 platforms with more than one slice require
> +       * three-way subslice hashing, a single "normal" 16x16 slice hashing
> +       * block is guaranteed to suffer from substantial imbalance, with
> one
> +       * subslice receiving twice as much work as the other two in the
> +       * slice.
> +       *
> +       * The performance impact of that would be particularly severe when
> +       * three-way hashing is also in use for slice balancing (which is
> the
> +       * case for all Gen9 GT4 platforms), because one of the slices
> +       * receives one every three 16x16 blocks in either direction, which
> +       * is roughly the periodicity of the underlying subslice imbalance
> +       * pattern ("roughly" because in reality the hardware's
> +       * implementation of three-way hashing doesn't do exact modulo 3
> +       * arithmetic, which somewhat decreases the magnitude of this effect
> +       * in practice).  This leads to a systematic subslice imbalance
> +       * within that slice regardless of the size of the primitive.  The
> +       * 32x32 hashing mode guarantees that the subslice imbalance within
> a
> +       * single slice hashing block is minimal, largely eliminating this
> +       * effect.
> +       */
> +      _32x32,
> +      /* Finest slice hashing mode available. */
> +      NORMAL
> +   };
> +   const unsigned subslice_hashing[] = {
> +      /* 16x16 would provide a slight cache locality benefit especially
> +       * visible in the sampler L1 cache efficiency of low-bandwidth
> +       * non-LLC platforms, but it comes at the cost of greater subslice
> +       * imbalance for primitives of dimensions approximately intermediate
> +       * between 16x4 and 16x16.
> +       */
> +      _16x4,
> +      /* Finest subslice hashing mode available. */
> +      _8x4
> +   };
> +   /* Dimensions of the smallest hashing block of a given hashing mode.
> If
> +    * the rendering area is smaller than this there can't possibly be any
> +    * benefit from switching to this mode, so we optimize out the
> +    * transition.
> +    */
> +   const unsigned min_size[][2] = {
> +         { 16, 4 },
> +         { 8, 4 }
> +   };
> +   const unsigned idx = scale > 1;
> +
> +   if (cmd_buffer->state.current_hash_scale != scale &&
> +       (width > min_size[idx][0] || height > min_size[idx][1])) {
> +      uint32_t gt_mode;
> +
> +      anv_pack_struct(&gt_mode, GENX(GT_MODE),
> +                      .SliceHashing = (devinfo->num_slices > 1 ?
> slice_hashing[idx] : 0),
> +                      .SliceHashingMask = (devinfo->num_slices > 1 ? -1 :
> 0),
> +                      .SubsliceHashing = subslice_hashing[idx],
> +                      .SubsliceHashingMask = -1);
> +
> +      cmd_buffer->state.pending_pipe_bits |=
> +         ANV_PIPE_CS_STALL_BIT | ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
> +      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
> +
> +      emit_lri(&cmd_buffer->batch, GENX(GT_MODE_num), gt_mode);
> +
> +      cmd_buffer->state.current_hash_scale = scale;
> +   }
> +#endif
> +}
> +
>  static void
>  cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
>  {
> --
> 2.22.0
>
>

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCHv2 4/4] anv/gen9: Optimize slice and subslice load balancing behavior.

Reply via email to