i915: Make sure we have enough memory bandwidth on ICL

Maarten Lankhorst Wed, 27 Mar 2019 07:12:34 -0700

Op 20-03-2019 om 22:46 schreef Ville Syrjala:
> From: Ville Syrjälä <ville.syrj...@linux.intel.com>
>
> ICL has so many planes that it can easily exceed the maximum
> effective memory bandwidth of the system. We must therefore check
> that we don't exceed that limit.
>
> The algorithm is very magic number heavy and lacks sufficient
> explanation for now. We also have no sane way to query the
> memory clock and timings, so we must rely on a combination of
> raw readout from the memory controller and hardcoded assumptions.
> The memory controller values obviously change as the system
> jumps between the different SAGV points, so we try to stabilize
> it first by disabling SAGV for the duration of the readout.
>
> The utilized bandwidth is tracked via a device wide atomic
> private object. That is actually not robust because we can't
> afford to enforce strict global ordering between the pipes.
> Thus I think I'll need to change this to simply chop up the
> available bandwidth between all the active pipes. Each pipe
> can then do whatever it wants as long as it doesn't exceed
> its budget. That scheme will also require that we assume that
> any number of planes could be active at any time.
>
> TODO: make it robust and deal with all the open questions
>
> Signed-off-by: Ville Syrjälä <ville.syrj...@linux.intel.com>
> ---
>  drivers/gpu/drm/i915/Makefile             |   1 +
>  drivers/gpu/drm/i915/i915_drv.c           | 346 ++++++++++++++++++++++
>  drivers/gpu/drm/i915/i915_drv.h           |  10 +
>  drivers/gpu/drm/i915/intel_atomic_plane.c |  20 ++
>  drivers/gpu/drm/i915/intel_bw.c           | 190 ++++++++++++
>  drivers/gpu/drm/i915/intel_display.c      |  39 ++-
>  drivers/gpu/drm/i915/intel_drv.h          |  32 ++
>  7 files changed, 637 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/gpu/drm/i915/intel_bw.c
>
> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> index 68fecf355471..2d24bdd501c4 100644
> --- a/drivers/gpu/drm/i915/Makefile
> +++ b/drivers/gpu/drm/i915/Makefile
> @@ -126,6 +126,7 @@ i915-y += intel_audio.o \
>         intel_atomic.o \
>         intel_atomic_plane.o \
>         intel_bios.o \
> +       intel_bw.o \
>         intel_cdclk.o \
>         intel_color.o \
>         intel_combo_phy.o \
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index 8b37ec0e0676..134d1b1a93f1 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -1451,6 +1451,348 @@ bxt_get_dram_info(struct drm_i915_private *dev_priv)
>       return 0;
>  }
>  
> +#define SA_PERF_STATUS_0_0_0_MCHBAR_PC _MMIO(MCHBAR_MIRROR_BASE_SNB + 0x5918)
> +#define  SKL_QCLK_RATIO_MASK (0x7f << 0)
> +#define  SKL_QCLK_RATIO_SHIT 0
> +#define  SKL_QCLK_REFERENCE (1 << 7)
> +#define  CNL_QCLK_RATIO_MASK (0x7f << 2)
> +#define  CNL_QCLK_RATIO_SHIT 2
> +#define  CNL_QCLK_REFERENCE (1 << 9)
> +#define  ICL_QCLK_RATIO_MASK (0xff << 2)
> +#define  ICL_QCLK_RATIO_SHIT 2
> +#define  ICL_QCLK_REFERENCE (1 << 10)
> +
> +#define MCHBAR_CH0_CR_TC_PRE_0_0_0_MCHBAR _MMIO(MCHBAR_MIRROR_BASE_SNB + 
> 0x4000)
> +#define MCHBAR_CH1_CR_TC_PRE_0_0_0_MCHBAR _MMIO(MCHBAR_MIRROR_BASE_SNB + 
> 0x4400)
> +#define  SKL_DRAM_T_WRPRE_MASK (0x7f << 24)
> +#define  SKL_DRAM_T_WRPRE_SHIFT 24
> +#define  SKL_DRAM_T_RDPRE_MASK (0xf << 16)
> +#define  SKL_DRAM_T_RDPRE_SHIFT 16
> +#define  SKL_DRAM_T_RAS_MASK (0x7f << 8)
> +#define  SKL_DRAM_T_RAS_SHIFT 8
> +#define  SKL_DRAM_T_RPAB_EXT_MASK (0x3 << 6)
> +#define  SKL_DRAM_T_RPAB_EXT_SHIFT 6
> +#define  SKL_DRAM_T_RP_MASK (0x3f << 0)
> +#define  SKL_DRAM_T_RP_SHIFT 0
> +#define  CNL_DRAM_T_WRPRE_MASK (0xff << 24)
> +#define  CNL_DRAM_T_WRPRE_SHIFT 24
> +#define  CNL_DRAM_T_PPD_MASK (0x7 << 21)
> +#define  CNL_DRAM_T_PPD_SHIFT 21
> +#define  CNL_DRAM_T_RDPRE_MASK (0x1f << 16)
> +#define  CNL_DRAM_T_RDPRE_SHIFT 16
> +#define  CNL_DRAM_T_RAS_MASK (0x7f << 9)
> +#define  CNL_DRAM_T_RAS_SHIFT 9
> +#define  CNL_DRAM_T_RPAB_EXT_MASK (0x7 << 6)
> +#define  CNL_DRAM_T_RPAB_EXT_SHIFT 6
> +#define  CNL_DRAM_T_RP_MASK (0x3f << 0)
> +#define  CNL_DRAM_T_RP_SHIFT 0
> +
> +struct intel_dram_timings {
> +     u8 t_rp, t_rdpre, t_ras, t_bl;
> +};
> +
> +static int icl_get_dclk(struct drm_i915_private *dev_priv)
> +{
> +     int ratio, ref;
> +     u32 val;
> +
> +     val = I915_READ(SA_PERF_STATUS_0_0_0_MCHBAR_PC);
> +
> +     DRM_DEBUG_KMS("SA_PERF = 0x%x\n", val);
> +     DRM_DEBUG_KMS("BIOS_DATA = 0x%x\n",
> +                   I915_READ(SKL_MC_BIOS_DATA_0_0_0_MCHBAR_PCU));
> +
> +     ratio = (val & ICL_QCLK_RATIO_MASK) >> ICL_QCLK_RATIO_SHIT;
> +
> +     if (val & ICL_QCLK_REFERENCE)
> +             ref = 6; /* 6 * 16.666 MHz = 100 MHz */
> +     else
> +             ref = 8; /* 8 * 16.666 MHz = 133 MHz */
> +
> +     return ratio * ref;
> +}
> +
> +#if 0
> +static void skl_get_dram_ch_timings(struct intel_dram_timings *t,
> +                                 int channel, enum intel_dram_type type,
> +                                 u32 val)
> +{
> +     t->t_rp = (val & SKL_DRAM_T_RP_MASK) >> SKL_DRAM_T_RP_SHIFT;
> +     t->t_rdpre = (val & SKL_DRAM_T_RDPRE_MASK) >> SKL_DRAM_T_RDPRE_SHIFT;
> +     t->t_ras = (val & SKL_DRAM_T_RAS_MASK) >> SKL_DRAM_T_RAS_SHIFT;
> +     t->t_bl = type == INTEL_DRAM_DDR4 ? 4 : 8;
> +
> +     DRM_DEBUG_KMS("CH%d tRP=%d tRDPRE=%d tRAS=%d tBL=%d\n",
> +                   channel, t->t_rp, t->t_rdpre, t->t_ras, t->t_bl);
> +}
> +
> +static void skl_get_dram_timings(struct drm_i915_private *dev_priv,
> +                              const struct dram_info *dram,
> +                              struct intel_dram_timings *t)
> +{
> +     if (dram->channels & BIT(0)) {
> +             u32 val = I915_READ(MCHBAR_CH0_CR_TC_PRE_0_0_0_MCHBAR);
> +
> +             skl_get_dram_ch_timings(t, 0, dram->type, val);
> +     } else if (dram->channels & BIT(1)) {
> +             u32 val = I915_READ(MCHBAR_CH1_CR_TC_PRE_0_0_0_MCHBAR);
> +
> +             skl_get_dram_ch_timings(t, 1, dram->type, val);
> +     }
> +}
> +#endif
> +
> +static void cnl_get_dram_ch_timings(struct intel_dram_timings *t,
> +                                 int channel, enum intel_dram_type type,
> +                                 u32 val)
> +{
> +     t->t_rp = (val & CNL_DRAM_T_RP_MASK) >> CNL_DRAM_T_RP_SHIFT;
> +     t->t_rdpre = (val & CNL_DRAM_T_RDPRE_MASK) >> CNL_DRAM_T_RDPRE_SHIFT;
> +     t->t_ras = (val & CNL_DRAM_T_RAS_MASK) >> CNL_DRAM_T_RAS_SHIFT;
> +     t->t_bl = type == INTEL_DRAM_DDR4 ? 4 : 8;
> +
> +     DRM_DEBUG_KMS("CH%d tRP=%d tRDPRE=%d tRAS=%d tBL=%d\n",
> +                   channel, t->t_rp, t->t_rdpre, t->t_ras, t->t_bl);
> +}
> +
> +static void cnl_get_dram_timings(struct drm_i915_private *dev_priv,
> +                              const struct dram_info *dram,
> +                              struct intel_dram_timings *t)
> +{
> +     u32 val;
> +
> +     if (dram->channels & BIT(0)) {
> +             val = I915_READ(MCHBAR_CH0_CR_TC_PRE_0_0_0_MCHBAR);
> +             cnl_get_dram_ch_timings(t, 0, dram->type, val);
> +     } else if (dram->channels & BIT(1)) {
> +             val = I915_READ(MCHBAR_CH1_CR_TC_PRE_0_0_0_MCHBAR);
> +             cnl_get_dram_ch_timings(t, 1, dram->type, val);
> +     }
> +}
> +
> +struct intel_sagv_point {
> +     u16 dclk, t_rp, t_rdpre, t_rc, t_ras, t_rcd;
> +};
> +
> +struct intel_sagv_info {
> +     struct intel_sagv_point points[3];
> +     int num_points;
> +};
> +
> +static void icl_get_sagv_points(struct drm_i915_private *dev_priv,
> +                             struct intel_sagv_info *si,
> +                             const struct intel_dram_timings *t)
> +{
> +     int dclk, i;
> +
> +     dclk = icl_get_dclk(dev_priv);
> +
> +     si->num_points = 3;
> +
> +     /*
> +      * ICL Hardcoded
> +      * Name  Description         MC clock(MHz)     DDR data rate(MT / s)  
> Gear
> +      * Low   Min voltage point   1066              2133                   2
> +      * Med   Max DDR rate point  Max DDR freq / 2  Max DDR freq           2
> +      * High  Min latency point   2667              Same as MC clock       1
> +      */
> +     si->points[0].dclk = min(64, dclk);
> +     si->points[1].dclk = dclk;
> +     si->points[2].dclk = min(80, dclk);
> +
> +     for (i = 0; i < si->num_points; i++) {
> +             struct intel_sagv_point *sp = &si->points[i];
> +
> +             /*
> +              * We assume these scale linearly.
> +              * Seems to match observed behaviour.
> +              */
> +             sp->t_rp = DIV_ROUND_UP(t->t_rp * sp->dclk, dclk);
> +             sp->t_rdpre = DIV_ROUND_UP(t->t_rdpre * sp->dclk, dclk);
> +             sp->t_ras = DIV_ROUND_UP(t->t_ras * sp->dclk, dclk);
> +
> +             sp->t_rcd = sp->t_rp;
> +             sp->t_rc = sp->t_rp + sp->t_ras;
> +
> +             DRM_DEBUG_KMS("SAGV %d DCLK=%d tRP=%d tRDPRE=%d tRAS=%d tRCD=%d 
> tRC=%d\n",
> +                           i, sp->dclk, sp->t_rp, sp->t_rdpre, sp->t_ras,
> +                           sp->t_rcd, sp->t_rc);
> +     }
> +}
> +
> +static int icl_calc_bw(int dclk, int num, int den)
> +{
> +     /* multiples of 2 x 16.666MHz (100/6) */
> +     return DIV_ROUND_CLOSEST(num * dclk * 2 * 100, den * 6);
> +}
> +
> +static int icl_sagv_max_dclk(const struct intel_sagv_info *si)
> +{
> +     u16 dclk = 0;
> +     int i;
> +
> +     for (i = 0; i < si->num_points; i++)
> +             dclk = max(dclk, si->points[i].dclk);
> +
> +     return dclk;
> +}
> +
> +struct intel_sa_info {
> +     u8 deburst, mpagesize, deprogbwlimit, displayrtids;
> +};
> +
> +static const struct intel_sa_info icl_sa_info = {
> +     .deburst = 8,
> +     .mpagesize = 16,
> +     .deprogbwlimit = 25, /* GB/s */
> +     .displayrtids = 128,
> +};
> +
> +static void icl_get_bw_info(struct drm_i915_private *dev_priv)
> +{
> +     const struct dram_info *dram = &dev_priv->dram_info;
> +     struct intel_sagv_info si = {};
> +     struct intel_dram_timings t = {};
> +     const struct intel_sa_info *sa = &icl_sa_info;
> +     bool is_y_tile = true; /* assume y tile may be used */
> +     int num_channels = hweight8(dram->channels);
> +     int deinterleave;
> +#if 0
> +     int clpchpblock;
> +     int pagelimit;
> +#endif
> +     int ipqdepth, ipqdepthpch;
> +     int dclk_max;
> +     int maxdebw;
> +     int i;
> +
> +     /*
> +      * Try to muzzle SAGV to prevent it from
> +      * messing up the memory controller readout.
> +      */
> +     intel_disable_sagv(dev_priv);
> +
> +     /*
> +      * Magic sleep to avoid observing very high DDR clock.
> +      * Not sure what's going on but on a system with DDR4-3200
> +      * clock of 4800 MT/s is often observed here. A short
> +      * sleep manages to hide that.. Is that actually
> +      * the "min latency" SAGV point?. Maybe the SA clocks
> +      * things way up when there is no memory traffic?
> +      * But polling the register never seems to show this
> +      * except during i915 unload/load. Sleeping before the
> +      * SAGV disable usually returns 2133 MT/s.
> +      *
> +      * FIXME what is going on?
> +      */
> +     msleep(5);
> +
> +     cnl_get_dram_timings(dev_priv, dram, &t);
> +
> +     icl_get_sagv_points(dev_priv, &si, &t);
> +
> +     deinterleave = DIV_ROUND_UP(num_channels, is_y_tile ? 4 : 2);
> +     dclk_max = icl_sagv_max_dclk(&si);
> +
> +     ipqdepthpch = 16;
> +
> +     maxdebw = min(sa->deprogbwlimit * 1000,
> +                   icl_calc_bw(dclk_max, 16, 1) * 6 / 10); /* 60% */
> +#if 0
> +     clpchpblock = deinterleave * 8 / num_channels;
> +     pagelimit = sa->mpagesize * deinterleave * 2 / num_channels;
> +#endif
> +     ipqdepth = min(ipqdepthpch, sa->displayrtids / num_channels);
> +
> +     DRM_DEBUG_KMS("maxdebw = %d\n", maxdebw);
> +     DRM_DEBUG_KMS("ipqdepth = %d\n", ipqdepth);
> +     DRM_DEBUG_KMS("deinterleave = %d\n", deinterleave);
> +     DRM_DEBUG_KMS("dclk_max = %d\n", dclk_max);
> +
> +     for (i = 0; i < ARRAY_SIZE(dev_priv->max_bw); i++) {
> +             struct intel_bw_info *bi = &dev_priv->max_bw[i];
> +             int clpchgroup;
> +             int j;
> +
> +             clpchgroup = (sa->deburst * deinterleave / num_channels) << i;
> +             bi->num_planes = (ipqdepth - clpchgroup) / clpchgroup + 1;
> +
> +             DRM_DEBUG_KMS("clpchgroup = %d\n", clpchgroup);
> +             DRM_DEBUG_KMS("num_planes = %d\n", bi->num_planes);
> +
> +             for (j = 0; j < si.num_points; j++) {
> +                     const struct intel_sagv_point *sp = &si.points[j];
> +                     int ct, bw;
> +
> +                     /*
> +                      * Max row cycle time
> +                      *
> +                      * FIXME what is the logic behind the
> +                      * assumed burst length?
> +                      */
> +                     ct = max_t(int, sp->t_rc, sp->t_rp + sp->t_rcd +
> +                                (clpchgroup - 1) * t.t_bl + sp->t_rdpre);
> +                     bw = icl_calc_bw(sp->dclk, clpchgroup * 32 * 
> num_channels, ct);
> +
> +                     DRM_DEBUG_KMS("ct = %d\n", ct);
> +                     DRM_DEBUG_KMS("bw = %d\n", bw);
> +
> +                     bi->deratedbw[j] = min(maxdebw,
> +                                            bw * 9 / 10); /* 90% */
> +             }
> +
> +             if (bi->num_planes == 1)
> +                     break;
> +     }
> +}
> +
> +static unsigned int icl_max_bw(struct drm_i915_private *dev_priv,
> +                            int num_planes, int sagv)
> +{
> +     int i;
> +
> +     for (i = 0; i < ARRAY_SIZE(dev_priv->max_bw); i++) {
> +             const struct intel_bw_info *bi =
> +                     &dev_priv->max_bw[i];
> +
> +             if (num_planes >= bi->num_planes)
> +                     return bi->deratedbw[sagv];
> +     }
> +
> +     return 0;
> +}
> +
> +unsigned int intel_max_data_rate(struct drm_i915_private *dev_priv,
> +                              int num_planes)
> +{
> +     if (IS_ICELAKE(dev_priv))
> +             /*
> +              * FIXME with SAGV disabled maybe we can assume
> +              * point 1 will always be used? Seems to match
> +              * the behaviour observed in the wild.
> +              */
> +             return min3(icl_max_bw(dev_priv, num_planes, 0),
> +                         icl_max_bw(dev_priv, num_planes, 1),
> +                         icl_max_bw(dev_priv, num_planes, 2));
> +     else
> +             return UINT_MAX;
> +}
> +
> +static void icl_dump_max_bw(struct drm_i915_private *dev_priv)
> +{
> +     int i;
> +
> +     for (i = 0; i < ARRAY_SIZE(dev_priv->max_bw); i++) {
> +             const struct intel_bw_info *bi = &dev_priv->max_bw[i];
> +             int j;
> +
> +             for (j = 0; j < ARRAY_SIZE(bi->deratedbw); j++) {
> +                     DRM_DEBUG_KMS("BW%d SAGV%d: num_planes=%d 
> deratedbw=%d\n",
> +                                   i, j, bi->num_planes, bi->deratedbw[j]);
> +             }
> +     }
> +}
> +
>  static void
>  intel_get_dram_info(struct drm_i915_private *dev_priv)
>  {
> @@ -1629,6 +1971,10 @@ static int i915_driver_init_hw(struct drm_i915_private 
> *dev_priv)
>        */
>       intel_get_dram_info(dev_priv);
>  
> +     if (INTEL_GEN(dev_priv) >= 11) {
> +             icl_get_bw_info(dev_priv);
> +             icl_dump_max_bw(dev_priv);
> +     }
>  
>       return 0;
>  
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index f638c0c74955..825bea3176fc 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -54,6 +54,7 @@
>  #include <drm/drm_cache.h>
>  #include <drm/drm_util.h>
>  #include <drm/drm_dsc.h>
> +#include <drm/drm_atomic.h>
>  #include <drm/drm_connector.h>
>  #include <drm/i915_mei_hdcp_interface.h>
>  
> @@ -1845,6 +1846,13 @@ struct drm_i915_private {
>               } type;
>       } dram_info;
>  
> +     struct intel_bw_info {
> +             int num_planes;
> +             int deratedbw[3];
> +     } max_bw[6];
> +
> +     struct drm_private_obj bw_obj;
> +
>       struct i915_runtime_pm runtime_pm;
>  
>       struct {
> @@ -2634,6 +2642,8 @@ extern unsigned long i915_mch_val(struct 
> drm_i915_private *dev_priv);
>  extern unsigned long i915_gfx_val(struct drm_i915_private *dev_priv);
>  extern void i915_update_gfx_val(struct drm_i915_private *dev_priv);
>  int vlv_force_gfx_clock(struct drm_i915_private *dev_priv, bool on);
> +unsigned int intel_max_data_rate(struct drm_i915_private *dev_priv,
> +                              int num_planes);
>  
>  int intel_engines_init_mmio(struct drm_i915_private *dev_priv);
>  int intel_engines_init(struct drm_i915_private *dev_priv);
> diff --git a/drivers/gpu/drm/i915/intel_atomic_plane.c 
> b/drivers/gpu/drm/i915/intel_atomic_plane.c
> index 9d32a6fcf840..de6b23ee6306 100644
> --- a/drivers/gpu/drm/i915/intel_atomic_plane.c
> +++ b/drivers/gpu/drm/i915/intel_atomic_plane.c
> @@ -111,6 +111,22 @@ intel_plane_destroy_state(struct drm_plane *plane,
>       drm_atomic_helper_plane_destroy_state(plane, state);
>  }
>  
> +unsigned int intel_plane_data_rate(const struct intel_crtc_state *crtc_state,
> +                                const struct intel_plane_state *plane_state)
> +{
> +     const struct drm_framebuffer *fb = plane_state->base.fb;
> +     unsigned int cpp = 0;
> +     int i;
> +
> +     if (!plane_state->base.visible)
> +             return 0;
> +
> +     for (i = 0; i < fb->format->num_planes; i++)
> +             cpp += fb->format->cpp[i];
> +
> +     return cpp * crtc_state->pixel_rate;


This doesn't take into account plane width/height? Surely that affects 
bandwidth as well?

This breaks kms_atomic_transition, which tries to decrease plane size to reduce 
load when max plane width/height is not supported.

According to this calculation, 6 256x256 overlay planes will take the same 
bandwidth as 6 planes filled with 4k fb's

~Maarten

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Re: [Intel-gfx] [PATCH 2/2] drm/i915: Make sure we have enough memory bandwidth on ICL

Reply via email to