On Wed,  7 Jun 2017 17:13:35 -0700
Eric Anholt <e...@anholt.net> wrote:

> The T tiling format is what V3D uses for textures, with no raster
> support at all until later revisions of the hardware (and always at a
> large 3D performance penalty).  If we can't scan out V3D's format,
> then we often need to do a relayout at some stage of the pipeline,
> either right before texturing from the scanout buffer (common in X11
> without a compositor) or between a tiled screen buffer right before
> scanout (an option I've considered in trying to resolve this
> inconsistency, but which means needing to use the dirty fb ioctl and
> having some update policy).
> 
> T-format scanout lets us avoid either of those shadow copies, for a
> massive, obvious performance improvement to X11 window dragging
> without a compositor.  Unfortunately, enabling a compositor to work
> around the discrepancy has turned out to be too costly in memory
> consumption for the Raspbian distribution.
> 
> Because the HVS operates a scanline at a time, compositing from T does
> increase the memory bandwidth cost of scanout.  On my 1920x1080@32bpp
> display on a RPi3, we go from about 15% of system memory bandwidth
> with linear to about 20% with tiled.  However, for X11 this still ends
> up being a huge performance win in active usage.
> 
> This patch doesn't yet handle src_x/src_y offsetting within the tiled
> buffer.  However, we fail to do so for untiled buffers already.
> 
> Signed-off-by: Eric Anholt <e...@anholt.net>

Reviewed-by: Boris Brezillon <boris.brezil...@free-electrons.com>

> ---
>  drivers/gpu/drm/vc4/vc4_plane.c | 31 +++++++++++++++++++++++++++----
>  drivers/gpu/drm/vc4/vc4_regs.h  | 19 +++++++++++++++++++
>  include/uapi/drm/drm_fourcc.h   | 23 ++++++++++++++++++++++-
>  3 files changed, 68 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/gpu/drm/vc4/vc4_plane.c b/drivers/gpu/drm/vc4/vc4_plane.c
> index da18dec21696..fa6809d8b0fe 100644
> --- a/drivers/gpu/drm/vc4/vc4_plane.c
> +++ b/drivers/gpu/drm/vc4/vc4_plane.c
> @@ -500,8 +500,8 @@ static int vc4_plane_mode_set(struct drm_plane *plane,
>       u32 ctl0_offset = vc4_state->dlist_count;
>       const struct hvs_format *format = 
> vc4_get_hvs_format(fb->format->format);
>       int num_planes = drm_format_num_planes(format->drm);
> -     u32 scl0, scl1;
> -     u32 lbm_size;
> +     u32 scl0, scl1, pitch0;
> +     u32 lbm_size, tiling;
>       unsigned long irqflags;
>       int ret, i;
>  
> @@ -542,11 +542,31 @@ static int vc4_plane_mode_set(struct drm_plane *plane,
>               scl1 = vc4_get_scl_field(state, 0);
>       }
>  
> +     switch (fb->modifier) {
> +     case DRM_FORMAT_MOD_LINEAR:
> +             tiling = SCALER_CTL0_TILING_LINEAR;
> +             pitch0 = VC4_SET_FIELD(fb->pitches[0], SCALER_SRC_PITCH);
> +             break;
> +     case DRM_FORMAT_MOD_BROADCOM_VC4_T_TILED:
> +             tiling = SCALER_CTL0_TILING_256B_OR_T;
> +
> +             pitch0 = (VC4_SET_FIELD(0, SCALER_PITCH0_TILE_Y_OFFSET),
> +                       VC4_SET_FIELD(0, SCALER_PITCH0_TILE_WIDTH_L),
> +                       VC4_SET_FIELD((vc4_state->src_w[0] + 31) >> 5,
> +                                     SCALER_PITCH0_TILE_WIDTH_R));
> +             break;
> +     default:
> +             DRM_DEBUG_KMS("Unsupported FB tiling flag 0x%16llx",
> +                           (long long)fb->modifier);
> +             return -EINVAL;
> +     }
> +
>       /* Control word */
>       vc4_dlist_write(vc4_state,
>                       SCALER_CTL0_VALID |
>                       (format->pixel_order << SCALER_CTL0_ORDER_SHIFT) |
>                       (format->hvs << SCALER_CTL0_PIXEL_FORMAT_SHIFT) |
> +                     VC4_SET_FIELD(tiling, SCALER_CTL0_TILING) |
>                       (vc4_state->is_unity ? SCALER_CTL0_UNITY : 0) |
>                       VC4_SET_FIELD(scl0, SCALER_CTL0_SCL0) |
>                       VC4_SET_FIELD(scl1, SCALER_CTL0_SCL1));
> @@ -600,8 +620,11 @@ static int vc4_plane_mode_set(struct drm_plane *plane,
>       for (i = 0; i < num_planes; i++)
>               vc4_dlist_write(vc4_state, 0xc0c0c0c0);
>  
> -     /* Pitch word 0/1/2 */
> -     for (i = 0; i < num_planes; i++) {
> +     /* Pitch word 0 */
> +     vc4_dlist_write(vc4_state, pitch0);
> +
> +     /* Pitch word 1/2 */
> +     for (i = 1; i < num_planes; i++) {
>               vc4_dlist_write(vc4_state,
>                               VC4_SET_FIELD(fb->pitches[i], 
> SCALER_SRC_PITCH));
>       }
> diff --git a/drivers/gpu/drm/vc4/vc4_regs.h b/drivers/gpu/drm/vc4/vc4_regs.h
> index 932093936178..d382c34c1b9e 100644
> --- a/drivers/gpu/drm/vc4/vc4_regs.h
> +++ b/drivers/gpu/drm/vc4/vc4_regs.h
> @@ -709,6 +709,13 @@ enum hvs_pixel_format {
>  #define SCALER_CTL0_SIZE_MASK                        VC4_MASK(29, 24)
>  #define SCALER_CTL0_SIZE_SHIFT                       24
>  
> +#define SCALER_CTL0_TILING_MASK                      VC4_MASK(21, 20)
> +#define SCALER_CTL0_TILING_SHIFT             20
> +#define SCALER_CTL0_TILING_LINEAR            0
> +#define SCALER_CTL0_TILING_64B                       1
> +#define SCALER_CTL0_TILING_128B                      2
> +#define SCALER_CTL0_TILING_256B_OR_T         3
> +
>  #define SCALER_CTL0_HFLIP                       BIT(16)
>  #define SCALER_CTL0_VFLIP                       BIT(15)
>  
> @@ -838,7 +845,19 @@ enum hvs_pixel_format {
>  #define SCALER_PPF_KERNEL_OFFSET_SHIFT               0
>  #define SCALER_PPF_KERNEL_UNCACHED           BIT(31)
>  
> +/* PITCH0/1/2 fields for raster. */
>  #define SCALER_SRC_PITCH_MASK                        VC4_MASK(15, 0)
>  #define SCALER_SRC_PITCH_SHIFT                       0
>  
> +/* PITCH0 fields for T-tiled. */
> +#define SCALER_PITCH0_TILE_WIDTH_L_MASK              VC4_MASK(22, 16)
> +#define SCALER_PITCH0_TILE_WIDTH_L_SHIFT     16
> +#define SCALER_PITCH0_TILE_LINE_DIR          BIT(15)
> +#define SCALER_PITCH0_TILE_INITIAL_LINE_DIR  BIT(14)
> +/* Y offset within a tile. */
> +#define SCALER_PITCH0_TILE_Y_OFFSET_MASK     VC4_MASK(13, 7)
> +#define SCALER_PITCH0_TILE_Y_OFFSET_SHIFT    7
> +#define SCALER_PITCH0_TILE_WIDTH_R_MASK              VC4_MASK(6, 0)
> +#define SCALER_PITCH0_TILE_WIDTH_R_SHIFT     0
> +
>  #endif /* VC4_REGS_H */
> diff --git a/include/uapi/drm/drm_fourcc.h b/include/uapi/drm/drm_fourcc.h
> index 55e301047b3e..7586c46f68bf 100644
> --- a/include/uapi/drm/drm_fourcc.h
> +++ b/include/uapi/drm/drm_fourcc.h
> @@ -182,6 +182,7 @@ extern "C" {
>  #define DRM_FORMAT_MOD_VENDOR_SAMSUNG 0x04
>  #define DRM_FORMAT_MOD_VENDOR_QCOM    0x05
>  #define DRM_FORMAT_MOD_VENDOR_VIVANTE 0x06
> +#define DRM_FORMAT_MOD_VENDOR_BROADCOM 0x07
>  /* add more to the end as needed */
>  
>  #define fourcc_mod_code(vendor, val) \
> @@ -306,7 +307,6 @@ extern "C" {
>   */
>  #define DRM_FORMAT_MOD_VIVANTE_SPLIT_SUPER_TILED fourcc_mod_code(VIVANTE, 4)
>  
> -
>  /* NVIDIA Tegra frame buffer modifiers */
>  
>  /*
> @@ -351,6 +351,27 @@ extern "C" {
>   */
>  #define NV_FORMAT_MOD_TEGRA_16BX2_BLOCK(v) fourcc_mod_tegra_code(2, v)
>  
> +/*
> + * Broadcom VC4 "T" format
> + *
> + * This is the primary layout that the V3D GPU can texture from (it
> + * can't do linear).  The T format has:
> + *
> + * - 64b utiles of pixels in a raster-order grid according to cpp.  It's 4x4
> + *   pixels at 32 bit depth.
> + *
> + * - 1k subtiles made of a 4x4 raster-order grid of 64b utiles (so usually
> + *   16x16 pixels).
> + *
> + * - 4k tiles made of a 2x2 grid of 1k subtiles (so usually 32x32 pixels).  
> On
> + *   even 4k tile rows, they're arranged as (BL, TL, TR, BR), and on odd rows
> + *   they're (TR, BR, BL, TL), where bottom left is start of memory.
> + *
> + * - an image made of 4k tiles in rows either left-to-right (even rows of 4k
> + *   tiles) or right-to-left (odd rows of 4k tiles).
> + */
> +#define DRM_FORMAT_MOD_BROADCOM_VC4_T_TILED fourcc_mod_code(BROADCOM, 1)
> +
>  #if defined(__cplusplus)
>  }
>  #endif

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Reply via email to