[Mesa-dev] [PATCH 4/5] radeonsi: add support for displayable DCC for multi-RB chips

Marek Olšák Thu, 28 Feb 2019 13:21:29 -0800

From: Marek Olšák <marek.ol...@amd.com>

A compute shader is used to reorder DCC data from aligned to unaligned.
---
 src/amd/common/ac_gpu_info.c                  |   1 +
 src/amd/common/ac_gpu_info.h                  |   3 +
 src/amd/common/ac_surface.c                   | 125 ++++++++++++++++--
 src/amd/common/ac_surface.h                   |  15 ++-
 src/gallium/drivers/radeonsi/cik_sdma.c       |   3 +-
 src/gallium/drivers/radeonsi/si_blit.c        |   3 +
 .../drivers/radeonsi/si_compute_blit.c        |  80 +++++++++++
 src/gallium/drivers/radeonsi/si_pipe.c        |   2 +
 src/gallium/drivers/radeonsi/si_pipe.h        |  15 ++-
 .../drivers/radeonsi/si_shaderlib_tgsi.c      |  73 ++++++++++
 src/gallium/drivers/radeonsi/si_texture.c     |  83 +++++++++++-
 11 files changed, 389 insertions(+), 14 deletions(-)


diff --git a/src/amd/common/ac_gpu_info.c b/src/amd/common/ac_gpu_info.c
index a6d249a6d2f..d890172227c 100644
--- a/src/amd/common/ac_gpu_info.c
+++ b/src/amd/common/ac_gpu_info.c
@@ -497,20 +497,21 @@ void ac_print_gpu_info(struct radeon_info *info)
               info->pci_dev, info->pci_func);
        printf("    pci_id = 0x%x\n", info->pci_id);
        printf("    family = %i\n", info->family);
        printf("    chip_class = %i\n", info->chip_class);
        printf("    num_compute_rings = %u\n", info->num_compute_rings);
        printf("    num_sdma_rings = %i\n", info->num_sdma_rings);
        printf("    clock_crystal_freq = %i\n", info->clock_crystal_freq);
        printf("    tcc_cache_line_size = %u\n", info->tcc_cache_line_size);
 
        printf("    use_display_dcc_unaligned = %u\n", 
info->use_display_dcc_unaligned);
+       printf("    use_display_dcc_with_retile_blit = %u\n", 
info->use_display_dcc_with_retile_blit);
 
        printf("Memory info:\n");
        printf("    pte_fragment_size = %u\n", info->pte_fragment_size);
        printf("    gart_page_size = %u\n", info->gart_page_size);
        printf("    gart_size = %i MB\n", (int)DIV_ROUND_UP(info->gart_size, 
1024*1024));
        printf("    vram_size = %i MB\n", (int)DIV_ROUND_UP(info->vram_size, 
1024*1024));
        printf("    vram_vis_size = %i MB\n", 
(int)DIV_ROUND_UP(info->vram_vis_size, 1024*1024));
        printf("    gds_size = %u kB\n", info->gds_size / 1024);
        printf("    gds_gfx_partition_size = %u kB\n", 
info->gds_gfx_partition_size / 1024);
        printf("    max_alloc_size = %i MB\n",
diff --git a/src/amd/common/ac_gpu_info.h b/src/amd/common/ac_gpu_info.h
index 99fed520618..5241c28f2a7 100644
--- a/src/amd/common/ac_gpu_info.h
+++ b/src/amd/common/ac_gpu_info.h
@@ -49,22 +49,25 @@ struct radeon_info {
        /* Device info. */
        const char                  *name;
        uint32_t                    pci_id;
        enum radeon_family          family;
        enum chip_class             chip_class;
        uint32_t                    num_compute_rings;
        uint32_t                    num_sdma_rings;
        uint32_t                    clock_crystal_freq;
        uint32_t                    tcc_cache_line_size;
 
+       /* There are 2 display DCC codepaths, because display expects unaligned 
DCC. */
        /* Disable RB and pipe alignment to skip the retile blit. (1 RB chips 
only) */
        bool                        use_display_dcc_unaligned;
+       /* Allocate both aligned and unaligned DCC and use the retile blit. */
+       bool                        use_display_dcc_with_retile_blit;
 
        /* Memory info. */
        uint32_t                    pte_fragment_size;
        uint32_t                    gart_page_size;
        uint64_t                    gart_size;
        uint64_t                    vram_size;
        uint64_t                    vram_vis_size;
        unsigned                    gds_size;
        unsigned                    gds_gfx_partition_size;
        uint64_t                    max_alloc_size;
diff --git a/src/amd/common/ac_surface.c b/src/amd/common/ac_surface.c
index 6802ab2badb..7225317f3e7 100644
--- a/src/amd/common/ac_surface.c
+++ b/src/amd/common/ac_surface.c
@@ -1072,20 +1072,21 @@ gfx9_get_preferred_swizzle_mode(ADDR_HANDLE addrlib,
 
        ret = Addr2GetPreferredSurfaceSetting(addrlib, &sin, &sout);
        if (ret != ADDR_OK)
                return ret;
 
        *swizzle_mode = sout.swizzleMode;
        return 0;
 }
 
 static int gfx9_compute_miptree(ADDR_HANDLE addrlib,
+                               const struct radeon_info *info,
                                const struct ac_surf_config *config,
                                struct radeon_surf *surf, bool compressed,
                                ADDR2_COMPUTE_SURFACE_INFO_INPUT *in)
 {
        ADDR2_MIP_INFO mip_info[RADEON_SURF_MAX_LEVELS] = {};
        ADDR2_COMPUTE_SURFACE_INFO_OUTPUT out = {0};
        ADDR_E_RETURNCODE ret;
 
        out.size = sizeof(ADDR2_COMPUTE_SURFACE_INFO_OUTPUT);
        out.pMipInfo = mip_info;
@@ -1209,21 +1210,20 @@ static int gfx9_compute_miptree(ADDR_HANDLE addrlib,
                        din.numFrags = in->numFrags;
                        din.numMipLevels = in->numMipLevels;
                        din.dataSurfaceSize = out.surfSize;
 
                        ret = Addr2ComputeDccInfo(addrlib, &din, &dout);
                        if (ret != ADDR_OK)
                                return ret;
 
                        surf->u.gfx9.dcc.rb_aligned = din.dccKeyFlags.rbAligned;
                        surf->u.gfx9.dcc.pipe_aligned = 
din.dccKeyFlags.pipeAligned;
-                       surf->u.gfx9.display_dcc_pitch_max = dout.pitch - 1;
                        surf->dcc_size = dout.dccRamSize;
                        surf->dcc_alignment = dout.dccRamBaseAlign;
                        surf->num_dcc_levels = in->numMipLevels;
 
                        /* Disable DCC for levels that are in the mip tail.
                         *
                         * There are two issues that this is intended to
                         * address:
                         *
                         * 1. Multiple mip levels may share a cache line. This
@@ -1245,20 +1245,120 @@ static int gfx9_compute_miptree(ADDR_HANDLE addrlib,
                         */
                        for (unsigned i = 0; i < in->numMipLevels; i++) {
                                if (meta_mip_info[i].inMiptail) {
                                        surf->num_dcc_levels = i;
                                        break;
                                }
                        }
 
                        if (!surf->num_dcc_levels)
                                surf->dcc_size = 0;
+
+                       surf->u.gfx9.display_dcc_size = surf->dcc_size;
+                       surf->u.gfx9.display_dcc_alignment = 
surf->dcc_alignment;
+                       surf->u.gfx9.display_dcc_pitch_max = dout.pitch - 1;
+
+                       /* Compute displayable DCC. */
+                       if (in->flags.display &&
+                           surf->num_dcc_levels &&
+                           info->use_display_dcc_with_retile_blit) {
+                               /* Compute displayable DCC info. */
+                               din.dccKeyFlags.pipeAligned = 0;
+                               din.dccKeyFlags.rbAligned = 0;
+
+                               assert(din.numSlices == 1);
+                               assert(din.numMipLevels == 1);
+                               assert(din.numFrags == 1);
+                               assert(surf->tile_swizzle == 0);
+                               assert(surf->u.gfx9.dcc.pipe_aligned ||
+                                      surf->u.gfx9.dcc.rb_aligned);
+
+                               ret = Addr2ComputeDccInfo(addrlib, &din, &dout);
+                               if (ret != ADDR_OK)
+                                       return ret;
+
+                               surf->u.gfx9.display_dcc_size = dout.dccRamSize;
+                               surf->u.gfx9.display_dcc_alignment = 
dout.dccRamBaseAlign;
+                               surf->u.gfx9.display_dcc_pitch_max = dout.pitch 
- 1;
+                               assert(surf->u.gfx9.display_dcc_size <= 
surf->dcc_size);
+
+                               /* Compute address mapping from non-displayable 
to displayable DCC. */
+                               ADDR2_COMPUTE_DCC_ADDRFROMCOORD_INPUT addrin = 
{};
+                               addrin.size             = sizeof(addrin);
+                               addrin.colorFlags.color = 1;
+                               addrin.swizzleMode      = din.swizzleMode;
+                               addrin.resourceType     = din.resourceType;
+                               addrin.bpp              = din.bpp;
+                               addrin.unalignedWidth   = din.unalignedWidth;
+                               addrin.unalignedHeight  = din.unalignedHeight;
+                               addrin.numSlices        = 1;
+                               addrin.numMipLevels     = 1;
+                               addrin.numFrags         = 1;
+
+                               ADDR2_COMPUTE_DCC_ADDRFROMCOORD_OUTPUT addrout 
= {};
+                               addrout.size = sizeof(addrout);
+
+                               surf->u.gfx9.dcc_retile_num_elements =
+                                       DIV_ROUND_UP(in->width, 
dout.compressBlkWidth) *
+                                       DIV_ROUND_UP(in->height, 
dout.compressBlkHeight) * 2;
+                               /* Align the size to 4 (for the compute 
shader). */
+                               surf->u.gfx9.dcc_retile_num_elements =
+                                       
align(surf->u.gfx9.dcc_retile_num_elements, 4);
+
+                               surf->u.gfx9.dcc_retile_map =
+                                       
malloc(surf->u.gfx9.dcc_retile_num_elements * 4);
+                               if (!surf->u.gfx9.dcc_retile_map)
+                                       return ADDR_OUTOFMEMORY;
+
+                               unsigned index = 0;
+                               surf->u.gfx9.dcc_retile_use_uint16 = true;
+
+                               for (unsigned y = 0; y < in->height; y += 
dout.compressBlkHeight) {
+                                       addrin.y = y;
+
+                                       for (unsigned x = 0; x < in->width; x 
+= dout.compressBlkWidth) {
+                                               addrin.x = x;
+
+                                               /* Compute src DCC address */
+                                               addrin.dccKeyFlags.pipeAligned 
= surf->u.gfx9.dcc.pipe_aligned;
+                                               addrin.dccKeyFlags.rbAligned = 
surf->u.gfx9.dcc.rb_aligned;
+                                               addrout.addr = 0;
+
+                                               ret = 
Addr2ComputeDccAddrFromCoord(addrlib, &addrin, &addrout);
+                                               if (ret != ADDR_OK)
+                                                       return ret;
+
+                                               
surf->u.gfx9.dcc_retile_map[index * 2] = addrout.addr;
+                                               if (addrout.addr > USHRT_MAX)
+                                                       
surf->u.gfx9.dcc_retile_use_uint16 = false;
+
+                                               /* Compute dst DCC address */
+                                               addrin.dccKeyFlags.pipeAligned 
= 0;
+                                               addrin.dccKeyFlags.rbAligned = 
0;
+                                               addrout.addr = 0;
+
+                                               ret = 
Addr2ComputeDccAddrFromCoord(addrlib, &addrin, &addrout);
+                                               if (ret != ADDR_OK)
+                                                       return ret;
+
+                                               
surf->u.gfx9.dcc_retile_map[index * 2 + 1] = addrout.addr;
+                                               if (addrout.addr > USHRT_MAX)
+                                                       
surf->u.gfx9.dcc_retile_use_uint16 = false;
+
+                                               assert(index * 2 + 1 < 
surf->u.gfx9.dcc_retile_num_elements);
+                                               index++;
+                                       }
+                               }
+                               /* Fill the remaining pairs with the last one 
(for the compute shader). */
+                               for (unsigned i = index * 2; i < 
surf->u.gfx9.dcc_retile_num_elements; i++)
+                                       surf->u.gfx9.dcc_retile_map[i] = 
surf->u.gfx9.dcc_retile_map[i - 2];
+                       }
                }
 
                /* FMASK */
                if (in->numSamples > 1) {
                        ADDR2_COMPUTE_FMASK_INFO_INPUT fin = {0};
                        ADDR2_COMPUTE_FMASK_INFO_OUTPUT fout = {0};
 
                        fin.size = sizeof(ADDR2_COMPUTE_FMASK_INFO_INPUT);
                        fout.size = sizeof(ADDR2_COMPUTE_FMASK_INFO_OUTPUT);
 
@@ -1494,56 +1594,59 @@ static int gfx9_compute_surface(ADDR_HANDLE addrlib,
 
        surf->num_dcc_levels = 0;
        surf->surf_size = 0;
        surf->fmask_size = 0;
        surf->dcc_size = 0;
        surf->htile_size = 0;
        surf->htile_slice_size = 0;
        surf->u.gfx9.surf_offset = 0;
        surf->u.gfx9.stencil_offset = 0;
        surf->cmask_size = 0;
+       surf->u.gfx9.dcc_retile_use_uint16 = false;
+       surf->u.gfx9.dcc_retile_num_elements = 0;
+       surf->u.gfx9.dcc_retile_map = NULL;
 
        /* Calculate texture layout information. */
-       r = gfx9_compute_miptree(addrlib, config, surf, compressed,
+       r = gfx9_compute_miptree(addrlib, info, config, surf, compressed,
                                 &AddrSurfInfoIn);
        if (r)
-               return r;
+               goto error;
 
        /* Calculate texture layout information for stencil. */
        if (surf->flags & RADEON_SURF_SBUFFER) {
                AddrSurfInfoIn.flags.stencil = 1;
                AddrSurfInfoIn.bpp = 8;
                AddrSurfInfoIn.format = ADDR_FMT_8;
 
                if (!AddrSurfInfoIn.flags.depth) {
                        r = gfx9_get_preferred_swizzle_mode(addrlib, 
&AddrSurfInfoIn,
                                                            false, 
&AddrSurfInfoIn.swizzleMode);
                        if (r)
-                               return r;
+                               goto error;
                } else
                        AddrSurfInfoIn.flags.depth = 0;
 
-               r = gfx9_compute_miptree(addrlib, config, surf, compressed,
+               r = gfx9_compute_miptree(addrlib, info, config, surf, 
compressed,
                                         &AddrSurfInfoIn);
                if (r)
-                       return r;
+                       goto error;
        }
 
        surf->is_linear = surf->u.gfx9.surf.swizzle_mode == ADDR_SW_LINEAR;
 
        /* Query whether the surface is displayable. */
        bool displayable = false;
        if (!config->is_3d && !config->is_cube) {
                r = Addr2IsValidDisplaySwizzleMode(addrlib, 
surf->u.gfx9.surf.swizzle_mode,
                                           surf->bpe * 8, &displayable);
                if (r)
-                       return r;
+                       goto error;
 
                /* Display needs unaligned DCC. */
                if (info->use_display_dcc_unaligned &&
                    surf->num_dcc_levels &&
                    (surf->u.gfx9.dcc.pipe_aligned ||
                     surf->u.gfx9.dcc.rb_aligned))
                        displayable = false;
        }
        surf->is_displayable = displayable;
 
@@ -1581,38 +1684,44 @@ static int gfx9_compute_surface(ADDR_HANDLE addrlib,
                case ADDR_SW_64KB_R_T:
                case ADDR_SW_4KB_R_X:
                case ADDR_SW_64KB_R_X:
                case ADDR_SW_VAR_R_X:
                        /* The rotated micro tile mode doesn't work if both 
CMASK and RB+ are
                         * used at the same time. This case is not currently 
expected to occur
                         * because we don't use rotated. Enforce this 
restriction on all chips
                         * to facilitate testing.
                         */
                        assert(!"rotate micro tile mode is unsupported");
-                       return ADDR_ERROR;
+                       r = ADDR_ERROR;
+                       goto error;
 
                /* Z = depth. */
                case ADDR_SW_4KB_Z:
                case ADDR_SW_64KB_Z:
                case ADDR_SW_VAR_Z:
                case ADDR_SW_64KB_Z_T:
                case ADDR_SW_4KB_Z_X:
                case ADDR_SW_64KB_Z_X:
                case ADDR_SW_VAR_Z_X:
                        surf->micro_tile_mode = RADEON_MICRO_MODE_DEPTH;
                        break;
 
                default:
                        assert(0);
        }
 
        return 0;
+
+error:
+       free(surf->u.gfx9.dcc_retile_map);
+       surf->u.gfx9.dcc_retile_map = NULL;
+       return r;
 }
 
 int ac_compute_surface(ADDR_HANDLE addrlib, const struct radeon_info *info,
                       const struct ac_surf_config *config,
                       enum radeon_surf_mode mode,
                       struct radeon_surf *surf)
 {
        int r;
 
        r = surf_config_sanity(config, surf->flags);
diff --git a/src/amd/common/ac_surface.h b/src/amd/common/ac_surface.h
index eb50c37c3c2..10d25e23d32 100644
--- a/src/amd/common/ac_surface.h
+++ b/src/amd/common/ac_surface.h
@@ -20,20 +20,21 @@
  *
  * The above copyright notice and this permission notice (including the
  * next paragraph) shall be included in all copies or substantial portions
  * of the Software.
  */
 
 #ifndef AC_SURFACE_H
 #define AC_SURFACE_H
 
 #include <stdint.h>
+#include <stdbool.h>
 
 #include "amd_family.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 /* Forward declarations. */
 typedef void* ADDR_HANDLE;
 
@@ -142,23 +143,33 @@ struct gfx9_surf_layout {
     enum gfx9_resource_type     resource_type; /* 1D, 2D or 3D */
     uint16_t                    surf_pitch; /* in blocks */
     uint16_t                    surf_height;
 
     uint64_t                    surf_offset; /* 0 unless imported with an 
offset */
     /* The size of the 2D plane containing all mipmap levels. */
     uint64_t                    surf_slice_size;
     /* Mipmap level offset within the slice in bytes. Only valid for LINEAR. */
     uint32_t                    offset[RADEON_SURF_MAX_LEVELS];
 
-    uint16_t                    display_dcc_pitch_max;  /* (mip chain pitch - 
1) */
-
     uint64_t                    stencil_offset; /* separate stencil */
+
+    /* Displayable DCC. This is always rb_aligned=0 and pipe_aligned=0.
+     * The 3D engine doesn't support that layout except for chips with 1 RB.
+     * All other chips must set rb_aligned=1.
+     * A compute shader needs to convert from aligned DCC to unaligned.
+     */
+    uint32_t                    display_dcc_size;
+    uint32_t                    display_dcc_alignment;
+    uint16_t                    display_dcc_pitch_max;  /* (mip chain pitch - 
1) */
+    bool                        dcc_retile_use_uint16; /* if all values fit 
into uint16_t */
+    uint32_t                    dcc_retile_num_elements;
+    uint32_t                    *dcc_retile_map;
 };
 
 struct radeon_surf {
     /* Format properties. */
     unsigned                    blk_w:4;
     unsigned                    blk_h:4;
     unsigned                    bpe:5;
     /* Number of mipmap levels where DCC is enabled starting from level 0.
      * Non-zero levels may be disabled due to alignment constraints, but not
      * the first level.
diff --git a/src/gallium/drivers/radeonsi/cik_sdma.c 
b/src/gallium/drivers/radeonsi/cik_sdma.c
index 096f75e508f..da9b25a442d 100644
--- a/src/gallium/drivers/radeonsi/cik_sdma.c
+++ b/src/gallium/drivers/radeonsi/cik_sdma.c
@@ -489,21 +489,22 @@ static void cik_sdma_copy(struct pipe_context *ctx,
                          unsigned src_level,
                          const struct pipe_box *src_box)
 {
        struct si_context *sctx = (struct si_context *)ctx;
 
        if (!sctx->dma_cs ||
            src->flags & PIPE_RESOURCE_FLAG_SPARSE ||
            dst->flags & PIPE_RESOURCE_FLAG_SPARSE)
                goto fallback;
 
-       if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) {
+       /* If src is a buffer and dst is a texture, we are uploading metadata. 
*/
+       if (src->target == PIPE_BUFFER) {
                cik_sdma_copy_buffer(sctx, dst, src, dstx, src_box->x, 
src_box->width);
                return;
        }
 
        if ((sctx->chip_class == CIK || sctx->chip_class == VI) &&
            cik_sdma_copy_texture(sctx, dst, dst_level, dstx, dsty, dstz,
                                  src, src_level, src_box))
                return;
 
 fallback:
diff --git a/src/gallium/drivers/radeonsi/si_blit.c 
b/src/gallium/drivers/radeonsi/si_blit.c
index 7613a63e3cb..610de289a20 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -1311,20 +1311,23 @@ static void si_flush_resource(struct pipe_context *ctx,
 
        /* st/dri calls flush twice per frame (not a bug), this prevents double
         * decompression. */
        if (tex->dcc_separate_buffer && !tex->separate_dcc_dirty)
                return;
 
        if (!tex->is_depth && (tex->cmask_buffer || tex->dcc_offset)) {
                si_blit_decompress_color(sctx, tex, 0, res->last_level,
                                         0, util_max_layer(res, 0),
                                         tex->dcc_separate_buffer != NULL);
+
+               if (tex->display_dcc_offset)
+                       si_retile_dcc(sctx, tex);
        }
 
        /* Always do the analysis even if DCC is disabled at the moment. */
        if (tex->dcc_gather_statistics) {
                bool separate_dcc_dirty = tex->separate_dcc_dirty;
 
                /* If the color buffer hasn't been unbound and fast clear hasn't
                 * been used, separate_dcc_dirty is false, but there may have 
been
                 * new rendering. Check if the color buffer is bound and assume
                 * it's dirty.
diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c 
b/src/gallium/drivers/radeonsi/si_compute_blit.c
index 2ce56d6a81a..3935d9c754d 100644
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@@ -412,20 +412,100 @@ void si_compute_copy_image(struct si_context *sctx,
 
        sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
                       (sctx->chip_class <= VI ? SI_CONTEXT_WRITEBACK_GLOBAL_L2 
: 0) |
                       si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
        ctx->bind_compute_state(ctx, saved_cs);
        ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, saved_image);
        ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
        si_compute_internal_end(sctx);
 }
 
+void si_retile_dcc(struct si_context *sctx, struct si_texture *tex)
+{
+       struct pipe_context *ctx = &sctx->b;
+
+       sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+                      SI_CONTEXT_CS_PARTIAL_FLUSH |
+                      si_get_flush_flags(sctx, SI_COHERENCY_CB_META, L2_LRU) |
+                      si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_LRU);
+       si_emit_cache_flush(sctx);
+
+       /* Save states. */
+       void *saved_cs = sctx->cs_shader_state.program;
+       struct pipe_image_view saved_img[3] = {};
+
+       for (unsigned i = 0; i < 3; i++) {
+               util_copy_image_view(&saved_img[i],
+                                    
&sctx->images[PIPE_SHADER_COMPUTE].views[i]);
+       }
+
+       /* Set images. */
+       bool use_uint16 = tex->surface.u.gfx9.dcc_retile_use_uint16;
+       unsigned num_elements = tex->surface.u.gfx9.dcc_retile_num_elements;
+       struct pipe_image_view img[3];
+
+       assert(tex->dcc_retile_map_offset && tex->dcc_retile_map_offset <= 
UINT_MAX);
+       assert(tex->dcc_offset && tex->dcc_offset <= UINT_MAX);
+       assert(tex->display_dcc_offset && tex->display_dcc_offset <= UINT_MAX);
+
+       for (unsigned i = 0; i < 3; i++) {
+               img[i].resource = &tex->buffer.b.b;
+               img[i].access = i == 2 ? PIPE_IMAGE_ACCESS_WRITE : 
PIPE_IMAGE_ACCESS_READ;
+               img[i].shader_access = SI_IMAGE_ACCESS_AS_BUFFER;
+       }
+
+       img[0].format = use_uint16 ? PIPE_FORMAT_R16G16B16A16_UINT :
+                                    PIPE_FORMAT_R32G32B32A32_UINT;
+       img[0].u.buf.offset = tex->dcc_retile_map_offset;
+       img[0].u.buf.size = num_elements * (use_uint16 ? 2 : 4);
+
+       img[1].format = PIPE_FORMAT_R8_UINT;
+       img[1].u.buf.offset = tex->dcc_offset;
+       img[1].u.buf.size = tex->surface.dcc_size;
+
+       img[2].format = PIPE_FORMAT_R8_UINT;
+       img[2].u.buf.offset = tex->display_dcc_offset;
+       img[2].u.buf.size = tex->surface.u.gfx9.display_dcc_size;
+
+       ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 3, img);
+
+       /* Bind the compute shader. */
+       if (!sctx->cs_dcc_retile)
+               sctx->cs_dcc_retile = si_create_dcc_retile_cs(ctx);
+       ctx->bind_compute_state(ctx, sctx->cs_dcc_retile);
+
+       /* Dispatch compute. */
+       /* img[0] has 4 channels per element containing 2 pairs of DCC offsets. 
*/
+       unsigned num_threads = num_elements / 4;
+
+       struct pipe_grid_info info = {};
+       info.block[0] = 64;
+       info.block[1] = 1;
+       info.block[2] = 1;
+       info.grid[0] = DIV_ROUND_UP(num_threads, 64); /* includes the partial 
block */
+       info.grid[1] = 1;
+       info.grid[2] = 1;
+       sctx->compute_last_block[0] = num_threads % 64;
+
+       ctx->launch_grid(ctx, &info);
+
+       sctx->compute_last_block[0] = 0; /* reset */
+
+       /* Don't flush caches or wait. The driver will wait at the end of this 
IB,
+        * and L2 will be flushed by the kernel fence.
+        */
+
+       /* Restore states. */
+       ctx->bind_compute_state(ctx, saved_cs);
+       ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 3, saved_img);
+}
+
 void si_init_compute_blit_functions(struct si_context *sctx)
 {
        sctx->b.clear_buffer = si_pipe_clear_buffer;
 }
 
 /* Clear a region of a color surface to a constant value. */
 void si_compute_clear_render_target(struct pipe_context *ctx,
                                    struct pipe_surface *dstsurf,
                                    const union pipe_color_union *color,
                                    unsigned dstx, unsigned dsty,
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c 
b/src/gallium/drivers/radeonsi/si_pipe.c
index 9b1eab8284b..aa79a4e967c 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -202,20 +202,22 @@ static void si_destroy_context(struct pipe_context 
*context)
        if (sctx->cs_copy_buffer)
                sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_buffer);
        if (sctx->cs_copy_image)
                sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_image);
        if (sctx->cs_copy_image_1d_array)
                sctx->b.delete_compute_state(&sctx->b, 
sctx->cs_copy_image_1d_array);
        if (sctx->cs_clear_render_target)
                sctx->b.delete_compute_state(&sctx->b, 
sctx->cs_clear_render_target);
        if (sctx->cs_clear_render_target_1d_array)
                sctx->b.delete_compute_state(&sctx->b, 
sctx->cs_clear_render_target_1d_array);
+       if (sctx->cs_dcc_retile)
+               sctx->b.delete_compute_state(&sctx->b, sctx->cs_dcc_retile);
 
        if (sctx->blitter)
                util_blitter_destroy(sctx->blitter);
 
        /* Release DCC stats. */
        for (int i = 0; i < ARRAY_SIZE(sctx->dcc_stats); i++) {
                assert(!sctx->dcc_stats[i].query_active);
 
                for (int j = 0; j < ARRAY_SIZE(sctx->dcc_stats[i].ps_stats); 
j++)
                        if (sctx->dcc_stats[i].ps_stats[j])
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h 
b/src/gallium/drivers/radeonsi/si_pipe.h
index 6765dcb3275..d1e7b8c0b4a 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -270,26 +270,36 @@ struct si_transfer {
        unsigned                        offset;
 };
 
 struct si_texture {
        struct si_resource              buffer;
 
        struct radeon_surf              surface;
        uint64_t                        size;
        struct si_texture               *flushed_depth_texture;
 
-       /* Colorbuffer compression and fast clear. */
+       /* One texture allocation can contain these buffers:
+        * - image (pixel data)
+        * - FMASK buffer (MSAA compression)
+        * - CMASK buffer (MSAA compression and/or legacy fast color clear)
+        * - HTILE buffer (Z/S compression and fast Z/S clear)
+        * - DCC buffer (color compression and new fast color clear)
+        * - displayable DCC buffer (if the DCC buffer is not displayable)
+        * - DCC retile mapping buffer (if the DCC buffer is not displayable)
+        */
        uint64_t                        fmask_offset;
        uint64_t                        cmask_offset;
        uint64_t                        cmask_base_address_reg;
        struct si_resource              *cmask_buffer;
        uint64_t                        dcc_offset; /* 0 = disabled */
+       uint64_t                        display_dcc_offset;
+       uint64_t                        dcc_retile_map_offset;
        unsigned                        cb_color_info; /* fast clear enable bit 
*/
        unsigned                        color_clear_value[2];
        unsigned                        last_msaa_resolve_target_micro_mode;
        unsigned                        num_level0_transfers;
 
        /* Depth buffer compression and fast clear. */
        uint64_t                        htile_offset;
        float                           depth_clear_value;
        uint16_t                        dirty_level_mask; /* each bit says if 
that mipmap is compressed */
        uint16_t                        stencil_dirty_level_mask; /* each bit 
says if that mipmap is compressed */
@@ -820,20 +830,21 @@ struct si_context {
        void                            *vs_blit_pos_layered;
        void                            *vs_blit_color;
        void                            *vs_blit_color_layered;
        void                            *vs_blit_texcoord;
        void                            *cs_clear_buffer;
        void                            *cs_copy_buffer;
        void                            *cs_copy_image;
        void                            *cs_copy_image_1d_array;
        void                            *cs_clear_render_target;
        void                            *cs_clear_render_target_1d_array;
+       void                            *cs_dcc_retile;
        struct si_screen                *screen;
        struct pipe_debug_callback      debug;
        struct ac_llvm_compiler         compiler; /* only non-threaded 
compilation */
        struct si_shader_ctx_state      fixed_func_tcs_shader;
        struct si_resource              *wait_mem_scratch;
        unsigned                        wait_mem_number;
        uint16_t                        prefetch_L2_mask;
 
        bool                            has_graphics;
        bool                            gfx_flush_in_progress:1;
@@ -1210,20 +1221,21 @@ void si_compute_copy_image(struct si_context *sctx,
                           struct pipe_resource *src,
                           unsigned src_level,
                           unsigned dstx, unsigned dsty, unsigned dstz,
                           const struct pipe_box *src_box);
 void si_compute_clear_render_target(struct pipe_context *ctx,
                                     struct pipe_surface *dstsurf,
                                     const union pipe_color_union *color,
                                     unsigned dstx, unsigned dsty,
                                     unsigned width, unsigned height,
                                    bool render_condition_enabled);
+void si_retile_dcc(struct si_context *sctx, struct si_texture *tex);
 void si_init_compute_blit_functions(struct si_context *sctx);
 
 /* si_cp_dma.c */
 #define SI_CPDMA_SKIP_CHECK_CS_SPACE   (1 << 0) /* don't call need_cs_space */
 #define SI_CPDMA_SKIP_SYNC_AFTER       (1 << 1) /* don't wait for DMA after 
the copy */
 #define SI_CPDMA_SKIP_SYNC_BEFORE      (1 << 2) /* don't wait for DMA before 
the copy (RAW hazards) */
 #define SI_CPDMA_SKIP_GFX_SYNC         (1 << 3) /* don't flush caches and 
don't wait for PS/CS */
 #define SI_CPDMA_SKIP_BO_LIST_UPDATE   (1 << 4) /* don't update the BO list */
 #define SI_CPDMA_SKIP_ALL (SI_CPDMA_SKIP_CHECK_CS_SPACE | \
                           SI_CPDMA_SKIP_SYNC_AFTER | \
@@ -1328,20 +1340,21 @@ void si_resume_queries(struct si_context *sctx);
 void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
                        unsigned num_layers);
 void *si_create_fixed_func_tcs(struct si_context *sctx);
 void *si_create_dma_compute_shader(struct pipe_context *ctx,
                                   unsigned num_dwords_per_thread,
                                   bool dst_stream_cache_policy, bool is_copy);
 void *si_create_copy_image_compute_shader(struct pipe_context *ctx);
 void *si_create_copy_image_compute_shader_1d_array(struct pipe_context *ctx);
 void *si_clear_render_target_shader(struct pipe_context *ctx);
 void *si_clear_render_target_shader_1d_array(struct pipe_context *ctx);
+void *si_create_dcc_retile_cs(struct pipe_context *ctx);
 void *si_create_query_result_cs(struct si_context *sctx);
 
 /* si_test_dma.c */
 void si_test_dma(struct si_screen *sscreen);
 
 /* si_test_clearbuffer.c */
 void si_test_dma_perf(struct si_screen *sscreen);
 
 /* si_uvd.c */
 struct pipe_video_codec *si_uvd_create_decoder(struct pipe_context *context,
diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c 
b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
index 8ff9ebda9ba..b68fd2ff236 100644
--- a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
+++ b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
@@ -219,20 +219,93 @@ void *si_create_dma_compute_shader(struct pipe_context 
*ctx,
        state.prog = ureg_get_tokens(ureg, NULL);
 
        void *cs = ctx->create_compute_state(ctx, &state);
        ureg_destroy(ureg);
         ureg_free_tokens(state.prog);
 
        free(values);
        return cs;
 }
 
+/* Create a compute shader that copies DCC from one buffer to another
+ * where each DCC buffer has a different layout.
+ *
+ * image[0]: offset remap table (pairs of <src_offset, dst_offset>),
+ *           2 pairs are read
+ * image[1]: DCC source buffer, typed r8_uint
+ * image[2]: DCC destination buffer, typed r8_uint
+ */
+void *si_create_dcc_retile_cs(struct pipe_context *ctx)
+{
+       struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE);
+       if (!ureg)
+               return NULL;
+
+       ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, 64);
+       ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 1);
+       ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1);
+
+       /* Compute the global thread ID (in idx). */
+       struct ureg_src tid = ureg_DECL_system_value(ureg, 
TGSI_SEMANTIC_THREAD_ID, 0);
+       struct ureg_src blk = ureg_DECL_system_value(ureg, 
TGSI_SEMANTIC_BLOCK_ID, 0);
+       struct ureg_dst idx = ureg_writemask(ureg_DECL_temporary(ureg),
+                                            TGSI_WRITEMASK_X);
+       ureg_UMAD(ureg, idx, blk, ureg_imm1u(ureg, 64), tid);
+
+       /* Load 2 pairs of offsets for DCC load & store. */
+       struct ureg_src map = ureg_DECL_image(ureg, 0, TGSI_TEXTURE_BUFFER, 0, 
false, false);
+       struct ureg_dst offsets = ureg_DECL_temporary(ureg);
+       struct ureg_src map_load_args[] = {map, ureg_src(idx)};
+
+       ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &offsets, 1, map_load_args, 2,
+                        TGSI_MEMORY_RESTRICT, TGSI_TEXTURE_BUFFER, 0);
+
+       struct ureg_src dcc_src = ureg_DECL_image(ureg, 1, TGSI_TEXTURE_BUFFER,
+                                                 0, false, false);
+       struct ureg_dst dcc_dst = ureg_dst(ureg_DECL_image(ureg, 2, 
TGSI_TEXTURE_BUFFER,
+                                                          0, true, false));
+       struct ureg_dst dcc_value[2];
+
+       /* Copy DCC values:
+        *   dst[offsets.y] = src[offsets.x];
+        *   dst[offsets.w] = src[offsets.z];
+        */
+       for (unsigned i = 0; i < 2; i++) {
+               dcc_value[i] = ureg_writemask(ureg_DECL_temporary(ureg), 
TGSI_WRITEMASK_X);
+
+               struct ureg_src load_args[] =
+                       {dcc_src, ureg_scalar(ureg_src(offsets), TGSI_SWIZZLE_X 
+ i*2)};
+               ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &dcc_value[i], 1, 
load_args, 2,
+                                TGSI_MEMORY_RESTRICT, TGSI_TEXTURE_BUFFER, 0);
+       }
+
+       dcc_dst = ureg_writemask(dcc_dst, TGSI_WRITEMASK_X);
+
+       for (unsigned i = 0; i < 2; i++) {
+               struct ureg_src store_args[] = {
+                       ureg_scalar(ureg_src(offsets), TGSI_SWIZZLE_Y + i*2),
+                       ureg_src(dcc_value[i])
+               };
+               ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dcc_dst, 1, 
store_args, 2,
+                                TGSI_MEMORY_RESTRICT, TGSI_TEXTURE_BUFFER, 0);
+       }
+       ureg_END(ureg);
+
+       struct pipe_compute_state state = {};
+       state.ir_type = PIPE_SHADER_IR_TGSI;
+       state.prog = ureg_get_tokens(ureg, NULL);
+
+       void *cs = ctx->create_compute_state(ctx, &state);
+       ureg_destroy(ureg);
+       return cs;
+}
+
 /* Create the compute shader that is used to collect the results.
  *
  * One compute grid with a single thread is launched for every query result
  * buffer. The thread (optionally) reads a previous summary buffer, then
  * accumulates data from the query result buffer, and writes the result either
  * to a summary buffer to be consumed by the next grid invocation or to the
  * user-supplied buffer.
  *
  * Data layout:
  *
diff --git a/src/gallium/drivers/radeonsi/si_texture.c 
b/src/gallium/drivers/radeonsi/si_texture.c
index cb62f153e59..8211f9cf325 100644
--- a/src/gallium/drivers/radeonsi/si_texture.c
+++ b/src/gallium/drivers/radeonsi/si_texture.c
@@ -424,27 +424,31 @@ static bool si_can_disable_dcc(struct si_texture *tex)
 {
        /* We can't disable DCC if it can be written by another process. */
        return tex->dcc_offset &&
               (!tex->buffer.b.is_shared ||
                !(tex->buffer.external_usage & 
PIPE_HANDLE_USAGE_FRAMEBUFFER_WRITE));
 }
 
 static bool si_texture_discard_dcc(struct si_screen *sscreen,
                                   struct si_texture *tex)
 {
-       if (!si_can_disable_dcc(tex))
+       if (!si_can_disable_dcc(tex)) {
+               assert(tex->display_dcc_offset == 0);
                return false;
+       }
 
        assert(tex->dcc_separate_buffer == NULL);
 
        /* Disable DCC. */
        tex->dcc_offset = 0;
+       tex->display_dcc_offset = 0;
+       tex->dcc_retile_map_offset = 0;
 
        /* Notify all contexts about the change. */
        p_atomic_inc(&sscreen->dirty_tex_counter);
        return true;
 }
 
 /**
  * Disable DCC for the texture. (first decompress, then discard metadata).
  *
  * There is unresolved multi-context synchronization issue between
@@ -618,21 +622,23 @@ static void si_set_tex_bo_metadata(struct si_screen 
*sscreen,
        struct radeon_surf *surface = &tex->surface;
        struct pipe_resource *res = &tex->buffer.b.b;
        struct radeon_bo_metadata md;
 
        memset(&md, 0, sizeof(md));
 
        if (sscreen->info.chip_class >= GFX9) {
                md.u.gfx9.swizzle_mode = surface->u.gfx9.surf.swizzle_mode;
 
                if (tex->dcc_offset && !tex->dcc_separate_buffer) {
-                       uint64_t dcc_offset = tex->dcc_offset;
+                       uint64_t dcc_offset =
+                               tex->display_dcc_offset ? 
tex->display_dcc_offset
+                                                       : tex->dcc_offset;
 
                        assert((dcc_offset >> 8) != 0 && (dcc_offset >> 8) < (1 
<< 24));
                        md.u.gfx9.dcc_offset_256B = dcc_offset >> 8;
                        md.u.gfx9.dcc_pitch_max = 
tex->surface.u.gfx9.display_dcc_pitch_max;
                        md.u.gfx9.dcc_independent_64B = 1;
                }
        } else {
                md.u.legacy.microtile = surface->u.legacy.level[0].mode >= 
RADEON_SURF_MODE_1D ?
                                           RADEON_LAYOUT_TILED : 
RADEON_LAYOUT_LINEAR;
                md.u.legacy.macrotile = surface->u.legacy.level[0].mode >= 
RADEON_SURF_MODE_2D ?
@@ -756,20 +762,25 @@ static bool si_has_displayable_dcc(struct si_texture *tex)
 
        /* This needs a cache flush before scanout.
         * (it can't be scanned out and rendered to simultaneously)
         */
        if (sscreen->info.use_display_dcc_unaligned &&
            tex->dcc_offset &&
            !tex->surface.u.gfx9.dcc.pipe_aligned &&
            !tex->surface.u.gfx9.dcc.rb_aligned)
                return true;
 
+       /* This needs an explicit flush (flush_resource). */
+       if (sscreen->info.use_display_dcc_with_retile_blit &&
+           tex->display_dcc_offset)
+               return true;
+
        return false;
 }
 
 static boolean si_texture_get_handle(struct pipe_screen* screen,
                                     struct pipe_context *ctx,
                                     struct pipe_resource *resource,
                                     struct winsys_handle *whandle,
                                     unsigned usage)
 {
        struct si_screen *sscreen = (struct si_screen*)screen;
@@ -903,23 +914,27 @@ static boolean si_texture_get_handle(struct pipe_screen* 
screen,
                res->external_usage = usage;
        }
 
        return sscreen->ws->buffer_get_handle(res->buf, stride, offset,
                                              slice_size, whandle);
 }
 
 static void si_texture_destroy(struct pipe_screen *screen,
                               struct pipe_resource *ptex)
 {
+       struct si_screen *sscreen = (struct si_screen*)screen;
        struct si_texture *tex = (struct si_texture*)ptex;
        struct si_resource *resource = &tex->buffer;
 
+       if (sscreen->info.chip_class >= GFX9)
+               free(tex->surface.u.gfx9.dcc_retile_map);
+
        si_texture_reference(&tex->flushed_depth_texture, NULL);
 
        if (tex->cmask_buffer != &tex->buffer) {
            si_resource_reference(&tex->cmask_buffer, NULL);
        }
        pb_reference(&resource->buf, NULL);
        si_resource_reference(&tex->dcc_separate_buffer, NULL);
        si_resource_reference(&tex->last_dcc_separate_buffer, NULL);
        FREE(tex);
 }
@@ -1247,24 +1262,46 @@ si_texture_create_object(struct pipe_screen *screen,
                                goto error;
                }
 
                /* Shared textures must always set up DCC here.
                 * If it's not present, it will be disabled by
                 * apply_opaque_metadata later.
                 */
                if (tex->surface.dcc_size &&
                    (buf || !(sscreen->debug_flags & DBG(NO_DCC))) &&
                    (sscreen->info.use_display_dcc_unaligned ||
+                    sscreen->info.use_display_dcc_with_retile_blit ||
                     !(tex->surface.flags & RADEON_SURF_SCANOUT))) {
                        /* Add space for the DCC buffer. */
                        tex->dcc_offset = align64(tex->size, 
tex->surface.dcc_alignment);
                        tex->size = tex->dcc_offset + tex->surface.dcc_size;
+
+                       if (sscreen->info.chip_class >= GFX9 &&
+                           tex->surface.u.gfx9.dcc_retile_num_elements) {
+                               /* Add space for the displayable DCC buffer. */
+                               tex->display_dcc_offset =
+                                       align64(tex->size, 
tex->surface.u.gfx9.display_dcc_alignment);
+                               tex->size = tex->display_dcc_offset +
+                                           
tex->surface.u.gfx9.display_dcc_size;
+
+                               /* Add space for the DCC retile buffer. (16-bit 
or 32-bit elements) */
+                               tex->dcc_retile_map_offset =
+                                       align64(tex->size, 
sscreen->info.tcc_cache_line_size);
+
+                               if (tex->surface.u.gfx9.dcc_retile_use_uint16) {
+                                       tex->size = tex->dcc_retile_map_offset +
+                                                   
tex->surface.u.gfx9.dcc_retile_num_elements * 2;
+                               } else {
+                                       tex->size = tex->dcc_retile_map_offset +
+                                                   
tex->surface.u.gfx9.dcc_retile_num_elements * 4;
+                               }
+                       }
                }
        }
 
        /* Now create the backing buffer. */
        if (!buf) {
                si_init_resource_fields(sscreen, resource, tex->size,
                                          tex->surface.surf_alignment);
 
                if (!si_alloc_resource(sscreen, resource))
                        goto error;
@@ -1346,20 +1383,60 @@ si_texture_create_object(struct pipe_screen *screen,
                                }
                                /* Mipmap levels without DCC. */
                                if (size != tex->surface.dcc_size) {
                                        si_screen_clear_buffer(sscreen, 
&tex->buffer.b.b,
                                                               tex->dcc_offset 
+ size,
                                                               
tex->surface.dcc_size - size,
                                                               
DCC_UNCOMPRESSED);
                                }
                        }
                }
+
+               /* Upload the DCC retile map. */
+               if (tex->dcc_retile_map_offset) {
+                       /* Use a staging buffer for the upload, because
+                        * the buffer backing the texture is unmappable.
+                        */
+                       bool use_uint16 = 
tex->surface.u.gfx9.dcc_retile_use_uint16;
+                       unsigned num_elements = 
tex->surface.u.gfx9.dcc_retile_num_elements;
+                       struct si_resource *buf =
+                               si_aligned_buffer_create(screen, 0, 
PIPE_USAGE_STREAM,
+                                                        num_elements * 
(use_uint16 ? 2 : 4),
+                                                        
sscreen->info.tcc_cache_line_size);
+                       uint32_t *ui = 
(uint32_t*)sscreen->ws->buffer_map(buf->buf, NULL,
+                                                                         
PIPE_TRANSFER_WRITE);
+                       uint16_t *us = (uint16_t*)ui;
+
+                       /* Upload the retile map into a staging buffer. */
+                       if (use_uint16) {
+                               for (unsigned i = 0; i < num_elements; i++)
+                                       us[i] = 
tex->surface.u.gfx9.dcc_retile_map[i];
+                       } else {
+                               for (unsigned i = 0; i < num_elements; i++)
+                                       ui[i] = 
tex->surface.u.gfx9.dcc_retile_map[i];
+                       }
+
+                       /* Copy the staging buffer to the buffer backing the 
texture. */
+                       struct si_context *sctx = (struct 
si_context*)sscreen->aux_context;
+                       struct pipe_box box;
+                       u_box_1d(0, buf->b.b.width0, &box);
+
+                       assert(tex->dcc_retile_map_offset <= UINT_MAX);
+                       mtx_lock(&sscreen->aux_context_lock);
+                       sctx->dma_copy(&sctx->b, &tex->buffer.b.b, 0,
+                                      tex->dcc_retile_map_offset, 0, 0,
+                                      &buf->b.b, 0, &box);
+                       sscreen->aux_context->flush(sscreen->aux_context, NULL, 
0);
+                       mtx_unlock(&sscreen->aux_context_lock);
+
+                       si_resource_reference(&buf, NULL);
+               }
        }
 
        /* Initialize the CMASK base register value. */
        tex->cmask_base_address_reg =
                (tex->buffer.gpu_address + tex->cmask_offset) >> 8;
 
        if (sscreen->debug_flags & DBG(VM)) {
                fprintf(stderr, "VM start=0x%"PRIX64"  end=0x%"PRIX64" | 
Texture %ix%ix%i, %i levels, %i samples, %s\n",
                        tex->buffer.gpu_address,
                        tex->buffer.gpu_address + tex->buffer.buf->size,
@@ -1374,20 +1451,22 @@ si_texture_create_object(struct pipe_screen *screen,
                si_print_texture_info(sscreen, tex, &log);
                u_log_new_page_print(&log, stdout);
                fflush(stdout);
                u_log_context_destroy(&log);
        }
 
        return tex;
 
 error:
        FREE(tex);
+       if (sscreen->info.chip_class >= GFX9)
+               free(surface->u.gfx9.dcc_retile_map);
        return NULL;
 }
 
 static enum radeon_surf_mode
 si_choose_tiling(struct si_screen *sscreen,
                 const struct pipe_resource *templ, bool tc_compatible_htile)
 {
        const struct util_format_description *desc = 
util_format_description(templ->format);
        bool force_tiling = templ->flags & SI_RESOURCE_FLAG_FORCE_MSAA_TILING;
        bool is_depth_stencil = util_format_is_depth_or_stencil(templ->format) 
&&
-- 
2.17.1

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 4/5] radeonsi: add support for displayable DCC for multi-RB chips

Reply via email to