From: Marek Olšák <marek.ol...@amd.com> A compute shader is used to reorder DCC data from aligned to unaligned. --- src/amd/common/ac_gpu_info.c | 1 + src/amd/common/ac_gpu_info.h | 3 + src/amd/common/ac_surface.c | 125 ++++++++++++++++-- src/amd/common/ac_surface.h | 15 ++- src/gallium/drivers/radeonsi/cik_sdma.c | 3 +- src/gallium/drivers/radeonsi/si_blit.c | 3 + .../drivers/radeonsi/si_compute_blit.c | 80 +++++++++++ src/gallium/drivers/radeonsi/si_pipe.c | 2 + src/gallium/drivers/radeonsi/si_pipe.h | 15 ++- .../drivers/radeonsi/si_shaderlib_tgsi.c | 73 ++++++++++ src/gallium/drivers/radeonsi/si_texture.c | 83 +++++++++++- 11 files changed, 389 insertions(+), 14 deletions(-)
diff --git a/src/amd/common/ac_gpu_info.c b/src/amd/common/ac_gpu_info.c index a6d249a6d2f..d890172227c 100644 --- a/src/amd/common/ac_gpu_info.c +++ b/src/amd/common/ac_gpu_info.c @@ -497,20 +497,21 @@ void ac_print_gpu_info(struct radeon_info *info) info->pci_dev, info->pci_func); printf(" pci_id = 0x%x\n", info->pci_id); printf(" family = %i\n", info->family); printf(" chip_class = %i\n", info->chip_class); printf(" num_compute_rings = %u\n", info->num_compute_rings); printf(" num_sdma_rings = %i\n", info->num_sdma_rings); printf(" clock_crystal_freq = %i\n", info->clock_crystal_freq); printf(" tcc_cache_line_size = %u\n", info->tcc_cache_line_size); printf(" use_display_dcc_unaligned = %u\n", info->use_display_dcc_unaligned); + printf(" use_display_dcc_with_retile_blit = %u\n", info->use_display_dcc_with_retile_blit); printf("Memory info:\n"); printf(" pte_fragment_size = %u\n", info->pte_fragment_size); printf(" gart_page_size = %u\n", info->gart_page_size); printf(" gart_size = %i MB\n", (int)DIV_ROUND_UP(info->gart_size, 1024*1024)); printf(" vram_size = %i MB\n", (int)DIV_ROUND_UP(info->vram_size, 1024*1024)); printf(" vram_vis_size = %i MB\n", (int)DIV_ROUND_UP(info->vram_vis_size, 1024*1024)); printf(" gds_size = %u kB\n", info->gds_size / 1024); printf(" gds_gfx_partition_size = %u kB\n", info->gds_gfx_partition_size / 1024); printf(" max_alloc_size = %i MB\n", diff --git a/src/amd/common/ac_gpu_info.h b/src/amd/common/ac_gpu_info.h index 99fed520618..5241c28f2a7 100644 --- a/src/amd/common/ac_gpu_info.h +++ b/src/amd/common/ac_gpu_info.h @@ -49,22 +49,25 @@ struct radeon_info { /* Device info. */ const char *name; uint32_t pci_id; enum radeon_family family; enum chip_class chip_class; uint32_t num_compute_rings; uint32_t num_sdma_rings; uint32_t clock_crystal_freq; uint32_t tcc_cache_line_size; + /* There are 2 display DCC codepaths, because display expects unaligned DCC. */ /* Disable RB and pipe alignment to skip the retile blit. (1 RB chips only) */ bool use_display_dcc_unaligned; + /* Allocate both aligned and unaligned DCC and use the retile blit. */ + bool use_display_dcc_with_retile_blit; /* Memory info. */ uint32_t pte_fragment_size; uint32_t gart_page_size; uint64_t gart_size; uint64_t vram_size; uint64_t vram_vis_size; unsigned gds_size; unsigned gds_gfx_partition_size; uint64_t max_alloc_size; diff --git a/src/amd/common/ac_surface.c b/src/amd/common/ac_surface.c index 6802ab2badb..7225317f3e7 100644 --- a/src/amd/common/ac_surface.c +++ b/src/amd/common/ac_surface.c @@ -1072,20 +1072,21 @@ gfx9_get_preferred_swizzle_mode(ADDR_HANDLE addrlib, ret = Addr2GetPreferredSurfaceSetting(addrlib, &sin, &sout); if (ret != ADDR_OK) return ret; *swizzle_mode = sout.swizzleMode; return 0; } static int gfx9_compute_miptree(ADDR_HANDLE addrlib, + const struct radeon_info *info, const struct ac_surf_config *config, struct radeon_surf *surf, bool compressed, ADDR2_COMPUTE_SURFACE_INFO_INPUT *in) { ADDR2_MIP_INFO mip_info[RADEON_SURF_MAX_LEVELS] = {}; ADDR2_COMPUTE_SURFACE_INFO_OUTPUT out = {0}; ADDR_E_RETURNCODE ret; out.size = sizeof(ADDR2_COMPUTE_SURFACE_INFO_OUTPUT); out.pMipInfo = mip_info; @@ -1209,21 +1210,20 @@ static int gfx9_compute_miptree(ADDR_HANDLE addrlib, din.numFrags = in->numFrags; din.numMipLevels = in->numMipLevels; din.dataSurfaceSize = out.surfSize; ret = Addr2ComputeDccInfo(addrlib, &din, &dout); if (ret != ADDR_OK) return ret; surf->u.gfx9.dcc.rb_aligned = din.dccKeyFlags.rbAligned; surf->u.gfx9.dcc.pipe_aligned = din.dccKeyFlags.pipeAligned; - surf->u.gfx9.display_dcc_pitch_max = dout.pitch - 1; surf->dcc_size = dout.dccRamSize; surf->dcc_alignment = dout.dccRamBaseAlign; surf->num_dcc_levels = in->numMipLevels; /* Disable DCC for levels that are in the mip tail. * * There are two issues that this is intended to * address: * * 1. Multiple mip levels may share a cache line. This @@ -1245,20 +1245,120 @@ static int gfx9_compute_miptree(ADDR_HANDLE addrlib, */ for (unsigned i = 0; i < in->numMipLevels; i++) { if (meta_mip_info[i].inMiptail) { surf->num_dcc_levels = i; break; } } if (!surf->num_dcc_levels) surf->dcc_size = 0; + + surf->u.gfx9.display_dcc_size = surf->dcc_size; + surf->u.gfx9.display_dcc_alignment = surf->dcc_alignment; + surf->u.gfx9.display_dcc_pitch_max = dout.pitch - 1; + + /* Compute displayable DCC. */ + if (in->flags.display && + surf->num_dcc_levels && + info->use_display_dcc_with_retile_blit) { + /* Compute displayable DCC info. */ + din.dccKeyFlags.pipeAligned = 0; + din.dccKeyFlags.rbAligned = 0; + + assert(din.numSlices == 1); + assert(din.numMipLevels == 1); + assert(din.numFrags == 1); + assert(surf->tile_swizzle == 0); + assert(surf->u.gfx9.dcc.pipe_aligned || + surf->u.gfx9.dcc.rb_aligned); + + ret = Addr2ComputeDccInfo(addrlib, &din, &dout); + if (ret != ADDR_OK) + return ret; + + surf->u.gfx9.display_dcc_size = dout.dccRamSize; + surf->u.gfx9.display_dcc_alignment = dout.dccRamBaseAlign; + surf->u.gfx9.display_dcc_pitch_max = dout.pitch - 1; + assert(surf->u.gfx9.display_dcc_size <= surf->dcc_size); + + /* Compute address mapping from non-displayable to displayable DCC. */ + ADDR2_COMPUTE_DCC_ADDRFROMCOORD_INPUT addrin = {}; + addrin.size = sizeof(addrin); + addrin.colorFlags.color = 1; + addrin.swizzleMode = din.swizzleMode; + addrin.resourceType = din.resourceType; + addrin.bpp = din.bpp; + addrin.unalignedWidth = din.unalignedWidth; + addrin.unalignedHeight = din.unalignedHeight; + addrin.numSlices = 1; + addrin.numMipLevels = 1; + addrin.numFrags = 1; + + ADDR2_COMPUTE_DCC_ADDRFROMCOORD_OUTPUT addrout = {}; + addrout.size = sizeof(addrout); + + surf->u.gfx9.dcc_retile_num_elements = + DIV_ROUND_UP(in->width, dout.compressBlkWidth) * + DIV_ROUND_UP(in->height, dout.compressBlkHeight) * 2; + /* Align the size to 4 (for the compute shader). */ + surf->u.gfx9.dcc_retile_num_elements = + align(surf->u.gfx9.dcc_retile_num_elements, 4); + + surf->u.gfx9.dcc_retile_map = + malloc(surf->u.gfx9.dcc_retile_num_elements * 4); + if (!surf->u.gfx9.dcc_retile_map) + return ADDR_OUTOFMEMORY; + + unsigned index = 0; + surf->u.gfx9.dcc_retile_use_uint16 = true; + + for (unsigned y = 0; y < in->height; y += dout.compressBlkHeight) { + addrin.y = y; + + for (unsigned x = 0; x < in->width; x += dout.compressBlkWidth) { + addrin.x = x; + + /* Compute src DCC address */ + addrin.dccKeyFlags.pipeAligned = surf->u.gfx9.dcc.pipe_aligned; + addrin.dccKeyFlags.rbAligned = surf->u.gfx9.dcc.rb_aligned; + addrout.addr = 0; + + ret = Addr2ComputeDccAddrFromCoord(addrlib, &addrin, &addrout); + if (ret != ADDR_OK) + return ret; + + surf->u.gfx9.dcc_retile_map[index * 2] = addrout.addr; + if (addrout.addr > USHRT_MAX) + surf->u.gfx9.dcc_retile_use_uint16 = false; + + /* Compute dst DCC address */ + addrin.dccKeyFlags.pipeAligned = 0; + addrin.dccKeyFlags.rbAligned = 0; + addrout.addr = 0; + + ret = Addr2ComputeDccAddrFromCoord(addrlib, &addrin, &addrout); + if (ret != ADDR_OK) + return ret; + + surf->u.gfx9.dcc_retile_map[index * 2 + 1] = addrout.addr; + if (addrout.addr > USHRT_MAX) + surf->u.gfx9.dcc_retile_use_uint16 = false; + + assert(index * 2 + 1 < surf->u.gfx9.dcc_retile_num_elements); + index++; + } + } + /* Fill the remaining pairs with the last one (for the compute shader). */ + for (unsigned i = index * 2; i < surf->u.gfx9.dcc_retile_num_elements; i++) + surf->u.gfx9.dcc_retile_map[i] = surf->u.gfx9.dcc_retile_map[i - 2]; + } } /* FMASK */ if (in->numSamples > 1) { ADDR2_COMPUTE_FMASK_INFO_INPUT fin = {0}; ADDR2_COMPUTE_FMASK_INFO_OUTPUT fout = {0}; fin.size = sizeof(ADDR2_COMPUTE_FMASK_INFO_INPUT); fout.size = sizeof(ADDR2_COMPUTE_FMASK_INFO_OUTPUT); @@ -1494,56 +1594,59 @@ static int gfx9_compute_surface(ADDR_HANDLE addrlib, surf->num_dcc_levels = 0; surf->surf_size = 0; surf->fmask_size = 0; surf->dcc_size = 0; surf->htile_size = 0; surf->htile_slice_size = 0; surf->u.gfx9.surf_offset = 0; surf->u.gfx9.stencil_offset = 0; surf->cmask_size = 0; + surf->u.gfx9.dcc_retile_use_uint16 = false; + surf->u.gfx9.dcc_retile_num_elements = 0; + surf->u.gfx9.dcc_retile_map = NULL; /* Calculate texture layout information. */ - r = gfx9_compute_miptree(addrlib, config, surf, compressed, + r = gfx9_compute_miptree(addrlib, info, config, surf, compressed, &AddrSurfInfoIn); if (r) - return r; + goto error; /* Calculate texture layout information for stencil. */ if (surf->flags & RADEON_SURF_SBUFFER) { AddrSurfInfoIn.flags.stencil = 1; AddrSurfInfoIn.bpp = 8; AddrSurfInfoIn.format = ADDR_FMT_8; if (!AddrSurfInfoIn.flags.depth) { r = gfx9_get_preferred_swizzle_mode(addrlib, &AddrSurfInfoIn, false, &AddrSurfInfoIn.swizzleMode); if (r) - return r; + goto error; } else AddrSurfInfoIn.flags.depth = 0; - r = gfx9_compute_miptree(addrlib, config, surf, compressed, + r = gfx9_compute_miptree(addrlib, info, config, surf, compressed, &AddrSurfInfoIn); if (r) - return r; + goto error; } surf->is_linear = surf->u.gfx9.surf.swizzle_mode == ADDR_SW_LINEAR; /* Query whether the surface is displayable. */ bool displayable = false; if (!config->is_3d && !config->is_cube) { r = Addr2IsValidDisplaySwizzleMode(addrlib, surf->u.gfx9.surf.swizzle_mode, surf->bpe * 8, &displayable); if (r) - return r; + goto error; /* Display needs unaligned DCC. */ if (info->use_display_dcc_unaligned && surf->num_dcc_levels && (surf->u.gfx9.dcc.pipe_aligned || surf->u.gfx9.dcc.rb_aligned)) displayable = false; } surf->is_displayable = displayable; @@ -1581,38 +1684,44 @@ static int gfx9_compute_surface(ADDR_HANDLE addrlib, case ADDR_SW_64KB_R_T: case ADDR_SW_4KB_R_X: case ADDR_SW_64KB_R_X: case ADDR_SW_VAR_R_X: /* The rotated micro tile mode doesn't work if both CMASK and RB+ are * used at the same time. This case is not currently expected to occur * because we don't use rotated. Enforce this restriction on all chips * to facilitate testing. */ assert(!"rotate micro tile mode is unsupported"); - return ADDR_ERROR; + r = ADDR_ERROR; + goto error; /* Z = depth. */ case ADDR_SW_4KB_Z: case ADDR_SW_64KB_Z: case ADDR_SW_VAR_Z: case ADDR_SW_64KB_Z_T: case ADDR_SW_4KB_Z_X: case ADDR_SW_64KB_Z_X: case ADDR_SW_VAR_Z_X: surf->micro_tile_mode = RADEON_MICRO_MODE_DEPTH; break; default: assert(0); } return 0; + +error: + free(surf->u.gfx9.dcc_retile_map); + surf->u.gfx9.dcc_retile_map = NULL; + return r; } int ac_compute_surface(ADDR_HANDLE addrlib, const struct radeon_info *info, const struct ac_surf_config *config, enum radeon_surf_mode mode, struct radeon_surf *surf) { int r; r = surf_config_sanity(config, surf->flags); diff --git a/src/amd/common/ac_surface.h b/src/amd/common/ac_surface.h index eb50c37c3c2..10d25e23d32 100644 --- a/src/amd/common/ac_surface.h +++ b/src/amd/common/ac_surface.h @@ -20,20 +20,21 @@ * * The above copyright notice and this permission notice (including the * next paragraph) shall be included in all copies or substantial portions * of the Software. */ #ifndef AC_SURFACE_H #define AC_SURFACE_H #include <stdint.h> +#include <stdbool.h> #include "amd_family.h" #ifdef __cplusplus extern "C" { #endif /* Forward declarations. */ typedef void* ADDR_HANDLE; @@ -142,23 +143,33 @@ struct gfx9_surf_layout { enum gfx9_resource_type resource_type; /* 1D, 2D or 3D */ uint16_t surf_pitch; /* in blocks */ uint16_t surf_height; uint64_t surf_offset; /* 0 unless imported with an offset */ /* The size of the 2D plane containing all mipmap levels. */ uint64_t surf_slice_size; /* Mipmap level offset within the slice in bytes. Only valid for LINEAR. */ uint32_t offset[RADEON_SURF_MAX_LEVELS]; - uint16_t display_dcc_pitch_max; /* (mip chain pitch - 1) */ - uint64_t stencil_offset; /* separate stencil */ + + /* Displayable DCC. This is always rb_aligned=0 and pipe_aligned=0. + * The 3D engine doesn't support that layout except for chips with 1 RB. + * All other chips must set rb_aligned=1. + * A compute shader needs to convert from aligned DCC to unaligned. + */ + uint32_t display_dcc_size; + uint32_t display_dcc_alignment; + uint16_t display_dcc_pitch_max; /* (mip chain pitch - 1) */ + bool dcc_retile_use_uint16; /* if all values fit into uint16_t */ + uint32_t dcc_retile_num_elements; + uint32_t *dcc_retile_map; }; struct radeon_surf { /* Format properties. */ unsigned blk_w:4; unsigned blk_h:4; unsigned bpe:5; /* Number of mipmap levels where DCC is enabled starting from level 0. * Non-zero levels may be disabled due to alignment constraints, but not * the first level. diff --git a/src/gallium/drivers/radeonsi/cik_sdma.c b/src/gallium/drivers/radeonsi/cik_sdma.c index 096f75e508f..da9b25a442d 100644 --- a/src/gallium/drivers/radeonsi/cik_sdma.c +++ b/src/gallium/drivers/radeonsi/cik_sdma.c @@ -489,21 +489,22 @@ static void cik_sdma_copy(struct pipe_context *ctx, unsigned src_level, const struct pipe_box *src_box) { struct si_context *sctx = (struct si_context *)ctx; if (!sctx->dma_cs || src->flags & PIPE_RESOURCE_FLAG_SPARSE || dst->flags & PIPE_RESOURCE_FLAG_SPARSE) goto fallback; - if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) { + /* If src is a buffer and dst is a texture, we are uploading metadata. */ + if (src->target == PIPE_BUFFER) { cik_sdma_copy_buffer(sctx, dst, src, dstx, src_box->x, src_box->width); return; } if ((sctx->chip_class == CIK || sctx->chip_class == VI) && cik_sdma_copy_texture(sctx, dst, dst_level, dstx, dsty, dstz, src, src_level, src_box)) return; fallback: diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c index 7613a63e3cb..610de289a20 100644 --- a/src/gallium/drivers/radeonsi/si_blit.c +++ b/src/gallium/drivers/radeonsi/si_blit.c @@ -1311,20 +1311,23 @@ static void si_flush_resource(struct pipe_context *ctx, /* st/dri calls flush twice per frame (not a bug), this prevents double * decompression. */ if (tex->dcc_separate_buffer && !tex->separate_dcc_dirty) return; if (!tex->is_depth && (tex->cmask_buffer || tex->dcc_offset)) { si_blit_decompress_color(sctx, tex, 0, res->last_level, 0, util_max_layer(res, 0), tex->dcc_separate_buffer != NULL); + + if (tex->display_dcc_offset) + si_retile_dcc(sctx, tex); } /* Always do the analysis even if DCC is disabled at the moment. */ if (tex->dcc_gather_statistics) { bool separate_dcc_dirty = tex->separate_dcc_dirty; /* If the color buffer hasn't been unbound and fast clear hasn't * been used, separate_dcc_dirty is false, but there may have been * new rendering. Check if the color buffer is bound and assume * it's dirty. diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c index 2ce56d6a81a..3935d9c754d 100644 --- a/src/gallium/drivers/radeonsi/si_compute_blit.c +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c @@ -412,20 +412,100 @@ void si_compute_copy_image(struct si_context *sctx, sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | (sctx->chip_class <= VI ? SI_CONTEXT_WRITEBACK_GLOBAL_L2 : 0) | si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM); ctx->bind_compute_state(ctx, saved_cs); ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, saved_image); ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb); si_compute_internal_end(sctx); } +void si_retile_dcc(struct si_context *sctx, struct si_texture *tex) +{ + struct pipe_context *ctx = &sctx->b; + + sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | + SI_CONTEXT_CS_PARTIAL_FLUSH | + si_get_flush_flags(sctx, SI_COHERENCY_CB_META, L2_LRU) | + si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_LRU); + si_emit_cache_flush(sctx); + + /* Save states. */ + void *saved_cs = sctx->cs_shader_state.program; + struct pipe_image_view saved_img[3] = {}; + + for (unsigned i = 0; i < 3; i++) { + util_copy_image_view(&saved_img[i], + &sctx->images[PIPE_SHADER_COMPUTE].views[i]); + } + + /* Set images. */ + bool use_uint16 = tex->surface.u.gfx9.dcc_retile_use_uint16; + unsigned num_elements = tex->surface.u.gfx9.dcc_retile_num_elements; + struct pipe_image_view img[3]; + + assert(tex->dcc_retile_map_offset && tex->dcc_retile_map_offset <= UINT_MAX); + assert(tex->dcc_offset && tex->dcc_offset <= UINT_MAX); + assert(tex->display_dcc_offset && tex->display_dcc_offset <= UINT_MAX); + + for (unsigned i = 0; i < 3; i++) { + img[i].resource = &tex->buffer.b.b; + img[i].access = i == 2 ? PIPE_IMAGE_ACCESS_WRITE : PIPE_IMAGE_ACCESS_READ; + img[i].shader_access = SI_IMAGE_ACCESS_AS_BUFFER; + } + + img[0].format = use_uint16 ? PIPE_FORMAT_R16G16B16A16_UINT : + PIPE_FORMAT_R32G32B32A32_UINT; + img[0].u.buf.offset = tex->dcc_retile_map_offset; + img[0].u.buf.size = num_elements * (use_uint16 ? 2 : 4); + + img[1].format = PIPE_FORMAT_R8_UINT; + img[1].u.buf.offset = tex->dcc_offset; + img[1].u.buf.size = tex->surface.dcc_size; + + img[2].format = PIPE_FORMAT_R8_UINT; + img[2].u.buf.offset = tex->display_dcc_offset; + img[2].u.buf.size = tex->surface.u.gfx9.display_dcc_size; + + ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 3, img); + + /* Bind the compute shader. */ + if (!sctx->cs_dcc_retile) + sctx->cs_dcc_retile = si_create_dcc_retile_cs(ctx); + ctx->bind_compute_state(ctx, sctx->cs_dcc_retile); + + /* Dispatch compute. */ + /* img[0] has 4 channels per element containing 2 pairs of DCC offsets. */ + unsigned num_threads = num_elements / 4; + + struct pipe_grid_info info = {}; + info.block[0] = 64; + info.block[1] = 1; + info.block[2] = 1; + info.grid[0] = DIV_ROUND_UP(num_threads, 64); /* includes the partial block */ + info.grid[1] = 1; + info.grid[2] = 1; + sctx->compute_last_block[0] = num_threads % 64; + + ctx->launch_grid(ctx, &info); + + sctx->compute_last_block[0] = 0; /* reset */ + + /* Don't flush caches or wait. The driver will wait at the end of this IB, + * and L2 will be flushed by the kernel fence. + */ + + /* Restore states. */ + ctx->bind_compute_state(ctx, saved_cs); + ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 3, saved_img); +} + void si_init_compute_blit_functions(struct si_context *sctx) { sctx->b.clear_buffer = si_pipe_clear_buffer; } /* Clear a region of a color surface to a constant value. */ void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surface *dstsurf, const union pipe_color_union *color, unsigned dstx, unsigned dsty, diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 9b1eab8284b..aa79a4e967c 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -202,20 +202,22 @@ static void si_destroy_context(struct pipe_context *context) if (sctx->cs_copy_buffer) sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_buffer); if (sctx->cs_copy_image) sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_image); if (sctx->cs_copy_image_1d_array) sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_image_1d_array); if (sctx->cs_clear_render_target) sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_render_target); if (sctx->cs_clear_render_target_1d_array) sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_render_target_1d_array); + if (sctx->cs_dcc_retile) + sctx->b.delete_compute_state(&sctx->b, sctx->cs_dcc_retile); if (sctx->blitter) util_blitter_destroy(sctx->blitter); /* Release DCC stats. */ for (int i = 0; i < ARRAY_SIZE(sctx->dcc_stats); i++) { assert(!sctx->dcc_stats[i].query_active); for (int j = 0; j < ARRAY_SIZE(sctx->dcc_stats[i].ps_stats); j++) if (sctx->dcc_stats[i].ps_stats[j]) diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 6765dcb3275..d1e7b8c0b4a 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -270,26 +270,36 @@ struct si_transfer { unsigned offset; }; struct si_texture { struct si_resource buffer; struct radeon_surf surface; uint64_t size; struct si_texture *flushed_depth_texture; - /* Colorbuffer compression and fast clear. */ + /* One texture allocation can contain these buffers: + * - image (pixel data) + * - FMASK buffer (MSAA compression) + * - CMASK buffer (MSAA compression and/or legacy fast color clear) + * - HTILE buffer (Z/S compression and fast Z/S clear) + * - DCC buffer (color compression and new fast color clear) + * - displayable DCC buffer (if the DCC buffer is not displayable) + * - DCC retile mapping buffer (if the DCC buffer is not displayable) + */ uint64_t fmask_offset; uint64_t cmask_offset; uint64_t cmask_base_address_reg; struct si_resource *cmask_buffer; uint64_t dcc_offset; /* 0 = disabled */ + uint64_t display_dcc_offset; + uint64_t dcc_retile_map_offset; unsigned cb_color_info; /* fast clear enable bit */ unsigned color_clear_value[2]; unsigned last_msaa_resolve_target_micro_mode; unsigned num_level0_transfers; /* Depth buffer compression and fast clear. */ uint64_t htile_offset; float depth_clear_value; uint16_t dirty_level_mask; /* each bit says if that mipmap is compressed */ uint16_t stencil_dirty_level_mask; /* each bit says if that mipmap is compressed */ @@ -820,20 +830,21 @@ struct si_context { void *vs_blit_pos_layered; void *vs_blit_color; void *vs_blit_color_layered; void *vs_blit_texcoord; void *cs_clear_buffer; void *cs_copy_buffer; void *cs_copy_image; void *cs_copy_image_1d_array; void *cs_clear_render_target; void *cs_clear_render_target_1d_array; + void *cs_dcc_retile; struct si_screen *screen; struct pipe_debug_callback debug; struct ac_llvm_compiler compiler; /* only non-threaded compilation */ struct si_shader_ctx_state fixed_func_tcs_shader; struct si_resource *wait_mem_scratch; unsigned wait_mem_number; uint16_t prefetch_L2_mask; bool has_graphics; bool gfx_flush_in_progress:1; @@ -1210,20 +1221,21 @@ void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *src, unsigned src_level, unsigned dstx, unsigned dsty, unsigned dstz, const struct pipe_box *src_box); void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surface *dstsurf, const union pipe_color_union *color, unsigned dstx, unsigned dsty, unsigned width, unsigned height, bool render_condition_enabled); +void si_retile_dcc(struct si_context *sctx, struct si_texture *tex); void si_init_compute_blit_functions(struct si_context *sctx); /* si_cp_dma.c */ #define SI_CPDMA_SKIP_CHECK_CS_SPACE (1 << 0) /* don't call need_cs_space */ #define SI_CPDMA_SKIP_SYNC_AFTER (1 << 1) /* don't wait for DMA after the copy */ #define SI_CPDMA_SKIP_SYNC_BEFORE (1 << 2) /* don't wait for DMA before the copy (RAW hazards) */ #define SI_CPDMA_SKIP_GFX_SYNC (1 << 3) /* don't flush caches and don't wait for PS/CS */ #define SI_CPDMA_SKIP_BO_LIST_UPDATE (1 << 4) /* don't update the BO list */ #define SI_CPDMA_SKIP_ALL (SI_CPDMA_SKIP_CHECK_CS_SPACE | \ SI_CPDMA_SKIP_SYNC_AFTER | \ @@ -1328,20 +1340,21 @@ void si_resume_queries(struct si_context *sctx); void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type, unsigned num_layers); void *si_create_fixed_func_tcs(struct si_context *sctx); void *si_create_dma_compute_shader(struct pipe_context *ctx, unsigned num_dwords_per_thread, bool dst_stream_cache_policy, bool is_copy); void *si_create_copy_image_compute_shader(struct pipe_context *ctx); void *si_create_copy_image_compute_shader_1d_array(struct pipe_context *ctx); void *si_clear_render_target_shader(struct pipe_context *ctx); void *si_clear_render_target_shader_1d_array(struct pipe_context *ctx); +void *si_create_dcc_retile_cs(struct pipe_context *ctx); void *si_create_query_result_cs(struct si_context *sctx); /* si_test_dma.c */ void si_test_dma(struct si_screen *sscreen); /* si_test_clearbuffer.c */ void si_test_dma_perf(struct si_screen *sscreen); /* si_uvd.c */ struct pipe_video_codec *si_uvd_create_decoder(struct pipe_context *context, diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c index 8ff9ebda9ba..b68fd2ff236 100644 --- a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c +++ b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c @@ -219,20 +219,93 @@ void *si_create_dma_compute_shader(struct pipe_context *ctx, state.prog = ureg_get_tokens(ureg, NULL); void *cs = ctx->create_compute_state(ctx, &state); ureg_destroy(ureg); ureg_free_tokens(state.prog); free(values); return cs; } +/* Create a compute shader that copies DCC from one buffer to another + * where each DCC buffer has a different layout. + * + * image[0]: offset remap table (pairs of <src_offset, dst_offset>), + * 2 pairs are read + * image[1]: DCC source buffer, typed r8_uint + * image[2]: DCC destination buffer, typed r8_uint + */ +void *si_create_dcc_retile_cs(struct pipe_context *ctx) +{ + struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE); + if (!ureg) + return NULL; + + ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, 64); + ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 1); + ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1); + + /* Compute the global thread ID (in idx). */ + struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0); + struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0); + struct ureg_dst idx = ureg_writemask(ureg_DECL_temporary(ureg), + TGSI_WRITEMASK_X); + ureg_UMAD(ureg, idx, blk, ureg_imm1u(ureg, 64), tid); + + /* Load 2 pairs of offsets for DCC load & store. */ + struct ureg_src map = ureg_DECL_image(ureg, 0, TGSI_TEXTURE_BUFFER, 0, false, false); + struct ureg_dst offsets = ureg_DECL_temporary(ureg); + struct ureg_src map_load_args[] = {map, ureg_src(idx)}; + + ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &offsets, 1, map_load_args, 2, + TGSI_MEMORY_RESTRICT, TGSI_TEXTURE_BUFFER, 0); + + struct ureg_src dcc_src = ureg_DECL_image(ureg, 1, TGSI_TEXTURE_BUFFER, + 0, false, false); + struct ureg_dst dcc_dst = ureg_dst(ureg_DECL_image(ureg, 2, TGSI_TEXTURE_BUFFER, + 0, true, false)); + struct ureg_dst dcc_value[2]; + + /* Copy DCC values: + * dst[offsets.y] = src[offsets.x]; + * dst[offsets.w] = src[offsets.z]; + */ + for (unsigned i = 0; i < 2; i++) { + dcc_value[i] = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X); + + struct ureg_src load_args[] = + {dcc_src, ureg_scalar(ureg_src(offsets), TGSI_SWIZZLE_X + i*2)}; + ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &dcc_value[i], 1, load_args, 2, + TGSI_MEMORY_RESTRICT, TGSI_TEXTURE_BUFFER, 0); + } + + dcc_dst = ureg_writemask(dcc_dst, TGSI_WRITEMASK_X); + + for (unsigned i = 0; i < 2; i++) { + struct ureg_src store_args[] = { + ureg_scalar(ureg_src(offsets), TGSI_SWIZZLE_Y + i*2), + ureg_src(dcc_value[i]) + }; + ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dcc_dst, 1, store_args, 2, + TGSI_MEMORY_RESTRICT, TGSI_TEXTURE_BUFFER, 0); + } + ureg_END(ureg); + + struct pipe_compute_state state = {}; + state.ir_type = PIPE_SHADER_IR_TGSI; + state.prog = ureg_get_tokens(ureg, NULL); + + void *cs = ctx->create_compute_state(ctx, &state); + ureg_destroy(ureg); + return cs; +} + /* Create the compute shader that is used to collect the results. * * One compute grid with a single thread is launched for every query result * buffer. The thread (optionally) reads a previous summary buffer, then * accumulates data from the query result buffer, and writes the result either * to a summary buffer to be consumed by the next grid invocation or to the * user-supplied buffer. * * Data layout: * diff --git a/src/gallium/drivers/radeonsi/si_texture.c b/src/gallium/drivers/radeonsi/si_texture.c index cb62f153e59..8211f9cf325 100644 --- a/src/gallium/drivers/radeonsi/si_texture.c +++ b/src/gallium/drivers/radeonsi/si_texture.c @@ -424,27 +424,31 @@ static bool si_can_disable_dcc(struct si_texture *tex) { /* We can't disable DCC if it can be written by another process. */ return tex->dcc_offset && (!tex->buffer.b.is_shared || !(tex->buffer.external_usage & PIPE_HANDLE_USAGE_FRAMEBUFFER_WRITE)); } static bool si_texture_discard_dcc(struct si_screen *sscreen, struct si_texture *tex) { - if (!si_can_disable_dcc(tex)) + if (!si_can_disable_dcc(tex)) { + assert(tex->display_dcc_offset == 0); return false; + } assert(tex->dcc_separate_buffer == NULL); /* Disable DCC. */ tex->dcc_offset = 0; + tex->display_dcc_offset = 0; + tex->dcc_retile_map_offset = 0; /* Notify all contexts about the change. */ p_atomic_inc(&sscreen->dirty_tex_counter); return true; } /** * Disable DCC for the texture. (first decompress, then discard metadata). * * There is unresolved multi-context synchronization issue between @@ -618,21 +622,23 @@ static void si_set_tex_bo_metadata(struct si_screen *sscreen, struct radeon_surf *surface = &tex->surface; struct pipe_resource *res = &tex->buffer.b.b; struct radeon_bo_metadata md; memset(&md, 0, sizeof(md)); if (sscreen->info.chip_class >= GFX9) { md.u.gfx9.swizzle_mode = surface->u.gfx9.surf.swizzle_mode; if (tex->dcc_offset && !tex->dcc_separate_buffer) { - uint64_t dcc_offset = tex->dcc_offset; + uint64_t dcc_offset = + tex->display_dcc_offset ? tex->display_dcc_offset + : tex->dcc_offset; assert((dcc_offset >> 8) != 0 && (dcc_offset >> 8) < (1 << 24)); md.u.gfx9.dcc_offset_256B = dcc_offset >> 8; md.u.gfx9.dcc_pitch_max = tex->surface.u.gfx9.display_dcc_pitch_max; md.u.gfx9.dcc_independent_64B = 1; } } else { md.u.legacy.microtile = surface->u.legacy.level[0].mode >= RADEON_SURF_MODE_1D ? RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR; md.u.legacy.macrotile = surface->u.legacy.level[0].mode >= RADEON_SURF_MODE_2D ? @@ -756,20 +762,25 @@ static bool si_has_displayable_dcc(struct si_texture *tex) /* This needs a cache flush before scanout. * (it can't be scanned out and rendered to simultaneously) */ if (sscreen->info.use_display_dcc_unaligned && tex->dcc_offset && !tex->surface.u.gfx9.dcc.pipe_aligned && !tex->surface.u.gfx9.dcc.rb_aligned) return true; + /* This needs an explicit flush (flush_resource). */ + if (sscreen->info.use_display_dcc_with_retile_blit && + tex->display_dcc_offset) + return true; + return false; } static boolean si_texture_get_handle(struct pipe_screen* screen, struct pipe_context *ctx, struct pipe_resource *resource, struct winsys_handle *whandle, unsigned usage) { struct si_screen *sscreen = (struct si_screen*)screen; @@ -903,23 +914,27 @@ static boolean si_texture_get_handle(struct pipe_screen* screen, res->external_usage = usage; } return sscreen->ws->buffer_get_handle(res->buf, stride, offset, slice_size, whandle); } static void si_texture_destroy(struct pipe_screen *screen, struct pipe_resource *ptex) { + struct si_screen *sscreen = (struct si_screen*)screen; struct si_texture *tex = (struct si_texture*)ptex; struct si_resource *resource = &tex->buffer; + if (sscreen->info.chip_class >= GFX9) + free(tex->surface.u.gfx9.dcc_retile_map); + si_texture_reference(&tex->flushed_depth_texture, NULL); if (tex->cmask_buffer != &tex->buffer) { si_resource_reference(&tex->cmask_buffer, NULL); } pb_reference(&resource->buf, NULL); si_resource_reference(&tex->dcc_separate_buffer, NULL); si_resource_reference(&tex->last_dcc_separate_buffer, NULL); FREE(tex); } @@ -1247,24 +1262,46 @@ si_texture_create_object(struct pipe_screen *screen, goto error; } /* Shared textures must always set up DCC here. * If it's not present, it will be disabled by * apply_opaque_metadata later. */ if (tex->surface.dcc_size && (buf || !(sscreen->debug_flags & DBG(NO_DCC))) && (sscreen->info.use_display_dcc_unaligned || + sscreen->info.use_display_dcc_with_retile_blit || !(tex->surface.flags & RADEON_SURF_SCANOUT))) { /* Add space for the DCC buffer. */ tex->dcc_offset = align64(tex->size, tex->surface.dcc_alignment); tex->size = tex->dcc_offset + tex->surface.dcc_size; + + if (sscreen->info.chip_class >= GFX9 && + tex->surface.u.gfx9.dcc_retile_num_elements) { + /* Add space for the displayable DCC buffer. */ + tex->display_dcc_offset = + align64(tex->size, tex->surface.u.gfx9.display_dcc_alignment); + tex->size = tex->display_dcc_offset + + tex->surface.u.gfx9.display_dcc_size; + + /* Add space for the DCC retile buffer. (16-bit or 32-bit elements) */ + tex->dcc_retile_map_offset = + align64(tex->size, sscreen->info.tcc_cache_line_size); + + if (tex->surface.u.gfx9.dcc_retile_use_uint16) { + tex->size = tex->dcc_retile_map_offset + + tex->surface.u.gfx9.dcc_retile_num_elements * 2; + } else { + tex->size = tex->dcc_retile_map_offset + + tex->surface.u.gfx9.dcc_retile_num_elements * 4; + } + } } } /* Now create the backing buffer. */ if (!buf) { si_init_resource_fields(sscreen, resource, tex->size, tex->surface.surf_alignment); if (!si_alloc_resource(sscreen, resource)) goto error; @@ -1346,20 +1383,60 @@ si_texture_create_object(struct pipe_screen *screen, } /* Mipmap levels without DCC. */ if (size != tex->surface.dcc_size) { si_screen_clear_buffer(sscreen, &tex->buffer.b.b, tex->dcc_offset + size, tex->surface.dcc_size - size, DCC_UNCOMPRESSED); } } } + + /* Upload the DCC retile map. */ + if (tex->dcc_retile_map_offset) { + /* Use a staging buffer for the upload, because + * the buffer backing the texture is unmappable. + */ + bool use_uint16 = tex->surface.u.gfx9.dcc_retile_use_uint16; + unsigned num_elements = tex->surface.u.gfx9.dcc_retile_num_elements; + struct si_resource *buf = + si_aligned_buffer_create(screen, 0, PIPE_USAGE_STREAM, + num_elements * (use_uint16 ? 2 : 4), + sscreen->info.tcc_cache_line_size); + uint32_t *ui = (uint32_t*)sscreen->ws->buffer_map(buf->buf, NULL, + PIPE_TRANSFER_WRITE); + uint16_t *us = (uint16_t*)ui; + + /* Upload the retile map into a staging buffer. */ + if (use_uint16) { + for (unsigned i = 0; i < num_elements; i++) + us[i] = tex->surface.u.gfx9.dcc_retile_map[i]; + } else { + for (unsigned i = 0; i < num_elements; i++) + ui[i] = tex->surface.u.gfx9.dcc_retile_map[i]; + } + + /* Copy the staging buffer to the buffer backing the texture. */ + struct si_context *sctx = (struct si_context*)sscreen->aux_context; + struct pipe_box box; + u_box_1d(0, buf->b.b.width0, &box); + + assert(tex->dcc_retile_map_offset <= UINT_MAX); + mtx_lock(&sscreen->aux_context_lock); + sctx->dma_copy(&sctx->b, &tex->buffer.b.b, 0, + tex->dcc_retile_map_offset, 0, 0, + &buf->b.b, 0, &box); + sscreen->aux_context->flush(sscreen->aux_context, NULL, 0); + mtx_unlock(&sscreen->aux_context_lock); + + si_resource_reference(&buf, NULL); + } } /* Initialize the CMASK base register value. */ tex->cmask_base_address_reg = (tex->buffer.gpu_address + tex->cmask_offset) >> 8; if (sscreen->debug_flags & DBG(VM)) { fprintf(stderr, "VM start=0x%"PRIX64" end=0x%"PRIX64" | Texture %ix%ix%i, %i levels, %i samples, %s\n", tex->buffer.gpu_address, tex->buffer.gpu_address + tex->buffer.buf->size, @@ -1374,20 +1451,22 @@ si_texture_create_object(struct pipe_screen *screen, si_print_texture_info(sscreen, tex, &log); u_log_new_page_print(&log, stdout); fflush(stdout); u_log_context_destroy(&log); } return tex; error: FREE(tex); + if (sscreen->info.chip_class >= GFX9) + free(surface->u.gfx9.dcc_retile_map); return NULL; } static enum radeon_surf_mode si_choose_tiling(struct si_screen *sscreen, const struct pipe_resource *templ, bool tc_compatible_htile) { const struct util_format_description *desc = util_format_description(templ->format); bool force_tiling = templ->flags & SI_RESOURCE_FLAG_FORCE_MSAA_TILING; bool is_depth_stencil = util_format_is_depth_or_stencil(templ->format) && -- 2.17.1 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev