From: Dave Airlie <airl...@redhat.com> This enables a transfer queue using the SDMA engine on CIK/VI/Polaris GPUs.
TODO: decide what to do with HW limitations from radeonsi (fail to record?) add linear bounds check to the buffer->image copies dEQP-VK.synchronization.op.multi_queue.fence.*: Passed: 1294/2688 (48.1%) Failed: 0/2688 (0.0%) Not supported: 1394/2688 (51.9%) Warnings: 0/2688 (0.0%) Signed-off-by: Dave Airlie <airl...@redhat.com> --- src/amd/vulkan/Makefile.sources | 1 + src/amd/vulkan/radv_cik_sdma.c | 747 ++++++++++++++++++++++++++ src/amd/vulkan/radv_cmd_buffer.c | 7 + src/amd/vulkan/radv_device.c | 20 + src/amd/vulkan/radv_meta_buffer.c | 42 +- src/amd/vulkan/radv_meta_copy.c | 19 + src/amd/vulkan/radv_private.h | 38 ++ src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c | 12 +- 8 files changed, 877 insertions(+), 9 deletions(-) create mode 100644 src/amd/vulkan/radv_cik_sdma.c diff --git a/src/amd/vulkan/Makefile.sources b/src/amd/vulkan/Makefile.sources index 425a00f..6cd9621 100644 --- a/src/amd/vulkan/Makefile.sources +++ b/src/amd/vulkan/Makefile.sources @@ -31,6 +31,7 @@ RADV_WS_AMDGPU_FILES := \ winsys/amdgpu/radv_amdgpu_winsys_public.h VULKAN_FILES := \ + radv_cik_sdma.c \ radv_cmd_buffer.c \ radv_cs.h \ radv_device.c \ diff --git a/src/amd/vulkan/radv_cik_sdma.c b/src/amd/vulkan/radv_cik_sdma.c new file mode 100644 index 0000000..1229d9c --- /dev/null +++ b/src/amd/vulkan/radv_cik_sdma.c @@ -0,0 +1,747 @@ +/* + * Copyright © 2016 Red Hat. + * + * based on cik_sdma.c: + * Copyright 2014,2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#include "radv_private.h" +#include "sid.h" +#include "vk_format.h" +#include "radv_cs.h" + +static VkFormat get_format_from_aspect_mask(VkImageAspectFlags aspectMask, + VkFormat format) +{ + if (aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) + format = vk_format_depth_only(format); + else if (aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) + format = vk_format_stencil_only(format); + return format; +} + +static unsigned minify_as_blocks(unsigned width, unsigned level, unsigned blk_w) +{ + width = radv_minify(width, level); + return DIV_ROUND_UP(width, blk_w); +} + +static const struct radeon_surf_level *get_base_level_info(const struct radv_image *img, + VkImageAspectFlags aspectMask, int base_mip_level) +{ + if (aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) + return &img->surface.stencil_level[base_mip_level]; + return &img->surface.level[base_mip_level]; +} + +static void get_image_info(struct radv_cmd_buffer *cmd_buffer, + const struct radv_image *img, + const VkImageSubresourceLayers *subres, + uint64_t *va_p, uint32_t *bpp_p, uint32_t *pitch, uint32_t *slice_pitch) +{ + const struct radeon_surf_level *base_level = get_base_level_info(img, subres->aspectMask, + subres->mipLevel); + VkFormat format = get_format_from_aspect_mask(subres->aspectMask, img->vk_format); + uint32_t bpp = vk_format_get_blocksize(format); + uint64_t va = cmd_buffer->device->ws->buffer_get_va(img->bo); + + va += img->offset; + *pitch = base_level->nblk_x; + *slice_pitch = base_level->slice_size / bpp; + if (bpp_p) + *bpp_p = bpp; + *va_p = va; +} + +static unsigned encode_tile_info(struct radv_cmd_buffer *cmd_buffer, + struct radv_image *image, unsigned level, + bool set_bpp) +{ + struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info; + unsigned tile_index = image->surface.tiling_index[level]; + unsigned macro_tile_index = image->surface.macro_tile_index; + unsigned tile_mode = info->si_tile_mode_array[tile_index]; + unsigned macro_tile_mode = info->cik_macrotile_mode_array[macro_tile_index]; + + return (set_bpp ? util_logbase2(image->surface.bpe) : 0) | + (G_009910_ARRAY_MODE(tile_mode) << 3) | + (G_009910_MICRO_TILE_MODE_NEW(tile_mode) << 8) | + /* Non-depth modes don't have TILE_SPLIT set. */ + ((util_logbase2(image->surface.tile_split >> 6)) << 11) | + (G_009990_BANK_WIDTH(macro_tile_mode) << 15) | + (G_009990_BANK_HEIGHT(macro_tile_mode) << 18) | + (G_009990_NUM_BANKS(macro_tile_mode) << 21) | + (G_009990_MACRO_TILE_ASPECT(macro_tile_mode) << 24) | + (G_009910_PIPE_CONFIG(tile_mode) << 26); +} + +static void +get_buffer_info(struct radv_cmd_buffer *cmd_buffer, + const struct radv_buffer *buffer, + const VkBufferImageCopy *region, + uint32_t bpp, + uint64_t *va_p, uint32_t *pitch, uint32_t *slice_pitch) +{ + uint64_t va = cmd_buffer->device->ws->buffer_get_va(buffer->bo); + + va += buffer->offset; + va += region->bufferOffset; + + *va_p = va; + if (region->bufferRowLength) + *pitch = region->bufferRowLength / bpp; + else + *pitch = region->imageExtent.width; + if (region->bufferImageHeight) + *slice_pitch = *pitch * region->bufferImageHeight; + else + *slice_pitch = *pitch * region->imageExtent.height; +} + +static void +get_bufimage_depth_info(VkImageType type, + const VkBufferImageCopy *region, + uint32_t *zoffset, uint32_t *depth) +{ + if (type == VK_IMAGE_TYPE_3D) { + *depth = region->imageExtent.depth; + *zoffset = region->imageOffset.z; + } else { + *depth = region->imageSubresource.layerCount; + *zoffset = region->imageSubresource.baseArrayLayer; + } +} + +static bool +linear_buffer_workaround(struct radv_cmd_buffer *cmd_buffer, + struct radv_image *image, + uint32_t level, + uint32_t bpp, unsigned *granularity_p) +{ + struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info; + unsigned til_tile_index = image->surface.tiling_index[level]; + unsigned til_tile_mode = info->si_tile_mode_array[til_tile_index]; + unsigned til_micro_mode = G_009910_MICRO_TILE_MODE_NEW(til_tile_mode); + unsigned granularity; + + /* Deduce the size of reads from the linear surface. */ + switch (til_micro_mode) { + case V_009910_ADDR_SURF_DISPLAY_MICRO_TILING: + granularity = bpp == 1 ? 64 / (8*bpp) : + 128 / (8*bpp); + break; + case V_009910_ADDR_SURF_THIN_MICRO_TILING: + case V_009910_ADDR_SURF_DEPTH_MICRO_TILING: + if (0 /* TODO: THICK microtiling */) + granularity = bpp == 1 ? 32 / (8*bpp) : + bpp == 2 ? 64 / (8*bpp) : + bpp <= 8 ? 128 / (8*bpp) : + 256 / (8*bpp); + else + granularity = bpp <= 2 ? 64 / (8*bpp) : + bpp <= 8 ? 128 / (8*bpp) : + 256 / (8*bpp); + break; + default: + return false; + } + + *granularity_p = granularity; + return true; +} + +/* L2L buffer->image + image->buffer */ +static void +radv_cik_dma_copy_one_lin_to_lin(struct radv_cmd_buffer *cmd_buffer, + struct radv_buffer *buffer, + struct radv_image *image, + const VkBufferImageCopy *region, + bool buf2img) +{ + uint64_t buf_va, img_va; + uint64_t src_va, dst_va; + unsigned depth; + unsigned zoffset; + uint32_t bpp, pitch, slice_pitch; + unsigned linear_pitch; + unsigned linear_slice_pitch; + + radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 13); + get_image_info(cmd_buffer, image, ®ion->imageSubresource, &img_va, + &bpp, &pitch, &slice_pitch); + get_buffer_info(cmd_buffer, buffer, region, bpp, &buf_va, + &linear_pitch, &linear_slice_pitch); + + get_bufimage_depth_info(image->type, region, &zoffset, &depth); + + src_va = buf2img ? buf_va : img_va; + dst_va = buf2img ? img_va : buf_va; + + radeon_emit(cmd_buffer->cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, + CIK_SDMA_COPY_SUB_OPCODE_LINEAR_SUB_WINDOW, 0) | + (util_logbase2(bpp) << 29)); + radeon_emit(cmd_buffer->cs, src_va); + radeon_emit(cmd_buffer->cs, src_va >> 32); + radeon_emit(cmd_buffer->cs, 0); + radeon_emit(cmd_buffer->cs, ((linear_pitch - 1) << 16)); + radeon_emit(cmd_buffer->cs, (linear_slice_pitch - 1)); + radeon_emit(cmd_buffer->cs, dst_va); + radeon_emit(cmd_buffer->cs, dst_va >> 32); + radeon_emit(cmd_buffer->cs, region->imageOffset.x | (region->imageOffset.y << 16)); + radeon_emit(cmd_buffer->cs, zoffset | ((pitch - 1) << 16)); + radeon_emit(cmd_buffer->cs, (slice_pitch - 1)); + if (cmd_buffer->device->physical_device->rad_info.chip_class == CIK) { + radeon_emit(cmd_buffer->cs, region->imageExtent.width | (region->imageExtent.height << 16)); + radeon_emit(cmd_buffer->cs, depth); + } else { + radeon_emit(cmd_buffer->cs, (region->imageExtent.width -1) | ((region->imageExtent.height - 1) << 16)); + radeon_emit(cmd_buffer->cs, (depth - 1)); + } +} + +/* L2T buffer->image + image->buffer */ +static void +radv_cik_dma_copy_one_lin_to_tiled(struct radv_cmd_buffer *cmd_buffer, + struct radv_buffer *buffer, + struct radv_image *image, + const VkBufferImageCopy *region, + bool buf2img) +{ + uint64_t buf_va, img_va; + unsigned depth; + unsigned zoffset; + unsigned pitch, slice_pitch, bpp; + + radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 14); + get_image_info(cmd_buffer, image, ®ion->imageSubresource, &img_va, + &bpp, &pitch, &slice_pitch); + + unsigned pitch_tile_max = pitch / 8 - 1; + unsigned slice_tile_max = slice_pitch / 64 - 1; + + unsigned copy_width = DIV_ROUND_UP(region->imageExtent.width, image->surface.blk_w); + unsigned copy_width_aligned = copy_width; + unsigned linear_pitch; + unsigned linear_slice_pitch; + + get_buffer_info(cmd_buffer, buffer, region, bpp, &buf_va, + &linear_pitch, &linear_slice_pitch); + + get_bufimage_depth_info(image->type, region, &zoffset, &depth); + + radeon_emit(cmd_buffer->cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, + CIK_SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW, 0) | + (buf2img ? 0 : (1u << 31))); + radeon_emit(cmd_buffer->cs, img_va); + radeon_emit(cmd_buffer->cs, img_va >> 32); + radeon_emit(cmd_buffer->cs, region->imageOffset.x | (region->imageOffset.y << 16)); + radeon_emit(cmd_buffer->cs, zoffset | (pitch_tile_max << 16)); + radeon_emit(cmd_buffer->cs, slice_tile_max); + radeon_emit(cmd_buffer->cs, encode_tile_info(cmd_buffer, image, region->imageSubresource.mipLevel, true)); + radeon_emit(cmd_buffer->cs, buf_va); + radeon_emit(cmd_buffer->cs, buf_va >> 32); + radeon_emit(cmd_buffer->cs, 0/*x,y*/); + radeon_emit(cmd_buffer->cs, ((linear_pitch - 1) << 16)); + radeon_emit(cmd_buffer->cs, linear_slice_pitch - 1); + if (cmd_buffer->device->physical_device->rad_info.chip_class == CIK) { + radeon_emit(cmd_buffer->cs, copy_width_aligned | (region->imageExtent.height << 16)); + radeon_emit(cmd_buffer->cs, depth); + } else { + radeon_emit(cmd_buffer->cs, (copy_width_aligned -1) | ((region->imageExtent.height - 1) << 16)); + radeon_emit(cmd_buffer->cs, (depth - 1)); + } +} + +/* T2T */ + +void radv_cik_dma_copy_buffer_to_image(struct radv_cmd_buffer *cmd_buffer, + struct radv_buffer *src_buffer, + struct radv_image *dest_image, + uint32_t region_count, + const VkBufferImageCopy *pRegions) +{ + uint32_t r; + for (r = 0; r < region_count; r++) { + const VkBufferImageCopy *region = &pRegions[r]; + if (dest_image->surface.level[region->imageSubresource.mipLevel].mode == RADEON_SURF_MODE_LINEAR_ALIGNED) { + /* L -> L */ + radv_cik_dma_copy_one_lin_to_lin(cmd_buffer, src_buffer, dest_image, + region, true); + } else { + /* L -> T */ + radv_cik_dma_copy_one_lin_to_tiled(cmd_buffer, src_buffer, dest_image, + region, true); + } + } +} + +void radv_cik_dma_copy_image_to_buffer(struct radv_cmd_buffer *cmd_buffer, + struct radv_image *src_image, + struct radv_buffer *dest_buffer, + uint32_t region_count, + const VkBufferImageCopy *pRegions) +{ + uint32_t r; + for (r = 0; r < region_count; r++) { + const VkBufferImageCopy *region = &pRegions[r]; + + if (src_image->surface.level[region->imageSubresource.mipLevel].mode == RADEON_SURF_MODE_LINEAR_ALIGNED) { + /* L -> L */ + radv_cik_dma_copy_one_lin_to_lin(cmd_buffer, dest_buffer, src_image, + region, false); + } else { + /* L -> T */ + radv_cik_dma_copy_one_lin_to_tiled(cmd_buffer, dest_buffer, src_image, + region, false); + } + } +} + +/* L2L buffer->image */ +static void +radv_cik_dma_copy_one_image_lin_to_lin(struct radv_cmd_buffer *cmd_buffer, + struct radv_image *src_image, + struct radv_image *dst_image, + const VkImageCopy *region) +{ + uint64_t src_va, dst_va; + unsigned src_pitch, src_slice_pitch, src_zoffset; + unsigned dst_pitch, dst_slice_pitch, dst_zoffset; + unsigned depth; + unsigned bpp; + + radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 13); + get_image_info(cmd_buffer, src_image, ®ion->srcSubresource, &src_va, + &bpp, &src_pitch, &src_slice_pitch); + get_image_info(cmd_buffer, dst_image, ®ion->dstSubresource, &dst_va, + NULL, &dst_pitch, &dst_slice_pitch); + + if (src_image->type == VK_IMAGE_TYPE_3D) { + depth = region->extent.depth; + src_zoffset = region->srcOffset.z; + } else { + depth = region->srcSubresource.layerCount; + src_zoffset = region->srcSubresource.baseArrayLayer; + } + + if (dst_image->type == VK_IMAGE_TYPE_3D) { + dst_zoffset = region->dstOffset.z; + } else { + dst_zoffset = region->dstSubresource.baseArrayLayer; + } + + radeon_emit(cmd_buffer->cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, + CIK_SDMA_COPY_SUB_OPCODE_LINEAR_SUB_WINDOW, 0) | + (util_logbase2(bpp) << 29)); + radeon_emit(cmd_buffer->cs, src_va); + radeon_emit(cmd_buffer->cs, src_va >> 32); + radeon_emit(cmd_buffer->cs, region->srcOffset.x | (region->srcOffset.y << 16)); + radeon_emit(cmd_buffer->cs, src_zoffset | ((src_pitch - 1) << 16)); + radeon_emit(cmd_buffer->cs, src_slice_pitch - 1); + radeon_emit(cmd_buffer->cs, dst_va); + radeon_emit(cmd_buffer->cs, dst_va >> 32); + radeon_emit(cmd_buffer->cs, region->dstOffset.x | (region->dstOffset.y << 16)); + radeon_emit(cmd_buffer->cs, dst_zoffset | ((dst_pitch - 1) << 16)); + radeon_emit(cmd_buffer->cs, dst_slice_pitch - 1); + if (cmd_buffer->device->physical_device->rad_info.chip_class == CIK) { + radeon_emit(cmd_buffer->cs, region->extent.width | (region->extent.height << 16)); + radeon_emit(cmd_buffer->cs, depth); + } else { + radeon_emit(cmd_buffer->cs, (region->extent.width -1) | ((region->extent.height - 1) << 16)); + radeon_emit(cmd_buffer->cs, (depth - 1)); + } +} + +/* L2L buffer->image */ +static void +radv_cik_dma_copy_one_image_lin_to_tiled(struct radv_cmd_buffer *cmd_buffer, + struct radv_image *lin_image, + const VkImageSubresourceLayers *lin_sub_resource, + const VkOffset3D *lin_offset, + struct radv_image *til_image, + const VkImageSubresourceLayers *til_sub_resource, + const VkOffset3D *til_offset, + const VkExtent3D *extent, bool lin2tiled) +{ + uint64_t lin_va, til_va; + unsigned lin_pitch, lin_slice_pitch, lin_zoffset; + unsigned til_pitch, til_slice_pitch, til_zoffset; + unsigned bpp; + unsigned lin_width = minify_as_blocks(lin_image->extent.width, + lin_sub_resource->mipLevel, lin_image->surface.blk_w); + unsigned til_width = minify_as_blocks(til_image->extent.width, + til_sub_resource->mipLevel, til_image->surface.blk_w); + radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 14); + get_image_info(cmd_buffer, lin_image, lin_sub_resource, &lin_va, + &bpp, &lin_pitch, &lin_slice_pitch); + get_image_info(cmd_buffer, til_image, til_sub_resource, &til_va, + NULL, &til_pitch, &til_slice_pitch); + + assert(til_pitch % 8 == 0); + assert(til_slice_pitch % 64 == 0); + unsigned pitch_tile_max = til_pitch / 8 - 1; + unsigned slice_tile_max = til_slice_pitch / 64 - 1; + unsigned xalign = MAX2(1, 4 / bpp); + unsigned copy_width = DIV_ROUND_UP(extent->width, til_image->surface.blk_w); + unsigned copy_height = DIV_ROUND_UP(extent->width, til_image->surface.blk_w); + unsigned copy_width_aligned = copy_width; + unsigned copy_depth; + + if (lin_image->type == VK_IMAGE_TYPE_3D) { + copy_depth = extent->depth; + lin_zoffset = lin_offset->z; + } else { + copy_depth = lin_sub_resource->layerCount; + lin_zoffset = lin_sub_resource->baseArrayLayer; + } + + if (til_image->type == VK_IMAGE_TYPE_3D) { + til_zoffset = til_offset->z; + } else { + til_zoffset = til_sub_resource->baseArrayLayer; + } + + /* If the region ends at the last pixel and is unaligned, we + * can copy the remainder of the line that is not visible to + * make it aligned. + */ + if (copy_width % xalign != 0 && + lin_offset->x + copy_width == lin_width && + til_offset->x + copy_width == til_width && + lin_offset->x + align(copy_width, xalign) <= lin_pitch && + til_offset->x + align(copy_width, xalign) <= til_pitch) + copy_width_aligned = align(copy_width, xalign); + + /* TODO HW Limitations - how do we handle those in vk? */ + + /* The hw can read outside of the given linear buffer bounds, + * or access those pages but not touch the memory in case + * of writes. (it still causes a VM fault) + * + * Out-of-bounds memory access or page directory access must + * be prevented. + */ + int64_t start_linear_address, end_linear_address; + bool ret; + unsigned granularity; + ret = linear_buffer_workaround(cmd_buffer, til_image, + til_sub_resource->mipLevel, + bpp, &granularity); + + if (ret == false) { + cmd_buffer->record_fail = true; + return; + } + + /* The linear reads start at tiled_x & ~(granularity - 1). + * If linear_x == 0 && tiled_x % granularity != 0, the hw + * starts reading from an address preceding linear_address!!! + */ + start_linear_address = + lin_image->surface.level[lin_sub_resource->mipLevel].offset + + bpp * (lin_offset->z * lin_slice_pitch + + lin_offset->y * lin_pitch + + lin_offset->x); + start_linear_address -= (int)(bpp * (til_offset->x % granularity)); + + end_linear_address = + lin_image->surface.level[lin_sub_resource->mipLevel].offset + + bpp * ((lin_offset->z + copy_depth - 1) * lin_slice_pitch + + (lin_offset->y + copy_height - 1) * lin_pitch + + (lin_offset->x + copy_width)); + + if ((til_offset->x + copy_width) % granularity) + end_linear_address += granularity - + (til_offset->x + copy_width) % granularity; + + if (start_linear_address < 0 || + end_linear_address > lin_image->surface.bo_size) { + cmd_buffer->record_fail = true; + return; + } + + radeon_emit(cmd_buffer->cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, + CIK_SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW, 0) | + (lin2tiled ? 0 : (1u << 31))); + radeon_emit(cmd_buffer->cs, til_va); + radeon_emit(cmd_buffer->cs, til_va >> 32); + radeon_emit(cmd_buffer->cs, til_offset->x | (til_offset->y << 16)); + radeon_emit(cmd_buffer->cs, til_zoffset | (pitch_tile_max << 16)); + radeon_emit(cmd_buffer->cs, slice_tile_max); + radeon_emit(cmd_buffer->cs, encode_tile_info(cmd_buffer, til_image, til_sub_resource->mipLevel, true)); + radeon_emit(cmd_buffer->cs, lin_va); + radeon_emit(cmd_buffer->cs, lin_va >> 32); + radeon_emit(cmd_buffer->cs, lin_offset->x | (lin_offset->y << 16)); + radeon_emit(cmd_buffer->cs, lin_zoffset | ((lin_pitch - 1) << 16)); + radeon_emit(cmd_buffer->cs, lin_slice_pitch - 1); + if (cmd_buffer->device->physical_device->rad_info.chip_class == CIK) { + radeon_emit(cmd_buffer->cs, copy_width_aligned | (copy_height << 16)); + radeon_emit(cmd_buffer->cs, copy_depth); + } else { + radeon_emit(cmd_buffer->cs, (copy_width_aligned - 1) | ((copy_height - 1) << 16)); + radeon_emit(cmd_buffer->cs, (copy_depth - 1)); + } +} + +static void +radv_cik_dma_copy_one_image_tiled_to_tiled(struct radv_cmd_buffer *cmd_buffer, + struct radv_image *src_image, + struct radv_image *dst_image, + const VkImageCopy *region) +{ + uint64_t src_va, dst_va; + unsigned src_pitch, src_slice_pitch, src_zoffset; + unsigned dst_pitch, dst_slice_pitch, dst_zoffset; + unsigned depth; + unsigned bpp; + unsigned dst_width = minify_as_blocks(dst_image->extent.width, + region->dstSubresource.mipLevel, dst_image->surface.blk_w); + unsigned src_width = minify_as_blocks(src_image->extent.width, + region->srcSubresource.mipLevel, src_image->surface.blk_w); + unsigned dst_height = minify_as_blocks(dst_image->extent.height, + region->dstSubresource.mipLevel, dst_image->surface.blk_h); + unsigned src_height = minify_as_blocks(src_image->extent.height, + region->srcSubresource.mipLevel, src_image->surface.blk_h); + + radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 15); + get_image_info(cmd_buffer, src_image, ®ion->srcSubresource, &src_va, + &bpp, &src_pitch, &src_slice_pitch); + get_image_info(cmd_buffer, dst_image, ®ion->dstSubresource, &dst_va, + NULL, &dst_pitch, &dst_slice_pitch); + + unsigned src_pitch_tile_max = src_pitch / 8 - 1; + unsigned src_slice_tile_max = src_slice_pitch / 64 - 1; + + unsigned dst_pitch_tile_max = dst_pitch / 8 - 1; + unsigned dst_slice_tile_max = dst_slice_pitch / 64 - 1; + + unsigned copy_width = DIV_ROUND_UP(region->extent.width, src_image->surface.blk_w); + unsigned copy_height = DIV_ROUND_UP(region->extent.height, src_image->surface.blk_h); + + unsigned copy_width_aligned = copy_width; + unsigned copy_height_aligned = copy_height; + + if (copy_width % 8 != 0 && + region->srcOffset.x + copy_width == src_width && + region->dstOffset.x + copy_width == dst_width) + copy_width_aligned = align(copy_width, 8); + + if (copy_height % 8 != 0 && + region->srcOffset.y + copy_height == src_height && + region->dstOffset.y + copy_height == dst_height) + copy_height_aligned = align(copy_height, 8); + + if (src_image->type == VK_IMAGE_TYPE_3D) { + depth = region->extent.depth; + src_zoffset = region->srcOffset.z; + } else { + depth = region->srcSubresource.layerCount; + src_zoffset = region->srcSubresource.baseArrayLayer; + } + + if (dst_image->type == VK_IMAGE_TYPE_3D) { + dst_zoffset = region->dstOffset.z; + } else { + dst_zoffset = region->dstSubresource.baseArrayLayer; + } + + radeon_emit(cmd_buffer->cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, + CIK_SDMA_COPY_SUB_OPCODE_T2T_SUB_WINDOW, 0)); + radeon_emit(cmd_buffer->cs, src_va); + radeon_emit(cmd_buffer->cs, src_va >> 32); + radeon_emit(cmd_buffer->cs, region->srcOffset.x | (region->srcOffset.y << 16)); + radeon_emit(cmd_buffer->cs, src_zoffset | (src_pitch_tile_max << 16)); + radeon_emit(cmd_buffer->cs, src_slice_tile_max); + radeon_emit(cmd_buffer->cs, encode_tile_info(cmd_buffer, src_image, region->srcSubresource.mipLevel, true)); + radeon_emit(cmd_buffer->cs, dst_va); + radeon_emit(cmd_buffer->cs, dst_va >> 32); + radeon_emit(cmd_buffer->cs, region->dstOffset.x | (region->dstOffset.y << 16)); + radeon_emit(cmd_buffer->cs, dst_zoffset | (dst_pitch_tile_max << 16)); + radeon_emit(cmd_buffer->cs, dst_slice_tile_max); + radeon_emit(cmd_buffer->cs, encode_tile_info(cmd_buffer, dst_image, region->dstSubresource.mipLevel, false)); + if (cmd_buffer->device->physical_device->rad_info.chip_class == CIK) { + radeon_emit(cmd_buffer->cs, copy_width_aligned | (copy_height_aligned << 16)); + radeon_emit(cmd_buffer->cs, depth); + } else { + radeon_emit(cmd_buffer->cs, (copy_width_aligned - 8) | ((copy_height_aligned - 8) << 16)); + radeon_emit(cmd_buffer->cs, (depth - 1)); + } +} + +void radv_cik_dma_copy_image(struct radv_cmd_buffer *cmd_buffer, + struct radv_image *src_image, + VkImageLayout src_image_layout, + struct radv_image *dest_image, + VkImageLayout dest_image_layout, + uint32_t region_count, + const VkImageCopy *pRegions) +{ + uint32_t r; + for (r = 0; r < region_count; r++) { + const VkImageCopy *region = &pRegions[r]; + bool src_is_linear = src_image->surface.level[region->srcSubresource.mipLevel].mode == RADEON_SURF_MODE_LINEAR_ALIGNED; + bool dst_is_linear = dest_image->surface.level[region->dstSubresource.mipLevel].mode == RADEON_SURF_MODE_LINEAR_ALIGNED; + + /* X -> X */ + if (src_is_linear && dst_is_linear) { + radv_cik_dma_copy_one_image_lin_to_lin(cmd_buffer, + src_image, + dest_image, + region); + /* L -> L */ + } else if (!src_is_linear && dst_is_linear) { + /* T -> L */ + radv_cik_dma_copy_one_image_lin_to_tiled(cmd_buffer, + dest_image, + ®ion->dstSubresource, + ®ion->dstOffset, + src_image, + ®ion->srcSubresource, + ®ion->srcOffset, + ®ion->extent, + false); + } else if (src_is_linear && !dst_is_linear) { + /* L -> T */ + radv_cik_dma_copy_one_image_lin_to_tiled(cmd_buffer, + src_image, + ®ion->srcSubresource, + ®ion->srcOffset, + dest_image, + ®ion->dstSubresource, + ®ion->dstOffset, + ®ion->extent, + true); + } else { + /* T -> T */ + radv_cik_dma_copy_one_image_tiled_to_tiled(cmd_buffer, + src_image, + dest_image, + region); + } + } + +} + +static void +radv_cik_sdma_do_copy_buffer_one(struct radv_cmd_buffer *cmd_buffer, + struct radv_buffer *src_buffer, + struct radv_buffer *dst_buffer, + const VkBufferCopy *region) +{ + unsigned ncopy, i; + uint64_t src_va, dst_va; + VkDeviceSize size = region->size; + + src_va = cmd_buffer->device->ws->buffer_get_va(src_buffer->bo); + dst_va = cmd_buffer->device->ws->buffer_get_va(dst_buffer->bo); + + src_va += src_buffer->offset; + dst_va += dst_buffer->offset; + ncopy = DIV_ROUND_UP(region->size, CIK_SDMA_COPY_MAX_SIZE); + + src_va += region->srcOffset; + dst_va += region->dstOffset; + + radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, ncopy * 7); + for (i = 0; i < ncopy; i++) { + unsigned csize = MIN2(size, CIK_SDMA_COPY_MAX_SIZE); + + radeon_emit(cmd_buffer->cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, + CIK_SDMA_COPY_SUB_OPCODE_LINEAR, + 0)); + + radeon_emit(cmd_buffer->cs, csize); + radeon_emit(cmd_buffer->cs, 0); + radeon_emit(cmd_buffer->cs, src_va); + radeon_emit(cmd_buffer->cs, src_va >> 32); + radeon_emit(cmd_buffer->cs, dst_va); + radeon_emit(cmd_buffer->cs, dst_va >> 32); + dst_va += csize; + src_va += csize; + size -= csize; + } +} + +void radv_cik_dma_copy_buffer(struct radv_cmd_buffer *cmd_buffer, + struct radv_buffer *src_buffer, + struct radv_buffer *dest_buffer, + uint32_t region_count, + const VkBufferCopy *pRegions) +{ + int r; + + for (r = 0; r < region_count; r++) + radv_cik_sdma_do_copy_buffer_one(cmd_buffer, + src_buffer, + dest_buffer, + &pRegions[r]); +} + +void radv_cik_dma_update_buffer(struct radv_cmd_buffer *cmd_buffer, + struct radv_buffer *dst_buffer, + VkDeviceSize dst_offset, + VkDeviceSize data_size, + const void *data) +{ + uint64_t dst_va = cmd_buffer->device->ws->buffer_get_va(dst_buffer->bo); + int num_dw = (data_size + 3) / 4; + dst_va += dst_buffer->offset; + dst_va += dst_offset; + + const uint32_t *data_dw = data; + int left_dw = num_dw; + do { + int can_dw = cmd_buffer->cs->max_dw - cmd_buffer->cs->cdw - 4; + int this_dw = MIN2(left_dw, can_dw); + + radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, this_dw + 4); + radeon_emit(cmd_buffer->cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_WRITE, + SDMA_WRITE_SUB_OPCODE_LINEAR, 0)); + radeon_emit(cmd_buffer->cs, dst_va); + radeon_emit(cmd_buffer->cs, dst_va >> 32); + radeon_emit(cmd_buffer->cs, this_dw); + radeon_emit_array(cmd_buffer->cs, data_dw, this_dw); + + data_dw += this_dw; + left_dw -= this_dw; + + if (left_dw) + radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, left_dw + 4); + } while (left_dw > 0); +} + +void radv_cik_dma_fill_buffer(struct radv_cmd_buffer *cmd_buffer, + struct radv_buffer *dst_buffer, + VkDeviceSize dst_offset, + VkDeviceSize fillSize, + uint32_t data) +{ + uint64_t dst_va = cmd_buffer->device->ws->buffer_get_va(dst_buffer->bo); + + dst_va += dst_buffer->offset; + dst_va += dst_offset; + radeon_emit(cmd_buffer->cs, CIK_SDMA_PACKET(CIK_SDMA_PACKET_CONSTANT_FILL, + 0, 0)); + radeon_emit(cmd_buffer->cs, dst_va); + radeon_emit(cmd_buffer->cs, dst_va >> 32); + radeon_emit(cmd_buffer->cs, data); + radeon_emit(cmd_buffer->cs, fillSize); +} diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index f281f33..3f84ace 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -2777,6 +2777,13 @@ void radv_CmdPipelineBarrier( VkAccessFlags src_flags = 0; VkAccessFlags dst_flags = 0; uint32_t b; + + if (cmd_buffer->queue_family_index == RADV_QUEUE_TRANSFER) { + /* NOP waits for idle on CIK and later. */ + radeon_emit(cmd_buffer->cs, 0x00000000); /* NOP */ + return; + } + for (uint32_t i = 0; i < memoryBarrierCount; i++) { src_flags |= pMemoryBarriers[i].srcAccessMask; dst_flags |= pMemoryBarriers[i].dstAccessMask; diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index 98d4b91..7e704d4 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -302,6 +302,7 @@ static const struct debug_control radv_debug_options[] = { {"nohiz", RADV_DEBUG_NO_HIZ}, {"nocompute", RADV_DEBUG_NO_COMPUTE_QUEUE}, {"unsafemath", RADV_DEBUG_UNSAFE_MATH}, + {"notransfer", RADV_DEBUG_NO_TRANSFER_QUEUE}, {NULL, 0} }; @@ -636,6 +637,11 @@ void radv_GetPhysicalDeviceQueueFamilyProperties( !(pdevice->instance->debug_flags & RADV_DEBUG_NO_COMPUTE_QUEUE)) num_queue_families++; + if (pdevice->rad_info.sdma_rings > 0 && + pdevice->rad_info.chip_class >= CIK && + !(pdevice->instance->debug_flags & RADV_DEBUG_NO_TRANSFER_QUEUE)) + num_queue_families++; + if (pQueueFamilyProperties == NULL) { *pCount = num_queue_families; return; @@ -670,6 +676,20 @@ void radv_GetPhysicalDeviceQueueFamilyProperties( idx++; } } + + if (pdevice->rad_info.sdma_rings > 0 && + pdevice->rad_info.chip_class >= CIK && + !(pdevice->instance->debug_flags & RADV_DEBUG_NO_TRANSFER_QUEUE)) { + if (*pCount > idx) { + pQueueFamilyProperties[idx] = (VkQueueFamilyProperties) { + .queueFlags = VK_QUEUE_TRANSFER_BIT, + .queueCount = pdevice->rad_info.sdma_rings, + .timestampValidBits = 64, + .minImageTransferGranularity = (VkExtent3D) { 8, 8, 1 }, + }; + idx++; + } + } *pCount = idx; } diff --git a/src/amd/vulkan/radv_meta_buffer.c b/src/amd/vulkan/radv_meta_buffer.c index cd2973f..21ec4d3 100644 --- a/src/amd/vulkan/radv_meta_buffer.c +++ b/src/amd/vulkan/radv_meta_buffer.c @@ -481,6 +481,10 @@ void radv_CmdFillBuffer( if (fillSize == VK_WHOLE_SIZE) fillSize = (dst_buffer->size - dstOffset) & ~3ull; + if (cmd_buffer->queue_family_index == RADV_QUEUE_TRANSFER) { + radv_cik_dma_fill_buffer(cmd_buffer, dst_buffer, dstOffset, fillSize, data); + return; + } radv_fill_buffer(cmd_buffer, dst_buffer->bo, dst_buffer->offset + dstOffset, fillSize, data); } @@ -496,6 +500,10 @@ void radv_CmdCopyBuffer( RADV_FROM_HANDLE(radv_buffer, src_buffer, srcBuffer); RADV_FROM_HANDLE(radv_buffer, dest_buffer, destBuffer); + if (cmd_buffer->queue_family_index == RADV_QUEUE_TRANSFER) { + radv_cik_dma_copy_buffer(cmd_buffer, src_buffer, dest_buffer, regionCount, pRegions); + return; + } for (unsigned r = 0; r < regionCount; r++) { uint64_t src_offset = src_buffer->offset + pRegions[r].srcOffset; uint64_t dest_offset = dest_buffer->offset + pRegions[r].dstOffset; @@ -506,15 +514,13 @@ void radv_CmdCopyBuffer( } } -void radv_CmdUpdateBuffer( - VkCommandBuffer commandBuffer, - VkBuffer dstBuffer, - VkDeviceSize dstOffset, - VkDeviceSize dataSize, - const void* pData) +static void +radv_update_buffer(struct radv_cmd_buffer *cmd_buffer, + struct radv_buffer *dst_buffer, + VkDeviceSize dstOffset, + VkDeviceSize dataSize, + const void *pData) { - RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); - RADV_FROM_HANDLE(radv_buffer, dst_buffer, dstBuffer); bool mec = radv_cmd_buffer_uses_mec(cmd_buffer); uint64_t words = dataSize / 4; uint64_t va = cmd_buffer->device->ws->buffer_get_va(dst_buffer->bo); @@ -543,3 +549,23 @@ void radv_CmdUpdateBuffer( buf_offset, dstOffset + dst_buffer->offset, dataSize); } } + +void radv_CmdUpdateBuffer( + VkCommandBuffer commandBuffer, + VkBuffer dstBuffer, + VkDeviceSize dstOffset, + VkDeviceSize dataSize, + const void* pData) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + RADV_FROM_HANDLE(radv_buffer, dst_buffer, dstBuffer); + + if (cmd_buffer->queue_family_index == RADV_QUEUE_TRANSFER) { + radv_cik_dma_update_buffer(cmd_buffer, dst_buffer, + dstOffset, dataSize, pData); + return; + } + + radv_update_buffer(cmd_buffer, dst_buffer, dstOffset, + dataSize, pData); +} diff --git a/src/amd/vulkan/radv_meta_copy.c b/src/amd/vulkan/radv_meta_copy.c index 2bd20b5..d8dacf2 100644 --- a/src/amd/vulkan/radv_meta_copy.c +++ b/src/amd/vulkan/radv_meta_copy.c @@ -218,6 +218,12 @@ void radv_CmdCopyBufferToImage( RADV_FROM_HANDLE(radv_image, dest_image, destImage); RADV_FROM_HANDLE(radv_buffer, src_buffer, srcBuffer); + if (cmd_buffer->queue_family_index == RADV_QUEUE_TRANSFER) { + radv_cik_dma_copy_buffer_to_image(cmd_buffer, src_buffer, + dest_image, regionCount, + pRegions); + return; + } meta_copy_buffer_to_image(cmd_buffer, src_buffer, dest_image, regionCount, pRegions); } @@ -314,6 +320,13 @@ void radv_CmdCopyImageToBuffer( RADV_FROM_HANDLE(radv_image, src_image, srcImage); RADV_FROM_HANDLE(radv_buffer, dst_buffer, destBuffer); + if (cmd_buffer->queue_family_index == RADV_QUEUE_TRANSFER) { + radv_cik_dma_copy_image_to_buffer(cmd_buffer, src_image, + dst_buffer, regionCount, + pRegions); + return; + } + meta_copy_image_to_buffer(cmd_buffer, dst_buffer, src_image, regionCount, pRegions); } @@ -427,6 +440,12 @@ void radv_CmdCopyImage( RADV_FROM_HANDLE(radv_image, src_image, srcImage); RADV_FROM_HANDLE(radv_image, dest_image, destImage); + if (cmd_buffer->queue_family_index == RADV_QUEUE_TRANSFER) { + radv_cik_dma_copy_image(cmd_buffer, src_image, srcImageLayout, + dest_image, destImageLayout, + regionCount, pRegions); + return; + } meta_copy_image(cmd_buffer, src_image, dest_image, regionCount, pRegions); } diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index 25ed5de..90523b2 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -110,6 +110,7 @@ enum { RADV_DEBUG_NO_HIZ = 0x20, RADV_DEBUG_NO_COMPUTE_QUEUE = 0x40, RADV_DEBUG_UNSAFE_MATH = 0x80, + RADV_DEBUG_NO_TRANSFER_QUEUE = 0x100, }; #define radv_printflike(a, b) __attribute__((__format__(__printf__, a, b))) @@ -1309,6 +1310,43 @@ struct radv_fence { bool signalled; }; +void radv_cik_dma_copy_buffer_to_image(struct radv_cmd_buffer *cmd_buffer, + struct radv_buffer *src_buffer, + struct radv_image *dest_image, + uint32_t region_count, + const VkBufferImageCopy *pRegions); +void radv_cik_dma_copy_image_to_buffer(struct radv_cmd_buffer *cmd_buffer, + struct radv_image *src_image, + struct radv_buffer *dest_buffer, + uint32_t region_count, + const VkBufferImageCopy *pRegions); + +void radv_cik_dma_copy_image(struct radv_cmd_buffer *cmd_buffer, + struct radv_image *src_image, + VkImageLayout src_image_layout, + struct radv_image *dest_image, + VkImageLayout dest_image_layout, + uint32_t region_count, + const VkImageCopy *pRegions); + +void radv_cik_dma_copy_buffer(struct radv_cmd_buffer *cmd_buffer, + struct radv_buffer *src_buffer, + struct radv_buffer *dest_buffer, + uint32_t region_count, + const VkBufferCopy *pRegions); + +void radv_cik_dma_update_buffer(struct radv_cmd_buffer *cmd_buffer, + struct radv_buffer *dst_buffer, + VkDeviceSize dst_offset, + VkDeviceSize data_size, + const void *data); + +void radv_cik_dma_fill_buffer(struct radv_cmd_buffer *cmd_buffer, + struct radv_buffer *dst_buffer, + VkDeviceSize dst_offset, + VkDeviceSize fillSize, + uint32_t data); + #define RADV_DEFINE_HANDLE_CASTS(__radv_type, __VkType) \ \ static inline struct __radv_type * \ diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c index b58f5db..afe9068 100644 --- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c +++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c @@ -290,10 +290,20 @@ static void radv_amdgpu_cs_grow(struct radeon_winsys_cs *_cs, size_t min_size) static bool radv_amdgpu_cs_finalize(struct radeon_winsys_cs *_cs) { struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs); + struct radeon_winsys *ws = (struct radeon_winsys*)cs->ws; + uint32_t pad_word = 0xffff1000; + + if (radv_amdgpu_winsys(ws)->family == FAMILY_SI) { + if (cs->hw_ip == AMDGPU_HW_IP_DMA) + pad_word = 0xf0000000; + else + pad_word = 0x80000000; + } else if (cs->hw_ip == AMDGPU_HW_IP_DMA) + pad_word = 0x00000000; if (cs->ws->use_ib_bos) { while (!cs->base.cdw || (cs->base.cdw & 7) != 0) - cs->base.buf[cs->base.cdw++] = 0xffff1000; + cs->base.buf[cs->base.cdw++] = pad_word; *cs->ib_size_ptr |= cs->base.cdw; -- 2.9.3 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev