On Tue, Jan 15, 2019 at 3:54 PM Axel Davy <davyax...@gmail.com> wrote:
> On 15/01/2019 18:50, Marek Olšák wrote: > > > > +void si_compute_copy_image(struct si_context *sctx, > > + struct pipe_resource *dst, > > + unsigned dst_level, > > + struct pipe_resource *src, > > + unsigned src_level, > > + unsigned dstx, unsigned dsty, unsigned dstz, > > + const struct pipe_box *src_box) > > +{ > > + struct pipe_context *ctx = &sctx->b; > > + unsigned width = src_box->width; > > + unsigned height = src_box->height; > > + unsigned depth = src_box->depth; > > + > > + unsigned data[] = {src_box->x, src_box->y, src_box->z, 0, dstx, > dsty, dstz, 0}; > > + > > + if (width == 0 || height == 0) > > + return; > > + > > + sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | > > + si_get_flush_flags(sctx, SI_COHERENCY_SHADER, > L2_STREAM); > > + si_make_CB_shader_coherent(sctx, dst->nr_samples, true); > > + > > + struct pipe_constant_buffer saved_cb = {}; > > + si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, > &saved_cb); > > + > > + struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE]; > > + struct pipe_image_view saved_image[2] = {0}; > > + util_copy_image_view(&saved_image[0], &images->views[0]); > > + util_copy_image_view(&saved_image[1], &images->views[1]); > > + > > + void *saved_cs = sctx->cs_shader_state.program; > > + > > + struct pipe_constant_buffer cb = {}; > > + cb.buffer_size = sizeof(data); > > + cb.user_buffer = data; > > + ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb); > > + > > + struct pipe_image_view image[2] = {0}; > > + image[0].resource = src; > > + image[0].shader_access = image[0].access = PIPE_IMAGE_ACCESS_READ; > > + image[0].format = util_format_linear(src->format); > > + image[0].u.tex.level = src_level; > > + image[0].u.tex.first_layer = 0; > > + image[0].u.tex.last_layer = > > + src->target == PIPE_TEXTURE_3D ? u_minify(src->depth0, > src_level) - 1 > > + : > (unsigned)(src->array_size - 1); > > + image[1].resource = dst; > > + image[1].shader_access = image[1].access = PIPE_IMAGE_ACCESS_WRITE; > > + image[1].format = util_format_linear(dst->format); > > + image[1].u.tex.level = dst_level; > > + image[1].u.tex.first_layer = 0; > > + image[1].u.tex.last_layer = > > + dst->target == PIPE_TEXTURE_3D ? u_minify(dst->depth0, > dst_level) - 1 > > + : > (unsigned)(dst->array_size - 1); > > + > > + if (src->format == PIPE_FORMAT_R9G9B9E5_FLOAT) > > + image[0].format = image[1].format = PIPE_FORMAT_R32_UINT; > > + > > + /* SNORM8 blitting has precision issues on some chips. Use the SINT > > + * equivalent instead, which doesn't force DCC decompression. > > + * Note that some chips avoid this issue by using SDMA. > > + */ > > + if (util_format_is_snorm8(dst->format)) { > > + image[0].format = image[1].format = > > + util_format_snorm8_to_sint8(dst->format); > > + } > > + > > + ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, image); > > + > > + struct pipe_grid_info info = {0}; > > + > > + if (dst->target == PIPE_TEXTURE_1D_ARRAY && src->target == > PIPE_TEXTURE_1D_ARRAY) { > > + if (!sctx->cs_copy_image_1d_array) > > + sctx->cs_copy_image_1d_array = > > + > si_create_copy_image_compute_shader_1d_array(ctx); > > + ctx->bind_compute_state(ctx, sctx->cs_copy_image_1d_array); > > + info.block[0] = 64; > > + info.last_block[0] = width % 64; > > + info.block[1] = 1; > > + info.block[2] = 1; > > + info.grid[0] = DIV_ROUND_UP(width, 64); > > + info.grid[1] = depth; > > + info.grid[2] = 1; > > + } else { > > + if (!sctx->cs_copy_image) > > + sctx->cs_copy_image = > si_create_copy_image_compute_shader(ctx); > > + ctx->bind_compute_state(ctx, sctx->cs_copy_image); > > + info.block[0] = 8; > > + info.last_block[0] = width % 8; > > + info.block[1] = 8; > > + info.last_block[1] = height % 8; > > + info.block[2] = 1; > > + info.grid[0] = DIV_ROUND_UP(width, 8); > > + info.grid[1] = DIV_ROUND_UP(height, 8); > > + info.grid[2] = depth; > > + } > > + > > + ctx->launch_grid(ctx, &info); > > + > > + sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | > > + (sctx->chip_class <= VI ? > SI_CONTEXT_WRITEBACK_GLOBAL_L2 : 0) | > > + si_get_flush_flags(sctx, SI_COHERENCY_SHADER, > L2_STREAM); > > + ctx->bind_compute_state(ctx, saved_cs); > > + ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, > saved_image); > > + ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb); > > +} > > + > > > +void *si_create_copy_image_compute_shader(struct pipe_context *ctx) > > +{ > > + static const char text[] = > > + "COMP\n" > > + "PROPERTY CS_FIXED_BLOCK_WIDTH 8\n" > > + "PROPERTY CS_FIXED_BLOCK_HEIGHT 8\n" > > + "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n" > > + "DCL SV[0], THREAD_ID\n" > > + "DCL SV[1], BLOCK_ID\n" > > + "DCL IMAGE[0], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, > WR\n" > > + "DCL IMAGE[1], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, > WR\n" > > + "DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw > > + "DCL TEMP[0..4], LOCAL\n" > > + "IMM[0] UINT32 {8, 1, 0, 0}\n" > > + "MOV TEMP[0].xyz, CONST[0][0].xyzw\n" > > + "UMAD TEMP[1].xyz, SV[1].xyzz, IMM[0].xxyy, SV[0].xyzz\n" > > + "UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[0].xyzx\n" > > + "LOAD TEMP[3], IMAGE[0], TEMP[2].xyzx, 2D_ARRAY, > PIPE_FORMAT_R32G32B32A32_FLOAT\n" > > + "MOV TEMP[4].xyz, CONST[0][1].xyzw\n" > > + "UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[4].xyzx\n" > > + "STORE IMAGE[1], TEMP[2].xyzz, TEMP[3], 2D_ARRAY, > PIPE_FORMAT_R32G32B32A32_FLOAT\n" > > + "END\n"; > > + > > + struct tgsi_token tokens[1024]; > > + struct pipe_compute_state state = {0}; > > + > > + if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) { > > + assert(false); > > + return NULL; > > + } > > + > > + state.ir_type = PIPE_SHADER_IR_TGSI; > > + state.prog = tokens; > > + > > + return ctx->create_compute_state(ctx, &state); > > +} > > + > > > Hi, > > Here is my summary of my understanding of the proposal implementation > for the copy implementation: > > . Store input and output (x, y, z) offsets into a constant buffer > . (8, 8) workgroups > . Each workitem copies pixel (x+get_group_id(0)*8+get_local_id(0), > y+get_group_id(1)*8+get_local_id(1), > z+get_group_id(2)*8+get_local_id(2)). The pixel is RGBA. > The block size in Z is 1. > > Some questions: > . What happens when the textures do not have some components ? R32F for > example > Components that don't exist are not stored. > . I'm not familiar with using images in compute shaders, but is it ok to > declare as ARGB32F even if the input/output data is not float ? > The driver ignores the format specified by shaders. > > Some comments: > > . If src_x, dstx, etcs are not multiple of (8, 8), the workgroups won't > be aligned well with the tiling pattern. Fortunately cache should > mitigate the loss, but if that's an important case to handle, one could > write the shader differently to have all workgroups (except at border) > aligned. I guess one can benchmark see if that tiling alignment matters > much here. > That would complicate the shaders too much. . Overhead can be reduced by copying several pixels per work-item. > Patches welcome. :) . If the src and dst region are perfectly aligned with the tiling > pattern, the copy can be reduced to just moving a rectangle of memory > (no tiling) and could be implemented with dma_copy if no conversion is > needed or with a shader using buffers (no images), which would avoid > using the image sampling hw which I believe can be more limiting than > sampling a buffer when there is a lot of wavefronts. The data conversion > can be done for no cost in the shader as it should be memory bound. > Too complicated. > . (8, 8) is not optimal for linear tiled images (but I guess we don't > often get to use them with resource_copy_region). > Yes, linear -> linear copies are slower. tiled <-> linear copies wouldn't be improved. Marek
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev