This reduces the intermediate VRAM used for RGB decoding by a factor of 100x for 6k video. This also speeds the decoder up by 16% for 4k RGB24 and 31% for 6k video.
This is equivalent to what the software decoder does, but with less pointers. --- libavcodec/vulkan/Makefile | 3 +- libavcodec/vulkan/ffv1_dec.comp | 158 ++++++++++++---- libavcodec/vulkan/ffv1_dec_rct.comp | 88 --------- libavcodec/vulkan_ffv1.c | 283 ++++++++-------------------- libavutil/vulkan_functions.h | 1 + 5 files changed, 203 insertions(+), 330 deletions(-) delete mode 100644 libavcodec/vulkan/ffv1_dec_rct.comp diff --git a/libavcodec/vulkan/Makefile b/libavcodec/vulkan/Makefile index e6bad486bd..feb5d2ea51 100644 --- a/libavcodec/vulkan/Makefile +++ b/libavcodec/vulkan/Makefile @@ -14,8 +14,7 @@ OBJS-$(CONFIG_FFV1_VULKAN_ENCODER) += vulkan/common.o \ OBJS-$(CONFIG_FFV1_VULKAN_HWACCEL) += vulkan/common.o \ vulkan/rangecoder.o vulkan/ffv1_vlc.o \ vulkan/ffv1_common.o vulkan/ffv1_reset.o \ - vulkan/ffv1_dec_setup.o vulkan/ffv1_dec.o \ - vulkan/ffv1_dec_rct.o + vulkan/ffv1_dec_setup.o vulkan/ffv1_dec.o VULKAN = $(subst $(SRC_PATH)/,,$(wildcard $(SRC_PATH)/libavcodec/vulkan/*.comp)) .SECONDARY: $(VULKAN:.comp=.c) diff --git a/libavcodec/vulkan/ffv1_dec.comp b/libavcodec/vulkan/ffv1_dec.comp index 1954c050f8..ae0324cb26 100644 --- a/libavcodec/vulkan/ffv1_dec.comp +++ b/libavcodec/vulkan/ffv1_dec.comp @@ -20,23 +20,69 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -ivec2 get_pred(ivec2 pos, ivec2 off, int p, int sw, uint8_t quant_table_idx) +#ifndef RGB +#define LADDR(p) (p) +#else +#define RGB_LINECACHE 2 +#define RGB_LBUF (RGB_LINECACHE - 1) +#define LADDR(p) (ivec2((p).x, ((p).y & RGB_LBUF))) +#endif + +#ifdef RGB +ivec2 get_pred(ivec2 sp, ivec2 off, int p, int sw, uint8_t quant_table_idx) +{ + const ivec2 yoff_border1 = off.x == 0 ? ivec2(1, -1) : ivec2(0, 0); + + /* Thanks to the same coincidence as below, we can skip checking if off == 0, 1 */ + VTYPE3 top = VTYPE3(TYPE(imageLoad(dec[p], sp + LADDR(off + ivec2(-1, -1) + yoff_border1))[0]), + TYPE(imageLoad(dec[p], sp + LADDR(off + ivec2(0, -1)))[0]), + TYPE(imageLoad(dec[p], sp + LADDR(off + ivec2(min(1, sw - off.x - 1), -1)))[0])); + + /* Normally, we'd need to check if off != ivec2(0, 0) here, since otherwise, we must + * return zero. However, ivec2(-1, 0) + ivec2(1, -1) == ivec2(0, -1), e.g. previous + * row, 0 offset, same slice, which is zero since we zero out the buffer for RGB */ + TYPE cur = TYPE(imageLoad(dec[p], sp + LADDR(off + ivec2(-1, 0) + yoff_border1))[0]); + + int base = quant_table[quant_table_idx][0][(cur - top[0]) & MAX_QUANT_TABLE_MASK] + + quant_table[quant_table_idx][1][(top[0] - top[1]) & MAX_QUANT_TABLE_MASK] + + quant_table[quant_table_idx][2][(top[1] - top[2]) & MAX_QUANT_TABLE_MASK]; + + if ((quant_table[quant_table_idx][3][127] != 0) || + (quant_table[quant_table_idx][4][127] != 0)) { + TYPE cur2 = TYPE(0); + if (off.x > 0) { + const ivec2 yoff_border2 = off.x == 1 ? ivec2(1, -1) : ivec2(0, 0); + cur2 = TYPE(imageLoad(dec[p], sp + LADDR(off + ivec2(-2, 0) + yoff_border2))[0]); + } + base += quant_table[quant_table_idx][3][(cur2 - cur) & MAX_QUANT_TABLE_MASK]; + + /* top-2 became current upon swap */ + TYPE top2 = TYPE(imageLoad(dec[p], sp + LADDR(off))[0]); + base += quant_table[quant_table_idx][4][(top2 - top[1]) & MAX_QUANT_TABLE_MASK]; + } + + /* context, prediction */ + return ivec2(base, predict(cur, VTYPE2(top))); +} +#else +ivec2 get_pred(ivec2 sp, ivec2 off, int p, int sw, uint8_t quant_table_idx) { const ivec2 yoff_border1 = off.x == 0 ? ivec2(1, -1) : ivec2(0, 0); + sp += off; VTYPE3 top = VTYPE3(TYPE(0), TYPE(0), TYPE(0)); if (off.y > 0 && off != ivec2(0, 1)) - top[0] = TYPE(imageLoad(dst[p], pos + ivec2(-1, -1) + yoff_border1)[0]); + top[0] = TYPE(imageLoad(dec[p], sp + ivec2(-1, -1) + yoff_border1)[0]); if (off.y > 0) { - top[1] = TYPE(imageLoad(dst[p], pos + ivec2(0, -1))[0]); - top[2] = TYPE(imageLoad(dst[p], pos + ivec2(min(1, sw - off.x - 1), -1))[0]); + top[1] = TYPE(imageLoad(dec[p], sp + ivec2(0, -1))[0]); + top[2] = TYPE(imageLoad(dec[p], sp + ivec2(min(1, sw - off.x - 1), -1))[0]); } TYPE cur = TYPE(0); if (off != ivec2(0, 0)) - cur = TYPE(imageLoad(dst[p], pos + ivec2(-1, 0) + yoff_border1)[0]); + cur = TYPE(imageLoad(dec[p], sp + ivec2(-1, 0) + yoff_border1)[0]); int base = quant_table[quant_table_idx][0][(cur - top[0]) & MAX_QUANT_TABLE_MASK] + quant_table[quant_table_idx][1][(top[0] - top[1]) & MAX_QUANT_TABLE_MASK] + @@ -47,19 +93,20 @@ ivec2 get_pred(ivec2 pos, ivec2 off, int p, int sw, uint8_t quant_table_idx) TYPE cur2 = TYPE(0); if (off.x > 0 && off != ivec2(1, 0)) { const ivec2 yoff_border2 = off.x == 1 ? ivec2(1, -1) : ivec2(0, 0); - cur2 = TYPE(imageLoad(dst[p], pos + ivec2(-2, 0) + yoff_border2)[0]); + cur2 = TYPE(imageLoad(dec[p], sp + ivec2(-2, 0) + yoff_border2)[0]); } base += quant_table[quant_table_idx][3][(cur2 - cur) & MAX_QUANT_TABLE_MASK]; TYPE top2 = TYPE(0); if (off.y > 1) - top2 = TYPE(imageLoad(dst[p], pos + ivec2(0, -2))[0]); + top2 = TYPE(imageLoad(dec[p], sp + ivec2(0, -2))[0]); base += quant_table[quant_table_idx][4][(top2 - top[1]) & MAX_QUANT_TABLE_MASK]; } /* context, prediction */ return ivec2(base, predict(cur, VTYPE2(top))); } +#endif #ifndef GOLOMB int get_isymbol(inout RangeCoder c, uint64_t state) @@ -89,11 +136,8 @@ int get_isymbol(inout RangeCoder c, uint64_t state) return get_rac(c, state - 11 + min(e, 10)) ? -a : a; } -void decode_line_pcm(inout SliceContext sc, int y, int p, int bits) +void decode_line_pcm(inout SliceContext sc, ivec2 sp, int w, int y, int p, int bits) { - ivec2 sp = sc.slice_pos; - int w = sc.slice_dim.x; - #ifndef RGB if (p > 0 && p < 3) { w >>= chroma_shift.x; @@ -106,16 +150,14 @@ void decode_line_pcm(inout SliceContext sc, int y, int p, int bits) for (int i = (bits - 1); i >= 0; i--) v |= uint(get_rac_equi(sc.c)) << i; - imageStore(dst[p], sp + ivec2(x, y), uvec4(v)); + imageStore(dec[p], sp + LADDR(ivec2(x, y)), uvec4(v)); } } -void decode_line(inout SliceContext sc, uint64_t state, - int y, int p, int bits, const int run_index) +void decode_line(inout SliceContext sc, ivec2 sp, int w, + int y, int p, int bits, uint64_t state, + const int run_index) { - ivec2 sp = sc.slice_pos; - int w = sc.slice_dim.x; - #ifndef RGB if (p > 0 && p < 3) { w >>= chroma_shift.x; @@ -124,7 +166,7 @@ void decode_line(inout SliceContext sc, uint64_t state, #endif for (int x = 0; x < w; x++) { - ivec2 pr = get_pred(sp + ivec2(x, y), ivec2(x, y), p, w, + ivec2 pr = get_pred(sp, ivec2(x, y), p, w, sc.quant_table_idx[p]); int diff = get_isymbol(sc.c, state + CONTEXT_SIZE*abs(pr[0])); @@ -132,18 +174,16 @@ void decode_line(inout SliceContext sc, uint64_t state, diff = -diff; uint v = zero_extend(pr[1] + diff, bits); - imageStore(dst[p], sp + ivec2(x, y), uvec4(v)); + imageStore(dec[p], sp + LADDR(ivec2(x, y)), uvec4(v)); } } #else /* GOLOMB */ -void decode_line(inout SliceContext sc, uint64_t state, - int y, int p, int bits, inout int run_index) +void decode_line(inout SliceContext sc, ivec2 sp, int w, + int y, int p, int bits, uint64_t state, + inout int run_index) { - ivec2 sp = sc.slice_pos; - int w = sc.slice_dim.x; - #ifndef RGB if (p > 0 && p < 3) { w >>= chroma_shift.x; @@ -157,7 +197,7 @@ void decode_line(inout SliceContext sc, uint64_t state, for (int x = 0; x < w; x++) { ivec2 pos = sp + ivec2(x, y); int diff; - ivec2 pr = get_pred(sp + ivec2(x, y), ivec2(x, y), p, w, + ivec2 pr = get_pred(sp, ivec2(x, y), p, w, sc.quant_table_idx[p]); VlcState sb = VlcState(state + VLC_STATE_SIZE*abs(pr[0])); @@ -202,7 +242,44 @@ void decode_line(inout SliceContext sc, uint64_t state, diff = -diff; uint v = zero_extend(pr[1] + diff, bits); - imageStore(dst[p], sp + ivec2(x, y), uvec4(v)); + imageStore(dec[p], sp + LADDR(ivec2(x, y)), uvec4(v)); + } +} +#endif + +#ifdef RGB +ivec4 transform_sample(ivec4 pix, ivec2 rct_coef) +{ + pix.b -= rct_offset; + pix.r -= rct_offset; + pix.g -= (pix.b*rct_coef.y + pix.r*rct_coef.x) >> 2; + pix.b += pix.g; + pix.r += pix.g; + return ivec4(pix[fmt_lut[0]], pix[fmt_lut[1]], + pix[fmt_lut[2]], pix[fmt_lut[3]]); +} + +void writeout_rgb(in SliceContext sc, ivec2 sp, int w, int y, bool apply_rct) +{ + for (int x = 0; x < w; x++) { + ivec2 lpos = sp + LADDR(ivec2(x, y)); + ivec2 pos = sc.slice_pos + ivec2(x, y); + + ivec4 pix; + pix.r = int(imageLoad(dec[2], lpos)[0]); + pix.g = int(imageLoad(dec[0], lpos)[0]); + pix.b = int(imageLoad(dec[1], lpos)[0]); + if (transparency != 0) + pix.a = int(imageLoad(dec[3], lpos)[0]); + + if (apply_rct) + pix = transform_sample(pix, sc.slice_rct_coef); + + imageStore(dst[0], pos, pix); + if (planar_rgb != 0) { + for (int i = 1; i < color_planes; i++) + imageStore(dst[i], pos, ivec4(pix[i])); + } } } #endif @@ -210,6 +287,8 @@ void decode_line(inout SliceContext sc, uint64_t state, void decode_slice(inout SliceContext sc, const uint slice_idx) { int run_index = 0; + int w = sc.slice_dim.x; + ivec2 sp = sc.slice_pos; #ifndef RGB int bits = bits_per_raw_sample; @@ -217,6 +296,8 @@ void decode_slice(inout SliceContext sc, const uint slice_idx) int bits = 9; if (bits != 8 || sc.slice_coding_mode != 0) bits = bits_per_raw_sample + int(sc.slice_coding_mode != 1); + + sp.y = int(gl_WorkGroupID.y)*RGB_LINECACHE; #endif /* PCM coding */ @@ -229,12 +310,14 @@ void decode_slice(inout SliceContext sc, const uint slice_idx) h >>= chroma_shift.y; for (int y = 0; y < h; y++) - decode_line_pcm(sc, y, p, bits); + decode_line_pcm(sc, sp, w, y, p, bits); } #else for (int y = 0; y < sc.slice_dim.y; y++) { for (int p = 0; p < color_planes; p++) - decode_line_pcm(sc, y, p, bits); + decode_line_pcm(sc, sp, w, y, p, bits); + + writeout_rgb(sc, sp, w, y, false); } #endif } else @@ -242,8 +325,9 @@ void decode_slice(inout SliceContext sc, const uint slice_idx) /* Arithmetic coding */ #endif { - uint64_t slice_state_off = uint64_t(slice_state) + - slice_idx*plane_state_size*codec_planes; + u64vec4 slice_state_off = (uint64_t(slice_state) + + slice_idx*plane_state_size*codec_planes) + + plane_state_size*uvec4(0, 1, 1, 2); #ifndef RGB for (int p = 0; p < planes; p++) { @@ -252,18 +336,16 @@ void decode_slice(inout SliceContext sc, const uint slice_idx) h >>= chroma_shift.y; for (int y = 0; y < h; y++) - decode_line(sc, slice_state_off, y, p, bits, run_index); - - /* For the second chroma plane, reuse the first plane's state */ - if (p != 1) - slice_state_off += plane_state_size; + decode_line(sc, sp, w, y, p, bits, + slice_state_off[p], run_index); } #else for (int y = 0; y < sc.slice_dim.y; y++) { for (int p = 0; p < color_planes; p++) - decode_line(sc, - slice_state_off + plane_state_size*((p + 1) >> 1), - y, p, bits, run_index); + decode_line(sc, sp, w, y, p, bits, + slice_state_off[p], run_index); + + writeout_rgb(sc, sp, w, y, true); } #endif } diff --git a/libavcodec/vulkan/ffv1_dec_rct.comp b/libavcodec/vulkan/ffv1_dec_rct.comp deleted file mode 100644 index a550a5fcb8..0000000000 --- a/libavcodec/vulkan/ffv1_dec_rct.comp +++ /dev/null @@ -1,88 +0,0 @@ -/* - * FFv1 codec - * - * Copyright (c) 2025 Lynne <d...@lynne.ee> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -void bypass_block(in SliceContext sc) -{ - ivec2 start = ivec2(gl_LocalInvocationID) + sc.slice_pos; - ivec2 end = sc.slice_pos + sc.slice_dim; - - for (uint y = start.y; y < end.y; y += gl_WorkGroupSize.y) { - for (uint x = start.x; x < end.x; x += gl_WorkGroupSize.x) { - ivec2 pos = ivec2(x, y); - ivec4 pix; - for (int i = 0; i < color_planes; i++) - pix[i] = int(imageLoad(src[i], pos)[0]); - - imageStore(dst[0], pos, pix); - if (planar_rgb != 0) { - for (int i = 1; i < color_planes; i++) - imageStore(dst[i], pos, ivec4(pix[i])); - } - } - } -} - -void transform_sample(ivec2 pos, ivec2 rct_coef) -{ - ivec4 pix; - pix.r = int(imageLoad(src[2], pos)[0]); - pix.g = int(imageLoad(src[0], pos)[0]); - pix.b = int(imageLoad(src[1], pos)[0]); - if (transparency != 0) - pix.a = int(imageLoad(src[3], pos)[0]); - - pix.b -= offset; - pix.r -= offset; - pix.g -= (pix.b*rct_coef.y + pix.r*rct_coef.x) >> 2; - pix.b += pix.g; - pix.r += pix.g; - - pix = ivec4(pix[fmt_lut[0]], pix[fmt_lut[1]], - pix[fmt_lut[2]], pix[fmt_lut[3]]); - - imageStore(dst[0], pos, pix); - if (planar_rgb != 0) { - for (int i = 1; i < color_planes; i++) - imageStore(dst[i], pos, ivec4(pix[i])); - } -} - -void transform_block(in SliceContext sc) -{ - const ivec2 rct_coef = sc.slice_rct_coef; - const ivec2 start = ivec2(gl_LocalInvocationID) + sc.slice_pos; - const ivec2 end = sc.slice_pos + sc.slice_dim; - - for (uint y = start.y; y < end.y; y += gl_WorkGroupSize.y) - for (uint x = start.x; x < end.x; x += gl_WorkGroupSize.x) - transform_sample(ivec2(x, y), rct_coef); -} - -void main() -{ - const uint slice_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x; - - if (slice_ctx[slice_idx].slice_coding_mode == 1) - bypass_block(slice_ctx[slice_idx]); - else - transform_block(slice_ctx[slice_idx]); -} diff --git a/libavcodec/vulkan_ffv1.c b/libavcodec/vulkan_ffv1.c index e511840a01..5584b72385 100644 --- a/libavcodec/vulkan_ffv1.c +++ b/libavcodec/vulkan_ffv1.c @@ -33,7 +33,6 @@ extern const char *ff_source_ffv1_common_comp; extern const char *ff_source_ffv1_dec_setup_comp; extern const char *ff_source_ffv1_reset_comp; extern const char *ff_source_ffv1_dec_comp; -extern const char *ff_source_ffv1_dec_rct_comp; const FFVulkanDecodeDescriptor ff_vk_dec_ffv1_desc = { .codec_id = AV_CODEC_ID_FFV1, @@ -66,7 +65,6 @@ typedef struct FFv1VulkanDecodeContext { FFVulkanShader setup; FFVulkanShader reset[2]; /* AC/Golomb */ FFVulkanShader decode[2][2][2]; /* 16/32 bit, AC/Golomb, Normal/RGB */ - FFVulkanShader rct[2]; /* 16/32 bit */ FFVkBuffer rangecoder_static_buf; FFVkBuffer quant_buf; @@ -85,11 +83,13 @@ typedef struct FFv1VkParameters { VkDeviceAddress slice_state; VkDeviceAddress scratch_data; + int fmt_lut[4]; uint32_t img_size[2]; uint32_t chroma_shift[2]; uint32_t plane_state_size; uint32_t crcref; + int rct_offset; uint8_t bits_per_raw_sample; uint8_t quant_table_count; @@ -100,6 +100,7 @@ typedef struct FFv1VkParameters { uint8_t codec_planes; uint8_t color_planes; uint8_t transparency; + uint8_t planar_rgb; uint8_t colorspace; uint8_t ec; uint8_t golomb; @@ -116,11 +117,13 @@ static void add_push_data(FFVulkanShader *shd) GLSLC(1, u8buf slice_state; ); GLSLC(1, u8buf scratch_data; ); GLSLC(0, ); + GLSLC(1, ivec4 fmt_lut; ); GLSLC(1, uvec2 img_size; ); GLSLC(1, uvec2 chroma_shift; ); GLSLC(0, ); GLSLC(1, uint plane_state_size; ); GLSLC(1, uint32_t crcref; ); + GLSLC(1, int rct_offset; ); GLSLC(0, ); GLSLC(1, uint8_t bits_per_raw_sample; ); GLSLC(1, uint8_t quant_table_count; ); @@ -131,6 +134,7 @@ static void add_push_data(FFVulkanShader *shd) GLSLC(1, uint8_t codec_planes; ); GLSLC(1, uint8_t color_planes; ); GLSLC(1, uint8_t transparency; ); + GLSLC(1, uint8_t planar_rgb; ); GLSLC(1, uint8_t colorspace; ); GLSLC(1, uint8_t ec; ); GLSLC(1, uint8_t golomb; ); @@ -349,11 +353,17 @@ static int vk_ffv1_end_frame(AVCodecContext *avctx) return err; if (is_rgb) { - RET(ff_vk_exec_add_dep_frame(&ctx->s, exec, vp->dpb_frame, - VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, - VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT)); RET(ff_vk_create_imageviews(&ctx->s, exec, rct_image_views, vp->dpb_frame, FF_VK_REP_NATIVE)); + RET(ff_vk_exec_add_dep_frame(&ctx->s, exec, vp->dpb_frame, + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_2_CLEAR_BIT)); + ff_vk_frame_barrier(&ctx->s, exec, decode_dst, img_bar, &nb_img_bar, + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, + VK_ACCESS_2_TRANSFER_WRITE_BIT, + VK_IMAGE_LAYOUT_GENERAL, + VK_QUEUE_FAMILY_IGNORED); } if (!(f->picture.f->flags & AV_FRAME_FLAG_KEY)) { @@ -391,6 +401,8 @@ static int vk_ffv1_end_frame(AVCodecContext *avctx) vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pImageMemoryBarriers = img_bar, + .imageMemoryBarrierCount = nb_img_bar, .pBufferMemoryBarriers = buf_bar, .bufferMemoryBarrierCount = nb_buf_bar, }); @@ -431,6 +443,7 @@ static int vk_ffv1_end_frame(AVCodecContext *avctx) .plane_state_size = fp->plane_state_size, .crcref = f->crcref, + .rct_offset = 1 << bits, .bits_per_raw_sample = bits, .quant_table_count = f->quant_table_count, @@ -441,11 +454,23 @@ static int vk_ffv1_end_frame(AVCodecContext *avctx) .codec_planes = f->plane_count, .color_planes = color_planes, .transparency = f->transparency, + .planar_rgb = ff_vk_mt_is_np_rgb(sw_format) && + (ff_vk_count_images((AVVkFrame *)f->picture.f->data[0]) > 1), .colorspace = f->colorspace, .ec = f->ec, .golomb = f->ac == AC_GOLOMB_RICE, .check_crc = !!(avctx->err_recognition & AV_EF_CRCCHECK), }; + + /* For some reason the C FFv1 encoder/decoder treats these differently */ + if (sw_format == AV_PIX_FMT_GBRP10 || sw_format == AV_PIX_FMT_GBRP12 || + sw_format == AV_PIX_FMT_GBRP14) + memcpy(pd.fmt_lut, (int [4]) { 2, 1, 0, 3 }, 4*sizeof(int)); + else if (sw_format == AV_PIX_FMT_X2BGR10) + memcpy(pd.fmt_lut, (int [4]) { 0, 2, 1, 3 }, 4*sizeof(int)); + else + ff_vk_set_perm(sw_format, pd.fmt_lut, 0); + for (int i = 0; i < MAX_QUANT_TABLES; i++) pd.context_count[i] = f->context_count[i]; @@ -455,6 +480,18 @@ static int vk_ffv1_end_frame(AVCodecContext *avctx) vk->CmdDispatch(exec->buf, f->num_h_slices, f->num_v_slices, 1); + if (is_rgb) { + AVVkFrame *vkf = (AVVkFrame *)vp->dpb_frame->data[0]; + for (int i = 0; i < color_planes; i++) + vk->CmdClearColorImage(exec->buf, vkf->img[i], VK_IMAGE_LAYOUT_GENERAL, + &((VkClearColorValue) { 0 }), + 1, &((VkImageSubresourceRange) { + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .levelCount = 1, + .layerCount = 1, + })); + } + /* Reset shader */ reset_shader = &fv->reset[f->ac == AC_GOLOMB_RICE]; ff_vk_shader_update_desc_buffer(&ctx->s, exec, reset_shader, @@ -493,12 +530,15 @@ static int vk_ffv1_end_frame(AVCodecContext *avctx) }; vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pImageMemoryBarriers = img_bar, + .imageMemoryBarrierCount = nb_img_bar, .pBufferMemoryBarriers = buf_bar, .bufferMemoryBarrierCount = nb_buf_bar, }); slice_state->stage = buf_bar[0].dstStageMask; slice_state->access = buf_bar[0].dstAccessMask; nb_buf_bar = 0; + nb_img_bar = 0; vk->CmdDispatch(exec->buf, f->num_h_slices, f->num_v_slices, f->plane_count); @@ -515,6 +555,12 @@ static int vk_ffv1_end_frame(AVCodecContext *avctx) 1, 1, VK_IMAGE_LAYOUT_GENERAL, VK_NULL_HANDLE); + if (is_rgb) + ff_vk_shader_update_img_array(&ctx->s, exec, decode_shader, + f->picture.f, vp->view.out, + 1, 2, + VK_IMAGE_LAYOUT_GENERAL, + VK_NULL_HANDLE); ff_vk_exec_bind_shader(&ctx->s, exec, decode_shader); ff_vk_shader_update_push_const(&ctx->s, exec, decode_shader, @@ -537,12 +583,20 @@ static int vk_ffv1_end_frame(AVCodecContext *avctx) }; /* Input frame barrier */ - ff_vk_frame_barrier(&ctx->s, exec, decode_dst, img_bar, &nb_img_bar, + ff_vk_frame_barrier(&ctx->s, exec, f->picture.f, img_bar, &nb_img_bar, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, - VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, + VK_ACCESS_SHADER_WRITE_BIT | + (!is_rgb ? VK_ACCESS_SHADER_READ_BIT : 0), VK_IMAGE_LAYOUT_GENERAL, VK_QUEUE_FAMILY_IGNORED); + if (is_rgb) + ff_vk_frame_barrier(&ctx->s, exec, vp->dpb_frame, img_bar, &nb_img_bar, + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, + VK_IMAGE_LAYOUT_GENERAL, + VK_QUEUE_FAMILY_IGNORED); vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, @@ -558,74 +612,6 @@ static int vk_ffv1_end_frame(AVCodecContext *avctx) vk->CmdDispatch(exec->buf, f->num_h_slices, f->num_v_slices, 1); - /* RCT */ - if (is_rgb) { - FFVulkanShader *rct_shader = &fv->rct[f->use32bit]; - FFv1VkRCTParameters pd_rct; - - ff_vk_shader_update_desc_buffer(&ctx->s, exec, rct_shader, - 1, 0, 0, - slice_state, - 0, fp->slice_data_size*f->slice_count, - VK_FORMAT_UNDEFINED); - ff_vk_shader_update_img_array(&ctx->s, exec, rct_shader, - decode_dst, decode_dst_view, - 1, 1, - VK_IMAGE_LAYOUT_GENERAL, - VK_NULL_HANDLE); - ff_vk_shader_update_img_array(&ctx->s, exec, rct_shader, - f->picture.f, vp->view.out, - 1, 2, - VK_IMAGE_LAYOUT_GENERAL, - VK_NULL_HANDLE); - - ff_vk_exec_bind_shader(&ctx->s, exec, rct_shader); - - pd_rct = (FFv1VkRCTParameters) { - .offset = 1 << bits, - .bits = bits, - .planar_rgb = ff_vk_mt_is_np_rgb(sw_format) && - (ff_vk_count_images((AVVkFrame *)f->picture.f->data[0]) > 1), - .color_planes = color_planes, - .transparency = f->transparency, - }; - - /* For some reason the C FFv1 encoder/decoder treats these differently */ - if (sw_format == AV_PIX_FMT_GBRP10 || sw_format == AV_PIX_FMT_GBRP12 || - sw_format == AV_PIX_FMT_GBRP14) - memcpy(pd_rct.fmt_lut, (int [4]) { 2, 1, 0, 3 }, 4*sizeof(int)); - else if (sw_format == AV_PIX_FMT_X2BGR10) - memcpy(pd_rct.fmt_lut, (int [4]) { 0, 2, 1, 3 }, 4*sizeof(int)); - else - ff_vk_set_perm(sw_format, pd_rct.fmt_lut, 0); - - ff_vk_shader_update_push_const(&ctx->s, exec, rct_shader, - VK_SHADER_STAGE_COMPUTE_BIT, - 0, sizeof(pd_rct), &pd_rct); - - ff_vk_frame_barrier(&ctx->s, exec, decode_dst, img_bar, &nb_img_bar, - VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, - VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, - VK_ACCESS_SHADER_READ_BIT, - VK_IMAGE_LAYOUT_GENERAL, - VK_QUEUE_FAMILY_IGNORED); - ff_vk_frame_barrier(&ctx->s, exec, f->picture.f, img_bar, &nb_img_bar, - VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, - VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, - VK_ACCESS_SHADER_WRITE_BIT, - VK_IMAGE_LAYOUT_GENERAL, - VK_QUEUE_FAMILY_IGNORED); - - vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { - .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, - .pImageMemoryBarriers = img_bar, - .imageMemoryBarrierCount = nb_img_bar, - }); - nb_img_bar = 0; - - vk->CmdDispatch(exec->buf, f->num_h_slices, f->num_v_slices, 1); - } - err = ff_vk_exec_submit(&ctx->s, exec); if (err < 0) return err; @@ -845,7 +831,9 @@ fail: static int init_decode_shader(FFV1Context *f, FFVulkanContext *s, FFVkExecPool *pool, FFVkSPIRVCompiler *spv, - FFVulkanShader *shd, AVHWFramesContext *frames_ctx, + FFVulkanShader *shd, + AVHWFramesContext *dec_frames_ctx, + AVHWFramesContext *out_frames_ctx, int use32bit, int ac, int rgb) { int err; @@ -910,127 +898,28 @@ static int init_decode_shader(FFV1Context *f, FFVulkanContext *s, .buf_elems = f->max_slice_count, }, { - .name = "dst", - .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, - .dimensions = 2, - .mem_layout = ff_vk_shader_rep_fmt(frames_ctx->sw_format, - FF_VK_REP_NATIVE), - .elems = av_pix_fmt_count_planes(frames_ctx->sw_format), - .stages = VK_SHADER_STAGE_COMPUTE_BIT, - }, - }; - RET(ff_vk_shader_add_descriptor_set(s, shd, desc_set, 2, 0, 0)); - - GLSLD(ff_source_ffv1_dec_comp); - - RET(spv->compile_shader(s, spv, shd, &spv_data, &spv_len, "main", - &spv_opaque)); - RET(ff_vk_shader_link(s, shd, spv_data, spv_len, "main")); - - RET(ff_vk_shader_register_exec(s, pool, shd)); - -fail: - if (spv_opaque) - spv->free_shader(spv, &spv_opaque); - - return err; -} - -static int init_rct_shader(FFV1Context *f, FFVulkanContext *s, - FFVkExecPool *pool, FFVkSPIRVCompiler *spv, - FFVulkanShader *shd, int use32bit, - AVHWFramesContext *src_ctx, AVHWFramesContext *dst_ctx) -{ - int err; - FFVulkanDescriptorSetBinding *desc_set; - - uint8_t *spv_data; - size_t spv_len; - void *spv_opaque = NULL; - int wg_count = sqrt(s->props.properties.limits.maxComputeWorkGroupInvocations); - - RET(ff_vk_shader_init(s, shd, "ffv1_rct", - VK_SHADER_STAGE_COMPUTE_BIT, - (const char *[]) { "GL_EXT_buffer_reference", - "GL_EXT_buffer_reference2" }, 2, - wg_count, wg_count, 1, - 0)); - - /* Common codec header */ - GLSLD(ff_source_common_comp); - - GLSLC(0, layout(push_constant, scalar) uniform pushConstants { ); - GLSLC(1, ivec4 fmt_lut; ); - GLSLC(1, int offset; ); - GLSLC(1, uint8_t bits; ); - GLSLC(1, uint8_t planar_rgb; ); - GLSLC(1, uint8_t color_planes; ); - GLSLC(1, uint8_t transparency; ); - GLSLC(1, uint8_t version; ); - GLSLC(1, uint8_t micro_version; ); - GLSLC(1, uint8_t padding[2]; ); - GLSLC(0, }; ); - ff_vk_shader_add_push_const(shd, 0, sizeof(FFv1VkRCTParameters), - VK_SHADER_STAGE_COMPUTE_BIT); - - av_bprintf(&shd->src, "#define MAX_QUANT_TABLES %i\n", MAX_QUANT_TABLES); - av_bprintf(&shd->src, "#define MAX_CONTEXT_INPUTS %i\n", MAX_CONTEXT_INPUTS); - av_bprintf(&shd->src, "#define MAX_QUANT_TABLE_SIZE %i\n", MAX_QUANT_TABLE_SIZE); - - desc_set = (FFVulkanDescriptorSetBinding []) { - { - .name = "rangecoder_static_buf", - .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, - .stages = VK_SHADER_STAGE_COMPUTE_BIT, - .mem_layout = "scalar", - .buf_content = "uint8_t zero_one_state[512];", - }, - { - .name = "quant_buf", - .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, - .stages = VK_SHADER_STAGE_COMPUTE_BIT, - .mem_layout = "scalar", - .buf_content = "int16_t quant_table[MAX_QUANT_TABLES]" - "[MAX_CONTEXT_INPUTS][MAX_QUANT_TABLE_SIZE];", - }, - }; - RET(ff_vk_shader_add_descriptor_set(s, shd, desc_set, 2, 1, 0)); - - define_shared_code(shd, use32bit); - - desc_set = (FFVulkanDescriptorSetBinding []) { - { - .name = "slice_data_buf", - .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - .mem_quali = "readonly", - .stages = VK_SHADER_STAGE_COMPUTE_BIT, - .buf_content = "SliceContext slice_ctx", - .buf_elems = f->max_slice_count, - }, - { - .name = "src", + .name = "dec", .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, .dimensions = 2, - .mem_layout = ff_vk_shader_rep_fmt(src_ctx->sw_format, + .mem_layout = ff_vk_shader_rep_fmt(dec_frames_ctx->sw_format, FF_VK_REP_NATIVE), - .mem_quali = "readonly", - .elems = av_pix_fmt_count_planes(src_ctx->sw_format), + .elems = av_pix_fmt_count_planes(dec_frames_ctx->sw_format), .stages = VK_SHADER_STAGE_COMPUTE_BIT, }, { .name = "dst", .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, .dimensions = 2, - .mem_layout = ff_vk_shader_rep_fmt(dst_ctx->sw_format, + .mem_layout = ff_vk_shader_rep_fmt(out_frames_ctx->sw_format, FF_VK_REP_NATIVE), .mem_quali = "writeonly", - .elems = av_pix_fmt_count_planes(dst_ctx->sw_format), + .elems = av_pix_fmt_count_planes(out_frames_ctx->sw_format), .stages = VK_SHADER_STAGE_COMPUTE_BIT, }, }; - RET(ff_vk_shader_add_descriptor_set(s, shd, desc_set, 3, 0, 0)); + RET(ff_vk_shader_add_descriptor_set(s, shd, desc_set, 2 + rgb, 0, 0)); - GLSLD(ff_source_ffv1_dec_rct_comp); + GLSLD(ff_source_ffv1_dec_comp); RET(spv->compile_shader(s, spv, shd, &spv_data, &spv_len, "main", &spv_opaque)); @@ -1051,6 +940,7 @@ static int init_indirect(AVCodecContext *avctx, FFVulkanContext *s, int err; AVHWFramesContext *frames_ctx; AVVulkanFramesContext *vk_frames; + FFV1Context *f = avctx->priv_data; *dst = av_hwframe_ctx_alloc(s->device_ref); if (!(*dst)) @@ -1059,13 +949,14 @@ static int init_indirect(AVCodecContext *avctx, FFVulkanContext *s, frames_ctx = (AVHWFramesContext *)((*dst)->data); frames_ctx->format = AV_PIX_FMT_VULKAN; frames_ctx->sw_format = sw_format; - frames_ctx->width = FFALIGN(s->frames->width, 32); - frames_ctx->height = FFALIGN(s->frames->height, 32); + frames_ctx->width = s->frames->width; + frames_ctx->height = f->num_v_slices*2; vk_frames = frames_ctx->hwctx; vk_frames->tiling = VK_IMAGE_TILING_OPTIMAL; - vk_frames->usage = VK_IMAGE_USAGE_STORAGE_BIT; vk_frames->img_flags = VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT; + vk_frames->usage = VK_IMAGE_USAGE_STORAGE_BIT | + VK_IMAGE_USAGE_TRANSFER_DST_BIT; err = av_hwframe_ctx_init(*dst); if (err < 0) { @@ -1095,9 +986,6 @@ static void vk_decode_ffv1_uninit(FFVulkanDecodeShared *ctx) for (int k = 0; k < 2; k++) /* Normal/RGB */ ff_vk_shader_free(&ctx->s, &fv->decode[i][j][k]); - for (int i = 0; i < 2; i++) /* 16/32 bit */ - ff_vk_shader_free(&ctx->s, &fv->rct[i]); - ff_vk_free_buf(&ctx->s, &fv->quant_buf); ff_vk_free_buf(&ctx->s, &fv->rangecoder_static_buf); ff_vk_free_buf(&ctx->s, &fv->crc_tab_buf); @@ -1165,12 +1053,13 @@ static int vk_decode_ffv1_init(AVCodecContext *avctx) for (int i = 0; i < 2; i++) { /* 16/32 bit */ for (int j = 0; j < 2; j++) { /* AC/Golomb */ for (int k = 0; k < 2; k++) { /* Normal/RGB */ - AVHWFramesContext *frames_ctx; - frames_ctx = k ? (AVHWFramesContext *)fv->intermediate_frames_ref[i]->data : - (AVHWFramesContext *)avctx->hw_frames_ctx->data; + AVHWFramesContext *dec_frames_ctx; + dec_frames_ctx = k ? (AVHWFramesContext *)fv->intermediate_frames_ref[i]->data : + (AVHWFramesContext *)avctx->hw_frames_ctx->data; err = init_decode_shader(f, &ctx->s, &ctx->exec_pool, spv, &fv->decode[i][j][k], - frames_ctx, + dec_frames_ctx, + (AVHWFramesContext *)avctx->hw_frames_ctx->data, i, !j ? AC_RANGE_CUSTOM_TAB : AC_GOLOMB_RICE, k); @@ -1180,16 +1069,6 @@ static int vk_decode_ffv1_init(AVCodecContext *avctx) } } - /* RCT shaders */ - for (int i = 0; i < 2; i++) { /* 16/32 bit */ - err = init_rct_shader(f, &ctx->s, &ctx->exec_pool, - spv, &fv->rct[i], i, - (AVHWFramesContext *)fv->intermediate_frames_ref[i]->data, - (AVHWFramesContext *)avctx->hw_frames_ctx->data); - if (err < 0) - return err; - } - /* Range coder data */ err = ff_ffv1_vk_init_state_transition_data(&ctx->s, &fv->rangecoder_static_buf, diff --git a/libavutil/vulkan_functions.h b/libavutil/vulkan_functions.h index 85279dd082..8f2bbb38c9 100644 --- a/libavutil/vulkan_functions.h +++ b/libavutil/vulkan_functions.h @@ -147,6 +147,7 @@ typedef uint64_t FFVulkanExtensions; MACRO(1, 1, FF_VK_EXT_NO_FLAG, CmdPipelineBarrier) \ MACRO(1, 1, FF_VK_EXT_NO_FLAG, CmdCopyBufferToImage) \ MACRO(1, 1, FF_VK_EXT_NO_FLAG, CmdCopyImageToBuffer) \ + MACRO(1, 1, FF_VK_EXT_NO_FLAG, CmdClearColorImage) \ MACRO(1, 1, FF_VK_EXT_NO_FLAG, CmdCopyBuffer) \ \ /* Buffer */ \ -- 2.47.2 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".