This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit cebe0b577e5dc8b74f5e1de0b4cbc7c39dddf2c5 Author: IndecisiveTurtle <[email protected]> AuthorDate: Wed Sep 3 22:28:34 2025 +0300 Commit: Lynne <[email protected]> CommitDate: Thu Mar 5 14:02:39 2026 +0000 lavc: implement a Vulkan-based prores encoder Adds a vulkan implementation of the reference prores kostya encoder. Provides about 3-4x speedup over the CPU code --- configure | 1 + libavcodec/Makefile | 1 + libavcodec/allcodecs.c | 1 + libavcodec/proresenc_kostya_vulkan.c | 1015 ++++++++++++++++++++ libavcodec/vulkan/Makefile | 6 + libavcodec/vulkan/dct.glsl | 49 + libavcodec/vulkan/prores_ks_alpha_data.comp.glsl | 87 ++ libavcodec/vulkan/prores_ks_encode_slice.comp.glsl | 273 ++++++ .../vulkan/prores_ks_estimate_slice.comp.glsl | 302 ++++++ libavcodec/vulkan/prores_ks_slice_data.comp.glsl | 179 ++++ libavcodec/vulkan/prores_ks_trellis_node.comp.glsl | 194 ++++ 11 files changed, 2108 insertions(+) diff --git a/configure b/configure index b81b7d40a2..1bfc08bc69 100755 --- a/configure +++ b/configure @@ -3239,6 +3239,7 @@ prores_decoder_select="blockdsp idctdsp" prores_encoder_select="fdctdsp" prores_aw_encoder_select="fdctdsp" prores_ks_encoder_select="fdctdsp" +prores_ks_vulkan_encoder_select="vulkan spirv_compiler" prores_raw_decoder_select="blockdsp idctdsp" qcelp_decoder_select="lsp" qdm2_decoder_select="mpegaudiodsp" diff --git a/libavcodec/Makefile b/libavcodec/Makefile index e19a69d884..7b57fe4564 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -648,6 +648,7 @@ OBJS-$(CONFIG_PRORES_DECODER) += proresdec.o proresdsp.o proresdata.o OBJS-$(CONFIG_PRORES_ENCODER) += proresenc_anatoliy.o proresdata.o OBJS-$(CONFIG_PRORES_AW_ENCODER) += proresenc_anatoliy.o proresdata.o OBJS-$(CONFIG_PRORES_KS_ENCODER) += proresenc_kostya.o proresdata.o proresenc_kostya_common.o +OBJS-$(CONFIG_PRORES_KS_VULKAN_ENCODER) += proresenc_kostya_vulkan.o proresdata.o proresenc_kostya_common.o OBJS-$(CONFIG_PRORES_RAW_DECODER) += prores_raw.o proresdsp.o proresdata.o OBJS-$(CONFIG_PRORES_VIDEOTOOLBOX_ENCODER) += videotoolboxenc.o OBJS-$(CONFIG_PROSUMER_DECODER) += prosumer.o diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c index 042b07c895..ad22162b0e 100644 --- a/libavcodec/allcodecs.c +++ b/libavcodec/allcodecs.c @@ -271,6 +271,7 @@ extern const FFCodec ff_prores_encoder; extern const FFCodec ff_prores_decoder; extern const FFCodec ff_prores_aw_encoder; extern const FFCodec ff_prores_ks_encoder; +extern const FFCodec ff_prores_ks_vulkan_encoder; extern const FFCodec ff_prores_raw_decoder; extern const FFCodec ff_prosumer_decoder; extern const FFCodec ff_psd_decoder; diff --git a/libavcodec/proresenc_kostya_vulkan.c b/libavcodec/proresenc_kostya_vulkan.c new file mode 100644 index 0000000000..11b11675b7 --- /dev/null +++ b/libavcodec/proresenc_kostya_vulkan.c @@ -0,0 +1,1015 @@ +/* + * Apple ProRes encoder + * + * Copyright (c) 2011 Anatoliy Wasserman + * Copyright (c) 2012 Konstantin Shishkov + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/buffer.h" +#include "libavutil/macros.h" +#include "libavutil/mem.h" +#include "libavutil/mem_internal.h" +#include "libavutil/opt.h" +#include "libavutil/pixdesc.h" +#include "libavutil/vulkan_spirv.h" +#include "libavutil/hwcontext_vulkan.h" +#include "libavutil/vulkan_loader.h" +#include "libavutil/vulkan.h" +#include "avcodec.h" +#include "codec.h" +#include "codec_internal.h" +#include "encode.h" +#include "packet.h" +#include "put_bits.h" +#include "profiles.h" +#include "bytestream.h" +#include "proresdata.h" +#include "proresenc_kostya_common.h" +#include "hwconfig.h" + +#define DCTSIZE 8 + +typedef struct ProresDataTables { + int16_t qmat[128][64]; + int16_t qmat_chroma[128][64]; +} ProresDataTables; + +typedef struct SliceDataInfo { + int plane; + int line_add; + int bits_per_sample; +} SliceDataInfo; + +typedef struct EncodeSliceInfo { + VkDeviceAddress bytestream; + VkDeviceAddress seek_table; +} EncodeSliceInfo; + +typedef struct SliceData { + uint32_t mbs_per_slice; + int16_t rows[MAX_PLANES * MAX_MBS_PER_SLICE * 256]; +} SliceData; + +typedef struct SliceScore { + int bits[MAX_STORED_Q][4]; + int error[MAX_STORED_Q][4]; + int total_bits[MAX_STORED_Q]; + int total_error[MAX_STORED_Q]; + int overquant; + int buf_start; + int quant; +} SliceScore; + +typedef struct VulkanEncodeProresFrameData { + /* Intermediate buffers */ + AVBufferRef *out_data_ref[2]; + AVBufferRef *slice_data_ref[2]; + AVBufferRef *slice_score_ref[2]; + AVBufferRef *frame_size_ref[2]; + + /* Copied from the source */ + int64_t pts; + int64_t duration; + void *frame_opaque; + AVBufferRef *frame_opaque_ref; + enum AVColorTransferCharacteristic color_trc; + enum AVColorSpace colorspace; + enum AVColorPrimaries color_primaries; + int key_frame; + int flags; +} VulkanEncodeProresFrameData; + +typedef struct ProresVulkanContext { + ProresContext ctx; + + /* Vulkan state */ + FFVulkanContext vkctx; + AVVulkanDeviceQueueFamily *qf; + FFVkExecPool e; + AVVulkanDeviceQueueFamily *transfer_qf; + FFVkExecPool transfer_exec_pool; + AVBufferPool *pkt_buf_pool; + AVBufferPool *slice_data_buf_pool; + AVBufferPool *slice_score_buf_pool; + AVBufferPool *frame_size_buf_pool; + + FFVulkanShader alpha_data_shd; + FFVulkanShader slice_data_shd[2]; + FFVulkanShader estimate_slice_shd; + FFVulkanShader encode_slice_shd; + FFVulkanShader trellis_node_shd; + FFVkBuffer prores_data_tables_buf; + + int *slice_quants; + SliceScore *slice_scores; + ProresDataTables *tables; + + int in_flight; + int async_depth; + AVFrame *frame; + VulkanEncodeProresFrameData *exec_ctx_info; +} ProresVulkanContext; + +extern const unsigned char ff_prores_ks_alpha_data_comp_spv_data[]; +extern const unsigned int ff_prores_ks_alpha_data_comp_spv_len; + +extern const unsigned char ff_prores_ks_slice_data_comp_spv_data[]; +extern const unsigned int ff_prores_ks_slice_data_comp_spv_len; + +extern const unsigned char ff_prores_ks_estimate_slice_comp_spv_data[]; +extern const unsigned int ff_prores_ks_estimate_slice_comp_spv_len; + +extern const unsigned char ff_prores_ks_trellis_node_comp_spv_data[]; +extern const unsigned int ff_prores_ks_trellis_node_comp_spv_len; + +extern const unsigned char ff_prores_ks_encode_slice_comp_spv_data[]; +extern const unsigned int ff_prores_ks_encode_slice_comp_spv_len; + +static int init_slice_data_pipeline(ProresVulkanContext *pv, FFVulkanShader *shd, int blocks_per_mb) +{ + int err = 0; + FFVulkanContext *vkctx = &pv->vkctx; + FFVulkanDescriptorSetBinding *desc; + + SPEC_LIST_CREATE(sl, 5, 5 * sizeof(uint32_t)) + SPEC_LIST_ADD(sl, 0, 32, pv->ctx.mbs_per_slice); + SPEC_LIST_ADD(sl, 1, 32, blocks_per_mb); + SPEC_LIST_ADD(sl, 2, 32, pv->ctx.mb_width); + SPEC_LIST_ADD(sl, 3, 32, pv->ctx.pictures_per_frame); + SPEC_LIST_ADD(sl, 16, 32, blocks_per_mb * pv->ctx.mbs_per_slice); /* nb_blocks */ + + ff_vk_shader_load(shd, VK_SHADER_STAGE_COMPUTE_BIT, sl, + (uint32_t []) { DCTSIZE, blocks_per_mb, pv->ctx.mbs_per_slice }, 0); + + desc = (FFVulkanDescriptorSetBinding []) { + { + .name = "SliceBuffer", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + }, + { + .name = "planes", + .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .elems = av_pix_fmt_count_planes(vkctx->frames->sw_format), + }, + }; + RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc, 2, 0, 0)); + + ff_vk_shader_add_push_const(shd, 0, sizeof(SliceDataInfo), VK_SHADER_STAGE_COMPUTE_BIT); + + RET(ff_vk_shader_link(vkctx, shd, + ff_prores_ks_slice_data_comp_spv_data, + ff_prores_ks_slice_data_comp_spv_len, "main")); + + RET(ff_vk_shader_register_exec(vkctx, &pv->e, shd)); + +fail: + return err; +} + +static int init_alpha_data_pipeline(ProresVulkanContext *pv, FFVulkanShader* shd) +{ + int err = 0; + FFVulkanContext *vkctx = &pv->vkctx; + FFVulkanDescriptorSetBinding *desc; + + SPEC_LIST_CREATE(sl, 4, 4 * sizeof(uint32_t)) + SPEC_LIST_ADD(sl, 0, 32, pv->ctx.alpha_bits); + SPEC_LIST_ADD(sl, 1, 32, pv->ctx.slices_width); + SPEC_LIST_ADD(sl, 2, 32, pv->ctx.mb_width); + SPEC_LIST_ADD(sl, 3, 32, pv->ctx.mbs_per_slice); + + ff_vk_shader_load(shd, VK_SHADER_STAGE_COMPUTE_BIT, sl, + (uint32_t []) { 16, 16, 1 }, 0); + + desc = (FFVulkanDescriptorSetBinding []) { + { + .name = "SliceBuffer", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + }, + { + .name = "plane", + .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + }, + }; + RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc, 2, 0, 0)); + + RET(ff_vk_shader_link(vkctx, shd, + ff_prores_ks_alpha_data_comp_spv_data, + ff_prores_ks_alpha_data_comp_spv_len, "main")); + + RET(ff_vk_shader_register_exec(vkctx, &pv->e, shd)); + +fail: + return err; +} + +static int init_estimate_slice_pipeline(ProresVulkanContext *pv, FFVulkanShader* shd) +{ + int err = 0; + FFVulkanContext *vkctx = &pv->vkctx; + FFVulkanDescriptorSetBinding *desc; + int subgroup_size = vkctx->subgroup_props.maxSubgroupSize; + int dim_x = pv->ctx.alpha_bits ? subgroup_size : (subgroup_size / 3) * 3; + + SPEC_LIST_CREATE(sl, 8, 8 * sizeof(uint32_t)) + SPEC_LIST_ADD(sl, 0, 32, pv->ctx.mbs_per_slice); + SPEC_LIST_ADD(sl, 1, 32, pv->ctx.chroma_factor); + SPEC_LIST_ADD(sl, 2, 32, pv->ctx.alpha_bits); + SPEC_LIST_ADD(sl, 3, 32, pv->ctx.num_planes); + SPEC_LIST_ADD(sl, 4, 32, pv->ctx.slices_per_picture); + SPEC_LIST_ADD(sl, 5, 32, pv->ctx.force_quant ? 0 : pv->ctx.profile_info->min_quant); + SPEC_LIST_ADD(sl, 6, 32, pv->ctx.force_quant ? 0 : pv->ctx.profile_info->max_quant); + SPEC_LIST_ADD(sl, 7, 32, pv->ctx.bits_per_mb); + + ff_vk_shader_load(shd, VK_SHADER_STAGE_COMPUTE_BIT, sl, + (uint32_t []) { dim_x, 1, 1 }, 0); + + desc = (FFVulkanDescriptorSetBinding []) { + { + .name = "SliceBuffer", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + }, + { + .name = "SliceScores", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + }, + { + .name = "ProresDataTables", + .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + }, + }; + RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc, 3, 0, 0)); + + RET(ff_vk_shader_link(vkctx, shd, + ff_prores_ks_estimate_slice_comp_spv_data, + ff_prores_ks_estimate_slice_comp_spv_len, "main")); + RET(ff_vk_shader_register_exec(vkctx, &pv->e, shd)); + +fail: + return err; +} + +static int init_trellis_node_pipeline(ProresVulkanContext *pv, FFVulkanShader* shd) +{ + int err = 0; + FFVulkanContext *vkctx = &pv->vkctx; + FFVulkanDescriptorSetBinding *desc; + int subgroup_size = vkctx->subgroup_props.maxSubgroupSize; + int num_subgroups = FFALIGN(pv->ctx.mb_height, subgroup_size) / subgroup_size; + + SPEC_LIST_CREATE(sl, 8, 8 * sizeof(uint32_t)) + SPEC_LIST_ADD(sl, 0, 32, pv->ctx.slices_width); + SPEC_LIST_ADD(sl, 1, 32, num_subgroups); + SPEC_LIST_ADD(sl, 2, 32, pv->ctx.num_planes); + SPEC_LIST_ADD(sl, 3, 32, pv->ctx.force_quant); + SPEC_LIST_ADD(sl, 4, 32, pv->ctx.profile_info->min_quant); + SPEC_LIST_ADD(sl, 5, 32, pv->ctx.profile_info->max_quant); + SPEC_LIST_ADD(sl, 6, 32, pv->ctx.mbs_per_slice); + SPEC_LIST_ADD(sl, 7, 32, pv->ctx.bits_per_mb); + + ff_vk_shader_load(shd, VK_SHADER_STAGE_COMPUTE_BIT, sl, + (uint32_t []) { pv->ctx.mb_height, 1, 1 }, 0); + + desc = (FFVulkanDescriptorSetBinding []) { + { + .name = "FrameSize", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + }, + { + .name = "SliceScores", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + }, + }; + RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc, 2, 0, 0)); + + RET(ff_vk_shader_link(vkctx, shd, + ff_prores_ks_trellis_node_comp_spv_data, + ff_prores_ks_trellis_node_comp_spv_len, "main")); + + RET(ff_vk_shader_register_exec(vkctx, &pv->e, shd)); + +fail: + return err; +} + +static int init_encode_slice_pipeline(ProresVulkanContext *pv, FFVulkanShader* shd) +{ + int err = 0; + FFVulkanContext *vkctx = &pv->vkctx; + FFVulkanDescriptorSetBinding *desc; + + SPEC_LIST_CREATE(sl, 6, 6 * sizeof(uint32_t)) + SPEC_LIST_ADD(sl, 0, 32, pv->ctx.mbs_per_slice); + SPEC_LIST_ADD(sl, 1, 32, pv->ctx.chroma_factor); + SPEC_LIST_ADD(sl, 2, 32, pv->ctx.alpha_bits); + SPEC_LIST_ADD(sl, 3, 32, pv->ctx.num_planes); + SPEC_LIST_ADD(sl, 4, 32, pv->ctx.slices_per_picture); + SPEC_LIST_ADD(sl, 5, 32, pv->ctx.force_quant ? pv->ctx.force_quant : pv->ctx.profile_info->max_quant); + + ff_vk_shader_load(shd, VK_SHADER_STAGE_COMPUTE_BIT, sl, + (uint32_t []) { 64, 1, 1 }, 0); + + desc = (FFVulkanDescriptorSetBinding []) { + { + .name = "SliceBuffer", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + }, + { + .name = "SliceScores", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + }, + { + .name = "ProresDataTables", + .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + }, + }; + RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc, 3, 0, 0)); + + ff_vk_shader_add_push_const(shd, 0, sizeof(EncodeSliceInfo), VK_SHADER_STAGE_COMPUTE_BIT); + + RET(ff_vk_shader_link(vkctx, shd, + ff_prores_ks_encode_slice_comp_spv_data, + ff_prores_ks_encode_slice_comp_spv_len, "main")); + + RET(ff_vk_shader_register_exec(vkctx, &pv->e, shd)); + +fail: + return err; +} + +static int vulkan_encode_prores_submit_frame(AVCodecContext *avctx, FFVkExecContext *exec, + AVFrame *frame, int picture_idx) +{ + ProresVulkanContext *pv = avctx->priv_data; + ProresContext *ctx = &pv->ctx; + VulkanEncodeProresFrameData *pd = exec->opaque; + FFVulkanContext *vkctx = &pv->vkctx; + FFVulkanFunctions *vk = &vkctx->vkfn; + int err = 0, nb_img_bar = 0, i, is_chroma; + int min_quant = ctx->profile_info->min_quant; + int max_quant = ctx->profile_info->max_quant; + int subgroup_size = vkctx->subgroup_props.maxSubgroupSize; + int estimate_dim_x = ctx->alpha_bits ? subgroup_size : (subgroup_size / 3) * 3; + int transfer_slices = vkctx->extensions & FF_VK_EXT_EXTERNAL_HOST_MEMORY; + const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(vkctx->frames->sw_format); + VkImageView views[AV_NUM_DATA_POINTERS]; + VkImageMemoryBarrier2 img_bar[AV_NUM_DATA_POINTERS]; + FFVkBuffer *pkt_vk_buf, *slice_data_buf, *slice_score_buf, *frame_size_buf; + SliceDataInfo slice_data_info; + EncodeSliceInfo encode_info; + FFVulkanShader *shd; + + /* Start recording */ + ff_vk_exec_start(vkctx, exec); + + /* Get a pooled buffer for writing output data */ + RET(ff_vk_get_pooled_buffer(vkctx, &pv->pkt_buf_pool, &pd->out_data_ref[picture_idx], + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT | + VK_BUFFER_USAGE_TRANSFER_SRC_BIT, NULL, + ctx->frame_size_upper_bound + FF_INPUT_BUFFER_MIN_SIZE, + transfer_slices ? VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT + : (VK_MEMORY_PROPERTY_HOST_CACHED_BIT | + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | + VK_MEMORY_PROPERTY_HOST_COHERENT_BIT))); + pkt_vk_buf = (FFVkBuffer*)pd->out_data_ref[picture_idx]->data; + ff_vk_exec_add_dep_buf(vkctx, exec, &pd->out_data_ref[picture_idx], 1, 1); + + /* Allocate buffer for writing slice data */ + RET(ff_vk_get_pooled_buffer(vkctx, &pv->slice_data_buf_pool, &pd->slice_data_ref[picture_idx], + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, NULL, + ctx->slices_per_picture * sizeof(SliceData), + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT)); + slice_data_buf = (FFVkBuffer*)pd->slice_data_ref[picture_idx]->data; + ff_vk_exec_add_dep_buf(vkctx, exec, &pd->slice_data_ref[picture_idx], 1, 1); + + /* Allocate buffer for writing slice scores */ + RET(ff_vk_get_pooled_buffer(vkctx, &pv->slice_score_buf_pool, &pd->slice_score_ref[picture_idx], + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, NULL, + ctx->slices_per_picture * sizeof(SliceScore), + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT)); + slice_score_buf = (FFVkBuffer*)pd->slice_score_ref[picture_idx]->data; + ff_vk_exec_add_dep_buf(vkctx, exec, &pd->slice_score_ref[picture_idx], 1, 1); + + /* Allocate buffer for writing frame size */ + RET(ff_vk_get_pooled_buffer(vkctx, &pv->frame_size_buf_pool, &pd->frame_size_ref[picture_idx], + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, NULL, + sizeof(int), + VK_MEMORY_PROPERTY_HOST_CACHED_BIT | + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | + VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)); + frame_size_buf = (FFVkBuffer*)pd->frame_size_ref[picture_idx]->data; + ff_vk_exec_add_dep_buf(vkctx, exec, &pd->frame_size_ref[picture_idx], 1, 1); + + /* Generate barriers and image views for frame images. */ + RET(ff_vk_exec_add_dep_frame(vkctx, exec, frame, + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT)); + RET(ff_vk_create_imageviews(vkctx, exec, views, frame, FF_VK_REP_INT)); + ff_vk_frame_barrier(vkctx, exec, frame, img_bar, &nb_img_bar, + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + VK_ACCESS_SHADER_READ_BIT, + VK_IMAGE_LAYOUT_GENERAL, + VK_QUEUE_FAMILY_IGNORED); + + /* Submit the image barriers. */ + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pImageMemoryBarriers = img_bar, + .imageMemoryBarrierCount = nb_img_bar, + }); + + /* Apply FDCT on input image data for future passes */ + slice_data_info = (SliceDataInfo) { + .line_add = ctx->pictures_per_frame == 1 ? 0 : picture_idx ^ !(frame->flags & AV_FRAME_FLAG_TOP_FIELD_FIRST), + }; + for (i = 0; i < ctx->num_planes; i++) { + is_chroma = (i == 1 || i == 2); + shd = &pv->slice_data_shd[!is_chroma || ctx->chroma_factor == CFACTOR_Y444]; + if (i < 3) { + slice_data_info.plane = i; + slice_data_info.bits_per_sample = desc->comp[i].depth; + ff_vk_shader_update_desc_buffer(vkctx, exec, shd, 0, 0, 0, + slice_data_buf, 0, slice_data_buf->size, + VK_FORMAT_UNDEFINED); + ff_vk_shader_update_img_array(vkctx, exec, shd, frame, views, 0, 1, + VK_IMAGE_LAYOUT_GENERAL, VK_NULL_HANDLE); + ff_vk_exec_bind_shader(vkctx, exec, shd); + ff_vk_shader_update_push_const(vkctx, exec, shd, VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(SliceDataInfo), &slice_data_info); + vk->CmdDispatch(exec->buf, ctx->slices_width, ctx->mb_height, 1); + } else { + ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->alpha_data_shd, 0, 0, 0, + slice_data_buf, 0, slice_data_buf->size, + VK_FORMAT_UNDEFINED); + ff_vk_shader_update_img(vkctx, exec, &pv->alpha_data_shd, 0, 1, 0, views[3], + VK_IMAGE_LAYOUT_GENERAL, VK_NULL_HANDLE); + ff_vk_exec_bind_shader(vkctx, exec, &pv->alpha_data_shd); + vk->CmdDispatch(exec->buf, ctx->mb_width, ctx->mb_height, 1); + } + } + + /* Wait for writes to slice buffer. */ + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pBufferMemoryBarriers = & (VkBufferMemoryBarrier2) { + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, + .pNext = NULL, + .srcStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + .srcAccessMask = VK_ACCESS_2_SHADER_WRITE_BIT, + .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + .dstAccessMask = VK_ACCESS_2_SHADER_READ_BIT, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .buffer = slice_data_buf->buf, + .offset = 0, + .size = slice_data_buf->size, + }, + .bufferMemoryBarrierCount = 1, + }); + + /* Estimate slice bits and error for each quant */ + ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->estimate_slice_shd, 0, 0, 0, + slice_data_buf, 0, slice_data_buf->size, + VK_FORMAT_UNDEFINED); + ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->estimate_slice_shd, 0, 1, 0, + slice_score_buf, 0, slice_score_buf->size, + VK_FORMAT_UNDEFINED); + ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->estimate_slice_shd, 0, 2, 0, + &pv->prores_data_tables_buf, 0, pv->prores_data_tables_buf.size, + VK_FORMAT_UNDEFINED); + ff_vk_exec_bind_shader(vkctx, exec, &pv->estimate_slice_shd); + vk->CmdDispatch(exec->buf, (ctx->slices_per_picture * ctx->num_planes + estimate_dim_x - 1) / estimate_dim_x, + ctx->force_quant ? 1 : (max_quant - min_quant + 1), 1); + + /* Wait for writes to score buffer. */ + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pBufferMemoryBarriers = & (VkBufferMemoryBarrier2) { + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, + .pNext = NULL, + .srcStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + .srcAccessMask = VK_ACCESS_2_SHADER_WRITE_BIT | VK_ACCESS_2_SHADER_READ_BIT, + .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + .dstAccessMask = VK_ACCESS_2_SHADER_WRITE_BIT | VK_ACCESS_2_SHADER_READ_BIT, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .buffer = slice_score_buf->buf, + .offset = 0, + .size = slice_score_buf->size, + }, + .bufferMemoryBarrierCount = 1, + }); + + /* Compute optimal quant value for each slice */ + ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->trellis_node_shd, 0, 0, 0, + frame_size_buf, 0, frame_size_buf->size, + VK_FORMAT_UNDEFINED); + ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->trellis_node_shd, 0, 1, 0, + slice_score_buf, 0, slice_score_buf->size, + VK_FORMAT_UNDEFINED); + ff_vk_exec_bind_shader(vkctx, exec, &pv->trellis_node_shd); + vk->CmdDispatch(exec->buf, 1, 1, 1); + + /* Wait for writes to quant buffer. */ + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pBufferMemoryBarriers = & (VkBufferMemoryBarrier2) { + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, + .pNext = NULL, + .srcStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + .srcAccessMask = VK_ACCESS_2_SHADER_WRITE_BIT, + .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + .dstAccessMask = VK_ACCESS_2_SHADER_WRITE_BIT | VK_ACCESS_2_SHADER_READ_BIT, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .buffer = frame_size_buf->buf, + .offset = 0, + .size = frame_size_buf->size, + }, + .bufferMemoryBarrierCount = 1, + }); + + /* Encode slices. */ + encode_info = (EncodeSliceInfo) { + .seek_table = pkt_vk_buf->address, + .bytestream = pkt_vk_buf->address + ctx->slices_per_picture * 2, + }; + ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->encode_slice_shd, 0, 0, 0, + slice_data_buf, 0, slice_data_buf->size, + VK_FORMAT_UNDEFINED); + ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->encode_slice_shd, 0, 1, 0, + slice_score_buf, 0, slice_score_buf->size, + VK_FORMAT_UNDEFINED); + ff_vk_shader_update_desc_buffer(vkctx, exec, &pv->encode_slice_shd, 0, 2, 0, + &pv->prores_data_tables_buf, 0, pv->prores_data_tables_buf.size, + VK_FORMAT_UNDEFINED); + ff_vk_exec_bind_shader(vkctx, exec, &pv->encode_slice_shd); + ff_vk_shader_update_push_const(vkctx, exec, &pv->encode_slice_shd, + VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(encode_info), &encode_info); + vk->CmdDispatch(exec->buf, FFALIGN(ctx->slices_per_picture, 64) / 64, + ctx->num_planes, 1); + +fail: + return err; +} + +static uint8_t *write_frame_header(AVCodecContext *avctx, ProresContext *ctx, + uint8_t **orig_buf, int flags, + enum AVColorPrimaries color_primaries, + enum AVColorTransferCharacteristic color_trc, + enum AVColorSpace colorspace) +{ + uint8_t *buf, *tmp; + uint8_t frame_flags; + + // frame atom + *orig_buf += 4; // frame size + bytestream_put_be32 (orig_buf, FRAME_ID); // frame container ID + buf = *orig_buf; + + // frame header + tmp = buf; + buf += 2; // frame header size will be stored here + bytestream_put_be16 (&buf, ctx->chroma_factor != CFACTOR_Y422 || ctx->alpha_bits ? 1 : 0); + bytestream_put_buffer(&buf, (uint8_t*)ctx->vendor, 4); + bytestream_put_be16 (&buf, avctx->width); + bytestream_put_be16 (&buf, avctx->height); + + frame_flags = ctx->chroma_factor << 6; + if (avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT) + frame_flags |= (flags & AV_FRAME_FLAG_TOP_FIELD_FIRST) ? 0x04 : 0x08; + bytestream_put_byte (&buf, frame_flags); + + bytestream_put_byte (&buf, 0); // reserved + bytestream_put_byte (&buf, color_primaries); + bytestream_put_byte (&buf, color_trc); + bytestream_put_byte (&buf, colorspace); + bytestream_put_byte (&buf, ctx->alpha_bits >> 3); + bytestream_put_byte (&buf, 0); // reserved + if (ctx->quant_sel != QUANT_MAT_DEFAULT) { + bytestream_put_byte (&buf, 0x03); // matrix flags - both matrices are present + bytestream_put_buffer(&buf, ctx->quant_mat, 64); // luma quantisation matrix + bytestream_put_buffer(&buf, ctx->quant_chroma_mat, 64); // chroma quantisation matrix + } else { + bytestream_put_byte (&buf, 0x00); // matrix flags - default matrices are used + } + bytestream_put_be16 (&tmp, buf - *orig_buf); // write back frame header size + return buf; +} + +static int get_packet(AVCodecContext *avctx, FFVkExecContext *exec, AVPacket *pkt) +{ + ProresVulkanContext *pv = avctx->priv_data; + ProresContext *ctx = &pv->ctx; + VulkanEncodeProresFrameData *pd = exec->opaque; + FFVulkanContext *vkctx = &pv->vkctx; + FFVulkanFunctions *vk = &vkctx->vkfn; + FFVkExecContext *transfer_exec; + uint8_t *orig_buf, *buf, *slice_sizes; + uint8_t *picture_size_pos; + int picture_idx, err = 0; + int frame_size, picture_size; + int pkt_size = ctx->frame_size_upper_bound; + int transfer_slices = vkctx->extensions & FF_VK_EXT_EXTERNAL_HOST_MEMORY; + FFVkBuffer *out_data_buf, *frame_size_buf; + VkMappedMemoryRange invalidate_data; + AVBufferRef *mapped_ref; + FFVkBuffer *mapped_buf; + + /* Allocate packet */ + RET(ff_get_encode_buffer(avctx, pkt, pkt_size + FF_INPUT_BUFFER_MIN_SIZE, 0)); + + /* Initialize packet. */ + pkt->pts = pd->pts; + pkt->dts = pd->pts; + pkt->duration = pd->duration; + pkt->flags |= AV_PKT_FLAG_KEY * pd->key_frame; + + if (avctx->flags & AV_CODEC_FLAG_COPY_OPAQUE) { + pkt->opaque = pd->frame_opaque; + pkt->opaque_ref = pd->frame_opaque_ref; + pd->frame_opaque_ref = NULL; + } + + /* Write frame atom */ + orig_buf = pkt->data; + buf = write_frame_header(avctx, ctx, &orig_buf, pd->flags, + pd->color_primaries, pd->color_trc, + pd->colorspace); + + /* Make sure encoding's done */ + ff_vk_exec_wait(vkctx, exec); + + /* Roll transfer execution context */ + if (transfer_slices) { + RET(ff_vk_host_map_buffer(vkctx, &mapped_ref, pkt->data, pkt->buf, + VK_BUFFER_USAGE_TRANSFER_DST_BIT)); + mapped_buf = (FFVkBuffer *)mapped_ref->data; + transfer_exec = ff_vk_exec_get(vkctx, &pv->transfer_exec_pool); + ff_vk_exec_start(vkctx, transfer_exec); + } + + for (picture_idx = 0; picture_idx < ctx->pictures_per_frame; picture_idx++) { + /* Fetch buffers for the current picture. */ + out_data_buf = (FFVkBuffer *)pd->out_data_ref[picture_idx]->data; + frame_size_buf = (FFVkBuffer *)pd->frame_size_ref[picture_idx]->data; + + /* Invalidate slice/output data if needed */ + invalidate_data = (VkMappedMemoryRange) { + .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE, + .offset = 0, + .size = VK_WHOLE_SIZE, + }; + if (!(frame_size_buf->flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) { + invalidate_data.memory = frame_size_buf->mem; + vk->InvalidateMappedMemoryRanges(vkctx->hwctx->act_dev, 1, &invalidate_data); + } + + /* Write picture header */ + picture_size_pos = buf + 1; + bytestream_put_byte(&buf, 0x40); // picture header size (in bits) + buf += 4; // picture data size will be stored here + bytestream_put_be16(&buf, ctx->slices_per_picture); + bytestream_put_byte(&buf, av_log2(ctx->mbs_per_slice) << 4); // slice width and height in MBs + + /* Skip over seek table */ + slice_sizes = buf; + buf += ctx->slices_per_picture * 2; + + /* Calculate final size */ + buf += *(int*)frame_size_buf->mapped_mem; + + if (transfer_slices) { + /* Perform host mapped transfer of slice data */ + ff_vk_exec_add_dep_buf(vkctx, transfer_exec, &pd->out_data_ref[picture_idx], 1, 0); + ff_vk_exec_add_dep_buf(vkctx, transfer_exec, &mapped_ref, 1, 0); + vk->CmdCopyBuffer(transfer_exec->buf, out_data_buf->buf, mapped_buf->buf, 1, & (VkBufferCopy) { + .srcOffset = 0, + .dstOffset = mapped_buf->virtual_offset + slice_sizes - pkt->data, + .size = buf - slice_sizes, + }); + } else { + /* Fallback to regular memcpy if transfer is not available */ + if (!(out_data_buf->flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) { + invalidate_data.memory = out_data_buf->mem; + vk->InvalidateMappedMemoryRanges(vkctx->hwctx->act_dev, 1, &invalidate_data); + } + memcpy(slice_sizes, out_data_buf->mapped_mem, buf - slice_sizes); + av_buffer_unref(&pd->out_data_ref[picture_idx]); + } + + /* Write picture size with header */ + picture_size = buf - (picture_size_pos - 1); + bytestream_put_be32(&picture_size_pos, picture_size); + + /* Slice output buffers no longer needed */ + av_buffer_unref(&pd->slice_data_ref[picture_idx]); + av_buffer_unref(&pd->slice_score_ref[picture_idx]); + av_buffer_unref(&pd->frame_size_ref[picture_idx]); + } + + /* Write frame size in header */ + orig_buf -= 8; + frame_size = buf - orig_buf; + bytestream_put_be32(&orig_buf, frame_size); + + av_shrink_packet(pkt, frame_size); + av_log(avctx, AV_LOG_VERBOSE, "Encoded data: %iMiB\n", pkt->size / (1024*1024)); + + /* Wait for slice transfer */ + if (transfer_slices) { + RET(ff_vk_exec_submit(vkctx, transfer_exec)); + ff_vk_exec_wait(vkctx, transfer_exec); + } + +fail: + return err; +} + +static int vulkan_encode_prores_receive_packet(AVCodecContext *avctx, AVPacket *pkt) +{ + int err; + ProresVulkanContext *pv = avctx->priv_data; + ProresContext *ctx = &pv->ctx; + VulkanEncodeProresFrameData *pd; + FFVkExecContext *exec; + AVFrame *frame; + + while (1) { + /* Roll an execution context */ + exec = ff_vk_exec_get(&pv->vkctx, &pv->e); + + /* If it had a frame, immediately output it */ + if (exec->had_submission) { + exec->had_submission = 0; + pv->in_flight--; + return get_packet(avctx, exec, pkt); + } + + /* Get next frame to encode */ + frame = pv->frame; + err = ff_encode_get_frame(avctx, frame); + if (err < 0 && err != AVERROR_EOF) { + return err; + } else if (err == AVERROR_EOF) { + if (!pv->in_flight) + return err; + continue; + } + + /* Encode frame */ + pd = exec->opaque; + pd->color_primaries = frame->color_primaries; + pd->color_trc = frame->color_trc; + pd->colorspace = frame->colorspace; + pd->pts = frame->pts; + pd->duration = frame->duration; + pd->flags = frame->flags; + if (avctx->flags & AV_CODEC_FLAG_COPY_OPAQUE) { + pd->frame_opaque = frame->opaque; + pd->frame_opaque_ref = frame->opaque_ref; + frame->opaque_ref = NULL; + } + + err = vulkan_encode_prores_submit_frame(avctx, exec, frame, 0); + if (ctx->pictures_per_frame > 1) + vulkan_encode_prores_submit_frame(avctx, exec, frame, 1); + + /* Submit execution context */ + ff_vk_exec_submit(&pv->vkctx, exec); + av_frame_unref(frame); + if (err < 0) + return err; + + pv->in_flight++; + if (pv->in_flight < pv->async_depth) + return AVERROR(EAGAIN); + } + + return 0; +} + +static av_cold int encode_close(AVCodecContext *avctx) +{ + ProresVulkanContext *pv = avctx->priv_data; + ProresContext *ctx = &pv->ctx; + FFVulkanContext *vkctx = &pv->vkctx; + + ff_vk_exec_pool_free(vkctx, &pv->e); + ff_vk_exec_pool_free(vkctx, &pv->transfer_exec_pool); + + if (ctx->alpha_bits) + ff_vk_shader_free(vkctx, &pv->alpha_data_shd); + + ff_vk_shader_free(vkctx, &pv->slice_data_shd[0]); + ff_vk_shader_free(vkctx, &pv->slice_data_shd[1]); + ff_vk_shader_free(vkctx, &pv->estimate_slice_shd); + ff_vk_shader_free(vkctx, &pv->encode_slice_shd); + ff_vk_shader_free(vkctx, &pv->trellis_node_shd); + + ff_vk_free_buf(vkctx, &pv->prores_data_tables_buf); + + av_buffer_pool_uninit(&pv->pkt_buf_pool); + av_buffer_pool_uninit(&pv->slice_data_buf_pool); + av_buffer_pool_uninit(&pv->slice_score_buf_pool); + av_buffer_pool_uninit(&pv->frame_size_buf_pool); + + ff_vk_uninit(vkctx); + + return 0; +} + +static av_cold int encode_init(AVCodecContext *avctx) +{ + ProresVulkanContext *pv = avctx->priv_data; + ProresContext *ctx = &pv->ctx; + int err = 0, i, q; + FFVulkanContext *vkctx = &pv->vkctx; + + /* Init vulkan */ + RET(ff_vk_init(vkctx, avctx, NULL, avctx->hw_frames_ctx)); + + pv->qf = ff_vk_qf_find(vkctx, VK_QUEUE_COMPUTE_BIT, 0); + if (!pv->qf) { + av_log(avctx, AV_LOG_ERROR, "Device has no compute queues!\n"); + return AVERROR(ENOTSUP); + } + + RET(ff_vk_exec_pool_init(vkctx, pv->qf, &pv->e, 1, 0, 0, 0, NULL)); + + pv->transfer_qf = ff_vk_qf_find(vkctx, VK_QUEUE_TRANSFER_BIT, 0); + if (!pv->transfer_qf) { + av_log(avctx, AV_LOG_ERROR, "Device has no transfer queues!\n"); + return err; + } + + RET(ff_vk_exec_pool_init(vkctx, pv->transfer_qf, &pv->transfer_exec_pool, + pv->async_depth, 0, 0, 0, NULL)); + + /* Init common prores structures */ + err = ff_prores_kostya_encode_init(avctx, ctx, vkctx->frames->sw_format); + if (err < 0) + return err; + + /* Temporary frame */ + pv->frame = av_frame_alloc(); + if (!pv->frame) + return AVERROR(ENOMEM); + + /* Async data pool */ + pv->async_depth = pv->e.pool_size; + pv->exec_ctx_info = av_calloc(pv->async_depth, sizeof(*pv->exec_ctx_info)); + if (!pv->exec_ctx_info) + return AVERROR(ENOMEM); + for (int i = 0; i < pv->async_depth; i++) + pv->e.contexts[i].opaque = &pv->exec_ctx_info[i]; + + /* Compile shaders used by encoder */ + init_slice_data_pipeline(pv, &pv->slice_data_shd[0], 2); + init_slice_data_pipeline(pv, &pv->slice_data_shd[1], 4); + init_estimate_slice_pipeline(pv, &pv->estimate_slice_shd); + init_trellis_node_pipeline(pv, &pv->trellis_node_shd); + init_encode_slice_pipeline(pv, &pv->encode_slice_shd); + + if (ctx->alpha_bits) + init_alpha_data_pipeline(pv, &pv->alpha_data_shd); + + /* Create prores data tables uniform buffer. */ + RET(ff_vk_create_buf(vkctx, &pv->prores_data_tables_buf, + sizeof(ProresDataTables), NULL, NULL, + VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)); + RET(ff_vk_map_buffer(vkctx, &pv->prores_data_tables_buf, (void *)&pv->tables, 0)); + for (q = 0; q < MAX_STORED_Q; ++q) { + for (i = 0; i < 64; i++) { + pv->tables->qmat[q][i] = ctx->quants[q][ctx->scantable[i]]; + pv->tables->qmat_chroma[q][i] = ctx->quants_chroma[q][ctx->scantable[i]]; + } + } + for (q = MAX_STORED_Q; q < 128; ++q) { + for (i = 0; i < 64; i++) { + pv->tables->qmat[q][i] = ctx->quant_mat[ctx->scantable[i]] * q; + pv->tables->qmat_chroma[q][i] = ctx->quant_chroma_mat[ctx->scantable[i]] * q; + } + } + +fail: + return err; +} + +#define OFFSET(x) offsetof(ProresVulkanContext, x) +#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM + +static const AVOption options[] = { + { "mbs_per_slice", "macroblocks per slice", OFFSET(ctx.mbs_per_slice), + AV_OPT_TYPE_INT, { .i64 = 8 }, 1, MAX_MBS_PER_SLICE, VE }, + { "profile", NULL, OFFSET(ctx.profile), AV_OPT_TYPE_INT, + { .i64 = PRORES_PROFILE_AUTO }, + PRORES_PROFILE_AUTO, PRORES_PROFILE_4444XQ, VE, .unit = "profile" }, + { "auto", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRORES_PROFILE_AUTO }, + 0, 0, VE, .unit = "profile" }, + { "proxy", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRORES_PROFILE_PROXY }, + 0, 0, VE, .unit = "profile" }, + { "lt", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRORES_PROFILE_LT }, + 0, 0, VE, .unit = "profile" }, + { "standard", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRORES_PROFILE_STANDARD }, + 0, 0, VE, .unit = "profile" }, + { "hq", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRORES_PROFILE_HQ }, + 0, 0, VE, .unit = "profile" }, + { "4444", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRORES_PROFILE_4444 }, + 0, 0, VE, .unit = "profile" }, + { "4444xq", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRORES_PROFILE_4444XQ }, + 0, 0, VE, .unit = "profile" }, + { "vendor", "vendor ID", OFFSET(ctx.vendor), + AV_OPT_TYPE_STRING, { .str = "Lavc" }, 0, 0, VE }, + { "bits_per_mb", "desired bits per macroblock", OFFSET(ctx.bits_per_mb), + AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 8192, VE }, + { "quant_mat", "quantiser matrix", OFFSET(ctx.quant_sel), AV_OPT_TYPE_INT, + { .i64 = -1 }, -1, QUANT_MAT_DEFAULT, VE, .unit = "quant_mat" }, + { "auto", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = -1 }, + 0, 0, VE, .unit = "quant_mat" }, + { "proxy", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = QUANT_MAT_PROXY }, + 0, 0, VE, .unit = "quant_mat" }, + { "lt", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = QUANT_MAT_LT }, + 0, 0, VE, .unit = "quant_mat" }, + { "standard", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = QUANT_MAT_STANDARD }, + 0, 0, VE, .unit = "quant_mat" }, + { "hq", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = QUANT_MAT_HQ }, + 0, 0, VE, .unit = "quant_mat" }, + { "default", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = QUANT_MAT_DEFAULT }, + 0, 0, VE, .unit = "quant_mat" }, + { "alpha_bits", "bits for alpha plane", OFFSET(ctx.alpha_bits), AV_OPT_TYPE_INT, + { .i64 = 16 }, 0, 16, VE }, + { "async_depth", "Internal parallelization depth", OFFSET(async_depth), AV_OPT_TYPE_INT, + { .i64 = 1 }, 1, INT_MAX, VE }, + { NULL } +}; + +static const AVClass proresenc_class = { + .class_name = "ProRes vulkan encoder", + .item_name = av_default_item_name, + .option = options, + .version = LIBAVUTIL_VERSION_INT, +}; + +static const AVCodecHWConfigInternal *const prores_ks_hw_configs[] = { + HW_CONFIG_ENCODER_FRAMES(VULKAN, VULKAN), + HW_CONFIG_ENCODER_DEVICE(NONE, VULKAN), + NULL, +}; + +const FFCodec ff_prores_ks_vulkan_encoder = { + .p.name = "prores_ks_vulkan", + CODEC_LONG_NAME("Apple ProRes (iCodec Pro)"), + .p.type = AVMEDIA_TYPE_VIDEO, + .p.id = AV_CODEC_ID_PRORES, + .priv_data_size = sizeof(ProresVulkanContext), + .init = encode_init, + .close = encode_close, + FF_CODEC_RECEIVE_PACKET_CB(&vulkan_encode_prores_receive_packet), + .p.capabilities = AV_CODEC_CAP_DELAY | + AV_CODEC_CAP_HARDWARE | + AV_CODEC_CAP_ENCODER_FLUSH | + AV_CODEC_CAP_ENCODER_REORDERED_OPAQUE, + CODEC_PIXFMTS(AV_PIX_FMT_VULKAN), + .hw_configs = prores_ks_hw_configs, + .color_ranges = AVCOL_RANGE_MPEG, + .p.priv_class = &proresenc_class, + .p.profiles = NULL_IF_CONFIG_SMALL(ff_prores_profiles), + .caps_internal = FF_CODEC_CAP_INIT_CLEANUP | FF_CODEC_CAP_EOF_FLUSH, +}; diff --git a/libavcodec/vulkan/Makefile b/libavcodec/vulkan/Makefile index 93133a9d2d..cd21a53f88 100644 --- a/libavcodec/vulkan/Makefile +++ b/libavcodec/vulkan/Makefile @@ -18,6 +18,12 @@ OBJS-$(CONFIG_FFV1_VULKAN_HWACCEL) += vulkan/ffv1_dec_setup.comp.spv.o \ vulkan/ffv1_dec_rgb.comp.spv.o \ vulkan/ffv1_dec_rgb_golomb.comp.spv.o +OBJS-$(CONFIG_PRORES_KS_VULKAN_ENCODER) += vulkan/prores_ks_alpha_data.comp.spv.o \ + vulkan/prores_ks_slice_data.comp.spv.o \ + vulkan/prores_ks_estimate_slice.comp.spv.o \ + vulkan/prores_ks_encode_slice.comp.spv.o \ + vulkan/prores_ks_trellis_node.comp.spv.o + OBJS-$(CONFIG_PRORES_RAW_VULKAN_HWACCEL) += vulkan/prores_raw_decode.comp.spv.o \ vulkan/prores_raw_idct.comp.spv.o diff --git a/libavcodec/vulkan/dct.glsl b/libavcodec/vulkan/dct.glsl index 159d4873ad..76f799b3e1 100644 --- a/libavcodec/vulkan/dct.glsl +++ b/libavcodec/vulkan/dct.glsl @@ -118,4 +118,53 @@ void idct8(uint block, uint offset, uint stride) blocks[block][7*stride + offset] = u7; } +void fdct8(uint block, uint offset, uint stride) +{ + const float c_pi = radians(180); + const float c_rt2 = sqrt(2.0); + const float c_norm = 1 / sqrt(8.0); + const float c_a = c_rt2 * cos( c_pi / 16); + const float c_b = c_rt2 * cos( c_pi / 8); + const float c_c = c_rt2 * cos(3 * c_pi / 16); + const float c_d = c_rt2 * cos(5 * c_pi / 16); + const float c_e = c_rt2 * cos(3 * c_pi / 8); + const float c_f = c_rt2 * cos(7 * c_pi / 16); + + float u0, u1, u2, u3, u4, u5, u6, u7; + + /* Input */ + u0 = blocks[block][0*stride + offset]; + u1 = blocks[block][1*stride + offset]; + u2 = blocks[block][2*stride + offset]; + u3 = blocks[block][3*stride + offset]; + u4 = blocks[block][4*stride + offset]; + u5 = blocks[block][5*stride + offset]; + u6 = blocks[block][6*stride + offset]; + u7 = blocks[block][7*stride + offset]; + + float X07P = u0 + u7; + float X16P = u1 + u6; + float X25P = u2 + u5; + float X34P = u3 + u4; + + float X07M = u0 - u7; + float X61M = u6 - u1; + float X25M = u2 - u5; + float X43M = u4 - u3; + + float X07P34PP = X07P + X34P; + float X07P34PM = X07P - X34P; + float X16P25PP = X16P + X25P; + float X16P25PM = X16P - X25P; + + blocks[block][0*stride + offset] = c_norm * (X07P34PP + X16P25PP); + blocks[block][2*stride + offset] = c_norm * (c_b * X07P34PM + c_e * X16P25PM); + blocks[block][4*stride + offset] = c_norm * (X07P34PP - X16P25PP); + blocks[block][6*stride + offset] = c_norm * (c_e * X07P34PM - c_b * X16P25PM); + blocks[block][1*stride + offset] = c_norm * (c_a * X07M - c_c * X61M + c_d * X25M - c_f * X43M); + blocks[block][3*stride + offset] = c_norm * (c_c * X07M + c_f * X61M - c_a * X25M + c_d * X43M); + blocks[block][5*stride + offset] = c_norm * (c_d * X07M + c_a * X61M + c_f * X25M - c_c * X43M); + blocks[block][7*stride + offset] = c_norm * (c_f * X07M + c_d * X61M + c_c * X25M + c_a * X43M); +} + #endif /* VULKAN_DCT_H */ diff --git a/libavcodec/vulkan/prores_ks_alpha_data.comp.glsl b/libavcodec/vulkan/prores_ks_alpha_data.comp.glsl new file mode 100644 index 0000000000..434a15fdea --- /dev/null +++ b/libavcodec/vulkan/prores_ks_alpha_data.comp.glsl @@ -0,0 +1,87 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#version 460 +#pragma shader_stage(compute) + +#extension GL_EXT_shader_image_load_formatted : require +#extension GL_EXT_scalar_block_layout : require +#extension GL_EXT_shader_explicit_arithmetic_types : require + +layout (local_size_x_id = 253, local_size_y_id = 254, local_size_z_id = 255) in; + +layout (constant_id = 0) const int alpha_bits = 0; +layout (constant_id = 1) const int slices_per_row = 0; +layout (constant_id = 2) const int width_in_mb = 0; +layout (constant_id = 3) const int max_mbs_per_slice = 0; + +struct SliceData { + uint mbs_per_slice; + int16_t coeffs[4][8 * 256]; +}; + +layout (set = 0, binding = 0, scalar) writeonly buffer SliceBuffer { + SliceData slices[]; +}; +layout (set = 0, binding = 1) uniform readonly iimage2D plane; + +/* Table of possible edge slice configurations */ +const uvec3 edge_mps_table[8] = uvec3[]( + uvec3(0, 0, 0), + uvec3(1, 0, 0), + uvec3(2, 0, 0), + uvec3(2, 1, 0), + uvec3(4, 0, 0), + uvec3(4, 1, 0), + uvec3(4, 2, 0), + uvec3(4, 2, 1) +); + +void main() +{ + ivec2 coord = min(ivec2(gl_GlobalInvocationID.xy), imageSize(plane) - ivec2(1)); + uint16_t alpha = uint16_t(imageLoad(plane, coord).x); + + if (alpha_bits == 8) + alpha >>= 2; + else + alpha = (alpha << 6) | (alpha >> 4); + + uint mbs_per_slice = max_mbs_per_slice; + uint slices_width = width_in_mb / mbs_per_slice; + uint mb_width = slices_width * mbs_per_slice; + uint slice_x = gl_WorkGroupID.x / mbs_per_slice; + uint slice_y = gl_WorkGroupID.y; + uvec2 slice_base = uvec2(slice_x * mbs_per_slice * 16u, slice_y * 16u); + + /* Handle slice macroblock size reduction on edge slices */ + if (gl_WorkGroupID.x >= mb_width) { + uint edge_mb = gl_WorkGroupID.x - mb_width; + uvec3 table = edge_mps_table[width_in_mb - mb_width]; + uvec3 base = uvec3(0, table.x, table.x + table.y); + uint edge_slice = edge_mb < base.y ? 0 : (edge_mb < base.z ? 1 : 2); + slice_x += edge_slice; + slice_base.x += base[edge_slice] * 16u; + mbs_per_slice = table[edge_slice]; + } + + uint slice = slice_y * slices_per_row + slice_x; + uvec2 coeff_coord = uvec2(coord) - slice_base; + uint coeff = coeff_coord.y * (mbs_per_slice * 16u) + coeff_coord.x; + slices[slice].coeffs[3][coeff] = int16_t(alpha); +} diff --git a/libavcodec/vulkan/prores_ks_encode_slice.comp.glsl b/libavcodec/vulkan/prores_ks_encode_slice.comp.glsl new file mode 100644 index 0000000000..7105ad8dae --- /dev/null +++ b/libavcodec/vulkan/prores_ks_encode_slice.comp.glsl @@ -0,0 +1,273 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#version 460 +#pragma shader_stage(compute) + +#extension GL_EXT_scalar_block_layout : require +#extension GL_EXT_shader_explicit_arithmetic_types : require +#extension GL_GOOGLE_include_directive : require + +#define PB_UNALIGNED +#include "common.glsl" + +layout (constant_id = 0) const int max_mbs_per_slice = 8; +layout (constant_id = 1) const int chroma_factor = 0; +layout (constant_id = 2) const int alpha_bits = 0; +layout (constant_id = 3) const int num_planes = 0; +layout (constant_id = 4) const int slices_per_picture = 0; +layout (constant_id = 5) const int max_quant = 0; + +struct SliceData { + uint32_t mbs_per_slice; + int16_t coeffs[4][8 * 256]; +}; + +struct SliceScore { + ivec4 bits[16]; + ivec4 score[16]; + int total_bits[16]; + int total_score[16]; + int overquant; + int buf_start; + int quant; +}; + +layout(push_constant, scalar) uniform EncodeSliceInfo { + u8buf bytestream; + u8vec2buf seek_table; +}; + +layout (set = 0, binding = 0, scalar) readonly buffer SliceBuffer { + SliceData slices[]; +}; +layout (set = 0, binding = 1, scalar) readonly buffer SliceScores { + SliceScore scores[]; +}; +layout (set = 0, binding = 2, scalar) uniform ProresDataTables { + int16_t qmat[128][64]; int16_t qmat_chroma[128][64]; +}; + +#define CFACTOR_Y444 3 + +void encode_vlc_codeword(inout PutBitContext pb, uint codebook, int val) +{ + /* number of prefix bits to switch between Rice and expGolomb */ + uint switch_bits = (codebook & 3) + 1; + uint rice_order = codebook >> 5; /* rice code order */ + uint exp_order = (codebook >> 2) & 7; /* exp golomb code order */ + + uint switch_val = switch_bits << rice_order; + + if (val >= switch_val) { + val -= int(switch_val - (1 << exp_order)); + int exponent = findMSB(val); + + put_bits(pb, exponent - exp_order + switch_bits, 0); + put_bits(pb, exponent + 1, val); + } else { + int exponent = val >> rice_order; + if (exponent != 0) + put_bits(pb, exponent, 0); + put_bits(pb, 1, 1); + if (rice_order != 0) + put_bits(pb, rice_order, zero_extend(val, rice_order)); + } +} + +#define GET_SIGN(x) ((x) >> 31) +#define MAKE_CODE(x) (((x) * 2) ^ GET_SIGN(x)) + +#define FIRST_DC_CB 0xB8 // rice_order = 5, exp_golomb_order = 6, switch_bits = 0 + +void encode_dcs(inout PutBitContext pb, bool is_chroma, int q) +{ + const uint8_t dc_codebook[7] = { U8(0x04), U8(0x28), U8(0x28), U8(0x4D), U8(0x4D), U8(0x70), U8(0x70) }; + + uint slice = gl_GlobalInvocationID.x; + uint plane = gl_GlobalInvocationID.y; + uint blocks_per_mb = is_chroma && chroma_factor != CFACTOR_Y444 ? 2 : 4; + uint blocks_per_slice = slices[slice].mbs_per_slice * blocks_per_mb; + int codebook = 5; + int scale = is_chroma ? qmat_chroma[q][0] : qmat[q][0]; + int coeff = slices[slice].coeffs[plane][0]; + int prev_dc = (coeff - 0x4000) / scale; + encode_vlc_codeword(pb, FIRST_DC_CB, MAKE_CODE(prev_dc)); + int sign = 0; + for (int i = 1; i < blocks_per_slice; i++) { + coeff = slices[slice].coeffs[plane][i]; + int dc = (coeff - 0x4000) / scale; + int delta = dc - prev_dc; + int new_sign = GET_SIGN(delta); + delta = (delta ^ sign) - sign; + int code = MAKE_CODE(delta); + encode_vlc_codeword(pb, dc_codebook[codebook], code); + codebook = min(code, 6); + sign = new_sign; + prev_dc = dc; + } +} + +void encode_acs(inout PutBitContext pb, bool is_chroma, int q) +{ + const uint8_t run_to_cb[16] = { U8(0x06), U8(0x06), U8(0x05), U8(0x05), U8(0x04), U8(0x29), + U8(0x29), U8(0x29), U8(0x29), U8(0x28), U8(0x28), U8(0x28), + U8(0x28), U8(0x28), U8(0x28), U8(0x4C) }; + + const uint8_t level_to_cb[10] = { U8(0x04), U8(0x0A), U8(0x05), U8(0x06), U8(0x04), U8(0x28), + U8(0x28), U8(0x28), U8(0x28), U8(0x4C) }; + + uint slice = gl_GlobalInvocationID.x; + uint plane = gl_GlobalInvocationID.y; + uint blocks_per_mb = is_chroma && chroma_factor != CFACTOR_Y444 ? 2 : 4; + uint blocks_per_slice = slices[slice].mbs_per_slice * blocks_per_mb; + int prev_run = 4; + int prev_level = 2; + int run = 0; + + for (uint i = 1; i < 64; i++) { + int quant = is_chroma ? qmat_chroma[q][i] : qmat[q][i]; + for (uint j = 0; j < blocks_per_slice; j++) { + uint idx = i * blocks_per_slice + j; + int coeff = slices[slice].coeffs[plane][idx]; + int level = coeff / quant; + if (level != 0) { + int abs_level = abs(level); + encode_vlc_codeword(pb, run_to_cb[prev_run], run); + encode_vlc_codeword(pb, level_to_cb[prev_level], abs_level - 1); + put_bits(pb, 1, zero_extend(GET_SIGN(level), 1)); + prev_run = min(run, 15); + prev_level = min(abs_level, 9); + run = 0; + } else { + run++; + } + } + } +} + +void encode_slice_plane(inout PutBitContext pb, int q) +{ + uint plane = gl_GlobalInvocationID.y; + bool is_chroma = plane == 1 || plane == 2; + encode_dcs(pb, is_chroma, q); + encode_acs(pb, is_chroma, q); +} + +void put_alpha_diff(inout PutBitContext pb, int cur, int prev) +{ + const int dbits = (alpha_bits == 8) ? 4 : 7; + const int dsize = 1 << dbits - 1; + int diff = cur - prev; + + diff = zero_extend(diff, alpha_bits); + if (diff >= (1 << alpha_bits) - dsize) + diff -= 1 << alpha_bits; + if (diff < -dsize || diff > dsize || diff == 0) { + put_bits(pb, 1, 1); + put_bits(pb, alpha_bits, diff); + } else { + put_bits(pb, 1, 0); + put_bits(pb, dbits - 1, abs(diff) - 1); + put_bits(pb, 1, int(diff < 0)); + } +} + +void put_alpha_run(inout PutBitContext pb, int run) +{ + if (run != 0) { + put_bits(pb, 1, 0); + if (run < 0x10) + put_bits(pb, 4, run); + else + put_bits(pb, 15, run); + } else { + put_bits(pb, 1, 1); + } +} + +void encode_alpha_plane(inout PutBitContext pb) +{ + uint slice = gl_GlobalInvocationID.x; + const int mask = (1 << alpha_bits) - 1; + const int num_coeffs = int(slices[slice].mbs_per_slice) * 256; + int prev = mask, cur; + int idx = 0; + int run = 0; + + cur = slices[slice].coeffs[3][idx++]; + put_alpha_diff(pb, cur, prev); + prev = cur; + do { + cur = slices[slice].coeffs[3][idx++]; + if (cur != prev) { + put_alpha_run(pb, run); + put_alpha_diff(pb, cur, prev); + prev = cur; + run = 0; + } else { + run++; + } + } while (idx < num_coeffs); + put_alpha_run(pb, run); +} + +u8vec2 byteswap16(int value) +{ + return unpack8(uint16_t(value)).yx; +} + +void main() +{ + uint slice = gl_GlobalInvocationID.x; + if (slice >= slices_per_picture) + return; + + uint plane = gl_GlobalInvocationID.y; + int q = scores[slice].quant; + int q_idx = min(q, max_quant + 1); + ivec4 bits = scores[slice].bits[q_idx]; + int slice_hdr_size = 2 * num_planes; + int slice_size = slice_hdr_size + ((bits.x + bits.y + bits.z + bits.w) / 8); + int buf_start = scores[slice].buf_start; + u8buf buf = OFFBUF(u8buf, bytestream, buf_start); + + /* Write slice header */ + if (plane == 0) { + buf[0].v = uint8_t(slice_hdr_size * 8); + buf[1].v = uint8_t(q); + u8vec2buf slice_hdr = OFFBUF(u8vec2buf, buf, 2); + for (int i = 0; i < num_planes - 1; i++) { + slice_hdr[i].v = byteswap16(bits[i] / 8); + } + seek_table[slice].v = byteswap16(slice_size); + } + + int plane_offset = 0; + for (int i = 0; i < plane; ++i) + plane_offset += bits[i] / 8; + + /* Encode slice plane */ + PutBitContext pb; + init_put_bits(pb, OFFBUF(u8buf, buf, slice_hdr_size + plane_offset), 0); + if (plane == 3) + encode_alpha_plane(pb); + else + encode_slice_plane(pb, q); + flush_put_bits(pb); +} diff --git a/libavcodec/vulkan/prores_ks_estimate_slice.comp.glsl b/libavcodec/vulkan/prores_ks_estimate_slice.comp.glsl new file mode 100644 index 0000000000..fdaa1c810a --- /dev/null +++ b/libavcodec/vulkan/prores_ks_estimate_slice.comp.glsl @@ -0,0 +1,302 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#version 460 +#pragma shader_stage(compute) + +#extension GL_EXT_scalar_block_layout : require +#extension GL_EXT_shader_explicit_arithmetic_types : require +#extension GL_KHR_shader_subgroup_clustered : require +#extension GL_KHR_shader_subgroup_shuffle : require +#extension GL_GOOGLE_include_directive : require + +#include "common.glsl" + +layout (constant_id = 0) const int max_mbs_per_slice = 8; +layout (constant_id = 1) const int chroma_factor = 0; +layout (constant_id = 2) const int alpha_bits = 0; +layout (constant_id = 3) const int num_planes = 0; +layout (constant_id = 4) const int slices_per_picture = 0; +layout (constant_id = 5) const int min_quant = 0; +layout (constant_id = 6) const int max_quant = 0; +layout (constant_id = 7) const int bits_per_mb = 0; + +struct SliceData { + uint32_t mbs_per_slice; + int16_t coeffs[4][8 * 256]; +}; + +struct SliceScore { + ivec4 bits[16]; + ivec4 score[16]; + int total_bits[16]; + int total_score[16]; + int overquant; + int buf_start; + int quant; +}; + +layout (set = 0, binding = 0, scalar) readonly buffer SliceBuffer { + SliceData slices[]; +}; +layout (set = 0, binding = 1, scalar) writeonly buffer SliceScores { + SliceScore scores[]; +}; +layout (set = 0, binding = 2, scalar) uniform ProresDataTables { + int16_t qmat[128][64]; + int16_t qmat_chroma[128][64]; +}; + +#define CFACTOR_Y444 3 + +#define GET_SIGN(x) ((x) >> 31) +#define MAKE_CODE(x) (((x) * 2) ^ GET_SIGN(x)) + +int estimate_vlc(uint codebook, int val) +{ + /* number of prefix bits to switch between Rice and expGolomb */ + uint switch_bits = (codebook & 3) + 1; + uint rice_order = codebook >> 5; /* rice code order */ + uint exp_order = (codebook >> 2) & 7; /* exp golomb code order */ + + uint switch_val = switch_bits << rice_order; + + if (val >= switch_val) { + val -= int(switch_val - (1 << exp_order)); + int exponent = findMSB(val); + return int(exponent * 2 - exp_order + switch_bits + 1); + } else { + return int((val >> rice_order) + rice_order + 1); + } +} + +#define FIRST_DC_CB 0xB8 // rice_order = 5, exp_golomb_order = 6, switch_bits = 0 + +int estimate_dcs(inout int error, uint slice, uint plane, uint q) +{ + const uint8_t dc_codebook[7] = { U8(0x04), U8(0x28), U8(0x28), U8(0x4D), U8(0x4D), U8(0x70), U8(0x70) }; + + uint blocks_per_mb = plane != 0 && chroma_factor != CFACTOR_Y444 ? 2 : 4; + uint blocks_per_slice = slices[slice].mbs_per_slice * blocks_per_mb; + int codebook = 5; + int coeff = slices[slice].coeffs[plane][0]; + int scale = plane != 0 ? qmat_chroma[q][0] : qmat[q][0]; + int prev_dc = (coeff - 0x4000) / scale; + int bits = estimate_vlc(FIRST_DC_CB, MAKE_CODE(prev_dc)); + int sign = 0; + + for (int i = 1; i < blocks_per_slice; ++i) { + coeff = slices[slice].coeffs[plane][i]; + int dc = (coeff - 0x4000) / scale; + error += abs(coeff - 0x4000) % scale; + int delta = dc - prev_dc; + int new_sign = GET_SIGN(delta); + delta = (delta ^ sign) - sign; + int code = MAKE_CODE(delta); + bits += estimate_vlc(dc_codebook[codebook], code); + codebook = min(code, 6); + sign = new_sign; + prev_dc = dc; + } + + return bits; +} + +#define FFALIGN(x, a) (((x)+(a)-1)&~((a)-1)) +#define SCORE_LIMIT 1073741823 + +int estimate_acs(inout int error, uint slice, uint plane, uint q) +{ + const uint8_t run_to_cb[16] = { U8(0x06), U8(0x06), U8(0x05), U8(0x05), U8(0x04), U8(0x29), + U8(0x29), U8(0x29), U8(0x29), U8(0x28), U8(0x28), U8(0x28), + U8(0x28), U8(0x28), U8(0x28), U8(0x4C) }; + + const uint8_t level_to_cb[10] = { U8(0x04), U8(0x0A), U8(0x05), U8(0x06), U8(0x04), U8(0x28), + U8(0x28), U8(0x28), U8(0x28), U8(0x4C) }; + + uint blocks_per_mb = plane != 0 && chroma_factor != CFACTOR_Y444 ? 2 : 4; + uint blocks_per_slice = slices[slice].mbs_per_slice * blocks_per_mb; + uint max_coeffs = blocks_per_slice << 6; + int prev_run = 4; + int prev_level = 2; + int bits = 0; + int run = 0; + + for (uint i = 1; i < 64; i++) { + int quant = plane != 0 ? qmat_chroma[q][i] : qmat[q][i]; + for (uint j = 0; j < blocks_per_slice; j++) { + uint idx = i * blocks_per_slice + j; + int coeff = slices[slice].coeffs[plane][idx]; + int level = coeff / quant; + error += abs(coeff) % quant; + if (level != 0) { + int abs_level = abs(level); + bits += estimate_vlc(run_to_cb[prev_run], run); + bits += estimate_vlc(level_to_cb[prev_level], abs_level - 1) + 1; + prev_run = min(run, 15); + prev_level = min(abs_level, 9); + run = 0; + } else { + run++; + } + } + } + + return bits; +} + +int estimate_slice_plane(inout int error, uint slice, uint plane, uint q) +{ + int bits = 0; + bits += estimate_dcs(error, slice, plane, q); + bits += estimate_acs(error, slice, plane, q); + return FFALIGN(bits, 8); +} + +int est_alpha_diff(int cur, int prev) +{ + const int dbits = (alpha_bits == 8) ? 4 : 7; + const int dsize = 1 << dbits - 1; + int diff = cur - prev; + + diff = zero_extend(diff, alpha_bits); + if (diff >= (1 << alpha_bits) - dsize) + diff -= 1 << alpha_bits; + if (diff < -dsize || diff > dsize || diff == 0) + return alpha_bits + 1; + else + return dbits + 1; +} + +int estimate_alpha_plane(uint slice) +{ + const int mask = (1 << alpha_bits) - 1; + const int num_coeffs = int(slices[slice].mbs_per_slice) * 256; + int prev = mask, cur; + int idx = 0; + int run = 0; + int bits; + + cur = slices[slice].coeffs[3][idx++]; + bits = est_alpha_diff(cur, prev); + prev = cur; + do { + cur = slices[slice].coeffs[3][idx++]; + if (cur != prev) { + if (run == 0) + bits++; + else if (run < 0x10) + bits += 5; + else + bits += 16; + bits += est_alpha_diff(cur, prev); + prev = cur; + run = 0; + } else { + run++; + } + } while (idx < num_coeffs); + + if (run != 0) { + if (run < 0x10) + bits += 5; + else + bits += 16; + } else { + bits++; + } + + return bits; +} + +int sum_of_planes(int value) +{ + if (num_planes == 3) { + uint base = (gl_SubgroupInvocationID / 3) * 3; + return subgroupShuffle(value, base) + subgroupShuffle(value, base + 1) + subgroupShuffle(value, base + 2); + } else + return subgroupClusteredAdd(value, 4); +} + +void main() +{ + uint slice = gl_GlobalInvocationID.x / num_planes; + uint plane = gl_LocalInvocationID.x % num_planes; + uint q = min_quant + gl_GlobalInvocationID.y; + if (slice >= slices_per_picture) + return; + + /* Estimate slice bits and error for specified quantizer and plane */ + int error = 0; + int bits = 0; + if (plane == 3) + bits = estimate_alpha_plane(slice); + else + bits = estimate_slice_plane(error, slice, plane, q); + + /* Write results to score buffer */ + scores[slice].bits[q][plane] = FFALIGN(bits, 8); + scores[slice].score[q][plane] = error; + + /* Accumulate total bits and error of all planes */ + int total_bits = sum_of_planes(bits); + int total_score = sum_of_planes(error); + if (total_bits > 65000 * 8) + total_score = SCORE_LIMIT; + scores[slice].total_bits[q] = total_bits; + scores[slice].total_score[q] = total_score; + + if (q != max_quant) + return; + + /* Task threads that computed max_quant to also compute overquant if necessary */ + uint mbs_per_slice = slices[slice].mbs_per_slice; + if (total_bits <= bits_per_mb * mbs_per_slice) { + /* Overquant isn't needed for this slice */ + scores[slice].total_bits[max_quant + 1] = total_bits; + scores[slice].total_score[max_quant + 1] = total_score + 1; + scores[slice].bits[max_quant + 1][plane] = FFALIGN(bits, 8); + scores[slice].score[max_quant + 1][plane] = error; + scores[slice].overquant = int(max_quant); + } else { + /* Keep searching until an encoding fits our budget */ + for (q = max_quant + 1; q < 128; ++q) { + /* Estimate slice bits and error for specified quantizer and plane */ + error = 0; + bits = 0; + if (plane == 3) + bits = estimate_alpha_plane(slice); + else + bits = estimate_slice_plane(error, slice, plane, q); + + /* Accumulate total bits and error of all planes */ + total_bits = sum_of_planes(bits); + total_score = sum_of_planes(error); + + /* If estimated bits fit within budget, we are done */ + if (total_bits <= bits_per_mb * mbs_per_slice) + break; + } + + scores[slice].bits[max_quant + 1][plane] = bits; + scores[slice].score[max_quant + 1][plane] = error; + scores[slice].total_bits[max_quant + 1] = total_bits; + scores[slice].total_score[max_quant + 1] = total_score; + scores[slice].overquant = int(q); + } +} diff --git a/libavcodec/vulkan/prores_ks_slice_data.comp.glsl b/libavcodec/vulkan/prores_ks_slice_data.comp.glsl new file mode 100644 index 0000000000..be87b7e03a --- /dev/null +++ b/libavcodec/vulkan/prores_ks_slice_data.comp.glsl @@ -0,0 +1,179 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#version 460 +#pragma shader_stage(compute) + +#extension GL_EXT_scalar_block_layout : require +#extension GL_EXT_shared_memory_block : require +#extension GL_GOOGLE_include_directive : require + +#include "common.glsl" +#include "dct.glsl" + +layout (constant_id = 0) const int max_mbs_per_slice = 8; +layout (constant_id = 1) const int blocks_per_mb = 0; +layout (constant_id = 2) const int width_in_mb = 0; +layout (constant_id = 3) const int pictures_per_frame = 0; + +layout(push_constant, scalar) uniform SliceDataInfo { + int plane; + int line_add; + int bits_per_sample; +}; + +struct SliceData { + uint32_t mbs_per_slice; + i16vec4 rows[4][8 * 32][2]; +}; + +layout (set = 0, binding = 0, scalar) writeonly buffer SliceBuffer { + SliceData slices[]; +}; +layout (set = 0, binding = 1) uniform readonly iimage2D planes[3]; + +/* Table of possible edge slice configurations */ +const uvec3 edge_mps_table[8] = uvec3[]( + uvec3(0, 0, 0), + uvec3(1, 0, 0), + uvec3(2, 0, 0), + uvec3(2, 1, 0), + uvec3(4, 0, 0), + uvec3(4, 1, 0), + uvec3(4, 2, 0), + uvec3(4, 2, 1) +); + +const u8vec2 progressive_scan[64] = { + u8vec2(0, 0), u8vec2(1, 0), u8vec2(0, 1), u8vec2(1, 1), + u8vec2(2, 0), u8vec2(3, 0), u8vec2(2, 1), u8vec2(3, 1), + u8vec2(0, 2), u8vec2(1, 2), u8vec2(0, 3), u8vec2(1, 3), + u8vec2(2, 2), u8vec2(3, 2), u8vec2(2, 3), u8vec2(3, 3), + u8vec2(4, 0), u8vec2(5, 0), u8vec2(4, 1), u8vec2(4, 2), + u8vec2(5, 1), u8vec2(6, 0), u8vec2(7, 0), u8vec2(6, 1), + u8vec2(5, 2), u8vec2(4, 3), u8vec2(5, 3), u8vec2(6, 2), + u8vec2(7, 1), u8vec2(7, 2), u8vec2(6, 3), u8vec2(7, 3), + u8vec2(0, 4), u8vec2(1, 4), u8vec2(0, 5), u8vec2(0, 6), + u8vec2(1, 5), u8vec2(2, 4), u8vec2(3, 4), u8vec2(2, 5), + u8vec2(1, 6), u8vec2(0, 7), u8vec2(1, 7), u8vec2(2, 6), + u8vec2(3, 5), u8vec2(4, 4), u8vec2(5, 4), u8vec2(4, 5), + u8vec2(3, 6), u8vec2(2, 7), u8vec2(3, 7), u8vec2(4, 6), + u8vec2(5, 5), u8vec2(6, 4), u8vec2(7, 4), u8vec2(6, 5), + u8vec2(5, 6), u8vec2(4, 7), u8vec2(5, 7), u8vec2(6, 6), + u8vec2(7, 5), u8vec2(7, 6), u8vec2(6, 7), u8vec2(7, 7), +}; + +const u8vec2 interlaced_scan[64] = { + u8vec2(0, 0), u8vec2(0, 1), u8vec2(1, 0), u8vec2(1, 1), + u8vec2(0, 2), u8vec2(0, 3), u8vec2(1, 2), u8vec2(1, 3), + u8vec2(2, 0), u8vec2(2, 1), u8vec2(3, 0), u8vec2(3, 1), + u8vec2(2, 2), u8vec2(2, 3), u8vec2(3, 2), u8vec2(3, 3), + u8vec2(0, 4), u8vec2(0, 5), u8vec2(1, 4), u8vec2(2, 4), + u8vec2(1, 5), u8vec2(0, 6), u8vec2(0, 7), u8vec2(1, 6), + u8vec2(2, 5), u8vec2(3, 4), u8vec2(3, 5), u8vec2(2, 6), + u8vec2(1, 7), u8vec2(2, 7), u8vec2(3, 6), u8vec2(3, 7), + u8vec2(4, 0), u8vec2(4, 1), u8vec2(5, 0), u8vec2(6, 0), + u8vec2(5, 1), u8vec2(4, 2), u8vec2(4, 3), u8vec2(5, 2), + u8vec2(6, 1), u8vec2(7, 0), u8vec2(7, 1), u8vec2(6, 2), + u8vec2(5, 3), u8vec2(4, 4), u8vec2(4, 5), u8vec2(5, 4), + u8vec2(6, 3), u8vec2(7, 2), u8vec2(7, 3), u8vec2(6, 4), + u8vec2(5, 5), u8vec2(4, 6), u8vec2(4, 7), u8vec2(5, 6), + u8vec2(6, 5), u8vec2(7, 4), u8vec2(7, 5), u8vec2(6, 6), + u8vec2(5, 7), u8vec2(6, 7), u8vec2(7, 6), u8vec2(7, 7), +}; + +#define DCTSIZE 8 + +int16_t get_swizzled_coeff(uint blocks_per_slice, uint slice_row, uint idx) +{ + uint coeff = slice_row * DCTSIZE + idx; + u8vec2 coord = pictures_per_frame == 1 ? progressive_scan[coeff / blocks_per_slice] + : interlaced_scan[coeff / blocks_per_slice]; + uint block = coeff % blocks_per_slice; + float v = blocks[block][coord.y * 9 + coord.x]; + return int16_t(v * float(1 << 11)); +} + +void main() +{ + uint row = gl_LocalInvocationID.x; + uint block = gl_LocalInvocationID.y; + uint macroblock = gl_LocalInvocationID.z; + uint slice_x = gl_WorkGroupID.x; + uint slice_block = macroblock * blocks_per_mb + block; + uint slice = gl_WorkGroupID.y * gl_NumWorkGroups.x + slice_x; + + /* Calculate the current thread coordinate in input plane */ + uint mbs_per_slice = max_mbs_per_slice; + uint mb_width = 4u * blocks_per_mb; + uint slices_width = width_in_mb / max_mbs_per_slice; + uvec2 slice_base = gl_WorkGroupID.xy * uvec2(max_mbs_per_slice * mb_width, DCTSIZE * 2u); + + /* Handle slice macroblock size reduction on edge slices */ + if (slice_x >= slices_width) { + uint edge_slice = slice_x - slices_width; + uvec3 table = edge_mps_table[width_in_mb - slices_width * max_mbs_per_slice]; + uvec3 base = uvec3(0u, table.x, table.x + table.y); + slice_base.x = (max_mbs_per_slice * slices_width + base[edge_slice]) * mb_width; + mbs_per_slice = table[edge_slice]; + } + + uvec2 mb_base = slice_base + uvec2(macroblock * mb_width, 0u); + uvec2 block_coord = plane != 0 ? uvec2(block >> 1u, block & 1u) : uvec2(block & 1u, block >> 1u); + ivec2 coord = ivec2(mb_base + block_coord * DCTSIZE + uvec2(0u, row)); + coord.y = coord.y * pictures_per_frame + line_add; + coord = min(coord, imageSize(planes[plane]) - ivec2(1)); + + /* Load and normalize coefficients to [-1, 1] for increased precision during the DCT. */ + [[unroll]] for (int i = 0; i < 8; i++) { + int c = imageLoad(planes[plane], coord + ivec2(i, 0)).x; + blocks[slice_block][row * 9 + i] = float(c) / (1 << (bits_per_sample - 1)); + } + + /* Row-wise DCT */ + fdct8(slice_block, row, 9); + barrier(); + + /* Column-wise DCT */ + fdct8(slice_block, row*9, 1); + barrier(); + + uint slice_row = slice_block * DCTSIZE + row; + uint blocks_per_slice = mbs_per_slice * blocks_per_mb; + + /** + * Swizzle coefficients in morton order before storing to output buffer. + * This allows for more cache friendly and coalesced coefficient loads. + */ + i16vec4 dst_low; + dst_low.x = get_swizzled_coeff(blocks_per_slice, slice_row, 0); + dst_low.y = get_swizzled_coeff(blocks_per_slice, slice_row, 1); + dst_low.z = get_swizzled_coeff(blocks_per_slice, slice_row, 2); + dst_low.w = get_swizzled_coeff(blocks_per_slice, slice_row, 3); + + i16vec4 dst_hi; + dst_hi.x = get_swizzled_coeff(blocks_per_slice, slice_row, 4); + dst_hi.y = get_swizzled_coeff(blocks_per_slice, slice_row, 5); + dst_hi.z = get_swizzled_coeff(blocks_per_slice, slice_row, 6); + dst_hi.w = get_swizzled_coeff(blocks_per_slice, slice_row, 7); + + /* Store DCT result to slice buffer */ + slices[slice].mbs_per_slice = mbs_per_slice; + slices[slice].rows[plane][slice_row][0] = dst_low; + slices[slice].rows[plane][slice_row][1] = dst_hi; +} diff --git a/libavcodec/vulkan/prores_ks_trellis_node.comp.glsl b/libavcodec/vulkan/prores_ks_trellis_node.comp.glsl new file mode 100644 index 0000000000..a5e5203e1e --- /dev/null +++ b/libavcodec/vulkan/prores_ks_trellis_node.comp.glsl @@ -0,0 +1,194 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#version 460 +#pragma shader_stage(compute) + +#extension GL_EXT_control_flow_attributes : require +#extension GL_EXT_scalar_block_layout : require +#extension GL_EXT_shader_explicit_arithmetic_types : require +#extension GL_KHR_shader_subgroup_arithmetic : require + +layout (local_size_x_id = 253, local_size_y_id = 254, local_size_z_id = 255) in; + +layout (constant_id = 0) const int slices_per_row = 1; +layout (constant_id = 1) const int num_subgroups = 1; +layout (constant_id = 2) const int num_planes = 0; +layout (constant_id = 3) const int force_quant = 0; +layout (constant_id = 4) const int min_quant = 0; +layout (constant_id = 5) const int max_quant = 0; +layout (constant_id = 6) const int mbs_per_slice = 0; +layout (constant_id = 7) const int bits_per_mb = 0; + +struct SliceScore { + ivec4 bits[16]; + ivec4 score[16]; + int total_bits[16]; + int total_score[16]; + int overquant; + int buf_start; + int quant; +}; + +layout (set = 0, binding = 0, scalar) writeonly buffer FrameSize { + int frame_size; +}; +layout (set = 0, binding = 1, scalar) buffer SliceScores { + SliceScore scores[]; +}; + +#define TRELLIS_WIDTH 16 +#define SCORE_LIMIT 1073741823 + +struct TrellisNode { + int prev_node; + int quant; + int bits; + int score; +}; + +shared int subgroup_sizes[num_subgroups]; + +int slice_sizes[slices_per_row]; + +TrellisNode nodes[(slices_per_row + 1) * TRELLIS_WIDTH]; + +int find_slice_quant(int slice_x) +{ + int slice = int(gl_LocalInvocationID.x) * slices_per_row + slice_x; + + int trellis_node = int(slice_x + 1) * TRELLIS_WIDTH; + [[unroll]] for (int q = min_quant; q < max_quant + 2; q++) { + nodes[trellis_node + q].prev_node = -1; + nodes[trellis_node + q].quant = q; + } + + int mbs = int(slice_x + 1) * mbs_per_slice; + nodes[trellis_node + max_quant + 1].quant = scores[slice].overquant; + + int bits_limit = mbs * bits_per_mb; + for (int pq = min_quant; pq < max_quant + 2; pq++) { + int prev = trellis_node - TRELLIS_WIDTH + pq; + for (int q = min_quant; q < max_quant + 2; q++) { + int cur = trellis_node + q; + int bits = nodes[prev].bits + scores[slice].total_bits[q]; + int error = scores[slice].total_score[q]; + if (bits > bits_limit) + error = SCORE_LIMIT; + + int new_score; + if (nodes[prev].score < SCORE_LIMIT && error < SCORE_LIMIT) + new_score = nodes[prev].score + error; + else + new_score = SCORE_LIMIT; + if (nodes[cur].prev_node == -1 || nodes[cur].score >= new_score) { + nodes[cur].bits = bits; + nodes[cur].score = new_score; + nodes[cur].prev_node = prev; + } + } + } + + int error = nodes[trellis_node + min_quant].score; + int pq = trellis_node + min_quant; + for (int q = min_quant + 1; q < max_quant + 2; q++) { + if (nodes[trellis_node + q].score <= error) { + error = nodes[trellis_node + q].score; + pq = trellis_node + q; + } + } + + return pq; +} + +int find_slice_row_quants() +{ + for (int i = min_quant; i < max_quant + 2; i++) { + nodes[i].prev_node = -1; + nodes[i].bits = 0; + nodes[i].score = 0; + } + + int q = 0; + for (int slice_x = 0; slice_x < slices_per_row; ++slice_x) { + q = find_slice_quant(slice_x); + } + + int slice_hdr_size = 2 * num_planes; + int slice_row_size = slice_hdr_size * slices_per_row; + int y = int(gl_LocalInvocationID.x); + for (int x = slices_per_row - 1; x >= 0; x--) { + int slice = x + y * slices_per_row; + int quant = nodes[q].quant; + int q_idx = min(quant, max_quant + 1); + ivec4 bits = scores[slice].bits[q_idx]; + slice_sizes[x] = (bits.x + bits.y + bits.z + bits.w) / 8; + slice_row_size += slice_sizes[x]; + scores[slice].quant = quant; + q = nodes[q].prev_node; + } + + return slice_row_size; +} + +int force_slice_row_quants() +{ + int slice_hdr_size = 2 * num_planes; + int slice_row_size = slice_hdr_size * slices_per_row; + int y = int(gl_LocalInvocationID.x); + for (int x = slices_per_row - 1; x >= 0; x--) { + int slice = x + y * slices_per_row; + ivec4 bits = scores[slice].bits[0]; + slice_sizes[x] = (bits.x + bits.y + bits.z + bits.w) / 8; + slice_row_size += slice_sizes[x]; + scores[slice].quant = force_quant; + } + + return slice_row_size; +} + +void main() +{ + int slice_row_size; + if (force_quant == 0) + slice_row_size = find_slice_row_quants(); + else + slice_row_size = force_slice_row_quants(); + + int subgroup_sum = subgroupAdd(slice_row_size); + subgroup_sizes[gl_SubgroupID] = subgroup_sum; + barrier(); + + int buf_start = subgroupExclusiveAdd(slice_row_size); + [[unroll]] for (int i = 0; i < num_subgroups; ++i) { + if (i >= gl_SubgroupID) + break; + buf_start += subgroup_sizes[i]; + } + + int slice_hdr_size = 2 * num_planes; + int y = int(gl_LocalInvocationID.x); + [[unroll]] for (int x = 0; x < slices_per_row; ++x) { + int slice = x + y * slices_per_row; + scores[slice].buf_start = buf_start; + buf_start += slice_hdr_size + slice_sizes[x]; + } + + if (y == gl_WorkGroupSize.x - 1) + frame_size = buf_start; +} _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
