From: IndecisiveTurtle <geoste...@gmail.com> Performance wise, encoding a 1080p 1-minute video is performed in about 2.5 minutes with the cpu encoder running on my Ryzen 5 4600H, while it takes about 30 seconds on my NVIDIA GTX 1650
Haar shader has a subgroup optimized variant that applies when configured wavelet depth allows it --- configure | 1 + libavcodec/Makefile | 3 + libavcodec/allcodecs.c | 1 + libavcodec/vc2enc_vulkan.c | 959 +++++++++++++++++++ libavcodec/vulkan/vc2_dwt_haar.comp | 82 ++ libavcodec/vulkan/vc2_dwt_haar_subgroup.comp | 75 ++ libavcodec/vulkan/vc2_dwt_hor_legall.comp | 82 ++ libavcodec/vulkan/vc2_dwt_upload.comp | 96 ++ libavcodec/vulkan/vc2_dwt_ver_legall.comp | 78 ++ libavcodec/vulkan/vc2_encode.comp | 169 ++++ libavcodec/vulkan/vc2_slice_sizes.comp | 170 ++++ 11 files changed, 1716 insertions(+) create mode 100644 libavcodec/vc2enc_vulkan.c create mode 100644 libavcodec/vulkan/vc2_dwt_haar.comp create mode 100644 libavcodec/vulkan/vc2_dwt_haar_subgroup.comp create mode 100644 libavcodec/vulkan/vc2_dwt_hor_legall.comp create mode 100644 libavcodec/vulkan/vc2_dwt_upload.comp create mode 100644 libavcodec/vulkan/vc2_dwt_ver_legall.comp create mode 100644 libavcodec/vulkan/vc2_encode.comp create mode 100644 libavcodec/vulkan/vc2_slice_sizes.comp diff --git a/configure b/configure index c94b8eac43..16ee163b05 100755 --- a/configure +++ b/configure @@ -3130,6 +3130,7 @@ utvideo_encoder_select="bswapdsp huffman llvidencdsp" vble_decoder_select="llviddsp" vbn_decoder_select="texturedsp" vbn_encoder_select="texturedspenc" +vc2_vulkan_encoder_select="vulkan spirv_compiler" vmix_decoder_select="idctdsp" vc1_decoder_select="blockdsp h264qpel intrax8 mpegvideodec qpeldsp vc1dsp" vc1image_decoder_select="vc1_decoder" diff --git a/libavcodec/Makefile b/libavcodec/Makefile index d4ebd86866..79505f8ef1 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -770,6 +770,9 @@ OBJS-$(CONFIG_VC1_MMAL_DECODER) += mmaldec.o OBJS-$(CONFIG_VC1_QSV_DECODER) += qsvdec.o OBJS-$(CONFIG_VC1_V4L2M2M_DECODER) += v4l2_m2m_dec.o OBJS-$(CONFIG_VC2_ENCODER) += vc2enc.o vc2enc_dwt.o vc2enc_common.o diractab.o +OBJS-$(CONFIG_VC2_VULKAN_ENCODER) += vc2enc_vulkan.o vulkan/vc2_encode.o vulkan/vc2_slice_sizes.o \ + vulkan/vc2_dwt_hor_legall.o vulkan/vc2_dwt_ver_legall.o \ + vulkan/vc2_dwt_upload.o vulkan/vc2_dwt_haar.o vulkan/vc2_dwt_haar_subgroup.o OBJS-$(CONFIG_VCR1_DECODER) += vcr1.o OBJS-$(CONFIG_VMDAUDIO_DECODER) += vmdaudio.o OBJS-$(CONFIG_VMDVIDEO_DECODER) += vmdvideo.o diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c index f10519617e..054b0d958b 100644 --- a/libavcodec/allcodecs.c +++ b/libavcodec/allcodecs.c @@ -365,6 +365,7 @@ extern const FFCodec ff_vc1image_decoder; extern const FFCodec ff_vc1_mmal_decoder; extern const FFCodec ff_vc1_qsv_decoder; extern const FFCodec ff_vc1_v4l2m2m_decoder; +extern const FFCodec ff_vc2_vulkan_encoder; extern const FFCodec ff_vc2_encoder; extern const FFCodec ff_vcr1_decoder; extern const FFCodec ff_vmdvideo_decoder; diff --git a/libavcodec/vc2enc_vulkan.c b/libavcodec/vc2enc_vulkan.c new file mode 100644 index 0000000000..d90d65e36d --- /dev/null +++ b/libavcodec/vc2enc_vulkan.c @@ -0,0 +1,959 @@ +/* + * Copyright (C) 2025 raphaelthegreat <geoste...@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/avassert.h" +#include "libavutil/mem.h" +#include "libavutil/pixdesc.h" +#include "libavutil/opt.h" +#include "libavutil/thread.h" +#include "libavutil/version.h" +#include "libavutil/vulkan_spirv.h" +#include "libavutil/hwcontext_vulkan.h" +#include "libavutil/vulkan_loader.h" +#include "libavutil/vulkan.h" +#include "codec_internal.h" +#include "internal.h" +#include "encode.h" +#include "version.h" +#include "vc2enc_common.h" +#include "hwconfig.h" + +#define LEGALL_TILE_DIM 16 +#define LEGALL_WORKGROUP_X 64 +#define SLICE_WORKGROUP_X 128 + +extern const char *ff_source_common_comp; +extern const char *ff_source_vc2_encode_comp; +extern const char *ff_source_vc2_dwt_hor_legall_comp; +extern const char *ff_source_vc2_dwt_ver_legall_comp; +extern const char *ff_source_vc2_slice_sizes_comp; +extern const char *ff_source_vc2_dwt_upload_comp; +extern const char *ff_source_vc2_dwt_haar_comp; +extern const char *ff_source_vc2_dwt_haar_subgroup_comp; + +typedef struct VC2DwtPushData { + int s; + union { + int diff_offset; + int plane_idx; + }; + int level; +} VC2DwtPushData; + +typedef struct VC2EncAuxData { + int quant[MAX_DWT_LEVELS][4]; + int ff_dirac_qscale_tab[116]; +} VC2EncAuxData; + +typedef struct VC2EncPushData { + VkDeviceAddress pb; + int num_x; + int num_y; + int wavelet_depth; + int size_scaler; + int prefix_bytes; +} VC2EncPushData; + +typedef struct VC2EncSliceArgs { + int quant_idx; + int bytes; + int pb_start; + int pad; +} VC2EncSliceArgs; + +typedef struct VC2EncSliceCalcPushData { + int num_x; + int num_y; + int wavelet_depth; + int size_scaler; + int prefix_bytes; + int bits_ceil; + int bits_floor; +} VC2EncSliceCalcPushData; + +typedef struct VC2EncVulkanContext { + VC2EncContext base; + FFVkBuffer lut_buf; + FFVkBuffer slice_buf; + VC2EncSliceArgs *slice_args; + + /* Vulkan state */ + FFVulkanContext vkctx; + AVVulkanDeviceQueueFamily *qf; + FFVkExecPool e; + + FFVulkanShader dwt_haar_shd; + FFVulkanShader dwt_upload_shd; + FFVulkanShader dwt_hor_shd, dwt_ver_shd; + FFVulkanShader slice_shd; + FFVulkanShader enc_shd; + AVBufferPool* dwt_buf_pool; + int haar_subgroup; + + VkBuffer plane_buf; + VC2EncPushData enc_consts; + VC2DwtPushData dwt_consts; + VC2EncSliceCalcPushData calc_consts; + + /* Intermediate frame pool */ + AVBufferRef *intermediate_frames_ref[3]; + AVFrame *intermediate_frame[AV_NUM_DATA_POINTERS]; + VkImageView intermediate_views[AV_NUM_DATA_POINTERS]; +} VC2EncVulkanContext; + +static int init_vulkan_pipeline(VC2EncVulkanContext* s, FFVkSPIRVCompiler *spv, + FFVulkanShader* shd, int push_size, + int lg_x, int lg_y, int lg_z, + const char* pl_name, const char* pl_source, + int start_desc, int num_desc) +{ + int err = 0; + uint8_t *spv_data; + size_t spv_len; + void *spv_opaque = NULL; + FFVulkanContext *vkctx = &s->vkctx; + FFVulkanDescriptorSetBinding *desc; + + ff_vk_shader_init(vkctx, shd, pl_name, VK_SHADER_STAGE_COMPUTE_BIT, + NULL, 0, lg_x, lg_y, lg_z, 0); + + av_bprintf(&shd->src, "struct SliceArgs {int quant_idx;int bytes;int pb_start;int pad;};\n"); + + desc = (FFVulkanDescriptorSetBinding []) { + { + .name = "src_planes", + .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + .mem_layout = ff_vk_shader_rep_fmt(vkctx->frames->sw_format, FF_VK_REP_UINT), + .dimensions = 2, + .elems = av_pix_fmt_count_planes(vkctx->frames->sw_format), + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + }, + { + .name = "coef_buf", + .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + .mem_layout = "r32i", + .dimensions = 2, + .elems = 3, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + }, + { + .name = "AuxData", + .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_layout = "scalar", + .buf_content = "int lut_quant[5][4]; int ff_dirac_qscale_tab[116];", + }, + { + .name = "SliceBuffer", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_layout = "scalar", + .buf_content = "SliceArgs slice_args[];", + }, + }; + RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc + start_desc, num_desc, 0, 0)); + + ff_vk_shader_add_push_const(shd, 0, push_size, VK_SHADER_STAGE_COMPUTE_BIT); + av_bprintf(&shd->src, "#define PB_UNALIGNED\n"); + av_bprintf(&shd->src, "#define PLANE_FMT %d\n", vkctx->frames->sw_format); + GLSLD(ff_source_common_comp); + GLSLD(pl_source); + + /* Compile Haar shader */ + RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main", &spv_opaque)); + RET(ff_vk_shader_link(vkctx, shd, spv_data, spv_len, "main")); + RET(ff_vk_shader_register_exec(vkctx, &s->e, shd)); + +fail: + return err; +} + +static int init_frame_pools(AVCodecContext *avctx) +{ + int i, err = 0; + VC2EncVulkanContext *sv = avctx->priv_data; + AVHWFramesContext *frames_ctx; + AVVulkanFramesContext *vk_frames; + enum AVPixelFormat sw_format = AV_PIX_FMT_GRAY32; + + for (i = 0; i < 3; i++) { + sv->intermediate_frames_ref[i] = av_hwframe_ctx_alloc(sv->vkctx.device_ref); + if (!sv->intermediate_frames_ref[i]) + return AVERROR(ENOMEM); + + frames_ctx = (AVHWFramesContext *)sv->intermediate_frames_ref[i]->data; + frames_ctx->format = AV_PIX_FMT_VULKAN; + frames_ctx->sw_format = sw_format; + frames_ctx->width = sv->base.plane[i].dwt_width; + frames_ctx->height = sv->base.plane[i].dwt_height; + + vk_frames = frames_ctx->hwctx; + vk_frames->tiling = VK_IMAGE_TILING_OPTIMAL; + vk_frames->usage = VK_IMAGE_USAGE_STORAGE_BIT; + vk_frames->img_flags = VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT; + + err = av_hwframe_ctx_init(sv->intermediate_frames_ref[i]); + if (err < 0) { + av_log(avctx, AV_LOG_ERROR, "Unable to initialize frame pool with format %s: %s\n", + av_get_pix_fmt_name(sw_format), av_err2str(err)); + av_buffer_unref(&sv->intermediate_frames_ref[i]); + return err; + } + } + + return err; +} + +static int vc2_init_vulkan(AVCodecContext *avctx) +{ + VC2EncVulkanContext *sv = avctx->priv_data; + VC2EncContext *s = &sv->base; + FFVulkanContext *vkctx = &sv->vkctx; + FFVkSPIRVCompiler *spv; + VC2EncAuxData *ad = NULL; + int err = 0; + unsigned int subgroup_size = vkctx->subgroup_props.maxSubgroupSize; + + /* Initialize spirv compiler */ + spv = ff_vk_spirv_init(); + if (!spv) { + av_log(avctx, AV_LOG_ERROR, "Unable to initialize SPIR-V compiler!\n"); + return -1; + } + + ff_vk_exec_pool_init(vkctx, sv->qf, &sv->e, 1, 0, 0, 0, NULL); + + /* Initialize Haar push data */ + sv->dwt_consts.diff_offset = s->diff_offset; + sv->dwt_consts.s = s->wavelet_idx == VC2_TRANSFORM_HAAR_S ? 1 : 0; + sv->dwt_consts.level = 0; + + /* Initializer slice calculation push data */ + sv->calc_consts.num_x = s->num_x; + sv->calc_consts.num_y = s->num_y; + sv->calc_consts.wavelet_depth = s->wavelet_depth; + sv->calc_consts.prefix_bytes = s->prefix_bytes; + + /* Initialize encoder push data */ + sv->enc_consts.wavelet_depth = s->wavelet_depth; + sv->enc_consts.num_x = s->num_x; + sv->enc_consts.num_y = s->num_y; + + /* Create buffer for encoder auxilary data. */ + RET(ff_vk_create_buf(vkctx, &sv->lut_buf, sizeof(VC2EncAuxData), NULL, NULL, + VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)); + RET(ff_vk_map_buffer(vkctx, &sv->lut_buf, (void *)&ad, 0)); + vc2_init_quant_matrix(s, ad->quant); + memcpy(ad->ff_dirac_qscale_tab, ff_dirac_qscale_tab, sizeof(ff_dirac_qscale_tab)); + RET(ff_vk_unmap_buffer(vkctx, &sv->lut_buf, 1)); + + /* Create buffer for encoder auxilary data. */ + RET(ff_vk_create_buf(vkctx, &sv->slice_buf, + sizeof(VC2EncSliceArgs) * s->num_x * s->num_y, + NULL, NULL, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)); + RET(ff_vk_map_buffer(vkctx, &sv->slice_buf, (void *)&sv->slice_args, 0)); + memset(sv->slice_args, 0, sv->slice_buf.size); + + /* Initialize intermediate frame pool. */ + RET(init_frame_pools(avctx)); + + /* Initialize encoding pipelines */ + init_vulkan_pipeline(sv, spv, &sv->dwt_upload_shd, sizeof(VC2DwtPushData), + 8, 8, 1, "dwt_upload_pl", ff_source_vc2_dwt_upload_comp, 0, 2); + init_vulkan_pipeline(sv, spv, &sv->slice_shd, sizeof(VC2EncPushData), + SLICE_WORKGROUP_X, 1, 1, "slice_pl", ff_source_vc2_slice_sizes_comp, 1, 3); + init_vulkan_pipeline(sv, spv, &sv->enc_shd, sizeof(VC2EncPushData), + SLICE_WORKGROUP_X, 1, 1, "enc_pl", ff_source_vc2_encode_comp, 1, 3); + sv->haar_subgroup = 0; + + if (s->wavelet_idx == VC2_TRANSFORM_HAAR || s->wavelet_idx == VC2_TRANSFORM_HAAR_S) { + if (subgroup_size == 32 && s->wavelet_depth < 3) { + init_vulkan_pipeline(sv, spv, &sv->dwt_haar_shd, sizeof(VC2DwtPushData), + 64, 1, 1, "dwt_haar_pl", ff_source_vc2_dwt_haar_subgroup_comp, 1, 1); + sv->haar_subgroup = 1; + } else if (subgroup_size == 64 && s->wavelet_depth < 4) { + init_vulkan_pipeline(sv, spv, &sv->dwt_haar_shd, sizeof(VC2DwtPushData), + 64, 1, 1, "dwt_haar_pl", ff_source_vc2_dwt_haar_subgroup_comp, 1, 1); + sv->haar_subgroup = 1; + } else { + init_vulkan_pipeline(sv, spv, &sv->dwt_haar_shd, sizeof(VC2DwtPushData), + 32, 32, 1, "dwt_haar_pl", ff_source_vc2_dwt_haar_comp, 1, 1); + } + } else if (s->wavelet_idx == VC2_TRANSFORM_5_3) { + init_vulkan_pipeline(sv, spv, &sv->dwt_hor_shd, sizeof(VC2DwtPushData), + LEGALL_WORKGROUP_X, 1, 1, "dwt_hor_pl", ff_source_vc2_dwt_hor_legall_comp, 1, 1); + init_vulkan_pipeline(sv, spv, &sv->dwt_ver_shd, sizeof(VC2DwtPushData), + LEGALL_WORKGROUP_X, 1, 1, "dwt_ver_pl", ff_source_vc2_dwt_ver_legall_comp, 1, 1); + } + +fail: + return err; +} + +static void vulkan_bind_img_planes(FFVulkanContext *s, FFVkExecContext *e, + FFVulkanShader *shd, VkImageView *views, + int set, int binding) +{ + for (int i = 0; i < 3; i++) + ff_vk_shader_update_img(s, e, shd, set, binding, i, + views[i], VK_IMAGE_LAYOUT_GENERAL, + VK_NULL_HANDLE); +} + +static void dwt_plane_haar(VC2EncVulkanContext *s, FFVkExecContext *exec, + VkImageMemoryBarrier2* img_bar, int nb_img_bar) +{ + int p, group_x, group_y; + FFVulkanContext *vkctx = &s->vkctx; + FFVulkanFunctions *vk = &vkctx->vkfn; + Plane* plane; + + s->dwt_consts.level = s->base.wavelet_depth; + vulkan_bind_img_planes(vkctx, exec, &s->dwt_haar_shd, s->intermediate_views, 0, 0); + ff_vk_exec_bind_shader(vkctx, exec, &s->dwt_haar_shd); + + /* Haar pass */ + for (p = 0; p < 3; p++) { + plane = &s->base.plane[p]; + s->dwt_consts.plane_idx = p; + if (s->haar_subgroup) { + group_x = FFALIGN(plane->dwt_width, 8) >> 3; + group_y = FFALIGN(plane->dwt_height, 8) >> 3; + } else { + group_x = FFALIGN(plane->dwt_width, 32) >> 5; + group_y = FFALIGN(plane->dwt_height, 32) >> 5; + } + + ff_vk_shader_update_push_const(vkctx, exec, &s->dwt_haar_shd, VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(VC2DwtPushData), &s->dwt_consts); + vk->CmdDispatch(exec->buf, group_x, group_y, 1); + } + + /* Wait for haar dispatches to complete */ + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pImageMemoryBarriers = img_bar, + .imageMemoryBarrierCount = nb_img_bar, + }); +} + +static void dwt_plane_legall(VC2EncVulkanContext *s, FFVkExecContext *exec, + VkImageMemoryBarrier2* img_bar, int nb_img_bar) +{ + FFVulkanContext *vkctx = &s->vkctx; + FFVulkanFunctions *vk = &vkctx->vkfn; + int legall_group_x = (s->base.plane[0].dwt_height + LEGALL_WORKGROUP_X - 1) >> 6; + int legall_group_y = (s->base.plane[0].dwt_width + LEGALL_WORKGROUP_X - 1) >> 6; + int i; + + /* Perform legall wavelet trasform */ + for (i = 0; i < s->base.wavelet_depth; i++) { + s->dwt_consts.level = i; + + /* Horizontal legall pass */ + vulkan_bind_img_planes(vkctx, exec, &s->dwt_hor_shd, s->intermediate_views, 0, 0); + ff_vk_exec_bind_shader(vkctx, exec, &s->dwt_hor_shd); + ff_vk_shader_update_push_const(vkctx, exec, &s->dwt_hor_shd, VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(VC2DwtPushData), &s->dwt_consts); + vk->CmdDispatch(exec->buf, legall_group_x, 1, 3); + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pImageMemoryBarriers = img_bar, + .imageMemoryBarrierCount = nb_img_bar, + }); + + /* Vertical legall pass */ + vulkan_bind_img_planes(vkctx, exec, &s->dwt_ver_shd, s->intermediate_views, 0, 0); + ff_vk_exec_bind_shader(vkctx, exec, &s->dwt_ver_shd); + ff_vk_shader_update_push_const(vkctx, exec, &s->dwt_ver_shd, VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(VC2DwtPushData), &s->dwt_consts); + vk->CmdDispatch(exec->buf, legall_group_y, 1, 3); + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pImageMemoryBarriers = img_bar, + .imageMemoryBarrierCount = nb_img_bar, + }); + } +} + +static int vulkan_dwt_plane(VC2EncVulkanContext *s, FFVkExecContext *exec, AVFrame *frame) +{ + int i, err = 0, nb_img_bar = 0; + int wavelet_idx = s->base.wavelet_idx; + int group_x = s->base.plane[0].dwt_width >> 3; + int group_y = s->base.plane[0].dwt_height >> 3; + FFVulkanContext *vkctx = &s->vkctx; + FFVulkanFunctions *vk = &vkctx->vkfn; + VkImageView views[AV_NUM_DATA_POINTERS]; + VkImageMemoryBarrier2 img_bar[AV_NUM_DATA_POINTERS]; + + /* Generate barriers and image views for frame images. */ + RET(ff_vk_exec_add_dep_frame(vkctx, exec, frame, + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT)); + RET(ff_vk_create_imageviews(vkctx, exec, views, frame, FF_VK_REP_UINT)); + ff_vk_frame_barrier(vkctx, exec, frame, img_bar, &nb_img_bar, + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + VK_ACCESS_SHADER_READ_BIT, + VK_IMAGE_LAYOUT_GENERAL, + VK_QUEUE_FAMILY_IGNORED); + + /* Submit the image barriers. */ + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pImageMemoryBarriers = img_bar, + .imageMemoryBarrierCount = nb_img_bar, + }); + + /* Create a temporaty frames */ + nb_img_bar = 0; + for (i = 0; i < 3; i++) { + s->intermediate_frame[i] = av_frame_alloc(); + if (!s->intermediate_frame[i]) + return AVERROR(ENOMEM); + + RET(av_hwframe_get_buffer(s->intermediate_frames_ref[i], + s->intermediate_frame[i], 0)); + RET(ff_vk_exec_add_dep_frame(vkctx, exec, s->intermediate_frame[i], + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT)); + RET(ff_vk_create_imageviews(vkctx, exec, &s->intermediate_views[i], + s->intermediate_frame[i], FF_VK_REP_INT)); + ff_vk_frame_barrier(vkctx, exec, s->intermediate_frame[i], img_bar, &nb_img_bar, + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + VK_ACCESS_SHADER_READ_BIT, + VK_IMAGE_LAYOUT_GENERAL, + VK_QUEUE_FAMILY_IGNORED); + } + + /* Submit the image barriers. */ + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pImageMemoryBarriers = img_bar, + .imageMemoryBarrierCount = nb_img_bar, + }); + + /* Bind input images to the shader. */ + ff_vk_shader_update_img_array(vkctx, exec, &s->dwt_upload_shd, frame, views, 0, 0, + VK_IMAGE_LAYOUT_GENERAL, VK_NULL_HANDLE); + vulkan_bind_img_planes(vkctx, exec, &s->dwt_upload_shd, s->intermediate_views, 0, 1); + + /* Upload coefficients from planes to the buffer. */ + s->dwt_consts.diff_offset = s->base.diff_offset; + ff_vk_exec_bind_shader(vkctx, exec, &s->dwt_upload_shd); + ff_vk_shader_update_push_const(vkctx, exec, &s->dwt_upload_shd, VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(VC2DwtPushData), &s->dwt_consts); + vk->CmdDispatch(exec->buf, group_x, group_y, 1); + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pImageMemoryBarriers = img_bar, + .imageMemoryBarrierCount = nb_img_bar, + }); + + /* Perform wavelet trasform. */ + if (wavelet_idx == VC2_TRANSFORM_HAAR || wavelet_idx == VC2_TRANSFORM_HAAR_S) + dwt_plane_haar(s, exec, img_bar, nb_img_bar); + else if (wavelet_idx == VC2_TRANSFORM_5_3) + dwt_plane_legall(s, exec, img_bar, nb_img_bar); + +fail: + return err; +} + +static void vulkan_encode_slices(VC2EncVulkanContext *s, FFVkExecContext *exec) +{ + int num_slices = s->base.num_x * s->base.num_y; + int num_slice_groups = (num_slices + SLICE_WORKGROUP_X - 1) >> 7; + int i, skip = 0; + FFVulkanContext *vkctx = &s->vkctx; + FFVulkanFunctions *vk = &vkctx->vkfn; + + /* Calculate slice sizes. */ + vulkan_bind_img_planes(vkctx, exec, &s->slice_shd, s->intermediate_views, 0, 0); + ff_vk_shader_update_desc_buffer(vkctx, exec, &s->slice_shd, + 0, 1, 0, &s->lut_buf, 0, + sizeof(VC2EncAuxData), + VK_FORMAT_UNDEFINED); + ff_vk_shader_update_desc_buffer(vkctx, exec, &s->slice_shd, + 0, 2, 0, &s->slice_buf, 0, + s->slice_buf.size, + VK_FORMAT_UNDEFINED); + ff_vk_exec_bind_shader(vkctx, exec, &s->slice_shd); + ff_vk_shader_update_push_const(vkctx, exec, &s->slice_shd, VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(VC2EncSliceCalcPushData), &s->calc_consts); + vk->CmdDispatch(exec->buf, num_slice_groups, 1, 1); + + flush_put_bits(&s->base.pb); + s->enc_consts.pb += put_bytes_output(&s->base.pb); + + /* Wait for slice sizes to be written. */ + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pBufferMemoryBarriers = &(VkBufferMemoryBarrier2) { + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, + .srcStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + .srcAccessMask = VK_ACCESS_2_SHADER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_2_SHADER_READ_BIT, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .buffer = s->slice_buf.buf, + .size = sizeof(VC2EncSliceArgs) * num_slices, + .offset = 0, + }, + .bufferMemoryBarrierCount = 1U, + }); + + /* Bind encoding shader. */ + vulkan_bind_img_planes(vkctx, exec, &s->enc_shd, s->intermediate_views, 0, 0); + ff_vk_shader_update_desc_buffer(vkctx, exec, &s->enc_shd, + 0, 1, 0, &s->lut_buf, 0, + sizeof(VC2EncAuxData), + VK_FORMAT_UNDEFINED); + ff_vk_shader_update_desc_buffer(vkctx, exec, &s->enc_shd, + 0, 2, 0, &s->slice_buf, 0, + s->slice_buf.size, + VK_FORMAT_UNDEFINED); + ff_vk_exec_bind_shader(vkctx, exec, &s->enc_shd); + ff_vk_shader_update_push_const(vkctx, exec, &s->enc_shd, VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(VC2EncPushData), &s->enc_consts); + + vk->CmdDispatch(exec->buf, num_slice_groups, 1, 1); + + ff_vk_exec_submit(vkctx, exec); + ff_vk_exec_wait(vkctx, exec); + + for (int slice_y = 0; slice_y < s->base.num_y; slice_y++) { + for (int slice_x = 0; slice_x < s->base.num_x; slice_x++) { + VC2EncSliceArgs *args = &s->slice_args[s->base.num_x * slice_y + slice_x]; + skip += args->bytes; + } + } + + /* Skip forward to write end header */ + skip_put_bytes(&s->base.pb, skip); + + /* Free allocated intermediate frames */ + for (i = 0; i < 3; i++) + av_frame_free(&s->intermediate_frame[i]); +} + +static int encode_frame(VC2EncVulkanContext *sv, AVPacket *avpkt, const AVFrame *frame, + const char *aux_data, const int header_size, int field) +{ + int ret; + int64_t max_frame_bytes; + AVBufferRef *avpkt_buf = NULL; + FFVkBuffer* buf_vk = NULL; + VC2EncContext* s = &sv->base; + FFVulkanContext *vkctx = &sv->vkctx; + FFVkExecContext *exec = ff_vk_exec_get(vkctx, &sv->e); + + ff_vk_exec_start(vkctx, exec); + + /* Perform wavelet pass on the inpute frame. */ + vulkan_dwt_plane(sv, exec, (AVFrame*)frame); + + /* Allocate a buffer that can fit at all all 3 planes of data */ + max_frame_bytes = header_size + s->avctx->width * s->avctx->height * sizeof(dwtcoef); + + /* Get a pooled device local host visible buffer for writing output data */ + if (field < 2) { + ret = ff_vk_get_pooled_buffer(vkctx, &sv->dwt_buf_pool, &avpkt_buf, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, NULL, + max_frame_bytes << s->interlaced, + VK_MEMORY_PROPERTY_HOST_CACHED_BIT | + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | + VK_MEMORY_PROPERTY_HOST_COHERENT_BIT); + avpkt->buf = avpkt_buf; + buf_vk = (FFVkBuffer *)avpkt_buf->data; + avpkt->data = buf_vk->mapped_mem; + avpkt->size = max_frame_bytes << s->interlaced; + sv->enc_consts.pb = buf_vk->address; + ff_vk_exec_add_dep_buf(vkctx, exec, &avpkt_buf, 1, 1); + + if (ret < 0) + return ret; + init_put_bits(&s->pb, avpkt->data, avpkt->size); + } + + /* Sequence header */ + vc2_encode_parse_info(s, DIRAC_PCODE_SEQ_HEADER); + vc2_encode_seq_header(s); + + /* Encoder version */ + if (aux_data) { + vc2_encode_parse_info(s, DIRAC_PCODE_AUX); + ff_put_string(&s->pb, aux_data, 1); + } + + /* Picture header */ + vc2_encode_parse_info(s, DIRAC_PCODE_PICTURE_HQ); + vc2_encode_picture_start(s); + + /* Encode slices */ + vulkan_encode_slices(sv, exec); + + /* End sequence */ + vc2_encode_parse_info(s, DIRAC_PCODE_END_SEQ); + + return 0; +} + +static av_cold int vc2_encode_frame(AVCodecContext *avctx, AVPacket *avpkt, + const AVFrame *frame, int *got_packet) +{ + int ret = 0; + int slice_ceil, sig_size = 256; + VC2EncVulkanContext *sv = avctx->priv_data; + VC2EncContext *s = &sv->base; + const int bitexact = avctx->flags & AV_CODEC_FLAG_BITEXACT; + const char *aux_data = bitexact ? "Lavc" : LIBAVCODEC_IDENT; + const int aux_data_size = bitexact ? sizeof("Lavc") : sizeof(LIBAVCODEC_IDENT); + const int header_size = 100 + aux_data_size; + int64_t r_bitrate = avctx->bit_rate >> (s->interlaced); + + s->avctx = avctx; + s->size_scaler = 2; + s->prefix_bytes = 0; + s->last_parse_code = 0; + s->next_parse_offset = 0; + + /* Rate control */ + s->frame_max_bytes = (av_rescale(r_bitrate, s->avctx->time_base.num, + s->avctx->time_base.den) >> 3) - header_size; + s->slice_max_bytes = slice_ceil = av_rescale(s->frame_max_bytes, 1, s->num_x * s->num_y); + + /* Find an appropriate size scaler */ + while (sig_size > 255) { + int r_size = SSIZE_ROUND(s->slice_max_bytes); + if (r_size > slice_ceil) { + s->slice_max_bytes -= r_size - slice_ceil; + r_size = SSIZE_ROUND(s->slice_max_bytes); + } + sig_size = r_size/s->size_scaler; /* Signalled slize size */ + s->size_scaler <<= 1; + } + + s->slice_min_bytes = s->slice_max_bytes - s->slice_max_bytes*(s->tolerance/100.0f); + if (s->slice_min_bytes < 0) + return AVERROR(EINVAL); + + /* Update slice calc push data */ + sv->calc_consts.size_scaler = s->size_scaler; + sv->calc_consts.bits_ceil = s->slice_max_bytes << 3; + sv->calc_consts.bits_floor = s->slice_min_bytes << 3; + sv->enc_consts.prefix_bytes = 0; + sv->enc_consts.size_scaler = s->size_scaler; + + ret = encode_frame(sv, avpkt, frame, aux_data, header_size, s->interlaced); + if (ret) + return ret; + if (s->interlaced) { + ret = encode_frame(sv, avpkt, frame, aux_data, header_size, 2); + if (ret) + return ret; + } + + flush_put_bits(&s->pb); + av_shrink_packet(avpkt, put_bytes_output(&s->pb)); + avpkt->flags |= AV_PKT_FLAG_KEY; + *got_packet = 1; + + return 0; +} + +static av_cold int vc2_encode_end(AVCodecContext *avctx) +{ + VC2EncVulkanContext *sv = avctx->priv_data; + FFVulkanContext *vkctx = &sv->vkctx; + int i; + + ff_vk_exec_pool_free(vkctx, &sv->e); + + ff_vk_shader_free(vkctx, &sv->dwt_upload_shd); + ff_vk_shader_free(vkctx, &sv->dwt_haar_shd); + ff_vk_shader_free(vkctx, &sv->dwt_hor_shd); + ff_vk_shader_free(vkctx, &sv->dwt_ver_shd); + ff_vk_shader_free(vkctx, &sv->slice_shd); + ff_vk_shader_free(vkctx, &sv->enc_shd); + + ff_vk_free_buf(vkctx, &sv->slice_buf); + ff_vk_free_buf(vkctx, &sv->lut_buf); + + for (i = 0; i < 3; i++) { + ff_vc2enc_free_transforms(&sv->base.transform_args[i].t); + av_freep(&sv->base.plane[i].coef_buf); + av_buffer_unref(&sv->intermediate_frames_ref[i]); + } + + av_buffer_pool_uninit(&sv->dwt_buf_pool); + ff_vk_uninit(vkctx); + + return 0; +} + +static av_cold int vc2_encode_init(AVCodecContext *avctx) +{ + static AVOnce init_static_once = AV_ONCE_INIT; + Plane *p; + SubBand *b; + int i, level, o, ret, depth; + const AVPixFmtDescriptor *fmt; + VC2EncVulkanContext *sv = avctx->priv_data; + VC2EncContext *s = &sv->base; + FFVulkanContext *vkctx = &sv->vkctx; + + /* Init Vulkan */ + ret = ff_vk_init(&sv->vkctx, avctx, NULL, avctx->hw_frames_ctx); + if (ret < 0) + return ret; + + sv->qf = ff_vk_qf_find(vkctx, VK_QUEUE_COMPUTE_BIT, 0); + if (!sv->qf) { + av_log(avctx, AV_LOG_ERROR, "Device has no compute queues!\n"); + return ret; + } + + s->picture_number = 0; + + /* Total allowed quantization range */ + s->q_ceil = DIRAC_MAX_QUANT_INDEX; + + s->ver.major = 2; + s->ver.minor = 0; + s->profile = 3; + s->level = 3; + + s->base_vf = -1; + s->strict_compliance = 1; + + s->q_avg = 0; + s->slice_max_bytes = 0; + s->slice_min_bytes = 0; + + /* Mark unknown as progressive */ + s->interlaced = !((avctx->field_order == AV_FIELD_UNKNOWN) || + (avctx->field_order == AV_FIELD_PROGRESSIVE)); + if (s->interlaced != 0) { + av_log(avctx, AV_LOG_ERROR, "Interlaced video is unsupported by this encoder\n"); + return AVERROR(ENOTSUP); + } + + for (i = 0; i < base_video_fmts_len; i++) { + const VC2BaseVideoFormat *fmt = &base_video_fmts[i]; + if (avctx->pix_fmt != fmt->pix_fmt || avctx->time_base.num != fmt->time_base.num || + avctx->time_base.den != fmt->time_base.den || avctx->width != fmt->width || + avctx->height != fmt->height || s->interlaced != fmt->interlaced) + continue; + s->base_vf = i; + s->level = base_video_fmts[i].level; + break; + } + + if (s->interlaced) + av_log(avctx, AV_LOG_WARNING, "Interlacing enabled!\n"); + + if ((s->slice_width & (s->slice_width - 1)) || + (s->slice_height & (s->slice_height - 1))) { + av_log(avctx, AV_LOG_ERROR, "Slice size is not a power of two!\n"); + return AVERROR_UNKNOWN; + } + + if ((s->slice_width > avctx->width) || + (s->slice_height > avctx->height)) { + av_log(avctx, AV_LOG_ERROR, "Slice size is bigger than the image!\n"); + return AVERROR_UNKNOWN; + } + + if (s->base_vf <= 0) { + if (avctx->strict_std_compliance < FF_COMPLIANCE_STRICT) { + s->strict_compliance = s->base_vf = 0; + av_log(avctx, AV_LOG_WARNING, "Format does not strictly comply with VC2 specs\n"); + } else { + av_log(avctx, AV_LOG_WARNING, "Given format does not strictly comply with " + "the specifications, decrease strictness to use it.\n"); + return AVERROR_UNKNOWN; + } + } else { + av_log(avctx, AV_LOG_INFO, "Selected base video format = %i (%s)\n", + s->base_vf, base_video_fmts[s->base_vf].name); + } + + /* Chroma subsampling */ + ret = av_pix_fmt_get_chroma_sub_sample(vkctx->frames->sw_format, &s->chroma_x_shift, + &s->chroma_y_shift); + if (ret) + return ret; + + /* Bit depth and color range index */ + fmt = av_pix_fmt_desc_get(vkctx->frames->sw_format); + depth = fmt->comp[0].depth; + + /* 16-bit depth is unsupported by this encoder */ + if (depth == 16) { + av_log(avctx, AV_LOG_ERROR, "16-bit pixel format depth is unsupported by this encoder\n"); + return AVERROR(ENOTSUP); + } + + if (depth == 8 && avctx->color_range == AVCOL_RANGE_JPEG) { + s->bpp = 1; + s->bpp_idx = 1; + s->diff_offset = 128; + } else if (depth == 8 && (avctx->color_range == AVCOL_RANGE_MPEG || + avctx->color_range == AVCOL_RANGE_UNSPECIFIED)) { + s->bpp = 1; + s->bpp_idx = 2; + s->diff_offset = 128; + } else if (depth == 10) { + s->bpp = 2; + s->bpp_idx = 3; + s->diff_offset = 512; + } else { + s->bpp = 2; + s->bpp_idx = 4; + s->diff_offset = 2048; + } + + /* Planes initialization */ + for (i = 0; i < 3; i++) { + int w, h; + p = &s->plane[i]; + p->width = avctx->width >> (i ? s->chroma_x_shift : 0); + p->height = avctx->height >> (i ? s->chroma_y_shift : 0); + if (s->interlaced) + p->height >>= 1; + p->dwt_width = w = FFALIGN(p->width, (1 << s->wavelet_depth)); + p->dwt_height = h = FFALIGN(p->height, (1 << s->wavelet_depth)); + p->coef_stride = FFALIGN(p->dwt_width, 32); + for (level = s->wavelet_depth-1; level >= 0; level--) { + w = w >> 1; + h = h >> 1; + for (o = 0; o < 4; o++) { + b = &p->band[level][o]; + b->width = w; + b->height = h; + b->stride = p->coef_stride; + b->shift = (o > 1)*b->height*b->stride + (o & 1)*b->width; + } + } + + /* DWT init */ + if (ff_vc2enc_init_transforms(&s->transform_args[i].t, + s->plane[i].coef_stride, + s->plane[i].dwt_height, + s->slice_width, s->slice_height)) + return AVERROR(ENOMEM); + } + + /* Slices */ + s->num_x = s->plane[0].dwt_width/s->slice_width; + s->num_y = s->plane[0].dwt_height/s->slice_height; + + s->slice_args = av_calloc(s->num_x*s->num_y, sizeof(SliceArgs)); + if (!s->slice_args) + return AVERROR(ENOMEM); + + for (i = 0; i < 116; i++) { + const uint64_t qf = ff_dirac_qscale_tab[i]; + const uint32_t m = av_log2(qf); + const uint32_t t = (1ULL << (m + 32)) / qf; + const uint32_t r = (t*qf + qf) & UINT32_MAX; + if (!(qf & (qf - 1))) { + s->qmagic_lut[i][0] = 0xFFFFFFFF; + s->qmagic_lut[i][1] = 0xFFFFFFFF; + } else if (r <= 1 << m) { + s->qmagic_lut[i][0] = t + 1; + s->qmagic_lut[i][1] = 0; + } else { + s->qmagic_lut[i][0] = t; + s->qmagic_lut[i][1] = t; + } + } + + ff_thread_once(&init_static_once, vc2_init_static_data); + + vc2_init_vulkan(avctx); + + return 0; +} + +#define VC2ENC_FLAGS (AV_OPT_FLAG_ENCODING_PARAM | AV_OPT_FLAG_VIDEO_PARAM) +static const AVOption vc2enc_options[] = { + {"tolerance", "Max undershoot in percent", offsetof(VC2EncContext, tolerance), AV_OPT_TYPE_DOUBLE, {.dbl = 5.0f}, 0.0f, 45.0f, VC2ENC_FLAGS, .unit = "tolerance"}, + {"slice_width", "Slice width", offsetof(VC2EncContext, slice_width), AV_OPT_TYPE_INT, {.i64 = 32}, 32, 1024, VC2ENC_FLAGS, .unit = "slice_width"}, + {"slice_height", "Slice height", offsetof(VC2EncContext, slice_height), AV_OPT_TYPE_INT, {.i64 = 8}, 8, 1024, VC2ENC_FLAGS, .unit = "slice_height"}, + {"wavelet_depth", "Transform depth", offsetof(VC2EncContext, wavelet_depth), AV_OPT_TYPE_INT, {.i64 = 4}, 1, 5, VC2ENC_FLAGS, .unit = "wavelet_depth"}, + {"wavelet_type", "Transform type", offsetof(VC2EncContext, wavelet_idx), AV_OPT_TYPE_INT, {.i64 = VC2_TRANSFORM_HAAR_S}, 0, VC2_TRANSFORMS_NB, VC2ENC_FLAGS, .unit = "wavelet_idx"}, + {"5_3", "LeGall (5,3)", 0, AV_OPT_TYPE_CONST, {.i64 = VC2_TRANSFORM_5_3}, INT_MIN, INT_MAX, VC2ENC_FLAGS, .unit = "wavelet_idx"}, + {"haar", "Haar (with shift)", 0, AV_OPT_TYPE_CONST, {.i64 = VC2_TRANSFORM_HAAR_S}, INT_MIN, INT_MAX, VC2ENC_FLAGS, .unit = "wavelet_idx"}, + {"haar_noshift", "Haar (without shift)", 0, AV_OPT_TYPE_CONST, {.i64 = VC2_TRANSFORM_HAAR}, INT_MIN, INT_MAX, VC2ENC_FLAGS, .unit = "wavelet_idx"}, + {"qm", "Custom quantization matrix", offsetof(VC2EncContext, quant_matrix), AV_OPT_TYPE_INT, {.i64 = VC2_QM_DEF}, 0, VC2_QM_NB, VC2ENC_FLAGS, .unit = "quant_matrix"}, + {"default", "Default from the specifications", 0, AV_OPT_TYPE_CONST, {.i64 = VC2_QM_DEF}, INT_MIN, INT_MAX, VC2ENC_FLAGS, .unit = "quant_matrix"}, + {"color", "Prevents low bitrate discoloration", 0, AV_OPT_TYPE_CONST, {.i64 = VC2_QM_COL}, INT_MIN, INT_MAX, VC2ENC_FLAGS, .unit = "quant_matrix"}, + {"flat", "Optimize for PSNR", 0, AV_OPT_TYPE_CONST, {.i64 = VC2_QM_FLAT}, INT_MIN, INT_MAX, VC2ENC_FLAGS, .unit = "quant_matrix"}, + {NULL} +}; + +static const AVClass vc2enc_class = { + .class_name = "vc2_vulkan_encoder", + .category = AV_CLASS_CATEGORY_ENCODER, + .option = vc2enc_options, + .item_name = av_default_item_name, + .version = LIBAVUTIL_VERSION_INT +}; + +static const FFCodecDefault vc2enc_defaults[] = { + { "b", "600000000" }, + { NULL }, +}; + +const AVCodecHWConfigInternal *const ff_vc2_hw_configs[] = { + HW_CONFIG_ENCODER_FRAMES(VULKAN, VULKAN), + HW_CONFIG_ENCODER_DEVICE(NONE, VULKAN), + NULL, +}; + +const FFCodec ff_vc2_vulkan_encoder = { + .p.name = "vc2_vulkan", + CODEC_LONG_NAME("SMPTE VC-2"), + .p.type = AVMEDIA_TYPE_VIDEO, + .p.id = AV_CODEC_ID_DIRAC, + .p.capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_HARDWARE, + .caps_internal = FF_CODEC_CAP_INIT_CLEANUP, + .priv_data_size = sizeof(VC2EncVulkanContext), + .init = vc2_encode_init, + .close = vc2_encode_end, + FF_CODEC_ENCODE_CB(vc2_encode_frame), + .p.priv_class = &vc2enc_class, + .defaults = vc2enc_defaults, + CODEC_PIXFMTS(AV_PIX_FMT_VULKAN), + .hw_configs = ff_vc2_hw_configs, +}; diff --git a/libavcodec/vulkan/vc2_dwt_haar.comp b/libavcodec/vulkan/vc2_dwt_haar.comp new file mode 100644 index 0000000000..4806cca729 --- /dev/null +++ b/libavcodec/vulkan/vc2_dwt_haar.comp @@ -0,0 +1,82 @@ +/* + * VC2 codec + * + * Copyright (c) 2025 raphaelthegreat <geoste...@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#extension GL_EXT_scalar_block_layout : require +#extension GL_EXT_buffer_reference : require + +#define LOCAL_X 1024 + +layout(push_constant, scalar) uniform ComputeInfo { + int s; + int plane_idx; + int wavelet_depth; +}; + +shared int local_coef[LOCAL_X]; + +void main() +{ + ivec2 coord = ivec2(gl_GlobalInvocationID.xy); + ivec2 dwt_dim = imageSize(coef_buf[plane_idx]); + int value = imageLoad(coef_buf[plane_idx], coord).x; + + /* Perform Haar wavelet on the 32x32 local workgroup with shared memory */ + for (int i = 0; i < wavelet_depth; i++) + { + ivec2 mask = ivec2((1 << i) - 1); + if (any(notEqual(coord & mask, ivec2(0)))) + break; + + /* Offset between valid hor pixels for each level, +1, +2, +4 etc */ + int dist = (1 << i); + + local_coef[gl_LocalInvocationIndex] = value; + barrier(); + + /* Horizontal haar wavelet */ + uint other_id = gl_LocalInvocationIndex ^ dist; + int other = local_coef[other_id]; + int a = gl_LocalInvocationIndex < other_id ? value : other; + int b = gl_LocalInvocationIndex < other_id ? other : value; + int dst_b = (b - a) * (1 << s); + int dst_a = a * (1 << s) + ((dst_b + 1) >> 1); + value = gl_LocalInvocationIndex < other_id ? dst_a : dst_b; + + /* Offset between valid ver pixels for each level, +1, +2, +4 etc */ + dist <<= 5; + + local_coef[gl_LocalInvocationIndex] = value; + barrier(); + + /* Vertical haar wavelet */ + other_id = gl_LocalInvocationIndex ^ dist; + other = local_coef[other_id]; + a = gl_LocalInvocationIndex < other_id ? value : other; + b = gl_LocalInvocationIndex < other_id ? other : value; + dst_b = b - a; + dst_a = a + ((dst_b + 1) >> 1); + value = gl_LocalInvocationIndex < other_id ? dst_a : dst_b; + } + + /* Store value */ + imageStore(coef_buf[plane_idx], coord, ivec4(value)); +} diff --git a/libavcodec/vulkan/vc2_dwt_haar_subgroup.comp b/libavcodec/vulkan/vc2_dwt_haar_subgroup.comp new file mode 100644 index 0000000000..81b0964271 --- /dev/null +++ b/libavcodec/vulkan/vc2_dwt_haar_subgroup.comp @@ -0,0 +1,75 @@ +/* + * VC2 codec + * + * Copyright (c) 2025 raphaelthegreat <geoste...@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#extension GL_EXT_scalar_block_layout : require +#extension GL_KHR_shader_subgroup_basic : require +#extension GL_KHR_shader_subgroup_shuffle : require + +#define TILE_DIM 8 + +layout(push_constant, scalar) uniform ComputeInfo { + int s; + int plane_idx; + int wavelet_depth; +}; + +void main() +{ + ivec2 tile_coord = ivec2(gl_WorkGroupID.xy); + ivec2 local_coord = ivec2(gl_LocalInvocationIndex & 7, gl_LocalInvocationIndex >> 3); + ivec2 coord = tile_coord * ivec2(TILE_DIM) + local_coord; + + int value = imageLoad(coef_buf[plane_idx], coord).x; + for (int i = 0; i < wavelet_depth; i++) + { + ivec2 mask = ivec2((1 << i) - 1); + if (any(notEqual(local_coord & mask, ivec2(0)))) + break; + + /* Offset between valid hor pixels for each level, +1, +2, +4 etc */ + int dist = (1 << i); + + /* Horizontal haar wavelet */ + uint other_sub_id = gl_SubgroupInvocationID ^ dist; + int other = subgroupShuffle(value, other_sub_id); + int a = gl_SubgroupInvocationID < other_sub_id ? value : other; + int b = gl_SubgroupInvocationID < other_sub_id ? other : value; + int dst_b = (b - a) * (1 << s); + int dst_a = a * (1 << s) + ((dst_b + 1) >> 1); + value = gl_SubgroupInvocationID < other_sub_id ? dst_a : dst_b; + + /* Offset between valid ver pixels for each level, +1, +2, +4 etc */ + dist <<= 3; + + /* Vertical haar wavelet */ + other_sub_id = gl_SubgroupInvocationID ^ dist; + other = subgroupShuffle(value, other_sub_id); + a = gl_SubgroupInvocationID < other_sub_id ? value : other; + b = gl_SubgroupInvocationID < other_sub_id ? other : value; + dst_b = b - a; + dst_a = a + ((dst_b + 1) >> 1); + value = gl_SubgroupInvocationID < other_sub_id ? dst_a : dst_b; + } + + /* Store value */ + imageStore(coef_buf[plane_idx], coord, ivec4(value)); +} diff --git a/libavcodec/vulkan/vc2_dwt_hor_legall.comp b/libavcodec/vulkan/vc2_dwt_hor_legall.comp new file mode 100644 index 0000000000..bada2ee1fd --- /dev/null +++ b/libavcodec/vulkan/vc2_dwt_hor_legall.comp @@ -0,0 +1,82 @@ +/* + * VC2 codec + * + * Copyright (c) 2025 raphaelthegreat <geoste...@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#extension GL_EXT_scalar_block_layout : require +#extension GL_EXT_buffer_reference : require + +layout(push_constant, scalar) uniform ComputeInfo { + int s; + int diff_offset; + int level; +}; + +int image_load(int coord_x) +{ + int coord_y = int(gl_GlobalInvocationID.x); + return imageLoad(coef_buf[gl_GlobalInvocationID.z], ivec2(coord_x, coord_y)).x; +} + +void image_store(int coord_x, int value) +{ + int coord_y = int(gl_GlobalInvocationID.x); + imageStore(coef_buf[gl_GlobalInvocationID.z], ivec2(coord_x, coord_y), ivec4(value)); +} + +void main() +{ + int coord_y = int(gl_GlobalInvocationID.x); + uint plane_idx = gl_GlobalInvocationID.z; + ivec2 work_area = imageSize(coef_buf[plane_idx]); + int dist = 1 << level; + if (coord_y >= work_area.y || (coord_y & (dist - 1)) != 0) + return; + + // Shift in one bit that is used for additional precision + for (int x = 0; x < work_area.x; x += dist) + image_store(x, image_load(x) << 1); + + // Lifting stage 2 + for (int x = 0; x < work_area.x - 2 * dist; x += 2 * dist) { + int lhs = image_load(x); + int rhs = image_load(x + 2 * dist); + int value = image_load(x + dist); + value -= (lhs + rhs + 1) >> 1; + image_store(x + dist, value); + } + int lhs = image_load(work_area.x - 2 * dist); + int value = image_load(work_area.x - dist); + value -= (2 * lhs + 1) >> 1; + image_store(work_area.x - dist, value); + + // Lifting stage 1 + lhs = image_load(dist); + value = image_load(0); + value += (2 * lhs + 2) >> 2; + image_store(0, value); + for (int x = 2 * dist; x <= work_area.x - 2 * dist; x += 2 * dist) { + int lhs = image_load(x - dist); + int rhs = image_load(x + dist); + int value = image_load(x); + value += (lhs + rhs + 2) >> 2; + image_store(x, value); + } +} diff --git a/libavcodec/vulkan/vc2_dwt_upload.comp b/libavcodec/vulkan/vc2_dwt_upload.comp new file mode 100644 index 0000000000..c758fd867f --- /dev/null +++ b/libavcodec/vulkan/vc2_dwt_upload.comp @@ -0,0 +1,96 @@ +/* + * VC2 codec + * + * Copyright (c) 2025 raphaelthegreat <geoste...@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#extension GL_EXT_scalar_block_layout : require +#extension GL_EXT_shader_explicit_arithmetic_types : require + +#define AV_PIX_FMT_XV30 214 +#define AV_PIX_FMT_XV36 216 +#define AV_PIX_FMT_XV48 242 +#define AV_PIX_FMT_P212 222 +#define AV_PIX_FMT_P012 209 +#define AV_PIX_FMT_P210 198 +#define AV_PIX_FMT_P016 169 +#define AV_PIX_FMT_P010 158 +#define AV_PIX_FMT_NV16 101 +#define AV_PIX_FMT_NV12 23 + +#define Y 0 +#define U 1 +#define V 2 + +layout(push_constant, scalar) uniform ComputeInfo { + int s; + int diff_offset; + int level; +}; + +uvec4 load_plane(uint plane_idx) +{ + ivec2 coord = ivec2(gl_GlobalInvocationID.xy); + return imageLoad(src_planes[plane_idx], coord); +} + +void store_plane(uint plane_idx, uint value) +{ + int result = int(value - diff_offset); + ivec2 coord = ivec2(gl_GlobalInvocationID.xy); + imageStore(coef_buf[plane_idx], coord, ivec4(result)); +} + +void main() +{ + uvec4 p0 = load_plane(0); +#if PLANE_FMT == AV_PIX_FMT_XV30 + store_plane(Y, (p0.x >> 10) & 0x3FF); + store_plane(U, p0.x & 0x3FF); + store_plane(V, (p0.x >> 20) & 0x3FF); +#elif PLANE_FMT == AV_PIX_FMT_XV36 + store_plane(Y, p0.y >> 4); + store_plane(U, p0.x >> 4); + store_plane(V, p0.z >> 4); +#elif PLANE_FMT == AV_PIX_FMT_NV12 + uvec4 p1 = load_plane(1); + store_plane(Y, p0.x | p0.y << 8); + store_plane(U, p1.x); + store_plane(V, p1.y); +#elif PLANE_FMT == AV_PIX_FMT_NV16 + uvec4 p1 = load_plane(1); + store_plane(Y, p0.x); + store_plane(U, p1.x); + store_plane(V, p1.y); +#elif PLANE_FMT == AV_PIX_FMT_P010 || PLANE_FMT == AV_PIX_FMT_P210 + uvec4 p1 = load_plane(1); + store_plane(Y, p0.x >> 6); + store_plane(U, p1.x >> 6); + store_plane(V, p1.y >> 6); +#elif PLANE_FMT == AV_PIX_FMT_P012 || PLANE_FMT == AV_PIX_FMT_P212 + uvec4 p1 = load_plane(1); + store_plane(Y, p0.x >> 4); + store_plane(U, p1.x >> 4); + store_plane(V, p1.y >> 4); +#else + store_plane(Y, p0.x); + store_plane(U, load_plane(1).x); + store_plane(V, load_plane(2).x); +#endif +} diff --git a/libavcodec/vulkan/vc2_dwt_ver_legall.comp b/libavcodec/vulkan/vc2_dwt_ver_legall.comp new file mode 100644 index 0000000000..ca391cc8d8 --- /dev/null +++ b/libavcodec/vulkan/vc2_dwt_ver_legall.comp @@ -0,0 +1,78 @@ +/* + * VC2 codec + * + * Copyright (c) 2025 raphaelthegreat <geoste...@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#extension GL_EXT_scalar_block_layout : require +#extension GL_EXT_buffer_reference : require + +layout(push_constant, scalar) uniform ComputeInfo { + int s; + int diff_offset; + int level; +}; + +int image_load(int coord_y) +{ + int coord_x = int(gl_GlobalInvocationID.x); + return imageLoad(coef_buf[gl_GlobalInvocationID.z], ivec2(coord_x, coord_y)).x; +} + +void image_store(int coord_y, int value) +{ + int coord_x = int(gl_GlobalInvocationID.x); + imageStore(coef_buf[gl_GlobalInvocationID.z], ivec2(coord_x, coord_y), ivec4(value)); +} + +void main() +{ + int coord_x = int(gl_GlobalInvocationID.x); + uint plane_idx = gl_GlobalInvocationID.z; + ivec2 work_area = imageSize(coef_buf[plane_idx]); + int dist = 1 << level; + if (coord_x >= work_area.x || (coord_x & (dist - 1)) != 0) + return; + + // Lifting stage 2 + for (int y = dist; y < work_area.y - 2 * dist; y += 2 * dist) { + int lhs = image_load(y - dist); + int rhs = image_load(y + dist); + int value = image_load(y); + value -= (lhs + rhs + 1) >> 1; + image_store(y, value); + } + int lhs = image_load(work_area.y - 2 * dist); + int value = image_load(work_area.y - dist); + value -= (2 * lhs + 1) >> 1; + image_store(work_area.y - dist, value); + + // Lifting stage 1 + lhs = image_load(dist); + value = image_load(0); + value += (2 * lhs + 2) >> 2; + image_store(0, value); + for (int y = 2 * dist; y <= work_area.y - 2 * dist; y += 2 * dist) { + int lhs = image_load(y + dist); + int rhs = image_load(y - dist); + int value = image_load(y); + value += (lhs + rhs + 2) >> 2; + image_store(y, value); + } +} diff --git a/libavcodec/vulkan/vc2_encode.comp b/libavcodec/vulkan/vc2_encode.comp new file mode 100644 index 0000000000..50b6214494 --- /dev/null +++ b/libavcodec/vulkan/vc2_encode.comp @@ -0,0 +1,169 @@ +/* + * VC2 codec + * + * Copyright (c) 2025 raphaelthegreat <geoste...@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#extension GL_EXT_shader_explicit_arithmetic_types : require +#extension GL_EXT_scalar_block_layout : require +#extension GL_EXT_buffer_reference : require +#extension GL_EXT_debug_printf : require + +#define MAX_DWT_LEVELS (5) + +layout(push_constant, scalar) uniform ComputeInfo { + u8buf bytestream; + ivec2 num_slices; + int wavelet_depth; + int size_scaler; + int prefix_bytes; +}; + +void put_vc2_ue_uint(inout PutBitContext pb, uint val) +{ + int pbits = 0, topbit = 1, maxval = 1, bits = 0; + if (val == 0) + { + put_bits(pb, 1, 1); + return; + } + val++; + + while (val > maxval) + { + topbit <<= 1; + bits++; + maxval <<= 1; + maxval |= 1; + } + + for (int i = 0; i < bits; i++) + { + topbit >>= 1; + pbits <<= 2; + if ((val & topbit) != 0) + pbits |= 1; + } + + put_bits(pb, bits * 2 + 1, (pbits << 1) | 1); +} + +int quants[MAX_DWT_LEVELS][4]; + +int subband_coord(int index, int h, int lvl) +{ + int coord = index; + coord <<= 1; + coord |= h; + coord <<= (wavelet_depth-lvl-1); + return coord; +} + +void main() +{ + int slice_index = int(gl_GlobalInvocationID.x); + int max_index = num_slices.x * num_slices.y; + if (slice_index >= max_index) + return; + + /* Step 2. Quantize and encode */ + int pb_start = slice_args[slice_index].pb_start; + int workgroup_x = int(gl_WorkGroupSize.x); + for (int i = 0, index = workgroup_x - 1; i < gl_WorkGroupID.x; i++) { + pb_start += slice_args[index].pb_start + slice_args[index].bytes; + index += workgroup_x; + } + ivec2 slice_coord = ivec2(slice_index % num_slices.x, slice_index / num_slices.x); + int slice_bytes_max = slice_args[slice_index].bytes; + int quant_index = slice_args[slice_index].quant_idx; + + PutBitContext pb; + init_put_bits(pb, OFFBUF(u8buf, bytestream, pb_start), slice_bytes_max); + + for (int level = 0; level < wavelet_depth; level++) + for (int orientation = int(level > 0); orientation < 4; orientation++) + quants[level][orientation] = max(quant_index - lut_quant[level][orientation], 0); + + /* Write quant index for this slice */ + put_bits(pb, 8, quant_index); + + /* Luma + 2 Chroma planes */ + for (int p = 0; p < 3; p++) + { + int pad_s, pad_c; + int bytes_start = int32_t(put_bytes_count(pb)); + + /* Save current location and write a zero value */ + uint64_t write_ptr_start = pb.buf; + int bit_left_start = pb.bit_left; + put_bits(pb, 8, 0); + + ivec2 dwt_dim = imageSize(coef_buf[p]); + for (int level = 0; level < wavelet_depth; level++) + { + ivec2 band_size = dwt_dim >> (wavelet_depth - level); + for (int o = int(level > 0); o < 4; o++) + { + /* Encode subband */ + int left = band_size.x * (slice_coord.x) / num_slices.x; + int right = band_size.x * (slice_coord.x+1) / num_slices.x; + int top = band_size.y * (slice_coord.y) / num_slices.y; + int bottom = band_size.y * (slice_coord.y+1) / num_slices.y; + + const int q_idx = quants[level][o]; + const int qfactor = ff_dirac_qscale_tab[q_idx]; + + const int yh = o >> 1; + const int xh = o & 1; + + for (int y = top; y < bottom; y++) + { + for (int x = left; x < right; x++) + { + int sx = subband_coord(x, xh, level); + int sy = subband_coord(y, yh, level); + int coef = imageLoad(coef_buf[p], ivec2(sx, sy)).x; + uint c_abs = uint(abs(coef)); + c_abs = (c_abs << 2) / qfactor; + put_vc2_ue_uint(pb, c_abs); + if (c_abs != 0) + put_bits(pb, 1, int(coef < 0)); + } + } + } + } + flush_put_bits(pb); + int bytes_len = int32_t(put_bytes_count(pb)) - bytes_start - 1; + if (p == 2) + { + int len_diff = slice_bytes_max - int32_t(put_bytes_count(pb)); + pad_s = align((bytes_len + len_diff), size_scaler)/size_scaler; + pad_c = (pad_s*size_scaler) - bytes_len; + } + else + { + pad_s = align(bytes_len, size_scaler)/size_scaler; + pad_c = (pad_s*size_scaler) - bytes_len; + } + uint64_t start_ptr = write_ptr_start + ((BUF_BITS - bit_left_start) >> 3); + u8buf(start_ptr).v = uint8_t(pad_s); + /* vc2-reference uses that padding that decodes to '0' coeffs */ + skip_put_bytes(pb, pad_c); + } +} diff --git a/libavcodec/vulkan/vc2_slice_sizes.comp b/libavcodec/vulkan/vc2_slice_sizes.comp new file mode 100644 index 0000000000..61070c1dc2 --- /dev/null +++ b/libavcodec/vulkan/vc2_slice_sizes.comp @@ -0,0 +1,170 @@ +/* + * VC2 codec + * + * Copyright (c) 2025 raphaelthegreat <geoste...@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#extension GL_EXT_shader_explicit_arithmetic_types : require +#extension GL_EXT_scalar_block_layout : require +#extension GL_EXT_buffer_reference : require + +#define DIRAC_MAX_QUANT_INDEX 116 +#define MAX_DWT_LEVELS 5 + +layout(push_constant, scalar) uniform ComputeInfo { + ivec2 num_slices; + int wavelet_depth; + int size_scaler; + int prefix_bytes; + int bits_ceil; + int bits_floor; +}; + +int count_vc2_ue_uint(uint val) +{ + return 2 * findMSB(val + 1) + 1; +} + +int cache[DIRAC_MAX_QUANT_INDEX]; +int quants[MAX_DWT_LEVELS][4]; +shared int slice_sizes[gl_WorkGroupSize.x]; + +int subband_coord(int index, int h, int lvl) +{ + int coord = index; + coord <<= 1; + coord |= h; + coord <<= (wavelet_depth-lvl-1); + return coord; +} + +int count_hq_slice(int quant_index) +{ + int bits = 0; + if (cache[quant_index] != 0) + return cache[quant_index]; + + bits += 8*prefix_bytes; + bits += 8; /* quant_idx */ + + for (int level = 0; level < wavelet_depth; level++) + for (int orientation = int(level > 0); orientation < 4; orientation++) + quants[level][orientation] = max(quant_index - lut_quant[level][orientation], 0); + + int slice_index = int(gl_GlobalInvocationID.x); + ivec2 slice_coord = ivec2(slice_index % num_slices.x, slice_index / num_slices.x); + for (int p = 0; p < 3; p++) + { + int bytes_start = bits >> 3; + bits += 8; + + ivec2 dwt_dim = imageSize(coef_buf[p]); + for (int level = 0; level < wavelet_depth; level++) + { + ivec2 band_dim = dwt_dim >> (wavelet_depth - level); + for (int o = int(level > 0); o < 4; o++) + { + const int left = band_dim.x * slice_coord.x / num_slices.x; + const int right = band_dim.x * (slice_coord.x+1) / num_slices.x; + const int top = band_dim.y * slice_coord.y / num_slices.y; + const int bottom = band_dim.y * (slice_coord.y+1) / num_slices.y; + + const int q_idx = quants[level][o]; + const int qfactor = ff_dirac_qscale_tab[q_idx]; + + const int yh = o >> 1; + const int xh = o & 1; + + for (int y = top; y < bottom; y++) + { + for (int x = left; x < right; x++) + { + int sx = subband_coord(x, xh, level); + int sy = subband_coord(y, yh, level); + int coef = imageLoad(coef_buf[p], ivec2(sx, sy)).x; + uint c_abs = uint(abs(coef)); + c_abs = (c_abs << 2) / qfactor; + bits += count_vc2_ue_uint(c_abs); + bits += int(c_abs > 0); + } + } + } + } + bits += align(bits, 8) - bits; + int bytes_len = (bits >> 3) - bytes_start - 1; + int pad_s = align(bytes_len, size_scaler) / size_scaler; + int pad_c = (pad_s * size_scaler) - bytes_len; + bits += pad_c * 8; + } + + cache[quant_index] = bits; + return bits; +} + +int ssize_round(int b) +{ + return align(b, size_scaler) + 4 + prefix_bytes; +} + +void main() +{ + int slice_index = int(gl_GlobalInvocationID.x); + int max_index = num_slices.x * num_slices.y; + if (slice_index >= max_index) + return; + + for (int i = 0; i < DIRAC_MAX_QUANT_INDEX; i++) + cache[i] = 0; + + const int q_ceil = DIRAC_MAX_QUANT_INDEX; + const int top = bits_ceil; + const int bottom = bits_floor; + int quant_buf[2] = int[2](-1, -1); + int quant = slice_args[slice_index].quant_idx; + int step = 1; + int bits_last = 0; + int bits = count_hq_slice(quant); + while ((bits > top) || (bits < bottom)) + { + const int signed_step = bits > top ? +step : -step; + quant = clamp(quant + signed_step, 0, q_ceil-1); + bits = count_hq_slice(quant); + if (quant_buf[1] == quant) + { + quant = max(quant_buf[0], quant); + bits = quant == quant_buf[0] ? bits_last : bits; + break; + } + step = clamp(step / 2, 1, (q_ceil - 1) / 2); + quant_buf[1] = quant_buf[0]; + quant_buf[0] = quant; + bits_last = bits; + } + int bytes = ssize_round(bits >> 3); + slice_args[slice_index].quant_idx = clamp(quant, 0, q_ceil-1); + slice_args[slice_index].bytes = bytes; + slice_sizes[gl_LocalInvocationIndex] = bytes; + barrier(); + + /* Prefix sum for all slices in current workgroup */ + int total_bytes = 0; + for (int i = 0; i < gl_LocalInvocationIndex; i++) + total_bytes += slice_sizes[i]; + slice_args[slice_index].pb_start = total_bytes; +} -- 2.49.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".