Added proper dependencies to configure and separated the vulkan encoder from cpu one. Should build okay with ./configure --enable-encoder=vc2_vulkan --enable-libglslang
Στις Κυρ 27 Οκτ 2024 στις 4:28 μ.μ., ο/η IndecisiveTurtle < geoste...@gmail.com> έγραψε: > Implements a Vulkan based dirac encoder. Supports Haar and Legall wavelets > and should work with all wavelet depths. > > Performance wise, encoding a 1080p 1-minute video is performed in about > 2.5 minutes with the cpu encoder running on my Ryzen 5 4600H, while it > takes about 30 seconds on my NVIDIA GTX 1650 > > Haar shader has a subgroup optimized variant that applies when configured > wavelet depth allows it > > lavapipe seems to be bugged for some reason, after a bunch of debugging > I'm not quite sure if it's a bug here or in lavapipe. But people probably > dont want to use this with a software implementation anyway. > --- > configure | 1 + > libavcodec/Makefile | 5 +- > libavcodec/allcodecs.c | 1 + > libavcodec/vc2enc.c | 501 +-------------- > libavcodec/vc2enc_common.c | 368 +++++++++++ > libavcodec/vc2enc_common.h | 279 ++++++++ > libavcodec/vc2enc_vulkan.c | 781 +++++++++++++++++++++++ > libavcodec/vulkan/dwt_haar.comp | 76 +++ > libavcodec/vulkan/dwt_haar_subgroup.comp | 94 +++ > libavcodec/vulkan/dwt_hor_legall.comp | 61 ++ > libavcodec/vulkan/dwt_legall.comp | 74 +++ > libavcodec/vulkan/dwt_upload.comp | 45 ++ > libavcodec/vulkan/dwt_ver_legall.comp | 55 ++ > libavcodec/vulkan/encode.comp | 256 ++++++++ > libavcodec/vulkan/slice_sizes.comp | 184 ++++++ > 15 files changed, 2280 insertions(+), 501 deletions(-) > create mode 100644 libavcodec/vc2enc_common.c > create mode 100644 libavcodec/vc2enc_common.h > create mode 100644 libavcodec/vc2enc_vulkan.c > create mode 100644 libavcodec/vulkan/dwt_haar.comp > create mode 100644 libavcodec/vulkan/dwt_haar_subgroup.comp > create mode 100644 libavcodec/vulkan/dwt_hor_legall.comp > create mode 100644 libavcodec/vulkan/dwt_legall.comp > create mode 100644 libavcodec/vulkan/dwt_upload.comp > create mode 100644 libavcodec/vulkan/dwt_ver_legall.comp > create mode 100644 libavcodec/vulkan/encode.comp > create mode 100644 libavcodec/vulkan/slice_sizes.comp > > diff --git a/configure b/configure > index 9f508a2527..23156da53d 100755 > --- a/configure > +++ b/configure > @@ -3108,6 +3108,7 @@ utvideo_encoder_select="bswapdsp huffman llvidencdsp" > vble_decoder_select="llviddsp" > vbn_decoder_select="texturedsp" > vbn_encoder_select="texturedspenc" > +vc2_vulkan_encoder_select="vulkan spirv_compiler" > vmix_decoder_select="idctdsp" > vc1_decoder_select="blockdsp h264qpel intrax8 mpegvideodec qpeldsp vc1dsp" > vc1image_decoder_select="vc1_decoder" > diff --git a/libavcodec/Makefile b/libavcodec/Makefile > index dd5d0de898..7778b99adb 100644 > --- a/libavcodec/Makefile > +++ b/libavcodec/Makefile > @@ -765,7 +765,10 @@ OBJS-$(CONFIG_VC1_CUVID_DECODER) += cuviddec.o > OBJS-$(CONFIG_VC1_MMAL_DECODER) += mmaldec.o > OBJS-$(CONFIG_VC1_QSV_DECODER) += qsvdec.o > OBJS-$(CONFIG_VC1_V4L2M2M_DECODER) += v4l2_m2m_dec.o > -OBJS-$(CONFIG_VC2_ENCODER) += vc2enc.o vc2enc_dwt.o diractab.o > +OBJS-$(CONFIG_VC2_ENCODER) += vc2enc.o vc2enc_dwt.o > vc2enc_common.o diractab.o > +OBJS-$(CONFIG_VC2_VULKAN_ENCODER) += vc2enc_vulkan.o vulkan/encode.o > vulkan/slice_sizes.o \ > + vulkan/dwt_hor_legall.o > vulkan/dwt_ver_legall.o \ > + vulkan/dwt_upload.o > vulkan/dwt_haar.o vulkan/dwt_haar_subgroup.o > OBJS-$(CONFIG_VCR1_DECODER) += vcr1.o > OBJS-$(CONFIG_VMDAUDIO_DECODER) += vmdaudio.o > OBJS-$(CONFIG_VMDVIDEO_DECODER) += vmdvideo.o > diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c > index c7e5f9910c..1db44b220f 100644 > --- a/libavcodec/allcodecs.c > +++ b/libavcodec/allcodecs.c > @@ -362,6 +362,7 @@ extern const FFCodec ff_vc1image_decoder; > extern const FFCodec ff_vc1_mmal_decoder; > extern const FFCodec ff_vc1_qsv_decoder; > extern const FFCodec ff_vc1_v4l2m2m_decoder; > +extern const FFCodec ff_vc2_vulkan_encoder; > extern const FFCodec ff_vc2_encoder; > extern const FFCodec ff_vcr1_decoder; > extern const FFCodec ff_vmdvideo_decoder; > diff --git a/libavcodec/vc2enc.c b/libavcodec/vc2enc.c > index b82370a753..54fcbe5f83 100644 > --- a/libavcodec/vc2enc.c > +++ b/libavcodec/vc2enc.c > @@ -29,506 +29,7 @@ > #include "put_bits.h" > #include "version.h" > > -#include "vc2enc_dwt.h" > -#include "diractab.h" > - > -/* The limited size resolution of each slice forces us to do this */ > -#define SSIZE_ROUND(b) (FFALIGN((b), s->size_scaler) + 4 + > s->prefix_bytes) > - > -/* Decides the cutoff point in # of slices to distribute the leftover > bytes */ > -#define SLICE_REDIST_TOTAL 150 > - > -typedef struct VC2BaseVideoFormat { > - enum AVPixelFormat pix_fmt; > - AVRational time_base; > - int width, height; > - uint8_t interlaced, level; > - char name[13]; > -} VC2BaseVideoFormat; > - > -static const VC2BaseVideoFormat base_video_fmts[] = { > - { 0 }, /* Custom format, here just to make indexing equal to base_vf > */ > - { AV_PIX_FMT_YUV420P, { 1001, 15000 }, 176, 120, 0, 1, > "QSIF525" }, > - { AV_PIX_FMT_YUV420P, { 2, 25 }, 176, 144, 0, 1, > "QCIF" }, > - { AV_PIX_FMT_YUV420P, { 1001, 15000 }, 352, 240, 0, 1, > "SIF525" }, > - { AV_PIX_FMT_YUV420P, { 2, 25 }, 352, 288, 0, 1, "CIF" > }, > - { AV_PIX_FMT_YUV420P, { 1001, 15000 }, 704, 480, 0, 1, > "4SIF525" }, > - { AV_PIX_FMT_YUV420P, { 2, 25 }, 704, 576, 0, 1, > "4CIF" }, > - > - { AV_PIX_FMT_YUV422P10, { 1001, 30000 }, 720, 480, 1, 2, > "SD480I-60" }, > - { AV_PIX_FMT_YUV422P10, { 1, 25 }, 720, 576, 1, 2, > "SD576I-50" }, > - > - { AV_PIX_FMT_YUV422P10, { 1001, 60000 }, 1280, 720, 0, 3, > "HD720P-60" }, > - { AV_PIX_FMT_YUV422P10, { 1, 50 }, 1280, 720, 0, 3, > "HD720P-50" }, > - { AV_PIX_FMT_YUV422P10, { 1001, 30000 }, 1920, 1080, 1, 3, > "HD1080I-60" }, > - { AV_PIX_FMT_YUV422P10, { 1, 25 }, 1920, 1080, 1, 3, > "HD1080I-50" }, > - { AV_PIX_FMT_YUV422P10, { 1001, 60000 }, 1920, 1080, 0, 3, > "HD1080P-60" }, > - { AV_PIX_FMT_YUV422P10, { 1, 50 }, 1920, 1080, 0, 3, > "HD1080P-50" }, > - > - { AV_PIX_FMT_YUV444P12, { 1, 24 }, 2048, 1080, 0, 4, > "DC2K" }, > - { AV_PIX_FMT_YUV444P12, { 1, 24 }, 4096, 2160, 0, 5, > "DC4K" }, > - > - { AV_PIX_FMT_YUV422P10, { 1001, 60000 }, 3840, 2160, 0, 6, "UHDTV > 4K-60" }, > - { AV_PIX_FMT_YUV422P10, { 1, 50 }, 3840, 2160, 0, 6, "UHDTV > 4K-50" }, > - > - { AV_PIX_FMT_YUV422P10, { 1001, 60000 }, 7680, 4320, 0, 7, "UHDTV > 8K-60" }, > - { AV_PIX_FMT_YUV422P10, { 1, 50 }, 7680, 4320, 0, 7, "UHDTV > 8K-50" }, > - > - { AV_PIX_FMT_YUV422P10, { 1001, 24000 }, 1920, 1080, 0, 3, > "HD1080P-24" }, > - { AV_PIX_FMT_YUV422P10, { 1001, 30000 }, 720, 486, 1, 2, "SD > Pro486" }, > -}; > -static const int base_video_fmts_len = FF_ARRAY_ELEMS(base_video_fmts); > - > -enum VC2_QM { > - VC2_QM_DEF = 0, > - VC2_QM_COL, > - VC2_QM_FLAT, > - > - VC2_QM_NB > -}; > - > -typedef struct SubBand { > - dwtcoef *buf; > - ptrdiff_t stride; > - int width; > - int height; > -} SubBand; > - > -typedef struct Plane { > - SubBand band[MAX_DWT_LEVELS][4]; > - dwtcoef *coef_buf; > - int width; > - int height; > - int dwt_width; > - int dwt_height; > - ptrdiff_t coef_stride; > -} Plane; > - > -typedef struct SliceArgs { > - const struct VC2EncContext *ctx; > - union { > - int cache[DIRAC_MAX_QUANT_INDEX]; > - uint8_t *buf; > - }; > - int x; > - int y; > - int quant_idx; > - int bits_ceil; > - int bits_floor; > - int bytes; > -} SliceArgs; > - > -typedef struct TransformArgs { > - const struct VC2EncContext *ctx; > - Plane *plane; > - const void *idata; > - ptrdiff_t istride; > - int field; > - VC2TransformContext t; > -} TransformArgs; > - > -typedef struct VC2EncContext { > - AVClass *av_class; > - PutBitContext pb; > - Plane plane[3]; > - AVCodecContext *avctx; > - DiracVersionInfo ver; > - > - SliceArgs *slice_args; > - TransformArgs transform_args[3]; > - > - /* For conversion from unsigned pixel values to signed */ > - int diff_offset; > - int bpp; > - int bpp_idx; > - > - /* Picture number */ > - uint32_t picture_number; > - > - /* Base video format */ > - int base_vf; > - int level; > - int profile; > - > - /* Quantization matrix */ > - uint8_t quant[MAX_DWT_LEVELS][4]; > - int custom_quant_matrix; > - > - /* Division LUT */ > - uint32_t qmagic_lut[116][2]; > - > - int num_x; /* #slices horizontally */ > - int num_y; /* #slices vertically */ > - int prefix_bytes; > - int size_scaler; > - int chroma_x_shift; > - int chroma_y_shift; > - > - /* Rate control stuff */ > - int frame_max_bytes; > - int slice_max_bytes; > - int slice_min_bytes; > - int q_ceil; > - int q_avg; > - > - /* Options */ > - double tolerance; > - int wavelet_idx; > - int wavelet_depth; > - int strict_compliance; > - int slice_height; > - int slice_width; > - int interlaced; > - enum VC2_QM quant_matrix; > - > - /* Parse code state */ > - uint32_t next_parse_offset; > - enum DiracParseCodes last_parse_code; > -} VC2EncContext; > - > -static av_always_inline void put_vc2_ue_uint(PutBitContext *pb, uint32_t > val) > -{ > - int i; > - int bits = 0; > - unsigned topbit = 1, maxval = 1; > - uint64_t pbits = 0; > - > - if (!val++) { > - put_bits(pb, 1, 1); > - return; > - } > - > - while (val > maxval) { > - topbit <<= 1; > - maxval <<= 1; > - maxval |= 1; > - } > - > - bits = ff_log2(topbit); > - > - for (i = 0; i < bits; i++) { > - topbit >>= 1; > - av_assert2(pbits <= UINT64_MAX>>3); > - pbits <<= 2; > - if (val & topbit) > - pbits |= 0x1; > - } > - > - put_bits64(pb, bits*2 + 1, (pbits << 1) | 1); > -} > - > -static av_always_inline int count_vc2_ue_uint(uint32_t val) > -{ > - int topbit = 1, maxval = 1; > - > - if (!val++) > - return 1; > - > - while (val > maxval) { > - topbit <<= 1; > - maxval <<= 1; > - maxval |= 1; > - } > - > - return ff_log2(topbit)*2 + 1; > -} > - > -/* VC-2 10.4 - parse_info() */ > -static void encode_parse_info(VC2EncContext *s, enum DiracParseCodes > pcode) > -{ > - uint32_t cur_pos, dist; > - > - align_put_bits(&s->pb); > - > - cur_pos = put_bytes_count(&s->pb, 0); > - > - /* Magic string */ > - ff_put_string(&s->pb, "BBCD", 0); > - > - /* Parse code */ > - put_bits(&s->pb, 8, pcode); > - > - /* Next parse offset */ > - dist = cur_pos - s->next_parse_offset; > - AV_WB32(s->pb.buf + s->next_parse_offset + 5, dist); > - s->next_parse_offset = cur_pos; > - put_bits32(&s->pb, pcode == DIRAC_PCODE_END_SEQ ? 13 : 0); > - > - /* Last parse offset */ > - put_bits32(&s->pb, s->last_parse_code == DIRAC_PCODE_END_SEQ ? 13 : > dist); > - > - s->last_parse_code = pcode; > -} > - > -/* VC-2 11.1 - parse_parameters() > - * The level dictates what the decoder should expect in terms of > resolution > - * and allows it to quickly reject whatever it can't support. Remember, > - * this codec kinda targets cheapo FPGAs without much memory. > Unfortunately > - * it also limits us greatly in our choice of formats, hence the flag to > disable > - * strict_compliance */ > -static void encode_parse_params(VC2EncContext *s) > -{ > - put_vc2_ue_uint(&s->pb, s->ver.major); /* VC-2 demands this to be 2 */ > - put_vc2_ue_uint(&s->pb, s->ver.minor); /* ^^ and this to be 0 */ > - put_vc2_ue_uint(&s->pb, s->profile); /* 3 to signal HQ profile */ > - put_vc2_ue_uint(&s->pb, s->level); /* 3 - 1080/720, 6 - 4K */ > -} > - > -/* VC-2 11.3 - frame_size() */ > -static void encode_frame_size(VC2EncContext *s) > -{ > - put_bits(&s->pb, 1, !s->strict_compliance); > - if (!s->strict_compliance) { > - AVCodecContext *avctx = s->avctx; > - put_vc2_ue_uint(&s->pb, avctx->width); > - put_vc2_ue_uint(&s->pb, avctx->height); > - } > -} > - > -/* VC-2 11.3.3 - color_diff_sampling_format() */ > -static void encode_sample_fmt(VC2EncContext *s) > -{ > - put_bits(&s->pb, 1, !s->strict_compliance); > - if (!s->strict_compliance) { > - int idx; > - if (s->chroma_x_shift == 1 && s->chroma_y_shift == 0) > - idx = 1; /* 422 */ > - else if (s->chroma_x_shift == 1 && s->chroma_y_shift == 1) > - idx = 2; /* 420 */ > - else > - idx = 0; /* 444 */ > - put_vc2_ue_uint(&s->pb, idx); > - } > -} > - > -/* VC-2 11.3.4 - scan_format() */ > -static void encode_scan_format(VC2EncContext *s) > -{ > - put_bits(&s->pb, 1, !s->strict_compliance); > - if (!s->strict_compliance) > - put_vc2_ue_uint(&s->pb, s->interlaced); > -} > - > -/* VC-2 11.3.5 - frame_rate() */ > -static void encode_frame_rate(VC2EncContext *s) > -{ > - put_bits(&s->pb, 1, !s->strict_compliance); > - if (!s->strict_compliance) { > - AVCodecContext *avctx = s->avctx; > - put_vc2_ue_uint(&s->pb, 0); > - put_vc2_ue_uint(&s->pb, avctx->time_base.den); > - put_vc2_ue_uint(&s->pb, avctx->time_base.num); > - } > -} > - > -/* VC-2 11.3.6 - aspect_ratio() */ > -static void encode_aspect_ratio(VC2EncContext *s) > -{ > - put_bits(&s->pb, 1, !s->strict_compliance); > - if (!s->strict_compliance) { > - AVCodecContext *avctx = s->avctx; > - put_vc2_ue_uint(&s->pb, 0); > - put_vc2_ue_uint(&s->pb, avctx->sample_aspect_ratio.num); > - put_vc2_ue_uint(&s->pb, avctx->sample_aspect_ratio.den); > - } > -} > - > -/* VC-2 11.3.7 - clean_area() */ > -static void encode_clean_area(VC2EncContext *s) > -{ > - put_bits(&s->pb, 1, 0); > -} > - > -/* VC-2 11.3.8 - signal_range() */ > -static void encode_signal_range(VC2EncContext *s) > -{ > - put_bits(&s->pb, 1, !s->strict_compliance); > - if (!s->strict_compliance) > - put_vc2_ue_uint(&s->pb, s->bpp_idx); > -} > - > -/* VC-2 11.3.9 - color_spec() */ > -static void encode_color_spec(VC2EncContext *s) > -{ > - AVCodecContext *avctx = s->avctx; > - put_bits(&s->pb, 1, !s->strict_compliance); > - if (!s->strict_compliance) { > - int val; > - put_vc2_ue_uint(&s->pb, 0); > - > - /* primaries */ > - put_bits(&s->pb, 1, 1); > - if (avctx->color_primaries == AVCOL_PRI_BT470BG) > - val = 2; > - else if (avctx->color_primaries == AVCOL_PRI_SMPTE170M) > - val = 1; > - else if (avctx->color_primaries == AVCOL_PRI_SMPTE240M) > - val = 1; > - else > - val = 0; > - put_vc2_ue_uint(&s->pb, val); > - > - /* color matrix */ > - put_bits(&s->pb, 1, 1); > - if (avctx->colorspace == AVCOL_SPC_RGB) > - val = 3; > - else if (avctx->colorspace == AVCOL_SPC_YCOCG) > - val = 2; > - else if (avctx->colorspace == AVCOL_SPC_BT470BG) > - val = 1; > - else > - val = 0; > - put_vc2_ue_uint(&s->pb, val); > - > - /* transfer function */ > - put_bits(&s->pb, 1, 1); > - if (avctx->color_trc == AVCOL_TRC_LINEAR) > - val = 2; > - else if (avctx->color_trc == AVCOL_TRC_BT1361_ECG) > - val = 1; > - else > - val = 0; > - put_vc2_ue_uint(&s->pb, val); > - } > -} > - > -/* VC-2 11.3 - source_parameters() */ > -static void encode_source_params(VC2EncContext *s) > -{ > - encode_frame_size(s); > - encode_sample_fmt(s); > - encode_scan_format(s); > - encode_frame_rate(s); > - encode_aspect_ratio(s); > - encode_clean_area(s); > - encode_signal_range(s); > - encode_color_spec(s); > -} > - > -/* VC-2 11 - sequence_header() */ > -static void encode_seq_header(VC2EncContext *s) > -{ > - align_put_bits(&s->pb); > - encode_parse_params(s); > - put_vc2_ue_uint(&s->pb, s->base_vf); > - encode_source_params(s); > - put_vc2_ue_uint(&s->pb, s->interlaced); /* Frames or fields coding */ > -} > - > -/* VC-2 12.1 - picture_header() */ > -static void encode_picture_header(VC2EncContext *s) > -{ > - align_put_bits(&s->pb); > - put_bits32(&s->pb, s->picture_number++); > -} > - > -/* VC-2 12.3.4.1 - slice_parameters() */ > -static void encode_slice_params(VC2EncContext *s) > -{ > - put_vc2_ue_uint(&s->pb, s->num_x); > - put_vc2_ue_uint(&s->pb, s->num_y); > - put_vc2_ue_uint(&s->pb, s->prefix_bytes); > - put_vc2_ue_uint(&s->pb, s->size_scaler); > -} > - > -/* 1st idx = LL, second - vertical, third - horizontal, fourth - total */ > -static const uint8_t vc2_qm_col_tab[][4] = { > - {20, 9, 15, 4}, > - { 0, 6, 6, 4}, > - { 0, 3, 3, 5}, > - { 0, 3, 5, 1}, > - { 0, 11, 10, 11} > -}; > - > -static const uint8_t vc2_qm_flat_tab[][4] = { > - { 0, 0, 0, 0}, > - { 0, 0, 0, 0}, > - { 0, 0, 0, 0}, > - { 0, 0, 0, 0}, > - { 0, 0, 0, 0} > -}; > - > -static void init_quant_matrix(VC2EncContext *s) > -{ > - int level, orientation; > - > - if (s->wavelet_depth <= 4 && s->quant_matrix == VC2_QM_DEF) { > - s->custom_quant_matrix = 0; > - for (level = 0; level < s->wavelet_depth; level++) { > - s->quant[level][0] = > ff_dirac_default_qmat[s->wavelet_idx][level][0]; > - s->quant[level][1] = > ff_dirac_default_qmat[s->wavelet_idx][level][1]; > - s->quant[level][2] = > ff_dirac_default_qmat[s->wavelet_idx][level][2]; > - s->quant[level][3] = > ff_dirac_default_qmat[s->wavelet_idx][level][3]; > - } > - return; > - } > - > - s->custom_quant_matrix = 1; > - > - if (s->quant_matrix == VC2_QM_DEF) { > - for (level = 0; level < s->wavelet_depth; level++) { > - for (orientation = 0; orientation < 4; orientation++) { > - if (level <= 3) > - s->quant[level][orientation] = > ff_dirac_default_qmat[s->wavelet_idx][level][orientation]; > - else > - s->quant[level][orientation] = > vc2_qm_col_tab[level][orientation]; > - } > - } > - } else if (s->quant_matrix == VC2_QM_COL) { > - for (level = 0; level < s->wavelet_depth; level++) { > - for (orientation = 0; orientation < 4; orientation++) { > - s->quant[level][orientation] = > vc2_qm_col_tab[level][orientation]; > - } > - } > - } else { > - for (level = 0; level < s->wavelet_depth; level++) { > - for (orientation = 0; orientation < 4; orientation++) { > - s->quant[level][orientation] = > vc2_qm_flat_tab[level][orientation]; > - } > - } > - } > -} > - > -/* VC-2 12.3.4.2 - quant_matrix() */ > -static void encode_quant_matrix(VC2EncContext *s) > -{ > - int level; > - put_bits(&s->pb, 1, s->custom_quant_matrix); > - if (s->custom_quant_matrix) { > - put_vc2_ue_uint(&s->pb, s->quant[0][0]); > - for (level = 0; level < s->wavelet_depth; level++) { > - put_vc2_ue_uint(&s->pb, s->quant[level][1]); > - put_vc2_ue_uint(&s->pb, s->quant[level][2]); > - put_vc2_ue_uint(&s->pb, s->quant[level][3]); > - } > - } > -} > - > -/* VC-2 12.3 - transform_parameters() */ > -static void encode_transform_params(VC2EncContext *s) > -{ > - put_vc2_ue_uint(&s->pb, s->wavelet_idx); > - put_vc2_ue_uint(&s->pb, s->wavelet_depth); > - > - encode_slice_params(s); > - encode_quant_matrix(s); > -} > - > -/* VC-2 12.2 - wavelet_transform() */ > -static void encode_wavelet_transform(VC2EncContext *s) > -{ > - encode_transform_params(s); > - align_put_bits(&s->pb); > -} > - > -/* VC-2 12 - picture_parse() */ > -static void encode_picture_start(VC2EncContext *s) > -{ > - align_put_bits(&s->pb); > - encode_picture_header(s); > - align_put_bits(&s->pb); > - encode_wavelet_transform(s); > -} > +#include "vc2enc_common.h" > > #define QUANT(c, mul, add, shift) (((mul) * (c) + (add)) >> (shift)) > > diff --git a/libavcodec/vc2enc_common.c b/libavcodec/vc2enc_common.c > new file mode 100644 > index 0000000000..3cc59c4b62 > --- /dev/null > +++ b/libavcodec/vc2enc_common.c > @@ -0,0 +1,368 @@ > +/* > + * Copyright (C) 2016 Open Broadcast Systems Ltd. > + * Author 2016 Rostislav Pehlivanov <atomnu...@gmail.com> > + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA > 02110-1301 USA > + */ > + > +#include "vc2enc_common.h" > + > +void put_vc2_ue_uint(PutBitContext *pb, uint32_t val) > +{ > + int i; > + int bits = 0; > + unsigned topbit = 1, maxval = 1; > + uint64_t pbits = 0; > + > + if (!val++) { > + put_bits(pb, 1, 1); > + return; > + } > + > + while (val > maxval) { > + topbit <<= 1; > + maxval <<= 1; > + maxval |= 1; > + } > + > + bits = ff_log2(topbit); > + > + for (i = 0; i < bits; i++) { > + topbit >>= 1; > + av_assert2(pbits <= UINT64_MAX>>3); > + pbits <<= 2; > + if (val & topbit) > + pbits |= 0x1; > + } > + > + put_bits64(pb, bits*2 + 1, (pbits << 1) | 1); > +} > + > +int count_vc2_ue_uint(uint32_t val) > +{ > + int topbit = 1, maxval = 1; > + > + if (!val++) > + return 1; > + > + while (val > maxval) { > + topbit <<= 1; > + maxval <<= 1; > + maxval |= 1; > + } > + > + return ff_log2(topbit)*2 + 1; > +} > + > +/* VC-2 10.4 - parse_info() */ > +void encode_parse_info(VC2EncContext *s, enum DiracParseCodes pcode) > +{ > + uint32_t cur_pos, dist; > + > + align_put_bits(&s->pb); > + > + cur_pos = put_bytes_count(&s->pb, 0); > + > + /* Magic string */ > + ff_put_string(&s->pb, "BBCD", 0); > + > + /* Parse code */ > + put_bits(&s->pb, 8, pcode); > + > + /* Next parse offset */ > + dist = cur_pos - s->next_parse_offset; > + AV_WB32(s->pb.buf + s->next_parse_offset + 5, dist); > + s->next_parse_offset = cur_pos; > + put_bits32(&s->pb, pcode == DIRAC_PCODE_END_SEQ ? 13 : 0); > + > + cur_pos = put_bytes_count(&s->pb, 0); > + > + /* Last parse offset */ > + put_bits32(&s->pb, s->last_parse_code == DIRAC_PCODE_END_SEQ ? 13 : > dist); > + > + s->last_parse_code = pcode; > +} > + > +/* VC-2 11.1 - parse_parameters() > + * The level dictates what the decoder should expect in terms of > resolution > + * and allows it to quickly reject whatever it can't support. Remember, > + * this codec kinda targets cheapo FPGAs without much memory. > Unfortunately > + * it also limits us greatly in our choice of formats, hence the flag to > disable > + * strict_compliance */ > +static void encode_parse_params(VC2EncContext *s) > +{ > + put_vc2_ue_uint(&s->pb, s->ver.major); /* VC-2 demands this to be 2 */ > + put_vc2_ue_uint(&s->pb, s->ver.minor); /* ^^ and this to be 0 */ > + put_vc2_ue_uint(&s->pb, s->profile); /* 3 to signal HQ profile */ > + put_vc2_ue_uint(&s->pb, s->level); /* 3 - 1080/720, 6 - 4K */ > +} > + > +/* VC-2 11.3 - frame_size() */ > +static void encode_frame_size(VC2EncContext *s) > +{ > + put_bits(&s->pb, 1, !s->strict_compliance); > + if (!s->strict_compliance) { > + AVCodecContext *avctx = s->avctx; > + put_vc2_ue_uint(&s->pb, avctx->width); > + put_vc2_ue_uint(&s->pb, avctx->height); > + } > +} > + > +/* VC-2 11.3.3 - color_diff_sampling_format() */ > +static void encode_sample_fmt(VC2EncContext *s) > +{ > + put_bits(&s->pb, 1, !s->strict_compliance); > + if (!s->strict_compliance) { > + int idx; > + if (s->chroma_x_shift == 1 && s->chroma_y_shift == 0) > + idx = 1; /* 422 */ > + else if (s->chroma_x_shift == 1 && s->chroma_y_shift == 1) > + idx = 2; /* 420 */ > + else > + idx = 0; /* 444 */ > + put_vc2_ue_uint(&s->pb, idx); > + } > +} > + > +/* VC-2 11.3.4 - scan_format() */ > +static void encode_scan_format(VC2EncContext *s) > +{ > + put_bits(&s->pb, 1, !s->strict_compliance); > + if (!s->strict_compliance) > + put_vc2_ue_uint(&s->pb, s->interlaced); > +} > + > +/* VC-2 11.3.5 - frame_rate() */ > +static void encode_frame_rate(VC2EncContext *s) > +{ > + put_bits(&s->pb, 1, !s->strict_compliance); > + if (!s->strict_compliance) { > + AVCodecContext *avctx = s->avctx; > + put_vc2_ue_uint(&s->pb, 0); > + put_vc2_ue_uint(&s->pb, avctx->time_base.den); > + put_vc2_ue_uint(&s->pb, avctx->time_base.num); > + } > +} > + > +/* VC-2 11.3.6 - aspect_ratio() */ > +static void encode_aspect_ratio(VC2EncContext *s) > +{ > + put_bits(&s->pb, 1, !s->strict_compliance); > + if (!s->strict_compliance) { > + AVCodecContext *avctx = s->avctx; > + put_vc2_ue_uint(&s->pb, 0); > + put_vc2_ue_uint(&s->pb, avctx->sample_aspect_ratio.num); > + put_vc2_ue_uint(&s->pb, avctx->sample_aspect_ratio.den); > + } > +} > + > +/* VC-2 11.3.7 - clean_area() */ > +static void encode_clean_area(VC2EncContext *s) > +{ > + put_bits(&s->pb, 1, 0); > +} > + > +/* VC-2 11.3.8 - signal_range() */ > +static void encode_signal_range(VC2EncContext *s) > +{ > + put_bits(&s->pb, 1, !s->strict_compliance); > + if (!s->strict_compliance) > + put_vc2_ue_uint(&s->pb, s->bpp_idx); > +} > + > +/* VC-2 11.3.9 - color_spec() */ > +static void encode_color_spec(VC2EncContext *s) > +{ > + AVCodecContext *avctx = s->avctx; > + put_bits(&s->pb, 1, !s->strict_compliance); > + if (!s->strict_compliance) { > + int val; > + put_vc2_ue_uint(&s->pb, 0); > + > + /* primaries */ > + put_bits(&s->pb, 1, 1); > + if (avctx->color_primaries == AVCOL_PRI_BT470BG) > + val = 2; > + else if (avctx->color_primaries == AVCOL_PRI_SMPTE170M) > + val = 1; > + else if (avctx->color_primaries == AVCOL_PRI_SMPTE240M) > + val = 1; > + else > + val = 0; > + put_vc2_ue_uint(&s->pb, val); > + > + /* color matrix */ > + put_bits(&s->pb, 1, 1); > + if (avctx->colorspace == AVCOL_SPC_RGB) > + val = 3; > + else if (avctx->colorspace == AVCOL_SPC_YCOCG) > + val = 2; > + else if (avctx->colorspace == AVCOL_SPC_BT470BG) > + val = 1; > + else > + val = 0; > + put_vc2_ue_uint(&s->pb, val); > + > + /* transfer function */ > + put_bits(&s->pb, 1, 1); > + if (avctx->color_trc == AVCOL_TRC_LINEAR) > + val = 2; > + else if (avctx->color_trc == AVCOL_TRC_BT1361_ECG) > + val = 1; > + else > + val = 0; > + put_vc2_ue_uint(&s->pb, val); > + } > +} > + > +/* VC-2 11.3 - source_parameters() */ > +static void encode_source_params(VC2EncContext *s) > +{ > + encode_frame_size(s); > + encode_sample_fmt(s); > + encode_scan_format(s); > + encode_frame_rate(s); > + encode_aspect_ratio(s); > + encode_clean_area(s); > + encode_signal_range(s); > + encode_color_spec(s); > +} > + > +/* VC-2 11 - sequence_header() */ > +void encode_seq_header(VC2EncContext *s) > +{ > + align_put_bits(&s->pb); > + encode_parse_params(s); > + put_vc2_ue_uint(&s->pb, s->base_vf); > + encode_source_params(s); > + put_vc2_ue_uint(&s->pb, s->interlaced); /* Frames or fields coding */ > +} > + > +/* VC-2 12.1 - picture_header() */ > +static void encode_picture_header(VC2EncContext *s) > +{ > + align_put_bits(&s->pb); > + put_bits32(&s->pb, s->picture_number++); > +} > + > +/* VC-2 12.3.4.1 - slice_parameters() */ > +static void encode_slice_params(VC2EncContext *s) > +{ > + put_vc2_ue_uint(&s->pb, s->num_x); > + put_vc2_ue_uint(&s->pb, s->num_y); > + put_vc2_ue_uint(&s->pb, s->prefix_bytes); > + put_vc2_ue_uint(&s->pb, s->size_scaler); > +} > + > +/* 1st idx = LL, second - vertical, third - horizontal, fourth - total */ > +static const uint8_t vc2_qm_col_tab[][4] = { > + {20, 9, 15, 4}, > + { 0, 6, 6, 4}, > + { 0, 3, 3, 5}, > + { 0, 3, 5, 1}, > + { 0, 11, 10, 11} > +}; > + > +static const uint8_t vc2_qm_flat_tab[][4] = { > + { 0, 0, 0, 0}, > + { 0, 0, 0, 0}, > + { 0, 0, 0, 0}, > + { 0, 0, 0, 0}, > + { 0, 0, 0, 0} > +}; > + > +void init_quant_matrix(VC2EncContext *s) > +{ > + int level, orientation; > + > + if (s->wavelet_depth <= 4 && s->quant_matrix == VC2_QM_DEF) { > + s->custom_quant_matrix = 0; > + for (level = 0; level < s->wavelet_depth; level++) { > + s->quant[level][0] = > ff_dirac_default_qmat[s->wavelet_idx][level][0]; > + s->quant[level][1] = > ff_dirac_default_qmat[s->wavelet_idx][level][1]; > + s->quant[level][2] = > ff_dirac_default_qmat[s->wavelet_idx][level][2]; > + s->quant[level][3] = > ff_dirac_default_qmat[s->wavelet_idx][level][3]; > + } > + return; > + } > + > + s->custom_quant_matrix = 1; > + > + if (s->quant_matrix == VC2_QM_DEF) { > + for (level = 0; level < s->wavelet_depth; level++) { > + for (orientation = 0; orientation < 4; orientation++) { > + if (level <= 3) > + s->quant[level][orientation] = > ff_dirac_default_qmat[s->wavelet_idx][level][orientation]; > + else > + s->quant[level][orientation] = > vc2_qm_col_tab[level][orientation]; > + } > + } > + } else if (s->quant_matrix == VC2_QM_COL) { > + for (level = 0; level < s->wavelet_depth; level++) { > + for (orientation = 0; orientation < 4; orientation++) { > + s->quant[level][orientation] = > vc2_qm_col_tab[level][orientation]; > + } > + } > + } else { > + for (level = 0; level < s->wavelet_depth; level++) { > + for (orientation = 0; orientation < 4; orientation++) { > + s->quant[level][orientation] = > vc2_qm_flat_tab[level][orientation]; > + } > + } > + } > +} > + > +/* VC-2 12.3.4.2 - quant_matrix() */ > +static void encode_quant_matrix(VC2EncContext *s) > +{ > + int level; > + put_bits(&s->pb, 1, s->custom_quant_matrix); > + if (s->custom_quant_matrix) { > + put_vc2_ue_uint(&s->pb, s->quant[0][0]); > + for (level = 0; level < s->wavelet_depth; level++) { > + put_vc2_ue_uint(&s->pb, s->quant[level][1]); > + put_vc2_ue_uint(&s->pb, s->quant[level][2]); > + put_vc2_ue_uint(&s->pb, s->quant[level][3]); > + } > + } > +} > + > +/* VC-2 12.3 - transform_parameters() */ > +static void encode_transform_params(VC2EncContext *s) > +{ > + put_vc2_ue_uint(&s->pb, s->wavelet_idx); > + put_vc2_ue_uint(&s->pb, s->wavelet_depth); > + > + encode_slice_params(s); > + encode_quant_matrix(s); > +} > + > +/* VC-2 12.2 - wavelet_transform() */ > +static void encode_wavelet_transform(VC2EncContext *s) > +{ > + encode_transform_params(s); > + align_put_bits(&s->pb); > +} > + > +/* VC-2 12 - picture_parse() */ > +void encode_picture_start(VC2EncContext *s) > +{ > + align_put_bits(&s->pb); > + encode_picture_header(s); > + align_put_bits(&s->pb); > + encode_wavelet_transform(s); > +} > diff --git a/libavcodec/vc2enc_common.h b/libavcodec/vc2enc_common.h > new file mode 100644 > index 0000000000..dfcd77752c > --- /dev/null > +++ b/libavcodec/vc2enc_common.h > @@ -0,0 +1,279 @@ > +/* > + * Copyright (C) 2016 Open Broadcast Systems Ltd. > + * Author 2016 Rostislav Pehlivanov <atomnu...@gmail.com> > + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA > 02110-1301 USA > + */ > + > +#ifndef AVCODEC_VC2ENC_COMMON_H > +#define AVCODEC_VC2ENC_COMMON_H > + > +#include "avcodec.h" > +#include "dirac.h" > +#include "put_bits.h" > + > +#include "vc2enc_dwt.h" > +#include "diractab.h" > +#include "libavutil/vulkan.h" > + > +/* The limited size resolution of each slice forces us to do this */ > +#define SSIZE_ROUND(b) (FFALIGN((b), s->size_scaler) + 4 + > s->prefix_bytes) > + > +/* Decides the cutoff point in # of slices to distribute the leftover > bytes */ > +#define SLICE_REDIST_TOTAL 150 > + > +typedef struct VC2BaseVideoFormat { > + enum AVPixelFormat pix_fmt; > + AVRational time_base; > + int width, height; > + uint8_t interlaced, level; > + char name[13]; > +} VC2BaseVideoFormat; > + > +static const VC2BaseVideoFormat base_video_fmts[] = { > + { 0 }, /* Custom format, here just to make indexing equal to base_vf > */ > + { AV_PIX_FMT_YUV420P, { 1001, 15000 }, 176, 120, 0, 1, > "QSIF525" }, > + { AV_PIX_FMT_YUV420P, { 2, 25 }, 176, 144, 0, 1, > "QCIF" }, > + { AV_PIX_FMT_YUV420P, { 1001, 15000 }, 352, 240, 0, 1, > "SIF525" }, > + { AV_PIX_FMT_YUV420P, { 2, 25 }, 352, 288, 0, 1, "CIF" > }, > + { AV_PIX_FMT_YUV420P, { 1001, 15000 }, 704, 480, 0, 1, > "4SIF525" }, > + { AV_PIX_FMT_YUV420P, { 2, 25 }, 704, 576, 0, 1, > "4CIF" }, > + > + { AV_PIX_FMT_YUV422P10, { 1001, 30000 }, 720, 480, 1, 2, > "SD480I-60" }, > + { AV_PIX_FMT_YUV422P10, { 1, 25 }, 720, 576, 1, 2, > "SD576I-50" }, > + > + { AV_PIX_FMT_YUV422P10, { 1001, 60000 }, 1280, 720, 0, 3, > "HD720P-60" }, > + { AV_PIX_FMT_YUV422P10, { 1, 50 }, 1280, 720, 0, 3, > "HD720P-50" }, > + { AV_PIX_FMT_YUV422P10, { 1001, 30000 }, 1920, 1080, 1, 3, > "HD1080I-60" }, > + { AV_PIX_FMT_YUV422P10, { 1, 25 }, 1920, 1080, 1, 3, > "HD1080I-50" }, > + { AV_PIX_FMT_YUV422P10, { 1001, 60000 }, 1920, 1080, 0, 3, > "HD1080P-60" }, > + { AV_PIX_FMT_YUV422P10, { 1, 50 }, 1920, 1080, 0, 3, > "HD1080P-50" }, > + > + { AV_PIX_FMT_YUV444P12, { 1, 24 }, 2048, 1080, 0, 4, > "DC2K" }, > + { AV_PIX_FMT_YUV444P12, { 1, 24 }, 4096, 2160, 0, 5, > "DC4K" }, > + > + { AV_PIX_FMT_YUV422P10, { 1001, 60000 }, 3840, 2160, 0, 6, "UHDTV > 4K-60" }, > + { AV_PIX_FMT_YUV422P10, { 1, 50 }, 3840, 2160, 0, 6, "UHDTV > 4K-50" }, > + > + { AV_PIX_FMT_YUV422P10, { 1001, 60000 }, 7680, 4320, 0, 7, "UHDTV > 8K-60" }, > + { AV_PIX_FMT_YUV422P10, { 1, 50 }, 7680, 4320, 0, 7, "UHDTV > 8K-50" }, > + > + { AV_PIX_FMT_YUV422P10, { 1001, 24000 }, 1920, 1080, 0, 3, > "HD1080P-24" }, > + { AV_PIX_FMT_YUV422P10, { 1001, 30000 }, 720, 486, 1, 2, "SD > Pro486" }, > +}; > +static const int base_video_fmts_len = FF_ARRAY_ELEMS(base_video_fmts); > + > +enum VC2_QM { > + VC2_QM_DEF = 0, > + VC2_QM_COL, > + VC2_QM_FLAT, > + > + VC2_QM_NB > +}; > + > +typedef struct SubBand { > + dwtcoef *buf; > + ptrdiff_t stride; > + int width; > + int height; > + int shift; > +} SubBand; > + > +typedef struct Plane { > + SubBand band[MAX_DWT_LEVELS][4]; > + dwtcoef *coef_buf; > + int width; > + int height; > + int dwt_width; > + int dwt_height; > + ptrdiff_t coef_stride; > +} Plane; > + > +typedef struct SliceArgs { > + const struct VC2EncContext *ctx; > + union { > + int cache[DIRAC_MAX_QUANT_INDEX]; > + uint8_t *buf; > + }; > + int x; > + int y; > + int quant_idx; > + int bits_ceil; > + int bits_floor; > + int bytes; > +} SliceArgs; > + > +typedef struct TransformArgs { > + struct VC2EncContext *ctx; > + Plane *plane; > + const void *idata; > + ptrdiff_t istride; > + int field; > + VC2TransformContext t; > +} TransformArgs; > + > +typedef struct VC2DwtPlane { > + int width; > + int height; > + int dwt_width; > + int dwt_height; > +} VC2DwtPlane; > + > +typedef struct VC2DwtPushData { > + int s; > + union { > + int diff_offset; > + int plane_idx; > + }; > + int level; > + VC2DwtPlane planes[3]; > + VkDeviceAddress pbuf[3]; > +} VC2DwtPushData; > + > +typedef struct VC2EncAuxData { > + uint32_t quant[MAX_DWT_LEVELS][4]; > + int ff_dirac_qscale_tab[116]; > +} VC2EncAuxData; > + > +typedef struct VC2EncPushData { > + VkDeviceAddress p[3]; > + VkDeviceAddress pb; > + VkDeviceAddress luts; > + VkDeviceAddress slice; > + int num_x; > + int num_y; > + VC2DwtPlane planes[3]; > + int wavelet_depth; > + int size_scaler; > + int prefix_bytes; > +} VC2EncPushData; > + > +typedef struct VC2EncSliceArgs { > + int quant_idx; > + int bytes; > + int pb_start; > + int pad; > +} VC2EncSliceArgs; > + > +typedef struct VC2EncSliceCalcPushData { > + VkDeviceAddress p[3]; > + VkDeviceAddress luts; > + VkDeviceAddress slice; > + int num_x; > + int num_y; > + VC2DwtPlane planes[3]; > + int wavelet_depth; > + int size_scaler; > + int prefix_bytes; > + int bits_ceil; > + int bits_floor; > +} VC2EncSliceCalcPushData; > + > +typedef struct VC2EncContext { > + AVClass *av_class; > + PutBitContext pb; > + Plane plane[3]; > + AVCodecContext *avctx; > + DiracVersionInfo ver; > + > + SliceArgs *slice_args; > + VC2EncSliceArgs* vk_slice_args; > + TransformArgs transform_args[3]; > + > + /* For conversion from unsigned pixel values to signed */ > + int diff_offset; > + int bpp; > + int bpp_idx; > + > + /* Picture number */ > + uint32_t picture_number; > + > + /* Base video format */ > + int base_vf; > + int level; > + int profile; > + > + /* Quantization matrix */ > + uint8_t quant[MAX_DWT_LEVELS][4]; > + int custom_quant_matrix; > + > + /* Division LUT */ > + uint32_t qmagic_lut[116][2]; > + > + int num_x; /* #slices horizontally */ > + int num_y; /* #slices vertically */ > + int group_x; > + int group_y; > + int prefix_bytes; > + int size_scaler; > + int chroma_x_shift; > + int chroma_y_shift; > + > + /* Rate control stuff */ > + int frame_max_bytes; > + int slice_max_bytes; > + int slice_min_bytes; > + int q_ceil; > + int q_avg; > + > + /* Options */ > + double tolerance; > + int wavelet_idx; > + int wavelet_depth; > + int strict_compliance; > + int slice_height; > + int slice_width; > + int interlaced; > + enum VC2_QM quant_matrix; > + > + /* Parse code state */ > + uint32_t next_parse_offset; > + enum DiracParseCodes last_parse_code; > + > + /* Vulkan state */ > + FFVulkanContext vkctx; > + FFVkQueueFamilyCtx qf; > + FFVkExecPool e; > + > + FFVulkanShader dwt_haar_shd; > + FFVulkanShader dwt_upload_shd; > + FFVulkanShader dwt_hor_shd, dwt_ver_shd; > + FFVulkanShader slice_shd; > + FFVulkanShader enc_shd; > + AVBufferPool* dwt_buf_pool; > + int haar_subgroup; > + > + VkBuffer plane_buf, slice_buf; > + uint32_t buf_plane_size; > + VC2EncPushData enc_consts; > + VC2DwtPushData dwt_consts; > + VC2EncSliceCalcPushData calc_consts; > +} VC2EncContext; > + > +void put_vc2_ue_uint(PutBitContext *pb, uint32_t val); > + > +int count_vc2_ue_uint(uint32_t val); > + > +void init_quant_matrix(VC2EncContext *s); > + > +void encode_parse_info(VC2EncContext *s, enum DiracParseCodes pcode); > + > +void encode_seq_header(VC2EncContext *s); > + > +void encode_picture_start(VC2EncContext *s); > + > +#endif > diff --git a/libavcodec/vc2enc_vulkan.c b/libavcodec/vc2enc_vulkan.c > new file mode 100644 > index 0000000000..09f2cf8fa6 > --- /dev/null > +++ b/libavcodec/vc2enc_vulkan.c > @@ -0,0 +1,781 @@ > +/* > + * Copyright (C) 2016 Open Broadcast Systems Ltd. > + * Author 2016 Rostislav Pehlivanov <atomnu...@gmail.com> > + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA > 02110-1301 USA > + */ > + > +#include "libavutil/avassert.h" > +#include "libavutil/mem.h" > +#include "libavutil/pixdesc.h" > +#include "libavutil/opt.h" > +#include "libavutil/version.h" > +#include "libavutil/vulkan_spirv.h" > +#include "libavutil/hwcontext_vulkan.h" > +#include "libavutil/vulkan_loader.h" > +#include "libavutil/vulkan.h" > +#include "codec_internal.h" > +#include "internal.h" > +#include "encode.h" > +#include "version.h" > +#include "vc2enc_common.h" > +#include "hwconfig.h" > + > +#define LEGALL_WORKGROUP_X 64 > +#define SLICE_WORKGROUP_X 128 > + > +extern const char *ff_source_encode_comp; > +extern const char *ff_source_dwt_hor_legall_comp; > +extern const char *ff_source_dwt_ver_legall_comp; > +extern const char *ff_source_slice_sizes_comp; > +extern const char *ff_source_dwt_upload_comp; > +extern const char *ff_source_dwt_haar_comp; > +extern const char *ff_source_dwt_haar_subgroup_comp; > + > +static int init_vulkan_pipeline(VC2EncContext* s, FFVkSPIRVCompiler *spv, > + FFVulkanShader* shd, int push_size, > + int lg_x, int lg_y, int lg_z, > + const char* pl_name, const char* > pl_source, > + int plane_img) > +{ > + uint8_t *spv_data; > + size_t spv_len; > + void *spv_opaque = NULL; > + FFVulkanContext *vkctx = &s->vkctx; > + FFVulkanDescriptorSetBinding *desc; > + int err = 0; > + > + ff_vk_shader_init(vkctx, shd, pl_name, VK_SHADER_STAGE_COMPUTE_BIT, > + NULL, 0, lg_x, lg_y, lg_z, 0); > + > + if (plane_img) { > + desc = (FFVulkanDescriptorSetBinding []) { > + { > + .name = "plane_imgs", > + .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, > + .mem_layout = > ff_vk_shader_rep_fmt(vkctx->frames->sw_format, FF_VK_REP_NATIVE), > + .dimensions = 2, > + .elems = 3, > + .stages = VK_SHADER_STAGE_COMPUTE_BIT, > + }, > + }; > + RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc, 1, 0, 0)); > + } > + > + ff_vk_shader_add_push_const(shd, 0, push_size, > VK_SHADER_STAGE_COMPUTE_BIT); > + GLSLD(pl_source); > + > + /* Compile Haar shader */ > + RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main", > &spv_opaque)); > + RET(ff_vk_shader_link(vkctx, shd, spv_data, spv_len, "main")); > + RET(ff_vk_shader_register_exec(vkctx, &s->e, shd)); > + > +fail: > + return err; > +} > + > +static int init_vulkan(AVCodecContext *avctx) > +{ > + VC2EncContext *s = avctx->priv_data; > + FFVulkanContext *vkctx = &s->vkctx; > + FFVkSPIRVCompiler *spv; > + AVBufferRef* dwt_buf = NULL; > + AVBufferRef* coef_buf = NULL; > + FFVkBuffer* vk_buf = NULL; > + VC2EncAuxData* ad = NULL; > + Plane *p; > + VC2DwtPlane vk_plane; > + int i, level, ret; > + uint32_t subgroup_size = vkctx->subgroup_props.maxSubgroupSize; > + > + /* Initialize spirv compiler */ > + spv = ff_vk_spirv_init(); > + if (!spv) { > + av_log(avctx, AV_LOG_ERROR, "Unable to initialize SPIR-V > compiler!\n"); > + return -1; > + } > + > + ff_vk_qf_init(vkctx, &s->qf, VK_QUEUE_COMPUTE_BIT); > + ff_vk_exec_pool_init(vkctx, &s->qf, &s->e, 1, 0, 0, 0, NULL); > + > + /* Allocate coefficient buffer for each plane */ > + p = &s->plane[0]; > + s->buf_plane_size = p->coef_stride*p->dwt_height*sizeof(dwtcoef); > + ret = ff_vk_get_pooled_buffer(vkctx, &s->dwt_buf_pool, &coef_buf, > + > VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT | > + VK_BUFFER_USAGE_TRANSFER_DST_BIT, NULL, > + s->buf_plane_size * 3, > + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); > + vk_buf = (FFVkBuffer*)coef_buf->data; > + s->plane_buf = vk_buf->buf; > + > + for (i = 0; i < 3; i++) { > + p = &s->plane[i]; > + vk_plane.dwt_width = p->dwt_width; > + vk_plane.dwt_height = p->dwt_height; > + vk_plane.width = p->width; > + vk_plane.height = p->height; > + memcpy(&s->calc_consts.planes[i], &vk_plane, sizeof(vk_plane)); > + memcpy(&s->dwt_consts.planes[i], &vk_plane, sizeof(vk_plane)); > + memcpy(&s->enc_consts.planes[i], &vk_plane, sizeof(vk_plane)); > + s->enc_consts.p[i] = vk_buf->address + s->buf_plane_size * i; > + s->calc_consts.p[i] = vk_buf->address + s->buf_plane_size * i; > + s->dwt_consts.pbuf[i] = vk_buf->address + s->buf_plane_size * i; > + } > + > + /* Initialize Haar push data */ > + s->dwt_consts.diff_offset = s->diff_offset; > + s->dwt_consts.s = s->wavelet_idx == VC2_TRANSFORM_HAAR_S ? 1 : 0; > + s->dwt_consts.level = 0; > + > + /* Initializer slice calc push data */ > + s->calc_consts.num_x = s->num_x; > + s->calc_consts.num_y = s->num_y; > + s->calc_consts.wavelet_depth = s->wavelet_depth; > + s->calc_consts.prefix_bytes = s->prefix_bytes; > + > + /* Initialize encoder push data */ > + s->enc_consts.wavelet_depth = s->wavelet_depth; > + s->enc_consts.num_x = s->num_x; > + s->enc_consts.num_y = s->num_y; > + > + /* Create buffer for encoder auxilary data. */ > + ret = ff_vk_get_pooled_buffer(vkctx, &s->dwt_buf_pool, &dwt_buf, > + > VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, NULL, > + sizeof(VC2EncAuxData), > + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | > + > VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); > + vk_buf = (FFVkBuffer*)dwt_buf->data; > + s->calc_consts.luts = vk_buf->address; > + s->enc_consts.luts = vk_buf->address; > + ad = (VC2EncAuxData*)vk_buf->mapped_mem; > + if (s->wavelet_depth <= 4 && s->quant_matrix == VC2_QM_DEF) { > + s->custom_quant_matrix = 0; > + for (level = 0; level < s->wavelet_depth; level++) { > + ad->quant[level][0] = > ff_dirac_default_qmat[s->wavelet_idx][level][0]; > + ad->quant[level][1] = > ff_dirac_default_qmat[s->wavelet_idx][level][1]; > + ad->quant[level][2] = > ff_dirac_default_qmat[s->wavelet_idx][level][2]; > + ad->quant[level][3] = > ff_dirac_default_qmat[s->wavelet_idx][level][3]; > + } > + } > + memcpy(ad->ff_dirac_qscale_tab, ff_dirac_qscale_tab, > sizeof(ff_dirac_qscale_tab)); > + > + /* Create buffer for slice arguments */ > + ret = ff_vk_get_pooled_buffer(vkctx, &s->dwt_buf_pool, &dwt_buf, > + > VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, NULL, > + sizeof(VC2EncSliceArgs) * s->num_x * > s->num_y, > + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | > + > VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); > + vk_buf = (FFVkBuffer*)dwt_buf->data; > + s->slice_buf = vk_buf->buf; > + s->vk_slice_args = (VC2EncSliceArgs*)vk_buf->mapped_mem; > + s->calc_consts.slice = vk_buf->address; > + s->enc_consts.slice = vk_buf->address; > + > + s->haar_subgroup = 0; > + > + /* Initialize encoding pipelines */ > + init_vulkan_pipeline(s, spv, &s->dwt_upload_shd, > sizeof(VC2DwtPushData), > + 8, 8, 1, "dwt_upload_pl", > ff_source_dwt_upload_comp, 1); > + init_vulkan_pipeline(s, spv, &s->slice_shd, sizeof(VC2EncPushData), > + 128, 1, 1, "slice_pl", > ff_source_slice_sizes_comp, 0); > + init_vulkan_pipeline(s, spv, &s->enc_shd, sizeof(VC2EncPushData), > + 128, 1, 1, "enc_pl", ff_source_encode_comp, 0); > + > + if (s->wavelet_idx == VC2_TRANSFORM_HAAR || s->wavelet_idx == > VC2_TRANSFORM_HAAR_S) { > + if (subgroup_size == 32 && s->wavelet_depth < 3) { > + init_vulkan_pipeline(s, spv, &s->dwt_haar_shd, > sizeof(VC2DwtPushData), > + 64, 1, 1, "dwt_haar_pl", > ff_source_dwt_haar_subgroup_comp, 0); > + s->haar_subgroup = 1; > + } else if (subgroup_size == 64 && s->wavelet_depth < 4) { > + init_vulkan_pipeline(s, spv, &s->dwt_haar_shd, > sizeof(VC2DwtPushData), > + 64, 1, 1, "dwt_haar_pl", > ff_source_dwt_haar_subgroup_comp, 0); > + s->haar_subgroup = 1; > + } else { > + init_vulkan_pipeline(s, spv, &s->dwt_haar_shd, > sizeof(VC2DwtPushData), > + 16, 16, 1, "dwt_haar_pl", > ff_source_dwt_haar_comp, 0); > + } > + } else if (s->wavelet_idx == VC2_TRANSFORM_5_3) { > + init_vulkan_pipeline(s, spv, &s->dwt_hor_shd, > sizeof(VC2DwtPushData), > + 64, 1, 1, "dwt_hor_pl", > ff_source_dwt_hor_legall_comp, 0); > + init_vulkan_pipeline(s, spv, &s->dwt_ver_shd, > sizeof(VC2DwtPushData), > + 64, 1, 1, "dwt_ver_pl", > ff_source_dwt_ver_legall_comp, 0); > + } > + > + s->group_x = s->plane[0].dwt_width >> 3; > + s->group_y = s->plane[0].dwt_height >> 3; > + return ret; > +} > + > +static void dwt_plane_haar(VC2EncContext *s, FFVkExecContext *exec, > VkBufferMemoryBarrier2* buf_bar) > +{ > + int p, group_x, group_y; > + FFVulkanContext *vkctx = &s->vkctx; > + FFVulkanFunctions *vk = &vkctx->vkfn; > + > + s->dwt_consts.level = s->wavelet_depth; > + ff_vk_exec_bind_shader(vkctx, exec, &s->dwt_haar_shd); > + > + /* Haar pass */ > + for (p = 0; p < 3; p++) { > + s->dwt_consts.plane_idx = p; > + if (s->haar_subgroup) { > + group_x = FFALIGN(s->plane[p].dwt_width, 8) >> 3; > + group_y = FFALIGN(s->plane[p].dwt_height, 8) >> 3; > + } else { > + group_x = FFALIGN(s->plane[p].dwt_width, 16) >> 4; > + group_y = FFALIGN(s->plane[p].dwt_height, 16) >> 4; > + } > + > + ff_vk_shader_update_push_const(vkctx, exec, &s->dwt_haar_shd, > VK_SHADER_STAGE_COMPUTE_BIT, > + 0, sizeof(VC2DwtPushData), > &s->dwt_consts); > + vk->CmdDispatch(exec->buf, group_x, group_y, 1); > + } > + > + /* Wait for Haar dispatches to complete */ > + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { > + .sType = > VK_STRUCTURE_TYPE_DEPENDENCY_INFO, > + .pBufferMemoryBarriers = > buf_bar, > + .bufferMemoryBarrierCount = 1U, > + }); > +} > + > +static void dwt_plane_legall(VC2EncContext *s, FFVkExecContext *exec, > VkBufferMemoryBarrier2* buf_bar) > +{ > + int i; > + int legall_group_x = (s->plane[0].dwt_height + LEGALL_WORKGROUP_X - > 1) >> 6; > + int legall_group_y = (s->plane[0].dwt_width + LEGALL_WORKGROUP_X - 1) > >> 6; > + FFVulkanContext *vkctx = &s->vkctx; > + FFVulkanFunctions *vk = &vkctx->vkfn; > + > + /* Perform Haar wavelet trasform */ > + for (i = 0; i < s->wavelet_depth; i++) { > + s->dwt_consts.level = i; > + > + /* Horizontal Haar pass */ > + ff_vk_exec_bind_shader(vkctx, exec, &s->dwt_hor_shd); > + ff_vk_shader_update_push_const(vkctx, exec, &s->dwt_hor_shd, > VK_SHADER_STAGE_COMPUTE_BIT, > + 0, sizeof(VC2DwtPushData), > &s->dwt_consts); > + vk->CmdDispatch(exec->buf, legall_group_x, 1, 3); > + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { > + .sType = > VK_STRUCTURE_TYPE_DEPENDENCY_INFO, > + .pBufferMemoryBarriers = > buf_bar, > + .bufferMemoryBarrierCount > = 1U, > + }); > + > + /* Vertical Haar pass */ > + ff_vk_exec_bind_shader(vkctx, exec, &s->dwt_ver_shd); > + ff_vk_shader_update_push_const(vkctx, exec, &s->dwt_ver_shd, > VK_SHADER_STAGE_COMPUTE_BIT, > + 0, sizeof(VC2DwtPushData), > &s->dwt_consts); > + vk->CmdDispatch(exec->buf, legall_group_y, 1, 3); > + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { > + .sType = > VK_STRUCTURE_TYPE_DEPENDENCY_INFO, > + .pBufferMemoryBarriers = > buf_bar, > + .bufferMemoryBarrierCount > = 1U, > + }); > + } > +} > + > +static void dwt_plane(VC2EncContext *s, FFVkExecContext *exec, AVFrame > *frame) > +{ > + int i, group_x = s->group_x, group_y = s->group_y; > + FFVulkanContext *vkctx = &s->vkctx; > + FFVulkanFunctions *vk = &vkctx->vkfn; > + uint32_t num_slice_groups = (s->num_x*s->num_y + SLICE_WORKGROUP_X - > 1) >> 7; > + VkBufferMemoryBarrier2 buf_bar; > + VkBufferMemoryBarrier2 slice_buf_bar; > + VkImageView views[AV_NUM_DATA_POINTERS]; > + VkImageMemoryBarrier2 img_bar[AV_NUM_DATA_POINTERS]; > + int nb_img_bar = 0; > + > + buf_bar = (VkBufferMemoryBarrier2) { > + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, > + .srcStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, > + .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, > + .srcAccessMask = VK_ACCESS_2_SHADER_WRITE_BIT | > VK_ACCESS_2_SHADER_READ_BIT, > + .dstAccessMask = VK_ACCESS_2_SHADER_WRITE_BIT | > VK_ACCESS_2_SHADER_READ_BIT, > + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, > + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, > + .buffer = s->plane_buf, > + .size = s->buf_plane_size * 3, > + .offset = 0, > + }; > + > + ff_vk_exec_add_dep_frame(vkctx, exec, frame, > + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, > + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT); > + ff_vk_create_imageviews(vkctx, exec, views, frame, FF_VK_REP_UINT); > + ff_vk_frame_barrier(vkctx, exec, frame, img_bar, &nb_img_bar, > + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, > + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, > + VK_ACCESS_SHADER_READ_BIT, > + VK_IMAGE_LAYOUT_GENERAL, > + VK_QUEUE_FAMILY_IGNORED); > + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { > + .sType = > VK_STRUCTURE_TYPE_DEPENDENCY_INFO, > + .pImageMemoryBarriers = > img_bar, > + .imageMemoryBarrierCount = > nb_img_bar, > + }); > + > + ff_vk_shader_update_img_array(vkctx, exec, &s->dwt_upload_shd, frame, > views, 0, 0, > + VK_IMAGE_LAYOUT_GENERAL, > + VK_NULL_HANDLE); > + > + /* Upload coefficients from planes to the buffer. */ > + s->dwt_consts.diff_offset = s->diff_offset; > + ff_vk_exec_bind_shader(vkctx, exec, &s->dwt_upload_shd); > + ff_vk_shader_update_push_const(vkctx, exec, &s->dwt_upload_shd, > VK_SHADER_STAGE_COMPUTE_BIT, > + 0, sizeof(VC2DwtPushData), > &s->dwt_consts); > + vk->CmdDispatch(exec->buf, group_x, group_y, 3); > + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { > + .sType = > VK_STRUCTURE_TYPE_DEPENDENCY_INFO, > + .pBufferMemoryBarriers = > &buf_bar, > + .bufferMemoryBarrierCount = 1U, > + }); > + > + /* Perform Haar wavelet trasform */ > + if (s->wavelet_idx == VC2_TRANSFORM_HAAR || s->wavelet_idx == > VC2_TRANSFORM_HAAR_S) { > + dwt_plane_haar(s, exec, &buf_bar); > + } else if (s->wavelet_idx == VC2_TRANSFORM_5_3) { > + dwt_plane_legall(s, exec, &buf_bar); > + } > + > + /* Calculate slice sizes. */ > + ff_vk_exec_bind_shader(vkctx, exec, &s->slice_shd); > + ff_vk_shader_update_push_const(vkctx, exec, &s->slice_shd, > VK_SHADER_STAGE_COMPUTE_BIT, > + 0, sizeof(VC2EncSliceCalcPushData), > &s->calc_consts); > + vk->CmdDispatch(exec->buf, num_slice_groups, 1, 1); > + > + slice_buf_bar = (VkBufferMemoryBarrier2) { > + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, > + .srcStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, > + .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, > + .srcAccessMask = VK_ACCESS_2_SHADER_READ_BIT | > VK_ACCESS_2_SHADER_WRITE_BIT, > + .dstAccessMask = VK_ACCESS_2_SHADER_READ_BIT, > + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, > + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, > + .buffer = s->slice_buf, > + .size = sizeof(VC2EncSliceArgs) * s->num_x * s->num_y, > + .offset = 0, > + }; > + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { > + .sType = > VK_STRUCTURE_TYPE_DEPENDENCY_INFO, > + .pBufferMemoryBarriers = > &slice_buf_bar, > + .bufferMemoryBarrierCount = 1U, > + }); > +} > + > +static void vulkan_encode_slices(VC2EncContext *s, FFVkExecContext *exec) > +{ > + uint32_t num_slice_groups = (s->num_x*s->num_y + SLICE_WORKGROUP_X - > 1) >> 7; > + FFVulkanContext *vkctx = &s->vkctx; > + FFVulkanFunctions *vk = &vkctx->vkfn; > + int skip = 0; > + > + flush_put_bits(&s->pb); > + s->enc_consts.pb += put_bytes_output(&s->pb); > + > + ff_vk_exec_bind_shader(vkctx, exec, &s->enc_shd); > + ff_vk_shader_update_push_const(vkctx, exec, &s->enc_shd, > VK_SHADER_STAGE_COMPUTE_BIT, > + 0, sizeof(VC2EncPushData), > &s->enc_consts); > + > + vk->CmdDispatch(exec->buf, num_slice_groups, 1, 1); > + > + ff_vk_exec_submit(vkctx, exec); > + ff_vk_exec_wait(vkctx, exec); > + > + for (int slice_y = 0; slice_y < s->num_y; slice_y++) { > + for (int slice_x = 0; slice_x < s->num_x; slice_x++) { > + VC2EncSliceArgs *args = &s->vk_slice_args[s->num_x*slice_y + > slice_x]; > + skip += args->bytes; > + } > + } > + > + /* Skip forward to write end header */ > + skip_put_bytes(&s->pb, skip); > +} > + > +static int encode_frame(VC2EncContext *s, AVPacket *avpkt, const AVFrame > *frame, > + const char *aux_data, const int header_size, int > field) > +{ > + int ret; > + int64_t max_frame_bytes; > + AVBufferRef *avpkt_buf = NULL; > + FFVkBuffer* buf_vk = NULL; > + FFVulkanContext *vkctx = &s->vkctx; > + FFVkExecContext *exec = ff_vk_exec_get(vkctx, &s->e); > + > + ff_vk_exec_start(vkctx, exec); > + > + /* Perform Haar DWT pass on the inpute frame. */ > + dwt_plane(s, exec, (AVFrame*)frame); > + > + /* Allocate a buffer that can fit at all all 3 planes of data */ > + max_frame_bytes = header_size + s->avctx->width * s->avctx->height * > sizeof(dwtcoef); > + s->custom_quant_matrix = 0; > + > + /* Get a pooled device local host visible buffer for writing output > data */ > + if (field < 2) { > + ret = ff_vk_get_pooled_buffer(vkctx, &s->dwt_buf_pool, &avpkt_buf, > + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | > + > VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, NULL, > + max_frame_bytes << s->interlaced, > + VK_MEMORY_PROPERTY_HOST_CACHED_BIT | > + > VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | > + > VK_MEMORY_PROPERTY_HOST_COHERENT_BIT); > + avpkt->buf = avpkt_buf; > + buf_vk = (FFVkBuffer *)avpkt_buf->data; > + avpkt->data = buf_vk->mapped_mem; > + avpkt->size = max_frame_bytes << s->interlaced; > + s->enc_consts.pb = buf_vk->address; > + > + if (ret < 0) { > + return ret; > + } > + init_put_bits(&s->pb, avpkt->data, avpkt->size); > + } > + > + /* Sequence header */ > + encode_parse_info(s, DIRAC_PCODE_SEQ_HEADER); > + encode_seq_header(s); > + > + /* Encoder version */ > + if (aux_data) { > + encode_parse_info(s, DIRAC_PCODE_AUX); > + ff_put_string(&s->pb, aux_data, 1); > + } > + > + /* Picture header */ > + encode_parse_info(s, DIRAC_PCODE_PICTURE_HQ); > + encode_picture_start(s); > + > + /* Encode slices */ > + vulkan_encode_slices(s, exec); > + > + /* End sequence */ > + encode_parse_info(s, DIRAC_PCODE_END_SEQ); > + > + return 0; > +} > + > +static av_cold int vc2_encode_frame(AVCodecContext *avctx, AVPacket > *avpkt, > + const AVFrame *frame, int *got_packet) > +{ > + int ret = 0; > + int slice_ceil, sig_size = 256; > + VC2EncContext *s = avctx->priv_data; > + const int bitexact = avctx->flags & AV_CODEC_FLAG_BITEXACT; > + const char *aux_data = bitexact ? "Lavc" : LIBAVCODEC_IDENT; > + const int aux_data_size = bitexact ? sizeof("Lavc") : > sizeof(LIBAVCODEC_IDENT); > + const int header_size = 100 + aux_data_size; > + int64_t r_bitrate = avctx->bit_rate >> (s->interlaced); > + > + s->avctx = avctx; > + s->size_scaler = 2; > + s->prefix_bytes = 0; > + s->last_parse_code = 0; > + s->next_parse_offset = 0; > + > + /* Rate control */ > + s->frame_max_bytes = (av_rescale(r_bitrate, s->avctx->time_base.num, > + s->avctx->time_base.den) >> 3) - > header_size; > + s->slice_max_bytes = slice_ceil = av_rescale(s->frame_max_bytes, 1, > s->num_x*s->num_y); > + > + /* Find an appropriate size scaler */ > + while (sig_size > 255) { > + int r_size = SSIZE_ROUND(s->slice_max_bytes); > + if (r_size > slice_ceil) { > + s->slice_max_bytes -= r_size - slice_ceil; > + r_size = SSIZE_ROUND(s->slice_max_bytes); > + } > + sig_size = r_size/s->size_scaler; /* Signalled slize size */ > + s->size_scaler <<= 1; > + } > + > + s->slice_min_bytes = s->slice_max_bytes - > s->slice_max_bytes*(s->tolerance/100.0f); > + if (s->slice_min_bytes < 0) > + return AVERROR(EINVAL); > + > + /* Update slice calc push data */ > + s->calc_consts.size_scaler = s->size_scaler; > + s->calc_consts.bits_ceil = s->slice_max_bytes << 3; > + s->calc_consts.bits_floor = s->slice_min_bytes << 3; > + s->enc_consts.prefix_bytes = 0; > + s->enc_consts.size_scaler = s->size_scaler; > + > + ret = encode_frame(s, avpkt, frame, aux_data, header_size, > s->interlaced); > + if (ret) > + return ret; > + if (s->interlaced) { > + ret = encode_frame(s, avpkt, frame, aux_data, header_size, 2); > + if (ret) > + return ret; > + } > + > + flush_put_bits(&s->pb); > + av_shrink_packet(avpkt, put_bytes_output(&s->pb)); > + avpkt->flags |= AV_PKT_FLAG_KEY; > + *got_packet = 1; > + > + return 0; > +} > + > +static av_cold int vc2_encode_end(AVCodecContext *avctx) > +{ > + int i; > + VC2EncContext *s = avctx->priv_data; > + > + av_log(avctx, AV_LOG_INFO, "Qavg: %i\n", s->q_avg); > + > + for (i = 0; i < 3; i++) { > + ff_vc2enc_free_transforms(&s->transform_args[i].t); > + av_freep(&s->plane[i].coef_buf); > + } > + > + return 0; > +} > + > +static av_cold int vc2_encode_init(AVCodecContext *avctx) > +{ > + Plane *p; > + SubBand *b; > + int i, level, o, ret, depth; > + const AVPixFmtDescriptor *fmt; > + VC2EncContext *s = avctx->priv_data; > + FFVulkanContext *vkctx = &s->vkctx; > + > + vkctx->frames_ref = av_buffer_ref(avctx->hw_frames_ctx); > + vkctx->frames = (AVHWFramesContext *)vkctx->frames_ref->data; > + vkctx->hwfc = vkctx->frames->hwctx; > + vkctx->device = (AVHWDeviceContext *)vkctx->frames->device_ref->data; > + vkctx->hwctx = vkctx->device->hwctx; > + vkctx->extensions = > ff_vk_extensions_to_mask(vkctx->hwctx->enabled_dev_extensions, > + > vkctx->hwctx->nb_enabled_dev_extensions); > + ff_vk_load_functions(vkctx->device, &vkctx->vkfn, vkctx->extensions, > 1, 1); > + ff_vk_load_props(vkctx); > + > + s->picture_number = 0; > + > + /* Total allowed quantization range */ > + s->q_ceil = DIRAC_MAX_QUANT_INDEX; > + > + s->ver.major = 2; > + s->ver.minor = 0; > + s->profile = 3; > + s->level = 3; > + > + s->base_vf = -1; > + s->strict_compliance = 1; > + > + s->q_avg = 0; > + s->slice_max_bytes = 0; > + s->slice_min_bytes = 0; > + > + /* Mark unknown as progressive */ > + s->interlaced = !((avctx->field_order == AV_FIELD_UNKNOWN) || > + (avctx->field_order == AV_FIELD_PROGRESSIVE)); > + > + for (i = 0; i < base_video_fmts_len; i++) { > + const VC2BaseVideoFormat *fmt = &base_video_fmts[i]; > + if (avctx->pix_fmt != fmt->pix_fmt) > + continue; > + if (avctx->time_base.num != fmt->time_base.num) > + continue; > + if (avctx->time_base.den != fmt->time_base.den) > + continue; > + if (avctx->width != fmt->width) > + continue; > + if (avctx->height != fmt->height) > + continue; > + if (s->interlaced != fmt->interlaced) > + continue; > + s->base_vf = i; > + s->level = base_video_fmts[i].level; > + break; > + } > + > + if (s->interlaced) > + av_log(avctx, AV_LOG_WARNING, "Interlacing enabled!\n"); > + > + if ((s->slice_width & (s->slice_width - 1)) || > + (s->slice_height & (s->slice_height - 1))) { > + av_log(avctx, AV_LOG_ERROR, "Slice size is not a power of > two!\n"); > + return AVERROR_UNKNOWN; > + } > + > + if ((s->slice_width > avctx->width) || > + (s->slice_height > avctx->height)) { > + av_log(avctx, AV_LOG_ERROR, "Slice size is bigger than the > image!\n"); > + return AVERROR_UNKNOWN; > + } > + > + if (s->base_vf <= 0) { > + if (avctx->strict_std_compliance < FF_COMPLIANCE_STRICT) { > + s->strict_compliance = s->base_vf = 0; > + av_log(avctx, AV_LOG_WARNING, "Format does not strictly > comply with VC2 specs\n"); > + } else { > + av_log(avctx, AV_LOG_WARNING, "Given format does not strictly > comply with " > + "the specifications, decrease strictness to use > it.\n"); > + return AVERROR_UNKNOWN; > + } > + } else { > + av_log(avctx, AV_LOG_INFO, "Selected base video format = %i > (%s)\n", > + s->base_vf, base_video_fmts[s->base_vf].name); > + } > + > + /* Chroma subsampling */ > + ret = av_pix_fmt_get_chroma_sub_sample(vkctx->frames->sw_format, > &s->chroma_x_shift, &s->chroma_y_shift); > + if (ret) > + return ret; > + > + /* Bit depth and color range index */ > + fmt = av_pix_fmt_desc_get(vkctx->frames->sw_format); > + depth = fmt->comp[0].depth; > + if (depth == 8 && avctx->color_range == AVCOL_RANGE_JPEG) { > + s->bpp = 1; > + s->bpp_idx = 1; > + s->diff_offset = 128; > + } else if (depth == 8 && (avctx->color_range == AVCOL_RANGE_MPEG || > + avctx->color_range == AVCOL_RANGE_UNSPECIFIED)) { > + s->bpp = 1; > + s->bpp_idx = 2; > + s->diff_offset = 128; > + } else if (depth == 10) { > + s->bpp = 2; > + s->bpp_idx = 3; > + s->diff_offset = 512; > + } else { > + s->bpp = 2; > + s->bpp_idx = 4; > + s->diff_offset = 2048; > + } > + > + /* Planes initialization */ > + for (i = 0; i < 3; i++) { > + int w, h; > + p = &s->plane[i]; > + p->width = avctx->width >> (i ? s->chroma_x_shift : 0); > + p->height = avctx->height >> (i ? s->chroma_y_shift : 0); > + if (s->interlaced) > + p->height >>= 1; > + p->dwt_width = w = FFALIGN(p->width, (1 << s->wavelet_depth)); > + p->dwt_height = h = FFALIGN(p->height, (1 << s->wavelet_depth)); > + p->coef_stride = FFALIGN(p->dwt_width, 32); > + for (level = s->wavelet_depth-1; level >= 0; level--) { > + w = w >> 1; > + h = h >> 1; > + for (o = 0; o < 4; o++) { > + b = &p->band[level][o]; > + b->width = w; > + b->height = h; > + b->stride = p->coef_stride; > + b->shift = (o > 1)*b->height*b->stride + (o & 1)*b->width; > + } > + } > + > + /* DWT init */ > + if (ff_vc2enc_init_transforms(&s->transform_args[i].t, > + s->plane[i].coef_stride, > + s->plane[i].dwt_height, > + s->slice_width, s->slice_height)) > + return AVERROR(ENOMEM); > + } > + > + /* Slices */ > + s->num_x = s->plane[0].dwt_width/s->slice_width; > + s->num_y = s->plane[0].dwt_height/s->slice_height; > + > + s->slice_args = av_calloc(s->num_x*s->num_y, sizeof(SliceArgs)); > + if (!s->slice_args) > + return AVERROR(ENOMEM); > + > + for (i = 0; i < 116; i++) { > + const uint64_t qf = ff_dirac_qscale_tab[i]; > + const uint32_t m = av_log2(qf); > + const uint32_t t = (1ULL << (m + 32)) / qf; > + const uint32_t r = (t*qf + qf) & UINT32_MAX; > + if (!(qf & (qf - 1))) { > + s->qmagic_lut[i][0] = 0xFFFFFFFF; > + s->qmagic_lut[i][1] = 0xFFFFFFFF; > + } else if (r <= 1 << m) { > + s->qmagic_lut[i][0] = t + 1; > + s->qmagic_lut[i][1] = 0; > + } else { > + s->qmagic_lut[i][0] = t; > + s->qmagic_lut[i][1] = t; > + } > + } > + init_vulkan(avctx); > + > + return 0; > +} > + > +#define VC2ENC_FLAGS (AV_OPT_FLAG_ENCODING_PARAM | > AV_OPT_FLAG_VIDEO_PARAM) > +static const AVOption vc2enc_options[] = { > + {"tolerance", "Max undershoot in percent", > offsetof(VC2EncContext, tolerance), AV_OPT_TYPE_DOUBLE, {.dbl = 5.0f}, > 0.0f, 45.0f, VC2ENC_FLAGS, .unit = "tolerance"}, > + {"slice_width", "Slice width", offsetof(VC2EncContext, > slice_width), AV_OPT_TYPE_INT, {.i64 = 32}, 32, 1024, VC2ENC_FLAGS, .unit = > "slice_width"}, > + {"slice_height", "Slice height", offsetof(VC2EncContext, > slice_height), AV_OPT_TYPE_INT, {.i64 = 16}, 8, 1024, VC2ENC_FLAGS, .unit = > "slice_height"}, > + {"wavelet_depth", "Transform depth", offsetof(VC2EncContext, > wavelet_depth), AV_OPT_TYPE_INT, {.i64 = 4}, 1, 5, VC2ENC_FLAGS, .unit = > "wavelet_depth"}, > + {"wavelet_type", "Transform type", offsetof(VC2EncContext, > wavelet_idx), AV_OPT_TYPE_INT, {.i64 = VC2_TRANSFORM_HAAR_S}, 0, > VC2_TRANSFORMS_NB, VC2ENC_FLAGS, .unit = "wavelet_idx"}, > + {"5_3", "LeGall (5,3)", 0, AV_OPT_TYPE_CONST, > {.i64 = VC2_TRANSFORM_5_3}, INT_MIN, INT_MAX, VC2ENC_FLAGS, .unit = > "wavelet_idx"}, > + {"haar", "Haar (with shift)", 0, AV_OPT_TYPE_CONST, > {.i64 = VC2_TRANSFORM_HAAR_S}, INT_MIN, INT_MAX, VC2ENC_FLAGS, .unit = > "wavelet_idx"}, > + {"haar_noshift", "Haar (without shift)", 0, AV_OPT_TYPE_CONST, > {.i64 = VC2_TRANSFORM_HAAR}, INT_MIN, INT_MAX, VC2ENC_FLAGS, .unit = > "wavelet_idx"}, > + {"qm", "Custom quantization matrix", offsetof(VC2EncContext, > quant_matrix), AV_OPT_TYPE_INT, {.i64 = VC2_QM_DEF}, 0, VC2_QM_NB, > VC2ENC_FLAGS, .unit = "quant_matrix"}, > + {"default", "Default from the specifications", 0, > AV_OPT_TYPE_CONST, {.i64 = VC2_QM_DEF}, INT_MIN, INT_MAX, VC2ENC_FLAGS, > .unit = "quant_matrix"}, > + {"color", "Prevents low bitrate discoloration", 0, > AV_OPT_TYPE_CONST, {.i64 = VC2_QM_COL}, INT_MIN, INT_MAX, VC2ENC_FLAGS, > .unit = "quant_matrix"}, > + {"flat", "Optimize for PSNR", 0, AV_OPT_TYPE_CONST, {.i64 = > VC2_QM_FLAT}, INT_MIN, INT_MAX, VC2ENC_FLAGS, .unit = "quant_matrix"}, > + {NULL} > +}; > + > +static const AVClass vc2enc_class = { > + .class_name = "vc2_vulkan_encoder", > + .category = AV_CLASS_CATEGORY_ENCODER, > + .option = vc2enc_options, > + .item_name = av_default_item_name, > + .version = LIBAVUTIL_VERSION_INT > +}; > + > +static const FFCodecDefault vc2enc_defaults[] = { > + { "b", "600000000" }, > + { NULL }, > +}; > + > +const AVCodecHWConfigInternal *const ff_vc2_hw_configs[] = { > + HW_CONFIG_ENCODER_FRAMES(VULKAN, VULKAN), > + HW_CONFIG_ENCODER_DEVICE(NONE, VULKAN), > + NULL, > +}; > + > +const FFCodec ff_vc2_vulkan_encoder = { > + .p.name = "vc2_vulkan", > + CODEC_LONG_NAME("SMPTE VC-2"), > + .p.type = AVMEDIA_TYPE_VIDEO, > + .p.id = AV_CODEC_ID_DIRAC, > + .p.capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_HARDWARE, > + .caps_internal = FF_CODEC_CAP_INIT_CLEANUP, > + .priv_data_size = sizeof(VC2EncContext), > + .init = vc2_encode_init, > + .close = vc2_encode_end, > + FF_CODEC_ENCODE_CB(vc2_encode_frame), > + .p.priv_class = &vc2enc_class, > + .defaults = vc2enc_defaults, > + .p.pix_fmts = (const enum AVPixelFormat[]) { > + AV_PIX_FMT_VULKAN, > + AV_PIX_FMT_NONE, > + }, > + .hw_configs = ff_vc2_hw_configs, > +}; > diff --git a/libavcodec/vulkan/dwt_haar.comp > b/libavcodec/vulkan/dwt_haar.comp > new file mode 100644 > index 0000000000..69073cb17f > --- /dev/null > +++ b/libavcodec/vulkan/dwt_haar.comp > @@ -0,0 +1,76 @@ > +#extension GL_EXT_scalar_block_layout : require > +#extension GL_EXT_buffer_reference : require > + > +#define LOCAL_X 256 > + > +layout(local_size_x = 16, local_size_y = 16, local_size_z = 1) in; > + > +layout(scalar, buffer_reference, buffer_reference_align = 4) buffer > DwtCoef { > + int coef_buf[]; > +}; > + > +struct Plane { > + ivec2 dim; > + ivec2 dwt_dim; > +}; > + > +layout(push_constant, scalar) uniform ComputeInfo { > + int s; > + int plane_idx; > + int wavelet_depth; > + Plane planes[3]; > + DwtCoef pbuf[3]; > +}; > + > +shared int local_coef[LOCAL_X]; > + > +void main() { > + ivec2 coord = ivec2(gl_GlobalInvocationID.xy); > + ivec2 dwt_dim = planes[plane_idx].dwt_dim; > + if (any(greaterThanEqual(coord, dwt_dim))) { > + return; > + } > + int index = dwt_dim.x * coord.y + coord.x; > + int value = pbuf[plane_idx].coef_buf[index]; > + > + /* Perform Haar wavelet on the 16x16 local workgroup with shared > memory */ > + for (int i = 0; i < wavelet_depth; i++) { > + ivec2 mask = ivec2((1 << i) - 1); > + if (any(notEqual(coord & mask, ivec2(0)))) { > + break; > + } > + > + /* Offset between valid hor pixels for each level, +1, +2, +4 etc > */ > + int dist = (1 << i); > + > + local_coef[gl_LocalInvocationIndex] = value; > + barrier(); > + > + /* Horizontal haar wavelet */ > + uint other_id = gl_LocalInvocationIndex ^ dist; > + int other = local_coef[other_id]; > + int a = gl_LocalInvocationIndex < other_id ? value : other; > + int b = gl_LocalInvocationIndex < other_id ? other : value; > + int dst_b = (b - a) * (1 << s); > + int dst_a = a * (1 << s) + ((dst_b + 1) >> 1); > + value = gl_LocalInvocationIndex < other_id ? dst_a : dst_b; > + > + /* Offset between valid ver pixels for each level, +1, +2, +4 etc > */ > + dist <<= 4; > + > + local_coef[gl_LocalInvocationIndex] = value; > + barrier(); > + > + /* Vertical haar wavelet */ > + other_id = gl_LocalInvocationIndex ^ dist; > + other = local_coef[other_id]; > + a = gl_LocalInvocationIndex < other_id ? value : other; > + b = gl_LocalInvocationIndex < other_id ? other : value; > + dst_b = b - a; > + dst_a = a + ((dst_b + 1) >> 1); > + value = gl_LocalInvocationIndex < other_id ? dst_a : dst_b; > + } > + > + /* Store value */ > + pbuf[plane_idx].coef_buf[index] = value; > +} > \ No newline at end of file > diff --git a/libavcodec/vulkan/dwt_haar_subgroup.comp > b/libavcodec/vulkan/dwt_haar_subgroup.comp > new file mode 100644 > index 0000000000..32fb04561a > --- /dev/null > +++ b/libavcodec/vulkan/dwt_haar_subgroup.comp > @@ -0,0 +1,94 @@ > +#extension GL_EXT_scalar_block_layout : require > +#extension GL_KHR_shader_subgroup_basic : require > +#extension GL_KHR_shader_subgroup_shuffle : require > +#extension GL_EXT_buffer_reference : require > + > +#define TILE_DIM 8 > +#define LOCAL_X 64 > + > +layout(local_size_x = LOCAL_X, local_size_y = 1, local_size_z = 1) in; > + > +layout(scalar, buffer_reference, buffer_reference_align = 4) buffer > DwtCoef { > + int coef_buf[]; > +}; > + > +struct Plane { > + ivec2 dim; > + ivec2 dwt_dim; > +}; > + > +layout(push_constant, scalar) uniform ComputeInfo { > + int s; > + int plane_idx; > + int wavelet_depth; > + Plane planes[3]; > + DwtCoef pbuf[3]; > +}; > + > +int dwt_haar_subgroup(int value, int i) { > + /* Offset between valid hor pixels for each level, +1, +2, +4 etc */ > + int dist = (1 << i); > + > + /* Horizontal haar wavelet */ > + uint other_sub_id = gl_SubgroupInvocationID ^ dist; > + int other = subgroupShuffle(value, other_sub_id); > + int a = gl_SubgroupInvocationID < other_sub_id ? value : other; > + int b = gl_SubgroupInvocationID < other_sub_id ? other : value; > + int dst_b = (b - a) * (1 << s); > + int dst_a = a * (1 << s) + ((dst_b + 1) >> 1); > + value = gl_SubgroupInvocationID < other_sub_id ? dst_a : dst_b; > + > + /* Offset between valid ver pixels for each level, +1, +2, +4 etc */ > + dist <<= 3; > + > + /* Vertical haar wavelet */ > + other_sub_id = gl_SubgroupInvocationID ^ dist; > + other = subgroupShuffle(value, other_sub_id); > + a = gl_SubgroupInvocationID < other_sub_id ? value : other; > + b = gl_SubgroupInvocationID < other_sub_id ? other : value; > + dst_b = b - a; > + dst_a = a + ((dst_b + 1) >> 1); > + return gl_SubgroupInvocationID < other_sub_id ? dst_a : dst_b; > +} > + > +bool is_thread_active(int i, ivec2 coord) { > + if (i > wavelet_depth - 1) { > + return false; > + } > + ivec2 mask = ivec2((1 << i) - 1); > + if (any(notEqual(coord & mask, ivec2(0)))) { > + return false; > + } > + return true; > +} > + > +void main() { > + ivec2 tile_coord = ivec2(gl_WorkGroupID.xy); > + ivec2 local_coord = ivec2(gl_LocalInvocationIndex & 7, > gl_LocalInvocationIndex >> 3); > + ivec2 coord = tile_coord * ivec2(TILE_DIM) + local_coord; > + ivec2 dwt_dim = planes[plane_idx].dwt_dim; > + if (any(greaterThanEqual(coord, dwt_dim))) { > + return; > + } > + int index = dwt_dim.x * coord.y + coord.x; > + int value = pbuf[plane_idx].coef_buf[index]; > + > + if (gl_SubgroupSize == 64) { > + for (int i = 0; i < 3; i++) { > + if (!is_thread_active(i, local_coord)) { > + break; > + } > + value = dwt_haar_subgroup(value, i); > + } > + } else { > + for (int i = 0; i < 2; i++) { > + if (!is_thread_active(i, local_coord)) { > + break; > + } > + value = dwt_haar_subgroup(value, i); > + } > + } > + > + // Store value > + pbuf[plane_idx].coef_buf[index] = value; > +} > \ No newline at end of file > diff --git a/libavcodec/vulkan/dwt_hor_legall.comp > b/libavcodec/vulkan/dwt_hor_legall.comp > new file mode 100644 > index 0000000000..9c3945825e > --- /dev/null > +++ b/libavcodec/vulkan/dwt_hor_legall.comp > @@ -0,0 +1,61 @@ > +#extension GL_EXT_scalar_block_layout : require > +#extension GL_EXT_buffer_reference : require > + > +layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; > + > +layout(scalar, buffer_reference, buffer_reference_align = 4) buffer > DwtCoef { > + int coef_buf[]; > +}; > + > +struct Plane { > + ivec2 dim; > + ivec2 dwt_dim; > +}; > + > +layout(push_constant, scalar) uniform ComputeInfo { > + int s; > + int diff_offset; > + int level; > + Plane planes[3]; > + DwtCoef pbuf[3]; > +}; > + > +int align(int x, int a) { > + return (x + a - 1) & ~(a - 1); > +} > + > +void main() { > + int coord_y = int(gl_GlobalInvocationID.x); > + uint plane_idx = gl_GlobalInvocationID.z; > + ivec2 work_area = planes[plane_idx].dwt_dim; > + int dist = 1 << level; > + if (coord_y >= work_area.y || (coord_y & (dist - 1)) != 0) { > + return; > + } > + > + DwtCoef buf = pbuf[plane_idx]; > + ivec2 dwt_area = work_area >> 1; > + int stride = align(planes[plane_idx].dwt_dim.x, 32); > + int start = stride * coord_y; > + > + // Shift in one bit that is used for additional precision > + for (int x = 0; x < work_area.x; x += dist) { > + buf.coef_buf[start + x] = buf.coef_buf[start + x] << 1; > + } > + > + // Lifting stage 2 > + for (int x = 0; x < work_area.x - 2 * dist; x += 2 * dist) { > + buf.coef_buf[start + x + dist] -= (buf.coef_buf[start + x] + > + buf.coef_buf[start + x + 2 * > dist] + 1) >> 1; > + } > + buf.coef_buf[start + work_area.x - dist] -= (2 * buf.coef_buf[start + > work_area.x - 2 * dist] + 1) >> 1; > + > + // Lifting stage 1 > + buf.coef_buf[start] += (2 * buf.coef_buf[start + dist] + 2) >> 2; > + for (int x = 2 * dist; x < work_area.x - 2 * dist; x += 2 * dist) { > + buf.coef_buf[start + x] += (buf.coef_buf[start + x - dist] + > + buf.coef_buf[start + x + dist] + 2) > >> 2; > + } > + buf.coef_buf[start + work_area.x - 2 * dist] += (buf.coef_buf[start + > work_area.x - 3 * dist] + > + buf.coef_buf[start + > work_area.x - dist] + 2) >> 2; > +} > \ No newline at end of file > diff --git a/libavcodec/vulkan/dwt_legall.comp > b/libavcodec/vulkan/dwt_legall.comp > new file mode 100644 > index 0000000000..1c640022f5 > --- /dev/null > +++ b/libavcodec/vulkan/dwt_legall.comp > @@ -0,0 +1,74 @@ > +#extension GL_EXT_scalar_block_layout : require > +#extension GL_EXT_debug_printf : require > + > +#define TILE_DIM 16 > +#define LOCAL_X 256 > + > +layout(local_size_x = TILE_DIM, local_size_y = TILE_DIM, local_size_z = > 1) in; > + > +layout(scalar, buffer_reference, buffer_reference_align = 4) buffer > DwtCoef { > + int coef_buf[]; > +}; > + > +struct Plane { > + ivec2 dim; > + ivec2 dwt_dim; > +}; > + > +layout(push_constant, scalar) uniform ComputeInfo { > + int s; > + int plane_idx; > + int wavelet_depth; > + Plane planes[3]; > + DwtCoef dst_buf[3]; > + DwtCoef src_buf[3]; > +}; > + > +int align(int x, int a) { > + return (x + a - 1) & ~(a - 1); > +} > + > +shared uint local_coef[LOCAL_X]; > + > +void main() { > + ivec2 coord = ivec2(gl_GlobalInvocationID.xy); > + ivec2 work_area = planes[plane_idx].dwt_dim; > + ivec2 dwt_area = work_area >> 1; > + if (any(greaterThanEqual(coord, work_area))) { > + return; > + } > + > + DwtCoef src = src_buf[plane_idx]; > + DwtCoef dst = dst_buf[plane_idx]; > + int stride = align(planes[plane_idx].dwt_dim.x, 32); > + int start = stride * coord.y; > + > + for (int i = 0; i < wavelet_depth; i++) { > + ivec2 mask = ivec2((1 << i) - 1); > + if (any(notEqual(coord & mask, ivec2(0)))) { > + break; > + } > + > + mask <<= 1; > + mask |= 1; > + > + // Shift in one bit that is used for additional precision > + for (int x = 0; x < work_area.x; x++) { > + dst.coef_buf[start + x] = src.coef_buf[start + x] << 1; > + } > + > + // Lifting stage 2 > + for (int x = 0; x < dwt_area.x - 1; x++) { > + dst.coef_buf[start + 2 * x + 1] -= (dst.coef_buf[start + 2 * > x] + > + dst.coef_buf[start + 2 * > x + 2] + 1) >> 1; > + } > + dst.coef_buf[start + work_area.x - 1] -= (2 * dst.coef_buf[start > + work_area.x - 2] + 1) >> 1; > + > + // Lifting stage 1 > + dst.coef_buf[start] += (2 * dst.coef_buf[start + 1] + 2) >> 2; > + for (int x = 1; x <= dwt_area.x - 1; x++) { > + dst.coef_buf[start + 2 * x] += (dst.coef_buf[start + 2 * x - > 1] + > + dst.coef_buf[start + 2 * x + > 1] + 2) >> 2; > + } > + } > +} > \ No newline at end of file > diff --git a/libavcodec/vulkan/dwt_upload.comp > b/libavcodec/vulkan/dwt_upload.comp > new file mode 100644 > index 0000000000..943ebf23d7 > --- /dev/null > +++ b/libavcodec/vulkan/dwt_upload.comp > @@ -0,0 +1,45 @@ > +#extension GL_EXT_scalar_block_layout : require > +#extension GL_EXT_shader_explicit_arithmetic_types : require > +#extension GL_EXT_buffer_reference : require > + > +layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in; > + > +layout(scalar, buffer_reference, buffer_reference_align = 4) buffer > DwtCoef { > + int coef_buf[]; > +}; > + > +layout(scalar, buffer_reference, buffer_reference_align = 1) buffer > PlaneBuf { > + uint8_t data[]; > +}; > + > +struct Plane { > + ivec2 dim; > + ivec2 dwt_dim; > +}; > + > +layout(push_constant, scalar) uniform ComputeInfo { > + int s; > + int diff_offset; > + int level; > + Plane planes[3]; > + DwtCoef pbuf[3]; > +}; > + > +int align(int x, int a) { > + return (x + a - 1) & ~(a - 1); > +} > + > +void main() { > + ivec2 coord = ivec2(gl_GlobalInvocationID.xy); > + uint plane_idx = gl_GlobalInvocationID.z; > + ivec2 work_area = planes[plane_idx].dwt_dim; > + if (any(greaterThanEqual(coord, work_area))) { > + return; > + } > + int stride = align(planes[plane_idx].dwt_dim.x, 32); > + uint coef_idx = coord.y * stride + coord.x; > + ivec2 coord_i = clamp(coord, ivec2(0), planes[plane_idx].dim); > + uint texel = imageLoad(plane_imgs[plane_idx], coord_i).x; > + int result = int(texel - diff_offset); > + pbuf[plane_idx].coef_buf[coef_idx] = result; > +} > \ No newline at end of file > diff --git a/libavcodec/vulkan/dwt_ver_legall.comp > b/libavcodec/vulkan/dwt_ver_legall.comp > new file mode 100644 > index 0000000000..6662bd656f > --- /dev/null > +++ b/libavcodec/vulkan/dwt_ver_legall.comp > @@ -0,0 +1,55 @@ > +#extension GL_EXT_scalar_block_layout : require > +#extension GL_EXT_buffer_reference : require > + > +layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; > + > +layout(scalar, buffer_reference, buffer_reference_align = 4) buffer > DwtCoef { > + int coef_buf[]; > +}; > + > +struct Plane { > + ivec2 dim; > + ivec2 dwt_dim; > +}; > + > +layout(push_constant, scalar) uniform ComputeInfo { > + int s; > + int diff_offset; > + int level; > + Plane planes[3]; > + DwtCoef pbuf[3]; > +}; > + > +int align(int x, int a) { > + return (x + a - 1) & ~(a - 1); > +} > + > +void main() { > + int coord_x = int(gl_GlobalInvocationID.x); > + uint plane_idx = gl_GlobalInvocationID.z; > + ivec2 work_area = planes[plane_idx].dwt_dim; > + int dist = 1 << level; > + if (coord_x >= work_area.x || (coord_x & (dist - 1)) != 0) { > + return; > + } > + > + DwtCoef buf = pbuf[plane_idx]; > + ivec2 dwt_area = work_area >> 1; > + int stride = align(planes[plane_idx].dwt_dim.x, 32); > + > + // Lifting stage 2 > + for (int y = dist; y < work_area.y - 2 * dist; y += 2 * dist) { > + buf.coef_buf[stride * y + coord_x] -= (buf.coef_buf[stride * (y - > dist) + coord_x] + > + buf.coef_buf[stride * (y + > dist) + coord_x] + 1) >> 1; > + } > + buf.coef_buf[stride * (work_area.y - dist) + coord_x] -= (2 * > buf.coef_buf[stride * (work_area.y - 2 * dist) + coord_x] + 1) >> 1; > + > + // Lifting stage 1 > + buf.coef_buf[coord_x] += (2 * buf.coef_buf[stride * dist + coord_x] + > 2) >> 2; > + for (int y = 2 * dist; y < work_area.y - 2 * dist; y += 2 * dist) { > + buf.coef_buf[stride * y + coord_x] += (buf.coef_buf[stride * (y + > dist) + coord_x] + > + buf.coef_buf[stride * (y - > dist) + coord_x] + 2) >> 2; > + } > + buf.coef_buf[stride * (work_area.y - 2 * dist) + coord_x] += > (buf.coef_buf[stride * (work_area.y - 3 * dist) + coord_x] + > + > buf.coef_buf[stride * (work_area.y - dist) + coord_x] + 2) >> 2; > +} > \ No newline at end of file > diff --git a/libavcodec/vulkan/encode.comp b/libavcodec/vulkan/encode.comp > new file mode 100644 > index 0000000000..892674c6da > --- /dev/null > +++ b/libavcodec/vulkan/encode.comp > @@ -0,0 +1,256 @@ > +#extension GL_EXT_shader_explicit_arithmetic_types : require > +#extension GL_EXT_scalar_block_layout : require > +#extension GL_EXT_buffer_reference : require > +#extension GL_EXT_debug_printf : require > + > +#define WORKGROUP_X 128 > +layout(local_size_x = WORKGROUP_X, local_size_y = 1, local_size_z = 1) in; > + > +#define MAX_DWT_LEVELS (5) > + > +struct SliceArgs { > + int quant_idx; > + int bytes; > + int pb_start; > + int pad; > +}; > + > +struct Plane { > + ivec2 dim; > + ivec2 dwt_dim; > +}; > + > +layout(std430, buffer_reference, buffer_reference_align = 16) buffer > SliceArgBuf { > + SliceArgs args[]; > +}; > +layout(scalar, buffer_reference, buffer_reference_align = 4) buffer > DwtCoef { > + int coef_buf[]; > +}; > +layout(scalar, buffer_reference, buffer_reference_align = 1) buffer > BitBuf { > + uint data[]; > +}; > +layout(scalar, buffer_reference, buffer_reference_align = 1) buffer > BitBufByte { > + uint8_t data[]; > +}; > +layout(scalar, buffer_reference, buffer_reference_align = 1) buffer > BitBufQword { > + uint64_t data[]; > +}; > +layout(scalar, buffer_reference, buffer_reference_align = 4) buffer > QuantLuts { > + int quant[5][4]; > + int ff_dirac_qscale_tab[116]; > +}; > + > +layout(push_constant, scalar) uniform ComputeInfo { > + DwtCoef plane_dat[3]; > + BitBuf pb; > + QuantLuts luts; > + SliceArgBuf slice; > + ivec2 num_slices; > + Plane planes[3]; > + int wavelet_depth; > + int size_scaler; > + int prefix_bytes; > +}; > + > +#define BUF_BITS 64 > +#define BUF_BYTES 4 > + > +uint64_t bit_buf = 0; > +int bit_left = BUF_BITS; > +int write_start = 0; > +int write_ptr = 0; > + > +uint64_t byteswap(uint64_t value) { > + u32vec2 val = unpack32(value); > + val.x = pack32(unpack8(val.x).wzyx); > + val.y = pack32(unpack8(val.y).wzyx); > + return pack64(val.yx); > +} > + > +void put_bits(int n, int value) { > + if (n < bit_left) { > + bit_buf = (bit_buf << n) | value; > + bit_left -= n; > + } else { > + bit_buf <<= bit_left; > + bit_buf |= (value >> (n - bit_left)); > + uint64_t pbb = uint64_t(pb) + write_ptr * 4; > + BitBufQword(pbb).data[0] = byteswap(bit_buf); > + write_ptr += 2; > + bit_left += BUF_BITS - n; > + bit_buf = value; > + } > +} > + > +void flush_put_bits() { > + int bit_left_lo = (bit_left >> 3) << 3; > + bit_buf <<= (bit_left - bit_left_lo); > + if (bit_left_lo == 0) { > + uint64_t pbb = uint64_t(pb) + write_ptr * 4; > + BitBufQword(pbb).data[0] = byteswap(bit_buf); > + write_ptr += 2; > + bit_left = BUF_BITS; > + bit_buf = 0; > + } else { > + bit_left = bit_left_lo; > + } > +} > + > +int put_bytes_count() { > + return (write_ptr - write_start) * BUF_BYTES + ((BUF_BITS - bit_left) > >> 3); > +} > + > +/* Same as skip_put_bytes in put_bits.h but fills in 0xFF */ > +void skip_put_bytes(int n) { > + int bytes_left = bit_left >> 3; > + if (n < bytes_left) { > + int n_bits = n << 3; > + int mask = (1 << n_bits) - 1; > + bit_buf <<= n_bits; > + bit_buf |= mask; > + bit_left -= n_bits; > + return; > + } > + if (bit_left < BUF_BITS) { > + int mask = (1 << bit_left) - 1; > + bit_buf <<= bit_left; > + bit_buf |= mask; > + uint64_t pbb = uint64_t(pb) + write_ptr * 4; > + BitBufQword(pbb).data[0] = byteswap(bit_buf); > + write_ptr += 2; > + n -= bit_left >> 3; > + } > + int skip_dwords = n >> 2; > + while (skip_dwords > 0) { > + pb.data[write_ptr++] = 0xFFFFFFFF; > + skip_dwords--; > + } > + int skip_bits = (n & 3) << 3; > + bit_buf = (1 << skip_bits) - 1; > + bit_left = BUF_BITS - skip_bits; > +} > + > +void put_vc2_ue_uint(uint val) { > + int pbits = 0, topbit = 1, maxval = 1, bits = 0; > + if (val == 0) { > + put_bits(1, 1); > + return; > + } > + val++; > + > + while (val > maxval) { > + topbit <<= 1; > + bits++; > + maxval <<= 1; > + maxval |= 1; > + } > + > + for (int i = 0; i < bits; i++) { > + topbit >>= 1; > + pbits <<= 2; > + if ((val & topbit) != 0) { > + pbits |= 1; > + } > + } > + > + put_bits(bits * 2 + 1, (pbits << 1) | 1); > +} > + > +int align(int x, int a) { > + return (x + a - 1) & ~(a - 1); > +} > + > +int quants[MAX_DWT_LEVELS][4]; > + > +int subband_coord(int index, int h, int lvl) { > + int coord = index; > + coord <<= 1; > + coord |= h; > + coord <<= (wavelet_depth-lvl-1); > + return coord; > +} > + > +void main() { > + int slice_index = int(gl_GlobalInvocationID.x); > + int max_index = num_slices.x * num_slices.y; > + if (slice_index >= max_index) { > + return; > + } > + > + /* Step 2. Quantize and encode */ > + int pb_start = slice.args[slice_index].pb_start; > + for (int i = 0, index = WORKGROUP_X - 1; i < gl_WorkGroupID.x; i++) { > + pb_start += slice.args[index].pb_start + slice.args[index].bytes; > + index += WORKGROUP_X; > + } > + ivec2 slice_coord = ivec2(slice_index % num_slices.x, slice_index / > num_slices.x); > + write_ptr = (pb_start >> 2); > + write_start = write_ptr; > + > + int slice_bytes_max = slice.args[slice_index].bytes; > + int quant_index = slice.args[slice_index].quant_idx; > + > + for (int level = 0; level < wavelet_depth; level++) > + for (int orientation = int(level > 0); orientation < 4; > orientation++) > + quants[level][orientation] = max(quant_index - > luts.quant[level][orientation], 0); > + > + /* Write quant index for this slice */ > + put_bits(8, quant_index); > + > + /* Luma + 2 Chroma planes */ > + for (int p = 0; p < 3; p++) { > + int pad_s, pad_c; > + int bytes_start = put_bytes_count(); > + > + /* Save current location and write a zero value */ > + int write_ptr_start = write_ptr; > + int bit_left_start = bit_left; > + put_bits(8, 0); > + for (int level = 0; level < wavelet_depth; level++) { > + ivec2 band_size = planes[p].dwt_dim >> (wavelet_depth - > level); > + for (int o = int(level > 0); o < 4; o++) { > + /* Encode subband */ > + int left = band_size.x * (slice_coord.x) / num_slices.x; > + int right = band_size.x * (slice_coord.x+1) / > num_slices.x; > + int top = band_size.y * (slice_coord.y) / num_slices.y; > + int bottom = band_size.y * (slice_coord.y+1) / > num_slices.y; > + > + const int q_idx = quants[level][o]; > + const int qfactor = luts.ff_dirac_qscale_tab[q_idx]; > + > + const int yh = o >> 1; > + const int xh = o & 1; > + > + int stride = align(planes[p].dwt_dim.x, 32); > + for (int y = top; y < bottom; y++) { > + for (int x = left; x < right; x++) { > + int sx = subband_coord(x, xh, level); > + int sy = subband_coord(y, yh, level); > + int coef = plane_dat[p].coef_buf[sy * stride + > sx]; > + uint c_abs = uint(abs(coef)); > + c_abs = (c_abs << 2) / qfactor; > + put_vc2_ue_uint(c_abs); > + if (c_abs != 0) { > + put_bits(1, int(coef < 0)); > + } > + } > + } > + } > + } > + flush_put_bits(); > + int bytes_len = put_bytes_count() - bytes_start - 1; > + if (p == 2) { > + int len_diff = slice_bytes_max - put_bytes_count(); > + pad_s = align((bytes_len + len_diff), > size_scaler)/size_scaler; > + pad_c = (pad_s*size_scaler) - bytes_len; > + } else { > + pad_s = align(bytes_len, size_scaler)/size_scaler; > + pad_c = (pad_s*size_scaler) - bytes_len; > + } > + BitBufByte pbb = BitBufByte(pb); > + int start_ptr = (write_ptr_start << 2) + ((BUF_BITS - > bit_left_start) >> 3); > + pbb.data[start_ptr] = uint8_t(pad_s); > + /* vc2-reference uses that padding that decodes to '0' coeffs */ > + skip_put_bytes(pad_c); > + } > +} > \ No newline at end of file > diff --git a/libavcodec/vulkan/slice_sizes.comp > b/libavcodec/vulkan/slice_sizes.comp > new file mode 100644 > index 0000000000..a965e911c7 > --- /dev/null > +++ b/libavcodec/vulkan/slice_sizes.comp > @@ -0,0 +1,184 @@ > +#extension GL_EXT_shader_explicit_arithmetic_types : require > +#extension GL_EXT_scalar_block_layout : require > +#extension GL_EXT_buffer_reference : require > + > +#define WORKGROUP_X 128 > +layout(local_size_x = WORKGROUP_X, local_size_y = 1, local_size_z = 1) in; > + > +#define DIRAC_MAX_QUANT_INDEX 116 > +#define MAX_DWT_LEVELS 5 > + > +struct SliceArgs { > + int quant_idx; > + int bytes; > + int pb_start; > + int pad; > +}; > + > +struct Plane { > + ivec2 dim; > + ivec2 dwt_dim; > +}; > + > +layout(scalar, buffer_reference, buffer_reference_align = 4) buffer > DwtCoef { > + int coef_buf[]; > +}; > + > +layout(std430, buffer_reference) buffer SliceArgBuf { > + SliceArgs args[]; > +}; > + > +layout(scalar, buffer_reference, buffer_reference_align = 4) buffer > QuantLuts { > + int quant[5][4]; > + int ff_dirac_qscale_tab[116]; > +}; > + > +layout(push_constant, scalar) uniform ComputeInfo { > + DwtCoef plane_dat[3]; > + QuantLuts luts; > + SliceArgBuf slice; > + ivec2 num_slices; > + Plane planes[3]; > + int wavelet_depth; > + int size_scaler; > + int prefix_bytes; > + int bits_ceil; > + int bits_floor; > +}; > + > +int count_vc2_ue_uint(uint val) { > + uint topbit = 1, maxval = 1; > + int bits = 0; > + if (val == 0) { > + return 1; > + } > + val++; > + while (val > maxval) { > + bits++; > + topbit <<= 1; > + maxval <<= 1; > + maxval |= 1; > + } > + return bits * 2 + 1; > +} > + > +int ffalign(int x, int a) { > + return (x + a - 1) & ~(a - 1); > +} > + > +int cache[DIRAC_MAX_QUANT_INDEX]; > +int quants[MAX_DWT_LEVELS][4]; > +shared int slice_sizes[WORKGROUP_X]; > + > +int subband_coord(int index, int h, int lvl) { > + int coord = index; > + coord <<= 1; > + coord |= h; > + coord <<= (wavelet_depth-lvl-1); > + return coord; > +} > + > +int count_hq_slice(int quant_index) { > + int bits = 0; > + if (cache[quant_index] != 0) { > + return cache[quant_index]; > + } > + > + bits += 8*prefix_bytes; > + bits += 8; /* quant_idx */ > + > + for (int level = 0; level < wavelet_depth; level++) > + for (int orientation = int(level > 0); orientation < 4; > orientation++) > + quants[level][orientation] = max(quant_index - > luts.quant[level][orientation], 0); > + > + int slice_index = int(gl_GlobalInvocationID.x); > + ivec2 slice_coord = ivec2(slice_index % num_slices.x, slice_index / > num_slices.x); > + for (int p = 0; p < 3; p++) { > + int bytes_start = bits >> 3; > + bits += 8; > + for (int level = 0; level < wavelet_depth; level++) { > + ivec2 band_dim = planes[p].dwt_dim >> (wavelet_depth - level); > + for (int o = int(level > 0); o < 4; o++) { > + const int left = band_dim.x * slice_coord.x / > num_slices.x; > + const int right = band_dim.x * (slice_coord.x+1) / > num_slices.x; > + const int top = band_dim.y * slice_coord.y / num_slices.y; > + const int bottom = band_dim.y * (slice_coord.y+1) / > num_slices.y; > + > + const int q_idx = quants[level][o]; > + const int qfactor = luts.ff_dirac_qscale_tab[q_idx]; > + > + const int yh = o >> 1; > + const int xh = o & 1; > + > + const int stride = ffalign(planes[p].dwt_dim.x, 32); > + for (int y = top; y < bottom; y++) { > + for (int x = left; x < right; x++) { > + int sx = subband_coord(x, xh, level); > + int sy = subband_coord(y, yh, level); > + int coef = plane_dat[p].coef_buf[sy * stride + > sx]; > + uint c_abs = uint(abs(coef)); > + c_abs = (c_abs << 2) / qfactor; > + bits += count_vc2_ue_uint(c_abs); > + bits += int(c_abs > 0); > + } > + } > + } > + } > + bits += ffalign(bits, 8) - bits; > + int bytes_len = (bits >> 3) - bytes_start - 1; > + int pad_s = ffalign(bytes_len, size_scaler) / size_scaler; > + int pad_c = (pad_s * size_scaler) - bytes_len; > + bits += pad_c * 8; > + } > + > + cache[quant_index] = bits; > + return bits; > +} > + > +int ssize_round(int b) { > + return ffalign(b, size_scaler) + 4 + prefix_bytes; > +} > + > +void main() { > + int slice_index = int(gl_GlobalInvocationID.x); > + int max_index = num_slices.x * num_slices.y; > + if (slice_index >= max_index) { > + return; > + } > + for (int i = 0; i < DIRAC_MAX_QUANT_INDEX; i++) { > + cache[i] = 0; > + } > + const int q_ceil = DIRAC_MAX_QUANT_INDEX; > + const int top = bits_ceil; > + const int bottom = bits_floor; > + int quant_buf[2] = int[2](-1, -1); > + int quant = slice.args[slice_index].quant_idx; > + int step = 1; > + int bits_last = 0; > + int bits = count_hq_slice(quant); > + while ((bits > top) || (bits < bottom)) { > + const int signed_step = bits > top ? +step : -step; > + quant = clamp(quant + signed_step, 0, q_ceil-1); > + bits = count_hq_slice(quant); > + if (quant_buf[1] == quant) { > + quant = max(quant_buf[0], quant); > + bits = quant == quant_buf[0] ? bits_last : bits; > + break; > + } > + step = clamp(step / 2, 1, (q_ceil - 1) / 2); > + quant_buf[1] = quant_buf[0]; > + quant_buf[0] = quant; > + bits_last = bits; > + } > + int bytes = ssize_round(bits >> 3); > + slice.args[slice_index].quant_idx = clamp(quant, 0, q_ceil-1); > + slice.args[slice_index].bytes = bytes; > + slice_sizes[gl_LocalInvocationIndex] = bytes; > + barrier(); > + /* Prefix sum for all slices in current workgroup */ > + int total_bytes = 0; > + for (int i = 0; i < gl_LocalInvocationIndex; i++) { > + total_bytes += slice_sizes[i]; > + } > + slice.args[slice_index].pb_start = total_bytes; > +} > \ No newline at end of file > -- > 2.47.0 > > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".