This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit d66552e676317308473ddb5049ad837d89e3742d Author: Lynne <[email protected]> AuthorDate: Tue May 26 11:38:29 2026 +0900 Commit: Lynne <[email protected]> CommitDate: Sat May 30 12:10:01 2026 +0900 vulkan/ffv1: add 32-bit float RGB encoding and a rice + remap path This implements 32-bit float RGB encoding and makes the Vulkan implementation on-par with the C implementation. Sponsored-by: Sovereign Tech Fund --- libavcodec/ffv1_vulkan.h | 1 + libavcodec/ffv1enc_vulkan.c | 153 +++++++++++++++++++-- libavcodec/vulkan/Makefile | 4 +- libavcodec/vulkan/ffv1_common.glsl | 1 + libavcodec/vulkan/ffv1_enc.comp.glsl | 27 +++- ...mp.glsl => ffv1_enc_rgb_float_golomb.comp.glsl} | 2 + libavcodec/vulkan/ffv1_enc_setup.comp.glsl | 127 ++++++++++++++++- libavcodec/vulkan/ffv1_enc_sort32.comp.glsl | 153 +++++++++++++++++++++ 8 files changed, 444 insertions(+), 24 deletions(-) diff --git a/libavcodec/ffv1_vulkan.h b/libavcodec/ffv1_vulkan.h index 9a206afaca..d6ae0f3fee 100644 --- a/libavcodec/ffv1_vulkan.h +++ b/libavcodec/ffv1_vulkan.h @@ -48,6 +48,7 @@ typedef struct FFv1ShaderParams { int sar[2]; int pic_mode; uint32_t slice_size_max; + uint32_t max_pixels_per_slice; } FFv1ShaderParams; #endif /* AVCODEC_FFV1_VULKAN_H */ diff --git a/libavcodec/ffv1enc_vulkan.c b/libavcodec/ffv1enc_vulkan.c index 92d46f7ddf..7c22ced785 100644 --- a/libavcodec/ffv1enc_vulkan.c +++ b/libavcodec/ffv1enc_vulkan.c @@ -72,6 +72,7 @@ typedef struct VulkanEncodeFFv1Context { FFVulkanShader rct_search; FFVulkanShader remap; + FFVulkanShader sort32; FFVulkanShader setup; FFVulkanShader reset; FFVulkanShader enc; @@ -101,6 +102,8 @@ typedef struct VulkanEncodeFFv1Context { int optimize_rct; int is_rgb; + int is_float32; + uint32_t max_pixels_per_slice; int ppi; int chunks; } VulkanEncodeFFv1Context; @@ -141,6 +144,12 @@ extern const unsigned int ff_ffv1_enc_remap_comp_spv_len; extern const unsigned char ff_ffv1_enc_rgb_float_comp_spv_data[]; extern const unsigned int ff_ffv1_enc_rgb_float_comp_spv_len; +extern const unsigned char ff_ffv1_enc_rgb_float_golomb_comp_spv_data[]; +extern const unsigned int ff_ffv1_enc_rgb_float_golomb_comp_spv_len; + +extern const unsigned char ff_ffv1_enc_sort32_comp_spv_data[]; +extern const unsigned int ff_ffv1_enc_sort32_comp_spv_len; + static int run_rct_search(AVCodecContext *avctx, FFVkExecContext *exec, AVFrame *enc_in, VkImageView *enc_in_views, FFVkBuffer *slice_data_buf, uint32_t slice_data_size, @@ -203,6 +212,37 @@ static int run_remap(AVCodecContext *avctx, FFVkExecContext *exec, return 0; } +static int run_sort32(AVCodecContext *avctx, FFVkExecContext *exec, + AVFrame *enc_in, VkImageView *enc_in_views, + FFVkBuffer *units_buf, uint32_t units_size, + FFv1ShaderParams *pd) +{ + VulkanEncodeFFv1Context *fv = avctx->priv_data; + FFV1Context *f = &fv->ctx; + FFVulkanFunctions *vk = &fv->s.vkfn; + + /* Update descriptors */ + ff_vk_shader_update_img_array(&fv->s, exec, &fv->sort32, + enc_in, enc_in_views, + 1, 1, + VK_IMAGE_LAYOUT_GENERAL, + VK_NULL_HANDLE); + ff_vk_shader_update_desc_buffer(&fv->s, exec, &fv->sort32, + 1, 2, 0, + units_buf, + 0, units_size*f->slice_count, + VK_FORMAT_UNDEFINED); + + ff_vk_exec_bind_shader(&fv->s, exec, &fv->sort32); + ff_vk_shader_update_push_const(&fv->s, exec, &fv->sort32, + VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(FFv1ShaderParams), pd); + + vk->CmdDispatch(exec->buf, fv->ctx.num_h_slices, fv->ctx.num_v_slices, 1); + + return 0; +} + static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx, FFVkExecContext *exec, const AVFrame *pict) @@ -279,15 +319,19 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx, slice_data_buf = (FFVkBuffer *)slice_data_ref->data; if (f->remap_mode) { - const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(fv->s.frames->sw_format); - remap_data_size = 4*(1 << desc->comp[0].depth)*sizeof(uint32_t); + if (fv->is_float32) { + /* Per (slice, plane): [units : max_pixels*2 uints] + [bitmap : max_pixels uints]. */ + remap_data_size = 4*fv->max_pixels_per_slice*3*sizeof(uint32_t); + } else { + const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(fv->s.frames->sw_format); + remap_data_size = 4*(1 << desc->comp[0].depth)*sizeof(uint32_t); + } RET(ff_vk_get_pooled_buffer(&fv->s, &fv->remap_data_pool, &remap_data_ref, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, NULL, remap_data_size*f->slice_count, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT)); - remap_data_buf = (FFVkBuffer *)remap_data_ref->data; } @@ -348,6 +392,7 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx, .pic_mode = !(pict->flags & AV_FRAME_FLAG_INTERLACED) ? 3 : !(pict->flags & AV_FRAME_FLAG_TOP_FIELD_FIRST) ? 2 : 1, .slice_size_max = out_data_buf->size / f->slice_count, + .max_pixels_per_slice = fv->max_pixels_per_slice, }; for (int i = 0; i < f->quant_table_count; i++) { @@ -420,8 +465,13 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx, } if (f->remap_mode) { - RET(run_remap(avctx, exec, src, src_views, - remap_data_buf, remap_data_size, &pd)); + if (fv->is_float32) { + RET(run_sort32(avctx, exec, src, src_views, + remap_data_buf, remap_data_size, &pd)); + } else { + RET(run_remap(avctx, exec, src, src_views, + remap_data_buf, remap_data_size, &pd)); + } /* Make sure the writes are visible to the setup shader */ ff_vk_buf_barrier(buf_bar[nb_buf_bar++], remap_data_buf, @@ -519,6 +569,14 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx, COMPUTE_SHADER_BIT, SHADER_WRITE_BIT, NONE_KHR, COMPUTE_SHADER_BIT, SHADER_READ_BIT, SHADER_WRITE_BIT, 0, slice_data_size*f->slice_count); + + /* Setup writes the per-pixel compact_idx (or compact_idx-of-value) + * back into the remap buffer; the encode shader reads it. */ + if (f->remap_mode) + ff_vk_buf_barrier(buf_bar[nb_buf_bar++], remap_data_buf, + COMPUTE_SHADER_BIT, SHADER_READ_BIT, SHADER_WRITE_BIT, + COMPUTE_SHADER_BIT, SHADER_READ_BIT, NONE_KHR, + 0, remap_data_size*f->slice_count); if (f->key_frame || fv->force_pcm) ff_vk_buf_barrier(buf_bar[nb_buf_bar++], slice_data_buf, COMPUTE_SHADER_BIT, SHADER_WRITE_BIT, NONE_KHR, @@ -906,6 +964,54 @@ fail: return err; } +static int init_sort32_shader(AVCodecContext *avctx, VkSpecializationInfo *sl) +{ + int err; + VulkanEncodeFFv1Context *fv = avctx->priv_data; + FFVulkanShader *shd = &fv->sort32; + + uint32_t wg_x = FFMIN(fv->max_pixels_per_slice, 256); + ff_vk_shader_load(shd, VK_SHADER_STAGE_COMPUTE_BIT, sl, + (uint32_t []) { wg_x, 1, 1 }, 0); + + ff_vk_shader_add_push_const(shd, 0, sizeof(FFv1ShaderParams), + VK_SHADER_STAGE_COMPUTE_BIT); + + const FFVulkanDescriptorSetBinding desc_set_const[] = { + { /* rangecoder_buf */ + .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + }, + }; + ff_vk_shader_add_descriptor_set(&fv->s, shd, desc_set_const, 1, 1, 0); + + const FFVulkanDescriptorSetBinding desc_set[] = { + { /* slice_data_buf */ + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + }, + { /* src */ + .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .elems = av_pix_fmt_count_planes(fv->s.frames->sw_format), + }, + { /* units */ + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + }, + }; + ff_vk_shader_add_descriptor_set(&fv->s, shd, desc_set, 3, 0, 0); + + RET(ff_vk_shader_link(&fv->s, shd, + ff_ffv1_enc_sort32_comp_spv_data, + ff_ffv1_enc_sort32_comp_spv_len, "main")); + + RET(ff_vk_shader_register_exec(&fv->s, &fv->exec_pool, shd)); + +fail: + return err; +} + static int init_remap_shader(AVCodecContext *avctx, VkSpecializationInfo *sl) { int err; @@ -1105,9 +1211,14 @@ static int init_encode_shader(AVCodecContext *avctx, VkSpecializationInfo *sl) 4 + fv->is_rgb + !!f->remap_mode, 0, 0); if (f->remap_mode) { - ff_vk_shader_link(&fv->s, shd, - ff_ffv1_enc_rgb_float_comp_spv_data, - ff_ffv1_enc_rgb_float_comp_spv_len, "main"); + if (fv->ctx.ac == AC_GOLOMB_RICE) + ff_vk_shader_link(&fv->s, shd, + ff_ffv1_enc_rgb_float_golomb_comp_spv_data, + ff_ffv1_enc_rgb_float_golomb_comp_spv_len, "main"); + else + ff_vk_shader_link(&fv->s, shd, + ff_ffv1_enc_rgb_float_comp_spv_data, + ff_ffv1_enc_rgb_float_comp_spv_len, "main"); } else if (fv->ctx.ac == AC_GOLOMB_RICE) { if (fv->is_rgb) ff_vk_shader_link(&fv->s, shd, @@ -1304,6 +1415,26 @@ static av_cold int vulkan_encode_ffv1_init(AVCodecContext *avctx) fv->is_rgb = !(f->colorspace == 0 && avctx->sw_pix_fmt != AV_PIX_FMT_YA8) && !(avctx->sw_pix_fmt == AV_PIX_FMT_YA8); + fv->is_float32 = (avctx->sw_pix_fmt == AV_PIX_FMT_GBRPF32 || + avctx->sw_pix_fmt == AV_PIX_FMT_GBRAPF32); + + if (fv->is_float32) { + /* Compute the worst-case slice geometry. With version >= 4 the slice + * boundaries are computed via slice_coord() which rounds up, so any + * single slice has at most ceil(width/num_h_slices) * ceil(height/num_v_slices) + * pixels. */ + uint32_t mw = (avctx->width + f->num_h_slices - 1) / f->num_h_slices; + uint32_t mh = (avctx->height + f->num_v_slices - 1) / f->num_v_slices; + /* Round up to next pow2 for bitonic sort */ + uint32_t n = 1; + uint32_t pn = mw*mh; + while (n < pn) + n <<= 1; + if (n < 2) + n = 2; + fv->max_pixels_per_slice = n; + } + /* Init rct search shader */ fv->optimize_rct = fv->is_rgb && f->version >= 4 && !fv->force_pcm && fv->optimize_rct; @@ -1325,7 +1456,10 @@ static av_cold int vulkan_encode_ffv1_init(AVCodecContext *avctx) } if (f->remap_mode) { - err = init_remap_shader(avctx, sl); + if (fv->is_float32) + err = init_sort32_shader(avctx, sl); + else + err = init_remap_shader(avctx, sl); if (err < 0) return err; } @@ -1420,6 +1554,7 @@ static av_cold int vulkan_encode_ffv1_close(AVCodecContext *avctx) ff_vk_shader_free(&fv->s, &fv->reset); ff_vk_shader_free(&fv->s, &fv->setup); ff_vk_shader_free(&fv->s, &fv->remap); + ff_vk_shader_free(&fv->s, &fv->sort32); ff_vk_shader_free(&fv->s, &fv->rct_search); if (fv->exec_ctx_info) { diff --git a/libavcodec/vulkan/Makefile b/libavcodec/vulkan/Makefile index c6817967c7..f86931727d 100644 --- a/libavcodec/vulkan/Makefile +++ b/libavcodec/vulkan/Makefile @@ -13,7 +13,9 @@ OBJS-$(CONFIG_FFV1_VULKAN_ENCODER) += vulkan/ffv1_enc_setup.comp.spv.o \ vulkan/ffv1_enc_rgb_golomb.comp.spv.o \ vulkan/ffv1_enc_rct_search.comp.spv.o \ vulkan/ffv1_enc_remap.comp.spv.o \ - vulkan/ffv1_enc_rgb_float.comp.spv.o + vulkan/ffv1_enc_rgb_float.comp.spv.o \ + vulkan/ffv1_enc_rgb_float_golomb.comp.spv.o \ + vulkan/ffv1_enc_sort32.comp.spv.o OBJS-$(CONFIG_FFV1_VULKAN_HWACCEL) += vulkan/ffv1_dec_setup.comp.spv.o \ vulkan/ffv1_dec_reset.comp.spv.o \ diff --git a/libavcodec/vulkan/ffv1_common.glsl b/libavcodec/vulkan/ffv1_common.glsl index 8580a0777f..3d3b6753c6 100644 --- a/libavcodec/vulkan/ffv1_common.glsl +++ b/libavcodec/vulkan/ffv1_common.glsl @@ -75,6 +75,7 @@ layout (push_constant, scalar) uniform pushConstants { ivec2 sar; int pic_mode; uint slice_size_max; + uint max_pixels_per_slice; }; #include "rangecoder.glsl" diff --git a/libavcodec/vulkan/ffv1_enc.comp.glsl b/libavcodec/vulkan/ffv1_enc.comp.glsl index 90ce8293b9..1c30e91828 100644 --- a/libavcodec/vulkan/ffv1_enc.comp.glsl +++ b/libavcodec/vulkan/ffv1_enc.comp.glsl @@ -40,8 +40,8 @@ layout (set = 1, binding = 1, scalar) writeonly buffer slice_results_buf { * denormals before we get to look at them. */ layout (set = 1, binding = 3) uniform uimage2D src[]; #ifdef FLOAT -layout (set = 1, binding = 5) readonly buffer fltmap_buf { - uint fltmap[][4][65536]; +layout (set = 1, binding = 5, scalar) readonly buffer fltmap_buf { + uint fltmap[]; }; #endif @@ -239,11 +239,24 @@ ivec4 load_components(uint slice_idx, in SliceContext sc, ivec2 pos) { ivec4 pix; #ifdef FLOAT - /* Source view is r16_uint so imageLoad returns the raw fp16 bit pattern - * in .x; no conversion is performed and denormals survive. */ - for (int i = 0; i < color_planes; i++) { - uint iv = imageLoad(src[i], pos)[0] & 0xFFFFu; - pix[i] = int(fltmap[slice_idx][i][iv]); + if (c_bits >= 32) { + /* 32-bit float: per-pixel-position bitmap lookup. The bitmap region + * follows the units region in the same buffer. */ + ivec2 rel = pos - sc.slice_pos; + uint pixel_idx = uint(rel.x + sc.slice_dim.x*rel.y); + uint plane_stride = max_pixels_per_slice*3u; + for (int i = 0; i < color_planes; i++) { + uint base = (slice_idx*4u + uint(i))*plane_stride + + max_pixels_per_slice*2u; + pix[i] = int(fltmap[base + pixel_idx]); + } + } else { + /* 16-bit float: value-indexed lookup. Source view is r16_uint so + * imageLoad returns the raw fp16 bit pattern in .x. */ + for (int i = 0; i < color_planes; i++) { + uint iv = imageLoad(src[i], pos)[0] & 0xFFFFu; + pix[i] = int(fltmap[(slice_idx*4u + uint(i))*65536u + iv]); + } } #else pix = ivec4(imageLoad(src[0], pos)); diff --git a/libavcodec/vulkan/ffv1_enc_rgb_float.comp.glsl b/libavcodec/vulkan/ffv1_enc_rgb_float_golomb.comp.glsl similarity index 96% copy from libavcodec/vulkan/ffv1_enc_rgb_float.comp.glsl copy to libavcodec/vulkan/ffv1_enc_rgb_float_golomb.comp.glsl index c66440601a..e4535eb08f 100644 --- a/libavcodec/vulkan/ffv1_enc_rgb_float.comp.glsl +++ b/libavcodec/vulkan/ffv1_enc_rgb_float_golomb.comp.glsl @@ -26,6 +26,8 @@ layout (set = 1, binding = 4) uniform uimage2D tmp; +#define PB_UNALIGNED +#define GOLOMB #define FLOAT #define RGB #include "ffv1_enc.comp.glsl" diff --git a/libavcodec/vulkan/ffv1_enc_setup.comp.glsl b/libavcodec/vulkan/ffv1_enc_setup.comp.glsl index 53a8d7f13f..f1db2aed8a 100644 --- a/libavcodec/vulkan/ffv1_enc_setup.comp.glsl +++ b/libavcodec/vulkan/ffv1_enc_setup.comp.glsl @@ -23,13 +23,13 @@ #pragma shader_stage(compute) #extension GL_GOOGLE_include_directive : require -#define NB_CONTEXTS 2 +#define NB_CONTEXTS 6 #define FULL_RENORM #include "common.glsl" #include "ffv1_common.glsl" -layout (set = 1, binding = 1) buffer fltmap_buf { - uint fltmap[][4][65536]; +layout (set = 1, binding = 1, scalar) buffer fltmap_buf { + uint fltmap[]; }; void init_slice(inout SliceContext sc, uint slice_idx) @@ -81,6 +81,7 @@ void encode_histogram_remap(uint slice_idx, inout SliceContext sc) const int flip = (remap_mode == 2) ? 0x7FFF : 0; for (int p = 0; p < color_planes; p++) { + const uint base = (slice_idx*4u + uint(p))*65536u; uint j = 0; uint lu = 0; int run = 0; @@ -90,15 +91,15 @@ void encode_histogram_remap(uint slice_idx, inout SliceContext sc) put_usymbol(0, 0); - for (int i = 0; i < NB_CONTEXTS; i++) + for (int i = 0; i < NB_CONTEXTS*CONTEXT_SIZE; i++) rc_state[i] = uint8_t(128); int cnt = 0; for (int i = 0; i < rct_offset; i++) { int ri = i ^ (((i & 0x8000) != 0) ? 0 : flip); - uint u = uint(fltmap[slice_idx][p][ri] != 0); + uint u = uint(fltmap[base + uint(ri)] != 0u); - fltmap[slice_idx][p][ri] = uint16_t(j); + fltmap[base + uint(ri)] = j; j += u; if (lu == u) { @@ -117,6 +118,115 @@ void encode_histogram_remap(uint slice_idx, inout SliceContext sc) } } +/* The 32-bit float remap uses 6 contexts: state[lu][category][bit] with + * lu = 0,1 and category = 0 (run/step-1), 1 (delta, unused here), 2 (mul). */ +#define CTX_F32(lu, cat) ((uint(lu)*3u + uint(cat))*CONTEXT_SIZE) + +void encode_float32_remap(uint slice_idx, inout SliceContext sc) +{ + const uint slice_w = uint(sc.slice_dim.x); + const uint slice_h = uint(sc.slice_dim.y); + const uint pixel_num = slice_w * slice_h; + const uint plane_stride = max_pixels_per_slice*3u; + + for (int p = 0; p < color_planes; p++) { + /* Layout: per (slice, plane) we have units (max_pixels*8 bytes) + * followed by bitmap (max_pixels*4 bytes). The units region is + * read-only here, the bitmap region is written. */ + const uint plane_base = (slice_idx*4u + uint(p))*plane_stride; + const uint bitmap_base = plane_base + max_pixels_per_slice*2u; + + for (int i = 0; i < NB_CONTEXTS*CONTEXT_SIZE; i++) + rc_state[i] = uint8_t(128); + + put_usymbol(1, CTX_F32(0, 0)); + + for (int i = 0; i < NB_CONTEXTS*CONTEXT_SIZE; i++) + rc_state[i] = uint8_t(128); + + /* last_val is the last unique value (or 0xFFFFFFFF as the "before + * any value" sentinel, this lets step = val - last_val give val+1 + * for the first emission via unsigned wraparound). */ + uint last_val = 0xFFFFFFFFu; + uint lu = 0; + uint run = 0; + int ci = -1; + bool emit_first_mul = true; + + for (uint i = 0; i < pixel_num; i++) { + uint u_val = fltmap[plane_base + 2u*i + 0u]; + uint u_ndx = fltmap[plane_base + 2u*i + 1u]; + + /* Duplicate of the previous unique value? Reuse ci. */ + if (i > 0u && last_val == u_val) { + fltmap[bitmap_base + u_ndx] = uint(ci); + continue; + } + + uint step = u_val - last_val; + + if (lu == 0u) { + put_usymbol(step - 1u, CTX_F32(0, 0)); + + if (emit_first_mul) { + put_usymbol(1, CTX_F32(0, 2)); + emit_first_mul = false; + } + + last_val = u_val; + if (step == 1u) { + lu = 1; + run = 0; + } + } else { + if (step == 1u) { + run++; + last_val = u_val; + } else { + if (run > 0u) { + put_usymbol(run, CTX_F32(1, 0)); + put_usymbol(0, CTX_F32(1, 0)); + last_val += 2u; + } else { + put_usymbol(0, CTX_F32(1, 0)); + last_val += 1u; + } + lu = 0; + run = 0; + + step = u_val - last_val; + put_usymbol(step - 1u, CTX_F32(0, 0)); + + last_val = u_val; + if (step == 1u) { + lu = 1; + run = 0; + } + } + } + + ci++; + fltmap[bitmap_base + u_ndx] = uint(ci); + } + + if (lu == 1u) { + if (run > 0u) { + put_usymbol(run, CTX_F32(1, 0)); + put_usymbol(0, CTX_F32(1, 0)); + last_val += 2u; + } else { + put_usymbol(0, CTX_F32(1, 0)); + last_val += 1u; + } + } + + if (last_val != 0xFFFFFFFFu) + put_usymbol(0xFFFFFFFFu - last_val, CTX_F32(0, 0)); + + sc.remap_count[p] = ci + 1; + } +} + void write_slice_header(uint slice_idx, inout SliceContext sc) { [[unroll]] @@ -149,7 +259,10 @@ void write_slice_header(uint slice_idx, inout SliceContext sc) if (remap_mode != 0) { put_usymbol(remap_mode, 0); - encode_histogram_remap(slice_idx, sc); + if (c_bits >= 32) + encode_float32_remap(slice_idx, sc); + else + encode_histogram_remap(slice_idx, sc); } } } diff --git a/libavcodec/vulkan/ffv1_enc_sort32.comp.glsl b/libavcodec/vulkan/ffv1_enc_sort32.comp.glsl new file mode 100644 index 0000000000..4d40d94577 --- /dev/null +++ b/libavcodec/vulkan/ffv1_enc_sort32.comp.glsl @@ -0,0 +1,153 @@ +/* + * FFv1 codec + * + * Copyright (c) 2026 Lynne <[email protected]> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#pragma shader_stage(compute) +#extension GL_GOOGLE_include_directive : require + +#define SB_QUALI readonly +#include "common.glsl" +#include "ffv1_common.glsl" + +layout (set = 1, binding = 1) uniform uimage2D src[]; + +layout (set = 1, binding = 2, scalar) buffer fltmap_buf { + uint fltmap[]; +}; + +/* The shared fltmap_buf is laid out per (slice, plane) as a + * max_pixels_per_slice*3 uint block, where the first + * max_pixels_per_slice*2 entries hold interleaved (val, ndx) pairs and + * the trailing [max_pixels_per_slice] entries are the bitmap region used + * by the setup/encode shaders. Padding past pixel_num is the sentinel + * (UINT32_MAX, UINT32_MAX) so it sorts at the end. */ + +/* Per-workgroup bitonic-sort buffer. Limits a slice's pow2 size; large + * slices fall back to working in global memory */ +shared u32vec2 smem[8192]; + +void main(void) +{ + const uint slice_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x; + uvec2 img_size = imageSize(src[0]); + + uint sxs = slice_coord(img_size.x, gl_WorkGroupID.x + 0, + gl_NumWorkGroups.x, 0); + uint sxe = slice_coord(img_size.x, gl_WorkGroupID.x + 1, + gl_NumWorkGroups.x, 0); + uint sys = slice_coord(img_size.y, gl_WorkGroupID.y + 0, + gl_NumWorkGroups.y, 0); + uint sye = slice_coord(img_size.y, gl_WorkGroupID.y + 1, + gl_NumWorkGroups.y, 0); + + uint slice_w = sxe - sxs; + uint slice_h = sye - sys; + uint pixel_num = slice_w * slice_h; + + /* Round up to next pow2 for bitonic sort */ + uint N = 1; + while (N < pixel_num) + N <<= 1; + N = max(N, 2); + if (N > max_pixels_per_slice) + N = max_pixels_per_slice; + + const uint plane_stride = max_pixels_per_slice*3u; + const bool use_smem = N <= 8192u; + + for (int p = 0; p < color_planes; p++) { + uint base = (slice_idx*4u + uint(p))*plane_stride; + + /* Load pixels */ + for (uint i = gl_LocalInvocationIndex; i < N; + i += gl_WorkGroupSize.x * gl_WorkGroupSize.y) { + uint v, ndx; + if (i < pixel_num) { + uint y = i / slice_w; + uint x = i - y*slice_w; + v = imageLoad(src[p], ivec2(sxs + x, sys + y))[0]; + if (remap_mode == 2) + v = ((v & 0x80000000u) != 0u) ? v : (v ^ 0x7FFFFFFFu); + ndx = i; + } else { + v = 0xFFFFFFFFu; + ndx = 0xFFFFFFFFu; + } + if (use_smem) { + smem[i] = u32vec2(v, ndx); + } else { + fltmap[base + 2u*i + 0u] = v; + fltmap[base + 2u*i + 1u] = ndx; + } + } + barrier(); + if (!use_smem) memoryBarrierBuffer(); + + /* Bitonic sort of the (val, ndx) pairs. */ + for (uint k = 2; k <= N; k <<= 1) { + for (uint j = k >> 1; j > 0; j >>= 1) { + for (uint i = gl_LocalInvocationIndex; i < N; + i += gl_WorkGroupSize.x * gl_WorkGroupSize.y) { + uint partner = i ^ j; + if (partner > i) { + bool ascending = (i & k) == 0; + u32vec2 a, b; + if (use_smem) { + a = smem[i]; + b = smem[partner]; + } else { + a = u32vec2(fltmap[base + 2u*i + 0u], + fltmap[base + 2u*i + 1u]); + b = u32vec2(fltmap[base + 2u*partner + 0u], + fltmap[base + 2u*partner + 1u]); + } + bool a_gt_b = (a.x > b.x) || + (a.x == b.x && a.y > b.y); + if (a_gt_b == ascending) { + if (use_smem) { + smem[i] = b; + smem[partner] = a; + } else { + fltmap[base + 2u*i + 0u] = b.x; + fltmap[base + 2u*i + 1u] = b.y; + fltmap[base + 2u*partner + 0u] = a.x; + fltmap[base + 2u*partner + 1u] = a.y; + } + } + } + } + barrier(); + if (!use_smem) memoryBarrierBuffer(); + } + } + + /* Write sorted pairs back to global */ + if (use_smem) { + for (uint i = gl_LocalInvocationIndex; i < N; + i += gl_WorkGroupSize.x * gl_WorkGroupSize.y) { + u32vec2 u = smem[i]; + fltmap[base + 2u*i + 0u] = u.x; + fltmap[base + 2u*i + 1u] = u.y; + } + barrier(); + } + } +} _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
