This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.

commit 8d433a5b4d325807fb049668dddf15f27c2ccaf1
Author:     Lynne <[email protected]>
AuthorDate: Fri Jun 12 14:11:15 2026 +0900
Commit:     Lynne <[email protected]>
CommitDate: Thu Jul 2 16:45:08 2026 +0900

    avcodec/vulkan_apv: store coefficients in a flat buffer, not the image
    
    The entropy pass wrote each decoded coefficient into the output image as
    16-bit scratch, then the iDCT pass read it back, dequantised it and
    overwrote the same texel with the final pixel. Routing the coefficients
    through the image couples the two passes to the image layout and forces
    the entropy shader to address the descriptor-indexed image.
    
    Add a dedicated device-local int16 buffer instead: the entropy shader
    writes coefficients into it (one plane per component, MB-aligned coded
    size) and the iDCT reads them, so the image is written exactly once, by
    the iDCT. The buffer is zero-filled before the entropy dispatch so blocks
    with no coded coefficients read as zero, and the decode->iDCT hand-off
    becomes a buffer barrier. The output image is still cleared so any padding
    the iDCT does not cover stays zero.
---
 libavcodec/vulkan/apv_decode.comp.glsl |  25 ++++++--
 libavcodec/vulkan/apv_idct.comp.glsl   |  22 ++++++-
 libavcodec/vulkan_apv.c                | 105 +++++++++++++++++++++++++++++----
 3 files changed, 133 insertions(+), 19 deletions(-)

diff --git a/libavcodec/vulkan/apv_decode.comp.glsl 
b/libavcodec/vulkan/apv_decode.comp.glsl
index 6db24b5372..af54ae2f29 100644
--- a/libavcodec/vulkan/apv_decode.comp.glsl
+++ b/libavcodec/vulkan/apv_decode.comp.glsl
@@ -35,6 +35,9 @@
 #define APV_MB_SIZE         (ivec2(16, 16))
 
 layout (set = 0, binding = 0) uniform writeonly uimage2D dst[];
+layout (set = 0, binding = 2, scalar) writeonly buffer coeffs_out_buf {
+    int16_t coeffs_out[];
+};
 layout (set = 0, binding = 1, scalar) readonly buffer frame_data_buf {
     uvec2 tile_offset[APV_MAX_NUM_COMP * APV_MAX_TILE_COUNT];
     uint8_t q_matrix[APV_MAX_NUM_COMP][8][8];
@@ -107,7 +110,7 @@ int prev_dc;
 int prev_k_dc;
 int prev_1st_ac_level;
 
-void decode_block(ivec2 pos, uint comp)
+void decode_block(uint cbase, int cstride, ivec2 pos, uint comp)
 {
     int dc_coeff;
     int abs_diff = apv_read_vlc(prev_k_dc);
@@ -125,7 +128,7 @@ void decode_block(ivec2 pos, uint comp)
         dc_coeff > APV_MAX_TRANS_COEFF)
         return;
 
-    imageStore(dst[comp], pos, uvec4(uint(dc_coeff) & 0xFFFFu));
+    coeffs_out[cbase + uint(pos.y * cstride + pos.x)] = int16_t(dc_coeff);
     prev_dc   = dc_coeff;
     prev_k_dc = min(abs_diff >> 1, 5);
 
@@ -165,7 +168,8 @@ void decode_block(ivec2 pos, uint comp)
                 return;
 
             int zz = int(zigzag[scan_pos]);
-            imageStore(dst[comp], pos + ivec2(zz & 7, zz >> 3), 
uvec4(uint(level) & 0xFFFFu));
+            coeffs_out[cbase + uint((pos.y + (zz >> 3)) * cstride +
+                                    pos.x + (zz & 7))] = int16_t(level);
 
             prev_level = abs_ac_coeff_minus1 + 1;
             if (first_ac != 0) {
@@ -194,6 +198,19 @@ void main(void)
     init_get_bits(gb, u8buf(tile_data + tile_bs.x), int(tile_bs.y));
 
     ivec2 sub_shift = comp_idx == 0 ? ivec2(0) : log2_chroma_sub;
+
+    /* This component's plane inside the flat coefficient buffer. Plane
+     * dims are the MB-aligned coded size (the closing entries of the tile
+     * col/row tables), in component resolution. */
+    const int cw0 = int(tile_col[tile_count.x]);
+    const int ch0 = int(tile_row[tile_count.y]);
+    uint cbase = 0u;
+    for (uint i = 0u; i < comp_idx; i++) {
+        ivec2 ss = i == 0u ? ivec2(0) : log2_chroma_sub;
+        cbase += uint((cw0 >> ss.x) * (ch0 >> ss.y));
+    }
+    const int cstride = cw0 >> sub_shift.x;
+
     ivec2 tile_start = ivec2(tile_col[tile_pos.x], tile_row[tile_pos.y]);
     ivec2 tile_dim = ivec2(tile_col[tile_pos.x + 1],
                            tile_row[tile_pos.y + 1]) - tile_start;
@@ -208,7 +225,7 @@ void main(void)
                     ivec2 pos = (APV_MB_SIZE*mb +
                                  APV_TR_SIZE*blk + tile_start) >> sub_shift;
 
-                    decode_block(pos, comp_idx);
+                    decode_block(cbase, cstride, pos, comp_idx);
                 }
             }
         }
diff --git a/libavcodec/vulkan/apv_idct.comp.glsl 
b/libavcodec/vulkan/apv_idct.comp.glsl
index 2b56bc02fc..679b8e7623 100644
--- a/libavcodec/vulkan/apv_idct.comp.glsl
+++ b/libavcodec/vulkan/apv_idct.comp.glsl
@@ -33,6 +33,9 @@
 #define APV_BLOCKS_PER_WG   8
 
 layout (set = 0, binding = 0) uniform uimage2D dst[];
+layout (set = 0, binding = 2, scalar) readonly buffer coeffs_in_buf {
+    int16_t coeffs_in[];
+};
 layout (set = 0, binding = 1, scalar) readonly buffer frame_data_buf {
     uvec2 tile_offset[APV_MAX_NUM_COMP * APV_MAX_TILE_COUNT];
     uint8_t q_matrix[APV_MAX_NUM_COMP][8][8];
@@ -87,11 +90,26 @@ void main(void)
     const float fact = float(half_range);
     const float norm = 1.0f / (1024.0f * fact); /* DCT normalization const */
 
+    /* This component's plane inside the flat coefficient buffer */
+    const int cw0 = int(tile_col[tile_count.x]);
+    const int ch0 = int(tile_row[tile_count.y]);
+    uint cbase = 0u;
+    for (uint i = 0u; i < comp; i++) {
+        ivec2 ss = i == 0u ? ivec2(0) : log2_chroma_sub;
+        cbase += uint((cw0 >> ss.x) * (ch0 >> ss.y));
+    }
+    const int cstride = cw0 >> sub_shift.x;
+    const int cheight = ch0 >> sub_shift.y;
+
+    /* blocks fully outside the coded area have nothing stored for them */
+    const bool oob = pos.x >= cstride || pos.y >= cheight;
+
     [[unroll]]
     for (uint y = 0u; y < 8u; y++) {
         /* load */
-        int   raw   = int(imageLoad(dst[comp], pos + ivec2(col, y)).x);
-        int   coeff = sign_extend(raw, 16);
+        int   coeff = oob ? 0
+                    : int(coeffs_in[cbase + uint((pos.y + int(y)) * cstride +
+                                                 pos.x + int(col))]);
         /* dequant + norm */
         int   qs    = level_scale * int(q_matrix[comp][col][y]) * (1 << 
qp_shift);
         float v     = float(coeff * qs) * norm;
diff --git a/libavcodec/vulkan_apv.c b/libavcodec/vulkan_apv.c
index 98f4cc2335..4c9b61d7e6 100644
--- a/libavcodec/vulkan_apv.c
+++ b/libavcodec/vulkan_apv.c
@@ -48,6 +48,11 @@ typedef struct APVVulkanDecodeContext {
     FFVulkanShader idct;
 
     AVBufferPool *frame_data_pool;
+
+    /* Flat per-frame coefficient buffer: entropy writes it, the iDCT reads it,
+     * instead of bouncing coefficients through the output image. */
+    AVBufferPool *coeff_pool;
+    size_t        coeff_size;
 } APVVulkanDecodeContext;
 
 typedef struct DecodePushData {
@@ -178,6 +183,8 @@ static int vk_apv_end_frame(AVCodecContext *avctx)
 
     VkImageMemoryBarrier2 img_bar[8];
     int nb_img_bar = 0;
+    VkBufferMemoryBarrier2 buf_bar[2];
+    int nb_buf_bar = 0;
 
     FFVkExecContext *exec = ff_vk_exec_get(&ctx->s, &ctx->exec_pool);
     ff_vk_exec_start(&ctx->s, exec);
@@ -244,6 +251,39 @@ static int vk_apv_end_frame(AVCodecContext *avctx)
     });
     nb_img_bar = 0;
 
+    /* Zero-filled first, since entropy writes only the nonzero coefficients. 
*/
+    AVBufferRef *coeff_ref;
+    err = ff_vk_get_pooled_buffer(&ctx->s, &apvvk->coeff_pool, &coeff_ref,
+                                  VK_BUFFER_USAGE_TRANSFER_DST_BIT |
+                                  VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
+                                  NULL, apvvk->coeff_size,
+                                  VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
+    if (err < 0)
+        return err;
+    FFVkBuffer *coeff_buf = (FFVkBuffer *)coeff_ref->data;
+    RET(ff_vk_exec_add_dep_buf(&ctx->s, exec, &coeff_ref, 1, 0));
+
+    vk->CmdFillBuffer(exec->buf, coeff_buf->buf, 0, VK_WHOLE_SIZE, 0);
+
+    buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
+        .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
+        .srcStageMask  = VK_PIPELINE_STAGE_2_CLEAR_BIT,
+        .srcAccessMask = VK_ACCESS_2_TRANSFER_WRITE_BIT,
+        .dstStageMask  = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+        .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT |
+                         VK_ACCESS_2_SHADER_STORAGE_READ_BIT,
+        .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+        .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+        .buffer = coeff_buf->buf,
+        .size   = VK_WHOLE_SIZE,
+    };
+    vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
+        .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+        .pBufferMemoryBarriers = buf_bar,
+        .bufferMemoryBarrierCount = nb_buf_bar,
+    });
+    nb_buf_bar = 0;
+
     /* Setup push data */
     DecodePushData pd = (DecodePushData) {
         .tile_data = slices_buf->address,
@@ -264,6 +304,10 @@ static int vk_apv_end_frame(AVCodecContext *avctx)
                                     frame_data_buf,
                                     0, frame_data_buf->size,
                                     VK_FORMAT_UNDEFINED);
+    ff_vk_shader_update_desc_buffer(&ctx->s, exec, &apvvk->decode,
+                                    0, 2, 0,
+                                    coeff_buf, 0, coeff_buf->size,
+                                    VK_FORMAT_UNDEFINED);
 
     ff_vk_exec_bind_shader(&ctx->s, exec, &apvvk->decode);
     ff_vk_shader_update_push_const(&ctx->s, exec, &apvvk->decode,
@@ -274,20 +318,24 @@ static int vk_apv_end_frame(AVCodecContext *avctx)
                     apv->tile_info.tile_cols, apv->tile_info.tile_rows,
                     desc->nb_components);
 
-    /* Wait for all decoding to finish */
-    ff_vk_frame_barrier(&ctx->s, exec, apv->output_frame, img_bar, &nb_img_bar,
-                        VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
-                        VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
-                        VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
-                        VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
-                        VK_IMAGE_LAYOUT_GENERAL,
-                        VK_QUEUE_FAMILY_IGNORED);
+    /* Wait for the coefficient writes before the iDCT reads them */
+    buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
+        .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
+        .srcStageMask  = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+        .srcAccessMask = VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
+        .dstStageMask  = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+        .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT,
+        .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+        .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+        .buffer = coeff_buf->buf,
+        .size   = VK_WHOLE_SIZE,
+    };
     vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
         .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
-        .pImageMemoryBarriers = img_bar,
-        .imageMemoryBarrierCount = nb_img_bar,
+        .pBufferMemoryBarriers = buf_bar,
+        .bufferMemoryBarrierCount = nb_buf_bar,
     });
-    nb_img_bar = 0;
+    nb_buf_bar = 0;
 
     /* iDCT */
     ff_vk_shader_update_img_array(&ctx->s, exec, &apvvk->idct,
@@ -300,6 +348,10 @@ static int vk_apv_end_frame(AVCodecContext *avctx)
                                     frame_data_buf,
                                     0, frame_data_buf->size,
                                     VK_FORMAT_UNDEFINED);
+    ff_vk_shader_update_desc_buffer(&ctx->s, exec, &apvvk->idct,
+                                    0, 2, 0,
+                                    coeff_buf, 0, coeff_buf->size,
+                                    VK_FORMAT_UNDEFINED);
 
     ff_vk_exec_bind_shader(&ctx->s, exec, &apvvk->idct);
     ff_vk_shader_update_push_const(&ctx->s, exec, &apvvk->idct,
@@ -350,9 +402,14 @@ static int init_decode_shader(AVCodecContext *avctx, 
FFVulkanContext *s,
             .name        = "frame_data_buf",
             .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
             .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+        },
+        {
+            .name        = "coeffs_out_buf",
+            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
         }
     };
-    ff_vk_shader_add_descriptor_set(s, shd, desc_set, 2, 0, 0);
+    ff_vk_shader_add_descriptor_set(s, shd, desc_set, 3, 0, 0);
 
     RET(ff_vk_shader_link(s, shd,
                           ff_apv_decode_comp_spv_data,
@@ -401,8 +458,13 @@ static int init_idct_shader(AVCodecContext *avctx, 
FFVulkanContext *s,
             .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
             .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
         },
+        {
+            .name        = "coeffs_in_buf",
+            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+        },
     };
-    ff_vk_shader_add_descriptor_set(s, shd, desc_set, 2, 0, 0);
+    ff_vk_shader_add_descriptor_set(s, shd, desc_set, 3, 0, 0);
 
     RET(ff_vk_shader_link(s, shd,
                           ff_apv_idct_comp_spv_data,
@@ -422,6 +484,7 @@ static void vk_decode_apv_uninit(FFVulkanDecodeShared *ctx)
     ff_vk_shader_free(&ctx->s, &apvvk->idct);
 
     av_buffer_pool_uninit(&apvvk->frame_data_pool);
+    av_buffer_pool_uninit(&apvvk->coeff_pool);
 
     av_freep(&apvvk);
 }
@@ -444,6 +507,22 @@ static int vk_decode_apv_init(AVCodecContext *avctx)
 
     ctx->sd_ctx_free = &vk_decode_apv_uninit;
 
+    /* Size the flat coefficient buffer: one int16 per sample of the
+     * MB-aligned coded area, summed over components. */
+    {
+        const AVPixFmtDescriptor *pd =
+            av_pix_fmt_desc_get(avctx->sw_pix_fmt);
+        int cw = FFALIGN(avctx->coded_width,  16);
+        int ch = FFALIGN(avctx->coded_height, 16);
+        apvvk->coeff_size = 0;
+        for (int i = 0; i < pd->nb_components; i++) {
+            int sx = (i == 1 || i == 2) ? pd->log2_chroma_w : 0;
+            int sy = (i == 1 || i == 2) ? pd->log2_chroma_h : 0;
+            apvvk->coeff_size += (size_t)(cw >> sx) * (ch >> sy);
+        }
+        apvvk->coeff_size *= sizeof(int16_t);
+    }
+
     RET(init_decode_shader(avctx, &ctx->s, &ctx->exec_pool,
                            &apvvk->decode));
 

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to