This reduces the intermediate VRAM used for RGB decoding by a
factor of 100x for 6k video.
This also speeds the decoder up by 16% for 4k RGB24 and 31% for 6k video.

This is equivalent to what the software decoder does, but with less pointers.
---
 libavcodec/vulkan/Makefile          |   3 +-
 libavcodec/vulkan/ffv1_dec.comp     | 158 ++++++++++++----
 libavcodec/vulkan/ffv1_dec_rct.comp |  88 ---------
 libavcodec/vulkan_ffv1.c            | 283 ++++++++--------------------
 libavutil/vulkan_functions.h        |   1 +
 5 files changed, 203 insertions(+), 330 deletions(-)
 delete mode 100644 libavcodec/vulkan/ffv1_dec_rct.comp

diff --git a/libavcodec/vulkan/Makefile b/libavcodec/vulkan/Makefile
index e6bad486bd..feb5d2ea51 100644
--- a/libavcodec/vulkan/Makefile
+++ b/libavcodec/vulkan/Makefile
@@ -14,8 +14,7 @@ OBJS-$(CONFIG_FFV1_VULKAN_ENCODER)  +=  vulkan/common.o \
 OBJS-$(CONFIG_FFV1_VULKAN_HWACCEL)  +=  vulkan/common.o \
                                        vulkan/rangecoder.o vulkan/ffv1_vlc.o \
                                        vulkan/ffv1_common.o 
vulkan/ffv1_reset.o \
-                                       vulkan/ffv1_dec_setup.o 
vulkan/ffv1_dec.o \
-                                       vulkan/ffv1_dec_rct.o
+                                       vulkan/ffv1_dec_setup.o 
vulkan/ffv1_dec.o
 
 VULKAN = $(subst $(SRC_PATH)/,,$(wildcard 
$(SRC_PATH)/libavcodec/vulkan/*.comp))
 .SECONDARY: $(VULKAN:.comp=.c)
diff --git a/libavcodec/vulkan/ffv1_dec.comp b/libavcodec/vulkan/ffv1_dec.comp
index 1954c050f8..ae0324cb26 100644
--- a/libavcodec/vulkan/ffv1_dec.comp
+++ b/libavcodec/vulkan/ffv1_dec.comp
@@ -20,23 +20,69 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-ivec2 get_pred(ivec2 pos, ivec2 off, int p, int sw, uint8_t quant_table_idx)
+#ifndef RGB
+#define LADDR(p) (p)
+#else
+#define RGB_LINECACHE 2
+#define RGB_LBUF (RGB_LINECACHE - 1)
+#define LADDR(p) (ivec2((p).x, ((p).y & RGB_LBUF)))
+#endif
+
+#ifdef RGB
+ivec2 get_pred(ivec2 sp, ivec2 off, int p, int sw, uint8_t quant_table_idx)
+{
+    const ivec2 yoff_border1 = off.x == 0 ? ivec2(1, -1) : ivec2(0, 0);
+
+    /* Thanks to the same coincidence as below, we can skip checking if off == 
0, 1 */
+    VTYPE3 top  = VTYPE3(TYPE(imageLoad(dec[p], sp + LADDR(off + ivec2(-1, -1) 
+ yoff_border1))[0]),
+                         TYPE(imageLoad(dec[p], sp + LADDR(off + ivec2(0, 
-1)))[0]),
+                         TYPE(imageLoad(dec[p], sp + LADDR(off + ivec2(min(1, 
sw - off.x - 1), -1)))[0]));
+
+    /* Normally, we'd need to check if off != ivec2(0, 0) here, since 
otherwise, we must
+     * return zero. However, ivec2(-1,  0) + ivec2(1, -1) == ivec2(0, -1), 
e.g. previous
+     * row, 0 offset, same slice, which is zero since we zero out the buffer 
for RGB */
+    TYPE cur = TYPE(imageLoad(dec[p], sp + LADDR(off + ivec2(-1,  0) + 
yoff_border1))[0]);
+
+    int base = quant_table[quant_table_idx][0][(cur    - top[0]) & 
MAX_QUANT_TABLE_MASK] +
+               quant_table[quant_table_idx][1][(top[0] - top[1]) & 
MAX_QUANT_TABLE_MASK] +
+               quant_table[quant_table_idx][2][(top[1] - top[2]) & 
MAX_QUANT_TABLE_MASK];
+
+    if ((quant_table[quant_table_idx][3][127] != 0) ||
+        (quant_table[quant_table_idx][4][127] != 0)) {
+        TYPE cur2 = TYPE(0);
+        if (off.x > 0) {
+            const ivec2 yoff_border2 = off.x == 1 ? ivec2(1, -1) : ivec2(0, 0);
+            cur2 = TYPE(imageLoad(dec[p], sp + LADDR(off + ivec2(-2,  0) + 
yoff_border2))[0]);
+        }
+        base += quant_table[quant_table_idx][3][(cur2 - cur) & 
MAX_QUANT_TABLE_MASK];
+
+        /* top-2 became current upon swap */
+        TYPE top2 = TYPE(imageLoad(dec[p], sp + LADDR(off))[0]);
+        base += quant_table[quant_table_idx][4][(top2 - top[1]) & 
MAX_QUANT_TABLE_MASK];
+    }
+
+    /* context, prediction */
+    return ivec2(base, predict(cur, VTYPE2(top)));
+}
+#else
+ivec2 get_pred(ivec2 sp, ivec2 off, int p, int sw, uint8_t quant_table_idx)
 {
     const ivec2 yoff_border1 = off.x == 0 ? ivec2(1, -1) : ivec2(0, 0);
+    sp += off;
 
     VTYPE3 top  = VTYPE3(TYPE(0),
                          TYPE(0),
                          TYPE(0));
     if (off.y > 0 && off != ivec2(0, 1))
-        top[0] = TYPE(imageLoad(dst[p], pos + ivec2(-1, -1) + 
yoff_border1)[0]);
+        top[0] = TYPE(imageLoad(dec[p], sp + ivec2(-1, -1) + yoff_border1)[0]);
     if (off.y > 0) {
-        top[1] = TYPE(imageLoad(dst[p], pos + ivec2(0, -1))[0]);
-        top[2] = TYPE(imageLoad(dst[p], pos + ivec2(min(1, sw - off.x - 1), 
-1))[0]);
+        top[1] = TYPE(imageLoad(dec[p], sp + ivec2(0, -1))[0]);
+        top[2] = TYPE(imageLoad(dec[p], sp + ivec2(min(1, sw - off.x - 1), 
-1))[0]);
     }
 
     TYPE cur = TYPE(0);
     if (off != ivec2(0, 0))
-        cur = TYPE(imageLoad(dst[p], pos + ivec2(-1,  0) + yoff_border1)[0]);
+        cur = TYPE(imageLoad(dec[p], sp + ivec2(-1,  0) + yoff_border1)[0]);
 
     int base = quant_table[quant_table_idx][0][(cur - top[0]) & 
MAX_QUANT_TABLE_MASK] +
                quant_table[quant_table_idx][1][(top[0] - top[1]) & 
MAX_QUANT_TABLE_MASK] +
@@ -47,19 +93,20 @@ ivec2 get_pred(ivec2 pos, ivec2 off, int p, int sw, uint8_t 
quant_table_idx)
         TYPE cur2 = TYPE(0);
         if (off.x > 0 && off != ivec2(1, 0)) {
             const ivec2 yoff_border2 = off.x == 1 ? ivec2(1, -1) : ivec2(0, 0);
-            cur2 = TYPE(imageLoad(dst[p], pos + ivec2(-2,  0) + 
yoff_border2)[0]);
+            cur2 = TYPE(imageLoad(dec[p], sp + ivec2(-2,  0) + 
yoff_border2)[0]);
         }
         base += quant_table[quant_table_idx][3][(cur2 - cur) & 
MAX_QUANT_TABLE_MASK];
 
         TYPE top2 = TYPE(0);
         if (off.y > 1)
-            top2 = TYPE(imageLoad(dst[p], pos + ivec2(0, -2))[0]);
+            top2 = TYPE(imageLoad(dec[p], sp + ivec2(0, -2))[0]);
         base += quant_table[quant_table_idx][4][(top2 - top[1]) & 
MAX_QUANT_TABLE_MASK];
     }
 
     /* context, prediction */
     return ivec2(base, predict(cur, VTYPE2(top)));
 }
+#endif
 
 #ifndef GOLOMB
 int get_isymbol(inout RangeCoder c, uint64_t state)
@@ -89,11 +136,8 @@ int get_isymbol(inout RangeCoder c, uint64_t state)
     return get_rac(c, state - 11 + min(e, 10)) ? -a : a;
 }
 
-void decode_line_pcm(inout SliceContext sc, int y, int p, int bits)
+void decode_line_pcm(inout SliceContext sc, ivec2 sp, int w, int y, int p, int 
bits)
 {
-    ivec2 sp = sc.slice_pos;
-    int w = sc.slice_dim.x;
-
 #ifndef RGB
     if (p > 0 && p < 3) {
         w >>= chroma_shift.x;
@@ -106,16 +150,14 @@ void decode_line_pcm(inout SliceContext sc, int y, int p, 
int bits)
         for (int i = (bits - 1); i >= 0; i--)
             v |= uint(get_rac_equi(sc.c)) << i;
 
-        imageStore(dst[p], sp + ivec2(x, y), uvec4(v));
+        imageStore(dec[p], sp + LADDR(ivec2(x, y)), uvec4(v));
     }
 }
 
-void decode_line(inout SliceContext sc, uint64_t state,
-                 int y, int p, int bits, const int run_index)
+void decode_line(inout SliceContext sc, ivec2 sp, int w,
+                 int y, int p, int bits, uint64_t state,
+                 const int run_index)
 {
-    ivec2 sp = sc.slice_pos;
-    int w = sc.slice_dim.x;
-
 #ifndef RGB
     if (p > 0 && p < 3) {
         w >>= chroma_shift.x;
@@ -124,7 +166,7 @@ void decode_line(inout SliceContext sc, uint64_t state,
 #endif
 
     for (int x = 0; x < w; x++) {
-        ivec2 pr = get_pred(sp + ivec2(x, y), ivec2(x, y), p, w,
+        ivec2 pr = get_pred(sp, ivec2(x, y), p, w,
                             sc.quant_table_idx[p]);
 
         int diff = get_isymbol(sc.c, state + CONTEXT_SIZE*abs(pr[0]));
@@ -132,18 +174,16 @@ void decode_line(inout SliceContext sc, uint64_t state,
             diff = -diff;
 
         uint v = zero_extend(pr[1] + diff, bits);
-        imageStore(dst[p], sp + ivec2(x, y), uvec4(v));
+        imageStore(dec[p], sp + LADDR(ivec2(x, y)), uvec4(v));
     }
 }
 
 #else /* GOLOMB */
 
-void decode_line(inout SliceContext sc, uint64_t state,
-                 int y, int p, int bits, inout int run_index)
+void decode_line(inout SliceContext sc, ivec2 sp, int w,
+                 int y, int p, int bits, uint64_t state,
+                 inout int run_index)
 {
-    ivec2 sp = sc.slice_pos;
-    int w = sc.slice_dim.x;
-
 #ifndef RGB
     if (p > 0 && p < 3) {
         w >>= chroma_shift.x;
@@ -157,7 +197,7 @@ void decode_line(inout SliceContext sc, uint64_t state,
     for (int x = 0; x < w; x++) {
         ivec2 pos = sp + ivec2(x, y);
         int diff;
-        ivec2 pr = get_pred(sp + ivec2(x, y), ivec2(x, y), p, w,
+        ivec2 pr = get_pred(sp, ivec2(x, y), p, w,
                             sc.quant_table_idx[p]);
 
         VlcState sb = VlcState(state + VLC_STATE_SIZE*abs(pr[0]));
@@ -202,7 +242,44 @@ void decode_line(inout SliceContext sc, uint64_t state,
             diff = -diff;
 
         uint v = zero_extend(pr[1] + diff, bits);
-        imageStore(dst[p], sp + ivec2(x, y), uvec4(v));
+        imageStore(dec[p], sp + LADDR(ivec2(x, y)), uvec4(v));
+    }
+}
+#endif
+
+#ifdef RGB
+ivec4 transform_sample(ivec4 pix, ivec2 rct_coef)
+{
+    pix.b -= rct_offset;
+    pix.r -= rct_offset;
+    pix.g -= (pix.b*rct_coef.y + pix.r*rct_coef.x) >> 2;
+    pix.b += pix.g;
+    pix.r += pix.g;
+    return ivec4(pix[fmt_lut[0]], pix[fmt_lut[1]],
+                 pix[fmt_lut[2]], pix[fmt_lut[3]]);
+}
+
+void writeout_rgb(in SliceContext sc, ivec2 sp, int w, int y, bool apply_rct)
+{
+    for (int x = 0; x < w; x++) {
+        ivec2 lpos = sp + LADDR(ivec2(x, y));
+        ivec2 pos = sc.slice_pos + ivec2(x, y);
+
+        ivec4 pix;
+        pix.r = int(imageLoad(dec[2], lpos)[0]);
+        pix.g = int(imageLoad(dec[0], lpos)[0]);
+        pix.b = int(imageLoad(dec[1], lpos)[0]);
+        if (transparency != 0)
+            pix.a = int(imageLoad(dec[3], lpos)[0]);
+
+        if (apply_rct)
+            pix = transform_sample(pix, sc.slice_rct_coef);
+
+        imageStore(dst[0], pos, pix);
+        if (planar_rgb != 0) {
+            for (int i = 1; i < color_planes; i++)
+                imageStore(dst[i], pos, ivec4(pix[i]));
+        }
     }
 }
 #endif
@@ -210,6 +287,8 @@ void decode_line(inout SliceContext sc, uint64_t state,
 void decode_slice(inout SliceContext sc, const uint slice_idx)
 {
     int run_index = 0;
+    int w = sc.slice_dim.x;
+    ivec2 sp = sc.slice_pos;
 
 #ifndef RGB
     int bits = bits_per_raw_sample;
@@ -217,6 +296,8 @@ void decode_slice(inout SliceContext sc, const uint 
slice_idx)
     int bits = 9;
     if (bits != 8 || sc.slice_coding_mode != 0)
         bits = bits_per_raw_sample + int(sc.slice_coding_mode != 1);
+
+    sp.y = int(gl_WorkGroupID.y)*RGB_LINECACHE;
 #endif
 
     /* PCM coding */
@@ -229,12 +310,14 @@ void decode_slice(inout SliceContext sc, const uint 
slice_idx)
                 h >>= chroma_shift.y;
 
             for (int y = 0; y < h; y++)
-                decode_line_pcm(sc, y, p, bits);
+                decode_line_pcm(sc, sp, w, y, p, bits);
         }
 #else
         for (int y = 0; y < sc.slice_dim.y; y++) {
             for (int p = 0; p < color_planes; p++)
-                decode_line_pcm(sc, y, p, bits);
+                decode_line_pcm(sc, sp, w, y, p, bits);
+
+            writeout_rgb(sc, sp, w, y, false);
         }
 #endif
     } else
@@ -242,8 +325,9 @@ void decode_slice(inout SliceContext sc, const uint 
slice_idx)
     /* Arithmetic coding */
 #endif
     {
-        uint64_t slice_state_off = uint64_t(slice_state) +
-                                   slice_idx*plane_state_size*codec_planes;
+        u64vec4 slice_state_off = (uint64_t(slice_state) +
+                                   slice_idx*plane_state_size*codec_planes) +
+                                  plane_state_size*uvec4(0, 1, 1, 2);
 
 #ifndef RGB
         for (int p = 0; p < planes; p++) {
@@ -252,18 +336,16 @@ void decode_slice(inout SliceContext sc, const uint 
slice_idx)
                 h >>= chroma_shift.y;
 
             for (int y = 0; y < h; y++)
-                decode_line(sc, slice_state_off, y, p, bits, run_index);
-
-            /* For the second chroma plane, reuse the first plane's state */
-            if (p != 1)
-                slice_state_off += plane_state_size;
+                decode_line(sc, sp, w, y, p, bits,
+                            slice_state_off[p], run_index);
         }
 #else
         for (int y = 0; y < sc.slice_dim.y; y++) {
             for (int p = 0; p < color_planes; p++)
-                decode_line(sc,
-                            slice_state_off + plane_state_size*((p + 1) >> 1),
-                            y, p, bits, run_index);
+                decode_line(sc, sp, w, y, p, bits,
+                            slice_state_off[p], run_index);
+
+            writeout_rgb(sc, sp, w, y, true);
         }
 #endif
     }
diff --git a/libavcodec/vulkan/ffv1_dec_rct.comp 
b/libavcodec/vulkan/ffv1_dec_rct.comp
deleted file mode 100644
index a550a5fcb8..0000000000
--- a/libavcodec/vulkan/ffv1_dec_rct.comp
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * FFv1 codec
- *
- * Copyright (c) 2025 Lynne <d...@lynne.ee>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-void bypass_block(in SliceContext sc)
-{
-    ivec2 start = ivec2(gl_LocalInvocationID) + sc.slice_pos;
-    ivec2 end = sc.slice_pos + sc.slice_dim;
-
-    for (uint y = start.y; y < end.y; y += gl_WorkGroupSize.y) {
-        for (uint x = start.x; x < end.x; x += gl_WorkGroupSize.x) {
-            ivec2 pos = ivec2(x, y);
-            ivec4 pix;
-            for (int i = 0; i < color_planes; i++)
-                pix[i] = int(imageLoad(src[i], pos)[0]);
-
-            imageStore(dst[0], pos, pix);
-            if (planar_rgb != 0) {
-                for (int i = 1; i < color_planes; i++)
-                    imageStore(dst[i], pos, ivec4(pix[i]));
-            }
-        }
-    }
-}
-
-void transform_sample(ivec2 pos, ivec2 rct_coef)
-{
-    ivec4 pix;
-    pix.r = int(imageLoad(src[2], pos)[0]);
-    pix.g = int(imageLoad(src[0], pos)[0]);
-    pix.b = int(imageLoad(src[1], pos)[0]);
-    if (transparency != 0)
-        pix.a = int(imageLoad(src[3], pos)[0]);
-
-    pix.b -= offset;
-    pix.r -= offset;
-    pix.g -= (pix.b*rct_coef.y + pix.r*rct_coef.x) >> 2;
-    pix.b += pix.g;
-    pix.r += pix.g;
-
-    pix = ivec4(pix[fmt_lut[0]], pix[fmt_lut[1]],
-                pix[fmt_lut[2]], pix[fmt_lut[3]]);
-
-    imageStore(dst[0], pos, pix);
-    if (planar_rgb != 0) {
-        for (int i = 1; i < color_planes; i++)
-            imageStore(dst[i], pos, ivec4(pix[i]));
-    }
-}
-
-void transform_block(in SliceContext sc)
-{
-    const ivec2 rct_coef = sc.slice_rct_coef;
-    const ivec2 start = ivec2(gl_LocalInvocationID) + sc.slice_pos;
-    const ivec2 end = sc.slice_pos + sc.slice_dim;
-
-    for (uint y = start.y; y < end.y; y += gl_WorkGroupSize.y)
-        for (uint x = start.x; x < end.x; x += gl_WorkGroupSize.x)
-            transform_sample(ivec2(x, y), rct_coef);
-}
-
-void main()
-{
-    const uint slice_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + 
gl_WorkGroupID.x;
-
-    if (slice_ctx[slice_idx].slice_coding_mode == 1)
-        bypass_block(slice_ctx[slice_idx]);
-    else
-        transform_block(slice_ctx[slice_idx]);
-}
diff --git a/libavcodec/vulkan_ffv1.c b/libavcodec/vulkan_ffv1.c
index e511840a01..5584b72385 100644
--- a/libavcodec/vulkan_ffv1.c
+++ b/libavcodec/vulkan_ffv1.c
@@ -33,7 +33,6 @@ extern const char *ff_source_ffv1_common_comp;
 extern const char *ff_source_ffv1_dec_setup_comp;
 extern const char *ff_source_ffv1_reset_comp;
 extern const char *ff_source_ffv1_dec_comp;
-extern const char *ff_source_ffv1_dec_rct_comp;
 
 const FFVulkanDecodeDescriptor ff_vk_dec_ffv1_desc = {
     .codec_id         = AV_CODEC_ID_FFV1,
@@ -66,7 +65,6 @@ typedef struct FFv1VulkanDecodeContext {
     FFVulkanShader setup;
     FFVulkanShader reset[2]; /* AC/Golomb */
     FFVulkanShader decode[2][2][2]; /* 16/32 bit, AC/Golomb, Normal/RGB */
-    FFVulkanShader rct[2]; /* 16/32 bit */
 
     FFVkBuffer rangecoder_static_buf;
     FFVkBuffer quant_buf;
@@ -85,11 +83,13 @@ typedef struct FFv1VkParameters {
     VkDeviceAddress slice_state;
     VkDeviceAddress scratch_data;
 
+    int fmt_lut[4];
     uint32_t img_size[2];
     uint32_t chroma_shift[2];
 
     uint32_t plane_state_size;
     uint32_t crcref;
+    int rct_offset;
 
     uint8_t bits_per_raw_sample;
     uint8_t quant_table_count;
@@ -100,6 +100,7 @@ typedef struct FFv1VkParameters {
     uint8_t codec_planes;
     uint8_t color_planes;
     uint8_t transparency;
+    uint8_t planar_rgb;
     uint8_t colorspace;
     uint8_t ec;
     uint8_t golomb;
@@ -116,11 +117,13 @@ static void add_push_data(FFVulkanShader *shd)
     GLSLC(1,    u8buf slice_state;                                  );
     GLSLC(1,    u8buf scratch_data;                                 );
     GLSLC(0,                                                        );
+    GLSLC(1,    ivec4 fmt_lut;                                      );
     GLSLC(1,    uvec2 img_size;                                     );
     GLSLC(1,    uvec2 chroma_shift;                                 );
     GLSLC(0,                                                        );
     GLSLC(1,    uint plane_state_size;                              );
     GLSLC(1,    uint32_t crcref;                                    );
+    GLSLC(1,    int rct_offset;                                     );
     GLSLC(0,                                                        );
     GLSLC(1,    uint8_t bits_per_raw_sample;                        );
     GLSLC(1,    uint8_t quant_table_count;                          );
@@ -131,6 +134,7 @@ static void add_push_data(FFVulkanShader *shd)
     GLSLC(1,    uint8_t codec_planes;                               );
     GLSLC(1,    uint8_t color_planes;                               );
     GLSLC(1,    uint8_t transparency;                               );
+    GLSLC(1,    uint8_t planar_rgb;                                 );
     GLSLC(1,    uint8_t colorspace;                                 );
     GLSLC(1,    uint8_t ec;                                         );
     GLSLC(1,    uint8_t golomb;                                     );
@@ -349,11 +353,17 @@ static int vk_ffv1_end_frame(AVCodecContext *avctx)
         return err;
 
     if (is_rgb) {
-        RET(ff_vk_exec_add_dep_frame(&ctx->s, exec, vp->dpb_frame,
-                                     VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
-                                     VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT));
         RET(ff_vk_create_imageviews(&ctx->s, exec, rct_image_views,
                                     vp->dpb_frame, FF_VK_REP_NATIVE));
+        RET(ff_vk_exec_add_dep_frame(&ctx->s, exec, vp->dpb_frame,
+                                     VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
+                                     VK_PIPELINE_STAGE_2_CLEAR_BIT));
+        ff_vk_frame_barrier(&ctx->s, exec, decode_dst, img_bar, &nb_img_bar,
+                            VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
+                            VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
+                            VK_ACCESS_2_TRANSFER_WRITE_BIT,
+                            VK_IMAGE_LAYOUT_GENERAL,
+                            VK_QUEUE_FAMILY_IGNORED);
     }
 
     if (!(f->picture.f->flags & AV_FRAME_FLAG_KEY)) {
@@ -391,6 +401,8 @@ static int vk_ffv1_end_frame(AVCodecContext *avctx)
 
     vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
         .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+        .pImageMemoryBarriers = img_bar,
+        .imageMemoryBarrierCount = nb_img_bar,
         .pBufferMemoryBarriers = buf_bar,
         .bufferMemoryBarrierCount = nb_buf_bar,
     });
@@ -431,6 +443,7 @@ static int vk_ffv1_end_frame(AVCodecContext *avctx)
 
         .plane_state_size = fp->plane_state_size,
         .crcref = f->crcref,
+        .rct_offset = 1 << bits,
 
         .bits_per_raw_sample = bits,
         .quant_table_count = f->quant_table_count,
@@ -441,11 +454,23 @@ static int vk_ffv1_end_frame(AVCodecContext *avctx)
         .codec_planes = f->plane_count,
         .color_planes = color_planes,
         .transparency = f->transparency,
+        .planar_rgb = ff_vk_mt_is_np_rgb(sw_format) &&
+                      (ff_vk_count_images((AVVkFrame *)f->picture.f->data[0]) 
> 1),
         .colorspace = f->colorspace,
         .ec = f->ec,
         .golomb = f->ac == AC_GOLOMB_RICE,
         .check_crc = !!(avctx->err_recognition & AV_EF_CRCCHECK),
     };
+
+    /* For some reason the C FFv1 encoder/decoder treats these differently */
+    if (sw_format == AV_PIX_FMT_GBRP10 || sw_format == AV_PIX_FMT_GBRP12 ||
+        sw_format == AV_PIX_FMT_GBRP14)
+        memcpy(pd.fmt_lut, (int [4]) { 2, 1, 0, 3 }, 4*sizeof(int));
+    else if (sw_format == AV_PIX_FMT_X2BGR10)
+        memcpy(pd.fmt_lut, (int [4]) { 0, 2, 1, 3 }, 4*sizeof(int));
+    else
+        ff_vk_set_perm(sw_format, pd.fmt_lut, 0);
+
     for (int i = 0; i < MAX_QUANT_TABLES; i++)
         pd.context_count[i] = f->context_count[i];
 
@@ -455,6 +480,18 @@ static int vk_ffv1_end_frame(AVCodecContext *avctx)
 
     vk->CmdDispatch(exec->buf, f->num_h_slices, f->num_v_slices, 1);
 
+    if (is_rgb) {
+        AVVkFrame *vkf = (AVVkFrame *)vp->dpb_frame->data[0];
+        for (int i = 0; i < color_planes; i++)
+            vk->CmdClearColorImage(exec->buf, vkf->img[i], 
VK_IMAGE_LAYOUT_GENERAL,
+                                   &((VkClearColorValue) { 0 }),
+                                   1, &((VkImageSubresourceRange) {
+                                       .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+                                       .levelCount = 1,
+                                       .layerCount = 1,
+                                   }));
+    }
+
     /* Reset shader */
     reset_shader = &fv->reset[f->ac == AC_GOLOMB_RICE];
     ff_vk_shader_update_desc_buffer(&ctx->s, exec, reset_shader,
@@ -493,12 +530,15 @@ static int vk_ffv1_end_frame(AVCodecContext *avctx)
     };
     vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
         .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+        .pImageMemoryBarriers = img_bar,
+        .imageMemoryBarrierCount = nb_img_bar,
         .pBufferMemoryBarriers = buf_bar,
         .bufferMemoryBarrierCount = nb_buf_bar,
     });
     slice_state->stage = buf_bar[0].dstStageMask;
     slice_state->access = buf_bar[0].dstAccessMask;
     nb_buf_bar = 0;
+    nb_img_bar = 0;
 
     vk->CmdDispatch(exec->buf, f->num_h_slices, f->num_v_slices,
                     f->plane_count);
@@ -515,6 +555,12 @@ static int vk_ffv1_end_frame(AVCodecContext *avctx)
                                   1, 1,
                                   VK_IMAGE_LAYOUT_GENERAL,
                                   VK_NULL_HANDLE);
+    if (is_rgb)
+        ff_vk_shader_update_img_array(&ctx->s, exec, decode_shader,
+                                      f->picture.f, vp->view.out,
+                                      1, 2,
+                                      VK_IMAGE_LAYOUT_GENERAL,
+                                      VK_NULL_HANDLE);
 
     ff_vk_exec_bind_shader(&ctx->s, exec, decode_shader);
     ff_vk_shader_update_push_const(&ctx->s, exec, decode_shader,
@@ -537,12 +583,20 @@ static int vk_ffv1_end_frame(AVCodecContext *avctx)
     };
 
     /* Input frame barrier */
-    ff_vk_frame_barrier(&ctx->s, exec, decode_dst, img_bar, &nb_img_bar,
+    ff_vk_frame_barrier(&ctx->s, exec, f->picture.f, img_bar, &nb_img_bar,
                         VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
                         VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
-                        VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
+                        VK_ACCESS_SHADER_WRITE_BIT |
+                        (!is_rgb ? VK_ACCESS_SHADER_READ_BIT : 0),
                         VK_IMAGE_LAYOUT_GENERAL,
                         VK_QUEUE_FAMILY_IGNORED);
+    if (is_rgb)
+        ff_vk_frame_barrier(&ctx->s, exec, vp->dpb_frame, img_bar, &nb_img_bar,
+                            VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
+                            VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+                            VK_ACCESS_SHADER_READ_BIT | 
VK_ACCESS_SHADER_WRITE_BIT,
+                            VK_IMAGE_LAYOUT_GENERAL,
+                            VK_QUEUE_FAMILY_IGNORED);
 
     vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
         .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
@@ -558,74 +612,6 @@ static int vk_ffv1_end_frame(AVCodecContext *avctx)
 
     vk->CmdDispatch(exec->buf, f->num_h_slices, f->num_v_slices, 1);
 
-    /* RCT */
-    if (is_rgb) {
-        FFVulkanShader *rct_shader = &fv->rct[f->use32bit];
-        FFv1VkRCTParameters pd_rct;
-
-        ff_vk_shader_update_desc_buffer(&ctx->s, exec, rct_shader,
-                                        1, 0, 0,
-                                        slice_state,
-                                        0, fp->slice_data_size*f->slice_count,
-                                        VK_FORMAT_UNDEFINED);
-        ff_vk_shader_update_img_array(&ctx->s, exec, rct_shader,
-                                      decode_dst, decode_dst_view,
-                                      1, 1,
-                                      VK_IMAGE_LAYOUT_GENERAL,
-                                      VK_NULL_HANDLE);
-        ff_vk_shader_update_img_array(&ctx->s, exec, rct_shader,
-                                      f->picture.f, vp->view.out,
-                                      1, 2,
-                                      VK_IMAGE_LAYOUT_GENERAL,
-                                      VK_NULL_HANDLE);
-
-        ff_vk_exec_bind_shader(&ctx->s, exec, rct_shader);
-
-        pd_rct = (FFv1VkRCTParameters) {
-            .offset = 1 << bits,
-            .bits = bits,
-            .planar_rgb = ff_vk_mt_is_np_rgb(sw_format) &&
-                          (ff_vk_count_images((AVVkFrame 
*)f->picture.f->data[0]) > 1),
-            .color_planes = color_planes,
-            .transparency = f->transparency,
-        };
-
-        /* For some reason the C FFv1 encoder/decoder treats these differently 
*/
-        if (sw_format == AV_PIX_FMT_GBRP10 || sw_format == AV_PIX_FMT_GBRP12 ||
-            sw_format == AV_PIX_FMT_GBRP14)
-            memcpy(pd_rct.fmt_lut, (int [4]) { 2, 1, 0, 3 }, 4*sizeof(int));
-        else if (sw_format == AV_PIX_FMT_X2BGR10)
-            memcpy(pd_rct.fmt_lut, (int [4]) { 0, 2, 1, 3 }, 4*sizeof(int));
-        else
-            ff_vk_set_perm(sw_format, pd_rct.fmt_lut, 0);
-
-        ff_vk_shader_update_push_const(&ctx->s, exec, rct_shader,
-                                       VK_SHADER_STAGE_COMPUTE_BIT,
-                                       0, sizeof(pd_rct), &pd_rct);
-
-        ff_vk_frame_barrier(&ctx->s, exec, decode_dst, img_bar, &nb_img_bar,
-                            VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
-                            VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
-                            VK_ACCESS_SHADER_READ_BIT,
-                            VK_IMAGE_LAYOUT_GENERAL,
-                            VK_QUEUE_FAMILY_IGNORED);
-        ff_vk_frame_barrier(&ctx->s, exec, f->picture.f, img_bar, &nb_img_bar,
-                            VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
-                            VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
-                            VK_ACCESS_SHADER_WRITE_BIT,
-                            VK_IMAGE_LAYOUT_GENERAL,
-                            VK_QUEUE_FAMILY_IGNORED);
-
-        vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
-            .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
-            .pImageMemoryBarriers = img_bar,
-            .imageMemoryBarrierCount = nb_img_bar,
-        });
-        nb_img_bar = 0;
-
-        vk->CmdDispatch(exec->buf, f->num_h_slices, f->num_v_slices, 1);
-    }
-
     err = ff_vk_exec_submit(&ctx->s, exec);
     if (err < 0)
         return err;
@@ -845,7 +831,9 @@ fail:
 
 static int init_decode_shader(FFV1Context *f, FFVulkanContext *s,
                               FFVkExecPool *pool, FFVkSPIRVCompiler *spv,
-                              FFVulkanShader *shd, AVHWFramesContext 
*frames_ctx,
+                              FFVulkanShader *shd,
+                              AVHWFramesContext *dec_frames_ctx,
+                              AVHWFramesContext *out_frames_ctx,
                               int use32bit, int ac, int rgb)
 {
     int err;
@@ -910,127 +898,28 @@ static int init_decode_shader(FFV1Context *f, 
FFVulkanContext *s,
             .buf_elems   = f->max_slice_count,
         },
         {
-            .name       = "dst",
-            .type       = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-            .dimensions = 2,
-            .mem_layout = ff_vk_shader_rep_fmt(frames_ctx->sw_format,
-                                               FF_VK_REP_NATIVE),
-            .elems      = av_pix_fmt_count_planes(frames_ctx->sw_format),
-            .stages     = VK_SHADER_STAGE_COMPUTE_BIT,
-        },
-    };
-    RET(ff_vk_shader_add_descriptor_set(s, shd, desc_set, 2, 0, 0));
-
-    GLSLD(ff_source_ffv1_dec_comp);
-
-    RET(spv->compile_shader(s, spv, shd, &spv_data, &spv_len, "main",
-                            &spv_opaque));
-    RET(ff_vk_shader_link(s, shd, spv_data, spv_len, "main"));
-
-    RET(ff_vk_shader_register_exec(s, pool, shd));
-
-fail:
-    if (spv_opaque)
-        spv->free_shader(spv, &spv_opaque);
-
-    return err;
-}
-
-static int init_rct_shader(FFV1Context *f, FFVulkanContext *s,
-                           FFVkExecPool *pool, FFVkSPIRVCompiler *spv,
-                           FFVulkanShader *shd, int use32bit,
-                           AVHWFramesContext *src_ctx, AVHWFramesContext 
*dst_ctx)
-{
-    int err;
-    FFVulkanDescriptorSetBinding *desc_set;
-
-    uint8_t *spv_data;
-    size_t spv_len;
-    void *spv_opaque = NULL;
-    int wg_count = 
sqrt(s->props.properties.limits.maxComputeWorkGroupInvocations);
-
-    RET(ff_vk_shader_init(s, shd, "ffv1_rct",
-                          VK_SHADER_STAGE_COMPUTE_BIT,
-                          (const char *[]) { "GL_EXT_buffer_reference",
-                                             "GL_EXT_buffer_reference2" }, 2,
-                          wg_count, wg_count, 1,
-                          0));
-
-    /* Common codec header */
-    GLSLD(ff_source_common_comp);
-
-    GLSLC(0, layout(push_constant, scalar) uniform pushConstants {             
);
-    GLSLC(1,    ivec4 fmt_lut;                                                 
);
-    GLSLC(1,    int offset;                                                    
);
-    GLSLC(1,    uint8_t bits;                                                  
);
-    GLSLC(1,    uint8_t planar_rgb;                                            
);
-    GLSLC(1,    uint8_t color_planes;                                          
);
-    GLSLC(1,    uint8_t transparency;                                          
);
-    GLSLC(1,    uint8_t version;                                               
);
-    GLSLC(1,    uint8_t micro_version;                                         
);
-    GLSLC(1,    uint8_t padding[2];                                            
);
-    GLSLC(0, };                                                                
);
-    ff_vk_shader_add_push_const(shd, 0, sizeof(FFv1VkRCTParameters),
-                                VK_SHADER_STAGE_COMPUTE_BIT);
-
-    av_bprintf(&shd->src, "#define MAX_QUANT_TABLES %i\n", MAX_QUANT_TABLES);
-    av_bprintf(&shd->src, "#define MAX_CONTEXT_INPUTS %i\n", 
MAX_CONTEXT_INPUTS);
-    av_bprintf(&shd->src, "#define MAX_QUANT_TABLE_SIZE %i\n", 
MAX_QUANT_TABLE_SIZE);
-
-    desc_set = (FFVulkanDescriptorSetBinding []) {
-        {
-            .name        = "rangecoder_static_buf",
-            .type        = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .mem_layout  = "scalar",
-            .buf_content = "uint8_t zero_one_state[512];",
-        },
-        {
-            .name        = "quant_buf",
-            .type        = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .mem_layout  = "scalar",
-            .buf_content = "int16_t quant_table[MAX_QUANT_TABLES]"
-                           "[MAX_CONTEXT_INPUTS][MAX_QUANT_TABLE_SIZE];",
-        },
-    };
-    RET(ff_vk_shader_add_descriptor_set(s, shd, desc_set, 2, 1, 0));
-
-    define_shared_code(shd, use32bit);
-
-    desc_set = (FFVulkanDescriptorSetBinding []) {
-        {
-            .name        = "slice_data_buf",
-            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .mem_quali   = "readonly",
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "SliceContext slice_ctx",
-            .buf_elems   = f->max_slice_count,
-        },
-        {
-            .name       = "src",
+            .name       = "dec",
             .type       = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
             .dimensions = 2,
-            .mem_layout = ff_vk_shader_rep_fmt(src_ctx->sw_format,
+            .mem_layout = ff_vk_shader_rep_fmt(dec_frames_ctx->sw_format,
                                                FF_VK_REP_NATIVE),
-            .mem_quali  = "readonly",
-            .elems      = av_pix_fmt_count_planes(src_ctx->sw_format),
+            .elems      = av_pix_fmt_count_planes(dec_frames_ctx->sw_format),
             .stages     = VK_SHADER_STAGE_COMPUTE_BIT,
         },
         {
             .name       = "dst",
             .type       = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
             .dimensions = 2,
-            .mem_layout = ff_vk_shader_rep_fmt(dst_ctx->sw_format,
+            .mem_layout = ff_vk_shader_rep_fmt(out_frames_ctx->sw_format,
                                                FF_VK_REP_NATIVE),
             .mem_quali  = "writeonly",
-            .elems      = av_pix_fmt_count_planes(dst_ctx->sw_format),
+            .elems      = av_pix_fmt_count_planes(out_frames_ctx->sw_format),
             .stages     = VK_SHADER_STAGE_COMPUTE_BIT,
         },
     };
-    RET(ff_vk_shader_add_descriptor_set(s, shd, desc_set, 3, 0, 0));
+    RET(ff_vk_shader_add_descriptor_set(s, shd, desc_set, 2 + rgb, 0, 0));
 
-    GLSLD(ff_source_ffv1_dec_rct_comp);
+    GLSLD(ff_source_ffv1_dec_comp);
 
     RET(spv->compile_shader(s, spv, shd, &spv_data, &spv_len, "main",
                             &spv_opaque));
@@ -1051,6 +940,7 @@ static int init_indirect(AVCodecContext *avctx, 
FFVulkanContext *s,
     int err;
     AVHWFramesContext *frames_ctx;
     AVVulkanFramesContext *vk_frames;
+    FFV1Context *f = avctx->priv_data;
 
     *dst = av_hwframe_ctx_alloc(s->device_ref);
     if (!(*dst))
@@ -1059,13 +949,14 @@ static int init_indirect(AVCodecContext *avctx, 
FFVulkanContext *s,
     frames_ctx = (AVHWFramesContext *)((*dst)->data);
     frames_ctx->format    = AV_PIX_FMT_VULKAN;
     frames_ctx->sw_format = sw_format;
-    frames_ctx->width     = FFALIGN(s->frames->width, 32);
-    frames_ctx->height    = FFALIGN(s->frames->height, 32);
+    frames_ctx->width     = s->frames->width;
+    frames_ctx->height    = f->num_v_slices*2;
 
     vk_frames = frames_ctx->hwctx;
     vk_frames->tiling    = VK_IMAGE_TILING_OPTIMAL;
-    vk_frames->usage     = VK_IMAGE_USAGE_STORAGE_BIT;
     vk_frames->img_flags = VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT;
+    vk_frames->usage     = VK_IMAGE_USAGE_STORAGE_BIT |
+                           VK_IMAGE_USAGE_TRANSFER_DST_BIT;
 
     err = av_hwframe_ctx_init(*dst);
     if (err < 0) {
@@ -1095,9 +986,6 @@ static void vk_decode_ffv1_uninit(FFVulkanDecodeShared 
*ctx)
             for (int k = 0; k < 2; k++) /* Normal/RGB */
                 ff_vk_shader_free(&ctx->s, &fv->decode[i][j][k]);
 
-    for (int i = 0; i < 2; i++) /* 16/32 bit */
-        ff_vk_shader_free(&ctx->s, &fv->rct[i]);
-
     ff_vk_free_buf(&ctx->s, &fv->quant_buf);
     ff_vk_free_buf(&ctx->s, &fv->rangecoder_static_buf);
     ff_vk_free_buf(&ctx->s, &fv->crc_tab_buf);
@@ -1165,12 +1053,13 @@ static int vk_decode_ffv1_init(AVCodecContext *avctx)
     for (int i = 0; i < 2; i++) { /* 16/32 bit */
         for (int j = 0; j < 2; j++) { /* AC/Golomb */
             for (int k = 0; k < 2; k++) { /* Normal/RGB */
-                AVHWFramesContext *frames_ctx;
-                frames_ctx = k ? (AVHWFramesContext 
*)fv->intermediate_frames_ref[i]->data :
-                                 (AVHWFramesContext 
*)avctx->hw_frames_ctx->data;
+                AVHWFramesContext *dec_frames_ctx;
+                dec_frames_ctx = k ? (AVHWFramesContext 
*)fv->intermediate_frames_ref[i]->data :
+                                     (AVHWFramesContext 
*)avctx->hw_frames_ctx->data;
                 err = init_decode_shader(f, &ctx->s, &ctx->exec_pool,
                                          spv, &fv->decode[i][j][k],
-                                         frames_ctx,
+                                         dec_frames_ctx,
+                                         (AVHWFramesContext 
*)avctx->hw_frames_ctx->data,
                                          i,
                                          !j ? AC_RANGE_CUSTOM_TAB : 
AC_GOLOMB_RICE,
                                          k);
@@ -1180,16 +1069,6 @@ static int vk_decode_ffv1_init(AVCodecContext *avctx)
         }
     }
 
-    /* RCT shaders */
-    for (int i = 0; i < 2; i++) { /* 16/32 bit */
-        err = init_rct_shader(f, &ctx->s, &ctx->exec_pool,
-                              spv, &fv->rct[i], i,
-                              (AVHWFramesContext 
*)fv->intermediate_frames_ref[i]->data,
-                              (AVHWFramesContext *)avctx->hw_frames_ctx->data);
-        if (err < 0)
-            return err;
-    }
-
     /* Range coder data */
     err = ff_ffv1_vk_init_state_transition_data(&ctx->s,
                                                 &fv->rangecoder_static_buf,
diff --git a/libavutil/vulkan_functions.h b/libavutil/vulkan_functions.h
index 85279dd082..8f2bbb38c9 100644
--- a/libavutil/vulkan_functions.h
+++ b/libavutil/vulkan_functions.h
@@ -147,6 +147,7 @@ typedef uint64_t FFVulkanExtensions;
     MACRO(1, 1, FF_VK_EXT_NO_FLAG,              CmdPipelineBarrier)            
          \
     MACRO(1, 1, FF_VK_EXT_NO_FLAG,              CmdCopyBufferToImage)          
          \
     MACRO(1, 1, FF_VK_EXT_NO_FLAG,              CmdCopyImageToBuffer)          
          \
+    MACRO(1, 1, FF_VK_EXT_NO_FLAG,              CmdClearColorImage)            
                        \
     MACRO(1, 1, FF_VK_EXT_NO_FLAG,              CmdCopyBuffer)                 
                        \
                                                                                
          \
     /* Buffer */                                                               
          \
-- 
2.47.2
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Reply via email to