Speeds up everything on AMD by 3x. This uses 32 local invocations to load state into cache, as well as to do the RCT faster. --- libavcodec/vulkan/ffv1_dec.comp | 71 ++++++++++++++++++++------------- libavcodec/vulkan_ffv1.c | 7 +++- 2 files changed, 50 insertions(+), 28 deletions(-)
diff --git a/libavcodec/vulkan/ffv1_dec.comp b/libavcodec/vulkan/ffv1_dec.comp index 9eba322b27..3c46ee1771 100644 --- a/libavcodec/vulkan/ffv1_dec.comp +++ b/libavcodec/vulkan/ffv1_dec.comp @@ -108,34 +108,37 @@ ivec2 get_pred(ivec2 sp, ivec2 off, int p, int sw, uint8_t quant_table_idx) #endif #ifndef GOLOMB -int get_isymbol(inout RangeCoder c, uint64_t state) +#ifdef CACHED_SYMBOL_READER +shared uint8_t state[CONTEXT_SIZE]; +#define READ(c, off) get_rac_direct(c, state[off]) +#else +#define READ(c, off) get_rac(c, uint64_t(slice_state) + state_off + off) +#endif + +int get_isymbol(inout RangeCoder c, uint state_off) { - if (expectEXT(get_rac(c, state), false)) + if (expectEXT(READ(c, 0), false)) return 0; - state += 1; - - int e; - for (e = 0; e < 32; e++) - if (!get_rac(c, state + min(e, 9))) + int e = 1; + for (; e < 33; e++) + if (!READ(c, min(e, 10))) break; - if (expectEXT(e == 0, false)) { - return get_rac(c, state + 10) ? -1 : 1; - } else if (expectEXT(e > 31, false)) { + if (expectEXT(e == 1, false)) { + return READ(c, 11) ? -1 : 1; + } else if (expectEXT(e == 33, false)) { corrupt = true; return 0; } - state += 21; - int a = 1; - for (int i = e - 1; i >= 0; i--) { + for (int i = e + 20; i >= 22; i--) { a <<= 1; - a |= int(get_rac(c, state + min(i, 9))); // 22..31 + a |= int(READ(c, min(i, 31))); } - return get_rac(c, state - 11 + min(e, 10)) ? -a : a; + return READ(c, min(e + 10, 21)) ? -a : a; } void decode_line_pcm(inout SliceContext sc, ivec2 sp, int w, int y, int p, int bits) @@ -157,7 +160,7 @@ void decode_line_pcm(inout SliceContext sc, ivec2 sp, int w, int y, int p, int b } void decode_line(inout SliceContext sc, ivec2 sp, int w, - int y, int p, int bits, uint64_t state, + int y, int p, int bits, uint state_off, uint8_t quant_table_idx, const int run_index) { #ifndef RGB @@ -171,19 +174,33 @@ void decode_line(inout SliceContext sc, ivec2 sp, int w, ivec2 pr = get_pred(sp, ivec2(x, y), p, w, quant_table_idx); - int diff = get_isymbol(sc.c, state + CONTEXT_SIZE*abs(pr[0])); - if (pr[0] < 0) - diff = -diff; + uint context_off = state_off + CONTEXT_SIZE*abs(pr[0]); +#ifdef CACHED_SYMBOL_READER + u8buf sb = u8buf(uint64_t(slice_state) + context_off + gl_LocalInvocationID.x); + state[gl_LocalInvocationID.x] = sb.v; + barrier(); + if (gl_LocalInvocationID.x == 0) { - uint v = zero_extend(pr[1] + diff, bits); - imageStore(dec[p], sp + LADDR(ivec2(x, y)), uvec4(v)); +#endif + + int diff = get_isymbol(sc.c, context_off); + if (pr[0] < 0) + diff = -diff; + + uint v = zero_extend(pr[1] + diff, bits); + imageStore(dec[p], sp + LADDR(ivec2(x, y)), uvec4(v)); + +#ifdef CACHED_SYMBOL_READER + } + sb.v = state[gl_LocalInvocationID.x]; +#endif } } #else /* GOLOMB */ void decode_line(inout SliceContext sc, ivec2 sp, int w, - int y, int p, int bits, uint64_t state, + int y, int p, int bits, uint state_off, uint8_t quant_table_idx, inout int run_index) { #ifndef RGB @@ -202,7 +219,7 @@ void decode_line(inout SliceContext sc, ivec2 sp, int w, ivec2 pr = get_pred(sp, ivec2(x, y), p, w, quant_table_idx); - VlcState sb = VlcState(state + VLC_STATE_SIZE*abs(pr[0])); + VlcState sb = VlcState(uint64_t(slice_state) + state_off + VLC_STATE_SIZE*abs(pr[0])); if (pr[0] == 0 && run_mode == 0) run_mode = 1; @@ -263,7 +280,7 @@ ivec4 transform_sample(ivec4 pix, ivec2 rct_coef) void writeout_rgb(in SliceContext sc, ivec2 sp, int w, int y, bool apply_rct) { - for (int x = 0; x < w; x++) { + for (uint x = gl_LocalInvocationID.x; x < w; x += gl_WorkGroupSize.x) { ivec2 lpos = sp + LADDR(ivec2(x, y)); ivec2 pos = sc.slice_pos + ivec2(x, y); @@ -305,6 +322,8 @@ void decode_slice(inout SliceContext sc, const uint slice_idx) /* PCM coding */ #ifndef GOLOMB if (sc.slice_coding_mode == 1) { + if (gl_LocalInvocationID.x > 0) + return; #ifndef RGB for (int p = 0; p < planes; p++) { int h = sc.slice_dim.y; @@ -328,9 +347,7 @@ void decode_slice(inout SliceContext sc, const uint slice_idx) #endif { u8vec4 quant_table_idx = sc.quant_table_idx.xyyz; - u64vec4 slice_state_off = (uint64_t(slice_state) + - slice_idx*plane_state_size*codec_planes) + - plane_state_size*uvec4(0, 1, 1, 2); + u32vec4 slice_state_off = (slice_idx*codec_planes + uvec4(0, 1, 1, 2))*plane_state_size; #ifndef RGB for (int p = 0; p < planes; p++) { diff --git a/libavcodec/vulkan_ffv1.c b/libavcodec/vulkan_ffv1.c index c1875711bc..33c4e9114d 100644 --- a/libavcodec/vulkan_ffv1.c +++ b/libavcodec/vulkan_ffv1.c @@ -823,12 +823,14 @@ static int init_decode_shader(FFV1Context *f, FFVulkanContext *s, uint8_t *spv_data; size_t spv_len; void *spv_opaque = NULL; + int use_cached_reader = ac != AC_GOLOMB_RICE && + s->driver_props.driverID == VK_DRIVER_ID_MESA_RADV; RET(ff_vk_shader_init(s, shd, "ffv1_dec", VK_SHADER_STAGE_COMPUTE_BIT, (const char *[]) { "GL_EXT_buffer_reference", "GL_EXT_buffer_reference2" }, 2, - 1, 1, 1, + use_cached_reader ? 32 : 1, 1, 1, 0)); if (ac == AC_GOLOMB_RICE) @@ -837,6 +839,9 @@ static int init_decode_shader(FFV1Context *f, FFVulkanContext *s, if (rgb) av_bprintf(&shd->src, "#define RGB\n"); + if (use_cached_reader) + av_bprintf(&shd->src, "#define CACHED_SYMBOL_READER 1\n"); + /* Common codec header */ GLSLD(ff_source_common_comp); -- 2.47.2 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".