[FFmpeg-cvslog] [ffmpeg] branch master updated. 62d43ba2e3 libavfilter/vf_nlmeans_vulkan: fix str defaults

ffmpeg-git--- via ffmpeg-cvslog Sat, 18 Oct 2025 15:31:55 -0700

The branch, master has been updated
       via  62d43ba2e3c5832cd70c2e852e0e416c7c74fb02 (commit)
       via  e8213f766f7dfadb258081b2fc8a6a6207e6cd02 (commit)
       via  7d65ce776311a75fc0f370143de04a70fa6eff71 (commit)
       via  26dee5b43ee4831263e37eaea9e51a9ec8f3a34c (commit)
       via  71ff349cc1a01484e638dd682dfe3a6aafeacd51 (commit)
       via  2e12b3251d555e809b2cb3a61dc9a2e4adc44b80 (commit)
       via  3fac2d85933c75b936884d6b7d3e14354ed65244 (commit)
      from  36896af64a9ecf6835758186e7161a2e954c364d (commit)



- Log -----------------------------------------------------------------
commit 62d43ba2e3c5832cd70c2e852e0e416c7c74fb02
Author:     Michael Yang <[email protected]>
AuthorDate: Fri Oct 17 08:00:13 2025 +1100
Commit:     Lynne <[email protected]>
CommitDate: Thu Oct 16 21:32:43 2025 +0000

    libavfilter/vf_nlmeans_vulkan: fix str defaults
    
    Revert back to NAN as -1.0 was erroneously to 0.0 to fit in the options
    range.
    
    Add special handling of str per requested.

diff --git a/libavfilter/vf_nlmeans_vulkan.c b/libavfilter/vf_nlmeans_vulkan.c
index 3803c493b8..b69e8ac0a2 100644
--- a/libavfilter/vf_nlmeans_vulkan.c
+++ b/libavfilter/vf_nlmeans_vulkan.c
@@ -34,9 +34,6 @@
 #define TYPE_BLOCK_SIZE (TYPE_SIZE * TYPE_BLOCK_ELEMS)
 #define WG_SIZE 32
 
-// prevent macro expansion in GLSL
-#undef isinf
-
 typedef struct NLMeansVulkanContext {
     FFVulkanContext vkctx;
 
@@ -160,7 +157,7 @@ static av_cold int init_integral_pipeline(FFVulkanContext 
*vkctx, FFVkExecPool *
     GLSLC(1,     uint comp_idx = uint(gl_WorkGroupID.y);                       
  );
     GLSLC(1,     uint invoc_idx = uint(gl_WorkGroupID.z);                      
  );
     GLSLC(0,                                                                   
  );
-    GLSLC(1,     if (isinf(strength[comp_idx]))                                
  );
+    GLSLC(1,     if (strength[comp_idx] == 0.0)                                
  );
     GLSLC(2,         return;                                                   
  );
     GLSLC(0,                                                                   
  );
     GLSLC(1,     offset = integral_size * (invoc_idx * nb_components + 
comp_idx); );
@@ -245,7 +242,7 @@ static av_cold int init_integral_pipeline(FFVulkanContext 
*vkctx, FFVkExecPool *
     GLSLC(1,     uint comp_idx = uint(gl_WorkGroupID.y);                       
  );
     GLSLC(1,     uint invoc_idx = uint(gl_WorkGroupID.z);                      
  );
     GLSLC(0,                                                                   
  );
-    GLSLC(1,     if (isinf(strength[comp_idx]))                                
  );
+    GLSLC(1,     if (strength[comp_idx] == 0.0)                                
  );
     GLSLC(2,         return;                                                   
  );
     GLSLC(0,                                                                   
  );
     GLSLC(1,     offset = integral_size * (invoc_idx * nb_components + 
comp_idx); );
@@ -417,7 +414,7 @@ static av_cold int init_weights_pipeline(FFVulkanContext 
*vkctx, FFVkExecPool *e
     GLSLC(1,     c_plane = comp_plane[comp_idx];                               
  );
     GLSLC(1,     p = patch_size[comp_idx];                                     
  );
     GLSLC(1,     s = strength[comp_idx];                                       
  );
-    GLSLC(1,     if (isinf(s) || pos.x < p || pos.y < p || pos.x >= 
width[c_plane] - p || pos.y >= height[c_plane] - p) );
+    GLSLC(1,     if (s == 0.0 || pos.x < p || pos.y < p || pos.x >= 
width[c_plane] - p || pos.y >= height[c_plane] - p) );
     GLSLC(2,         return;                                                   
  );
     GLSLC(0,                                                                   
  );
     GLSLC(1,     offset = integral_size * (invoc_idx * nb_components + 
comp_idx); );
@@ -635,12 +632,16 @@ static av_cold int init_filter(AVFilterContext *ctx)
     }
 
     for (int i = 0; i < 4; i++) {
-        double str = s->opts.sc[i] != -1.0 ? s->opts.sc[i] : s->opts.s;
+        double str = !isnan(s->opts.sc[i]) ? s->opts.sc[i] : s->opts.s;
         int ps = (s->opts.pc[i] ? s->opts.pc[i] : s->opts.p);
-        str  = 10.0f*str;
-        str *= -str;
-        str  = 255.0*255.0 / str;
-        s->strength[i] = str;
+        if (str == 0.0) {
+            s->strength[i] = 0.0;
+        } else {
+            str  = 10.0f*str;
+            str *= -str;
+            str  = 255.0*255.0 / str;
+            s->strength[i] = str;
+        }
         if (!(ps & 1)) {
             ps |= 1;
             av_log(ctx, AV_LOG_WARNING, "Patch size should be odd, setting to 
%i",
@@ -1178,10 +1179,10 @@ static const AVOption nlmeans_vulkan_options[] = {
     { "r",  "research window size", OFFSET(opts.r), AV_OPT_TYPE_INT, { .i64 = 
7*2+1 }, 0, 99, FLAGS },
     { "t",  "parallelism", OFFSET(opts.t), AV_OPT_TYPE_INT, { .i64 = 8 }, 1, 
64, FLAGS },
 
-    { "s1", "denoising strength for component 1", OFFSET(opts.sc[0]), 
AV_OPT_TYPE_DOUBLE, { .dbl = -1.0 }, 0.0, 100.0, FLAGS },
-    { "s2", "denoising strength for component 2", OFFSET(opts.sc[1]), 
AV_OPT_TYPE_DOUBLE, { .dbl = -1.0 }, 0.0, 100.0, FLAGS },
-    { "s3", "denoising strength for component 3", OFFSET(opts.sc[2]), 
AV_OPT_TYPE_DOUBLE, { .dbl = -1.0 }, 0.0, 100.0, FLAGS },
-    { "s4", "denoising strength for component 4", OFFSET(opts.sc[3]), 
AV_OPT_TYPE_DOUBLE, { .dbl = -1.0 }, 0.0, 100.0, FLAGS },
+    { "s1", "denoising strength for component 1", OFFSET(opts.sc[0]), 
AV_OPT_TYPE_DOUBLE, { .dbl = NAN }, 0.0, 100.0, FLAGS },
+    { "s2", "denoising strength for component 2", OFFSET(opts.sc[1]), 
AV_OPT_TYPE_DOUBLE, { .dbl = NAN }, 0.0, 100.0, FLAGS },
+    { "s3", "denoising strength for component 3", OFFSET(opts.sc[2]), 
AV_OPT_TYPE_DOUBLE, { .dbl = NAN }, 0.0, 100.0, FLAGS },
+    { "s4", "denoising strength for component 4", OFFSET(opts.sc[3]), 
AV_OPT_TYPE_DOUBLE, { .dbl = NAN }, 0.0, 100.0, FLAGS },
 
     { "p1", "patch size for component 1", OFFSET(opts.pc[0]), AV_OPT_TYPE_INT, 
{ .i64 = 0 }, 0, 99, FLAGS },
     { "p2", "patch size for component 2", OFFSET(opts.pc[1]), AV_OPT_TYPE_INT, 
{ .i64 = 0 }, 0, 99, FLAGS },

commit e8213f766f7dfadb258081b2fc8a6a6207e6cd02
Author:     Michael Yang <[email protected]>
AuthorDate: Thu Oct 16 11:42:10 2025 +1100
Commit:     Lynne <[email protected]>
CommitDate: Thu Oct 16 21:32:43 2025 +0000

    libavfilter/vf_nlmeans_vulkan: amend doc

diff --git a/doc/filters.texi b/doc/filters.texi
index 5863041d1a..259162f6b7 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -29101,7 +29101,7 @@ The filter accepts the following options.
 
 @table @option
 @item s
-Set denoising strength for all components. Default is 1.0. Must be in range 
[1.0, 100.0].
+Set denoising strength for all components. Default is 1.0. Must be in range 
[0.0, 100.0].
 
 @item p
 Set patch size for all planes. Default is 7. Must be odd number in range [0, 
99].
@@ -29110,17 +29110,16 @@ Set patch size for all planes. Default is 7. Must be 
odd number in range [0, 99]
 Set research size. Default is 15. Must be odd number in range [0, 99].
 
 @item t
-Set parallelism. Default is 36. Must be a number in the range [1, 168].
-Larger values may speed up processing, at the cost of more VRAM.
-Lower values will slow it down, reducing VRAM usage.
-Only supported on GPUs with atomic float operations (RDNA3+, Ampere+).
+Set parallelism. Default is 8. Must be a number in the range [1, 64].
+Larger values will use more VRAM but may not result in greater speed.
+The optimal value is hardware and input dependent.
 
 @item s0
 @item s1
 @item s2
 @item s3
-Set denoising strength for a specific component. Default is @var{1}, equal to 
@option{s}.
-Must be odd number in range [1, 100].
+Set denoising strength for a specific component. Default is @var{1.0}, equal 
to @option{s}.
+Must be in range [0.0, 100.0]. 0.0 disables denoising in that component.
 
 @item p0
 @item p1
diff --git a/libavfilter/vf_nlmeans_vulkan.c b/libavfilter/vf_nlmeans_vulkan.c
index e69a0883cc..3803c493b8 100644
--- a/libavfilter/vf_nlmeans_vulkan.c
+++ b/libavfilter/vf_nlmeans_vulkan.c
@@ -1175,7 +1175,7 @@ static void nlmeans_vulkan_uninit(AVFilterContext *avctx)
 static const AVOption nlmeans_vulkan_options[] = {
     { "s",  "denoising strength for all components", OFFSET(opts.s), 
AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 }, 0.0, 100.0, FLAGS },
     { "p",  "patch size for all components", OFFSET(opts.p), AV_OPT_TYPE_INT, 
{ .i64 = 3*2+1 }, 0, 99, FLAGS },
-    { "r",  "research window radius", OFFSET(opts.r), AV_OPT_TYPE_INT, { .i64 
= 7*2+1 }, 0, 99, FLAGS },
+    { "r",  "research window size", OFFSET(opts.r), AV_OPT_TYPE_INT, { .i64 = 
7*2+1 }, 0, 99, FLAGS },
     { "t",  "parallelism", OFFSET(opts.t), AV_OPT_TYPE_INT, { .i64 = 8 }, 1, 
64, FLAGS },
 
     { "s1", "denoising strength for component 1", OFFSET(opts.sc[0]), 
AV_OPT_TYPE_DOUBLE, { .dbl = -1.0 }, 0.0, 100.0, FLAGS },

commit 7d65ce776311a75fc0f370143de04a70fa6eff71
Author:     Michael Yang <[email protected]>
AuthorDate: Wed Oct 15 17:55:56 2025 +1100
Commit:     Lynne <[email protected]>
CommitDate: Thu Oct 16 21:32:43 2025 +0000

    libavfilter/vf_nlmeans_vulkan: clean up defaults
    
    Change per-plane strength defaults to -1.0.

diff --git a/libavfilter/vf_nlmeans_vulkan.c b/libavfilter/vf_nlmeans_vulkan.c
index a3536eaaf6..e69a0883cc 100644
--- a/libavfilter/vf_nlmeans_vulkan.c
+++ b/libavfilter/vf_nlmeans_vulkan.c
@@ -34,6 +34,7 @@
 #define TYPE_BLOCK_SIZE (TYPE_SIZE * TYPE_BLOCK_ELEMS)
 #define WG_SIZE 32
 
+// prevent macro expansion in GLSL
 #undef isinf
 
 typedef struct NLMeansVulkanContext {
@@ -416,7 +417,7 @@ static av_cold int init_weights_pipeline(FFVulkanContext 
*vkctx, FFVkExecPool *e
     GLSLC(1,     c_plane = comp_plane[comp_idx];                               
  );
     GLSLC(1,     p = patch_size[comp_idx];                                     
  );
     GLSLC(1,     s = strength[comp_idx];                                       
  );
-    GLSLC(1,     if (pos.y < p || pos.y >= height[c_plane] - p || pos.x < p || 
pos.x >= width[c_plane] - p || isinf(s)) );
+    GLSLC(1,     if (isinf(s) || pos.x < p || pos.y < p || pos.x >= 
width[c_plane] - p || pos.y >= height[c_plane] - p) );
     GLSLC(2,         return;                                                   
  );
     GLSLC(0,                                                                   
  );
     GLSLC(1,     offset = integral_size * (invoc_idx * nb_components + 
comp_idx); );
@@ -424,7 +425,7 @@ static av_cold int init_weights_pipeline(FFVulkanContext 
*vkctx, FFVkExecPool *e
     for (int i = 0; i < TYPE_ELEMS; i++)
         GLSLF(1, offs[%i] = xyoffsets[xyoffs_start + %i*invoc_idx + %i];       
  ,i,TYPE_ELEMS,i);
     GLSLC(0,                                                                   
  );
-    GLSLC(1,     ws_off = ws_count * invoc_idx + ws_offset[comp_idx] + pos.y * 
ws_stride[comp_idx]; );
+    GLSLC(1,     ws_off = ws_count * invoc_idx + ws_offset[comp_idx] + pos.y * 
ws_stride[comp_idx] + pos.x; );
     GLSLC(1,     size = imageSize(input_img[c_plane]);                         
  );
     GLSLC(0,                                                                   
  );
     GLSLC(1,     DTYPE a;                                                      
  );
@@ -460,8 +461,8 @@ static av_cold int init_weights_pipeline(FFVulkanContext 
*vkctx, FFVkExecPool *e
     GLSLC(1,         w_sum = w[0] + w[1] + w[2] + w[3];                        
  );
     GLSLC(1,         sum = dot(w, src * 255);                                  
  );
     GLSLC(0,                                                                   
  );
-    GLSLC(1,         weights[ws_off + pos.x] += w_sum;                         
  );
-    GLSLC(1,         sums[ws_off + pos.x] += sum;                              
  );
+    GLSLC(1,         weights[ws_off] += w_sum;                                 
  );
+    GLSLC(1,         sums[ws_off] += sum;                                      
  );
     GLSLC(0, }                                                                 
  );
 
     RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main", 
&spv_opaque));
@@ -634,7 +635,7 @@ static av_cold int init_filter(AVFilterContext *ctx)
     }
 
     for (int i = 0; i < 4; i++) {
-        double str = !isnan(s->opts.sc[i]) ? s->opts.sc[i] : s->opts.s;
+        double str = s->opts.sc[i] != -1.0 ? s->opts.sc[i] : s->opts.s;
         int ps = (s->opts.pc[i] ? s->opts.pc[i] : s->opts.p);
         str  = 10.0f*str;
         str *= -str;
@@ -1177,10 +1178,10 @@ static const AVOption nlmeans_vulkan_options[] = {
     { "r",  "research window radius", OFFSET(opts.r), AV_OPT_TYPE_INT, { .i64 
= 7*2+1 }, 0, 99, FLAGS },
     { "t",  "parallelism", OFFSET(opts.t), AV_OPT_TYPE_INT, { .i64 = 8 }, 1, 
64, FLAGS },
 
-    { "s1", "denoising strength for component 1", OFFSET(opts.sc[0]), 
AV_OPT_TYPE_DOUBLE, { .dbl = NAN }, 0.0, 100.0, FLAGS },
-    { "s2", "denoising strength for component 2", OFFSET(opts.sc[1]), 
AV_OPT_TYPE_DOUBLE, { .dbl = NAN }, 0.0, 100.0, FLAGS },
-    { "s3", "denoising strength for component 3", OFFSET(opts.sc[2]), 
AV_OPT_TYPE_DOUBLE, { .dbl = NAN }, 0.0, 100.0, FLAGS },
-    { "s4", "denoising strength for component 4", OFFSET(opts.sc[3]), 
AV_OPT_TYPE_DOUBLE, { .dbl = NAN }, 0.0, 100.0, FLAGS },
+    { "s1", "denoising strength for component 1", OFFSET(opts.sc[0]), 
AV_OPT_TYPE_DOUBLE, { .dbl = -1.0 }, 0.0, 100.0, FLAGS },
+    { "s2", "denoising strength for component 2", OFFSET(opts.sc[1]), 
AV_OPT_TYPE_DOUBLE, { .dbl = -1.0 }, 0.0, 100.0, FLAGS },
+    { "s3", "denoising strength for component 3", OFFSET(opts.sc[2]), 
AV_OPT_TYPE_DOUBLE, { .dbl = -1.0 }, 0.0, 100.0, FLAGS },
+    { "s4", "denoising strength for component 4", OFFSET(opts.sc[3]), 
AV_OPT_TYPE_DOUBLE, { .dbl = -1.0 }, 0.0, 100.0, FLAGS },
 
     { "p1", "patch size for component 1", OFFSET(opts.pc[0]), AV_OPT_TYPE_INT, 
{ .i64 = 0 }, 0, 99, FLAGS },
     { "p2", "patch size for component 2", OFFSET(opts.pc[1]), AV_OPT_TYPE_INT, 
{ .i64 = 0 }, 0, 99, FLAGS },

commit 26dee5b43ee4831263e37eaea9e51a9ec8f3a34c
Author:     Michael Yang <[email protected]>
AuthorDate: Sun Oct 12 19:00:05 2025 +1100
Commit:     Lynne <[email protected]>
CommitDate: Thu Oct 16 21:32:43 2025 +0000

    libavfilter/vf_nlmeans_vulkan: reverse img_bar

diff --git a/libavfilter/vf_nlmeans_vulkan.c b/libavfilter/vf_nlmeans_vulkan.c
index b0c4887763..a3536eaaf6 100644
--- a/libavfilter/vf_nlmeans_vulkan.c
+++ b/libavfilter/vf_nlmeans_vulkan.c
@@ -824,7 +824,7 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, 
AVFrame *in)
     FFVkExecContext *exec;
     VkImageView in_views[AV_NUM_DATA_POINTERS];
     VkImageView out_views[AV_NUM_DATA_POINTERS];
-    VkImageMemoryBarrier2 img_bar[2];
+    VkImageMemoryBarrier2 img_bar[8];
     int nb_img_bar = 0;
     VkBufferMemoryBarrier2 buf_bar[2];
     int nb_buf_bar = 0;

commit 71ff349cc1a01484e638dd682dfe3a6aafeacd51
Author:     Michael Yang <[email protected]>
AuthorDate: Sun Oct 12 18:38:29 2025 +1100
Commit:     Lynne <[email protected]>
CommitDate: Thu Oct 16 21:32:43 2025 +0000

    libavfilter/vf_nlmeans_vulkan: lower strength min
    
    Lower (per-component) strength minimum from 1.0 to 0.0, with 0.0 skipping
    integral and weights calculations.

diff --git a/libavfilter/vf_nlmeans_vulkan.c b/libavfilter/vf_nlmeans_vulkan.c
index ad2d63900e..b0c4887763 100644
--- a/libavfilter/vf_nlmeans_vulkan.c
+++ b/libavfilter/vf_nlmeans_vulkan.c
@@ -34,6 +34,8 @@
 #define TYPE_BLOCK_SIZE (TYPE_SIZE * TYPE_BLOCK_ELEMS)
 #define WG_SIZE 32
 
+#undef isinf
+
 typedef struct NLMeansVulkanContext {
     FFVulkanContext vkctx;
 
@@ -70,6 +72,7 @@ typedef struct NLMeansVulkanContext {
 typedef struct IntegralPushData {
     uint32_t width[4];
     uint32_t height[4];
+    float    strength[4];
     uint32_t comp_off[4];
     uint32_t comp_plane[4];
     VkDeviceAddress integral_base;
@@ -99,6 +102,7 @@ static void shared_shd_def(FFVulkanShader *shd) {
     GLSLC(0, layout(push_constant, std430) uniform pushConstants {            
);
     GLSLC(1,     uvec4 width;                                                 
);
     GLSLC(1,     uvec4 height;                                                
);
+    GLSLC(1,     vec4 strength;                                               
);
     GLSLC(1,     uvec4 comp_off;                                              
);
     GLSLC(1,     uvec4 comp_plane;                                            
);
     GLSLC(1,     DataBuffer integral_base;                                    
);
@@ -155,6 +159,9 @@ static av_cold int init_integral_pipeline(FFVulkanContext 
*vkctx, FFVkExecPool *
     GLSLC(1,     uint comp_idx = uint(gl_WorkGroupID.y);                       
  );
     GLSLC(1,     uint invoc_idx = uint(gl_WorkGroupID.z);                      
  );
     GLSLC(0,                                                                   
  );
+    GLSLC(1,     if (isinf(strength[comp_idx]))                                
  );
+    GLSLC(2,         return;                                                   
  );
+    GLSLC(0,                                                                   
  );
     GLSLC(1,     offset = integral_size * (invoc_idx * nb_components + 
comp_idx); );
     GLSLC(1,     integral_data = DataBuffer(uint64_t(integral_base) + offset); 
  );
     GLSLC(0,                                                                   
  );
@@ -237,6 +244,9 @@ static av_cold int init_integral_pipeline(FFVulkanContext 
*vkctx, FFVkExecPool *
     GLSLC(1,     uint comp_idx = uint(gl_WorkGroupID.y);                       
  );
     GLSLC(1,     uint invoc_idx = uint(gl_WorkGroupID.z);                      
  );
     GLSLC(0,                                                                   
  );
+    GLSLC(1,     if (isinf(strength[comp_idx]))                                
  );
+    GLSLC(2,         return;                                                   
  );
+    GLSLC(0,                                                                   
  );
     GLSLC(1,     offset = integral_size * (invoc_idx * nb_components + 
comp_idx); );
     GLSLC(1,     integral_data = DataBuffer(uint64_t(integral_base) + offset); 
  );
     for (int i = 0; i < TYPE_ELEMS; i++)
@@ -389,6 +399,7 @@ static av_cold int init_weights_pipeline(FFVulkanContext 
*vkctx, FFVkExecPool *e
     GLSLC(1,     ivec2 pos;                                                    
  );
     GLSLC(1,     ivec2 pos_off;                                                
  );
     GLSLC(1,     int p;                                                        
  );
+    GLSLC(1,     float s;                                                      
  );
     GLSLC(0,                                                                   
  );
     GLSLC(1,     DataBuffer integral_data;                                     
  );
     GLSLF(1,     ivec2 offs[%i];                                               
  ,TYPE_ELEMS);
@@ -404,7 +415,8 @@ static av_cold int init_weights_pipeline(FFVulkanContext 
*vkctx, FFVkExecPool *e
     GLSLC(1,     c_off = comp_off[comp_idx];                                   
  );
     GLSLC(1,     c_plane = comp_plane[comp_idx];                               
  );
     GLSLC(1,     p = patch_size[comp_idx];                                     
  );
-    GLSLC(1,     if (pos.y < p || pos.y >= height[c_plane] - p || pos.x < p || 
pos.x >= width[c_plane] - p) );
+    GLSLC(1,     s = strength[comp_idx];                                       
  );
+    GLSLC(1,     if (pos.y < p || pos.y >= height[c_plane] - p || pos.x < p || 
pos.x >= width[c_plane] - p || isinf(s)) );
     GLSLC(2,         return;                                                   
  );
     GLSLC(0,                                                                   
  );
     GLSLC(1,     offset = integral_size * (invoc_idx * nb_components + 
comp_idx); );
@@ -444,7 +456,7 @@ static av_cold int init_weights_pipeline(FFVulkanContext 
*vkctx, FFVkExecPool *e
     GLSLC(1,         d = dst.v[pos.x + p];                                     
  );
     GLSLC(0,                                                                   
  );
     GLSLC(1,         patch_diff = d + a - b - c;                               
  );
-    GLSLC(1,         w = exp(patch_diff * strength[comp_idx]);                 
  );
+    GLSLC(1,         w = exp(patch_diff * s);                                  
  );
     GLSLC(1,         w_sum = w[0] + w[1] + w[2] + w[3];                        
  );
     GLSLC(1,         sum = dot(w, src * 255);                                  
  );
     GLSLC(0,                                                                   
  );
@@ -622,7 +634,7 @@ static av_cold int init_filter(AVFilterContext *ctx)
     }
 
     for (int i = 0; i < 4; i++) {
-        double str = (s->opts.sc[i] > 1.0) ? s->opts.sc[i] : s->opts.s;
+        double str = !isnan(s->opts.sc[i]) ? s->opts.sc[i] : s->opts.s;
         int ps = (s->opts.pc[i] ? s->opts.pc[i] : s->opts.p);
         str  = 10.0f*str;
         str *= -str;
@@ -969,6 +981,7 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, 
AVFrame *in)
         IntegralPushData pd = {
             { plane_widths[0], plane_widths[1], plane_widths[2], 
plane_widths[3] },
             { plane_heights[0], plane_heights[1], plane_heights[2], 
plane_heights[3] },
+            { s->strength[0], s->strength[1], s->strength[2], s->strength[3], 
},
             { comp_offs[0], comp_offs[1], comp_offs[2], comp_offs[3] },
             { comp_planes[0], comp_planes[1], comp_planes[2], comp_planes[3] },
             integral_vk->address,
@@ -1159,15 +1172,15 @@ static void nlmeans_vulkan_uninit(AVFilterContext 
*avctx)
 #define OFFSET(x) offsetof(NLMeansVulkanContext, x)
 #define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
 static const AVOption nlmeans_vulkan_options[] = {
-    { "s",  "denoising strength for all components", OFFSET(opts.s), 
AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 }, 1.0, 100.0, FLAGS },
+    { "s",  "denoising strength for all components", OFFSET(opts.s), 
AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 }, 0.0, 100.0, FLAGS },
     { "p",  "patch size for all components", OFFSET(opts.p), AV_OPT_TYPE_INT, 
{ .i64 = 3*2+1 }, 0, 99, FLAGS },
     { "r",  "research window radius", OFFSET(opts.r), AV_OPT_TYPE_INT, { .i64 
= 7*2+1 }, 0, 99, FLAGS },
     { "t",  "parallelism", OFFSET(opts.t), AV_OPT_TYPE_INT, { .i64 = 8 }, 1, 
64, FLAGS },
 
-    { "s1", "denoising strength for component 1", OFFSET(opts.sc[0]), 
AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 }, 1.0, 100.0, FLAGS },
-    { "s2", "denoising strength for component 2", OFFSET(opts.sc[1]), 
AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 }, 1.0, 100.0, FLAGS },
-    { "s3", "denoising strength for component 3", OFFSET(opts.sc[2]), 
AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 }, 1.0, 100.0, FLAGS },
-    { "s4", "denoising strength for component 4", OFFSET(opts.sc[3]), 
AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 }, 1.0, 100.0, FLAGS },
+    { "s1", "denoising strength for component 1", OFFSET(opts.sc[0]), 
AV_OPT_TYPE_DOUBLE, { .dbl = NAN }, 0.0, 100.0, FLAGS },
+    { "s2", "denoising strength for component 2", OFFSET(opts.sc[1]), 
AV_OPT_TYPE_DOUBLE, { .dbl = NAN }, 0.0, 100.0, FLAGS },
+    { "s3", "denoising strength for component 3", OFFSET(opts.sc[2]), 
AV_OPT_TYPE_DOUBLE, { .dbl = NAN }, 0.0, 100.0, FLAGS },
+    { "s4", "denoising strength for component 4", OFFSET(opts.sc[3]), 
AV_OPT_TYPE_DOUBLE, { .dbl = NAN }, 0.0, 100.0, FLAGS },
 
     { "p1", "patch size for component 1", OFFSET(opts.pc[0]), AV_OPT_TYPE_INT, 
{ .i64 = 0 }, 0, 99, FLAGS },
     { "p2", "patch size for component 2", OFFSET(opts.pc[1]), AV_OPT_TYPE_INT, 
{ .i64 = 0 }, 0, 99, FLAGS },

commit 2e12b3251d555e809b2cb3a61dc9a2e4adc44b80
Author:     Michael Yang <[email protected]>
AuthorDate: Sat Oct 11 20:52:05 2025 +1100
Commit:     Lynne <[email protected]>
CommitDate: Thu Oct 16 21:32:43 2025 +0000

    libavfilter/vf_nlmeans_vulkan: clean up naming
    
    Add `nb_components` to push data.
    
    Rename `ws_total_*`` to `ws_*`.

diff --git a/libavfilter/vf_nlmeans_vulkan.c b/libavfilter/vf_nlmeans_vulkan.c
index bffca4066a..ad2d63900e 100644
--- a/libavfilter/vf_nlmeans_vulkan.c
+++ b/libavfilter/vf_nlmeans_vulkan.c
@@ -76,6 +76,7 @@ typedef struct IntegralPushData {
     uint64_t integral_size;
     uint64_t int_stride;
     uint32_t xyoffs_start;
+    uint32_t nb_components;
 } IntegralPushData;
 
 static void shared_shd_def(FFVulkanShader *shd) {
@@ -104,6 +105,7 @@ static void shared_shd_def(FFVulkanShader *shd) {
     GLSLC(1,     uint64_t integral_size;                                      
);
     GLSLC(1,     uint64_t int_stride;                                         
);
     GLSLC(1,     uint xyoffs_start;                                           
);
+    GLSLC(1,     uint nb_components;                                          
);
     GLSLC(0, };                                                               
);
     GLSLC(0,                                                                  
);
 
@@ -150,10 +152,10 @@ static av_cold int init_integral_pipeline(FFVulkanContext 
*vkctx, FFVkExecPool *
     GLSLC(0,                                                                   
  );
     GLSLC(1,     uint c_plane;                                                 
  );
     GLSLC(0,                                                                   
  );
-    GLSLC(1,     int comp_idx = int(gl_WorkGroupID.y);                         
  );
-    GLSLC(1,     int invoc_idx = int(gl_WorkGroupID.z);                        
  );
+    GLSLC(1,     uint comp_idx = uint(gl_WorkGroupID.y);                       
  );
+    GLSLC(1,     uint invoc_idx = uint(gl_WorkGroupID.z);                      
  );
     GLSLC(0,                                                                   
  );
-    GLSLF(1,     offset = integral_size * (invoc_idx * %i + comp_idx);         
  ,desc->nb_components);
+    GLSLC(1,     offset = integral_size * (invoc_idx * nb_components + 
comp_idx); );
     GLSLC(1,     integral_data = DataBuffer(uint64_t(integral_base) + offset); 
  );
     GLSLC(0,                                                                   
  );
     GLSLC(1,     c_plane = comp_plane[comp_idx];                               
  );
@@ -232,10 +234,10 @@ static av_cold int init_integral_pipeline(FFVulkanContext 
*vkctx, FFVkExecPool *
     GLSLC(1,     uint c_off;                                                   
  );
     GLSLC(1,     uint c_plane;                                                 
  );
     GLSLC(0,                                                                   
  );
-    GLSLC(1,     int comp_idx = int(gl_WorkGroupID.y);                         
  );
-    GLSLC(1,     int invoc_idx = int(gl_WorkGroupID.z);                        
  );
+    GLSLC(1,     uint comp_idx = uint(gl_WorkGroupID.y);                       
  );
+    GLSLC(1,     uint invoc_idx = uint(gl_WorkGroupID.z);                      
  );
     GLSLC(0,                                                                   
  );
-    GLSLF(1,     offset = integral_size * (invoc_idx * %i + comp_idx);         
  ,desc->nb_components);
+    GLSLC(1,     offset = integral_size * (invoc_idx * nb_components + 
comp_idx); );
     GLSLC(1,     integral_data = DataBuffer(uint64_t(integral_base) + offset); 
  );
     for (int i = 0; i < TYPE_ELEMS; i++)
         GLSLF(1, offs[%i] = xyoffsets[xyoffs_start + %i*invoc_idx + %i];       
  ,i,TYPE_ELEMS,i);
@@ -290,7 +292,8 @@ typedef struct WeightsPushData {
     uint64_t integral_size;
     uint64_t int_stride;
     uint32_t xyoffs_start;
-    uint32_t ws_total_count;
+    uint32_t ws_count;
+    uint32_t nb_components;
 } WeightsPushData;
 
 static av_cold int init_weights_pipeline(FFVulkanContext *vkctx, FFVkExecPool 
*exec,
@@ -333,7 +336,8 @@ static av_cold int init_weights_pipeline(FFVulkanContext 
*vkctx, FFVkExecPool *e
     GLSLC(1,     uint64_t integral_size;                                      
);
     GLSLC(1,     uint64_t int_stride;                                         
);
     GLSLC(1,     uint xyoffs_start;                                           
);
-    GLSLC(1,     uint ws_total_count;                                         
);
+    GLSLC(1,     uint ws_count;                                               
);
+    GLSLC(1,     uint nb_components;                                          
);
     GLSLC(0, };                                                               
);
     GLSLC(0,                                                                  
);
 
@@ -394,8 +398,8 @@ static av_cold int init_weights_pipeline(FFVulkanContext 
*vkctx, FFVkExecPool *e
     GLSLC(1,     uint ws_off;                                                  
  );
     GLSLC(0,                                                                   
  );
     GLSLC(1,     pos = ivec2(gl_GlobalInvocationID.xy);                        
  );
-    GLSLF(1,     int comp_idx = int(gl_WorkGroupID.z) %% %i;                   
  ,desc->nb_components);
-    GLSLF(1,     int invoc_idx = int(gl_WorkGroupID.z) / %i;                   
  ,desc->nb_components);
+    GLSLC(1,     uint comp_idx = uint(gl_WorkGroupID.z) %% nb_components;      
  );
+    GLSLC(1,     uint invoc_idx = uint(gl_WorkGroupID.z) / nb_components;      
  );
     GLSLC(0,                                                                   
  );
     GLSLC(1,     c_off = comp_off[comp_idx];                                   
  );
     GLSLC(1,     c_plane = comp_plane[comp_idx];                               
  );
@@ -403,12 +407,12 @@ static av_cold int init_weights_pipeline(FFVulkanContext 
*vkctx, FFVkExecPool *e
     GLSLC(1,     if (pos.y < p || pos.y >= height[c_plane] - p || pos.x < p || 
pos.x >= width[c_plane] - p) );
     GLSLC(2,         return;                                                   
  );
     GLSLC(0,                                                                   
  );
-    GLSLF(1,     offset = integral_size * (invoc_idx * %i + comp_idx);         
  ,desc->nb_components);
+    GLSLC(1,     offset = integral_size * (invoc_idx * nb_components + 
comp_idx); );
     GLSLC(1,     integral_data = DataBuffer(uint64_t(integral_base) + offset); 
  );
     for (int i = 0; i < TYPE_ELEMS; i++)
         GLSLF(1, offs[%i] = xyoffsets[xyoffs_start + %i*invoc_idx + %i];       
  ,i,TYPE_ELEMS,i);
     GLSLC(0,                                                                   
  );
-    GLSLC(1,     ws_off = ws_total_count * invoc_idx + ws_offset[comp_idx] + 
pos.y * ws_stride[comp_idx]; );
+    GLSLC(1,     ws_off = ws_count * invoc_idx + ws_offset[comp_idx] + pos.y * 
ws_stride[comp_idx]; );
     GLSLC(1,     size = imageSize(input_img[c_plane]);                         
  );
     GLSLC(0,                                                                   
  );
     GLSLC(1,     DTYPE a;                                                      
  );
@@ -465,8 +469,9 @@ typedef struct DenoisePushData {
     uint32_t comp_plane[4];
     uint32_t ws_offset[4];
     uint32_t ws_stride[4];
-    uint32_t ws_total_count;
+    uint32_t ws_count;
     uint32_t t;
+    uint32_t nb_components;
 } DenoisePushData;
 
 static av_cold int init_denoise_pipeline(FFVulkanContext *vkctx, FFVkExecPool 
*exec,
@@ -490,8 +495,9 @@ static av_cold int init_denoise_pipeline(FFVulkanContext 
*vkctx, FFVkExecPool *e
     GLSLC(1,    uvec4 comp_plane;                                         );
     GLSLC(1,    uvec4 ws_offset;                                          );
     GLSLC(1,    uvec4 ws_stride;                                          );
-    GLSLC(1,    uint32_t ws_total_count;                                  );
+    GLSLC(1,    uint32_t ws_count;                                        );
     GLSLC(1,    uint32_t t;                                               );
+    GLSLC(1,    uint32_t nb_components;                                   );
     GLSLC(0, };                                                           );
 
     ff_vk_shader_add_push_const(shd, 0, sizeof(DenoisePushData),
@@ -552,19 +558,19 @@ static av_cold int init_denoise_pipeline(FFVulkanContext 
*vkctx, FFVkExecPool *e
     GLSLC(1,     float sum;                                                   
);
     GLSLC(1,     vec4 src;                                                    
);
     GLSLC(1,     vec4 r;                                                      
);
-    GLSLC(1,     int invoc_idx;                                               
);
-    GLSLC(1,     int comp_idx;                                                
);
+    GLSLC(1,     uint invoc_idx;                                              
);
+    GLSLC(1,     uint comp_idx;                                               
);
     GLSLC(0,                                                                  
);
     GLSLC(1,     if (!IS_WITHIN(pos, size))                                   
);
     GLSLC(2,         return;                                                  
);
     GLSLC(0,                                                                  
);
     GLSLC(1,     src = imageLoad(input_img[plane], pos);                      
);
-    GLSLF(1,     for (comp_idx = 0; comp_idx < %i; comp_idx++) {              
,desc->nb_components);
+    GLSLC(1,     for (comp_idx = 0; comp_idx < nb_components; comp_idx++) {   
);
     GLSLC(2,         if (plane == comp_plane[comp_idx]) {                     
);
     GLSLC(3,             w_sum = 0.0;                                         
);
     GLSLC(3,             sum = 0.0;                                           
);
     GLSLC(3,             for (invoc_idx = 0; invoc_idx < t; invoc_idx++) {    
);
-    GLSLC(4,                 ws_off = ws_total_count * invoc_idx + 
ws_offset[comp_idx] + pos.y * ws_stride[comp_idx] + pos.x; );
+    GLSLC(4,                 ws_off = ws_count * invoc_idx + 
ws_offset[comp_idx] + pos.y * ws_stride[comp_idx] + pos.x; );
     GLSLC(4,                 w_sum += weights[ws_off];                        
);
     GLSLC(4,                 sum += sums[ws_off];                             
);
     GLSLC(3,             }                                                    
);
@@ -716,7 +722,7 @@ fail:
 static int denoise_pass(NLMeansVulkanContext *s, FFVkExecContext *exec,
                         FFVkBuffer *ws_vk, uint32_t comp_offs[4], uint32_t 
comp_planes[4],
                         uint32_t ws_offset[4], uint32_t ws_stride[4],
-                        uint32_t ws_total_count, int t)
+                        uint32_t ws_count, uint32_t t, uint32_t nb_components)
 {
     FFVulkanContext *vkctx = &s->vkctx;
     FFVulkanFunctions *vk = &vkctx->vkfn;
@@ -728,8 +734,9 @@ static int denoise_pass(NLMeansVulkanContext *s, 
FFVkExecContext *exec,
         { comp_planes[0], comp_planes[1], comp_planes[2], comp_planes[3] },
         { ws_offset[0], ws_offset[1], ws_offset[2], ws_offset[3] },
         { ws_stride[0], ws_stride[1], ws_stride[2], ws_stride[3] },
-        ws_total_count,
+        ws_count,
         t,
+        nb_components,
     };
 
     /* Denoise pass pipeline */
@@ -797,15 +804,15 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink 
*link, AVFrame *in)
     /* Weights/sums */
     AVBufferRef *ws_buf = NULL;
     FFVkBuffer *ws_vk;
-    uint32_t ws_total_count = 0;
+    uint32_t ws_count = 0;
     uint32_t ws_offset[4];
     uint32_t ws_stride[4];
-    size_t ws_total_size;
+    size_t ws_size;
 
     FFVkExecContext *exec;
     VkImageView in_views[AV_NUM_DATA_POINTERS];
     VkImageView out_views[AV_NUM_DATA_POINTERS];
-    VkImageMemoryBarrier2 img_bar[8];
+    VkImageMemoryBarrier2 img_bar[2];
     int nb_img_bar = 0;
     VkBufferMemoryBarrier2 buf_bar[2];
     int nb_buf_bar = 0;
@@ -832,11 +839,11 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink 
*link, AVFrame *in)
         comp_planes[i] = desc->comp[i].plane;
 
         ws_stride[i] = plane_widths[i];
-        ws_offset[i] = ws_total_count;
-        ws_total_count += ws_stride[i] * plane_heights[i];
+        ws_offset[i] = ws_count;
+        ws_count += ws_stride[i] * plane_heights[i];
     }
 
-    ws_total_size = ws_total_count * sizeof(float);
+    ws_size = ws_count * sizeof(float);
 
     /* Buffers */
     err = ff_vk_get_pooled_buffer(&s->vkctx, &s->integral_buf_pool, 
&integral_buf,
@@ -854,7 +861,7 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, 
AVFrame *in)
                                   VK_BUFFER_USAGE_TRANSFER_DST_BIT |
                                   VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
                                   NULL,
-                                  ws_total_size * s-> opts.t * 2,
+                                  ws_size * s-> opts.t * 2,
                                   VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
     if (err < 0)
         return err;
@@ -937,10 +944,10 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink 
*link, AVFrame *in)
     ff_vk_shader_update_img_array(vkctx, exec, &s->shd_weights, in, in_views, 
0, 0,
                                   VK_IMAGE_LAYOUT_GENERAL, VK_NULL_HANDLE);
     RET(ff_vk_shader_update_desc_buffer(&s->vkctx, exec, &s->shd_weights, 0, 
1, 0,
-                                        ws_vk, 0, ws_total_size * s-> opts.t,
+                                        ws_vk, 0, ws_size * s-> opts.t,
                                         VK_FORMAT_UNDEFINED));
     RET(ff_vk_shader_update_desc_buffer(&s->vkctx, exec, &s->shd_weights, 0, 
2, 0,
-                                        ws_vk, ws_total_size * s-> opts.t, 
ws_total_size * s-> opts.t,
+                                        ws_vk, ws_size * s-> opts.t, ws_size * 
s-> opts.t,
                                         VK_FORMAT_UNDEFINED));
 
     /* Update denoise descriptors */
@@ -949,10 +956,10 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink 
*link, AVFrame *in)
     ff_vk_shader_update_img_array(vkctx, exec, &s->shd_denoise, out, 
out_views, 0, 1,
                                   VK_IMAGE_LAYOUT_GENERAL, VK_NULL_HANDLE);
     RET(ff_vk_shader_update_desc_buffer(&s->vkctx, exec, &s->shd_denoise, 1, 
0, 0,
-                                        ws_vk, 0, ws_total_size * s-> opts.t,
+                                        ws_vk, 0, ws_size * s-> opts.t,
                                         VK_FORMAT_UNDEFINED));
     RET(ff_vk_shader_update_desc_buffer(&s->vkctx, exec, &s->shd_denoise, 1, 
1, 0,
-                                        ws_vk, ws_total_size * s-> opts.t, 
ws_total_size * s-> opts.t,
+                                        ws_vk, ws_size * s-> opts.t, ws_size * 
s-> opts.t,
                                         VK_FORMAT_UNDEFINED));
 
     do {
@@ -968,6 +975,7 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, 
AVFrame *in)
             (uint64_t)int_size,
             (uint64_t)int_stride,
             offsets_dispatched,
+            desc->nb_components,
         };
 
         ff_vk_exec_bind_shader(vkctx, exec, &s->shd_vertical);
@@ -997,8 +1005,10 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink 
*link, AVFrame *in)
         integral_vk->access = buf_bar[0].dstAccessMask;
 
         /* End of vertical pass */
-        vk->CmdDispatch(exec->buf, FFALIGN(vkctx->output_width, 
s->shd_vertical.lg_size[0])/s->shd_vertical.lg_size[0],
-                        desc->nb_components, wg_invoc);
+        vk->CmdDispatch(exec->buf,
+                        FFALIGN(vkctx->output_width, 
s->shd_vertical.lg_size[0])/s->shd_vertical.lg_size[0],
+                        desc->nb_components,
+                        wg_invoc);
 
         ff_vk_exec_bind_shader(vkctx, exec, &s->shd_horizontal);
         ff_vk_shader_update_push_const(vkctx, exec, &s->shd_horizontal,
@@ -1028,8 +1038,10 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink 
*link, AVFrame *in)
         integral_vk->access = buf_bar[0].dstAccessMask;
 
         /* End of horizontal pass */
-        vk->CmdDispatch(exec->buf, FFALIGN(vkctx->output_height, 
s->shd_horizontal.lg_size[0])/s->shd_horizontal.lg_size[0],
-                        desc->nb_components, wg_invoc);
+        vk->CmdDispatch(exec->buf,
+                        FFALIGN(vkctx->output_height, 
s->shd_horizontal.lg_size[0])/s->shd_horizontal.lg_size[0],
+                        desc->nb_components,
+                        wg_invoc);
 
         /* Weights pipeline */
         WeightsPushData wpd = {
@@ -1045,7 +1057,8 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink 
*link, AVFrame *in)
             (uint64_t)int_size,
             (uint64_t)int_stride,
             offsets_dispatched,
-            ws_total_count,
+            ws_count,
+            desc->nb_components,
         };
 
         ff_vk_exec_bind_shader(vkctx, exec, &s->shd_weights);
@@ -1099,7 +1112,7 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink 
*link, AVFrame *in)
     } while (offsets_dispatched < s->nb_offsets);
 
     RET(denoise_pass(s, exec, ws_vk, comp_offs, comp_planes, ws_offset, 
ws_stride,
-                     ws_total_count, s->opts.t));
+                     ws_count, s->opts.t, desc->nb_components));
 
     err = ff_vk_exec_submit(vkctx, exec);
     if (err < 0)

commit 3fac2d85933c75b936884d6b7d3e14354ed65244
Author:     Michael Yang <[email protected]>
AuthorDate: Tue Sep 30 12:40:21 2025 +1000
Commit:     Lynne <[email protected]>
CommitDate: Thu Oct 16 21:32:43 2025 +0000

    avfilter/vf_nlmeans_vulkan: rewrite filter
    
    This is a major rewrite of the exising nlmeans vulkan code, with bug
    fixes and major performance improvement.
    
    Fix visual artifacts found in ticket #10661, #10733. Add OOB checks for
    image loading and patch sized area around the border. Correct chroma
    plane height, strength and buffer barrier index.
    
    Improve parallelism with component workgroup axis and more but smaller
    workgroups. Split weights pass into vertical/horizontal (integral) and
    weights passes. Remove h/v order logic to always calculate sum on
    vertical pass. Remove atomic float requirement, which causes high memory
    locking contentions, at the cost of higher memory usage of w/s buffer.
    Use cache blocking in h pass to reduce memory bandwidth usage.

diff --git a/libavfilter/vf_nlmeans_vulkan.c b/libavfilter/vf_nlmeans_vulkan.c
index 22a2a73eae..bffca4066a 100644
--- a/libavfilter/vf_nlmeans_vulkan.c
+++ b/libavfilter/vf_nlmeans_vulkan.c
@@ -30,6 +30,9 @@
 #define TYPE_NAME  "vec4"
 #define TYPE_ELEMS 4
 #define TYPE_SIZE  (TYPE_ELEMS*4)
+#define TYPE_BLOCK_ELEMS 16
+#define TYPE_BLOCK_SIZE (TYPE_SIZE * TYPE_BLOCK_ELEMS)
+#define WG_SIZE 32
 
 typedef struct NLMeansVulkanContext {
     FFVulkanContext vkctx;
@@ -43,7 +46,8 @@ typedef struct NLMeansVulkanContext {
 
     FFVkBuffer xyoffsets_buf;
 
-    int pl_weights_rows;
+    FFVulkanShader shd_horizontal;
+    FFVulkanShader shd_vertical;
     FFVulkanShader shd_weights;
     FFVulkanShader shd_denoise;
 
@@ -63,189 +67,251 @@ typedef struct NLMeansVulkanContext {
     } opts;
 } NLMeansVulkanContext;
 
-static void insert_first(FFVulkanShader *shd, int r, const char *off, int 
horiz, int plane, int comp)
-{
-    GLSLF(4, s1    = imageLoad(input_img[%i], pos + ivec2(%i + %s, %i + 
%s))[%i];
-          ,plane, horiz ? r : 0, horiz ? off : "0", !horiz ? r : 0, !horiz ? 
off : "0", comp);
-
-    GLSLF(4, s2[0] = imageLoad(input_img[%i], pos + offs[0] + ivec2(%i + %s, 
%i + %s))[%i];
-          ,plane, horiz ? r : 0, horiz ? off : "0", !horiz ? r : 0, !horiz ? 
off : "0", comp);
-    GLSLF(4, s2[1] = imageLoad(input_img[%i], pos + offs[1] + ivec2(%i + %s, 
%i + %s))[%i];
-          ,plane, horiz ? r : 0, horiz ? off : "0", !horiz ? r : 0, !horiz ? 
off : "0", comp);
-    GLSLF(4, s2[2] = imageLoad(input_img[%i], pos + offs[2] + ivec2(%i + %s, 
%i + %s))[%i];
-          ,plane, horiz ? r : 0, horiz ? off : "0", !horiz ? r : 0, !horiz ? 
off : "0", comp);
-    GLSLF(4, s2[3] = imageLoad(input_img[%i], pos + offs[3] + ivec2(%i + %s, 
%i + %s))[%i];
-          ,plane, horiz ? r : 0, horiz ? off : "0", !horiz ? r : 0, !horiz ? 
off : "0", comp);
-
-    GLSLC(4, s2 = (s1 - s2) * (s1 - s2);                                       
             );
-}
+typedef struct IntegralPushData {
+    uint32_t width[4];
+    uint32_t height[4];
+    uint32_t comp_off[4];
+    uint32_t comp_plane[4];
+    VkDeviceAddress integral_base;
+    uint64_t integral_size;
+    uint64_t int_stride;
+    uint32_t xyoffs_start;
+} IntegralPushData;
 
-static void insert_horizontal_pass(FFVulkanShader *shd, int nb_rows, int 
first, int plane, int comp)
-{
-    GLSLF(1, pos.y = int(gl_GlobalInvocationID.x) * %i;                        
   ,nb_rows);
-    if (!first)
-        GLSLC(1, barrier();                                                    
   );
-    GLSLC(0,                                                                   
   );
-    GLSLF(1, if (pos.y < height[%i]) {                                         
   ,plane);
-    GLSLC(2,     #pragma unroll(1)                                             
   );
-    GLSLF(2,     for (r = 0; r < %i; r++) {                                    
   ,nb_rows);
-    GLSLC(3,         prefix_sum = DTYPE(0);                                    
   );
-    GLSLC(3,         offset = int_stride * uint64_t(pos.y + r);                
   );
-    GLSLC(3,         dst = DataBuffer(uint64_t(integral_data) + offset);       
   );
-    GLSLC(0,                                                                   
   );
-    GLSLF(3,         for (pos.x = 0; pos.x < width[%i]; pos.x++) {             
   ,plane);
-    if (first)
-        insert_first(shd, 0, "r", 0, plane, comp);
-    else
-        GLSLC(4,         s2 = dst.v[pos.x];                                    
   );
-    GLSLC(4,             dst.v[pos.x] = s2 + prefix_sum;                       
   );
-    GLSLC(4,             prefix_sum += s2;                                     
   );
-    GLSLC(3,         }                                                         
   );
-    GLSLC(2,     }                                                             
   );
-    GLSLC(1, }                                                                 
   );
-    GLSLC(0,                                                                   
   );
-}
+static void shared_shd_def(FFVulkanShader *shd) {
+    GLSLC(0, #extension GL_ARB_gpu_shader_int64 : require                     
);
+    GLSLC(0,                                                                  
);
+    GLSLF(0, #define DTYPE %s                                                 
,TYPE_NAME);
+    GLSLF(0, #define T_ALIGN %i                                               
,TYPE_SIZE);
+    GLSLF(0, #define T_BLOCK_ELEMS %i                                         
,TYPE_BLOCK_ELEMS);
+    GLSLF(0, #define T_BLOCK_ALIGN %i                                         
,TYPE_BLOCK_SIZE);
+    GLSLC(0,                                                                  
);
+    GLSLC(0, layout(buffer_reference, buffer_reference_align = T_ALIGN) buffer 
DataBuffer {  );
+    GLSLC(1,     DTYPE v[];                                                   
);
+    GLSLC(0, };                                                               
);
+    GLSLC(0, struct Block {                                                   
);
+    GLSLC(1,     DTYPE data[T_BLOCK_ELEMS];                                   
);
+    GLSLC(0, };                                                               
);
+    GLSLC(0, layout(buffer_reference, buffer_reference_align = T_BLOCK_ALIGN) 
buffer BlockBuffer {  );
+    GLSLC(1,     Block v[];                                                   
);
+    GLSLC(0, };                                                               
);
+    GLSLC(0, layout(push_constant, std430) uniform pushConstants {            
);
+    GLSLC(1,     uvec4 width;                                                 
);
+    GLSLC(1,     uvec4 height;                                                
);
+    GLSLC(1,     uvec4 comp_off;                                              
);
+    GLSLC(1,     uvec4 comp_plane;                                            
);
+    GLSLC(1,     DataBuffer integral_base;                                    
);
+    GLSLC(1,     uint64_t integral_size;                                      
);
+    GLSLC(1,     uint64_t int_stride;                                         
);
+    GLSLC(1,     uint xyoffs_start;                                           
);
+    GLSLC(0, };                                                               
);
+    GLSLC(0,                                                                  
);
 
-static void insert_vertical_pass(FFVulkanShader *shd, int nb_rows, int first, 
int plane, int comp)
-{
-    GLSLF(1, pos.x = int(gl_GlobalInvocationID.x) * %i;                        
   ,nb_rows);
-    GLSLC(1, #pragma unroll(1)                                                 
   );
-    GLSLF(1, for (r = 0; r < %i; r++)                                          
   ,nb_rows);
-    GLSLC(2,     psum[r] = DTYPE(0);                                           
   );
-    GLSLC(0,                                                                   
   );
-    if (!first)
-        GLSLC(1, barrier();                                                    
   );
-    GLSLC(0,                                                                   
   );
-    GLSLF(1, if (pos.x < width[%i]) {                                          
   ,plane);
-    GLSLF(2,     for (pos.y = 0; pos.y < height[%i]; pos.y++) {                
   ,plane);
-    GLSLC(3,         offset = int_stride * uint64_t(pos.y);                    
   );
-    GLSLC(3,         dst = DataBuffer(uint64_t(integral_data) + offset);       
   );
-    GLSLC(0,                                                                   
   );
-    GLSLC(3,         #pragma unroll(1)                                         
   );
-    GLSLF(3,         for (r = 0; r < %i; r++) {                                
   ,nb_rows);
-    if (first)
-        insert_first(shd, 0, "r", 1, plane, comp);
-    else
-        GLSLC(4,         s2 = dst.v[pos.x + r];                                
   );
-    GLSLC(4,             dst.v[pos.x + r] = s2 + psum[r];                      
   );
-    GLSLC(4,             psum[r] += s2;                                        
   );
-    GLSLC(3,         }                                                         
   );
-    GLSLC(2,     }                                                             
   );
-    GLSLC(1, }                                                                 
   );
-    GLSLC(0,                                                                   
   );
+    ff_vk_shader_add_push_const(shd, 0, sizeof(IntegralPushData),
+                                VK_SHADER_STAGE_COMPUTE_BIT);
 }
 
-static void insert_weights_pass(FFVulkanShader *shd, int nb_rows, int vert,
-                                int t, int dst_comp, int plane, int comp)
+static av_cold int init_integral_pipeline(FFVulkanContext *vkctx, FFVkExecPool 
*exec,
+                                          FFVulkanShader *shd_horizontal,
+                                          FFVulkanShader *shd_vertical,
+                                          FFVkSPIRVCompiler *spv,
+                                          const AVPixFmtDescriptor *desc, int 
planes)
 {
-    GLSLF(1, p = patch_size[%i];                                              
,dst_comp);
-    GLSLC(0,                                                                  
);
-    GLSLC(1, barrier();                                                       
);
-    GLSLC(0,                                                                  
);
-    if (!vert) {
-        GLSLF(1, for (pos.y = 0; pos.y < height[%i]; pos.y++) {               
,plane);
-        GLSLF(2,     if (gl_GlobalInvocationID.x*%i >= width[%i])             
,nb_rows, plane);
-        GLSLC(3,         break;                                               
);
-        GLSLF(2,     for (r = 0; r < %i; r++) {                               
,nb_rows);
-        GLSLF(3,         pos.x = int(gl_GlobalInvocationID.x) * %i + r;       
,nb_rows);
-    } else {
-        GLSLF(1, for (pos.x = 0; pos.x < width[%i]; pos.x++) {                
,plane);
-        GLSLF(2,     if (gl_GlobalInvocationID.x*%i >= height[%i])            
,nb_rows, plane);
-        GLSLC(3,         break;                                               
);
-        GLSLF(2,     for (r = 0; r < %i; r++) {                               
,nb_rows);
-        GLSLF(3,         pos.y = int(gl_GlobalInvocationID.x) * %i + r;       
,nb_rows);
-    }
-    GLSLC(0,                                                                  
);
-    GLSLC(3,         a = DTYPE(0);                                            
);
-    GLSLC(3,         b = DTYPE(0);                                            
);
-    GLSLC(3,         c = DTYPE(0);                                            
);
-    GLSLC(3,         d = DTYPE(0);                                            
);
-    GLSLC(0,                                                                  
);
-    GLSLC(3,         lt = ((pos.x - p) < 0) || ((pos.y - p) < 0);             
);
-    GLSLC(0,                                                                  
);
-    GLSLF(3,         src[0] = imageLoad(input_img[%i], pos + offs[0])[%i];    
,plane, comp);
-    GLSLF(3,         src[1] = imageLoad(input_img[%i], pos + offs[1])[%i];    
,plane, comp);
-    GLSLF(3,         src[2] = imageLoad(input_img[%i], pos + offs[2])[%i];    
,plane, comp);
-    GLSLF(3,         src[3] = imageLoad(input_img[%i], pos + offs[3])[%i];    
,plane, comp);
-    GLSLC(0,                                                                  
);
-    GLSLC(3,         if (lt == false) {                                       
);
-    GLSLC(3,             offset = int_stride * uint64_t(pos.y - p);           
);
-    GLSLC(3,             dst = DataBuffer(uint64_t(integral_data) + offset);  
);
-    GLSLC(4,             a = dst.v[pos.x - p];                                
);
-    GLSLC(4,             c = dst.v[pos.x + p];                                
);
-    GLSLC(3,             offset = int_stride * uint64_t(pos.y + p);           
);
-    GLSLC(3,             dst = DataBuffer(uint64_t(integral_data) + offset);  
);
-    GLSLC(4,             b = dst.v[pos.x - p];                                
);
-    GLSLC(4,             d = dst.v[pos.x + p];                                
);
-    GLSLC(3,         }                                                        
);
-    GLSLC(0,                                                                  
);
-    GLSLC(3,         patch_diff = d + a - b - c;                              
);
-    GLSLF(3,         w = exp(patch_diff * strength[%i]);                      
,dst_comp);
-    GLSLC(3,         w_sum = w[0] + w[1] + w[2] + w[3];                       
);
-    GLSLC(3,         sum = dot(w, src*255);                                   
);
-    GLSLC(0,                                                                  
);
-    if (t > 1) {
-        GLSLF(3,         atomicAdd(weights_%i[pos.y*ws_stride[%i] + pos.x], 
w_sum);   ,dst_comp, dst_comp);
-        GLSLF(3,         atomicAdd(sums_%i[pos.y*ws_stride[%i] + pos.x], sum); 
       ,dst_comp, dst_comp);
-    } else {
-        GLSLF(3,         weights_%i[pos.y*ws_stride[%i] + pos.x] += w_sum;     
       ,dst_comp, dst_comp);
-        GLSLF(3,         sums_%i[pos.y*ws_stride[%i] + pos.x] += sum;          
       ,dst_comp, dst_comp);
+    int err;
+    uint8_t *spv_data;
+    size_t spv_len;
+    void *spv_opaque = NULL;
+    FFVulkanShader *shd;
+    FFVulkanDescriptorSetBinding *desc_set;
+
+    shd = shd_horizontal;
+    RET(ff_vk_shader_init(vkctx, shd, "nlmeans_horizontal",
+                          VK_SHADER_STAGE_COMPUTE_BIT,
+                          (const char *[]) { "GL_EXT_buffer_reference",
+                                             "GL_EXT_buffer_reference2" }, 2,
+                          WG_SIZE, 1, 1,
+                          0));
+    shared_shd_def(shd);
+
+    GLSLC(0,                                                                   
  );
+    GLSLC(0, void main()                                                       
  );
+    GLSLC(0, {                                                                 
  );
+    GLSLC(1,     uint64_t offset;                                              
  );
+    GLSLC(1,     DataBuffer dst;                                               
  );
+    GLSLC(1,     BlockBuffer b_dst;                                            
  );
+    GLSLC(1,     Block block;                                                  
  );
+    GLSLC(1,     DTYPE s2;                                                     
  );
+    GLSLC(1,     DTYPE prefix_sum;                                             
  );
+    GLSLC(1,     ivec2 pos;                                                    
  );
+    GLSLC(1,     int k;                                                        
  );
+    GLSLC(1,     int o;                                                        
  );
+    GLSLC(0,                                                                   
  );
+    GLSLC(1,     DataBuffer integral_data;                                     
  );
+    GLSLC(0,                                                                   
  );
+    GLSLC(1,     uint c_plane;                                                 
  );
+    GLSLC(0,                                                                   
  );
+    GLSLC(1,     int comp_idx = int(gl_WorkGroupID.y);                         
  );
+    GLSLC(1,     int invoc_idx = int(gl_WorkGroupID.z);                        
  );
+    GLSLC(0,                                                                   
  );
+    GLSLF(1,     offset = integral_size * (invoc_idx * %i + comp_idx);         
  ,desc->nb_components);
+    GLSLC(1,     integral_data = DataBuffer(uint64_t(integral_base) + offset); 
  );
+    GLSLC(0,                                                                   
  );
+    GLSLC(1,     c_plane = comp_plane[comp_idx];                               
  );
+    GLSLC(0,                                                                   
  );
+    GLSLC(1,     pos.y = int(gl_GlobalInvocationID.x);                         
  );
+    GLSLC(1,     if (pos.y < height[c_plane]) {                                
  );
+    GLSLC(2,         prefix_sum = DTYPE(0);                                    
  );
+    GLSLC(2,         offset = int_stride * uint64_t(pos.y);                    
  );
+    GLSLC(2,         b_dst = BlockBuffer(uint64_t(integral_data) + offset);    
  );
+    GLSLC(0,                                                                   
  );
+    GLSLC(2,         for (k = 0; k * T_BLOCK_ELEMS < width[c_plane]; k++) {    
  );
+    GLSLC(3,             block = b_dst.v[k];                                   
  );
+    GLSLC(3,             for (o = 0; o < T_BLOCK_ELEMS; o++) {                 
  );
+    GLSLC(4,                 s2 = block.data[o];                               
  );
+    GLSLC(4,                 block.data[o] = s2 + prefix_sum;                  
  );
+    GLSLC(4,                 prefix_sum += s2;                                 
  );
+    GLSLC(3,             }                                                     
  );
+    GLSLC(3,             b_dst.v[k] = block;                                   
  );
+    GLSLC(2,         }                                                         
  );
+    GLSLC(1,     }                                                             
  );
+    GLSLC(0, }                                                                 
  );
+
+    RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main", 
&spv_opaque));
+    RET(ff_vk_shader_link(vkctx, shd, spv_data, spv_len, "main"));
+
+    RET(ff_vk_shader_register_exec(vkctx, exec, shd));
+
+    shd = shd_vertical;
+    RET(ff_vk_shader_init(vkctx, shd, "nlmeans_vertical",
+                          VK_SHADER_STAGE_COMPUTE_BIT,
+                          (const char *[]) { "GL_EXT_buffer_reference",
+                                             "GL_EXT_buffer_reference2" }, 2,
+                          WG_SIZE, 1, 1,
+                          0));
+    shared_shd_def(shd);
+
+    desc_set = (FFVulkanDescriptorSetBinding []) {
+        {
+            .name       = "input_img",
+            .type       = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            .mem_layout = ff_vk_shader_rep_fmt(vkctx->input_format, 
FF_VK_REP_FLOAT),
+            .mem_quali  = "readonly",
+            .dimensions = 2,
+            .elems      = planes,
+            .stages     = VK_SHADER_STAGE_COMPUTE_BIT,
+        },
+    };
+    RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set, 1, 0, 0));
+
+    desc_set = (FFVulkanDescriptorSetBinding []) {
+        {
+            .name        = "xyoffsets_buffer",
+            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .mem_quali   = "readonly",
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+            .buf_content = "ivec2 xyoffsets[];",
+        },
+    };
+    RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set, 1, 1, 0));
+
+    GLSLC(0,                                                                   
  );
+    GLSLC(0, void main()                                                       
  );
+    GLSLC(0, {                                                                 
  );
+    GLSLC(1,     uint64_t offset;                                              
  );
+    GLSLC(1,     DataBuffer dst;                                               
  );
+    GLSLC(1,     float s1;                                                     
  );
+    GLSLC(1,     DTYPE s2;                                                     
  );
+    GLSLC(1,     DTYPE prefix_sum;                                             
  );
+    GLSLC(1,     uvec2 size;                                                   
  );
+    GLSLC(1,     ivec2 pos;                                                    
  );
+    GLSLC(1,     ivec2 pos_off;                                                
  );
+    GLSLC(0,                                                                   
  );
+    GLSLC(1,     DataBuffer integral_data;                                     
  );
+    GLSLF(1,     ivec2 offs[%i];                                               
  ,TYPE_ELEMS);
+    GLSLC(0,                                                                   
  );
+    GLSLC(1,     uint c_off;                                                   
  );
+    GLSLC(1,     uint c_plane;                                                 
  );
+    GLSLC(0,                                                                   
  );
+    GLSLC(1,     int comp_idx = int(gl_WorkGroupID.y);                         
  );
+    GLSLC(1,     int invoc_idx = int(gl_WorkGroupID.z);                        
  );
+    GLSLC(0,                                                                   
  );
+    GLSLF(1,     offset = integral_size * (invoc_idx * %i + comp_idx);         
  ,desc->nb_components);
+    GLSLC(1,     integral_data = DataBuffer(uint64_t(integral_base) + offset); 
  );
+    for (int i = 0; i < TYPE_ELEMS; i++)
+        GLSLF(1, offs[%i] = xyoffsets[xyoffs_start + %i*invoc_idx + %i];       
  ,i,TYPE_ELEMS,i);
+    GLSLC(0,                                                                   
  );
+    GLSLC(1,     c_off = comp_off[comp_idx];                                   
  );
+    GLSLC(1,     c_plane = comp_plane[comp_idx];                               
  );
+    GLSLC(1,     size = imageSize(input_img[c_plane]);                         
  );
+    GLSLC(0,                                                                   
  );
+    GLSLC(1,     pos.x = int(gl_GlobalInvocationID.x);                         
  );
+    GLSLC(1,     if (pos.x < width[c_plane]) {                                 
  );
+    GLSLC(2,         prefix_sum = DTYPE(0);                                    
  );
+    GLSLC(2,         for (pos.y = 0; pos.y < height[c_plane]; pos.y++) {       
  );
+    GLSLC(3,             offset = int_stride * uint64_t(pos.y);                
  );
+    GLSLC(3,             dst = DataBuffer(uint64_t(integral_data) + offset);   
  );
+    GLSLC(4,             s1 = imageLoad(input_img[c_plane], pos)[c_off];       
  );
+    for (int i = 0; i < TYPE_ELEMS; i++) {
+        GLSLF(4,         pos_off = pos + offs[%i];                             
  ,i);
+        GLSLC(4,         if (!IS_WITHIN(uvec2(pos_off), size))                 
  );
+        GLSLF(5,             s2[%i] = s1;                                      
  ,i);
+        GLSLC(4,         else                                                  
  );
+        GLSLF(5,             s2[%i] = imageLoad(input_img[c_plane], 
pos_off)[c_off]; ,i);
     }
-    GLSLC(2,     }                                                            
);
-    GLSLC(1, }                                                                
);
+    GLSLC(4,             s2 = (s1 - s2) * (s1 - s2);                           
  );
+    GLSLC(3,             dst.v[pos.x] = s2 + prefix_sum;                       
  );
+    GLSLC(3,             prefix_sum += s2;                                     
  );
+    GLSLC(2,         }                                                         
  );
+    GLSLC(1,     }                                                             
  );
+    GLSLC(0, }                                                                 
  );
+
+    RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main", 
&spv_opaque));
+    RET(ff_vk_shader_link(vkctx, shd, spv_data, spv_len, "main"));
+
+    RET(ff_vk_shader_register_exec(vkctx, exec, shd));
+
+fail:
+    if (spv_opaque)
+        spv->free_shader(spv, &spv_opaque);
+
+    return err;
 }
 
-typedef struct HorizontalPushData {
+typedef struct WeightsPushData {
     uint32_t width[4];
     uint32_t height[4];
+    uint32_t ws_offset[4];
     uint32_t ws_stride[4];
     int32_t  patch_size[4];
     float    strength[4];
+    uint32_t comp_off[4];
+    uint32_t comp_plane[4];
     VkDeviceAddress integral_base;
     uint64_t integral_size;
     uint64_t int_stride;
     uint32_t xyoffs_start;
-} HorizontalPushData;
+    uint32_t ws_total_count;
+} WeightsPushData;
 
 static av_cold int init_weights_pipeline(FFVulkanContext *vkctx, FFVkExecPool 
*exec,
                                          FFVulkanShader *shd,
                                          FFVkSPIRVCompiler *spv,
-                                         int width, int height, int t,
                                          const AVPixFmtDescriptor *desc,
-                                         int planes, int *nb_rows)
+                                         int planes)
 {
     int err;
     uint8_t *spv_data;
     size_t spv_len;
     void *spv_opaque = NULL;
     FFVulkanDescriptorSetBinding *desc_set;
-    int max_dim = FFMAX(width, height);
-    uint32_t max_wg = 
vkctx->props.properties.limits.maxComputeWorkGroupSize[0];
-    int wg_size, wg_rows;
-
-    /* Round the max workgroup size to the previous power of two */
-    wg_size = max_wg;
-    wg_rows = 1;
-
-    if (max_wg > max_dim) {
-        wg_size = max_dim;
-    } else if (max_wg < max_dim) {
-        /* Make it fit */
-        while (wg_size*wg_rows < max_dim)
-            wg_rows++;
-    }
 
     RET(ff_vk_shader_init(vkctx, shd, "nlmeans_weights",
                           VK_SHADER_STAGE_COMPUTE_BIT,
                           (const char *[]) { "GL_EXT_buffer_reference",
                                              "GL_EXT_buffer_reference2" }, 2,
-                          wg_size, 1, 1,
+                          WG_SIZE, WG_SIZE, 1,
                           0));
 
-    *nb_rows = wg_rows;
-
-    if (t > 1)
-        GLSLC(0, #extension GL_EXT_shader_atomic_float : require              
);
     GLSLC(0, #extension GL_ARB_gpu_shader_int64 : require                     
);
     GLSLC(0,                                                                  
);
     GLSLF(0, #define DTYPE %s                                                 
,TYPE_NAME);
@@ -254,21 +320,24 @@ static av_cold int init_weights_pipeline(FFVulkanContext 
*vkctx, FFVkExecPool *e
     GLSLC(0, layout(buffer_reference, buffer_reference_align = T_ALIGN) buffer 
DataBuffer {  );
     GLSLC(1,     DTYPE v[];                                                   
);
     GLSLC(0, };                                                               
);
-    GLSLC(0,                                                                  
);
     GLSLC(0, layout(push_constant, std430) uniform pushConstants {            
);
     GLSLC(1,     uvec4 width;                                                 
);
     GLSLC(1,     uvec4 height;                                                
);
+    GLSLC(1,     uvec4 ws_offset;                                             
);
     GLSLC(1,     uvec4 ws_stride;                                             
);
     GLSLC(1,     ivec4 patch_size;                                            
);
     GLSLC(1,     vec4 strength;                                               
);
+    GLSLC(1,     uvec4 comp_off;                                              
);
+    GLSLC(1,     uvec4 comp_plane;                                            
);
     GLSLC(1,     DataBuffer integral_base;                                    
);
     GLSLC(1,     uint64_t integral_size;                                      
);
     GLSLC(1,     uint64_t int_stride;                                         
);
     GLSLC(1,     uint xyoffs_start;                                           
);
+    GLSLC(1,     uint ws_total_count;                                         
);
     GLSLC(0, };                                                               
);
     GLSLC(0,                                                                  
);
 
-    ff_vk_shader_add_push_const(shd, 0, sizeof(HorizontalPushData),
+    ff_vk_shader_add_push_const(shd, 0, sizeof(WeightsPushData),
                                 VK_SHADER_STAGE_COMPUTE_BIT);
 
     desc_set = (FFVulkanDescriptorSetBinding []) {
@@ -282,55 +351,19 @@ static av_cold int init_weights_pipeline(FFVulkanContext 
*vkctx, FFVkExecPool *e
             .stages     = VK_SHADER_STAGE_COMPUTE_BIT,
         },
         {
-            .name        = "weights_buffer_0",
+            .name        = "weights_buffer",
             .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
             .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "float weights_0[];",
+            .buf_content = "float weights[];",
         },
         {
-            .name        = "sums_buffer_0",
+            .name        = "sums_buffer",
             .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
             .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "float sums_0[];",
-        },
-        {
-            .name        = "weights_buffer_1",
-            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "float weights_1[];",
-        },
-        {
-            .name        = "sums_buffer_1",
-            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "float sums_1[];",
-        },
-        {
-            .name        = "weights_buffer_2",
-            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "float weights_2[];",
-        },
-        {
-            .name        = "sums_buffer_2",
-            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "float sums_2[];",
-        },
-        {
-            .name        = "weights_buffer_3",
-            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "float weights_3[];",
-        },
-        {
-            .name        = "sums_buffer_3",
-            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "float sums_3[];",
+            .buf_content = "float sums[];",
         },
     };
-    RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set, 1 + 
2*desc->nb_components, 0, 0));
+    RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set, 3, 0, 0));
 
     desc_set = (FFVulkanDescriptorSetBinding []) {
         {
@@ -348,57 +381,71 @@ static av_cold int init_weights_pipeline(FFVulkanContext 
*vkctx, FFVkExecPool *e
     GLSLC(0, {                                                                 
  );
     GLSLC(1,     uint64_t offset;                                              
  );
     GLSLC(1,     DataBuffer dst;                                               
  );
-    GLSLC(1,     float s1;                                                     
  );
-    GLSLC(1,     DTYPE s2;                                                     
  );
-    GLSLC(1,     DTYPE prefix_sum;                                             
  );
-    GLSLF(1,     DTYPE psum[%i];                                               
  ,*nb_rows);
-    GLSLC(1,     int r;                                                        
  );
+    GLSLC(1,     uvec2 size;                                                   
  );
     GLSLC(1,     ivec2 pos;                                                    
  );
+    GLSLC(1,     ivec2 pos_off;                                                
  );
     GLSLC(1,     int p;                                                        
  );
     GLSLC(0,                                                                   
  );
     GLSLC(1,     DataBuffer integral_data;                                     
  );
     GLSLF(1,     ivec2 offs[%i];                                               
  ,TYPE_ELEMS);
     GLSLC(0,                                                                   
  );
-    GLSLC(1,     int invoc_idx = int(gl_WorkGroupID.z);                        
  );
+    GLSLC(1,     uint c_off;                                                   
  );
+    GLSLC(1,     uint c_plane;                                                 
  );
+    GLSLC(1,     uint ws_off;                                                  
  );
     GLSLC(0,                                                                   
  );
-    GLSLC(1,     offset = integral_size * invoc_idx;                           
  );
+    GLSLC(1,     pos = ivec2(gl_GlobalInvocationID.xy);                        
  );
+    GLSLF(1,     int comp_idx = int(gl_WorkGroupID.z) %% %i;                   
  ,desc->nb_components);
+    GLSLF(1,     int invoc_idx = int(gl_WorkGroupID.z) / %i;                   
  ,desc->nb_components);
+    GLSLC(0,                                                                   
  );
+    GLSLC(1,     c_off = comp_off[comp_idx];                                   
  );
+    GLSLC(1,     c_plane = comp_plane[comp_idx];                               
  );
+    GLSLC(1,     p = patch_size[comp_idx];                                     
  );
+    GLSLC(1,     if (pos.y < p || pos.y >= height[c_plane] - p || pos.x < p || 
pos.x >= width[c_plane] - p) );
+    GLSLC(2,         return;                                                   
  );
+    GLSLC(0,                                                                   
  );
+    GLSLF(1,     offset = integral_size * (invoc_idx * %i + comp_idx);         
  ,desc->nb_components);
     GLSLC(1,     integral_data = DataBuffer(uint64_t(integral_base) + offset); 
  );
     for (int i = 0; i < TYPE_ELEMS; i++)
         GLSLF(1, offs[%i] = xyoffsets[xyoffs_start + %i*invoc_idx + %i];       
  ,i,TYPE_ELEMS,i);
     GLSLC(0,                                                                   
  );
+    GLSLC(1,     ws_off = ws_total_count * invoc_idx + ws_offset[comp_idx] + 
pos.y * ws_stride[comp_idx]; );
+    GLSLC(1,     size = imageSize(input_img[c_plane]);                         
  );
+    GLSLC(0,                                                                   
  );
     GLSLC(1,     DTYPE a;                                                      
  );
     GLSLC(1,     DTYPE b;                                                      
  );
     GLSLC(1,     DTYPE c;                                                      
  );
     GLSLC(1,     DTYPE d;                                                      
  );
     GLSLC(0,                                                                   
  );
     GLSLC(1,     DTYPE patch_diff;                                             
  );
-    if (TYPE_ELEMS == 4) {
-        GLSLC(1, vec4 src;                                                     
  );
-        GLSLC(1, vec4 w;                                                       
  );
-    } else {
-        GLSLC(1, vec4 src[4];                                                  
  );
-        GLSLC(1, vec4 w[4];                                                    
  );
-    }
+    GLSLC(1,     vec4 src;                                                     
  );
+    GLSLC(1,     vec4 w;                                                       
  );
     GLSLC(1,     float w_sum;                                                  
  );
     GLSLC(1,     float sum;                                                    
  );
     GLSLC(0,                                                                   
  );
-    GLSLC(1,     bool lt;                                                      
  );
-    GLSLC(1,     bool gt;                                                      
  );
-    GLSLC(0,                                                                   
  );
-
-    for (int i = 0; i < desc->nb_components; i++) {
-        int off = desc->comp[i].offset / (FFALIGN(desc->comp[i].depth, 8)/8);
-        if (width >= height) {
-            insert_horizontal_pass(shd, *nb_rows, 1, desc->comp[i].plane, off);
-            insert_vertical_pass(shd, *nb_rows, 0, desc->comp[i].plane, off);
-            insert_weights_pass(shd, *nb_rows, 0, t, i, desc->comp[i].plane, 
off);
-        } else {
-            insert_vertical_pass(shd, *nb_rows, 1, desc->comp[i].plane, off);
-            insert_horizontal_pass(shd, *nb_rows, 0, desc->comp[i].plane, off);
-            insert_weights_pass(shd, *nb_rows, 1, t, i, desc->comp[i].plane, 
off);
-        }
+    for (int i = 0; i < 4; i++) {
+        GLSLF(1,     pos_off = pos + offs[%i];                                 
  ,i);
+        GLSLC(1,     if (!IS_WITHIN(uvec2(pos_off), size))                     
  );
+        GLSLF(2,         src[%i] = imageLoad(input_img[c_plane], pos)[c_off];  
  ,i);
+        GLSLC(1,     else                                                      
  );
+        GLSLF(2,         src[%i] = imageLoad(input_img[c_plane], 
pos_off)[c_off]; ,i);
     }
-
+    GLSLC(0,                                                                   
  );
+    GLSLC(1,         offset = int_stride * uint64_t(pos.y - p);                
  );
+    GLSLC(1,         dst = DataBuffer(uint64_t(integral_data) + offset);       
  );
+    GLSLC(1,         a = dst.v[pos.x - p];                                     
  );
+    GLSLC(1,         c = dst.v[pos.x + p];                                     
  );
+    GLSLC(1,         offset = int_stride * uint64_t(pos.y + p);                
  );
+    GLSLC(1,         dst = DataBuffer(uint64_t(integral_data) + offset);       
  );
+    GLSLC(1,         b = dst.v[pos.x - p];                                     
  );
+    GLSLC(1,         d = dst.v[pos.x + p];                                     
  );
+    GLSLC(0,                                                                   
  );
+    GLSLC(1,         patch_diff = d + a - b - c;                               
  );
+    GLSLC(1,         w = exp(patch_diff * strength[comp_idx]);                 
  );
+    GLSLC(1,         w_sum = w[0] + w[1] + w[2] + w[3];                        
  );
+    GLSLC(1,         sum = dot(w, src * 255);                                  
  );
+    GLSLC(0,                                                                   
  );
+    GLSLC(1,         weights[ws_off + pos.x] += w_sum;                         
  );
+    GLSLC(1,         sums[ws_off + pos.x] += sum;                              
  );
     GLSLC(0, }                                                                 
  );
 
     RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main", 
&spv_opaque));
@@ -414,7 +461,12 @@ fail:
 }
 
 typedef struct DenoisePushData {
+    uint32_t comp_off[4];
+    uint32_t comp_plane[4];
+    uint32_t ws_offset[4];
     uint32_t ws_stride[4];
+    uint32_t ws_total_count;
+    uint32_t t;
 } DenoisePushData;
 
 static av_cold int init_denoise_pipeline(FFVulkanContext *vkctx, FFVkExecPool 
*exec,
@@ -426,16 +478,20 @@ static av_cold int init_denoise_pipeline(FFVulkanContext 
*vkctx, FFVkExecPool *e
     size_t spv_len;
     void *spv_opaque = NULL;
     FFVulkanDescriptorSetBinding *desc_set;
-
     RET(ff_vk_shader_init(vkctx, shd, "nlmeans_denoise",
                           VK_SHADER_STAGE_COMPUTE_BIT,
                           (const char *[]) { "GL_EXT_buffer_reference",
                                              "GL_EXT_buffer_reference2" }, 2,
-                          32, 32, 1,
+                          WG_SIZE, WG_SIZE, 1,
                           0));
 
     GLSLC(0, layout(push_constant, std430) uniform pushConstants {        );
+    GLSLC(1,    uvec4 comp_off;                                           );
+    GLSLC(1,    uvec4 comp_plane;                                         );
+    GLSLC(1,    uvec4 ws_offset;                                          );
     GLSLC(1,    uvec4 ws_stride;                                          );
+    GLSLC(1,    uint32_t ws_total_count;                                  );
+    GLSLC(1,    uint32_t t;                                               );
     GLSLC(0, };                                                           );
 
     ff_vk_shader_add_push_const(shd, 0, sizeof(DenoisePushData),
@@ -465,92 +521,58 @@ static av_cold int init_denoise_pipeline(FFVulkanContext 
*vkctx, FFVkExecPool *e
 
     desc_set = (FFVulkanDescriptorSetBinding []) {
         {
-            .name        = "weights_buffer_0",
-            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .mem_quali   = "readonly",
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "float weights_0[];",
-        },
-        {
-            .name        = "sums_buffer_0",
-            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .mem_quali   = "readonly",
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "float sums_0[];",
-        },
-        {
-            .name        = "weights_buffer_1",
+            .name        = "weights_buffer",
             .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
             .mem_quali   = "readonly",
             .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "float weights_1[];",
+            .buf_content = "float weights[];",
         },
         {
-            .name        = "sums_buffer_1",
+            .name        = "sums_buffer",
             .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
             .mem_quali   = "readonly",
             .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "float sums_1[];",
-        },
-        {
-            .name        = "weights_buffer_2",
-            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .mem_quali   = "readonly",
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "float weights_2[];",
-        },
-        {
-            .name        = "sums_buffer_2",
-            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .mem_quali   = "readonly",
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "float sums_2[];",
-        },
-        {
-            .name        = "weights_buffer_3",
-            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .mem_quali   = "readonly",
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "float weights_3[];",
-        },
-        {
-            .name        = "sums_buffer_3",
-            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .mem_quali   = "readonly",
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "float sums_3[];",
+            .buf_content = "float sums[];",
         },
     };
 
-    RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set, 
2*desc->nb_components, 0, 0));
+    RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set, 2, 0, 0));
 
     GLSLC(0, void main()                                                      
);
     GLSLC(0, {                                                                
);
-    GLSLC(1,     ivec2 size;                                                  
);
     GLSLC(1,     const ivec2 pos = ivec2(gl_GlobalInvocationID.xy);           
);
     GLSLC(1,     const uint plane = uint(gl_WorkGroupID.z);                   
);
+    GLSLC(1,     const uvec2 size = imageSize(output_img[plane]);             
);
+    GLSLC(0,                                                                  
);
+    GLSLC(1,     uint c_off;                                                  
);
+    GLSLC(1,     uint c_plane;                                                
);
+    GLSLC(1,     uint ws_off;                                                 
);
     GLSLC(0,                                                                  
);
     GLSLC(1,     float w_sum;                                                 
);
     GLSLC(1,     float sum;                                                   
);
     GLSLC(1,     vec4 src;                                                    
);
     GLSLC(1,     vec4 r;                                                      
);
+    GLSLC(1,     int invoc_idx;                                               
);
+    GLSLC(1,     int comp_idx;                                                
);
     GLSLC(0,                                                                  
);
-    GLSLC(1,     size = imageSize(output_img[plane]);                         
);
     GLSLC(1,     if (!IS_WITHIN(pos, size))                                   
);
     GLSLC(2,         return;                                                  
);
     GLSLC(0,                                                                  
);
     GLSLC(1,     src = imageLoad(input_img[plane], pos);                      
);
-    GLSLC(0,                                                                  
);
-    for (int c = 0; c < desc->nb_components; c++) {
-        int off = desc->comp[c].offset / (FFALIGN(desc->comp[c].depth, 8)/8);
-        GLSLF(1, if (plane == %i) {                                            
  ,desc->comp[c].plane);
-        GLSLF(2,     w_sum = weights_%i[pos.y*ws_stride[%i] + pos.x];          
                 ,c, c);
-        GLSLF(2,     sum = sums_%i[pos.y*ws_stride[%i] + pos.x];               
                 ,c, c);
-        GLSLF(2,     r[%i] = (sum + src[%i]*255) / (1.0 + w_sum) / 255;        
             ,off, off);
-        GLSLC(1, }                                                             
                      );
-        GLSLC(0,                                                               
                      );
-    }
-    GLSLC(1, imageStore(output_img[plane], pos, r);                           
);
+    GLSLF(1,     for (comp_idx = 0; comp_idx < %i; comp_idx++) {              
,desc->nb_components);
+    GLSLC(2,         if (plane == comp_plane[comp_idx]) {                     
);
+    GLSLC(3,             w_sum = 0.0;                                         
);
+    GLSLC(3,             sum = 0.0;                                           
);
+    GLSLC(3,             for (invoc_idx = 0; invoc_idx < t; invoc_idx++) {    
);
+    GLSLC(4,                 ws_off = ws_total_count * invoc_idx + 
ws_offset[comp_idx] + pos.y * ws_stride[comp_idx] + pos.x; );
+    GLSLC(4,                 w_sum += weights[ws_off];                        
);
+    GLSLC(4,                 sum += sums[ws_off];                             
);
+    GLSLC(3,             }                                                    
);
+    GLSLC(3,             c_off = comp_off[comp_idx];                          
);
+    GLSLC(3,             r[c_off] = (sum + src[c_off] * 255) / (1.0 + w_sum) / 
255; );
+    GLSLC(2,         }                                                        
);
+    GLSLC(1,     }                                                            
);
+    GLSLC(1,     imageStore(output_img[plane], pos, r);                       
);
     GLSLC(0, }                                                                
);
 
     RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main", 
&spv_opaque));
@@ -640,11 +662,6 @@ static av_cold int init_filter(AVFilterContext *ctx)
     RET(ff_vk_unmap_buffer(&s->vkctx, &s->xyoffsets_buf, 1));
 
     s->opts.t = FFMIN(s->opts.t, (FFALIGN(s->nb_offsets, TYPE_ELEMS) / 
TYPE_ELEMS));
-    if (!vkctx->atomic_float_feats.shaderBufferFloat32AtomicAdd) {
-        av_log(ctx, AV_LOG_WARNING, "Device doesn't support atomic float adds, 
"
-               "disabling dispatch parallelism\n");
-        s->opts.t = 1;
-    }
 
     spv = ff_vk_spirv_init();
     if (!spv) {
@@ -661,21 +678,25 @@ static av_cold int init_filter(AVFilterContext *ctx)
 
     RET(ff_vk_exec_pool_init(vkctx, s->qf, &s->e, 1, 0, 0, 0, NULL));
 
-    RET(init_weights_pipeline(vkctx, &s->e, &s->shd_weights,
-                              spv, s->vkctx.output_width, 
s->vkctx.output_height,
-                              s->opts.t, desc, planes, &s->pl_weights_rows));
+    RET(init_integral_pipeline(vkctx, &s->e, &s->shd_horizontal, 
&s->shd_vertical,
+                               spv, desc, planes));
 
-    RET(init_denoise_pipeline(vkctx, &s->e, &s->shd_denoise,
-                              spv, desc, planes));
+    RET(init_weights_pipeline(vkctx, &s->e, &s->shd_weights, spv, desc, 
planes));
+
+    RET(init_denoise_pipeline(vkctx, &s->e, &s->shd_denoise, spv, desc, 
planes));
+
+    RET(ff_vk_shader_update_desc_buffer(vkctx, &s->e.contexts[0], 
&s->shd_vertical,
+                                        1, 0, 0,
+                                        &s->xyoffsets_buf, 0, 
s->xyoffsets_buf.size,
+                                        VK_FORMAT_UNDEFINED));
 
     RET(ff_vk_shader_update_desc_buffer(vkctx, &s->e.contexts[0], 
&s->shd_weights,
                                         1, 0, 0,
-                                    &s->xyoffsets_buf, 0, 
s->xyoffsets_buf.size,
-                                    VK_FORMAT_UNDEFINED));
+                                        &s->xyoffsets_buf, 0, 
s->xyoffsets_buf.size,
+                                        VK_FORMAT_UNDEFINED));
 
     do {
         int wg_invoc = FFMIN((s->nb_offsets - offsets_dispatched)/TYPE_ELEMS, 
s->opts.t);
-        wg_invoc = FFMIN(wg_invoc, 
vkctx->props.properties.limits.maxComputeWorkGroupCount[2]);
         offsets_dispatched += wg_invoc * TYPE_ELEMS;
         nb_dispatches++;
     } while (offsets_dispatched < s->nb_offsets);
@@ -693,15 +714,22 @@ fail:
 }
 
 static int denoise_pass(NLMeansVulkanContext *s, FFVkExecContext *exec,
-                        FFVkBuffer *ws_vk, uint32_t ws_stride[4])
+                        FFVkBuffer *ws_vk, uint32_t comp_offs[4], uint32_t 
comp_planes[4],
+                        uint32_t ws_offset[4], uint32_t ws_stride[4],
+                        uint32_t ws_total_count, int t)
 {
     FFVulkanContext *vkctx = &s->vkctx;
     FFVulkanFunctions *vk = &vkctx->vkfn;
-    VkBufferMemoryBarrier2 buf_bar[8];
+    VkBufferMemoryBarrier2 buf_bar[2];
     int nb_buf_bar = 0;
 
     DenoisePushData pd = {
+        { comp_offs[0], comp_offs[1], comp_offs[2], comp_offs[3] },
+        { comp_planes[0], comp_planes[1], comp_planes[2], comp_planes[3] },
+        { ws_offset[0], ws_offset[1], ws_offset[2], ws_offset[3] },
         { ws_stride[0], ws_stride[1], ws_stride[2], ws_stride[3] },
+        ws_total_count,
+        t,
     };
 
     /* Denoise pass pipeline */
@@ -753,6 +781,8 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, 
AVFrame *in)
     FFVulkanFunctions *vk = &vkctx->vkfn;
 
     const AVPixFmtDescriptor *desc;
+    int comp_offs[4];
+    int comp_planes[4];
     int plane_widths[4];
     int plane_heights[4];
 
@@ -767,18 +797,17 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink 
*link, AVFrame *in)
     /* Weights/sums */
     AVBufferRef *ws_buf = NULL;
     FFVkBuffer *ws_vk;
-    VkDeviceSize weights_offs[4];
-    VkDeviceSize sums_offs[4];
+    uint32_t ws_total_count = 0;
+    uint32_t ws_offset[4];
     uint32_t ws_stride[4];
-    size_t ws_size[4];
-    size_t ws_total_size = 0;
+    size_t ws_total_size;
 
     FFVkExecContext *exec;
     VkImageView in_views[AV_NUM_DATA_POINTERS];
     VkImageView out_views[AV_NUM_DATA_POINTERS];
     VkImageMemoryBarrier2 img_bar[8];
     int nb_img_bar = 0;
-    VkBufferMemoryBarrier2 buf_bar[8];
+    VkBufferMemoryBarrier2 buf_bar[2];
     int nb_buf_bar = 0;
 
     if (!s->initialized)
@@ -789,27 +818,32 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink 
*link, AVFrame *in)
         return AVERROR(EINVAL);
 
     /* Integral image */
-    int_stride = s->shd_weights.lg_size[0]*s->pl_weights_rows*TYPE_SIZE;
-    int_size = s->shd_weights.lg_size[0]*s->pl_weights_rows*int_stride;
+    int_stride = FFALIGN(vkctx->output_width, s->shd_vertical.lg_size[0]) * 
TYPE_SIZE;
+    int_size = FFALIGN(vkctx->output_height, s->shd_horizontal.lg_size[0]) * 
int_stride;
 
     /* Plane dimensions */
     for (int i = 0; i < desc->nb_components; i++) {
         plane_widths[i] = !i || (i == 3) ? vkctx->output_width : 
AV_CEIL_RSHIFT(vkctx->output_width, desc->log2_chroma_w);
-        plane_heights[i] = !i || (i == 3) ? vkctx->output_height : 
AV_CEIL_RSHIFT(vkctx->output_height, desc->log2_chroma_w);
+        plane_heights[i] = !i || (i == 3) ? vkctx->output_height : 
AV_CEIL_RSHIFT(vkctx->output_height, desc->log2_chroma_h);
         plane_widths[i]  = FFALIGN(plane_widths[i],  
s->shd_denoise.lg_size[0]);
         plane_heights[i] = FFALIGN(plane_heights[i], 
s->shd_denoise.lg_size[1]);
 
+        comp_offs[i] = desc->comp[i].offset / (FFALIGN(desc->comp[i].depth, 
8)/8);
+        comp_planes[i] = desc->comp[i].plane;
+
         ws_stride[i] = plane_widths[i];
-        ws_size[i] = ws_stride[i] * plane_heights[i] * sizeof(float);
-        ws_total_size += ws_size[i];
+        ws_offset[i] = ws_total_count;
+        ws_total_count += ws_stride[i] * plane_heights[i];
     }
 
+    ws_total_size = ws_total_count * sizeof(float);
+
     /* Buffers */
     err = ff_vk_get_pooled_buffer(&s->vkctx, &s->integral_buf_pool, 
&integral_buf,
                                   VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
                                   VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
                                   NULL,
-                                  s->opts.t * int_size,
+                                  int_size * s->opts.t * desc->nb_components,
                                   VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
     if (err < 0)
         return err;
@@ -820,19 +854,12 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink 
*link, AVFrame *in)
                                   VK_BUFFER_USAGE_TRANSFER_DST_BIT |
                                   VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
                                   NULL,
-                                  ws_total_size * 2,
+                                  ws_total_size * s-> opts.t * 2,
                                   VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
     if (err < 0)
         return err;
     ws_vk = (FFVkBuffer *)ws_buf->data;
 
-    weights_offs[0] = 0;
-    sums_offs[0] = ws_total_size;
-    for (int i = 1; i < desc->nb_components; i++) {
-        weights_offs[i] = weights_offs[i - 1] + ws_size[i - 1];
-        sums_offs[i] = sums_offs[i - 1] + ws_size[i - 1];
-    }
-
     /* Output frame */
     out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
     if (!out) {
@@ -889,19 +916,6 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, 
AVFrame *in)
         .size = ws_vk->size,
         .offset = 0,
     };
-    buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
-        .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
-        .srcStageMask = integral_vk->stage,
-        .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
-        .srcAccessMask = integral_vk->access,
-        .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
-                         VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
-        .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-        .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-        .buffer = integral_vk->buf,
-        .size = integral_vk->size,
-        .offset = 0,
-    };
 
     vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
             .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
@@ -912,118 +926,180 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink 
*link, AVFrame *in)
         });
     ws_vk->stage = buf_bar[0].dstStageMask;
     ws_vk->access = buf_bar[0].dstAccessMask;
-    integral_vk->stage = buf_bar[1].dstStageMask;
-    integral_vk->access = buf_bar[1].dstAccessMask;
 
     /* Buffer zeroing */
     vk->CmdFillBuffer(exec->buf, ws_vk->buf, 0, ws_vk->size, 0x0);
 
-    nb_buf_bar = 0;
-    buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
-        .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
-        .srcStageMask = ws_vk->stage,
-        .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
-        .srcAccessMask = ws_vk->access,
-        .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
-                         VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
-        .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-        .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-        .buffer = ws_vk->buf,
-        .size = ws_vk->size,
-        .offset = 0,
-    };
-
-    vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
-            .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
-            .pBufferMemoryBarriers = buf_bar,
-            .bufferMemoryBarrierCount = nb_buf_bar,
-        });
-    ws_vk->stage = buf_bar[0].dstStageMask;
-    ws_vk->access = buf_bar[0].dstAccessMask;
-
+    /* Update integral descriptors */
+    ff_vk_shader_update_img_array(vkctx, exec, &s->shd_vertical, in, in_views, 
0, 0,
+                                  VK_IMAGE_LAYOUT_GENERAL, VK_NULL_HANDLE);
     /* Update weights descriptors */
     ff_vk_shader_update_img_array(vkctx, exec, &s->shd_weights, in, in_views, 
0, 0,
                                   VK_IMAGE_LAYOUT_GENERAL, VK_NULL_HANDLE);
-    for (int i = 0; i < desc->nb_components; i++) {
-        RET(ff_vk_shader_update_desc_buffer(&s->vkctx, exec, &s->shd_weights, 
0, 1 + i*2 + 0, 0,
-                                            ws_vk, weights_offs[i], ws_size[i],
-                                            VK_FORMAT_UNDEFINED));
-        RET(ff_vk_shader_update_desc_buffer(&s->vkctx, exec, &s->shd_weights, 
0, 1 + i*2 + 1, 0,
-                                            ws_vk, sums_offs[i], ws_size[i],
-                                            VK_FORMAT_UNDEFINED));
-    }
+    RET(ff_vk_shader_update_desc_buffer(&s->vkctx, exec, &s->shd_weights, 0, 
1, 0,
+                                        ws_vk, 0, ws_total_size * s-> opts.t,
+                                        VK_FORMAT_UNDEFINED));
+    RET(ff_vk_shader_update_desc_buffer(&s->vkctx, exec, &s->shd_weights, 0, 
2, 0,
+                                        ws_vk, ws_total_size * s-> opts.t, 
ws_total_size * s-> opts.t,
+                                        VK_FORMAT_UNDEFINED));
 
     /* Update denoise descriptors */
     ff_vk_shader_update_img_array(vkctx, exec, &s->shd_denoise, in, in_views, 
0, 0,
                                   VK_IMAGE_LAYOUT_GENERAL, VK_NULL_HANDLE);
     ff_vk_shader_update_img_array(vkctx, exec, &s->shd_denoise, out, 
out_views, 0, 1,
                                   VK_IMAGE_LAYOUT_GENERAL, VK_NULL_HANDLE);
-    for (int i = 0; i < desc->nb_components; i++) {
-        RET(ff_vk_shader_update_desc_buffer(&s->vkctx, exec, &s->shd_denoise, 
1, i*2 + 0, 0,
-                                            ws_vk, weights_offs[i], ws_size[i],
-                                            VK_FORMAT_UNDEFINED));
-        RET(ff_vk_shader_update_desc_buffer(&s->vkctx, exec, &s->shd_denoise, 
1, i*2 + 1, 0,
-                                            ws_vk, sums_offs[i], ws_size[i],
-                                            VK_FORMAT_UNDEFINED));
-    }
-
-    /* Weights pipeline */
-    ff_vk_exec_bind_shader(vkctx, exec, &s->shd_weights);
+    RET(ff_vk_shader_update_desc_buffer(&s->vkctx, exec, &s->shd_denoise, 1, 
0, 0,
+                                        ws_vk, 0, ws_total_size * s-> opts.t,
+                                        VK_FORMAT_UNDEFINED));
+    RET(ff_vk_shader_update_desc_buffer(&s->vkctx, exec, &s->shd_denoise, 1, 
1, 0,
+                                        ws_vk, ws_total_size * s-> opts.t, 
ws_total_size * s-> opts.t,
+                                        VK_FORMAT_UNDEFINED));
 
     do {
-        int wg_invoc;
-        HorizontalPushData pd = {
+        int wg_invoc = FFMIN((s->nb_offsets - offsets_dispatched)/TYPE_ELEMS, 
s->opts.t);
+
+        /* Integral pipeline */
+        IntegralPushData pd = {
             { plane_widths[0], plane_widths[1], plane_widths[2], 
plane_widths[3] },
             { plane_heights[0], plane_heights[1], plane_heights[2], 
plane_heights[3] },
-            { ws_stride[0], ws_stride[1], ws_stride[2], ws_stride[3] },
-            { s->patch[0], s->patch[1], s->patch[2], s->patch[3] },
-            { s->strength[0], s->strength[1], s->strength[2], s->strength[2], 
},
+            { comp_offs[0], comp_offs[1], comp_offs[2], comp_offs[3] },
+            { comp_planes[0], comp_planes[1], comp_planes[2], comp_planes[3] },
             integral_vk->address,
             (uint64_t)int_size,
             (uint64_t)int_stride,
             offsets_dispatched,
         };
 
-        /* Push data */
-        ff_vk_shader_update_push_const(vkctx, exec, &s->shd_weights,
+        ff_vk_exec_bind_shader(vkctx, exec, &s->shd_vertical);
+        ff_vk_shader_update_push_const(vkctx, exec, &s->shd_vertical,
                                        VK_SHADER_STAGE_COMPUTE_BIT,
                                        0, sizeof(pd), &pd);
 
-        if (offsets_dispatched) {
-            nb_buf_bar = 0;
-            buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
-                .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
-                .srcStageMask = integral_vk->stage,
-                .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
-                .srcAccessMask = integral_vk->access,
-                .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
-                                 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
-                .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-                .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-                .buffer = integral_vk->buf,
-                .size = integral_vk->size,
-                .offset = 0,
-            };
-
-            vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
-                    .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
-                    .pBufferMemoryBarriers = buf_bar,
-                    .bufferMemoryBarrierCount = nb_buf_bar,
-                });
-            integral_vk->stage = buf_bar[1].dstStageMask;
-            integral_vk->access = buf_bar[1].dstAccessMask;
-        }
+        nb_buf_bar = 0;
+        buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
+            .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
+            .srcStageMask = integral_vk->stage,
+            .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+            .srcAccessMask = integral_vk->access,
+            .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
+            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .buffer = integral_vk->buf,
+            .size = integral_vk->size,
+            .offset = 0,
+        };
+        vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
+            .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+            .pBufferMemoryBarriers = buf_bar,
+            .bufferMemoryBarrierCount = nb_buf_bar,
+        });
+        integral_vk->stage = buf_bar[0].dstStageMask;
+        integral_vk->access = buf_bar[0].dstAccessMask;
 
-        wg_invoc = FFMIN((s->nb_offsets - offsets_dispatched)/TYPE_ELEMS, 
s->opts.t);
-        wg_invoc = FFMIN(wg_invoc, 
vkctx->props.properties.limits.maxComputeWorkGroupCount[2]);
+        /* End of vertical pass */
+        vk->CmdDispatch(exec->buf, FFALIGN(vkctx->output_width, 
s->shd_vertical.lg_size[0])/s->shd_vertical.lg_size[0],
+                        desc->nb_components, wg_invoc);
+
+        ff_vk_exec_bind_shader(vkctx, exec, &s->shd_horizontal);
+        ff_vk_shader_update_push_const(vkctx, exec, &s->shd_horizontal,
+                                       VK_SHADER_STAGE_COMPUTE_BIT,
+                                       0, sizeof(pd), &pd);
+
+        nb_buf_bar = 0;
+        buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
+            .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
+            .srcStageMask = integral_vk->stage,
+            .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+            .srcAccessMask = integral_vk->access,
+            .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
+                                VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
+            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .buffer = integral_vk->buf,
+            .size = integral_vk->size,
+            .offset = 0,
+        };
+        vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
+            .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+            .pBufferMemoryBarriers = buf_bar,
+            .bufferMemoryBarrierCount = nb_buf_bar,
+        });
+        integral_vk->stage = buf_bar[0].dstStageMask;
+        integral_vk->access = buf_bar[0].dstAccessMask;
 
         /* End of horizontal pass */
-        vk->CmdDispatch(exec->buf, 1, 1, wg_invoc);
+        vk->CmdDispatch(exec->buf, FFALIGN(vkctx->output_height, 
s->shd_horizontal.lg_size[0])/s->shd_horizontal.lg_size[0],
+                        desc->nb_components, wg_invoc);
+
+        /* Weights pipeline */
+        WeightsPushData wpd = {
+            { plane_widths[0], plane_widths[1], plane_widths[2], 
plane_widths[3] },
+            { plane_heights[0], plane_heights[1], plane_heights[2], 
plane_heights[3] },
+            { ws_offset[0], ws_offset[1], ws_offset[2], ws_offset[3] },
+            { ws_stride[0], ws_stride[1], ws_stride[2], ws_stride[3] },
+            { s->patch[0], s->patch[1], s->patch[2], s->patch[3] },
+            { s->strength[0], s->strength[1], s->strength[2], s->strength[3], 
},
+            { comp_offs[0], comp_offs[1], comp_offs[2], comp_offs[3] },
+            { comp_planes[0], comp_planes[1], comp_planes[2], comp_planes[3] },
+            integral_vk->address,
+            (uint64_t)int_size,
+            (uint64_t)int_stride,
+            offsets_dispatched,
+            ws_total_count,
+        };
+
+        ff_vk_exec_bind_shader(vkctx, exec, &s->shd_weights);
+        ff_vk_shader_update_push_const(vkctx, exec, &s->shd_weights,
+                                        VK_SHADER_STAGE_COMPUTE_BIT,
+                                        0, sizeof(wpd), &wpd);
+
+        nb_buf_bar = 0;
+        buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
+            .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
+            .srcStageMask = integral_vk->stage,
+            .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+            .srcAccessMask = integral_vk->access,
+            .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT,
+            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .buffer = integral_vk->buf,
+            .size = integral_vk->size,
+            .offset = 0,
+        };
+        buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
+            .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
+            .srcStageMask = ws_vk->stage,
+            .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+            .srcAccessMask = ws_vk->access,
+            .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
+                                VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
+            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .buffer = ws_vk->buf,
+            .size = ws_vk->size,
+            .offset = 0,
+        };
+        vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
+            .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+            .pBufferMemoryBarriers = buf_bar,
+            .bufferMemoryBarrierCount = nb_buf_bar,
+        });
+        integral_vk->stage = buf_bar[0].dstStageMask;
+        integral_vk->access = buf_bar[0].dstAccessMask;
+        ws_vk->stage = buf_bar[1].dstStageMask;
+        ws_vk->access = buf_bar[1].dstAccessMask;
+
+        /* End of weights pass */
+        vk->CmdDispatch(exec->buf,
+                        FFALIGN(vkctx->output_width, 
s->shd_weights.lg_size[0])/s->shd_weights.lg_size[0],
+                        FFALIGN(vkctx->output_height, 
s->shd_weights.lg_size[1])/s->shd_weights.lg_size[1],
+                        wg_invoc * desc->nb_components);
 
         offsets_dispatched += wg_invoc * TYPE_ELEMS;
     } while (offsets_dispatched < s->nb_offsets);
 
-    RET(denoise_pass(s, exec, ws_vk, ws_stride));
+    RET(denoise_pass(s, exec, ws_vk, comp_offs, comp_planes, ws_offset, 
ws_stride,
+                     ws_total_count, s->opts.t));
 
     err = ff_vk_exec_submit(vkctx, exec);
     if (err < 0)
@@ -1051,6 +1127,8 @@ static void nlmeans_vulkan_uninit(AVFilterContext *avctx)
     FFVulkanContext *vkctx = &s->vkctx;
 
     ff_vk_exec_pool_free(vkctx, &s->e);
+    ff_vk_shader_free(vkctx, &s->shd_horizontal);
+    ff_vk_shader_free(vkctx, &s->shd_vertical);
     ff_vk_shader_free(vkctx, &s->shd_weights);
     ff_vk_shader_free(vkctx, &s->shd_denoise);
 
@@ -1071,7 +1149,7 @@ static const AVOption nlmeans_vulkan_options[] = {
     { "s",  "denoising strength for all components", OFFSET(opts.s), 
AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 }, 1.0, 100.0, FLAGS },
     { "p",  "patch size for all components", OFFSET(opts.p), AV_OPT_TYPE_INT, 
{ .i64 = 3*2+1 }, 0, 99, FLAGS },
     { "r",  "research window radius", OFFSET(opts.r), AV_OPT_TYPE_INT, { .i64 
= 7*2+1 }, 0, 99, FLAGS },
-    { "t",  "parallelism", OFFSET(opts.t), AV_OPT_TYPE_INT, { .i64 = 36 }, 1, 
168, FLAGS },
+    { "t",  "parallelism", OFFSET(opts.t), AV_OPT_TYPE_INT, { .i64 = 8 }, 1, 
64, FLAGS },
 
     { "s1", "denoising strength for component 1", OFFSET(opts.sc[0]), 
AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 }, 1.0, 100.0, FLAGS },
     { "s2", "denoising strength for component 2", OFFSET(opts.sc[1]), 
AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 }, 1.0, 100.0, FLAGS },

-----------------------------------------------------------------------

Summary of changes:
 doc/filters.texi                |  13 +-
 libavfilter/vf_nlmeans_vulkan.c | 950 ++++++++++++++++++++++------------------
 2 files changed, 534 insertions(+), 429 deletions(-)


hooks/post-receive
-- 

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] branch master updated. 62d43ba2e3 libavfilter/vf_nlmeans_vulkan: fix str defaults

Reply via email to