On 21/03/2021 23:22, Lucas Clemente Vella wrote:
This filter originally quantized OpenCL float images fetchs in 256 levels,
and computed the integral image of squared differences in 32 bit integers.
This had two consequences:
1) it could overflow if the image resolution was big enough (I got overflows
in a 4K video);
2) it dropped precision from bit depths higher than 8 bits.
Now the integral image is computed with float values in range [0, 1], instead
of integers in range [0, 255] (then squared), so there is no longer the risk
of overflow.
And even with the accumulated floating point error over the integral image,
the resulting difference between this float implementation and an experimental
uint64 implementation with 65535 quantization levels is less than 0.08%
on the worst difference (per component), and less than 0.002% on average. For
reference, the smallest variation possible on a 10-bit quantization is 0.098%
of the total intensity. This was tested on a 4K frame from an 10-bit source.
Does the sum actually avoid floating-point error beyond simple rounding?
Given only 23 bits of precision, it looks like there is a danger of losing
things by adding a set of small numbers to a large number and the large number
being unchanged.
Signed-off-by: Lucas Clemente Vella <lve...@gmail.com>
---
libavfilter/opencl/nlmeans.cl | 31 ++++++++++++++----------------
libavfilter/vf_nlmeans_opencl.c | 34 +++++----------------------------
2 files changed, 19 insertions(+), 46 deletions(-)
diff --git a/libavfilter/opencl/nlmeans.cl b/libavfilter/opencl/nlmeans.cl
index 72bd681fd6..6d78a41e46 100644
--- a/libavfilter/opencl/nlmeans.cl
+++ b/libavfilter/opencl/nlmeans.cl
@@ -20,7 +20,7 @@ const sampler_t sampler = (CLK_NORMALIZED_COORDS_FALSE |
CLK_ADDRESS_CLAMP_TO_EDGE |
CLK_FILTER_NEAREST);
-kernel void horiz_sum(__global uint4 *integral_img,
+kernel void horiz_sum(__global float4 *integral_img,
__read_only image2d_t src,
int width,
int height,
@@ -31,7 +31,7 @@ kernel void horiz_sum(__global uint4 *integral_img,
int y = get_global_id(0);
int work_size = get_global_size(0);
- uint4 sum = (uint4)(0);
+ float4 sum = 0.0;
Surprise double.
float4 s2;
for (int i = 0; i < width; i++) {
float s1 = read_imagef(src, sampler, (int2)(i, y)).x;
@@ -39,28 +39,25 @@ kernel void horiz_sum(__global uint4 *integral_img,
s2.y = read_imagef(src, sampler, (int2)(i + dx.y, y + dy.y)).x;
s2.z = read_imagef(src, sampler, (int2)(i + dx.z, y + dy.z)).x;
s2.w = read_imagef(src, sampler, (int2)(i + dx.w, y + dy.w)).x;
- sum += convert_uint4((s1 - s2) * (s1 - s2) * 255 * 255);
+ sum += (s1 - s2) * (s1 - s2);
integral_img[y * width + i] = sum;
}
}
-kernel void vert_sum(__global uint4 *integral_img,
- __global int *overflow,
+kernel void vert_sum(__global float4 *integral_img,
int width,
int height)
{
int x = get_global_id(0);
- uint4 sum = 0;
+ float4 sum = 0;
Please write floats as floats (0.0f), even if the implicit conversion is
correct.
for (int i = 0; i < height; i++) {
- if (any((uint4)UINT_MAX - integral_img[i * width + x] < sum))
- atomic_inc(overflow);
integral_img[i * width + x] += sum;
sum = integral_img[i * width + x];
}
}
kernel void weight_accum(global float *sum, global float *weight,
- global uint4 *integral_img, __read_only image2d_t src,
+ global float4 *integral_img, __read_only image2d_t
src,
int width, int height, int p, float h,
int4 dx, int4 dy)
{
@@ -75,16 +72,16 @@ kernel void weight_accum(global float *sum, global float
*weight,
int y = get_global_id(1);
int4 xoff = x + dx;
int4 yoff = y + dy;
- uint4 a = 0, b = 0, c = 0, d = 0;
- uint4 src_pix = 0;
+ float4 a = 0, b = 0, c = 0, d = 0;
+ float4 src_pix = 0;
// out-of-bounding-box?
int oobb = (x - p) < 0 || (y - p) < 0 || (y + p) >= height || (x + p) >=
width;
- src_pix.x = (int)(255 * read_imagef(src, sampler, (int2)(xoff.x, yoff.x)).x);
- src_pix.y = (int)(255 * read_imagef(src, sampler, (int2)(xoff.y,
yoff.y)).x);
- src_pix.z = (int)(255 * read_imagef(src, sampler, (int2)(xoff.z,
yoff.z)).x);
- src_pix.w = (int)(255 * read_imagef(src, sampler, (int2)(xoff.w,
yoff.w)).x);
+ src_pix.x = read_imagef(src, sampler, (int2)(xoff.x, yoff.x)).x;
+ src_pix.y = read_imagef(src, sampler, (int2)(xoff.y, yoff.y)).x;
+ src_pix.z = read_imagef(src, sampler, (int2)(xoff.z, yoff.z)).x;
+ src_pix.w = read_imagef(src, sampler, (int2)(xoff.w, yoff.w)).x;
if (!oobb) {
a = integral_img[(y - p) * width + x - p];
b = integral_img[(y + p) * width + x - p];
@@ -93,7 +90,7 @@ kernel void weight_accum(global float *sum, global float
*weight,
}
float4 patch_diff = convert_float4(d + a - c - b);
- float4 w = native_exp(-patch_diff / (h * h));
+ float4 w = native_exp(-patch_diff * (float4)(255.0f * 255.0f) / (h * h));
float w_sum = w.x + w.y + w.z + w.w;
weight[y * width + x] += w_sum;
sum[y * width + x] += dot(w, convert_float4(src_pix));
@@ -109,7 +106,7 @@ kernel void average(__write_only image2d_t dst,
float w = weight[y * dim.x + x];
float s = sum[y * dim.x + x];
float src_pix = read_imagef(src, sampler, (int2)(x, y)).x;
- float r = (s + src_pix * 255) / (1.0f + w) / 255.0f;
+ float r = (s + src_pix) / (1.0f + w);
if (x < dim.x && y < dim.y)
write_imagef(dst, (int2)(x, y), (float4)(r, 0.0f, 0.0f, 1.0f));
}
diff --git a/libavfilter/vf_nlmeans_opencl.c b/libavfilter/vf_nlmeans_opencl.c
index e57b5e0873..0b69f3b6c4 100644
--- a/libavfilter/vf_nlmeans_opencl.c
+++ b/libavfilter/vf_nlmeans_opencl.c
@@ -30,11 +30,9 @@
#include "opencl_source.h"
#include "video.h"
-// TODO:
-// the integral image may overflow 32bit, consider using 64bit
-
static const enum AVPixelFormat supported_formats[] = {
AV_PIX_FMT_YUV420P,
+ AV_PIX_FMT_YUV420P16LE,
AV_PIX_FMT_YUV444P,
Is this the only format it has added support for? Feels like it could do
others which in the MSBs.
AV_PIX_FMT_GBRP,
};
@@ -59,7 +57,6 @@ typedef struct NLMeansOpenCLContext {
cl_mem integral_img;
cl_mem weight;
cl_mem sum;
- cl_mem overflow; // overflow in integral image?
double sigma;
float h;
int chroma_w;
@@ -129,7 +126,7 @@ static int nlmeans_opencl_init(AVFilterContext *avctx, int
width, int height)
"average kernel %d.\n", cle);
ctx->integral_img = clCreateBuffer(ctx->ocf.hwctx->context, 0,
- 4 * width * height * sizeof(cl_int),
+ 4 * width * height * sizeof(cl_float),
NULL, &cle);
CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create "
"integral image %d.\n", cle);
@@ -144,11 +141,6 @@ static int nlmeans_opencl_init(AVFilterContext *avctx, int
width, int height)
CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create "
"sum buffer %d.\n", cle);
- ctx->overflow = clCreateBuffer(ctx->ocf.hwctx->context, 0,
- sizeof(cl_int), NULL, &cle);
- CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create "
- "overflow buffer %d.\n", cle);
-
ctx->initialised = 1;
return 0;
@@ -161,7 +153,6 @@ fail:
CL_RELEASE_MEMORY(ctx->integral_img);
CL_RELEASE_MEMORY(ctx->weight);
CL_RELEASE_MEMORY(ctx->sum);
- CL_RELEASE_MEMORY(ctx->overflow);
CL_RELEASE_QUEUE(ctx->command_queue);
return err;
@@ -239,9 +230,8 @@ static int nlmeans_plane(AVFilterContext *avctx, cl_mem
dst, cl_mem src,
// vertical pass
// integral(x, y) = sum(integral(x, v)) for v in [0, y]
CL_SET_KERNEL_ARG(ctx->vert_kernel, 0, cl_mem, &ctx->integral_img);
- CL_SET_KERNEL_ARG(ctx->vert_kernel, 1, cl_mem, &ctx->overflow);
- CL_SET_KERNEL_ARG(ctx->vert_kernel, 2, cl_int, &width);
- CL_SET_KERNEL_ARG(ctx->vert_kernel, 3, cl_int, &height);
+ CL_SET_KERNEL_ARG(ctx->vert_kernel, 1, cl_int, &width);
+ CL_SET_KERNEL_ARG(ctx->vert_kernel, 2, cl_int, &height);
Not affecting this patch, but a side thought: are these arguments actually
needed? They are used as the work sizes, so might be accessible inside the
kernel already.
cle = clEnqueueNDRangeKernel(ctx->command_queue, ctx->vert_kernel,
1, NULL, worksize2, NULL, 0, NULL, NULL);
CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue vert_kernel: %d.\n",
@@ -293,8 +283,7 @@ static int nlmeans_opencl_filter_frame(AVFilterLink
*inlink, AVFrame *input)
const AVPixFmtDescriptor *desc;
enum AVPixelFormat in_format;
cl_mem src, dst;
- const cl_int zero = 0;
- int w, h, err, cle, overflow, p, patch, research;
+ int w, h, err, cle, p, patch, research;
av_log(ctx, AV_LOG_DEBUG, "Filter input: %s, %ux%u (%"PRId64").\n",
av_get_pix_fmt_name(input->format),
@@ -331,11 +320,6 @@ static int nlmeans_opencl_filter_frame(AVFilterLink
*inlink, AVFrame *input)
goto fail;
}
- cle = clEnqueueWriteBuffer(ctx->command_queue, ctx->overflow, CL_FALSE,
- 0, sizeof(cl_int), &zero, 0, NULL, NULL);
- CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to initialize overflow"
- "detection buffer %d.\n", cle);
-
for (p = 0; p < FF_ARRAY_ELEMS(output->data); p++) {
src = (cl_mem) input->data[p];
dst = (cl_mem) output->data[p];
@@ -351,17 +335,10 @@ static int nlmeans_opencl_filter_frame(AVFilterLink
*inlink, AVFrame *input)
if (err < 0)
goto fail;
}
- // overflow occurred?
- cle = clEnqueueReadBuffer(ctx->command_queue, ctx->overflow, CL_FALSE,
- 0, sizeof(cl_int), &overflow, 0, NULL, NULL);
- CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to read overflow: %d.\n", cle);
cle = clFinish(ctx->command_queue);
CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to finish kernel: %d.\n", cle);
- if (overflow > 0)
- av_log(avctx, AV_LOG_ERROR, "integral image overflow %d\n", overflow);
-
av_frame_free(&input);
av_log(ctx, AV_LOG_DEBUG, "Filter output: %s, %ux%u (%"PRId64").\n",
@@ -390,7 +367,6 @@ static av_cold void nlmeans_opencl_uninit(AVFilterContext
*avctx)
CL_RELEASE_MEMORY(ctx->integral_img);
CL_RELEASE_MEMORY(ctx->weight);
CL_RELEASE_MEMORY(ctx->sum);
- CL_RELEASE_MEMORY(ctx->overflow);
CL_RELEASE_QUEUE(ctx->command_queue);
Thanks,
- Mark
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".