On Tue, Oct 28, 2014 at 06:30:34PM +0100, Stefano Sabatini wrote: [...] > How much effort would it take to implement the remaining scaling modes? >
According to https://ffmpeg.org/pipermail/ffmpeg-devel/2014-October/164574.html "I think 4x can be done fast enough, but 3x will take time." [...] > > +typedef struct { > > + uint32_t rgbtoyuv[1<<24]; > > We should avoid this 64MiB. Also the table should be possibly static, > so you don't have to fill it per each xBR instance. > So, I requested to do it exactly the same as HQx because this part is common according to the specifications. This should be kept the same vf_hqx, and then factorized. Now about removing this allocation, I did benchmark this LUT vs computation (see attached patch for comp. version). And the problem is that it's slightly slower, probably due to the /1000. I wasn't able to make it bitexact with the current code using bithacks, and while this sounds like a tolerable inaccuracy, it actually isn't and has an impact of the output. For example, doing this (on top of attached patch): diff --git a/libavfilter/vf_hqx.c b/libavfilter/vf_hqx.c index 41a77cf..f4d8006 100644 --- a/libavfilter/vf_hqx.c +++ b/libavfilter/vf_hqx.c @@ -29,6 +29,7 @@ #include "libavutil/opt.h" #include "libavutil/avassert.h" +#include "libavutil/colorspace.h" #include "libavutil/pixdesc.h" #include "internal.h" @@ -58,9 +59,9 @@ static av_always_inline uint32_t rgb2yuv(uint32_t c) const int r = c >> 16 & 0xff; const int g = c >> 8 & 0xff; const int b = c & 0xff; - const uint32_t y = (uint32_t)(( 299*r + 587*g + 114*b)/1000); - const uint32_t u = (uint32_t)((-169*r - 331*g + 500*b)/1000) + 128; - const uint32_t v = (uint32_t)(( 500*r - 419*g - 81*b)/1000) + 128; + const uint32_t y = RGB_TO_Y(r, g, b); + const uint32_t u = RGB_TO_U(r, g, b, 0); + const uint32_t v = RGB_TO_V(r, g, b, 0); return y<<16 | u<<8 | v; } ...leads to this: https://lut.im/S9sJXgGU/ttB0B1j1 vs https://lut.im/9iRC6VMx/ef3PKqYd (look at the sorcerers typically, or bomberman) Even with a higher bit depth and checking the rounding, I had differences. So for now, I prefer to keep the LUT unless someone has a better idea. And anyway, this is orthogonal to this patch. [...] -- Clément B.
From 411b4e217f893a4ca1077d9814af02cf5349054a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20B=C5=93sch?= <u...@pkh.me> Date: Mon, 27 Oct 2014 23:49:47 +0100 Subject: [PATCH] avfilter/hqx/WIP: remove LUT --- libavfilter/vf_hqx.c | 88 ++++++++++++++++++++-------------------------------- 1 file changed, 34 insertions(+), 54 deletions(-) diff --git a/libavfilter/vf_hqx.c b/libavfilter/vf_hqx.c index 4783381..41a77cf 100644 --- a/libavfilter/vf_hqx.c +++ b/libavfilter/vf_hqx.c @@ -38,12 +38,10 @@ typedef struct { const AVClass *class; int n; hqxfunc_t func; - uint32_t rgbtoyuv[1<<24]; } HQXContext; typedef struct ThreadData { AVFrame *in, *out; - const uint32_t *rgbtoyuv; } ThreadData; #define OFFSET(x) offsetof(HQXContext, x) @@ -55,9 +53,15 @@ static const AVOption hqx_options[] = { AVFILTER_DEFINE_CLASS(hqx); -static av_always_inline uint32_t rgb2yuv(const uint32_t *r2y, uint32_t c) +static av_always_inline uint32_t rgb2yuv(uint32_t c) { - return r2y[c & 0xffffff]; + const int r = c >> 16 & 0xff; + const int g = c >> 8 & 0xff; + const int b = c & 0xff; + const uint32_t y = (uint32_t)(( 299*r + 587*g + 114*b)/1000); + const uint32_t u = (uint32_t)((-169*r - 331*g + 500*b)/1000) + 128; + const uint32_t v = (uint32_t)(( 500*r - 419*g - 81*b)/1000) + 128; + return y<<16 | u<<8 | v; } static av_always_inline int yuv_diff(uint32_t yuv1, uint32_t yuv2) @@ -97,7 +101,7 @@ static av_always_inline uint32_t interp_3px(uint32_t c1, int w1, uint32_t c2, in #define SHF(x, rot, n) (((x) >> ((rot) ? 7-DROP4(n) : DROP4(n)) & 1) << DROP4(p##n)) /* used to check if there is YUV difference between 2 pixels */ -#define WDIFF(c1, c2) yuv_diff(rgb2yuv(r2y, c1), rgb2yuv(r2y, c2)) +#define WDIFF(c1, c2) yuv_diff(rgb2yuv(c1), rgb2yuv(c2)) /* bootstrap template for every interpolation code. It defines the shuffled * masks and surrounding pixels. The rot flag is used to indicate if it's a @@ -114,8 +118,7 @@ static av_always_inline uint32_t interp_3px(uint32_t c1, int w1, uint32_t c2, in /* Assuming p0..p8 is mapped to pixels 0..8, this function interpolates the * top-left pixel in the total of the 2x2 pixels to interpolates. The function * is also used for the 3 other pixels */ -static av_always_inline uint32_t hq2x_interp_1x1(const uint32_t *r2y, int k, - const uint32_t *w, +static av_always_inline uint32_t hq2x_interp_1x1(int k, const uint32_t *w, int p0, int p1, int p2, int p3, int p4, int p5, int p6, int p7, int p8) @@ -165,8 +168,7 @@ static av_always_inline uint32_t hq2x_interp_1x1(const uint32_t *r2y, int k, * defining the outline. The center pixel is not defined through this function, * since it's just the same as the original value. */ static av_always_inline void hq3x_interp_2x1(uint32_t *dst, int dst_linesize, - const uint32_t *r2y, int k, - const uint32_t *w, + int k, const uint32_t *w, int pos00, int pos01, int p0, int p1, int p2, int p3, int p4, int p5, @@ -231,8 +233,7 @@ static av_always_inline void hq3x_interp_2x1(uint32_t *dst, int dst_linesize, * interpolates. The function is also used for the 3 other blocks of 2x2 * pixels. */ static av_always_inline void hq4x_interp_2x2(uint32_t *dst, int dst_linesize, - const uint32_t *r2y, int k, - const uint32_t *w, + int k, const uint32_t *w, int pos00, int pos01, int pos10, int pos11, int p0, int p1, int p2, @@ -382,7 +383,6 @@ static av_always_inline void hqx_filter(const ThreadData *td, int jobnr, int nb_ { int x, y; AVFrame *in = td->in, *out = td->out; - const uint32_t *r2y = td->rgbtoyuv; const int height = in->height; const int width = in->width; const int slice_start = (height * jobnr ) / nb_jobs; @@ -409,32 +409,32 @@ static av_always_inline void hqx_filter(const ThreadData *td, int jobnr, int nb_ src32[prevcol ], src32[ 0], src32[ nextcol], src32[prevcol + nextline], src32[nextline], src32[nextline + nextcol] }; - const uint32_t yuv1 = rgb2yuv(r2y, w[4]); - const int pattern = (w[4] != w[0] ? (yuv_diff(yuv1, rgb2yuv(r2y, w[0]))) : 0) - | (w[4] != w[1] ? (yuv_diff(yuv1, rgb2yuv(r2y, w[1]))) : 0) << 1 - | (w[4] != w[2] ? (yuv_diff(yuv1, rgb2yuv(r2y, w[2]))) : 0) << 2 - | (w[4] != w[3] ? (yuv_diff(yuv1, rgb2yuv(r2y, w[3]))) : 0) << 3 - | (w[4] != w[5] ? (yuv_diff(yuv1, rgb2yuv(r2y, w[5]))) : 0) << 4 - | (w[4] != w[6] ? (yuv_diff(yuv1, rgb2yuv(r2y, w[6]))) : 0) << 5 - | (w[4] != w[7] ? (yuv_diff(yuv1, rgb2yuv(r2y, w[7]))) : 0) << 6 - | (w[4] != w[8] ? (yuv_diff(yuv1, rgb2yuv(r2y, w[8]))) : 0) << 7; + const uint32_t yuv1 = rgb2yuv(w[4]); + const int pattern = (w[4] != w[0] ? (yuv_diff(yuv1, rgb2yuv(w[0]))) : 0) + | (w[4] != w[1] ? (yuv_diff(yuv1, rgb2yuv(w[1]))) : 0) << 1 + | (w[4] != w[2] ? (yuv_diff(yuv1, rgb2yuv(w[2]))) : 0) << 2 + | (w[4] != w[3] ? (yuv_diff(yuv1, rgb2yuv(w[3]))) : 0) << 3 + | (w[4] != w[5] ? (yuv_diff(yuv1, rgb2yuv(w[5]))) : 0) << 4 + | (w[4] != w[6] ? (yuv_diff(yuv1, rgb2yuv(w[6]))) : 0) << 5 + | (w[4] != w[7] ? (yuv_diff(yuv1, rgb2yuv(w[7]))) : 0) << 6 + | (w[4] != w[8] ? (yuv_diff(yuv1, rgb2yuv(w[8]))) : 0) << 7; if (n == 2) { - dst32[dst32_linesize*0 + 0] = hq2x_interp_1x1(r2y, pattern, w, 0,1,2,3,4,5,6,7,8); // 00 - dst32[dst32_linesize*0 + 1] = hq2x_interp_1x1(r2y, pattern, w, 2,1,0,5,4,3,8,7,6); // 01 (vert mirrored) - dst32[dst32_linesize*1 + 0] = hq2x_interp_1x1(r2y, pattern, w, 6,7,8,3,4,5,0,1,2); // 10 (horiz mirrored) - dst32[dst32_linesize*1 + 1] = hq2x_interp_1x1(r2y, pattern, w, 8,7,6,5,4,3,2,1,0); // 11 (center mirrored) + dst32[dst32_linesize*0 + 0] = hq2x_interp_1x1(pattern, w, 0,1,2,3,4,5,6,7,8); // 00 + dst32[dst32_linesize*0 + 1] = hq2x_interp_1x1(pattern, w, 2,1,0,5,4,3,8,7,6); // 01 (vert mirrored) + dst32[dst32_linesize*1 + 0] = hq2x_interp_1x1(pattern, w, 6,7,8,3,4,5,0,1,2); // 10 (horiz mirrored) + dst32[dst32_linesize*1 + 1] = hq2x_interp_1x1(pattern, w, 8,7,6,5,4,3,2,1,0); // 11 (center mirrored) } else if (n == 3) { - hq3x_interp_2x1(dst32, dst32_linesize, r2y, pattern, w, 0,1, 0,1,2,3,4,5,6,7,8, 0); // 00 01 - hq3x_interp_2x1(dst32 + 1, dst32_linesize, r2y, pattern, w, 1,3, 2,5,8,1,4,7,0,3,6, 1); // 02 12 (rotated to the right) - hq3x_interp_2x1(dst32 + 1*dst32_linesize, dst32_linesize, r2y, pattern, w, 2,0, 6,3,0,7,4,1,8,5,2, 1); // 20 10 (rotated to the left) - hq3x_interp_2x1(dst32 + 1*dst32_linesize + 1, dst32_linesize, r2y, pattern, w, 3,2, 8,7,6,5,4,3,2,1,0, 0); // 22 21 (center mirrored) - dst32[dst32_linesize + 1] = w[4]; // 11 + hq3x_interp_2x1(dst32, dst32_linesize, pattern, w, 0,1, 0,1,2,3,4,5,6,7,8, 0); // 00 01 + hq3x_interp_2x1(dst32 + 1, dst32_linesize, pattern, w, 1,3, 2,5,8,1,4,7,0,3,6, 1); // 02 12 (rotated to the right) + hq3x_interp_2x1(dst32 + 1*dst32_linesize, dst32_linesize, pattern, w, 2,0, 6,3,0,7,4,1,8,5,2, 1); // 20 10 (rotated to the left) + hq3x_interp_2x1(dst32 + 1*dst32_linesize + 1, dst32_linesize, pattern, w, 3,2, 8,7,6,5,4,3,2,1,0, 0); // 22 21 (center mirrored) + dst32[dst32_linesize + 1] = w[4]; // 11 } else if (n == 4) { - hq4x_interp_2x2(dst32, dst32_linesize, r2y, pattern, w, 0,1,2,3, 0,1,2,3,4,5,6,7,8); // 00 01 10 11 - hq4x_interp_2x2(dst32 + 2, dst32_linesize, r2y, pattern, w, 1,0,3,2, 2,1,0,5,4,3,8,7,6); // 02 03 12 13 (vert mirrored) - hq4x_interp_2x2(dst32 + 2*dst32_linesize, dst32_linesize, r2y, pattern, w, 2,3,0,1, 6,7,8,3,4,5,0,1,2); // 20 21 30 31 (horiz mirrored) - hq4x_interp_2x2(dst32 + 2*dst32_linesize + 2, dst32_linesize, r2y, pattern, w, 3,2,1,0, 8,7,6,5,4,3,2,1,0); // 22 23 32 33 (center mirrored) + hq4x_interp_2x2(dst32, dst32_linesize, pattern, w, 0,1,2,3, 0,1,2,3,4,5,6,7,8); // 00 01 10 11 + hq4x_interp_2x2(dst32 + 2, dst32_linesize, pattern, w, 1,0,3,2, 2,1,0,5,4,3,8,7,6); // 02 03 12 13 (vert mirrored) + hq4x_interp_2x2(dst32 + 2*dst32_linesize, dst32_linesize, pattern, w, 2,3,0,1, 6,7,8,3,4,5,0,1,2); // 20 21 30 31 (horiz mirrored) + hq4x_interp_2x2(dst32 + 2*dst32_linesize + 2, dst32_linesize, pattern, w, 3,2,1,0, 8,7,6,5,4,3,2,1,0); // 22 23 32 33 (center mirrored) } else { av_assert0(0); } @@ -497,7 +497,6 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in) td.in = in; td.out = out; - td.rgbtoyuv = hqx->rgbtoyuv; ctx->internal->execute(ctx, hqx->func, &td, NULL, FFMIN(inlink->h, ctx->graph->nb_threads)); av_frame_free(&in); @@ -508,25 +507,6 @@ static av_cold int init(AVFilterContext *ctx) { HQXContext *hqx = ctx->priv; static const hqxfunc_t hqxfuncs[] = {hq2x, hq3x, hq4x}; - - uint32_t c; - int bg, rg, g; - - for (bg=-255; bg<256; bg++) { - for (rg=-255; rg<256; rg++) { - const uint32_t u = (uint32_t)((-169*rg + 500*bg)/1000) + 128; - const uint32_t v = (uint32_t)(( 500*rg - 81*bg)/1000) + 128; - int startg = FFMAX3(-bg, -rg, 0); - int endg = FFMIN3(255-bg, 255-rg, 255); - uint32_t y = (uint32_t)(( 299*rg + 1000*startg + 114*bg)/1000); - c = bg + (rg<<16) + 0x010101 * startg; - for (g = startg; g <= endg; g++) { - hqx->rgbtoyuv[c] = ((y++) << 16) + (u << 8) + v; - c+= 0x010101; - } - } - } - hqx->func = hqxfuncs[hqx->n - 2]; return 0; } -- 2.1.2
pgpkqHDwXtaeI.pgp
Description: PGP signature
_______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel