In previous patch there was missing braces {} in for (; j < length; j++) loop. Please ignore that patch.
New patch with braces and without FFMIN macro. Mateusz W dniu 2017-03-18 o 21:28, Mateusz Brzostek pisze: > W dniu 2017-03-16 o 19:17, Michael Niedermayer pisze: >> On Wed, Mar 15, 2017 at 10:52:29PM +0100, Mateusz Brzostek wrote: >>> Hello! >>> >>> There are 3 problems with DITHER_COPY macro in >>> libswscale/swscale_unscaled.c: >>> 1) there is overflow in dithering from 12-bit to 10-bit (output value > >>> 1023); >>> 2) for limit range the lower limit is not respected, for example from >>> 10-bit to 8-bit value 64 is converted to 15; >>> 3) for many iteration of downscale/upscale of the same image the 200th >>> iteration is significantly darker. >>> >>> The first bug is due to wrong dithers table (now it is OK only for 8-bit >>> destination), fix is: >>> - const uint8_t *dither= dithers[src_depth-9][i&7];\ >>> + const uint8_t *dither= dithers[src_depth-dst_depth-1][i&7];\ >>> >>> For bugs 2) and 3) it is needed formula that do not make images darker (in >>> attachment). So please review. >> does your code maintain white and black levels ? >> with 4 bits white is 15, with 7 bits white is 127 for example >> white should stay white >> black should stay black >> in both directions >> >> [...] > After speed tests I've prepared faster version of this patch (do exactly the > same but faster). > > ffmpeg0 - clean 64-bit gcc 6.3 build for Windows from snapshot 2017-03-18, > ffmpeg1 - the same + old patch, > ffmpeg2 - the same + new patch. > > Average speed was (from 10 laps): > ffmpeg0 -- 2.625 > ffmpeg1 -- 2.540 > ffmpeg2 -- 2.628 > > New patch is faster than old and is not slower than current (buggy) code. > > My speed test looks like this (copy from 324 frames movie 2400x1350, > yuv444p12): > f:\speed\ff2>for /L %n in (1 1 10) do (for /L %i in (0 1 2) do (ffmpeg%i -y > -stats -i ../original_vc3.mxf -v error -strict -1 -p > ix_fmt yuv444p10 -f yuv4mpegpipe NUL ) ) > > f:\speed\ff2>(for /L %i in (0 1 2) do (ffmpeg%i -y -stats -i > ../original_vc3.mxf -v error -strict -1 -pix_fmt yuv444p10 -f yuv4m > pegpipe NUL ) ) > > f:\speed\ff2>(ffmpeg0 -y -stats -i ../original_vc3.mxf -v error -strict -1 > -pix_fmt yuv444p10 -f yuv4mpegpipe NUL ) > frame= 324 fps= 63 q=-0.0 Lsize= 6150939kB time=00:00:13.50 > bitrate=3732481.2kbits/s speed=2.64x > > f:\speed\ff2>(ffmpeg1 -y -stats -i ../original_vc3.mxf -v error -strict -1 > -pix_fmt yuv444p10 -f yuv4mpegpipe NUL ) > frame= 324 fps= 61 q=-0.0 Lsize= 6150939kB time=00:00:13.50 > bitrate=3732481.2kbits/s speed=2.54x > > f:\speed\ff2>(ffmpeg2 -y -stats -i ../original_vc3.mxf -v error -strict -1 > -pix_fmt yuv444p10 -f yuv4mpegpipe NUL ) > frame= 324 fps= 63 q=-0.0 Lsize= 6150939kB time=00:00:13.50 > bitrate=3732481.2kbits/s speed=2.62x > > New patch inline (+ in attachment for easy apply): > libswscale/swscale_unscaled.c | 41 +++++++++++------------------------------ > 1 file changed, 11 insertions(+), 30 deletions(-) > > diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c > index ba3d688..a1d0a8d 100644 > --- a/libswscale/swscale_unscaled.c > +++ b/libswscale/swscale_unscaled.c > @@ -110,24 +110,6 @@ DECLARE_ALIGNED(8, static const uint8_t, > dithers)[8][8][8]={ > { 112, 16,104, 8,118, 22,110, 14,}, > }}; > > -static const uint16_t dither_scale[15][16]={ > -{ 2, 3, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, > 5, 5, 5, 5,}, > -{ 2, 3, 7, 7, 13, 13, 25, 25, 25, 25, 25, 25, > 25, 25, 25, 25,}, > -{ 3, 3, 4, 15, 15, 29, 57, 57, 57, 113, 113, 113, > 113, 113, 113, 113,}, > -{ 3, 4, 4, 5, 31, 31, 61, 121, 241, 241, 241, 241, > 481, 481, 481, 481,}, > -{ 3, 4, 5, 5, 6, 63, 63, 125, 249, 497, 993, 993, > 993, 993, 993, 1985,}, > -{ 3, 5, 6, 6, 6, 7, 127, 127, 253, 505, 1009, 2017, > 4033, 4033, 4033, 4033,}, > -{ 3, 5, 6, 7, 7, 7, 8, 255, 255, 509, 1017, 2033, > 4065, 8129,16257,16257,}, > -{ 3, 5, 6, 8, 8, 8, 8, 9, 511, 511, 1021, 2041, > 4081, 8161,16321,32641,}, > -{ 3, 5, 7, 8, 9, 9, 9, 9, 10, 1023, 1023, 2045, > 4089, 8177,16353,32705,}, > -{ 3, 5, 7, 8, 10, 10, 10, 10, 10, 11, 2047, 2047, > 4093, 8185,16369,32737,}, > -{ 3, 5, 7, 8, 10, 11, 11, 11, 11, 11, 12, 4095, > 4095, 8189,16377,32753,}, > -{ 3, 5, 7, 9, 10, 12, 12, 12, 12, 12, 12, 13, > 8191, 8191,16381,32761,}, > -{ 3, 5, 7, 9, 10, 12, 13, 13, 13, 13, 13, 13, > 14,16383,16383,32765,}, > -{ 3, 5, 7, 9, 10, 12, 14, 14, 14, 14, 14, 14, > 14, 15,32767,32767,}, > -{ 3, 5, 7, 9, 11, 12, 14, 15, 15, 15, 15, 15, > 15, 15, 16,65535,}, > -}; > - > > static void fillPlane(uint8_t *plane, int stride, int width, int height, int > y, > uint8_t val) > @@ -1485,22 +1467,21 @@ static int packedCopyWrapper(SwsContext *c, const > uint8_t *src[], > } > > #define DITHER_COPY(dst, dstStride, src, srcStride, bswap, dbswap)\ > - uint16_t scale= dither_scale[dst_depth-1][src_depth-1];\ > - int shift= src_depth-dst_depth + dither_scale[src_depth-2][dst_depth-1];\ > + unsigned shift= src_depth-dst_depth, dst_max= (1<<dst_depth)-1, tmp;\ > for (i = 0; i < height; i++) {\ > - const uint8_t *dither= dithers[src_depth-9][i&7];\ > + const uint8_t *dither= dithers[shift-1][i&7];\ > for (j = 0; j < length-7; j+=8){\ > - dst[j+0] = dbswap((bswap(src[j+0]) + dither[0])*scale>>shift);\ > - dst[j+1] = dbswap((bswap(src[j+1]) + dither[1])*scale>>shift);\ > - dst[j+2] = dbswap((bswap(src[j+2]) + dither[2])*scale>>shift);\ > - dst[j+3] = dbswap((bswap(src[j+3]) + dither[3])*scale>>shift);\ > - dst[j+4] = dbswap((bswap(src[j+4]) + dither[4])*scale>>shift);\ > - dst[j+5] = dbswap((bswap(src[j+5]) + dither[5])*scale>>shift);\ > - dst[j+6] = dbswap((bswap(src[j+6]) + dither[6])*scale>>shift);\ > - dst[j+7] = dbswap((bswap(src[j+7]) + dither[7])*scale>>shift);\ > + tmp = (bswap(src[j+0]) + dither[0])>>shift; dst[j+0] = > dbswap(FFMIN(tmp, dst_max));\ > + tmp = (bswap(src[j+1]) + dither[1])>>shift; dst[j+1] = > dbswap(FFMIN(tmp, dst_max));\ > + tmp = (bswap(src[j+2]) + dither[2])>>shift; dst[j+2] = > dbswap(FFMIN(tmp, dst_max));\ > + tmp = (bswap(src[j+3]) + dither[3])>>shift; dst[j+3] = > dbswap(FFMIN(tmp, dst_max));\ > + tmp = (bswap(src[j+4]) + dither[4])>>shift; dst[j+4] = > dbswap(FFMIN(tmp, dst_max));\ > + tmp = (bswap(src[j+5]) + dither[5])>>shift; dst[j+5] = > dbswap(FFMIN(tmp, dst_max));\ > + tmp = (bswap(src[j+6]) + dither[6])>>shift; dst[j+6] = > dbswap(FFMIN(tmp, dst_max));\ > + tmp = (bswap(src[j+7]) + dither[7])>>shift; dst[j+7] = > dbswap(FFMIN(tmp, dst_max));\ > }\ > for (; j < length; j++)\ > - dst[j] = dbswap((bswap(src[j]) + dither[j&7])*scale>>shift);\ > + tmp = (bswap(src[j]) + dither[j&7])>>shift; dst[j] = > dbswap(FFMIN(tmp, dst_max));\ > dst += dstStride;\ > src += srcStride;\ > } > > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
libswscale/swscale_unscaled.c | 44 +++++++++++++------------------------------ 1 file changed, 13 insertions(+), 31 deletions(-) diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c index ba3d688..ee1844d 100644 --- a/libswscale/swscale_unscaled.c +++ b/libswscale/swscale_unscaled.c @@ -110,24 +110,6 @@ DECLARE_ALIGNED(8, static const uint8_t, dithers)[8][8][8]={ { 112, 16,104, 8,118, 22,110, 14,}, }}; -static const uint16_t dither_scale[15][16]={ -{ 2, 3, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,}, -{ 2, 3, 7, 7, 13, 13, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,}, -{ 3, 3, 4, 15, 15, 29, 57, 57, 57, 113, 113, 113, 113, 113, 113, 113,}, -{ 3, 4, 4, 5, 31, 31, 61, 121, 241, 241, 241, 241, 481, 481, 481, 481,}, -{ 3, 4, 5, 5, 6, 63, 63, 125, 249, 497, 993, 993, 993, 993, 993, 1985,}, -{ 3, 5, 6, 6, 6, 7, 127, 127, 253, 505, 1009, 2017, 4033, 4033, 4033, 4033,}, -{ 3, 5, 6, 7, 7, 7, 8, 255, 255, 509, 1017, 2033, 4065, 8129,16257,16257,}, -{ 3, 5, 6, 8, 8, 8, 8, 9, 511, 511, 1021, 2041, 4081, 8161,16321,32641,}, -{ 3, 5, 7, 8, 9, 9, 9, 9, 10, 1023, 1023, 2045, 4089, 8177,16353,32705,}, -{ 3, 5, 7, 8, 10, 10, 10, 10, 10, 11, 2047, 2047, 4093, 8185,16369,32737,}, -{ 3, 5, 7, 8, 10, 11, 11, 11, 11, 11, 12, 4095, 4095, 8189,16377,32753,}, -{ 3, 5, 7, 9, 10, 12, 12, 12, 12, 12, 12, 13, 8191, 8191,16381,32761,}, -{ 3, 5, 7, 9, 10, 12, 13, 13, 13, 13, 13, 13, 14,16383,16383,32765,}, -{ 3, 5, 7, 9, 10, 12, 14, 14, 14, 14, 14, 14, 14, 15,32767,32767,}, -{ 3, 5, 7, 9, 11, 12, 14, 15, 15, 15, 15, 15, 15, 15, 16,65535,}, -}; - static void fillPlane(uint8_t *plane, int stride, int width, int height, int y, uint8_t val) @@ -1485,22 +1467,22 @@ static int packedCopyWrapper(SwsContext *c, const uint8_t *src[], } #define DITHER_COPY(dst, dstStride, src, srcStride, bswap, dbswap)\ - uint16_t scale= dither_scale[dst_depth-1][src_depth-1];\ - int shift= src_depth-dst_depth + dither_scale[src_depth-2][dst_depth-1];\ + unsigned shift= src_depth-dst_depth, tmp;\ for (i = 0; i < height; i++) {\ - const uint8_t *dither= dithers[src_depth-9][i&7];\ + const uint8_t *dither= dithers[shift-1][i&7];\ for (j = 0; j < length-7; j+=8){\ - dst[j+0] = dbswap((bswap(src[j+0]) + dither[0])*scale>>shift);\ - dst[j+1] = dbswap((bswap(src[j+1]) + dither[1])*scale>>shift);\ - dst[j+2] = dbswap((bswap(src[j+2]) + dither[2])*scale>>shift);\ - dst[j+3] = dbswap((bswap(src[j+3]) + dither[3])*scale>>shift);\ - dst[j+4] = dbswap((bswap(src[j+4]) + dither[4])*scale>>shift);\ - dst[j+5] = dbswap((bswap(src[j+5]) + dither[5])*scale>>shift);\ - dst[j+6] = dbswap((bswap(src[j+6]) + dither[6])*scale>>shift);\ - dst[j+7] = dbswap((bswap(src[j+7]) + dither[7])*scale>>shift);\ + tmp = (bswap(src[j+0]) + dither[0])>>shift; dst[j+0] = dbswap(tmp - (tmp>>dst_depth));\ + tmp = (bswap(src[j+1]) + dither[1])>>shift; dst[j+1] = dbswap(tmp - (tmp>>dst_depth));\ + tmp = (bswap(src[j+2]) + dither[2])>>shift; dst[j+2] = dbswap(tmp - (tmp>>dst_depth));\ + tmp = (bswap(src[j+3]) + dither[3])>>shift; dst[j+3] = dbswap(tmp - (tmp>>dst_depth));\ + tmp = (bswap(src[j+4]) + dither[4])>>shift; dst[j+4] = dbswap(tmp - (tmp>>dst_depth));\ + tmp = (bswap(src[j+5]) + dither[5])>>shift; dst[j+5] = dbswap(tmp - (tmp>>dst_depth));\ + tmp = (bswap(src[j+6]) + dither[6])>>shift; dst[j+6] = dbswap(tmp - (tmp>>dst_depth));\ + tmp = (bswap(src[j+7]) + dither[7])>>shift; dst[j+7] = dbswap(tmp - (tmp>>dst_depth));\ + }\ + for (; j < length; j++){\ + tmp = (bswap(src[j]) + dither[j&7])>>shift; dst[j] = dbswap(tmp - (tmp>>dst_depth));\ }\ - for (; j < length; j++)\ - dst[j] = dbswap((bswap(src[j]) + dither[j&7])*scale>>shift);\ dst += dstStride;\ src += srcStride;\ }
_______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel