This is a somewhat large patchset, mostly because I've tried to break each patch down into the smallest set of changes I could. On my machine (x86_64 linux, with AVX2), everything compiles after each patch. Whenever I configure ffmpeg libpostproc is always disabled and I have to manually change config.mak to fix this. I'm not sure if this is an issue on my end or just the default setting.
This is ultimately a qualifing task for the google summer of code, so I know I still have a lot of things to work on, I just want to make sure I'm following the proper coding conventions and everything for now. This moves c functions to process blocks horozontally into a seperate file, so that none of the postprocessing algorithms are in the main postprecess.c file --- libpostproc/postprocess.c | 352 +---------------------------------------- libpostproc/postprocess_c.c | 373 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 374 insertions(+), 351 deletions(-) create mode 100644 libpostproc/postprocess_c.c diff --git a/libpostproc/postprocess.c b/libpostproc/postprocess.c index 9d89782..86c0520 100644 --- a/libpostproc/postprocess.c +++ b/libpostproc/postprocess.c @@ -199,357 +199,7 @@ static inline void prefetcht2(const void *p) } #endif -/* The horizontal functions exist only in C because the MMX - * code is faster with vertical filters and transposing. */ - -/** - * Check if the given 8x8 Block is mostly "flat" - */ -static inline int isHorizDC_C(const uint8_t src[], int stride, const PPContext *c) -{ - int numEq= 0; - int y; - const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1; - const int dcThreshold= dcOffset*2 + 1; - - for(y=0; y<BLOCK_SIZE; y++){ - numEq += ((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold; - numEq += ((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold; - numEq += ((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold; - numEq += ((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold; - numEq += ((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold; - numEq += ((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold; - numEq += ((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold; - src+= stride; - } - return numEq > c->ppMode.flatnessThreshold; -} - -/** - * Check if the middle 8x8 Block in the given 8x16 block is flat - */ -static inline int isVertDC_C(const uint8_t src[], int stride, const PPContext *c) -{ - int numEq= 0; - int y; - const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1; - const int dcThreshold= dcOffset*2 + 1; - - src+= stride*4; // src points to begin of the 8x8 Block - for(y=0; y<BLOCK_SIZE-1; y++){ - numEq += ((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold; - numEq += ((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold; - numEq += ((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold; - numEq += ((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold; - numEq += ((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold; - numEq += ((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold; - numEq += ((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold; - numEq += ((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold; - src+= stride; - } - return numEq > c->ppMode.flatnessThreshold; -} - -static inline int isHorizMinMaxOk_C(const uint8_t src[], int stride, int QP) -{ - int i; - for(i=0; i<2; i++){ - if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0; - src += stride; - if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0; - src += stride; - if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0; - src += stride; - if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0; - src += stride; - } - return 1; -} - -static inline int isVertMinMaxOk_C(const uint8_t src[], int stride, int QP) -{ - int x; - src+= stride*4; - for(x=0; x<BLOCK_SIZE; x+=4){ - if((unsigned)(src[ x + 0*stride] - src[ x + 5*stride] + 2*QP) > 4*QP) return 0; - if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0; - if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0; - if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0; - } - return 1; -} - -static inline int horizClassify_C(const uint8_t src[], int stride, const PPContext *c) -{ - if( isHorizDC_C(src, stride, c) ){ - return isHorizMinMaxOk_C(src, stride, c->QP); - }else{ - return 2; - } -} - -static inline int vertClassify_C(const uint8_t src[], int stride, const PPContext *c) -{ - if( isVertDC_C(src, stride, c) ){ - return isVertMinMaxOk_C(src, stride, c->QP); - }else{ - return 2; - } -} - -static inline void doHorizDefFilter_C(uint8_t dst[], int stride, const PPContext *c) -{ - int y; - for(y=0; y<BLOCK_SIZE; y++){ - const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]); - - if(FFABS(middleEnergy) < 8*c->QP){ - const int q=(dst[3] - dst[4])/2; - const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]); - const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]); - - int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) ); - d= FFMAX(d, 0); - - d= (5*d + 32) >> 6; - d*= FFSIGN(-middleEnergy); - - if(q>0) - { - d = FFMAX(d, 0); - d = FFMIN(d, q); - } - else - { - d = FFMIN(d, 0); - d = FFMAX(d, q); - } - - dst[3]-= d; - dst[4]+= d; - } - dst+= stride; - } -} - -/** - * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block) - * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version) - */ -static inline void doHorizLowPass_C(uint8_t dst[], int stride, const PPContext *c) -{ - int y; - for(y=0; y<BLOCK_SIZE; y++){ - const int first= FFABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0]; - const int last= FFABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7]; - - int sums[10]; - sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4; - sums[1] = sums[0] - first + dst[3]; - sums[2] = sums[1] - first + dst[4]; - sums[3] = sums[2] - first + dst[5]; - sums[4] = sums[3] - first + dst[6]; - sums[5] = sums[4] - dst[0] + dst[7]; - sums[6] = sums[5] - dst[1] + last; - sums[7] = sums[6] - dst[2] + last; - sums[8] = sums[7] - dst[3] + last; - sums[9] = sums[8] - dst[4] + last; - - dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4; - dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4; - dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4; - dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4; - dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4; - dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4; - dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4; - dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4; - - dst+= stride; - } -} - -/** - * Experimental Filter 1 (Horizontal) - * will not damage linear gradients - * Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter - * can only smooth blocks at the expected locations (it cannot smooth them if they did move) - * MMX2 version does correct clipping C version does not - * not identical with the vertical one - */ -static inline void horizX1Filter(uint8_t *src, int stride, int QP) -{ - int y; - static uint64_t lut[256]; - if(!lut[255]) - { - int i; - for(i=0; i<256; i++) - { - int v= i < 128 ? 2*i : 2*(i-256); -/* -//Simulate 112242211 9-Tap filter - uint64_t a= (v/16) & 0xFF; - uint64_t b= (v/8) & 0xFF; - uint64_t c= (v/4) & 0xFF; - uint64_t d= (3*v/8) & 0xFF; -*/ -//Simulate piecewise linear interpolation - uint64_t a= (v/16) & 0xFF; - uint64_t b= (v*3/16) & 0xFF; - uint64_t c= (v*5/16) & 0xFF; - uint64_t d= (7*v/16) & 0xFF; - uint64_t A= (0x100 - a)&0xFF; - uint64_t B= (0x100 - b)&0xFF; - uint64_t C= (0x100 - c)&0xFF; - uint64_t D= (0x100 - c)&0xFF; - - lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) | - (D<<24) | (C<<16) | (B<<8) | (A); - //lut[i] = (v<<32) | (v<<24); - } - } - - for(y=0; y<BLOCK_SIZE; y++){ - int a= src[1] - src[2]; - int b= src[3] - src[4]; - int c= src[5] - src[6]; - - int d= FFMAX(FFABS(b) - (FFABS(a) + FFABS(c))/2, 0); - - if(d < QP){ - int v = d * FFSIGN(-b); - - src[1] +=v/8; - src[2] +=v/4; - src[3] +=3*v/8; - src[4] -=3*v/8; - src[5] -=v/4; - src[6] -=v/8; - } - src+=stride; - } -} - -/** - * accurate deblock filter - */ -static av_always_inline void do_a_deblock_C(uint8_t *src, int step, - int stride, const PPContext *c, int mode) -{ - int y; - const int QP= c->QP; - const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1; - const int dcThreshold= dcOffset*2 + 1; -//START_TIMER - src+= step*4; // src points to begin of the 8x8 Block - for(y=0; y<8; y++){ - int numEq= 0; - - numEq += ((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold; - numEq += ((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold; - numEq += ((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold; - numEq += ((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold; - numEq += ((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold; - numEq += ((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold; - numEq += ((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold; - numEq += ((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold; - numEq += ((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold; - if(numEq > c->ppMode.flatnessThreshold){ - int min, max, x; - - if(src[0] > src[step]){ - max= src[0]; - min= src[step]; - }else{ - max= src[step]; - min= src[0]; - } - for(x=2; x<8; x+=2){ - if(src[x*step] > src[(x+1)*step]){ - if(src[x *step] > max) max= src[ x *step]; - if(src[(x+1)*step] < min) min= src[(x+1)*step]; - }else{ - if(src[(x+1)*step] > max) max= src[(x+1)*step]; - if(src[ x *step] < min) min= src[ x *step]; - } - } - if(max-min < 2*QP){ - const int first= FFABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0]; - const int last= FFABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step]; - - int sums[10]; - sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4; - sums[1] = sums[0] - first + src[3*step]; - sums[2] = sums[1] - first + src[4*step]; - sums[3] = sums[2] - first + src[5*step]; - sums[4] = sums[3] - first + src[6*step]; - sums[5] = sums[4] - src[0*step] + src[7*step]; - sums[6] = sums[5] - src[1*step] + last; - sums[7] = sums[6] - src[2*step] + last; - sums[8] = sums[7] - src[3*step] + last; - sums[9] = sums[8] - src[4*step] + last; - - if (mode & VISUALIZE) { - src[0*step] = - src[1*step] = - src[2*step] = - src[3*step] = - src[4*step] = - src[5*step] = - src[6*step] = - src[7*step] = 128; - } - src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4; - src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4; - src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4; - src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4; - src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4; - src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4; - src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4; - src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4; - } - }else{ - const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]); - - if(FFABS(middleEnergy) < 8*QP){ - const int q=(src[3*step] - src[4*step])/2; - const int leftEnergy= 5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]); - const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]); - - int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) ); - d= FFMAX(d, 0); - - d= (5*d + 32) >> 6; - d*= FFSIGN(-middleEnergy); - - if(q>0){ - d = FFMAX(d, 0); - d = FFMIN(d, q); - }else{ - d = FFMIN(d, 0); - d = FFMAX(d, q); - } - - if ((mode & VISUALIZE) && d) { - d= (d < 0) ? 32 : -32; - src[3*step]= av_clip_uint8(src[3*step] - d); - src[4*step]= av_clip_uint8(src[4*step] + d); - d = 0; - } - - src[3*step]-= d; - src[4*step]+= d; - } - } - - src += stride; - } -/*if(step==16){ - STOP_TIMER("step16") -}else{ - STOP_TIMER("stepX") -}*/ -} +#include "postprocess_c.c" //Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one //Plain C versions diff --git a/libpostproc/postprocess_c.c b/libpostproc/postprocess_c.c new file mode 100644 index 0000000..bf22e95 --- /dev/null +++ b/libpostproc/postprocess_c.c @@ -0,0 +1,373 @@ +/****************************************************************************** +* C implementation of postprocessing routines +* Copyright (C) 2001-2002 Michael Niedermayer (michae...@gmx.at) +* Copyright (c) 2015 Tucker DiNapoli +* +* This file is part of FFmpeg. +* +* FFmpeg is free software; you can redistribute it and/or +* modify it under the terms of the GNU Lesser General Public +* License as published by the Free Software Foundation; either +* version 2.1 of the License, or (at your option) any later version. +* +* FFmpeg is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* Lesser General Public License for more details. +* +* You should have received a copy of the GNU Lesser General Public +* License along with FFmpeg; if not, write to the Free Software +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +******************************************************************************/ + +/* The horizontal functions exist only in C because the MMX + * code is faster with vertical filters and transposing. */ + +/** + * Check if the given 8x8 Block is mostly "flat" + */ +static inline int isHorizDC_C(const uint8_t src[], int stride, const PPContext *c) +{ + int numEq= 0; + int y; + const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1; + const int dcThreshold= dcOffset*2 + 1; + + for(y=0; y<BLOCK_SIZE; y++){ + numEq += ((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold; + numEq += ((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold; + numEq += ((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold; + numEq += ((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold; + numEq += ((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold; + numEq += ((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold; + numEq += ((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold; + src+= stride; + } + return numEq > c->ppMode.flatnessThreshold; +} + +/** + * Check if the middle 8x8 Block in the given 8x16 block is flat + */ +static inline int isVertDC_C(const uint8_t src[], int stride, const PPContext *c) +{ + int numEq= 0; + int y; + const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1; + const int dcThreshold= dcOffset*2 + 1; + + src+= stride*4; // src points to begin of the 8x8 Block + for(y=0; y<BLOCK_SIZE-1; y++){ + numEq += ((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold; + numEq += ((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold; + numEq += ((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold; + numEq += ((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold; + numEq += ((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold; + numEq += ((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold; + numEq += ((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold; + numEq += ((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold; + src+= stride; + } + return numEq > c->ppMode.flatnessThreshold; +} + +static inline int isHorizMinMaxOk_C(const uint8_t src[], int stride, int QP) +{ + int i; + for(i=0; i<2; i++){ + if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0; + src += stride; + if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0; + src += stride; + if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0; + src += stride; + if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0; + src += stride; + } + return 1; +} + +static inline int isVertMinMaxOk_C(const uint8_t src[], int stride, int QP) +{ + int x; + src+= stride*4; + for(x=0; x<BLOCK_SIZE; x+=4){ + if((unsigned)(src[ x + 0*stride] - src[ x + 5*stride] + 2*QP) > 4*QP) return 0; + if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0; + if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0; + if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0; + } + return 1; +} + +static inline int horizClassify_C(const uint8_t src[], int stride, const PPContext *c) +{ + if( isHorizDC_C(src, stride, c) ){ + return isHorizMinMaxOk_C(src, stride, c->QP); + }else{ + return 2; + } +} + +static inline int vertClassify_C(const uint8_t src[], int stride, const PPContext *c) +{ + if( isVertDC_C(src, stride, c) ){ + return isVertMinMaxOk_C(src, stride, c->QP); + }else{ + return 2; + } +} + +static inline void doHorizDefFilter_C(uint8_t dst[], int stride, const PPContext *c) +{ + int y; + for(y=0; y<BLOCK_SIZE; y++){ + const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]); + + if(FFABS(middleEnergy) < 8*c->QP){ + const int q=(dst[3] - dst[4])/2; + const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]); + const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]); + + int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) ); + d= FFMAX(d, 0); + + d= (5*d + 32) >> 6; + d*= FFSIGN(-middleEnergy); + + if(q>0) + { + d = FFMAX(d, 0); + d = FFMIN(d, q); + } + else + { + d = FFMIN(d, 0); + d = FFMAX(d, q); + } + + dst[3]-= d; + dst[4]+= d; + } + dst+= stride; + } +} + +/** + * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block) + * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version) + */ +static inline void doHorizLowPass_C(uint8_t dst[], int stride, const PPContext *c) +{ + int y; + for(y=0; y<BLOCK_SIZE; y++){ + const int first= FFABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0]; + const int last= FFABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7]; + + int sums[10]; + sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4; + sums[1] = sums[0] - first + dst[3]; + sums[2] = sums[1] - first + dst[4]; + sums[3] = sums[2] - first + dst[5]; + sums[4] = sums[3] - first + dst[6]; + sums[5] = sums[4] - dst[0] + dst[7]; + sums[6] = sums[5] - dst[1] + last; + sums[7] = sums[6] - dst[2] + last; + sums[8] = sums[7] - dst[3] + last; + sums[9] = sums[8] - dst[4] + last; + + dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4; + dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4; + dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4; + dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4; + dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4; + dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4; + dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4; + dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4; + + dst+= stride; + } +} + +/** + * Experimental Filter 1 (Horizontal) + * will not damage linear gradients + * Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter + * can only smooth blocks at the expected locations (it cannot smooth them if they did move) + * MMX2 version does correct clipping C version does not + * not identical with the vertical one + */ +static inline void horizX1Filter(uint8_t *src, int stride, int QP) +{ + int y; + static uint64_t lut[256]; + if(!lut[255]) + { + int i; + for(i=0; i<256; i++) + { + int v= i < 128 ? 2*i : 2*(i-256); +/* +//Simulate 112242211 9-Tap filter + uint64_t a= (v/16) & 0xFF; + uint64_t b= (v/8) & 0xFF; + uint64_t c= (v/4) & 0xFF; + uint64_t d= (3*v/8) & 0xFF; +*/ +//Simulate piecewise linear interpolation + uint64_t a= (v/16) & 0xFF; + uint64_t b= (v*3/16) & 0xFF; + uint64_t c= (v*5/16) & 0xFF; + uint64_t d= (7*v/16) & 0xFF; + uint64_t A= (0x100 - a)&0xFF; + uint64_t B= (0x100 - b)&0xFF; + uint64_t C= (0x100 - c)&0xFF; + uint64_t D= (0x100 - c)&0xFF; + + lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) | + (D<<24) | (C<<16) | (B<<8) | (A); + //lut[i] = (v<<32) | (v<<24); + } + } + + for(y=0; y<BLOCK_SIZE; y++){ + int a= src[1] - src[2]; + int b= src[3] - src[4]; + int c= src[5] - src[6]; + + int d= FFMAX(FFABS(b) - (FFABS(a) + FFABS(c))/2, 0); + + if(d < QP){ + int v = d * FFSIGN(-b); + + src[1] +=v/8; + src[2] +=v/4; + src[3] +=3*v/8; + src[4] -=3*v/8; + src[5] -=v/4; + src[6] -=v/8; + } + src+=stride; + } +} + +/** + * accurate deblock filter + */ +static av_always_inline void do_a_deblock_C(uint8_t *src, int step, + int stride, const PPContext *c, int mode) +{ + int y; + const int QP= c->QP; + const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1; + const int dcThreshold= dcOffset*2 + 1; +//START_TIMER + src+= step*4; // src points to begin of the 8x8 Block + for(y=0; y<8; y++){ + int numEq= 0; + + numEq += ((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold; + numEq += ((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold; + numEq += ((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold; + numEq += ((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold; + numEq += ((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold; + numEq += ((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold; + numEq += ((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold; + numEq += ((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold; + numEq += ((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold; + if(numEq > c->ppMode.flatnessThreshold){ + int min, max, x; + + if(src[0] > src[step]){ + max= src[0]; + min= src[step]; + }else{ + max= src[step]; + min= src[0]; + } + for(x=2; x<8; x+=2){ + if(src[x*step] > src[(x+1)*step]){ + if(src[x *step] > max) max= src[ x *step]; + if(src[(x+1)*step] < min) min= src[(x+1)*step]; + }else{ + if(src[(x+1)*step] > max) max= src[(x+1)*step]; + if(src[ x *step] < min) min= src[ x *step]; + } + } + if(max-min < 2*QP){ + const int first= FFABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0]; + const int last= FFABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step]; + + int sums[10]; + sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4; + sums[1] = sums[0] - first + src[3*step]; + sums[2] = sums[1] - first + src[4*step]; + sums[3] = sums[2] - first + src[5*step]; + sums[4] = sums[3] - first + src[6*step]; + sums[5] = sums[4] - src[0*step] + src[7*step]; + sums[6] = sums[5] - src[1*step] + last; + sums[7] = sums[6] - src[2*step] + last; + sums[8] = sums[7] - src[3*step] + last; + sums[9] = sums[8] - src[4*step] + last; + + if (mode & VISUALIZE) { + src[0*step] = + src[1*step] = + src[2*step] = + src[3*step] = + src[4*step] = + src[5*step] = + src[6*step] = + src[7*step] = 128; + } + src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4; + src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4; + src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4; + src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4; + src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4; + src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4; + src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4; + src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4; + } + }else{ + const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]); + + if(FFABS(middleEnergy) < 8*QP){ + const int q=(src[3*step] - src[4*step])/2; + const int leftEnergy= 5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]); + const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]); + + int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) ); + d= FFMAX(d, 0); + + d= (5*d + 32) >> 6; + d*= FFSIGN(-middleEnergy); + + if(q>0){ + d = FFMAX(d, 0); + d = FFMIN(d, q); + }else{ + d = FFMIN(d, 0); + d = FFMAX(d, q); + } + + if ((mode & VISUALIZE) && d) { + d= (d < 0) ? 32 : -32; + src[3*step]= av_clip_uint8(src[3*step] - d); + src[4*step]= av_clip_uint8(src[4*step] + d); + d = 0; + } + + src[3*step]-= d; + src[4*step]+= d; + } + } + + src += stride; + } +/*if(step==16){ + STOP_TIMER("step16") +}else{ + STOP_TIMER("stepX") +}*/ +} -- 2.2.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel